Complete OCR annotation pipeline#3
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a browser-based OCR Annotation Interface package (annotation_OCR), featuring a Flask web server, an indexing script (ocr_index.py), session storage management (store.py), study agreement metrics computation (study_agreement.py), and study session bundle generation (study_sessions.py). It also includes front-end assets (HTML, CSS, JS) and updates dependency configurations (pyproject.toml, uv.lock). The review feedback highlights critical security and robustness issues, including path traversal vulnerabilities in server.py and store.py due to unvalidated client-supplied paths and session IDs, non-portable hardcoded absolute directories in ocr_index.py, potential AttributeError crashes in server.py and study_agreement.py when parsing malformed JSON files, and missing global error handlers in Flask that lead to generic 500 Internal Server Errors instead of clean 400/404 API responses.
| manifest_items, index_summary, study_config = resolve_session_source( | ||
| study_bundle_path=study_bundle_path, | ||
| manifest_path=manifest_path, | ||
| ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]), | ||
| raw_root=Path(payload.get("raw_root") or defaults["raw_root"]), |
There was a problem hiding this comment.
Allowing the client to specify arbitrary absolute paths for manifest_path, study_bundle_path, ocr_root, and raw_root poses a path traversal and arbitrary file read vulnerability. An attacker could craft a request to read sensitive JSON files or scan unauthorized directories on the server. Consider validating that these paths are resolved and restricted to a safe, pre-defined root directory.
There was a problem hiding this comment.
I mean yeah but the server listen on a local port that is ssh forwarded, if the attacker already has a ssh link up, we have more problems to worry about.
| def session_dir(session_id: str) -> Path: | ||
| return SESSIONS_DIR / session_id |
There was a problem hiding this comment.
The session_id parameter is received directly from client requests and is used to construct file paths. On Windows systems, backslashes (\\) can be used as path separators, allowing directory traversal (e.g., ..\\..\\). To prevent this, validate that session_id conforms to a strict alphanumeric/slug format before resolving the path.
| def session_dir(session_id: str) -> Path: | |
| return SESSIONS_DIR / session_id | |
| def session_dir(session_id: str) -> Path: | |
| if not re.match(r"^[A-Za-z0-9_.-]+$", session_id): | |
| raise ValueError("Invalid session ID format") | |
| return SESSIONS_DIR / session_id |
| DEFAULT_RAW_ROOT = Path( | ||
| "/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs" | ||
| ) |
There was a problem hiding this comment.
| def load_study_bundle(path: Path) -> dict[str, Any]: | ||
| payload = load_json(path) | ||
| sessions = payload.get("sessions") if isinstance(payload, dict) else None | ||
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | ||
| sessions, list | ||
| ): | ||
| raise ValueError(f"invalid study bundle in {path}") |
There was a problem hiding this comment.
If payload is not a dictionary (e.g., if the JSON file is empty, contains a list, or fails to load and returns None), calling payload.get on line 124 will raise an AttributeError instead of raising the intended ValueError. Ensure payload is validated as a dictionary first.
| def load_study_bundle(path: Path) -> dict[str, Any]: | |
| payload = load_json(path) | |
| sessions = payload.get("sessions") if isinstance(payload, dict) else None | |
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | |
| sessions, list | |
| ): | |
| raise ValueError(f"invalid study bundle in {path}") | |
| def load_study_bundle(path: Path) -> dict[str, Any]: | |
| payload = load_json(path) | |
| if not isinstance(payload, dict): | |
| raise ValueError(f"invalid study bundle in {path}") | |
| sessions = payload.get("sessions") | |
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | |
| sessions, list | |
| ): | |
| raise ValueError(f"invalid study bundle in {path}") |
| def load_precomputed_manifest( | ||
| manifest_path: Path, | ||
| ) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| payload = json.loads(manifest_path.read_text(encoding="utf-8")) | ||
| items = payload.get("items") | ||
| if not isinstance(items, list): | ||
| raise ValueError(f"invalid manifest items in {manifest_path}") |
There was a problem hiding this comment.
If the precomputed manifest JSON file contains a non-dictionary top-level element (like a list), payload.get will raise an AttributeError. Validate that payload is a dictionary before calling .get().
| def load_precomputed_manifest( | |
| manifest_path: Path, | |
| ) -> tuple[list[dict[str, Any]], dict[str, Any]]: | |
| payload = json.loads(manifest_path.read_text(encoding="utf-8")) | |
| items = payload.get("items") | |
| if not isinstance(items, list): | |
| raise ValueError(f"invalid manifest items in {manifest_path}") | |
| def load_precomputed_manifest( | |
| manifest_path: Path, | |
| ) -> tuple[list[dict[str, Any]], dict[str, Any]]: | |
| payload = json.loads(manifest_path.read_text(encoding="utf-8")) | |
| if not isinstance(payload, dict): | |
| raise ValueError(f"invalid manifest format in {manifest_path}") | |
| items = payload.get("items") | |
| if not isinstance(items, list): | |
| raise ValueError(f"invalid manifest items in {manifest_path}") |
| def load_study_bundle(bundle_path: Path) -> dict[str, Any]: | ||
| payload = json.loads(bundle_path.read_text(encoding="utf-8")) | ||
| sessions = payload.get("sessions") | ||
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | ||
| sessions, list | ||
| ): | ||
| raise ValueError(f"invalid study bundle in {bundle_path}") |
There was a problem hiding this comment.
If the study bundle JSON file contains a non-dictionary top-level element, payload.get will raise an AttributeError. Validate that payload is a dictionary before calling .get().
| def load_study_bundle(bundle_path: Path) -> dict[str, Any]: | |
| payload = json.loads(bundle_path.read_text(encoding="utf-8")) | |
| sessions = payload.get("sessions") | |
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | |
| sessions, list | |
| ): | |
| raise ValueError(f"invalid study bundle in {bundle_path}") | |
| def load_study_bundle(bundle_path: Path) -> dict[str, Any]: | |
| payload = json.loads(bundle_path.read_text(encoding="utf-8")) | |
| if not isinstance(payload, dict): | |
| raise ValueError(f"invalid study bundle in {bundle_path}") | |
| sessions = payload.get("sessions") | |
| if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( | |
| sessions, list | |
| ): | |
| raise ValueError(f"invalid study bundle in {bundle_path}") |
| def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask: | ||
| app = Flask(__name__, template_folder="templates", static_folder="static") | ||
| app.config["DEFAULT_SESSION_ID"] = default_session_id | ||
| app.config["BUILD_DEFAULTS"] = build_defaults |
There was a problem hiding this comment.
Currently, exceptions like FileNotFoundError (when a session doesn't exist) and KeyError (when an item is not in the manifest) are not caught, which results in Flask returning a generic 500 Internal Server Error. Registering global error handlers in Flask to map these to 404 and 400 respectively would greatly improve API robustness and client-side error handling.
| def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask: | |
| app = Flask(__name__, template_folder="templates", static_folder="static") | |
| app.config["DEFAULT_SESSION_ID"] = default_session_id | |
| app.config["BUILD_DEFAULTS"] = build_defaults | |
| def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask: | |
| app = Flask(__name__, template_folder="templates", static_folder="static") | |
| app.config["DEFAULT_SESSION_ID"] = default_session_id | |
| app.config["BUILD_DEFAULTS"] = build_defaults | |
| @app.errorhandler(FileNotFoundError) | |
| def handle_file_not_found(e): | |
| return jsonify(error=str(e)), 404 | |
| @app.errorhandler(KeyError) | |
| def handle_key_error(e): | |
| return jsonify(error=str(e)), 400 |
There was a problem hiding this comment.
Pull request overview
Adds a new annotation_OCR package that provides a browser-based OCR table annotation workflow, including manifest generation, session persistence, study-bundle assignment, annotation UI, and agreement analysis.
Changes:
- Adds Flask server, frontend templates/static assets, and file-backed annotation storage.
- Adds OCR/table manifest indexing, study-session bundle generation, summary regeneration, and agreement analysis scripts.
- Adds runtime dependencies for the web UI and Markdown sanitization, plus a standalone OCR cleanup script.
Reviewed changes
Copilot reviewed 15 out of 17 changed files in this pull request and generated 7 comments.
Show a summary per file
| File | Description |
|---|---|
.gitignore |
Ignores generated OCR annotation session artifacts. |
pyproject.toml |
Adds Flask, Markdown, and Bleach dependencies. |
uv.lock |
Locks new runtime dependencies and transitive packages. |
scripts/fix_broken_dollar_overlap.py |
Adds OCR cleanup script for broken dollar markers. |
annotation_OCR/__init__.py |
Introduces the annotation package. |
annotation_OCR/README.md |
Documents running the annotation server and study workflow. |
annotation_OCR/manifests/README.md |
Documents reusable manifests and study bundles. |
annotation_OCR/ocr_index.py |
Builds page/table annotation manifests from OCR outputs. |
annotation_OCR/server.py |
Implements Flask routes for sessions, items, images, annotations, and summaries. |
annotation_OCR/store.py |
Implements file-backed session metadata, logs, current annotations, and summaries. |
annotation_OCR/study_sessions.py |
Generates balanced study-session bundles. |
annotation_OCR/study_agreement.py |
Computes agreement and accept/reject metrics. |
annotation_OCR/summarize.py |
Regenerates per-session and combined summaries. |
annotation_OCR/templates/index.html |
Adds main annotation UI template. |
annotation_OCR/templates/landing.html |
Adds landing/session creation UI. |
annotation_OCR/static/app.js |
Adds client-side annotation, navigation, image, and shortcut behavior. |
annotation_OCR/static/style.css |
Adds styling for the annotation interface. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| for item in manifest: | ||
| status = current.get(item["item_id"], {}).get("overall_status", "unreviewed") | ||
| status_counts[status] = status_counts.get(status, 0) + 1 | ||
|
|
||
| next_unreviewed_index = None | ||
| for index, item in enumerate(manifest): | ||
| if item["item_id"] not in current: | ||
| next_unreviewed_index = index | ||
| break | ||
|
|
||
| return { | ||
| "metadata": metadata, | ||
| "item_count": len(manifest), | ||
| "reviewed_count": len(current), |
| target = Path(raw_png_path).resolve() | ||
| raw_root = Path(item.get("raw_root") or "/").resolve() | ||
| if not target.is_relative_to(raw_root): | ||
| abort(400, description="raw image outside raw root") |
| The 15- and 16-annotator bundles use 1500 unique tables with 300 triple-coded | ||
| agreement tables. The 14-annotator bundle lowers the agreement subset to 220 so | ||
| all session quotas still stay within the 120 to 140 target range. |
|
|
||
| - `1500 unique + 300 triple-coded` requires `2100` total annotations. | ||
| - That fits 15 or 16 annotators while keeping each session in the `120–140` range. | ||
| - For 14 annotators, the bundle uses `220` agreement tables instead, for `1940` total annotations and per-session targets of `138–139`. |
| bundle = load_study_bundle(bundle_path) | ||
| claimed = claimed_study_slots(bundle_path) | ||
| sessions = bundle["sessions"] | ||
| next_session = None | ||
| for entry in sessions: | ||
| slot = entry.get("slot") | ||
| if isinstance(slot, int) and slot not in claimed: | ||
| next_session = entry | ||
| break |
There was a problem hiding this comment.
yeah that's a good point
| if 0 <= page_index < len(page_pngs): | ||
| return page_pngs[page_index] |
No description provided.