vectorize-io · Sanderhoff-alt · Jun 11, 2026
diff --git a/.env.example b/.env.example
@@ -79,6 +79,17 @@ HINDSIGHT_API_LOG_LEVEL=info
 # korean_lindera/lindera(korean), ngram(min,max), edge_ngram(min,max)
 # HINDSIGHT_API_TEXT_SEARCH_EXTENSION_PG_SEARCH_TOKENIZER=
 
+# File Parser (Optional - uses markitdown by default)
+# HINDSIGHT_API_FILE_PARSER=markitdown
+# Enable image/scanned-document OCR for MarkItDown using an OpenAI-compatible vision model.
+# Dedicated OCR settings fall back to HINDSIGHT_API_LLM_API_KEY, HINDSIGHT_API_LLM_BASE_URL,
+# and HINDSIGHT_API_LLM_MODEL when unset.
+# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED=false
+# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY=
+# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL=
+# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL=
+# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT=
+
 # Embeddings Configuration (Optional - uses local by default)
 # Provider: "local" (default), "onnx", "tei", "openai", "cohere", "google", "openrouter", "zeroentropy", "litellm", or "litellm-sdk"
 # HINDSIGHT_API_EMBEDDINGS_PROVIDER=local

diff --git a/hindsight-api-slim/hindsight_api/api/http.py b/hindsight-api-slim/hindsight_api/api/http.py
@@ -6493,7 +6493,7 @@ async def api_retain(
         description="Upload files (PDF, DOCX, etc.), convert them to markdown, and retain as memories.\n\n"
         "This endpoint handles file upload, conversion, and memory creation in a single operation.\n\n"
         "**Features:**\n"
-        "- Supports PDF, DOCX, PPTX, XLSX, images (with OCR), audio (with transcription)\n"
+        "- Supports PDF, DOCX, PPTX, XLSX, images (parser-dependent OCR), audio (with transcription)\n"
         "- Automatic file-to-markdown conversion using pluggable parsers\n"
         "- Files stored in object storage (PostgreSQL by default, S3 for production)\n"
         "- Each file becomes a separate document with optional metadata/tags\n"

diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py
@@ -422,6 +422,11 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_KEY"
 ENV_FILE_PARSER = "HINDSIGHT_API_FILE_PARSER"
 ENV_FILE_PARSER_ALLOWLIST = "HINDSIGHT_API_FILE_PARSER_ALLOWLIST"
+ENV_FILE_PARSER_MARKITDOWN_OCR_ENABLED = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED"
+ENV_FILE_PARSER_MARKITDOWN_OCR_API_KEY = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY"
+ENV_FILE_PARSER_MARKITDOWN_OCR_BASE_URL = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL"
+ENV_FILE_PARSER_MARKITDOWN_OCR_MODEL = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL"
+ENV_FILE_PARSER_MARKITDOWN_OCR_PROMPT = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT"
 ENV_FILE_PARSER_IRIS_TOKEN = "HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN"
 ENV_FILE_PARSER_IRIS_ORG_ID = "HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID"
 ENV_FILE_PARSER_LLAMA_PARSE_API_KEY = "HINDSIGHT_API_FILE_PARSER_LLAMA_PARSE_API_KEY"
@@ -839,6 +844,10 @@ def _parse_strategy_boosts(raw: str | None) -> dict[str, str]:
 DEFAULT_FILE_STORAGE_TYPE = "native"  # PostgreSQL BYTEA storage
 DEFAULT_FILE_PARSER = "markitdown"  # Default parser fallback chain (comma-separated, e.g. "iris,markitdown")
 DEFAULT_FILE_PARSER_ALLOWLIST = None  # Allowlist of parsers clients may request (None = all registered parsers)
+DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED = False
+DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT = """You are a precise OCR and document transcription engine.
+
+Extract all visible text from the image or scanned document without translating it. Preserve the original language, wording, numbers, punctuation, and reading order. Reconstruct headings, lists, key-value fields, stamps, and tables as clean Markdown. If text is unclear, mark it as [unclear] instead of inventing content. Return only the extracted Markdown."""
 DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE_MB = 100  # Max total batch size in MB (all files combined)
 DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE = 10  # Max files per batch upload
 DEFAULT_ENABLE_FILE_UPLOAD_API = True  # Enable file upload endpoint
@@ -1597,6 +1606,11 @@ class HindsightConfig:
     embeddings_zeroentropy_encoding_format: str = DEFAULT_EMBEDDINGS_ZEROENTROPY_ENCODING_FORMAT
     embeddings_zeroentropy_batch_size: int = DEFAULT_EMBEDDINGS_ZEROENTROPY_BATCH_SIZE
     embeddings_zeroentropy_latency: str | None = DEFAULT_EMBEDDINGS_ZEROENTROPY_LATENCY
+    file_parser_markitdown_ocr_enabled: bool = DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED
+    file_parser_markitdown_ocr_api_key: str | None = None
+    file_parser_markitdown_ocr_base_url: str | None = None
+    file_parser_markitdown_ocr_model: str | None = None
+    file_parser_markitdown_ocr_prompt: str = DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT
 
     # Class-level sets for configuration categorization
 
@@ -1637,6 +1651,8 @@ class HindsightConfig:
         "file_storage_gcs_service_account_key",
         "file_storage_azure_account_key",
         # File parser credentials
+        "file_parser_markitdown_ocr_api_key",
+        "file_parser_markitdown_ocr_base_url",
         "file_parser_iris_token",
         "file_parser_llama_parse_api_key",
     }
@@ -2334,6 +2350,22 @@ def from_env(cls) -> "HindsightConfig":
             file_parser_allowlist=_parse_str_list(os.getenv(ENV_FILE_PARSER_ALLOWLIST))
             if os.getenv(ENV_FILE_PARSER_ALLOWLIST)
             else None,
+            file_parser_markitdown_ocr_enabled=os.getenv(
+                ENV_FILE_PARSER_MARKITDOWN_OCR_ENABLED,
+                str(DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED),
+            ).lower()
+            in ("1", "true", "yes", "on"),
+            file_parser_markitdown_ocr_api_key=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_API_KEY)
+            or os.getenv(ENV_LLM_API_KEY)
+            or None,
+            file_parser_markitdown_ocr_base_url=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_BASE_URL)
+            or os.getenv(ENV_LLM_BASE_URL)
+            or None,
+            file_parser_markitdown_ocr_model=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_MODEL) or llm_model,
+            file_parser_markitdown_ocr_prompt=os.getenv(
+                ENV_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
+                DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
+            ),
             file_parser_iris_token=os.getenv(ENV_FILE_PARSER_IRIS_TOKEN) or None,
             file_parser_iris_org_id=os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID) or None,
             file_parser_llama_parse_api_key=os.getenv(ENV_FILE_PARSER_LLAMA_PARSE_API_KEY) or None,

diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -2746,7 +2746,16 @@ async def _init_connection(conn: asyncpg.Connection) -> None:
 
         self._parser_registry = FileParserRegistry()
         try:
-            self._parser_registry.register(MarkitdownParser())
+            self._parser_registry.register(
+                MarkitdownParser(
+                    ocr_enabled=config.file_parser_markitdown_ocr_enabled,
+                    ocr_api_key=config.file_parser_markitdown_ocr_api_key,
+                    ocr_base_url=config.file_parser_markitdown_ocr_base_url,
+                    ocr_model=config.file_parser_markitdown_ocr_model,
+                    ocr_prompt=config.file_parser_markitdown_ocr_prompt,
+                    ocr_default_headers=config.llm_default_headers,
+                )
+            )
             logger.debug("Registered markitdown parser")
         except ImportError:
             logger.warning("markitdown not available - file parsing disabled")

diff --git a/hindsight-api-slim/hindsight_api/engine/parsers/markitdown.py b/hindsight-api-slim/hindsight_api/engine/parsers/markitdown.py
@@ -2,9 +2,12 @@
 
 import asyncio
 import logging
+import os
 import tempfile
 from pathlib import Path
 
+from hindsight_api.config import DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT
+
 from .base import FileParser
 
 logger = logging.getLogger(__name__)
@@ -15,31 +18,89 @@ class MarkitdownParser(FileParser):
     Markitdown file parser.
 
     Uses Microsoft's markitdown library to convert various file formats
-    to markdown including PDF, Office docs, images (via OCR), audio, HTML.
+    to markdown including PDF, Office docs, images with optional OCR,
+    audio, HTML.
 
     Supported formats:
     - PDF (.pdf)
     - Word (.docx, .doc)
     - PowerPoint (.pptx, .ppt)
     - Excel (.xlsx, .xls)
-    - Images (.jpg, .jpeg, .png) - with OCR
+    - Images (.jpg, .jpeg, .png) - optional OCR
     - HTML (.html, .htm)
     - Text (.txt, .md)
     - Audio (.mp3, .wav) - with transcription
     """
 
-    def __init__(self):
+    def __init__(
+        self,
+        *,
+        ocr_enabled: bool = False,
+        ocr_api_key: str | None = None,
+        ocr_base_url: str | None = None,
+        ocr_model: str | None = None,
+        ocr_prompt: str | None = None,
+        ocr_default_headers: dict | None = None,
+    ):
         """Initialize markitdown parser."""
         # Lazy import to avoid requiring markitdown for all users
         try:
             from markitdown import MarkItDown
-
-            self._markitdown = MarkItDown()
         except ImportError as e:
             raise ImportError(
                 "markitdown package is required for file parsing. Install with: pip install markitdown"
             ) from e
 
+        self._ocr_enabled = ocr_enabled
+        kwargs = {}
+        if ocr_enabled:
+            kwargs = self._build_ocr_kwargs(
+                api_key=ocr_api_key,
+                base_url=ocr_base_url,
+                model=ocr_model,
+                prompt=ocr_prompt,
+                default_headers=ocr_default_headers,
+            )
+
+        self._markitdown = MarkItDown(**kwargs)
+
+    def _build_ocr_kwargs(
+        self,
+        *,
+        api_key: str | None,
+        base_url: str | None,
+        model: str | None,
+        prompt: str | None,
+        default_headers: dict | None,
+    ) -> dict:
+        """Build MarkItDown kwargs for OpenAI-compatible image OCR."""
+        if not model or not model.strip():
+            raise ValueError(
+                "Markitdown OCR is enabled but no model is configured. "
+                "Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL or HINDSIGHT_API_LLM_MODEL."
+            )
+        if not api_key and not os.getenv("OPENAI_API_KEY"):
+            raise ValueError(
+                "Markitdown OCR is enabled but no API key is configured. "
+                "Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY, HINDSIGHT_API_LLM_API_KEY, or OPENAI_API_KEY."
+            )
+
+        from openai import OpenAI
+
+        client_kwargs = {}
+        if api_key:
+            client_kwargs["api_key"] = api_key
+        if base_url:
+            client_kwargs["base_url"] = base_url
+        if default_headers:
+            client_kwargs["default_headers"] = default_headers
+
+        return {
+            "llm_client": OpenAI(**client_kwargs),
+            "llm_model": model,
+            "llm_prompt": prompt or DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
+        }
+
     async def convert(self, file_data: bytes, filename: str) -> str:
         """Parse file to markdown using markitdown."""
         # markitdown is synchronous, so we run it in executor to avoid blocking
@@ -48,6 +109,13 @@ async def convert(self, file_data: bytes, filename: str) -> str:
 
     def _convert_sync(self, file_data: bytes, filename: str) -> str:
         """Synchronous parsing (runs in thread pool)."""
+        if self._is_image_file(filename) and not self._ocr_enabled:
+            raise RuntimeError(
+                "Image OCR is not enabled for the markitdown parser. "
+                "Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED=true and configure a vision-capable "
+                "OpenAI-compatible model, or choose an OCR-capable parser."
+            )
+
         # Write to temp file (markitdown requires file path)
         with tempfile.NamedTemporaryFile(suffix=Path(filename).suffix, delete=False) as tmp:
             tmp.write(file_data)
@@ -73,6 +141,11 @@ def _convert_sync(self, file_data: bytes, filename: str) -> str:
             except Exception:
                 pass
 
+    @staticmethod
+    def _is_image_file(filename: str) -> bool:
+        """Return whether the file type needs OCR to extract useful text."""
+        return Path(filename).suffix.lower() in {".jpg", ".jpeg", ".png"}
+
     def supports(self, filename: str, content_type: str | None = None) -> bool:
         """Check if markitdown supports this file type."""
         # Supported extensions (from markitdown docs)
@@ -85,7 +158,7 @@ def supports(self, filename: str, content_type: str | None = None) -> bool:
             ".ppt",
             ".xlsx",
             ".xls",
-            # Images (with OCR)
+            # Images (optional OCR)
             ".jpg",
             ".jpeg",
             ".png",

diff --git a/hindsight-api-slim/tests/test_config_validation.py b/hindsight-api-slim/tests/test_config_validation.py
@@ -403,6 +403,53 @@ def test_llm_output_language_empty_string_is_unset(monkeypatch):
     assert config.llm_output_language is None
 
 
+def test_markitdown_ocr_defaults_disabled(monkeypatch):
+    from hindsight_api.config import HindsightConfig
+
+    monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")
+
+    config = HindsightConfig.from_env()
+    assert config.file_parser_markitdown_ocr_enabled is False
+
+
+def test_markitdown_ocr_falls_back_to_main_llm_config(monkeypatch):
+    from hindsight_api.config import DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT, HindsightConfig
+
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED", "true")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_API_KEY", "main-key")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_BASE_URL", "https://main.example/v1")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_MODEL", "main-vision-model")
+
+    config = HindsightConfig.from_env()
+    assert config.file_parser_markitdown_ocr_enabled is True
+    assert config.file_parser_markitdown_ocr_api_key == "main-key"
+    assert config.file_parser_markitdown_ocr_base_url == "https://main.example/v1"
+    assert config.file_parser_markitdown_ocr_model == "main-vision-model"
+    assert config.file_parser_markitdown_ocr_prompt == DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT
+
+
+def test_markitdown_ocr_specific_config_overrides_main_llm_config(monkeypatch):
+    from hindsight_api.config import HindsightConfig
+
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED", "true")
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY", "parser-key")
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL", "https://parser.example/v1")
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL", "parser-vision-model")
+    monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT", "Extract this document exactly.")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_API_KEY", "main-key")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_BASE_URL", "https://main.example/v1")
+    monkeypatch.setenv("HINDSIGHT_API_LLM_MODEL", "main-vision-model")
+
+    config = HindsightConfig.from_env()
+    assert config.file_parser_markitdown_ocr_enabled is True
+    assert config.file_parser_markitdown_ocr_api_key == "parser-key"
+    assert config.file_parser_markitdown_ocr_base_url == "https://parser.example/v1"
+    assert config.file_parser_markitdown_ocr_model == "parser-vision-model"
+    assert config.file_parser_markitdown_ocr_prompt == "Extract this document exactly."
+
+
 def test_llm_reasoning_effort_defaults_to_low(monkeypatch):
     from hindsight_api.config import HindsightConfig