Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,17 @@ HINDSIGHT_API_LOG_LEVEL=info
# korean_lindera/lindera(korean), ngram(min,max), edge_ngram(min,max)
# HINDSIGHT_API_TEXT_SEARCH_EXTENSION_PG_SEARCH_TOKENIZER=

# File Parser (Optional - uses markitdown by default)
# HINDSIGHT_API_FILE_PARSER=markitdown
# Enable image/scanned-document OCR for MarkItDown using an OpenAI-compatible vision model.
# Dedicated OCR settings fall back to HINDSIGHT_API_LLM_API_KEY, HINDSIGHT_API_LLM_BASE_URL,
# and HINDSIGHT_API_LLM_MODEL when unset.
# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED=false
# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY=
# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL=
# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL=
# HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT=

# Embeddings Configuration (Optional - uses local by default)
# Provider: "local" (default), "onnx", "tei", "openai", "cohere", "google", "openrouter", "zeroentropy", "litellm", or "litellm-sdk"
# HINDSIGHT_API_EMBEDDINGS_PROVIDER=local
Expand Down
2 changes: 1 addition & 1 deletion hindsight-api-slim/hindsight_api/api/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -6493,7 +6493,7 @@ async def api_retain(
description="Upload files (PDF, DOCX, etc.), convert them to markdown, and retain as memories.\n\n"
"This endpoint handles file upload, conversion, and memory creation in a single operation.\n\n"
"**Features:**\n"
"- Supports PDF, DOCX, PPTX, XLSX, images (with OCR), audio (with transcription)\n"
"- Supports PDF, DOCX, PPTX, XLSX, images (parser-dependent OCR), audio (with transcription)\n"
"- Automatic file-to-markdown conversion using pluggable parsers\n"
"- Files stored in object storage (PostgreSQL by default, S3 for production)\n"
"- Each file becomes a separate document with optional metadata/tags\n"
Expand Down
32 changes: 32 additions & 0 deletions hindsight-api-slim/hindsight_api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,11 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
ENV_FILE_STORAGE_AZURE_ACCOUNT_KEY = "HINDSIGHT_API_FILE_STORAGE_AZURE_ACCOUNT_KEY"
ENV_FILE_PARSER = "HINDSIGHT_API_FILE_PARSER"
ENV_FILE_PARSER_ALLOWLIST = "HINDSIGHT_API_FILE_PARSER_ALLOWLIST"
ENV_FILE_PARSER_MARKITDOWN_OCR_ENABLED = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED"
ENV_FILE_PARSER_MARKITDOWN_OCR_API_KEY = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY"
ENV_FILE_PARSER_MARKITDOWN_OCR_BASE_URL = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL"
ENV_FILE_PARSER_MARKITDOWN_OCR_MODEL = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL"
ENV_FILE_PARSER_MARKITDOWN_OCR_PROMPT = "HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT"
ENV_FILE_PARSER_IRIS_TOKEN = "HINDSIGHT_API_FILE_PARSER_IRIS_TOKEN"
ENV_FILE_PARSER_IRIS_ORG_ID = "HINDSIGHT_API_FILE_PARSER_IRIS_ORG_ID"
ENV_FILE_PARSER_LLAMA_PARSE_API_KEY = "HINDSIGHT_API_FILE_PARSER_LLAMA_PARSE_API_KEY"
Expand Down Expand Up @@ -839,6 +844,10 @@ def _parse_strategy_boosts(raw: str | None) -> dict[str, str]:
DEFAULT_FILE_STORAGE_TYPE = "native" # PostgreSQL BYTEA storage
DEFAULT_FILE_PARSER = "markitdown" # Default parser fallback chain (comma-separated, e.g. "iris,markitdown")
DEFAULT_FILE_PARSER_ALLOWLIST = None # Allowlist of parsers clients may request (None = all registered parsers)
DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED = False
DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT = """You are a precise OCR and document transcription engine.

Extract all visible text from the image or scanned document without translating it. Preserve the original language, wording, numbers, punctuation, and reading order. Reconstruct headings, lists, key-value fields, stamps, and tables as clean Markdown. If text is unclear, mark it as [unclear] instead of inventing content. Return only the extracted Markdown."""
DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE_MB = 100 # Max total batch size in MB (all files combined)
DEFAULT_FILE_CONVERSION_MAX_BATCH_SIZE = 10 # Max files per batch upload
DEFAULT_ENABLE_FILE_UPLOAD_API = True # Enable file upload endpoint
Expand Down Expand Up @@ -1597,6 +1606,11 @@ class HindsightConfig:
embeddings_zeroentropy_encoding_format: str = DEFAULT_EMBEDDINGS_ZEROENTROPY_ENCODING_FORMAT
embeddings_zeroentropy_batch_size: int = DEFAULT_EMBEDDINGS_ZEROENTROPY_BATCH_SIZE
embeddings_zeroentropy_latency: str | None = DEFAULT_EMBEDDINGS_ZEROENTROPY_LATENCY
file_parser_markitdown_ocr_enabled: bool = DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED
file_parser_markitdown_ocr_api_key: str | None = None
file_parser_markitdown_ocr_base_url: str | None = None
file_parser_markitdown_ocr_model: str | None = None
file_parser_markitdown_ocr_prompt: str = DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT

# Class-level sets for configuration categorization

Expand Down Expand Up @@ -1637,6 +1651,8 @@ class HindsightConfig:
"file_storage_gcs_service_account_key",
"file_storage_azure_account_key",
# File parser credentials
"file_parser_markitdown_ocr_api_key",
"file_parser_markitdown_ocr_base_url",
"file_parser_iris_token",
"file_parser_llama_parse_api_key",
}
Expand Down Expand Up @@ -2334,6 +2350,22 @@ def from_env(cls) -> "HindsightConfig":
file_parser_allowlist=_parse_str_list(os.getenv(ENV_FILE_PARSER_ALLOWLIST))
if os.getenv(ENV_FILE_PARSER_ALLOWLIST)
else None,
file_parser_markitdown_ocr_enabled=os.getenv(
ENV_FILE_PARSER_MARKITDOWN_OCR_ENABLED,
str(DEFAULT_FILE_PARSER_MARKITDOWN_OCR_ENABLED),
).lower()
in ("1", "true", "yes", "on"),
file_parser_markitdown_ocr_api_key=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_API_KEY)
or os.getenv(ENV_LLM_API_KEY)
or None,
file_parser_markitdown_ocr_base_url=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_BASE_URL)
or os.getenv(ENV_LLM_BASE_URL)
or None,
file_parser_markitdown_ocr_model=os.getenv(ENV_FILE_PARSER_MARKITDOWN_OCR_MODEL) or llm_model,
file_parser_markitdown_ocr_prompt=os.getenv(
ENV_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
),
file_parser_iris_token=os.getenv(ENV_FILE_PARSER_IRIS_TOKEN) or None,
file_parser_iris_org_id=os.getenv(ENV_FILE_PARSER_IRIS_ORG_ID) or None,
file_parser_llama_parse_api_key=os.getenv(ENV_FILE_PARSER_LLAMA_PARSE_API_KEY) or None,
Expand Down
11 changes: 10 additions & 1 deletion hindsight-api-slim/hindsight_api/engine/memory_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2746,7 +2746,16 @@ async def _init_connection(conn: asyncpg.Connection) -> None:

self._parser_registry = FileParserRegistry()
try:
self._parser_registry.register(MarkitdownParser())
self._parser_registry.register(
MarkitdownParser(
ocr_enabled=config.file_parser_markitdown_ocr_enabled,
ocr_api_key=config.file_parser_markitdown_ocr_api_key,
ocr_base_url=config.file_parser_markitdown_ocr_base_url,
ocr_model=config.file_parser_markitdown_ocr_model,
ocr_prompt=config.file_parser_markitdown_ocr_prompt,
ocr_default_headers=config.llm_default_headers,
)
)
logger.debug("Registered markitdown parser")
except ImportError:
logger.warning("markitdown not available - file parsing disabled")
Expand Down
85 changes: 79 additions & 6 deletions hindsight-api-slim/hindsight_api/engine/parsers/markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@

import asyncio
import logging
import os
import tempfile
from pathlib import Path

from hindsight_api.config import DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT

from .base import FileParser

logger = logging.getLogger(__name__)
Expand All @@ -15,31 +18,89 @@ class MarkitdownParser(FileParser):
Markitdown file parser.

Uses Microsoft's markitdown library to convert various file formats
to markdown including PDF, Office docs, images (via OCR), audio, HTML.
to markdown including PDF, Office docs, images with optional OCR,
audio, HTML.

Supported formats:
- PDF (.pdf)
- Word (.docx, .doc)
- PowerPoint (.pptx, .ppt)
- Excel (.xlsx, .xls)
- Images (.jpg, .jpeg, .png) - with OCR
- Images (.jpg, .jpeg, .png) - optional OCR
- HTML (.html, .htm)
- Text (.txt, .md)
- Audio (.mp3, .wav) - with transcription
"""

def __init__(self):
def __init__(
self,
*,
ocr_enabled: bool = False,
ocr_api_key: str | None = None,
ocr_base_url: str | None = None,
ocr_model: str | None = None,
ocr_prompt: str | None = None,
ocr_default_headers: dict | None = None,
):
"""Initialize markitdown parser."""
# Lazy import to avoid requiring markitdown for all users
try:
from markitdown import MarkItDown

self._markitdown = MarkItDown()
except ImportError as e:
raise ImportError(
"markitdown package is required for file parsing. Install with: pip install markitdown"
) from e

self._ocr_enabled = ocr_enabled
kwargs = {}
if ocr_enabled:
kwargs = self._build_ocr_kwargs(
api_key=ocr_api_key,
base_url=ocr_base_url,
model=ocr_model,
prompt=ocr_prompt,
default_headers=ocr_default_headers,
)

self._markitdown = MarkItDown(**kwargs)

def _build_ocr_kwargs(
self,
*,
api_key: str | None,
base_url: str | None,
model: str | None,
prompt: str | None,
default_headers: dict | None,
) -> dict:
"""Build MarkItDown kwargs for OpenAI-compatible image OCR."""
if not model or not model.strip():
raise ValueError(
"Markitdown OCR is enabled but no model is configured. "
"Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL or HINDSIGHT_API_LLM_MODEL."
)
if not api_key and not os.getenv("OPENAI_API_KEY"):
raise ValueError(
"Markitdown OCR is enabled but no API key is configured. "
"Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY, HINDSIGHT_API_LLM_API_KEY, or OPENAI_API_KEY."
)

from openai import OpenAI

client_kwargs = {}
if api_key:
client_kwargs["api_key"] = api_key
if base_url:
client_kwargs["base_url"] = base_url
if default_headers:
client_kwargs["default_headers"] = default_headers

return {
"llm_client": OpenAI(**client_kwargs),
"llm_model": model,
"llm_prompt": prompt or DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT,
}

async def convert(self, file_data: bytes, filename: str) -> str:
"""Parse file to markdown using markitdown."""
# markitdown is synchronous, so we run it in executor to avoid blocking
Expand All @@ -48,6 +109,13 @@ async def convert(self, file_data: bytes, filename: str) -> str:

def _convert_sync(self, file_data: bytes, filename: str) -> str:
"""Synchronous parsing (runs in thread pool)."""
if self._is_image_file(filename) and not self._ocr_enabled:
raise RuntimeError(
"Image OCR is not enabled for the markitdown parser. "
"Set HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED=true and configure a vision-capable "
"OpenAI-compatible model, or choose an OCR-capable parser."
)

# Write to temp file (markitdown requires file path)
with tempfile.NamedTemporaryFile(suffix=Path(filename).suffix, delete=False) as tmp:
tmp.write(file_data)
Expand All @@ -73,6 +141,11 @@ def _convert_sync(self, file_data: bytes, filename: str) -> str:
except Exception:
pass

@staticmethod
def _is_image_file(filename: str) -> bool:
"""Return whether the file type needs OCR to extract useful text."""
return Path(filename).suffix.lower() in {".jpg", ".jpeg", ".png"}

def supports(self, filename: str, content_type: str | None = None) -> bool:
"""Check if markitdown supports this file type."""
# Supported extensions (from markitdown docs)
Expand All @@ -85,7 +158,7 @@ def supports(self, filename: str, content_type: str | None = None) -> bool:
".ppt",
".xlsx",
".xls",
# Images (with OCR)
# Images (optional OCR)
".jpg",
".jpeg",
".png",
Expand Down
47 changes: 47 additions & 0 deletions hindsight-api-slim/tests/test_config_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,53 @@ def test_llm_output_language_empty_string_is_unset(monkeypatch):
assert config.llm_output_language is None


def test_markitdown_ocr_defaults_disabled(monkeypatch):
from hindsight_api.config import HindsightConfig

monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")

config = HindsightConfig.from_env()
assert config.file_parser_markitdown_ocr_enabled is False


def test_markitdown_ocr_falls_back_to_main_llm_config(monkeypatch):
from hindsight_api.config import DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT, HindsightConfig

monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED", "true")
monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")
monkeypatch.setenv("HINDSIGHT_API_LLM_API_KEY", "main-key")
monkeypatch.setenv("HINDSIGHT_API_LLM_BASE_URL", "https://main.example/v1")
monkeypatch.setenv("HINDSIGHT_API_LLM_MODEL", "main-vision-model")

config = HindsightConfig.from_env()
assert config.file_parser_markitdown_ocr_enabled is True
assert config.file_parser_markitdown_ocr_api_key == "main-key"
assert config.file_parser_markitdown_ocr_base_url == "https://main.example/v1"
assert config.file_parser_markitdown_ocr_model == "main-vision-model"
assert config.file_parser_markitdown_ocr_prompt == DEFAULT_FILE_PARSER_MARKITDOWN_OCR_PROMPT


def test_markitdown_ocr_specific_config_overrides_main_llm_config(monkeypatch):
from hindsight_api.config import HindsightConfig

monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_ENABLED", "true")
monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_API_KEY", "parser-key")
monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_BASE_URL", "https://parser.example/v1")
monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_MODEL", "parser-vision-model")
monkeypatch.setenv("HINDSIGHT_API_FILE_PARSER_MARKITDOWN_OCR_PROMPT", "Extract this document exactly.")
monkeypatch.setenv("HINDSIGHT_API_LLM_PROVIDER", "mock")
monkeypatch.setenv("HINDSIGHT_API_LLM_API_KEY", "main-key")
monkeypatch.setenv("HINDSIGHT_API_LLM_BASE_URL", "https://main.example/v1")
monkeypatch.setenv("HINDSIGHT_API_LLM_MODEL", "main-vision-model")

config = HindsightConfig.from_env()
assert config.file_parser_markitdown_ocr_enabled is True
assert config.file_parser_markitdown_ocr_api_key == "parser-key"
assert config.file_parser_markitdown_ocr_base_url == "https://parser.example/v1"
assert config.file_parser_markitdown_ocr_model == "parser-vision-model"
assert config.file_parser_markitdown_ocr_prompt == "Extract this document exactly."


def test_llm_reasoning_effort_defaults_to_low(monkeypatch):
from hindsight_api.config import HindsightConfig

Expand Down
Loading
Loading