diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py index 152b33a44..24962d2e2 100644 --- a/hindsight-api-slim/hindsight_api/config.py +++ b/hindsight-api-slim/hindsight_api/config.py @@ -396,6 +396,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: # Retain settings ENV_RETAIN_MAX_COMPLETION_TOKENS = "HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS" ENV_RETAIN_CHUNK_SIZE = "HINDSIGHT_API_RETAIN_CHUNK_SIZE" +ENV_RETAIN_CHUNK_OVERFLOW_FACTOR = "HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR" ENV_RETAIN_EXTRACT_CAUSAL_LINKS = "HINDSIGHT_API_RETAIN_EXTRACT_CAUSAL_LINKS" ENV_RETAIN_EXTRACTION_MODE = "HINDSIGHT_API_RETAIN_EXTRACTION_MODE" ENV_RETAIN_MISSION = "HINDSIGHT_API_RETAIN_MISSION" @@ -818,6 +819,9 @@ def _parse_strategy_boosts(raw: str | None) -> dict[str, str]: # Retain settings DEFAULT_RETAIN_MAX_COMPLETION_TOKENS = 64000 # Max tokens for fact extraction LLM call DEFAULT_RETAIN_CHUNK_SIZE = 3000 # Max chars per chunk for fact extraction +DEFAULT_RETAIN_CHUNK_OVERFLOW_FACTOR = ( + 1.5 # A whole JSONL line / conversation turn is kept intact up to this × chunk size +) DEFAULT_RETAIN_EXTRACT_CAUSAL_LINKS = True # Extract causal links between facts DEFAULT_RETAIN_EXTRACTION_MODE = "concise" # Extraction mode: "concise", "verbose", or "custom" RETAIN_EXTRACTION_MODES = ("concise", "verbose", "custom", "verbatim", "chunks") # Allowed extraction modes @@ -1066,6 +1070,25 @@ def _parse_positive_int(name: str, raw: str | None, default: int) -> int: return parsed +def _parse_chunk_overflow_factor(raw: str | None) -> float: + """Parse the retain chunk overflow factor (a float >= 1.0). + + The factor is how far a single JSONL line / conversation turn may exceed the + chunk-size budget while still being kept whole. A value below 1.0 would split + units that already fit the budget, so it fails fast. Falls back to + ``DEFAULT_RETAIN_CHUNK_OVERFLOW_FACTOR`` when unset/empty. + """ + if raw is None or raw == "": + return DEFAULT_RETAIN_CHUNK_OVERFLOW_FACTOR + try: + parsed = float(raw) + except ValueError as e: + raise ValueError(f"{ENV_RETAIN_CHUNK_OVERFLOW_FACTOR} must be a number, got {raw!r}") from e + if parsed < 1.0: + raise ValueError(f"{ENV_RETAIN_CHUNK_OVERFLOW_FACTOR} must be >= 1.0, got {parsed}") + return parsed + + def _parse_optional_positive_int(name: str, raw: str | None) -> int | None: """Parse an optional env var that must be a positive integer when set.""" if raw is None or raw == "": @@ -1426,6 +1449,7 @@ class HindsightConfig: # Retain settings retain_max_completion_tokens: int retain_chunk_size: int + retain_chunk_overflow_factor: float # Keep a JSONL line / conversation turn whole up to this × chunk size retain_extract_causal_links: bool retain_extraction_mode: str retain_mission: str | None @@ -1649,6 +1673,7 @@ class HindsightConfig: "mcp_enabled_tools", # Retention settings (behavioral) "retain_chunk_size", + "retain_chunk_overflow_factor", "retain_extraction_mode", "retain_mission", "retain_custom_instructions", @@ -2294,6 +2319,7 @@ def from_env(cls) -> "HindsightConfig": os.getenv(ENV_RETAIN_MAX_COMPLETION_TOKENS, str(DEFAULT_RETAIN_MAX_COMPLETION_TOKENS)) ), retain_chunk_size=int(os.getenv(ENV_RETAIN_CHUNK_SIZE, str(DEFAULT_RETAIN_CHUNK_SIZE))), + retain_chunk_overflow_factor=_parse_chunk_overflow_factor(os.getenv(ENV_RETAIN_CHUNK_OVERFLOW_FACTOR)), retain_extract_causal_links=os.getenv( ENV_RETAIN_EXTRACT_CAUSAL_LINKS, str(DEFAULT_RETAIN_EXTRACT_CAUSAL_LINKS) ).lower() diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py index 0099b0148..1e5c8b23b 100644 --- a/hindsight-api-slim/hindsight_api/engine/memory_engine.py +++ b/hindsight-api-slim/hindsight_api/engine/memory_engine.py @@ -412,6 +412,19 @@ def _is_oracledb_integrity_error(e: Exception) -> bool: return isinstance(e, oracledb.IntegrityError) +@dataclass +class RetainChunkParams: + """Bank-resolved parameters that govern how retain content is chunked. + + Both fields must mirror what the orchestrator uses so that chunk-count + estimates (for per-document ``chunk_index`` offsets) match the chunk + boundaries the orchestrator actually produces. + """ + + chunk_size: int + overflow_factor: float + + @dataclass class _SubBatchSplit: """Result of packing retain contents into sub-batches. @@ -3225,7 +3238,7 @@ async def retain_batch_async( # with, so the offsets match the chunk_index values it assigns. from .retain import fact_extraction, fact_storage - sub_chunk_size = await self._resolve_retain_chunk_size(bank_id, request_context, strategy) + chunk_params = await self._resolve_retain_chunk_params(bank_id, request_context, strategy) chunk_offsets: dict[str, int] = {} # In update_mode="append", retain_batch prepends the existing document @@ -3248,7 +3261,7 @@ async def retain_batch_async( existing_text = await fact_storage.get_document_content(conn, bank_id, append_doc_id) if existing_text: append_prepend_chunks[append_doc_id] = len( - fact_extraction.chunk_text(existing_text, sub_chunk_size) + fact_extraction.chunk_text(existing_text, chunk_params.chunk_size, chunk_params.overflow_factor) ) for i, (sub_batch, sub_origins) in enumerate(zip(sub_batches, origin_indices), 1): @@ -3302,7 +3315,13 @@ async def retain_batch_async( # document continues the sequence. if sub_doc_id: sub_chunk_count = sum( - len(fact_extraction.chunk_text(item.get("content", "") or "", sub_chunk_size)) + len( + fact_extraction.chunk_text( + item.get("content", "") or "", + chunk_params.chunk_size, + chunk_params.overflow_factor, + ) + ) for item in sub_batch ) # retain_batch only prepends the existing body on the global @@ -3413,19 +3432,20 @@ async def _submit_post_insert_maintenance( except Exception as e: logger.warning(f"Failed to submit graph maintenance task for bank {bank_id}: {e}") - async def _resolve_retain_chunk_size( + async def _resolve_retain_chunk_params( self, bank_id: str, request_context: "RequestContext", strategy: str | None, - ) -> int: - """Resolve the effective ``retain_chunk_size`` for a bank. + ) -> "RetainChunkParams": + """Resolve the effective chunking parameters for a bank. Mirrors the bank-config + strategy resolution that ``_retain_batch_async_internal`` applies before handing config to the orchestrator, so chunk-count estimates used for per-document chunk_index offsets match the chunk_index values the orchestrator - actually assigns. + actually assigns. Both the chunk size and the overflow factor must match, + since the overflow factor affects how oversized units are chunked. """ from hindsight_api.config_resolver import apply_strategy @@ -3433,7 +3453,10 @@ async def _resolve_retain_chunk_size( effective_strategy = strategy or resolved_config.retain_default_strategy if effective_strategy: resolved_config = apply_strategy(resolved_config, effective_strategy) - return getattr(resolved_config, "retain_chunk_size", 3000) + return RetainChunkParams( + chunk_size=getattr(resolved_config, "retain_chunk_size", 3000), + overflow_factor=getattr(resolved_config, "retain_chunk_overflow_factor", 1.5), + ) async def _retain_batch_async_internal( self, diff --git a/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py b/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py index 7cbe29153..71770ff55 100644 --- a/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py +++ b/hindsight-api-slim/hindsight_api/engine/retain/fact_extraction.py @@ -447,18 +447,21 @@ def _split_oversized_unit(text: str, max_chars: int) -> list[str]: return splitter.split_text(text) -def chunk_text(text: str, max_chars: int) -> list[str]: +def chunk_text(text: str, max_chars: int, overflow_factor: float = _CHUNK_OVERFLOW_FACTOR) -> list[str]: """ Split text into chunks, preserving conversation structure when possible. For JSON conversation arrays (user/assistant turns) and JSONL (newline-delimited JSON objects), splits at turn/line boundaries so no object is split across chunks. - A single turn/line that overflows is kept whole up to ``_CHUNK_OVERFLOW_FACTOR``× + A single turn/line that overflows is kept whole up to ``overflow_factor``× the budget, then split as text. For plain text, uses sentence-aware splitting. Args: text: Input text to chunk (plain text, JSON conversation, or JSONL) max_chars: Maximum characters per chunk (default 120k ≈ 30k tokens) + overflow_factor: How far a single turn/line may exceed ``max_chars`` and + still be kept whole (>= 1.0). Raise it to keep larger structured units + (e.g. long JSONL messages) intact; see issue #2136. Returns: List of text chunks, roughly under max_chars @@ -472,12 +475,12 @@ def chunk_text(text: str, max_chars: int) -> list[str]: parsed = json.loads(text) if isinstance(parsed, list) and all(isinstance(turn, dict) for turn in parsed): # This looks like a conversation - chunk at turn boundaries - return _chunk_conversation(parsed, max_chars) + return _chunk_conversation(parsed, max_chars, overflow_factor) except (json.JSONDecodeError, ValueError): pass # Try to parse as JSONL (newline-delimited JSON objects, e.g. session logs) - jsonl_chunks = _chunk_jsonl(text, max_chars) + jsonl_chunks = _chunk_jsonl(text, max_chars, overflow_factor) if jsonl_chunks is not None: return jsonl_chunks @@ -485,19 +488,23 @@ def chunk_text(text: str, max_chars: int) -> list[str]: return _split_oversized_unit(text, max_chars) -def _chunk_conversation(turns: list[dict], max_chars: int) -> list[str]: +def _chunk_conversation( + turns: list[dict], max_chars: int, overflow_factor: float = _CHUNK_OVERFLOW_FACTOR +) -> list[str]: """ Chunk a conversation array at turn boundaries, preserving complete turns. Args: turns: List of conversation turn dicts (with 'role' and 'content' keys) max_chars: Maximum characters per chunk + overflow_factor: How far a single turn may exceed ``max_chars`` and still + be kept whole (>= 1.0). Returns: List of JSON-serialized chunks, each containing complete turns """ - overflow_limit = int(max_chars * _CHUNK_OVERFLOW_FACTOR) + overflow_limit = int(max_chars * overflow_factor) chunks = [] current_chunk = [] @@ -536,18 +543,20 @@ def _flush() -> None: return chunks if chunks else [json.dumps(turns, ensure_ascii=False)] -def _chunk_jsonl(text: str, max_chars: int) -> list[str] | None: +def _chunk_jsonl(text: str, max_chars: int, overflow_factor: float = _CHUNK_OVERFLOW_FACTOR) -> list[str] | None: """Chunk newline-delimited JSON (JSONL) at line boundaries. Detects JSONL — two or more non-empty lines, each a complete JSON object — and packs whole lines into chunks so no line is split across chunks (multiple short lines may share a chunk). A line that overflows is kept whole up to - ``_CHUNK_OVERFLOW_FACTOR``× the budget, then split as text. Returns ``None`` + ``overflow_factor``× the budget, then split as text. Returns ``None`` if the input is not JSONL, so the caller falls back to plain-text splitting. Args: text: Input text to inspect/chunk. max_chars: Maximum characters per chunk. + overflow_factor: How far a single line may exceed ``max_chars`` and still + be kept whole (>= 1.0). Returns: List of JSONL chunks (lines joined by newline), or ``None`` if not JSONL. @@ -564,7 +573,7 @@ def _chunk_jsonl(text: str, max_chars: int) -> list[str] | None: if not isinstance(obj, dict): return None - overflow_limit = int(max_chars * _CHUNK_OVERFLOW_FACTOR) + overflow_limit = int(max_chars * overflow_factor) chunks: list[str] = [] current_chunk: list[str] = [] @@ -1739,7 +1748,7 @@ async def extract_facts_from_text( - chunks: List of tuples (chunk_text, fact_count) for each chunk - usage: Aggregated token usage across all LLM calls """ - chunks = chunk_text(text, max_chars=config.retain_chunk_size) + chunks = chunk_text(text, max_chars=config.retain_chunk_size, overflow_factor=config.retain_chunk_overflow_factor) # Log chunk count before starting LLM requests total_chars = sum(len(c) for c in chunks) @@ -1923,7 +1932,9 @@ async def extract_facts_from_contents_batch_api( prompt, response_schema = _build_extraction_prompt_and_schema(config) for content_index, item in enumerate(contents): - chunks = chunk_text(item.content, max_chars=config.retain_chunk_size) + chunks = chunk_text( + item.content, max_chars=config.retain_chunk_size, overflow_factor=config.retain_chunk_overflow_factor + ) for chunk_index_in_content, chunk in enumerate(chunks): all_chunks_info.append((chunk, content_index, chunk_index_in_content, item.event_date, item.context)) @@ -2344,7 +2355,7 @@ def _extract_facts_chunks( global_chunk_idx = 0 for content_index, content in enumerate(contents): - chunks = chunk_text(content.content, config.retain_chunk_size) + chunks = chunk_text(content.content, config.retain_chunk_size, config.retain_chunk_overflow_factor) for chunk in chunks: chunks_metadata.append( ChunkMetadata( diff --git a/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py b/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py index 0b7dc4b8f..73449623b 100644 --- a/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py +++ b/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py @@ -864,10 +864,11 @@ async def retain_batch( # retain code paths. chunk_batch_size = getattr(config, "retain_chunk_batch_size", 100) chunk_size = getattr(config, "retain_chunk_size", 3000) + overflow_factor = getattr(config, "retain_chunk_overflow_factor", 1.5) all_pre_chunks: list[str] = [] chunk_to_content: list[int] = [] # maps chunk index -> index into contents for content_idx, content in enumerate(contents): - content_chunks = fact_extraction.chunk_text(content.content, chunk_size) + content_chunks = fact_extraction.chunk_text(content.content, chunk_size, overflow_factor) all_pre_chunks.extend(content_chunks) chunk_to_content.extend([content_idx] * len(content_chunks)) @@ -2249,7 +2250,8 @@ def _chunk_contents_for_delta(contents: list[RetainContent], config) -> dict[int global_chunk_idx = 0 for content in contents: chunk_size = getattr(config, "retain_chunk_size", 3000) - chunks = fact_extraction.chunk_text(content.content, chunk_size) + overflow_factor = getattr(config, "retain_chunk_overflow_factor", 1.5) + chunks = fact_extraction.chunk_text(content.content, chunk_size, overflow_factor) for chunk_text in chunks: result[global_chunk_idx] = chunk_text global_chunk_idx += 1 diff --git a/hindsight-api-slim/tests/test_chunking.py b/hindsight-api-slim/tests/test_chunking.py index 865535df6..5482c4b38 100644 --- a/hindsight-api-slim/tests/test_chunking.py +++ b/hindsight-api-slim/tests/test_chunking.py @@ -231,6 +231,42 @@ def test_chunk_conversation_huge_turn_is_split(): assert len(chunk) <= int(20 * OVERFLOW_FACTOR) +# --------------------------------------------------------------------------- +# Configurable overflow factor (issue #2136) +# --------------------------------------------------------------------------- + + +def test_overflow_factor_keeps_larger_jsonl_line_whole(): + """A raised overflow_factor keeps a line whole that the default would split.""" + # 49-char line, budget 20: default cap 30 splits it, but cap 3x = 60 keeps it. + big = json.dumps({"c": "y" * 40}) # 49 chars + small = json.dumps({"c": "ok"}) + text = "\n".join([big, small]) + assert len(big) == 49 + + default = chunk_text(text, max_chars=20) + raised = chunk_text(text, max_chars=20, overflow_factor=3.0) + + # Default splits the big line into text fragments (more than 2 chunks). + assert len(default) > 2 + # With the raised factor the big line is kept intact as its own chunk. + assert raised == [big, small] + + +def test_overflow_factor_keeps_larger_conversation_turn_whole(): + """A raised overflow_factor keeps a conversation turn whole, not split as text.""" + turns = [{"c": "y" * 40}, {"c": "ok"}] + text = json.dumps(turns) + + default = chunk_text(text, max_chars=20) + raised = chunk_text(text, max_chars=20, overflow_factor=4.0) + + # Default splits the huge turn into text fragments. + assert len(default) > 2 + # With the raised factor every chunk is a valid JSON array of whole turns. + assert [json.loads(c) for c in raised] == [[turns[0]], [turns[1]]] + + # --------------------------------------------------------------------------- # Detection guard # --------------------------------------------------------------------------- diff --git a/hindsight-api-slim/tests/test_config_validation.py b/hindsight-api-slim/tests/test_config_validation.py index 5872ae209..cca3b4e9e 100644 --- a/hindsight-api-slim/tests/test_config_validation.py +++ b/hindsight-api-slim/tests/test_config_validation.py @@ -20,6 +20,7 @@ def setup_test_env(): "HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS", "HINDSIGHT_API_CONSOLIDATION_MAX_COMPLETION_TOKENS", "HINDSIGHT_API_RETAIN_CHUNK_SIZE", + "HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR", "HINDSIGHT_API_LLM_PROVIDER", "HINDSIGHT_API_LLM_MODEL", "HINDSIGHT_API_LLM_REASONING_EFFORT", @@ -106,6 +107,39 @@ def test_valid_retain_config_succeeds(): assert config.retain_chunk_size == 3000 +def test_retain_chunk_overflow_factor_defaults_to_1_5(): + """The chunk overflow factor defaults to 1.5 when unset.""" + from hindsight_api.config import HindsightConfig + + os.environ["HINDSIGHT_API_LLM_PROVIDER"] = "mock" + os.environ.pop("HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR", None) + + config = HindsightConfig.from_env() + assert config.retain_chunk_overflow_factor == 1.5 + + +def test_retain_chunk_overflow_factor_reads_from_env(): + """The chunk overflow factor is read from the environment.""" + from hindsight_api.config import HindsightConfig + + os.environ["HINDSIGHT_API_LLM_PROVIDER"] = "mock" + os.environ["HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR"] = "3" + + config = HindsightConfig.from_env() + assert config.retain_chunk_overflow_factor == 3.0 + + +def test_retain_chunk_overflow_factor_below_one_fails(): + """A factor below 1.0 would split units that already fit — fail fast.""" + from hindsight_api.config import HindsightConfig + + os.environ["HINDSIGHT_API_LLM_PROVIDER"] = "mock" + os.environ["HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR"] = "0.5" + + with pytest.raises(ValueError, match="RETAIN_CHUNK_OVERFLOW_FACTOR"): + HindsightConfig.from_env() + + def test_semantic_min_similarity_reads_from_env(): """Semantic retrieval min similarity can be configured at the server level.""" from hindsight_api.config import HindsightConfig diff --git a/hindsight-api-slim/tests/test_hierarchical_config.py b/hindsight-api-slim/tests/test_hierarchical_config.py index 3b2ecb2eb..abd0ec01f 100644 --- a/hindsight-api-slim/tests/test_hierarchical_config.py +++ b/hindsight-api-slim/tests/test_hierarchical_config.py @@ -113,6 +113,7 @@ async def test_hierarchical_fields_categorization(): assert "retain_mission" in configurable assert "retain_custom_instructions" in configurable assert "retain_chunk_size" in configurable + assert "retain_chunk_overflow_factor" in configurable assert "enable_observations" in configurable assert "consolidation_llm_batch_size" in configurable assert "consolidation_source_facts_max_tokens" in configurable @@ -139,7 +140,7 @@ async def test_hierarchical_fields_categorization(): assert "consolidation_llm_parallelism" in configurable # Verify count is correct - assert len(configurable) == 38 + assert len(configurable) == 39 # Verify credential fields (NEVER exposed) assert "llm_api_key" in credentials diff --git a/hindsight-docs/docs/developer/configuration.md b/hindsight-docs/docs/developer/configuration.md index fb22da937..15836ad76 100644 --- a/hindsight-docs/docs/developer/configuration.md +++ b/hindsight-docs/docs/developer/configuration.md @@ -1017,6 +1017,7 @@ Controls the retain (memory ingestion) pipeline. |----------|-------------|---------| | `HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS` | Max completion tokens for fact extraction LLM calls | `64000` | | `HINDSIGHT_API_RETAIN_CHUNK_SIZE` | Max characters per chunk for fact extraction. Larger chunks extract fewer LLM calls but may lose context. | `3000` | +| `HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR` | Keep a single JSONL line or conversation turn whole up to this multiple of the chunk size before splitting it as text. Raise it to keep long structured messages in a single chunk. Hierarchical (per-bank). | `1.5` | | `HINDSIGHT_API_RETAIN_EXTRACTION_MODE` | Fact extraction mode: `concise`, `verbose`, `verbatim`, `chunks`, or `custom` | `concise` | | `HINDSIGHT_API_RETAIN_MISSION` | What this bank should pay attention to during extraction. Steers the LLM without replacing the extraction rules — works alongside any extraction mode. | - | | `HINDSIGHT_API_RETAIN_CUSTOM_INSTRUCTIONS` | Full prompt override for fact extraction (only used when mode is `custom`). Replaces built-in extraction rules entirely. | - | diff --git a/skills/hindsight-docs/references/developer/configuration.md b/skills/hindsight-docs/references/developer/configuration.md index 4abdc795c..57e8d7a3f 100644 --- a/skills/hindsight-docs/references/developer/configuration.md +++ b/skills/hindsight-docs/references/developer/configuration.md @@ -1017,6 +1017,7 @@ Controls the retain (memory ingestion) pipeline. |----------|-------------|---------| | `HINDSIGHT_API_RETAIN_MAX_COMPLETION_TOKENS` | Max completion tokens for fact extraction LLM calls | `64000` | | `HINDSIGHT_API_RETAIN_CHUNK_SIZE` | Max characters per chunk for fact extraction. Larger chunks extract fewer LLM calls but may lose context. | `3000` | +| `HINDSIGHT_API_RETAIN_CHUNK_OVERFLOW_FACTOR` | Keep a single JSONL line or conversation turn whole up to this multiple of the chunk size before splitting it as text. Raise it to keep long structured messages in a single chunk. Hierarchical (per-bank). | `1.5` | | `HINDSIGHT_API_RETAIN_EXTRACTION_MODE` | Fact extraction mode: `concise`, `verbose`, `verbatim`, `chunks`, or `custom` | `concise` | | `HINDSIGHT_API_RETAIN_MISSION` | What this bank should pay attention to during extraction. Steers the LLM without replacing the extraction rules — works alongside any extraction mode. | - | | `HINDSIGHT_API_RETAIN_CUSTOM_INSTRUCTIONS` | Full prompt override for fact extraction (only used when mode is `custom`). Replaces built-in extraction rules entirely. | - |