sileod · sileod · Mar 22, 2026
diff --git a/TASK_AUTHORING_GUIDE.md b/TASK_AUTHORING_GUIDE.md
@@ -24,6 +24,9 @@ Every task should provide:
 
 `Task.generate_example(...)` automatically adds metadata:
 - `_task`, `_level`, `_config`, `_time`, `_prompt_tokens`, `_cot_tokens`.
+- `_score_answer = {"version": ..., "hash": ..., "commit": ...}`.
+
+If you break scorer compatibility, bump `score_answer_version` (default `0`) and, only if needed, point `score_answer_history[old_version]` at a legacy file or commit.
 
 ## Config and Difficulty Scaling
 Base `Config` protected fields:

diff --git a/reasoning_core/__init__.py b/reasoning_core/__init__.py
@@ -89,7 +89,7 @@ def __getattribute__(self,_): raise RuntimeError("score_answer should not use se
 
 
 scorers = {
-    k: lambda answer, entry, task_name=k: DATASETS[task_name].score_answer(SelfMock(), answer, entry)
+    k: lambda answer, entry, task_name=k: DATASETS[task_name].score_answer_for_entry(answer, entry, SelfMock())
     for k in DATASETS.keys()
 }
 

diff --git a/reasoning_core/score_answer_history.py b/reasoning_core/score_answer_history.py
@@ -0,0 +1,131 @@
+import hashlib
+import inspect
+import subprocess
+import textwrap
+from pathlib import Path
+
+
+_SCORE_ANSWER_CACHE = {}
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def repo_root():
+    return _REPO_ROOT
+
+
+def current_git_commit():
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "HEAD"],
+            cwd=repo_root(),
+            text=True,
+            stderr=subprocess.DEVNULL,
+        ).strip()
+    except Exception:
+        return None
+
+
+def score_answer_hash(task_cls, fn=None):
+    fn = fn or task_cls.score_answer
+    try:
+        source = textwrap.dedent(inspect.getsource(fn))
+    except (OSError, TypeError):
+        source = repr(fn)
+    return hashlib.sha1(source.encode()).hexdigest()[:12]
+
+
+def _score_answer_sources(task_cls):
+    return {str(version): spec for version, spec in dict(getattr(task_cls, "score_answer_history", {}) or {}).items()}
+
+
+def _default_history_path(task_cls):
+    source_path = inspect.getsourcefile(task_cls)
+    if source_path is None:
+        raise RuntimeError(f"Could not locate source file for {task_cls.__name__}")
+    source_path = Path(source_path).resolve()
+    try:
+        return source_path.relative_to(repo_root()).as_posix()
+    except ValueError as exc:
+        raise RuntimeError(
+            f"Task source {source_path} is outside the repository root; set an explicit history path."
+        ) from exc
+
+
+def _resolve_history_callable(task_cls, namespace, spec):
+    spec = dict(spec or {})
+    callable_name = spec.get("callable")
+    if callable_name:
+        return namespace[callable_name]
+
+    class_name = spec.get("class_name", task_cls.__name__)
+    method_name = spec.get("method", "score_answer")
+    history_cls = namespace.get(class_name)
+    if history_cls is not None and hasattr(history_cls, method_name):
+        return getattr(history_cls, method_name)
+    if method_name in namespace:
+        return namespace[method_name]
+    raise KeyError(f"Could not resolve a historical scorer for {task_cls.task_name} from spec {spec}")
+
+
+def _load_history_source(task_cls, version, spec):
+    spec = {"commit": spec} if isinstance(spec, str) else dict(spec or {})
+    if "file" in spec:
+        source_path = Path(spec["file"])
+        if not source_path.is_absolute():
+            source_path = repo_root() / source_path
+        return source_path.read_text(), str(source_path), spec
+
+    commit = spec.get("commit")
+    if not commit:
+        raise KeyError(f"score_answer_history[{version!r}] must define either 'file' or 'commit'.")
+    history_path = spec.get("path") or _default_history_path(task_cls)
+    source = subprocess.check_output(
+        ["git", "show", f"{commit}:{history_path}"],
+        cwd=repo_root(),
+        text=True,
+    )
+    return source, f"{commit}:{history_path}", spec
+
+
+def _load_historical_score_answer(task_cls, version, spec):
+    cache_key = (task_cls.__module__, task_cls.__name__, str(version), repr(spec))
+    if cache_key in _SCORE_ANSWER_CACHE:
+        return _SCORE_ANSWER_CACHE[cache_key]
+
+    source, filename, spec = _load_history_source(task_cls, version, spec)
+    namespace = {"__builtins__": __builtins__}
+    exec(compile(source, filename, "exec"), namespace)
+    fn = _resolve_history_callable(task_cls, namespace, spec)
+    _SCORE_ANSWER_CACHE[cache_key] = fn
+    return fn
+
+
+def resolve_score_answer_fn(task_cls, entry=None):
+    metadata = getattr(entry, "metadata", None) or (entry.get("metadata", {}) if entry is not None else {}) or {}
+    score_meta = metadata.get("_score_answer") or {}
+    requested_version = metadata.get("_score_answer_version") or metadata.get("score_answer_version") or score_meta.get("version")
+    requested_hash = metadata.get("_score_answer_hash") or score_meta.get("hash")
+
+    current_version = str(getattr(task_cls, "score_answer_version", 0))
+    current_hash = score_answer_hash(task_cls, task_cls.score_answer)
+    if requested_version is None:
+        if requested_hash and requested_hash != current_hash:
+            for version, spec in _score_answer_sources(task_cls).items():
+                fn = _load_historical_score_answer(task_cls, version, spec)
+                if score_answer_hash(task_cls, fn) == requested_hash:
+                    return fn
+        return task_cls.score_answer
+
+    requested_version = str(requested_version)
+    if requested_version == current_version:
+        return task_cls.score_answer
+
+    spec = _score_answer_sources(task_cls).get(requested_version)
+    if spec is None:
+        if requested_hash == current_hash:
+            return task_cls.score_answer
+        raise KeyError(
+            f"Unknown score_answer version {requested_version!r} for task {task_cls.task_name}. "
+            f"Set score_answer_history[{requested_version!r}] to a legacy file or commit."
+        )
+    return _load_historical_score_answer(task_cls, requested_version, spec)
diff --git a/reasoning_core/template.py b/reasoning_core/template.py
@@ -16,6 +16,7 @@
 import signal
 from contextlib import contextmanager
 from inflection import underscore
+from .score_answer_history import current_git_commit, resolve_score_answer_fn, score_answer_hash
 import tiktoken
 import psutil 
 
@@ -142,6 +143,9 @@ def prepr_task_name(name):
 
 
 class Task(ProceduralDataset):
+    score_answer_version = 0
+    score_answer_history = {}
+
     def __init_subclass__(cls):
         cls.task_name = getattr(cls, 'task_name', prepr_task_name(cls.__name__))
         register_dataset(cls.task_name, cls)
@@ -159,6 +163,18 @@ def __init__(self, config=dict(), timeout=10, seed=None, _level=0, *a, **kwa):
         self.balancing_key_ratio = 0.5
         self.tokenizer = tiktoken.get_encoding("o200k_base")
 
+    @classmethod
+    def score_answer_hash(cls, fn=None):
+        return score_answer_hash(cls, fn)
+
+    @classmethod
+    def resolve_score_answer_fn(cls, entry=None):
+        return resolve_score_answer_fn(cls, entry)
+
+    @classmethod
+    def score_answer_for_entry(cls, answer, entry, scorer_self):
+        return cls.resolve_score_answer_fn(entry)(scorer_self, answer, entry)
+
     def generate(self):
         """To override, return one problem"""
         #return Problem(metadata=edict(), answer="")
@@ -273,6 +289,11 @@ def inner():
                 problem.metadata['_config'] = self.config.to_dict()
                 problem.metadata['_prompt_tokens'] = prompt_tokens
                 problem.metadata['_cot_tokens'] = cot_tokens
+                problem.metadata['_score_answer'] = edict({
+                    'version': self.score_answer_version,
+                    'hash': self.score_answer_hash(self.score_answer),
+                    'commit': current_git_commit(),
+                })
 
                 problem.balancing_key = self.balancing_key(problem)
                 problem.deduplication_key = self.deduplication_key(problem)

diff --git a/tests/test_score_answer_versioning.py b/tests/test_score_answer_versioning.py
@@ -0,0 +1,106 @@
+import pytest
+import tiktoken
+
+import reasoning_core
+import reasoning_core.template as template
+from reasoning_core import score_answer
+from reasoning_core.template import Config, Problem, Task
+
+
+class _DummyTokenizer:
+    def encode(self, text):
+        return list(text)
+
+
+@pytest.fixture(autouse=True)
+def _patch_tokenizer(monkeypatch):
+    monkeypatch.setattr(tiktoken, "get_encoding", lambda _: _DummyTokenizer())
+
+
+class DefaultVersionTask(Task):
+    task_name = "default_version_task"
+
+    def __init__(self):
+        super().__init__(config=Config())
+
+    def generate(self):
+        return Problem(metadata={"instance": "Return Alpha Beta."}, answer="Alpha Beta")
+
+    def prompt(self, metadata):
+        return metadata["instance"]
+
+    def score_answer(self, answer, entry):
+        return float(str(answer).strip() == str(entry.answer).strip())
+
+
+class SparseVersionTask(Task):
+    task_name = "sparse_version_task"
+    score_answer_version = 1
+    score_answer_history = {}
+
+    def __init__(self):
+        super().__init__(config=Config())
+
+    def generate(self):
+        return Problem(metadata={"instance": "Normalize spacing."}, answer="Alpha Beta")
+
+    def prompt(self, metadata):
+        return metadata["instance"]
+
+    def score_answer(self, answer, entry):
+        normalize = lambda text: "".join(str(text).split()).lower()
+        return float(normalize(answer) == normalize(entry.answer))
+
+
+def _register_test_task(task_cls):
+    reasoning_core.DATASETS[task_cls.task_name] = task_cls
+    reasoning_core.scorers[task_cls.task_name] = lambda answer, entry: task_cls.score_answer_for_entry(answer, entry, object())
+
+
+def test_generated_examples_record_default_version_and_commit(monkeypatch):
+    monkeypatch.setattr(template, "current_git_commit", lambda: "deadbeef")
+    task = DefaultVersionTask()
+    example = task.generate_example()
+
+    assert example.metadata["_score_answer"] == {
+        "version": 0,
+        "hash": DefaultVersionTask.score_answer_hash(DefaultVersionTask.score_answer),
+        "commit": "deadbeef",
+    }
+
+
+def test_global_score_answer_loads_legacy_scorer_from_file(tmp_path):
+    legacy_path = tmp_path / "legacy_sparse_version_task.py"
+    legacy_path.write_text(
+        "def score_answer(self, answer, entry):\n"
+        "    return float(str(answer).strip().lower() == str(entry.answer).strip().lower())\n"
+    )
+    SparseVersionTask.score_answer_history = {0: {"file": str(legacy_path)}}
+    _register_test_task(SparseVersionTask)
+
+    entry = Problem(
+        metadata={
+            "_task": "sparse_version_task",
+            "_score_answer": {"version": 0},
+        },
+        answer="Alpha Beta",
+    )
+
+    assert score_answer("alphabeta", entry) == 0.0
+    assert score_answer("alpha beta", entry) == 1.0
+
+
+def test_unknown_legacy_version_requires_sparse_history_registration():
+    SparseVersionTask.score_answer_history = {}
+    _register_test_task(SparseVersionTask)
+
+    entry = Problem(
+        metadata={
+            "_task": "sparse_version_task",
+            "_score_answer": {"version": 0},
+        },
+        answer="Alpha Beta",
+    )
+
+    with pytest.raises(KeyError, match="score_answer_history"):
+        score_answer("alpha beta", entry)