Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions TASK_AUTHORING_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ Every task should provide:

`Task.generate_example(...)` automatically adds metadata:
- `_task`, `_level`, `_config`, `_time`, `_prompt_tokens`, `_cot_tokens`.
- `_score_answer = {"version": ..., "hash": ..., "commit": ...}`.

If you break scorer compatibility, bump `score_answer_version` (default `0`) and, only if needed, point `score_answer_history[old_version]` at a legacy file or commit.

## Config and Difficulty Scaling
Base `Config` protected fields:
Expand Down
2 changes: 1 addition & 1 deletion reasoning_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __getattribute__(self,_): raise RuntimeError("score_answer should not use se


scorers = {
k: lambda answer, entry, task_name=k: DATASETS[task_name].score_answer(SelfMock(), answer, entry)
k: lambda answer, entry, task_name=k: DATASETS[task_name].score_answer_for_entry(answer, entry, SelfMock())
for k in DATASETS.keys()
}

Expand Down
131 changes: 131 additions & 0 deletions reasoning_core/score_answer_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import hashlib
import inspect
import subprocess
import textwrap
from pathlib import Path


_SCORE_ANSWER_CACHE = {}
_REPO_ROOT = Path(__file__).resolve().parents[1]


def repo_root():
return _REPO_ROOT


def current_git_commit():
try:
return subprocess.check_output(
["git", "rev-parse", "HEAD"],
cwd=repo_root(),
text=True,
stderr=subprocess.DEVNULL,
).strip()
except Exception:
return None


def score_answer_hash(task_cls, fn=None):
fn = fn or task_cls.score_answer
try:
source = textwrap.dedent(inspect.getsource(fn))
except (OSError, TypeError):
source = repr(fn)
return hashlib.sha1(source.encode()).hexdigest()[:12]


def _score_answer_sources(task_cls):
return {str(version): spec for version, spec in dict(getattr(task_cls, "score_answer_history", {}) or {}).items()}


def _default_history_path(task_cls):
source_path = inspect.getsourcefile(task_cls)
if source_path is None:
raise RuntimeError(f"Could not locate source file for {task_cls.__name__}")
source_path = Path(source_path).resolve()
try:
return source_path.relative_to(repo_root()).as_posix()
except ValueError as exc:
raise RuntimeError(
f"Task source {source_path} is outside the repository root; set an explicit history path."
) from exc


def _resolve_history_callable(task_cls, namespace, spec):
spec = dict(spec or {})
callable_name = spec.get("callable")
if callable_name:
return namespace[callable_name]

class_name = spec.get("class_name", task_cls.__name__)
method_name = spec.get("method", "score_answer")
history_cls = namespace.get(class_name)
if history_cls is not None and hasattr(history_cls, method_name):
return getattr(history_cls, method_name)
if method_name in namespace:
return namespace[method_name]
raise KeyError(f"Could not resolve a historical scorer for {task_cls.task_name} from spec {spec}")


def _load_history_source(task_cls, version, spec):
spec = {"commit": spec} if isinstance(spec, str) else dict(spec or {})
if "file" in spec:
source_path = Path(spec["file"])
if not source_path.is_absolute():
source_path = repo_root() / source_path
return source_path.read_text(), str(source_path), spec

commit = spec.get("commit")
if not commit:
raise KeyError(f"score_answer_history[{version!r}] must define either 'file' or 'commit'.")
history_path = spec.get("path") or _default_history_path(task_cls)
source = subprocess.check_output(
["git", "show", f"{commit}:{history_path}"],
cwd=repo_root(),
text=True,
)
return source, f"{commit}:{history_path}", spec


def _load_historical_score_answer(task_cls, version, spec):
cache_key = (task_cls.__module__, task_cls.__name__, str(version), repr(spec))
if cache_key in _SCORE_ANSWER_CACHE:
return _SCORE_ANSWER_CACHE[cache_key]

source, filename, spec = _load_history_source(task_cls, version, spec)
namespace = {"__builtins__": __builtins__}
exec(compile(source, filename, "exec"), namespace)
fn = _resolve_history_callable(task_cls, namespace, spec)
_SCORE_ANSWER_CACHE[cache_key] = fn
return fn


def resolve_score_answer_fn(task_cls, entry=None):
metadata = getattr(entry, "metadata", None) or (entry.get("metadata", {}) if entry is not None else {}) or {}
score_meta = metadata.get("_score_answer") or {}
requested_version = metadata.get("_score_answer_version") or metadata.get("score_answer_version") or score_meta.get("version")
requested_hash = metadata.get("_score_answer_hash") or score_meta.get("hash")

current_version = str(getattr(task_cls, "score_answer_version", 0))
current_hash = score_answer_hash(task_cls, task_cls.score_answer)
if requested_version is None:
if requested_hash and requested_hash != current_hash:
for version, spec in _score_answer_sources(task_cls).items():
fn = _load_historical_score_answer(task_cls, version, spec)
if score_answer_hash(task_cls, fn) == requested_hash:
return fn
return task_cls.score_answer

requested_version = str(requested_version)
if requested_version == current_version:
return task_cls.score_answer

spec = _score_answer_sources(task_cls).get(requested_version)
if spec is None:
if requested_hash == current_hash:
return task_cls.score_answer
raise KeyError(
f"Unknown score_answer version {requested_version!r} for task {task_cls.task_name}. "
f"Set score_answer_history[{requested_version!r}] to a legacy file or commit."
)
return _load_historical_score_answer(task_cls, requested_version, spec)
21 changes: 21 additions & 0 deletions reasoning_core/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import signal
from contextlib import contextmanager
from inflection import underscore
from .score_answer_history import current_git_commit, resolve_score_answer_fn, score_answer_hash
import tiktoken
import psutil

Expand Down Expand Up @@ -142,6 +143,9 @@ def prepr_task_name(name):


class Task(ProceduralDataset):
score_answer_version = 0
score_answer_history = {}

def __init_subclass__(cls):
cls.task_name = getattr(cls, 'task_name', prepr_task_name(cls.__name__))
register_dataset(cls.task_name, cls)
Expand All @@ -159,6 +163,18 @@ def __init__(self, config=dict(), timeout=10, seed=None, _level=0, *a, **kwa):
self.balancing_key_ratio = 0.5
self.tokenizer = tiktoken.get_encoding("o200k_base")

@classmethod
def score_answer_hash(cls, fn=None):
return score_answer_hash(cls, fn)

@classmethod
def resolve_score_answer_fn(cls, entry=None):
return resolve_score_answer_fn(cls, entry)

@classmethod
def score_answer_for_entry(cls, answer, entry, scorer_self):
return cls.resolve_score_answer_fn(entry)(scorer_self, answer, entry)

def generate(self):
"""To override, return one problem"""
#return Problem(metadata=edict(), answer="")
Expand Down Expand Up @@ -273,6 +289,11 @@ def inner():
problem.metadata['_config'] = self.config.to_dict()
problem.metadata['_prompt_tokens'] = prompt_tokens
problem.metadata['_cot_tokens'] = cot_tokens
problem.metadata['_score_answer'] = edict({
'version': self.score_answer_version,
'hash': self.score_answer_hash(self.score_answer),
'commit': current_git_commit(),
})

problem.balancing_key = self.balancing_key(problem)
problem.deduplication_key = self.deduplication_key(problem)
Expand Down
106 changes: 106 additions & 0 deletions tests/test_score_answer_versioning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import pytest
import tiktoken

import reasoning_core
import reasoning_core.template as template
from reasoning_core import score_answer
from reasoning_core.template import Config, Problem, Task


class _DummyTokenizer:
def encode(self, text):
return list(text)


@pytest.fixture(autouse=True)
def _patch_tokenizer(monkeypatch):
monkeypatch.setattr(tiktoken, "get_encoding", lambda _: _DummyTokenizer())


class DefaultVersionTask(Task):
task_name = "default_version_task"

def __init__(self):
super().__init__(config=Config())

def generate(self):
return Problem(metadata={"instance": "Return Alpha Beta."}, answer="Alpha Beta")

def prompt(self, metadata):
return metadata["instance"]

def score_answer(self, answer, entry):
return float(str(answer).strip() == str(entry.answer).strip())


class SparseVersionTask(Task):
task_name = "sparse_version_task"
score_answer_version = 1
score_answer_history = {}

def __init__(self):
super().__init__(config=Config())

def generate(self):
return Problem(metadata={"instance": "Normalize spacing."}, answer="Alpha Beta")

def prompt(self, metadata):
return metadata["instance"]

def score_answer(self, answer, entry):
normalize = lambda text: "".join(str(text).split()).lower()
return float(normalize(answer) == normalize(entry.answer))


def _register_test_task(task_cls):
reasoning_core.DATASETS[task_cls.task_name] = task_cls
reasoning_core.scorers[task_cls.task_name] = lambda answer, entry: task_cls.score_answer_for_entry(answer, entry, object())


def test_generated_examples_record_default_version_and_commit(monkeypatch):
monkeypatch.setattr(template, "current_git_commit", lambda: "deadbeef")
task = DefaultVersionTask()
example = task.generate_example()

assert example.metadata["_score_answer"] == {
"version": 0,
"hash": DefaultVersionTask.score_answer_hash(DefaultVersionTask.score_answer),
"commit": "deadbeef",
}


def test_global_score_answer_loads_legacy_scorer_from_file(tmp_path):
legacy_path = tmp_path / "legacy_sparse_version_task.py"
legacy_path.write_text(
"def score_answer(self, answer, entry):\n"
" return float(str(answer).strip().lower() == str(entry.answer).strip().lower())\n"
)
SparseVersionTask.score_answer_history = {0: {"file": str(legacy_path)}}
_register_test_task(SparseVersionTask)

entry = Problem(
metadata={
"_task": "sparse_version_task",
"_score_answer": {"version": 0},
},
answer="Alpha Beta",
)

assert score_answer("alphabeta", entry) == 0.0
assert score_answer("alpha beta", entry) == 1.0


def test_unknown_legacy_version_requires_sparse_history_registration():
SparseVersionTask.score_answer_history = {}
_register_test_task(SparseVersionTask)

entry = Problem(
metadata={
"_task": "sparse_version_task",
"_score_answer": {"version": 0},
},
answer="Alpha Beta",
)

with pytest.raises(KeyError, match="score_answer_history"):
score_answer("alpha beta", entry)