Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 255 additions & 0 deletions .github/workflows/validate-results-json.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
name: Validate Results JSON

on:
pull_request:
paths:
- "results/metrics/**"
- "results/pipeline_descriptions/**"

permissions:
contents: read

jobs:
validate-json:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v6.0.2
with:
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Validate changed JSON files
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
set -euo pipefail

mapfile -t changed_json_files < <(
git diff --name-only --diff-filter=ACMRD "$BASE_SHA" "$HEAD_SHA" \
-- ':(glob)results/metrics/**/*.json' \
':(glob)results/pipeline_descriptions/**/*.json'
)

if [ ${#changed_json_files[@]} -eq 0 ]; then
echo "No changed JSON files to validate."
exit 0
fi
Comment thread
QuentinJGMace marked this conversation as resolved.

printf '%s\n' "${changed_json_files[@]}" > changed_json_files.txt

python - <<'PY'
import json
Comment thread
QuentinJGMace marked this conversation as resolved.
import sys
from pathlib import Path

paths = [Path(line.strip()) for line in Path("changed_json_files.txt").read_text().splitlines() if line.strip()]
errors = []

# Derive model names only from well-structured paths:
# results/{metrics,pipeline_descriptions}/<model>/<file>.json
model_names = set(
p.parts[2]
for p in paths
if (
len(p.parts) == 4
and p.parts[0] == "results"
and p.parts[1] in {"metrics", "pipeline_descriptions"}
and p.suffix == ".json"
)
)

# Detect JSON files under results/{metrics,pipeline_descriptions} that do not
# match the expected layout and report them explicitly.
for p in paths:
if (
len(p.parts) >= 2
and p.parts[0] == "results"
and p.parts[1] in {"metrics", "pipeline_descriptions"}
and p.suffix == ".json"
and not (len(p.parts) == 4)
):
errors.append(
f"Unexpected JSON file location: {p}. Expected layout "
"results/{metrics,pipeline_descriptions}/<model>/<file>.json."
)

required_metrics_file_names_for_submission = {
"vidore_v3_computer_science.json",
"vidore_v3_energy.json",
"vidore_v3_finance_en.json",
"vidore_v3_finance_fr.json",
"vidore_v3_hr.json",
"vidore_v3_industrial.json",
"vidore_v3_pharmaceuticals.json",
"vidore_v3_physics.json",
}

metrics_paths = [
p
for p in paths
if len(p.parts) == 4 and p.parts[0] == "results" and p.parts[1] == "metrics" and p.suffix == ".json"
]
Comment thread
QuentinJGMace marked this conversation as resolved.
description_paths = [
p
for p in paths
if len(p.parts) == 4 and p.parts[0] == "results" and p.parts[1] == "pipeline_descriptions" and p.suffix == ".json"
]

required_description_keys = {
"pipeline_name",
"pipeline_alias",
"description",
"class_name",
"module_path",
"nodes",
"edges"
}

required_top_level_keys = {"split", "language", "pipeline_args", "aggregated_metrics"}
required_aggregated_metrics_keys = {"overall", "by_language", "timing"}
required_overall_metric_keys = {
"map",
"recip_rank",
"P_1",
"P_5",
"P_10",
"P_20",
"recall_1",
"recall_5",
"recall_10",
"recall_20",
"recall_50",
"recall_100",
"ndcg_cut_1",
"ndcg_cut_5",
"ndcg_cut_10",
"ndcg_cut_20",
"ndcg_cut_100",
"map_cut_1",
"map_cut_10",
"map_cut_100",
"num_queries",
}
required_timing_keys = {
"total_retrieval_time_milliseconds",
"indexing_time_milliseconds",
"search_time_milliseconds",
"num_queries",
"num_corpus",
"indexing_throughput_ms_per_doc",
"search_throughput_ms_per_query",
}

def add_not_all_required_files_errors(folder_path, actual_files, required_files):
"for each submission model folder (e.g., results/metrics/jinav4_text_zerank2textual), check that all required files are present"
missing = sorted(required_files - set(actual_files))
if missing:
errors.append(f"{folder_path}: missing files: {', '.join(missing)}")

def validate_required_files_for_submission(metrics_folder_path):
actual_files = [p.name for p in Path(metrics_folder_path).glob("*.json")]
add_not_all_required_files_errors(metrics_folder_path, actual_files, required_metrics_file_names_for_submission)

def add_missing_key_errors(file_path, actual_keys, required_keys, section_name):
missing = sorted(required_keys - set(actual_keys))
if missing:
errors.append(f"{file_path}: missing keys in {section_name}: {', '.join(missing)}")

def validate_description_schema(file_path, payload):
add_missing_key_errors(file_path, payload.keys(), required_description_keys, "root")

def validate_metrics_schema(file_path, payload):
add_missing_key_errors(file_path, payload.keys(), required_top_level_keys, "root")

aggregated = payload.get("aggregated_metrics")
if not isinstance(aggregated, dict):
errors.append(f"{file_path}: key 'aggregated_metrics' must be an object")
return

add_missing_key_errors(
file_path,
aggregated.keys(),
required_aggregated_metrics_keys,
"aggregated_metrics",
)

overall = aggregated.get("overall")
if not isinstance(overall, dict):
errors.append(f"{file_path}: key 'aggregated_metrics.overall' must be an object")
else:
add_missing_key_errors(
file_path,
overall.keys(),
required_overall_metric_keys,
"aggregated_metrics.overall",
)

by_language = aggregated.get("by_language")
if not isinstance(by_language, dict):
errors.append(f"{file_path}: key 'aggregated_metrics.by_language' must be an object")
else:
english = by_language.get("english")
if not isinstance(english, dict):
errors.append(
f"{file_path}: key 'aggregated_metrics.by_language.english' must be an object"
)
else:
add_missing_key_errors(
file_path,
english.keys(),
required_overall_metric_keys,
"aggregated_metrics.by_language.english",
)

timing = aggregated.get("timing")
if not isinstance(timing, dict):
errors.append(f"{file_path}: key 'aggregated_metrics.timing' must be an object")
else:
add_missing_key_errors(
file_path,
timing.keys(),
required_timing_keys,
"aggregated_metrics.timing",
)

for model_name in model_names:
metrics_folder = Path("results/metrics") / model_name
validate_required_files_for_submission(metrics_folder)

description_folder = Path("results/pipeline_descriptions") / model_name
description_file_path = description_folder / "description.json"
if not description_file_path.exists():
errors.append(f"{description_file_path}: missing pipeline description file for model {model_name}")

for path in paths:
if not path.exists():
# File might have been removed or renamed after diff selection.
continue
try:
with path.open("r", encoding="utf-8") as fh:
payload = json.load(fh)
except json.JSONDecodeError as exc:
errors.append(f"{path}: line {exc.lineno}, column {exc.colno} -> {exc.msg}")
continue

# Enforce metrics schema only for files under results/metrics/*/*.json.
if path in metrics_paths:
validate_metrics_schema(path, payload)
elif path in description_paths:
validate_description_schema(path, payload)

if errors:
print("JSON validation failed:")
for err in errors:
print(f"- {err}")
sys.exit(1)

print(f"Validated {len(paths)} JSON file(s) successfully.")
PY
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,5 +167,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

results/
Loading