xataio · RitwijParmar · Jun 5, 2026
diff --git a/apps/dbagent/README.md b/apps/dbagent/README.md
@@ -78,3 +78,17 @@ vim .env.eval
 Update you `.env.local` file to contain: `EVAL=true`
 
 Ensure you have docker installed and run: `pnpm run eval`
+
+Each eval writes replay artifacts under the configured `EVAL_FOLDER`:
+
+- `human.txt`: readable prompt, answer, and tool-result transcript
+- `replay.json`: structured replay manifest with model metadata, prompts, tool calls, tool results, and failure diagnostics
+- `response.json`: raw Vercel AI SDK response for deep debugging
+- `evalResult.json`: pass/fail result for the case
+
+The test-run folder also includes `evalResults.csv`. In addition to pass/fail and UI links, the CSV includes diagnostic columns for:
+
+- `classifications`: high-level failure categories such as `missing-expected-tool`, `unexpected-tool-call`, `tool-error`, `no-tool-result`, `malformed-request`, or `empty-final-answer`
+- `expected_tools`, `observed_tools`, `missing_expected_tools`, and `unexpected_tools`
+
+This makes model/provider regressions easier to triage without opening every raw trace. For example, when a tool-choice eval fails, first filter `evalResults.csv` by `missing-expected-tool` or `unexpected-tool-call`, then open the linked eval UI and inspect `replay.json` to see the exact prompt, model, tool-call sequence, arguments, and result/error previews.
diff --git a/apps/dbagent/src/app/api/evals/route.ts b/apps/dbagent/src/app/api/evals/route.ts
@@ -3,6 +3,7 @@ import { NextRequest } from 'next/server';
 import path from 'path';
 import { z } from 'zod';
 import { evalResponseSchema } from '~/evals/api-schemas';
+import { EVAL_REPLAY_FILE_NAME, EVAL_RESULT_FILE_NAME } from '~/evals/lib/consts';
 import { env } from '~/lib/env/server';
 
 export async function GET(request: NextRequest) {
@@ -43,21 +44,12 @@ export async function GET(request: NextRequest) {
       })
     );
 
-    filesWithContents.sort((a, b) => {
-      if (a.fileName === 'human.txt') {
-        return -1;
-      }
-      if (b.fileName === 'human.txt') {
-        return 1;
-      }
-      if (a.fileName === 'evalResult.json') {
-        return 1;
-      }
-      if (b.fileName === 'evalResult.json') {
-        return -1;
-      }
-      return 0;
-    });
+    const fileOrder = ['human.txt', EVAL_REPLAY_FILE_NAME, 'response.json', EVAL_RESULT_FILE_NAME];
+    const getFileOrder = (fileName: string) => {
+      const index = fileOrder.indexOf(fileName);
+      return index === -1 ? fileOrder.length : index;
+    };
+    filesWithContents.sort((a, b) => getFileOrder(a.fileName) - getFileOrder(b.fileName));
 
     const response = evalResponseSchema.parse({ files: filesWithContents });
 

diff --git a/apps/dbagent/src/evals/chat/tool-choice.test.ts b/apps/dbagent/src/evals/chat/tool-choice.test.ts
@@ -168,7 +168,14 @@ describe.concurrent('tool_choice', () => {
     const result = await evalChat({
       messages: [{ role: 'user', content: prompt }],
       dbConnection: dbConfig.connectionString,
-      expect
+      expect,
+      traceMetadata: {
+        scenario: 'tool-choice',
+        toolPolicy: {
+          expectedToolCalls: toolCalls,
+          allowOtherTools
+        }
+      }
     });
 
     const allToolCalls = result.steps.flatMap((step) => step.toolCalls);

diff --git a/apps/dbagent/src/evals/eval-reporter.ts b/apps/dbagent/src/evals/eval-reporter.ts
@@ -5,8 +5,13 @@ import { TestCase } from 'vitest/node';
 import { Reporter } from 'vitest/reporters';
 import { delay } from '~/utils/delay';
 import { env } from '../lib/env/eval';
-import { EVAL_RESULT_FILE_NAME, EVAL_RESULTS_CSV_FILE_NAME, EVAL_RESULTS_FILE_NAME } from './lib/consts';
-import { EvalResult, evalResultSchema, evalSummarySchema } from './lib/schemas';
+import {
+  EVAL_REPLAY_FILE_NAME,
+  EVAL_RESULT_FILE_NAME,
+  EVAL_RESULTS_CSV_FILE_NAME,
+  EVAL_RESULTS_FILE_NAME
+} from './lib/consts';
+import { evalReplayManifestSchema, EvalResult, evalResultSchema, evalSummarySchema } from './lib/schemas';
 import { ensureTestRunTraceFolderExists, ensureTraceFolderExists, testNameToEvalId } from './lib/test-id';
 
 const getEnv = () => {
@@ -37,9 +42,18 @@ export default class EvalReporter implements Reporter {
     fs.writeFileSync(path.join(evalTraceFolder, EVAL_RESULTS_FILE_NAME), JSON.stringify(testResults, null, 2));
 
     const csvTestResults = testResults.map((testResult) => {
+      const replayFile = testResult.logFiles.find((logFile) => path.basename(logFile) === EVAL_REPLAY_FILE_NAME);
+      const replay = replayFile
+        ? evalReplayManifestSchema.parse(JSON.parse(fs.readFileSync(replayFile, 'utf-8')))
+        : undefined;
       const result: any = {
         id: testResult.id,
         result: testResult.result,
+        classifications: replay?.diagnostics.classifications.join('|') ?? '',
+        expected_tools: replay?.diagnostics.expectedToolCalls.join('|') ?? '',
+        observed_tools: replay?.diagnostics.observedToolCalls.join('|') ?? '',
+        missing_expected_tools: replay?.diagnostics.missingExpectedToolCalls.join('|') ?? '',
+        unexpected_tools: replay?.diagnostics.unexpectedToolCalls.join('|') ?? '',
         ui: `http://localhost:4001/evals?folder=${evalTraceFolder}&evalId=${testResult.id}`
       };
       testResult.logFiles.forEach((logFile, index) => {

diff --git a/apps/dbagent/src/evals/lib/chat-runner.ts b/apps/dbagent/src/evals/lib/chat-runner.ts
@@ -6,16 +6,19 @@ import { getTools } from '~/lib/ai/tools';
 import { Connection, Project } from '~/lib/db/schema';
 import { env } from '~/lib/env/eval';
 import { getTargetDbPool } from '~/lib/targetdb/db';
+import { EvalTraceMetadata } from './schemas';
 import { traceVercelAiResponse } from './trace';
 
 export const evalChat = async ({
   messages,
   dbConnection,
-  expect
+  expect,
+  traceMetadata
 }: {
   messages: CoreMessage[] | Omit<SDKMessage, 'id'>[];
   dbConnection: string;
   expect: ExpectStatic;
+  traceMetadata?: EvalTraceMetadata;
 }) => {
   const project: Project = {
     id: 'projectId',
@@ -41,7 +44,7 @@ export const evalChat = async ({
       tools,
       messages
     });
-    traceVercelAiResponse(response, expect);
+    traceVercelAiResponse(response, expect, traceMetadata);
     return response;
   } finally {
     await targetDb.end();

diff --git a/apps/dbagent/src/evals/lib/consts.ts b/apps/dbagent/src/evals/lib/consts.ts
@@ -1,3 +1,4 @@
 export const EVAL_RESULT_FILE_NAME = 'evalResult.json';
 export const EVAL_RESULTS_FILE_NAME = 'evalResults.json';
 export const EVAL_RESULTS_CSV_FILE_NAME = 'evalResults.csv';
+export const EVAL_REPLAY_FILE_NAME = 'replay.json';
diff --git a/apps/dbagent/src/evals/lib/schemas.ts b/apps/dbagent/src/evals/lib/schemas.ts
@@ -17,3 +17,85 @@ export const evalSummarySchema = z
   })
   .strict();
 export type EvalSummary = z.infer<typeof evalSummarySchema>;
+
+export const evalToolPolicySchema = z
+  .object({
+    expectedToolCalls: z.array(z.string()).default([]),
+    allowOtherTools: z.boolean().default(true)
+  })
+  .strict();
+export type EvalToolPolicy = z.infer<typeof evalToolPolicySchema>;
+
+export const evalTraceMetadataSchema = z
+  .object({
+    scenario: z.string().optional(),
+    toolPolicy: evalToolPolicySchema.optional()
+  })
+  .strict();
+export type EvalTraceMetadata = z.infer<typeof evalTraceMetadataSchema>;
+
+export const evalToolCallSchema = z
+  .object({
+    step: z.number().int().positive(),
+    toolCallId: z.string().optional(),
+    toolName: z.string(),
+    args: z.unknown().optional(),
+    hasResult: z.boolean(),
+    resultPreview: z.string().optional(),
+    error: z.string().optional()
+  })
+  .strict();
+export type EvalToolCall = z.infer<typeof evalToolCallSchema>;
+
+export const evalFailureClassificationSchema = z.enum([
+  'malformed-request',
+  'missing-system-prompt',
+  'missing-user-prompt',
+  'missing-expected-tool',
+  'unexpected-tool-call',
+  'tool-error',
+  'no-tool-result',
+  'empty-final-answer'
+]);
+export type EvalFailureClassification = z.infer<typeof evalFailureClassificationSchema>;
+
+export const evalReplayManifestSchema = z
+  .object({
+    schemaVersion: z.literal(1),
+    id: z.string(),
+    metadata: evalTraceMetadataSchema,
+    provider: z
+      .object({
+        model: z.string().optional(),
+        requestBodyCaptured: z.boolean()
+      })
+      .strict(),
+    prompts: z
+      .object({
+        system: z.string().optional(),
+        user: z.string().optional()
+      })
+      .strict(),
+    steps: z.array(
+      z
+        .object({
+          index: z.number().int().positive(),
+          text: z.string(),
+          toolCalls: z.array(evalToolCallSchema)
+        })
+        .strict()
+    ),
+    finalAnswer: z.string(),
+    diagnostics: z
+      .object({
+        classifications: z.array(evalFailureClassificationSchema),
+        expectedToolCalls: z.array(z.string()),
+        observedToolCalls: z.array(z.string()),
+        missingExpectedToolCalls: z.array(z.string()),
+        unexpectedToolCalls: z.array(z.string()),
+        toolErrors: z.array(evalToolCallSchema)
+      })
+      .strict()
+  })
+  .strict();
+export type EvalReplayManifest = z.infer<typeof evalReplayManifestSchema>;
diff --git a/apps/dbagent/src/evals/lib/trace.test.ts b/apps/dbagent/src/evals/lib/trace.test.ts
@@ -0,0 +1,129 @@
+import { describe, expect, it } from 'vitest';
+import { buildReplayManifest } from './trace';
+
+const response = {
+  request: {
+    body: JSON.stringify({
+      model: 'test-provider/test-model',
+      system: [{ text: 'You are a PostgreSQL agent.' }],
+      messages: [{ content: [{ text: 'Describe the dogs table' }] }]
+    })
+  },
+  text: 'The dogs table has an id and name column.',
+  steps: [
+    {
+      text: '',
+      toolCalls: [
+        {
+          toolCallId: 'call-1',
+          toolName: 'describeTable',
+          args: { table: 'dogs' }
+        }
+      ],
+      toolResults: [
+        {
+          toolCallId: 'call-1',
+          toolName: 'describeTable',
+          args: { table: 'dogs' },
+          result: { columns: ['id', 'name'] }
+        }
+      ]
+    }
+  ]
+} as any;
+
+describe('buildReplayManifest', () => {
+  it('captures prompts, model metadata and tool calls for replayable eval diagnostics', () => {
+    const manifest = buildReplayManifest({
+      id: 'describe_table',
+      response,
+      metadata: {
+        scenario: 'tool-choice',
+        toolPolicy: {
+          expectedToolCalls: ['describeTable'],
+          allowOtherTools: false
+        }
+      }
+    });
+
+    expect(manifest.provider.model).toBe('test-provider/test-model');
+    expect(manifest.prompts.user).toBe('Describe the dogs table');
+    expect(manifest.steps[0]?.toolCalls[0]).toMatchObject({
+      step: 1,
+      toolName: 'describeTable',
+      hasResult: true
+    });
+    expect(manifest.diagnostics.classifications).toEqual([]);
+  });
+
+  it('classifies missing expected tools and disallowed extra tools', () => {
+    const manifest = buildReplayManifest({
+      id: 'wrong_tool',
+      response,
+      metadata: {
+        toolPolicy: {
+          expectedToolCalls: ['getTablesAndInstanceInfo'],
+          allowOtherTools: false
+        }
+      }
+    });
+
+    expect(manifest.diagnostics.classifications).toEqual(['missing-expected-tool', 'unexpected-tool-call']);
+    expect(manifest.diagnostics.missingExpectedToolCalls).toEqual(['getTablesAndInstanceInfo']);
+    expect(manifest.diagnostics.unexpectedToolCalls).toEqual(['describeTable']);
+  });
+
+  it('classifies malformed provider request bodies and tool failures', () => {
+    const manifest = buildReplayManifest({
+      id: 'tool_error',
+      response: {
+        ...response,
+        request: { body: '{not-json' },
+        text: '',
+        steps: [
+          {
+            text: '',
+            toolCalls: [{ toolCallId: 'call-1', toolName: 'safeExplainQuery', args: { query: 'select * from dogs' } }],
+            toolResults: [
+              {
+                toolCallId: 'call-1',
+                toolName: 'safeExplainQuery',
+                result: { error: 'permission denied' }
+              }
+            ]
+          }
+        ]
+      }
+    });
+
+    expect(manifest.diagnostics.classifications).toEqual([
+      'malformed-request',
+      'missing-system-prompt',
+      'missing-user-prompt',
+      'tool-error',
+      'empty-final-answer'
+    ]);
+    expect(manifest.diagnostics.toolErrors[0]).toMatchObject({
+      toolName: 'safeExplainQuery',
+      error: 'permission denied'
+    });
+  });
+
+  it('classifies tool calls that never produce a result', () => {
+    const manifest = buildReplayManifest({
+      id: 'missing_tool_result',
+      response: {
+        ...response,
+        steps: [
+          {
+            text: '',
+            toolCalls: [{ toolCallId: 'call-1', toolName: 'getSlowQueries', args: {} }],
+            toolResults: []
+          }
+        ]
+      }
+    });
+
+    expect(manifest.diagnostics.classifications).toEqual(['no-tool-result']);
+  });
+});