Skip to content
This repository was archived by the owner on Jun 15, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions apps/dbagent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,17 @@ vim .env.eval
Update you `.env.local` file to contain: `EVAL=true`

Ensure you have docker installed and run: `pnpm run eval`

Each eval writes replay artifacts under the configured `EVAL_FOLDER`:

- `human.txt`: readable prompt, answer, and tool-result transcript
- `replay.json`: structured replay manifest with model metadata, prompts, tool calls, tool results, and failure diagnostics
- `response.json`: raw Vercel AI SDK response for deep debugging
- `evalResult.json`: pass/fail result for the case

The test-run folder also includes `evalResults.csv`. In addition to pass/fail and UI links, the CSV includes diagnostic columns for:

- `classifications`: high-level failure categories such as `missing-expected-tool`, `unexpected-tool-call`, `tool-error`, `no-tool-result`, `malformed-request`, or `empty-final-answer`
- `expected_tools`, `observed_tools`, `missing_expected_tools`, and `unexpected_tools`

This makes model/provider regressions easier to triage without opening every raw trace. For example, when a tool-choice eval fails, first filter `evalResults.csv` by `missing-expected-tool` or `unexpected-tool-call`, then open the linked eval UI and inspect `replay.json` to see the exact prompt, model, tool-call sequence, arguments, and result/error previews.
22 changes: 7 additions & 15 deletions apps/dbagent/src/app/api/evals/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { NextRequest } from 'next/server';
import path from 'path';
import { z } from 'zod';
import { evalResponseSchema } from '~/evals/api-schemas';
import { EVAL_REPLAY_FILE_NAME, EVAL_RESULT_FILE_NAME } from '~/evals/lib/consts';
import { env } from '~/lib/env/server';

export async function GET(request: NextRequest) {
Expand Down Expand Up @@ -43,21 +44,12 @@ export async function GET(request: NextRequest) {
})
);

filesWithContents.sort((a, b) => {
if (a.fileName === 'human.txt') {
return -1;
}
if (b.fileName === 'human.txt') {
return 1;
}
if (a.fileName === 'evalResult.json') {
return 1;
}
if (b.fileName === 'evalResult.json') {
return -1;
}
return 0;
});
const fileOrder = ['human.txt', EVAL_REPLAY_FILE_NAME, 'response.json', EVAL_RESULT_FILE_NAME];
const getFileOrder = (fileName: string) => {
const index = fileOrder.indexOf(fileName);
return index === -1 ? fileOrder.length : index;
};
filesWithContents.sort((a, b) => getFileOrder(a.fileName) - getFileOrder(b.fileName));

const response = evalResponseSchema.parse({ files: filesWithContents });

Expand Down
9 changes: 8 additions & 1 deletion apps/dbagent/src/evals/chat/tool-choice.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,14 @@ describe.concurrent('tool_choice', () => {
const result = await evalChat({
messages: [{ role: 'user', content: prompt }],
dbConnection: dbConfig.connectionString,
expect
expect,
traceMetadata: {
scenario: 'tool-choice',
toolPolicy: {
expectedToolCalls: toolCalls,
allowOtherTools
}
}
});

const allToolCalls = result.steps.flatMap((step) => step.toolCalls);
Expand Down
18 changes: 16 additions & 2 deletions apps/dbagent/src/evals/eval-reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ import { TestCase } from 'vitest/node';
import { Reporter } from 'vitest/reporters';
import { delay } from '~/utils/delay';
import { env } from '../lib/env/eval';
import { EVAL_RESULT_FILE_NAME, EVAL_RESULTS_CSV_FILE_NAME, EVAL_RESULTS_FILE_NAME } from './lib/consts';
import { EvalResult, evalResultSchema, evalSummarySchema } from './lib/schemas';
import {
EVAL_REPLAY_FILE_NAME,
EVAL_RESULT_FILE_NAME,
EVAL_RESULTS_CSV_FILE_NAME,
EVAL_RESULTS_FILE_NAME
} from './lib/consts';
import { evalReplayManifestSchema, EvalResult, evalResultSchema, evalSummarySchema } from './lib/schemas';
import { ensureTestRunTraceFolderExists, ensureTraceFolderExists, testNameToEvalId } from './lib/test-id';

const getEnv = () => {
Expand Down Expand Up @@ -37,9 +42,18 @@ export default class EvalReporter implements Reporter {
fs.writeFileSync(path.join(evalTraceFolder, EVAL_RESULTS_FILE_NAME), JSON.stringify(testResults, null, 2));

const csvTestResults = testResults.map((testResult) => {
const replayFile = testResult.logFiles.find((logFile) => path.basename(logFile) === EVAL_REPLAY_FILE_NAME);
const replay = replayFile
? evalReplayManifestSchema.parse(JSON.parse(fs.readFileSync(replayFile, 'utf-8')))
: undefined;
const result: any = {
id: testResult.id,
result: testResult.result,
classifications: replay?.diagnostics.classifications.join('|') ?? '',
expected_tools: replay?.diagnostics.expectedToolCalls.join('|') ?? '',
observed_tools: replay?.diagnostics.observedToolCalls.join('|') ?? '',
missing_expected_tools: replay?.diagnostics.missingExpectedToolCalls.join('|') ?? '',
unexpected_tools: replay?.diagnostics.unexpectedToolCalls.join('|') ?? '',
ui: `http://localhost:4001/evals?folder=${evalTraceFolder}&evalId=${testResult.id}`
};
testResult.logFiles.forEach((logFile, index) => {
Expand Down
7 changes: 5 additions & 2 deletions apps/dbagent/src/evals/lib/chat-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@ import { getTools } from '~/lib/ai/tools';
import { Connection, Project } from '~/lib/db/schema';
import { env } from '~/lib/env/eval';
import { getTargetDbPool } from '~/lib/targetdb/db';
import { EvalTraceMetadata } from './schemas';
import { traceVercelAiResponse } from './trace';

export const evalChat = async ({
messages,
dbConnection,
expect
expect,
traceMetadata
}: {
messages: CoreMessage[] | Omit<SDKMessage, 'id'>[];
dbConnection: string;
expect: ExpectStatic;
traceMetadata?: EvalTraceMetadata;
}) => {
const project: Project = {
id: 'projectId',
Expand All @@ -41,7 +44,7 @@ export const evalChat = async ({
tools,
messages
});
traceVercelAiResponse(response, expect);
traceVercelAiResponse(response, expect, traceMetadata);
return response;
} finally {
await targetDb.end();
Expand Down
1 change: 1 addition & 0 deletions apps/dbagent/src/evals/lib/consts.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export const EVAL_RESULT_FILE_NAME = 'evalResult.json';
export const EVAL_RESULTS_FILE_NAME = 'evalResults.json';
export const EVAL_RESULTS_CSV_FILE_NAME = 'evalResults.csv';
export const EVAL_REPLAY_FILE_NAME = 'replay.json';
82 changes: 82 additions & 0 deletions apps/dbagent/src/evals/lib/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,85 @@ export const evalSummarySchema = z
})
.strict();
export type EvalSummary = z.infer<typeof evalSummarySchema>;

export const evalToolPolicySchema = z
.object({
expectedToolCalls: z.array(z.string()).default([]),
allowOtherTools: z.boolean().default(true)
})
.strict();
export type EvalToolPolicy = z.infer<typeof evalToolPolicySchema>;

export const evalTraceMetadataSchema = z
.object({
scenario: z.string().optional(),
toolPolicy: evalToolPolicySchema.optional()
})
.strict();
export type EvalTraceMetadata = z.infer<typeof evalTraceMetadataSchema>;

export const evalToolCallSchema = z
.object({
step: z.number().int().positive(),
toolCallId: z.string().optional(),
toolName: z.string(),
args: z.unknown().optional(),
hasResult: z.boolean(),
resultPreview: z.string().optional(),
error: z.string().optional()
})
.strict();
export type EvalToolCall = z.infer<typeof evalToolCallSchema>;

export const evalFailureClassificationSchema = z.enum([
'malformed-request',
'missing-system-prompt',
'missing-user-prompt',
'missing-expected-tool',
'unexpected-tool-call',
'tool-error',
'no-tool-result',
'empty-final-answer'
]);
export type EvalFailureClassification = z.infer<typeof evalFailureClassificationSchema>;

export const evalReplayManifestSchema = z
.object({
schemaVersion: z.literal(1),
id: z.string(),
metadata: evalTraceMetadataSchema,
provider: z
.object({
model: z.string().optional(),
requestBodyCaptured: z.boolean()
})
.strict(),
prompts: z
.object({
system: z.string().optional(),
user: z.string().optional()
})
.strict(),
steps: z.array(
z
.object({
index: z.number().int().positive(),
text: z.string(),
toolCalls: z.array(evalToolCallSchema)
})
.strict()
),
finalAnswer: z.string(),
diagnostics: z
.object({
classifications: z.array(evalFailureClassificationSchema),
expectedToolCalls: z.array(z.string()),
observedToolCalls: z.array(z.string()),
missingExpectedToolCalls: z.array(z.string()),
unexpectedToolCalls: z.array(z.string()),
toolErrors: z.array(evalToolCallSchema)
})
.strict()
})
.strict();
export type EvalReplayManifest = z.infer<typeof evalReplayManifestSchema>;
129 changes: 129 additions & 0 deletions apps/dbagent/src/evals/lib/trace.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import { describe, expect, it } from 'vitest';
import { buildReplayManifest } from './trace';

const response = {
request: {
body: JSON.stringify({
model: 'test-provider/test-model',
system: [{ text: 'You are a PostgreSQL agent.' }],
messages: [{ content: [{ text: 'Describe the dogs table' }] }]
})
},
text: 'The dogs table has an id and name column.',
steps: [
{
text: '',
toolCalls: [
{
toolCallId: 'call-1',
toolName: 'describeTable',
args: { table: 'dogs' }
}
],
toolResults: [
{
toolCallId: 'call-1',
toolName: 'describeTable',
args: { table: 'dogs' },
result: { columns: ['id', 'name'] }
}
]
}
]
} as any;

describe('buildReplayManifest', () => {
it('captures prompts, model metadata and tool calls for replayable eval diagnostics', () => {
const manifest = buildReplayManifest({
id: 'describe_table',
response,
metadata: {
scenario: 'tool-choice',
toolPolicy: {
expectedToolCalls: ['describeTable'],
allowOtherTools: false
}
}
});

expect(manifest.provider.model).toBe('test-provider/test-model');
expect(manifest.prompts.user).toBe('Describe the dogs table');
expect(manifest.steps[0]?.toolCalls[0]).toMatchObject({
step: 1,
toolName: 'describeTable',
hasResult: true
});
expect(manifest.diagnostics.classifications).toEqual([]);
});

it('classifies missing expected tools and disallowed extra tools', () => {
const manifest = buildReplayManifest({
id: 'wrong_tool',
response,
metadata: {
toolPolicy: {
expectedToolCalls: ['getTablesAndInstanceInfo'],
allowOtherTools: false
}
}
});

expect(manifest.diagnostics.classifications).toEqual(['missing-expected-tool', 'unexpected-tool-call']);
expect(manifest.diagnostics.missingExpectedToolCalls).toEqual(['getTablesAndInstanceInfo']);
expect(manifest.diagnostics.unexpectedToolCalls).toEqual(['describeTable']);
});

it('classifies malformed provider request bodies and tool failures', () => {
const manifest = buildReplayManifest({
id: 'tool_error',
response: {
...response,
request: { body: '{not-json' },
text: '',
steps: [
{
text: '',
toolCalls: [{ toolCallId: 'call-1', toolName: 'safeExplainQuery', args: { query: 'select * from dogs' } }],
toolResults: [
{
toolCallId: 'call-1',
toolName: 'safeExplainQuery',
result: { error: 'permission denied' }
}
]
}
]
}
});

expect(manifest.diagnostics.classifications).toEqual([
'malformed-request',
'missing-system-prompt',
'missing-user-prompt',
'tool-error',
'empty-final-answer'
]);
expect(manifest.diagnostics.toolErrors[0]).toMatchObject({
toolName: 'safeExplainQuery',
error: 'permission denied'
});
});

it('classifies tool calls that never produce a result', () => {
const manifest = buildReplayManifest({
id: 'missing_tool_result',
response: {
...response,
steps: [
{
text: '',
toolCalls: [{ toolCallId: 'call-1', toolName: 'getSlowQueries', args: {} }],
toolResults: []
}
]
}
});

expect(manifest.diagnostics.classifications).toEqual(['no-tool-result']);
});
});
Loading