diff --git a/AGENTS.md b/AGENTS.md index 3bde42b41..ba82fc5fc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -311,7 +311,7 @@ During normal plan review, an Archive sidebar tab provides the same browsing via | `/api/external-annotations` | POST | Add external annotations (single or batch `{ annotations: [...] }`) | | `/api/external-annotations` | PATCH | Update fields on a single annotation (`?id=`) | | `/api/external-annotations` | DELETE | Remove by `?id=`, `?source=`, or clear all | -| `/api/agents/capabilities` | GET | Check available agent providers (claude, codex, tour) | +| `/api/agents/capabilities` | GET | Check available agent providers (claude, codex, tour, cursor) | | `/api/agents/jobs/stream` | GET | SSE stream for real-time agent job status updates | | `/api/agents/jobs` | GET | Snapshot of agent jobs (polling fallback, `?since=N` for version gating) | | `/api/agents/jobs` | POST | Launch an agent job (body: `{ provider, command, label }`) | diff --git a/apps/pi-extension/server/agent-jobs.ts b/apps/pi-extension/server/agent-jobs.ts index d4a492402..f426826ac 100644 --- a/apps/pi-extension/server/agent-jobs.ts +++ b/apps/pi-extension/server/agent-jobs.ts @@ -21,6 +21,7 @@ import { AGENT_HEARTBEAT_INTERVAL_MS, } from "../generated/agent-jobs.js"; import { formatClaudeLogEvent } from "../generated/claude-review.js"; +import { formatCursorLogEvent, parseCursorModelsOutput, type CursorModel } from "../generated/cursor-review.js"; import { json, parseBody } from "./helpers.js"; // --------------------------------------------------------------------------- @@ -32,6 +33,15 @@ const JOBS = `${BASE}/jobs`; const JOBS_STREAM = `${JOBS}/stream`; const CAPABILITIES = `${BASE}/capabilities`; +// Providers whose command is owned by the server. Client-supplied argv is never +// spawned for these — buildCommand must produce the command or the launch fails. +const SERVER_BUILT_PROVIDERS: ReadonlySet = new Set([ + "claude", + "codex", + "tour", + "cursor", +]); + // --------------------------------------------------------------------------- // which() helper for Node.js // --------------------------------------------------------------------------- @@ -54,7 +64,7 @@ export interface AgentJobHandlerOptions { mode: "plan" | "review" | "annotate"; getServerUrl: () => string; getCwd: () => string; - /** Server-side command builder for known providers (codex, claude, tour). */ + /** Server-side command builder for known providers (codex, claude, tour, cursor). */ buildCommand?: (provider: string, config?: Record) => Promise<{ command: string[]; outputPath?: string; @@ -84,6 +94,24 @@ export interface AgentJobHandlerOptions { onJobComplete?: (job: AgentJobInfo, meta: { outputPath?: string; stdout?: string; cwd?: string }) => void | Promise; } +/** + * Best-effort Cursor model catalog from `agent models`, parsed once. Empty when + * discovery fails or the CLI is unauthenticated — the UI falls back to an + * `auto`-only picker. Account-specific, so never hardcoded. + */ +function discoverCursorModels(): CursorModel[] { + try { + const out = execFileSync("agent", ["models"], { + timeout: 5000, + stdio: ["ignore", "pipe", "ignore"], + encoding: "utf8", + }); + return parseCursorModelsOutput(out); + } catch { + return []; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions) { const { mode, getServerUrl, getCwd } = options; @@ -94,10 +122,19 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { let version = 0; // --- Capability detection (run once) --- + // Cursor CLI's binary is literally named `agent` (NOT `cursor`). When present, + // discover its account-specific model catalog so the UI doesn't hardcode ids. + const cursorAvailable = mode === "review" && whichCmd("agent"); const capabilities: AgentCapability[] = [ { id: "claude", name: "Claude Code", available: whichCmd("claude") }, { id: "codex", name: "Codex CLI", available: whichCmd("codex") }, { id: "tour", name: "Code Tour", available: whichCmd("claude") || whichCmd("codex") }, + { + id: "cursor", + name: "Cursor CLI", + available: cursorAvailable, + ...(cursorAvailable ? { models: discoverCursorModels() } : {}), + }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -183,31 +220,45 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { if (spawnOptions?.cwd) jobOutputPaths.set(`${id}:cwd`, spawnOptions.cwd); broadcast({ type: "job:started", job: { ...info } }); - // --- Stdout capture (Claude JSONL streaming) --- + // --- Stdout capture (Claude/Cursor stream-json) --- let stdoutBuf = ""; if (captureStdout && proc.stdout) { + // Format one complete JSONL line into a live-log delta (skip result + // events — handled in onJobComplete). + const emitLogLine = (line: string) => { + if (!line.trim()) return; + // Tour jobs with the Claude engine also stream Claude JSONL. + if (provider === "claude" || spawnOptions?.engine === "claude") { + const formatted = formatClaudeLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + try { + const event = JSON.parse(line); + if (event.type === 'result') return; + } catch { /* not JSON — forward as raw log */ } + broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); + }; + // stream-json output is NDJSON and chunk boundaries are arbitrary — + // carry the trailing partial line until a later chunk completes it, + // otherwise records split across chunks are dropped from live logs. + let logLineCarry = ""; proc.stdout.on("data", (chunk: Buffer) => { const text = chunk.toString(); stdoutBuf += text; - - // Forward JSONL lines as log events - const lines = text.split('\n'); - for (const line of lines) { - if (!line.trim()) continue; - // Tour jobs with the Claude engine also stream Claude JSONL. - if (provider === "claude" || spawnOptions?.engine === "claude") { - const formatted = formatClaudeLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - try { - const event = JSON.parse(line); - if (event.type === 'result') continue; - } catch { /* not JSON — forward as raw log */ } - broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); - } + const lines = (logLineCarry + text).split('\n'); + logLineCarry = lines.pop() ?? ""; + for (const line of lines) emitLogLine(line); + }); + proc.stdout.on("end", () => { + if (logLineCarry) emitLogLine(logLineCarry); }); } @@ -265,8 +316,15 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { stdout: captureStdout ? stdoutBuf : undefined, cwd: jobCwd, }); - } catch { - // Result ingestion failure shouldn't prevent job completion broadcast + } catch (err) { + // Claude/Codex are fail-open; Cursor is fail-closed — an unexpected + // throw during prompt-enforced ingestion must fail the job, not pass + // it. (The Cursor handler normally fails by mutation and never throws; + // this guards future refactors.) + if (provider === "cursor") { + entry.info.status = "failed"; + entry.info.error = err instanceof Error ? err.message : "Cursor result ingestion failed"; + } } } jobOutputPaths.delete(id); @@ -418,6 +476,19 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { return true; } + // Fail-closed enforcement for server-owned providers: the command MUST + // be built server-side. Client-supplied argv is never spawned for these + // providers — a null/throwing builder becomes an error, not a fallback. + if (SERVER_BUILT_PROVIDERS.has(provider)) { + if (!options.buildCommand) { + json(res, { error: `Provider ${provider} requires server-built command` }, 400); + return true; + } + // Discard any client-supplied argv so a null build cleanly hits the + // `command.length === 0` guard below instead of falling through. + command = []; + } + // Try server-side command building for known providers let captureStdout = false; let stdinPrompt: string | undefined; diff --git a/apps/pi-extension/server/serverReview.ts b/apps/pi-extension/server/serverReview.ts index 94598038d..8743c252a 100644 --- a/apps/pi-extension/server/serverReview.ts +++ b/apps/pi-extension/server/serverReview.ts @@ -90,6 +90,12 @@ import { transformClaudeFindings, } from "../generated/claude-review.js"; import { createTourSession, TOUR_EMPTY_OUTPUT_ERROR } from "../generated/tour-review.js"; +import { + CURSOR_REVIEW_PROMPT, + buildCursorCommand, + parseCursorStreamOutput, + transformCursorFindings, +} from "../generated/cursor-review.js"; import { WorkspaceReviewSession, type WorkspaceDiffType, @@ -567,6 +573,17 @@ export async function startReviewServer(options: { return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, effort, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; } + if (provider === "cursor") { + // Cursor has no schema flag — its marker-block output contract lives in + // CURSOR_REVIEW_PROMPT. captureStdout is required (the marker block comes + // back on stdout, like Claude). buildCursorCommand passes the prompt as + // the trailing argv arg and threads --workspace=cwd to match the spawn cwd. + const model = typeof config?.model === "string" && config.model ? config.model : undefined; + const prompt = CURSOR_REVIEW_PROMPT + "\n\n---\n\n" + userMessage; + const { command } = buildCursorCommand(prompt, model, cwd); + return { command, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; + } + return null; }, @@ -636,6 +653,49 @@ export async function startReviewServer(options: { return; } + // --- Cursor path --- + // FAIL-CLOSED: Cursor output is prompt-enforced (no schema flag), so any + // missing/malformed/schema/transform/insertion failure must MUTATE the job + // to failed — NEVER throw (agent-jobs.ts swallows throws, silently leaving + // an exit-0 job marked done). Mirrors the Tour fail-closed pattern below. + if (job.provider === "cursor") { + const output = meta.stdout ? parseCursorStreamOutput(meta.stdout) : null; + if (!output) { + job.status = "failed"; + job.error = "Cursor review output missing or unparseable (no valid marker JSON)."; + return; + } + + // Derive the verdict from finding severities (like Claude) rather than + // trusting Cursor's free-form `correctness` string. Cursor has no schema + // flag, so a model value like "not correct" would be stored verbatim and + // the detail panel (any string containing "correct" except "incorrect" → + // green) would invert the displayed result. + const hasImportant = output.findings.some((f) => f.severity === "important"); + job.summary = { + correctness: hasImportant ? "Issues Found" : "Correct", + explanation: output.summary.explanation, + confidence: output.summary.confidence, + }; + + if (output.findings.length > 0) { + const annotations = transformCursorFindings( + output.findings, + job.source, + cwd, + workspace ? (filePath) => workspace.normalizeAnnotationPath(filePath) : undefined, + ) + .map(a => ({ ...a, ...jobPrContext, ...(jobDiffScope && { diffScope: jobDiffScope }) })); + const result = externalAnnotations.addAnnotations({ annotations }); + if ("error" in result) { + job.status = "failed"; + job.error = `Cursor annotation insertion failed: ${result.error}`; + return; + } + } + return; + } + if (job.provider === "tour") { const { summary } = await tour.onJobComplete({ job, meta }); if (summary) { diff --git a/apps/pi-extension/vendor.sh b/apps/pi-extension/vendor.sh index 5c9a32544..e0ecadfdc 100755 --- a/apps/pi-extension/vendor.sh +++ b/apps/pi-extension/vendor.sh @@ -13,7 +13,7 @@ for f in feedback-templates prompts review-core diff-paths cli-pagination jj-cor done # Vendor review agent modules from packages/server/ — rewrite imports for generated/ layout -for f in agent-review-message codex-review claude-review path-utils; do +for f in agent-review-message codex-review claude-review cursor-review path-utils; do src="../../packages/server/$f.ts" printf '// @generated — DO NOT EDIT. Source: packages/server/%s.ts\n' "$f" | cat - "$src" \ | sed 's|from "./vcs"|from "./review-core.js"|' \ diff --git a/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx b/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx index d1c2b2ff5..6afa5a5d1 100644 --- a/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx +++ b/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx @@ -350,7 +350,7 @@ function ProviderPill({ provider, engine, model }: { provider: string; engine?: const engineLabel = engine === 'codex' ? 'Codex' : 'Claude'; label = model && engine !== 'codex' ? `Tour · ${engineLabel} ${model.charAt(0).toUpperCase() + model.slice(1)}` : `Tour · ${engineLabel}`; } else { - label = provider === 'claude' ? 'Claude' : provider === 'codex' ? 'Codex' : 'Shell'; + label = provider === 'claude' ? 'Claude' : provider === 'codex' ? 'Codex' : provider === 'cursor' ? 'Cursor' : 'Shell'; } return ( = new Set([ + "claude", + "codex", + "tour", + "cursor", +]); + // --------------------------------------------------------------------------- // Factory // --------------------------------------------------------------------------- @@ -95,6 +105,26 @@ export interface AgentJobHandlerOptions { onJobComplete?: (job: AgentJobInfo, meta: { outputPath?: string; stdout?: string; cwd?: string }) => void | Promise; } + +/** + * Best-effort Cursor model catalog from `agent models`, parsed once and cached. + * Empty when discovery fails or the CLI is unauthenticated — the UI falls back + * to an `auto`-only picker. Account-specific, so never hardcoded. + */ +function discoverCursorModels(): CursorModel[] { + try { + const res = Bun.spawnSync(["agent", "models"], { + stdout: "pipe", + stderr: "ignore", + timeout: 5000, + }); + if (!res.success) return []; + return parseCursorModelsOutput(new TextDecoder().decode(res.stdout)); + } catch { + return []; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJobHandler { const { mode, getServerUrl, getCwd } = options; @@ -106,10 +136,19 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob let version = 0; // --- Capability detection (run once) --- + // Cursor CLI's binary is literally named `agent` (NOT `cursor`). When present, + // discover its account-specific model catalog so the UI doesn't hardcode ids. + const cursorAvailable = mode === "review" && !!Bun.which("agent"); const capabilities: AgentCapability[] = [ { id: "claude", name: "Claude Code", available: !!Bun.which("claude") }, { id: "codex", name: "Codex CLI", available: !!Bun.which("codex") }, { id: "tour", name: "Code Tour", available: !!Bun.which("claude") || !!Bun.which("codex") }, + { + id: "cursor", + name: "Cursor CLI", + available: cursorAvailable, + ...(cursorAvailable ? { models: discoverCursorModels() } : {}), + }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -235,32 +274,44 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob let stdoutBuf = ""; const stdoutDone = (captureStdout && proc.stdout && typeof proc.stdout !== "number") ? (async () => { + // Format one complete JSONL line into a live-log delta (skip result + // events — those are handled in onJobComplete). + const emitLogLine = (line: string) => { + if (!line.trim()) return; + // Claude: format JSONL into readable text. Tour jobs with the + // Claude engine also stream Claude JSONL, so key off engine too. + if (provider === "claude" || spawnOptions?.engine === "claude") { + const formatted = formatClaudeLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + try { + const event = JSON.parse(line); + if (event.type === 'result') return; + } catch { /* not JSON — forward as raw log */ } + broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); + }; try { const reader = proc!.stdout as unknown as AsyncIterable; + // stream-json output is NDJSON and chunk boundaries are arbitrary — + // carry the trailing partial line until a later chunk completes it, + // otherwise records split across chunks are dropped from live logs. + let logLineCarry = ""; for await (const chunk of reader) { const text = typeof chunk === "string" ? chunk : new TextDecoder().decode(chunk); stdoutBuf += text; - - // Forward JSONL lines as log events (skip result events) - const lines = text.split('\n'); - for (const line of lines) { - if (!line.trim()) continue; - // Claude: format JSONL into readable text. Tour jobs with the - // Claude engine also stream Claude JSONL, so key off engine too. - if (provider === "claude" || spawnOptions?.engine === "claude") { - const formatted = formatClaudeLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - try { - const event = JSON.parse(line); - if (event.type === 'result') continue; // handled in onJobComplete - } catch { /* not JSON — forward as raw log */ } - broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); - } + const lines = (logLineCarry + text).split('\n'); + logLineCarry = lines.pop() ?? ""; + for (const line of lines) emitLogLine(line); } + if (logLineCarry) emitLogLine(logLineCarry); } catch { // Stream closed } @@ -293,8 +344,16 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob stdout: captureStdout ? stdoutBuf : undefined, cwd: jobCwd, }); - } catch { - // Result ingestion failure shouldn't prevent job completion broadcast + } catch (err) { + // Claude/Codex are fail-open: an ingestion error is logged but does + // not change the terminal state. Cursor is fail-closed — its findings + // are prompt-enforced, so an unexpected throw here must surface as a + // failed job rather than a green one. (The Cursor handler normally + // fails by mutation and never throws; this guards future refactors.) + if (provider === "cursor") { + entry.info.status = "failed"; + entry.info.error = err instanceof Error ? err.message : "Cursor result ingestion failed"; + } } } jobOutputPaths.delete(id); @@ -444,6 +503,22 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob ); } + // Fail-closed enforcement for server-owned providers: the command MUST + // be built server-side. Client-supplied argv is never spawned for these + // providers — a null/throwing builder becomes an error, not a fallback. + const isServerBuiltProvider = SERVER_BUILT_PROVIDERS.has(provider); + if (isServerBuiltProvider) { + if (!options.buildCommand) { + return Response.json( + { error: `Provider ${provider} requires server-built command` }, + { status: 400 }, + ); + } + // Discard any client-supplied argv so a null build cleanly hits the + // `command.length === 0` guard below instead of falling through. + command = []; + } + // Try server-side command building for known providers let captureStdout = false; let stdinPrompt: string | undefined; diff --git a/packages/server/cursor-review.test.ts b/packages/server/cursor-review.test.ts new file mode 100644 index 000000000..c1feb0cb4 --- /dev/null +++ b/packages/server/cursor-review.test.ts @@ -0,0 +1,354 @@ +import { describe, expect, test } from "bun:test"; +import { + CURSOR_MARKER_OPEN, + CURSOR_MARKER_CLOSE, + buildCursorCommand, + reduceCursorStreamEvents, + parseCursorStreamOutput, + formatCursorLogEvent, + transformCursorFindings, + validateCursorReviewOutput, + parseCursorModelsOutput, + type CursorFinding, + type CursorReviewOutput, +} from "./cursor-review"; + +// --------------------------------------------------------------------------- +// Helpers — build NDJSON the way Cursor `stream-json` does. +// --------------------------------------------------------------------------- + +/** Wrap a JSON object as one NDJSON record (line + newline). */ +function ndjson(obj: unknown): string { + return JSON.stringify(obj) + "\n"; +} + +/** A real partial-output assistant delta: timestamp_ms present, model_call_id absent. */ +function assistantDelta(text: string, ts = 1): string { + return ndjson({ type: "assistant", timestamp_ms: ts, message: { content: text } }); +} + +/** A duplicate assistant flush: model_call_id present → must be skipped. */ +function assistantDuplicate(text: string, ts = 1): string { + return ndjson({ type: "assistant", timestamp_ms: ts, model_call_id: "call_1", message: { content: text } }); +} + +/** Wrap review JSON in the marker block. */ +function markerBlock(payload: unknown): string { + return `${CURSOR_MARKER_OPEN}\n${JSON.stringify(payload, null, 2)}\n${CURSOR_MARKER_CLOSE}`; +} + +const validReview: CursorReviewOutput = { + findings: [ + { + file: "packages/server/review.ts", + line: 123, + end_line: 130, + severity: "important", + description: "Null deref on the unhappy path.", + reasoning: "The guard above only covers the happy path.", + }, + ], + summary: { correctness: "Issues Found", explanation: "One important issue.", confidence: 0.85 }, +}; + +/** Split a string into N near-equal chunks to simulate arbitrary pipe boundaries. */ +function chunkify(s: string, parts: number): string[] { + const size = Math.ceil(s.length / parts); + const out: string[] = []; + for (let i = 0; i < s.length; i += size) out.push(s.slice(i, i + size)); + return out; +} + +// --------------------------------------------------------------------------- +// Command construction +// --------------------------------------------------------------------------- + +describe("buildCursorCommand", () => { + test("uses the `agent` binary and read-only flags, prompt as trailing argv", () => { + const { command } = buildCursorCommand("review this", "gpt-5", "/repo"); + expect(command[0]).toBe("agent"); + expect(command).toContain("-p"); + // Read-only posture: --mode ask + --sandbox enabled, NO --force/--yolo. + expect(command).toContain("--mode"); + expect(command[command.indexOf("--mode") + 1]).toBe("ask"); + expect(command).toContain("--sandbox"); + expect(command[command.indexOf("--sandbox") + 1]).toBe("enabled"); + expect(command).not.toContain("--force"); + expect(command).not.toContain("--yolo"); + // Headless print mode needs --trust to skip the interactive workspace-trust prompt. + expect(command).toContain("--trust"); + // stream-json + partial output for live logs. + expect(command).toContain("--output-format"); + expect(command[command.indexOf("--output-format") + 1]).toBe("stream-json"); + expect(command).toContain("--stream-partial-output"); + // workspace matches launch cwd; model present. + expect(command[command.indexOf("--workspace") + 1]).toBe("/repo"); + expect(command[command.indexOf("--model") + 1]).toBe("gpt-5"); + // Prompt is the trailing positional argv arg — agent reads it from argv, not stdin. + expect(command[command.length - 1]).toBe("review this"); + }); + + test("omits --model when model is Auto or empty", () => { + expect(buildCursorCommand("p", "Auto", "/repo").command).not.toContain("--model"); + expect(buildCursorCommand("p", "", "/repo").command).not.toContain("--model"); + expect(buildCursorCommand("p", undefined, "/repo").command).not.toContain("--model"); + }); + + test("omits --workspace when no cwd is provided", () => { + expect(buildCursorCommand("p").command).not.toContain("--workspace"); + }); +}); + +// --------------------------------------------------------------------------- +// NDJSON reduction — line buffering across arbitrary chunk boundaries +// --------------------------------------------------------------------------- + +describe("reduceCursorStreamEvents", () => { + test("reconstructs canonical text regardless of how the buffer is chunked", () => { + const stdout = + ndjson({ type: "system", subtype: "init", model: "Auto" }) + + assistantDelta("Looking at ") + + assistantDelta("the diff.") + + ndjson({ type: "result", result: " Done." }); + + // Whole buffer (the accumulated stdout is what onJobComplete parses). + const whole = reduceCursorStreamEvents(stdout); + expect(whole.canonicalText).toBe("Looking at the diff. Done."); + + // Re-joining arbitrary chunk splits must yield identical canonical text — + // proves pipe chunk boundaries never corrupt a record. + for (const parts of [2, 3, 7, stdout.length]) { + const rejoined = chunkify(stdout, parts).join(""); + expect(reduceCursorStreamEvents(rejoined).canonicalText).toBe("Looking at the diff. Done."); + } + }); + + test("filters duplicate partial-output flushes (model_call_id present)", () => { + const stdout = + assistantDelta("real ") + + assistantDuplicate("DUPLICATE") + // skipped: model_call_id present + assistantDelta("text"); + const { canonicalText } = reduceCursorStreamEvents(stdout); + expect(canonicalText).toBe("real text"); + expect(canonicalText).not.toContain("DUPLICATE"); + }); + + test("ignores malformed lines without aborting the reduction", () => { + const stdout = assistantDelta("a") + "this is not json\n" + assistantDelta("b"); + const { canonicalText, recordCount } = reduceCursorStreamEvents(stdout); + expect(canonicalText).toBe("ab"); + expect(recordCount).toBe(2); // malformed line not counted + }); +}); + +// --------------------------------------------------------------------------- +// Marker extraction — last block wins +// --------------------------------------------------------------------------- + +describe("parseCursorStreamOutput", () => { + test("extracts the LAST complete marker block from multi-message output", () => { + // An earlier (stale/draft) block followed by the final authoritative one. + const staleReview = { findings: [], summary: { correctness: "Correct", explanation: "draft", confidence: 0.1 } }; + const stdout = + assistantDelta("Here is a draft:\n" + markerBlock(staleReview) + "\n") + + assistantDelta("On reflection:\n" + markerBlock(validReview)); + + const out = parseCursorStreamOutput(stdout); + expect(out).not.toBeNull(); + expect(out!.summary.correctness).toBe("Issues Found"); + expect(out!.findings).toHaveLength(1); + expect(out!.findings[0].file).toBe("packages/server/review.ts"); + }); + + test("parses a single marker block spread across NDJSON records and chunks", () => { + const full = "Commentary first.\n" + markerBlock(validReview); + // Split the marker text itself across multiple assistant deltas. + const half = Math.floor(full.length / 2); + const stdout = assistantDelta(full.slice(0, half)) + assistantDelta(full.slice(half)); + // Then chunk the raw stdout at arbitrary byte boundaries. + const rejoined = chunkify(stdout, 9).join(""); + const out = parseCursorStreamOutput(rejoined); + expect(out).not.toBeNull(); + expect(out!.findings).toHaveLength(1); + }); + + test("returns null when the marker block is missing", () => { + const stdout = assistantDelta("I found no issues. Everything looks correct."); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when the marker block is opened but never closed", () => { + const stdout = assistantDelta(`prose ${CURSOR_MARKER_OPEN}\n{"findings":[]`); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when the marker block contains malformed JSON", () => { + const stdout = assistantDelta(`${CURSOR_MARKER_OPEN}\n{ findings: [ }\n${CURSOR_MARKER_CLOSE}`); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when JSON is valid but schema-invalid", () => { + // Missing summary.confidence (and bad severity) — must fail validation. + const bad = { + findings: [{ file: "x.ts", line: 1, severity: "blocker", description: "d" }], + summary: { correctness: "Issues Found", explanation: "e" }, + }; + const stdout = assistantDelta(markerBlock(bad)); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null on empty stdout", () => { + expect(parseCursorStreamOutput("")).toBeNull(); + expect(parseCursorStreamOutput(" \n ")).toBeNull(); + }); + + test("accepts an empty finding set with a valid summary", () => { + const clean = { findings: [], summary: { correctness: "Correct", explanation: "No issues.", confidence: 1 } }; + const out = parseCursorStreamOutput(assistantDelta(markerBlock(clean))); + expect(out).not.toBeNull(); + expect(out!.findings).toHaveLength(0); + expect(out!.summary.correctness).toBe("Correct"); + }); +}); + +// --------------------------------------------------------------------------- +// Schema validator (direct) +// --------------------------------------------------------------------------- + +describe("validateCursorReviewOutput", () => { + test("defaults end_line to line and reasoning to empty string", () => { + const out = validateCursorReviewOutput({ + findings: [{ file: "a.ts", line: 5, severity: "nit", description: "d" }], + summary: { correctness: "Issues Found", explanation: "e", confidence: 0.5 }, + }); + expect(out).not.toBeNull(); + expect(out!.findings[0].end_line).toBe(5); + expect(out!.findings[0].reasoning).toBe(""); + }); + + test("rejects non-object, missing findings array, and bad severity", () => { + expect(validateCursorReviewOutput(null)).toBeNull(); + expect(validateCursorReviewOutput({ summary: { correctness: "c", explanation: "e", confidence: 1 } })).toBeNull(); + expect( + validateCursorReviewOutput({ + findings: [{ file: "a.ts", line: 1, severity: "high", description: "d" }], + summary: { correctness: "c", explanation: "e", confidence: 1 }, + }), + ).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// Log formatting +// --------------------------------------------------------------------------- + +describe("formatCursorLogEvent", () => { + test("maps init / assistant-delta / tool_call / result and skips noise", () => { + expect(formatCursorLogEvent(ndjson({ type: "system", subtype: "init", model: "Auto", session_id: "s1" }))).toContain("model=Auto"); + expect(formatCursorLogEvent(assistantDelta("hello").trim())).toBe("hello"); + expect(formatCursorLogEvent(ndjson({ type: "tool_call", subtype: "started", name: "read_file", args: { path: "x.ts" } }))).toContain("[read_file]"); + expect(formatCursorLogEvent(ndjson({ type: "tool_call", subtype: "completed", name: "read_file" }))).toBe("[read_file] completed"); + expect(formatCursorLogEvent(ndjson({ type: "result", duration_ms: 42, request_id: "r1" }))).toContain("42ms"); + }); + + test("skips duplicate assistant flushes and malformed lines", () => { + expect(formatCursorLogEvent(assistantDuplicate("dup").trim())).toBeNull(); + expect(formatCursorLogEvent("not json")).toBeNull(); + expect(formatCursorLogEvent(ndjson({ type: "user" }).trim())).toBeNull(); + }); + + test("skips the final buffered assistant flush (no timestamp_ms) under partial output", () => { + // End-of-turn flush carries no timestamp_ms and repeats already-streamed + // deltas — showing it would duplicate the whole assistant output in live logs. + expect( + formatCursorLogEvent(ndjson({ type: "assistant", message: { content: "FULL FLUSH" } }).trim()), + ).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// Finding → annotation transform +// --------------------------------------------------------------------------- + +describe("transformCursorFindings", () => { + const findings: CursorFinding[] = [ + { + file: "/repo/packages/server/review.ts", + line: 10, + end_line: 12, + severity: "important", + description: "Off-by-one in the loop bound.", + reasoning: "Index reaches length, overrunning the array.", + }, + ]; + + test("produces the shared annotation shape with author Cursor and severity-prefixed text", () => { + const [a] = transformCursorFindings(findings, "agent-abcd1234", "/repo"); + expect(a.source).toBe("agent-abcd1234"); + expect(a.filePath).toBe("packages/server/review.ts"); // relativized to cwd + expect(a.lineStart).toBe(10); + expect(a.lineEnd).toBe(12); + expect(a.side).toBe("new"); + expect(a.scope).toBe("line"); + expect(a.type).toBe("comment"); + expect(a.author).toBe("Cursor"); + expect(a.severity).toBe("important"); + expect(a.text).toBe("[important] Off-by-one in the loop bound."); + expect(a.reasoning).toBe("Index reaches length, overrunning the array."); + }); + + test("applies the workspace path transform when provided", () => { + const [a] = transformCursorFindings(findings, "src", "/repo", (p) => `child/${p}`); + expect(a.filePath).toBe("child/packages/server/review.ts"); + }); + + test("drops findings without a file or numeric line", () => { + const dirty = [ + ...findings, + { file: "", line: 1, end_line: 1, severity: "nit", description: "x", reasoning: "" } as CursorFinding, + { file: "y.ts", line: undefined as unknown as number, end_line: 1, severity: "nit", description: "x", reasoning: "" } as CursorFinding, + ]; + expect(transformCursorFindings(dirty, "src", "/repo")).toHaveLength(1); + }); + + test("defaults lineEnd to lineStart when end_line is absent", () => { + const f = [{ file: "/repo/a.ts", line: 7, severity: "nit", description: "d", reasoning: "" } as unknown as CursorFinding]; + expect(transformCursorFindings(f, "src", "/repo")[0].lineEnd).toBe(7); + }); +}); + +describe("parseCursorModelsOutput", () => { + test("parses `id - Label` lines and skips the header + tip", () => { + const out = [ + "Available models", + "", + "auto - Auto", + "gpt-5.2 - GPT-5.2", + "claude-opus-4-8-thinking-high - Opus 4.8 1M Thinking", + "composer-2.5 - Composer 2.5 (current)", + "", + "Tip: use --model (or /model in interactive mode) to switch.", + ].join("\n"); + const models = parseCursorModelsOutput(out); + expect(models).toEqual([ + { id: "auto", label: "Auto" }, + { id: "gpt-5.2", label: "GPT-5.2" }, + { id: "claude-opus-4-8-thinking-high", label: "Opus 4.8 1M Thinking" }, + { id: "composer-2.5", label: "Composer 2.5 (current)" }, + ]); + }); + + test("returns [] for unauthenticated / empty output", () => { + expect(parseCursorModelsOutput("No models available for this account.")).toEqual([]); + expect(parseCursorModelsOutput("")).toEqual([]); + expect(parseCursorModelsOutput("Not logged in")).toEqual([]); + }); + + test("dedupes repeated ids, keeping first", () => { + const models = parseCursorModelsOutput("auto - Auto\nauto - Auto Again\ngpt-5.2 - GPT-5.2"); + expect(models).toEqual([ + { id: "auto", label: "Auto" }, + { id: "gpt-5.2", label: "GPT-5.2" }, + ]); + }); +}); diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts new file mode 100644 index 000000000..19e9e0ce3 --- /dev/null +++ b/packages/server/cursor-review.ts @@ -0,0 +1,578 @@ +import { toRelativePath } from "./path-utils"; + +/** + * Cursor CLI Review Agent — prompt, command builder, NDJSON stream reducer, + * marker-block parser, and finding transformer. + * + * Cursor differs from Claude/Codex in exactly one way: its headless CLI + * (the binary is literally named `agent`) exposes no schema-validation flag + * (`--json-schema` / `--output-schema` / `structured_output`). Its final task + * output is prose. That earns strict *parsing* — Cursor is told to emit a + * marker-delimited JSON block, and we extract the LAST complete block from the + * reconstructed canonical text. It does NOT earn extra isolation, + * finding-policing, env scrubbing, or a per-job marker nonce. Cursor inherits + * process.env like Claude/Codex; its read-only posture comes only from the + * command flags (`--mode ask` + `--sandbox enabled` + no `--force`). + * + * Cursor `stream-json` is decoded with a real line-buffered NDJSON reducer + * (pipe chunk boundaries are NOT NDJSON record boundaries), producing live log + * deltas and the canonical assistant/result text used for marker extraction. + */ + +// --------------------------------------------------------------------------- +// Static marker — v1 uses a STATIC marker plus last-block selection (no nonce). +// If testing shows the model echoing the marker from the prompt, add a per-job +// nonce then — not before (per approach doc §Structured Output Policy). +// --------------------------------------------------------------------------- + +export const CURSOR_MARKER_OPEN = ""; +export const CURSOR_MARKER_CLOSE = ""; + +// --------------------------------------------------------------------------- +// Types — mirror ClaudeFinding/ClaudeReviewOutput, plus a freeform summary. +// --------------------------------------------------------------------------- + +export type CursorSeverity = "important" | "nit" | "pre_existing"; + +export interface CursorFinding { + file: string; + line: number; + end_line: number; + severity: CursorSeverity; + description: string; + reasoning: string; +} + +export interface CursorReviewSummary { + correctness: string; + explanation: string; + confidence: number; +} + +export interface CursorReviewOutput { + findings: CursorFinding[]; + summary: CursorReviewSummary; +} + +const VALID_SEVERITIES: ReadonlySet = new Set([ + "important", + "nit", + "pre_existing", +]); + +// --------------------------------------------------------------------------- +// Schema validator — hand-rolled (no Ajv): Cursor output is prompt-enforced, +// so validation is the floor that turns prose back into a trusted object. +// --------------------------------------------------------------------------- + +/** + * Validate a parsed object against the Cursor review schema. + * Returns the typed output, or null if the shape is invalid. + * + * Empty findings are valid as long as the JSON itself is valid and carries a + * valid summary (per approach doc §Structured Output Policy). + */ +export function validateCursorReviewOutput(parsed: unknown): CursorReviewOutput | null { + if (!parsed || typeof parsed !== "object") return null; + const obj = parsed as Record; + + if (!Array.isArray(obj.findings)) return null; + + const summary = obj.summary; + if (!summary || typeof summary !== "object") return null; + const s = summary as Record; + if (typeof s.correctness !== "string") return null; + if (typeof s.explanation !== "string") return null; + if (typeof s.confidence !== "number") return null; + + const findings: CursorFinding[] = []; + for (const raw of obj.findings) { + if (!raw || typeof raw !== "object") return null; + const f = raw as Record; + if (typeof f.file !== "string") return null; + if (typeof f.line !== "number") return null; + if (typeof f.severity !== "string" || !VALID_SEVERITIES.has(f.severity)) return null; + if (typeof f.description !== "string") return null; + + findings.push({ + file: f.file, + line: f.line, + end_line: typeof f.end_line === "number" ? f.end_line : f.line, + severity: f.severity as CursorSeverity, + description: f.description, + reasoning: typeof f.reasoning === "string" ? f.reasoning : "", + }); + } + + return { + findings, + summary: { + correctness: s.correctness, + explanation: s.explanation, + confidence: s.confidence, + }, + }; +} + +// --------------------------------------------------------------------------- +// Review prompt — investigation-first methodology tuned for Cursor's agentic +// model, ending with the marker-block output contract (Cursor has no schema +// flag, so the contract IS the prompt). Project-specific rules are NOT baked in +// here: the agent is told to discover and honor repo guidance files at review +// time, so this same prompt works for any repo `plannotator review` is run on. +// --------------------------------------------------------------------------- + +export const CURSOR_REVIEW_PROMPT = `# Code Review + +## Your role +You are a senior engineer reviewing a code change. Find the bugs a maintainer +would fix before merging — logic errors, regressions, broken edge cases, +security holes with a real exploit path, data-loss risks. You are not a linter +or style checker. Optimize for findings that get resolved, not for volume: a +few real, well-evidenced bugs beat a long list of nits. + +## Method — investigate before you report +You have tools. Use them. Do not judge the diff in isolation: + - Read the full function and module around each change, not just the hunk. + - Trace call sites and data flow to see how the changed code is actually used. + - Read any repo guidance and honor it if present: CLAUDE.md, REVIEW.md, + AGENTS.md, .cursor/rules/*, and .cursor/BUGBOT.md (root and any nested under + the directories you're reviewing). Treat these as authoritative for this + repo and respect any skip/ignore rules they define. + - Before reporting, check whether the issue is already handled elsewhere (a + guard, try/catch, fallback, type guarantee). + - Look at sibling code and tests: is the pattern you're flagging used safely + elsewhere? Is the failure path you fear already covered? + +## What to look for + - Correctness: logic errors, regressions, broken edge cases, off-by-one, + wrong results, build/type breakage. + - Robustness: unhandled errors on realistic paths, resource/process leaks, + races with observable consequences, state left inconsistent on failure. + - Security (only when the change introduces it): execution of + user/client-controlled input, trusting unvalidated external or model + output, weakened trust/read-only guarantees, newly exposed surfaces. + - Contracts & attribution: wrong file/line/path mapping, scope confusion, or a + change to one side of a contract (types, API shape, parallel + implementations) without the matching change on the other side. + - Clear, citable violations of the repo guidance files above. + +## Validate every finding +Confirm each candidate is real by tracing the actual code path. If you can't +show how it triggers, drop it — prefer silence over a false positive. The +confirmation you write becomes the \`reasoning\` field: what triggers it, what +breaks, and why it isn't already handled. + +## Severity — report what a maintainer would actually fix + important — should be fixed before merge: data loss, silent wrong results, + security with an exploit path, crashes/leaks, contract drift that breaks a + real consumer, fail-open where it must fail-closed. + + nit — minor and non-blocking: a real but low-impact edge case, missing + handling on an unlikely path, a robustness gap with an easy workaround. + + pre_existing — a genuine bug in surrounding code NOT introduced by this + change. Only flag when it directly affects the changed code path. + +## Do NOT report + - Formatting, naming, or style preferences (unless a repo guideline requires). + - "Consider adding tests" — unless the change adds non-trivial logic with no + test coverage at all. + - Hypothetical problems needing exotic conditions with no real code path. + - Speculative refactors or "this could be cleaner" outside the change's scope. + +## Hard constraints +- This is a read-only review. Do NOT modify files. +- Do NOT post any comments to GitHub or GitLab. Do NOT use gh pr comment, glab, + or any commenting tool. +- Never approve or block the change. Your only output is findings. + +## Output contract +Your only machine-readable output is a single marker-delimited JSON block. +Any natural-language commentary you write must come BEFORE the final marker +block. Emit the block exactly once, as the last thing in your response: + +${CURSOR_MARKER_OPEN} +{ + "findings": [ + { + "file": "packages/server/review.ts", + "line": 123, + "end_line": 123, + "severity": "important", + "description": "The issue...", + "reasoning": "Why this is a real issue..." + } + ], + "summary": { + "correctness": "Issues Found", + "explanation": "One important issue was found.", + "confidence": 0.85 + } +} +${CURSOR_MARKER_CLOSE} + +Schema: +- findings: array of objects, each with + - file: string (path as shown in the diff) + - line: integer (start line, post-change numbering) + - end_line: integer (end line; equal to line for a single line) + - severity: one of "important", "nit", "pre_existing" + - description: string (one paragraph) — state the IMPACT (what breaks, for + whom) and the TRIGGER (when it happens); suggest a minimal fix if obvious + - reasoning: string (how the issue was confirmed) +- summary: object with + - correctness: string ("Correct" or "Issues Found") + - explanation: string (one sentence) + - confidence: number between 0 and 1 + +Cite file/line from the new (post-change) code. One finding per distinct bug — +do not stack unrelated issues. If no issues are found, return an empty +"findings" array with a valid summary.`; + +// --------------------------------------------------------------------------- +// Command builder +// --------------------------------------------------------------------------- + +export interface CursorCommandResult { + command: string[]; +} + +/** + * Build the `agent -p` command. NOTE the binary is `agent`, NOT `cursor`. + * + * Read-only posture comes entirely from `--mode ask` + `--sandbox enabled` and + * the absence of `--force`/`--yolo`. `--trust` is required in headless print + * mode: without it Cursor stops on an interactive workspace-trust prompt that a + * background job can never answer. It is safe here because the run is already + * constrained to read-only ask mode with the sandbox enabled. + * + * The prompt is the trailing positional argument — `agent` reads task text from + * argv (`[prompt...]`), not stdin. `--model` is omitted when the model is + * `Auto`/empty so Cursor uses its default model selection. `--workspace` is set + * to the launch cwd when provided, matching the spawn cwd. + */ +export function buildCursorCommand(prompt: string, model?: string, cwd?: string): CursorCommandResult { + // `auto` is Cursor's default model id — omit --model so the CLI chooses. + const useModel = !!model && model.toLowerCase() !== "auto"; + + return { + command: [ + "agent", + "-p", + "--mode", + "ask", + "--output-format", + "stream-json", + "--stream-partial-output", + "--trust", + ...(cwd ? ["--workspace", cwd] : []), + "--sandbox", + "enabled", + ...(useModel ? ["--model", model] : []), + // Prompt is the trailing positional arg — agent reads it from argv, not stdin. + prompt, + ], + }; +} + +// --------------------------------------------------------------------------- +// Model discovery — parse `agent models` output. The spawn is runtime-specific +// (it lives in each server's agent-jobs adapter); this pure parser is shared. +// --------------------------------------------------------------------------- + +export interface CursorModel { + id: string; + label: string; +} + +/** + * Parse the text output of `agent models` / `agent --list-models` into a model + * catalog. The CLI prints one model per line as ` -