From 9e82d417a699dd4e30912318415af105dd0c324a Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 13:51:52 -0700 Subject: [PATCH 1/7] feat(review): add Cursor CLI as an experimental review agent provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 'cursor' as a third background review provider alongside Claude, Codex, and Code Tour, plugged into the existing /api/agents/jobs system (not Ask AI). Design principle: match the existing providers, don't out-engineer them. Cursor's only real difference is prose output (no schema flag), which earns strict PARSING only — no extra isolation, grounding, or finding-policing. Backend: - packages/server/cursor-review.ts: command builder (read-only flags, prompt on stdin), line-buffered NDJSON stream reducer, static-marker last-block JSON extraction, schema validation, finding->annotation transform. - agent-jobs.ts: capability detection (binary is 'agent', not 'cursor'), fail-closed server-built-provider guard (client argv is never spawned for built-in providers), cursor stdout-log dispatch. - review.ts onJobComplete: fail-closed BY MUTATION (never throws — the runner swallows throws), PR/diffScope attribution, workspace path normalization. - Full Pi parity (agent-jobs.ts, serverReview.ts, vendored via vendor.sh). UI (capability-gated, invisible unless the Cursor CLI is installed): - AgentsTab/useAgentSettings: Cursor as a review-only engine with an experimental label and Auto model default. Read-only posture is flags-only: --mode ask + --sandbox enabled, no --force. Env is inherited like Claude/Codex. Tests: packages/server/cursor-review.test.ts (22, high-value only). Full typecheck + affected review-agent test families pass. OUTSTANDING (not automatable here): empirically verify 'agent -p --mode ask' is non-mutating with an authenticated Cursor account before trusting in production. Ship as experimental until confirmed. --- AGENTS.md | 2 +- apps/pi-extension/server/agent-jobs.ts | 34 ++ apps/pi-extension/server/serverReview.ts | 54 +++ apps/pi-extension/vendor.sh | 2 +- packages/server/agent-jobs.ts | 37 ++ packages/server/cursor-review.test.ts | 308 ++++++++++++++ packages/server/cursor-review.ts | 515 +++++++++++++++++++++++ packages/server/review.ts | 54 +++ packages/ui/components/AgentsTab.tsx | 91 +++- packages/ui/hooks/useAgentSettings.ts | 32 +- 10 files changed, 1109 insertions(+), 20 deletions(-) create mode 100644 packages/server/cursor-review.test.ts create mode 100644 packages/server/cursor-review.ts diff --git a/AGENTS.md b/AGENTS.md index 3bde42b41..ba82fc5fc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -311,7 +311,7 @@ During normal plan review, an Archive sidebar tab provides the same browsing via | `/api/external-annotations` | POST | Add external annotations (single or batch `{ annotations: [...] }`) | | `/api/external-annotations` | PATCH | Update fields on a single annotation (`?id=`) | | `/api/external-annotations` | DELETE | Remove by `?id=`, `?source=`, or clear all | -| `/api/agents/capabilities` | GET | Check available agent providers (claude, codex, tour) | +| `/api/agents/capabilities` | GET | Check available agent providers (claude, codex, tour, cursor) | | `/api/agents/jobs/stream` | GET | SSE stream for real-time agent job status updates | | `/api/agents/jobs` | GET | Snapshot of agent jobs (polling fallback, `?since=N` for version gating) | | `/api/agents/jobs` | POST | Launch an agent job (body: `{ provider, command, label }`) | diff --git a/apps/pi-extension/server/agent-jobs.ts b/apps/pi-extension/server/agent-jobs.ts index d4a492402..696c56212 100644 --- a/apps/pi-extension/server/agent-jobs.ts +++ b/apps/pi-extension/server/agent-jobs.ts @@ -21,6 +21,7 @@ import { AGENT_HEARTBEAT_INTERVAL_MS, } from "../generated/agent-jobs.js"; import { formatClaudeLogEvent } from "../generated/claude-review.js"; +import { formatCursorLogEvent } from "../generated/cursor-review.js"; import { json, parseBody } from "./helpers.js"; // --------------------------------------------------------------------------- @@ -32,6 +33,15 @@ const JOBS = `${BASE}/jobs`; const JOBS_STREAM = `${JOBS}/stream`; const CAPABILITIES = `${BASE}/capabilities`; +// Providers whose command is owned by the server. Client-supplied argv is never +// spawned for these — buildCommand must produce the command or the launch fails. +const SERVER_BUILT_PROVIDERS: ReadonlySet = new Set([ + "claude", + "codex", + "tour", + "cursor", +]); + // --------------------------------------------------------------------------- // which() helper for Node.js // --------------------------------------------------------------------------- @@ -98,6 +108,8 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { { id: "claude", name: "Claude Code", available: whichCmd("claude") }, { id: "codex", name: "Codex CLI", available: whichCmd("codex") }, { id: "tour", name: "Code Tour", available: whichCmd("claude") || whichCmd("codex") }, + // Cursor CLI's binary is literally named `agent` (NOT `cursor`). + { id: "cursor", name: "Cursor CLI", available: whichCmd("agent") }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -202,6 +214,15 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { } continue; } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) { + broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + } + continue; + } try { const event = JSON.parse(line); if (event.type === 'result') continue; @@ -418,6 +439,19 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { return true; } + // Fail-closed enforcement for server-owned providers: the command MUST + // be built server-side. Client-supplied argv is never spawned for these + // providers — a null/throwing builder becomes an error, not a fallback. + if (SERVER_BUILT_PROVIDERS.has(provider)) { + if (!options.buildCommand) { + json(res, { error: `Provider ${provider} requires server-built command` }, 400); + return true; + } + // Discard any client-supplied argv so a null build cleanly hits the + // `command.length === 0` guard below instead of falling through. + command = []; + } + // Try server-side command building for known providers let captureStdout = false; let stdinPrompt: string | undefined; diff --git a/apps/pi-extension/server/serverReview.ts b/apps/pi-extension/server/serverReview.ts index 94598038d..20a8d7282 100644 --- a/apps/pi-extension/server/serverReview.ts +++ b/apps/pi-extension/server/serverReview.ts @@ -90,6 +90,12 @@ import { transformClaudeFindings, } from "../generated/claude-review.js"; import { createTourSession, TOUR_EMPTY_OUTPUT_ERROR } from "../generated/tour-review.js"; +import { + CURSOR_REVIEW_PROMPT, + buildCursorCommand, + parseCursorStreamOutput, + transformCursorFindings, +} from "../generated/cursor-review.js"; import { WorkspaceReviewSession, type WorkspaceDiffType, @@ -567,6 +573,17 @@ export async function startReviewServer(options: { return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, effort, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; } + if (provider === "cursor") { + // Cursor has no schema flag — its marker-block output contract lives in + // CURSOR_REVIEW_PROMPT. captureStdout is required (the marker block comes + // back on stdout, like Claude). buildCursorCommand puts the prompt on + // stdin and threads --workspace=cwd to match the spawn cwd. + const model = typeof config?.model === "string" && config.model ? config.model : undefined; + const prompt = CURSOR_REVIEW_PROMPT + "\n\n---\n\n" + userMessage; + const { command, stdinPrompt } = buildCursorCommand(prompt, model, cwd); + return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; + } + return null; }, @@ -636,6 +653,43 @@ export async function startReviewServer(options: { return; } + // --- Cursor path --- + // FAIL-CLOSED: Cursor output is prompt-enforced (no schema flag), so any + // missing/malformed/schema/transform/insertion failure must MUTATE the job + // to failed — NEVER throw (agent-jobs.ts swallows throws, silently leaving + // an exit-0 job marked done). Mirrors the Tour fail-closed pattern below. + if (job.provider === "cursor") { + const output = meta.stdout ? parseCursorStreamOutput(meta.stdout) : null; + if (!output) { + job.status = "failed"; + job.error = "Cursor review output missing or unparseable (no valid marker JSON)."; + return; + } + + job.summary = { + correctness: output.summary.correctness, + explanation: output.summary.explanation, + confidence: output.summary.confidence, + }; + + if (output.findings.length > 0) { + const annotations = transformCursorFindings( + output.findings, + job.source, + cwd, + workspace ? (filePath) => workspace.normalizeAnnotationPath(filePath) : undefined, + ) + .map(a => ({ ...a, ...jobPrContext, ...(jobDiffScope && { diffScope: jobDiffScope }) })); + const result = externalAnnotations.addAnnotations({ annotations }); + if ("error" in result) { + job.status = "failed"; + job.error = `Cursor annotation insertion failed: ${result.error}`; + return; + } + } + return; + } + if (job.provider === "tour") { const { summary } = await tour.onJobComplete({ job, meta }); if (summary) { diff --git a/apps/pi-extension/vendor.sh b/apps/pi-extension/vendor.sh index 5c9a32544..e0ecadfdc 100755 --- a/apps/pi-extension/vendor.sh +++ b/apps/pi-extension/vendor.sh @@ -13,7 +13,7 @@ for f in feedback-templates prompts review-core diff-paths cli-pagination jj-cor done # Vendor review agent modules from packages/server/ — rewrite imports for generated/ layout -for f in agent-review-message codex-review claude-review path-utils; do +for f in agent-review-message codex-review claude-review cursor-review path-utils; do src="../../packages/server/$f.ts" printf '// @generated — DO NOT EDIT. Source: packages/server/%s.ts\n' "$f" | cat - "$src" \ | sed 's|from "./vcs"|from "./review-core.js"|' \ diff --git a/packages/server/agent-jobs.ts b/packages/server/agent-jobs.ts index 9a1424415..c6ee893bf 100644 --- a/packages/server/agent-jobs.ts +++ b/packages/server/agent-jobs.ts @@ -9,6 +9,7 @@ */ import { formatClaudeLogEvent } from "./claude-review"; +import { formatCursorLogEvent } from "./cursor-review"; import { type AgentJobInfo, type AgentJobEvent, @@ -46,6 +47,15 @@ const JOBS = `${BASE}/jobs`; const JOBS_STREAM = `${JOBS}/stream`; const CAPABILITIES = `${BASE}/capabilities`; +// Providers whose command is owned by the server. Client-supplied argv is never +// spawned for these — buildCommand must produce the command or the launch fails. +const SERVER_BUILT_PROVIDERS: ReadonlySet = new Set([ + "claude", + "codex", + "tour", + "cursor", +]); + // --------------------------------------------------------------------------- // Factory // --------------------------------------------------------------------------- @@ -110,6 +120,8 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob { id: "claude", name: "Claude Code", available: !!Bun.which("claude") }, { id: "codex", name: "Codex CLI", available: !!Bun.which("codex") }, { id: "tour", name: "Code Tour", available: !!Bun.which("claude") || !!Bun.which("codex") }, + // Cursor CLI's binary is literally named `agent` (NOT `cursor`). + { id: "cursor", name: "Cursor CLI", available: !!Bun.which("agent") }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -254,6 +266,15 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob } continue; } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) { + broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + } + continue; + } try { const event = JSON.parse(line); if (event.type === 'result') continue; // handled in onJobComplete @@ -444,6 +465,22 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob ); } + // Fail-closed enforcement for server-owned providers: the command MUST + // be built server-side. Client-supplied argv is never spawned for these + // providers — a null/throwing builder becomes an error, not a fallback. + const isServerBuiltProvider = SERVER_BUILT_PROVIDERS.has(provider); + if (isServerBuiltProvider) { + if (!options.buildCommand) { + return Response.json( + { error: `Provider ${provider} requires server-built command` }, + { status: 400 }, + ); + } + // Discard any client-supplied argv so a null build cleanly hits the + // `command.length === 0` guard below instead of falling through. + command = []; + } + // Try server-side command building for known providers let captureStdout = false; let stdinPrompt: string | undefined; diff --git a/packages/server/cursor-review.test.ts b/packages/server/cursor-review.test.ts new file mode 100644 index 000000000..bd724037d --- /dev/null +++ b/packages/server/cursor-review.test.ts @@ -0,0 +1,308 @@ +import { describe, expect, test } from "bun:test"; +import { + CURSOR_MARKER_OPEN, + CURSOR_MARKER_CLOSE, + buildCursorCommand, + reduceCursorStreamEvents, + parseCursorStreamOutput, + formatCursorLogEvent, + transformCursorFindings, + validateCursorReviewOutput, + type CursorFinding, + type CursorReviewOutput, +} from "./cursor-review"; + +// --------------------------------------------------------------------------- +// Helpers — build NDJSON the way Cursor `stream-json` does. +// --------------------------------------------------------------------------- + +/** Wrap a JSON object as one NDJSON record (line + newline). */ +function ndjson(obj: unknown): string { + return JSON.stringify(obj) + "\n"; +} + +/** A real partial-output assistant delta: timestamp_ms present, model_call_id absent. */ +function assistantDelta(text: string, ts = 1): string { + return ndjson({ type: "assistant", timestamp_ms: ts, message: { content: text } }); +} + +/** A duplicate assistant flush: model_call_id present → must be skipped. */ +function assistantDuplicate(text: string, ts = 1): string { + return ndjson({ type: "assistant", timestamp_ms: ts, model_call_id: "call_1", message: { content: text } }); +} + +/** Wrap review JSON in the marker block. */ +function markerBlock(payload: unknown): string { + return `${CURSOR_MARKER_OPEN}\n${JSON.stringify(payload, null, 2)}\n${CURSOR_MARKER_CLOSE}`; +} + +const validReview: CursorReviewOutput = { + findings: [ + { + file: "packages/server/review.ts", + line: 123, + end_line: 130, + severity: "important", + description: "Null deref on the unhappy path.", + reasoning: "The guard above only covers the happy path.", + }, + ], + summary: { correctness: "Issues Found", explanation: "One important issue.", confidence: 0.85 }, +}; + +/** Split a string into N near-equal chunks to simulate arbitrary pipe boundaries. */ +function chunkify(s: string, parts: number): string[] { + const size = Math.ceil(s.length / parts); + const out: string[] = []; + for (let i = 0; i < s.length; i += size) out.push(s.slice(i, i + size)); + return out; +} + +// --------------------------------------------------------------------------- +// Command construction +// --------------------------------------------------------------------------- + +describe("buildCursorCommand", () => { + test("uses the `agent` binary and read-only flags, prompt on stdin", () => { + const { command, stdinPrompt } = buildCursorCommand("review this", "gpt-5", "/repo"); + expect(command[0]).toBe("agent"); + expect(command).toContain("-p"); + // Read-only posture: --mode ask + --sandbox enabled, NO --force/--yolo. + expect(command).toContain("--mode"); + expect(command[command.indexOf("--mode") + 1]).toBe("ask"); + expect(command).toContain("--sandbox"); + expect(command[command.indexOf("--sandbox") + 1]).toBe("enabled"); + expect(command).not.toContain("--force"); + expect(command).not.toContain("--yolo"); + // stream-json + partial output for live logs. + expect(command).toContain("--output-format"); + expect(command[command.indexOf("--output-format") + 1]).toBe("stream-json"); + expect(command).toContain("--stream-partial-output"); + // workspace matches launch cwd; model present. + expect(command[command.indexOf("--workspace") + 1]).toBe("/repo"); + expect(command[command.indexOf("--model") + 1]).toBe("gpt-5"); + // Prompt is NOT in argv — it goes on stdin. + expect(stdinPrompt).toBe("review this"); + expect(command).not.toContain("review this"); + }); + + test("omits --model when model is Auto or empty", () => { + expect(buildCursorCommand("p", "Auto", "/repo").command).not.toContain("--model"); + expect(buildCursorCommand("p", "", "/repo").command).not.toContain("--model"); + expect(buildCursorCommand("p", undefined, "/repo").command).not.toContain("--model"); + }); + + test("omits --workspace when no cwd is provided", () => { + expect(buildCursorCommand("p").command).not.toContain("--workspace"); + }); +}); + +// --------------------------------------------------------------------------- +// NDJSON reduction — line buffering across arbitrary chunk boundaries +// --------------------------------------------------------------------------- + +describe("reduceCursorStreamEvents", () => { + test("reconstructs canonical text regardless of how the buffer is chunked", () => { + const stdout = + ndjson({ type: "system", subtype: "init", model: "Auto" }) + + assistantDelta("Looking at ") + + assistantDelta("the diff.") + + ndjson({ type: "result", result: " Done." }); + + // Whole buffer (the accumulated stdout is what onJobComplete parses). + const whole = reduceCursorStreamEvents(stdout); + expect(whole.canonicalText).toBe("Looking at the diff. Done."); + + // Re-joining arbitrary chunk splits must yield identical canonical text — + // proves pipe chunk boundaries never corrupt a record. + for (const parts of [2, 3, 7, stdout.length]) { + const rejoined = chunkify(stdout, parts).join(""); + expect(reduceCursorStreamEvents(rejoined).canonicalText).toBe("Looking at the diff. Done."); + } + }); + + test("filters duplicate partial-output flushes (model_call_id present)", () => { + const stdout = + assistantDelta("real ") + + assistantDuplicate("DUPLICATE") + // skipped: model_call_id present + assistantDelta("text"); + const { canonicalText } = reduceCursorStreamEvents(stdout); + expect(canonicalText).toBe("real text"); + expect(canonicalText).not.toContain("DUPLICATE"); + }); + + test("ignores malformed lines without aborting the reduction", () => { + const stdout = assistantDelta("a") + "this is not json\n" + assistantDelta("b"); + const { canonicalText, recordCount } = reduceCursorStreamEvents(stdout); + expect(canonicalText).toBe("ab"); + expect(recordCount).toBe(2); // malformed line not counted + }); +}); + +// --------------------------------------------------------------------------- +// Marker extraction — last block wins +// --------------------------------------------------------------------------- + +describe("parseCursorStreamOutput", () => { + test("extracts the LAST complete marker block from multi-message output", () => { + // An earlier (stale/draft) block followed by the final authoritative one. + const staleReview = { findings: [], summary: { correctness: "Correct", explanation: "draft", confidence: 0.1 } }; + const stdout = + assistantDelta("Here is a draft:\n" + markerBlock(staleReview) + "\n") + + assistantDelta("On reflection:\n" + markerBlock(validReview)); + + const out = parseCursorStreamOutput(stdout); + expect(out).not.toBeNull(); + expect(out!.summary.correctness).toBe("Issues Found"); + expect(out!.findings).toHaveLength(1); + expect(out!.findings[0].file).toBe("packages/server/review.ts"); + }); + + test("parses a single marker block spread across NDJSON records and chunks", () => { + const full = "Commentary first.\n" + markerBlock(validReview); + // Split the marker text itself across multiple assistant deltas. + const half = Math.floor(full.length / 2); + const stdout = assistantDelta(full.slice(0, half)) + assistantDelta(full.slice(half)); + // Then chunk the raw stdout at arbitrary byte boundaries. + const rejoined = chunkify(stdout, 9).join(""); + const out = parseCursorStreamOutput(rejoined); + expect(out).not.toBeNull(); + expect(out!.findings).toHaveLength(1); + }); + + test("returns null when the marker block is missing", () => { + const stdout = assistantDelta("I found no issues. Everything looks correct."); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when the marker block is opened but never closed", () => { + const stdout = assistantDelta(`prose ${CURSOR_MARKER_OPEN}\n{"findings":[]`); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when the marker block contains malformed JSON", () => { + const stdout = assistantDelta(`${CURSOR_MARKER_OPEN}\n{ findings: [ }\n${CURSOR_MARKER_CLOSE}`); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null when JSON is valid but schema-invalid", () => { + // Missing summary.confidence (and bad severity) — must fail validation. + const bad = { + findings: [{ file: "x.ts", line: 1, severity: "blocker", description: "d" }], + summary: { correctness: "Issues Found", explanation: "e" }, + }; + const stdout = assistantDelta(markerBlock(bad)); + expect(parseCursorStreamOutput(stdout)).toBeNull(); + }); + + test("returns null on empty stdout", () => { + expect(parseCursorStreamOutput("")).toBeNull(); + expect(parseCursorStreamOutput(" \n ")).toBeNull(); + }); + + test("accepts an empty finding set with a valid summary", () => { + const clean = { findings: [], summary: { correctness: "Correct", explanation: "No issues.", confidence: 1 } }; + const out = parseCursorStreamOutput(assistantDelta(markerBlock(clean))); + expect(out).not.toBeNull(); + expect(out!.findings).toHaveLength(0); + expect(out!.summary.correctness).toBe("Correct"); + }); +}); + +// --------------------------------------------------------------------------- +// Schema validator (direct) +// --------------------------------------------------------------------------- + +describe("validateCursorReviewOutput", () => { + test("defaults end_line to line and reasoning to empty string", () => { + const out = validateCursorReviewOutput({ + findings: [{ file: "a.ts", line: 5, severity: "nit", description: "d" }], + summary: { correctness: "Issues Found", explanation: "e", confidence: 0.5 }, + }); + expect(out).not.toBeNull(); + expect(out!.findings[0].end_line).toBe(5); + expect(out!.findings[0].reasoning).toBe(""); + }); + + test("rejects non-object, missing findings array, and bad severity", () => { + expect(validateCursorReviewOutput(null)).toBeNull(); + expect(validateCursorReviewOutput({ summary: { correctness: "c", explanation: "e", confidence: 1 } })).toBeNull(); + expect( + validateCursorReviewOutput({ + findings: [{ file: "a.ts", line: 1, severity: "high", description: "d" }], + summary: { correctness: "c", explanation: "e", confidence: 1 }, + }), + ).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// Log formatting +// --------------------------------------------------------------------------- + +describe("formatCursorLogEvent", () => { + test("maps init / assistant-delta / tool_call / result and skips noise", () => { + expect(formatCursorLogEvent(ndjson({ type: "system", subtype: "init", model: "Auto", session_id: "s1" }))).toContain("model=Auto"); + expect(formatCursorLogEvent(assistantDelta("hello").trim())).toBe("hello"); + expect(formatCursorLogEvent(ndjson({ type: "tool_call", subtype: "started", name: "read_file", args: { path: "x.ts" } }))).toContain("[read_file]"); + expect(formatCursorLogEvent(ndjson({ type: "tool_call", subtype: "completed", name: "read_file" }))).toBe("[read_file] completed"); + expect(formatCursorLogEvent(ndjson({ type: "result", duration_ms: 42, request_id: "r1" }))).toContain("42ms"); + }); + + test("skips duplicate assistant flushes and malformed lines", () => { + expect(formatCursorLogEvent(assistantDuplicate("dup").trim())).toBeNull(); + expect(formatCursorLogEvent("not json")).toBeNull(); + expect(formatCursorLogEvent(ndjson({ type: "user" }).trim())).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// Finding → annotation transform +// --------------------------------------------------------------------------- + +describe("transformCursorFindings", () => { + const findings: CursorFinding[] = [ + { + file: "/repo/packages/server/review.ts", + line: 10, + end_line: 12, + severity: "important", + description: "Off-by-one in the loop bound.", + reasoning: "Index reaches length, overrunning the array.", + }, + ]; + + test("produces the shared annotation shape with author Cursor and severity-prefixed text", () => { + const [a] = transformCursorFindings(findings, "agent-abcd1234", "/repo"); + expect(a.source).toBe("agent-abcd1234"); + expect(a.filePath).toBe("packages/server/review.ts"); // relativized to cwd + expect(a.lineStart).toBe(10); + expect(a.lineEnd).toBe(12); + expect(a.side).toBe("new"); + expect(a.scope).toBe("line"); + expect(a.type).toBe("comment"); + expect(a.author).toBe("Cursor"); + expect(a.severity).toBe("important"); + expect(a.text).toBe("[important] Off-by-one in the loop bound."); + expect(a.reasoning).toBe("Index reaches length, overrunning the array."); + }); + + test("applies the workspace path transform when provided", () => { + const [a] = transformCursorFindings(findings, "src", "/repo", (p) => `child/${p}`); + expect(a.filePath).toBe("child/packages/server/review.ts"); + }); + + test("drops findings without a file or numeric line", () => { + const dirty = [ + ...findings, + { file: "", line: 1, end_line: 1, severity: "nit", description: "x", reasoning: "" } as CursorFinding, + { file: "y.ts", line: undefined as unknown as number, end_line: 1, severity: "nit", description: "x", reasoning: "" } as CursorFinding, + ]; + expect(transformCursorFindings(dirty, "src", "/repo")).toHaveLength(1); + }); + + test("defaults lineEnd to lineStart when end_line is absent", () => { + const f = [{ file: "/repo/a.ts", line: 7, severity: "nit", description: "d", reasoning: "" } as unknown as CursorFinding]; + expect(transformCursorFindings(f, "src", "/repo")[0].lineEnd).toBe(7); + }); +}); diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts new file mode 100644 index 000000000..fe6f4526f --- /dev/null +++ b/packages/server/cursor-review.ts @@ -0,0 +1,515 @@ +import { toRelativePath } from "./path-utils"; + +/** + * Cursor CLI Review Agent — prompt, command builder, NDJSON stream reducer, + * marker-block parser, and finding transformer. + * + * Cursor differs from Claude/Codex in exactly one way: its headless CLI + * (the binary is literally named `agent`) exposes no schema-validation flag + * (`--json-schema` / `--output-schema` / `structured_output`). Its final task + * output is prose. That earns strict *parsing* — Cursor is told to emit a + * marker-delimited JSON block, and we extract the LAST complete block from the + * reconstructed canonical text. It does NOT earn extra isolation, + * finding-policing, env scrubbing, or a per-job marker nonce. Cursor inherits + * process.env like Claude/Codex; its read-only posture comes only from the + * command flags (`--mode ask` + `--sandbox enabled` + no `--force`). + * + * Cursor `stream-json` is decoded with a real line-buffered NDJSON reducer + * (pipe chunk boundaries are NOT NDJSON record boundaries), producing live log + * deltas and the canonical assistant/result text used for marker extraction. + */ + +// --------------------------------------------------------------------------- +// Static marker — v1 uses a STATIC marker plus last-block selection (no nonce). +// If testing shows the model echoing the marker from the prompt, add a per-job +// nonce then — not before (per approach doc §Structured Output Policy). +// --------------------------------------------------------------------------- + +export const CURSOR_MARKER_OPEN = ""; +export const CURSOR_MARKER_CLOSE = ""; + +// --------------------------------------------------------------------------- +// Types — mirror ClaudeFinding/ClaudeReviewOutput, plus a freeform summary. +// --------------------------------------------------------------------------- + +export type CursorSeverity = "important" | "nit" | "pre_existing"; + +export interface CursorFinding { + file: string; + line: number; + end_line: number; + severity: CursorSeverity; + description: string; + reasoning: string; +} + +export interface CursorReviewSummary { + correctness: string; + explanation: string; + confidence: number; +} + +export interface CursorReviewOutput { + findings: CursorFinding[]; + summary: CursorReviewSummary; +} + +const VALID_SEVERITIES: ReadonlySet = new Set([ + "important", + "nit", + "pre_existing", +]); + +// --------------------------------------------------------------------------- +// Schema validator — hand-rolled (no Ajv): Cursor output is prompt-enforced, +// so validation is the floor that turns prose back into a trusted object. +// --------------------------------------------------------------------------- + +/** + * Validate a parsed object against the Cursor review schema. + * Returns the typed output, or null if the shape is invalid. + * + * Empty findings are valid as long as the JSON itself is valid and carries a + * valid summary (per approach doc §Structured Output Policy). + */ +export function validateCursorReviewOutput(parsed: unknown): CursorReviewOutput | null { + if (!parsed || typeof parsed !== "object") return null; + const obj = parsed as Record; + + if (!Array.isArray(obj.findings)) return null; + + const summary = obj.summary; + if (!summary || typeof summary !== "object") return null; + const s = summary as Record; + if (typeof s.correctness !== "string") return null; + if (typeof s.explanation !== "string") return null; + if (typeof s.confidence !== "number") return null; + + const findings: CursorFinding[] = []; + for (const raw of obj.findings) { + if (!raw || typeof raw !== "object") return null; + const f = raw as Record; + if (typeof f.file !== "string") return null; + if (typeof f.line !== "number") return null; + if (typeof f.severity !== "string" || !VALID_SEVERITIES.has(f.severity)) return null; + if (typeof f.description !== "string") return null; + + findings.push({ + file: f.file, + line: f.line, + end_line: typeof f.end_line === "number" ? f.end_line : f.line, + severity: f.severity as CursorSeverity, + description: f.description, + reasoning: typeof f.reasoning === "string" ? f.reasoning : "", + }); + } + + return { + findings, + summary: { + correctness: s.correctness, + explanation: s.explanation, + confidence: s.confidence, + }, + }; +} + +// --------------------------------------------------------------------------- +// Review prompt — mirrors CLAUDE_REVIEW_PROMPT, ending with the marker-block +// output contract (Cursor has no schema flag, so the contract is the prompt). +// --------------------------------------------------------------------------- + +export const CURSOR_REVIEW_PROMPT = `# Cursor Code Review System Prompt + +## Identity +You are a code review system. Your job is to find bugs that would break +production. You are not a linter, formatter, or style checker unless +project guidance files explicitly expand your scope. + +## Pipeline + +Step 1: Gather context + - Retrieve the PR diff or local diff (gh pr diff, git diff, or jj diff) + - Read CLAUDE.md and REVIEW.md at the repo root and in every directory + containing modified files + - Build a map of which rules apply to which file paths + - Identify any skip rules (paths, patterns, or file types to ignore) + +Step 2: Review for issues + - Logic errors, regressions, broken edge cases, build failures, and code + that will produce wrong results. + - Security vulnerabilities with concrete exploit paths, race conditions, and + incorrect assumptions about trust boundaries. + - Code quality: unnecessary duplication, missed reuse of existing utilities, + overly complex implementations a senior engineer would care about. + - Guideline compliance: clear, unambiguous violations of CLAUDE.md / REVIEW.md + where you can cite the exact rule broken. Respect all skip rules. + +Step 3: Validate each candidate finding + - Trace the actual code path to confirm the issue is real. + - Check whether the issue is handled elsewhere (try/catch, upstream guard, + fallback logic, type system guarantees). + - If validation fails, drop the finding silently. + - If validation passes, write a clear reasoning chain — this becomes the + \`reasoning\` field. + +Step 4: Classify each validated finding with exactly one severity: + + important — A bug that should be fixed before merging. Build failures, clear + logic errors, security vulnerabilities with exploit paths, data loss risks, + race conditions with observable consequences. + + nit — A minor issue worth fixing but non-blocking. Style deviations from + project guidelines, code quality concerns, unlikely-but-worth-noting edge + cases, convention violations that don't affect correctness. + + pre_existing — A bug that exists in the surrounding codebase but was NOT + introduced by this change. Only flag when directly relevant to the changed + code path. + +## Hard constraints +- Never approve or block the change. +- Never comment on formatting or code style unless guidance files say to. +- Never flag missing test coverage unless guidance files say to. +- Never invent rules — only enforce what CLAUDE.md or REVIEW.md state. +- Prefer silence over false positives — when in doubt, drop the finding. +- Do NOT modify files. This is a read-only review. +- Do NOT post any comments to GitHub or GitLab. Do NOT use gh pr comment, glab, + or any commenting tool. + +## Output contract +Your only machine-readable output is a single marker-delimited JSON block. +Any natural-language commentary you write must come BEFORE the final marker +block. Emit the block exactly once, as the last thing in your response: + +${CURSOR_MARKER_OPEN} +{ + "findings": [ + { + "file": "packages/server/review.ts", + "line": 123, + "end_line": 123, + "severity": "important", + "description": "The issue...", + "reasoning": "Why this is a real issue..." + } + ], + "summary": { + "correctness": "Issues Found", + "explanation": "One important issue was found.", + "confidence": 0.85 + } +} +${CURSOR_MARKER_CLOSE} + +Schema: +- findings: array of objects, each with + - file: string (path as shown in the diff) + - line: integer (start line, post-change numbering) + - end_line: integer (end line; equal to line for a single line) + - severity: one of "important", "nit", "pre_existing" + - description: string (one paragraph) + - reasoning: string (how the issue was confirmed) +- summary: object with + - correctness: string ("Correct" or "Issues Found") + - explanation: string (one sentence) + - confidence: number between 0 and 1 + +If no issues are found, return an empty "findings" array with a valid summary.`; + +// --------------------------------------------------------------------------- +// Command builder +// --------------------------------------------------------------------------- + +export interface CursorCommandResult { + command: string[]; + /** Prompt text written to stdin (preferred over argv, like Claude). */ + stdinPrompt: string; +} + +/** + * Build the `agent -p` command. NOTE the binary is `agent`, NOT `cursor`. + * + * Read-only posture comes entirely from `--mode ask` + `--sandbox enabled` and + * the absence of `--force`/`--yolo`. The prompt goes on stdin (preferred over + * argv — avoids quoting issues and argv size limits). `--model` is omitted when + * the model is `Auto`/empty so Cursor uses its default model selection. + * `--workspace` is set to the launch cwd when provided, matching the spawn cwd. + */ +export function buildCursorCommand(prompt: string, model?: string, cwd?: string): CursorCommandResult { + const useModel = !!model && model !== "Auto"; + + return { + command: [ + "agent", + "-p", + "--mode", + "ask", + "--output-format", + "stream-json", + "--stream-partial-output", + ...(cwd ? ["--workspace", cwd] : []), + "--sandbox", + "enabled", + ...(useModel ? ["--model", model] : []), + ], + stdinPrompt: prompt, + }; +} + +// --------------------------------------------------------------------------- +// Stream reduction — line-buffered NDJSON reducer. +// --------------------------------------------------------------------------- + +interface CursorStreamEvent { + type?: string; + subtype?: string; + timestamp_ms?: number; + model_call_id?: string; + message?: { content?: unknown }; + text?: string; + [key: string]: unknown; +} + +/** + * A partial-output assistant event is a real new text delta only when + * `timestamp_ms` is present AND `model_call_id` is absent. All other assistant + * flushes are duplicate re-emissions (per approach doc §Log Streaming). + */ +function isRealAssistantDelta(event: CursorStreamEvent): boolean { + return event.timestamp_ms !== undefined && event.model_call_id === undefined; +} + +/** Pull readable text out of an assistant event's content (string or parts). */ +function extractAssistantText(event: CursorStreamEvent): string { + if (typeof event.text === "string") return event.text; + const content = event.message?.content; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .filter((p): p is { type?: string; text?: string } => !!p && typeof p === "object") + .filter((p) => p.type === "text" && typeof p.text === "string") + .map((p) => p.text as string) + .join(""); + } + return ""; +} + +export interface CursorStreamReduction { + /** Canonical assistant/result text — the substrate for marker extraction. */ + canonicalText: string; + /** Number of NDJSON records that parsed successfully. */ + recordCount: number; +} + +/** + * Reduce a complete Cursor `stream-json` stdout buffer into canonical text. + * + * Critically line-buffered: only complete lines (terminated by `\n`) are parsed. + * A trailing partial line with no newline is treated as complete (the process + * has exited by the time this runs on the accumulated buffer), but mid-stream + * chunk boundaries never corrupt a record because we join the whole buffer + * before splitting on newlines. + */ +export function reduceCursorStreamEvents(stdout: string): CursorStreamReduction { + let canonicalText = ""; + let recordCount = 0; + + if (!stdout) return { canonicalText, recordCount }; + + for (const rawLine of stdout.split("\n")) { + const line = rawLine.trim(); + if (!line) continue; + + let event: CursorStreamEvent; + try { + event = JSON.parse(line) as CursorStreamEvent; + } catch { + continue; // not a complete/valid NDJSON record — skip + } + recordCount++; + + if (event.type === "assistant") { + // For partial streaming, only append real new deltas. If no event in the + // stream carries timestamp_ms (partial streaming disabled), append the + // full assistant text once. + if (event.timestamp_ms !== undefined) { + if (isRealAssistantDelta(event)) canonicalText += extractAssistantText(event); + } else if (event.model_call_id === undefined) { + canonicalText += extractAssistantText(event); + } + } else if (event.type === "result") { + const resultText = + typeof event.result === "string" + ? event.result + : typeof event.text === "string" + ? event.text + : ""; + if (resultText) canonicalText += resultText; + } + } + + return { canonicalText, recordCount }; +} + +// --------------------------------------------------------------------------- +// Marker-block parsing — reduce → take the LAST complete marker block → parse → +// schema-validate. Returns null on any failure (caller fails the job). +// --------------------------------------------------------------------------- + +/** Extract the content of the LAST complete marker block from canonical text. */ +function extractLastMarkerBlock(text: string): string | null { + let result: string | null = null; + let searchFrom = 0; + + while (true) { + const open = text.indexOf(CURSOR_MARKER_OPEN, searchFrom); + if (open === -1) break; + const contentStart = open + CURSOR_MARKER_OPEN.length; + const close = text.indexOf(CURSOR_MARKER_CLOSE, contentStart); + if (close === -1) break; // no matching close — block is incomplete + result = text.slice(contentStart, close); + searchFrom = close + CURSOR_MARKER_CLOSE.length; + } + + return result; +} + +/** + * Parse Cursor `stream-json` stdout into a validated review output. + * + * Pipeline: line-buffered NDJSON reduce → reconstruct canonical text → take the + * LAST complete marker block → JSON.parse → schema-validate. Returns null on + * ANY failure (missing marker, malformed JSON, schema mismatch) so the caller + * can fail the job. + */ +export function parseCursorStreamOutput(stdout: string): CursorReviewOutput | null { + if (!stdout || !stdout.trim()) return null; + + const { canonicalText } = reduceCursorStreamEvents(stdout); + if (!canonicalText) return null; + + const block = extractLastMarkerBlock(canonicalText); + if (block === null) return null; + + let parsed: unknown; + try { + parsed = JSON.parse(block.trim()); + } catch { + return null; + } + + return validateCursorReviewOutput(parsed); +} + +// --------------------------------------------------------------------------- +// Live log formatter — maps one NDJSON event to a readable log line. +// --------------------------------------------------------------------------- + +/** + * Format one Cursor `stream-json` line for the LiveLogViewer. + * Returns a human-readable string, or null if the line should be skipped. + * + * Mirrors formatClaudeLogEvent. Applies the same partial-output dedup rule as + * the reducer: an assistant delta is shown only when `timestamp_ms` is present + * and `model_call_id` is absent. + */ +export function formatCursorLogEvent(line: string): string | null { + let event: CursorStreamEvent; + try { + event = JSON.parse(line) as CursorStreamEvent; + } catch { + return null; + } + + switch (event.type) { + case "system": { + if (event.subtype === "init") { + const model = typeof event.model === "string" ? event.model : undefined; + const sessionId = typeof event.session_id === "string" ? event.session_id : undefined; + const bits = ["[init]"]; + if (model) bits.push(`model=${model}`); + if (sessionId) bits.push(`session=${sessionId}`); + return bits.join(" "); + } + return null; + } + case "assistant": { + // Apply partial-output dedup: only show real new deltas. If the event + // carries no timestamp_ms at all (non-partial flush), show it once. + if (event.timestamp_ms !== undefined && !isRealAssistantDelta(event)) return null; + const text = extractAssistantText(event); + return text ? text : null; + } + case "tool_call": { + const name = typeof event.name === "string" ? event.name : "tool"; + if (event.subtype === "completed") { + return `[${name}] completed`; + } + // started (or unspecified) — show concise args, never full file contents. + const args = + typeof event.args === "string" + ? event.args.slice(0, 100) + : event.args !== undefined + ? JSON.stringify(event.args).slice(0, 100) + : ""; + return `[${name}] ${args}`.trimEnd(); + } + case "result": { + const duration = + typeof event.duration_ms === "number" ? `${event.duration_ms}ms` : undefined; + const requestId = typeof event.request_id === "string" ? event.request_id : undefined; + const bits = ["[result]"]; + if (duration) bits.push(duration); + if (requestId) bits.push(`request=${requestId}`); + return bits.length > 1 ? bits.join(" ") : null; + } + default: + return null; + } +} + +// --------------------------------------------------------------------------- +// Finding transform — Cursor findings → external annotations. +// Identical shape to transformClaudeFindings, with author "Cursor". +// --------------------------------------------------------------------------- + +export interface CursorReviewAnnotationInput { + source: string; + filePath: string; + lineStart: number; + lineEnd: number; + type: string; + side: string; + scope: string; + text: string; + severity: CursorSeverity; + reasoning: string; + author: string; +} + +/** Transform Cursor findings into the external annotation format. */ +export function transformCursorFindings( + findings: CursorFinding[], + source: string, + cwd?: string, + pathTransform?: (path: string) => string, +): CursorReviewAnnotationInput[] { + return findings + .filter((f) => f.file && typeof f.line === "number") + .map((f) => ({ + source, + filePath: pathTransform + ? pathTransform(toRelativePath(f.file, cwd)) + : toRelativePath(f.file, cwd), + lineStart: f.line, + lineEnd: f.end_line ?? f.line, + type: "comment", + side: "new", + scope: "line", + text: `[${f.severity}] ${f.description}`, + severity: f.severity, + reasoning: f.reasoning, + author: "Cursor", + })); +} diff --git a/packages/server/review.ts b/packages/server/review.ts index 33acff0d1..cc6fd245d 100644 --- a/packages/server/review.ts +++ b/packages/server/review.ts @@ -58,6 +58,12 @@ import { transformClaudeFindings, } from "./claude-review"; import { createTourSession, TOUR_EMPTY_OUTPUT_ERROR } from "./tour/tour-review"; +import { + CURSOR_REVIEW_PROMPT, + buildCursorCommand, + parseCursorStreamOutput, + transformCursorFindings, +} from "./cursor-review"; import { loadConfig, saveConfig, detectGitUser, getServerConfig } from "./config"; import { type PRMetadata, type PRReviewFileComment, type PRStackTree, type PRListItem, fetchPR, fetchPRFileContent, fetchPRContext, submitPRReview, fetchPRViewedFiles, markPRFilesViewed, fetchPRStack, fetchPRList, getPRUser, parsePRUrl, prRefFromMetadata, isSameProject, getDisplayRepo, getMRLabel, getMRNumberLabel } from "./pr"; import { AI_QUERY_ENDPOINT, createAIRuntime } from "./ai-runtime"; @@ -538,6 +544,17 @@ export async function startReviewServer( return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, effort, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; } + if (provider === "cursor") { + // Cursor has no schema flag — its marker-block output contract lives in + // CURSOR_REVIEW_PROMPT. captureStdout is required (the marker block comes + // back on stdout, like Claude). buildCursorCommand puts the prompt on + // stdin and threads --workspace=cwd to match the spawn cwd. + const model = typeof config?.model === "string" && config.model ? config.model : undefined; + const prompt = CURSOR_REVIEW_PROMPT + "\n\n---\n\n" + userMessage; + const { command, stdinPrompt } = buildCursorCommand(prompt, model, cwd); + return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; + } + return null; }, @@ -611,6 +628,43 @@ export async function startReviewServer( return; } + // --- Cursor path --- + // FAIL-CLOSED: Cursor output is prompt-enforced (no schema flag), so any + // missing/malformed/schema/transform/insertion failure must MUTATE the job + // to failed — NEVER throw (agent-jobs.ts swallows throws, silently leaving + // an exit-0 job marked done). Mirrors the Tour fail-closed pattern below. + if (job.provider === "cursor") { + const output = meta.stdout ? parseCursorStreamOutput(meta.stdout) : null; + if (!output) { + job.status = "failed"; + job.error = "Cursor review output missing or unparseable (no valid marker JSON)."; + return; + } + + job.summary = { + correctness: output.summary.correctness, + explanation: output.summary.explanation, + confidence: output.summary.confidence, + }; + + if (output.findings.length > 0) { + const annotations = transformCursorFindings( + output.findings, + job.source, + cwd, + workspace ? (filePath) => workspace.normalizeAnnotationPath(filePath) : undefined, + ) + .map(a => ({ ...a, ...jobPrContext, ...(jobDiffScope && { diffScope: jobDiffScope }) })); + const result = externalAnnotations.addAnnotations({ annotations }); + if ("error" in result) { + job.status = "failed"; + job.error = `Cursor annotation insertion failed: ${result.error}`; + return; + } + } + return; + } + // --- Tour path --- if (job.provider === "tour") { const { summary } = await tour.onJobComplete({ job, meta }); diff --git a/packages/ui/components/AgentsTab.tsx b/packages/ui/components/AgentsTab.tsx index 55ec9f6d4..73fa8c3e6 100644 --- a/packages/ui/components/AgentsTab.tsx +++ b/packages/ui/components/AgentsTab.tsx @@ -17,7 +17,7 @@ import { isTerminalStatus } from '@plannotator/shared/agent-jobs'; import { cn } from '../lib/utils'; import { ReviewAgentsIcon } from './ReviewAgentsIcon'; import { useAgentSettings } from '../hooks/useAgentSettings'; -import type { AgentEngine, AgentMode } from '../hooks/useAgentSettings'; +import type { AgentEngine, AgentMode, ReviewEngine } from '../hooks/useAgentSettings'; // --- Agent option catalogs (shared across review + tour engine dropdowns) --- @@ -68,6 +68,12 @@ const TOUR_CLAUDE_MODELS: Array<{ value: string; label: string }> = [ { value: 'opus', label: 'Opus (thorough)' }, ]; +// Cursor v1 minimal catalog — just Auto. Model discovery waits on a +// provider-metadata contract (see approach doc § Model Discovery). +const CURSOR_MODELS: Array<{ value: string; label: string }> = [ + { value: 'Auto', label: 'Auto' }, +]; + const MODE_LABEL: Record = { review: 'Code Review', tour: 'Code Tour', @@ -78,6 +84,14 @@ const ENGINE_LABEL: Record = { codex: 'Codex', }; +// Review-only label map. Keeps Tour's ENGINE_LABEL exhaustive over the +// narrow AgentEngine union while letting the review surface offer Cursor. +const REVIEW_ENGINE_LABEL: Record = { + claude: 'Claude', + codex: 'Codex', + cursor: 'Cursor', +}; + interface AgentsTabProps { jobs: AgentJobInfo[]; capabilities: AgentCapabilities | null; @@ -146,6 +160,7 @@ function catalogLabel(list: Array<{ value: string; label: string }>, value: stri } function formatModel(provider: string, engine: string | undefined, model: string): string { + if (provider === 'cursor') return catalogLabel(CURSOR_MODELS, model); if (provider === 'codex' || engine === 'codex') return catalogLabel(CODEX_MODELS, model); if (provider === 'tour' && engine === 'claude') return catalogLabel(TOUR_CLAUDE_MODELS, model); return catalogLabel(CLAUDE_MODELS, model); @@ -376,6 +391,7 @@ export const AgentsTab: React.FC = ({ codexModel, codexReasoning, codexFast, + cursorModel, tourClaudeModel, tourClaudeEffort, tourCodexModel, @@ -389,6 +405,7 @@ export const AgentsTab: React.FC = ({ setCodexModel, setCodexReasoning, setCodexFast, + setCursorModel, setTourClaudeModel, setTourClaudeEffort, setTourCodexModel, @@ -399,7 +416,9 @@ export const AgentsTab: React.FC = ({ const claudeAvailable = capabilities?.providers.some((p) => p.id === 'claude' && p.available) ?? false; const codexAvailable = capabilities?.providers.some((p) => p.id === 'codex' && p.available) ?? false; const tourAvailable = capabilities?.providers.some((p) => p.id === 'tour' && p.available) ?? false; + const cursorAvailable = capabilities?.providers.some((p) => p.id === 'cursor' && p.available) ?? false; + // Tour engines (narrow union). Cursor is NOT included here — it is review-only. const availableEngines = useMemo(() => { const engines: AgentEngine[] = []; if (claudeAvailable) engines.push('claude'); @@ -407,15 +426,25 @@ export const AgentsTab: React.FC = ({ return engines; }, [claudeAvailable, codexAvailable]); + // Review engines (wide union) = tour engines + cursor when available. + const availableReviewEngines = useMemo(() => { + const engines: ReviewEngine[] = [...availableEngines]; + if (cursorAvailable) engines.push('cursor'); + return engines; + }, [availableEngines, cursorAvailable]); + const availableModes = useMemo(() => { const modes: AgentMode[] = []; - if (availableEngines.length > 0) modes.push('review'); + if (availableReviewEngines.length > 0) modes.push('review'); if (tourAvailable && availableEngines.length > 0) modes.push('tour'); return modes; - }, [availableEngines.length, tourAvailable]); + }, [availableReviewEngines.length, availableEngines.length, tourAvailable]); const firstAvailableEngine = availableEngines[0] ?? null; + const firstAvailableReviewEngine = availableReviewEngines[0] ?? null; const engineAvailable = (engine: AgentEngine) => engine === 'claude' ? claudeAvailable : codexAvailable; + const reviewEngineAvailable = (engine: ReviewEngine) => + engine === 'cursor' ? cursorAvailable : engineAvailable(engine); // Reconcile mode + engine choices against live capabilities. Runs when // capabilities change or the stored selection becomes invalid. @@ -424,13 +453,17 @@ export const AgentsTab: React.FC = ({ if (!selectedMode || !availableModes.includes(selectedMode)) { setSelectedMode(availableModes[0]); } - if (!firstAvailableEngine) return; - if (!engineAvailable(reviewEngine)) setReviewEngine(firstAvailableEngine); - if (!engineAvailable(tourEngine)) setTourEngine(firstAvailableEngine); + if (firstAvailableReviewEngine && !reviewEngineAvailable(reviewEngine)) { + setReviewEngine(firstAvailableReviewEngine); + } + if (firstAvailableEngine && !engineAvailable(tourEngine)) { + setTourEngine(firstAvailableEngine); + } }, [ capabilities, availableModes, firstAvailableEngine, + firstAvailableReviewEngine, selectedMode, reviewEngine, tourEngine, @@ -466,10 +499,19 @@ export const AgentsTab: React.FC = ({ ); type LaunchParams = Parameters[0]; - const buildReviewLaunch = (engine: AgentEngine): LaunchParams => { + const buildReviewLaunch = (engine: ReviewEngine): LaunchParams => { if (engine === 'claude') { return { provider: 'claude', label: 'Code Review', model: claudeModel, effort: claudeEffort }; } + if (engine === 'cursor') { + // Omission ⇒ Auto: drop model client-side when Auto so the POST carries + // no model and the server applies Cursor's default behavior. + return { + provider: 'cursor', + label: 'Code Review', + ...(cursorModel && cursorModel !== 'Auto' ? { model: cursorModel } : {}), + }; + } return { provider: 'codex', label: 'Code Review', @@ -489,7 +531,7 @@ export const AgentsTab: React.FC = ({ }); const canLaunch = selectedMode === 'review' - ? engineAvailable(reviewEngine) + ? reviewEngineAvailable(reviewEngine) : selectedMode === 'tour' ? tourAvailable && engineAvailable(tourEngine) : false; @@ -501,6 +543,7 @@ export const AgentsTab: React.FC = ({ const modeOptions = availableModes.map((mode) => ({ value: mode, label: MODE_LABEL[mode] })); const engineOptions = availableEngines.map((engine) => ({ value: engine, label: ENGINE_LABEL[engine] })); + const reviewEngineOptions = availableReviewEngines.map((engine) => ({ value: engine, label: REVIEW_ENGINE_LABEL[engine] })); const renderStaticChoice = (label: string, icon?: React.ReactNode) => (
{icon} @@ -508,16 +551,23 @@ export const AgentsTab: React.FC = ({
); - const renderEngineSelect = (value: AgentEngine, onChange: (engine: AgentEngine) => void) => ( + // Engine picker, parameterized by the engine list + options so the review + // surface can use the wide (cursor-inclusive) variant while Tour stays narrow. + const renderEngineSelect = ( + value: T, + onChange: (engine: T) => void, + options: Array<{ value: string; label: string }>, + fallbackLabel: string, + ) => ( - {availableEngines.length > 1 ? ( + {options.length > 1 ? ( onChange(next as AgentEngine)} + options={options} + onChange={(next) => onChange(next as T)} /> ) : ( - renderStaticChoice(engineOptions[0]?.label ?? ENGINE_LABEL[value]) + renderStaticChoice(options[0]?.label ?? fallbackLabel) )} ); @@ -549,7 +599,7 @@ export const AgentsTab: React.FC = ({ {selectedMode === 'review' && ( <> - {renderEngineSelect(reviewEngine, setReviewEngine)} + {renderEngineSelect(reviewEngine, setReviewEngine, reviewEngineOptions, REVIEW_ENGINE_LABEL[reviewEngine])} {reviewEngine === 'claude' && ( <> @@ -573,12 +623,23 @@ export const AgentsTab: React.FC = ({ )} + {reviewEngine === 'cursor' && ( + <> +
+ experimental + Findings are prompt-enforced +
+ + + + + )} )} {selectedMode === 'tour' && ( <> - {renderEngineSelect(tourEngine, setTourEngine)} + {renderEngineSelect(tourEngine, setTourEngine, engineOptions, ENGINE_LABEL[tourEngine])} ; } +// Cursor has no per-model sub-settings (no effort/reasoning), so a flat +// { model } section is enough — deliberately simpler than Claude/Codex. +interface CursorSection { + model: string; // 'Auto' or a discovered/custom model string +} + export type AgentMode = 'review' | 'tour'; export type AgentEngine = 'claude' | 'codex'; +// Review-only engine union. Tour stays on the narrow AgentEngine so its +// exhaustive Record maps remain valid without change. +export type ReviewEngine = AgentEngine | 'cursor'; interface AgentSettingsState { selectedMode?: AgentMode; - reviewEngine: AgentEngine; + reviewEngine: ReviewEngine; tourEngine: AgentEngine; claude: ClaudeSection; codex: CodexSection; + cursor: CursorSection; tourClaude: ClaudeSection; tourCodex: CodexSection; } @@ -43,6 +54,7 @@ const initialState: AgentSettingsState = { tourEngine: 'claude', claude: { model: DEFAULT_CLAUDE_MODEL, perModel: {} }, codex: { model: DEFAULT_CODEX_MODEL, perModel: {} }, + cursor: { model: DEFAULT_CURSOR_MODEL }, tourClaude: { model: DEFAULT_TOUR_CLAUDE_MODEL, perModel: {} }, tourCodex: { model: DEFAULT_TOUR_CODEX_MODEL, perModel: {} }, }; @@ -70,6 +82,11 @@ function parseEngine(value: unknown): AgentEngine { return value === 'codex' ? 'codex' : 'claude'; } +function parseReviewEngine(value: unknown): ReviewEngine { + if (value === 'cursor') return 'cursor'; + return parseEngine(value); +} + function parseMode(value: unknown): AgentMode | undefined { if (value === 'review' || value === 'tour') return value; return undefined; @@ -82,7 +99,7 @@ function readCookie(): AgentSettingsState { const parsed = JSON.parse(raw); return { selectedMode: parseMode(parsed.selectedMode) ?? initialState.selectedMode, - reviewEngine: parseEngine(parsed.reviewEngine), + reviewEngine: parseReviewEngine(parsed.reviewEngine), tourEngine: parseEngine(parsed.tourEngine), claude: { model: typeof parsed.claude?.model === 'string' ? parsed.claude.model : DEFAULT_CLAUDE_MODEL, @@ -92,6 +109,9 @@ function readCookie(): AgentSettingsState { model: typeof parsed.codex?.model === 'string' ? parsed.codex.model : DEFAULT_CODEX_MODEL, perModel: sanitizeCodexPerModel(parsed.codex?.perModel), }, + cursor: { + model: typeof parsed.cursor?.model === 'string' ? parsed.cursor.model : DEFAULT_CURSOR_MODEL, + }, tourClaude: { model: typeof parsed.tourClaude?.model === 'string' ? parsed.tourClaude.model : DEFAULT_TOUR_CLAUDE_MODEL, perModel: parsed.tourClaude?.perModel ?? {}, @@ -117,7 +137,7 @@ export function useAgentSettings() { setState((s) => ({ ...s, selectedMode: mode })); }, []); - const setReviewEngine = useCallback((engine: AgentEngine) => { + const setReviewEngine = useCallback((engine: ReviewEngine) => { setState((s) => ({ ...s, reviewEngine: engine })); }, []); @@ -155,6 +175,10 @@ export function useAgentSettings() { setState((s) => ({ ...s, codex: { ...s.codex, model } })); }, []); + const setCursorModel = useCallback((model: string) => { + setState((s) => ({ ...s, cursor: { ...s.cursor, model } })); + }, []); + const patchCodex = useCallback( ( section: 'codex' | 'tourCodex', @@ -223,6 +247,7 @@ export function useAgentSettings() { codexModel: state.codex.model, codexReasoning, codexFast, + cursorModel: state.cursor.model, tourClaudeModel: state.tourClaude.model, tourClaudeEffort, tourCodexModel: state.tourCodex.model, @@ -236,6 +261,7 @@ export function useAgentSettings() { setCodexModel, setCodexReasoning, setCodexFast, + setCursorModel, setTourClaudeModel, setTourClaudeEffort, setTourCodexModel, From 98e18b9098bc8d567552eea0406bf687737c138e Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 14:17:54 -0700 Subject: [PATCH 2/7] fix(review): address Cursor provider review feedback (PR #912) - [P1] Pass the review prompt as the trailing positional argv arg instead of stdin. 'agent' reads task text from argv ([prompt...]); on stdin it received an empty prompt and never saw the diff/instructions, so no real review ran. - [P2] Add --trust to the headless command. Without it 'agent -p' stops on an interactive workspace-trust prompt a background job can't answer, failing the first review in any untrusted repo. Safe alongside --mode ask + --sandbox. - [P2] Derive the job verdict from finding severities (like Claude) instead of storing Cursor's free-form summary.correctness. A value like 'not correct' was accepted verbatim and the detail panel (string contains 'correct' except 'incorrect' -> green) would invert the displayed result. - [P3] Carry a partial-line buffer across stdout chunks in the live-log drainer. stream-json is NDJSON with arbitrary chunk boundaries; records split across chunks were dropped from live logs (fixes Claude's same latent bug too). All changes mirrored into the Pi runtime (serverReview.ts, agent-jobs.ts). Typecheck green; 58 tests pass across cursor-review + affected families. --- apps/pi-extension/server/agent-jobs.ts | 63 +++++++++++++----------- apps/pi-extension/server/serverReview.ts | 16 ++++-- packages/server/agent-jobs.ts | 61 ++++++++++++----------- packages/server/cursor-review.test.ts | 11 +++-- packages/server/cursor-review.ts | 19 ++++--- packages/server/review.ts | 16 ++++-- 6 files changed, 106 insertions(+), 80 deletions(-) diff --git a/apps/pi-extension/server/agent-jobs.ts b/apps/pi-extension/server/agent-jobs.ts index 696c56212..34648d767 100644 --- a/apps/pi-extension/server/agent-jobs.ts +++ b/apps/pi-extension/server/agent-jobs.ts @@ -195,40 +195,45 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { if (spawnOptions?.cwd) jobOutputPaths.set(`${id}:cwd`, spawnOptions.cwd); broadcast({ type: "job:started", job: { ...info } }); - // --- Stdout capture (Claude JSONL streaming) --- + // --- Stdout capture (Claude/Cursor stream-json) --- let stdoutBuf = ""; if (captureStdout && proc.stdout) { + // Format one complete JSONL line into a live-log delta (skip result + // events — handled in onJobComplete). + const emitLogLine = (line: string) => { + if (!line.trim()) return; + // Tour jobs with the Claude engine also stream Claude JSONL. + if (provider === "claude" || spawnOptions?.engine === "claude") { + const formatted = formatClaudeLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + try { + const event = JSON.parse(line); + if (event.type === 'result') return; + } catch { /* not JSON — forward as raw log */ } + broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); + }; + // stream-json output is NDJSON and chunk boundaries are arbitrary — + // carry the trailing partial line until a later chunk completes it, + // otherwise records split across chunks are dropped from live logs. + let logLineCarry = ""; proc.stdout.on("data", (chunk: Buffer) => { const text = chunk.toString(); stdoutBuf += text; - - // Forward JSONL lines as log events - const lines = text.split('\n'); - for (const line of lines) { - if (!line.trim()) continue; - // Tour jobs with the Claude engine also stream Claude JSONL. - if (provider === "claude" || spawnOptions?.engine === "claude") { - const formatted = formatClaudeLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - // Cursor: map stream-json events (init/assistant/tool_call/result) - // into readable log deltas, applying the partial-output dedup rule. - if (provider === "cursor") { - const formatted = formatCursorLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - try { - const event = JSON.parse(line); - if (event.type === 'result') continue; - } catch { /* not JSON — forward as raw log */ } - broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); - } + const lines = (logLineCarry + text).split('\n'); + logLineCarry = lines.pop() ?? ""; + for (const line of lines) emitLogLine(line); + }); + proc.stdout.on("end", () => { + if (logLineCarry) emitLogLine(logLineCarry); }); } diff --git a/apps/pi-extension/server/serverReview.ts b/apps/pi-extension/server/serverReview.ts index 20a8d7282..8743c252a 100644 --- a/apps/pi-extension/server/serverReview.ts +++ b/apps/pi-extension/server/serverReview.ts @@ -576,12 +576,12 @@ export async function startReviewServer(options: { if (provider === "cursor") { // Cursor has no schema flag — its marker-block output contract lives in // CURSOR_REVIEW_PROMPT. captureStdout is required (the marker block comes - // back on stdout, like Claude). buildCursorCommand puts the prompt on - // stdin and threads --workspace=cwd to match the spawn cwd. + // back on stdout, like Claude). buildCursorCommand passes the prompt as + // the trailing argv arg and threads --workspace=cwd to match the spawn cwd. const model = typeof config?.model === "string" && config.model ? config.model : undefined; const prompt = CURSOR_REVIEW_PROMPT + "\n\n---\n\n" + userMessage; - const { command, stdinPrompt } = buildCursorCommand(prompt, model, cwd); - return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; + const { command } = buildCursorCommand(prompt, model, cwd); + return { command, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; } return null; @@ -666,8 +666,14 @@ export async function startReviewServer(options: { return; } + // Derive the verdict from finding severities (like Claude) rather than + // trusting Cursor's free-form `correctness` string. Cursor has no schema + // flag, so a model value like "not correct" would be stored verbatim and + // the detail panel (any string containing "correct" except "incorrect" → + // green) would invert the displayed result. + const hasImportant = output.findings.some((f) => f.severity === "important"); job.summary = { - correctness: output.summary.correctness, + correctness: hasImportant ? "Issues Found" : "Correct", explanation: output.summary.explanation, confidence: output.summary.confidence, }; diff --git a/packages/server/agent-jobs.ts b/packages/server/agent-jobs.ts index c6ee893bf..c53c2f090 100644 --- a/packages/server/agent-jobs.ts +++ b/packages/server/agent-jobs.ts @@ -247,41 +247,44 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob let stdoutBuf = ""; const stdoutDone = (captureStdout && proc.stdout && typeof proc.stdout !== "number") ? (async () => { + // Format one complete JSONL line into a live-log delta (skip result + // events — those are handled in onJobComplete). + const emitLogLine = (line: string) => { + if (!line.trim()) return; + // Claude: format JSONL into readable text. Tour jobs with the + // Claude engine also stream Claude JSONL, so key off engine too. + if (provider === "claude" || spawnOptions?.engine === "claude") { + const formatted = formatClaudeLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + // Cursor: map stream-json events (init/assistant/tool_call/result) + // into readable log deltas, applying the partial-output dedup rule. + if (provider === "cursor") { + const formatted = formatCursorLogEvent(line); + if (formatted !== null) broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); + return; + } + try { + const event = JSON.parse(line); + if (event.type === 'result') return; + } catch { /* not JSON — forward as raw log */ } + broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); + }; try { const reader = proc!.stdout as unknown as AsyncIterable; + // stream-json output is NDJSON and chunk boundaries are arbitrary — + // carry the trailing partial line until a later chunk completes it, + // otherwise records split across chunks are dropped from live logs. + let logLineCarry = ""; for await (const chunk of reader) { const text = typeof chunk === "string" ? chunk : new TextDecoder().decode(chunk); stdoutBuf += text; - - // Forward JSONL lines as log events (skip result events) - const lines = text.split('\n'); - for (const line of lines) { - if (!line.trim()) continue; - // Claude: format JSONL into readable text. Tour jobs with the - // Claude engine also stream Claude JSONL, so key off engine too. - if (provider === "claude" || spawnOptions?.engine === "claude") { - const formatted = formatClaudeLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - // Cursor: map stream-json events (init/assistant/tool_call/result) - // into readable log deltas, applying the partial-output dedup rule. - if (provider === "cursor") { - const formatted = formatCursorLogEvent(line); - if (formatted !== null) { - broadcast({ type: "job:log", jobId: id, delta: formatted + '\n' }); - } - continue; - } - try { - const event = JSON.parse(line); - if (event.type === 'result') continue; // handled in onJobComplete - } catch { /* not JSON — forward as raw log */ } - broadcast({ type: "job:log", jobId: id, delta: line + '\n' }); - } + const lines = (logLineCarry + text).split('\n'); + logLineCarry = lines.pop() ?? ""; + for (const line of lines) emitLogLine(line); } + if (logLineCarry) emitLogLine(logLineCarry); } catch { // Stream closed } diff --git a/packages/server/cursor-review.test.ts b/packages/server/cursor-review.test.ts index bd724037d..04ba82c48 100644 --- a/packages/server/cursor-review.test.ts +++ b/packages/server/cursor-review.test.ts @@ -63,8 +63,8 @@ function chunkify(s: string, parts: number): string[] { // --------------------------------------------------------------------------- describe("buildCursorCommand", () => { - test("uses the `agent` binary and read-only flags, prompt on stdin", () => { - const { command, stdinPrompt } = buildCursorCommand("review this", "gpt-5", "/repo"); + test("uses the `agent` binary and read-only flags, prompt as trailing argv", () => { + const { command } = buildCursorCommand("review this", "gpt-5", "/repo"); expect(command[0]).toBe("agent"); expect(command).toContain("-p"); // Read-only posture: --mode ask + --sandbox enabled, NO --force/--yolo. @@ -74,6 +74,8 @@ describe("buildCursorCommand", () => { expect(command[command.indexOf("--sandbox") + 1]).toBe("enabled"); expect(command).not.toContain("--force"); expect(command).not.toContain("--yolo"); + // Headless print mode needs --trust to skip the interactive workspace-trust prompt. + expect(command).toContain("--trust"); // stream-json + partial output for live logs. expect(command).toContain("--output-format"); expect(command[command.indexOf("--output-format") + 1]).toBe("stream-json"); @@ -81,9 +83,8 @@ describe("buildCursorCommand", () => { // workspace matches launch cwd; model present. expect(command[command.indexOf("--workspace") + 1]).toBe("/repo"); expect(command[command.indexOf("--model") + 1]).toBe("gpt-5"); - // Prompt is NOT in argv — it goes on stdin. - expect(stdinPrompt).toBe("review this"); - expect(command).not.toContain("review this"); + // Prompt is the trailing positional argv arg — agent reads it from argv, not stdin. + expect(command[command.length - 1]).toBe("review this"); }); test("omits --model when model is Auto or empty", () => { diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts index fe6f4526f..2347f8221 100644 --- a/packages/server/cursor-review.ts +++ b/packages/server/cursor-review.ts @@ -223,18 +223,21 @@ If no issues are found, return an empty "findings" array with a valid summary.`; export interface CursorCommandResult { command: string[]; - /** Prompt text written to stdin (preferred over argv, like Claude). */ - stdinPrompt: string; } /** * Build the `agent -p` command. NOTE the binary is `agent`, NOT `cursor`. * * Read-only posture comes entirely from `--mode ask` + `--sandbox enabled` and - * the absence of `--force`/`--yolo`. The prompt goes on stdin (preferred over - * argv — avoids quoting issues and argv size limits). `--model` is omitted when - * the model is `Auto`/empty so Cursor uses its default model selection. - * `--workspace` is set to the launch cwd when provided, matching the spawn cwd. + * the absence of `--force`/`--yolo`. `--trust` is required in headless print + * mode: without it Cursor stops on an interactive workspace-trust prompt that a + * background job can never answer. It is safe here because the run is already + * constrained to read-only ask mode with the sandbox enabled. + * + * The prompt is the trailing positional argument — `agent` reads task text from + * argv (`[prompt...]`), not stdin. `--model` is omitted when the model is + * `Auto`/empty so Cursor uses its default model selection. `--workspace` is set + * to the launch cwd when provided, matching the spawn cwd. */ export function buildCursorCommand(prompt: string, model?: string, cwd?: string): CursorCommandResult { const useModel = !!model && model !== "Auto"; @@ -248,12 +251,14 @@ export function buildCursorCommand(prompt: string, model?: string, cwd?: string) "--output-format", "stream-json", "--stream-partial-output", + "--trust", ...(cwd ? ["--workspace", cwd] : []), "--sandbox", "enabled", ...(useModel ? ["--model", model] : []), + // Prompt is the trailing positional arg — agent reads it from argv, not stdin. + prompt, ], - stdinPrompt: prompt, }; } diff --git a/packages/server/review.ts b/packages/server/review.ts index cc6fd245d..515e0917c 100644 --- a/packages/server/review.ts +++ b/packages/server/review.ts @@ -547,12 +547,12 @@ export async function startReviewServer( if (provider === "cursor") { // Cursor has no schema flag — its marker-block output contract lives in // CURSOR_REVIEW_PROMPT. captureStdout is required (the marker block comes - // back on stdout, like Claude). buildCursorCommand puts the prompt on - // stdin and threads --workspace=cwd to match the spawn cwd. + // back on stdout, like Claude). buildCursorCommand passes the prompt as + // the trailing argv arg and threads --workspace=cwd to match the spawn cwd. const model = typeof config?.model === "string" && config.model ? config.model : undefined; const prompt = CURSOR_REVIEW_PROMPT + "\n\n---\n\n" + userMessage; - const { command, stdinPrompt } = buildCursorCommand(prompt, model, cwd); - return { command, stdinPrompt, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; + const { command } = buildCursorCommand(prompt, model, cwd); + return { command, prompt, cwd, label: jobLabel, captureStdout: true, model, prUrl: launchPrUrl, diffScope: launchDiffScope, diffContext }; } return null; @@ -641,8 +641,14 @@ export async function startReviewServer( return; } + // Derive the verdict from finding severities (like Claude) rather than + // trusting Cursor's free-form `correctness` string. Cursor has no schema + // flag, so a model value like "not correct" would be stored verbatim and + // the detail panel (any string containing "correct" except "incorrect" → + // green) would invert the displayed result. + const hasImportant = output.findings.some((f) => f.severity === "important"); job.summary = { - correctness: output.summary.correctness, + correctness: hasImportant ? "Issues Found" : "Correct", explanation: output.summary.explanation, confidence: output.summary.confidence, }; From ac41eac49796c1226c9f598352477b90e0da65e5 Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 19:24:41 -0700 Subject: [PATCH 3/7] fix(review): address second-pass Cursor review feedback (PR #912) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [P2] Skip Cursor's final buffered assistant flush in live logs. We always run with --stream-partial-output, so only real deltas (timestamp_ms present, model_call_id absent) should show; the no-timestamp end-of-turn flush repeats the whole assistant output. Added a test. - [important] Verify the 'agent' binary is actually Cursor before offering it. 'agent' is a generic name; capability detection now probes 'agent about --format json' (fast, unauthenticated) and requires a cliVersion field, so an unrelated 'agent' on PATH is never spawned with review content. - [nit] Cursor is fail-closed: an unexpected throw in onJobComplete now flips the job to failed instead of being swallowed (Claude/Codex stay fail-open). - [P3] Cursor jobs now label as 'Cursor' (was 'Shell') in the detail panel. - [nit] Cursor's single-option model row renders static text, not a dead dropdown. - [nit] Pi buildCommand JSDoc now lists cursor. All server changes mirrored Bun<->Pi. Typecheck green; 59 tests pass. Note: the argv-size finding was triaged and intentionally NOT changed — Cursor passes the prompt as argv exactly like Codex already does (parity, not a regression). The diff is only embedded for full-stack/workspace/unknown-VCS reviews; normal reviews hand the agent a git instruction or URL. --- apps/pi-extension/server/agent-jobs.ts | 40 ++++++++++++++--- .../dock/panels/ReviewAgentJobDetailPanel.tsx | 2 +- packages/server/agent-jobs.ts | 43 +++++++++++++++++-- packages/server/cursor-review.test.ts | 8 ++++ packages/server/cursor-review.ts | 9 ++-- packages/ui/components/AgentsTab.tsx | 6 ++- 6 files changed, 94 insertions(+), 14 deletions(-) diff --git a/apps/pi-extension/server/agent-jobs.ts b/apps/pi-extension/server/agent-jobs.ts index 34648d767..e3742fba4 100644 --- a/apps/pi-extension/server/agent-jobs.ts +++ b/apps/pi-extension/server/agent-jobs.ts @@ -64,7 +64,7 @@ export interface AgentJobHandlerOptions { mode: "plan" | "review" | "annotate"; getServerUrl: () => string; getCwd: () => string; - /** Server-side command builder for known providers (codex, claude, tour). */ + /** Server-side command builder for known providers (codex, claude, tour, cursor). */ buildCommand?: (provider: string, config?: Record) => Promise<{ command: string[]; outputPath?: string; @@ -94,6 +94,28 @@ export interface AgentJobHandlerOptions { onJobComplete?: (job: AgentJobInfo, meta: { outputPath?: string; stdout?: string; cwd?: string }) => void | Promise; } +/** + * `agent` is a very generic binary name — other tools ship binaries called + * `agent` too, so finding one on PATH does not prove it is Cursor. Verify its + * identity before offering Cursor as a provider. Cheap one-time probe: `agent + * about --format json` is fast, works unauthenticated, and only the Cursor CLI + * emits a JSON object with a `cliVersion` field. Any failure → not Cursor. + */ +function isCursorAgentAvailable(): boolean { + if (!whichCmd("agent")) return false; + try { + const out = execFileSync("agent", ["about", "--format", "json"], { + timeout: 3000, + stdio: ["ignore", "pipe", "ignore"], + encoding: "utf8", + }); + const parsed = JSON.parse(out) as { cliVersion?: unknown }; + return parsed != null && typeof parsed.cliVersion === "string"; + } catch { + return false; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions) { const { mode, getServerUrl, getCwd } = options; @@ -108,8 +130,9 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { { id: "claude", name: "Claude Code", available: whichCmd("claude") }, { id: "codex", name: "Codex CLI", available: whichCmd("codex") }, { id: "tour", name: "Code Tour", available: whichCmd("claude") || whichCmd("codex") }, - // Cursor CLI's binary is literally named `agent` (NOT `cursor`). - { id: "cursor", name: "Cursor CLI", available: whichCmd("agent") }, + // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify + // identity rather than trusting the name alone (see isCursorAgentAvailable). + { id: "cursor", name: "Cursor CLI", available: isCursorAgentAvailable() }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -291,8 +314,15 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { stdout: captureStdout ? stdoutBuf : undefined, cwd: jobCwd, }); - } catch { - // Result ingestion failure shouldn't prevent job completion broadcast + } catch (err) { + // Claude/Codex are fail-open; Cursor is fail-closed — an unexpected + // throw during prompt-enforced ingestion must fail the job, not pass + // it. (The Cursor handler normally fails by mutation and never throws; + // this guards future refactors.) + if (provider === "cursor") { + entry.info.status = "failed"; + entry.info.error = err instanceof Error ? err.message : "Cursor result ingestion failed"; + } } } jobOutputPaths.delete(id); diff --git a/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx b/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx index d1c2b2ff5..6afa5a5d1 100644 --- a/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx +++ b/packages/review-editor/dock/panels/ReviewAgentJobDetailPanel.tsx @@ -350,7 +350,7 @@ function ProviderPill({ provider, engine, model }: { provider: string; engine?: const engineLabel = engine === 'codex' ? 'Codex' : 'Claude'; label = model && engine !== 'codex' ? `Tour · ${engineLabel} ${model.charAt(0).toUpperCase() + model.slice(1)}` : `Tour · ${engineLabel}`; } else { - label = provider === 'claude' ? 'Claude' : provider === 'codex' ? 'Codex' : 'Shell'; + label = provider === 'claude' ? 'Claude' : provider === 'codex' ? 'Codex' : provider === 'cursor' ? 'Cursor' : 'Shell'; } return ( void | Promise; } +/** + * `agent` is a very generic binary name — other tools ship binaries called + * `agent` too, so finding one on PATH does not prove it is Cursor. Confirm its + * identity before offering Cursor as a provider; otherwise selecting Cursor + * could spawn an unrelated tool with review content as an argument. + * + * Cheap one-time probe: `agent about --format json` is fast, works + * unauthenticated, and only the Cursor CLI emits a JSON object with a + * `cliVersion` field. Any failure/timeout/mismatch → treat as not Cursor. + */ +function isCursorAgentAvailable(): boolean { + if (!Bun.which("agent")) return false; + try { + const res = Bun.spawnSync(["agent", "about", "--format", "json"], { + stdout: "pipe", + stderr: "ignore", + timeout: 3000, + }); + if (!res.success) return false; + const parsed = JSON.parse(new TextDecoder().decode(res.stdout)) as { cliVersion?: unknown }; + return parsed != null && typeof parsed.cliVersion === "string"; + } catch { + return false; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJobHandler { const { mode, getServerUrl, getCwd } = options; @@ -120,8 +146,9 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob { id: "claude", name: "Claude Code", available: !!Bun.which("claude") }, { id: "codex", name: "Codex CLI", available: !!Bun.which("codex") }, { id: "tour", name: "Code Tour", available: !!Bun.which("claude") || !!Bun.which("codex") }, - // Cursor CLI's binary is literally named `agent` (NOT `cursor`). - { id: "cursor", name: "Cursor CLI", available: !!Bun.which("agent") }, + // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify + // identity rather than trusting the name alone (see isCursorAgentAvailable). + { id: "cursor", name: "Cursor CLI", available: isCursorAgentAvailable() }, ]; const capabilitiesResponse: AgentCapabilities = { mode, @@ -317,8 +344,16 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob stdout: captureStdout ? stdoutBuf : undefined, cwd: jobCwd, }); - } catch { - // Result ingestion failure shouldn't prevent job completion broadcast + } catch (err) { + // Claude/Codex are fail-open: an ingestion error is logged but does + // not change the terminal state. Cursor is fail-closed — its findings + // are prompt-enforced, so an unexpected throw here must surface as a + // failed job rather than a green one. (The Cursor handler normally + // fails by mutation and never throws; this guards future refactors.) + if (provider === "cursor") { + entry.info.status = "failed"; + entry.info.error = err instanceof Error ? err.message : "Cursor result ingestion failed"; + } } } jobOutputPaths.delete(id); diff --git a/packages/server/cursor-review.test.ts b/packages/server/cursor-review.test.ts index 04ba82c48..1cf501f2b 100644 --- a/packages/server/cursor-review.test.ts +++ b/packages/server/cursor-review.test.ts @@ -255,6 +255,14 @@ describe("formatCursorLogEvent", () => { expect(formatCursorLogEvent("not json")).toBeNull(); expect(formatCursorLogEvent(ndjson({ type: "user" }).trim())).toBeNull(); }); + + test("skips the final buffered assistant flush (no timestamp_ms) under partial output", () => { + // End-of-turn flush carries no timestamp_ms and repeats already-streamed + // deltas — showing it would duplicate the whole assistant output in live logs. + expect( + formatCursorLogEvent(ndjson({ type: "assistant", message: { content: "FULL FLUSH" } }).trim()), + ).toBeNull(); + }); }); // --------------------------------------------------------------------------- diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts index 2347f8221..9902d42c1 100644 --- a/packages/server/cursor-review.ts +++ b/packages/server/cursor-review.ts @@ -440,9 +440,12 @@ export function formatCursorLogEvent(line: string): string | null { return null; } case "assistant": { - // Apply partial-output dedup: only show real new deltas. If the event - // carries no timestamp_ms at all (non-partial flush), show it once. - if (event.timestamp_ms !== undefined && !isRealAssistantDelta(event)) return null; + // We always launch with --stream-partial-output, so the only assistant + // events worth showing live are real new deltas (timestamp_ms present, + // model_call_id absent). Everything else repeats already-streamed text: + // pre-tool-call duplicate flushes (model_call_id present) and the final + // buffered flush at end of turn (no timestamp_ms). Skip both. + if (!isRealAssistantDelta(event)) return null; const text = extractAssistantText(event); return text ? text : null; } diff --git a/packages/ui/components/AgentsTab.tsx b/packages/ui/components/AgentsTab.tsx index 73fa8c3e6..aa1b0e146 100644 --- a/packages/ui/components/AgentsTab.tsx +++ b/packages/ui/components/AgentsTab.tsx @@ -630,7 +630,11 @@ export const AgentsTab: React.FC = ({ Findings are prompt-enforced - + {CURSOR_MODELS.length > 1 ? ( + + ) : ( + renderStaticChoice(catalogLabel(CURSOR_MODELS, cursorModel)) + )} )} From a7f53fdd9e1825f6f8bd2e8eb5acd6d89389875b Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 19:29:00 -0700 Subject: [PATCH 4/7] docs(review): clarify why the stream reducer keeps the assistant flush the live-log formatter drops Self-review follow-up: the assistant-dedup rule differs on purpose between reduceCursorStreamEvents (keeps the no-timestamp end-of-turn flush as a marker-parse safety net) and formatCursorLogEvent (drops it to avoid duplicating output in live logs). Documents the relationship so the two aren't mistakenly unified. Comment-only. --- packages/server/cursor-review.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts index 9902d42c1..2b50b3cea 100644 --- a/packages/server/cursor-review.ts +++ b/packages/server/cursor-review.ts @@ -335,9 +335,15 @@ export function reduceCursorStreamEvents(stdout: string): CursorStreamReduction recordCount++; if (event.type === "assistant") { - // For partial streaming, only append real new deltas. If no event in the - // stream carries timestamp_ms (partial streaming disabled), append the - // full assistant text once. + // Append real new deltas (timestamp_ms present, model_call_id absent). + // The no-timestamp branch below intentionally KEEPS the end-of-turn flush + // — it covers both partial-streaming-disabled output (the full message + // arrives once) and the enabled-mode final flush, and keeping it is a + // parse-robustness safety net: it guarantees the marker block is present + // even if the deltas didn't fully carry it (extractLastMarkerBlock takes + // the LAST block, so the duplicate is harmless). This is deliberately MORE + // lenient than formatCursorLogEvent, which drops the flush so live logs + // don't repeat the whole assistant output. Do not "unify" the two. if (event.timestamp_ms !== undefined) { if (isRealAssistantDelta(event)) canonicalText += extractAssistantText(event); } else if (event.model_call_id === undefined) { From 39f10d85cb967c435891d6ab1c03d5893b497fd6 Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 20:01:04 -0700 Subject: [PATCH 5/7] feat(review): upgrade the Cursor review prompt (investigation-first, repo-guidance-aware) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the homemade Claude-derived prompt with an investigation-first prompt tuned for Cursor's agentic model: - Tells the agent to use tools — read the full module, trace call sites, check sibling code/tests, confirm the issue isn't already handled — before reporting. - Discovers and honors repo guidance if present (CLAUDE.md, REVIEW.md, AGENTS.md, .cursor/rules/*, .cursor/BUGBOT.md root+nested). So project-specific rules flow in from the repo instead of being hardcoded — same prompt works for any repo, and a repo's BUGBOT.md serves both Cursor Bugbot and this CLI review. - Adds an explicit 'Do NOT report' list (formatting/naming, speculative refactors, 'consider adding tests', exotic hypotheticals) to cut noise. - Requires impact + trigger in each finding; one finding per distinct bug. - Optimizes for findings a maintainer would actually fix. Output contract unchanged: same marker block and the same important/nit/pre_existing severities the parser, transform, and UI depend on. No test asserts prompt text; typecheck + 23 tests pass. --- packages/server/cursor-review.ts | 127 +++++++++++++++++-------------- 1 file changed, 70 insertions(+), 57 deletions(-) diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts index 2b50b3cea..9653939c6 100644 --- a/packages/server/cursor-review.ts +++ b/packages/server/cursor-review.ts @@ -115,67 +115,77 @@ export function validateCursorReviewOutput(parsed: unknown): CursorReviewOutput } // --------------------------------------------------------------------------- -// Review prompt — mirrors CLAUDE_REVIEW_PROMPT, ending with the marker-block -// output contract (Cursor has no schema flag, so the contract is the prompt). +// Review prompt — investigation-first methodology tuned for Cursor's agentic +// model, ending with the marker-block output contract (Cursor has no schema +// flag, so the contract IS the prompt). Project-specific rules are NOT baked in +// here: the agent is told to discover and honor repo guidance files at review +// time, so this same prompt works for any repo `plannotator review` is run on. // --------------------------------------------------------------------------- -export const CURSOR_REVIEW_PROMPT = `# Cursor Code Review System Prompt - -## Identity -You are a code review system. Your job is to find bugs that would break -production. You are not a linter, formatter, or style checker unless -project guidance files explicitly expand your scope. - -## Pipeline - -Step 1: Gather context - - Retrieve the PR diff or local diff (gh pr diff, git diff, or jj diff) - - Read CLAUDE.md and REVIEW.md at the repo root and in every directory - containing modified files - - Build a map of which rules apply to which file paths - - Identify any skip rules (paths, patterns, or file types to ignore) - -Step 2: Review for issues - - Logic errors, regressions, broken edge cases, build failures, and code - that will produce wrong results. - - Security vulnerabilities with concrete exploit paths, race conditions, and - incorrect assumptions about trust boundaries. - - Code quality: unnecessary duplication, missed reuse of existing utilities, - overly complex implementations a senior engineer would care about. - - Guideline compliance: clear, unambiguous violations of CLAUDE.md / REVIEW.md - where you can cite the exact rule broken. Respect all skip rules. - -Step 3: Validate each candidate finding - - Trace the actual code path to confirm the issue is real. - - Check whether the issue is handled elsewhere (try/catch, upstream guard, - fallback logic, type system guarantees). - - If validation fails, drop the finding silently. - - If validation passes, write a clear reasoning chain — this becomes the - \`reasoning\` field. - -Step 4: Classify each validated finding with exactly one severity: - - important — A bug that should be fixed before merging. Build failures, clear - logic errors, security vulnerabilities with exploit paths, data loss risks, - race conditions with observable consequences. - - nit — A minor issue worth fixing but non-blocking. Style deviations from - project guidelines, code quality concerns, unlikely-but-worth-noting edge - cases, convention violations that don't affect correctness. - - pre_existing — A bug that exists in the surrounding codebase but was NOT - introduced by this change. Only flag when directly relevant to the changed - code path. +export const CURSOR_REVIEW_PROMPT = `# Code Review + +## Your role +You are a senior engineer reviewing a code change. Find the bugs a maintainer +would fix before merging — logic errors, regressions, broken edge cases, +security holes with a real exploit path, data-loss risks. You are not a linter +or style checker. Optimize for findings that get resolved, not for volume: a +few real, well-evidenced bugs beat a long list of nits. + +## Method — investigate before you report +You have tools. Use them. Do not judge the diff in isolation: + - Read the full function and module around each change, not just the hunk. + - Trace call sites and data flow to see how the changed code is actually used. + - Read any repo guidance and honor it if present: CLAUDE.md, REVIEW.md, + AGENTS.md, .cursor/rules/*, and .cursor/BUGBOT.md (root and any nested under + the directories you're reviewing). Treat these as authoritative for this + repo and respect any skip/ignore rules they define. + - Before reporting, check whether the issue is already handled elsewhere (a + guard, try/catch, fallback, type guarantee). + - Look at sibling code and tests: is the pattern you're flagging used safely + elsewhere? Is the failure path you fear already covered? + +## What to look for + - Correctness: logic errors, regressions, broken edge cases, off-by-one, + wrong results, build/type breakage. + - Robustness: unhandled errors on realistic paths, resource/process leaks, + races with observable consequences, state left inconsistent on failure. + - Security (only when the change introduces it): execution of + user/client-controlled input, trusting unvalidated external or model + output, weakened trust/read-only guarantees, newly exposed surfaces. + - Contracts & attribution: wrong file/line/path mapping, scope confusion, or a + change to one side of a contract (types, API shape, parallel + implementations) without the matching change on the other side. + - Clear, citable violations of the repo guidance files above. + +## Validate every finding +Confirm each candidate is real by tracing the actual code path. If you can't +show how it triggers, drop it — prefer silence over a false positive. The +confirmation you write becomes the \`reasoning\` field: what triggers it, what +breaks, and why it isn't already handled. + +## Severity — report what a maintainer would actually fix + important — should be fixed before merge: data loss, silent wrong results, + security with an exploit path, crashes/leaks, contract drift that breaks a + real consumer, fail-open where it must fail-closed. + + nit — minor and non-blocking: a real but low-impact edge case, missing + handling on an unlikely path, a robustness gap with an easy workaround. + + pre_existing — a genuine bug in surrounding code NOT introduced by this + change. Only flag when it directly affects the changed code path. + +## Do NOT report + - Formatting, naming, or style preferences (unless a repo guideline requires). + - "Consider adding tests" — unless the change adds non-trivial logic with no + test coverage at all. + - Hypothetical problems needing exotic conditions with no real code path. + - Speculative refactors or "this could be cleaner" outside the change's scope. ## Hard constraints -- Never approve or block the change. -- Never comment on formatting or code style unless guidance files say to. -- Never flag missing test coverage unless guidance files say to. -- Never invent rules — only enforce what CLAUDE.md or REVIEW.md state. -- Prefer silence over false positives — when in doubt, drop the finding. -- Do NOT modify files. This is a read-only review. +- This is a read-only review. Do NOT modify files. - Do NOT post any comments to GitHub or GitLab. Do NOT use gh pr comment, glab, or any commenting tool. +- Never approve or block the change. Your only output is findings. ## Output contract Your only machine-readable output is a single marker-delimited JSON block. @@ -208,14 +218,17 @@ Schema: - line: integer (start line, post-change numbering) - end_line: integer (end line; equal to line for a single line) - severity: one of "important", "nit", "pre_existing" - - description: string (one paragraph) + - description: string (one paragraph) — state the IMPACT (what breaks, for + whom) and the TRIGGER (when it happens); suggest a minimal fix if obvious - reasoning: string (how the issue was confirmed) - summary: object with - correctness: string ("Correct" or "Issues Found") - explanation: string (one sentence) - confidence: number between 0 and 1 -If no issues are found, return an empty "findings" array with a valid summary.`; +Cite file/line from the new (post-change) code. One finding per distinct bug — +do not stack unrelated issues. If no issues are found, return an empty +"findings" array with a valid summary.`; // --------------------------------------------------------------------------- // Command builder From dce4006449f0659e90947b81615e6cf99b859861 Mon Sep 17 00:00:00 2001 From: Michael Ramos Date: Sat, 13 Jun 2026 20:29:22 -0700 Subject: [PATCH 6/7] feat(review): discover Cursor models dynamically from `agent models` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the hardcoded single-entry Cursor model list with the real, account-specific catalog. `agent models` returns ~139 models (current ones: Opus 4.8, GPT-5.5, Fable 5, Gemini 3.1, Grok, etc.) in a stable `id - Label` format — hardcoding any subset would go stale immediately. - cursor-review.ts: parseCursorModelsOutput() — runtime-agnostic parser for the `agent models` text (skips header/tip, dedupes). Shared, vendored to Pi. - agent-jobs.ts (Bun + Pi): discoverCursorModels() spawns `agent models` at review-server capability detection (best-effort, cached; empty when the CLI is unauthenticated) and attaches the catalog to the cursor AgentCapability. - AgentCapability gains an optional `models` field (deliberate, per approach doc — not smuggled). Cursor capability detection now also gated to review mode. - UI: AgentsTab drives the model picker from capabilities.cursor.models, falling back to an `auto`-only list when none are reported. Default model id is now `auto` (lowercase, matching the real catalog); buildCursorCommand omits --model for `auto` case-insensitively. Verified the parser against the live CLI: parses all 139 models. Typecheck green; 61 tests pass (incl. 3 new parser tests). Binary rebuilt. --- apps/pi-extension/server/agent-jobs.ts | 33 +++++++++++++++++++--- packages/server/agent-jobs.ts | 34 ++++++++++++++++++++--- packages/server/cursor-review.test.ts | 37 +++++++++++++++++++++++++ packages/server/cursor-review.ts | 38 +++++++++++++++++++++++++- packages/shared/agent-jobs.ts | 7 +++++ packages/ui/components/AgentsTab.tsx | 30 ++++++++++++++------ packages/ui/hooks/useAgentSettings.ts | 4 ++- 7 files changed, 164 insertions(+), 19 deletions(-) diff --git a/apps/pi-extension/server/agent-jobs.ts b/apps/pi-extension/server/agent-jobs.ts index e3742fba4..24f91b123 100644 --- a/apps/pi-extension/server/agent-jobs.ts +++ b/apps/pi-extension/server/agent-jobs.ts @@ -21,7 +21,7 @@ import { AGENT_HEARTBEAT_INTERVAL_MS, } from "../generated/agent-jobs.js"; import { formatClaudeLogEvent } from "../generated/claude-review.js"; -import { formatCursorLogEvent } from "../generated/cursor-review.js"; +import { formatCursorLogEvent, parseCursorModelsOutput, type CursorModel } from "../generated/cursor-review.js"; import { json, parseBody } from "./helpers.js"; // --------------------------------------------------------------------------- @@ -116,6 +116,24 @@ function isCursorAgentAvailable(): boolean { } } +/** + * Best-effort Cursor model catalog from `agent models`, parsed once. Empty when + * discovery fails or the CLI is unauthenticated — the UI falls back to an + * `auto`-only picker. Account-specific, so never hardcoded. + */ +function discoverCursorModels(): CursorModel[] { + try { + const out = execFileSync("agent", ["models"], { + timeout: 5000, + stdio: ["ignore", "pipe", "ignore"], + encoding: "utf8", + }); + return parseCursorModelsOutput(out); + } catch { + return []; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions) { const { mode, getServerUrl, getCwd } = options; @@ -126,13 +144,20 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions) { let version = 0; // --- Capability detection (run once) --- + // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify + // identity rather than trusting the name alone. When present, also discover + // its account-specific model catalog so the UI doesn't hardcode model ids. + const cursorAvailable = mode === "review" && isCursorAgentAvailable(); const capabilities: AgentCapability[] = [ { id: "claude", name: "Claude Code", available: whichCmd("claude") }, { id: "codex", name: "Codex CLI", available: whichCmd("codex") }, { id: "tour", name: "Code Tour", available: whichCmd("claude") || whichCmd("codex") }, - // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify - // identity rather than trusting the name alone (see isCursorAgentAvailable). - { id: "cursor", name: "Cursor CLI", available: isCursorAgentAvailable() }, + { + id: "cursor", + name: "Cursor CLI", + available: cursorAvailable, + ...(cursorAvailable ? { models: discoverCursorModels() } : {}), + }, ]; const capabilitiesResponse: AgentCapabilities = { mode, diff --git a/packages/server/agent-jobs.ts b/packages/server/agent-jobs.ts index 9750c7432..653aeef97 100644 --- a/packages/server/agent-jobs.ts +++ b/packages/server/agent-jobs.ts @@ -9,7 +9,7 @@ */ import { formatClaudeLogEvent } from "./claude-review"; -import { formatCursorLogEvent } from "./cursor-review"; +import { formatCursorLogEvent, parseCursorModelsOutput, type CursorModel } from "./cursor-review"; import { type AgentJobInfo, type AgentJobEvent, @@ -131,6 +131,25 @@ function isCursorAgentAvailable(): boolean { } } +/** + * Best-effort Cursor model catalog from `agent models`, parsed once and cached. + * Empty when discovery fails or the CLI is unauthenticated — the UI falls back + * to an `auto`-only picker. Account-specific, so never hardcoded. + */ +function discoverCursorModels(): CursorModel[] { + try { + const res = Bun.spawnSync(["agent", "models"], { + stdout: "pipe", + stderr: "ignore", + timeout: 5000, + }); + if (!res.success) return []; + return parseCursorModelsOutput(new TextDecoder().decode(res.stdout)); + } catch { + return []; + } +} + export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJobHandler { const { mode, getServerUrl, getCwd } = options; @@ -142,13 +161,20 @@ export function createAgentJobHandler(options: AgentJobHandlerOptions): AgentJob let version = 0; // --- Capability detection (run once) --- + // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify + // identity rather than trusting the name alone. When present, also discover + // its account-specific model catalog so the UI doesn't hardcode model ids. + const cursorAvailable = mode === "review" && isCursorAgentAvailable(); const capabilities: AgentCapability[] = [ { id: "claude", name: "Claude Code", available: !!Bun.which("claude") }, { id: "codex", name: "Codex CLI", available: !!Bun.which("codex") }, { id: "tour", name: "Code Tour", available: !!Bun.which("claude") || !!Bun.which("codex") }, - // Cursor CLI's binary is literally named `agent` (NOT `cursor`), so verify - // identity rather than trusting the name alone (see isCursorAgentAvailable). - { id: "cursor", name: "Cursor CLI", available: isCursorAgentAvailable() }, + { + id: "cursor", + name: "Cursor CLI", + available: cursorAvailable, + ...(cursorAvailable ? { models: discoverCursorModels() } : {}), + }, ]; const capabilitiesResponse: AgentCapabilities = { mode, diff --git a/packages/server/cursor-review.test.ts b/packages/server/cursor-review.test.ts index 1cf501f2b..c1feb0cb4 100644 --- a/packages/server/cursor-review.test.ts +++ b/packages/server/cursor-review.test.ts @@ -8,6 +8,7 @@ import { formatCursorLogEvent, transformCursorFindings, validateCursorReviewOutput, + parseCursorModelsOutput, type CursorFinding, type CursorReviewOutput, } from "./cursor-review"; @@ -315,3 +316,39 @@ describe("transformCursorFindings", () => { expect(transformCursorFindings(f, "src", "/repo")[0].lineEnd).toBe(7); }); }); + +describe("parseCursorModelsOutput", () => { + test("parses `id - Label` lines and skips the header + tip", () => { + const out = [ + "Available models", + "", + "auto - Auto", + "gpt-5.2 - GPT-5.2", + "claude-opus-4-8-thinking-high - Opus 4.8 1M Thinking", + "composer-2.5 - Composer 2.5 (current)", + "", + "Tip: use --model (or /model in interactive mode) to switch.", + ].join("\n"); + const models = parseCursorModelsOutput(out); + expect(models).toEqual([ + { id: "auto", label: "Auto" }, + { id: "gpt-5.2", label: "GPT-5.2" }, + { id: "claude-opus-4-8-thinking-high", label: "Opus 4.8 1M Thinking" }, + { id: "composer-2.5", label: "Composer 2.5 (current)" }, + ]); + }); + + test("returns [] for unauthenticated / empty output", () => { + expect(parseCursorModelsOutput("No models available for this account.")).toEqual([]); + expect(parseCursorModelsOutput("")).toEqual([]); + expect(parseCursorModelsOutput("Not logged in")).toEqual([]); + }); + + test("dedupes repeated ids, keeping first", () => { + const models = parseCursorModelsOutput("auto - Auto\nauto - Auto Again\ngpt-5.2 - GPT-5.2"); + expect(models).toEqual([ + { id: "auto", label: "Auto" }, + { id: "gpt-5.2", label: "GPT-5.2" }, + ]); + }); +}); diff --git a/packages/server/cursor-review.ts b/packages/server/cursor-review.ts index 9653939c6..19e9e0ce3 100644 --- a/packages/server/cursor-review.ts +++ b/packages/server/cursor-review.ts @@ -253,7 +253,8 @@ export interface CursorCommandResult { * to the launch cwd when provided, matching the spawn cwd. */ export function buildCursorCommand(prompt: string, model?: string, cwd?: string): CursorCommandResult { - const useModel = !!model && model !== "Auto"; + // `auto` is Cursor's default model id — omit --model so the CLI chooses. + const useModel = !!model && model.toLowerCase() !== "auto"; return { command: [ @@ -275,6 +276,41 @@ export function buildCursorCommand(prompt: string, model?: string, cwd?: string) }; } +// --------------------------------------------------------------------------- +// Model discovery — parse `agent models` output. The spawn is runtime-specific +// (it lives in each server's agent-jobs adapter); this pure parser is shared. +// --------------------------------------------------------------------------- + +export interface CursorModel { + id: string; + label: string; +} + +/** + * Parse the text output of `agent models` / `agent --list-models` into a model + * catalog. The CLI prints one model per line as ` -