From d5aed7bea1ae9ff873c280b7e704057706435a77 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 30 Apr 2026 16:07:41 -0700 Subject: [PATCH 1/6] Add DeepSeek V4 provider --- agents/base2/base2-free-deepseek-v4.ts | 11 + agents/types/agent-definition.ts | 2 + common/src/constants/free-agents.ts | 11 +- common/src/constants/freebuff-models.ts | 7 + common/src/constants/model-config.ts | 3 + .../types/agent-definition.ts | 2 + evals/buffbench/main-single-eval.ts | 2 +- packages/internal/src/env-schema.ts | 2 + packages/internal/src/env.ts | 1 + .../completions/__tests__/completions.test.ts | 113 +++ web/src/app/api/v1/chat/completions/_post.ts | 119 ++- web/src/llm-api/deepseek.ts | 769 ++++++++++++++++++ web/src/server/free-session/config.ts | 2 + 13 files changed, 1004 insertions(+), 40 deletions(-) create mode 100644 agents/base2/base2-free-deepseek-v4.ts create mode 100644 web/src/llm-api/deepseek.ts diff --git a/agents/base2/base2-free-deepseek-v4.ts b/agents/base2/base2-free-deepseek-v4.ts new file mode 100644 index 000000000..19ca5a891 --- /dev/null +++ b/agents/base2/base2-free-deepseek-v4.ts @@ -0,0 +1,11 @@ +import { createBase2 } from './base2' + +const definition = { + ...createBase2('free', { + noAskUser: true, + model: 'deepseek/deepseek-v4-pro', + }), + id: 'base2-free-deepseek-v4', + displayName: 'Buffy the DeepSeek V4 Free Orchestrator', +} +export default definition diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts index 088dd1dca..2d05e4e0b 100644 --- a/agents/types/agent-definition.ts +++ b/agents/types/agent-definition.ts @@ -415,6 +415,8 @@ export type ModelName = | 'qwen/qwen3-30b-a3b:nitro' // DeepSeek + | 'deepseek/deepseek-v4-pro' + | 'deepseek-v4-pro' | 'deepseek/deepseek-chat-v3-0324' | 'deepseek/deepseek-chat-v3-0324:nitro' | 'deepseek/deepseek-r1-0528' diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 6d22152c5..e8a0d19b6 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -1,6 +1,9 @@ import { parseAgentId } from '../util/agent-id-parsing' -import { SUPPORTED_FREEBUFF_MODELS } from './freebuff-models' +import { + FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, + SUPPORTED_FREEBUFF_MODELS, +} from './freebuff-models' import type { CostMode } from './model-config' @@ -16,7 +19,10 @@ export const FREE_COST_MODE = 'free' as const * excluded — they're spawned by the root, so counting them would inflate * every user's apparent activity. */ -export const FREEBUFF_ROOT_AGENT_IDS = ['base2-free'] as const +export const FREEBUFF_ROOT_AGENT_IDS = [ + 'base2-free', + 'base2-free-deepseek-v4', +] as const const FREEBUFF_ROOT_AGENT_ID_SET: ReadonlySet = new Set( FREEBUFF_ROOT_AGENT_IDS, ) @@ -35,6 +41,7 @@ const FREEBUFF_ALLOWED_MODEL_IDS = SUPPORTED_FREEBUFF_MODELS.map( export const FREE_MODE_AGENT_MODELS: Record> = { // Root orchestrator 'base2-free': new Set(FREEBUFF_ALLOWED_MODEL_IDS), + 'base2-free-deepseek-v4': new Set([FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]), // File exploration agents 'file-picker': new Set(['google/gemini-2.5-flash-lite']), diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index 246731a3f..84daca5d8 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -22,6 +22,7 @@ export interface FreebuffModelOption { * `getFreebuffDeploymentAvailabilityLabel()` instead. */ export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT every day' export const FREEBUFF_GEMINI_PRO_MODEL_ID = 'google/gemini-3.1-pro-preview' +export const FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID = 'deepseek/deepseek-v4-pro' export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1' export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6' export const FREEBUFF_MINIMAX_MODEL_ID = 'minimax/minimax-m2.7' @@ -48,6 +49,12 @@ export const FREEBUFF_MODELS = [ tagline: 'Deepest, 1/day', availability: 'always', }, + { + id: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, + displayName: 'DeepSeek V4 Pro', + tagline: 'Experimental', + availability: 'always', + }, { id: FREEBUFF_MINIMAX_MODEL_ID, displayName: 'MiniMax M2.7', diff --git a/common/src/constants/model-config.ts b/common/src/constants/model-config.ts index 9be6d31e0..ced599fc2 100644 --- a/common/src/constants/model-config.ts +++ b/common/src/constants/model-config.ts @@ -6,6 +6,7 @@ export const ALLOWED_MODEL_PREFIXES = [ 'openai', 'google', 'x-ai', + 'deepseek', ] as const export const costModes = [ @@ -55,6 +56,8 @@ export type openrouterModel = export const deepseekModels = { deepseekChat: 'deepseek-chat', deepseekReasoner: 'deepseek-reasoner', + deepseekV4ProDirect: 'deepseek-v4-pro', + deepseekV4Pro: 'deepseek/deepseek-v4-pro', } as const export type DeepseekModel = (typeof deepseekModels)[keyof typeof deepseekModels] diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts index 088dd1dca..2d05e4e0b 100644 --- a/common/src/templates/initial-agents-dir/types/agent-definition.ts +++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts @@ -415,6 +415,8 @@ export type ModelName = | 'qwen/qwen3-30b-a3b:nitro' // DeepSeek + | 'deepseek/deepseek-v4-pro' + | 'deepseek-v4-pro' | 'deepseek/deepseek-chat-v3-0324' | 'deepseek/deepseek-chat-v3-0324:nitro' | 'deepseek/deepseek-r1-0528' diff --git a/evals/buffbench/main-single-eval.ts b/evals/buffbench/main-single-eval.ts index 6eceac7a5..bff2d322b 100644 --- a/evals/buffbench/main-single-eval.ts +++ b/evals/buffbench/main-single-eval.ts @@ -7,7 +7,7 @@ async function main() { await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base2-free-evals'], + agents: ['base2-free-deepseek-v4'], taskIds: ['server-agent-validation'], saveTraces, }) diff --git a/packages/internal/src/env-schema.ts b/packages/internal/src/env-schema.ts index a8af80f06..232309ba0 100644 --- a/packages/internal/src/env-schema.ts +++ b/packages/internal/src/env-schema.ts @@ -8,6 +8,7 @@ export const serverEnvSchema = clientEnvSchema.extend({ ANTHROPIC_API_KEY: z.string().min(1), FIREWORKS_API_KEY: z.string().min(1), CANOPYWAVE_API_KEY: z.string().min(1).optional(), + DEEPSEEK_API_KEY: z.string().min(1).optional(), SILICONFLOW_API_KEY: z.string().min(1).optional(), LINKUP_API_KEY: z.string().min(1), CONTEXT7_API_KEY: z.string().optional(), @@ -92,6 +93,7 @@ export const serverProcessEnv: ServerInput = { ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY, FIREWORKS_API_KEY: process.env.FIREWORKS_API_KEY, CANOPYWAVE_API_KEY: process.env.CANOPYWAVE_API_KEY, + DEEPSEEK_API_KEY: process.env.DEEPSEEK_API_KEY, SILICONFLOW_API_KEY: process.env.SILICONFLOW_API_KEY, LINKUP_API_KEY: process.env.LINKUP_API_KEY, CONTEXT7_API_KEY: process.env.CONTEXT7_API_KEY, diff --git a/packages/internal/src/env.ts b/packages/internal/src/env.ts index 3c3f60ce8..6edcea4d7 100644 --- a/packages/internal/src/env.ts +++ b/packages/internal/src/env.ts @@ -18,6 +18,7 @@ if (isCI) { ensureEnvDefault('ANTHROPIC_API_KEY', 'test') ensureEnvDefault('FIREWORKS_API_KEY', 'test') ensureEnvDefault('CANOPYWAVE_API_KEY', 'test') + ensureEnvDefault('DEEPSEEK_API_KEY', 'test') ensureEnvDefault('LINKUP_API_KEY', 'test') ensureEnvDefault('GRAVITY_API_KEY', 'test') ensureEnvDefault('IPINFO_TOKEN', 'test') diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index cf846131c..12965104b 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test' import { NextRequest } from 'next/server' import { + FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, FREEBUFF_GEMINI_PRO_MODEL_ID, FREEBUFF_GLM_MODEL_ID, isFreebuffDeploymentHours, @@ -147,6 +148,13 @@ describe('/api/v1/chat/completions POST endpoint', () => { status: 'running', } } + if (runId === 'run-free-deepseek-v4') { + return { + agent_id: 'base2-free-deepseek-v4', + ancestor_run_ids: [], + status: 'running', + } + } if (runId === 'run-reviewer-direct') { return { agent_id: 'code-reviewer-lite', @@ -823,6 +831,111 @@ describe('/api/v1/chat/completions POST endpoint', () => { FETCH_PATH_TEST_TIMEOUT_MS, ) + it( + 'lets the DeepSeek V4 free agent use the direct DeepSeek provider', + async () => { + const fetchedBodies: Record[] = [] + const fetchedUrls: string[] = [] + const fetchViaDeepSeek = mock( + async (url: string | URL | Request, init?: RequestInit) => { + fetchedUrls.push(String(url)) + fetchedBodies.push(JSON.parse(init?.body as string)) + return new Response( + JSON.stringify({ + id: 'test-id', + model: 'deepseek-v4-pro', + choices: [{ message: { content: 'test response' } }], + usage: { + prompt_tokens: 10, + prompt_cache_hit_tokens: 4, + completion_tokens: 20, + total_tokens: 30, + }, + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }, + ) + }, + ) as unknown as typeof globalThis.fetch + + const req = new NextRequest( + 'http://localhost:3000/api/v1/chat/completions', + { + method: 'POST', + headers: allowedFreeModeHeaders('test-api-key-new-free'), + body: JSON.stringify({ + model: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, + stream: false, + codebuff_metadata: { + run_id: 'run-free-deepseek-v4', + client_id: 'test-client-id-123', + cost_mode: 'free', + }, + }), + }, + ) + + const response = await postChatCompletions({ + req, + getUserInfoFromApiKey: mockGetUserInfoFromApiKey, + logger: mockLogger, + trackEvent: mockTrackEvent, + getUserUsageData: mockGetUserUsageData, + getAgentRunFromId: mockGetAgentRunFromId, + fetch: fetchViaDeepSeek, + insertMessageBigquery: mockInsertMessageBigquery, + loggerWithContext: mockLoggerWithContext, + checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + }) + + const body = await response.json() + expect(response.status).toBe(200) + expect(fetchedUrls[0]).toBe('https://api.deepseek.com/chat/completions') + expect(fetchedBodies[0].model).toBe('deepseek-v4-pro') + expect(body.model).toBe(FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID) + expect(body.provider).toBe('DeepSeek') + }, + FETCH_PATH_TEST_TIMEOUT_MS, + ) + + it('rejects the DeepSeek V4 free agent when it requests another free model', async () => { + const req = new NextRequest( + 'http://localhost:3000/api/v1/chat/completions', + { + method: 'POST', + headers: allowedFreeModeHeaders('test-api-key-new-free'), + body: JSON.stringify({ + model: FREEBUFF_GEMINI_PRO_MODEL_ID, + stream: false, + codebuff_metadata: { + run_id: 'run-free-deepseek-v4', + client_id: 'test-client-id-123', + cost_mode: 'free', + }, + }), + }, + ) + + const response = await postChatCompletions({ + req, + getUserInfoFromApiKey: mockGetUserInfoFromApiKey, + logger: mockLogger, + trackEvent: mockTrackEvent, + getUserUsageData: mockGetUserUsageData, + getAgentRunFromId: mockGetAgentRunFromId, + fetch: mockFetch, + insertMessageBigquery: mockInsertMessageBigquery, + loggerWithContext: mockLoggerWithContext, + checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + }) + + const body = await response.json() + expect(response.status).toBe(403) + expect(body.error).toBe('free_mode_invalid_agent_model') + }) + it('lets freebuff use Gemini 3.1 Pro through the free-mode allowlist', async () => { const req = new NextRequest( 'http://localhost:3000/api/v1/chat/completions', diff --git a/web/src/app/api/v1/chat/completions/_post.ts b/web/src/app/api/v1/chat/completions/_post.ts index 0a7771d46..4c507c291 100644 --- a/web/src/app/api/v1/chat/completions/_post.ts +++ b/web/src/app/api/v1/chat/completions/_post.ts @@ -48,6 +48,12 @@ import { handleFireworksStream, isFireworksModel, } from '@/llm-api/fireworks' +import { + DeepSeekError, + handleDeepSeekNonStream, + handleDeepSeekStream, + isDeepSeekModel, +} from '@/llm-api/deepseek' import { SiliconFlowError, handleSiliconFlowNonStream, @@ -597,12 +603,15 @@ export async function postChatCompletions(params: { // Handle streaming vs non-streaming try { if (bodyStream) { - // Streaming request — route to SiliconFlow/CanopyWave/Fireworks for supported models + // Streaming request — route supported models to direct providers. const useSiliconFlow = false // isSiliconFlowModel(typedBody.model) const useCanopyWave = isCanopyWaveModel(typedBody.model) - const useFireworks = !useCanopyWave && isFireworksModel(typedBody.model) + const useDeepSeek = !useCanopyWave && isDeepSeekModel(typedBody.model) + const useFireworks = + !useCanopyWave && !useDeepSeek && isFireworksModel(typedBody.model) const useOpenAIDirect = !useCanopyWave && + !useDeepSeek && !useFireworks && isOpenAIDirectModel(typedBody.model) const stream = useSiliconFlow @@ -625,8 +634,8 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : useFireworks - ? await handleFireworksStream({ + : useDeepSeek + ? await handleDeepSeekStream({ body: typedBody, userId, stripeCustomerId, @@ -635,8 +644,8 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : useOpenAIDirect - ? await handleOpenAIStream({ + : useFireworks + ? await handleFireworksStream({ body: typedBody, userId, stripeCustomerId, @@ -645,16 +654,26 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : await handleOpenRouterStream({ - body: typedBody, - userId, - stripeCustomerId, - agentId, - openrouterApiKey, - fetch, - logger, - insertMessageBigquery, - }) + : useOpenAIDirect + ? await handleOpenAIStream({ + body: typedBody, + userId, + stripeCustomerId, + agentId, + fetch, + logger, + insertMessageBigquery, + }) + : await handleOpenRouterStream({ + body: typedBody, + userId, + stripeCustomerId, + agentId, + openrouterApiKey, + fetch, + logger, + insertMessageBigquery, + }) trackEvent({ event: AnalyticsEvent.CHAT_COMPLETIONS_STREAM_STARTED, @@ -679,9 +698,14 @@ export async function postChatCompletions(params: { const model = typedBody.model const useSiliconFlow = false // isSiliconFlowModel(model) const useCanopyWave = isCanopyWaveModel(model) - const useFireworks = !useCanopyWave && isFireworksModel(model) + const useDeepSeek = !useCanopyWave && isDeepSeekModel(model) + const useFireworks = + !useCanopyWave && !useDeepSeek && isFireworksModel(model) const shouldUseOpenAIEndpoint = - !useCanopyWave && !useFireworks && isOpenAIDirectModel(model) + !useCanopyWave && + !useDeepSeek && + !useFireworks && + isOpenAIDirectModel(model) const nonStreamRequest = useSiliconFlow ? handleSiliconFlowNonStream({ @@ -703,8 +727,8 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : useFireworks - ? handleFireworksNonStream({ + : useDeepSeek + ? handleDeepSeekNonStream({ body: typedBody, userId, stripeCustomerId, @@ -713,8 +737,8 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : shouldUseOpenAIEndpoint - ? handleOpenAINonStream({ + : useFireworks + ? handleFireworksNonStream({ body: typedBody, userId, stripeCustomerId, @@ -723,16 +747,26 @@ export async function postChatCompletions(params: { logger, insertMessageBigquery, }) - : handleOpenRouterNonStream({ - body: typedBody, - userId, - stripeCustomerId, - agentId, - openrouterApiKey, - fetch, - logger, - insertMessageBigquery, - }) + : shouldUseOpenAIEndpoint + ? handleOpenAINonStream({ + body: typedBody, + userId, + stripeCustomerId, + agentId, + fetch, + logger, + insertMessageBigquery, + }) + : handleOpenRouterNonStream({ + body: typedBody, + userId, + stripeCustomerId, + agentId, + openrouterApiKey, + fetch, + logger, + insertMessageBigquery, + }) const result = await nonStreamRequest trackEvent({ @@ -761,6 +795,10 @@ export async function postChatCompletions(params: { if (error instanceof CanopyWaveError) { canopywaveError = error } + let deepseekError: DeepSeekError | undefined + if (error instanceof DeepSeekError) { + deepseekError = error + } let siliconflowError: SiliconFlowError | undefined if (error instanceof SiliconFlowError) { siliconflowError = error @@ -776,11 +814,13 @@ export async function postChatCompletions(params: { ? 'SiliconFlow' : canopywaveError ? 'CanopyWave' - : fireworksError - ? 'Fireworks' - : openaiError - ? 'OpenAI' - : 'OpenRouter' + : deepseekError + ? 'DeepSeek' + : fireworksError + ? 'Fireworks' + : openaiError + ? 'OpenAI' + : 'OpenRouter' logger.error( { error: getErrorObject(error), @@ -798,6 +838,7 @@ export async function postChatCompletions(params: { openrouterError ?? fireworksError ?? canopywaveError ?? + deepseekError ?? siliconflowError ?? openaiError )?.statusCode, @@ -805,6 +846,7 @@ export async function postChatCompletions(params: { openrouterError ?? fireworksError ?? canopywaveError ?? + deepseekError ?? siliconflowError ?? openaiError )?.statusText, @@ -838,6 +880,9 @@ export async function postChatCompletions(params: { if (error instanceof CanopyWaveError) { return NextResponse.json(error.toJSON(), { status: error.statusCode }) } + if (error instanceof DeepSeekError) { + return NextResponse.json(error.toJSON(), { status: error.statusCode }) + } if (error instanceof SiliconFlowError) { return NextResponse.json(error.toJSON(), { status: error.statusCode }) } diff --git a/web/src/llm-api/deepseek.ts b/web/src/llm-api/deepseek.ts new file mode 100644 index 000000000..12ac66265 --- /dev/null +++ b/web/src/llm-api/deepseek.ts @@ -0,0 +1,769 @@ +import { Agent } from 'undici' + +import { deepseekModels } from '@codebuff/common/constants/model-config' +import { PROFIT_MARGIN } from '@codebuff/common/constants/limits' +import { getErrorObject } from '@codebuff/common/util/error' +import { env } from '@codebuff/internal/env' + +import { + consumeCreditsForMessage, + extractRequestMetadata, + insertMessageToBigQuery, +} from './helpers' + +import type { UsageData } from './helpers' +import type { InsertMessageBigqueryFn } from '@codebuff/common/types/contracts/bigquery' +import type { Logger } from '@codebuff/common/types/contracts/logger' +import type { ChatCompletionRequestBody } from './types' + +const DEEPSEEK_BASE_URL = 'https://api.deepseek.com' + +// Extended timeout for deep-thinking models that can take +// a long time to start streaming. +const DEEPSEEK_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 + +const deepseekAgent = new Agent({ + headersTimeout: DEEPSEEK_HEADERS_TIMEOUT_MS, + bodyTimeout: 0, +}) + +// DeepSeek per-token pricing (dollars per token) +interface DeepSeekPricing { + inputCostPerToken: number + cachedInputCostPerToken: number + outputCostPerToken: number +} + +const DEEPSEEK_V4_PRO_PRICING: DeepSeekPricing = { + inputCostPerToken: 0.435 / 1_000_000, + cachedInputCostPerToken: 0.003625 / 1_000_000, + outputCostPerToken: 0.87 / 1_000_000, +} + +/** Single source of truth for DeepSeek model metadata and pricing. + * Kept as one map so adding a model can't drift between routing and billing. */ +const DEEPSEEK_MODELS: Record< + string, + { deepseekId: string; pricing: DeepSeekPricing } +> = { + [deepseekModels.deepseekV4ProDirect]: { + deepseekId: deepseekModels.deepseekV4ProDirect, + pricing: DEEPSEEK_V4_PRO_PRICING, + }, + [deepseekModels.deepseekV4Pro]: { + deepseekId: deepseekModels.deepseekV4ProDirect, + pricing: DEEPSEEK_V4_PRO_PRICING, + }, +} + +const DEEPSEEK_ROUTED_MODELS = new Set(Object.keys(DEEPSEEK_MODELS)) + +export function isDeepSeekModel(model: string): boolean { + return DEEPSEEK_ROUTED_MODELS.has(model) +} + +function getDeepSeekModelId(openrouterModel: string): string { + return DEEPSEEK_MODELS[openrouterModel]?.deepseekId ?? openrouterModel +} + +function getDeepSeekPricing(model: string): DeepSeekPricing { + const entry = DEEPSEEK_MODELS[model] + if (!entry) { + throw new Error(`No DeepSeek pricing found for model: ${model}`) + } + return entry.pricing +} + +type StreamState = { + responseText: string + reasoningText: string + ttftMs: number | null + billedAlready: boolean +} + +type LineResult = { + state: StreamState + billedCredits?: number + patchedLine: string +} + +function toDeepSeekReasoningEffort(effort: unknown): 'high' | 'max' { + return effort === 'max' || effort === 'xhigh' ? 'max' : 'high' +} + +function createDeepSeekRequest(params: { + body: ChatCompletionRequestBody + originalModel: string + fetch: typeof globalThis.fetch +}) { + const { body, originalModel, fetch } = params + const deepseekBody: Record = { + ...body, + model: getDeepSeekModelId(originalModel), + } + + // DeepSeek uses `thinking` instead of OpenRouter's `reasoning`. + if (deepseekBody.reasoning && typeof deepseekBody.reasoning === 'object') { + const reasoning = deepseekBody.reasoning as { + enabled?: boolean + effort?: 'high' | 'medium' | 'low' + } + deepseekBody.thinking = { + type: reasoning.enabled === false ? 'disabled' : 'enabled', + reasoning_effort: toDeepSeekReasoningEffort(reasoning.effort), + } + } else if (deepseekBody.reasoning_effort) { + deepseekBody.thinking = { + type: 'enabled', + reasoning_effort: toDeepSeekReasoningEffort( + deepseekBody.reasoning_effort, + ), + } + } + delete deepseekBody.reasoning + delete deepseekBody.reasoning_effort + + // Strip OpenRouter-specific / internal fields + delete deepseekBody.provider + delete deepseekBody.transforms + delete deepseekBody.codebuff_metadata + delete deepseekBody.usage + + // For streaming, request usage in the final chunk + if (deepseekBody.stream) { + deepseekBody.stream_options = { include_usage: true } + } + + if (!env.DEEPSEEK_API_KEY) { + throw new Error('DEEPSEEK_API_KEY is not configured') + } + + return fetch(`${DEEPSEEK_BASE_URL}/chat/completions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${env.DEEPSEEK_API_KEY}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(deepseekBody), + // @ts-expect-error - dispatcher is a valid undici option not in fetch types + dispatcher: deepseekAgent, + }) +} + +function extractUsageAndCost( + usage: Record | undefined | null, + model: string, +): UsageData { + if (!usage) + return { + inputTokens: 0, + outputTokens: 0, + cacheReadInputTokens: 0, + reasoningTokens: 0, + cost: 0, + } + const completionDetails = usage.completion_tokens_details as + | Record + | undefined + | null + + const inputTokens = + typeof usage.prompt_tokens === 'number' ? usage.prompt_tokens : 0 + const outputTokens = + typeof usage.completion_tokens === 'number' ? usage.completion_tokens : 0 + const cacheReadInputTokens = + typeof usage.prompt_cache_hit_tokens === 'number' + ? usage.prompt_cache_hit_tokens + : 0 + const reasoningTokens = + typeof completionDetails?.reasoning_tokens === 'number' + ? completionDetails.reasoning_tokens + : 0 + + const pricing = getDeepSeekPricing(model) + const nonCachedInputTokens = Math.max(0, inputTokens - cacheReadInputTokens) + const cost = + nonCachedInputTokens * pricing.inputCostPerToken + + cacheReadInputTokens * pricing.cachedInputCostPerToken + + outputTokens * pricing.outputCostPerToken + + return { + inputTokens, + outputTokens, + cacheReadInputTokens, + reasoningTokens, + cost, + } +} + +export async function handleDeepSeekNonStream({ + body, + userId, + stripeCustomerId, + agentId, + fetch, + logger, + insertMessageBigquery, +}: { + body: ChatCompletionRequestBody + userId: string + stripeCustomerId?: string | null + agentId: string + fetch: typeof globalThis.fetch + logger: Logger + insertMessageBigquery: InsertMessageBigqueryFn +}) { + const originalModel = body.model + const startTime = new Date() + const { clientId, clientRequestId, costMode } = extractRequestMetadata({ + body, + logger, + }) + + const response = await createDeepSeekRequest({ body, originalModel, fetch }) + + if (!response.ok) { + throw await parseDeepSeekError(response) + } + + const data = await response.json() + const content = data.choices?.[0]?.message?.content ?? '' + const reasoningText = + data.choices?.[0]?.message?.reasoning_content ?? + data.choices?.[0]?.message?.reasoning ?? + '' + const usageData = extractUsageAndCost(data.usage, originalModel) + + insertMessageToBigQuery({ + messageId: data.id, + userId, + startTime, + request: body, + reasoningText, + responseText: content, + usageData, + logger, + insertMessageBigquery, + }).catch((error) => { + logger.error({ error }, 'Failed to insert message into BigQuery') + }) + + const billedCredits = await consumeCreditsForMessage({ + messageId: data.id, + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + startTime, + model: originalModel, + reasoningText, + responseText: content, + usageData, + byok: false, + logger, + costMode, + ttftMs: null, // Non-stream - no TTFT to report + }) + + // Overwrite cost so SDK calculates exact credits we charged + if (data.usage) { + data.usage.cost = creditsToFakeCost(billedCredits) + data.usage.cost_details = { upstream_inference_cost: 0 } + } + + // Normalise model name back to OpenRouter format for client compatibility + data.model = originalModel + if (!data.provider) data.provider = 'DeepSeek' + + return data +} + +export async function handleDeepSeekStream({ + body, + userId, + stripeCustomerId, + agentId, + fetch, + logger, + insertMessageBigquery, +}: { + body: ChatCompletionRequestBody + userId: string + stripeCustomerId?: string | null + agentId: string + fetch: typeof globalThis.fetch + logger: Logger + insertMessageBigquery: InsertMessageBigqueryFn +}) { + const originalModel = body.model + const startTime = new Date() + const { clientId, clientRequestId, costMode } = extractRequestMetadata({ + body, + logger, + }) + + const response = await createDeepSeekRequest({ body, originalModel, fetch }) + + if (!response.ok) { + throw await parseDeepSeekError(response) + } + + const reader = response.body?.getReader() + if (!reader) { + throw new Error('Failed to get response reader') + } + + let heartbeatInterval: NodeJS.Timeout + let state: StreamState = { + responseText: '', + reasoningText: '', + ttftMs: null, + billedAlready: false, + } + let clientDisconnected = false + + const stream = new ReadableStream({ + async start(controller) { + const decoder = new TextDecoder() + let buffer = '' + + controller.enqueue( + new TextEncoder().encode(`: connected ${new Date().toISOString()}\n`), + ) + + heartbeatInterval = setInterval(() => { + if (!clientDisconnected) { + try { + controller.enqueue( + new TextEncoder().encode( + `: heartbeat ${new Date().toISOString()}\n\n`, + ), + ) + } catch { + // client disconnected + } + } + }, 30000) + + try { + let done = false + while (!done) { + const result = await reader.read() + done = result.done + const value = result.value + + if (done) break + + buffer += decoder.decode(value, { stream: true }) + let lineEnd = buffer.indexOf('\n') + + while (lineEnd !== -1) { + const line = buffer.slice(0, lineEnd + 1) + buffer = buffer.slice(lineEnd + 1) + + const lineResult = await handleLine({ + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + costMode, + startTime, + request: body, + originalModel, + line, + state, + logger, + insertMessage: insertMessageBigquery, + }) + state = lineResult.state + + if (!clientDisconnected) { + try { + controller.enqueue( + new TextEncoder().encode(lineResult.patchedLine), + ) + } catch { + logger.warn( + 'Client disconnected during stream, continuing for billing', + ) + clientDisconnected = true + } + } + + lineEnd = buffer.indexOf('\n') + } + } + + if (!clientDisconnected) { + controller.close() + } + } catch (error) { + if (!clientDisconnected) { + controller.error(error) + } else { + logger.warn( + getErrorObject(error), + 'Error after client disconnect in DeepSeek stream', + ) + } + } finally { + clearInterval(heartbeatInterval) + } + }, + cancel() { + clearInterval(heartbeatInterval) + clientDisconnected = true + logger.warn( + { + clientDisconnected, + responseTextLength: state.responseText.length, + reasoningTextLength: state.reasoningText.length, + }, + 'Client cancelled stream, continuing DeepSeek consumption for billing', + ) + }, + }) + + return stream +} + +async function handleLine({ + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + costMode, + startTime, + request, + originalModel, + line, + state, + logger, + insertMessage, +}: { + userId: string + stripeCustomerId?: string | null + agentId: string + clientId: string | null + clientRequestId: string | null + costMode: string | undefined + startTime: Date + request: unknown + originalModel: string + line: string + state: StreamState + logger: Logger + insertMessage: InsertMessageBigqueryFn +}): Promise { + if (!line.startsWith('data: ')) { + return { state, patchedLine: line } + } + + const raw = line.slice('data: '.length) + if (raw === '[DONE]\n' || raw === '[DONE]') { + return { state, patchedLine: line } + } + + let obj: Record + try { + obj = JSON.parse(raw) + } catch (error) { + logger.warn( + { error: getErrorObject(error, { includeRawError: true }) }, + 'Received non-JSON DeepSeek response', + ) + return { state, patchedLine: line } + } + + // Patch model and provider for SDK compatibility + if (obj.model) obj.model = originalModel + if (!obj.provider) obj.provider = 'DeepSeek' + + // Process the chunk for billing / state tracking + const result = await handleResponse({ + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + costMode, + startTime, + request, + originalModel, + data: obj, + state, + logger, + insertMessage, + }) + + // If this is the final chunk with billing, overwrite cost in the patched object + if (result.billedCredits !== undefined && obj.usage) { + const usage = obj.usage as Record + usage.cost = creditsToFakeCost(result.billedCredits) + usage.cost_details = { upstream_inference_cost: 0 } + } + + const patchedLine = `data: ${JSON.stringify(obj)}\n` + return { + state: result.state, + billedCredits: result.billedCredits, + patchedLine, + } +} + +function isFinalChunk(data: Record): boolean { + const choices = data.choices as Array> | undefined + if (!choices || choices.length === 0) return true + return choices.some((c) => c.finish_reason != null) +} + +async function handleResponse({ + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + costMode, + startTime, + request, + originalModel, + data, + state, + logger, + insertMessage, +}: { + userId: string + stripeCustomerId?: string | null + agentId: string + clientId: string | null + clientRequestId: string | null + costMode: string | undefined + startTime: Date + request: unknown + originalModel: string + data: Record + state: StreamState + logger: Logger + insertMessage: InsertMessageBigqueryFn +}): Promise<{ state: StreamState; billedCredits?: number }> { + state = handleStreamChunk({ + data, + state, + startTime, + logger, + userId, + agentId, + model: originalModel, + }) + + // Some providers send cumulative usage on EVERY chunk (not just the final one), + // so we must only bill once on the final chunk to avoid charging N times. + if ( + 'error' in data || + !data.usage || + state.billedAlready || + !isFinalChunk(data) + ) { + // Strip usage from non-final chunks and duplicate final chunks + // so the SDK doesn't see multiple usage objects + if (data.usage && (!isFinalChunk(data) || state.billedAlready)) { + delete data.usage + } + return { state } + } + + const usageData = extractUsageAndCost( + data.usage as Record, + originalModel, + ) + const messageId = typeof data.id === 'string' ? data.id : 'unknown' + + state.billedAlready = true + + insertMessageToBigQuery({ + messageId, + userId, + startTime, + request, + reasoningText: state.reasoningText, + responseText: state.responseText, + usageData, + logger, + insertMessageBigquery: insertMessage, + }).catch((error) => { + logger.error({ error }, 'Failed to insert message into BigQuery') + }) + + const billedCredits = await consumeCreditsForMessage({ + messageId, + userId, + stripeCustomerId, + agentId, + clientId, + clientRequestId, + startTime, + model: originalModel, + reasoningText: state.reasoningText, + responseText: state.responseText, + usageData, + byok: false, + logger, + costMode, + ttftMs: state.ttftMs, + }) + + return { state, billedCredits } +} + +function handleStreamChunk({ + data, + state, + startTime, + logger, + userId, + agentId, + model, +}: { + data: Record + state: StreamState + startTime: Date + logger: Logger + userId: string + agentId: string + model: string +}): StreamState { + const MAX_BUFFER_SIZE = 1 * 1024 * 1024 + + if ('error' in data) { + const errorData = data.error as Record + logger.error( + { + userId, + agentId, + model, + errorCode: errorData?.code, + errorType: errorData?.type, + errorMessage: errorData?.message, + }, + 'Received error chunk in DeepSeek stream', + ) + return state + } + + const choices = data.choices as Array> | undefined + if (!choices?.length) { + return state + } + const choice = choices[0] + const delta = choice.delta as Record | undefined + + const contentDelta = typeof delta?.content === 'string' ? delta.content : '' + if (state.responseText.length < MAX_BUFFER_SIZE) { + state.responseText += contentDelta + if (state.responseText.length >= MAX_BUFFER_SIZE) { + state.responseText = + state.responseText.slice(0, MAX_BUFFER_SIZE) + '\n---[TRUNCATED]---' + logger.warn( + { userId, agentId, model }, + 'Response text buffer truncated at 1MB', + ) + } + } + + const reasoningDelta = + typeof delta?.reasoning_content === 'string' + ? delta.reasoning_content + : typeof delta?.reasoning === 'string' + ? delta.reasoning + : '' + + // Track time to first token (TTFT) - set on first meaningful delta (content, reasoning, or tool_calls) + const hasToolCallsDelta = + delta?.tool_calls != null && (delta.tool_calls as unknown[])?.length > 0 + if ( + state.ttftMs === null && + (contentDelta !== '' || reasoningDelta !== '' || hasToolCallsDelta) + ) { + state.ttftMs = Date.now() - startTime.getTime() + } + + if (state.reasoningText.length < MAX_BUFFER_SIZE) { + state.reasoningText += reasoningDelta + if (state.reasoningText.length >= MAX_BUFFER_SIZE) { + state.reasoningText = + state.reasoningText.slice(0, MAX_BUFFER_SIZE) + '\n---[TRUNCATED]---' + logger.warn( + { userId, agentId, model }, + 'Reasoning text buffer truncated at 1MB', + ) + } + } + + return state +} + +export class DeepSeekError extends Error { + constructor( + public readonly statusCode: number, + public readonly statusText: string, + public readonly errorBody: { + error: { + message: string + code: string | number | null + type?: string | null + } + }, + ) { + super(errorBody.error.message) + this.name = 'DeepSeekError' + } + + toJSON() { + return { + error: { + message: this.errorBody.error.message, + code: this.errorBody.error.code, + type: this.errorBody.error.type, + }, + } + } +} + +async function parseDeepSeekError(response: Response): Promise { + const errorText = await response.text() + let errorBody: DeepSeekError['errorBody'] + try { + const parsed = JSON.parse(errorText) + if (parsed?.error?.message) { + errorBody = { + error: { + message: parsed.error.message, + code: parsed.error.code ?? null, + type: parsed.error.type ?? null, + }, + } + } else { + errorBody = { + error: { + message: errorText || response.statusText, + code: response.status, + }, + } + } + } catch { + errorBody = { + error: { + message: errorText || response.statusText, + code: response.status, + }, + } + } + return new DeepSeekError(response.status, response.statusText, errorBody) +} + +function creditsToFakeCost(credits: number): number { + return credits / ((1 + PROFIT_MARGIN) * 100) +} diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts index cbde91678..bb5ee7e48 100644 --- a/web/src/server/free-session/config.ts +++ b/web/src/server/free-session/config.ts @@ -1,4 +1,5 @@ import { + FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, FREEBUFF_GEMINI_PRO_MODEL_ID, FREEBUFF_GLM_MODEL_ID, FREEBUFF_KIMI_MODEL_ID, @@ -54,6 +55,7 @@ export function getSessionGraceMs(): number { * queue). */ const INSTANT_ADMIT_CAPACITY: Record = { + [FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]: 50, [FREEBUFF_GEMINI_PRO_MODEL_ID]: 50, [FREEBUFF_GLM_MODEL_ID]: 50, [FREEBUFF_KIMI_MODEL_ID]: 50, From 415161ca5bc5be1e6b0550b9598217a9a5f67497 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 30 Apr 2026 17:01:40 -0700 Subject: [PATCH 2/6] Stabilize free-mode rate limit test --- .../completions/__tests__/completions.test.ts | 80 +++++++++++-------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 12965104b..253e85c6e 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -8,10 +8,7 @@ import { isFreebuffDeploymentHours, } from '@codebuff/common/constants/freebuff-models' import { formatQuotaResetCountdown, postChatCompletions } from '../_post' -import { - checkFreeModeRateLimit, - resetFreeModeRateLimits, -} from '../free-mode-rate-limiter' +import { resetFreeModeRateLimits } from '../free-mode-rate-limiter' import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' import type { InsertMessageBigqueryFn } from '@codebuff/common/types/contracts/bigquery' @@ -49,6 +46,10 @@ describe('/api/v1/chat/completions POST endpoint', () => { id: 'user-new-free-gemini', banned: false, }, + 'test-api-key-reviewer-rate-limit': { + id: 'user-reviewer-rate-limit', + banned: false, + }, } const mockGetUserInfoFromApiKey: GetUserInfoFromApiKeyFn = async ({ @@ -1006,36 +1007,49 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(body.error).toBe('free_mode_invalid_agent_hierarchy') }) - it('counts child reviewer Gemini requests toward the free-mode request limit', async () => { - const response = await postChatCompletions({ - req: new NextRequest('http://localhost:3000/api/v1/chat/completions', { - method: 'POST', - headers: allowedFreeModeHeaders('test-api-key-new-free-gemini'), - body: JSON.stringify({ - model: FREEBUFF_GEMINI_PRO_MODEL_ID, - stream: false, - codebuff_metadata: { - run_id: 'run-reviewer-child', - client_id: 'test-client-id-123', - cost_mode: 'free', - }, - }), - }), - getUserInfoFromApiKey: mockGetUserInfoFromApiKey, - logger: mockLogger, - trackEvent: mockTrackEvent, - getUserUsageData: mockGetUserUsageData, - getAgentRunFromId: mockGetAgentRunFromId, - fetch: mockFetch, - insertMessageBigquery: mockInsertMessageBigquery, - loggerWithContext: mockLoggerWithContext, - checkSessionAdmissible: mockCheckSessionAdmissibleAllow, - }) + it( + 'counts child reviewer Gemini requests toward the free-mode request limit', + async () => { + const createRequest = () => + new NextRequest('http://localhost:3000/api/v1/chat/completions', { + method: 'POST', + headers: allowedFreeModeHeaders('test-api-key-reviewer-rate-limit'), + body: JSON.stringify({ + model: FREEBUFF_GEMINI_PRO_MODEL_ID, + stream: false, + codebuff_metadata: { + run_id: 'run-reviewer-child', + client_id: 'test-client-id-123', + cost_mode: 'free', + }, + }), + }) - expect(response.status).toBe(200) - expect(checkFreeModeRateLimit('user-new-free-gemini').limited).toBe(false) - expect(checkFreeModeRateLimit('user-new-free-gemini').limited).toBe(true) - }) + const createPostParams = () => ({ + req: createRequest(), + getUserInfoFromApiKey: mockGetUserInfoFromApiKey, + logger: mockLogger, + trackEvent: mockTrackEvent, + getUserUsageData: mockGetUserUsageData, + getAgentRunFromId: mockGetAgentRunFromId, + fetch: mockFetch, + insertMessageBigquery: mockInsertMessageBigquery, + loggerWithContext: mockLoggerWithContext, + checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + }) + + const firstResponse = await postChatCompletions(createPostParams()) + const secondResponse = await postChatCompletions(createPostParams()) + const limitedResponse = await postChatCompletions(createPostParams()) + + expect(firstResponse.status).toBe(200) + expect(secondResponse.status).toBe(200) + expect(limitedResponse.status).toBe(429) + const body = await limitedResponse.json() + expect(body.error).toBe('free_mode_rate_limited') + }, + FETCH_PATH_TEST_TIMEOUT_MS, + ) it( 'skips credit check when in FREE mode even with 0 credits', From d5135d6534e94dc89a16973c2ae37f100f5153a0 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 1 May 2026 12:13:32 -0700 Subject: [PATCH 3/6] Preserve DeepSeek reasoning for tool calls --- .../agent-runtime/src/tools/stream-parser.ts | 5 +++ ...to-openai-compatible-chat-messages.test.ts | 37 +++++++++++++++++++ ...vert-to-openai-compatible-chat-messages.ts | 7 ++++ 3 files changed, 49 insertions(+) diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts index 8dbda8bdc..5abd5ee50 100644 --- a/packages/agent-runtime/src/tools/stream-parser.ts +++ b/packages/agent-runtime/src/tools/stream-parser.ts @@ -276,6 +276,11 @@ export async function processStream( } if (chunk.type === 'reasoning') { + if (chunk.text) { + assistantMessages.push( + assistantMessage({ type: 'reasoning', text: chunk.text }), + ) + } onResponseChunk({ type: 'reasoning_delta', text: chunk.text, diff --git a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts index a24d72499..2f2274567 100644 --- a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts +++ b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts @@ -509,6 +509,43 @@ describe('provider-specific metadata merging', () => { ]) }) + it('should preserve assistant reasoning content with tool calls', () => { + const result = convertToOpenAICompatibleChatMessages([ + { + role: 'assistant', + content: [ + { type: 'reasoning', text: 'Need the date first. ' }, + { type: 'reasoning', text: 'Then call weather.' }, + { type: 'text', text: 'Checking that now...' }, + { + type: 'tool-call', + toolCallId: 'call1', + toolName: 'get_weather', + input: { location: 'Hangzhou' }, + }, + ], + }, + ]) + + expect(result).toEqual([ + { + role: 'assistant', + content: 'Checking that now...', + reasoning_content: 'Need the date first. Then call weather.', + tool_calls: [ + { + id: 'call1', + type: 'function', + function: { + name: 'get_weather', + arguments: JSON.stringify({ location: 'Hangzhou' }), + }, + }, + ], + }, + ]) + }) + it('should handle a single tool role message with multiple tool-result parts', () => { const result = convertToOpenAICompatibleChatMessages([ { diff --git a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts index 30a27cf6c..ec1945a8f 100644 --- a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts +++ b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts @@ -65,6 +65,7 @@ export function convertToOpenAICompatibleChatMessages( case 'assistant': { let text = '' + let reasoningContent = '' const toolCalls: Array<{ id: string type: 'function' @@ -78,6 +79,10 @@ export function convertToOpenAICompatibleChatMessages( text += part.text break } + case 'reasoning': { + reasoningContent += part.text + break + } case 'tool-call': { toolCalls.push({ id: part.toolCallId, @@ -96,6 +101,8 @@ export function convertToOpenAICompatibleChatMessages( messages.push({ role: 'assistant', content: text, + reasoning_content: + reasoningContent.length > 0 ? reasoningContent : undefined, tool_calls: toolCalls.length > 0 ? toolCalls : undefined, ...metadata, }) From e4a97a623073176046906c6f89b6ae4e6f7ee384 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 1 May 2026 15:11:47 -0700 Subject: [PATCH 4/6] Gate reasoning history replay --- common/src/constants/freebuff-models.ts | 6 ------ packages/agent-runtime/src/constants.ts | 4 ++++ packages/agent-runtime/src/tools/stream-parser.ts | 3 ++- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts index 84daca5d8..c03099bd4 100644 --- a/common/src/constants/freebuff-models.ts +++ b/common/src/constants/freebuff-models.ts @@ -49,12 +49,6 @@ export const FREEBUFF_MODELS = [ tagline: 'Deepest, 1/day', availability: 'always', }, - { - id: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID, - displayName: 'DeepSeek V4 Pro', - tagline: 'Experimental', - availability: 'always', - }, { id: FREEBUFF_MINIMAX_MODEL_ID, displayName: 'MiniMax M2.7', diff --git a/packages/agent-runtime/src/constants.ts b/packages/agent-runtime/src/constants.ts index d2981d456..16508a0bb 100644 --- a/packages/agent-runtime/src/constants.ts +++ b/packages/agent-runtime/src/constants.ts @@ -9,3 +9,7 @@ export const globalStopSequence = `${JSON.stringify(endsAgentStepParam)}` * to diff sequential requests and find what's breaking prompt caching. */ export const CACHE_DEBUG_FULL_LOGGING = false + +// Keep disabled by default to preserve mainline behavior until reasoning-token +// replay has been tested more thoroughly. +export const INCLUDE_REASONING_IN_MESSAGE_HISTORY = false diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts index 5abd5ee50..df4e33bef 100644 --- a/packages/agent-runtime/src/tools/stream-parser.ts +++ b/packages/agent-runtime/src/tools/stream-parser.ts @@ -8,6 +8,7 @@ import { import { generateCompactId } from '@codebuff/common/util/string' import { processStreamWithTools } from '../tool-stream-parser' +import { INCLUDE_REASONING_IN_MESSAGE_HISTORY } from '../constants' import { executeCustomToolCall, executeToolCall, @@ -276,7 +277,7 @@ export async function processStream( } if (chunk.type === 'reasoning') { - if (chunk.text) { + if (INCLUDE_REASONING_IN_MESSAGE_HISTORY && chunk.text) { assistantMessages.push( assistantMessage({ type: 'reasoning', text: chunk.text }), ) From 63e3ded3f7a7f5451ef19396ea9b3f3c6d911e92 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 1 May 2026 15:58:14 -0700 Subject: [PATCH 5/6] Stabilize Gemini thinker rate limit test --- .../chat/completions/__tests__/completions.test.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index 1ceb730c5..f73b568bb 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -53,6 +53,10 @@ describe('/api/v1/chat/completions POST endpoint', () => { id: 'user-reviewer-rate-limit', banned: false, }, + 'test-api-key-gemini-rate-limit': { + id: 'user-gemini-rate-limit', + banned: false, + }, } const mockGetUserInfoFromApiKey: GetUserInfoFromApiKeyFn = async ({ @@ -1100,10 +1104,14 @@ describe('/api/v1/chat/completions POST endpoint', () => { it( 'counts child Gemini thinker requests toward the free-mode request limit', async () => { + expect(checkFreeModeRateLimit('user-gemini-rate-limit').limited).toBe( + false, + ) + const createRequest = () => new NextRequest('http://localhost:3000/api/v1/chat/completions', { method: 'POST', - headers: allowedFreeModeHeaders('test-api-key-new-free-gemini'), + headers: allowedFreeModeHeaders('test-api-key-gemini-rate-limit'), body: JSON.stringify({ model: FREEBUFF_GEMINI_PRO_MODEL_ID, stream: false, @@ -1130,11 +1138,9 @@ describe('/api/v1/chat/completions POST endpoint', () => { }) const firstResponse = await postChatCompletions(createPostParams()) - const secondResponse = await postChatCompletions(createPostParams()) const limitedResponse = await postChatCompletions(createPostParams()) expect(firstResponse.status).toBe(200) - expect(secondResponse.status).toBe(200) expect(limitedResponse.status).toBe(429) const body = await limitedResponse.json() expect(body.error).toBe('free_mode_rate_limited') From ac3312462b9cf2a23825992951e648897e9aac2c Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 1 May 2026 16:17:01 -0700 Subject: [PATCH 6/6] Inject free mode rate limiter in completions tests --- .../completions/__tests__/completions.test.ts | 17 ++++++++++++++--- web/src/app/api/v1/chat/completions/_post.ts | 7 ++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts index f73b568bb..f5f329d25 100644 --- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts +++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts @@ -1104,9 +1104,18 @@ describe('/api/v1/chat/completions POST endpoint', () => { it( 'counts child Gemini thinker requests toward the free-mode request limit', async () => { - expect(checkFreeModeRateLimit('user-gemini-rate-limit').limited).toBe( - false, - ) + let rateLimitChecks = 0 + const checkFreeModeRateLimitForTest = mock((userId: string) => { + expect(userId).toBe('user-gemini-rate-limit') + rateLimitChecks += 1 + return rateLimitChecks === 1 + ? { limited: false as const } + : { + limited: true as const, + windowName: '1 second', + retryAfterMs: 1_000, + } + }) const createRequest = () => new NextRequest('http://localhost:3000/api/v1/chat/completions', { @@ -1135,6 +1144,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { insertMessageBigquery: mockInsertMessageBigquery, loggerWithContext: mockLoggerWithContext, checkSessionAdmissible: mockCheckSessionAdmissibleAllow, + checkFreeModeRateLimit: checkFreeModeRateLimitForTest, }) const firstResponse = await postChatCompletions(createPostParams()) @@ -1144,6 +1154,7 @@ describe('/api/v1/chat/completions POST endpoint', () => { expect(limitedResponse.status).toBe(429) const body = await limitedResponse.json() expect(body.error).toBe('free_mode_rate_limited') + expect(checkFreeModeRateLimitForTest).toHaveBeenCalledTimes(2) }, FETCH_PATH_TEST_TIMEOUT_MS, ) diff --git a/web/src/app/api/v1/chat/completions/_post.ts b/web/src/app/api/v1/chat/completions/_post.ts index bc37e3dfe..fd435cf3e 100644 --- a/web/src/app/api/v1/chat/completions/_post.ts +++ b/web/src/app/api/v1/chat/completions/_post.ts @@ -78,7 +78,7 @@ import { getFreeModeCountryAccess } from '@/server/free-mode-country' import type { SessionGateResult } from '@/server/free-session/public-api' import { extractApiKeyFromHeader } from '@/util/auth' import { withDefaultProperties } from '@codebuff/common/analytics' -import { checkFreeModeRateLimit } from './free-mode-rate-limiter' +import { checkFreeModeRateLimit as defaultCheckFreeModeRateLimit } from './free-mode-rate-limiter' export const formatQuotaResetCountdown = ( nextQuotaReset: string | null | undefined, @@ -117,6 +117,7 @@ export const formatQuotaResetCountdown = ( } export type CheckSessionAdmissibleFn = typeof checkSessionAdmissible +export type CheckFreeModeRateLimitFn = typeof defaultCheckFreeModeRateLimit type GateRejectCode = Extract['code'] @@ -147,6 +148,9 @@ export async function postChatCompletions(params: { /** Optional override for the freebuff waiting-room gate. Defaults to the * real check backed by Postgres; tests inject a no-op. */ checkSessionAdmissible?: CheckSessionAdmissibleFn + /** Optional override for the free-mode rate limiter. Tests inject this to + * avoid coupling to process-global limiter state. */ + checkFreeModeRateLimit?: CheckFreeModeRateLimitFn }) { const { req, @@ -159,6 +163,7 @@ export async function postChatCompletions(params: { ensureSubscriberBlockGrant, getUserPreferences, checkSessionAdmissible: checkSession = checkSessionAdmissible, + checkFreeModeRateLimit = defaultCheckFreeModeRateLimit, } = params let { logger } = params let { trackEvent } = params