From d5aed7bea1ae9ff873c280b7e704057706435a77 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 30 Apr 2026 16:07:41 -0700
Subject: [PATCH 1/6] Add DeepSeek V4 provider

---
 agents/base2/base2-free-deepseek-v4.ts        |  11 +
 agents/types/agent-definition.ts              |   2 +
 common/src/constants/free-agents.ts           |  11 +-
 common/src/constants/freebuff-models.ts       |   7 +
 common/src/constants/model-config.ts          |   3 +
 .../types/agent-definition.ts                 |   2 +
 evals/buffbench/main-single-eval.ts           |   2 +-
 packages/internal/src/env-schema.ts           |   2 +
 packages/internal/src/env.ts                  |   1 +
 .../completions/__tests__/completions.test.ts | 113 +++
 web/src/app/api/v1/chat/completions/_post.ts  | 119 ++-
 web/src/llm-api/deepseek.ts                   | 769 ++++++++++++++++++
 web/src/server/free-session/config.ts         |   2 +
 13 files changed, 1004 insertions(+), 40 deletions(-)
 create mode 100644 agents/base2/base2-free-deepseek-v4.ts
 create mode 100644 web/src/llm-api/deepseek.ts
diff --git a/agents/base2/base2-free-deepseek-v4.ts b/agents/base2/base2-free-deepseek-v4.ts
new file mode 100644
index 000000000..19ca5a891
--- /dev/null
+++ b/agents/base2/base2-free-deepseek-v4.ts
@@ -0,0 +1,11 @@
+import { createBase2 } from './base2'
+
+const definition = {
+  ...createBase2('free', {
+    noAskUser: true,
+    model: 'deepseek/deepseek-v4-pro',
+  }),
+  id: 'base2-free-deepseek-v4',
+  displayName: 'Buffy the DeepSeek V4 Free Orchestrator',
+}
+export default definition
diff --git a/agents/types/agent-definition.ts b/agents/types/agent-definition.ts
index 088dd1dca..2d05e4e0b 100644
--- a/agents/types/agent-definition.ts
+++ b/agents/types/agent-definition.ts
@@ -415,6 +415,8 @@ export type ModelName =
   | 'qwen/qwen3-30b-a3b:nitro'
 
   // DeepSeek
+  | 'deepseek/deepseek-v4-pro'
+  | 'deepseek-v4-pro'
   | 'deepseek/deepseek-chat-v3-0324'
   | 'deepseek/deepseek-chat-v3-0324:nitro'
   | 'deepseek/deepseek-r1-0528'
diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts
index 6d22152c5..e8a0d19b6 100644
--- a/common/src/constants/free-agents.ts
+++ b/common/src/constants/free-agents.ts
@@ -1,6 +1,9 @@
 import { parseAgentId } from '../util/agent-id-parsing'
 
-import { SUPPORTED_FREEBUFF_MODELS } from './freebuff-models'
+import {
+  FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
+  SUPPORTED_FREEBUFF_MODELS,
+} from './freebuff-models'
 
 import type { CostMode } from './model-config'
 
@@ -16,7 +19,10 @@ export const FREE_COST_MODE = 'free' as const
  * excluded — they're spawned by the root, so counting them would inflate
  * every user's apparent activity.
  */
-export const FREEBUFF_ROOT_AGENT_IDS = ['base2-free'] as const
+export const FREEBUFF_ROOT_AGENT_IDS = [
+  'base2-free',
+  'base2-free-deepseek-v4',
+] as const
 const FREEBUFF_ROOT_AGENT_ID_SET: ReadonlySet<string> = new Set(
   FREEBUFF_ROOT_AGENT_IDS,
 )
@@ -35,6 +41,7 @@ const FREEBUFF_ALLOWED_MODEL_IDS = SUPPORTED_FREEBUFF_MODELS.map(
 export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
   // Root orchestrator
   'base2-free': new Set(FREEBUFF_ALLOWED_MODEL_IDS),
+  'base2-free-deepseek-v4': new Set([FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]),
 
   // File exploration agents
   'file-picker': new Set(['google/gemini-2.5-flash-lite']),
diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts
index 246731a3f..84daca5d8 100644
--- a/common/src/constants/freebuff-models.ts
+++ b/common/src/constants/freebuff-models.ts
@@ -22,6 +22,7 @@ export interface FreebuffModelOption {
  *  `getFreebuffDeploymentAvailabilityLabel()` instead. */
 export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT every day'
 export const FREEBUFF_GEMINI_PRO_MODEL_ID = 'google/gemini-3.1-pro-preview'
+export const FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID = 'deepseek/deepseek-v4-pro'
 export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1'
 export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6'
 export const FREEBUFF_MINIMAX_MODEL_ID = 'minimax/minimax-m2.7'
@@ -48,6 +49,12 @@ export const FREEBUFF_MODELS = [
     tagline: 'Deepest, 1/day',
     availability: 'always',
   },
+  {
+    id: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
+    displayName: 'DeepSeek V4 Pro',
+    tagline: 'Experimental',
+    availability: 'always',
+  },
   {
     id: FREEBUFF_MINIMAX_MODEL_ID,
     displayName: 'MiniMax M2.7',
diff --git a/common/src/constants/model-config.ts b/common/src/constants/model-config.ts
index 9be6d31e0..ced599fc2 100644
--- a/common/src/constants/model-config.ts
+++ b/common/src/constants/model-config.ts
@@ -6,6 +6,7 @@ export const ALLOWED_MODEL_PREFIXES = [
   'openai',
   'google',
   'x-ai',
+  'deepseek',
 ] as const
 
 export const costModes = [
@@ -55,6 +56,8 @@ export type openrouterModel =
 export const deepseekModels = {
   deepseekChat: 'deepseek-chat',
   deepseekReasoner: 'deepseek-reasoner',
+  deepseekV4ProDirect: 'deepseek-v4-pro',
+  deepseekV4Pro: 'deepseek/deepseek-v4-pro',
 } as const
 export type DeepseekModel = (typeof deepseekModels)[keyof typeof deepseekModels]
 
diff --git a/common/src/templates/initial-agents-dir/types/agent-definition.ts b/common/src/templates/initial-agents-dir/types/agent-definition.ts
index 088dd1dca..2d05e4e0b 100644
--- a/common/src/templates/initial-agents-dir/types/agent-definition.ts
+++ b/common/src/templates/initial-agents-dir/types/agent-definition.ts
@@ -415,6 +415,8 @@ export type ModelName =
   | 'qwen/qwen3-30b-a3b:nitro'
 
   // DeepSeek
+  | 'deepseek/deepseek-v4-pro'
+  | 'deepseek-v4-pro'
   | 'deepseek/deepseek-chat-v3-0324'
   | 'deepseek/deepseek-chat-v3-0324:nitro'
   | 'deepseek/deepseek-r1-0528'
diff --git a/evals/buffbench/main-single-eval.ts b/evals/buffbench/main-single-eval.ts
index 6eceac7a5..bff2d322b 100644
--- a/evals/buffbench/main-single-eval.ts
+++ b/evals/buffbench/main-single-eval.ts
@@ -7,7 +7,7 @@ async function main() {
 
   await runBuffBench({
     evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
-    agents: ['base2-free-evals'],
+    agents: ['base2-free-deepseek-v4'],
     taskIds: ['server-agent-validation'],
     saveTraces,
   })
diff --git a/packages/internal/src/env-schema.ts b/packages/internal/src/env-schema.ts
index a8af80f06..232309ba0 100644
--- a/packages/internal/src/env-schema.ts
+++ b/packages/internal/src/env-schema.ts
@@ -8,6 +8,7 @@ export const serverEnvSchema = clientEnvSchema.extend({
   ANTHROPIC_API_KEY: z.string().min(1),
   FIREWORKS_API_KEY: z.string().min(1),
   CANOPYWAVE_API_KEY: z.string().min(1).optional(),
+  DEEPSEEK_API_KEY: z.string().min(1).optional(),
   SILICONFLOW_API_KEY: z.string().min(1).optional(),
   LINKUP_API_KEY: z.string().min(1),
   CONTEXT7_API_KEY: z.string().optional(),
@@ -92,6 +93,7 @@ export const serverProcessEnv: ServerInput = {
   ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
   FIREWORKS_API_KEY: process.env.FIREWORKS_API_KEY,
   CANOPYWAVE_API_KEY: process.env.CANOPYWAVE_API_KEY,
+  DEEPSEEK_API_KEY: process.env.DEEPSEEK_API_KEY,
   SILICONFLOW_API_KEY: process.env.SILICONFLOW_API_KEY,
   LINKUP_API_KEY: process.env.LINKUP_API_KEY,
   CONTEXT7_API_KEY: process.env.CONTEXT7_API_KEY,
diff --git a/packages/internal/src/env.ts b/packages/internal/src/env.ts
index 3c3f60ce8..6edcea4d7 100644
--- a/packages/internal/src/env.ts
+++ b/packages/internal/src/env.ts
@@ -18,6 +18,7 @@ if (isCI) {
   ensureEnvDefault('ANTHROPIC_API_KEY', 'test')
   ensureEnvDefault('FIREWORKS_API_KEY', 'test')
   ensureEnvDefault('CANOPYWAVE_API_KEY', 'test')
+  ensureEnvDefault('DEEPSEEK_API_KEY', 'test')
   ensureEnvDefault('LINKUP_API_KEY', 'test')
   ensureEnvDefault('GRAVITY_API_KEY', 'test')
   ensureEnvDefault('IPINFO_TOKEN', 'test')
diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index cf846131c..12965104b 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -2,6 +2,7 @@ import { afterEach, beforeEach, describe, expect, mock, it } from 'bun:test'
 import { NextRequest } from 'next/server'
 
 import {
+  FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
   FREEBUFF_GEMINI_PRO_MODEL_ID,
   FREEBUFF_GLM_MODEL_ID,
   isFreebuffDeploymentHours,
@@ -147,6 +148,13 @@ describe('/api/v1/chat/completions POST endpoint', () => {
           status: 'running',
         }
       }
+      if (runId === 'run-free-deepseek-v4') {
+        return {
+          agent_id: 'base2-free-deepseek-v4',
+          ancestor_run_ids: [],
+          status: 'running',
+        }
+      }
       if (runId === 'run-reviewer-direct') {
         return {
           agent_id: 'code-reviewer-lite',
@@ -823,6 +831,111 @@ describe('/api/v1/chat/completions POST endpoint', () => {
       FETCH_PATH_TEST_TIMEOUT_MS,
     )
 
+    it(
+      'lets the DeepSeek V4 free agent use the direct DeepSeek provider',
+      async () => {
+        const fetchedBodies: Record<string, unknown>[] = []
+        const fetchedUrls: string[] = []
+        const fetchViaDeepSeek = mock(
+          async (url: string | URL | Request, init?: RequestInit) => {
+            fetchedUrls.push(String(url))
+            fetchedBodies.push(JSON.parse(init?.body as string))
+            return new Response(
+              JSON.stringify({
+                id: 'test-id',
+                model: 'deepseek-v4-pro',
+                choices: [{ message: { content: 'test response' } }],
+                usage: {
+                  prompt_tokens: 10,
+                  prompt_cache_hit_tokens: 4,
+                  completion_tokens: 20,
+                  total_tokens: 30,
+                },
+              }),
+              {
+                status: 200,
+                headers: { 'Content-Type': 'application/json' },
+              },
+            )
+          },
+        ) as unknown as typeof globalThis.fetch
+
+        const req = new NextRequest(
+          'http://localhost:3000/api/v1/chat/completions',
+          {
+            method: 'POST',
+            headers: allowedFreeModeHeaders('test-api-key-new-free'),
+            body: JSON.stringify({
+              model: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
+              stream: false,
+              codebuff_metadata: {
+                run_id: 'run-free-deepseek-v4',
+                client_id: 'test-client-id-123',
+                cost_mode: 'free',
+              },
+            }),
+          },
+        )
+
+        const response = await postChatCompletions({
+          req,
+          getUserInfoFromApiKey: mockGetUserInfoFromApiKey,
+          logger: mockLogger,
+          trackEvent: mockTrackEvent,
+          getUserUsageData: mockGetUserUsageData,
+          getAgentRunFromId: mockGetAgentRunFromId,
+          fetch: fetchViaDeepSeek,
+          insertMessageBigquery: mockInsertMessageBigquery,
+          loggerWithContext: mockLoggerWithContext,
+          checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
+        })
+
+        const body = await response.json()
+        expect(response.status).toBe(200)
+        expect(fetchedUrls[0]).toBe('https://api.deepseek.com/chat/completions')
+        expect(fetchedBodies[0].model).toBe('deepseek-v4-pro')
+        expect(body.model).toBe(FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID)
+        expect(body.provider).toBe('DeepSeek')
+      },
+      FETCH_PATH_TEST_TIMEOUT_MS,
+    )
+
+    it('rejects the DeepSeek V4 free agent when it requests another free model', async () => {
+      const req = new NextRequest(
+        'http://localhost:3000/api/v1/chat/completions',
+        {
+          method: 'POST',
+          headers: allowedFreeModeHeaders('test-api-key-new-free'),
+          body: JSON.stringify({
+            model: FREEBUFF_GEMINI_PRO_MODEL_ID,
+            stream: false,
+            codebuff_metadata: {
+              run_id: 'run-free-deepseek-v4',
+              client_id: 'test-client-id-123',
+              cost_mode: 'free',
+            },
+          }),
+        },
+      )
+
+      const response = await postChatCompletions({
+        req,
+        getUserInfoFromApiKey: mockGetUserInfoFromApiKey,
+        logger: mockLogger,
+        trackEvent: mockTrackEvent,
+        getUserUsageData: mockGetUserUsageData,
+        getAgentRunFromId: mockGetAgentRunFromId,
+        fetch: mockFetch,
+        insertMessageBigquery: mockInsertMessageBigquery,
+        loggerWithContext: mockLoggerWithContext,
+        checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
+      })
+
+      const body = await response.json()
+      expect(response.status).toBe(403)
+      expect(body.error).toBe('free_mode_invalid_agent_model')
+    })
+
     it('lets freebuff use Gemini 3.1 Pro through the free-mode allowlist', async () => {
       const req = new NextRequest(
         'http://localhost:3000/api/v1/chat/completions',
diff --git a/web/src/app/api/v1/chat/completions/_post.ts b/web/src/app/api/v1/chat/completions/_post.ts
index 0a7771d46..4c507c291 100644
--- a/web/src/app/api/v1/chat/completions/_post.ts
+++ b/web/src/app/api/v1/chat/completions/_post.ts
@@ -48,6 +48,12 @@ import {
   handleFireworksStream,
   isFireworksModel,
 } from '@/llm-api/fireworks'
+import {
+  DeepSeekError,
+  handleDeepSeekNonStream,
+  handleDeepSeekStream,
+  isDeepSeekModel,
+} from '@/llm-api/deepseek'
 import {
   SiliconFlowError,
   handleSiliconFlowNonStream,
@@ -597,12 +603,15 @@ export async function postChatCompletions(params: {
     // Handle streaming vs non-streaming
     try {
       if (bodyStream) {
-        // Streaming request — route to SiliconFlow/CanopyWave/Fireworks for supported models
+        // Streaming request — route supported models to direct providers.
         const useSiliconFlow = false // isSiliconFlowModel(typedBody.model)
         const useCanopyWave = isCanopyWaveModel(typedBody.model)
-        const useFireworks = !useCanopyWave && isFireworksModel(typedBody.model)
+        const useDeepSeek = !useCanopyWave && isDeepSeekModel(typedBody.model)
+        const useFireworks =
+          !useCanopyWave && !useDeepSeek && isFireworksModel(typedBody.model)
         const useOpenAIDirect =
           !useCanopyWave &&
+          !useDeepSeek &&
           !useFireworks &&
           isOpenAIDirectModel(typedBody.model)
         const stream = useSiliconFlow
@@ -625,8 +634,8 @@ export async function postChatCompletions(params: {
                 logger,
                 insertMessageBigquery,
               })
-            : useFireworks
-              ? await handleFireworksStream({
+            : useDeepSeek
+              ? await handleDeepSeekStream({
                   body: typedBody,
                   userId,
                   stripeCustomerId,
@@ -635,8 +644,8 @@ export async function postChatCompletions(params: {
                   logger,
                   insertMessageBigquery,
                 })
-              : useOpenAIDirect
-                ? await handleOpenAIStream({
+              : useFireworks
+                ? await handleFireworksStream({
                     body: typedBody,
                     userId,
                     stripeCustomerId,
@@ -645,16 +654,26 @@ export async function postChatCompletions(params: {
                     logger,
                     insertMessageBigquery,
                   })
-                : await handleOpenRouterStream({
-                    body: typedBody,
-                    userId,
-                    stripeCustomerId,
-                    agentId,
-                    openrouterApiKey,
-                    fetch,
-                    logger,
-                    insertMessageBigquery,
-                  })
+                : useOpenAIDirect
+                  ? await handleOpenAIStream({
+                      body: typedBody,
+                      userId,
+                      stripeCustomerId,
+                      agentId,
+                      fetch,
+                      logger,
+                      insertMessageBigquery,
+                    })
+                  : await handleOpenRouterStream({
+                      body: typedBody,
+                      userId,
+                      stripeCustomerId,
+                      agentId,
+                      openrouterApiKey,
+                      fetch,
+                      logger,
+                      insertMessageBigquery,
+                    })
 
         trackEvent({
           event: AnalyticsEvent.CHAT_COMPLETIONS_STREAM_STARTED,
@@ -679,9 +698,14 @@ export async function postChatCompletions(params: {
         const model = typedBody.model
         const useSiliconFlow = false // isSiliconFlowModel(model)
         const useCanopyWave = isCanopyWaveModel(model)
-        const useFireworks = !useCanopyWave && isFireworksModel(model)
+        const useDeepSeek = !useCanopyWave && isDeepSeekModel(model)
+        const useFireworks =
+          !useCanopyWave && !useDeepSeek && isFireworksModel(model)
         const shouldUseOpenAIEndpoint =
-          !useCanopyWave && !useFireworks && isOpenAIDirectModel(model)
+          !useCanopyWave &&
+          !useDeepSeek &&
+          !useFireworks &&
+          isOpenAIDirectModel(model)
 
         const nonStreamRequest = useSiliconFlow
           ? handleSiliconFlowNonStream({
@@ -703,8 +727,8 @@ export async function postChatCompletions(params: {
                 logger,
                 insertMessageBigquery,
               })
-            : useFireworks
-              ? handleFireworksNonStream({
+            : useDeepSeek
+              ? handleDeepSeekNonStream({
                   body: typedBody,
                   userId,
                   stripeCustomerId,
@@ -713,8 +737,8 @@ export async function postChatCompletions(params: {
                   logger,
                   insertMessageBigquery,
                 })
-              : shouldUseOpenAIEndpoint
-                ? handleOpenAINonStream({
+              : useFireworks
+                ? handleFireworksNonStream({
                     body: typedBody,
                     userId,
                     stripeCustomerId,
@@ -723,16 +747,26 @@ export async function postChatCompletions(params: {
                     logger,
                     insertMessageBigquery,
                   })
-                : handleOpenRouterNonStream({
-                    body: typedBody,
-                    userId,
-                    stripeCustomerId,
-                    agentId,
-                    openrouterApiKey,
-                    fetch,
-                    logger,
-                    insertMessageBigquery,
-                  })
+                : shouldUseOpenAIEndpoint
+                  ? handleOpenAINonStream({
+                      body: typedBody,
+                      userId,
+                      stripeCustomerId,
+                      agentId,
+                      fetch,
+                      logger,
+                      insertMessageBigquery,
+                    })
+                  : handleOpenRouterNonStream({
+                      body: typedBody,
+                      userId,
+                      stripeCustomerId,
+                      agentId,
+                      openrouterApiKey,
+                      fetch,
+                      logger,
+                      insertMessageBigquery,
+                    })
         const result = await nonStreamRequest
 
         trackEvent({
@@ -761,6 +795,10 @@ export async function postChatCompletions(params: {
       if (error instanceof CanopyWaveError) {
         canopywaveError = error
       }
+      let deepseekError: DeepSeekError | undefined
+      if (error instanceof DeepSeekError) {
+        deepseekError = error
+      }
       let siliconflowError: SiliconFlowError | undefined
       if (error instanceof SiliconFlowError) {
         siliconflowError = error
@@ -776,11 +814,13 @@ export async function postChatCompletions(params: {
         ? 'SiliconFlow'
         : canopywaveError
           ? 'CanopyWave'
-          : fireworksError
-            ? 'Fireworks'
-            : openaiError
-              ? 'OpenAI'
-              : 'OpenRouter'
+          : deepseekError
+            ? 'DeepSeek'
+            : fireworksError
+              ? 'Fireworks'
+              : openaiError
+                ? 'OpenAI'
+                : 'OpenRouter'
       logger.error(
         {
           error: getErrorObject(error),
@@ -798,6 +838,7 @@ export async function postChatCompletions(params: {
             openrouterError ??
             fireworksError ??
             canopywaveError ??
+            deepseekError ??
             siliconflowError ??
             openaiError
           )?.statusCode,
@@ -805,6 +846,7 @@ export async function postChatCompletions(params: {
             openrouterError ??
             fireworksError ??
             canopywaveError ??
+            deepseekError ??
             siliconflowError ??
             openaiError
           )?.statusText,
@@ -838,6 +880,9 @@ export async function postChatCompletions(params: {
       if (error instanceof CanopyWaveError) {
         return NextResponse.json(error.toJSON(), { status: error.statusCode })
       }
+      if (error instanceof DeepSeekError) {
+        return NextResponse.json(error.toJSON(), { status: error.statusCode })
+      }
       if (error instanceof SiliconFlowError) {
         return NextResponse.json(error.toJSON(), { status: error.statusCode })
       }
diff --git a/web/src/llm-api/deepseek.ts b/web/src/llm-api/deepseek.ts
new file mode 100644
index 000000000..12ac66265
--- /dev/null
+++ b/web/src/llm-api/deepseek.ts
@@ -0,0 +1,769 @@
+import { Agent } from 'undici'
+
+import { deepseekModels } from '@codebuff/common/constants/model-config'
+import { PROFIT_MARGIN } from '@codebuff/common/constants/limits'
+import { getErrorObject } from '@codebuff/common/util/error'
+import { env } from '@codebuff/internal/env'
+
+import {
+  consumeCreditsForMessage,
+  extractRequestMetadata,
+  insertMessageToBigQuery,
+} from './helpers'
+
+import type { UsageData } from './helpers'
+import type { InsertMessageBigqueryFn } from '@codebuff/common/types/contracts/bigquery'
+import type { Logger } from '@codebuff/common/types/contracts/logger'
+import type { ChatCompletionRequestBody } from './types'
+
+const DEEPSEEK_BASE_URL = 'https://api.deepseek.com'
+
+// Extended timeout for deep-thinking models that can take
+// a long time to start streaming.
+const DEEPSEEK_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
+
+const deepseekAgent = new Agent({
+  headersTimeout: DEEPSEEK_HEADERS_TIMEOUT_MS,
+  bodyTimeout: 0,
+})
+
+// DeepSeek per-token pricing (dollars per token)
+interface DeepSeekPricing {
+  inputCostPerToken: number
+  cachedInputCostPerToken: number
+  outputCostPerToken: number
+}
+
+const DEEPSEEK_V4_PRO_PRICING: DeepSeekPricing = {
+  inputCostPerToken: 0.435 / 1_000_000,
+  cachedInputCostPerToken: 0.003625 / 1_000_000,
+  outputCostPerToken: 0.87 / 1_000_000,
+}
+
+/** Single source of truth for DeepSeek model metadata and pricing.
+ *  Kept as one map so adding a model can't drift between routing and billing. */
+const DEEPSEEK_MODELS: Record<
+  string,
+  { deepseekId: string; pricing: DeepSeekPricing }
+> = {
+  [deepseekModels.deepseekV4ProDirect]: {
+    deepseekId: deepseekModels.deepseekV4ProDirect,
+    pricing: DEEPSEEK_V4_PRO_PRICING,
+  },
+  [deepseekModels.deepseekV4Pro]: {
+    deepseekId: deepseekModels.deepseekV4ProDirect,
+    pricing: DEEPSEEK_V4_PRO_PRICING,
+  },
+}
+
+const DEEPSEEK_ROUTED_MODELS = new Set<string>(Object.keys(DEEPSEEK_MODELS))
+
+export function isDeepSeekModel(model: string): boolean {
+  return DEEPSEEK_ROUTED_MODELS.has(model)
+}
+
+function getDeepSeekModelId(openrouterModel: string): string {
+  return DEEPSEEK_MODELS[openrouterModel]?.deepseekId ?? openrouterModel
+}
+
+function getDeepSeekPricing(model: string): DeepSeekPricing {
+  const entry = DEEPSEEK_MODELS[model]
+  if (!entry) {
+    throw new Error(`No DeepSeek pricing found for model: ${model}`)
+  }
+  return entry.pricing
+}
+
+type StreamState = {
+  responseText: string
+  reasoningText: string
+  ttftMs: number | null
+  billedAlready: boolean
+}
+
+type LineResult = {
+  state: StreamState
+  billedCredits?: number
+  patchedLine: string
+}
+
+function toDeepSeekReasoningEffort(effort: unknown): 'high' | 'max' {
+  return effort === 'max' || effort === 'xhigh' ? 'max' : 'high'
+}
+
+function createDeepSeekRequest(params: {
+  body: ChatCompletionRequestBody
+  originalModel: string
+  fetch: typeof globalThis.fetch
+}) {
+  const { body, originalModel, fetch } = params
+  const deepseekBody: Record<string, unknown> = {
+    ...body,
+    model: getDeepSeekModelId(originalModel),
+  }
+
+  // DeepSeek uses `thinking` instead of OpenRouter's `reasoning`.
+  if (deepseekBody.reasoning && typeof deepseekBody.reasoning === 'object') {
+    const reasoning = deepseekBody.reasoning as {
+      enabled?: boolean
+      effort?: 'high' | 'medium' | 'low'
+    }
+    deepseekBody.thinking = {
+      type: reasoning.enabled === false ? 'disabled' : 'enabled',
+      reasoning_effort: toDeepSeekReasoningEffort(reasoning.effort),
+    }
+  } else if (deepseekBody.reasoning_effort) {
+    deepseekBody.thinking = {
+      type: 'enabled',
+      reasoning_effort: toDeepSeekReasoningEffort(
+        deepseekBody.reasoning_effort,
+      ),
+    }
+  }
+  delete deepseekBody.reasoning
+  delete deepseekBody.reasoning_effort
+
+  // Strip OpenRouter-specific / internal fields
+  delete deepseekBody.provider
+  delete deepseekBody.transforms
+  delete deepseekBody.codebuff_metadata
+  delete deepseekBody.usage
+
+  // For streaming, request usage in the final chunk
+  if (deepseekBody.stream) {
+    deepseekBody.stream_options = { include_usage: true }
+  }
+
+  if (!env.DEEPSEEK_API_KEY) {
+    throw new Error('DEEPSEEK_API_KEY is not configured')
+  }
+
+  return fetch(`${DEEPSEEK_BASE_URL}/chat/completions`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${env.DEEPSEEK_API_KEY}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify(deepseekBody),
+    // @ts-expect-error - dispatcher is a valid undici option not in fetch types
+    dispatcher: deepseekAgent,
+  })
+}
+
+function extractUsageAndCost(
+  usage: Record<string, unknown> | undefined | null,
+  model: string,
+): UsageData {
+  if (!usage)
+    return {
+      inputTokens: 0,
+      outputTokens: 0,
+      cacheReadInputTokens: 0,
+      reasoningTokens: 0,
+      cost: 0,
+    }
+  const completionDetails = usage.completion_tokens_details as
+    | Record<string, unknown>
+    | undefined
+    | null
+
+  const inputTokens =
+    typeof usage.prompt_tokens === 'number' ? usage.prompt_tokens : 0
+  const outputTokens =
+    typeof usage.completion_tokens === 'number' ? usage.completion_tokens : 0
+  const cacheReadInputTokens =
+    typeof usage.prompt_cache_hit_tokens === 'number'
+      ? usage.prompt_cache_hit_tokens
+      : 0
+  const reasoningTokens =
+    typeof completionDetails?.reasoning_tokens === 'number'
+      ? completionDetails.reasoning_tokens
+      : 0
+
+  const pricing = getDeepSeekPricing(model)
+  const nonCachedInputTokens = Math.max(0, inputTokens - cacheReadInputTokens)
+  const cost =
+    nonCachedInputTokens * pricing.inputCostPerToken +
+    cacheReadInputTokens * pricing.cachedInputCostPerToken +
+    outputTokens * pricing.outputCostPerToken
+
+  return {
+    inputTokens,
+    outputTokens,
+    cacheReadInputTokens,
+    reasoningTokens,
+    cost,
+  }
+}
+
+export async function handleDeepSeekNonStream({
+  body,
+  userId,
+  stripeCustomerId,
+  agentId,
+  fetch,
+  logger,
+  insertMessageBigquery,
+}: {
+  body: ChatCompletionRequestBody
+  userId: string
+  stripeCustomerId?: string | null
+  agentId: string
+  fetch: typeof globalThis.fetch
+  logger: Logger
+  insertMessageBigquery: InsertMessageBigqueryFn
+}) {
+  const originalModel = body.model
+  const startTime = new Date()
+  const { clientId, clientRequestId, costMode } = extractRequestMetadata({
+    body,
+    logger,
+  })
+
+  const response = await createDeepSeekRequest({ body, originalModel, fetch })
+
+  if (!response.ok) {
+    throw await parseDeepSeekError(response)
+  }
+
+  const data = await response.json()
+  const content = data.choices?.[0]?.message?.content ?? ''
+  const reasoningText =
+    data.choices?.[0]?.message?.reasoning_content ??
+    data.choices?.[0]?.message?.reasoning ??
+    ''
+  const usageData = extractUsageAndCost(data.usage, originalModel)
+
+  insertMessageToBigQuery({
+    messageId: data.id,
+    userId,
+    startTime,
+    request: body,
+    reasoningText,
+    responseText: content,
+    usageData,
+    logger,
+    insertMessageBigquery,
+  }).catch((error) => {
+    logger.error({ error }, 'Failed to insert message into BigQuery')
+  })
+
+  const billedCredits = await consumeCreditsForMessage({
+    messageId: data.id,
+    userId,
+    stripeCustomerId,
+    agentId,
+    clientId,
+    clientRequestId,
+    startTime,
+    model: originalModel,
+    reasoningText,
+    responseText: content,
+    usageData,
+    byok: false,
+    logger,
+    costMode,
+    ttftMs: null, // Non-stream - no TTFT to report
+  })
+
+  // Overwrite cost so SDK calculates exact credits we charged
+  if (data.usage) {
+    data.usage.cost = creditsToFakeCost(billedCredits)
+    data.usage.cost_details = { upstream_inference_cost: 0 }
+  }
+
+  // Normalise model name back to OpenRouter format for client compatibility
+  data.model = originalModel
+  if (!data.provider) data.provider = 'DeepSeek'
+
+  return data
+}
+
+export async function handleDeepSeekStream({
+  body,
+  userId,
+  stripeCustomerId,
+  agentId,
+  fetch,
+  logger,
+  insertMessageBigquery,
+}: {
+  body: ChatCompletionRequestBody
+  userId: string
+  stripeCustomerId?: string | null
+  agentId: string
+  fetch: typeof globalThis.fetch
+  logger: Logger
+  insertMessageBigquery: InsertMessageBigqueryFn
+}) {
+  const originalModel = body.model
+  const startTime = new Date()
+  const { clientId, clientRequestId, costMode } = extractRequestMetadata({
+    body,
+    logger,
+  })
+
+  const response = await createDeepSeekRequest({ body, originalModel, fetch })
+
+  if (!response.ok) {
+    throw await parseDeepSeekError(response)
+  }
+
+  const reader = response.body?.getReader()
+  if (!reader) {
+    throw new Error('Failed to get response reader')
+  }
+
+  let heartbeatInterval: NodeJS.Timeout
+  let state: StreamState = {
+    responseText: '',
+    reasoningText: '',
+    ttftMs: null,
+    billedAlready: false,
+  }
+  let clientDisconnected = false
+
+  const stream = new ReadableStream({
+    async start(controller) {
+      const decoder = new TextDecoder()
+      let buffer = ''
+
+      controller.enqueue(
+        new TextEncoder().encode(`: connected ${new Date().toISOString()}\n`),
+      )
+
+      heartbeatInterval = setInterval(() => {
+        if (!clientDisconnected) {
+          try {
+            controller.enqueue(
+              new TextEncoder().encode(
+                `: heartbeat ${new Date().toISOString()}\n\n`,
+              ),
+            )
+          } catch {
+            // client disconnected
+          }
+        }
+      }, 30000)
+
+      try {
+        let done = false
+        while (!done) {
+          const result = await reader.read()
+          done = result.done
+          const value = result.value
+
+          if (done) break
+
+          buffer += decoder.decode(value, { stream: true })
+          let lineEnd = buffer.indexOf('\n')
+
+          while (lineEnd !== -1) {
+            const line = buffer.slice(0, lineEnd + 1)
+            buffer = buffer.slice(lineEnd + 1)
+
+            const lineResult = await handleLine({
+              userId,
+              stripeCustomerId,
+              agentId,
+              clientId,
+              clientRequestId,
+              costMode,
+              startTime,
+              request: body,
+              originalModel,
+              line,
+              state,
+              logger,
+              insertMessage: insertMessageBigquery,
+            })
+            state = lineResult.state
+
+            if (!clientDisconnected) {
+              try {
+                controller.enqueue(
+                  new TextEncoder().encode(lineResult.patchedLine),
+                )
+              } catch {
+                logger.warn(
+                  'Client disconnected during stream, continuing for billing',
+                )
+                clientDisconnected = true
+              }
+            }
+
+            lineEnd = buffer.indexOf('\n')
+          }
+        }
+
+        if (!clientDisconnected) {
+          controller.close()
+        }
+      } catch (error) {
+        if (!clientDisconnected) {
+          controller.error(error)
+        } else {
+          logger.warn(
+            getErrorObject(error),
+            'Error after client disconnect in DeepSeek stream',
+          )
+        }
+      } finally {
+        clearInterval(heartbeatInterval)
+      }
+    },
+    cancel() {
+      clearInterval(heartbeatInterval)
+      clientDisconnected = true
+      logger.warn(
+        {
+          clientDisconnected,
+          responseTextLength: state.responseText.length,
+          reasoningTextLength: state.reasoningText.length,
+        },
+        'Client cancelled stream, continuing DeepSeek consumption for billing',
+      )
+    },
+  })
+
+  return stream
+}
+
+async function handleLine({
+  userId,
+  stripeCustomerId,
+  agentId,
+  clientId,
+  clientRequestId,
+  costMode,
+  startTime,
+  request,
+  originalModel,
+  line,
+  state,
+  logger,
+  insertMessage,
+}: {
+  userId: string
+  stripeCustomerId?: string | null
+  agentId: string
+  clientId: string | null
+  clientRequestId: string | null
+  costMode: string | undefined
+  startTime: Date
+  request: unknown
+  originalModel: string
+  line: string
+  state: StreamState
+  logger: Logger
+  insertMessage: InsertMessageBigqueryFn
+}): Promise<LineResult> {
+  if (!line.startsWith('data: ')) {
+    return { state, patchedLine: line }
+  }
+
+  const raw = line.slice('data: '.length)
+  if (raw === '[DONE]\n' || raw === '[DONE]') {
+    return { state, patchedLine: line }
+  }
+
+  let obj: Record<string, unknown>
+  try {
+    obj = JSON.parse(raw)
+  } catch (error) {
+    logger.warn(
+      { error: getErrorObject(error, { includeRawError: true }) },
+      'Received non-JSON DeepSeek response',
+    )
+    return { state, patchedLine: line }
+  }
+
+  // Patch model and provider for SDK compatibility
+  if (obj.model) obj.model = originalModel
+  if (!obj.provider) obj.provider = 'DeepSeek'
+
+  // Process the chunk for billing / state tracking
+  const result = await handleResponse({
+    userId,
+    stripeCustomerId,
+    agentId,
+    clientId,
+    clientRequestId,
+    costMode,
+    startTime,
+    request,
+    originalModel,
+    data: obj,
+    state,
+    logger,
+    insertMessage,
+  })
+
+  // If this is the final chunk with billing, overwrite cost in the patched object
+  if (result.billedCredits !== undefined && obj.usage) {
+    const usage = obj.usage as Record<string, unknown>
+    usage.cost = creditsToFakeCost(result.billedCredits)
+    usage.cost_details = { upstream_inference_cost: 0 }
+  }
+
+  const patchedLine = `data: ${JSON.stringify(obj)}\n`
+  return {
+    state: result.state,
+    billedCredits: result.billedCredits,
+    patchedLine,
+  }
+}
+
+function isFinalChunk(data: Record<string, unknown>): boolean {
+  const choices = data.choices as Array<Record<string, unknown>> | undefined
+  if (!choices || choices.length === 0) return true
+  return choices.some((c) => c.finish_reason != null)
+}
+
+async function handleResponse({
+  userId,
+  stripeCustomerId,
+  agentId,
+  clientId,
+  clientRequestId,
+  costMode,
+  startTime,
+  request,
+  originalModel,
+  data,
+  state,
+  logger,
+  insertMessage,
+}: {
+  userId: string
+  stripeCustomerId?: string | null
+  agentId: string
+  clientId: string | null
+  clientRequestId: string | null
+  costMode: string | undefined
+  startTime: Date
+  request: unknown
+  originalModel: string
+  data: Record<string, unknown>
+  state: StreamState
+  logger: Logger
+  insertMessage: InsertMessageBigqueryFn
+}): Promise<{ state: StreamState; billedCredits?: number }> {
+  state = handleStreamChunk({
+    data,
+    state,
+    startTime,
+    logger,
+    userId,
+    agentId,
+    model: originalModel,
+  })
+
+  // Some providers send cumulative usage on EVERY chunk (not just the final one),
+  // so we must only bill once on the final chunk to avoid charging N times.
+  if (
+    'error' in data ||
+    !data.usage ||
+    state.billedAlready ||
+    !isFinalChunk(data)
+  ) {
+    // Strip usage from non-final chunks and duplicate final chunks
+    // so the SDK doesn't see multiple usage objects
+    if (data.usage && (!isFinalChunk(data) || state.billedAlready)) {
+      delete data.usage
+    }
+    return { state }
+  }
+
+  const usageData = extractUsageAndCost(
+    data.usage as Record<string, unknown>,
+    originalModel,
+  )
+  const messageId = typeof data.id === 'string' ? data.id : 'unknown'
+
+  state.billedAlready = true
+
+  insertMessageToBigQuery({
+    messageId,
+    userId,
+    startTime,
+    request,
+    reasoningText: state.reasoningText,
+    responseText: state.responseText,
+    usageData,
+    logger,
+    insertMessageBigquery: insertMessage,
+  }).catch((error) => {
+    logger.error({ error }, 'Failed to insert message into BigQuery')
+  })
+
+  const billedCredits = await consumeCreditsForMessage({
+    messageId,
+    userId,
+    stripeCustomerId,
+    agentId,
+    clientId,
+    clientRequestId,
+    startTime,
+    model: originalModel,
+    reasoningText: state.reasoningText,
+    responseText: state.responseText,
+    usageData,
+    byok: false,
+    logger,
+    costMode,
+    ttftMs: state.ttftMs,
+  })
+
+  return { state, billedCredits }
+}
+
+function handleStreamChunk({
+  data,
+  state,
+  startTime,
+  logger,
+  userId,
+  agentId,
+  model,
+}: {
+  data: Record<string, unknown>
+  state: StreamState
+  startTime: Date
+  logger: Logger
+  userId: string
+  agentId: string
+  model: string
+}): StreamState {
+  const MAX_BUFFER_SIZE = 1 * 1024 * 1024
+
+  if ('error' in data) {
+    const errorData = data.error as Record<string, unknown>
+    logger.error(
+      {
+        userId,
+        agentId,
+        model,
+        errorCode: errorData?.code,
+        errorType: errorData?.type,
+        errorMessage: errorData?.message,
+      },
+      'Received error chunk in DeepSeek stream',
+    )
+    return state
+  }
+
+  const choices = data.choices as Array<Record<string, unknown>> | undefined
+  if (!choices?.length) {
+    return state
+  }
+  const choice = choices[0]
+  const delta = choice.delta as Record<string, unknown> | undefined
+
+  const contentDelta = typeof delta?.content === 'string' ? delta.content : ''
+  if (state.responseText.length < MAX_BUFFER_SIZE) {
+    state.responseText += contentDelta
+    if (state.responseText.length >= MAX_BUFFER_SIZE) {
+      state.responseText =
+        state.responseText.slice(0, MAX_BUFFER_SIZE) + '\n---[TRUNCATED]---'
+      logger.warn(
+        { userId, agentId, model },
+        'Response text buffer truncated at 1MB',
+      )
+    }
+  }
+
+  const reasoningDelta =
+    typeof delta?.reasoning_content === 'string'
+      ? delta.reasoning_content
+      : typeof delta?.reasoning === 'string'
+        ? delta.reasoning
+        : ''
+
+  // Track time to first token (TTFT) - set on first meaningful delta (content, reasoning, or tool_calls)
+  const hasToolCallsDelta =
+    delta?.tool_calls != null && (delta.tool_calls as unknown[])?.length > 0
+  if (
+    state.ttftMs === null &&
+    (contentDelta !== '' || reasoningDelta !== '' || hasToolCallsDelta)
+  ) {
+    state.ttftMs = Date.now() - startTime.getTime()
+  }
+
+  if (state.reasoningText.length < MAX_BUFFER_SIZE) {
+    state.reasoningText += reasoningDelta
+    if (state.reasoningText.length >= MAX_BUFFER_SIZE) {
+      state.reasoningText =
+        state.reasoningText.slice(0, MAX_BUFFER_SIZE) + '\n---[TRUNCATED]---'
+      logger.warn(
+        { userId, agentId, model },
+        'Reasoning text buffer truncated at 1MB',
+      )
+    }
+  }
+
+  return state
+}
+
+export class DeepSeekError extends Error {
+  constructor(
+    public readonly statusCode: number,
+    public readonly statusText: string,
+    public readonly errorBody: {
+      error: {
+        message: string
+        code: string | number | null
+        type?: string | null
+      }
+    },
+  ) {
+    super(errorBody.error.message)
+    this.name = 'DeepSeekError'
+  }
+
+  toJSON() {
+    return {
+      error: {
+        message: this.errorBody.error.message,
+        code: this.errorBody.error.code,
+        type: this.errorBody.error.type,
+      },
+    }
+  }
+}
+
+async function parseDeepSeekError(response: Response): Promise<DeepSeekError> {
+  const errorText = await response.text()
+  let errorBody: DeepSeekError['errorBody']
+  try {
+    const parsed = JSON.parse(errorText)
+    if (parsed?.error?.message) {
+      errorBody = {
+        error: {
+          message: parsed.error.message,
+          code: parsed.error.code ?? null,
+          type: parsed.error.type ?? null,
+        },
+      }
+    } else {
+      errorBody = {
+        error: {
+          message: errorText || response.statusText,
+          code: response.status,
+        },
+      }
+    }
+  } catch {
+    errorBody = {
+      error: {
+        message: errorText || response.statusText,
+        code: response.status,
+      },
+    }
+  }
+  return new DeepSeekError(response.status, response.statusText, errorBody)
+}
+
+function creditsToFakeCost(credits: number): number {
+  return credits / ((1 + PROFIT_MARGIN) * 100)
+}
diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts
index cbde91678..bb5ee7e48 100644
--- a/web/src/server/free-session/config.ts
+++ b/web/src/server/free-session/config.ts
@@ -1,4 +1,5 @@
 import {
+  FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
   FREEBUFF_GEMINI_PRO_MODEL_ID,
   FREEBUFF_GLM_MODEL_ID,
   FREEBUFF_KIMI_MODEL_ID,
@@ -54,6 +55,7 @@ export function getSessionGraceMs(): number {
  * queue).
  */
 const INSTANT_ADMIT_CAPACITY: Record<string, number> = {
+  [FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]: 50,
   [FREEBUFF_GEMINI_PRO_MODEL_ID]: 50,
   [FREEBUFF_GLM_MODEL_ID]: 50,
   [FREEBUFF_KIMI_MODEL_ID]: 50,

From 415161ca5bc5be1e6b0550b9598217a9a5f67497 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 30 Apr 2026 17:01:40 -0700
Subject: [PATCH 2/6] Stabilize free-mode rate limit test

---
 .../completions/__tests__/completions.test.ts | 80 +++++++++++--------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index 12965104b..253e85c6e 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -8,10 +8,7 @@ import {
   isFreebuffDeploymentHours,
 } from '@codebuff/common/constants/freebuff-models'
 import { formatQuotaResetCountdown, postChatCompletions } from '../_post'
-import {
-  checkFreeModeRateLimit,
-  resetFreeModeRateLimits,
-} from '../free-mode-rate-limiter'
+import { resetFreeModeRateLimits } from '../free-mode-rate-limiter'
 
 import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics'
 import type { InsertMessageBigqueryFn } from '@codebuff/common/types/contracts/bigquery'
@@ -49,6 +46,10 @@ describe('/api/v1/chat/completions POST endpoint', () => {
       id: 'user-new-free-gemini',
       banned: false,
     },
+    'test-api-key-reviewer-rate-limit': {
+      id: 'user-reviewer-rate-limit',
+      banned: false,
+    },
   }
 
   const mockGetUserInfoFromApiKey: GetUserInfoFromApiKeyFn = async ({
@@ -1006,36 +1007,49 @@ describe('/api/v1/chat/completions POST endpoint', () => {
       expect(body.error).toBe('free_mode_invalid_agent_hierarchy')
     })
 
-    it('counts child reviewer Gemini requests toward the free-mode request limit', async () => {
-      const response = await postChatCompletions({
-        req: new NextRequest('http://localhost:3000/api/v1/chat/completions', {
-          method: 'POST',
-          headers: allowedFreeModeHeaders('test-api-key-new-free-gemini'),
-          body: JSON.stringify({
-            model: FREEBUFF_GEMINI_PRO_MODEL_ID,
-            stream: false,
-            codebuff_metadata: {
-              run_id: 'run-reviewer-child',
-              client_id: 'test-client-id-123',
-              cost_mode: 'free',
-            },
-          }),
-        }),
-        getUserInfoFromApiKey: mockGetUserInfoFromApiKey,
-        logger: mockLogger,
-        trackEvent: mockTrackEvent,
-        getUserUsageData: mockGetUserUsageData,
-        getAgentRunFromId: mockGetAgentRunFromId,
-        fetch: mockFetch,
-        insertMessageBigquery: mockInsertMessageBigquery,
-        loggerWithContext: mockLoggerWithContext,
-        checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
-      })
+    it(
+      'counts child reviewer Gemini requests toward the free-mode request limit',
+      async () => {
+        const createRequest = () =>
+          new NextRequest('http://localhost:3000/api/v1/chat/completions', {
+            method: 'POST',
+            headers: allowedFreeModeHeaders('test-api-key-reviewer-rate-limit'),
+            body: JSON.stringify({
+              model: FREEBUFF_GEMINI_PRO_MODEL_ID,
+              stream: false,
+              codebuff_metadata: {
+                run_id: 'run-reviewer-child',
+                client_id: 'test-client-id-123',
+                cost_mode: 'free',
+              },
+            }),
+          })
 
-      expect(response.status).toBe(200)
-      expect(checkFreeModeRateLimit('user-new-free-gemini').limited).toBe(false)
-      expect(checkFreeModeRateLimit('user-new-free-gemini').limited).toBe(true)
-    })
+        const createPostParams = () => ({
+          req: createRequest(),
+          getUserInfoFromApiKey: mockGetUserInfoFromApiKey,
+          logger: mockLogger,
+          trackEvent: mockTrackEvent,
+          getUserUsageData: mockGetUserUsageData,
+          getAgentRunFromId: mockGetAgentRunFromId,
+          fetch: mockFetch,
+          insertMessageBigquery: mockInsertMessageBigquery,
+          loggerWithContext: mockLoggerWithContext,
+          checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
+        })
+
+        const firstResponse = await postChatCompletions(createPostParams())
+        const secondResponse = await postChatCompletions(createPostParams())
+        const limitedResponse = await postChatCompletions(createPostParams())
+
+        expect(firstResponse.status).toBe(200)
+        expect(secondResponse.status).toBe(200)
+        expect(limitedResponse.status).toBe(429)
+        const body = await limitedResponse.json()
+        expect(body.error).toBe('free_mode_rate_limited')
+      },
+      FETCH_PATH_TEST_TIMEOUT_MS,
+    )
 
     it(
       'skips credit check when in FREE mode even with 0 credits',

From d5135d6534e94dc89a16973c2ae37f100f5153a0 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Fri, 1 May 2026 12:13:32 -0700
Subject: [PATCH 3/6] Preserve DeepSeek reasoning for tool calls

---
 .../agent-runtime/src/tools/stream-parser.ts  |  5 +++
 ...to-openai-compatible-chat-messages.test.ts | 37 +++++++++++++++++++
 ...vert-to-openai-compatible-chat-messages.ts |  7 ++++
 3 files changed, 49 insertions(+)

diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index 8dbda8bdc..5abd5ee50 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -276,6 +276,11 @@ export async function processStream(
       }
 
       if (chunk.type === 'reasoning') {
+        if (chunk.text) {
+          assistantMessages.push(
+            assistantMessage({ type: 'reasoning', text: chunk.text }),
+          )
+        }
         onResponseChunk({
           type: 'reasoning_delta',
           text: chunk.text,
diff --git a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts
index a24d72499..2f2274567 100644
--- a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts
+++ b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.test.ts
@@ -509,6 +509,43 @@ describe('provider-specific metadata merging', () => {
     ])
   })
 
+  it('should preserve assistant reasoning content with tool calls', () => {
+    const result = convertToOpenAICompatibleChatMessages([
+      {
+        role: 'assistant',
+        content: [
+          { type: 'reasoning', text: 'Need the date first. ' },
+          { type: 'reasoning', text: 'Then call weather.' },
+          { type: 'text', text: 'Checking that now...' },
+          {
+            type: 'tool-call',
+            toolCallId: 'call1',
+            toolName: 'get_weather',
+            input: { location: 'Hangzhou' },
+          },
+        ],
+      },
+    ])
+
+    expect(result).toEqual([
+      {
+        role: 'assistant',
+        content: 'Checking that now...',
+        reasoning_content: 'Need the date first. Then call weather.',
+        tool_calls: [
+          {
+            id: 'call1',
+            type: 'function',
+            function: {
+              name: 'get_weather',
+              arguments: JSON.stringify({ location: 'Hangzhou' }),
+            },
+          },
+        ],
+      },
+    ])
+  })
+
   it('should handle a single tool role message with multiple tool-result parts', () => {
     const result = convertToOpenAICompatibleChatMessages([
       {
diff --git a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts
index 30a27cf6c..ec1945a8f 100644
--- a/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts
+++ b/packages/internal/src/openai-compatible/chat/convert-to-openai-compatible-chat-messages.ts
@@ -65,6 +65,7 @@ export function convertToOpenAICompatibleChatMessages(
 
       case 'assistant': {
         let text = ''
+        let reasoningContent = ''
         const toolCalls: Array<{
           id: string
           type: 'function'
@@ -78,6 +79,10 @@ export function convertToOpenAICompatibleChatMessages(
               text += part.text
               break
             }
+            case 'reasoning': {
+              reasoningContent += part.text
+              break
+            }
             case 'tool-call': {
               toolCalls.push({
                 id: part.toolCallId,
@@ -96,6 +101,8 @@ export function convertToOpenAICompatibleChatMessages(
         messages.push({
           role: 'assistant',
           content: text,
+          reasoning_content:
+            reasoningContent.length > 0 ? reasoningContent : undefined,
           tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
           ...metadata,
         })

From e4a97a623073176046906c6f89b6ae4e6f7ee384 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Fri, 1 May 2026 15:11:47 -0700
Subject: [PATCH 4/6] Gate reasoning history replay

---
 common/src/constants/freebuff-models.ts           | 6 ------
 packages/agent-runtime/src/constants.ts           | 4 ++++
 packages/agent-runtime/src/tools/stream-parser.ts | 3 ++-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts
index 84daca5d8..c03099bd4 100644
--- a/common/src/constants/freebuff-models.ts
+++ b/common/src/constants/freebuff-models.ts
@@ -49,12 +49,6 @@ export const FREEBUFF_MODELS = [
     tagline: 'Deepest, 1/day',
     availability: 'always',
   },
-  {
-    id: FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
-    displayName: 'DeepSeek V4 Pro',
-    tagline: 'Experimental',
-    availability: 'always',
-  },
   {
     id: FREEBUFF_MINIMAX_MODEL_ID,
     displayName: 'MiniMax M2.7',
diff --git a/packages/agent-runtime/src/constants.ts b/packages/agent-runtime/src/constants.ts
index d2981d456..16508a0bb 100644
--- a/packages/agent-runtime/src/constants.ts
+++ b/packages/agent-runtime/src/constants.ts
@@ -9,3 +9,7 @@ export const globalStopSequence = `${JSON.stringify(endsAgentStepParam)}`
  * to diff sequential requests and find what's breaking prompt caching.
  */
 export const CACHE_DEBUG_FULL_LOGGING = false
+
+// Keep disabled by default to preserve mainline behavior until reasoning-token
+// replay has been tested more thoroughly.
+export const INCLUDE_REASONING_IN_MESSAGE_HISTORY = false
diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index 5abd5ee50..df4e33bef 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -8,6 +8,7 @@ import {
 import { generateCompactId } from '@codebuff/common/util/string'
 
 import { processStreamWithTools } from '../tool-stream-parser'
+import { INCLUDE_REASONING_IN_MESSAGE_HISTORY } from '../constants'
 import {
   executeCustomToolCall,
   executeToolCall,
@@ -276,7 +277,7 @@ export async function processStream(
       }
 
       if (chunk.type === 'reasoning') {
-        if (chunk.text) {
+        if (INCLUDE_REASONING_IN_MESSAGE_HISTORY && chunk.text) {
           assistantMessages.push(
             assistantMessage({ type: 'reasoning', text: chunk.text }),
           )

From 63e3ded3f7a7f5451ef19396ea9b3f3c6d911e92 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Fri, 1 May 2026 15:58:14 -0700
Subject: [PATCH 5/6] Stabilize Gemini thinker rate limit test

---
 .../chat/completions/__tests__/completions.test.ts   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index 1ceb730c5..f73b568bb 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -53,6 +53,10 @@ describe('/api/v1/chat/completions POST endpoint', () => {
       id: 'user-reviewer-rate-limit',
       banned: false,
     },
+    'test-api-key-gemini-rate-limit': {
+      id: 'user-gemini-rate-limit',
+      banned: false,
+    },
   }
 
   const mockGetUserInfoFromApiKey: GetUserInfoFromApiKeyFn = async ({
@@ -1100,10 +1104,14 @@ describe('/api/v1/chat/completions POST endpoint', () => {
     it(
       'counts child Gemini thinker requests toward the free-mode request limit',
       async () => {
+        expect(checkFreeModeRateLimit('user-gemini-rate-limit').limited).toBe(
+          false,
+        )
+
         const createRequest = () =>
           new NextRequest('http://localhost:3000/api/v1/chat/completions', {
             method: 'POST',
-            headers: allowedFreeModeHeaders('test-api-key-new-free-gemini'),
+            headers: allowedFreeModeHeaders('test-api-key-gemini-rate-limit'),
             body: JSON.stringify({
               model: FREEBUFF_GEMINI_PRO_MODEL_ID,
               stream: false,
@@ -1130,11 +1138,9 @@ describe('/api/v1/chat/completions POST endpoint', () => {
         })
 
         const firstResponse = await postChatCompletions(createPostParams())
-        const secondResponse = await postChatCompletions(createPostParams())
         const limitedResponse = await postChatCompletions(createPostParams())
 
         expect(firstResponse.status).toBe(200)
-        expect(secondResponse.status).toBe(200)
         expect(limitedResponse.status).toBe(429)
         const body = await limitedResponse.json()
         expect(body.error).toBe('free_mode_rate_limited')

From ac3312462b9cf2a23825992951e648897e9aac2c Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Fri, 1 May 2026 16:17:01 -0700
Subject: [PATCH 6/6] Inject free mode rate limiter in completions tests

---
 .../completions/__tests__/completions.test.ts   | 17 ++++++++++++++---
 web/src/app/api/v1/chat/completions/_post.ts    |  7 ++++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index f73b568bb..f5f329d25 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -1104,9 +1104,18 @@ describe('/api/v1/chat/completions POST endpoint', () => {
     it(
       'counts child Gemini thinker requests toward the free-mode request limit',
       async () => {
-        expect(checkFreeModeRateLimit('user-gemini-rate-limit').limited).toBe(
-          false,
-        )
+        let rateLimitChecks = 0
+        const checkFreeModeRateLimitForTest = mock((userId: string) => {
+          expect(userId).toBe('user-gemini-rate-limit')
+          rateLimitChecks += 1
+          return rateLimitChecks === 1
+            ? { limited: false as const }
+            : {
+                limited: true as const,
+                windowName: '1 second',
+                retryAfterMs: 1_000,
+              }
+        })
 
         const createRequest = () =>
           new NextRequest('http://localhost:3000/api/v1/chat/completions', {
@@ -1135,6 +1144,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
           insertMessageBigquery: mockInsertMessageBigquery,
           loggerWithContext: mockLoggerWithContext,
           checkSessionAdmissible: mockCheckSessionAdmissibleAllow,
+          checkFreeModeRateLimit: checkFreeModeRateLimitForTest,
         })
 
         const firstResponse = await postChatCompletions(createPostParams())
@@ -1144,6 +1154,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
         expect(limitedResponse.status).toBe(429)
         const body = await limitedResponse.json()
         expect(body.error).toBe('free_mode_rate_limited')
+        expect(checkFreeModeRateLimitForTest).toHaveBeenCalledTimes(2)
       },
       FETCH_PATH_TEST_TIMEOUT_MS,
     )
diff --git a/web/src/app/api/v1/chat/completions/_post.ts b/web/src/app/api/v1/chat/completions/_post.ts
index bc37e3dfe..fd435cf3e 100644
--- a/web/src/app/api/v1/chat/completions/_post.ts
+++ b/web/src/app/api/v1/chat/completions/_post.ts
@@ -78,7 +78,7 @@ import { getFreeModeCountryAccess } from '@/server/free-mode-country'
 import type { SessionGateResult } from '@/server/free-session/public-api'
 import { extractApiKeyFromHeader } from '@/util/auth'
 import { withDefaultProperties } from '@codebuff/common/analytics'
-import { checkFreeModeRateLimit } from './free-mode-rate-limiter'
+import { checkFreeModeRateLimit as defaultCheckFreeModeRateLimit } from './free-mode-rate-limiter'
 
 export const formatQuotaResetCountdown = (
   nextQuotaReset: string | null | undefined,
@@ -117,6 +117,7 @@ export const formatQuotaResetCountdown = (
 }
 
 export type CheckSessionAdmissibleFn = typeof checkSessionAdmissible
+export type CheckFreeModeRateLimitFn = typeof defaultCheckFreeModeRateLimit
 
 type GateRejectCode = Extract<SessionGateResult, { ok: false }>['code']
 
@@ -147,6 +148,9 @@ export async function postChatCompletions(params: {
   /** Optional override for the freebuff waiting-room gate. Defaults to the
    *  real check backed by Postgres; tests inject a no-op. */
   checkSessionAdmissible?: CheckSessionAdmissibleFn
+  /** Optional override for the free-mode rate limiter. Tests inject this to
+   *  avoid coupling to process-global limiter state. */
+  checkFreeModeRateLimit?: CheckFreeModeRateLimitFn
 }) {
   const {
     req,
@@ -159,6 +163,7 @@ export async function postChatCompletions(params: {
     ensureSubscriberBlockGrant,
     getUserPreferences,
     checkSessionAdmissible: checkSession = checkSessionAdmissible,
+    checkFreeModeRateLimit = defaultCheckFreeModeRateLimit,
   } = params
   let { logger } = params
   let { trackEvent } = params