MoonshotAI · bj456736 · Jun 9, 2026 · chatgpt-codex-connector · Jun 9, 2026
diff --git a/packages/agent-core/src/agent/context/index.ts b/packages/agent-core/src/agent/context/index.ts
@@ -47,6 +47,14 @@ export class ContextMemory {
       toolCalls: [],
       origin,
     });
+    // When the user message contains video file references, remind the model
+    // to use ReadMediaFile instead of writing Python scripts to extract frames.
+    if (hasVideoTag(content)) {
+      this.appendSystemReminder(
+        'The user provided a video file. Use the ReadMediaFile tool to read and analyze the video content directly. Do not write Python scripts or other code to extract frames from the video.',
+        { kind: 'injection', variant: 'host' },
+      );
+    }
   }
 
   appendSystemReminder(content: string, origin: PromptOrigin): void {
@@ -307,6 +315,14 @@ export class ContextMemory {
   }
 }
 
+function hasVideoTag(content: readonly ContentPart[]): boolean {
+  return content.some(
+    (part) =>
+      part.type === 'text' &&
+      /<video\s+path="[^"]+"\s*>\s*<\/video>/.test(part.text),
+  );
+}
+
 function toolResultOutputForModel(result: ExecutableToolResult): string | ContentPart[] {
   const output = result.output;
   if (typeof output === 'string') {

diff --git a/packages/agent-core/test/agent/context.test.ts b/packages/agent-core/test/agent/context.test.ts
@@ -218,6 +218,45 @@ describe('Agent context', () => {
     await ctx.expectResumeMatches();
   });
 
+  it('adds a system reminder when user message contains a video tag', async () => {
+    const ctx = testAgent();
+    ctx.configure();
+
+    ctx.mockNextResponse({ type: 'text', text: 'got it' });
+    await ctx.rpc.prompt({
+      input: [
+        { type: 'text', text: '分析这个视频 <video path="/tmp/test.mp4"></video>' },
+      ],
+    });
+
+    await ctx.untilTurnEnd();
+    const lastCall = ctx.llmCalls.at(-1);
+    expect(lastCall).toBeDefined();
+    const allText = lastCall!.history
+      .map((m) => m.content.map((c) => (c.type === 'text' ? c.text : '')).join(''))
+      .join('');
+    expect(allText).toContain('The user provided a video file');
+    expect(allText).toContain('ReadMediaFile');
+    await ctx.expectResumeMatches();
+  });
+
+  it('does not add a video reminder when user message has no video tag', async () => {
+    const ctx = testAgent();
+    ctx.configure();
+
+    ctx.mockNextResponse({ type: 'text', text: 'ok' });
+    await ctx.rpc.prompt({ input: [{ type: 'text', text: 'hello' }] });
+
+    await ctx.untilTurnEnd();
+    const lastCall = ctx.llmCalls.at(-1);
+    expect(lastCall).toBeDefined();
+    const allText = lastCall!.history
+      .map((m) => m.content.map((c) => (c.type === 'text' ? c.text : '')).join(''))
+      .join('');
+    expect(allText).not.toContain('The user provided a video file');
+    await ctx.expectResumeMatches();
+  });
+
   it('keeps system reminders separate from real user prompts', async () => {
     const ctx = testAgent();
     ctx.configure();
@@ -237,6 +276,7 @@ describe('Agent context', () => {
         user: text "<system-reminder>\\nRemember the host note.\\n</system-reminder>"
         user: text "Real user prompt"
     `);
+    await ctx.expectResumeMatches();
   });
 
   it('defers system reminders until pending tool results are recorded and resumed', async () => {