From a178dde1e6bc0d94170bd01bffa67f789f0dc5c0 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 1 Jul 2026 20:38:12 -0600
Subject: [PATCH] feat(runtime): reasoning-aware routerChatWithUsage

Two gaps, each hit twice by real experiment runs in agent-lab (R230 local
routing, R231 provider-invariance grid):

1. reasoning controls were droppable: the request body forwarded only
   model/messages/temperature/max_tokens, so a thinking model on a binary
   decision burned its whole budget inside the think block; on a slow
   (CPU-local) backend that becomes a client timeout, and 20/20 audit
   cells died as "fetch failed". opts.reasoningEffort now forwards as
   reasoning_effort ('none' is the load-bearing value for routing/gating).

2. reasoning and content were conflated: OpenRouter returns reasoning in
   a separate field with clean content; Groq inlines a <think> block into
   content. Downstream single-token parsers read the reasoning prose
   (which quotes both option tokens) and misread decisions, making the
   SAME weights look broken on one provider and fine on another (R231:
   groq qwen3-32b "10/20" vs openrouter "20/20", both actually correct).
   parseChatResult now splits both shapes into RouterChatResult.reasoning
   and always-clean content; an unclosed think block (budget exhausted
   mid-thought) yields empty content, which is honest: no answer was
   emitted.

Additive: no call-site changes required; non-thinking responses are
byte-identical. 9 new tests cover effort forwarding, both provider
shapes, reasoning_content (DeepSeek/Kimi), unclosed think, and the
unchanged non-thinking path.
---
 src/runtime/router-client.complete.test.ts | 79 ++++++++++++++++++++++
 src/runtime/router-client.ts               | 63 ++++++++++++++++-
 2 files changed, 139 insertions(+), 3 deletions(-)
diff --git a/src/runtime/router-client.complete.test.ts b/src/runtime/router-client.complete.test.ts
index 697811ea..15d1c329 100644
--- a/src/runtime/router-client.complete.test.ts
+++ b/src/runtime/router-client.complete.test.ts
@@ -87,3 +87,82 @@ describe('RouterConfig.complete — the injected completion transport', () => {
     expect(fetchSpy).toHaveBeenCalledOnce()
   })
 })
+
+describe('reasoning-aware parsing and reasoning_effort forwarding', () => {
+  const cfg = (complete: (body: Record<string, unknown>) => Promise<unknown>) => ({
+    routerBaseUrl: 'http://router.test/v1',
+    routerKey: 'k',
+    model: 'qwen/qwen3-32b',
+    complete,
+  })
+
+  it('forwards reasoningEffort as reasoning_effort, omits it when unset', async () => {
+    const seen: Record<string, unknown>[] = []
+    const complete = async (body: Record<string, unknown>) => {
+      seen.push(body)
+      return { choices: [{ message: { content: 'ABSTAIN' } }] }
+    }
+    await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }], {
+      reasoningEffort: 'none',
+    })
+    await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }])
+    expect(seen[0]?.reasoning_effort).toBe('none')
+    expect('reasoning_effort' in (seen[1] ?? {})).toBe(false)
+  })
+
+  it('splits OpenRouter-style separate reasoning field from content', async () => {
+    const complete = async () => ({
+      choices: [{ message: { content: 'ABSTAIN', reasoning: 'taskFamilyMatches is false...' } }],
+    })
+    const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }])
+    expect(res.content).toBe('ABSTAIN')
+    expect(res.reasoning).toBe('taskFamilyMatches is false...')
+  })
+
+  it('strips Groq-style inline <think> block out of content into reasoning', async () => {
+    // Before the split, a single-token parser reading content saw the reasoning prose
+    // (which quotes both option tokens) and misread the decision — the same model
+    // looked broken on Groq and fine on OpenRouter.
+    const complete = async () => ({
+      choices: [
+        {
+          message: {
+            content:
+              '<think>\nShould I EXECUTE_AUDIT? taskFamilyMatches is false, so no.\n</think>\n\nABSTAIN',
+          },
+        },
+      ],
+    })
+    const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }])
+    expect(res.content).toBe('ABSTAIN')
+    expect(res.reasoning).toContain('taskFamilyMatches is false')
+  })
+
+  it('unclosed <think> (budget exhausted mid-thought) yields empty content, all reasoning', async () => {
+    const complete = async () => ({
+      choices: [{ message: { content: '<think>\nstill thinking about the features' } }],
+    })
+    const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }])
+    expect(res.content).toBe('')
+    expect(res.reasoning).toContain('still thinking')
+  })
+
+  it('reasoning_content (DeepSeek/Kimi field name) is honored', async () => {
+    const complete = async () => ({
+      choices: [{ message: { content: 'EXECUTE_AUDIT', reasoning_content: 'all four true' } }],
+    })
+    const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }])
+    expect(res.content).toBe('EXECUTE_AUDIT')
+    expect(res.reasoning).toBe('all four true')
+  })
+
+  it('non-thinking responses are unchanged (no reasoning key)', async () => {
+    const complete = async () => ({
+      choices: [{ message: { content: 'pong' } }],
+      usage: { prompt_tokens: 1, completion_tokens: 1 },
+    })
+    const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'hi' }])
+    expect(res.content).toBe('pong')
+    expect('reasoning' in res).toBe(false)
+  })
+})
diff --git a/src/runtime/router-client.ts b/src/runtime/router-client.ts
index 9b36847c..3629be20 100644
--- a/src/runtime/router-client.ts
+++ b/src/runtime/router-client.ts
@@ -28,7 +28,17 @@ export interface RouterConfig {
 }
 
 export interface RouterChatResult {
+  /** The final answer, with any inline `<think>...</think>` block stripped into `reasoning`. */
   content: string
+  /**
+   * Thinking-model reasoning, when the provider surfaced it — either as a separate
+   * `reasoning`/`reasoning_content` message field (OpenRouter style) or inlined into
+   * `content` as a `<think>` block (Groq style). Undefined for non-thinking models.
+   * Downstream parsers that match single-token answers must read `content`, which is
+   * clean either way; before this split, Groq-style inlining made the same model look
+   * broken on one provider and fine on another.
+   */
+  reasoning?: string
   /** REAL usage, or undefined when the provider reported none. */
   usage?: { input: number; output: number }
   /** Derived from usage via `estimateCost` when the model is priced; else undefined. */
@@ -38,7 +48,20 @@ export interface RouterChatResult {
 export async function routerChatWithUsage(
   cfg: RouterConfig,
   messages: Array<{ role: string; content: string }>,
-  opts?: { temperature?: number; signal?: AbortSignal; maxTokens?: number },
+  opts?: {
+    temperature?: number
+    signal?: AbortSignal
+    maxTokens?: number
+    /**
+     * Reasoning control for thinking models, forwarded as `reasoning_effort`.
+     * 'none' is the load-bearing value: binary/single-token decisions (routing,
+     * gating) on a thinking model otherwise burn the whole token budget inside
+     * the think block — on slow backends (CPU-local) that turns into a client
+     * timeout, not just waste. Providers that ignore the field are handled by
+     * the reasoning/content split in `parseChatResult`.
+     */
+    reasoningEffort?: 'none' | 'low' | 'medium' | 'high'
+  },
 ): Promise<RouterChatResult> {
   const url = `${cfg.routerBaseUrl.replace(/\/$/, '')}/chat/completions`
   const headers = { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` }
@@ -50,6 +73,7 @@ export async function routerChatWithUsage(
     messages,
     temperature,
     max_tokens: opts?.maxTokens ?? 8192,
+    ...(opts?.reasoningEffort ? { reasoning_effort: opts.reasoningEffort } : {}),
   })
   // Injected transport short-circuits the network: the offline benchmark seam. It owns its own
   // determinism, so the fetch-specific transient-retry/temperature-handling below does not apply.
@@ -88,7 +112,9 @@ export async function routerChatWithUsage(
 
 function parseChatResult(json: unknown, model: string): RouterChatResult {
   const data = json as {
-    choices?: Array<{ message?: { content?: string } }>
+    choices?: Array<{
+      message?: { content?: string; reasoning?: string; reasoning_content?: string }
+    }>
     usage?: { prompt_tokens?: number; completion_tokens?: number }
   }
   const u = data.usage
@@ -98,13 +124,44 @@ function parseChatResult(json: unknown, model: string): RouterChatResult {
       : undefined
   const costUsd =
     usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : undefined
+  const msg = data.choices?.[0]?.message
+  const { content, reasoning } = splitReasoning(
+    msg?.content ?? '',
+    msg?.reasoning ?? msg?.reasoning_content,
+  )
   return {
-    content: data.choices?.[0]?.message?.content ?? '',
+    content,
+    ...(reasoning ? { reasoning } : {}),
     ...(usage ? { usage } : {}),
     ...(costUsd !== undefined ? { costUsd } : {}),
   }
 }
 
+/**
+ * Normalize the two ways providers surface thinking-model reasoning into one shape:
+ * a separate field (OpenRouter: `reasoning`, DeepSeek/Kimi: `reasoning_content`) or a
+ * `<think>...</think>` block inlined at the head of `content` (Groq, some local runtimes).
+ * An UNCLOSED `<think>` (the model hit max_tokens mid-thought) yields empty content and
+ * everything as reasoning — which is honest: no final answer was emitted.
+ */
+function splitReasoning(
+  rawContent: string,
+  fieldReasoning: string | undefined,
+): { content: string; reasoning?: string } {
+  const open = rawContent.indexOf('<think>')
+  if (open !== -1) {
+    const close = rawContent.indexOf('</think>', open)
+    const inline = close !== -1 ? rawContent.slice(open + 7, close) : rawContent.slice(open + 7)
+    const rest =
+      close !== -1
+        ? rawContent.slice(0, open) + rawContent.slice(close + 8)
+        : rawContent.slice(0, open)
+    const reasoning = [fieldReasoning, inline.trim()].filter(Boolean).join('\n')
+    return { content: rest.trim(), ...(reasoning ? { reasoning } : {}) }
+  }
+  return { content: rawContent, ...(fieldReasoning ? { reasoning: fieldReasoning } : {}) }
+}
+
 /** A tool-call the model emitted (provider-neutral; mirrors the runtime's ToolCallRequest). */
 export interface RouterToolCall {
   id: string