From a178dde1e6bc0d94170bd01bffa67f789f0dc5c0 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 1 Jul 2026 20:38:12 -0600 Subject: [PATCH] feat(runtime): reasoning-aware routerChatWithUsage Two gaps, each hit twice by real experiment runs in agent-lab (R230 local routing, R231 provider-invariance grid): 1. reasoning controls were droppable: the request body forwarded only model/messages/temperature/max_tokens, so a thinking model on a binary decision burned its whole budget inside the think block; on a slow (CPU-local) backend that becomes a client timeout, and 20/20 audit cells died as "fetch failed". opts.reasoningEffort now forwards as reasoning_effort ('none' is the load-bearing value for routing/gating). 2. reasoning and content were conflated: OpenRouter returns reasoning in a separate field with clean content; Groq inlines a block into content. Downstream single-token parsers read the reasoning prose (which quotes both option tokens) and misread decisions, making the SAME weights look broken on one provider and fine on another (R231: groq qwen3-32b "10/20" vs openrouter "20/20", both actually correct). parseChatResult now splits both shapes into RouterChatResult.reasoning and always-clean content; an unclosed think block (budget exhausted mid-thought) yields empty content, which is honest: no answer was emitted. Additive: no call-site changes required; non-thinking responses are byte-identical. 9 new tests cover effort forwarding, both provider shapes, reasoning_content (DeepSeek/Kimi), unclosed think, and the unchanged non-thinking path. --- src/runtime/router-client.complete.test.ts | 79 ++++++++++++++++++++++ src/runtime/router-client.ts | 63 ++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/src/runtime/router-client.complete.test.ts b/src/runtime/router-client.complete.test.ts index 697811ea..15d1c329 100644 --- a/src/runtime/router-client.complete.test.ts +++ b/src/runtime/router-client.complete.test.ts @@ -87,3 +87,82 @@ describe('RouterConfig.complete — the injected completion transport', () => { expect(fetchSpy).toHaveBeenCalledOnce() }) }) + +describe('reasoning-aware parsing and reasoning_effort forwarding', () => { + const cfg = (complete: (body: Record) => Promise) => ({ + routerBaseUrl: 'http://router.test/v1', + routerKey: 'k', + model: 'qwen/qwen3-32b', + complete, + }) + + it('forwards reasoningEffort as reasoning_effort, omits it when unset', async () => { + const seen: Record[] = [] + const complete = async (body: Record) => { + seen.push(body) + return { choices: [{ message: { content: 'ABSTAIN' } }] } + } + await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }], { + reasoningEffort: 'none', + }) + await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }]) + expect(seen[0]?.reasoning_effort).toBe('none') + expect('reasoning_effort' in (seen[1] ?? {})).toBe(false) + }) + + it('splits OpenRouter-style separate reasoning field from content', async () => { + const complete = async () => ({ + choices: [{ message: { content: 'ABSTAIN', reasoning: 'taskFamilyMatches is false...' } }], + }) + const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }]) + expect(res.content).toBe('ABSTAIN') + expect(res.reasoning).toBe('taskFamilyMatches is false...') + }) + + it('strips Groq-style inline block out of content into reasoning', async () => { + // Before the split, a single-token parser reading content saw the reasoning prose + // (which quotes both option tokens) and misread the decision — the same model + // looked broken on Groq and fine on OpenRouter. + const complete = async () => ({ + choices: [ + { + message: { + content: + '\nShould I EXECUTE_AUDIT? taskFamilyMatches is false, so no.\n\n\nABSTAIN', + }, + }, + ], + }) + const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }]) + expect(res.content).toBe('ABSTAIN') + expect(res.reasoning).toContain('taskFamilyMatches is false') + }) + + it('unclosed (budget exhausted mid-thought) yields empty content, all reasoning', async () => { + const complete = async () => ({ + choices: [{ message: { content: '\nstill thinking about the features' } }], + }) + const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }]) + expect(res.content).toBe('') + expect(res.reasoning).toContain('still thinking') + }) + + it('reasoning_content (DeepSeek/Kimi field name) is honored', async () => { + const complete = async () => ({ + choices: [{ message: { content: 'EXECUTE_AUDIT', reasoning_content: 'all four true' } }], + }) + const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'route' }]) + expect(res.content).toBe('EXECUTE_AUDIT') + expect(res.reasoning).toBe('all four true') + }) + + it('non-thinking responses are unchanged (no reasoning key)', async () => { + const complete = async () => ({ + choices: [{ message: { content: 'pong' } }], + usage: { prompt_tokens: 1, completion_tokens: 1 }, + }) + const res = await routerChatWithUsage(cfg(complete), [{ role: 'user', content: 'hi' }]) + expect(res.content).toBe('pong') + expect('reasoning' in res).toBe(false) + }) +}) diff --git a/src/runtime/router-client.ts b/src/runtime/router-client.ts index 9b36847c..3629be20 100644 --- a/src/runtime/router-client.ts +++ b/src/runtime/router-client.ts @@ -28,7 +28,17 @@ export interface RouterConfig { } export interface RouterChatResult { + /** The final answer, with any inline `...` block stripped into `reasoning`. */ content: string + /** + * Thinking-model reasoning, when the provider surfaced it — either as a separate + * `reasoning`/`reasoning_content` message field (OpenRouter style) or inlined into + * `content` as a `` block (Groq style). Undefined for non-thinking models. + * Downstream parsers that match single-token answers must read `content`, which is + * clean either way; before this split, Groq-style inlining made the same model look + * broken on one provider and fine on another. + */ + reasoning?: string /** REAL usage, or undefined when the provider reported none. */ usage?: { input: number; output: number } /** Derived from usage via `estimateCost` when the model is priced; else undefined. */ @@ -38,7 +48,20 @@ export interface RouterChatResult { export async function routerChatWithUsage( cfg: RouterConfig, messages: Array<{ role: string; content: string }>, - opts?: { temperature?: number; signal?: AbortSignal; maxTokens?: number }, + opts?: { + temperature?: number + signal?: AbortSignal + maxTokens?: number + /** + * Reasoning control for thinking models, forwarded as `reasoning_effort`. + * 'none' is the load-bearing value: binary/single-token decisions (routing, + * gating) on a thinking model otherwise burn the whole token budget inside + * the think block — on slow backends (CPU-local) that turns into a client + * timeout, not just waste. Providers that ignore the field are handled by + * the reasoning/content split in `parseChatResult`. + */ + reasoningEffort?: 'none' | 'low' | 'medium' | 'high' + }, ): Promise { const url = `${cfg.routerBaseUrl.replace(/\/$/, '')}/chat/completions` const headers = { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` } @@ -50,6 +73,7 @@ export async function routerChatWithUsage( messages, temperature, max_tokens: opts?.maxTokens ?? 8192, + ...(opts?.reasoningEffort ? { reasoning_effort: opts.reasoningEffort } : {}), }) // Injected transport short-circuits the network: the offline benchmark seam. It owns its own // determinism, so the fetch-specific transient-retry/temperature-handling below does not apply. @@ -88,7 +112,9 @@ export async function routerChatWithUsage( function parseChatResult(json: unknown, model: string): RouterChatResult { const data = json as { - choices?: Array<{ message?: { content?: string } }> + choices?: Array<{ + message?: { content?: string; reasoning?: string; reasoning_content?: string } + }> usage?: { prompt_tokens?: number; completion_tokens?: number } } const u = data.usage @@ -98,13 +124,44 @@ function parseChatResult(json: unknown, model: string): RouterChatResult { : undefined const costUsd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : undefined + const msg = data.choices?.[0]?.message + const { content, reasoning } = splitReasoning( + msg?.content ?? '', + msg?.reasoning ?? msg?.reasoning_content, + ) return { - content: data.choices?.[0]?.message?.content ?? '', + content, + ...(reasoning ? { reasoning } : {}), ...(usage ? { usage } : {}), ...(costUsd !== undefined ? { costUsd } : {}), } } +/** + * Normalize the two ways providers surface thinking-model reasoning into one shape: + * a separate field (OpenRouter: `reasoning`, DeepSeek/Kimi: `reasoning_content`) or a + * `...` block inlined at the head of `content` (Groq, some local runtimes). + * An UNCLOSED `` (the model hit max_tokens mid-thought) yields empty content and + * everything as reasoning — which is honest: no final answer was emitted. + */ +function splitReasoning( + rawContent: string, + fieldReasoning: string | undefined, +): { content: string; reasoning?: string } { + const open = rawContent.indexOf('') + if (open !== -1) { + const close = rawContent.indexOf('', open) + const inline = close !== -1 ? rawContent.slice(open + 7, close) : rawContent.slice(open + 7) + const rest = + close !== -1 + ? rawContent.slice(0, open) + rawContent.slice(close + 8) + : rawContent.slice(0, open) + const reasoning = [fieldReasoning, inline.trim()].filter(Boolean).join('\n') + return { content: rest.trim(), ...(reasoning ? { reasoning } : {}) } + } + return { content: rawContent, ...(fieldReasoning ? { reasoning: fieldReasoning } : {}) } +} + /** A tool-call the model emitted (provider-neutral; mirrors the runtime's ToolCallRequest). */ export interface RouterToolCall { id: string