diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 77ef438..6594020 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.100.3" +version = "0.101.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index bb13857..c289ecd 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -58,7 +58,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.100.3" + __version__ = "0.101.0" __all__ = [ "Client", diff --git a/package.json b/package.json index 5f07bc9..0ed7ac8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.100.3", + "version": "0.101.0", "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/agent-profile.test.ts b/src/agent-profile.test.ts index 6c17186..9a70c87 100644 --- a/src/agent-profile.test.ts +++ b/src/agent-profile.test.ts @@ -4,6 +4,9 @@ import { agentProfileHash, agentProfileId, agentProfileModelId, + CODING_HARNESSES, + expandProfileAxes, + harnessAxisOf, } from './agent-profile' const base: AgentProfile = { @@ -159,3 +162,90 @@ describe('agentProfileModelId', () => { ) }) }) + +describe('CODING_HARNESSES', () => { + it('is the canonical primary coding-harness set', () => { + expect([...CODING_HARNESSES]).toEqual(['opencode', 'claude-code', 'codex', 'kimi-code']) + }) +}) + +describe('expandProfileAxes', () => { + const axisBase: AgentProfile = { name: 'agent', model: { default: 'deepseek-v4-flash' } } + + it('defaults to CODING_HARNESSES × the base model — one compatible cell per harness', () => { + const profiles = expandProfileAxes({ base: axisBase }) + // An unprefixed model id is compatible with every harness → one cell each. + expect(profiles).toHaveLength(CODING_HARNESSES.length) + expect(profiles.map((p) => harnessAxisOf(p)?.harness).sort()).toEqual( + [...CODING_HARNESSES].sort(), + ) + for (const p of profiles) expect(p.model?.default).toBe('deepseek-v4-flash') + }) + + it('crosses harnesses × models with a distinct id per cell (no collapse)', () => { + const profiles = expandProfileAxes({ + base: axisBase, + harnesses: ['opencode', 'codex'], + models: ['m-a', 'm-b'], + }) + expect(profiles).toHaveLength(4) + expect(new Set(profiles.map((p) => agentProfileId(p))).size).toBe(4) + }) + + it('drops (harness, model) pairs a vendor-locked harness cannot run', () => { + const pairs = expandProfileAxes({ + base: axisBase, + harnesses: ['claude-code', 'codex'], + models: ['anthropic/claude-x', 'openai/gpt-x'], + }) + .map((p) => harnessAxisOf(p)) + .filter(Boolean) + expect(pairs).toContainEqual({ harness: 'claude-code', model: 'anthropic/claude-x' }) + expect(pairs).toContainEqual({ harness: 'codex', model: 'openai/gpt-x' }) + expect(pairs).not.toContainEqual({ harness: 'claude-code', model: 'openai/gpt-x' }) + expect(pairs).toHaveLength(2) + }) + + it('router-backed harness (opencode) accepts any provider', () => { + expect( + expandProfileAxes({ + base: axisBase, + harnesses: ['opencode'], + models: ['anthropic/x', 'openai/y'], + }), + ).toHaveLength(2) + }) + + it('keepIncompatible retains an otherwise-dropped pair', () => { + expect( + expandProfileAxes({ + base: axisBase, + harnesses: ['claude-code'], + models: ['openai/gpt-x'], + keepIncompatible: true, + }), + ).toHaveLength(1) + }) + + it('carries harness + model in metadata and round-trips via harnessAxisOf', () => { + const [p] = expandProfileAxes({ base: axisBase, harnesses: ['opencode'], models: ['m1'] }) + expect(p?.metadata?.harness).toBe('opencode') + expect(p?.metadata?.harnessModel).toBe('m1') + expect(harnessAxisOf(p as AgentProfile)).toEqual({ harness: 'opencode', model: 'm1' }) + }) + + it('fails loud on no harnesses / no models / all-incompatible', () => { + expect(() => expandProfileAxes({ base: axisBase, harnesses: [] })).toThrow(/no harnesses/) + expect(() => expandProfileAxes({ base: { name: 'x' } })).toThrow(/no models/) + expect(() => + expandProfileAxes({ base: axisBase, harnesses: ['claude-code'], models: ['openai/gpt-x'] }), + ).toThrow(/incompatible/) + }) +}) + +describe('harnessAxisOf', () => { + it('returns undefined for a profile not produced by expandProfileAxes', () => { + expect(harnessAxisOf({ metadata: undefined })).toBeUndefined() + expect(harnessAxisOf({ metadata: { foo: 'bar' } })).toBeUndefined() + }) +}) diff --git a/src/agent-profile.ts b/src/agent-profile.ts index a4a1e19..42efabe 100644 --- a/src/agent-profile.ts +++ b/src/agent-profile.ts @@ -1,9 +1,108 @@ import { createHash } from 'node:crypto' import type { AgentProfile } from '@tangle-network/agent-interface' +import { type HarnessType, harnessSupportsModel } from '@tangle-network/agent-interface' import { ValidationError } from './errors' import { canonicalize } from './pre-registration' -export type { AgentProfile } from '@tangle-network/agent-interface' +export type { AgentProfile, HarnessType } from '@tangle-network/agent-interface' + +/** + * The agentic coding harnesses an eval sweeps by default — the ones we care about + * ranking. This is the SINGLE source of that list; consumers import it instead of + * re-declaring their own (a re-declared list is how the fleet drifts). Pass an + * explicit `harnesses` (e.g. `harnessTypeSchema.options` for literally every known + * harness) to widen beyond these. + */ +export const CODING_HARNESSES: readonly HarnessType[] = [ + 'opencode', + 'claude-code', + 'codex', + 'kimi-code', +] + +export interface ProfileAxisSpec { + /** The domain profile to sweep. Its prompt/tools/skills are held fixed; only the + * harness and model vary. `model.default` is the fallback model. */ + base: AgentProfile + /** Harnesses to cross. Default: {@link CODING_HARNESSES}. */ + harnesses?: readonly HarnessType[] + /** Models to cross. Default: `[base.model.default]` — one model, i.e. today's + * single-model behaviour, so omitting this never changes an existing run. */ + models?: readonly string[] + /** Keep (harness, model) pairs the harness can't run instead of dropping them. + * Default: drop (via `harnessSupportsModel`), so a vendor-locked harness paired + * with a foreign model doesn't become a guaranteed-failing cell. */ + keepIncompatible?: boolean +} + +/** + * Expand a base profile across the harness × model matrix into the `AgentProfile[]` + * that `runProfileMatrix` / `selfImprove` score — the ONE place "which harnesses × + * which models do we evaluate" lives, so no product hand-rolls its own harness list + * or column→profile mapping (the pattern that let those copies drift and silently + * break the harness pivot). + * + * Each cell clones `base`, sets `model.default`, and stamps `metadata.harness` + + * `metadata.harnessModel` (both hash-bearing, so every cell gets a distinct + * `agentProfileId` row and results join back by harness/model via {@link harnessAxisOf} + * with no hand-recomputed key). Incompatible pairs are dropped unless `keepIncompatible`. + * + * Omit `harnesses`/`models` to sweep the full default set — the "turn it on for + * everything we care about" switch, identical in shape whether one harness or all. + */ +export function expandProfileAxes(spec: ProfileAxisSpec): AgentProfile[] { + const harnesses = spec.harnesses ?? CODING_HARNESSES + if (harnesses.length === 0) throw new ValidationError('expandProfileAxes: no harnesses to sweep') + const baseModel = spec.base.model?.default + const models = spec.models ?? (baseModel ? [baseModel] : []) + if (models.length === 0) { + throw new ValidationError( + 'expandProfileAxes: no models to sweep — base profile has no model.default and none were supplied', + ) + } + const out: AgentProfile[] = [] + const seen = new Set() + for (const harness of harnesses) { + for (const model of models) { + if (!spec.keepIncompatible && !harnessSupportsModel(harness, model)) continue + const profile: AgentProfile = { + ...spec.base, + name: `${spec.base.name ?? 'agent'}/${harness}/${model}`, + model: { ...spec.base.model, default: model }, + metadata: { ...(spec.base.metadata ?? {}), harness, harnessModel: model }, + } + const id = agentProfileId(profile) + if (seen.has(id)) continue + seen.add(id) + out.push(profile) + } + } + if (out.length === 0) { + throw new ValidationError( + `expandProfileAxes: every (harness, model) pair was incompatible (harnesses=[${harnesses.join(', ')}], models=[${models.join(', ')}]). Widen the models or pass keepIncompatible.`, + ) + } + return out +} + +/** + * Read the (harness, model) a matrix cell ran under, off a profile or a result row's + * profile — the join-back for a `byHarness` pivot. Returns undefined when the profile + * wasn't produced by {@link expandProfileAxes}. Callers group `result.byProfile` by + * this instead of recomputing an id (recomputing the wrong key is what broke the pivot + * in the hand-rolled copies). + */ +export function harnessAxisOf( + profile: Pick, +): { harness: HarnessType; model: string } | undefined { + const m = profile.metadata as Record | undefined + const harness = m?.harness + const model = m?.harnessModel + if (typeof harness === 'string' && typeof model === 'string') { + return { harness: harness as HarnessType, model } + } + return undefined +} /** * Collision-resistant, path-safe, human-readable profile id for eval artifacts. diff --git a/src/campaign/presets/run-profile-matrix.ts b/src/campaign/presets/run-profile-matrix.ts index aeeb4b8..5b3c055 100644 --- a/src/campaign/presets/run-profile-matrix.ts +++ b/src/campaign/presets/run-profile-matrix.ts @@ -39,7 +39,9 @@ import { agentProfileHash, agentProfileId, agentProfileModelId, + harnessAxisOf, } from '../../agent-profile' +import { type AgentProfileCell, buildAgentProfileCell } from '../../agent-profile-cell' import { AgentEvalError } from '../../errors' import { assertRealBackend, @@ -213,6 +215,10 @@ interface BuildRecordArgs { splitTag: RunSplitTag commitSha: string matrixId: string + /** The (profile, harness, model, dimensions) identity of this cell — attached to + * every record so results group by the canonical `groupRunsByAgentProfileCell` + * (harness/model aware) instead of profileId alone. */ + agentProfileCell?: AgentProfileCell scenario?: TScenario corpusText?: ( artifact: TArtifact, @@ -303,6 +309,7 @@ function buildRunRecord( outcome, splitTag, scenarioId: cell.scenarioId, + ...(args.agentProfileCell ? { agentProfile: args.agentProfileCell } : {}), ...(cell.error ? { failureMode: cell.error } : {}), } @@ -409,6 +416,19 @@ export async function runProfileMatrix( runDir: join(opts.runDir, sanitize(profileId)), }) + // The canonical (profile, harness, model) identity for every record in this + // column, so results group by `groupRunsByAgentProfileCell` (harness/model + // aware). Harness comes from the axis stamp `expandProfileAxes` left on the + // profile; a profile that wasn't axis-expanded simply has no harness in its + // cell (unchanged grouping). Built once per profile. + const axis = harnessAxisOf(profile) + const agentProfileCell = await buildAgentProfileCell({ + profileId, + sourceProfile: { kind: 'agent-interface-profile', hash: profileHash }, + model, + ...(axis ? { harness: { id: axis.harness } } : {}), + }) + const profileRecords: RunRecord[] = [] for (const cell of campaign.cells) { const record = buildRunRecord({ @@ -420,6 +440,7 @@ export async function runProfileMatrix( splitTag, commitSha: opts.commitSha, matrixId, + agentProfileCell, scenario: scenarioById.get(cell.scenarioId), corpusText: opts.corpusText, }) diff --git a/src/index.ts b/src/index.ts index a89e6c9..e35ecc7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -705,8 +705,15 @@ export { buildTrajectory } from './trajectory' // ── Auxiliary statistical + decision modules ───────────────────────── -export type { AgentProfile } from './agent-profile' -export { agentProfileHash, agentProfileId, agentProfileModelId } from './agent-profile' +export type { AgentProfile, HarnessType, ProfileAxisSpec } from './agent-profile' +export { + agentProfileHash, + agentProfileId, + agentProfileModelId, + CODING_HARNESSES, + expandProfileAxes, + harnessAxisOf, +} from './agent-profile' export type { BaselineOptions, BaselineReport, MetricSamples, MetricVerdict } from './baseline' export { compareToBaseline, iqr, welchsTTest } from './baseline' export type { diff --git a/tests/campaign/run-profile-matrix.test.ts b/tests/campaign/run-profile-matrix.test.ts index 1878a4b..c60eb90 100644 --- a/tests/campaign/run-profile-matrix.test.ts +++ b/tests/campaign/run-profile-matrix.test.ts @@ -1,5 +1,11 @@ import { describe, expect, it } from 'vitest' -import { type AgentProfile, agentProfileHash, agentProfileId } from '../../src/agent-profile' +import { + type AgentProfile, + agentProfileHash, + agentProfileId, + expandProfileAxes, +} from '../../src/agent-profile' +import { groupRunsByAgentProfileCell } from '../../src/agent-profile-cell' import { inMemoryCampaignStorage, type JudgeConfig, @@ -293,4 +299,32 @@ describe('runProfileMatrix', () => { runProfileMatrix({ ...baseOpts(), profiles: [], dispatch: realDispatch }), ).rejects.toBeInstanceOf(ProfileMatrixError) }) + + it('attaches a harness-bearing AgentProfileCell so runs group by harness', async () => { + const axisBase: AgentProfile = { + name: 'agent', + model: { default: 'test-model@2025-01-01' }, + prompt: { systemPrompt: 'p' }, + } + const profiles = expandProfileAxes({ + base: axisBase, + harnesses: ['opencode', 'codex'], + models: ['test-model@2025-01-01'], + }) + expect(profiles).toHaveLength(2) + + const result = await runProfileMatrix({ ...baseOpts(), profiles, dispatch: realDispatch }) + + // Every record carries the canonical cell, and its harness is the one the + // generator stamped — no metadata smuggling, no hand-recomputed key. + for (const rec of result.records) { + expect(rec.agentProfile?.harness?.id).toMatch(/^(opencode|codex)$/) + } + expect(new Set(result.records.map((r) => r.agentProfile?.harness?.id))).toEqual( + new Set(['opencode', 'codex']), + ) + // The EXISTING grouping (not a bespoke pivot) separates the two harnesses: + // one cell per (profile, harness, model), independent of scenario/rep. + expect(groupRunsByAgentProfileCell(result.records).size).toBe(2) + }) })