Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "agent-eval-rpc"
version = "0.100.3"
version = "0.101.0"
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion clients/python/src/agent_eval_rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
try:
__version__ = version("agent-eval-rpc")
except PackageNotFoundError:
__version__ = "0.100.3"
__version__ = "0.101.0"

__all__ = [
"Client",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-eval",
"version": "0.100.3",
"version": "0.101.0",
"description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
"homepage": "https://github.com/tangle-network/agent-eval#readme",
"repository": {
Expand Down
90 changes: 90 additions & 0 deletions src/agent-profile.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import {
agentProfileHash,
agentProfileId,
agentProfileModelId,
CODING_HARNESSES,
expandProfileAxes,
harnessAxisOf,
} from './agent-profile'

const base: AgentProfile = {
Expand Down Expand Up @@ -159,3 +162,90 @@ describe('agentProfileModelId', () => {
)
})
})

describe('CODING_HARNESSES', () => {
it('is the canonical primary coding-harness set', () => {
expect([...CODING_HARNESSES]).toEqual(['opencode', 'claude-code', 'codex', 'kimi-code'])
})
})

describe('expandProfileAxes', () => {
const axisBase: AgentProfile = { name: 'agent', model: { default: 'deepseek-v4-flash' } }

it('defaults to CODING_HARNESSES × the base model — one compatible cell per harness', () => {
const profiles = expandProfileAxes({ base: axisBase })
// An unprefixed model id is compatible with every harness → one cell each.
expect(profiles).toHaveLength(CODING_HARNESSES.length)
expect(profiles.map((p) => harnessAxisOf(p)?.harness).sort()).toEqual(
[...CODING_HARNESSES].sort(),
)
for (const p of profiles) expect(p.model?.default).toBe('deepseek-v4-flash')
})

it('crosses harnesses × models with a distinct id per cell (no collapse)', () => {
const profiles = expandProfileAxes({
base: axisBase,
harnesses: ['opencode', 'codex'],
models: ['m-a', 'm-b'],
})
expect(profiles).toHaveLength(4)
expect(new Set(profiles.map((p) => agentProfileId(p))).size).toBe(4)
})

it('drops (harness, model) pairs a vendor-locked harness cannot run', () => {
const pairs = expandProfileAxes({
base: axisBase,
harnesses: ['claude-code', 'codex'],
models: ['anthropic/claude-x', 'openai/gpt-x'],
})
.map((p) => harnessAxisOf(p))
.filter(Boolean)
expect(pairs).toContainEqual({ harness: 'claude-code', model: 'anthropic/claude-x' })
expect(pairs).toContainEqual({ harness: 'codex', model: 'openai/gpt-x' })
expect(pairs).not.toContainEqual({ harness: 'claude-code', model: 'openai/gpt-x' })
expect(pairs).toHaveLength(2)
})

it('router-backed harness (opencode) accepts any provider', () => {
expect(
expandProfileAxes({
base: axisBase,
harnesses: ['opencode'],
models: ['anthropic/x', 'openai/y'],
}),
).toHaveLength(2)
})

it('keepIncompatible retains an otherwise-dropped pair', () => {
expect(
expandProfileAxes({
base: axisBase,
harnesses: ['claude-code'],
models: ['openai/gpt-x'],
keepIncompatible: true,
}),
).toHaveLength(1)
})

it('carries harness + model in metadata and round-trips via harnessAxisOf', () => {
const [p] = expandProfileAxes({ base: axisBase, harnesses: ['opencode'], models: ['m1'] })
expect(p?.metadata?.harness).toBe('opencode')
expect(p?.metadata?.harnessModel).toBe('m1')
expect(harnessAxisOf(p as AgentProfile)).toEqual({ harness: 'opencode', model: 'm1' })
})

it('fails loud on no harnesses / no models / all-incompatible', () => {
expect(() => expandProfileAxes({ base: axisBase, harnesses: [] })).toThrow(/no harnesses/)
expect(() => expandProfileAxes({ base: { name: 'x' } })).toThrow(/no models/)
expect(() =>
expandProfileAxes({ base: axisBase, harnesses: ['claude-code'], models: ['openai/gpt-x'] }),
).toThrow(/incompatible/)
})
})

describe('harnessAxisOf', () => {
it('returns undefined for a profile not produced by expandProfileAxes', () => {
expect(harnessAxisOf({ metadata: undefined })).toBeUndefined()
expect(harnessAxisOf({ metadata: { foo: 'bar' } })).toBeUndefined()
})
})
101 changes: 100 additions & 1 deletion src/agent-profile.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,108 @@
import { createHash } from 'node:crypto'
import type { AgentProfile } from '@tangle-network/agent-interface'
import { type HarnessType, harnessSupportsModel } from '@tangle-network/agent-interface'
import { ValidationError } from './errors'
import { canonicalize } from './pre-registration'

export type { AgentProfile } from '@tangle-network/agent-interface'
export type { AgentProfile, HarnessType } from '@tangle-network/agent-interface'

/**
* The agentic coding harnesses an eval sweeps by default — the ones we care about
* ranking. This is the SINGLE source of that list; consumers import it instead of
* re-declaring their own (a re-declared list is how the fleet drifts). Pass an
* explicit `harnesses` (e.g. `harnessTypeSchema.options` for literally every known
* harness) to widen beyond these.
*/
export const CODING_HARNESSES: readonly HarnessType[] = [
'opencode',
'claude-code',
'codex',
'kimi-code',
]

export interface ProfileAxisSpec {
/** The domain profile to sweep. Its prompt/tools/skills are held fixed; only the
* harness and model vary. `model.default` is the fallback model. */
base: AgentProfile
/** Harnesses to cross. Default: {@link CODING_HARNESSES}. */
harnesses?: readonly HarnessType[]
/** Models to cross. Default: `[base.model.default]` — one model, i.e. today's
* single-model behaviour, so omitting this never changes an existing run. */
models?: readonly string[]
/** Keep (harness, model) pairs the harness can't run instead of dropping them.
* Default: drop (via `harnessSupportsModel`), so a vendor-locked harness paired
* with a foreign model doesn't become a guaranteed-failing cell. */
keepIncompatible?: boolean
}

/**
* Expand a base profile across the harness × model matrix into the `AgentProfile[]`
* that `runProfileMatrix` / `selfImprove` score — the ONE place "which harnesses ×
* which models do we evaluate" lives, so no product hand-rolls its own harness list
* or column→profile mapping (the pattern that let those copies drift and silently
* break the harness pivot).
*
* Each cell clones `base`, sets `model.default`, and stamps `metadata.harness` +
* `metadata.harnessModel` (both hash-bearing, so every cell gets a distinct
* `agentProfileId` row and results join back by harness/model via {@link harnessAxisOf}
* with no hand-recomputed key). Incompatible pairs are dropped unless `keepIncompatible`.
*
* Omit `harnesses`/`models` to sweep the full default set — the "turn it on for
* everything we care about" switch, identical in shape whether one harness or all.
*/
export function expandProfileAxes(spec: ProfileAxisSpec): AgentProfile[] {
const harnesses = spec.harnesses ?? CODING_HARNESSES
if (harnesses.length === 0) throw new ValidationError('expandProfileAxes: no harnesses to sweep')
const baseModel = spec.base.model?.default
const models = spec.models ?? (baseModel ? [baseModel] : [])
if (models.length === 0) {
throw new ValidationError(
'expandProfileAxes: no models to sweep — base profile has no model.default and none were supplied',
)
}
const out: AgentProfile[] = []
const seen = new Set<string>()
for (const harness of harnesses) {
for (const model of models) {
if (!spec.keepIncompatible && !harnessSupportsModel(harness, model)) continue
const profile: AgentProfile = {
...spec.base,
name: `${spec.base.name ?? 'agent'}/${harness}/${model}`,
model: { ...spec.base.model, default: model },
metadata: { ...(spec.base.metadata ?? {}), harness, harnessModel: model },
}
const id = agentProfileId(profile)
if (seen.has(id)) continue
seen.add(id)
out.push(profile)
}
}
if (out.length === 0) {
throw new ValidationError(
`expandProfileAxes: every (harness, model) pair was incompatible (harnesses=[${harnesses.join(', ')}], models=[${models.join(', ')}]). Widen the models or pass keepIncompatible.`,
)
}
return out
}

/**
* Read the (harness, model) a matrix cell ran under, off a profile or a result row's
* profile — the join-back for a `byHarness` pivot. Returns undefined when the profile
* wasn't produced by {@link expandProfileAxes}. Callers group `result.byProfile` by
* this instead of recomputing an id (recomputing the wrong key is what broke the pivot
* in the hand-rolled copies).
*/
export function harnessAxisOf(
profile: Pick<AgentProfile, 'metadata'>,
): { harness: HarnessType; model: string } | undefined {
const m = profile.metadata as Record<string, unknown> | undefined
const harness = m?.harness
const model = m?.harnessModel
if (typeof harness === 'string' && typeof model === 'string') {
return { harness: harness as HarnessType, model }
}
return undefined
}

/**
* Collision-resistant, path-safe, human-readable profile id for eval artifacts.
Expand Down
21 changes: 21 additions & 0 deletions src/campaign/presets/run-profile-matrix.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ import {
agentProfileHash,
agentProfileId,
agentProfileModelId,
harnessAxisOf,
} from '../../agent-profile'
import { type AgentProfileCell, buildAgentProfileCell } from '../../agent-profile-cell'
import { AgentEvalError } from '../../errors'
import {
assertRealBackend,
Expand Down Expand Up @@ -213,6 +215,10 @@ interface BuildRecordArgs<TScenario extends Scenario, TArtifact> {
splitTag: RunSplitTag
commitSha: string
matrixId: string
/** The (profile, harness, model, dimensions) identity of this cell — attached to
* every record so results group by the canonical `groupRunsByAgentProfileCell`
* (harness/model aware) instead of profileId alone. */
agentProfileCell?: AgentProfileCell
scenario?: TScenario
corpusText?: (
artifact: TArtifact,
Expand Down Expand Up @@ -303,6 +309,7 @@ function buildRunRecord<TScenario extends Scenario, TArtifact>(
outcome,
splitTag,
scenarioId: cell.scenarioId,
...(args.agentProfileCell ? { agentProfile: args.agentProfileCell } : {}),
...(cell.error ? { failureMode: cell.error } : {}),
}

Expand Down Expand Up @@ -409,6 +416,19 @@ export async function runProfileMatrix<TScenario extends Scenario, TArtifact>(
runDir: join(opts.runDir, sanitize(profileId)),
})

// The canonical (profile, harness, model) identity for every record in this
// column, so results group by `groupRunsByAgentProfileCell` (harness/model
// aware). Harness comes from the axis stamp `expandProfileAxes` left on the
// profile; a profile that wasn't axis-expanded simply has no harness in its
// cell (unchanged grouping). Built once per profile.
const axis = harnessAxisOf(profile)
const agentProfileCell = await buildAgentProfileCell({
profileId,
sourceProfile: { kind: 'agent-interface-profile', hash: profileHash },
model,
...(axis ? { harness: { id: axis.harness } } : {}),
})

const profileRecords: RunRecord[] = []
for (const cell of campaign.cells) {
const record = buildRunRecord({
Expand All @@ -420,6 +440,7 @@ export async function runProfileMatrix<TScenario extends Scenario, TArtifact>(
splitTag,
commitSha: opts.commitSha,
matrixId,
agentProfileCell,
scenario: scenarioById.get(cell.scenarioId),
corpusText: opts.corpusText,
})
Expand Down
11 changes: 9 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -705,8 +705,15 @@ export { buildTrajectory } from './trajectory'

// ── Auxiliary statistical + decision modules ─────────────────────────

export type { AgentProfile } from './agent-profile'
export { agentProfileHash, agentProfileId, agentProfileModelId } from './agent-profile'
export type { AgentProfile, HarnessType, ProfileAxisSpec } from './agent-profile'
export {
agentProfileHash,
agentProfileId,
agentProfileModelId,
CODING_HARNESSES,
expandProfileAxes,
harnessAxisOf,
} from './agent-profile'
export type { BaselineOptions, BaselineReport, MetricSamples, MetricVerdict } from './baseline'
export { compareToBaseline, iqr, welchsTTest } from './baseline'
export type {
Expand Down
36 changes: 35 additions & 1 deletion tests/campaign/run-profile-matrix.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import { describe, expect, it } from 'vitest'
import { type AgentProfile, agentProfileHash, agentProfileId } from '../../src/agent-profile'
import {
type AgentProfile,
agentProfileHash,
agentProfileId,
expandProfileAxes,
} from '../../src/agent-profile'
import { groupRunsByAgentProfileCell } from '../../src/agent-profile-cell'
import {
inMemoryCampaignStorage,
type JudgeConfig,
Expand Down Expand Up @@ -293,4 +299,32 @@ describe('runProfileMatrix', () => {
runProfileMatrix({ ...baseOpts(), profiles: [], dispatch: realDispatch }),
).rejects.toBeInstanceOf(ProfileMatrixError)
})

it('attaches a harness-bearing AgentProfileCell so runs group by harness', async () => {
const axisBase: AgentProfile = {
name: 'agent',
model: { default: 'test-model@2025-01-01' },
prompt: { systemPrompt: 'p' },
}
const profiles = expandProfileAxes({
base: axisBase,
harnesses: ['opencode', 'codex'],
models: ['test-model@2025-01-01'],
})
expect(profiles).toHaveLength(2)

const result = await runProfileMatrix({ ...baseOpts(), profiles, dispatch: realDispatch })

// Every record carries the canonical cell, and its harness is the one the
// generator stamped — no metadata smuggling, no hand-recomputed key.
for (const rec of result.records) {
expect(rec.agentProfile?.harness?.id).toMatch(/^(opencode|codex)$/)
}
expect(new Set(result.records.map((r) => r.agentProfile?.harness?.id))).toEqual(
new Set(['opencode', 'codex']),
)
// The EXISTING grouping (not a bespoke pivot) separates the two harnesses:
// one cell per (profile, harness, model), independent of scenario/rep.
expect(groupRunsByAgentProfileCell(result.records).size).toBe(2)
})
})
Loading