tangle-network · drewstone · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.100.3"
+version = "0.101.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.100.3"
+    __version__ = "0.101.0"
 
 __all__ = [
     "Client",

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.100.3",
+  "version": "0.101.0",
   "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

diff --git a/src/agent-profile.test.ts b/src/agent-profile.test.ts
@@ -4,6 +4,9 @@ import {
   agentProfileHash,
   agentProfileId,
   agentProfileModelId,
+  CODING_HARNESSES,
+  expandProfileAxes,
+  harnessAxisOf,
 } from './agent-profile'
 
 const base: AgentProfile = {
@@ -159,3 +162,90 @@ describe('agentProfileModelId', () => {
     )
   })
 })
+
+describe('CODING_HARNESSES', () => {
+  it('is the canonical primary coding-harness set', () => {
+    expect([...CODING_HARNESSES]).toEqual(['opencode', 'claude-code', 'codex', 'kimi-code'])
+  })
+})
+
+describe('expandProfileAxes', () => {
+  const axisBase: AgentProfile = { name: 'agent', model: { default: 'deepseek-v4-flash' } }
+
+  it('defaults to CODING_HARNESSES × the base model — one compatible cell per harness', () => {
+    const profiles = expandProfileAxes({ base: axisBase })
+    // An unprefixed model id is compatible with every harness → one cell each.
+    expect(profiles).toHaveLength(CODING_HARNESSES.length)
+    expect(profiles.map((p) => harnessAxisOf(p)?.harness).sort()).toEqual(
+      [...CODING_HARNESSES].sort(),
+    )
+    for (const p of profiles) expect(p.model?.default).toBe('deepseek-v4-flash')
+  })
+
+  it('crosses harnesses × models with a distinct id per cell (no collapse)', () => {
+    const profiles = expandProfileAxes({
+      base: axisBase,
+      harnesses: ['opencode', 'codex'],
+      models: ['m-a', 'm-b'],
+    })
+    expect(profiles).toHaveLength(4)
+    expect(new Set(profiles.map((p) => agentProfileId(p))).size).toBe(4)
+  })
+
+  it('drops (harness, model) pairs a vendor-locked harness cannot run', () => {
+    const pairs = expandProfileAxes({
+      base: axisBase,
+      harnesses: ['claude-code', 'codex'],
+      models: ['anthropic/claude-x', 'openai/gpt-x'],
+    })
+      .map((p) => harnessAxisOf(p))
+      .filter(Boolean)
+    expect(pairs).toContainEqual({ harness: 'claude-code', model: 'anthropic/claude-x' })
+    expect(pairs).toContainEqual({ harness: 'codex', model: 'openai/gpt-x' })
+    expect(pairs).not.toContainEqual({ harness: 'claude-code', model: 'openai/gpt-x' })
+    expect(pairs).toHaveLength(2)
+  })
+
+  it('router-backed harness (opencode) accepts any provider', () => {
+    expect(
+      expandProfileAxes({
+        base: axisBase,
+        harnesses: ['opencode'],
+        models: ['anthropic/x', 'openai/y'],
+      }),
+    ).toHaveLength(2)
+  })
+
+  it('keepIncompatible retains an otherwise-dropped pair', () => {
+    expect(
+      expandProfileAxes({
+        base: axisBase,
+        harnesses: ['claude-code'],
+        models: ['openai/gpt-x'],
+        keepIncompatible: true,
+      }),
+    ).toHaveLength(1)
+  })
+
+  it('carries harness + model in metadata and round-trips via harnessAxisOf', () => {
+    const [p] = expandProfileAxes({ base: axisBase, harnesses: ['opencode'], models: ['m1'] })
+    expect(p?.metadata?.harness).toBe('opencode')
+    expect(p?.metadata?.harnessModel).toBe('m1')
+    expect(harnessAxisOf(p as AgentProfile)).toEqual({ harness: 'opencode', model: 'm1' })
+  })
+
+  it('fails loud on no harnesses / no models / all-incompatible', () => {
+    expect(() => expandProfileAxes({ base: axisBase, harnesses: [] })).toThrow(/no harnesses/)
+    expect(() => expandProfileAxes({ base: { name: 'x' } })).toThrow(/no models/)
+    expect(() =>
+      expandProfileAxes({ base: axisBase, harnesses: ['claude-code'], models: ['openai/gpt-x'] }),
+    ).toThrow(/incompatible/)
+  })
+})
+
+describe('harnessAxisOf', () => {
+  it('returns undefined for a profile not produced by expandProfileAxes', () => {
+    expect(harnessAxisOf({ metadata: undefined })).toBeUndefined()
+    expect(harnessAxisOf({ metadata: { foo: 'bar' } })).toBeUndefined()
+  })
+})
diff --git a/src/agent-profile.ts b/src/agent-profile.ts
@@ -1,9 +1,108 @@
 import { createHash } from 'node:crypto'
 import type { AgentProfile } from '@tangle-network/agent-interface'
+import { type HarnessType, harnessSupportsModel } from '@tangle-network/agent-interface'
 import { ValidationError } from './errors'
 import { canonicalize } from './pre-registration'
 
-export type { AgentProfile } from '@tangle-network/agent-interface'
+export type { AgentProfile, HarnessType } from '@tangle-network/agent-interface'
+
+/**
+ * The agentic coding harnesses an eval sweeps by default — the ones we care about
+ * ranking. This is the SINGLE source of that list; consumers import it instead of
+ * re-declaring their own (a re-declared list is how the fleet drifts). Pass an
+ * explicit `harnesses` (e.g. `harnessTypeSchema.options` for literally every known
+ * harness) to widen beyond these.
+ */
+export const CODING_HARNESSES: readonly HarnessType[] = [
+  'opencode',
+  'claude-code',
+  'codex',
+  'kimi-code',
+]
+
+export interface ProfileAxisSpec {
+  /** The domain profile to sweep. Its prompt/tools/skills are held fixed; only the
+   *  harness and model vary. `model.default` is the fallback model. */
+  base: AgentProfile
+  /** Harnesses to cross. Default: {@link CODING_HARNESSES}. */
+  harnesses?: readonly HarnessType[]
+  /** Models to cross. Default: `[base.model.default]` — one model, i.e. today's
+   *  single-model behaviour, so omitting this never changes an existing run. */
+  models?: readonly string[]
+  /** Keep (harness, model) pairs the harness can't run instead of dropping them.
+   *  Default: drop (via `harnessSupportsModel`), so a vendor-locked harness paired
+   *  with a foreign model doesn't become a guaranteed-failing cell. */
+  keepIncompatible?: boolean
+}
+
+/**
+ * Expand a base profile across the harness × model matrix into the `AgentProfile[]`
+ * that `runProfileMatrix` / `selfImprove` score — the ONE place "which harnesses ×
+ * which models do we evaluate" lives, so no product hand-rolls its own harness list
+ * or column→profile mapping (the pattern that let those copies drift and silently
+ * break the harness pivot).
+ *
+ * Each cell clones `base`, sets `model.default`, and stamps `metadata.harness` +
+ * `metadata.harnessModel` (both hash-bearing, so every cell gets a distinct
+ * `agentProfileId` row and results join back by harness/model via {@link harnessAxisOf}
+ * with no hand-recomputed key). Incompatible pairs are dropped unless `keepIncompatible`.
+ *
+ * Omit `harnesses`/`models` to sweep the full default set — the "turn it on for
+ * everything we care about" switch, identical in shape whether one harness or all.
+ */
+export function expandProfileAxes(spec: ProfileAxisSpec): AgentProfile[] {
+  const harnesses = spec.harnesses ?? CODING_HARNESSES
+  if (harnesses.length === 0) throw new ValidationError('expandProfileAxes: no harnesses to sweep')
+  const baseModel = spec.base.model?.default
+  const models = spec.models ?? (baseModel ? [baseModel] : [])
+  if (models.length === 0) {
+    throw new ValidationError(
+      'expandProfileAxes: no models to sweep — base profile has no model.default and none were supplied',
+    )
+  }
+  const out: AgentProfile[] = []
+  const seen = new Set<string>()
+  for (const harness of harnesses) {
+    for (const model of models) {
+      if (!spec.keepIncompatible && !harnessSupportsModel(harness, model)) continue
+      const profile: AgentProfile = {
+        ...spec.base,
+        name: `${spec.base.name ?? 'agent'}/${harness}/${model}`,
+        model: { ...spec.base.model, default: model },
+        metadata: { ...(spec.base.metadata ?? {}), harness, harnessModel: model },
+      }
+      const id = agentProfileId(profile)
+      if (seen.has(id)) continue
+      seen.add(id)
+      out.push(profile)
+    }
+  }
+  if (out.length === 0) {
+    throw new ValidationError(
+      `expandProfileAxes: every (harness, model) pair was incompatible (harnesses=[${harnesses.join(', ')}], models=[${models.join(', ')}]). Widen the models or pass keepIncompatible.`,
+    )
+  }
+  return out
+}
+
+/**
+ * Read the (harness, model) a matrix cell ran under, off a profile or a result row's
+ * profile — the join-back for a `byHarness` pivot. Returns undefined when the profile
+ * wasn't produced by {@link expandProfileAxes}. Callers group `result.byProfile` by
+ * this instead of recomputing an id (recomputing the wrong key is what broke the pivot
+ * in the hand-rolled copies).
+ */
+export function harnessAxisOf(
+  profile: Pick<AgentProfile, 'metadata'>,
+): { harness: HarnessType; model: string } | undefined {
+  const m = profile.metadata as Record<string, unknown> | undefined
+  const harness = m?.harness
+  const model = m?.harnessModel
+  if (typeof harness === 'string' && typeof model === 'string') {
+    return { harness: harness as HarnessType, model }
+  }
+  return undefined
+}
 
 /**
  * Collision-resistant, path-safe, human-readable profile id for eval artifacts.

diff --git a/src/campaign/presets/run-profile-matrix.ts b/src/campaign/presets/run-profile-matrix.ts
@@ -39,7 +39,9 @@ import {
   agentProfileHash,
   agentProfileId,
   agentProfileModelId,
+  harnessAxisOf,
 } from '../../agent-profile'
+import { type AgentProfileCell, buildAgentProfileCell } from '../../agent-profile-cell'
 import { AgentEvalError } from '../../errors'
 import {
   assertRealBackend,
@@ -213,6 +215,10 @@ interface BuildRecordArgs<TScenario extends Scenario, TArtifact> {
   splitTag: RunSplitTag
   commitSha: string
   matrixId: string
+  /** The (profile, harness, model, dimensions) identity of this cell — attached to
+   *  every record so results group by the canonical `groupRunsByAgentProfileCell`
+   *  (harness/model aware) instead of profileId alone. */
+  agentProfileCell?: AgentProfileCell
   scenario?: TScenario
   corpusText?: (
     artifact: TArtifact,
@@ -303,6 +309,7 @@ function buildRunRecord<TScenario extends Scenario, TArtifact>(
     outcome,
     splitTag,
     scenarioId: cell.scenarioId,
+    ...(args.agentProfileCell ? { agentProfile: args.agentProfileCell } : {}),
     ...(cell.error ? { failureMode: cell.error } : {}),
   }
 
@@ -409,6 +416,19 @@ export async function runProfileMatrix<TScenario extends Scenario, TArtifact>(
       runDir: join(opts.runDir, sanitize(profileId)),
     })
 
+    // The canonical (profile, harness, model) identity for every record in this
+    // column, so results group by `groupRunsByAgentProfileCell` (harness/model
+    // aware). Harness comes from the axis stamp `expandProfileAxes` left on the
+    // profile; a profile that wasn't axis-expanded simply has no harness in its
+    // cell (unchanged grouping). Built once per profile.
+    const axis = harnessAxisOf(profile)
+    const agentProfileCell = await buildAgentProfileCell({
+      profileId,
+      sourceProfile: { kind: 'agent-interface-profile', hash: profileHash },
+      model,
+      ...(axis ? { harness: { id: axis.harness } } : {}),
+    })
+
     const profileRecords: RunRecord[] = []
     for (const cell of campaign.cells) {
       const record = buildRunRecord({
@@ -420,6 +440,7 @@ export async function runProfileMatrix<TScenario extends Scenario, TArtifact>(
         splitTag,
         commitSha: opts.commitSha,
         matrixId,
+        agentProfileCell,
         scenario: scenarioById.get(cell.scenarioId),
         corpusText: opts.corpusText,
       })

diff --git a/src/index.ts b/src/index.ts
@@ -705,8 +705,15 @@ export { buildTrajectory } from './trajectory'
 
 // ── Auxiliary statistical + decision modules ─────────────────────────
 
-export type { AgentProfile } from './agent-profile'
-export { agentProfileHash, agentProfileId, agentProfileModelId } from './agent-profile'
+export type { AgentProfile, HarnessType, ProfileAxisSpec } from './agent-profile'
+export {
+  agentProfileHash,
+  agentProfileId,
+  agentProfileModelId,
+  CODING_HARNESSES,
+  expandProfileAxes,
+  harnessAxisOf,
+} from './agent-profile'
 export type { BaselineOptions, BaselineReport, MetricSamples, MetricVerdict } from './baseline'
 export { compareToBaseline, iqr, welchsTTest } from './baseline'
 export type {

diff --git a/tests/campaign/run-profile-matrix.test.ts b/tests/campaign/run-profile-matrix.test.ts
@@ -1,5 +1,11 @@
 import { describe, expect, it } from 'vitest'
-import { type AgentProfile, agentProfileHash, agentProfileId } from '../../src/agent-profile'
+import {
+  type AgentProfile,
+  agentProfileHash,
+  agentProfileId,
+  expandProfileAxes,
+} from '../../src/agent-profile'
+import { groupRunsByAgentProfileCell } from '../../src/agent-profile-cell'
 import {
   inMemoryCampaignStorage,
   type JudgeConfig,
@@ -293,4 +299,32 @@ describe('runProfileMatrix', () => {
       runProfileMatrix({ ...baseOpts(), profiles: [], dispatch: realDispatch }),
     ).rejects.toBeInstanceOf(ProfileMatrixError)
   })
+
+  it('attaches a harness-bearing AgentProfileCell so runs group by harness', async () => {
+    const axisBase: AgentProfile = {
+      name: 'agent',
+      model: { default: 'test-model@2025-01-01' },
+      prompt: { systemPrompt: 'p' },
+    }
+    const profiles = expandProfileAxes({
+      base: axisBase,
+      harnesses: ['opencode', 'codex'],
+      models: ['test-model@2025-01-01'],
+    })
+    expect(profiles).toHaveLength(2)
+
+    const result = await runProfileMatrix({ ...baseOpts(), profiles, dispatch: realDispatch })
+
+    // Every record carries the canonical cell, and its harness is the one the
+    // generator stamped — no metadata smuggling, no hand-recomputed key.
+    for (const rec of result.records) {
+      expect(rec.agentProfile?.harness?.id).toMatch(/^(opencode|codex)$/)
+    }
+    expect(new Set(result.records.map((r) => r.agentProfile?.harness?.id))).toEqual(
+      new Set(['opencode', 'codex']),
+    )
+    // The EXISTING grouping (not a bespoke pivot) separates the two harnesses:
+    // one cell per (profile, harness, model), independent of scenario/rep.
+    expect(groupRunsByAgentProfileCell(result.records).size).toBe(2)
+  })
 })