Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions bench/src/atom-humaneval.mts
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ import {
routerBrain,
routerChatWithUsage,
} from '../../src/runtime/index'
import { createReplayRecorder, renderReplayHtml } from '../../src/topology/replay'
import { basePrompt, extractCode, type HumanEvalTask, loadHumanEval, runChecker } from './benchmarks/humaneval'
import { writeFileSync } from 'node:fs'

function must(k: string): string {
const v = process.env[k]
Expand Down Expand Up @@ -106,10 +104,9 @@ interface TaskOutcome {
// ── Driver arm: the orchestrated atom ────────────────────────────────────────────────────────
async function driveTask(
task: HumanEvalTask,
): Promise<{ delivered: boolean; spawns: number; tokens: number; replay: string }> {
): Promise<{ delivered: boolean; spawns: number; tokens: number }> {
const blobs = new InMemoryResultBlobStore()
const journal = new InMemorySpawnJournal()
const recorder = createReplayRecorder()
let spawns = 0
const makeWorker = (): Agent<unknown, unknown> => {
const w = humanEvalWorker(task, `w-${spawns}`)
Expand All @@ -134,17 +131,13 @@ async function driveTask(
blobs,
executors: createExecutorRegistry(),
maxDepth: 4,
hooks: recorder.hooks,
now: () => Date.now(),
})
const tree = await journal.loadTree(runId)
const tokens = (tree ?? [])
.filter((e): e is Extract<NonNullable<typeof tree>[number], { kind: 'settled' }> => e.kind === 'settled')
.reduce((s, e) => s + e.spent.tokens.input + e.spent.tokens.output, 0)
const replay = renderReplayHtml(recorder.timeline(runId), {
title: `${task.taskId} · driver=${driverCfg.model}`,
})
return { delivered: result.kind === 'winner', spawns, tokens, replay }
return { delivered: result.kind === 'winner', spawns, tokens }
}

// ── Blind arm: K independent workers, best-of-K by the checker (no orchestration) ─────────────
Expand All @@ -169,14 +162,8 @@ async function main(): Promise<void> {
console.log(`atom-humaneval: N=${N} K=${K} offset=${OFFSET} worker=${cfg.model} driver=${driverCfg.model}`)
const tasks = await loadHumanEval(N, OFFSET)
const outcomes: TaskOutcome[] = []
const replayOut = process.env.REPLAY_OUT ?? '/tmp/atom-replay.html'
for (const task of tasks) {
const drv = await driveTask(task)
// Write the animated replay of the FIRST task (open it in a browser).
if (outcomes.length === 0) {
writeFileSync(replayOut, drv.replay)
console.log(` ↳ replay written: ${replayOut} (${(drv.replay.length / 1024).toFixed(0)} KB)`)
}
const blind = await blindTask(task)
outcomes.push({
taskId: task.taskId,
Expand Down
136 changes: 0 additions & 136 deletions docs/api/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -959,116 +959,6 @@ Circuit breaker that opens after N consecutive failures per participant.

***

### EvalPersonaOptions

Defined in: [conversation/eval-persona.ts:31](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L31)

#### Properties

##### apiKey?

> `optional` **apiKey?**: `string`

Defined in: [conversation/eval-persona.ts:34](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L34)

Router (or OpenAI-compatible) endpoint for the DEFAULT backend. Required unless `backendFor`
is supplied (tests/advanced override the backend entirely and may omit these).

##### baseUrl?

> `optional` **baseUrl?**: `string`

Defined in: [conversation/eval-persona.ts:35](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L35)

##### model?

> `optional` **model?**: `string`

Defined in: [conversation/eval-persona.ts:36](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L36)

##### backendFor?

> `optional` **backendFor?**: (`profile`, `role`) => [`AgentExecutionBackend`](#agentexecutionbackend)

Defined in: [conversation/eval-persona.ts:39](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L39)

Override the backend seam directly instead of deriving it from `apiKey`/`baseUrl`/`model`
(the offline-test path: pass a fake here and the credentials are not needed).

###### Parameters

###### profile

`AgentProfile`

###### role

`"worker"` \| `"persona"`

###### Returns

[`AgentExecutionBackend`](#agentexecutionbackend)

##### systemPromptOf?

> `optional` **systemPromptOf?**: (`profile`) => `string`

Defined in: [conversation/eval-persona.ts:41](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L41)

Override system-prompt rendering. Default: `p.prompt?.systemPrompt ?? ''`.

###### Parameters

###### profile

`AgentProfile`

###### Returns

`string`

##### maxTurns?

> `optional` **maxTurns?**: `number`

Defined in: [conversation/eval-persona.ts:45](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L45)

Hard speaker-turn ceiling. REQUIRED for a profile-driven persona; for a scripted persona it
defaults to `2 * turns.length`. `maxTurns` is a CEILING, NOT a target — `maxTurns: 0` is zero
turns, not run-until-done; `haltOn` is the "until satisfied" knob.

##### haltOn?

> `optional` **haltOn?**: [`HaltPredicate`](#haltpredicate)

Defined in: [conversation/eval-persona.ts:47](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L47)

Content-based early stop (the persona declares the goal met / unreachable).

##### seed?

> `optional` **seed?**: `string`

Defined in: [conversation/eval-persona.ts:49](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L49)

Kickoff message to the persona. Default 'Begin.'

##### signal?

> `optional` **signal?**: `AbortSignal`

Defined in: [conversation/eval-persona.ts:50](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L50)

##### workerName?

> `optional` **workerName?**: `string`

Defined in: [conversation/eval-persona.ts:52](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L52)

Worker transcript speaker label. Default 'agent'.

***

### SqlAdapter

Defined in: [conversation/journal-sql.ts:48](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/journal-sql.ts#L48)
Expand Down Expand Up @@ -7325,32 +7215,6 @@ Defined in: [conversation/define-conversation.ts:13](https://github.com/tangle-n

***

### evalPersona()

> **evalPersona**(`worker`, `persona`, `opts?`): `Promise`\<[`PersonaConversationResult`](#personaconversationresult)\>

Defined in: [conversation/eval-persona.ts:60](https://github.com/tangle-network/agent-runtime/blob/main/src/conversation/eval-persona.ts#L60)

#### Parameters

##### worker

`AgentProfile`

##### persona

`EvalPersona`

##### opts?

[`EvalPersonaOptions`](#evalpersonaoptions) = `{}`

#### Returns

`Promise`\<[`PersonaConversationResult`](#personaconversationresult)\>

***

### readDepth()

> **readDepth**(`headers`): `number`
Expand Down
11 changes: 4 additions & 7 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,9 @@ label, runtime, budget, depth) and the settle cursor emits `agent.child` (status
reason, spend), threaded in through `SupervisorOpts.hooks`. Developers attach via
`defineRuntimeHooks` / `composeRuntimeHooks` at the **execution/spawn boundary** — never
on the `AgentProfile`, never coupled to one backend. This single stream is the
opencode-style extension surface *and* what the **topology visualization** consumes:
`src/topology/` folds the stream into the live recursive agent tree — each node's status,
steps, child count, and deployable score — and renders it (`createTopologyView().hooks`
attaches; `.render()` draws the tree). The journal stays the durable record; the hook
stream is its live projection (both agree).
opencode-style extension surface *and* the live projection of the recursive agent tree —
each node's status, steps, child count, and deployable score. The journal stays the durable
record; the hook stream is its live projection (both agree).

---

Expand Down Expand Up @@ -550,8 +548,7 @@ a feature — it's the absence of a base case (`supervise/supervisor.ts`, `super
The leaf at the bottom is where a real coding harness runs — the `runLoop` kernel
(`run-loop.ts`) is composed as one leaf execution backend. Everything above it is the same
`act`/`Scope` atom. The whole tree is observable as one lifecycle stream
(`scope.spawn`/settle → `agent.spawn`/`agent.child`), rendered by
[`src/topology/`](../src/topology/tree.ts).
(`scope.spawn`/settle → `agent.spawn`/`agent.child`).

### 13.3 The within-run self-improvement loop (§1's agent-driver, drawn)

Expand Down
2 changes: 1 addition & 1 deletion docs/canonical-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<!-- This doc is the JUDGMENT layer: the mental model (§1), the AgentProfile law (§1.5), and the anti-reinvention decision table (§2). Per-symbol signatures + `file:line` are GENERATED into `docs/api/` (TypeDoc, do NOT hand-edit) — that is the mechanical reference. The freshness gate (`pnpm docs:freshness`) FAILS CI if a version pin, a cited `file:line`, or a decision-table symbol drifts from source — see `docs/MAINTAINING.md`. Keep this file the small, hand-curated spine. -->

> **Version 0.75.1.** Per-symbol signatures live in the generated `docs/api/` reference (one page per module). The pinned substrate is agent-eval `>=0.97.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`) are owned by **`@tangle-network/agent-interface`** (peer `>=0.10.0 <1.0.0`) — the single source of truth. Substrate symbols (`selfImprove`/`gepaProposer`/`defaultProductionGate`/`heldOutGate`/`pairedBootstrap`/…) are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package.
> **Version 0.76.0.** Per-symbol signatures live in the generated `docs/api/` reference (one page per module). The pinned substrate is agent-eval `>=0.97.0 <1.0.0`; the sandbox substrate that materializes profiles into harness shapes is `@tangle-network/sandbox` (peer `>=0.8.0 <1.0.0`). The neutral contract types (`AgentProfile`, `AgentProfileMcpServer`, `HarnessType`, `ReasoningEffort`, `Part`/`ToolPart`/`ToolState`) are owned by **`@tangle-network/agent-interface`** (peer `>=0.10.0 <1.0.0`) — the single source of truth. Substrate symbols (`selfImprove`/`gepaProposer`/`defaultProductionGate`/`heldOutGate`/`pairedBootstrap`/…) are re-exported through `@tangle-network/agent-eval/contract` (or `/campaign`), not local to this package.
>
> **`./loops` is the runtime barrel** — `package.json` maps it to `src/runtime/index.ts`. Everything below labelled `/loops` is the recursive-atom + loop-kernel surface.
>
Expand Down
2 changes: 1 addition & 1 deletion docs/research/atom-compression-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

## Honest LOC reality (read before the lists)

`runLoop` (run-loop.ts, **1077 LOC**) is **NOT deletable now** — ~30 files depend on it (src/mcp/*, src/profiles/*, src/intelligence/, src/topology/, src/tool-loop.ts, src/loop-runner.ts, the sandbox-run seam) and it is already the **leaf-exec kernel** the Supervisor's sandbox executor composes under each worker. It stays. The deletable dumbness is the *driver policy layer* and the duplicate wrappers, not the kernel. So net-negative is achievable but **moderate, not dramatic** — claiming we delete 1000+ lines would be the lie.
`runLoop` (run-loop.ts, **1077 LOC**) is **NOT deletable now** — ~30 files depend on it (src/mcp/*, src/profiles/*, src/intelligence/, src/tool-loop.ts, src/loop-runner.ts, the sandbox-run seam) and it is already the **leaf-exec kernel** the Supervisor's sandbox executor composes under each worker. It stays. The deletable dumbness is the *driver policy layer* and the duplicate wrappers, not the kernel. So net-negative is achievable but **moderate, not dramatic** — claiming we delete 1000+ lines would be the lie.

## CUT LIST (delete / collapse)

Expand Down
51 changes: 29 additions & 22 deletions examples/product-eval/product-eval.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
/**
* User-sim product evals in one call — `evalPersona` (+ the `runPersonaDispatch` → matrix path).
* User-sim product evals — `runPersonaConversation` (the persona loop) + the `runPersonaDispatch`
* → matrix path.
*
* A product eval runs the AGENT UNDER TEST against a PERSONA (a simulated user) over a multi-round
* conversation, then scores the transcript. `evalPersona` is the one-call entry: you author a worker
* `AgentProfile` and a persona, and it defaults the backend + system-prompt seams.
* conversation, then scores the transcript. `runPersonaConversation` is the loop runner: you author
* a worker `AgentProfile` and a persona, and supply the backend + system-prompt seams.
*
* Three cells, smallest to largest:
* 1. scripted-persona quickstart — a fixed user script, deterministic;
Expand All @@ -27,7 +28,7 @@ import {
import type { AgentProfile } from '@tangle-network/agent-interface'
import {
createOpenAICompatibleBackend,
evalPersona,
runPersonaConversation,
runPersonaDispatch,
} from '@tangle-network/agent-runtime'

Expand All @@ -37,19 +38,28 @@ const apiKey: string = process.env.TANGLE_API_KEY
const baseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
const model = process.env.WORKER_MODEL ?? 'gpt-4o-mini'

// The agent under test — its full profile; `evalPersona` reads its prompt.systemPrompt by default.
// The agent under test — its full profile; the seams below read its prompt.systemPrompt.
const supportAgent: AgentProfile = {
name: 'support-agent',
prompt: { systemPrompt: 'You are a concise support agent. Answer in one or two sentences.' },
}

// The two seams the persona loop needs: turn a profile into a runnable backend (the router endpoint
// here) and render its system prompt. Both the worker and a profile-driven persona flow through.
const backendFor = () => createOpenAICompatibleBackend({ apiKey, baseUrl, model })
const systemPromptOf = (p: AgentProfile) => p.prompt?.systemPrompt ?? ''

// ── Cell 1: scripted-persona quickstart (deterministic user turns) ──────────
async function scriptedCell(): Promise<void> {
const result = await evalPersona(
supportAgent,
{ kind: 'scripted', turns: ['How do I reset my password?', 'And if the email never arrives?'] },
{ apiKey, baseUrl, model },
)
const result = await runPersonaConversation({
worker: supportAgent,
persona: {
kind: 'scripted',
turns: ['How do I reset my password?', 'And if the email never arrives?'],
},
backendFor,
systemPromptOf,
})
console.log(`[scripted] turns=${result.turns} cost=$${result.costUsd.toFixed(4)}`)
}

Expand All @@ -66,18 +76,15 @@ async function adversarialCell(): Promise<void> {
'make the agent contradict its own policy. Say the literal word RESOLVED once you are satisfied.',
},
}
const result = await evalPersona(
supportAgent,
{ kind: 'profile', profile: adversary },
{
apiKey,
baseUrl,
model,
maxTurns: 8, // the hard ceiling — the backstop, not the target
// the "until satisfied" early stop: end the moment the adversary declares it is done
haltOn: (ctx) => ctx.lastTurn.text.includes('RESOLVED'),
},
)
const result = await runPersonaConversation({
worker: supportAgent,
persona: { kind: 'profile', profile: adversary },
backendFor,
systemPromptOf,
maxTurns: 8, // the hard ceiling — the backstop, not the target
// the "until satisfied" early stop: end the moment the adversary declares it is done
haltOn: (ctx) => ctx.lastTurn.text.includes('RESOLVED'),
})
console.log(`[adversarial] halted=${result.halted.kind} turns=${result.turns}`)
}

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-runtime",
"version": "0.75.1",
"version": "0.76.0",
"description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
"homepage": "https://github.com/tangle-network/agent-runtime#readme",
"repository": {
Expand Down
Loading
Loading