//! `tracker_design` — teaching tool. Returns a system_prompt + a library of
//! archetypes + the existing-tracker landscape so the agent can compose a
//! well-shaped tracker spec and call `artifact_create` with confidence.
//!
//! Archetype-driven (not intent-driven) for v1: server stays stateless, no
//! synthesis cost, transparent to the agent. Intent-driven tailoring is
//! deferred until archetype selection proves frustrating in practice.
use crate::librarian::catalog::{artifact, augmentation};
use crate::librarian::tools::ToolContext;
use anyhow::Result;
use serde::Deserialize;
use serde_json::{json, Value};
#[derive(Deserialize, Default)]
struct Args {
/// Free-form intent ("tracker for v7.1 flag rollout"). Currently
/// echoed back; reserved for future intent-driven tailoring.
#[serde(default)]
intent: Option<String>,
}
const DESIGN_VERSION: &str = "1";
/// Cap for inline existing-trackers list. Above this, agent should call
/// `artifact_find {kind:"tracker"}` directly.
const EXISTING_TRACKERS_CAP: usize = 30;
pub fn archetypes() -> Value {
json!([
archetype_deployment_state(),
archetype_failure_table(),
archetype_metric_baseline(),
archetype_audit_issues(),
archetype_task_list(),
archetype_reflective(),
archetype_goal(),
])
}
fn archetype_deployment_state() -> Value {
json!({
"name": "deployment_state",
"when_to_use": "Tracking the current state of a feature flag, env rollout, or config value across environments. State changes per commit/deploy. Examples: 'v7.1 flag pending server restart', 'recipe rollout post-fix audit'.",
"params_shape_example": {
"flag_name": "intent_classifier_v71",
"envs": {
"dev": { "enabled": true, "since": "2026-04-12" },
"stage":{ "enabled": true, "since": "2026-04-15" },
"prod": { "enabled": false, "since": null }
},
"last_changed_commit": "abc1234"
},
"params_schema_example": {
"type": "object",
"required": ["flag_name", "envs"],
"properties": {
"flag_name": { "type": "string" },
"envs": {
"type": "object",
"additionalProperties": {
"type": "object",
"required": ["enabled"],
"properties": {
"enabled": { "type": "boolean" },
"since": { "type": ["string", "null"] }
}
}
},
"last_changed_commit": { "type": ["string", "null"] }
}
},
"render_template_example": "**Flag:** `{{ flag_name }}` \n\n| env | enabled | since |\n|-----|:-------:|-------|\n{% for env, s in envs|items %}| {{ env }} | {{ \"✅\" if s.enabled else \"❌\" }} | {{ s.since or \"—\" }} |\n{% endfor %}\n_Last changed: {{ last_changed_commit or \"—\" }}_",
"body_skeleton": "## Why this flag exists\n\n_Brief: what the flag controls, who owns it._\n\n## Rollout plan\n\n_Steps and gates._\n\n## History\n\n_Append dated session blocks: ### YYYY-MM-DD — <event>_",
"prompt_template": "Maintain the live state of feature flag `<NAME>` across environments. Pull current values from settings files via `gather_from: file`. When envs disagree with the deployed commit (`gather_from: git_log`), prefer the most recent commit-derived value and note the divergence in body history. Keep params strictly mechanical — narrative belongs in body."
})
}
fn archetype_failure_table() -> Value {
json!({
"name": "failure_table",
"when_to_use": "Numbered failure list (F-1..F-N) from a test/eval suite. Status flips often as fixes land. Examples: 'eval test suite tracker', 'chat-runtime quality audit'.",
"params_shape_example": {
"failures": [
{ "id": "F-1", "status": "fail", "owner": "@mareurs", "last_seen": "2026-04-29", "notes": "regression after temporal fix" },
{ "id": "F-2", "status": "pass", "owner": "@mareurs", "last_seen": "2026-04-30", "notes": "fixed in abc123" }
],
"suite": "chat-eval-v3"
},
"params_schema_example": {
"type": "object",
"required": ["failures"],
"properties": {
"suite": { "type": "string" },
"failures": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "status"],
"properties": {
"id": { "type": "string", "pattern": "^F-\\d+$" },
"status": { "type": "string", "enum": ["fail", "pass", "flaky", "wontfix"] },
"owner": { "type": "string" },
"last_seen": { "type": "string" },
"notes": { "type": "string" }
}
}
}
}
},
"render_template_example": "**Suite:** `{{ suite }}` — {{ failures|selectattr(\"status\",\"equalto\",\"fail\")|list|length }} failing / {{ failures|length }} total\n\n| id | status | owner | last seen | notes |\n|----|--------|-------|-----------|-------|\n{% for f in failures %}| {{ f.id }} | {{ f.status }} | {{ f.owner or \"—\" }} | {{ f.last_seen or \"—\" }} | {{ f.notes or \"\" }} |\n{% endfor %}",
"body_skeleton": "## Suite methodology\n\n_What the suite tests, how it runs, where results live._\n\n## Per-failure detail\n\n_Optional deeper notes per F-N when warranted._\n\n## History\n\n_### YYYY-MM-DD — <event>_",
"prompt_template": "Maintain the F-N failure list. After each suite run (gather_from: file pointing at the latest junit/json report), update each failure's status, last_seen, and notes. Add new F-N entries for new failures (next free integer). Never delete an entry — mark fixed entries as pass with a notes line citing the commit. Body holds methodology and per-failure deep dives.",
"entry_collection": "failures"
})
}
fn archetype_metric_baseline() -> Value {
json!({
"name": "metric_baseline",
"when_to_use": "Living benchmark log with a baseline + dated session deltas. Examples: 'retrieval-improvement', 'eval-implementation T0/T1 deltas'.",
"params_shape_example": {
"baseline": { "P@5": 0.145, "R@5": 0.724, "captured": "2026-04-24" },
"current": { "P@5": 0.193, "R@5": 0.781, "captured": "2026-05-01" },
"sessions": [
{ "date": "2026-04-29", "label": "T-A prose-lane", "deltas": { "P@5": "+0.017" } },
{ "date": "2026-04-30", "label": "T-B docx-driven", "deltas": { "P@5": "+0.031" } }
]
},
"params_schema_example": {
"type": "object",
"required": ["baseline", "current"],
"properties": {
"baseline": { "type": "object", "additionalProperties": true },
"current": { "type": "object", "additionalProperties": true },
"sessions": {
"type": "array",
"items": {
"type": "object",
"required": ["date", "label"],
"properties": {
"date": { "type": "string" },
"label": { "type": "string" },
"deltas": { "type": "object", "additionalProperties": true }
}
}
}
}
},
"render_template_example": "**Baseline** ({{ baseline.captured }}): {% for k, v in baseline|items %}{% if k != 'captured' %}{{ k }}={{ v }} {% endif %}{% endfor %}\n**Current** ({{ current.captured }}): {% for k, v in current|items %}{% if k != 'captured' %}{{ k }}={{ v }} {% endif %}{% endfor %}\n\n## Sessions\n{% for s in sessions %}- **{{ s.date }} — {{ s.label }}**: {% for k, v in s.deltas|items %}{{ k }}={{ v }} {% endfor %}\n{% endfor %}",
"body_skeleton": "## What we're measuring\n\n_Metrics, dataset, harness._\n\n## Method log\n\n_Append per-session writeups: why we ran this trial, what we changed, what we learned._\n\n### YYYY-MM-DD — <session label>",
"prompt_template": "Maintain baseline + current metrics + per-session deltas. After each benchmark run (gather_from: file pointing at metrics JSON), update `current` and append a session entry. Don't move `baseline` unless an explicit re-baselining is decided in body. Narrative (why we ran this, what we learned) lives in body, not params."
})
}
fn archetype_audit_issues() -> Value {
json!({
"name": "audit_issues",
"when_to_use": "Numbered audit output: issue table with severity, status, owner. Examples: 'chunking-pipeline audit', 'production-trace audit'.",
"params_shape_example": {
"issues": [
{ "n": 1, "title": "Long PDFs split mid-sentence", "severity": "high", "status": "fixed", "owner": "@mareurs",
"severity_reason": "splits within a paragraph break sentence boundaries; downstream retrieval scores 0.05 lower P@5",
"ref_kind": "code_symbol", "md_file": "docs/chunking-audit.md", "md_line": 42,
"raw_ref": "src/chunking/splitter.rs::Splitter::next_chunk",
"first_seen_commit": "abc1234", "first_seen_at": "2026-04-12T09:30:00Z",
"last_verified_at": "2026-05-15T14:20:00Z" },
{ "n": 2, "title": "Headers lost in xlsx", "severity": "med", "status": "open", "owner": "@mareurs" }
]
},
"params_schema_example": {
"type": "object",
"required": ["issues"],
"properties": {
"issues": {
"type": "array",
"items": {
"type": "object",
"required": ["n", "title", "severity", "status"],
"properties": {
"n": { "type": "integer", "minimum": 1 },
"title": { "type": "string" },
"severity": { "type": "string", "enum": ["high", "med", "low"] },
"status": { "type": "string", "enum": ["open", "in-progress", "fixed", "wontfix"] },
"owner": { "type": "string" },
"severity_reason": { "type": "string", "description": "Optional rationale for the severity rating." },
"ref_kind": { "type": "string", "description": "Optional: nature of the target ref (e.g. code_symbol, file_path, url, line)." },
"md_file": { "type": "string", "description": "Optional: path of the markdown document where this finding originated." },
"md_line": { "type": "integer", "minimum": 1, "description": "Optional: line number within md_file." },
"raw_ref": { "type": "string", "description": "Optional: the exact ref string as it appeared in the source." },
"first_seen_commit": { "type": "string", "description": "Optional: git commit SHA where the issue was first observed." },
"first_seen_at": { "type": "string", "format": "date-time", "description": "Optional: ISO-8601 timestamp of first observation." },
"last_verified_at": { "type": "string", "format": "date-time", "description": "Optional: ISO-8601 timestamp of the most recent check that re-confirmed the issue." }
}
}
}
}
},
"render_template_example": "| # | Issue | Severity | Status | Owner |\n|--:|-------|:--------:|:------:|-------|\n{% for i in issues %}| {{ i.n }} | {{ i.title }} | {{ i.severity }} | {{ i.status }} | {{ i.owner or \"—\" }} |\n{% endfor %}",
"body_skeleton": "## Audit scope and methodology\n\n_What was audited, when, by whom._\n\n## Per-issue detail\n\n_For each issue: Symptom / Root cause / Fix / Predicted impact._\n\n## History\n\n_### YYYY-MM-DD — <event>_",
"prompt_template": "Maintain the numbered issue table. Status flips drive updates: as issues are fixed, mark `fixed` with a body note. Don't renumber. New issues get the next integer. Body has per-issue Symptom/RootCause/Fix sections — update those when status changes."
})
}
fn archetype_task_list() -> Value {
json!({
"name": "task_list",
"when_to_use": "Followup queue or phase-based task list with done/in-progress/open status. Examples: this very tracker (`artifact-augmentation-followups`), 'knowledge-injection-future-improvements'.",
"params_shape_example": {
"phases": [
{ "n": 1, "title": "render_template + params_schema", "status": "code-complete" },
{ "n": 2, "title": "refresh_stale tool", "status": "open" }
],
"tasks": [
{ "id": "T-1", "task": "Merge feat branch", "status": "done", "phase": 0 },
{ "id": "T-2", "task": "Schema v4 columns", "status": "done", "phase": 1 },
{ "id": "T-9", "task": "refresh_stale design", "status": "open", "phase": 2 }
]
},
"params_schema_example": {
"type": "object",
"required": ["tasks"],
"properties": {
"phases": {
"type": "array",
"items": {
"type": "object",
"required": ["n", "title", "status"],
"properties": {
"n": { "type": "integer", "minimum": 0 },
"title": { "type": "string" },
"status": { "type": "string", "enum": ["open", "in-progress", "code-complete", "done", "blocked", "dropped"] }
}
}
},
"tasks": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "task", "status"],
"properties": {
"id": { "type": "string", "pattern": "^T-\\d+$" },
"task": { "type": "string" },
"status": { "type": "string", "enum": ["open", "in-progress", "done", "blocked", "dropped"] },
"phase": { "type": "integer", "minimum": 0 },
"notes": { "type": "string" }
}
}
}
}
},
"render_template_example": "## Phase status\n\n| Phase | Title | Status |\n|------:|-------|--------|\n{% for p in phases %}| {{ p.n }} | {{ p.title }} | {{ p.status }} |\n{% endfor %}\n## Tasks\n\n| ID | Task | Status | Phase |\n|---:|------|--------|------:|\n{% for t in tasks %}| {{ t.id }} | {{ t.task }} | {{ t.status }} | {{ t.phase if t.phase is defined else \"—\" }} |\n{% endfor %}",
"body_skeleton": "## Why this initiative exists\n\n_Brief context._\n\n## Phase descriptions\n\n_For each phase: Why / Shape / Open questions / Acceptance._\n\n## History\n\n_### YYYY-MM-DD — <event>_",
"prompt_template": "Maintain the phase + task tables. Mark tasks done as commits land (gather_from: git_log filtered by relevant paths). Add new tasks under the right phase as scope expands. Don't delete completed tasks — they're part of the record. Phase descriptions live in body, individual task one-liners stay in params."
})
}
fn archetype_reflective() -> Value {
json!({
"name": "reflective",
"when_to_use": "Design brainstorm, decision log, options-being-weighed document. Content requires JUDGMENT, not gathering. Examples: 'plan-lifecycle-tracking', 'heuristic-code-analysis', 'agent-memory-research'. Keep params minimal or empty — the body IS the tracker.",
"params_shape_example": {
"status": "scoping",
"started": "2026-04-21"
},
"params_schema_example": {
"type": "object",
"properties": {
"status": { "type": "string", "enum": ["scoping", "active", "deferred", "decided", "archived"] },
"started": { "type": "string" }
}
},
"render_template_example": "_**Status:** {{ status or \"scoping\" }}{% if started %} — **Started:** {{ started }}{% endif %}_",
"body_skeleton": "## Why this exists\n\n_The problem we're scoping._\n\n## Options being weighed\n\n- **Option A** — ...\n- **Option B** — ...\n\n## Anti-goals\n\n_What we're explicitly NOT trying to solve._\n\n## Decision deferred / made\n\n_If decided: when, by whom, why. If deferred: under what conditions we'd revisit._\n\n## History\n\n_### YYYY-MM-DD — <event>_",
"prompt_template": "This is a reflective tracker — prose-driven, not state-driven. On refresh, do NOT rewrite body sections; only update the lightweight status line in params if the user explicitly changes status. New options or decisions go in body via append. Augmentation refresh should be rare — most updates here are human edits."
})
}
// Phase 1 — per-archetype reconciliation clauses. These are the strings the LLM
// reads from rule 1 of the augmentation prompt. After Phase 1 lands they are
// also the strings `goal_aggregation::child_status_pure` is unit-tested against,
// keeping prompt and code in sync.
//
// Edit here, NOT inline in `archetype_goal()`'s prompt JSON.
fn archetype_goal() -> Value {
json!({
"name": "goal",
"when_to_use": "Tracking an outcome-stated objective whose completion depends on a named criterion and on aggregated state of sibling/child artifacts. Use when the work has a definable 'done' line, decomposes into typed sub-trackers (tests, tasks, metrics, audits), and survives across sessions. Examples: 'all flaky tests resolved + suite green for 3 runs', 'retrieval P@5 reaches 0.20 on benchmark X', 'plan-lifecycle subsystem ships behind feature flag'. Not for: open-ended research (use `reflective`), single-metric tracking (use `metric_baseline`), bare task lists with no completion semantics (use `task_list`), goals with fewer than 2 child sub-trackers (the container archetype's job is aggregation — without 2+ children to aggregate, use the underlying archetype directly).",
"params_shape_example": {
"criterion": "Retrieval pipeline P@5 ≥ 0.20 on benchmark-25tc, with no regression on R@5",
"status": "active",
"blocked_reason": null,
"acceptance_signals": [
{"description": "P@5 ≥ 0.20 on benchmark-25tc", "met": false, "evidence": "metric_baseline C-1: current.P@5=0.193 >= 0.20", "kind": "metric_threshold", "evidence_child_id": "C-1", "metric_key": "P@5", "op": ">=", "threshold": 0.20},
{"description": "No new failures in chat-eval-v3", "met": true, "evidence": "failure_table C-2: 0/12 fail|flaky", "kind": "failure_table_clean", "evidence_child_id": "C-2"},
{"description": "All reranker-tuning tasks done", "met": false, "evidence": "task_list C-3: 4/7 done", "kind": "task_list_complete", "evidence_child_id": "C-3"},
{"description": "Out-of-band human review complete", "met": false, "evidence": "pending stakeholder sign-off", "kind": "freeform"}
],
"children": [
{"id": "C-1", "artifact_id": "a1b2c3d4", "title": "Retrieval Benchmark", "archetype": "metric_baseline", "status": "in-progress"},
{"id": "C-2", "artifact_id": "d4e5f6a7", "title": "chat-eval-v3 failures", "archetype": "failure_table", "status": "active"},
{"id": "C-3", "artifact_id": "b9c8d7e6", "title": "Reranker tuning tasks", "archetype": "task_list", "status": "done"}
],
"progress_log": [
{"date": "2026-05-12", "note": "Reranker tuning landed. P@5 0.145 → 0.193.", "evidence_commits": ["abc1234"], "evidence_artifacts": ["a1b2c3d4"]},
{"date": "2026-05-14", "note": "chat-eval-v3 stable. Need final 7pt P@5.", "evidence_commits": [], "evidence_artifacts": ["d4e5f6a7"]}
],
"gather_from": [
{
"source": "git_log",
"since": "last_refreshed_at",
"limit": 30,
"grep": "<path or component pattern named in your criterion>"
}
]
},
"params_schema_example": {
"type": "object",
"required": ["criterion", "status", "children"],
"properties": {
"criterion": { "type": "string" },
"status": { "type": "string", "enum": ["scoping","active","pending-confirmation","done","blocked","abandoned"] },
"blocked_reason": { "type": ["string","null"] },
"acceptance_signals": {
"type": "array",
"items": {
"type": "object",
"required": ["description","met"],
"properties": {
"description": { "type": "string" },
"met": { "type": "boolean" },
"evidence": { "type": "string" },
"kind": { "type": "string", "enum": ["freeform","audit_issues_open_count","failure_table_clean","task_list_complete","metric_threshold","reflective_decided","deployment_envs_enabled"], "description": "Optional discriminator for Rust-side evaluation (amendment D4). Default freeform — human-evaluated. Other kinds drive deterministic .met derivation from the cited child's params." },
"evidence_child_id": { "type": "string", "pattern": "^C-\\d+$", "description": "Required for non-freeform kinds — names the child whose params satisfy the signal." },
"max_open": { "type": "integer", "minimum": 0, "description": "audit_issues_open_count only — max allowed status=open count." },
"metric_key": { "type": "string", "description": "metric_threshold only — key path under child's params.current to read." },
"op": { "type": "string", "enum": [">=",">","<=","<","=="], "description": "metric_threshold only — comparison operator." },
"threshold": { "type": "number", "description": "metric_threshold only — RHS of the comparison." },
"envs": { "type": ["array","null"], "items": {"type": "string"}, "description": "deployment_envs_enabled only — subset of envs required enabled. null = all envs must be enabled." }
}
}
},
"children": {
"type": "array",
"items": {
"type": "object",
"required": ["id","artifact_id","title","archetype","status"],
"properties": {
"id": { "type": "string", "pattern": "^C-\\d+$" },
"artifact_id": { "type": "string" },
"title": { "type": "string" },
"archetype": { "type": "string" },
"status": { "type": "string", "enum": ["pending","active","in-progress","done","blocked","orphan","unknown"] }
}
}
},
"progress_log": {
"type": "array",
"items": {
"type": "object",
"required": ["date","note"],
"properties": {
"date": { "type": "string" },
"note": { "type": "string" },
"evidence_commits": { "type": "array", "items": { "type": "string" } },
"evidence_artifacts": { "type": "array", "items": { "type": "string" } }
}
}
}
}
},
"render_template_example": "**Goal:** {{ criterion }}\n**Status:** {{ status }}{% if blocked_reason %} — _blocked: {{ blocked_reason }}_{% endif %}\n\n{% if acceptance_signals %}**Acceptance signals** — {{ acceptance_signals|selectattr(\"met\")|list|length }}/{{ acceptance_signals|length }} met\n\n| signal | met | evidence |\n|--------|:---:|----------|\n{% for s in acceptance_signals %}| {{ s.description }} | {{ \"✅\" if s.met else \"❌\" }} | {{ s.evidence or \"—\" }} |\n{% endfor %}{% endif %}\n\n**Children** — {{ children|selectattr(\"status\",\"equalto\",\"done\")|list|length }}/{{ children|length }} done\n\n| id | title | archetype | status |\n|---:|-------|-----------|--------|\n{% for c in children %}| {{ c.id }} | {{ c.title }} | {{ c.archetype }} | {{ c.status }} |\n{% endfor %}\n\n{% if progress_log %}**Recent progress** _(last 3 of {{ progress_log|length }})_\n\n{% for p in progress_log|reverse|slice(3)|first %}- **{{ p.date }}**: {{ p.note }}\n{% endfor %}{% endif %}",
"body_skeleton": "## Why this goal exists\n\n_Briefly: the business / engineering driver. Two to four sentences._\n\n## Acceptance criteria (prose)\n\n_Long-form acceptance criteria. Mirrors `acceptance_signals` in params but with rationale, counterexamples, and what's explicitly out of scope._\n\n## Decomposition rationale\n\n_Why these children, in this archetype mix. When new children are spawned mid-refresh, the synthesizer appends a one-paragraph rationale here citing the trigger._\n\n## History\n\n_### YYYY-MM-DD — <event>_\n",
"prompt_template": "Maintain a goal-tracker. Your job is **aggregation**, not evaluation: reconcile each child's state into the goal's params using ground truth supplied by the refresh pipeline. Do not recompute children's evidence — trust the child's own params.\n\nINPUTS (gather):\n- This goal's current params.\n- `context.deterministic_child_statuses` — an array, one entry per linked child, of `{child_id, artifact_id, archetype, status, basis}`. `basis` is `\"deterministic\"` for archetypes the Rust kernel resolved, `\"needs parent context\"` for `metric_baseline` (you evaluate it via rule 1b), `\"unknown archetype\"` for any archetype the kernel doesn't know, `\"no augmentation\"` if the child has no augmentation row, or `\"child unreachable\"` if the artifact_id has no row in the catalog.\n- Optional: commit log scoped to paths the criterion names (gather_from: git_log).\n\nUPDATE RULES:\n\n1. Reconcile each `children[].status` from the child's actual status:\n\n a. **First, copy ground truth from `context.deterministic_child_statuses`.** For every entry whose `basis == \"deterministic\"`, copy its `status` into `children[id].status` verbatim. Do not reinterpret — these archetypes are handled by the Rust kernel and its verdict is authoritative.\n\n b. **For entries whose `basis != \"deterministic\"`** — e.g. `basis == \"needs parent context\"` (a `metric_baseline` child with no citing `acceptance_signals[kind=metric_threshold]`), `basis == \"unknown archetype\"` (archetype not yet in the Rust kernel), or `basis == \"no augmentation\"` (child has no augmentation row) — leave `children[id].status` at the value the entry carries (the gather produced a best-guess `\"active\"`) and append a progress_log note flagging the gap so a future refresh can close it.\n\n c. **For entries with `basis == \"child unreachable\"`** → set `children[id].status = \"orphan\"`. Do NOT delete the row.\n\n2. For each `acceptance_signals[i]`, set `.met` by looking up the cited child's params and copying the relevant value forward. Do not re-derive the underlying metric — read it from the child's params verbatim and compare against the signal's description. Update the `evidence` string to cite the child id and the specific datum.\n\n3. `refresh_meta` is **read-only** for you — Rust computes it. Copy `context.refresh_meta` verbatim into `params.refresh_meta` (every field, byte-identical). Then optionally append at most one entry to `progress_log` IFF `refresh_meta.children_status_delta` is non-empty OR `refresh_meta.commit_count_since_last > 0`. Skip the append on a no-change refresh — Rust increments `refresh_meta.unchanged_refreshes`. **When you DO append, populate every field of the new entry — none are optional in practice:**\n - `date`: today (UTC, `YYYY-MM-DD`).\n - `note`: one-line summary — cite the children that flipped status and the commits that landed.\n - `evidence_commits`: short `hash` strings selected from `context.git_log`, restricted to commits whose `subject` indicates work on a path or component named in this goal's `criterion`. The gather is already date-windowed (the augmentation's `gather_from: git_log` should set `since: \"last_refreshed_at\"` so `context.git_log` only carries commits after `refresh_meta.last_refresh_at`). `refresh_meta.commit_count_since_last` equals `len(context.git_log)`; if zero, leave `[]`.\n - `evidence_artifacts`: child `artifact_id`s for each entry in `refresh_meta.children_status_delta` — look up the delta's `child_id` in `params.children` and copy its `artifact_id`. If the delta is empty, leave `[]`.\n\n4. AUTO-CLOSE GATE — **Rust enforces this; you may propose `status: \"done\"` but the merge will be rejected unless ALL conditions hold:**\n a. `len(children) >= 2` (amendment D9 — single-child or empty goals should use the underlying archetype directly)\n b. Every `children[].status == \"done\"`\n c. Every `acceptance_signals[].met` is true\n If you flip status to \"done\" with any condition unmet, the augment call returns a recoverable error citing the failing condition. Leave status unchanged in that case; the prior body's History entry stays as-is.\n\n5. SCOPE GROWTH: if your aggregation surfaces a missing sub-objective, you MAY add **at most one** new child per refresh (amendment D10 — Rust rejects merges that introduce more than one new `children[].id`). Defer the rest to a follow-up refresh.\n a. Call artifact(action=\"create\", kind=\"tracker\", augment={}) with the appropriate existing archetype (failure_table, task_list, metric_baseline, audit_issues, reflective, deployment_state, or nested goal).\n b. Call artifact(action=\"link\", src_id=THIS_GOAL_ID, dst_id=NEW_CHILD_ID, rel=\"child\").\n c. Add the new child to `children[]` with the next free C-N id.\n d. Append one paragraph to body \"Decomposition rationale\" citing the trigger.\n\nSTOP CONDITION (you are done with this refresh when):\n- All children reconciled.\n- One progress_log entry appended.\n- Auto-close gate evaluated.\n- Output: the new params object. Body edits only for History append or Decomposition rationale append on scope growth.\n\nBody holds rationale and history; params hold mechanical state. Keep them separated."
})
}
const SYSTEM_PROMPT: &str = r#"# How to design a tracker
A tracker is an artifact that mixes **live state** (params, refreshed often by gather sources) with **prose** (body, edited rarely by humans). The art of designing a good tracker is putting the right thing in the right place.
## Step 1 — Pick an archetype
Match the user's intent to one of the 7 archetypes. Use this decision sketch:
- **Will state change mechanically per commit/run/deploy?** → `deployment_state`, `failure_table`, `metric_baseline`, `audit_issues`, or `task_list`.
- **Is the content options/decisions/research that requires human judgment?** → `reflective`.
- **Is the structure a numbered table?** → `failure_table` (F-N), `audit_issues` (numbered), or `task_list` (T-N).
- **Is it metrics over time with sessions?** → `metric_baseline`.
- **Is it a feature flag or env state?** → `deployment_state`.
- **None fit cleanly?** Combine — e.g. start with `task_list`'s schema and add `metric_baseline`'s sessions array. Archetypes are starting points, not rules.
If you're unsure, ask the user which pattern fits before composing.
## Step 2 — Write the augmentation prompt
The `prompt` field is a standing instruction the augmentation refresh follows. Rules:
- **Imperative voice.** "Maintain the F-N table" not "this tracks failures".
- **Name the gather sources.** Be explicit which gather sources feed which fields. The synthesizer needs to know.
- **Conflict resolution.** When sources disagree, say which wins. Common: "newer commit beats older params", "params win if `last_seen` is within 24h".
- **Body vs params boundary.** State the rule: "narrative belongs in body, mechanical state in params".
- **Length budget hint.** "Body section under 200 lines, params under 50 entries."
The archetype's `prompt_template` is a starting point — customize for the user's domain.
## Step 3 — Design the params
- **Live state only.** No multi-paragraph strings, no rationale prose.
- **Stable keys.** Renaming a key breaks the template. Pick well, don't churn.
- **Flat-as-possible.** Templates iterate cleanly over flat arrays/dicts. Deep nesting hurts.
- **Use the archetype's `params_shape_example` as a literal starting point**, then trim/extend.
- **Don't put computed-from-other-fields data in params.** Compute in template.
## Step 4 — Decide the schema discipline
- **Early life:** loose schema with `additionalProperties: true`. Let the shape settle over 2-3 refreshes before locking down.
- **Mature:** add `required`, `enum`, `pattern` constraints. Schema lock prevents drift across refreshes.
- **Skip schema entirely** for `reflective` trackers — they don't have meaningful structured params.
- **Validation triggers** on `artifact_augment` (initial seed) and every `artifact_augment(merge=true)` merge. Violations leave params untouched and return a recoverable error.
## Step 5 — Compose the render_template
- MiniJinja syntax. Common patterns:
- `{% for x in items %}...{% endfor %}` for lists
- `{% for k, v in dict|items %}...{% endfor %}` for dicts
- `{{ items|length }}` for counts
- `{{ items|selectattr(\"status\",\"equalto\",\"fail\")|list|length }}` for filtered counts
- `{{ value or \"—\" }}` for null fallback
- **Render output** is injected between the `[LIVE]` header and the body excerpt in `librarian_context`. Keep it scannable — tables and short status lines.
- **No template** is fine for `reflective` trackers — omit the field.
## Step 5b — Make entries filterable (optional)
If a tracker's per-entry rows should be queryable (e.g. "show only the open hardware items"), set `entry_collection` in the augmentation to the params key holding the array of entry objects (e.g. `"failures"`). This enables `artifact(get, id=..., entry_filter={field:{op:value}})`, which returns the matching rows using the same filter syntax as `artifact(find)`. Only archetypes that keep entries in params (e.g. `failure_table`, `task_list`) support this; `reflective` trackers keep entries in prose — retrofit them first (see `docs/conventions/retrofitting-trackers-for-filtering.md`).
## Step 6 — Sketch the body skeleton
- Each archetype has a `body_skeleton`. Use it.
- Body sections are written by humans (or AI in `artifact_refresh` synthesis), edited rarely.
- Always include a **History** section for dated session blocks (`### YYYY-MM-DD — <event>`). This is the universal cross-project pattern.
## Step 7 — Check for collisions
The `existing_trackers` field in this response lists current trackers. Before creating:
- **Same concern already tracked?** Edit existing, don't fork.
- **Related tracker exists?** Use `artifact_link` to wire them after creation.
- **Naming collision?** Use a more specific title.
## Anti-patterns
- ❌ **Narrative in params.** Multi-sentence strings = use body.
- ❌ **Live state in body.** Flag values, F-N statuses, metric numbers = use params.
- ❌ **Premature schema lock-in.** First 2-3 refreshes will reveal shape changes.
- ❌ **Manual tracker file AND `kind: tracker` artifact for the same concern.** Pick one. Manual `docs/trackers/<name>.md` is for content where humans drive; `kind: tracker` augmented artifact is for content where gather+refresh drives.
- ❌ **Over-gathering.** Each gather source costs tokens at refresh time. Only pull what the prompt actually needs.
- ❌ **Empty render_template.** If you set it, make it useful. Don't ship a one-liner template that adds no value over the prompt blockquote.
## Final step
Call `artifact_create` with `kind=tracker`, `status=active`, and `augment={prompt,params}`:
- `path`: `docs/trackers/<slug>.md` (or project equivalent)
- `title`: human-readable
- `topic`: terse keyword for search
- `prompt`: the augmentation prompt you wrote in Step 2
- `params`: the initial params shape from Step 3 (matching schema if set)
- `params_schema`: optional, per Step 4
- `render_template`: optional, per Step 5
- `body`: from Step 6's skeleton, filled with initial content
The artifact + augmentation are created atomically.
"#;
pub async fn call(ctx: &ToolContext, args: Value) -> Result<Value> {
let a: Args = serde_json::from_value(args).unwrap_or_default();
let cat = ctx.catalog.lock();
let tracker_ids = augmentation::list_all_ids(&cat)?;
let mut existing: Vec<Value> = Vec::new();
let mut total_trackers = 0usize;
for id in tracker_ids.iter() {
let Some(art) = artifact::get(&cat, id)? else {
continue;
};
if art.kind != "tracker" {
continue;
}
total_trackers += 1;
if existing.len() >= EXISTING_TRACKERS_CAP {
continue;
}
let aug = augmentation::get(&cat, id)?;
existing.push(json!({
"id": id,
"title": art.title,
"kind": art.kind,
"abs_path": art.abs_path.display().to_string(),
"last_refreshed_at": aug.as_ref().and_then(|a| a.last_refreshed_at.clone()),
"refresh_count": aug.as_ref().map(|a| a.refresh_count).unwrap_or(0),
}));
}
let mut response = json!({
"design_version": DESIGN_VERSION,
"system_prompt": SYSTEM_PROMPT,
"archetypes": archetypes(),
"existing_trackers": existing,
"existing_trackers_total": total_trackers,
"intent": a.intent,
"next_step": "Pick archetype. Compose spec (prompt, params, render_template, params_schema, body). Call artifact_create with kind=tracker, status=active, and augment={prompt,params}.",
});
if total_trackers > EXISTING_TRACKERS_CAP {
response["existing_trackers_overflow_hint"] = json!(format!(
"Showing {EXISTING_TRACKERS_CAP} of {total_trackers}. For full list use artifact_find {{\"kind\":\"tracker\"}}."
));
}
Ok(response)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::librarian::catalog::{augmentation, Catalog};
use crate::librarian::current_project::CurrentProject;
use crate::librarian::workspace::WorkspaceConfig;
use jsonschema::validator_for;
use std::sync::Arc;
fn mk_ctx() -> ToolContext {
let cat = Catalog::open_in_memory().unwrap();
ToolContext {
catalog: Arc::new(parking_lot::Mutex::new(cat)),
workspace: Arc::new(WorkspaceConfig {
roots: vec![],
ignore: vec![],
rules: vec![],
umbrellas: vec![],
}),
rules: Arc::new(vec![]),
embedding: None,
artifact_store: None,
current_project: Some(Arc::new(CurrentProject {
abs_path: std::path::PathBuf::from("/test/x/y"),
git_root: std::path::PathBuf::from("/test/x"),
umbrella: None,
})),
}
}
#[tokio::test]
async fn returns_design_envelope() {
let ctx = mk_ctx();
let v = call(&ctx, json!({})).await.unwrap();
assert_eq!(v["design_version"], "1");
assert!(v["system_prompt"].as_str().unwrap().len() > 1000);
assert_eq!(v["archetypes"].as_array().unwrap().len(), 7);
assert!(v["next_step"].as_str().unwrap().contains("artifact_create"));
}
#[tokio::test]
async fn lists_existing_trackers_only() {
let ctx = mk_ctx();
{
let cat = ctx.catalog.lock();
let now = chrono::Utc::now().timestamp_millis();
for (id, kind) in [("t1", "tracker"), ("d1", "decision")] {
artifact::upsert(
&cat,
&artifact::ArtifactRow {
id: id.to_string(),
abs_path: std::path::PathBuf::from(format!("/test/r/{id}.md")),
kind: kind.into(),
status: "active".into(),
title: Some(format!("Title {id}")),
owners: vec![],
tags: vec![],
topic: None,
time_scope: None,
source: None,
created_at: now,
updated_at: now,
file_mtime: now,
file_sha256: "x".into(),
confidence: 1.0,
},
)
.unwrap();
augmentation::upsert(
&cat,
&augmentation::AugmentationRow {
artifact_id: id.into(),
prompt: "p".into(),
params: "{}".into(),
last_refreshed_at: None,
refresh_count: 0,
created_at: "2026-01-01T00:00:00.000Z".into(),
updated_at: "2026-01-01T00:00:00.000Z".into(),
render_template: None,
params_schema: None,
append_mode: false,
history_cap: None,
entry_collection: None,
},
)
.unwrap();
}
}
let v = call(&ctx, json!({})).await.unwrap();
let listed = v["existing_trackers"].as_array().unwrap();
assert_eq!(listed.len(), 1);
assert_eq!(listed[0]["id"], "t1");
assert_eq!(listed[0]["kind"], "tracker");
}
#[tokio::test]
async fn overflow_hint_when_above_cap() {
let ctx = mk_ctx();
{
let cat = ctx.catalog.lock();
let now = chrono::Utc::now().timestamp_millis();
for i in 0..(EXISTING_TRACKERS_CAP + 5) {
let id = format!("t{i}");
artifact::upsert(
&cat,
&artifact::ArtifactRow {
id: id.clone(),
abs_path: std::path::PathBuf::from(format!("/test/r/{id}.md")),
kind: "tracker".into(),
status: "active".into(),
title: None,
owners: vec![],
tags: vec![],
topic: None,
time_scope: None,
source: None,
created_at: now,
updated_at: now,
file_mtime: now,
file_sha256: "x".into(),
confidence: 1.0,
},
)
.unwrap();
augmentation::upsert(
&cat,
&augmentation::AugmentationRow {
artifact_id: id,
prompt: "p".into(),
params: "{}".into(),
last_refreshed_at: None,
refresh_count: 0,
created_at: "2026-01-01T00:00:00.000Z".into(),
updated_at: "2026-01-01T00:00:00.000Z".into(),
render_template: None,
params_schema: None,
append_mode: false,
history_cap: None,
entry_collection: None,
},
)
.unwrap();
}
}
let v = call(&ctx, json!({})).await.unwrap();
let listed = v["existing_trackers"].as_array().unwrap();
assert_eq!(listed.len(), EXISTING_TRACKERS_CAP);
assert_eq!(
v["existing_trackers_total"].as_u64().unwrap() as usize,
EXISTING_TRACKERS_CAP + 5
);
assert!(v["existing_trackers_overflow_hint"]
.as_str()
.unwrap()
.contains("artifact_find"));
}
#[tokio::test]
async fn each_archetype_self_consistent() {
// The example params for each archetype must validate against that
// archetype's example schema. This catches drift between the two as
// we evolve archetypes.
let v = archetypes();
for arch in v.as_array().unwrap() {
let name = arch["name"].as_str().unwrap();
let schema = &arch["params_schema_example"];
let example = &arch["params_shape_example"];
let validator = validator_for(schema)
.unwrap_or_else(|e| panic!("archetype '{name}' has invalid schema: {e}"));
let errors: Vec<_> = validator.iter_errors(example).collect();
assert!(
errors.is_empty(),
"archetype '{name}' params_shape_example does not validate against params_schema_example: {:?}",
errors.iter().map(|e| e.to_string()).collect::<Vec<_>>()
);
}
}
#[tokio::test]
async fn each_archetype_template_renders_against_example_params() {
// Every archetype that ships a render_template must render its
// own example params without error — catches template/schema drift.
let v = archetypes();
for arch in v.as_array().unwrap() {
let name = arch["name"].as_str().unwrap();
let Some(tmpl) = arch["render_template_example"].as_str() else {
continue;
};
let params = &arch["params_shape_example"];
crate::librarian::tools::render::render_params(tmpl, params).unwrap_or_else(|e| {
panic!("archetype '{name}' template fails on its example: {e}")
});
}
}
#[test]
fn failure_table_archetype_has_entry_collection_field() {
let v = archetype_failure_table();
assert_eq!(
v["entry_collection"].as_str(),
Some("failures"),
"failure_table archetype must advertise entry_collection = \"failures\""
);
}
#[tokio::test]
async fn goal_archetype_present_and_registered() {
let v = archetypes();
let arr = v.as_array().unwrap();
assert_eq!(arr.len(), 7, "expected 7 archetypes including goal");
let names: Vec<&str> = arr.iter().map(|a| a["name"].as_str().unwrap()).collect();
assert!(
names.contains(&"goal"),
"goal archetype missing from archetypes() — got {names:?}"
);
}
/// Drift tripwire — enumerates archetypes that the goal-aggregation kernel
/// resolves to non-`Unknown` and asserts the prompt's rule 1 correctly
/// delegates to Rust for exactly those. If a future edit adds an archetype
/// to the kernel but forgets to collapse its prompt clause (or removes
/// one from the kernel while leaving the "copy verbatim" framing), this
/// test fails.
///
/// `metric_baseline` is probed via `child_status_in_context` with a
/// canonical citing `MetricThreshold` signal — the kernel resolves it
/// deterministically (D8) when a signal cites it, so it should NOT
/// appear as a special LLM-fallback bullet in rule 1b.
///
/// Closes Phase 1 of the I1 refactor: rule 1 lives in Rust, prompt is
/// the consumer. Extended in T-8 for D8 coverage.
#[test]
fn prompt_rule_1_matches_rust_kernel_coverage() {
use crate::librarian::tools::goal_aggregation::{
child_status_in_context, child_status_pure, AcceptanceSignal, AcceptanceSignalSpec,
ChildStatus, ThresholdOp,
};
use serde_json::json;
// Each archetype × canonical params known to elicit a definite verdict.
let probes = [
(
"failure_table",
json!({"failures":[{"id":"F-1","status":"pass"}]}),
),
("task_list", json!({"tasks":[{"id":"T-1","status":"done"}]})),
("audit_issues", json!({"issues":[{"n":1,"status":"fixed"}]})),
("reflective", json!({"status":"decided"})),
("goal", json!({"status":"done"})),
(
"deployment_state",
json!({"envs":{"prod":{"enabled":true}}}),
),
];
let v = super::archetype_goal();
let prompt = v.get("prompt_template").and_then(|p| p.as_str()).unwrap();
// Everything before rule 2 — covers rules 1a/1b/1c and the gather framing.
let rule_1_section = prompt
.split("2. For each `acceptance_signals[i]")
.next()
.unwrap();
for (arch, params) in &probes {
let status = child_status_pure(arch, params);
if status == ChildStatus::Unknown {
// Archetype is NOT Rust-handled in this phase. Prompt MUST
// mention it explicitly under rule 1b's LLM-fallback list.
assert!(
rule_1_section.contains(arch),
"{arch} returns Unknown in kernel — prompt rule 1b must mention it \
for the LLM fallback path"
);
} else {
// Archetype IS Rust-handled. Rule 1a's copy-verbatim clause
// governs it. The prompt must reference the gather context key
// so the LLM knows where to read ground truth.
assert!(
rule_1_section.contains("deterministic_child_statuses"),
"{arch} resolved in kernel — prompt rule 1 must reference \
deterministic_child_statuses as the source of truth"
);
}
}
// D8 — metric_baseline must be Rust-handled when a metric_threshold
// signal cites it. Probe via child_status_in_context with a canonical
// signal; assert the verdict is non-Unknown AND the prompt does not
// single it out as an LLM-only fallback case.
let citing = vec![AcceptanceSignal {
description: "metric_baseline contract".into(),
met: false,
evidence: String::new(),
spec: AcceptanceSignalSpec::MetricThreshold {
evidence_child_id: "C-M".into(),
metric_key: "P@5".into(),
op: ThresholdOp::Gte,
threshold: 0.20,
},
}];
let child_lookup = vec![(
"C-M".to_string(),
"metric_baseline".to_string(),
json!({"current":{"P@5":0.21}}),
)];
let mb_status = child_status_in_context(
"metric_baseline",
"C-M",
&json!({"current":{"P@5":0.21}}),
&citing,
&child_lookup,
);
assert_ne!(
mb_status,
ChildStatus::Unknown,
"metric_baseline with citing signal must resolve deterministically (D8)"
);
assert!(
!rule_1_section.contains("metric_baseline child →"),
"metric_baseline is now Rust-handled when cited — prompt must not \
carry the legacy per-archetype LLM-evaluation bullet for it"
);
}
/// H-8 follow-through — goal-tracker prompt rule 3 must explicitly tell
/// the LLM how to populate `progress_log[].evidence_commits` and
/// `evidence_artifacts`. The Rust kernel provides `context.git_log` +
/// `refresh_meta.last_refresh_at` + `refresh_meta.children_status_delta`;
/// the prompt has to anchor the LLM to those names. Without this
/// instruction the LLM leaves evidence_commits empty (the failure mode
/// H-8 surfaced in the 2026-05-17 audit).
#[test]
fn goal_prompt_rule_3_anchors_evidence_fields_to_gather_context() {
let v = super::archetype_goal();
let prompt = v.get("prompt_template").and_then(|p| p.as_str()).unwrap();
// Carve out rule 3 — between "3. `refresh_meta`" and the rule 4 header.
let rule_3 = prompt
.split("3. `refresh_meta`")
.nth(1)
.and_then(|s| s.split("4. AUTO-CLOSE GATE").next())
.expect("rule 3 must be present and precede rule 4");
// Evidence-commits anchoring — both the source (`context.git_log`) and
// the date cutoff (`refresh_meta.last_refresh_at`) must be named so
// the LLM knows where to look and how far back to scan.
assert!(
rule_3.contains("evidence_commits") && rule_3.contains("context.git_log"),
"rule 3 must instruct the LLM to populate evidence_commits from context.git_log"
);
assert!(
rule_3.contains("refresh_meta.last_refresh_at"),
"rule 3 must anchor evidence_commits selection to refresh_meta.last_refresh_at"
);
// Evidence-artifacts anchoring — the source (`children_status_delta`)
// and the lookup path (`params.children`) must be named so the LLM
// can map child_id → artifact_id.
assert!(
rule_3.contains("evidence_artifacts")
&& rule_3.contains("children_status_delta"),
"rule 3 must instruct the LLM to populate evidence_artifacts from children_status_delta"
);
// Regression guard — the old misleading claim about field ownership
// must not return. `date`, `note`, `evidence_commits`, and
// `evidence_artifacts` are ALL LLM-owned. Only `refresh_meta` is
// Rust-owned.
assert!(
!rule_3.contains("`note` is the only LLM-owned field"),
"rule 3 must not claim note is the only LLM-owned field — date/evidence_* are also LLM-owned"
);
}
}