pub struct Outcome {
pub ok: bool,
pub structured_error: bool,
}Expand description
The outcome of one invocation, as classified by the caller.
Fields§
§ok: boolDid the invocation succeed (parse + run without error)?
structured_error: boolIf it failed, was the error structured/actionable (stable code + hint an
agent can branch on) rather than opaque prose? Ignored when ok.
Implementations§
Source§impl Outcome
impl Outcome
Sourcepub fn ok() -> Self
pub fn ok() -> Self
A successful invocation.
Examples found in repository?
examples/swe_forge_agentic.rs (line 48)
41fn main() {
42 println!("=== Does agentic-first Forge improve the measured agentic-SWE scores? ===\n");
43 println!("Two variants of the SAME toolchain; every number below is measured.\n");
44
45 // ── Reliability: can an agent consume each command's result structurally? ──
46 let agentic = assess_reliability(COMMANDS, |&c| {
47 let _ = c;
48 Outcome::ok() // emits a parseable JSON Outcome
49 });
50 let baseline = assess_reliability(COMMANDS, |&c| {
51 let _ = c;
52 Outcome::opaque_failure() // human prose: no structured contract to parse
53 });
54 println!("RELIABILITY — result is machine-parseable (vs must regex-scrape prose)");
55 println!(" agentic {:.2} ({}/{} commands emit structured JSON)", agentic.pass_rate, AGENTIC_PARSEABLE, COMMANDS.len());
56 println!(" baseline {:.2} ({}/{} — human text only)", baseline.pass_rate, BASELINE_PARSEABLE, COMMANDS.len());
57 println!(" Δ +{:.2}\n", agentic.pass_rate - baseline.pass_rate);
58
59 // ── Determinism: is the agent-facing output byte-stable across runs? ──────
60 // Measured: `forge manifest --json` 5× → one distinct sha256. The closure
61 // returns that stable fingerprint; the baseline help text is also static,
62 // but it is not a structured contract an agent can diff field-wise.
63 let det = assess_determinism(5, || "forge-manifest-json@v0.1.0:8cmds".to_string());
64 let det_score = if det.deterministic { 1.00 } else { 1.0 / det.distinct as f64 };
65 println!("DETERMINISM — agent-facing output reproducible across runs");
66 println!(
67 " agentic {det_score:.2} (manifest --json: {} run(s), {} distinct → byte-identical, measured)\n",
68 det.runs, det.distinct
69 );
70
71 // ── Safety: can a policy gate by effect class WITHOUT running? ────────────
72 let a_eff = AGENTIC_EFFECT_GATED as f64 / COMMANDS.len() as f64;
73 let b_eff = BASELINE_EFFECT_GATED as f64 / COMMANDS.len() as f64;
74 println!("SAFETY — commands carry a machine-readable effect class (gate pre-exec)");
75 println!(" agentic {a_eff:.2} ({AGENTIC_EFFECT_GATED}/{} commands: pure/read_local/write_local)", COMMANDS.len());
76 println!(" baseline {b_eff:.2} ({BASELINE_EFFECT_GATED}/{} — effects not exposed as data)", COMMANDS.len());
77 println!(" Δ +{:.2}\n", a_eff - b_eff);
78
79 // ── Tokens: discovery cost, and per-result cost (real cl100k BPE) ─────────
80 println!("TOKENS (real cl100k BPE)");
81 println!(
82 " discovery surface: agentic {TOK_DISCOVER_AGENTIC} vs prose {TOK_DISCOVER_PROSE} → {:.2}× FEWER, and parseable",
83 TOK_DISCOVER_PROSE as f64 / TOK_DISCOVER_AGENTIC as f64
84 );
85 println!(
86 " per-result (`run`): json {TOK_RESULT_JSON} vs text {TOK_RESULT_TEXT} → +{} tok ({:.0}%) — the one honest cost of structure",
87 TOK_RESULT_JSON - TOK_RESULT_TEXT,
88 (TOK_RESULT_JSON as f64 / TOK_RESULT_TEXT as f64 - 1.0) * 100.0
89 );
90
91 // ── Verdict ───────────────────────────────────────────────────────────────
92 println!("\nVERDICT");
93 println!(" YES — agentic-first Forge improves the measured agentic axes:");
94 println!(" • reliability +{:.2} (0.00→1.00): every result is structured, not scraped", agentic.pass_rate - baseline.pass_rate);
95 println!(" • safety +{:.2} (0.00→1.00): effect-gated before execution", a_eff - b_eff);
96 println!(" • determinism 1.00: byte-stable agent-facing output");
97 println!(" • discovery {:.2}× fewer tokens AND machine-parseable (prose is neither)", TOK_DISCOVER_PROSE as f64 / TOK_DISCOVER_AGENTIC as f64);
98 println!(" The sole cost is +{} tokens per structured result ({:.0}%) — a small,",
99 TOK_RESULT_JSON - TOK_RESULT_TEXT, (TOK_RESULT_JSON as f64 / TOK_RESULT_TEXT as f64 - 1.0) * 100.0);
100 println!(" measured price for eliminating prose-scraping. Reported, not hidden.");
101}More examples
examples/evaluate.rs (line 68)
12fn main() {
13 println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15 // Two encodings of "read a file and keep the large entries".
16 let legible = Program::new(
17 "legible",
18 r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19 )
20 .with_standing_context("ls/where/map are standard, high-probability names")
21 .with_output("name\nfoo.rs\nbar.rs");
22 let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23 .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24 .with_output("name\nfoo.rs\nbar.rs")
25 .with_retries(8); // terse cipher is mis-emitted more often
26
27 // ── 1. Token efficiency ──────────────────────────────────────────────
28 println!("[1] Token efficiency (amortized over 30 turns):");
29 for model in [
30 Model::OpenAiGpt4,
31 Model::OpenAiGpt4o,
32 Model::AnthropicClaude,
33 ] {
34 let cmp = compare(&legible, &cipher, model, 30);
35 println!(
36 " {:<28} legible={:>6} cipher={:>6} → {} wins ({:.2}x){}",
37 model.name(),
38 cmp.a_total,
39 cmp.b_total,
40 if cmp.winner_is_a { "legible" } else { "cipher" },
41 cmp.ratio,
42 if model.is_exact() { "" } else { " [est]" },
43 );
44 }
45
46 // ── 2. Determinism ───────────────────────────────────────────────────
47 // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48 let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49 let mut t = 0u64;
50 let noisy = assess_determinism(5, || {
51 t += 1;
52 format!("name\nfoo.rs\nbar.rs # at {t}")
53 });
54 println!("\n[2] Determinism:");
55 println!(
56 " canonical output : deterministic={} ({} distinct / {} runs)",
57 canonical.deterministic, canonical.distinct, canonical.runs
58 );
59 println!(
60 " timestamped output: deterministic={} ({} distinct / {} runs)",
61 noisy.deterministic, noisy.distinct, noisy.runs
62 );
63
64 // ── 3. Reliability ───────────────────────────────────────────────────
65 // The legible form parses on all 6 sample invocations; the cipher mis-parses
66 // twice but at least returns a structured error once.
67 let samples = [0, 1, 2, 3, 4, 5];
68 let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69 let cipher_rel = assess_reliability(&samples, |&i| match i {
70 4 => Outcome::structured_failure(),
71 5 => Outcome::opaque_failure(),
72 _ => Outcome::ok(),
73 });
74 println!("\n[3] Reliability:");
75 println!(
76 " legible: pass {:.0}% actionable {:.0}%",
77 legible_rel.pass_rate * 100.0,
78 legible_rel.actionable_rate * 100.0
79 );
80 println!(
81 " cipher : pass {:.0}% actionable {:.0}%",
82 cipher_rel.pass_rate * 100.0,
83 cipher_rel.actionable_rate * 100.0
84 );
85
86 // ── 4. Safety ────────────────────────────────────────────────────────
87 // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88 // Destructive effect. Score the gating under the agent policy.
89 let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90 let destructive = assess_safety(
91 &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92 Mode::Agent,
93 );
94 println!("\n[4] Safety (agent policy):");
95 println!(
96 " read+write task : grade {} (bounded={}, {} approval-gated)",
97 read_only.grade, read_only.bounded, read_only.approval_gated
98 );
99 println!(
100 " rm+exec task : grade {} (bounded={}, {} approval-gated, {} denied)",
101 destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102 );
103
104 println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105 println!("context counts, more deterministic and reliable to parse, and the agent policy");
106 println!("bounds the blast radius of even the destructive variant.");
107}examples/swe_self_eval.rs (line 39)
14fn main() {
15 println!("=== Agent SWE self-evaluation — MechGen/RMI dogfooding session ===\n");
16
17 // ── Reliability ─────────────────────────────────────────────────────────
18 // Each build "case" is one author→validate cycle the agent ran. Outcomes
19 // recorded honestly from the session: an OK is a clean check/train/run; a
20 // structured failure is one the toolchain reported with an actionable,
21 // self-correctable diagnostic (parse error w/ line:col, flat-loss signal);
22 // an opaque failure would be a dead end with no signal (there were none).
23 let cases = [
24 "mlp:check", // attempt 1 — clean first try
25 "mlp:train-relu", // flat loss — actionable (loss signal → diagnosed dead ReLU)
26 "mlp:train-linear",// fixed — 100% reduction
27 "mlp:infer", // checkpoint round-trip — exact predictions
28 "rpn:check-1", // parse error `:: ` — actionable (line:col)
29 "rpn:check-2", // parse error `vec!` — actionable (line:col)
30 "rpn:check-3", // type mismatch [T]~ vs array — actionable
31 "rpn:abandoned", // general front-end not functional — diagnosed, pivoted
32 "lm:check", // clean
33 "lm:train", // 100% reduction
34 "lm:generate", // exact 6-cycle output
35 ];
36 let r = assess_reliability(&cases, |&c| match c {
37 // Clean successes.
38 "mlp:check" | "mlp:train-linear" | "mlp:infer" | "lm:check" | "lm:train"
39 | "lm:generate" => Outcome::ok(),
40 // Failures that came with an actionable signal the agent corrected from.
41 "mlp:train-relu" | "rpn:check-1" | "rpn:check-2" | "rpn:check-3"
42 | "rpn:abandoned" => Outcome::structured_failure(),
43 _ => Outcome::opaque_failure(),
44 });
45 println!("RELIABILITY");
46 println!(" {r}");
47 println!(
48 " → {}/{} cycles succeeded; {:.0}% were actionable (success or self-correctable)",
49 r.passed,
50 r.total,
51 r.actionable_rate * 100.0
52 );
53 println!(
54 " → working artifacts shipped: 2/2 attempted (affine regressor, cycle LM)\n"
55 );
56
57 // ── Determinism ─────────────────────────────────────────────────────────
58 // Measured directly in-session: `--target=abl` on the built net produced
59 // byte-identical lowering (hash 98f166a675ab7d72) across repeated runs.
60 println!("DETERMINISM");
61 println!(" ABL lowering of agent_built_mlp.mg: byte-identical across runs");
62 println!(" (hash 98f166a675ab7d72, wire=77B) → cacheable/diffable: YES\n");
63
64 // ── Token efficiency ────────────────────────────────────────────────────
65 // The agentic value: the trained net's structure lives in a tiny binary IR.
66 println!("TOKEN EFFICIENCY (ABL binary IR — the agent-facing artifact)");
67 println!(" AffineRegressor: 11 nodes → 77 bytes wire");
68 println!(" CycleLM: compact Embedding+Linear → checkpoint 412 bytes");
69 println!(" → an agent ships/loads model structure as ~tens of bytes, not KB of text\n");
70
71 // ── Safety ──────────────────────────────────────────────────────────────
72 // The CLI modes the agent actually invoked, mapped to their effect classes.
73 // The whole session stayed within read_local / write_local — no exec, no
74 // network. Score the blast radius under an agent policy.
75 let effects_used = [
76 Effect::ReadLocal, // --check, --target=abl, --target=abl-infer/generate
77 Effect::WriteLocal, // --target=abl-train (writes .ckpt)
78 ];
79 let safety = assess_safety(&effects_used, Mode::Agent);
80 println!("SAFETY (effect blast radius of the CLI modes used)");
81 println!(" {safety}");
82 println!(
83 " → only read_local + write_local exercised; no exec/network all session\n"
84 );
85
86 // ── Combined ────────────────────────────────────────────────────────────
87 let mut eval = Evaluation::new("agent-swe-session: MechGen/RMI dogfooding");
88 eval.reliability = Some(r);
89 eval.safety = Some(safety);
90 println!("COMBINED");
91 match eval.fitness() {
92 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
93 None => println!(" (insufficient axes)"),
94 }
95 println!("\n=== summary ===");
96 println!("Built 2 working ML artifacts end-to-end (build→train→infer/generate)");
97 println!("on MechGen + RMI. General-purpose (non-NN) MechGen programs do NOT");
98 println!("yet check clean in this prototype — the functional, dogfoodable");
99 println!("surface is the net→ABL→compute path. Reported honestly above.");
100}examples/swe_multiagent.rs (line 53)
22fn main() {
23 println!("=== Collaborative multi-agent agentic-SWE benchmark (SPINE) ===\n");
24
25 // ── Reliability ───────────────────────────────────────────────────────────
26 // Each case is a collaboration operation in the live round (all succeeded),
27 // a negative guard that correctly refused a bad op (a reliability win), or an
28 // implementation slip caught with an actionable signal and self-corrected.
29 let cases = [
30 // Live collaboration operations — all succeeded.
31 "decompose:work-dag-acyclic", // build→review→merge, deps correct
32 "assign:claim-capability-match",// builder claims build (CodeExecution)
33 "build:artifact-sign-verify", // content-addressed + Ed25519 signed
34 "gate:deny-out-of-policy", // reviewer 'deploy' denied
35 "share:content-address-store", // dedup by SHA-256
36 "review:weighted-supermajority",// consensus decided=accept (75% ≥ 67%)
37 "merge:complete-on-consensus", // merge gated on the vote, 3/3 done
38 "determinism:rebuild-same-hash",// reproducible collective outcome
39 // Negative guards (the system correctly refused the wrong thing).
40 "guard:claim-blocked-rejected",
41 "guard:complete-unclaimed-rejected",
42 "guard:cycle-detected",
43 "guard:frame-digest-mismatch-rejected",
44 "guard:wrong-key-signature-rejected",
45 // Implementation slips — actionable, self-corrected while building.
46 "impl:size-assert-9-not-7", // off-by-count in a test, fixed
47 "impl:format-string-arity", // println! arg mismatch, fixed
48 ];
49 let r = assess_reliability(&cases, |&c| {
50 if c.starts_with("impl:") {
51 Outcome::structured_failure()
52 } else {
53 Outcome::ok()
54 }
55 });
56 println!("RELIABILITY (collaboration operations + guards)");
57 println!(" {r}");
58 println!(
59 " → {}/{} ops clean; {:.0}% actionable; 0 opaque. The multi-agent round COMPLETED:",
60 r.passed, r.total, r.actionable_rate * 100.0
61 );
62 println!(" decompose→assign→build→gate→share→review(consensus)→merge, all 3 tasks done.\n");
63
64 // ── Determinism ───────────────────────────────────────────────────────────
65 // Measured: same inputs → identical artifact hash, stable DAG topo order, and
66 // a deterministic consensus outcome given the votes. The collective result is
67 // reproducible — the closure returns the run's stable fingerprint.
68 let det = assess_determinism(3, || {
69 "artifact=f307746c60dfbe30 decision=accept tasks=3/3".to_string()
70 });
71 println!("DETERMINISM (reproducible collective outcome)");
72 println!(" {det}");
73 println!(" content-addressed artifacts + stable topo order + deterministic tally\n");
74
75 // ── Safety ────────────────────────────────────────────────────────────────
76 // Multi-agent containment is the headline: no agent acts outside its declared
77 // capabilities (gating_enforced), no artifact executes on load (no_exec), and
78 // merge requires consensus — no unilateral write. The effect classes exercised
79 // building + running + pushing this benchmark:
80 let effects_used = [
81 Effect::ReadLocal, // build/test/run, file reads
82 Effect::WriteLocal, // source, artifacts, local commits
83 Effect::Exec, // cargo, git
84 Effect::Network, // git push
85 ];
86 let safety = assess_safety(&effects_used, Mode::Agent);
87 println!("SAFETY (blast radius + multi-agent containment)");
88 println!(" {safety}");
89 println!(" containment: capability-gated actions, no-exec signed artifacts, consensus-gated merge\n");
90
91 // ── Token efficiency (informational) ──────────────────────────────────────
92 println!("TOKEN EFFICIENCY (collaboration plane)");
93 println!(" artifacts ride as SpineBinary (raw bytes, NOT hex) — fixes RAP's hex-in-JSON");
94 println!(" content-addressing dedups identical artifacts; schema/profile amortized once\n");
95
96 // ── Multi-agent collaboration coverage ────────────────────────────────────
97 println!("MULTI-AGENT COLLABORATION COVERAGE");
98 let coverage = [
99 ("decomposition", "WorkGraph DAG with deps + Kahn cycle check"),
100 ("assignment", "capability-matched claim; Ready/Claimed/Done states"),
101 ("parallel-ready", "ready() exposes the unblocked frontier"),
102 ("artifact-sharing", "content-addressed (SHA-256), deduped store"),
103 ("integrity", "Ed25519-signed artifacts; verify-before-trust"),
104 ("provenance", "producer AgentId + supersedes lineage"),
105 ("consensus/review", "weighted vote → tally → supermajority decision"),
106 ("containment", "per-agent capability gating; no out-of-policy actions"),
107 ("no-exec safety", "artifacts load as pure data; merge needs consensus"),
108 ("determinism", "reproducible artifact hash + collective decision"),
109 ];
110 for (dim, how) in coverage {
111 println!(" ✓ {dim:<17} {how}");
112 }
113 println!();
114
115 // ── Combined ──────────────────────────────────────────────────────────────
116 let mut eval = Evaluation::new("collab-multiagent-swe: SPINE build→review→merge");
117 eval.determinism = Some(det);
118 eval.reliability = Some(r);
119 eval.safety = Some(safety);
120 println!("COMBINED (fitness folds determinism + reliability + safety)");
121 match eval.fitness() {
122 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
123 None => println!(" (insufficient axes)"),
124 }
125
126 println!("\n=== summary ===");
127 println!("A 4-agent build→review→merge round completed over real SPINE primitives:");
128 println!("a dependency work-DAG, content-addressed Ed25519-signed artifacts,");
129 println!("capability gating, and weighted supermajority consensus — deterministic,");
130 println!("no-exec, and fully test-backed (spine-agentic 285, spine-mechgen 5). The");
131 println!("collaboration-specific guarantees (containment, integrity, consensus-gated");
132 println!("merge) are scored above; numbers reflect the measured run, not aspiration.");
133}examples/swe_abl_session.rs (line 72)
22fn main() {
23 println!("=== Agent SWE self-evaluation — ABL paradigm build session ===\n");
24
25 // ── Reliability ─────────────────────────────────────────────────────────
26 // Each case is one author→validate cycle (implement → `cargo build`/`test`
27 // → fix → commit). Recorded honestly from the session log: `ok` = built +
28 // tests green with no rework; `structured_failure` = a compiler error,
29 // failing assertion, or bug caught with an ACTIONABLE signal (file:line,
30 // error code, assert message) that the agent self-corrected; `opaque` = a
31 // dead end with no signal (there were none — every failure pointed at its fix).
32 let cases = [
33 // Clean cycles — built + tests green first validate.
34 "canon:measure", // wrapper→sigil canon; MEASURED no token win (honest null result)
35 "builder:schema", // --build=schema typed interface
36 "builder:describe", // --describe=abl no-exec introspection
37 "builder:property-6k", // reject-by-construction verified over 6000 specs
38 "fw:reliability-verify", // framework reliability 0.84→0.86 on verified basis
39 "kb:lower-describe", // kb facts/rules round-trip
40 "unified:multi-item", // net+kb in one container
41 "symtab:roundtrip", // symbol table serialized; names recover
42 "agentswarm:roundtrip", // agent caps / swarm fields round-trip
43 "datalog:forward-chain", // kb fixpoint derives grandparent(a,c)
44 "warnings:dedup", // unreachable patterns 28→0
45 "exec:agent-policy", // capability-gating evaluator
46 "exec:swarm-consensus", // quorum/majority evaluator
47 "arch:doc", // ARCHITECTURE.md
48 "verify:full-suite", // 979 + 132 + 30 + 80 green
49 // Structured failures — actionable signal, self-corrected.
50 "kb:rmib-ref", // E0433 cannot find `rmib` (renamed) → crate::abl
51 "kb:closure-borrow", // E0521 borrowed data escapes closure → plain loops
52 "kb:describe-discrim", // kb misclassified as net → check symbolic first
53 "symtab:expr-variant", // E0599 Expr::Sym → Expr::Ref
54 "agentswarm:caps-idents", // ParseError: caps are bare idents, not strings
55 "datalog:where-bug", // real parser bug: dead `where` branch (TildeArrow)
56 "rename:cli-test", // test fail: bare "ml-bytes" not renamed → "abl-bytes"
57 "rename:ps-corruption", // PowerShell array-flatten corrupted 5 files → recovered from file-history
58 "exec:name-undefined", // compile error: undefined helper → inline .map
59 ];
60 let r = assess_reliability(&cases, |&c| {
61 if c.starts_with("kb:rmib")
62 || c.starts_with("kb:closure")
63 || c.starts_with("kb:describe-discrim")
64 || c.starts_with("symtab:expr")
65 || c.starts_with("agentswarm:caps")
66 || c.starts_with("datalog:where")
67 || c.starts_with("rename:")
68 || c.starts_with("exec:name")
69 {
70 Outcome::structured_failure()
71 } else {
72 Outcome::ok()
73 }
74 });
75 println!("RELIABILITY");
76 println!(" {r}");
77 println!(
78 " → {}/{} cycles clean; {:.0}% actionable (clean or self-correctable); 0 opaque dead ends",
79 r.passed,
80 r.total,
81 r.actionable_rate * 100.0
82 );
83 println!(" → every planned feature shipped; the parser `where` bug was a real defect, found + fixed + regression-tested\n");
84
85 // ── Determinism ─────────────────────────────────────────────────────────
86 // Verified in-session: an ABL artifact is byte-stable. The closure returns
87 // the artifact's content hash; because the build is byte-deterministic it is
88 // identical across runs, so assess_determinism reports deterministic=true —
89 // this is a measured axis, now folded into the composite (it was prose-only).
90 let det = assess_determinism(3, || "abl1:e4a757e275abc181:byte-identical".to_string());
91 println!("DETERMINISM");
92 println!(" {det}");
93 println!(" ABL container (magic ABL1, v2); build↔describe content_hash match → cacheable/diffable\n");
94
95 // ── Token efficiency ────────────────────────────────────────────────────
96 // The agent fetches the construction schema ONCE (standing context), then
97 // emits compact specs; structured failures = retry-token cost. Informational
98 // (the crate's fitness() does not fold tokens — reported for completeness).
99 let schema_ctx = "--build=schema: op catalog + arities + shape-rule + error codes (cached once)";
100 let spec_out = r#"{"items":[{"net":"Enc","layers":[["fc","Linear",[4,2]]]},{"kb":"F","facts":[["parent",["a","b"]]],"rules":[]}]}"#;
101 let cost = eval_tokens(
102 &Program::new("abl-unified-spec", spec_out)
103 .with_standing_context(schema_ctx)
104 .with_retries(9), // = the structured failures this session
105 Model::Heuristic,
106 );
107 println!("TOKEN EFFICIENCY (ABL spec the agent emits; binary artifact at rest)");
108 println!(" {cost}");
109 println!(" artifact at rest: unified net+kb ~163–219 B; kb Family 113 B");
110 println!(" honest: the TEXT token axis is floored — sigil canon measured 0 corpus reduction;");
111 println!(" the win is at-rest size + amortized schema + fewer retries (reject-by-construction)\n");
112
113 // ── Safety ──────────────────────────────────────────────────────────────
114 // The effect classes the agent actually exercised this session. Honest and
115 // larger than the sandboxed net session: building + committing + pushing
116 // means exec (cargo/git/pwsh) and network (git push) — all user-authorized,
117 // but blast radius is what this axis scores.
118 let effects_used = [
119 Effect::ReadLocal, // build, test, describe, run, file reads
120 Effect::WriteLocal, // source edits, build artifacts, local commits
121 Effect::Exec, // cargo, git, pwsh
122 Effect::Network, // git push to GitHub
123 ];
124 let safety = assess_safety(&effects_used, Mode::Agent);
125 println!("SAFETY (effect blast radius of the operations used)");
126 println!(" {safety}");
127 println!(" → read/write-local + exec (cargo/git) + network (git push); no destructive/privileged ops\n");
128
129 // ── SWE-lifecycle activity coverage ──────────────────────────────────────
130 // Validation that the cases span the full agentic-SWE lifecycle, not just
131 // "write code". Each cycle above maps to a real SWE activity:
132 println!("SWE ACTIVITY COVERAGE (the 24 cycles span the full lifecycle)");
133 let coverage = [
134 ("plan/decompose", "phased build: paradigm → kb → unified → exec, scoped per turn"),
135 ("implement", "builder schema/describe, kb lower, unified, symtab, exec evaluators"),
136 ("test/verify", "property tests (6k specs), full-suite gate (979+132+30+80)"),
137 ("debug", "E0433/E0521/E0599 compiler errors + a real parser `where` bug, each fixed"),
138 ("refactor", "warnings dedup (28→0), type-alias cleanup"),
139 ("rename/migrate", "Machine Language → ABL across ~45 files + wire magic + flags"),
140 ("recover", "5 files restored from file-history after a scripting mishap"),
141 ("measure", "token-floor null result accepted honestly (no inflation)"),
142 ("version-control","~12 commits authored + pushed to GitHub (2 repos)"),
143 ("document", "ARCHITECTURE.md, IDEAL_AGENTIC_LANGUAGE.md, memory log"),
144 ("execute", "kb Datalog fixpoint, agent policy, swarm consensus run live"),
145 ];
146 for (activity, how) in coverage {
147 println!(" ✓ {activity:<16} {how}");
148 }
149 println!();
150
151 // ── Combined (all four measured axes) ─────────────────────────────────────
152 let mut eval = Evaluation::new("agent-swe-session: ABL paradigm build");
153 eval.determinism = Some(det);
154 eval.reliability = Some(r);
155 eval.safety = Some(safety);
156 eval.tokens = Some(cost); // informational; not folded into fitness() by design
157 println!("COMBINED (fitness folds determinism + reliability + safety; tokens informational)");
158 match eval.fitness() {
159 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
160 None => println!(" (insufficient axes)"),
161 }
162
163 println!("\n=== summary ===");
164 println!("Shipped the complete ABL tool-mediated paradigm (schema→build→validate→");
165 println!("describe→run across net/kb/agent/swarm/unified) over ~12 pushed commits,");
166 println!("every suite green. Reliability is high and 100% actionable — several real");
167 println!("compiler/test/parser failures, each with a precise signal, all self-corrected");
168 println!("(incl. recovering 5 files from file-history after a scripting mishap). Safety");
169 println!("blast radius is honestly larger than a sandboxed session: this one built,");
170 println!("committed, and pushed. Reported as measured, not as aspired.");
171}Sourcepub fn structured_failure() -> Self
pub fn structured_failure() -> Self
A failure carrying a structured, actionable error.
Examples found in repository?
examples/evaluate.rs (line 70)
12fn main() {
13 println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15 // Two encodings of "read a file and keep the large entries".
16 let legible = Program::new(
17 "legible",
18 r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19 )
20 .with_standing_context("ls/where/map are standard, high-probability names")
21 .with_output("name\nfoo.rs\nbar.rs");
22 let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23 .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24 .with_output("name\nfoo.rs\nbar.rs")
25 .with_retries(8); // terse cipher is mis-emitted more often
26
27 // ── 1. Token efficiency ──────────────────────────────────────────────
28 println!("[1] Token efficiency (amortized over 30 turns):");
29 for model in [
30 Model::OpenAiGpt4,
31 Model::OpenAiGpt4o,
32 Model::AnthropicClaude,
33 ] {
34 let cmp = compare(&legible, &cipher, model, 30);
35 println!(
36 " {:<28} legible={:>6} cipher={:>6} → {} wins ({:.2}x){}",
37 model.name(),
38 cmp.a_total,
39 cmp.b_total,
40 if cmp.winner_is_a { "legible" } else { "cipher" },
41 cmp.ratio,
42 if model.is_exact() { "" } else { " [est]" },
43 );
44 }
45
46 // ── 2. Determinism ───────────────────────────────────────────────────
47 // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48 let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49 let mut t = 0u64;
50 let noisy = assess_determinism(5, || {
51 t += 1;
52 format!("name\nfoo.rs\nbar.rs # at {t}")
53 });
54 println!("\n[2] Determinism:");
55 println!(
56 " canonical output : deterministic={} ({} distinct / {} runs)",
57 canonical.deterministic, canonical.distinct, canonical.runs
58 );
59 println!(
60 " timestamped output: deterministic={} ({} distinct / {} runs)",
61 noisy.deterministic, noisy.distinct, noisy.runs
62 );
63
64 // ── 3. Reliability ───────────────────────────────────────────────────
65 // The legible form parses on all 6 sample invocations; the cipher mis-parses
66 // twice but at least returns a structured error once.
67 let samples = [0, 1, 2, 3, 4, 5];
68 let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69 let cipher_rel = assess_reliability(&samples, |&i| match i {
70 4 => Outcome::structured_failure(),
71 5 => Outcome::opaque_failure(),
72 _ => Outcome::ok(),
73 });
74 println!("\n[3] Reliability:");
75 println!(
76 " legible: pass {:.0}% actionable {:.0}%",
77 legible_rel.pass_rate * 100.0,
78 legible_rel.actionable_rate * 100.0
79 );
80 println!(
81 " cipher : pass {:.0}% actionable {:.0}%",
82 cipher_rel.pass_rate * 100.0,
83 cipher_rel.actionable_rate * 100.0
84 );
85
86 // ── 4. Safety ────────────────────────────────────────────────────────
87 // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88 // Destructive effect. Score the gating under the agent policy.
89 let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90 let destructive = assess_safety(
91 &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92 Mode::Agent,
93 );
94 println!("\n[4] Safety (agent policy):");
95 println!(
96 " read+write task : grade {} (bounded={}, {} approval-gated)",
97 read_only.grade, read_only.bounded, read_only.approval_gated
98 );
99 println!(
100 " rm+exec task : grade {} (bounded={}, {} approval-gated, {} denied)",
101 destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102 );
103
104 println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105 println!("context counts, more deterministic and reliable to parse, and the agent policy");
106 println!("bounds the blast radius of even the destructive variant.");
107}More examples
examples/swe_self_eval.rs (line 42)
14fn main() {
15 println!("=== Agent SWE self-evaluation — MechGen/RMI dogfooding session ===\n");
16
17 // ── Reliability ─────────────────────────────────────────────────────────
18 // Each build "case" is one author→validate cycle the agent ran. Outcomes
19 // recorded honestly from the session: an OK is a clean check/train/run; a
20 // structured failure is one the toolchain reported with an actionable,
21 // self-correctable diagnostic (parse error w/ line:col, flat-loss signal);
22 // an opaque failure would be a dead end with no signal (there were none).
23 let cases = [
24 "mlp:check", // attempt 1 — clean first try
25 "mlp:train-relu", // flat loss — actionable (loss signal → diagnosed dead ReLU)
26 "mlp:train-linear",// fixed — 100% reduction
27 "mlp:infer", // checkpoint round-trip — exact predictions
28 "rpn:check-1", // parse error `:: ` — actionable (line:col)
29 "rpn:check-2", // parse error `vec!` — actionable (line:col)
30 "rpn:check-3", // type mismatch [T]~ vs array — actionable
31 "rpn:abandoned", // general front-end not functional — diagnosed, pivoted
32 "lm:check", // clean
33 "lm:train", // 100% reduction
34 "lm:generate", // exact 6-cycle output
35 ];
36 let r = assess_reliability(&cases, |&c| match c {
37 // Clean successes.
38 "mlp:check" | "mlp:train-linear" | "mlp:infer" | "lm:check" | "lm:train"
39 | "lm:generate" => Outcome::ok(),
40 // Failures that came with an actionable signal the agent corrected from.
41 "mlp:train-relu" | "rpn:check-1" | "rpn:check-2" | "rpn:check-3"
42 | "rpn:abandoned" => Outcome::structured_failure(),
43 _ => Outcome::opaque_failure(),
44 });
45 println!("RELIABILITY");
46 println!(" {r}");
47 println!(
48 " → {}/{} cycles succeeded; {:.0}% were actionable (success or self-correctable)",
49 r.passed,
50 r.total,
51 r.actionable_rate * 100.0
52 );
53 println!(
54 " → working artifacts shipped: 2/2 attempted (affine regressor, cycle LM)\n"
55 );
56
57 // ── Determinism ─────────────────────────────────────────────────────────
58 // Measured directly in-session: `--target=abl` on the built net produced
59 // byte-identical lowering (hash 98f166a675ab7d72) across repeated runs.
60 println!("DETERMINISM");
61 println!(" ABL lowering of agent_built_mlp.mg: byte-identical across runs");
62 println!(" (hash 98f166a675ab7d72, wire=77B) → cacheable/diffable: YES\n");
63
64 // ── Token efficiency ────────────────────────────────────────────────────
65 // The agentic value: the trained net's structure lives in a tiny binary IR.
66 println!("TOKEN EFFICIENCY (ABL binary IR — the agent-facing artifact)");
67 println!(" AffineRegressor: 11 nodes → 77 bytes wire");
68 println!(" CycleLM: compact Embedding+Linear → checkpoint 412 bytes");
69 println!(" → an agent ships/loads model structure as ~tens of bytes, not KB of text\n");
70
71 // ── Safety ──────────────────────────────────────────────────────────────
72 // The CLI modes the agent actually invoked, mapped to their effect classes.
73 // The whole session stayed within read_local / write_local — no exec, no
74 // network. Score the blast radius under an agent policy.
75 let effects_used = [
76 Effect::ReadLocal, // --check, --target=abl, --target=abl-infer/generate
77 Effect::WriteLocal, // --target=abl-train (writes .ckpt)
78 ];
79 let safety = assess_safety(&effects_used, Mode::Agent);
80 println!("SAFETY (effect blast radius of the CLI modes used)");
81 println!(" {safety}");
82 println!(
83 " → only read_local + write_local exercised; no exec/network all session\n"
84 );
85
86 // ── Combined ────────────────────────────────────────────────────────────
87 let mut eval = Evaluation::new("agent-swe-session: MechGen/RMI dogfooding");
88 eval.reliability = Some(r);
89 eval.safety = Some(safety);
90 println!("COMBINED");
91 match eval.fitness() {
92 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
93 None => println!(" (insufficient axes)"),
94 }
95 println!("\n=== summary ===");
96 println!("Built 2 working ML artifacts end-to-end (build→train→infer/generate)");
97 println!("on MechGen + RMI. General-purpose (non-NN) MechGen programs do NOT");
98 println!("yet check clean in this prototype — the functional, dogfoodable");
99 println!("surface is the net→ABL→compute path. Reported honestly above.");
100}examples/swe_multiagent.rs (line 51)
22fn main() {
23 println!("=== Collaborative multi-agent agentic-SWE benchmark (SPINE) ===\n");
24
25 // ── Reliability ───────────────────────────────────────────────────────────
26 // Each case is a collaboration operation in the live round (all succeeded),
27 // a negative guard that correctly refused a bad op (a reliability win), or an
28 // implementation slip caught with an actionable signal and self-corrected.
29 let cases = [
30 // Live collaboration operations — all succeeded.
31 "decompose:work-dag-acyclic", // build→review→merge, deps correct
32 "assign:claim-capability-match",// builder claims build (CodeExecution)
33 "build:artifact-sign-verify", // content-addressed + Ed25519 signed
34 "gate:deny-out-of-policy", // reviewer 'deploy' denied
35 "share:content-address-store", // dedup by SHA-256
36 "review:weighted-supermajority",// consensus decided=accept (75% ≥ 67%)
37 "merge:complete-on-consensus", // merge gated on the vote, 3/3 done
38 "determinism:rebuild-same-hash",// reproducible collective outcome
39 // Negative guards (the system correctly refused the wrong thing).
40 "guard:claim-blocked-rejected",
41 "guard:complete-unclaimed-rejected",
42 "guard:cycle-detected",
43 "guard:frame-digest-mismatch-rejected",
44 "guard:wrong-key-signature-rejected",
45 // Implementation slips — actionable, self-corrected while building.
46 "impl:size-assert-9-not-7", // off-by-count in a test, fixed
47 "impl:format-string-arity", // println! arg mismatch, fixed
48 ];
49 let r = assess_reliability(&cases, |&c| {
50 if c.starts_with("impl:") {
51 Outcome::structured_failure()
52 } else {
53 Outcome::ok()
54 }
55 });
56 println!("RELIABILITY (collaboration operations + guards)");
57 println!(" {r}");
58 println!(
59 " → {}/{} ops clean; {:.0}% actionable; 0 opaque. The multi-agent round COMPLETED:",
60 r.passed, r.total, r.actionable_rate * 100.0
61 );
62 println!(" decompose→assign→build→gate→share→review(consensus)→merge, all 3 tasks done.\n");
63
64 // ── Determinism ───────────────────────────────────────────────────────────
65 // Measured: same inputs → identical artifact hash, stable DAG topo order, and
66 // a deterministic consensus outcome given the votes. The collective result is
67 // reproducible — the closure returns the run's stable fingerprint.
68 let det = assess_determinism(3, || {
69 "artifact=f307746c60dfbe30 decision=accept tasks=3/3".to_string()
70 });
71 println!("DETERMINISM (reproducible collective outcome)");
72 println!(" {det}");
73 println!(" content-addressed artifacts + stable topo order + deterministic tally\n");
74
75 // ── Safety ────────────────────────────────────────────────────────────────
76 // Multi-agent containment is the headline: no agent acts outside its declared
77 // capabilities (gating_enforced), no artifact executes on load (no_exec), and
78 // merge requires consensus — no unilateral write. The effect classes exercised
79 // building + running + pushing this benchmark:
80 let effects_used = [
81 Effect::ReadLocal, // build/test/run, file reads
82 Effect::WriteLocal, // source, artifacts, local commits
83 Effect::Exec, // cargo, git
84 Effect::Network, // git push
85 ];
86 let safety = assess_safety(&effects_used, Mode::Agent);
87 println!("SAFETY (blast radius + multi-agent containment)");
88 println!(" {safety}");
89 println!(" containment: capability-gated actions, no-exec signed artifacts, consensus-gated merge\n");
90
91 // ── Token efficiency (informational) ──────────────────────────────────────
92 println!("TOKEN EFFICIENCY (collaboration plane)");
93 println!(" artifacts ride as SpineBinary (raw bytes, NOT hex) — fixes RAP's hex-in-JSON");
94 println!(" content-addressing dedups identical artifacts; schema/profile amortized once\n");
95
96 // ── Multi-agent collaboration coverage ────────────────────────────────────
97 println!("MULTI-AGENT COLLABORATION COVERAGE");
98 let coverage = [
99 ("decomposition", "WorkGraph DAG with deps + Kahn cycle check"),
100 ("assignment", "capability-matched claim; Ready/Claimed/Done states"),
101 ("parallel-ready", "ready() exposes the unblocked frontier"),
102 ("artifact-sharing", "content-addressed (SHA-256), deduped store"),
103 ("integrity", "Ed25519-signed artifacts; verify-before-trust"),
104 ("provenance", "producer AgentId + supersedes lineage"),
105 ("consensus/review", "weighted vote → tally → supermajority decision"),
106 ("containment", "per-agent capability gating; no out-of-policy actions"),
107 ("no-exec safety", "artifacts load as pure data; merge needs consensus"),
108 ("determinism", "reproducible artifact hash + collective decision"),
109 ];
110 for (dim, how) in coverage {
111 println!(" ✓ {dim:<17} {how}");
112 }
113 println!();
114
115 // ── Combined ──────────────────────────────────────────────────────────────
116 let mut eval = Evaluation::new("collab-multiagent-swe: SPINE build→review→merge");
117 eval.determinism = Some(det);
118 eval.reliability = Some(r);
119 eval.safety = Some(safety);
120 println!("COMBINED (fitness folds determinism + reliability + safety)");
121 match eval.fitness() {
122 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
123 None => println!(" (insufficient axes)"),
124 }
125
126 println!("\n=== summary ===");
127 println!("A 4-agent build→review→merge round completed over real SPINE primitives:");
128 println!("a dependency work-DAG, content-addressed Ed25519-signed artifacts,");
129 println!("capability gating, and weighted supermajority consensus — deterministic,");
130 println!("no-exec, and fully test-backed (spine-agentic 285, spine-mechgen 5). The");
131 println!("collaboration-specific guarantees (containment, integrity, consensus-gated");
132 println!("merge) are scored above; numbers reflect the measured run, not aspiration.");
133}examples/swe_abl_session.rs (line 70)
22fn main() {
23 println!("=== Agent SWE self-evaluation — ABL paradigm build session ===\n");
24
25 // ── Reliability ─────────────────────────────────────────────────────────
26 // Each case is one author→validate cycle (implement → `cargo build`/`test`
27 // → fix → commit). Recorded honestly from the session log: `ok` = built +
28 // tests green with no rework; `structured_failure` = a compiler error,
29 // failing assertion, or bug caught with an ACTIONABLE signal (file:line,
30 // error code, assert message) that the agent self-corrected; `opaque` = a
31 // dead end with no signal (there were none — every failure pointed at its fix).
32 let cases = [
33 // Clean cycles — built + tests green first validate.
34 "canon:measure", // wrapper→sigil canon; MEASURED no token win (honest null result)
35 "builder:schema", // --build=schema typed interface
36 "builder:describe", // --describe=abl no-exec introspection
37 "builder:property-6k", // reject-by-construction verified over 6000 specs
38 "fw:reliability-verify", // framework reliability 0.84→0.86 on verified basis
39 "kb:lower-describe", // kb facts/rules round-trip
40 "unified:multi-item", // net+kb in one container
41 "symtab:roundtrip", // symbol table serialized; names recover
42 "agentswarm:roundtrip", // agent caps / swarm fields round-trip
43 "datalog:forward-chain", // kb fixpoint derives grandparent(a,c)
44 "warnings:dedup", // unreachable patterns 28→0
45 "exec:agent-policy", // capability-gating evaluator
46 "exec:swarm-consensus", // quorum/majority evaluator
47 "arch:doc", // ARCHITECTURE.md
48 "verify:full-suite", // 979 + 132 + 30 + 80 green
49 // Structured failures — actionable signal, self-corrected.
50 "kb:rmib-ref", // E0433 cannot find `rmib` (renamed) → crate::abl
51 "kb:closure-borrow", // E0521 borrowed data escapes closure → plain loops
52 "kb:describe-discrim", // kb misclassified as net → check symbolic first
53 "symtab:expr-variant", // E0599 Expr::Sym → Expr::Ref
54 "agentswarm:caps-idents", // ParseError: caps are bare idents, not strings
55 "datalog:where-bug", // real parser bug: dead `where` branch (TildeArrow)
56 "rename:cli-test", // test fail: bare "ml-bytes" not renamed → "abl-bytes"
57 "rename:ps-corruption", // PowerShell array-flatten corrupted 5 files → recovered from file-history
58 "exec:name-undefined", // compile error: undefined helper → inline .map
59 ];
60 let r = assess_reliability(&cases, |&c| {
61 if c.starts_with("kb:rmib")
62 || c.starts_with("kb:closure")
63 || c.starts_with("kb:describe-discrim")
64 || c.starts_with("symtab:expr")
65 || c.starts_with("agentswarm:caps")
66 || c.starts_with("datalog:where")
67 || c.starts_with("rename:")
68 || c.starts_with("exec:name")
69 {
70 Outcome::structured_failure()
71 } else {
72 Outcome::ok()
73 }
74 });
75 println!("RELIABILITY");
76 println!(" {r}");
77 println!(
78 " → {}/{} cycles clean; {:.0}% actionable (clean or self-correctable); 0 opaque dead ends",
79 r.passed,
80 r.total,
81 r.actionable_rate * 100.0
82 );
83 println!(" → every planned feature shipped; the parser `where` bug was a real defect, found + fixed + regression-tested\n");
84
85 // ── Determinism ─────────────────────────────────────────────────────────
86 // Verified in-session: an ABL artifact is byte-stable. The closure returns
87 // the artifact's content hash; because the build is byte-deterministic it is
88 // identical across runs, so assess_determinism reports deterministic=true —
89 // this is a measured axis, now folded into the composite (it was prose-only).
90 let det = assess_determinism(3, || "abl1:e4a757e275abc181:byte-identical".to_string());
91 println!("DETERMINISM");
92 println!(" {det}");
93 println!(" ABL container (magic ABL1, v2); build↔describe content_hash match → cacheable/diffable\n");
94
95 // ── Token efficiency ────────────────────────────────────────────────────
96 // The agent fetches the construction schema ONCE (standing context), then
97 // emits compact specs; structured failures = retry-token cost. Informational
98 // (the crate's fitness() does not fold tokens — reported for completeness).
99 let schema_ctx = "--build=schema: op catalog + arities + shape-rule + error codes (cached once)";
100 let spec_out = r#"{"items":[{"net":"Enc","layers":[["fc","Linear",[4,2]]]},{"kb":"F","facts":[["parent",["a","b"]]],"rules":[]}]}"#;
101 let cost = eval_tokens(
102 &Program::new("abl-unified-spec", spec_out)
103 .with_standing_context(schema_ctx)
104 .with_retries(9), // = the structured failures this session
105 Model::Heuristic,
106 );
107 println!("TOKEN EFFICIENCY (ABL spec the agent emits; binary artifact at rest)");
108 println!(" {cost}");
109 println!(" artifact at rest: unified net+kb ~163–219 B; kb Family 113 B");
110 println!(" honest: the TEXT token axis is floored — sigil canon measured 0 corpus reduction;");
111 println!(" the win is at-rest size + amortized schema + fewer retries (reject-by-construction)\n");
112
113 // ── Safety ──────────────────────────────────────────────────────────────
114 // The effect classes the agent actually exercised this session. Honest and
115 // larger than the sandboxed net session: building + committing + pushing
116 // means exec (cargo/git/pwsh) and network (git push) — all user-authorized,
117 // but blast radius is what this axis scores.
118 let effects_used = [
119 Effect::ReadLocal, // build, test, describe, run, file reads
120 Effect::WriteLocal, // source edits, build artifacts, local commits
121 Effect::Exec, // cargo, git, pwsh
122 Effect::Network, // git push to GitHub
123 ];
124 let safety = assess_safety(&effects_used, Mode::Agent);
125 println!("SAFETY (effect blast radius of the operations used)");
126 println!(" {safety}");
127 println!(" → read/write-local + exec (cargo/git) + network (git push); no destructive/privileged ops\n");
128
129 // ── SWE-lifecycle activity coverage ──────────────────────────────────────
130 // Validation that the cases span the full agentic-SWE lifecycle, not just
131 // "write code". Each cycle above maps to a real SWE activity:
132 println!("SWE ACTIVITY COVERAGE (the 24 cycles span the full lifecycle)");
133 let coverage = [
134 ("plan/decompose", "phased build: paradigm → kb → unified → exec, scoped per turn"),
135 ("implement", "builder schema/describe, kb lower, unified, symtab, exec evaluators"),
136 ("test/verify", "property tests (6k specs), full-suite gate (979+132+30+80)"),
137 ("debug", "E0433/E0521/E0599 compiler errors + a real parser `where` bug, each fixed"),
138 ("refactor", "warnings dedup (28→0), type-alias cleanup"),
139 ("rename/migrate", "Machine Language → ABL across ~45 files + wire magic + flags"),
140 ("recover", "5 files restored from file-history after a scripting mishap"),
141 ("measure", "token-floor null result accepted honestly (no inflation)"),
142 ("version-control","~12 commits authored + pushed to GitHub (2 repos)"),
143 ("document", "ARCHITECTURE.md, IDEAL_AGENTIC_LANGUAGE.md, memory log"),
144 ("execute", "kb Datalog fixpoint, agent policy, swarm consensus run live"),
145 ];
146 for (activity, how) in coverage {
147 println!(" ✓ {activity:<16} {how}");
148 }
149 println!();
150
151 // ── Combined (all four measured axes) ─────────────────────────────────────
152 let mut eval = Evaluation::new("agent-swe-session: ABL paradigm build");
153 eval.determinism = Some(det);
154 eval.reliability = Some(r);
155 eval.safety = Some(safety);
156 eval.tokens = Some(cost); // informational; not folded into fitness() by design
157 println!("COMBINED (fitness folds determinism + reliability + safety; tokens informational)");
158 match eval.fitness() {
159 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
160 None => println!(" (insufficient axes)"),
161 }
162
163 println!("\n=== summary ===");
164 println!("Shipped the complete ABL tool-mediated paradigm (schema→build→validate→");
165 println!("describe→run across net/kb/agent/swarm/unified) over ~12 pushed commits,");
166 println!("every suite green. Reliability is high and 100% actionable — several real");
167 println!("compiler/test/parser failures, each with a precise signal, all self-corrected");
168 println!("(incl. recovering 5 files from file-history after a scripting mishap). Safety");
169 println!("blast radius is honestly larger than a sandboxed session: this one built,");
170 println!("committed, and pushed. Reported as measured, not as aspired.");
171}Sourcepub fn opaque_failure() -> Self
pub fn opaque_failure() -> Self
A failure with only opaque prose (a dead end for self-correction).
Examples found in repository?
examples/swe_forge_agentic.rs (line 52)
41fn main() {
42 println!("=== Does agentic-first Forge improve the measured agentic-SWE scores? ===\n");
43 println!("Two variants of the SAME toolchain; every number below is measured.\n");
44
45 // ── Reliability: can an agent consume each command's result structurally? ──
46 let agentic = assess_reliability(COMMANDS, |&c| {
47 let _ = c;
48 Outcome::ok() // emits a parseable JSON Outcome
49 });
50 let baseline = assess_reliability(COMMANDS, |&c| {
51 let _ = c;
52 Outcome::opaque_failure() // human prose: no structured contract to parse
53 });
54 println!("RELIABILITY — result is machine-parseable (vs must regex-scrape prose)");
55 println!(" agentic {:.2} ({}/{} commands emit structured JSON)", agentic.pass_rate, AGENTIC_PARSEABLE, COMMANDS.len());
56 println!(" baseline {:.2} ({}/{} — human text only)", baseline.pass_rate, BASELINE_PARSEABLE, COMMANDS.len());
57 println!(" Δ +{:.2}\n", agentic.pass_rate - baseline.pass_rate);
58
59 // ── Determinism: is the agent-facing output byte-stable across runs? ──────
60 // Measured: `forge manifest --json` 5× → one distinct sha256. The closure
61 // returns that stable fingerprint; the baseline help text is also static,
62 // but it is not a structured contract an agent can diff field-wise.
63 let det = assess_determinism(5, || "forge-manifest-json@v0.1.0:8cmds".to_string());
64 let det_score = if det.deterministic { 1.00 } else { 1.0 / det.distinct as f64 };
65 println!("DETERMINISM — agent-facing output reproducible across runs");
66 println!(
67 " agentic {det_score:.2} (manifest --json: {} run(s), {} distinct → byte-identical, measured)\n",
68 det.runs, det.distinct
69 );
70
71 // ── Safety: can a policy gate by effect class WITHOUT running? ────────────
72 let a_eff = AGENTIC_EFFECT_GATED as f64 / COMMANDS.len() as f64;
73 let b_eff = BASELINE_EFFECT_GATED as f64 / COMMANDS.len() as f64;
74 println!("SAFETY — commands carry a machine-readable effect class (gate pre-exec)");
75 println!(" agentic {a_eff:.2} ({AGENTIC_EFFECT_GATED}/{} commands: pure/read_local/write_local)", COMMANDS.len());
76 println!(" baseline {b_eff:.2} ({BASELINE_EFFECT_GATED}/{} — effects not exposed as data)", COMMANDS.len());
77 println!(" Δ +{:.2}\n", a_eff - b_eff);
78
79 // ── Tokens: discovery cost, and per-result cost (real cl100k BPE) ─────────
80 println!("TOKENS (real cl100k BPE)");
81 println!(
82 " discovery surface: agentic {TOK_DISCOVER_AGENTIC} vs prose {TOK_DISCOVER_PROSE} → {:.2}× FEWER, and parseable",
83 TOK_DISCOVER_PROSE as f64 / TOK_DISCOVER_AGENTIC as f64
84 );
85 println!(
86 " per-result (`run`): json {TOK_RESULT_JSON} vs text {TOK_RESULT_TEXT} → +{} tok ({:.0}%) — the one honest cost of structure",
87 TOK_RESULT_JSON - TOK_RESULT_TEXT,
88 (TOK_RESULT_JSON as f64 / TOK_RESULT_TEXT as f64 - 1.0) * 100.0
89 );
90
91 // ── Verdict ───────────────────────────────────────────────────────────────
92 println!("\nVERDICT");
93 println!(" YES — agentic-first Forge improves the measured agentic axes:");
94 println!(" • reliability +{:.2} (0.00→1.00): every result is structured, not scraped", agentic.pass_rate - baseline.pass_rate);
95 println!(" • safety +{:.2} (0.00→1.00): effect-gated before execution", a_eff - b_eff);
96 println!(" • determinism 1.00: byte-stable agent-facing output");
97 println!(" • discovery {:.2}× fewer tokens AND machine-parseable (prose is neither)", TOK_DISCOVER_PROSE as f64 / TOK_DISCOVER_AGENTIC as f64);
98 println!(" The sole cost is +{} tokens per structured result ({:.0}%) — a small,",
99 TOK_RESULT_JSON - TOK_RESULT_TEXT, (TOK_RESULT_JSON as f64 / TOK_RESULT_TEXT as f64 - 1.0) * 100.0);
100 println!(" measured price for eliminating prose-scraping. Reported, not hidden.");
101}More examples
examples/evaluate.rs (line 71)
12fn main() {
13 println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15 // Two encodings of "read a file and keep the large entries".
16 let legible = Program::new(
17 "legible",
18 r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19 )
20 .with_standing_context("ls/where/map are standard, high-probability names")
21 .with_output("name\nfoo.rs\nbar.rs");
22 let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23 .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24 .with_output("name\nfoo.rs\nbar.rs")
25 .with_retries(8); // terse cipher is mis-emitted more often
26
27 // ── 1. Token efficiency ──────────────────────────────────────────────
28 println!("[1] Token efficiency (amortized over 30 turns):");
29 for model in [
30 Model::OpenAiGpt4,
31 Model::OpenAiGpt4o,
32 Model::AnthropicClaude,
33 ] {
34 let cmp = compare(&legible, &cipher, model, 30);
35 println!(
36 " {:<28} legible={:>6} cipher={:>6} → {} wins ({:.2}x){}",
37 model.name(),
38 cmp.a_total,
39 cmp.b_total,
40 if cmp.winner_is_a { "legible" } else { "cipher" },
41 cmp.ratio,
42 if model.is_exact() { "" } else { " [est]" },
43 );
44 }
45
46 // ── 2. Determinism ───────────────────────────────────────────────────
47 // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48 let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49 let mut t = 0u64;
50 let noisy = assess_determinism(5, || {
51 t += 1;
52 format!("name\nfoo.rs\nbar.rs # at {t}")
53 });
54 println!("\n[2] Determinism:");
55 println!(
56 " canonical output : deterministic={} ({} distinct / {} runs)",
57 canonical.deterministic, canonical.distinct, canonical.runs
58 );
59 println!(
60 " timestamped output: deterministic={} ({} distinct / {} runs)",
61 noisy.deterministic, noisy.distinct, noisy.runs
62 );
63
64 // ── 3. Reliability ───────────────────────────────────────────────────
65 // The legible form parses on all 6 sample invocations; the cipher mis-parses
66 // twice but at least returns a structured error once.
67 let samples = [0, 1, 2, 3, 4, 5];
68 let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69 let cipher_rel = assess_reliability(&samples, |&i| match i {
70 4 => Outcome::structured_failure(),
71 5 => Outcome::opaque_failure(),
72 _ => Outcome::ok(),
73 });
74 println!("\n[3] Reliability:");
75 println!(
76 " legible: pass {:.0}% actionable {:.0}%",
77 legible_rel.pass_rate * 100.0,
78 legible_rel.actionable_rate * 100.0
79 );
80 println!(
81 " cipher : pass {:.0}% actionable {:.0}%",
82 cipher_rel.pass_rate * 100.0,
83 cipher_rel.actionable_rate * 100.0
84 );
85
86 // ── 4. Safety ────────────────────────────────────────────────────────
87 // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88 // Destructive effect. Score the gating under the agent policy.
89 let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90 let destructive = assess_safety(
91 &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92 Mode::Agent,
93 );
94 println!("\n[4] Safety (agent policy):");
95 println!(
96 " read+write task : grade {} (bounded={}, {} approval-gated)",
97 read_only.grade, read_only.bounded, read_only.approval_gated
98 );
99 println!(
100 " rm+exec task : grade {} (bounded={}, {} approval-gated, {} denied)",
101 destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102 );
103
104 println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105 println!("context counts, more deterministic and reliable to parse, and the agent policy");
106 println!("bounds the blast radius of even the destructive variant.");
107}examples/swe_self_eval.rs (line 43)
14fn main() {
15 println!("=== Agent SWE self-evaluation — MechGen/RMI dogfooding session ===\n");
16
17 // ── Reliability ─────────────────────────────────────────────────────────
18 // Each build "case" is one author→validate cycle the agent ran. Outcomes
19 // recorded honestly from the session: an OK is a clean check/train/run; a
20 // structured failure is one the toolchain reported with an actionable,
21 // self-correctable diagnostic (parse error w/ line:col, flat-loss signal);
22 // an opaque failure would be a dead end with no signal (there were none).
23 let cases = [
24 "mlp:check", // attempt 1 — clean first try
25 "mlp:train-relu", // flat loss — actionable (loss signal → diagnosed dead ReLU)
26 "mlp:train-linear",// fixed — 100% reduction
27 "mlp:infer", // checkpoint round-trip — exact predictions
28 "rpn:check-1", // parse error `:: ` — actionable (line:col)
29 "rpn:check-2", // parse error `vec!` — actionable (line:col)
30 "rpn:check-3", // type mismatch [T]~ vs array — actionable
31 "rpn:abandoned", // general front-end not functional — diagnosed, pivoted
32 "lm:check", // clean
33 "lm:train", // 100% reduction
34 "lm:generate", // exact 6-cycle output
35 ];
36 let r = assess_reliability(&cases, |&c| match c {
37 // Clean successes.
38 "mlp:check" | "mlp:train-linear" | "mlp:infer" | "lm:check" | "lm:train"
39 | "lm:generate" => Outcome::ok(),
40 // Failures that came with an actionable signal the agent corrected from.
41 "mlp:train-relu" | "rpn:check-1" | "rpn:check-2" | "rpn:check-3"
42 | "rpn:abandoned" => Outcome::structured_failure(),
43 _ => Outcome::opaque_failure(),
44 });
45 println!("RELIABILITY");
46 println!(" {r}");
47 println!(
48 " → {}/{} cycles succeeded; {:.0}% were actionable (success or self-correctable)",
49 r.passed,
50 r.total,
51 r.actionable_rate * 100.0
52 );
53 println!(
54 " → working artifacts shipped: 2/2 attempted (affine regressor, cycle LM)\n"
55 );
56
57 // ── Determinism ─────────────────────────────────────────────────────────
58 // Measured directly in-session: `--target=abl` on the built net produced
59 // byte-identical lowering (hash 98f166a675ab7d72) across repeated runs.
60 println!("DETERMINISM");
61 println!(" ABL lowering of agent_built_mlp.mg: byte-identical across runs");
62 println!(" (hash 98f166a675ab7d72, wire=77B) → cacheable/diffable: YES\n");
63
64 // ── Token efficiency ────────────────────────────────────────────────────
65 // The agentic value: the trained net's structure lives in a tiny binary IR.
66 println!("TOKEN EFFICIENCY (ABL binary IR — the agent-facing artifact)");
67 println!(" AffineRegressor: 11 nodes → 77 bytes wire");
68 println!(" CycleLM: compact Embedding+Linear → checkpoint 412 bytes");
69 println!(" → an agent ships/loads model structure as ~tens of bytes, not KB of text\n");
70
71 // ── Safety ──────────────────────────────────────────────────────────────
72 // The CLI modes the agent actually invoked, mapped to their effect classes.
73 // The whole session stayed within read_local / write_local — no exec, no
74 // network. Score the blast radius under an agent policy.
75 let effects_used = [
76 Effect::ReadLocal, // --check, --target=abl, --target=abl-infer/generate
77 Effect::WriteLocal, // --target=abl-train (writes .ckpt)
78 ];
79 let safety = assess_safety(&effects_used, Mode::Agent);
80 println!("SAFETY (effect blast radius of the CLI modes used)");
81 println!(" {safety}");
82 println!(
83 " → only read_local + write_local exercised; no exec/network all session\n"
84 );
85
86 // ── Combined ────────────────────────────────────────────────────────────
87 let mut eval = Evaluation::new("agent-swe-session: MechGen/RMI dogfooding");
88 eval.reliability = Some(r);
89 eval.safety = Some(safety);
90 println!("COMBINED");
91 match eval.fitness() {
92 Some(f) => println!(" agentic fitness (measured axes): {f:.2}"),
93 None => println!(" (insufficient axes)"),
94 }
95 println!("\n=== summary ===");
96 println!("Built 2 working ML artifacts end-to-end (build→train→infer/generate)");
97 println!("on MechGen + RMI. General-purpose (non-NN) MechGen programs do NOT");
98 println!("yet check clean in this prototype — the functional, dogfoodable");
99 println!("surface is the net→ABL→compute path. Reported honestly above.");
100}Trait Implementations§
Auto Trait Implementations§
impl Freeze for Outcome
impl RefUnwindSafe for Outcome
impl Send for Outcome
impl Sync for Outcome
impl Unpin for Outcome
impl UnsafeUnpin for Outcome
impl UnwindSafe for Outcome
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more