Skip to main content

agentic_eval/
ontology.rs

1//! A complete, self-describing **ontology** of `agentic-eval` itself.
2//!
3//! Agentic-first design means a consumer should never have to read prose docs to
4//! use the library — it should be able to *discover* every capability through a
5//! compact, machine-readable manifest and expand any entry on demand. That is the
6//! same progressive-disclosure pattern this crate measures (a cheap root index +
7//! `describe(...)` for detail), applied reflexively to the crate's own surface:
8//! its four [axes](crate), the [`Effect`] taxonomy with the policy [`Decision`] each
9//! effect gets under each [`Mode`], the supported tokenizer [`Model`]s, and the
10//! built-in CLI command classifier.
11//!
12//! The ontology is **deterministic** (built from static data in fixed order, no map
13//! iteration), **compact** ([`manifest`] is a few hundred tokens), and
14//! **machine-readable** (every type derives `serde::Serialize` under the `serde`
15//! feature). Start at [`manifest`], expand with [`describe`], or take the whole
16//! structured catalog with [`ontology`].
17//!
18//! ```
19//! // Discover the surface without reading docs:
20//! let root = agentic_eval::ontology::manifest();
21//! assert!(root.contains("axes:"));
22//! // Expand one entry on demand:
23//! let safety = agentic_eval::ontology::describe("safety").unwrap();
24//! assert!(safety.contains("assess_safety"));
25//! let rm = agentic_eval::ontology::describe("destructive").unwrap();
26//! assert!(rm.contains("agent=approve")); // the policy decision, machine-checkable
27//! ```
28
29use crate::commands::{commands_for, known_command_count};
30use crate::safety::{Effect, Mode};
31use crate::tokens::Model;
32
33/// The crate's own version (so the ontology never drifts from the build).
34pub const VERSION: &str = env!("CARGO_PKG_VERSION");
35
36/// One of the four evaluation axes — what it measures and how to invoke it.
37#[cfg_attr(feature = "serde", derive(serde::Serialize))]
38#[derive(Debug, Clone)]
39pub struct AxisDoc {
40    /// Axis name (`tokens` / `determinism` / `reliability` / `safety`).
41    pub name: &'static str,
42    /// One-line summary of what the axis scores.
43    pub summary: &'static str,
44    /// The public entry-point functions for this axis.
45    pub entry_points: &'static [&'static str],
46    /// Whether assessing it needs the program to *run* (a caller closure), vs.
47    /// working on text/declared effects alone.
48    pub needs_execution: bool,
49    /// The report type the axis produces.
50    pub output_type: &'static str,
51}
52
53/// An effect class with the policy decision it receives under each mode.
54#[cfg_attr(feature = "serde", derive(serde::Serialize))]
55#[derive(Debug, Clone)]
56pub struct EffectDoc {
57    /// Canonical snake_case effect name.
58    pub name: &'static str,
59    /// One-line description of the effect class.
60    pub summary: &'static str,
61    /// Whether the class is dangerous (should be gated for an agent).
62    pub dangerous: bool,
63    /// The decision a human gets (`allow` / `approve` / `deny`).
64    pub human_decision: &'static str,
65    /// The decision an agent gets under the default policy.
66    pub agent_decision: &'static str,
67    /// A few CLI commands the built-in classifier maps to this effect.
68    pub example_commands: Vec<&'static str>,
69}
70
71/// A supported tokenizer/model.
72#[cfg_attr(feature = "serde", derive(serde::Serialize))]
73#[derive(Debug, Clone)]
74pub struct ModelDoc {
75    /// Display name (tokenizer family).
76    pub name: &'static str,
77    /// Whether this build counts it exactly (a real BPE) vs. an estimate.
78    pub exact: bool,
79}
80
81/// The complete, structured ontology of the crate.
82#[cfg_attr(feature = "serde", derive(serde::Serialize))]
83#[derive(Debug, Clone)]
84pub struct Ontology {
85    /// Crate name.
86    pub crate_name: &'static str,
87    /// Crate version (matches the build).
88    pub version: &'static str,
89    /// One-line description of the crate's purpose.
90    pub summary: &'static str,
91    /// The four evaluation axes.
92    pub axes: Vec<AxisDoc>,
93    /// The effect taxonomy with per-mode policy decisions.
94    pub effects: Vec<EffectDoc>,
95    /// The operating modes the policy distinguishes.
96    pub modes: Vec<&'static str>,
97    /// The tokenizer models token efficiency can be counted under.
98    pub models: Vec<ModelDoc>,
99    /// How many CLI commands the built-in classifier recognizes.
100    pub known_commands: usize,
101    /// Programming languages with curated agentic profiles (name + fitness).
102    pub languages: Vec<SubjectDoc>,
103    /// AI frameworks with curated agentic profiles (name + fitness).
104    pub frameworks: Vec<SubjectDoc>,
105    /// VM/sandbox systems with curated agentic profiles (name + fitness).
106    pub vms: Vec<SubjectDoc>,
107    /// Web stacks / wire protocols with curated agentic profiles
108    /// (name + fitness).
109    pub web_stacks: Vec<SubjectDoc>,
110}
111
112/// A profiled evaluation subject (language, framework, or VM system) — compact
113/// index entry; expand with [`describe`] for full per-axis scores and evidence.
114#[cfg_attr(feature = "serde", derive(serde::Serialize))]
115#[derive(Debug, Clone)]
116pub struct SubjectDoc {
117    /// Canonical name (`rust`, `pytorch`, …).
118    pub name: &'static str,
119    /// Composite agentic fitness (0.0–1.0, mean of axes).
120    pub fitness: f64,
121}
122
123/// The four evaluation axes, in canonical order.
124pub fn axes() -> Vec<AxisDoc> {
125    vec![
126        AxisDoc {
127            name: "tokens",
128            summary: "token efficiency: the four cost terms an agent pays — standing \
129                      context, input, output, retries — amortized over a session; plus \
130                      output scaling (per-item cost) and prompt-cache savings",
131            entry_points: &[
132                "evaluate",
133                "evaluate_with",
134                "compare",
135                "rank",
136                "rank_with",
137                "assess_scaling",
138                "assess_cache",
139                "cacheable_prefix_tokens",
140            ],
141            needs_execution: false,
142            output_type: "AgentCost | ScalingReport | CacheReport",
143        },
144        AxisDoc {
145            name: "determinism",
146            summary: "whether a program's output is byte-identical across repeated runs \
147                      (so an agent can parse, cache, and diff it)",
148            entry_points: &["assess_determinism", "stable_across"],
149            needs_execution: true,
150            output_type: "DeterminismReport",
151        },
152        AxisDoc {
153            name: "reliability",
154            summary: "success rate over representative invocations, whether failures are \
155                      structured/actionable rather than dead ends, and graded error \
156                      quality (code/message/location/fix)",
157            entry_points: &["assess_reliability", "assess_error_quality"],
158            needs_execution: true,
159            output_type: "ReliabilityReport | ErrorQualityReport",
160        },
161        AxisDoc {
162            name: "safety",
163            summary: "the fraction of a program's dangerous blast radius that is gated \
164                      (approval/denied) under an agent policy; plus reversibility \
165                      (recoverable blast radius) and data-exfiltration exposure",
166            entry_points: &[
167                "assess_safety",
168                "assess_safety_named",
169                "assess_safety_script",
170                "assess_reversibility",
171                "assess_exfiltration",
172            ],
173            needs_execution: false,
174            output_type: "SafetyReport | ReversibilityReport | ExfiltrationReport",
175        },
176    ]
177}
178
179/// The effect taxonomy, each annotated with the policy decision it gets under each
180/// mode and a few example commands the built-in classifier maps to it.
181pub fn effects() -> Vec<EffectDoc> {
182    Effect::all()
183        .into_iter()
184        .map(|e| EffectDoc {
185            name: e.name(),
186            summary: e.summary(),
187            dangerous: e.is_dangerous(),
188            human_decision: e.decision(Mode::Human).name(),
189            agent_decision: e.decision(Mode::Agent).name(),
190            example_commands: commands_for(e).iter().take(4).copied().collect(),
191        })
192        .collect()
193}
194
195/// The tokenizer models token efficiency can be counted under (exact or estimated).
196pub fn models() -> Vec<ModelDoc> {
197    Model::all()
198        .into_iter()
199        .map(|m| ModelDoc {
200            name: m.name(),
201            exact: m.is_exact(),
202        })
203        .collect()
204}
205
206/// Compact index of the profiled programming languages.
207pub fn languages() -> Vec<SubjectDoc> {
208    crate::languages::Language::all()
209        .iter()
210        .map(|&l| {
211            let p = crate::languages::profile(l);
212            SubjectDoc {
213                name: l.name(),
214                fitness: p.fitness(),
215            }
216        })
217        .collect()
218}
219
220/// Compact index of the profiled AI frameworks.
221pub fn frameworks() -> Vec<SubjectDoc> {
222    crate::frameworks::Framework::all()
223        .iter()
224        .map(|&f| {
225            let p = crate::frameworks::profile(f);
226            SubjectDoc {
227                name: f.name(),
228                fitness: p.fitness(),
229            }
230        })
231        .collect()
232}
233
234/// Compact index of the profiled VM/sandbox systems.
235pub fn vms() -> Vec<SubjectDoc> {
236    crate::vms::Vm::all()
237        .iter()
238        .map(|&v| {
239            let p = crate::vms::profile(v);
240            SubjectDoc {
241                name: v.name(),
242                fitness: p.fitness(),
243            }
244        })
245        .collect()
246}
247
248/// Compact index of the profiled web stacks / wire protocols.
249pub fn web_stacks() -> Vec<SubjectDoc> {
250    crate::web::WebStack::all()
251        .iter()
252        .map(|&w| {
253            let p = crate::web::profile(w);
254            SubjectDoc {
255                name: w.name(),
256                fitness: p.fitness(),
257            }
258        })
259        .collect()
260}
261
262/// The complete structured ontology of the crate.
263pub fn ontology() -> Ontology {
264    Ontology {
265        crate_name: "agentic-eval",
266        version: VERSION,
267        summary: "evaluate programs, programming languages, AI frameworks, \
268                  VM/sandbox systems, and web stacks / wire protocols for \
269                  agentic AI use across four axes — token efficiency, \
270                  determinism, reliability, and safety (frameworks add \
271                  discoverability; VM systems use agent-native axes: \
272                  start-latency, density, isolation, snapshotting, agent-control; \
273                  web stacks use streaming, tool-discoverability, \
274                  encoding-efficiency, interop, security-primitives)",
275        axes: axes(),
276        effects: effects(),
277        modes: Mode::all().iter().map(|m| m.name()).collect(),
278        models: models(),
279        known_commands: known_command_count(),
280        languages: languages(),
281        frameworks: frameworks(),
282        vms: vms(),
283        web_stacks: web_stacks(),
284    }
285}
286
287/// A compact, deterministic **manifest** — the cheap discovery root. Lists the axes,
288/// effect classes, modes, models, and command count, plus how to expand any entry
289/// with [`describe`]. A few hundred tokens; the agentic-first entry point.
290pub fn manifest() -> String {
291    let o = ontology();
292    let mut s = String::new();
293    s.push_str(&format!("{} {} — {}\n", o.crate_name, o.version, o.summary));
294    s.push_str("axes: ");
295    s.push_str(&o.axes.iter().map(|a| a.name).collect::<Vec<_>>().join(", "));
296    s.push_str(&format!("\neffects({}): ", o.effects.len()));
297    s.push_str(
298        &o.effects
299            .iter()
300            .map(|e| e.name)
301            .collect::<Vec<_>>()
302            .join(" "),
303    );
304    s.push_str("\nmodes: ");
305    s.push_str(&o.modes.join(", "));
306    s.push_str("\nmodels: ");
307    s.push_str(
308        &o.models
309            .iter()
310            .map(|m| m.name)
311            .collect::<Vec<_>>()
312            .join(", "),
313    );
314    s.push_str(&format!(
315        "\ncommands: {} classified across {} effect classes",
316        o.known_commands,
317        o.effects.len()
318    ));
319    s.push_str(&format!("\nlanguages({}): ", o.languages.len()));
320    s.push_str(
321        &o.languages
322            .iter()
323            .map(|l| l.name)
324            .collect::<Vec<_>>()
325            .join(" "),
326    );
327    s.push_str(&format!("\nframeworks({}): ", o.frameworks.len()));
328    s.push_str(
329        &o.frameworks
330            .iter()
331            .map(|f| f.name)
332            .collect::<Vec<_>>()
333            .join(" "),
334    );
335    s.push_str(&format!("\nvms({}): ", o.vms.len()));
336    s.push_str(&o.vms.iter().map(|v| v.name).collect::<Vec<_>>().join(" "));
337    s.push_str(&format!("\nweb_stacks({}): ", o.web_stacks.len()));
338    s.push_str(
339        &o.web_stacks
340            .iter()
341            .map(|w| w.name)
342            .collect::<Vec<_>>()
343            .join(" "),
344    );
345    s.push_str(
346        "\ndescribe(<axis|effect|model|language|framework|vm|web|\"axes\"|\"effects\"|\"models\"|\"languages\"|\"frameworks\"|\"vms\"|\"web\">) for detail",
347    );
348    s
349}
350
351/// Expand one ontology entry by name (case-insensitive): an axis (`tokens`…), an
352/// effect (`destructive`…), a model (`gpt4`/`claude`…), or a section keyword
353/// (`axes` / `effects` / `models` / `modes` / `commands`). Returns `None` for an
354/// unknown query — the [`manifest`] lists every valid name.
355pub fn describe(query: &str) -> Option<String> {
356    let q = query.trim().to_ascii_lowercase();
357    let o = ontology();
358
359    // Section keywords expand the whole group.
360    match q.as_str() {
361        "axes" => {
362            return Some(
363                o.axes
364                    .iter()
365                    .map(describe_axis)
366                    .collect::<Vec<_>>()
367                    .join("\n"),
368            )
369        }
370        "effects" => {
371            return Some(
372                o.effects
373                    .iter()
374                    .map(describe_effect)
375                    .collect::<Vec<_>>()
376                    .join("\n"),
377            )
378        }
379        "models" => {
380            return Some(
381                o.models
382                    .iter()
383                    .map(|m| format!("{} (exact={})", m.name, m.exact))
384                    .collect::<Vec<_>>()
385                    .join("\n"),
386            )
387        }
388        "modes" => return Some(o.modes.join(", ")),
389        "commands" => {
390            return Some(format!(
391                "{} CLI commands classified; describe an effect (e.g. \"network\") for examples",
392                o.known_commands
393            ))
394        }
395        "languages" => {
396            return Some(
397                crate::languages::rank_languages()
398                    .iter()
399                    .map(|p| p.to_string())
400                    .collect::<Vec<_>>()
401                    .join("\n"),
402            )
403        }
404        "frameworks" => {
405            return Some(
406                crate::frameworks::rank_frameworks()
407                    .iter()
408                    .map(|p| p.to_string())
409                    .collect::<Vec<_>>()
410                    .join("\n"),
411            )
412        }
413        "vms" => {
414            return Some(
415                crate::vms::rank_vms()
416                    .iter()
417                    .map(|p| p.to_string())
418                    .collect::<Vec<_>>()
419                    .join("\n"),
420            )
421        }
422        "web" | "web-stacks" | "web_stacks" => {
423            return Some(
424                crate::web::rank_web_stacks()
425                    .iter()
426                    .map(|p| p.to_string())
427                    .collect::<Vec<_>>()
428                    .join("\n"),
429            )
430        }
431        _ => {}
432    }
433
434    // A specific language (full profile + evidence).
435    if let Some(l) = crate::languages::Language::from_name(&q) {
436        let p = crate::languages::profile(l);
437        let mut s = p.to_string();
438        for e in &p.evidence {
439            s.push_str("\n  - ");
440            s.push_str(e);
441        }
442        return Some(s);
443    }
444    // A specific framework (full profile + evidence).
445    if let Some(fw) = crate::frameworks::Framework::from_name(&q) {
446        let p = crate::frameworks::profile(fw);
447        let mut s = p.to_string();
448        for e in &p.evidence {
449            s.push_str("\n  - ");
450            s.push_str(e);
451        }
452        return Some(s);
453    }
454    // A specific VM/sandbox system (full profile + evidence).
455    if let Some(v) = crate::vms::Vm::from_name(&q) {
456        let p = crate::vms::profile(v);
457        let mut s = p.to_string();
458        for e in &p.evidence {
459            s.push_str("\n  - ");
460            s.push_str(e);
461        }
462        return Some(s);
463    }
464    // A specific web stack / wire protocol (full profile + evidence).
465    if let Some(w) = crate::web::WebStack::from_name(&q) {
466        let p = crate::web::profile(w);
467        let mut s = p.to_string();
468        for e in &p.evidence {
469            s.push_str("\n  - ");
470            s.push_str(e);
471        }
472        return Some(s);
473    }
474
475    // A specific axis.
476    if let Some(a) = o.axes.iter().find(|a| a.name == q) {
477        return Some(describe_axis(a));
478    }
479    // A specific effect (by canonical name).
480    if let Some(e) =
481        Effect::from_name(&q).and_then(|e| o.effects.iter().find(|d| d.name == e.name()))
482    {
483        return Some(describe_effect(e));
484    }
485    // A specific model (accepts the same aliases as Model::from_name).
486    if let Some(m) = Model::from_name(&q) {
487        return Some(format!("{} (exact={})", m.name(), m.is_exact()));
488    }
489    None
490}
491
492fn describe_axis(a: &AxisDoc) -> String {
493    format!(
494        "axis {}: {}\n  output: {}  needs_execution: {}\n  entry_points: {}",
495        a.name,
496        a.summary,
497        a.output_type,
498        a.needs_execution,
499        a.entry_points.join(", ")
500    )
501}
502
503fn describe_effect(e: &EffectDoc) -> String {
504    format!(
505        "effect {}: {}\n  dangerous: {}  human={}  agent={}\n  e.g. {}",
506        e.name,
507        e.summary,
508        e.dangerous,
509        e.human_decision,
510        e.agent_decision,
511        if e.example_commands.is_empty() {
512            "(none)".to_string()
513        } else {
514            e.example_commands.join(", ")
515        }
516    )
517}
518
519impl std::fmt::Display for Ontology {
520    /// The manifest followed by every axis and effect expanded — a complete,
521    /// human-readable dump of the ontology.
522    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
523        writeln!(f, "{}", manifest())?;
524        writeln!(f, "\n# axes")?;
525        for a in &self.axes {
526            writeln!(f, "{}", describe_axis(a))?;
527        }
528        writeln!(f, "\n# effects")?;
529        for e in &self.effects {
530            writeln!(f, "{}", describe_effect(e))?;
531        }
532        Ok(())
533    }
534}
535
536#[cfg(test)]
537mod tests {
538    use super::*;
539
540    #[test]
541    fn manifest_is_compact_and_lists_every_section() {
542        let m = manifest();
543        assert!(m.contains("agentic-eval"));
544        assert!(m.contains(VERSION));
545        for axis in ["tokens", "determinism", "reliability", "safety"] {
546            assert!(m.contains(axis), "manifest lists axis {axis}: {m}");
547        }
548        // Every effect name appears.
549        for e in Effect::all() {
550            assert!(m.contains(e.name()), "manifest lists effect {}", e.name());
551        }
552        // The new subject groups are indexed.
553        assert!(m.contains("languages("), "manifest lists languages");
554        assert!(m.contains("frameworks("), "manifest lists frameworks");
555        assert!(m.contains("vms("), "manifest lists vms");
556        assert!(m.contains("web_stacks("), "manifest lists web stacks");
557        assert!(m.contains("mechgen") && m.contains("rmi"));
558        assert!(m.contains("aethervm") && m.contains("firecracker"));
559        assert!(m.contains("spine") && m.contains("grpc"));
560        // Compact: a few hundred tokens, not a doc dump.
561        assert!(m.len() < 1800, "manifest stays compact ({} bytes)", m.len());
562    }
563
564    #[test]
565    fn describe_expands_languages_frameworks_vms_and_web() {
566        // Group expansions are ranked tables.
567        let langs = describe("languages").unwrap();
568        assert!(langs.contains("rust") && langs.contains("fitness"));
569        let fws = describe("frameworks").unwrap();
570        assert!(fws.contains("pytorch") && fws.contains("discoverability"));
571        let vms = describe("vms").unwrap();
572        assert!(vms.contains("firecracker") && vms.contains("agent-control"));
573        let web = describe("web").unwrap();
574        assert!(
575            web.contains("spine") && web.contains("streaming"),
576            "describe(\"web\") should list ranked web stacks with the streaming axis"
577        );
578        assert_eq!(
579            describe("web-stacks").unwrap(),
580            web,
581            "describe(\"web-stacks\") alias matches describe(\"web\")"
582        );
583        // Individual subjects expand to full profile + evidence bullets.
584        let rust = describe("rust").unwrap();
585        assert!(rust.contains("reliability") && rust.contains("\n  - "));
586        let torch = describe("torch").unwrap(); // alias resolution
587        assert!(torch.contains("pytorch"));
588        let aether = describe("aethervm").unwrap();
589        assert!(aether.contains("snapshot") && aether.contains("\n  - "));
590        assert!(describe("kvm").unwrap().contains("qemu-kvm")); // vm alias resolution
591        let spine = describe("spine").unwrap();
592        assert!(
593            spine.contains("fitness") && spine.contains("\n  - "),
594            "describe(\"spine\") expands to profile + evidence bullets"
595        );
596        assert!(describe("openai").unwrap().contains("openai-api")); // web alias resolution
597                                                                    // Effect names still win over any future subject collision
598                                                                    // (lookup order: sections → languages → frameworks → vms → web → axes/effects/models
599                                                                    //  — and effect/axis names are disjoint from subject names today).
600        assert!(describe("destructive").unwrap().contains("agent="));
601    }
602
603    #[test]
604    fn ontology_is_complete_and_consistent() {
605        let o = ontology();
606        assert_eq!(o.axes.len(), 4);
607        assert_eq!(o.effects.len(), 8); // every Effect variant
608        assert_eq!(o.modes.len(), 2);
609        assert_eq!(o.models.len(), 4);
610        assert_eq!(o.web_stacks.len(), 7); // every WebStack variant
611        assert!(
612            o.web_stacks.iter().any(|w| w.name == "spine"),
613            "web_stacks index includes SPINE"
614        );
615        assert!(o.known_commands > 100, "classifier ontology is substantial");
616        // The effect docs carry the real policy decisions.
617        let destructive = o.effects.iter().find(|e| e.name == "destructive").unwrap();
618        assert!(destructive.dangerous);
619        assert_eq!(destructive.human_decision, "allow");
620        assert_eq!(destructive.agent_decision, "approve");
621        let privileged = o.effects.iter().find(|e| e.name == "privileged").unwrap();
622        assert_eq!(privileged.agent_decision, "deny");
623    }
624
625    #[test]
626    fn describe_expands_axes_effects_models_and_keywords() {
627        assert!(describe("safety").unwrap().contains("assess_safety"));
628        assert!(describe("TOKENS").unwrap().contains("AgentCost")); // case-insensitive
629        let dest = describe("destructive").unwrap();
630        assert!(dest.contains("agent=approve"));
631        // Model aliases resolve.
632        assert!(describe("gpt4").unwrap().contains("cl100k"));
633        // Section keywords expand the whole group.
634        assert!(describe("effects").unwrap().contains("privileged"));
635        assert!(describe("models").unwrap().contains("heuristic"));
636        // Unknown query → None (manifest lists the valid names).
637        assert!(describe("does-not-exist").is_none());
638    }
639
640    #[test]
641    fn manifest_and_describe_are_deterministic() {
642        assert_eq!(manifest(), manifest());
643        assert_eq!(describe("effects"), describe("effects"));
644        // The full Display dump is stable too.
645        assert_eq!(ontology().to_string(), ontology().to_string());
646    }
647
648    #[test]
649    fn version_matches_the_crate() {
650        assert_eq!(VERSION, env!("CARGO_PKG_VERSION"));
651        assert!(manifest().contains(env!("CARGO_PKG_VERSION")));
652    }
653}