Skip to main content

agentic_eval/
frameworks.rs

1//! Evaluating **AI frameworks** for agentic AI use.
2//!
3//! Where [`languages`](crate::languages) profiles the language an agent writes
4//! in, this module profiles the *AI framework* an agent builds with — the
5//! library it must discover, drive, and debug autonomously. Same four axes,
6//! framework-flavored:
7//!
8//! - **token efficiency** — how many tokens a working model/pipeline costs
9//!   (API verbosity, config boilerplate, import surface).
10//! - **determinism** — seeded-run reproducibility, version stability, and
11//!   whether artifacts (checkpoints, graphs) are byte-stable.
12//! - **reliability** — when the agent misuses the API, does it get an early
13//!   structured error (shape checks at graph build) or a runtime tensor
14//!   explosion three layers deep?
15//! - **safety** — does loading/running third-party artifacts execute arbitrary
16//!   code (pickle!), and is the compute surface effect-gated?
17//!
18//! Plus one framework-specific axis the others don't need:
19//!
20//! - **discoverability** — can an agent learn the surface *from the framework
21//!   itself* (machine-readable schemas/ontology, introspectable ops, stable
22//!   programmatic docs) instead of scraping prose?
23//!
24//! Profiles are curated 0.0–1.0 static judgments with `evidence`, like the
25//! language profiles — deterministic, serializable, comparable.
26//!
27//! ```
28//! use agentic_eval::frameworks::{profile, rank_frameworks, Framework};
29//! let torch = profile(Framework::PyTorch);
30//! assert!(torch.evidence.len() >= 3);
31//! let ranked = rank_frameworks();
32//! assert!(ranked[0].fitness() >= ranked[ranked.len() - 1].fitness());
33//! ```
34
35/// AI frameworks with curated agentic profiles.
36#[cfg_attr(feature = "serde", derive(serde::Serialize))]
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
38#[allow(missing_docs)]
39pub enum Framework {
40    PyTorch,
41    TensorFlow,
42    Jax,
43    HuggingFaceTransformers,
44    OnnxRuntime,
45    ScikitLearn,
46    Candle,
47    Burn,
48    /// RecursiveMachineIntelligence (RMI) — the built-in agentic-first framework of
49    /// MachineGenetics (MechGen): self-describing ontology + manifest, binary-first
50    /// effect-typed compute. Scored on the same axes as everything else.
51    FramewerxRmi,
52}
53
54impl Framework {
55    /// All profiled frameworks, in fixed (deterministic) order.
56    pub fn all() -> [Framework; 9] {
57        [
58            Framework::PyTorch,
59            Framework::TensorFlow,
60            Framework::Jax,
61            Framework::HuggingFaceTransformers,
62            Framework::OnnxRuntime,
63            Framework::ScikitLearn,
64            Framework::Candle,
65            Framework::Burn,
66            Framework::FramewerxRmi,
67        ]
68    }
69
70    /// Canonical lowercase name.
71    pub fn name(self) -> &'static str {
72        match self {
73            Framework::PyTorch => "pytorch",
74            Framework::TensorFlow => "tensorflow",
75            Framework::Jax => "jax",
76            Framework::HuggingFaceTransformers => "transformers",
77            Framework::OnnxRuntime => "onnxruntime",
78            Framework::ScikitLearn => "sklearn",
79            Framework::Candle => "candle",
80            Framework::Burn => "burn",
81            Framework::FramewerxRmi => "rmi",
82        }
83    }
84
85    /// Parse a (case-insensitive) name; accepts common aliases
86    /// (`torch`, `tf`, `hf`, `scikit-learn`, `rmi`, `ort`).
87    pub fn from_name(name: &str) -> Option<Framework> {
88        match name.to_ascii_lowercase().as_str() {
89            "pytorch" | "torch" => Some(Framework::PyTorch),
90            "tensorflow" | "tf" => Some(Framework::TensorFlow),
91            "jax" => Some(Framework::Jax),
92            "transformers" | "hf" | "huggingface" => Some(Framework::HuggingFaceTransformers),
93            "onnxruntime" | "onnx" | "ort" => Some(Framework::OnnxRuntime),
94            "sklearn" | "scikit-learn" | "scikit" => Some(Framework::ScikitLearn),
95            "candle" => Some(Framework::Candle),
96            "burn" => Some(Framework::Burn),
97            "rmi" | "recursivemachineintelligence" | "framewerx" => Some(Framework::FramewerxRmi),
98            _ => None,
99        }
100    }
101}
102
103/// A curated agentic profile of an AI framework: the four shared axes plus
104/// framework-specific **discoverability**, with evidence.
105#[cfg_attr(feature = "serde", derive(serde::Serialize))]
106#[derive(Debug, Clone)]
107pub struct FrameworkProfile {
108    /// Which framework this profiles.
109    pub framework: Framework,
110    /// Token cost of a working model/pipeline (1.0 = very compact).
111    pub token_efficiency: f64,
112    /// Seeded reproducibility + artifact/version stability.
113    pub determinism: f64,
114    /// Early, structured failure on API misuse (vs late tensor explosions).
115    pub reliability: f64,
116    /// Artifact-loading and execution blast radius (pickle ≈ arbitrary code).
117    pub safety: f64,
118    /// Can an agent learn the surface from the framework itself
119    /// (schemas, ontology, introspection) instead of prose docs?
120    pub discoverability: f64,
121    /// Why: one evidence string per notable factor.
122    pub evidence: Vec<&'static str>,
123}
124
125impl FrameworkProfile {
126    /// Composite agentic fitness: unweighted mean of all five axes.
127    pub fn fitness(&self) -> f64 {
128        (self.token_efficiency
129            + self.determinism
130            + self.reliability
131            + self.safety
132            + self.discoverability)
133            / 5.0
134    }
135}
136
137impl std::fmt::Display for FrameworkProfile {
138    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
139        write!(
140            f,
141            "{}: fitness {:.2} (tokens {:.2}, determinism {:.2}, reliability {:.2}, safety {:.2}, discoverability {:.2})",
142            self.framework.name(),
143            self.fitness(),
144            self.token_efficiency,
145            self.determinism,
146            self.reliability,
147            self.safety,
148            self.discoverability
149        )
150    }
151}
152
153/// The curated profile for `fw` (static, documented judgments — see module docs).
154pub fn profile(fw: Framework) -> FrameworkProfile {
155    match fw {
156        Framework::PyTorch => FrameworkProfile {
157            framework: fw,
158            token_efficiency: 0.7,
159            determinism: 0.5,
160            reliability: 0.5,
161            safety: 0.3,
162            discoverability: 0.5,
163            evidence: vec![
164                "dominant in LLM training data: agents emit competent PyTorch with few tokens of guidance",
165                "eager execution defers shape errors to runtime, mid-forward — late self-correction signal",
166                "determinism requires opt-in flags (use_deterministic_algorithms) and still has CUDA caveats",
167                "torch.load = pickle = arbitrary code execution on artifact load (weights_only mitigates, not default historically)",
168                "good runtime introspection (modules walkable) but no machine-readable surface schema",
169            ],
170        },
171        Framework::TensorFlow => FrameworkProfile {
172            framework: fw,
173            token_efficiency: 0.5,
174            determinism: 0.55,
175            reliability: 0.55,
176            safety: 0.45,
177            discoverability: 0.5,
178            evidence: vec![
179                "graph mode catches shape errors at build, but the Keras/TF1/TF2 API strata cost agent tokens and confusion",
180                "SavedModel is a real schema'd artifact (better than pickle)",
181                "version churn between majors broke much trained-data knowledge",
182                "op-level determinism is opt-in and incomplete on GPU",
183            ],
184        },
185        Framework::Jax => FrameworkProfile {
186            framework: fw,
187            token_efficiency: 0.65,
188            determinism: 0.85,
189            reliability: 0.6,
190            safety: 0.55,
191            discoverability: 0.45,
192            evidence: vec![
193                "functional purity + explicit PRNG keys: the most reproducible mainstream choice",
194                "jit tracing errors (abstract tracer leaks) are notoriously confusing for agents",
195                "compact numpy-like surface, but the ecosystem (flax/optax/orbax) adds standing context",
196                "no pickle-by-default artifacts; checkpoint formats are schema'd",
197            ],
198        },
199        Framework::HuggingFaceTransformers => FrameworkProfile {
200            framework: fw,
201            token_efficiency: 0.85,
202            determinism: 0.45,
203            reliability: 0.5,
204            safety: 0.4,
205            discoverability: 0.7,
206            evidence: vec![
207                "pipeline()/AutoModel: a working LLM in ~3 lines — best token economy of the set",
208                "Hub model cards + config.json are machine-readable (good discoverability)",
209                "trust_remote_code executes arbitrary hub code; safetensors fixed weights but custom code remains the hole",
210                "version pinning matters: behavior drifts across releases; remote artifacts mutate",
211            ],
212        },
213        Framework::OnnxRuntime => FrameworkProfile {
214            framework: fw,
215            token_efficiency: 0.6,
216            determinism: 0.8,
217            reliability: 0.7,
218            safety: 0.75,
219            discoverability: 0.75,
220            evidence: vec![
221                "ONNX graphs are fully schema'd protobuf: an agent can introspect every op/shape without running",
222                "inference-only scope: small, stable API; graph validation catches malformed models at load",
223                "no code execution in artifacts (data-only format) — the safest artifact story here",
224                "training support is marginal; agents needing training must go elsewhere",
225            ],
226        },
227        Framework::ScikitLearn => FrameworkProfile {
228            framework: fw,
229            token_efficiency: 0.8,
230            determinism: 0.75,
231            reliability: 0.7,
232            safety: 0.35,
233            discoverability: 0.6,
234            evidence: vec![
235                "fit/predict uniformity: one API shape across ~all estimators (cheap for agents to generalize)",
236                "random_state threading gives easy reproducibility",
237                "get_params()/set_params() is machine-walkable; estimator tags exist but are semi-private",
238                "joblib/pickle persistence = arbitrary code execution on load",
239            ],
240        },
241        Framework::Candle => FrameworkProfile {
242            framework: fw,
243            token_efficiency: 0.55,
244            determinism: 0.8,
245            reliability: 0.75,
246            safety: 0.7,
247            discoverability: 0.4,
248            evidence: vec![
249                "Rust: compile-time dimension/type errors catch agent mistakes pre-run; cargo reproducibility",
250                "safetensors-native (data-only artifacts)",
251                "far less training-data representation: agents need more tokens of guidance than for PyTorch",
252                "smaller op surface; no machine-readable self-description",
253            ],
254        },
255        Framework::Burn => FrameworkProfile {
256            framework: fw,
257            token_efficiency: 0.5,
258            determinism: 0.8,
259            reliability: 0.8,
260            safety: 0.7,
261            discoverability: 0.45,
262            evidence: vec![
263                "type-state tensors (rank/dtype in the type) catch shape misuse at compile time — strongest static reliability of the set",
264                "backend-generic (wgpu/candle/ndarray) with cargo-locked reproducibility",
265                "youngest ecosystem; thin training-data presence costs agent tokens",
266                "derive-macro module system is introspectable in-code but lacks a runtime ontology",
267            ],
268        },
269        // The defining premise of this profile: for agentic ML use the model
270        // artifact IS the binary Agentic Binary Language IR — what the agent emits, ships, loads,
271        // and introspects — not text source. Axes are anchored to MEASURED
272        // numbers (MechGen benchmarks/IR_ARTIFACT_REPORT.md), not estimates.
273        Framework::FramewerxRmi => FrameworkProfile {
274            framework: fw,
275            // CORRECTED 0.80→0.75. The byte win (144 B vs 440 B text) is real
276            // for STORAGE/TRANSPORT/LOAD but is NOT a token win for an LLM
277            // EMITTING the model: measured, Agentic Binary Language-as-base64 ≈ 106 tokens and hex
278            // ≈ 144, vs ~134 text tokens — the byte advantage evaporates under
279            // base64/hex emission (LLMs emit tokens, not raw bytes). The only
280            // genuine token edge over PyTorch (0.70) is zero import/config
281            // boilerplate (you write the `net {}` block, no `import torch`). So
282            // ~0.75, not 0.80. The byte compaction is credited where it actually
283            // pays off — determinism (byte-stable) and safety (no-exec load).
284            token_efficiency: 0.75,
285            // Measured byte-identical across emissions (cmp → identical):
286            // content-hashable cache keys, meaningful diffs. Exceeds PyTorch
287            // (0.50), whose pickle artifacts / runs aren't byte-stable.
288            determinism: 0.9,
289            // Agent emits structured BYTES, not text — the text syntax-error
290            // class is gone; plus shape inference + typed Result on every
291            // Backend op and exact-F32 fallback for quant/half paths. +0.02 →
292            // 0.86: the tool-mediated construction layer (`--build=abl`) is now
293            // PROPERTY-VERIFIED reject-by-construction over 6000 generated specs
294            // — no structurally-valid net is ever spuriously refused (3000
295            // cases lower to clean-resolving, deterministically-constructed
296            // source) AND no invalid net ever reaches an artifact (3000 cases:
297            // unknown op / wrong arity / non-positive dim / shape mismatch each
298            // caught with a machine-readable code+fix BEFORE construction). This
299            // is the framework analogue of the language's verified soundness.
300            // Held at 0.86 (below ~0.90): prototype; property-tested, not proven;
301            // and stronger than Burn's type-state reliability (0.80) because the
302            // errors are machine-actionable, not Rust type-error prose.
303            reliability: 0.86,
304            // Verified: Agentic Binary Language decode is pure bounds-checked data — loading a
305            // model CANNOT execute code (`--from=abl-bytes` round-trips
306            // structure without running). Contrast torch.load=pickle=arbitrary
307            // code (PyTorch safety 0.30). Plus effect-typed compute.
308            safety: 0.88,
309            discoverability: 0.95,
310            evidence: vec![
311                "token (MEASURED, honest): the 144 B Agentic Binary Language artifact is 56–67% smaller than text for STORAGE/TRANSPORT/LOAD, but that byte win does NOT survive LLM emission — base64 ≈ 106 tokens / hex ≈ 144 vs ~134 text tokens. The real token edge over PyTorch is just zero import/config boilerplate (write the `net {}` block, no `import`). Byte compaction is credited under determinism/safety, not token",
312                "deterministic artifact (MEASURED byte-identical across emissions): content-hashable cache keys + meaningful diffs; deterministic ontology/manifest. Exceeds frameworks whose artifacts/runs aren't byte-stable",
313                "reliability (PROPERTY-VERIFIED): emitting structured IR bytes removes the text syntax-error class; the tool-mediated `--build=abl` layer is reject-by-construction, verified over 6000 generated specs — every valid net lowers to clean, deterministically-constructed source; every invalid net (unknown op / wrong arity / bad dims / shape mismatch) is caught with a machine-readable code+fix BEFORE any artifact exists. Plus shape inference + typed Result on every Backend op; quant/half paths fall back to exact F32 rather than silently degrading",
314                "safety (VERIFIED): Agentic Binary Language load is bounds-checked data decode with NO code execution (vs torch.load=pickle=arbitrary code); effect-typed compute; driver-checked CUDA construction",
315                "discoverable from itself: FrameworkOntology + token-compact manifest()/describe(), plus a typed self-describing construction schema (`--build=schema`: op catalog/arities/shape-rule/error-codes, deterministic + drift-guarded against the validator) and no-exec structured introspection (`--describe=abl`: decode the artifact as pure data into JSON). Young framework — minimal training-data presence, so agents rely on this self-description (the design bet)",
316            ],
317        },
318    }
319}
320
321/// Profiles for all frameworks, in [`Framework::all`] order (deterministic).
322pub fn profiles() -> Vec<FrameworkProfile> {
323    Framework::all().iter().map(|&f| profile(f)).collect()
324}
325
326/// All profiles ranked best-first by [`FrameworkProfile::fitness`]
327/// (stable order on ties).
328pub fn rank_frameworks() -> Vec<FrameworkProfile> {
329    let mut v = profiles();
330    v.sort_by(|a, b| {
331        b.fitness()
332            .partial_cmp(&a.fitness())
333            .unwrap_or(std::cmp::Ordering::Equal)
334    });
335    v
336}
337
338/// Compare two frameworks: positive deltas mean `a` fits agentic use better.
339#[cfg_attr(feature = "serde", derive(serde::Serialize))]
340#[derive(Debug, Clone)]
341pub struct FrameworkComparison {
342    /// First framework (the subject).
343    pub a: FrameworkProfile,
344    /// Second framework (the baseline).
345    pub b: FrameworkProfile,
346    /// `a.fitness() - b.fitness()`.
347    pub fitness_delta: f64,
348    /// Axis name → delta (a − b), in fixed axis order.
349    pub axis_deltas: Vec<(&'static str, f64)>,
350}
351
352/// Compare framework `a` against baseline `b` across all five axes.
353pub fn compare_frameworks(a: Framework, b: Framework) -> FrameworkComparison {
354    let pa = profile(a);
355    let pb = profile(b);
356    let axis_deltas = vec![
357        ("tokens", pa.token_efficiency - pb.token_efficiency),
358        ("determinism", pa.determinism - pb.determinism),
359        ("reliability", pa.reliability - pb.reliability),
360        ("safety", pa.safety - pb.safety),
361        ("discoverability", pa.discoverability - pb.discoverability),
362    ];
363    FrameworkComparison {
364        fitness_delta: pa.fitness() - pb.fitness(),
365        a: pa,
366        b: pb,
367        axis_deltas,
368    }
369}
370
371impl std::fmt::Display for FrameworkComparison {
372    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
373        writeln!(
374            f,
375            "{} vs {}: fitness delta {:+.2}",
376            self.a.framework.name(),
377            self.b.framework.name(),
378            self.fitness_delta
379        )?;
380        for (axis, d) in &self.axis_deltas {
381            writeln!(f, "  {axis}: {d:+.2}")?;
382        }
383        Ok(())
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390
391    #[test]
392    fn rmi_ir_artifact_claims_stay_measured() {
393        // The rmi profile's IR-as-artifact axes are anchored to measured facts
394        // (benchmarks/IR_ARTIFACT_REPORT.md). These guard the *direction* of
395        // those measured advantages — not a target number.
396        let rmi = profile(Framework::FramewerxRmi);
397        let torch = profile(Framework::PyTorch);
398        // Measured byte-identical artifact ⇒ strictly more deterministic than
399        // pickle-based PyTorch.
400        assert!(rmi.determinism > torch.determinism, "byte-stable IR must beat pickle on determinism");
401        // Verified no-exec data decode ⇒ strictly safer artifact than torch.load.
402        assert!(rmi.safety > torch.safety, "no-pickle load must beat torch.load on safety");
403        // Honesty caps: nothing maxed for a young framework.
404        for v in [rmi.token_efficiency, rmi.determinism, rmi.reliability, rmi.safety] {
405            assert!(v < 0.98, "axis {v} implausibly high");
406        }
407    }
408
409    #[test]
410    fn every_framework_profiles_with_evidence() {
411        for fw in Framework::all() {
412            let p = profile(fw);
413            assert!(
414                p.evidence.len() >= 3,
415                "{} needs ≥3 evidence lines",
416                fw.name()
417            );
418            for s in [
419                p.token_efficiency,
420                p.determinism,
421                p.reliability,
422                p.safety,
423                p.discoverability,
424            ] {
425                assert!((0.0..=1.0).contains(&s), "{} score out of range", fw.name());
426            }
427        }
428    }
429
430    #[test]
431    fn from_name_roundtrip_and_aliases() {
432        for fw in Framework::all() {
433            assert_eq!(Framework::from_name(fw.name()), Some(fw));
434        }
435        assert_eq!(Framework::from_name("torch"), Some(Framework::PyTorch));
436        assert_eq!(
437            Framework::from_name("HF"),
438            Some(Framework::HuggingFaceTransformers)
439        );
440        assert_eq!(Framework::from_name("rmi"), Some(Framework::FramewerxRmi));
441        assert_eq!(Framework::from_name("caffe"), None);
442    }
443
444    #[test]
445    fn ranking_is_deterministic_and_sorted() {
446        let r1 = rank_frameworks();
447        let r2 = rank_frameworks();
448        let n1: Vec<_> = r1.iter().map(|p| p.framework.name()).collect();
449        let n2: Vec<_> = r2.iter().map(|p| p.framework.name()).collect();
450        assert_eq!(n1, n2);
451        for w in r1.windows(2) {
452            assert!(w[0].fitness() >= w[1].fitness());
453        }
454    }
455
456    #[test]
457    fn axis_judgments_hold_directionally() {
458        let torch = profile(Framework::PyTorch);
459        let jax = profile(Framework::Jax);
460        let ort = profile(Framework::OnnxRuntime);
461        let hf = profile(Framework::HuggingFaceTransformers);
462        let burn = profile(Framework::Burn);
463        assert!(
464            jax.determinism > torch.determinism,
465            "explicit PRNG keys beat opt-in flags"
466        );
467        assert!(ort.safety > torch.safety, "data-only artifacts beat pickle");
468        assert!(
469            hf.token_efficiency > burn.token_efficiency,
470            "pipeline() in 3 lines beats young Rust ecosystem"
471        );
472        assert!(
473            burn.reliability > torch.reliability,
474            "type-state tensors catch shape misuse pre-run"
475        );
476    }
477
478    #[test]
479    fn comparison_deltas_are_consistent() {
480        let cmp = compare_frameworks(Framework::FramewerxRmi, Framework::PyTorch);
481        let sum: f64 = cmp.axis_deltas.iter().map(|(_, d)| d).sum();
482        assert!((sum / 5.0 - cmp.fitness_delta).abs() < 1e-9);
483        assert!(format!("{cmp}").contains("rmi vs pytorch"));
484    }
485}