Skip to main content

agentic_eval/
vms.rs

1//! Evaluating **VM / sandbox systems** for agentic AI use.
2//!
3//! An agent runtime does not run one long-lived VM; it spawns a *fleet* of
4//! short-lived, isolated execution environments — one per tool call, code run,
5//! or sub-agent — and tears them down. That workload rewards different
6//! properties than a classic datacenter VM, so this module scores VM/sandbox
7//! systems on five agent-native axes:
8//!
9//! - **start-latency** — how fast a fresh, isolated sandbox is ready. Agent
10//!   loops spawn constantly; cold-start dominates wall-clock.
11//! - **density** — sandboxes per host (per-instance memory/CPU overhead). Fleet
12//!   economics: how many concurrent agents fit on a box.
13//! - **isolation** — strength of the security boundary for *untrusted,
14//!   agent-generated* code. Hardware virtualization beats a shared kernel.
15//! - **snapshotting** — instant fork / snapshot-restore of a warm template
16//!   (copy-on-write), so an agent can branch a primed context per call and keep
17//!   warm pools.
18//! - **agent-control** — is the control plane programmatic and *agent/tool-native*
19//!   (an agent can discover and drive lifecycle directly), or bring-your-own glue?
20//!
21//! Profiles are curated 0.0–1.0 static judgments with `evidence`, like the
22//! [`languages`](crate::languages) and [`frameworks`](crate::frameworks)
23//! profiles — deterministic, serializable, comparable. Scores reflect each
24//! system's design center for the *ephemeral agent-sandbox* workload, not its
25//! fitness for every use; a great long-lived datacenter VM can still rank low
26//! here, and that is the point.
27//!
28//! ```
29//! use agentic_eval::vms::{profile, rank_vms, Vm};
30//! let fc = profile(Vm::Firecracker);
31//! assert!(fc.evidence.len() >= 3);
32//! let ranked = rank_vms();
33//! assert!(ranked[0].fitness() >= ranked[ranked.len() - 1].fitness());
34//! ```
35
36/// VM / sandbox systems with curated agentic profiles.
37#[cfg_attr(feature = "serde", derive(serde::Serialize))]
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
39#[allow(missing_docs)]
40pub enum Vm {
41    /// AetherVM (HyperMachine) — the agentic-first hypervisor this ecosystem
42    /// ships: copy-on-write sandbox spawn, an MCP tool surface, and an agent
43    /// runtime. Scored on the same axes as everything else.
44    AetherVm,
45    Firecracker,
46    CloudHypervisor,
47    Gvisor,
48    KataContainers,
49    QemuKvm,
50    Docker,
51}
52
53impl Vm {
54    /// All profiled VM/sandbox systems, in fixed (deterministic) order.
55    pub fn all() -> [Vm; 7] {
56        [
57            Vm::AetherVm,
58            Vm::Firecracker,
59            Vm::CloudHypervisor,
60            Vm::Gvisor,
61            Vm::KataContainers,
62            Vm::QemuKvm,
63            Vm::Docker,
64        ]
65    }
66
67    /// Canonical lowercase name.
68    pub fn name(self) -> &'static str {
69        match self {
70            Vm::AetherVm => "aethervm",
71            Vm::Firecracker => "firecracker",
72            Vm::CloudHypervisor => "cloud-hypervisor",
73            Vm::Gvisor => "gvisor",
74            Vm::KataContainers => "kata",
75            Vm::QemuKvm => "qemu-kvm",
76            Vm::Docker => "docker",
77        }
78    }
79
80    /// Parse a (case-insensitive) name; accepts common aliases
81    /// (`fc`, `chv`, `runsc`, `kvm`, `runc`, …).
82    pub fn from_name(name: &str) -> Option<Vm> {
83        match name.to_ascii_lowercase().as_str() {
84            "aethervm" | "aether" | "hypermachine" => Some(Vm::AetherVm),
85            "firecracker" | "fc" => Some(Vm::Firecracker),
86            "cloud-hypervisor" | "cloudhv" | "chv" => Some(Vm::CloudHypervisor),
87            "gvisor" | "runsc" => Some(Vm::Gvisor),
88            "kata" | "kata-containers" => Some(Vm::KataContainers),
89            "qemu-kvm" | "qemu" | "kvm" => Some(Vm::QemuKvm),
90            "docker" | "runc" | "containers" => Some(Vm::Docker),
91            _ => None,
92        }
93    }
94}
95
96/// A curated agentic profile of a VM/sandbox system across the five
97/// agent-native axes, with evidence.
98#[cfg_attr(feature = "serde", derive(serde::Serialize))]
99#[derive(Debug, Clone)]
100pub struct VmProfile {
101    /// Which system this profiles.
102    pub vm: Vm,
103    /// Cold-start speed of a fresh isolated sandbox (1.0 = sub-100ms).
104    pub start_latency: f64,
105    /// Sandboxes per host / low per-instance overhead (1.0 = microVM/container class).
106    pub density: f64,
107    /// Security-boundary strength for untrusted agent-generated code
108    /// (1.0 = hardware virtualization, minimal attack surface).
109    pub isolation: f64,
110    /// Instant CoW fork / snapshot-restore for agent branching and warm pools.
111    pub snapshotting: f64,
112    /// Agent/tool-native, discoverable control plane to drive lifecycle.
113    pub agent_control: f64,
114    /// Why: one evidence string per notable factor.
115    pub evidence: Vec<&'static str>,
116}
117
118impl VmProfile {
119    /// Composite agentic fitness: unweighted mean of all five axes.
120    pub fn fitness(&self) -> f64 {
121        (self.start_latency
122            + self.density
123            + self.isolation
124            + self.snapshotting
125            + self.agent_control)
126            / 5.0
127    }
128}
129
130impl std::fmt::Display for VmProfile {
131    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
132        write!(
133            f,
134            "{}: fitness {:.2} (start {:.2}, density {:.2}, isolation {:.2}, snapshot {:.2}, agent-control {:.2})",
135            self.vm.name(),
136            self.fitness(),
137            self.start_latency,
138            self.density,
139            self.isolation,
140            self.snapshotting,
141            self.agent_control
142        )
143    }
144}
145
146/// The curated profile for `vm` (static, documented judgments — see module docs).
147pub fn profile(vm: Vm) -> VmProfile {
148    match vm {
149        Vm::AetherVm => VmProfile {
150            vm,
151            start_latency: 0.8,
152            density: 0.85,
153            isolation: 0.8,
154            snapshotting: 0.9,
155            agent_control: 0.95,
156            evidence: vec![
157                "copy-on-write spawn from a warm template (SandboxPool) forks a sub-VM without a full boot; HV2 is hosted on KVM/WHPX/HVF",
158                "page-level CoW shares one base template across the fleet — high agent-sandbox density by design",
159                "hardware-assisted isolation (KVM/WHPX/HVF) for HV2 plus an optional bare-metal Type-1 (HV1); younger and less battle-tested at scale than Firecracker/QEMU",
160                "CoW memory templates + first-class snapshot/restore make instant agent branching the core primitive — fork a primed context per tool call",
161                "agentic-first control plane: a ~30-tool MCP registry projected to OpenAI/Anthropic/Gemini, an agent runtime, and a REST agent API — an agent drives lifecycle natively without bespoke glue",
162            ],
163        },
164        Vm::Firecracker => VmProfile {
165            vm,
166            start_latency: 0.9,
167            density: 0.9,
168            isolation: 0.85,
169            snapshotting: 0.8,
170            agent_control: 0.5,
171            evidence: vec![
172                "powers AWS Lambda/Fargate: ~125 ms boot to userspace — the microVM cold-start reference",
173                "minimal device model + jailer → a few MB of overhead and thousands of microVMs per host",
174                "KVM hardware isolation with a deliberately tiny attack surface; the most battle-tested microVM",
175                "snapshot/restore enables warm-clone pools for sandbox prewarming",
176                "control is a REST API over a unix socket — programmatic but not agent/tool-native; orchestration is bring-your-own",
177            ],
178        },
179        Vm::CloudHypervisor => VmProfile {
180            vm,
181            start_latency: 0.85,
182            density: 0.8,
183            isolation: 0.85,
184            snapshotting: 0.8,
185            agent_control: 0.5,
186            evidence: vec![
187                "modern Rust VMM: boot near Firecracker with a richer (but still lean) device model",
188                "lightweight; strong density with slightly more overhead than Firecracker",
189                "KVM isolation, and the VMM itself is memory-safe Rust",
190                "live migration + snapshot/restore for warm clones",
191                "REST / D-Bus control plane; capable but not agent-native",
192            ],
193        },
194        Vm::Gvisor => VmProfile {
195            vm,
196            start_latency: 0.85,
197            density: 0.85,
198            isolation: 0.6,
199            snapshotting: 0.4,
200            agent_control: 0.55,
201            evidence: vec![
202                "userspace kernel (the Sentry) intercepts guest syscalls: container-fast start, no full boot",
203                "container-class density; runs as an OCI runtime (runsc)",
204                "smaller attack surface than raw namespaces, but it is a userspace kernel — a Sentry bug is host-reachable, not hardware-contained",
205                "checkpoint/restore exists but is limited/experimental versus first-class VM snapshots",
206                "Docker/Kubernetes drive it via OCI; convenient, but the control plane is not agent-native",
207            ],
208        },
209        Vm::KataContainers => VmProfile {
210            vm,
211            start_latency: 0.65,
212            density: 0.6,
213            isolation: 0.85,
214            snapshotting: 0.4,
215            agent_control: 0.6,
216            evidence: vec![
217                "OCI/CRI containers backed by a per-workload microVM: hardware isolation with the container UX",
218                "per-pod VM overhead sits above namespaces — density between a microVM and a container",
219                "hardware-virt isolation per workload — strong for untrusted code",
220                "templating exists, but live snapshot/branch is limited",
221                "Kubernetes/containerd-native (RuntimeClass); standard tooling, not agent-native",
222            ],
223        },
224        Vm::QemuKvm => VmProfile {
225            vm,
226            start_latency: 0.4,
227            density: 0.45,
228            isolation: 0.9,
229            snapshotting: 0.85,
230            agent_control: 0.45,
231            evidence: vec![
232                "full machine virtualization: rich device model but multi-second boot (the microvm machine type helps; the base is heavy)",
233                "full per-VM overhead → far lower density than microVMs for ephemeral agent sandboxes",
234                "the mature hardware-virt reference — though the large device-model attack surface has a long CVE history",
235                "mature savevm snapshots + live migration: strong state capture",
236                "libvirt/QMP is powerful but heavy, and not designed for an agent to drive directly",
237            ],
238        },
239        Vm::Docker => VmProfile {
240            vm,
241            start_latency: 0.95,
242            density: 0.95,
243            isolation: 0.35,
244            snapshotting: 0.4,
245            agent_control: 0.6,
246            evidence: vec![
247                "near-instant container start — the fastest spawn of the set",
248                "shared-kernel containers: the highest density (minimal per-instance overhead)",
249                "namespaces + cgroups + seccomp share the host kernel — a single kernel LPE escapes the sandbox; widely considered insufficient for untrusted agent-generated code",
250                "image layers / commit capture the filesystem, not live memory; CRIU checkpoint is niche",
251                "the Docker API is what most agent-sandbox stacks drive today (ubiquitous tooling) — control is excellent, isolation is the weak point",
252            ],
253        },
254    }
255}
256
257/// Profiles for all systems, in [`Vm::all`] order (deterministic).
258pub fn profiles() -> Vec<VmProfile> {
259    Vm::all().iter().map(|&v| profile(v)).collect()
260}
261
262/// All profiles ranked best-first by [`VmProfile::fitness`]
263/// (stable order on ties).
264pub fn rank_vms() -> Vec<VmProfile> {
265    let mut v = profiles();
266    v.sort_by(|a, b| {
267        b.fitness()
268            .partial_cmp(&a.fitness())
269            .unwrap_or(std::cmp::Ordering::Equal)
270    });
271    v
272}
273
274/// Compare two systems: positive deltas mean `a` fits agentic use better.
275#[cfg_attr(feature = "serde", derive(serde::Serialize))]
276#[derive(Debug, Clone)]
277pub struct VmComparison {
278    /// First system (the subject).
279    pub a: VmProfile,
280    /// Second system (the baseline).
281    pub b: VmProfile,
282    /// `a.fitness() - b.fitness()`.
283    pub fitness_delta: f64,
284    /// Axis name → delta (a − b), in fixed axis order.
285    pub axis_deltas: Vec<(&'static str, f64)>,
286}
287
288/// Compare system `a` against baseline `b` across all five axes.
289pub fn compare_vms(a: Vm, b: Vm) -> VmComparison {
290    let pa = profile(a);
291    let pb = profile(b);
292    let axis_deltas = vec![
293        ("start-latency", pa.start_latency - pb.start_latency),
294        ("density", pa.density - pb.density),
295        ("isolation", pa.isolation - pb.isolation),
296        ("snapshotting", pa.snapshotting - pb.snapshotting),
297        ("agent-control", pa.agent_control - pb.agent_control),
298    ];
299    VmComparison {
300        fitness_delta: pa.fitness() - pb.fitness(),
301        a: pa,
302        b: pb,
303        axis_deltas,
304    }
305}
306
307impl std::fmt::Display for VmComparison {
308    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
309        writeln!(
310            f,
311            "{} vs {}: fitness delta {:+.2}",
312            self.a.vm.name(),
313            self.b.vm.name(),
314            self.fitness_delta
315        )?;
316        for (axis, d) in &self.axis_deltas {
317            writeln!(f, "  {axis}: {d:+.2}")?;
318        }
319        Ok(())
320    }
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326
327    #[test]
328    fn every_vm_profiles_with_evidence() {
329        for vm in Vm::all() {
330            let p = profile(vm);
331            assert!(
332                p.evidence.len() >= 3,
333                "{} needs ≥3 evidence lines",
334                vm.name()
335            );
336            for s in [
337                p.start_latency,
338                p.density,
339                p.isolation,
340                p.snapshotting,
341                p.agent_control,
342            ] {
343                assert!((0.0..=1.0).contains(&s), "{} score out of range", vm.name());
344            }
345        }
346    }
347
348    #[test]
349    fn from_name_roundtrip_and_aliases() {
350        for vm in Vm::all() {
351            assert_eq!(Vm::from_name(vm.name()), Some(vm));
352        }
353        assert_eq!(Vm::from_name("FC"), Some(Vm::Firecracker));
354        assert_eq!(Vm::from_name("kvm"), Some(Vm::QemuKvm));
355        assert_eq!(Vm::from_name("runc"), Some(Vm::Docker));
356        assert_eq!(Vm::from_name("hypermachine"), Some(Vm::AetherVm));
357        assert_eq!(Vm::from_name("virtualbox"), None);
358    }
359
360    #[test]
361    fn ranking_is_deterministic_and_sorted() {
362        let r1 = rank_vms();
363        let r2 = rank_vms();
364        let n1: Vec<_> = r1.iter().map(|p| p.vm.name()).collect();
365        let n2: Vec<_> = r2.iter().map(|p| p.vm.name()).collect();
366        assert_eq!(n1, n2);
367        for w in r1.windows(2) {
368            assert!(w[0].fitness() >= w[1].fitness());
369        }
370    }
371
372    #[test]
373    fn axis_judgments_hold_directionally() {
374        let aether = profile(Vm::AetherVm);
375        let fc = profile(Vm::Firecracker);
376        let qemu = profile(Vm::QemuKvm);
377        let docker = profile(Vm::Docker);
378        let gvisor = profile(Vm::Gvisor);
379        assert!(
380            fc.start_latency > qemu.start_latency,
381            "a microVM cold-starts far faster than a full machine VM"
382        );
383        assert!(
384            qemu.isolation > docker.isolation,
385            "hardware virtualization beats a shared host kernel for untrusted code"
386        );
387        assert!(
388            docker.density > qemu.density,
389            "shared-kernel containers pack far denser than full VMs"
390        );
391        assert!(
392            aether.agent_control > fc.agent_control,
393            "an MCP-native control plane beats a bring-your-own REST socket"
394        );
395        assert!(
396            fc.isolation > gvisor.isolation,
397            "hardware isolation beats a userspace kernel for untrusted code"
398        );
399        assert!(
400            aether.snapshotting >= qemu.snapshotting,
401            "first-class CoW branching is at least as strong as savevm snapshots"
402        );
403    }
404
405    #[test]
406    fn comparison_deltas_are_consistent() {
407        let cmp = compare_vms(Vm::AetherVm, Vm::Firecracker);
408        let sum: f64 = cmp.axis_deltas.iter().map(|(_, d)| d).sum();
409        assert!((sum / 5.0 - cmp.fitness_delta).abs() < 1e-9);
410        assert!(format!("{cmp}").contains("aethervm vs firecracker"));
411    }
412}