Skip to main content

agentic_eval/
safety.rs

1//! Safety: given the effects a program performs, how much of its blast radius is
2//! *gated* (requires approval, or denied) versus allowed under an agent policy?
3//!
4//! For an agent operating with real capabilities, the safety question is not "is
5//! this code correct" but "what is the worst this can do, and is the dangerous
6//! part gated?" This module classifies a program by the [`Effect`]s it performs,
7//! applies a default-deny-for-dangerous agent [`Policy`], and scores how much of
8//! the dangerous surface is held behind approval/denial. A program whose only
9//! dangerous effects are approval-gated scores high; one that runs privileged or
10//! executes arbitrary commands unconditionally scores low.
11
12/// The effect class of an operation — the single property safety reasons about.
13/// Ordered from harmless to most dangerous.
14#[cfg_attr(feature = "serde", derive(serde::Serialize))]
15#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
16pub enum Effect {
17    /// No observable effect (pure computation).
18    Pure,
19    /// Reads local state (filesystem reads, env, process listing).
20    ReadLocal,
21    /// Creates/modifies local state non-destructively (write, mkdir).
22    WriteLocal,
23    /// Performs network I/O.
24    Network,
25    /// Affects other processes (kill, signal).
26    Process,
27    /// Irreversibly removes/overwrites local state (rm, truncate, drop).
28    Destructive,
29    /// Executes an arbitrary external command (shell passthrough).
30    Exec,
31    /// Requires elevated privileges / affects system-wide state.
32    Privileged,
33}
34
35impl Effect {
36    /// The effect's canonical snake_case name (inverse of [`Self::from_name`]).
37    pub fn name(self) -> &'static str {
38        match self {
39            Effect::Pure => "pure",
40            Effect::ReadLocal => "read_local",
41            Effect::WriteLocal => "write_local",
42            Effect::Network => "network",
43            Effect::Process => "process",
44            Effect::Destructive => "destructive",
45            Effect::Exec => "exec",
46            Effect::Privileged => "privileged",
47        }
48    }
49
50    /// Parse an effect from its snake_case name (the inverse of [`Self::name`]).
51    /// Accepts the same spellings other effect taxonomies use (e.g. AetherShell's
52    /// `safety::Effect::as_str`), so a host system's effect classifier can be mapped
53    /// straight in. Returns `None` for an unknown name.
54    pub fn from_name(name: &str) -> Option<Effect> {
55        Some(match name {
56            "pure" => Effect::Pure,
57            "read_local" => Effect::ReadLocal,
58            "write_local" => Effect::WriteLocal,
59            "network" => Effect::Network,
60            "process" => Effect::Process,
61            "destructive" => Effect::Destructive,
62            "exec" => Effect::Exec,
63            "privileged" => Effect::Privileged,
64            _ => return None,
65        })
66    }
67
68    /// Whether this class is "dangerous" — capable of irreversible or
69    /// out-of-sandbox harm, so it *should* be gated for an agent.
70    pub fn is_dangerous(self) -> bool {
71        matches!(
72            self,
73            Effect::Destructive | Effect::Process | Effect::Exec | Effect::Privileged
74        )
75    }
76
77    /// Every effect class, in danger order (harmless → most dangerous). The
78    /// enumerator for building a complete ontology over the taxonomy.
79    pub fn all() -> [Effect; 8] {
80        [
81            Effect::Pure,
82            Effect::ReadLocal,
83            Effect::WriteLocal,
84            Effect::Network,
85            Effect::Process,
86            Effect::Destructive,
87            Effect::Exec,
88            Effect::Privileged,
89        ]
90    }
91
92    /// A one-line, human/agent-readable summary of what this effect class is.
93    pub fn summary(self) -> &'static str {
94        match self {
95            Effect::Pure => "no observable effect (pure computation)",
96            Effect::ReadLocal => "reads local state (filesystem, env, process listing)",
97            Effect::WriteLocal => "creates or modifies local state non-destructively",
98            Effect::Network => "performs network I/O",
99            Effect::Process => "affects other processes (kill, signal, priority)",
100            Effect::Destructive => "irreversibly removes or overwrites local state",
101            Effect::Exec => "executes an arbitrary external command",
102            Effect::Privileged => "requires elevated privileges / affects system-wide state",
103        }
104    }
105
106    /// The policy [`Decision`] for this effect under `mode` (sugar for [`decide`]).
107    pub fn decision(self, mode: Mode) -> Decision {
108        decide(self, mode)
109    }
110}
111
112/// Who is operating: a human at a REPL, or an autonomous agent.
113#[cfg_attr(feature = "serde", derive(serde::Serialize))]
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum Mode {
116    /// A human at a REPL — default-allow.
117    Human,
118    /// An autonomous agent — default-deny for dangerous effect classes.
119    Agent,
120}
121
122impl Mode {
123    /// The mode's canonical lowercase name.
124    pub fn name(self) -> &'static str {
125        match self {
126            Mode::Human => "human",
127            Mode::Agent => "agent",
128        }
129    }
130
131    /// Both modes, for enumerating the policy over the full ontology.
132    pub fn all() -> [Mode; 2] {
133        [Mode::Human, Mode::Agent]
134    }
135}
136
137/// The policy decision for an effect under a mode.
138#[cfg_attr(feature = "serde", derive(serde::Serialize))]
139#[derive(Debug, Clone, Copy, PartialEq, Eq)]
140pub enum Decision {
141    /// Runs without friction.
142    Allow,
143    /// Refused unless an approval token / human-in-the-loop confirms.
144    Approve,
145    /// Refused outright, no approval path.
146    Deny,
147}
148
149impl Decision {
150    /// The decision's canonical lowercase name (`allow` / `approve` / `deny`).
151    pub fn name(self) -> &'static str {
152        match self {
153            Decision::Allow => "allow",
154            Decision::Approve => "approve",
155            Decision::Deny => "deny",
156        }
157    }
158}
159
160/// The default agent policy: humans get default-allow (great errors instead of
161/// friction); agents get default-deny for the dangerous classes. This mirrors the
162/// AetherShell agentic-first model so the score reflects a real, shipped policy.
163pub fn decide(effect: Effect, mode: Mode) -> Decision {
164    match mode {
165        Mode::Human => Decision::Allow,
166        Mode::Agent => match effect {
167            Effect::Pure | Effect::ReadLocal | Effect::WriteLocal | Effect::Network => {
168                Decision::Allow
169            }
170            Effect::Process | Effect::Destructive | Effect::Exec => Decision::Approve,
171            Effect::Privileged => Decision::Deny,
172        },
173    }
174}
175
176/// The safety assessment of a program described by the effects it performs.
177#[cfg_attr(feature = "serde", derive(serde::Serialize))]
178#[derive(Debug, Clone)]
179pub struct SafetyReport {
180    /// The mode the assessment was run under.
181    pub mode: Mode,
182    /// Total number of effects assessed.
183    pub effects: usize,
184    /// Effects allowed to run without friction.
185    pub allowed: usize,
186    /// Effects requiring approval before running.
187    pub approval_gated: usize,
188    /// Effects denied outright.
189    pub denied: usize,
190    /// Dangerous effects that the policy would let run *without* gating. For the
191    /// default agent policy this is 0 (every dangerous class is gated/denied); a
192    /// permissive policy could leave some ungated.
193    pub dangerous_ungated: usize,
194    /// True iff no dangerous effect is left ungated — the blast radius is bounded.
195    pub bounded: bool,
196    /// 0.0–1.0 safety score: the fraction of dangerous effects that are gated
197    /// (approval or deny). 1.0 when there are no dangerous effects, or all are
198    /// gated. Lower as more dangerous effects run unchecked.
199    pub score: f64,
200    /// A letter grade derived from `score` (A ≥ .9, B ≥ .75, C ≥ .5, D ≥ .25, F).
201    pub grade: char,
202}
203
204/// Assess a program's safety from the effects it performs, under `mode`.
205pub fn assess_safety(effects: &[Effect], mode: Mode) -> SafetyReport {
206    let (mut allowed, mut approval_gated, mut denied, mut dangerous, mut dangerous_ungated) =
207        (0, 0, 0, 0usize, 0usize);
208    for &e in effects {
209        let d = decide(e, mode);
210        match d {
211            Decision::Allow => allowed += 1,
212            Decision::Approve => approval_gated += 1,
213            Decision::Deny => denied += 1,
214        }
215        if e.is_dangerous() {
216            dangerous += 1;
217            if d == Decision::Allow {
218                dangerous_ungated += 1;
219            }
220        }
221    }
222    let score = if dangerous == 0 {
223        1.0
224    } else {
225        (dangerous - dangerous_ungated) as f64 / dangerous as f64
226    };
227    let grade = if score >= 0.9 {
228        'A'
229    } else if score >= 0.75 {
230        'B'
231    } else if score >= 0.5 {
232        'C'
233    } else if score >= 0.25 {
234        'D'
235    } else {
236        'F'
237    };
238    SafetyReport {
239        mode,
240        effects: effects.len(),
241        allowed,
242        approval_gated,
243        denied,
244        dangerous_ungated,
245        bounded: dangerous_ungated == 0,
246        score,
247        grade,
248    }
249}
250
251impl std::fmt::Display for SafetyReport {
252    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
253        write!(
254            f,
255            "grade {} bounded={} (allowed={} approval-gated={} denied={}, {} dangerous ungated)",
256            self.grade,
257            self.bounded,
258            self.allowed,
259            self.approval_gated,
260            self.denied,
261            self.dangerous_ungated
262        )
263    }
264}
265
266/// Assess safety from operation *names* plus a `classify` closure mapping each name
267/// to its [`Effect`] (e.g. a host's effect classifier). Names the classifier returns
268/// `None` for are skipped. Convenience over [`assess_safety`] when you start from
269/// names rather than effects.
270///
271/// ```
272/// use agentic_eval::safety::{assess_safety_named, Effect, Mode};
273/// let classify = |n: &str| match n {
274///     "read" => Some(Effect::ReadLocal),
275///     "rm" => Some(Effect::Destructive),
276///     _ => None,
277/// };
278/// let r = assess_safety_named(&["read", "rm", "unknown"], classify, Mode::Agent);
279/// assert!(r.bounded); // rm is approval-gated; unknown is skipped
280/// ```
281pub fn assess_safety_named<F: Fn(&str) -> Option<Effect>>(
282    names: &[&str],
283    classify: F,
284    mode: Mode,
285) -> SafetyReport {
286    let effects: Vec<Effect> = names.iter().filter_map(|n| classify(n)).collect();
287    assess_safety(&effects, mode)
288}
289
290/// How much of a program's *dangerous* blast radius is **reversible** — backed by an
291/// undo/rollback (transaction, trash, snapshot) rather than permanent. Gating (see
292/// [`assess_safety`]) bounds *whether* a dangerous effect runs; reversibility bounds
293/// *the damage if it does*. Together they describe the real recoverable blast radius.
294#[cfg_attr(feature = "serde", derive(serde::Serialize))]
295#[derive(Debug, Clone)]
296pub struct ReversibilityReport {
297    /// Number of dangerous effects considered.
298    pub dangerous: usize,
299    /// Dangerous effects that are reversible (an undo/rollback exists).
300    pub reversible: usize,
301    /// Dangerous effects with no undo path.
302    pub irreversible: usize,
303    /// Fraction of dangerous effects that are reversible (1.0 if none are dangerous).
304    pub score: f64,
305    /// True iff every dangerous effect is reversible — the blast radius is recoverable.
306    pub recoverable: bool,
307}
308
309/// Assess reversibility from `(effect, reversible)` pairs — each operation's effect
310/// class plus whether it has an undo/rollback. Only dangerous effects count toward
311/// the score (a pure read is trivially safe regardless of "reversibility").
312pub fn assess_reversibility(ops: &[(Effect, bool)]) -> ReversibilityReport {
313    let mut dangerous = 0usize;
314    let mut reversible = 0usize;
315    for &(effect, rev) in ops {
316        if effect.is_dangerous() {
317            dangerous += 1;
318            if rev {
319                reversible += 1;
320            }
321        }
322    }
323    let irreversible = dangerous - reversible;
324    let score = if dangerous == 0 {
325        1.0
326    } else {
327        reversible as f64 / dangerous as f64
328    };
329    ReversibilityReport {
330        dangerous,
331        reversible,
332        irreversible,
333        score,
334        recoverable: irreversible == 0,
335    }
336}
337
338impl std::fmt::Display for ReversibilityReport {
339    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
340        write!(
341            f,
342            "reversible {}/{} dangerous (score {:.2}, recoverable={})",
343            self.reversible, self.dangerous, self.score, self.recoverable
344        )
345    }
346}
347
348/// Whether a program has a data-**exfiltration path**: it both reads local/sensitive
349/// state (a *source*) and can send data out (a *sink* — network or arbitrary exec).
350/// The dangerous combination is source ∧ sink; either alone is not an exfil path.
351#[cfg_attr(feature = "serde", derive(serde::Serialize))]
352#[derive(Debug, Clone)]
353pub struct ExfiltrationReport {
354    /// A data source is present (reads local state).
355    pub has_source: bool,
356    /// A network egress sink is present.
357    pub has_network: bool,
358    /// An arbitrary-exec sink is present (a covert channel).
359    pub has_exec: bool,
360    /// True iff both a source and at least one sink are present.
361    pub exposed: bool,
362    /// 0.0–1.0 exfiltration risk: 0 when no path exists; higher as the sink gets
363    /// more capable (network < exec < both).
364    pub risk: f64,
365}
366
367/// Assess data-exfiltration exposure from the effects a program performs — a read
368/// source ([`Effect::ReadLocal`]) combined with an egress sink ([`Effect::Network`]
369/// or [`Effect::Exec`]).
370pub fn assess_exfiltration(effects: &[Effect]) -> ExfiltrationReport {
371    let has_source = effects.contains(&Effect::ReadLocal);
372    let has_network = effects.contains(&Effect::Network);
373    let has_exec = effects.contains(&Effect::Exec);
374    let exposed = has_source && (has_network || has_exec);
375    let risk = match (exposed, has_network, has_exec) {
376        (false, _, _) => 0.0,
377        (true, true, true) => 1.0,
378        (true, _, true) => 0.9,
379        (true, true, false) => 0.6,
380        _ => 0.0,
381    };
382    ExfiltrationReport {
383        has_source,
384        has_network,
385        has_exec,
386        exposed,
387        risk,
388    }
389}
390
391impl std::fmt::Display for ExfiltrationReport {
392    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
393        write!(
394            f,
395            "exfiltration risk {:.2} (exposed={}; source={} network={} exec={})",
396            self.risk, self.exposed, self.has_source, self.has_network, self.has_exec
397        )
398    }
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404
405    #[test]
406    fn agent_policy_gates_every_dangerous_class() {
407        // A program that reads, writes, deletes, execs, and needs privilege.
408        let effects = [
409            Effect::ReadLocal,
410            Effect::WriteLocal,
411            Effect::Destructive,
412            Effect::Exec,
413            Effect::Privileged,
414        ];
415        let r = assess_safety(&effects, Mode::Agent);
416        assert!(r.bounded, "no dangerous effect left ungated");
417        assert_eq!(r.dangerous_ungated, 0);
418        assert_eq!(r.score, 1.0);
419        assert_eq!(r.grade, 'A');
420        assert_eq!(r.denied, 1); // privileged
421        assert_eq!(r.approval_gated, 2); // destructive + exec
422        assert_eq!(r.allowed, 2); // read + write
423    }
424
425    #[test]
426    fn human_mode_allows_everything_so_dangerous_is_ungated() {
427        let effects = [Effect::Destructive, Effect::Exec];
428        let r = assess_safety(&effects, Mode::Human);
429        assert_eq!(r.allowed, 2);
430        assert!(
431            !r.bounded,
432            "human mode does not gate — blast radius unbounded"
433        );
434        assert_eq!(r.dangerous_ungated, 2);
435        assert_eq!(r.score, 0.0);
436        assert_eq!(r.grade, 'F');
437    }
438
439    #[test]
440    fn pure_program_is_trivially_safe() {
441        let r = assess_safety(&[Effect::Pure, Effect::ReadLocal], Mode::Agent);
442        assert_eq!(r.score, 1.0); // no dangerous effects at all
443        assert!(r.bounded);
444        assert_eq!(r.grade, 'A');
445    }
446
447    #[test]
448    fn effects_are_ordered_by_danger() {
449        assert!(Effect::Pure < Effect::Destructive);
450        assert!(Effect::Network < Effect::Privileged);
451        assert!(Effect::Destructive.is_dangerous());
452        assert!(!Effect::ReadLocal.is_dangerous());
453    }
454
455    #[test]
456    fn from_name_round_trips_every_effect() {
457        for e in [
458            Effect::Pure,
459            Effect::ReadLocal,
460            Effect::WriteLocal,
461            Effect::Network,
462            Effect::Process,
463            Effect::Destructive,
464            Effect::Exec,
465            Effect::Privileged,
466        ] {
467            assert_eq!(Effect::from_name(e.name()), Some(e));
468        }
469        assert_eq!(Effect::from_name("nonsense"), None);
470    }
471
472    #[test]
473    fn assess_safety_named_maps_and_skips_unknown() {
474        // Classifier maps names → effects via the canonical names; unknowns skipped.
475        let r = assess_safety_named(
476            &["read_local", "destructive", "exec", "??unknown??"],
477            Effect::from_name,
478            Mode::Agent,
479        );
480        assert_eq!(r.effects, 3, "unknown name skipped");
481        assert!(r.bounded); // destructive + exec are approval-gated
482        assert_eq!(r.approval_gated, 2);
483        assert_eq!(r.grade, 'A');
484    }
485
486    #[test]
487    fn reversibility_scores_only_dangerous_effects() {
488        // A read + a reversible delete (rollback) + an irreversible exec.
489        let ops = [
490            (Effect::ReadLocal, false), // not dangerous → ignored
491            (Effect::Destructive, true),
492            (Effect::Exec, false),
493        ];
494        let r = assess_reversibility(&ops);
495        assert_eq!(r.dangerous, 2);
496        assert_eq!(r.reversible, 1);
497        assert_eq!(r.irreversible, 1);
498        assert_eq!(r.score, 0.5);
499        assert!(!r.recoverable);
500
501        // All dangerous effects reversible → recoverable, score 1.0.
502        let ok = assess_reversibility(&[(Effect::Destructive, true), (Effect::Process, true)]);
503        assert!(ok.recoverable && ok.score == 1.0);
504        // No dangerous effects → vacuously recoverable.
505        assert_eq!(assess_reversibility(&[(Effect::Pure, false)]).score, 1.0);
506    }
507
508    #[test]
509    fn exfiltration_needs_both_source_and_sink() {
510        // Read + network → exposed (network sink).
511        let r = assess_exfiltration(&[Effect::ReadLocal, Effect::Network]);
512        assert!(r.exposed && r.risk == 0.6);
513        // Read + exec → higher risk.
514        assert_eq!(
515            assess_exfiltration(&[Effect::ReadLocal, Effect::Exec]).risk,
516            0.9
517        );
518        // Read + network + exec → max risk.
519        assert_eq!(
520            assess_exfiltration(&[Effect::ReadLocal, Effect::Network, Effect::Exec]).risk,
521            1.0
522        );
523        // Source alone or sink alone → no path.
524        assert!(!assess_exfiltration(&[Effect::ReadLocal]).exposed);
525        assert!(!assess_exfiltration(&[Effect::Network]).exposed);
526        assert_eq!(assess_exfiltration(&[Effect::Network]).risk, 0.0);
527    }
528}