agentic-eval 0.14.2

//! Safety: given the effects a program performs, how much of its blast radius is
//! *gated* (requires approval, or denied) versus allowed under an agent policy?
//!
//! For an agent operating with real capabilities, the safety question is not "is
//! this code correct" but "what is the worst this can do, and is the dangerous
//! part gated?" This module classifies a program by the [`Effect`]s it performs,
//! applies a default-deny-for-dangerous agent [`Policy`], and scores how much of
//! the dangerous surface is held behind approval/denial. A program whose only
//! dangerous effects are approval-gated scores high; one that runs privileged or
//! executes arbitrary commands unconditionally scores low.

/// The effect class of an operation — the single property safety reasons about.
/// Ordered from harmless to most dangerous.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Effect {
    /// No observable effect (pure computation).
    Pure,
    /// Reads local state (filesystem reads, env, process listing).
    ReadLocal,
    /// Creates/modifies local state non-destructively (write, mkdir).
    WriteLocal,
    /// Performs network I/O.
    Network,
    /// Affects other processes (kill, signal).
    Process,
    /// Irreversibly removes/overwrites local state (rm, truncate, drop).
    Destructive,
    /// Executes an arbitrary external command (shell passthrough).
    Exec,
    /// Requires elevated privileges / affects system-wide state.
    Privileged,
}

impl Effect {
    /// The effect's canonical snake_case name (inverse of [`Self::from_name`]).
    pub fn name(self) -> &'static str {
        match self {
            Effect::Pure => "pure",
            Effect::ReadLocal => "read_local",
            Effect::WriteLocal => "write_local",
            Effect::Network => "network",
            Effect::Process => "process",
            Effect::Destructive => "destructive",
            Effect::Exec => "exec",
            Effect::Privileged => "privileged",
        }
    }

    /// Parse an effect from its snake_case name (the inverse of [`Self::name`]).
    /// Accepts the same spellings other effect taxonomies use (e.g. AetherShell's
    /// `safety::Effect::as_str`), so a host system's effect classifier can be mapped
    /// straight in. Returns `None` for an unknown name.
    pub fn from_name(name: &str) -> Option<Effect> {
        Some(match name {
            "pure" => Effect::Pure,
            "read_local" => Effect::ReadLocal,
            "write_local" => Effect::WriteLocal,
            "network" => Effect::Network,
            "process" => Effect::Process,
            "destructive" => Effect::Destructive,
            "exec" => Effect::Exec,
            "privileged" => Effect::Privileged,
            _ => return None,
        })
    }

    /// Whether this class is "dangerous" — capable of irreversible or
    /// out-of-sandbox harm, so it *should* be gated for an agent.
    pub fn is_dangerous(self) -> bool {
        matches!(
            self,
            Effect::Destructive | Effect::Process | Effect::Exec | Effect::Privileged
        )
    }

    /// Every effect class, in danger order (harmless → most dangerous). The
    /// enumerator for building a complete ontology over the taxonomy.
    pub fn all() -> [Effect; 8] {
        [
            Effect::Pure,
            Effect::ReadLocal,
            Effect::WriteLocal,
            Effect::Network,
            Effect::Process,
            Effect::Destructive,
            Effect::Exec,
            Effect::Privileged,
        ]
    }

    /// A one-line, human/agent-readable summary of what this effect class is.
    pub fn summary(self) -> &'static str {
        match self {
            Effect::Pure => "no observable effect (pure computation)",
            Effect::ReadLocal => "reads local state (filesystem, env, process listing)",
            Effect::WriteLocal => "creates or modifies local state non-destructively",
            Effect::Network => "performs network I/O",
            Effect::Process => "affects other processes (kill, signal, priority)",
            Effect::Destructive => "irreversibly removes or overwrites local state",
            Effect::Exec => "executes an arbitrary external command",
            Effect::Privileged => "requires elevated privileges / affects system-wide state",
        }
    }

    /// The policy [`Decision`] for this effect under `mode` (sugar for [`decide`]).
    pub fn decision(self, mode: Mode) -> Decision {
        decide(self, mode)
    }
}

/// Who is operating: a human at a REPL, or an autonomous agent.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Mode {
    /// A human at a REPL — default-allow.
    Human,
    /// An autonomous agent — default-deny for dangerous effect classes.
    Agent,
}

impl Mode {
    /// The mode's canonical lowercase name.
    pub fn name(self) -> &'static str {
        match self {
            Mode::Human => "human",
            Mode::Agent => "agent",
        }
    }

    /// Both modes, for enumerating the policy over the full ontology.
    pub fn all() -> [Mode; 2] {
        [Mode::Human, Mode::Agent]
    }
}

/// The policy decision for an effect under a mode.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Decision {
    /// Runs without friction.
    Allow,
    /// Refused unless an approval token / human-in-the-loop confirms.
    Approve,
    /// Refused outright, no approval path.
    Deny,
}

impl Decision {
    /// The decision's canonical lowercase name (`allow` / `approve` / `deny`).
    pub fn name(self) -> &'static str {
        match self {
            Decision::Allow => "allow",
            Decision::Approve => "approve",
            Decision::Deny => "deny",
        }
    }
}

/// The default agent policy: humans get default-allow (great errors instead of
/// friction); agents get default-deny for the dangerous classes. This mirrors the
/// AetherShell agentic-first model so the score reflects a real, shipped policy.
pub fn decide(effect: Effect, mode: Mode) -> Decision {
    match mode {
        Mode::Human => Decision::Allow,
        Mode::Agent => match effect {
            Effect::Pure | Effect::ReadLocal | Effect::WriteLocal | Effect::Network => {
                Decision::Allow
            }
            Effect::Process | Effect::Destructive | Effect::Exec => Decision::Approve,
            Effect::Privileged => Decision::Deny,
        },
    }
}

/// The safety assessment of a program described by the effects it performs.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone)]
pub struct SafetyReport {
    /// The mode the assessment was run under.
    pub mode: Mode,
    /// Total number of effects assessed.
    pub effects: usize,
    /// Effects allowed to run without friction.
    pub allowed: usize,
    /// Effects requiring approval before running.
    pub approval_gated: usize,
    /// Effects denied outright.
    pub denied: usize,
    /// Dangerous effects that the policy would let run *without* gating. For the
    /// default agent policy this is 0 (every dangerous class is gated/denied); a
    /// permissive policy could leave some ungated.
    pub dangerous_ungated: usize,
    /// True iff no dangerous effect is left ungated — the blast radius is bounded.
    pub bounded: bool,
    /// 0.0–1.0 safety score: the fraction of dangerous effects that are gated
    /// (approval or deny). 1.0 when there are no dangerous effects, or all are
    /// gated. Lower as more dangerous effects run unchecked.
    pub score: f64,
    /// A letter grade derived from `score` (A ≥ .9, B ≥ .75, C ≥ .5, D ≥ .25, F).
    pub grade: char,
}

/// Assess a program's safety from the effects it performs, under `mode`.
pub fn assess_safety(effects: &[Effect], mode: Mode) -> SafetyReport {
    let (mut allowed, mut approval_gated, mut denied, mut dangerous, mut dangerous_ungated) =
        (0, 0, 0, 0usize, 0usize);
    for &e in effects {
        let d = decide(e, mode);
        match d {
            Decision::Allow => allowed += 1,
            Decision::Approve => approval_gated += 1,
            Decision::Deny => denied += 1,
        }
        if e.is_dangerous() {
            dangerous += 1;
            if d == Decision::Allow {
                dangerous_ungated += 1;
            }
        }
    }
    let score = if dangerous == 0 {
        1.0
    } else {
        (dangerous - dangerous_ungated) as f64 / dangerous as f64
    };
    let grade = if score >= 0.9 {
        'A'
    } else if score >= 0.75 {
        'B'
    } else if score >= 0.5 {
        'C'
    } else if score >= 0.25 {
        'D'
    } else {
        'F'
    };
    SafetyReport {
        mode,
        effects: effects.len(),
        allowed,
        approval_gated,
        denied,
        dangerous_ungated,
        bounded: dangerous_ungated == 0,
        score,
        grade,
    }
}

impl std::fmt::Display for SafetyReport {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "grade {} bounded={} (allowed={} approval-gated={} denied={}, {} dangerous ungated)",
            self.grade,
            self.bounded,
            self.allowed,
            self.approval_gated,
            self.denied,
            self.dangerous_ungated
        )
    }
}

/// Assess safety from operation *names* plus a `classify` closure mapping each name
/// to its [`Effect`] (e.g. a host's effect classifier). Names the classifier returns
/// `None` for are skipped. Convenience over [`assess_safety`] when you start from
/// names rather than effects.
///
/// ```
/// use agentic_eval::safety::{assess_safety_named, Effect, Mode};
/// let classify = |n: &str| match n {
///     "read" => Some(Effect::ReadLocal),
///     "rm" => Some(Effect::Destructive),
///     _ => None,
/// };
/// let r = assess_safety_named(&["read", "rm", "unknown"], classify, Mode::Agent);
/// assert!(r.bounded); // rm is approval-gated; unknown is skipped
/// ```
pub fn assess_safety_named<F: Fn(&str) -> Option<Effect>>(
    names: &[&str],
    classify: F,
    mode: Mode,
) -> SafetyReport {
    let effects: Vec<Effect> = names.iter().filter_map(|n| classify(n)).collect();
    assess_safety(&effects, mode)
}

/// How much of a program's *dangerous* blast radius is **reversible** — backed by an
/// undo/rollback (transaction, trash, snapshot) rather than permanent. Gating (see
/// [`assess_safety`]) bounds *whether* a dangerous effect runs; reversibility bounds
/// *the damage if it does*. Together they describe the real recoverable blast radius.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone)]
pub struct ReversibilityReport {
    /// Number of dangerous effects considered.
    pub dangerous: usize,
    /// Dangerous effects that are reversible (an undo/rollback exists).
    pub reversible: usize,
    /// Dangerous effects with no undo path.
    pub irreversible: usize,
    /// Fraction of dangerous effects that are reversible (1.0 if none are dangerous).
    pub score: f64,
    /// True iff every dangerous effect is reversible — the blast radius is recoverable.
    pub recoverable: bool,
}

/// Assess reversibility from `(effect, reversible)` pairs — each operation's effect
/// class plus whether it has an undo/rollback. Only dangerous effects count toward
/// the score (a pure read is trivially safe regardless of "reversibility").
pub fn assess_reversibility(ops: &[(Effect, bool)]) -> ReversibilityReport {
    let mut dangerous = 0usize;
    let mut reversible = 0usize;
    for &(effect, rev) in ops {
        if effect.is_dangerous() {
            dangerous += 1;
            if rev {
                reversible += 1;
            }
        }
    }
    let irreversible = dangerous - reversible;
    let score = if dangerous == 0 {
        1.0
    } else {
        reversible as f64 / dangerous as f64
    };
    ReversibilityReport {
        dangerous,
        reversible,
        irreversible,
        score,
        recoverable: irreversible == 0,
    }
}

impl std::fmt::Display for ReversibilityReport {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "reversible {}/{} dangerous (score {:.2}, recoverable={})",
            self.reversible, self.dangerous, self.score, self.recoverable
        )
    }
}

/// Whether a program has a data-**exfiltration path**: it both reads local/sensitive
/// state (a *source*) and can send data out (a *sink* — network or arbitrary exec).
/// The dangerous combination is source ∧ sink; either alone is not an exfil path.
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone)]
pub struct ExfiltrationReport {
    /// A data source is present (reads local state).
    pub has_source: bool,
    /// A network egress sink is present.
    pub has_network: bool,
    /// An arbitrary-exec sink is present (a covert channel).
    pub has_exec: bool,
    /// True iff both a source and at least one sink are present.
    pub exposed: bool,
    /// 0.0–1.0 exfiltration risk: 0 when no path exists; higher as the sink gets
    /// more capable (network < exec < both).
    pub risk: f64,
}

/// Assess data-exfiltration exposure from the effects a program performs — a read
/// source ([`Effect::ReadLocal`]) combined with an egress sink ([`Effect::Network`]
/// or [`Effect::Exec`]).
pub fn assess_exfiltration(effects: &[Effect]) -> ExfiltrationReport {
    let has_source = effects.contains(&Effect::ReadLocal);
    let has_network = effects.contains(&Effect::Network);
    let has_exec = effects.contains(&Effect::Exec);
    let exposed = has_source && (has_network || has_exec);
    let risk = match (exposed, has_network, has_exec) {
        (false, _, _) => 0.0,
        (true, true, true) => 1.0,
        (true, _, true) => 0.9,
        (true, true, false) => 0.6,
        _ => 0.0,
    };
    ExfiltrationReport {
        has_source,
        has_network,
        has_exec,
        exposed,
        risk,
    }
}

impl std::fmt::Display for ExfiltrationReport {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "exfiltration risk {:.2} (exposed={}; source={} network={} exec={})",
            self.risk, self.exposed, self.has_source, self.has_network, self.has_exec
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn agent_policy_gates_every_dangerous_class() {
        // A program that reads, writes, deletes, execs, and needs privilege.
        let effects = [
            Effect::ReadLocal,
            Effect::WriteLocal,
            Effect::Destructive,
            Effect::Exec,
            Effect::Privileged,
        ];
        let r = assess_safety(&effects, Mode::Agent);
        assert!(r.bounded, "no dangerous effect left ungated");
        assert_eq!(r.dangerous_ungated, 0);
        assert_eq!(r.score, 1.0);
        assert_eq!(r.grade, 'A');
        assert_eq!(r.denied, 1); // privileged
        assert_eq!(r.approval_gated, 2); // destructive + exec
        assert_eq!(r.allowed, 2); // read + write
    }

    #[test]
    fn human_mode_allows_everything_so_dangerous_is_ungated() {
        let effects = [Effect::Destructive, Effect::Exec];
        let r = assess_safety(&effects, Mode::Human);
        assert_eq!(r.allowed, 2);
        assert!(
            !r.bounded,
            "human mode does not gate — blast radius unbounded"
        );
        assert_eq!(r.dangerous_ungated, 2);
        assert_eq!(r.score, 0.0);
        assert_eq!(r.grade, 'F');
    }

    #[test]
    fn pure_program_is_trivially_safe() {
        let r = assess_safety(&[Effect::Pure, Effect::ReadLocal], Mode::Agent);
        assert_eq!(r.score, 1.0); // no dangerous effects at all
        assert!(r.bounded);
        assert_eq!(r.grade, 'A');
    }

    #[test]
    fn effects_are_ordered_by_danger() {
        assert!(Effect::Pure < Effect::Destructive);
        assert!(Effect::Network < Effect::Privileged);
        assert!(Effect::Destructive.is_dangerous());
        assert!(!Effect::ReadLocal.is_dangerous());
    }

    #[test]
    fn from_name_round_trips_every_effect() {
        for e in [
            Effect::Pure,
            Effect::ReadLocal,
            Effect::WriteLocal,
            Effect::Network,
            Effect::Process,
            Effect::Destructive,
            Effect::Exec,
            Effect::Privileged,
        ] {
            assert_eq!(Effect::from_name(e.name()), Some(e));
        }
        assert_eq!(Effect::from_name("nonsense"), None);
    }

    #[test]
    fn assess_safety_named_maps_and_skips_unknown() {
        // Classifier maps names → effects via the canonical names; unknowns skipped.
        let r = assess_safety_named(
            &["read_local", "destructive", "exec", "??unknown??"],
            Effect::from_name,
            Mode::Agent,
        );
        assert_eq!(r.effects, 3, "unknown name skipped");
        assert!(r.bounded); // destructive + exec are approval-gated
        assert_eq!(r.approval_gated, 2);
        assert_eq!(r.grade, 'A');
    }

    #[test]
    fn reversibility_scores_only_dangerous_effects() {
        // A read + a reversible delete (rollback) + an irreversible exec.
        let ops = [
            (Effect::ReadLocal, false), // not dangerous → ignored
            (Effect::Destructive, true),
            (Effect::Exec, false),
        ];
        let r = assess_reversibility(&ops);
        assert_eq!(r.dangerous, 2);
        assert_eq!(r.reversible, 1);
        assert_eq!(r.irreversible, 1);
        assert_eq!(r.score, 0.5);
        assert!(!r.recoverable);

        // All dangerous effects reversible → recoverable, score 1.0.
        let ok = assess_reversibility(&[(Effect::Destructive, true), (Effect::Process, true)]);
        assert!(ok.recoverable && ok.score == 1.0);
        // No dangerous effects → vacuously recoverable.
        assert_eq!(assess_reversibility(&[(Effect::Pure, false)]).score, 1.0);
    }

    #[test]
    fn exfiltration_needs_both_source_and_sink() {
        // Read + network → exposed (network sink).
        let r = assess_exfiltration(&[Effect::ReadLocal, Effect::Network]);
        assert!(r.exposed && r.risk == 0.6);
        // Read + exec → higher risk.
        assert_eq!(
            assess_exfiltration(&[Effect::ReadLocal, Effect::Exec]).risk,
            0.9
        );
        // Read + network + exec → max risk.
        assert_eq!(
            assess_exfiltration(&[Effect::ReadLocal, Effect::Network, Effect::Exec]).risk,
            1.0
        );
        // Source alone or sink alone → no path.
        assert!(!assess_exfiltration(&[Effect::ReadLocal]).exposed);
        assert!(!assess_exfiltration(&[Effect::Network]).exposed);
        assert_eq!(assess_exfiltration(&[Effect::Network]).risk, 0.0);
    }
}