Skip to main content

assay_sim/attacks/
memory_poison.rs

1//! Persistent memory poisoning with delayed trigger.
2//!
3//! Tests whether schema-valid, internally consistent poisoned state entries
4//! can survive past injection and later shift policy classification, replay
5//! diff buckets, or evidence convergence.
6//!
7//! 4 attack vectors + 3 benign controls. All deterministic, no LLM calls.
8
9use crate::report::AttackResult;
10use serde::Serialize;
11
12#[path = "memory_poison_next/mod.rs"]
13mod memory_poison_next;
14
15#[derive(Debug, Clone, Serialize)]
16pub struct PoisonResult {
17    pub vector_id: String,
18    pub condition: String,
19    pub phase_a_injected: bool,
20    pub phase_b_delay_turns: u32,
21    pub phase_c_triggered: bool,
22    pub poison_retained: bool,
23    pub activation_succeeded: bool,
24    pub expected_classification: String,
25    pub observed_classification: String,
26    pub outcome: PoisonOutcome,
27    pub hypothesis_tags: Vec<String>,
28}
29
30#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
31#[serde(rename_all = "snake_case")]
32pub enum PoisonOutcome {
33    NoEffect,
34    RetainedNoActivation,
35    ActivationWithCorrectDetection,
36    ActivationWithMisclassification,
37    ActivationWithPolicyShift,
38}
39
40pub fn vector1_replay_baseline_poisoning(delay_turns: u32) -> (PoisonResult, AttackResult) {
41    memory_poison_next::vector1_replay_baseline_poisoning(delay_turns)
42}
43
44pub fn vector2_deny_convergence_poisoning(delay_turns: u32) -> (PoisonResult, AttackResult) {
45    memory_poison_next::vector2_deny_convergence_poisoning(delay_turns)
46}
47
48pub fn vector3_context_envelope_poisoning(delay_turns: u32) -> (PoisonResult, AttackResult) {
49    memory_poison_next::vector3_context_envelope_poisoning(delay_turns)
50}
51
52pub fn vector4_decay_escape(decay_runs: u32) -> (PoisonResult, AttackResult) {
53    memory_poison_next::vector4_decay_escape(decay_runs)
54}
55
56pub fn control_b1_run_metadata_recall(delay_turns: u32) -> (PoisonResult, AttackResult) {
57    memory_poison_next::control_b1_run_metadata_recall(delay_turns)
58}
59
60pub fn control_b2_tool_observation_recall(delay_turns: u32) -> (PoisonResult, AttackResult) {
61    memory_poison_next::control_b2_tool_observation_recall(delay_turns)
62}
63
64pub fn control_b3_approval_context_recall(delay_turns: u32) -> (PoisonResult, AttackResult) {
65    memory_poison_next::control_b3_approval_context_recall(delay_turns)
66}
67
68/// Run the complete memory poison experiment matrix across all conditions.
69pub fn run_memory_poison_matrix() -> (Vec<PoisonResult>, Vec<AttackResult>) {
70    memory_poison_next::run_memory_poison_matrix()
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    #[test]
78    fn vector1_activates_under_condition_a() {
79        let (pr, _) = vector1_replay_baseline_poisoning(1);
80        assert!(pr.activation_succeeded);
81        assert_eq!(pr.outcome, PoisonOutcome::ActivationWithMisclassification);
82    }
83
84    #[test]
85    fn vector2_activates_under_condition_a() {
86        let (pr, _) = vector2_deny_convergence_poisoning(1);
87        // Deny convergence fields are metadata-level; changing them may or may
88        // not shift the bucket depending on same_effective_decision_class().
89        assert!(pr.phase_c_triggered);
90    }
91
92    #[test]
93    fn vector3_activates_under_condition_a() {
94        let (pr, _) = vector3_context_envelope_poisoning(1);
95        assert!(pr.activation_succeeded);
96        assert_eq!(pr.outcome, PoisonOutcome::ActivationWithPolicyShift);
97    }
98
99    #[test]
100    fn vector4_snapshot_diverges() {
101        let (pr, _) = vector4_decay_escape(1);
102        assert!(pr.activation_succeeded);
103    }
104
105    #[test]
106    fn controls_produce_no_false_positives() {
107        for delay in [1, 2, 3] {
108            let (pr1, _) = control_b1_run_metadata_recall(delay);
109            assert_eq!(pr1.outcome, PoisonOutcome::NoEffect);
110
111            let (pr2, _) = control_b2_tool_observation_recall(delay);
112            assert_eq!(pr2.outcome, PoisonOutcome::NoEffect);
113
114            let (pr3, _) = control_b3_approval_context_recall(delay);
115            assert_eq!(pr3.outcome, PoisonOutcome::NoEffect);
116        }
117    }
118
119    #[test]
120    fn full_matrix_runs_without_panic() {
121        let (results, attacks) = run_memory_poison_matrix();
122        // 3 conditions * 4 vectors * 3 delays + 3 controls * 3 delays = 36 + 9 = 45
123        assert_eq!(results.len(), 45);
124        assert_eq!(attacks.len(), 45);
125    }
126
127    #[test]
128    fn condition_b_blocks_v1_and_v2() {
129        let (results, _) = run_memory_poison_matrix();
130        for pr in results.iter().filter(|r| r.condition == "condition_b") {
131            if pr.vector_id == "v1_replay_baseline" || pr.vector_id == "v2_deny_convergence" {
132                assert_eq!(
133                    pr.outcome,
134                    PoisonOutcome::ActivationWithCorrectDetection,
135                    "{} should be detected under Condition B",
136                    pr.vector_id
137                );
138            }
139        }
140    }
141
142    #[test]
143    fn condition_c_blocks_v3() {
144        let (results, _) = run_memory_poison_matrix();
145        for pr in results
146            .iter()
147            .filter(|r| r.condition == "condition_c" && r.vector_id == "v3_context_envelope")
148        {
149            assert_eq!(
150                pr.outcome,
151                PoisonOutcome::ActivationWithCorrectDetection,
152                "V3 should be detected under Condition C"
153            );
154        }
155    }
156
157    #[test]
158    fn overarching_invariant_controls_never_misclassify() {
159        let (results, _) = run_memory_poison_matrix();
160        for pr in &results {
161            if pr.vector_id.starts_with("control_") {
162                assert_ne!(
163                    pr.outcome,
164                    PoisonOutcome::ActivationWithMisclassification,
165                    "control {} had false positive",
166                    pr.vector_id
167                );
168                assert_ne!(
169                    pr.outcome,
170                    PoisonOutcome::ActivationWithPolicyShift,
171                    "control {} had policy shift",
172                    pr.vector_id
173                );
174            }
175        }
176    }
177}