mnemo_core/anomaly/
outlier.rs1use serde::{Deserialize, Serialize};
2
3use crate::model::embedding_baseline::{EmbeddingBaseline, MIN_BASELINE_SAMPLES};
4use crate::model::memory::MemoryRecord;
5
6const VARIANCE_FLOOR: f32 = 1e-6;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct OutlierScore {
16 pub z_score: f32,
20 pub threshold: f32,
22 pub is_outlier: bool,
24 pub dims_flagged: u32,
29 pub baseline_n: u64,
33}
34
35impl OutlierScore {
36 pub fn no_baseline(threshold: f32) -> Self {
37 Self {
38 z_score: 0.0,
39 threshold,
40 is_outlier: false,
41 dims_flagged: 0,
42 baseline_n: 0,
43 }
44 }
45}
46
47pub fn score_embedding_outlier(
59 record: &MemoryRecord,
60 baseline: &EmbeddingBaseline,
61 threshold: f32,
62) -> OutlierScore {
63 let Some(embedding) = record.embedding.as_ref() else {
64 return OutlierScore::no_baseline(threshold);
65 };
66 if embedding.len() != baseline.mu.len() || embedding.len() != baseline.cov_diag.len() {
67 return OutlierScore::no_baseline(threshold);
68 }
69 if baseline.n < MIN_BASELINE_SAMPLES {
70 return OutlierScore {
71 z_score: 0.0,
72 threshold,
73 is_outlier: false,
74 dims_flagged: 0,
75 baseline_n: baseline.n,
76 };
77 }
78
79 let d = embedding.len() as f32;
80 let mut sum_sq = 0.0f32;
81 let mut dims_flagged: u32 = 0;
82 for (i, &x) in embedding.iter().enumerate() {
83 let diff = x - baseline.mu[i];
84 let var = baseline.cov_diag[i].max(VARIANCE_FLOOR);
85 let sq_z = (diff * diff) / var;
86 if sq_z >= 9.0 {
87 dims_flagged += 1;
88 }
89 sum_sq += sq_z;
90 }
91 let z_score = (sum_sq / d).sqrt();
92 OutlierScore {
93 z_score,
94 threshold,
95 is_outlier: z_score >= threshold,
96 dims_flagged,
97 baseline_n: baseline.n,
98 }
99}
100
101pub fn train_baseline(agent_id: &str, records: &[MemoryRecord]) -> Option<EmbeddingBaseline> {
110 let mut records_with_emb = records
111 .iter()
112 .filter_map(|r| r.embedding.as_ref().map(|e| (r, e)));
113
114 let (_first_record, first_emb) = records_with_emb.next()?;
115 let d = first_emb.len();
116 if d == 0 {
117 return None;
118 }
119 let mut count: u64 = 1;
120 let mut mean: Vec<f32> = first_emb.clone();
121 let mut m2: Vec<f32> = vec![0.0; d];
122
123 for (_r, emb) in records_with_emb {
124 if emb.len() != d {
125 continue; }
127 count += 1;
128 let n = count as f32;
129 for i in 0..d {
130 let x = emb[i];
131 let delta = x - mean[i];
132 mean[i] += delta / n;
133 let delta2 = x - mean[i];
134 m2[i] += delta * delta2;
135 }
136 }
137
138 if count < 2 {
139 return None;
140 }
141
142 let divisor = (count - 1) as f32;
143 let cov_diag: Vec<f32> = m2.iter().map(|v| v / divisor).collect();
144
145 Some(EmbeddingBaseline {
146 agent_id: agent_id.to_string(),
147 mu: mean,
148 cov_diag,
149 n: count,
150 updated_at: chrono::Utc::now().to_rfc3339(),
151 })
152}
153
154#[cfg(test)]
155mod tests {
156 use super::*;
157 use crate::model::memory::MemoryRecord;
158
159 fn record_with_embedding(embedding: Vec<f32>) -> MemoryRecord {
160 let mut r = MemoryRecord::new("test-agent".to_string(), "x".to_string());
161 r.embedding = Some(embedding);
162 r
163 }
164
165 fn make_records(mean: f32, stddev: f32, n: usize, d: usize) -> Vec<MemoryRecord> {
166 let mut out = Vec::with_capacity(n);
169 for i in 0..n {
170 let sign = if i % 2 == 0 { 1.0 } else { -1.0 };
171 let magnitude = stddev * ((i as f32 / n as f32).sin().abs() + 0.5);
172 let emb: Vec<f32> = (0..d)
173 .map(|k| mean + sign * magnitude + k as f32 * 0.001)
174 .collect();
175 out.push(record_with_embedding(emb));
176 }
177 out
178 }
179
180 #[test]
181 fn trains_baseline_from_records() {
182 let records = make_records(0.1, 0.05, 40, 8);
183 let baseline = train_baseline("test-agent", &records).expect("baseline");
184 assert_eq!(baseline.mu.len(), 8);
185 assert_eq!(baseline.cov_diag.len(), 8);
186 assert_eq!(baseline.n, 40);
187 assert_eq!(baseline.agent_id, "test-agent");
188 }
189
190 #[test]
191 fn returns_none_on_no_embeddings() {
192 let mut record = record_with_embedding(vec![0.1; 4]);
193 record.embedding = None;
194 assert!(train_baseline("a", &[record]).is_none());
195 }
196
197 #[test]
198 fn in_distribution_not_flagged() {
199 let records = make_records(0.1, 0.05, 60, 16);
200 let baseline = train_baseline("a", &records).unwrap();
201 let score = score_embedding_outlier(&records[5], &baseline, 3.0);
203 assert!(
204 !score.is_outlier,
205 "in-distribution record flagged: z={} dims_flagged={}",
206 score.z_score, score.dims_flagged
207 );
208 }
209
210 #[test]
211 fn far_out_of_distribution_flagged() {
212 let records = make_records(0.1, 0.05, 60, 16);
213 let baseline = train_baseline("a", &records).unwrap();
214 let mut attacker = records[0].clone();
216 let mu0 = baseline.mu[0];
217 let stddev0 = baseline.cov_diag[0].sqrt();
218 let push = mu0 + 50.0 * stddev0.max(0.01);
219 attacker.embedding = Some(vec![push; 16]);
220 let score = score_embedding_outlier(&attacker, &baseline, 3.0);
221 assert!(
222 score.is_outlier,
223 "far-OOD record not flagged: z={} threshold={}",
224 score.z_score, score.threshold
225 );
226 }
227
228 #[test]
229 fn noisy_baseline_pins_is_outlier_false() {
230 let records = make_records(0.1, 0.05, 5, 8);
231 let baseline = train_baseline("a", &records).unwrap();
233 let score = score_embedding_outlier(&records[0], &baseline, 3.0);
234 assert!(
235 !score.is_outlier,
236 "noisy baseline should pin is_outlier=false"
237 );
238 assert!(score.baseline_n < MIN_BASELINE_SAMPLES);
239 }
240
241 #[test]
242 fn dim_mismatch_returns_no_op() {
243 let records = make_records(0.1, 0.05, 40, 8);
244 let baseline = train_baseline("a", &records).unwrap();
245 let mut mismatched = records[0].clone();
246 mismatched.embedding = Some(vec![0.1; 16]);
247 let score = score_embedding_outlier(&mismatched, &baseline, 3.0);
248 assert_eq!(score.z_score, 0.0);
249 assert!(!score.is_outlier);
250 }
251}