zeph_tools/
anomaly.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Sliding-window anomaly detection for tool execution patterns.
5
6use std::collections::VecDeque;
7
8/// Severity of a detected anomaly.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum AnomalySeverity {
11    Warning,
12    Critical,
13}
14
15/// A detected anomaly in tool execution patterns.
16#[derive(Debug, Clone)]
17pub struct Anomaly {
18    pub severity: AnomalySeverity,
19    pub description: String,
20}
21
22/// Tracks recent tool execution outcomes and detects anomalous patterns.
23#[derive(Debug)]
24pub struct AnomalyDetector {
25    window: VecDeque<Outcome>,
26    window_size: usize,
27    error_threshold: f64,
28    critical_threshold: f64,
29}
30
31#[derive(Debug, Clone, Copy)]
32enum Outcome {
33    Success,
34    Error,
35    Blocked,
36}
37
38impl AnomalyDetector {
39    #[must_use]
40    pub fn new(window_size: usize, error_threshold: f64, critical_threshold: f64) -> Self {
41        Self {
42            window: VecDeque::with_capacity(window_size),
43            window_size,
44            error_threshold,
45            critical_threshold,
46        }
47    }
48
49    /// Record a successful tool execution.
50    pub fn record_success(&mut self) {
51        self.push(Outcome::Success);
52    }
53
54    /// Record a failed tool execution.
55    pub fn record_error(&mut self) {
56        self.push(Outcome::Error);
57    }
58
59    /// Record a blocked tool execution.
60    pub fn record_blocked(&mut self) {
61        self.push(Outcome::Blocked);
62    }
63
64    /// Record a quality failure (`ToolNotFound`, `InvalidParameters`, `TypeMismatch`) that
65    /// originated from a reasoning-enhanced model. Counts as an error for anomaly
66    /// detection purposes and logs a `reasoning_amplification` warning.
67    ///
68    /// Per arXiv:2510.22977, reasoning models amplify tool hallucinations — this
69    /// method makes such failures visible in the anomaly window.
70    pub fn record_reasoning_quality_failure(&mut self, model_name: &str, tool_name: &str) {
71        self.push(Outcome::Error);
72        tracing::warn!(
73            model = model_name,
74            tool = tool_name,
75            category = "reasoning_amplification",
76            "quality failure from reasoning model — CoT may amplify tool hallucination (arXiv:2510.22977)"
77        );
78    }
79
80    fn push(&mut self, outcome: Outcome) {
81        if self.window.len() >= self.window_size {
82            self.window.pop_front();
83        }
84        self.window.push_back(outcome);
85    }
86
87    /// Check the current window for anomalies.
88    #[must_use]
89    #[allow(clippy::cast_precision_loss)]
90    pub fn check(&self) -> Option<Anomaly> {
91        if self.window.len() < 3 {
92            return None;
93        }
94
95        let total = self.window.len();
96        let errors = self
97            .window
98            .iter()
99            .filter(|o| matches!(o, Outcome::Error | Outcome::Blocked))
100            .count();
101
102        let ratio = errors as f64 / total as f64;
103
104        if ratio >= self.critical_threshold {
105            Some(Anomaly {
106                severity: AnomalySeverity::Critical,
107                description: format!(
108                    "error rate {:.0}% ({errors}/{total}) exceeds critical threshold",
109                    ratio * 100.0,
110                ),
111            })
112        } else if ratio >= self.error_threshold {
113            Some(Anomaly {
114                severity: AnomalySeverity::Warning,
115                description: format!(
116                    "error rate {:.0}% ({errors}/{total}) exceeds warning threshold",
117                    ratio * 100.0,
118                ),
119            })
120        } else {
121            None
122        }
123    }
124
125    /// Reset the sliding window.
126    pub fn reset(&mut self) {
127        self.window.clear();
128    }
129}
130
131impl Default for AnomalyDetector {
132    fn default() -> Self {
133        Self::new(10, 0.5, 0.8)
134    }
135}
136
137/// Returns `true` when `model_name` matches a known reasoning-enhanced model pattern.
138///
139/// Reasoning models (o1, o3, o4-mini, `QwQ`, `DeepSeek-R1`, etc.) are more prone to
140/// tool hallucination than standard models per arXiv:2510.22977. This helper enables
141/// callers to conditionally emit `reasoning_amplification` warnings.
142#[must_use]
143pub fn is_reasoning_model(model_name: &str) -> bool {
144    let lower = model_name.to_ascii_lowercase();
145    // OpenAI o-series: o1, o3, o4-mini, o1-mini, o1-preview, o3-mini
146    let openai_o = lower.starts_with("o1") || lower.starts_with("o3") || lower.starts_with("o4");
147    // QwQ reasoning models
148    let qwq = lower.contains("qwq");
149    // DeepSeek R1 and variants
150    let deepseek_r1 = lower.contains("deepseek-r1") || lower.contains("deepseek_r1");
151    // Claude extended thinking (prefixed with "claude" and contains "think")
152    let claude_think = lower.starts_with("claude") && lower.contains("think");
153    openai_o || qwq || deepseek_r1 || claude_think
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn no_anomaly_on_success() {
162        let mut det = AnomalyDetector::default();
163        for _ in 0..10 {
164            det.record_success();
165        }
166        assert!(det.check().is_none());
167    }
168
169    #[test]
170    fn warning_on_half_errors() {
171        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
172        for _ in 0..5 {
173            det.record_success();
174        }
175        for _ in 0..5 {
176            det.record_error();
177        }
178        let anomaly = det.check().unwrap();
179        assert_eq!(anomaly.severity, AnomalySeverity::Warning);
180    }
181
182    #[test]
183    fn critical_on_high_errors() {
184        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
185        for _ in 0..2 {
186            det.record_success();
187        }
188        for _ in 0..8 {
189            det.record_error();
190        }
191        let anomaly = det.check().unwrap();
192        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
193    }
194
195    #[test]
196    fn blocked_counts_as_error() {
197        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
198        for _ in 0..2 {
199            det.record_success();
200        }
201        for _ in 0..8 {
202            det.record_blocked();
203        }
204        let anomaly = det.check().unwrap();
205        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
206    }
207
208    #[test]
209    fn window_slides() {
210        let mut det = AnomalyDetector::new(5, 0.5, 0.8);
211        for _ in 0..5 {
212            det.record_error();
213        }
214        assert!(det.check().is_some());
215
216        // Push 5 successes to slide out errors
217        for _ in 0..5 {
218            det.record_success();
219        }
220        assert!(det.check().is_none());
221    }
222
223    #[test]
224    fn too_few_samples_returns_none() {
225        let mut det = AnomalyDetector::default();
226        det.record_error();
227        det.record_error();
228        assert!(det.check().is_none());
229    }
230
231    #[test]
232    fn reset_clears_window() {
233        let mut det = AnomalyDetector::new(5, 0.5, 0.8);
234        for _ in 0..5 {
235            det.record_error();
236        }
237        assert!(det.check().is_some());
238        det.reset();
239        assert!(det.check().is_none());
240    }
241
242    #[test]
243    fn default_thresholds() {
244        let det = AnomalyDetector::default();
245        assert_eq!(det.window_size, 10);
246        assert!((det.error_threshold - 0.5).abs() < f64::EPSILON);
247        assert!((det.critical_threshold - 0.8).abs() < f64::EPSILON);
248    }
249
250    #[test]
251    fn is_reasoning_model_openai_o_series() {
252        assert!(is_reasoning_model("o1"));
253        assert!(is_reasoning_model("o1-mini"));
254        assert!(is_reasoning_model("o1-preview"));
255        assert!(is_reasoning_model("o3"));
256        assert!(is_reasoning_model("o3-mini"));
257        assert!(is_reasoning_model("o4-mini"));
258        assert!(!is_reasoning_model("gpt-4o"));
259        assert!(!is_reasoning_model("gpt-4o-mini"));
260    }
261
262    #[test]
263    fn is_reasoning_model_other_families() {
264        assert!(is_reasoning_model("QwQ-32B"));
265        assert!(is_reasoning_model("deepseek-r1"));
266        assert!(is_reasoning_model("deepseek-r1-distill-qwen-14b"));
267        assert!(is_reasoning_model("claude-3-opus-think"));
268        assert!(!is_reasoning_model("claude-3-opus"));
269        assert!(!is_reasoning_model("qwen2.5:14b"));
270    }
271
272    #[test]
273    fn record_reasoning_quality_failure_increments_error_count() {
274        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
275        // Record 6 reasoning quality failures in window of 10
276        for _ in 0..6 {
277            det.record_reasoning_quality_failure("o1", "shell");
278        }
279        // 6/6 = 100% > critical threshold
280        let anomaly = det.check().unwrap();
281        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
282    }
283}
zeph_tools/anomaly.rs

zeph_tools/
anomaly.rs