Skip to main content

zeph_tools/
anomaly.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Sliding-window anomaly detection for tool execution patterns.
5
6use std::collections::VecDeque;
7
8/// Severity of a detected anomaly.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10#[non_exhaustive]
11pub enum AnomalySeverity {
12    Warning,
13    Critical,
14}
15
16/// A detected anomaly in tool execution patterns.
17#[derive(Debug, Clone)]
18pub struct Anomaly {
19    pub severity: AnomalySeverity,
20    pub description: String,
21}
22
23/// Tracks recent tool execution outcomes and detects anomalous patterns.
24#[derive(Debug)]
25pub struct AnomalyDetector {
26    window: VecDeque<Outcome>,
27    window_size: usize,
28    error_threshold: f64,
29    critical_threshold: f64,
30}
31
32#[derive(Debug, Clone, Copy)]
33enum Outcome {
34    Success,
35    Error,
36    Blocked,
37}
38
39impl AnomalyDetector {
40    #[must_use]
41    pub fn new(window_size: usize, error_threshold: f64, critical_threshold: f64) -> Self {
42        Self {
43            window: VecDeque::with_capacity(window_size),
44            window_size,
45            error_threshold,
46            critical_threshold,
47        }
48    }
49
50    /// Record a successful tool execution.
51    pub fn record_success(&mut self) {
52        self.push(Outcome::Success);
53    }
54
55    /// Record a failed tool execution.
56    pub fn record_error(&mut self) {
57        self.push(Outcome::Error);
58    }
59
60    /// Record a blocked tool execution.
61    pub fn record_blocked(&mut self) {
62        self.push(Outcome::Blocked);
63    }
64
65    /// Record a quality failure (`ToolNotFound`, `InvalidParameters`, `TypeMismatch`) that
66    /// originated from a reasoning-enhanced model. Counts as an error for anomaly
67    /// detection purposes and logs a `reasoning_amplification` warning.
68    ///
69    /// Per arXiv:2510.22977, reasoning models amplify tool hallucinations — this
70    /// method makes such failures visible in the anomaly window.
71    pub fn record_reasoning_quality_failure(&mut self, model_name: &str, tool_name: &str) {
72        self.push(Outcome::Error);
73        tracing::warn!(
74            model = model_name,
75            tool = tool_name,
76            category = "reasoning_amplification",
77            "quality failure from reasoning model — CoT may amplify tool hallucination (arXiv:2510.22977)"
78        );
79    }
80
81    fn push(&mut self, outcome: Outcome) {
82        if self.window.len() >= self.window_size {
83            self.window.pop_front();
84        }
85        self.window.push_back(outcome);
86    }
87
88    /// Check the current window for anomalies.
89    #[must_use]
90    #[allow(clippy::cast_precision_loss)]
91    pub fn check(&self) -> Option<Anomaly> {
92        if self.window.len() < 3 {
93            return None;
94        }
95
96        let total = self.window.len();
97        let errors = self
98            .window
99            .iter()
100            .filter(|o| matches!(o, Outcome::Error | Outcome::Blocked))
101            .count();
102
103        let ratio = errors as f64 / total as f64;
104
105        if ratio >= self.critical_threshold {
106            Some(Anomaly {
107                severity: AnomalySeverity::Critical,
108                description: format!(
109                    "error rate {:.0}% ({errors}/{total}) exceeds critical threshold",
110                    ratio * 100.0,
111                ),
112            })
113        } else if ratio >= self.error_threshold {
114            Some(Anomaly {
115                severity: AnomalySeverity::Warning,
116                description: format!(
117                    "error rate {:.0}% ({errors}/{total}) exceeds warning threshold",
118                    ratio * 100.0,
119                ),
120            })
121        } else {
122            None
123        }
124    }
125
126    /// Reset the sliding window.
127    pub fn reset(&mut self) {
128        self.window.clear();
129    }
130}
131
132impl Default for AnomalyDetector {
133    fn default() -> Self {
134        Self::new(10, 0.5, 0.8)
135    }
136}
137
138/// Returns `true` when `model_name` matches a known reasoning-enhanced model pattern.
139///
140/// Reasoning models (o1, o3, o4-mini, `QwQ`, `DeepSeek-R1`, etc.) are more prone to
141/// tool hallucination than standard models per arXiv:2510.22977. This helper enables
142/// callers to conditionally emit `reasoning_amplification` warnings.
143#[must_use]
144pub fn is_reasoning_model(model_name: &str) -> bool {
145    let lower = model_name.to_ascii_lowercase();
146    // OpenAI o-series: o1, o3, o4-mini, o1-mini, o1-preview, o3-mini
147    let openai_o = lower.starts_with("o1") || lower.starts_with("o3") || lower.starts_with("o4");
148    // QwQ reasoning models
149    let qwq = lower.contains("qwq");
150    // DeepSeek R1 and variants
151    let deepseek_r1 = lower.contains("deepseek-r1") || lower.contains("deepseek_r1");
152    // Claude extended thinking (prefixed with "claude" and contains "think")
153    let claude_think = lower.starts_with("claude") && lower.contains("think");
154    openai_o || qwq || deepseek_r1 || claude_think
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[test]
162    fn no_anomaly_on_success() {
163        let mut det = AnomalyDetector::default();
164        for _ in 0..10 {
165            det.record_success();
166        }
167        assert!(det.check().is_none());
168    }
169
170    #[test]
171    fn warning_on_half_errors() {
172        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
173        for _ in 0..5 {
174            det.record_success();
175        }
176        for _ in 0..5 {
177            det.record_error();
178        }
179        let anomaly = det.check().unwrap();
180        assert_eq!(anomaly.severity, AnomalySeverity::Warning);
181    }
182
183    #[test]
184    fn critical_on_high_errors() {
185        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
186        for _ in 0..2 {
187            det.record_success();
188        }
189        for _ in 0..8 {
190            det.record_error();
191        }
192        let anomaly = det.check().unwrap();
193        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
194    }
195
196    #[test]
197    fn blocked_counts_as_error() {
198        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
199        for _ in 0..2 {
200            det.record_success();
201        }
202        for _ in 0..8 {
203            det.record_blocked();
204        }
205        let anomaly = det.check().unwrap();
206        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
207    }
208
209    #[test]
210    fn window_slides() {
211        let mut det = AnomalyDetector::new(5, 0.5, 0.8);
212        for _ in 0..5 {
213            det.record_error();
214        }
215        assert!(det.check().is_some());
216
217        // Push 5 successes to slide out errors
218        for _ in 0..5 {
219            det.record_success();
220        }
221        assert!(det.check().is_none());
222    }
223
224    #[test]
225    fn too_few_samples_returns_none() {
226        let mut det = AnomalyDetector::default();
227        det.record_error();
228        det.record_error();
229        assert!(det.check().is_none());
230    }
231
232    #[test]
233    fn reset_clears_window() {
234        let mut det = AnomalyDetector::new(5, 0.5, 0.8);
235        for _ in 0..5 {
236            det.record_error();
237        }
238        assert!(det.check().is_some());
239        det.reset();
240        assert!(det.check().is_none());
241    }
242
243    #[test]
244    fn default_thresholds() {
245        let det = AnomalyDetector::default();
246        assert_eq!(det.window_size, 10);
247        assert!((det.error_threshold - 0.5).abs() < f64::EPSILON);
248        assert!((det.critical_threshold - 0.8).abs() < f64::EPSILON);
249    }
250
251    #[test]
252    fn is_reasoning_model_openai_o_series() {
253        assert!(is_reasoning_model("o1"));
254        assert!(is_reasoning_model("o1-mini"));
255        assert!(is_reasoning_model("o1-preview"));
256        assert!(is_reasoning_model("o3"));
257        assert!(is_reasoning_model("o3-mini"));
258        assert!(is_reasoning_model("o4-mini"));
259        assert!(!is_reasoning_model("gpt-4o"));
260        assert!(!is_reasoning_model("gpt-4o-mini"));
261    }
262
263    #[test]
264    fn is_reasoning_model_other_families() {
265        assert!(is_reasoning_model("QwQ-32B"));
266        assert!(is_reasoning_model("deepseek-r1"));
267        assert!(is_reasoning_model("deepseek-r1-distill-qwen-14b"));
268        assert!(is_reasoning_model("claude-3-opus-think"));
269        assert!(!is_reasoning_model("claude-3-opus"));
270        assert!(!is_reasoning_model("qwen2.5:14b"));
271    }
272
273    #[test]
274    fn record_reasoning_quality_failure_increments_error_count() {
275        let mut det = AnomalyDetector::new(10, 0.5, 0.8);
276        // Record 6 reasoning quality failures in window of 10
277        for _ in 0..6 {
278            det.record_reasoning_quality_failure("o1", "shell");
279        }
280        // 6/6 = 100% > critical threshold
281        let anomaly = det.check().unwrap();
282        assert_eq!(anomaly.severity, AnomalySeverity::Critical);
283    }
284}