Skip to main content

entrenar/monitor/
andon.rs

1//! Andon Alerting System (ENT-045)
2//!
3//! Toyota Way 自働化 (Jidoka): Automation with human touch.
4//! Automatically detect abnormalities, stop training, alert humans.
5
6use std::sync::atomic::{AtomicBool, Ordering};
7use std::sync::Arc;
8
9/// Alert severity levels
10#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
11pub enum AlertLevel {
12    /// Informational (training started/completed)
13    Info,
14    /// Warning (accuracy dip, gradient spike)
15    Warning,
16    /// Error (training diverged, NaN detected)
17    Error,
18    /// Critical (all experts failed, system fault)
19    Critical,
20}
21
22impl AlertLevel {
23    /// Get string representation
24    pub fn as_str(&self) -> &'static str {
25        match self {
26            AlertLevel::Info => "INFO",
27            AlertLevel::Warning => "WARNING",
28            AlertLevel::Error => "ERROR",
29            AlertLevel::Critical => "CRITICAL",
30        }
31    }
32
33    /// Get emoji for display
34    pub fn emoji(&self) -> &'static str {
35        match self {
36            AlertLevel::Info => "ℹ️",
37            AlertLevel::Warning => "⚠️",
38            AlertLevel::Error => "❌",
39            AlertLevel::Critical => "🛑",
40        }
41    }
42}
43
44/// An alert triggered by the monitoring system
45#[derive(Debug, Clone)]
46pub struct Alert {
47    /// Severity level
48    pub level: AlertLevel,
49    /// Alert message
50    pub message: String,
51    /// Source of the alert
52    pub source: String,
53    /// Timestamp (unix millis)
54    pub timestamp: u64,
55    /// Optional metric value that triggered the alert
56    pub value: Option<f64>,
57}
58
59impl Alert {
60    /// Create a new alert
61    pub fn new(level: AlertLevel, message: impl Into<String>) -> Self {
62        let timestamp = std::time::SystemTime::now()
63            .duration_since(std::time::UNIX_EPOCH)
64            .map(|d| d.as_millis() as u64)
65            .unwrap_or(0);
66
67        Self { level, message: message.into(), source: String::new(), timestamp, value: None }
68    }
69
70    /// Set the source
71    pub fn with_source(mut self, source: impl Into<String>) -> Self {
72        self.source = source.into();
73        self
74    }
75
76    /// Set the value
77    pub fn with_value(mut self, value: f64) -> Self {
78        self.value = Some(value);
79        self
80    }
81
82    /// Create an info alert
83    pub fn info(message: impl Into<String>) -> Self {
84        Self::new(AlertLevel::Info, message)
85    }
86
87    /// Create a warning alert
88    pub fn warning(message: impl Into<String>) -> Self {
89        Self::new(AlertLevel::Warning, message)
90    }
91
92    /// Create an error alert
93    pub fn error(message: impl Into<String>) -> Self {
94        Self::new(AlertLevel::Error, message)
95    }
96
97    /// Create a critical alert
98    pub fn critical(message: impl Into<String>) -> Self {
99        Self::new(AlertLevel::Critical, message)
100    }
101}
102
103/// Configuration for the Andon system
104#[derive(Debug, Clone)]
105pub struct AndonConfig {
106    /// Stop training on error
107    pub stop_on_error: bool,
108    /// Stop training on critical
109    pub stop_on_critical: bool,
110    /// Log alerts to stderr
111    pub log_alerts: bool,
112}
113
114impl Default for AndonConfig {
115    fn default() -> Self {
116        Self { stop_on_error: true, stop_on_critical: true, log_alerts: true }
117    }
118}
119
120/// Andon system for training monitoring
121///
122/// Toyota Way principle: 自働化 (Jidoka) - build in quality
123#[derive(Debug)]
124pub struct AndonSystem {
125    config: AndonConfig,
126    stop_flag: Arc<AtomicBool>,
127    alert_history: Vec<Alert>,
128}
129
130impl AndonSystem {
131    /// Create a new Andon system
132    pub fn new() -> Self {
133        Self::with_config(AndonConfig::default())
134    }
135
136    /// Create with custom config
137    pub fn with_config(config: AndonConfig) -> Self {
138        Self { config, stop_flag: Arc::new(AtomicBool::new(false)), alert_history: Vec::new() }
139    }
140
141    /// Get a clone of the stop flag for checking in training loop
142    pub fn stop_flag(&self) -> Arc<AtomicBool> {
143        Arc::clone(&self.stop_flag)
144    }
145
146    /// Check if stop has been requested
147    pub fn should_stop(&self) -> bool {
148        self.stop_flag.load(Ordering::SeqCst)
149    }
150
151    /// Trigger an alert
152    pub fn trigger(&mut self, alert: Alert) {
153        // Log if configured
154        if self.config.log_alerts {
155            eprintln!(
156                "{} [{}] {}: {}",
157                alert.level.emoji(),
158                alert.level.as_str(),
159                alert.source,
160                alert.message
161            );
162        }
163
164        // Check if we should stop
165        let should_stop = match alert.level {
166            AlertLevel::Critical => self.config.stop_on_critical,
167            AlertLevel::Error => self.config.stop_on_error,
168            AlertLevel::Info | AlertLevel::Warning => false,
169        };
170
171        if should_stop {
172            self.stop_flag.store(true, Ordering::SeqCst);
173            if self.config.log_alerts {
174                eprintln!("🛑 ANDON: Training stopped due to {} alert", alert.level.as_str());
175            }
176        }
177
178        // Store in history
179        self.alert_history.push(alert);
180    }
181
182    /// Trigger an info alert
183    pub fn info(&mut self, message: impl Into<String>) {
184        self.trigger(Alert::info(message));
185    }
186
187    /// Trigger a warning alert
188    pub fn warning(&mut self, message: impl Into<String>) {
189        self.trigger(Alert::warning(message));
190    }
191
192    /// Trigger an error alert
193    pub fn error(&mut self, message: impl Into<String>) {
194        self.trigger(Alert::error(message));
195    }
196
197    /// Trigger a critical alert
198    pub fn critical(&mut self, message: impl Into<String>) {
199        self.trigger(Alert::critical(message));
200    }
201
202    /// Get alert history
203    pub fn history(&self) -> &[Alert] {
204        &self.alert_history
205    }
206
207    /// Get alerts of a specific level
208    pub fn alerts_by_level(&self, level: AlertLevel) -> Vec<&Alert> {
209        self.alert_history.iter().filter(|a| a.level == level).collect()
210    }
211
212    /// Clear stop flag (for retry)
213    pub fn reset(&mut self) {
214        self.stop_flag.store(false, Ordering::SeqCst);
215    }
216}
217
218impl Default for AndonSystem {
219    fn default() -> Self {
220        Self::new()
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_alert_levels_ordered() {
230        assert!(AlertLevel::Info < AlertLevel::Warning);
231        assert!(AlertLevel::Warning < AlertLevel::Error);
232        assert!(AlertLevel::Error < AlertLevel::Critical);
233    }
234
235    #[test]
236    fn test_alert_creation() {
237        let alert = Alert::critical("Test failure").with_source("test").with_value(42.0);
238
239        assert_eq!(alert.level, AlertLevel::Critical);
240        assert_eq!(alert.message, "Test failure");
241        assert_eq!(alert.source, "test");
242        assert_eq!(alert.value, Some(42.0));
243    }
244
245    #[test]
246    fn test_andon_new() {
247        let andon = AndonSystem::new();
248        assert!(!andon.should_stop());
249        assert!(andon.history().is_empty());
250    }
251
252    #[test]
253    fn test_andon_info_no_stop() {
254        let mut andon = AndonSystem::new();
255        andon.info("Training started");
256        assert!(!andon.should_stop());
257        assert_eq!(andon.history().len(), 1);
258    }
259
260    #[test]
261    fn test_andon_warning_no_stop() {
262        let mut andon = AndonSystem::new();
263        andon.warning("Accuracy dip detected");
264        assert!(!andon.should_stop());
265    }
266
267    #[test]
268    fn test_andon_error_stops() {
269        let mut andon = AndonSystem::new();
270        andon.error("NaN detected in loss");
271        assert!(andon.should_stop());
272    }
273
274    #[test]
275    fn test_andon_critical_stops() {
276        let mut andon = AndonSystem::new();
277        andon.critical("All experts failed");
278        assert!(andon.should_stop());
279    }
280
281    #[test]
282    fn test_andon_reset() {
283        let mut andon = AndonSystem::new();
284        andon.critical("Test");
285        assert!(andon.should_stop());
286        andon.reset();
287        assert!(!andon.should_stop());
288    }
289
290    #[test]
291    fn test_andon_configurable() {
292        let config =
293            AndonConfig { stop_on_error: false, stop_on_critical: true, log_alerts: false };
294        let mut andon = AndonSystem::with_config(config);
295
296        andon.error("This should not stop");
297        assert!(!andon.should_stop());
298
299        andon.critical("This should stop");
300        assert!(andon.should_stop());
301    }
302
303    #[test]
304    fn test_stop_flag_shared() {
305        let mut andon = AndonSystem::new();
306        let flag = andon.stop_flag();
307
308        assert!(!flag.load(Ordering::SeqCst));
309        andon.critical("Stop!");
310        assert!(flag.load(Ordering::SeqCst));
311    }
312
313    #[test]
314    fn test_alert_level_as_str() {
315        assert_eq!(AlertLevel::Info.as_str(), "INFO");
316        assert_eq!(AlertLevel::Warning.as_str(), "WARNING");
317        assert_eq!(AlertLevel::Error.as_str(), "ERROR");
318        assert_eq!(AlertLevel::Critical.as_str(), "CRITICAL");
319    }
320
321    #[test]
322    fn test_alert_level_emoji() {
323        assert_eq!(AlertLevel::Info.emoji(), "ℹ️");
324        assert_eq!(AlertLevel::Warning.emoji(), "⚠️");
325        assert_eq!(AlertLevel::Error.emoji(), "❌");
326        assert_eq!(AlertLevel::Critical.emoji(), "🛑");
327    }
328
329    #[test]
330    fn test_alerts_by_level() {
331        let mut andon = AndonSystem::new();
332        andon.info("Info 1");
333        andon.warning("Warning 1");
334        andon.info("Info 2");
335
336        let info_alerts = andon.alerts_by_level(AlertLevel::Info);
337        assert_eq!(info_alerts.len(), 2);
338
339        let warning_alerts = andon.alerts_by_level(AlertLevel::Warning);
340        assert_eq!(warning_alerts.len(), 1);
341    }
342
343    #[test]
344    fn test_andon_default() {
345        let andon = AndonSystem::default();
346        assert!(!andon.should_stop());
347    }
348
349    #[test]
350    fn test_alert_clone() {
351        let alert = Alert::info("Test").with_source("test").with_value(1.0);
352        let cloned = alert.clone();
353        assert_eq!(alert.message, cloned.message);
354        assert_eq!(alert.source, cloned.source);
355        assert_eq!(alert.value, cloned.value);
356    }
357
358    #[test]
359    fn test_alert_level_clone_copy() {
360        let level = AlertLevel::Warning;
361        let copied = level;
362        let cloned = level;
363        assert_eq!(level, copied);
364        assert_eq!(level, cloned);
365    }
366
367    #[test]
368    fn test_trigger_should_stop_all_alert_level_variants() {
369        let levels =
370            [AlertLevel::Info, AlertLevel::Warning, AlertLevel::Error, AlertLevel::Critical];
371
372        for level in &levels {
373            // Syntactic match covering all arms from trigger() method
374            let should_stop = match level {
375                AlertLevel::Critical => true,
376                AlertLevel::Error => true,
377                AlertLevel::Info | AlertLevel::Warning => false,
378            };
379
380            let config =
381                AndonConfig { stop_on_error: true, stop_on_critical: true, log_alerts: false };
382            let mut andon = AndonSystem::with_config(config);
383            andon.trigger(Alert::new(*level, "test"));
384
385            match level {
386                AlertLevel::Critical | AlertLevel::Error => {
387                    assert!(andon.should_stop(), "Expected stop for {level:?}");
388                    assert!(should_stop);
389                }
390                AlertLevel::Info | AlertLevel::Warning => {
391                    assert!(!andon.should_stop(), "Expected no stop for {level:?}");
392                    assert!(!should_stop);
393                }
394            }
395        }
396    }
397}