Skip to main content

entrenar/monitor/inference/safety_andon/
andon.rs

1//! Safety Andon for inference monitoring
2
3use super::emergency::EmergencyCondition;
4use super::sil::SafetyIntegrityLevel;
5use crate::monitor::andon::{Alert, AndonSystem};
6use crate::monitor::inference::path::DecisionPath;
7use crate::monitor::inference::trace::DecisionTrace;
8
9/// Safety Andon for inference monitoring
10///
11/// # Features
12/// - Confidence monitoring
13/// - Latency monitoring
14/// - Emergency condition detection
15/// - Integration with base Andon system
16pub struct SafetyAndon {
17    /// Base Andon system
18    andon: AndonSystem,
19    /// Safety integrity level
20    sil: SafetyIntegrityLevel,
21    /// Minimum acceptable confidence
22    pub(crate) min_confidence: f32,
23    /// Maximum acceptable latency in nanoseconds
24    pub(crate) max_latency_ns: u64,
25    /// Consecutive low-confidence counter
26    low_confidence_count: usize,
27    /// Threshold for low confidence alert
28    pub(crate) low_confidence_threshold: usize,
29    /// Alert on unknown classification
30    pub(crate) alert_on_unknown: bool,
31}
32
33impl SafetyAndon {
34    /// Create a new safety Andon system
35    pub fn new(sil: SafetyIntegrityLevel) -> Self {
36        Self {
37            andon: AndonSystem::new(),
38            min_confidence: sil.min_confidence(),
39            max_latency_ns: sil.max_latency_ns(),
40            sil,
41            low_confidence_count: 0,
42            low_confidence_threshold: 5,
43            alert_on_unknown: true,
44        }
45    }
46
47    /// Set custom confidence threshold
48    pub fn with_min_confidence(mut self, threshold: f32) -> Self {
49        self.min_confidence = threshold;
50        self
51    }
52
53    /// Set custom latency threshold
54    pub fn with_max_latency_ns(mut self, max_ns: u64) -> Self {
55        self.max_latency_ns = max_ns;
56        self
57    }
58
59    /// Set consecutive low-confidence threshold
60    pub fn with_low_confidence_threshold(mut self, threshold: usize) -> Self {
61        self.low_confidence_threshold = threshold;
62        self
63    }
64
65    /// Disable unknown classification alerts
66    pub fn without_unknown_alerts(mut self) -> Self {
67        self.alert_on_unknown = false;
68        self
69    }
70
71    /// Check a trace against safety rules
72    pub fn check_trace<P: DecisionPath>(
73        &mut self,
74        trace: &DecisionTrace<P>,
75        latency_budget_ns: u64,
76    ) {
77        let confidence = trace.confidence();
78        let latency_ns = trace.latency_ns;
79
80        // Check for invalid output
81        if trace.output.is_nan() || trace.output.is_infinite() {
82            self.trigger_emergency(EmergencyCondition::InvalidOutput);
83            return;
84        }
85
86        // Check confidence
87        if confidence < self.min_confidence {
88            self.low_confidence_count += 1;
89
90            if self.low_confidence_count >= self.low_confidence_threshold {
91                self.trigger_emergency(EmergencyCondition::ConsecutiveLowConfidence {
92                    count: self.low_confidence_count,
93                    threshold: self.min_confidence,
94                });
95            } else {
96                self.andon.trigger(
97                    Alert::warning(format!(
98                        "Low confidence: {:.1}% (threshold: {:.1}%)",
99                        confidence * 100.0,
100                        self.min_confidence * 100.0
101                    ))
102                    .with_source("SafetyAndon")
103                    .with_value(f64::from(confidence)),
104                );
105            }
106        } else {
107            self.low_confidence_count = 0;
108        }
109
110        // Check latency
111        let effective_budget = latency_budget_ns.min(self.max_latency_ns);
112        if latency_ns > effective_budget {
113            let latency_ms = latency_ns as f64 / 1_000_000.0;
114            let budget_ms = effective_budget as f64 / 1_000_000.0;
115
116            if latency_ns > self.max_latency_ns * 2 {
117                // Critical: more than 2x over budget
118                self.trigger_emergency(EmergencyCondition::DecisionTimeout {
119                    max_ms: budget_ms as f32,
120                });
121            } else {
122                self.andon.trigger(
123                    Alert::warning(format!(
124                        "Latency exceeded: {latency_ms:.2}ms > {budget_ms:.2}ms budget"
125                    ))
126                    .with_source("SafetyAndon")
127                    .with_value(latency_ms),
128                );
129            }
130        }
131    }
132
133    /// Trigger an emergency condition
134    pub fn trigger_emergency(&mut self, condition: EmergencyCondition) {
135        let alert =
136            Alert::new(condition.alert_level(), condition.message()).with_source("SafetyAndon");
137        self.andon.trigger(alert);
138    }
139
140    /// Check if stop has been requested
141    pub fn should_stop(&self) -> bool {
142        self.andon.should_stop()
143    }
144
145    /// Reset the Andon system
146    pub fn reset(&mut self) {
147        self.andon.reset();
148        self.low_confidence_count = 0;
149    }
150
151    /// Get alert history
152    pub fn history(&self) -> &[Alert] {
153        self.andon.history()
154    }
155
156    /// Get the safety integrity level
157    pub fn sil(&self) -> SafetyIntegrityLevel {
158        self.sil
159    }
160
161    /// Get reference to the underlying Andon system
162    pub fn andon(&self) -> &AndonSystem {
163        &self.andon
164    }
165}
166
167impl Default for SafetyAndon {
168    fn default() -> Self {
169        Self::new(SafetyIntegrityLevel::QM)
170    }
171}