mockforge_chaos/
alerts.rs

1//! Alert system for chaos events
2
3use crate::analytics::MetricsBucket;
4use chrono::{DateTime, Utc};
5use parking_lot::RwLock;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::sync::Arc;
9use tracing::{debug, info, warn};
10
11/// Alert severity level
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13pub enum AlertSeverity {
14    Info,
15    Warning,
16    Critical,
17}
18
19/// Alert type
20#[derive(Debug, Clone, Serialize, Deserialize)]
21#[serde(tag = "type")]
22pub enum AlertType {
23    /// High event rate detected
24    HighEventRate {
25        events_per_minute: usize,
26        threshold: usize,
27    },
28    /// High latency detected
29    HighLatency {
30        avg_latency_ms: f64,
31        threshold_ms: u64,
32    },
33    /// High fault rate detected
34    HighFaultRate {
35        faults_per_minute: usize,
36        threshold: usize,
37    },
38    /// Rate limit violations
39    RateLimitViolations {
40        violations_per_minute: usize,
41        threshold: usize,
42    },
43    /// Endpoint under stress
44    EndpointStress {
45        endpoint: String,
46        events_per_minute: usize,
47        threshold: usize,
48    },
49    /// Chaos impact high
50    HighImpact { severity_score: f64, threshold: f64 },
51    /// Custom alert
52    Custom {
53        message: String,
54        metadata: HashMap<String, String>,
55    },
56}
57
58/// Alert
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct Alert {
61    /// Unique alert ID
62    pub id: String,
63    /// Alert timestamp
64    pub timestamp: DateTime<Utc>,
65    /// Alert severity
66    pub severity: AlertSeverity,
67    /// Alert type and details
68    pub alert_type: AlertType,
69    /// Alert message
70    pub message: String,
71    /// Is this alert resolved
72    pub resolved: bool,
73    /// Resolution timestamp
74    pub resolved_at: Option<DateTime<Utc>>,
75}
76
77impl Alert {
78    /// Create a new alert
79    pub fn new(severity: AlertSeverity, alert_type: AlertType, message: impl Into<String>) -> Self {
80        Self {
81            id: uuid::Uuid::new_v4().to_string(),
82            timestamp: Utc::now(),
83            severity,
84            alert_type,
85            message: message.into(),
86            resolved: false,
87            resolved_at: None,
88        }
89    }
90
91    /// Resolve this alert
92    pub fn resolve(&mut self) {
93        self.resolved = true;
94        self.resolved_at = Some(Utc::now());
95    }
96}
97
98/// Alert rule configuration
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct AlertRule {
101    /// Rule ID
102    pub id: String,
103    /// Rule name
104    pub name: String,
105    /// Is this rule enabled
106    pub enabled: bool,
107    /// Severity for alerts from this rule
108    pub severity: AlertSeverity,
109    /// Rule type
110    pub rule_type: AlertRuleType,
111}
112
113/// Alert rule types
114#[derive(Debug, Clone, Serialize, Deserialize)]
115#[serde(tag = "type")]
116pub enum AlertRuleType {
117    /// Alert on event rate threshold
118    EventRateThreshold {
119        threshold: usize,
120        window_minutes: i64,
121    },
122    /// Alert on latency threshold
123    LatencyThreshold {
124        threshold_ms: u64,
125        window_minutes: i64,
126    },
127    /// Alert on fault rate threshold
128    FaultRateThreshold {
129        threshold: usize,
130        window_minutes: i64,
131    },
132    /// Alert on rate limit violations
133    RateLimitThreshold {
134        threshold: usize,
135        window_minutes: i64,
136    },
137    /// Alert on endpoint stress
138    EndpointThreshold {
139        endpoint: String,
140        threshold: usize,
141        window_minutes: i64,
142    },
143    /// Alert on chaos impact score
144    ImpactThreshold { threshold: f64, window_minutes: i64 },
145}
146
147impl AlertRule {
148    /// Create a new alert rule
149    pub fn new(
150        id: impl Into<String>,
151        name: impl Into<String>,
152        severity: AlertSeverity,
153        rule_type: AlertRuleType,
154    ) -> Self {
155        Self {
156            id: id.into(),
157            name: name.into(),
158            enabled: true,
159            severity,
160            rule_type,
161        }
162    }
163
164    /// Check if this rule should fire based on metrics
165    pub fn evaluate(&self, metrics: &[MetricsBucket]) -> Option<Alert> {
166        if !self.enabled || metrics.is_empty() {
167            return None;
168        }
169
170        match &self.rule_type {
171            AlertRuleType::EventRateThreshold { threshold, .. } => {
172                let total_events: usize = metrics.iter().map(|m| m.total_events).sum();
173                let events_per_minute = total_events / metrics.len().max(1);
174
175                if events_per_minute > *threshold {
176                    Some(Alert::new(
177                        self.severity,
178                        AlertType::HighEventRate {
179                            events_per_minute,
180                            threshold: *threshold,
181                        },
182                        format!(
183                            "High chaos event rate detected: {} events/min (threshold: {})",
184                            events_per_minute, threshold
185                        ),
186                    ))
187                } else {
188                    None
189                }
190            }
191            AlertRuleType::LatencyThreshold { threshold_ms, .. } => {
192                let avg_latency: f64 =
193                    metrics.iter().map(|m| m.avg_latency_ms).sum::<f64>() / metrics.len() as f64;
194
195                if avg_latency > *threshold_ms as f64 {
196                    Some(Alert::new(
197                        self.severity,
198                        AlertType::HighLatency {
199                            avg_latency_ms: avg_latency,
200                            threshold_ms: *threshold_ms,
201                        },
202                        format!(
203                            "High latency detected: {:.0}ms (threshold: {}ms)",
204                            avg_latency, threshold_ms
205                        ),
206                    ))
207                } else {
208                    None
209                }
210            }
211            AlertRuleType::FaultRateThreshold { threshold, .. } => {
212                let total_faults: usize = metrics.iter().map(|m| m.total_faults).sum();
213                let faults_per_minute = total_faults / metrics.len().max(1);
214
215                if faults_per_minute > *threshold {
216                    Some(Alert::new(
217                        self.severity,
218                        AlertType::HighFaultRate {
219                            faults_per_minute,
220                            threshold: *threshold,
221                        },
222                        format!(
223                            "High fault injection rate detected: {} faults/min (threshold: {})",
224                            faults_per_minute, threshold
225                        ),
226                    ))
227                } else {
228                    None
229                }
230            }
231            AlertRuleType::RateLimitThreshold { threshold, .. } => {
232                let total_violations: usize = metrics.iter().map(|m| m.rate_limit_violations).sum();
233                let violations_per_minute = total_violations / metrics.len().max(1);
234
235                if violations_per_minute > *threshold {
236                    Some(Alert::new(
237                        self.severity,
238                        AlertType::RateLimitViolations {
239                            violations_per_minute,
240                            threshold: *threshold,
241                        },
242                        format!(
243                            "High rate limit violations: {} violations/min (threshold: {})",
244                            violations_per_minute, threshold
245                        ),
246                    ))
247                } else {
248                    None
249                }
250            }
251            AlertRuleType::EndpointThreshold {
252                endpoint,
253                threshold,
254                ..
255            } => {
256                let endpoint_events: usize = metrics
257                    .iter()
258                    .map(|m| m.affected_endpoints.get(endpoint).copied().unwrap_or(0))
259                    .sum();
260                let events_per_minute = endpoint_events / metrics.len().max(1);
261
262                if events_per_minute > *threshold {
263                    Some(Alert::new(
264                        self.severity,
265                        AlertType::EndpointStress {
266                            endpoint: endpoint.clone(),
267                            events_per_minute,
268                            threshold: *threshold,
269                        },
270                        format!(
271                            "Endpoint '{}' under chaos stress: {} events/min (threshold: {})",
272                            endpoint, events_per_minute, threshold
273                        ),
274                    ))
275                } else {
276                    None
277                }
278            }
279            AlertRuleType::ImpactThreshold { .. } => {
280                // This would need ChaosImpact from analytics
281                // For now, return None - would be implemented with full integration
282                None
283            }
284        }
285    }
286}
287
288/// Alert handler trait
289pub trait AlertHandler: Send + Sync {
290    /// Handle an alert
291    fn handle(&self, alert: &Alert);
292}
293
294/// Console alert handler (logs to console)
295pub struct ConsoleAlertHandler;
296
297impl AlertHandler for ConsoleAlertHandler {
298    fn handle(&self, alert: &Alert) {
299        match alert.severity {
300            AlertSeverity::Info => info!("[ALERT] {}: {}", alert.id, alert.message),
301            AlertSeverity::Warning => warn!("[ALERT] {}: {}", alert.id, alert.message),
302            AlertSeverity::Critical => {
303                tracing::error!("[ALERT] {}: {}", alert.id, alert.message)
304            }
305        }
306    }
307}
308
309/// Alert manager
310pub struct AlertManager {
311    /// Alert rules
312    rules: Arc<RwLock<HashMap<String, AlertRule>>>,
313    /// Active alerts
314    active_alerts: Arc<RwLock<HashMap<String, Alert>>>,
315    /// Alert history
316    alert_history: Arc<RwLock<Vec<Alert>>>,
317    /// Alert handlers
318    handlers: Arc<RwLock<Vec<Box<dyn AlertHandler>>>>,
319    /// Maximum history size
320    max_history: usize,
321}
322
323impl AlertManager {
324    /// Create a new alert manager
325    pub fn new() -> Self {
326        Self {
327            rules: Arc::new(RwLock::new(HashMap::new())),
328            active_alerts: Arc::new(RwLock::new(HashMap::new())),
329            alert_history: Arc::new(RwLock::new(Vec::new())),
330            handlers: Arc::new(RwLock::new(vec![Box::new(ConsoleAlertHandler)])),
331            max_history: 1000,
332        }
333    }
334
335    /// Add an alert rule
336    pub fn add_rule(&self, rule: AlertRule) {
337        let id = rule.id.clone();
338        let mut rules = self.rules.write();
339        rules.insert(id.clone(), rule);
340        info!("Added alert rule: {}", id);
341    }
342
343    /// Remove an alert rule
344    pub fn remove_rule(&self, id: &str) -> Option<AlertRule> {
345        let mut rules = self.rules.write();
346        let removed = rules.remove(id);
347        if removed.is_some() {
348            info!("Removed alert rule: {}", id);
349        }
350        removed
351    }
352
353    /// Enable/disable a rule
354    pub fn set_rule_enabled(&self, id: &str, enabled: bool) -> Result<(), String> {
355        let mut rules = self.rules.write();
356        if let Some(rule) = rules.get_mut(id) {
357            rule.enabled = enabled;
358            info!("Alert rule '{}' {}", id, if enabled { "enabled" } else { "disabled" });
359            Ok(())
360        } else {
361            Err(format!("Rule '{}' not found", id))
362        }
363    }
364
365    /// Get all rules
366    pub fn get_rules(&self) -> Vec<AlertRule> {
367        let rules = self.rules.read();
368        rules.values().cloned().collect()
369    }
370
371    /// Evaluate all rules against metrics
372    pub fn evaluate_rules(&self, metrics: &[MetricsBucket]) {
373        let rules = self.rules.read();
374
375        for rule in rules.values() {
376            if let Some(alert) = rule.evaluate(metrics) {
377                self.fire_alert(alert);
378            }
379        }
380    }
381
382    /// Fire an alert
383    pub fn fire_alert(&self, alert: Alert) {
384        debug!("Firing alert: {} - {}", alert.id, alert.message);
385
386        // Store in active alerts
387        {
388            let mut active = self.active_alerts.write();
389            active.insert(alert.id.clone(), alert.clone());
390        }
391
392        // Add to history
393        {
394            let mut history = self.alert_history.write();
395            history.push(alert.clone());
396
397            // Trim history if needed
398            if history.len() > self.max_history {
399                let excess = history.len() - self.max_history;
400                history.drain(0..excess);
401            }
402        }
403
404        // Notify handlers
405        let handlers = self.handlers.read();
406        for handler in handlers.iter() {
407            handler.handle(&alert);
408        }
409    }
410
411    /// Resolve an alert
412    pub fn resolve_alert(&self, alert_id: &str) -> Result<(), String> {
413        let mut alert = {
414            let mut active = self.active_alerts.write();
415            active.remove(alert_id)
416        };
417
418        if let Some(ref mut alert_ref) = alert {
419            alert_ref.resolve();
420
421            // Update in history
422            let mut history = self.alert_history.write();
423            if let Some(historical_alert) = history.iter_mut().find(|a| a.id == alert_id) {
424                *historical_alert = alert_ref.clone();
425            }
426
427            info!("Resolved alert: {}", alert_id);
428            Ok(())
429        } else {
430            Err(format!("Alert '{}' not found", alert_id))
431        }
432    }
433
434    /// Get active alerts
435    pub fn get_active_alerts(&self) -> Vec<Alert> {
436        let active = self.active_alerts.read();
437        active.values().cloned().collect()
438    }
439
440    /// Get alert history
441    pub fn get_alert_history(&self, limit: Option<usize>) -> Vec<Alert> {
442        let history = self.alert_history.read();
443        let mut alerts: Vec<_> = history.clone();
444
445        if let Some(limit) = limit {
446            alerts.truncate(limit);
447        }
448
449        alerts
450    }
451
452    /// Add a custom alert handler
453    pub fn add_handler(&self, handler: Box<dyn AlertHandler>) {
454        let mut handlers = self.handlers.write();
455        handlers.push(handler);
456    }
457
458    /// Clear all alerts
459    pub fn clear_alerts(&self) {
460        let mut active = self.active_alerts.write();
461        let mut history = self.alert_history.write();
462        active.clear();
463        history.clear();
464        info!("Cleared all alerts");
465    }
466}
467
468impl Default for AlertManager {
469    fn default() -> Self {
470        Self::new()
471    }
472}
473
474#[cfg(test)]
475mod tests {
476    use super::*;
477    use crate::analytics::MetricsBucket;
478
479    #[test]
480    fn test_alert_creation() {
481        let alert = Alert::new(
482            AlertSeverity::Warning,
483            AlertType::HighEventRate {
484                events_per_minute: 100,
485                threshold: 50,
486            },
487            "Test alert",
488        );
489
490        assert_eq!(alert.severity, AlertSeverity::Warning);
491        assert!(!alert.resolved);
492    }
493
494    #[test]
495    fn test_alert_resolve() {
496        let mut alert = Alert::new(
497            AlertSeverity::Info,
498            AlertType::Custom {
499                message: "test".to_string(),
500                metadata: HashMap::new(),
501            },
502            "Test",
503        );
504
505        alert.resolve();
506        assert!(alert.resolved);
507        assert!(alert.resolved_at.is_some());
508    }
509
510    #[test]
511    fn test_alert_rule_evaluation() {
512        let rule = AlertRule::new(
513            "test_rule",
514            "Test Rule",
515            AlertSeverity::Warning,
516            AlertRuleType::EventRateThreshold {
517                threshold: 50,
518                window_minutes: 1,
519            },
520        );
521
522        let mut bucket = MetricsBucket::new(Utc::now(), crate::analytics::TimeBucket::Minute);
523        bucket.total_events = 100;
524
525        let alert = rule.evaluate(&[bucket]);
526        assert!(alert.is_some());
527
528        let alert = alert.unwrap();
529        assert_eq!(alert.severity, AlertSeverity::Warning);
530    }
531
532    #[test]
533    fn test_alert_manager() {
534        let manager = AlertManager::new();
535
536        let rule = AlertRule::new(
537            "test_rule",
538            "Test Rule",
539            AlertSeverity::Info,
540            AlertRuleType::EventRateThreshold {
541                threshold: 10,
542                window_minutes: 1,
543            },
544        );
545
546        manager.add_rule(rule);
547
548        let rules = manager.get_rules();
549        assert_eq!(rules.len(), 1);
550
551        manager.remove_rule("test_rule");
552        let rules = manager.get_rules();
553        assert_eq!(rules.len(), 0);
554    }
555}