scirs2_sparse/realtime_performance_monitor/
alerts.rs

1//! Alert Management System
2//!
3//! This module provides a comprehensive alerting system for performance
4//! monitoring, including rule-based alerts and notifications.
5
6use super::metrics::{PerformanceSample, ProcessorType};
7use std::collections::{HashMap, VecDeque};
8use std::time::{SystemTime, UNIX_EPOCH};
9
10/// Performance alert
11#[derive(Debug, Clone)]
12pub struct Alert {
13    pub id: String,
14    pub timestamp: u64,
15    pub severity: AlertSeverity,
16    pub message: String,
17    pub processor_type: ProcessorType,
18    pub processor_id: String,
19    pub metric_name: String,
20    pub threshold_value: f64,
21    pub actual_value: f64,
22    pub resolved: bool,
23    pub resolution_timestamp: Option<u64>,
24    pub duration_ms: Option<u64>,
25}
26
27/// Alert severity levels
28#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
29pub enum AlertSeverity {
30    Info,
31    Warning,
32    Error,
33    Critical,
34}
35
36impl std::fmt::Display for AlertSeverity {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        match self {
39            AlertSeverity::Info => write!(f, "INFO"),
40            AlertSeverity::Warning => write!(f, "WARNING"),
41            AlertSeverity::Error => write!(f, "ERROR"),
42            AlertSeverity::Critical => write!(f, "CRITICAL"),
43        }
44    }
45}
46
47/// Alert rule configuration
48#[derive(Debug, Clone)]
49pub struct AlertRule {
50    pub id: String,
51    pub metric_name: String,
52    pub condition: AlertCondition,
53    pub threshold: f64,
54    pub severity: AlertSeverity,
55    pub enabled: bool,
56    pub processor_types: Vec<ProcessorType>,
57    pub cooldown_seconds: u64,
58    pub description: String,
59    pub auto_resolve: bool,
60    pub resolution_threshold: Option<f64>,
61}
62
63/// Alert condition types
64#[derive(Debug, Clone)]
65pub enum AlertCondition {
66    GreaterThan,
67    LessThan,
68    Equals,
69    NotEquals,
70    PercentageIncrease(f64), // Compared to baseline
71    PercentageDecrease(f64), // Compared to baseline
72    RateOfChange(f64),       // Rate of change per second
73}
74
75/// Notification channels for alerts
76#[derive(Debug, Clone)]
77pub enum NotificationChannel {
78    Console,
79    Log,
80    Email { address: String },
81    Webhook { url: String },
82    File { path: String },
83}
84
85/// Alert management system
86#[derive(Debug)]
87pub struct AlertManager {
88    pub active_alerts: HashMap<String, Alert>,
89    pub alert_history: VecDeque<Alert>,
90    pub notification_channels: Vec<NotificationChannel>,
91    pub alert_rules: Vec<AlertRule>,
92    max_history: usize,
93    rule_cooldowns: HashMap<String, u64>, // rule_id -> last_triggered_timestamp
94}
95
96impl AlertManager {
97    /// Create new alert manager
98    pub fn new(max_history: usize) -> Self {
99        Self {
100            active_alerts: HashMap::new(),
101            alert_history: VecDeque::with_capacity(max_history),
102            notification_channels: vec![NotificationChannel::Console],
103            alert_rules: Self::create_default_alert_rules(),
104            max_history,
105            rule_cooldowns: HashMap::new(),
106        }
107    }
108
109    /// Add an alert rule
110    pub fn add_rule(&mut self, rule: AlertRule) {
111        self.alert_rules.push(rule);
112    }
113
114    /// Remove an alert rule
115    pub fn remove_rule(&mut self, rule_id: &str) {
116        self.alert_rules.retain(|rule| rule.id != rule_id);
117        self.rule_cooldowns.remove(rule_id);
118    }
119
120    /// Enable/disable a rule
121    pub fn set_rule_enabled(&mut self, rule_id: &str, enabled: bool) {
122        if let Some(rule) = self.alert_rules.iter_mut().find(|r| r.id == rule_id) {
123            rule.enabled = enabled;
124        }
125    }
126
127    /// Process a performance sample for alert evaluation
128    pub fn process_sample(&mut self, sample: &PerformanceSample, baseline: Option<f64>) {
129        let current_time = SystemTime::now()
130            .duration_since(UNIX_EPOCH)
131            .unwrap_or_default()
132            .as_secs();
133
134        let mut alerts_to_trigger = Vec::new();
135
136        for rule in &self.alert_rules {
137            if !rule.enabled {
138                continue;
139            }
140
141            // Check if rule applies to this processor type
142            if !rule.processor_types.is_empty()
143                && !rule.processor_types.contains(&sample.processor_type)
144            {
145                continue;
146            }
147
148            // Check cooldown
149            if let Some(&last_triggered) = self.rule_cooldowns.get(&rule.id) {
150                if current_time - last_triggered < rule.cooldown_seconds {
151                    continue;
152                }
153            }
154
155            // Get metric value
156            if let Some(metric_value) = sample.get_metric(&rule.metric_name) {
157                if self.evaluate_condition(&rule.condition, metric_value, baseline) {
158                    let alert = self.create_alert(rule, sample, metric_value);
159                    alerts_to_trigger.push((alert, rule.id.clone()));
160                }
161            }
162        }
163
164        // Trigger alerts after iteration
165        for (alert, rule_id) in alerts_to_trigger {
166            self.trigger_alert(alert);
167            self.rule_cooldowns.insert(rule_id, current_time);
168        }
169
170        // Check for auto-resolution of existing alerts
171        self.check_auto_resolution(sample);
172    }
173
174    /// Evaluate alert condition
175    fn evaluate_condition(
176        &self,
177        condition: &AlertCondition,
178        value: f64,
179        baseline: Option<f64>,
180    ) -> bool {
181        match condition {
182            AlertCondition::GreaterThan => value > 0.0,
183            AlertCondition::LessThan => value < 0.0,
184            AlertCondition::Equals => (value - 0.0).abs() < f64::EPSILON,
185            AlertCondition::NotEquals => (value - 0.0).abs() >= f64::EPSILON,
186            AlertCondition::PercentageIncrease(threshold) => {
187                if let Some(baseline_val) = baseline {
188                    if baseline_val > 0.0 {
189                        let increase = (value - baseline_val) / baseline_val;
190                        increase > threshold / 100.0
191                    } else {
192                        false
193                    }
194                } else {
195                    false
196                }
197            }
198            AlertCondition::PercentageDecrease(threshold) => {
199                if let Some(baseline_val) = baseline {
200                    if baseline_val > 0.0 {
201                        let decrease = (baseline_val - value) / baseline_val;
202                        decrease > threshold / 100.0
203                    } else {
204                        false
205                    }
206                } else {
207                    false
208                }
209            }
210            AlertCondition::RateOfChange(_threshold) => {
211                // This would require tracking previous values
212                // For now, return false
213                false
214            }
215        }
216    }
217
218    /// Create an alert from a rule and sample
219    fn create_alert(
220        &self,
221        rule: &AlertRule,
222        sample: &PerformanceSample,
223        metric_value: f64,
224    ) -> Alert {
225        let timestamp = SystemTime::now()
226            .duration_since(UNIX_EPOCH)
227            .unwrap_or_default()
228            .as_millis() as u64;
229
230        let alert_id = format!(
231            "{}:{}:{}:{}",
232            rule.id, sample.processor_type, sample.processor_id, timestamp
233        );
234
235        let message = format!(
236            "{}: {} = {:.2} (threshold: {:.2}) for {}:{}",
237            rule.description,
238            rule.metric_name,
239            metric_value,
240            rule.threshold,
241            sample.processor_type,
242            sample.processor_id
243        );
244
245        Alert {
246            id: alert_id,
247            timestamp,
248            severity: rule.severity,
249            message,
250            processor_type: sample.processor_type,
251            processor_id: sample.processor_id.clone(),
252            metric_name: rule.metric_name.clone(),
253            threshold_value: rule.threshold,
254            actual_value: metric_value,
255            resolved: false,
256            resolution_timestamp: None,
257            duration_ms: None,
258        }
259    }
260
261    /// Trigger an alert
262    fn trigger_alert(&mut self, alert: Alert) {
263        // Add to active alerts
264        self.active_alerts.insert(alert.id.clone(), alert.clone());
265
266        // Add to history
267        if self.alert_history.len() >= self.max_history {
268            self.alert_history.pop_front();
269        }
270        self.alert_history.push_back(alert.clone());
271
272        // Send notifications
273        self.send_notifications(&alert);
274    }
275
276    /// Send notifications for an alert
277    fn send_notifications(&self, alert: &Alert) {
278        for channel in &self.notification_channels {
279            match channel {
280                NotificationChannel::Console => {
281                    println!(
282                        "[{}] {}: {}",
283                        alert.severity, alert.timestamp, alert.message
284                    );
285                }
286                NotificationChannel::Log => {
287                    log::warn!(
288                        "[{}] {}: {}",
289                        alert.severity,
290                        alert.timestamp,
291                        alert.message
292                    );
293                }
294                NotificationChannel::Email { address: _address } => {
295                    // Email notification would be implemented here
296                }
297                NotificationChannel::Webhook { url: _url } => {
298                    // Webhook notification would be implemented here
299                }
300                NotificationChannel::File { path: _path } => {
301                    // File notification would be implemented here
302                }
303            }
304        }
305    }
306
307    /// Check for auto-resolution of active alerts
308    fn check_auto_resolution(&mut self, sample: &PerformanceSample) {
309        let mut resolved_alerts = Vec::new();
310
311        for alert in self.active_alerts.values_mut() {
312            if alert.processor_type != sample.processor_type
313                || alert.processor_id != sample.processor_id
314            {
315                continue;
316            }
317
318            // Find corresponding rule
319            if let Some(rule) = self
320                .alert_rules
321                .iter()
322                .find(|r| r.id.starts_with(&alert.metric_name))
323            {
324                if rule.auto_resolve {
325                    if let Some(metric_value) = sample.get_metric(&alert.metric_name) {
326                        let should_resolve = match rule.resolution_threshold {
327                            Some(threshold) => match rule.condition {
328                                AlertCondition::GreaterThan => metric_value <= threshold,
329                                AlertCondition::LessThan => metric_value >= threshold,
330                                _ => false,
331                            },
332                            None => {
333                                // Use original threshold for resolution
334                                match rule.condition {
335                                    AlertCondition::GreaterThan => metric_value <= rule.threshold,
336                                    AlertCondition::LessThan => metric_value >= rule.threshold,
337                                    _ => false,
338                                }
339                            }
340                        };
341
342                        if should_resolve {
343                            alert.resolved = true;
344                            alert.resolution_timestamp = Some(sample.timestamp);
345                            alert.duration_ms = Some(sample.timestamp - alert.timestamp);
346                            resolved_alerts.push(alert.id.clone());
347                        }
348                    }
349                }
350            }
351        }
352
353        // Remove resolved alerts from active list
354        for alert_id in resolved_alerts {
355            self.active_alerts.remove(&alert_id);
356        }
357    }
358
359    /// Manually resolve an alert
360    pub fn resolve_alert(&mut self, alert_id: &str) -> bool {
361        if let Some(alert) = self.active_alerts.get_mut(alert_id) {
362            let current_time = SystemTime::now()
363                .duration_since(UNIX_EPOCH)
364                .unwrap_or_default()
365                .as_millis() as u64;
366
367            alert.resolved = true;
368            alert.resolution_timestamp = Some(current_time);
369            alert.duration_ms = Some(current_time - alert.timestamp);
370
371            self.active_alerts.remove(alert_id);
372            true
373        } else {
374            false
375        }
376    }
377
378    /// Get active alerts
379    pub fn get_active_alerts(&self) -> Vec<&Alert> {
380        self.active_alerts.values().collect()
381    }
382
383    /// Get alerts by severity
384    pub fn get_alerts_by_severity(&self, severity: AlertSeverity) -> Vec<&Alert> {
385        self.active_alerts
386            .values()
387            .filter(|alert| alert.severity == severity)
388            .collect()
389    }
390
391    /// Get alert statistics
392    pub fn get_alert_stats(&self) -> AlertStats {
393        let total_alerts = self.alert_history.len();
394        let active_alerts = self.active_alerts.len();
395
396        let mut severity_counts = HashMap::new();
397        for alert in &self.alert_history {
398            *severity_counts.entry(alert.severity).or_insert(0) += 1;
399        }
400
401        let resolved_alerts = self
402            .alert_history
403            .iter()
404            .filter(|alert| alert.resolved)
405            .count();
406
407        let avg_resolution_time = if resolved_alerts > 0 {
408            let total_duration: u64 = self
409                .alert_history
410                .iter()
411                .filter_map(|alert| alert.duration_ms)
412                .sum();
413            Some(total_duration as f64 / resolved_alerts as f64)
414        } else {
415            None
416        };
417
418        AlertStats {
419            total_alerts,
420            active_alerts,
421            resolved_alerts,
422            severity_counts,
423            avg_resolution_time_ms: avg_resolution_time,
424        }
425    }
426
427    /// Create default alert rules
428    pub fn create_default_alert_rules() -> Vec<AlertRule> {
429        vec![
430            AlertRule {
431                id: "high_execution_time".to_string(),
432                metric_name: "execution_time_ms".to_string(),
433                condition: AlertCondition::GreaterThan,
434                threshold: 1000.0,
435                severity: AlertSeverity::Warning,
436                enabled: true,
437                processor_types: vec![],
438                cooldown_seconds: 60,
439                description: "High execution time detected".to_string(),
440                auto_resolve: true,
441                resolution_threshold: Some(500.0),
442            },
443            AlertRule {
444                id: "low_throughput".to_string(),
445                metric_name: "throughput_ops_per_sec".to_string(),
446                condition: AlertCondition::LessThan,
447                threshold: 10.0,
448                severity: AlertSeverity::Error,
449                enabled: true,
450                processor_types: vec![],
451                cooldown_seconds: 120,
452                description: "Low throughput detected".to_string(),
453                auto_resolve: true,
454                resolution_threshold: Some(50.0),
455            },
456            AlertRule {
457                id: "high_error_rate".to_string(),
458                metric_name: "error_rate".to_string(),
459                condition: AlertCondition::GreaterThan,
460                threshold: 0.1,
461                severity: AlertSeverity::Critical,
462                enabled: true,
463                processor_types: vec![],
464                cooldown_seconds: 30,
465                description: "High error rate detected".to_string(),
466                auto_resolve: true,
467                resolution_threshold: Some(0.05),
468            },
469            AlertRule {
470                id: "low_cache_hit_ratio".to_string(),
471                metric_name: "cache_hit_ratio".to_string(),
472                condition: AlertCondition::LessThan,
473                threshold: 0.5,
474                severity: AlertSeverity::Warning,
475                enabled: true,
476                processor_types: vec![],
477                cooldown_seconds: 300,
478                description: "Low cache hit ratio detected".to_string(),
479                auto_resolve: true,
480                resolution_threshold: Some(0.7),
481            },
482        ]
483    }
484
485    /// Clear all alerts
486    pub fn clear_all_alerts(&mut self) {
487        self.active_alerts.clear();
488        self.alert_history.clear();
489        self.rule_cooldowns.clear();
490    }
491
492    /// Add notification channel
493    pub fn add_notification_channel(&mut self, channel: NotificationChannel) {
494        self.notification_channels.push(channel);
495    }
496}
497
498/// Alert statistics
499#[derive(Debug, Clone)]
500pub struct AlertStats {
501    pub total_alerts: usize,
502    pub active_alerts: usize,
503    pub resolved_alerts: usize,
504    pub severity_counts: HashMap<AlertSeverity, usize>,
505    pub avg_resolution_time_ms: Option<f64>,
506}
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511
512    #[test]
513    fn test_alert_creation() {
514        let alert = Alert {
515            id: "test-alert".to_string(),
516            timestamp: 12345,
517            severity: AlertSeverity::Warning,
518            message: "Test alert".to_string(),
519            processor_type: ProcessorType::QuantumInspired,
520            processor_id: "test-processor".to_string(),
521            metric_name: "execution_time_ms".to_string(),
522            threshold_value: 1000.0,
523            actual_value: 1500.0,
524            resolved: false,
525            resolution_timestamp: None,
526            duration_ms: None,
527        };
528
529        assert_eq!(alert.severity, AlertSeverity::Warning);
530        assert!(!alert.resolved);
531    }
532
533    #[test]
534    fn test_alert_manager_creation() {
535        let manager = AlertManager::new(1000);
536        assert_eq!(manager.max_history, 1000);
537        assert!(!manager.alert_rules.is_empty()); // Should have default rules
538    }
539
540    #[test]
541    fn test_add_remove_rules() {
542        let mut manager = AlertManager::new(1000);
543        let initial_count = manager.alert_rules.len();
544
545        let rule = AlertRule {
546            id: "test_rule".to_string(),
547            metric_name: "test_metric".to_string(),
548            condition: AlertCondition::GreaterThan,
549            threshold: 100.0,
550            severity: AlertSeverity::Info,
551            enabled: true,
552            processor_types: vec![],
553            cooldown_seconds: 60,
554            description: "Test rule".to_string(),
555            auto_resolve: false,
556            resolution_threshold: None,
557        };
558
559        manager.add_rule(rule);
560        assert_eq!(manager.alert_rules.len(), initial_count + 1);
561
562        manager.remove_rule("test_rule");
563        assert_eq!(manager.alert_rules.len(), initial_count);
564    }
565
566    #[test]
567    fn test_alert_severity_ordering() {
568        assert!(AlertSeverity::Critical > AlertSeverity::Error);
569        assert!(AlertSeverity::Error > AlertSeverity::Warning);
570        assert!(AlertSeverity::Warning > AlertSeverity::Info);
571    }
572
573    #[test]
574    fn test_alert_severity_display() {
575        assert_eq!(AlertSeverity::Info.to_string(), "INFO");
576        assert_eq!(AlertSeverity::Warning.to_string(), "WARNING");
577        assert_eq!(AlertSeverity::Error.to_string(), "ERROR");
578        assert_eq!(AlertSeverity::Critical.to_string(), "CRITICAL");
579    }
580
581    #[test]
582    fn test_alert_stats() {
583        let mut manager = AlertManager::new(1000);
584        let stats = manager.get_alert_stats();
585
586        assert_eq!(stats.total_alerts, 0);
587        assert_eq!(stats.active_alerts, 0);
588        assert_eq!(stats.resolved_alerts, 0);
589    }
590
591    #[test]
592    fn test_rule_enable_disable() {
593        let mut manager = AlertManager::new(1000);
594
595        if let Some(rule) = manager.alert_rules.first() {
596            let rule_id = rule.id.clone();
597            let initial_enabled = rule.enabled;
598
599            manager.set_rule_enabled(&rule_id, !initial_enabled);
600
601            if let Some(updated_rule) = manager.alert_rules.iter().find(|r| r.id == rule_id) {
602                assert_eq!(updated_rule.enabled, !initial_enabled);
603            }
604        }
605    }
606
607    #[test]
608    fn test_notification_channels() {
609        let mut manager = AlertManager::new(1000);
610        let initial_count = manager.notification_channels.len();
611
612        manager.add_notification_channel(NotificationChannel::Email {
613            address: "test@example.com".to_string(),
614        });
615
616        assert_eq!(manager.notification_channels.len(), initial_count + 1);
617    }
618}