chie_core/
alerting.rs

1//! Alerting system with configurable thresholds.
2//!
3//! This module provides a comprehensive alerting system for monitoring various metrics
4//! and triggering alerts when thresholds are exceeded.
5//!
6//! # Features
7//!
8//! - Multiple severity levels (Info, Warning, Error, Critical)
9//! - Configurable thresholds for various metrics
10//! - Alert suppression to prevent alert fatigue
11//! - Alert history tracking
12//! - Customizable alert handlers
13//!
14//! # Example
15//!
16//! ```
17//! use chie_core::alerting::{AlertManager, AlertSeverity, ThresholdConfig, AlertMetric};
18//!
19//! let mut manager = AlertManager::new();
20//!
21//! // Configure a threshold for high storage usage
22//! let threshold = ThresholdConfig {
23//!     metric: AlertMetric::StorageUsagePercent,
24//!     warning_threshold: 75.0,
25//!     error_threshold: 90.0,
26//!     critical_threshold: 95.0,
27//!     check_interval_secs: 60,
28//! };
29//! manager.add_threshold(threshold);
30//!
31//! // Check a metric value
32//! manager.check_metric(AlertMetric::StorageUsagePercent, 92.0);
33//!
34//! // Get active alerts
35//! let alerts = manager.get_active_alerts();
36//! for alert in alerts {
37//!     println!("Alert: {:?} - {}", alert.severity, alert.message);
38//! }
39//! ```
40
41use std::collections::HashMap;
42use std::time::{Duration, SystemTime};
43
44/// Alert severity levels.
45#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
46pub enum AlertSeverity {
47    /// Informational alerts.
48    Info,
49    /// Warning alerts that require attention.
50    Warning,
51    /// Error alerts that indicate problems.
52    Error,
53    /// Critical alerts that require immediate action.
54    Critical,
55}
56
57/// Alert metric types that can be monitored.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
59pub enum AlertMetric {
60    /// Storage usage percentage (0-100).
61    StorageUsagePercent,
62    /// Bandwidth usage in bytes per second.
63    BandwidthUsageBps,
64    /// CPU usage percentage (0-100).
65    CpuUsagePercent,
66    /// Memory usage percentage (0-100).
67    MemoryUsagePercent,
68    /// Error rate (errors per second).
69    ErrorRate,
70    /// Request latency in milliseconds.
71    LatencyMs,
72    /// Failed chunk verification count.
73    FailedVerifications,
74    /// Peer reputation score (0-100).
75    PeerReputation,
76    /// Cache hit rate percentage (0-100).
77    CacheHitRate,
78    /// Queue depth (number of pending requests).
79    QueueDepth,
80}
81
82impl AlertMetric {
83    /// Get a human-readable name for the metric.
84    #[must_use]
85    #[inline]
86    pub fn name(&self) -> &'static str {
87        match self {
88            Self::StorageUsagePercent => "Storage Usage",
89            Self::BandwidthUsageBps => "Bandwidth Usage",
90            Self::CpuUsagePercent => "CPU Usage",
91            Self::MemoryUsagePercent => "Memory Usage",
92            Self::ErrorRate => "Error Rate",
93            Self::LatencyMs => "Latency",
94            Self::FailedVerifications => "Failed Verifications",
95            Self::PeerReputation => "Peer Reputation",
96            Self::CacheHitRate => "Cache Hit Rate",
97            Self::QueueDepth => "Queue Depth",
98        }
99    }
100
101    /// Get the unit for the metric.
102    #[must_use]
103    #[inline]
104    pub fn unit(&self) -> &'static str {
105        match self {
106            Self::StorageUsagePercent
107            | Self::CpuUsagePercent
108            | Self::MemoryUsagePercent
109            | Self::CacheHitRate => "%",
110            Self::BandwidthUsageBps => "bps",
111            Self::ErrorRate => "errors/sec",
112            Self::LatencyMs => "ms",
113            Self::FailedVerifications | Self::QueueDepth => "count",
114            Self::PeerReputation => "score",
115        }
116    }
117}
118
119/// Configuration for threshold-based alerting.
120#[derive(Debug, Clone)]
121pub struct ThresholdConfig {
122    /// The metric type to monitor.
123    pub metric: AlertMetric,
124    /// Warning threshold value.
125    pub warning_threshold: f64,
126    /// Error threshold value.
127    pub error_threshold: f64,
128    /// Critical threshold value.
129    pub critical_threshold: f64,
130    /// Minimum interval between checks in seconds.
131    pub check_interval_secs: u64,
132}
133
134impl ThresholdConfig {
135    /// Determine the severity level for a given metric value.
136    #[must_use]
137    #[inline]
138    pub fn evaluate(&self, value: f64) -> Option<AlertSeverity> {
139        if value >= self.critical_threshold {
140            Some(AlertSeverity::Critical)
141        } else if value >= self.error_threshold {
142            Some(AlertSeverity::Error)
143        } else if value >= self.warning_threshold {
144            Some(AlertSeverity::Warning)
145        } else {
146            None
147        }
148    }
149}
150
151/// An alert triggered by a threshold violation.
152#[derive(Debug, Clone)]
153pub struct Alert {
154    /// Unique alert ID.
155    pub id: String,
156    /// Alert severity.
157    pub severity: AlertSeverity,
158    /// The metric that triggered the alert.
159    pub metric: AlertMetric,
160    /// The measured value.
161    pub value: f64,
162    /// The threshold that was exceeded.
163    pub threshold: f64,
164    /// Alert message.
165    pub message: String,
166    /// Timestamp when the alert was created.
167    pub timestamp: SystemTime,
168    /// Whether the alert is still active.
169    pub active: bool,
170}
171
172impl Alert {
173    /// Create a new alert.
174    #[must_use]
175    pub fn new(severity: AlertSeverity, metric: AlertMetric, value: f64, threshold: f64) -> Self {
176        let id = format!(
177            "{:?}_{:?}_{}",
178            metric,
179            severity,
180            SystemTime::now()
181                .duration_since(SystemTime::UNIX_EPOCH)
182                .unwrap_or_default()
183                .as_millis()
184        );
185
186        let message = format!(
187            "{} {} is {:.2}{} (threshold: {:.2}{})",
188            severity_emoji(severity),
189            metric.name(),
190            value,
191            metric.unit(),
192            threshold,
193            metric.unit()
194        );
195
196        Self {
197            id,
198            severity,
199            metric,
200            value,
201            threshold,
202            message,
203            timestamp: SystemTime::now(),
204            active: true,
205        }
206    }
207
208    /// Resolve the alert.
209    #[inline]
210    pub fn resolve(&mut self) {
211        self.active = false;
212    }
213
214    /// Get the age of the alert in seconds.
215    #[must_use]
216    #[inline]
217    pub fn age_secs(&self) -> u64 {
218        SystemTime::now()
219            .duration_since(self.timestamp)
220            .unwrap_or_default()
221            .as_secs()
222    }
223}
224
225/// Alert manager for threshold-based monitoring.
226pub struct AlertManager {
227    /// Configured thresholds.
228    thresholds: HashMap<AlertMetric, ThresholdConfig>,
229    /// Active alerts.
230    active_alerts: Vec<Alert>,
231    /// Alert history (resolved alerts).
232    alert_history: Vec<Alert>,
233    /// Maximum number of alerts to keep in history.
234    max_history_size: usize,
235    /// Last check time for each metric.
236    last_check: HashMap<AlertMetric, SystemTime>,
237    /// Suppression window in seconds.
238    suppression_window_secs: u64,
239}
240
241impl Default for AlertManager {
242    fn default() -> Self {
243        Self::new()
244    }
245}
246
247impl AlertManager {
248    /// Create a new alert manager.
249    #[must_use]
250    #[inline]
251    pub fn new() -> Self {
252        Self {
253            thresholds: HashMap::new(),
254            active_alerts: Vec::new(),
255            alert_history: Vec::new(),
256            max_history_size: 1000,
257            last_check: HashMap::new(),
258            suppression_window_secs: 300, // 5 minutes default
259        }
260    }
261
262    /// Create a new alert manager with custom configuration.
263    #[must_use]
264    #[inline]
265    pub fn with_config(max_history_size: usize, suppression_window_secs: u64) -> Self {
266        Self {
267            thresholds: HashMap::new(),
268            active_alerts: Vec::new(),
269            alert_history: Vec::new(),
270            max_history_size,
271            last_check: HashMap::new(),
272            suppression_window_secs,
273        }
274    }
275
276    /// Add a threshold configuration.
277    pub fn add_threshold(&mut self, config: ThresholdConfig) {
278        self.thresholds.insert(config.metric, config);
279    }
280
281    /// Remove a threshold configuration.
282    pub fn remove_threshold(&mut self, metric: AlertMetric) {
283        self.thresholds.remove(&metric);
284    }
285
286    /// Check a metric value against configured thresholds.
287    pub fn check_metric(&mut self, metric: AlertMetric, value: f64) {
288        let Some(config) = self.thresholds.get(&metric) else {
289            return;
290        };
291
292        // Check if we should skip this check due to interval
293        if let Some(last_check_time) = self.last_check.get(&metric) {
294            let elapsed = SystemTime::now()
295                .duration_since(*last_check_time)
296                .unwrap_or_default();
297            if elapsed < Duration::from_secs(config.check_interval_secs) {
298                return;
299            }
300        }
301
302        self.last_check.insert(metric, SystemTime::now());
303
304        // Evaluate the metric
305        if let Some(severity) = config.evaluate(value) {
306            let threshold = match severity {
307                AlertSeverity::Critical => config.critical_threshold,
308                AlertSeverity::Error => config.error_threshold,
309                AlertSeverity::Warning => config.warning_threshold,
310                AlertSeverity::Info => 0.0,
311            };
312
313            // Check if we should suppress this alert
314            if !self.should_suppress_alert(metric, severity) {
315                let alert = Alert::new(severity, metric, value, threshold);
316                self.active_alerts.push(alert);
317            }
318        } else {
319            // Resolve any active alerts for this metric
320            self.resolve_alerts_for_metric(metric);
321        }
322    }
323
324    /// Check if an alert should be suppressed.
325    #[must_use]
326    fn should_suppress_alert(&self, metric: AlertMetric, severity: AlertSeverity) -> bool {
327        let suppression_duration = Duration::from_secs(self.suppression_window_secs);
328
329        self.active_alerts.iter().any(|alert| {
330            alert.metric == metric
331                && alert.severity == severity
332                && alert.active
333                && SystemTime::now()
334                    .duration_since(alert.timestamp)
335                    .unwrap_or_default()
336                    < suppression_duration
337        })
338    }
339
340    /// Resolve all active alerts for a metric.
341    fn resolve_alerts_for_metric(&mut self, metric: AlertMetric) {
342        for alert in &mut self.active_alerts {
343            if alert.metric == metric && alert.active {
344                alert.resolve();
345            }
346        }
347
348        // Move resolved alerts to history
349        let mut remaining = Vec::new();
350        let mut resolved = Vec::new();
351
352        for alert in self.active_alerts.drain(..) {
353            if alert.active {
354                remaining.push(alert);
355            } else {
356                resolved.push(alert);
357            }
358        }
359
360        self.active_alerts = remaining;
361        self.alert_history.extend(resolved);
362
363        // Trim history if needed
364        if self.alert_history.len() > self.max_history_size {
365            let excess = self.alert_history.len() - self.max_history_size;
366            self.alert_history.drain(0..excess);
367        }
368    }
369
370    /// Get all active alerts.
371    #[must_use]
372    #[inline]
373    pub fn get_active_alerts(&self) -> &[Alert] {
374        &self.active_alerts
375    }
376
377    /// Get active alerts for a specific metric.
378    #[must_use]
379    #[inline]
380    pub fn get_alerts_for_metric(&self, metric: AlertMetric) -> Vec<&Alert> {
381        self.active_alerts
382            .iter()
383            .filter(|a| a.metric == metric && a.active)
384            .collect()
385    }
386
387    /// Get active alerts by severity.
388    #[must_use]
389    #[inline]
390    pub fn get_alerts_by_severity(&self, severity: AlertSeverity) -> Vec<&Alert> {
391        self.active_alerts
392            .iter()
393            .filter(|a| a.severity == severity && a.active)
394            .collect()
395    }
396
397    /// Get the alert history.
398    #[must_use]
399    #[inline]
400    pub fn get_alert_history(&self) -> &[Alert] {
401        &self.alert_history
402    }
403
404    /// Clear all resolved alerts from history.
405    pub fn clear_history(&mut self) {
406        self.alert_history.clear();
407    }
408
409    /// Get the count of active alerts.
410    #[must_use]
411    #[inline]
412    pub fn active_alert_count(&self) -> usize {
413        self.active_alerts.len()
414    }
415
416    /// Get the count of critical alerts.
417    #[must_use]
418    #[inline]
419    pub fn critical_alert_count(&self) -> usize {
420        self.active_alerts
421            .iter()
422            .filter(|a| a.severity == AlertSeverity::Critical && a.active)
423            .count()
424    }
425
426    /// Check if there are any critical alerts.
427    #[must_use]
428    #[inline]
429    pub fn has_critical_alerts(&self) -> bool {
430        self.critical_alert_count() > 0
431    }
432}
433
434/// Get an emoji for an alert severity.
435#[must_use]
436#[inline]
437fn severity_emoji(severity: AlertSeverity) -> &'static str {
438    match severity {
439        AlertSeverity::Info => "ℹ️",
440        AlertSeverity::Warning => "⚠️",
441        AlertSeverity::Error => "❌",
442        AlertSeverity::Critical => "🚨",
443    }
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn test_threshold_evaluation() {
452        let config = ThresholdConfig {
453            metric: AlertMetric::StorageUsagePercent,
454            warning_threshold: 75.0,
455            error_threshold: 90.0,
456            critical_threshold: 95.0,
457            check_interval_secs: 60,
458        };
459
460        assert_eq!(config.evaluate(50.0), None);
461        assert_eq!(config.evaluate(75.0), Some(AlertSeverity::Warning));
462        assert_eq!(config.evaluate(90.0), Some(AlertSeverity::Error));
463        assert_eq!(config.evaluate(95.0), Some(AlertSeverity::Critical));
464    }
465
466    #[test]
467    fn test_alert_creation() {
468        let alert = Alert::new(
469            AlertSeverity::Warning,
470            AlertMetric::StorageUsagePercent,
471            85.0,
472            75.0,
473        );
474
475        assert_eq!(alert.severity, AlertSeverity::Warning);
476        assert_eq!(alert.metric, AlertMetric::StorageUsagePercent);
477        assert_eq!(alert.value, 85.0);
478        assert_eq!(alert.threshold, 75.0);
479        assert!(alert.active);
480    }
481
482    #[test]
483    fn test_alert_manager_basic() {
484        let mut manager = AlertManager::new();
485
486        let config = ThresholdConfig {
487            metric: AlertMetric::StorageUsagePercent,
488            warning_threshold: 75.0,
489            error_threshold: 90.0,
490            critical_threshold: 95.0,
491            check_interval_secs: 0,
492        };
493        manager.add_threshold(config);
494
495        // Check below threshold
496        manager.check_metric(AlertMetric::StorageUsagePercent, 50.0);
497        assert_eq!(manager.active_alert_count(), 0);
498
499        // Check above warning threshold
500        manager.check_metric(AlertMetric::StorageUsagePercent, 80.0);
501        assert_eq!(manager.active_alert_count(), 1);
502
503        // Resolve by checking below threshold
504        manager.check_metric(AlertMetric::StorageUsagePercent, 50.0);
505        assert_eq!(manager.active_alert_count(), 0);
506    }
507
508    #[test]
509    fn test_alert_severity_filtering() {
510        let mut manager = AlertManager::new();
511
512        let config = ThresholdConfig {
513            metric: AlertMetric::StorageUsagePercent,
514            warning_threshold: 75.0,
515            error_threshold: 90.0,
516            critical_threshold: 95.0,
517            check_interval_secs: 0,
518        };
519        manager.add_threshold(config);
520
521        manager.check_metric(AlertMetric::StorageUsagePercent, 96.0);
522
523        let critical_alerts = manager.get_alerts_by_severity(AlertSeverity::Critical);
524        assert_eq!(critical_alerts.len(), 1);
525        assert!(manager.has_critical_alerts());
526    }
527
528    #[test]
529    fn test_multiple_metrics() {
530        let mut manager = AlertManager::new();
531
532        let storage_config = ThresholdConfig {
533            metric: AlertMetric::StorageUsagePercent,
534            warning_threshold: 75.0,
535            error_threshold: 90.0,
536            critical_threshold: 95.0,
537            check_interval_secs: 0,
538        };
539        manager.add_threshold(storage_config);
540
541        let cpu_config = ThresholdConfig {
542            metric: AlertMetric::CpuUsagePercent,
543            warning_threshold: 70.0,
544            error_threshold: 85.0,
545            critical_threshold: 95.0,
546            check_interval_secs: 0,
547        };
548        manager.add_threshold(cpu_config);
549
550        manager.check_metric(AlertMetric::StorageUsagePercent, 92.0);
551        manager.check_metric(AlertMetric::CpuUsagePercent, 88.0);
552
553        assert_eq!(manager.active_alert_count(), 2);
554    }
555
556    #[test]
557    fn test_alert_history() {
558        let mut manager = AlertManager::with_config(100, 0);
559
560        let config = ThresholdConfig {
561            metric: AlertMetric::StorageUsagePercent,
562            warning_threshold: 75.0,
563            error_threshold: 90.0,
564            critical_threshold: 95.0,
565            check_interval_secs: 0,
566        };
567        manager.add_threshold(config);
568
569        // Trigger and resolve an alert
570        manager.check_metric(AlertMetric::StorageUsagePercent, 80.0);
571        assert_eq!(manager.active_alert_count(), 1);
572
573        manager.check_metric(AlertMetric::StorageUsagePercent, 50.0);
574        assert_eq!(manager.active_alert_count(), 0);
575        assert_eq!(manager.get_alert_history().len(), 1);
576
577        manager.clear_history();
578        assert_eq!(manager.get_alert_history().len(), 0);
579    }
580
581    #[test]
582    fn test_metric_type_info() {
583        assert_eq!(AlertMetric::StorageUsagePercent.name(), "Storage Usage");
584        assert_eq!(AlertMetric::StorageUsagePercent.unit(), "%");
585        assert_eq!(AlertMetric::BandwidthUsageBps.unit(), "bps");
586        assert_eq!(AlertMetric::LatencyMs.unit(), "ms");
587    }
588}