rustkernel_core/observability/
alerting.rs

1//! Alert Rules and Routing
2//!
3//! Defines alert rules for kernel health and performance monitoring.
4//!
5//! # Features
6//!
7//! - Alert rule definition with conditions
8//! - Severity levels and routing
9//! - SLO violation alerts
10//! - Integration with external alerting systems
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use rustkernel_core::observability::alerting::{AlertRule, AlertSeverity, AlertConfig};
16//!
17//! let config = AlertConfig::default()
18//!     .add_rule(
19//!         AlertRule::new("high_latency")
20//!             .condition("avg_latency_ms > 100")
21//!             .severity(AlertSeverity::Warning)
22//!             .for_duration(Duration::from_secs(60))
23//!     );
24//! ```
25
26use serde::{Deserialize, Serialize};
27use std::time::Duration;
28
29/// Alert severity levels
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
31#[serde(rename_all = "lowercase")]
32pub enum AlertSeverity {
33    /// Informational alert
34    Info,
35    /// Warning - may need attention
36    #[default]
37    Warning,
38    /// Critical - needs immediate attention
39    Critical,
40    /// Page - wake someone up
41    Page,
42}
43
44impl std::fmt::Display for AlertSeverity {
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46        match self {
47            Self::Info => write!(f, "info"),
48            Self::Warning => write!(f, "warning"),
49            Self::Critical => write!(f, "critical"),
50            Self::Page => write!(f, "page"),
51        }
52    }
53}
54
55/// Alert state
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
57#[serde(rename_all = "lowercase")]
58pub enum AlertState {
59    /// Alert is not firing
60    #[default]
61    Ok,
62    /// Alert condition is pending (within for_duration)
63    Pending,
64    /// Alert is firing
65    Firing,
66    /// Alert has been acknowledged
67    Acknowledged,
68    /// Alert has been resolved
69    Resolved,
70}
71
72/// Alert configuration
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct AlertConfig {
75    /// Enable alerting
76    pub enabled: bool,
77    /// Alert rules
78    pub rules: Vec<AlertRule>,
79    /// Alert routing configuration
80    pub routing: AlertRouting,
81    /// Evaluation interval
82    pub evaluation_interval: Duration,
83    /// Resolve timeout (auto-resolve after this duration of OK)
84    pub resolve_timeout: Duration,
85}
86
87impl Default for AlertConfig {
88    fn default() -> Self {
89        Self {
90            enabled: true,
91            rules: Vec::new(),
92            routing: AlertRouting::default(),
93            evaluation_interval: Duration::from_secs(15),
94            resolve_timeout: Duration::from_secs(300),
95        }
96    }
97}
98
99impl AlertConfig {
100    /// Add an alert rule
101    pub fn add_rule(mut self, rule: AlertRule) -> Self {
102        self.rules.push(rule);
103        self
104    }
105
106    /// Set routing configuration
107    pub fn with_routing(mut self, routing: AlertRouting) -> Self {
108        self.routing = routing;
109        self
110    }
111
112    /// Set evaluation interval
113    pub fn with_evaluation_interval(mut self, interval: Duration) -> Self {
114        self.evaluation_interval = interval;
115        self
116    }
117
118    /// Add default kernel health rules
119    pub fn with_default_rules(mut self) -> Self {
120        self.rules.push(AlertRule::kernel_unhealthy());
121        self.rules.push(AlertRule::high_latency());
122        self.rules.push(AlertRule::high_error_rate());
123        self.rules.push(AlertRule::queue_depth());
124        self.rules.push(AlertRule::gpu_memory());
125        self
126    }
127}
128
129/// Alert rule definition
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct AlertRule {
132    /// Rule name
133    pub name: String,
134    /// Rule description
135    pub description: String,
136    /// Alert condition expression
137    pub condition: String,
138    /// Severity level
139    pub severity: AlertSeverity,
140    /// Duration condition must be true before firing
141    pub for_duration: Duration,
142    /// Labels for routing
143    pub labels: std::collections::HashMap<String, String>,
144    /// Annotations for alert message
145    pub annotations: std::collections::HashMap<String, String>,
146    /// Kernels this rule applies to (empty = all)
147    pub kernel_filter: Vec<String>,
148    /// Domains this rule applies to (empty = all)
149    pub domain_filter: Vec<String>,
150}
151
152impl AlertRule {
153    /// Create a new alert rule
154    pub fn new(name: impl Into<String>) -> Self {
155        Self {
156            name: name.into(),
157            description: String::new(),
158            condition: String::new(),
159            severity: AlertSeverity::Warning,
160            for_duration: Duration::from_secs(0),
161            labels: std::collections::HashMap::new(),
162            annotations: std::collections::HashMap::new(),
163            kernel_filter: Vec::new(),
164            domain_filter: Vec::new(),
165        }
166    }
167
168    /// Set description
169    pub fn description(mut self, desc: impl Into<String>) -> Self {
170        self.description = desc.into();
171        self
172    }
173
174    /// Set condition
175    pub fn condition(mut self, cond: impl Into<String>) -> Self {
176        self.condition = cond.into();
177        self
178    }
179
180    /// Set severity
181    pub fn severity(mut self, severity: AlertSeverity) -> Self {
182        self.severity = severity;
183        self
184    }
185
186    /// Set for_duration
187    pub fn for_duration(mut self, duration: Duration) -> Self {
188        self.for_duration = duration;
189        self
190    }
191
192    /// Add a label
193    pub fn label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
194        self.labels.insert(key.into(), value.into());
195        self
196    }
197
198    /// Add an annotation
199    pub fn annotation(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
200        self.annotations.insert(key.into(), value.into());
201        self
202    }
203
204    /// Filter to specific kernels
205    pub fn for_kernels(mut self, kernels: Vec<String>) -> Self {
206        self.kernel_filter = kernels;
207        self
208    }
209
210    /// Filter to specific domains
211    pub fn for_domains(mut self, domains: Vec<String>) -> Self {
212        self.domain_filter = domains;
213        self
214    }
215
216    // Predefined rules
217
218    /// Kernel unhealthy rule
219    pub fn kernel_unhealthy() -> Self {
220        Self::new("KernelUnhealthy")
221            .description("Kernel is reporting unhealthy status")
222            .condition("health_status != healthy")
223            .severity(AlertSeverity::Critical)
224            .for_duration(Duration::from_secs(30))
225            .annotation("summary", "Kernel {{ $labels.kernel_id }} is unhealthy")
226    }
227
228    /// High latency rule
229    pub fn high_latency() -> Self {
230        Self::new("KernelHighLatency")
231            .description("Kernel message latency is above threshold")
232            .condition("avg_latency_ms > 100")
233            .severity(AlertSeverity::Warning)
234            .for_duration(Duration::from_secs(60))
235            .annotation(
236                "summary",
237                "Kernel {{ $labels.kernel_id }} has high latency ({{ $value }}ms)",
238            )
239    }
240
241    /// High error rate rule
242    pub fn high_error_rate() -> Self {
243        Self::new("KernelHighErrorRate")
244            .description("Kernel error rate is above threshold")
245            .condition("error_rate > 0.01")
246            .severity(AlertSeverity::Warning)
247            .for_duration(Duration::from_secs(300))
248            .annotation(
249                "summary",
250                "Kernel {{ $labels.kernel_id }} has high error rate ({{ $value }})",
251            )
252    }
253
254    /// Queue depth rule
255    pub fn queue_depth() -> Self {
256        Self::new("KernelQueueDepth")
257            .description("Kernel message queue is getting full")
258            .condition("queue_depth > 1000")
259            .severity(AlertSeverity::Warning)
260            .for_duration(Duration::from_secs(60))
261            .annotation(
262                "summary",
263                "Kernel {{ $labels.kernel_id }} queue depth is high ({{ $value }})",
264            )
265    }
266
267    /// GPU memory rule
268    pub fn gpu_memory() -> Self {
269        Self::new("GPUMemoryHigh")
270            .description("GPU memory usage is above 90%")
271            .condition("gpu_memory_percent > 90")
272            .severity(AlertSeverity::Critical)
273            .for_duration(Duration::from_secs(60))
274            .annotation(
275                "summary",
276                "GPU memory usage is critically high ({{ $value }}%)",
277            )
278    }
279
280    /// SLO violation rule
281    pub fn slo_violation(slo_name: impl Into<String>) -> Self {
282        let name = slo_name.into();
283        Self::new(format!("SLOViolation_{}", name))
284            .description(format!("SLO '{}' is being violated", name))
285            .condition(format!("slo_{}_compliance < target", name))
286            .severity(AlertSeverity::Warning)
287            .for_duration(Duration::from_secs(300))
288            .label("slo", name.clone())
289            .annotation(
290                "summary",
291                format!("SLO '{}' compliance is below target", name),
292            )
293    }
294}
295
296/// Alert routing configuration
297#[derive(Debug, Clone, Default, Serialize, Deserialize)]
298pub struct AlertRouting {
299    /// Default receiver
300    pub default_receiver: Option<String>,
301    /// Routes based on labels
302    pub routes: Vec<AlertRoute>,
303    /// Receiver configurations
304    pub receivers: Vec<AlertReceiver>,
305}
306
307impl AlertRouting {
308    /// Add a route
309    pub fn add_route(mut self, route: AlertRoute) -> Self {
310        self.routes.push(route);
311        self
312    }
313
314    /// Add a receiver
315    pub fn add_receiver(mut self, receiver: AlertReceiver) -> Self {
316        self.receivers.push(receiver);
317        self
318    }
319
320    /// Set default receiver
321    pub fn with_default(mut self, receiver: impl Into<String>) -> Self {
322        self.default_receiver = Some(receiver.into());
323        self
324    }
325}
326
327/// Alert route
328#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct AlertRoute {
330    /// Label matchers
331    pub matchers: std::collections::HashMap<String, String>,
332    /// Receiver name
333    pub receiver: String,
334    /// Continue matching after this route
335    pub continue_matching: bool,
336    /// Group by labels
337    pub group_by: Vec<String>,
338    /// Group wait duration
339    pub group_wait: Duration,
340    /// Group interval
341    pub group_interval: Duration,
342}
343
344impl AlertRoute {
345    /// Create a new route
346    pub fn new(receiver: impl Into<String>) -> Self {
347        Self {
348            matchers: std::collections::HashMap::new(),
349            receiver: receiver.into(),
350            continue_matching: false,
351            group_by: Vec::new(),
352            group_wait: Duration::from_secs(30),
353            group_interval: Duration::from_secs(300),
354        }
355    }
356
357    /// Add a matcher
358    pub fn match_label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
359        self.matchers.insert(key.into(), value.into());
360        self
361    }
362
363    /// Set group by
364    pub fn group_by(mut self, labels: Vec<String>) -> Self {
365        self.group_by = labels;
366        self
367    }
368}
369
370/// Alert receiver
371#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct AlertReceiver {
373    /// Receiver name
374    pub name: String,
375    /// Receiver type
376    pub receiver_type: ReceiverType,
377}
378
379impl AlertReceiver {
380    /// Create a new receiver
381    pub fn new(name: impl Into<String>, receiver_type: ReceiverType) -> Self {
382        Self {
383            name: name.into(),
384            receiver_type,
385        }
386    }
387
388    /// Slack receiver
389    pub fn slack(name: impl Into<String>, webhook_url: impl Into<String>) -> Self {
390        Self::new(
391            name,
392            ReceiverType::Slack {
393                webhook_url: webhook_url.into(),
394                channel: None,
395            },
396        )
397    }
398
399    /// PagerDuty receiver
400    pub fn pagerduty(name: impl Into<String>, service_key: impl Into<String>) -> Self {
401        Self::new(
402            name,
403            ReceiverType::PagerDuty {
404                service_key: service_key.into(),
405            },
406        )
407    }
408
409    /// Webhook receiver
410    pub fn webhook(name: impl Into<String>, url: impl Into<String>) -> Self {
411        Self::new(name, ReceiverType::Webhook { url: url.into() })
412    }
413}
414
415/// Receiver type
416#[derive(Debug, Clone, Serialize, Deserialize)]
417#[serde(tag = "type", rename_all = "snake_case")]
418pub enum ReceiverType {
419    /// Slack webhook
420    Slack {
421        /// Slack webhook URL
422        webhook_url: String,
423        /// Slack channel override
424        channel: Option<String>,
425    },
426    /// PagerDuty
427    PagerDuty {
428        /// PagerDuty service key
429        service_key: String,
430    },
431    /// Generic webhook
432    Webhook {
433        /// Webhook URL
434        url: String,
435    },
436    /// Email
437    Email {
438        /// Email recipients
439        to: Vec<String>,
440        /// Sender address
441        from: String,
442        /// SMTP server address
443        smtp_server: String,
444    },
445    /// Log only (for testing)
446    Log,
447}
448
449/// An active alert instance
450#[derive(Debug, Clone, Serialize)]
451pub struct Alert {
452    /// Alert rule name
453    pub rule_name: String,
454    /// Current state
455    pub state: AlertState,
456    /// Severity
457    pub severity: AlertSeverity,
458    /// Labels
459    pub labels: std::collections::HashMap<String, String>,
460    /// Annotations
461    pub annotations: std::collections::HashMap<String, String>,
462    /// When the alert started firing
463    pub started_at: Option<chrono::DateTime<chrono::Utc>>,
464    /// When the alert was last updated
465    pub updated_at: chrono::DateTime<chrono::Utc>,
466    /// Current value that triggered the alert
467    pub value: Option<f64>,
468}
469
470impl Alert {
471    /// Create a new alert
472    pub fn new(rule: &AlertRule) -> Self {
473        Self {
474            rule_name: rule.name.clone(),
475            state: AlertState::Pending,
476            severity: rule.severity,
477            labels: rule.labels.clone(),
478            annotations: rule.annotations.clone(),
479            started_at: None,
480            updated_at: chrono::Utc::now(),
481            value: None,
482        }
483    }
484
485    /// Transition to firing state
486    pub fn fire(&mut self) {
487        if self.state != AlertState::Firing {
488            self.state = AlertState::Firing;
489            self.started_at = Some(chrono::Utc::now());
490        }
491        self.updated_at = chrono::Utc::now();
492    }
493
494    /// Transition to resolved state
495    pub fn resolve(&mut self) {
496        self.state = AlertState::Resolved;
497        self.updated_at = chrono::Utc::now();
498    }
499
500    /// Acknowledge the alert
501    pub fn acknowledge(&mut self) {
502        self.state = AlertState::Acknowledged;
503        self.updated_at = chrono::Utc::now();
504    }
505}
506
507#[cfg(test)]
508mod tests {
509    use super::*;
510
511    #[test]
512    fn test_alert_rule() {
513        let rule = AlertRule::new("test_rule")
514            .description("Test rule")
515            .condition("error_rate > 0.01")
516            .severity(AlertSeverity::Warning)
517            .for_duration(Duration::from_secs(60));
518
519        assert_eq!(rule.name, "test_rule");
520        assert_eq!(rule.severity, AlertSeverity::Warning);
521    }
522
523    #[test]
524    fn test_predefined_rules() {
525        let unhealthy = AlertRule::kernel_unhealthy();
526        assert_eq!(unhealthy.severity, AlertSeverity::Critical);
527
528        let high_latency = AlertRule::high_latency();
529        assert_eq!(high_latency.severity, AlertSeverity::Warning);
530    }
531
532    #[test]
533    fn test_alert_config() {
534        let config = AlertConfig::default().with_default_rules();
535        assert!(!config.rules.is_empty());
536    }
537
538    #[test]
539    fn test_alert_state() {
540        let rule = AlertRule::kernel_unhealthy();
541        let mut alert = Alert::new(&rule);
542
543        assert_eq!(alert.state, AlertState::Pending);
544
545        alert.fire();
546        assert_eq!(alert.state, AlertState::Firing);
547        assert!(alert.started_at.is_some());
548
549        alert.acknowledge();
550        assert_eq!(alert.state, AlertState::Acknowledged);
551
552        alert.resolve();
553        assert_eq!(alert.state, AlertState::Resolved);
554    }
555
556    #[test]
557    fn test_receivers() {
558        let slack = AlertReceiver::slack("slack-ops", "https://hooks.slack.com/xxx");
559        assert_eq!(slack.name, "slack-ops");
560
561        let pagerduty = AlertReceiver::pagerduty("pagerduty-ops", "service-key");
562        assert_eq!(pagerduty.name, "pagerduty-ops");
563    }
564}