1use serde::{Deserialize, Serialize};
27use std::time::Duration;
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
31#[serde(rename_all = "lowercase")]
32pub enum AlertSeverity {
33 Info,
35 #[default]
37 Warning,
38 Critical,
40 Page,
42}
43
44impl std::fmt::Display for AlertSeverity {
45 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46 match self {
47 Self::Info => write!(f, "info"),
48 Self::Warning => write!(f, "warning"),
49 Self::Critical => write!(f, "critical"),
50 Self::Page => write!(f, "page"),
51 }
52 }
53}
54
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
57#[serde(rename_all = "lowercase")]
58pub enum AlertState {
59 #[default]
61 Ok,
62 Pending,
64 Firing,
66 Acknowledged,
68 Resolved,
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct AlertConfig {
75 pub enabled: bool,
77 pub rules: Vec<AlertRule>,
79 pub routing: AlertRouting,
81 pub evaluation_interval: Duration,
83 pub resolve_timeout: Duration,
85}
86
87impl Default for AlertConfig {
88 fn default() -> Self {
89 Self {
90 enabled: true,
91 rules: Vec::new(),
92 routing: AlertRouting::default(),
93 evaluation_interval: Duration::from_secs(15),
94 resolve_timeout: Duration::from_secs(300),
95 }
96 }
97}
98
99impl AlertConfig {
100 pub fn add_rule(mut self, rule: AlertRule) -> Self {
102 self.rules.push(rule);
103 self
104 }
105
106 pub fn with_routing(mut self, routing: AlertRouting) -> Self {
108 self.routing = routing;
109 self
110 }
111
112 pub fn with_evaluation_interval(mut self, interval: Duration) -> Self {
114 self.evaluation_interval = interval;
115 self
116 }
117
118 pub fn with_default_rules(mut self) -> Self {
120 self.rules.push(AlertRule::kernel_unhealthy());
121 self.rules.push(AlertRule::high_latency());
122 self.rules.push(AlertRule::high_error_rate());
123 self.rules.push(AlertRule::queue_depth());
124 self.rules.push(AlertRule::gpu_memory());
125 self
126 }
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct AlertRule {
132 pub name: String,
134 pub description: String,
136 pub condition: String,
138 pub severity: AlertSeverity,
140 pub for_duration: Duration,
142 pub labels: std::collections::HashMap<String, String>,
144 pub annotations: std::collections::HashMap<String, String>,
146 pub kernel_filter: Vec<String>,
148 pub domain_filter: Vec<String>,
150}
151
152impl AlertRule {
153 pub fn new(name: impl Into<String>) -> Self {
155 Self {
156 name: name.into(),
157 description: String::new(),
158 condition: String::new(),
159 severity: AlertSeverity::Warning,
160 for_duration: Duration::from_secs(0),
161 labels: std::collections::HashMap::new(),
162 annotations: std::collections::HashMap::new(),
163 kernel_filter: Vec::new(),
164 domain_filter: Vec::new(),
165 }
166 }
167
168 pub fn description(mut self, desc: impl Into<String>) -> Self {
170 self.description = desc.into();
171 self
172 }
173
174 pub fn condition(mut self, cond: impl Into<String>) -> Self {
176 self.condition = cond.into();
177 self
178 }
179
180 pub fn severity(mut self, severity: AlertSeverity) -> Self {
182 self.severity = severity;
183 self
184 }
185
186 pub fn for_duration(mut self, duration: Duration) -> Self {
188 self.for_duration = duration;
189 self
190 }
191
192 pub fn label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
194 self.labels.insert(key.into(), value.into());
195 self
196 }
197
198 pub fn annotation(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
200 self.annotations.insert(key.into(), value.into());
201 self
202 }
203
204 pub fn for_kernels(mut self, kernels: Vec<String>) -> Self {
206 self.kernel_filter = kernels;
207 self
208 }
209
210 pub fn for_domains(mut self, domains: Vec<String>) -> Self {
212 self.domain_filter = domains;
213 self
214 }
215
216 pub fn kernel_unhealthy() -> Self {
220 Self::new("KernelUnhealthy")
221 .description("Kernel is reporting unhealthy status")
222 .condition("health_status != healthy")
223 .severity(AlertSeverity::Critical)
224 .for_duration(Duration::from_secs(30))
225 .annotation("summary", "Kernel {{ $labels.kernel_id }} is unhealthy")
226 }
227
228 pub fn high_latency() -> Self {
230 Self::new("KernelHighLatency")
231 .description("Kernel message latency is above threshold")
232 .condition("avg_latency_ms > 100")
233 .severity(AlertSeverity::Warning)
234 .for_duration(Duration::from_secs(60))
235 .annotation(
236 "summary",
237 "Kernel {{ $labels.kernel_id }} has high latency ({{ $value }}ms)",
238 )
239 }
240
241 pub fn high_error_rate() -> Self {
243 Self::new("KernelHighErrorRate")
244 .description("Kernel error rate is above threshold")
245 .condition("error_rate > 0.01")
246 .severity(AlertSeverity::Warning)
247 .for_duration(Duration::from_secs(300))
248 .annotation(
249 "summary",
250 "Kernel {{ $labels.kernel_id }} has high error rate ({{ $value }})",
251 )
252 }
253
254 pub fn queue_depth() -> Self {
256 Self::new("KernelQueueDepth")
257 .description("Kernel message queue is getting full")
258 .condition("queue_depth > 1000")
259 .severity(AlertSeverity::Warning)
260 .for_duration(Duration::from_secs(60))
261 .annotation(
262 "summary",
263 "Kernel {{ $labels.kernel_id }} queue depth is high ({{ $value }})",
264 )
265 }
266
267 pub fn gpu_memory() -> Self {
269 Self::new("GPUMemoryHigh")
270 .description("GPU memory usage is above 90%")
271 .condition("gpu_memory_percent > 90")
272 .severity(AlertSeverity::Critical)
273 .for_duration(Duration::from_secs(60))
274 .annotation(
275 "summary",
276 "GPU memory usage is critically high ({{ $value }}%)",
277 )
278 }
279
280 pub fn slo_violation(slo_name: impl Into<String>) -> Self {
282 let name = slo_name.into();
283 Self::new(format!("SLOViolation_{}", name))
284 .description(format!("SLO '{}' is being violated", name))
285 .condition(format!("slo_{}_compliance < target", name))
286 .severity(AlertSeverity::Warning)
287 .for_duration(Duration::from_secs(300))
288 .label("slo", name.clone())
289 .annotation(
290 "summary",
291 format!("SLO '{}' compliance is below target", name),
292 )
293 }
294}
295
296#[derive(Debug, Clone, Default, Serialize, Deserialize)]
298pub struct AlertRouting {
299 pub default_receiver: Option<String>,
301 pub routes: Vec<AlertRoute>,
303 pub receivers: Vec<AlertReceiver>,
305}
306
307impl AlertRouting {
308 pub fn add_route(mut self, route: AlertRoute) -> Self {
310 self.routes.push(route);
311 self
312 }
313
314 pub fn add_receiver(mut self, receiver: AlertReceiver) -> Self {
316 self.receivers.push(receiver);
317 self
318 }
319
320 pub fn with_default(mut self, receiver: impl Into<String>) -> Self {
322 self.default_receiver = Some(receiver.into());
323 self
324 }
325}
326
327#[derive(Debug, Clone, Serialize, Deserialize)]
329pub struct AlertRoute {
330 pub matchers: std::collections::HashMap<String, String>,
332 pub receiver: String,
334 pub continue_matching: bool,
336 pub group_by: Vec<String>,
338 pub group_wait: Duration,
340 pub group_interval: Duration,
342}
343
344impl AlertRoute {
345 pub fn new(receiver: impl Into<String>) -> Self {
347 Self {
348 matchers: std::collections::HashMap::new(),
349 receiver: receiver.into(),
350 continue_matching: false,
351 group_by: Vec::new(),
352 group_wait: Duration::from_secs(30),
353 group_interval: Duration::from_secs(300),
354 }
355 }
356
357 pub fn match_label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
359 self.matchers.insert(key.into(), value.into());
360 self
361 }
362
363 pub fn group_by(mut self, labels: Vec<String>) -> Self {
365 self.group_by = labels;
366 self
367 }
368}
369
370#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct AlertReceiver {
373 pub name: String,
375 pub receiver_type: ReceiverType,
377}
378
379impl AlertReceiver {
380 pub fn new(name: impl Into<String>, receiver_type: ReceiverType) -> Self {
382 Self {
383 name: name.into(),
384 receiver_type,
385 }
386 }
387
388 pub fn slack(name: impl Into<String>, webhook_url: impl Into<String>) -> Self {
390 Self::new(
391 name,
392 ReceiverType::Slack {
393 webhook_url: webhook_url.into(),
394 channel: None,
395 },
396 )
397 }
398
399 pub fn pagerduty(name: impl Into<String>, service_key: impl Into<String>) -> Self {
401 Self::new(
402 name,
403 ReceiverType::PagerDuty {
404 service_key: service_key.into(),
405 },
406 )
407 }
408
409 pub fn webhook(name: impl Into<String>, url: impl Into<String>) -> Self {
411 Self::new(name, ReceiverType::Webhook { url: url.into() })
412 }
413}
414
415#[derive(Debug, Clone, Serialize, Deserialize)]
417#[serde(tag = "type", rename_all = "snake_case")]
418pub enum ReceiverType {
419 Slack {
421 webhook_url: String,
423 channel: Option<String>,
425 },
426 PagerDuty {
428 service_key: String,
430 },
431 Webhook {
433 url: String,
435 },
436 Email {
438 to: Vec<String>,
440 from: String,
442 smtp_server: String,
444 },
445 Log,
447}
448
449#[derive(Debug, Clone, Serialize)]
451pub struct Alert {
452 pub rule_name: String,
454 pub state: AlertState,
456 pub severity: AlertSeverity,
458 pub labels: std::collections::HashMap<String, String>,
460 pub annotations: std::collections::HashMap<String, String>,
462 pub started_at: Option<chrono::DateTime<chrono::Utc>>,
464 pub updated_at: chrono::DateTime<chrono::Utc>,
466 pub value: Option<f64>,
468}
469
470impl Alert {
471 pub fn new(rule: &AlertRule) -> Self {
473 Self {
474 rule_name: rule.name.clone(),
475 state: AlertState::Pending,
476 severity: rule.severity,
477 labels: rule.labels.clone(),
478 annotations: rule.annotations.clone(),
479 started_at: None,
480 updated_at: chrono::Utc::now(),
481 value: None,
482 }
483 }
484
485 pub fn fire(&mut self) {
487 if self.state != AlertState::Firing {
488 self.state = AlertState::Firing;
489 self.started_at = Some(chrono::Utc::now());
490 }
491 self.updated_at = chrono::Utc::now();
492 }
493
494 pub fn resolve(&mut self) {
496 self.state = AlertState::Resolved;
497 self.updated_at = chrono::Utc::now();
498 }
499
500 pub fn acknowledge(&mut self) {
502 self.state = AlertState::Acknowledged;
503 self.updated_at = chrono::Utc::now();
504 }
505}
506
507#[cfg(test)]
508mod tests {
509 use super::*;
510
511 #[test]
512 fn test_alert_rule() {
513 let rule = AlertRule::new("test_rule")
514 .description("Test rule")
515 .condition("error_rate > 0.01")
516 .severity(AlertSeverity::Warning)
517 .for_duration(Duration::from_secs(60));
518
519 assert_eq!(rule.name, "test_rule");
520 assert_eq!(rule.severity, AlertSeverity::Warning);
521 }
522
523 #[test]
524 fn test_predefined_rules() {
525 let unhealthy = AlertRule::kernel_unhealthy();
526 assert_eq!(unhealthy.severity, AlertSeverity::Critical);
527
528 let high_latency = AlertRule::high_latency();
529 assert_eq!(high_latency.severity, AlertSeverity::Warning);
530 }
531
532 #[test]
533 fn test_alert_config() {
534 let config = AlertConfig::default().with_default_rules();
535 assert!(!config.rules.is_empty());
536 }
537
538 #[test]
539 fn test_alert_state() {
540 let rule = AlertRule::kernel_unhealthy();
541 let mut alert = Alert::new(&rule);
542
543 assert_eq!(alert.state, AlertState::Pending);
544
545 alert.fire();
546 assert_eq!(alert.state, AlertState::Firing);
547 assert!(alert.started_at.is_some());
548
549 alert.acknowledge();
550 assert_eq!(alert.state, AlertState::Acknowledged);
551
552 alert.resolve();
553 assert_eq!(alert.state, AlertState::Resolved);
554 }
555
556 #[test]
557 fn test_receivers() {
558 let slack = AlertReceiver::slack("slack-ops", "https://hooks.slack.com/xxx");
559 assert_eq!(slack.name, "slack-ops");
560
561 let pagerduty = AlertReceiver::pagerduty("pagerduty-ops", "service-key");
562 assert_eq!(pagerduty.name, "pagerduty-ops");
563 }
564}