use serde::{Deserialize, Serialize};
use std::time::Duration;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum AlertSeverity {
Info,
#[default]
Warning,
Critical,
Page,
}
impl std::fmt::Display for AlertSeverity {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Info => write!(f, "info"),
Self::Warning => write!(f, "warning"),
Self::Critical => write!(f, "critical"),
Self::Page => write!(f, "page"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum AlertState {
#[default]
Ok,
Pending,
Firing,
Acknowledged,
Resolved,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlertConfig {
pub enabled: bool,
pub rules: Vec<AlertRule>,
pub routing: AlertRouting,
pub evaluation_interval: Duration,
pub resolve_timeout: Duration,
}
impl Default for AlertConfig {
fn default() -> Self {
Self {
enabled: true,
rules: Vec::new(),
routing: AlertRouting::default(),
evaluation_interval: Duration::from_secs(15),
resolve_timeout: Duration::from_secs(300),
}
}
}
impl AlertConfig {
pub fn add_rule(mut self, rule: AlertRule) -> Self {
self.rules.push(rule);
self
}
pub fn with_routing(mut self, routing: AlertRouting) -> Self {
self.routing = routing;
self
}
pub fn with_evaluation_interval(mut self, interval: Duration) -> Self {
self.evaluation_interval = interval;
self
}
pub fn with_default_rules(mut self) -> Self {
self.rules.push(AlertRule::kernel_unhealthy());
self.rules.push(AlertRule::high_latency());
self.rules.push(AlertRule::high_error_rate());
self.rules.push(AlertRule::queue_depth());
self.rules.push(AlertRule::gpu_memory());
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlertRule {
pub name: String,
pub description: String,
pub condition: String,
pub severity: AlertSeverity,
pub for_duration: Duration,
pub labels: std::collections::HashMap<String, String>,
pub annotations: std::collections::HashMap<String, String>,
pub kernel_filter: Vec<String>,
pub domain_filter: Vec<String>,
}
impl AlertRule {
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
description: String::new(),
condition: String::new(),
severity: AlertSeverity::Warning,
for_duration: Duration::from_secs(0),
labels: std::collections::HashMap::new(),
annotations: std::collections::HashMap::new(),
kernel_filter: Vec::new(),
domain_filter: Vec::new(),
}
}
pub fn description(mut self, desc: impl Into<String>) -> Self {
self.description = desc.into();
self
}
pub fn condition(mut self, cond: impl Into<String>) -> Self {
self.condition = cond.into();
self
}
pub fn severity(mut self, severity: AlertSeverity) -> Self {
self.severity = severity;
self
}
pub fn for_duration(mut self, duration: Duration) -> Self {
self.for_duration = duration;
self
}
pub fn label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.labels.insert(key.into(), value.into());
self
}
pub fn annotation(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.annotations.insert(key.into(), value.into());
self
}
pub fn for_kernels(mut self, kernels: Vec<String>) -> Self {
self.kernel_filter = kernels;
self
}
pub fn for_domains(mut self, domains: Vec<String>) -> Self {
self.domain_filter = domains;
self
}
pub fn kernel_unhealthy() -> Self {
Self::new("KernelUnhealthy")
.description("Kernel is reporting unhealthy status")
.condition("health_status != healthy")
.severity(AlertSeverity::Critical)
.for_duration(Duration::from_secs(30))
.annotation("summary", "Kernel {{ $labels.kernel_id }} is unhealthy")
}
pub fn high_latency() -> Self {
Self::new("KernelHighLatency")
.description("Kernel message latency is above threshold")
.condition("avg_latency_ms > 100")
.severity(AlertSeverity::Warning)
.for_duration(Duration::from_secs(60))
.annotation(
"summary",
"Kernel {{ $labels.kernel_id }} has high latency ({{ $value }}ms)",
)
}
pub fn high_error_rate() -> Self {
Self::new("KernelHighErrorRate")
.description("Kernel error rate is above threshold")
.condition("error_rate > 0.01")
.severity(AlertSeverity::Warning)
.for_duration(Duration::from_secs(300))
.annotation(
"summary",
"Kernel {{ $labels.kernel_id }} has high error rate ({{ $value }})",
)
}
pub fn queue_depth() -> Self {
Self::new("KernelQueueDepth")
.description("Kernel message queue is getting full")
.condition("queue_depth > 1000")
.severity(AlertSeverity::Warning)
.for_duration(Duration::from_secs(60))
.annotation(
"summary",
"Kernel {{ $labels.kernel_id }} queue depth is high ({{ $value }})",
)
}
pub fn gpu_memory() -> Self {
Self::new("GPUMemoryHigh")
.description("GPU memory usage is above 90%")
.condition("gpu_memory_percent > 90")
.severity(AlertSeverity::Critical)
.for_duration(Duration::from_secs(60))
.annotation(
"summary",
"GPU memory usage is critically high ({{ $value }}%)",
)
}
pub fn slo_violation(slo_name: impl Into<String>) -> Self {
let name = slo_name.into();
Self::new(format!("SLOViolation_{}", name))
.description(format!("SLO '{}' is being violated", name))
.condition(format!("slo_{}_compliance < target", name))
.severity(AlertSeverity::Warning)
.for_duration(Duration::from_secs(300))
.label("slo", name.clone())
.annotation(
"summary",
format!("SLO '{}' compliance is below target", name),
)
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct AlertRouting {
pub default_receiver: Option<String>,
pub routes: Vec<AlertRoute>,
pub receivers: Vec<AlertReceiver>,
}
impl AlertRouting {
pub fn add_route(mut self, route: AlertRoute) -> Self {
self.routes.push(route);
self
}
pub fn add_receiver(mut self, receiver: AlertReceiver) -> Self {
self.receivers.push(receiver);
self
}
pub fn with_default(mut self, receiver: impl Into<String>) -> Self {
self.default_receiver = Some(receiver.into());
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlertRoute {
pub matchers: std::collections::HashMap<String, String>,
pub receiver: String,
pub continue_matching: bool,
pub group_by: Vec<String>,
pub group_wait: Duration,
pub group_interval: Duration,
}
impl AlertRoute {
pub fn new(receiver: impl Into<String>) -> Self {
Self {
matchers: std::collections::HashMap::new(),
receiver: receiver.into(),
continue_matching: false,
group_by: Vec::new(),
group_wait: Duration::from_secs(30),
group_interval: Duration::from_secs(300),
}
}
pub fn match_label(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.matchers.insert(key.into(), value.into());
self
}
pub fn group_by(mut self, labels: Vec<String>) -> Self {
self.group_by = labels;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlertReceiver {
pub name: String,
pub receiver_type: ReceiverType,
}
impl AlertReceiver {
pub fn new(name: impl Into<String>, receiver_type: ReceiverType) -> Self {
Self {
name: name.into(),
receiver_type,
}
}
pub fn slack(name: impl Into<String>, webhook_url: impl Into<String>) -> Self {
Self::new(
name,
ReceiverType::Slack {
webhook_url: webhook_url.into(),
channel: None,
},
)
}
pub fn pagerduty(name: impl Into<String>, service_key: impl Into<String>) -> Self {
Self::new(
name,
ReceiverType::PagerDuty {
service_key: service_key.into(),
},
)
}
pub fn webhook(name: impl Into<String>, url: impl Into<String>) -> Self {
Self::new(name, ReceiverType::Webhook { url: url.into() })
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ReceiverType {
Slack {
webhook_url: String,
channel: Option<String>,
},
PagerDuty {
service_key: String,
},
Webhook {
url: String,
},
Email {
to: Vec<String>,
from: String,
smtp_server: String,
},
Log,
}
#[derive(Debug, Clone, Serialize)]
pub struct Alert {
pub rule_name: String,
pub state: AlertState,
pub severity: AlertSeverity,
pub labels: std::collections::HashMap<String, String>,
pub annotations: std::collections::HashMap<String, String>,
pub started_at: Option<chrono::DateTime<chrono::Utc>>,
pub updated_at: chrono::DateTime<chrono::Utc>,
pub value: Option<f64>,
}
impl Alert {
pub fn new(rule: &AlertRule) -> Self {
Self {
rule_name: rule.name.clone(),
state: AlertState::Pending,
severity: rule.severity,
labels: rule.labels.clone(),
annotations: rule.annotations.clone(),
started_at: None,
updated_at: chrono::Utc::now(),
value: None,
}
}
pub fn fire(&mut self) {
if self.state != AlertState::Firing {
self.state = AlertState::Firing;
self.started_at = Some(chrono::Utc::now());
}
self.updated_at = chrono::Utc::now();
}
pub fn resolve(&mut self) {
self.state = AlertState::Resolved;
self.updated_at = chrono::Utc::now();
}
pub fn acknowledge(&mut self) {
self.state = AlertState::Acknowledged;
self.updated_at = chrono::Utc::now();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_alert_rule() {
let rule = AlertRule::new("test_rule")
.description("Test rule")
.condition("error_rate > 0.01")
.severity(AlertSeverity::Warning)
.for_duration(Duration::from_secs(60));
assert_eq!(rule.name, "test_rule");
assert_eq!(rule.severity, AlertSeverity::Warning);
}
#[test]
fn test_predefined_rules() {
let unhealthy = AlertRule::kernel_unhealthy();
assert_eq!(unhealthy.severity, AlertSeverity::Critical);
let high_latency = AlertRule::high_latency();
assert_eq!(high_latency.severity, AlertSeverity::Warning);
}
#[test]
fn test_alert_config() {
let config = AlertConfig::default().with_default_rules();
assert!(!config.rules.is_empty());
}
#[test]
fn test_alert_state() {
let rule = AlertRule::kernel_unhealthy();
let mut alert = Alert::new(&rule);
assert_eq!(alert.state, AlertState::Pending);
alert.fire();
assert_eq!(alert.state, AlertState::Firing);
assert!(alert.started_at.is_some());
alert.acknowledge();
assert_eq!(alert.state, AlertState::Acknowledged);
alert.resolve();
assert_eq!(alert.state, AlertState::Resolved);
}
#[test]
fn test_receivers() {
let slack = AlertReceiver::slack("slack-ops", "https://hooks.slack.com/xxx");
assert_eq!(slack.name, "slack-ops");
let pagerduty = AlertReceiver::pagerduty("pagerduty-ops", "service-key");
assert_eq!(pagerduty.name, "pagerduty-ops");
}
}