use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use crate::monitoring_metrics::{MetricsCollector, PerformanceMetrics};
pub trait AlertHandler {
fn handle_alert(&self, alert: Alert) -> Result<()>;
}
#[derive(Debug, Clone)]
pub struct Alert {
pub alert_type: AlertType,
pub message: String,
pub severity: AlertSeverity,
pub timestamp: DateTime<Utc>,
pub metrics: HashMap<String, f64>,
}
#[derive(Debug, Clone)]
pub enum AlertType {
HighLatency,
LowThroughput,
HighErrorRate,
LowCacheHitRate,
QualityDrift,
PerformanceDrift,
ResourceExhaustion,
SystemFailure,
}
#[derive(Debug, Clone)]
pub enum AlertSeverity {
Info,
Warning,
Critical,
Emergency,
}
#[derive(Debug, Clone)]
pub struct AlertThresholds {
pub max_p95_latency_ms: f64,
pub min_throughput_rps: f64,
pub max_error_rate: f64,
pub min_cache_hit_rate: f64,
pub max_quality_drift: f64,
pub max_memory_usage_mb: f64,
pub max_gpu_memory_mb: f64,
}
impl Default for AlertThresholds {
fn default() -> Self {
Self {
max_p95_latency_ms: 500.0,
min_throughput_rps: 100.0,
max_error_rate: 0.05, min_cache_hit_rate: 0.8, max_quality_drift: 0.1,
max_memory_usage_mb: 4096.0, max_gpu_memory_mb: 8192.0, }
}
}
pub struct ConsoleAlertHandler;
impl AlertHandler for ConsoleAlertHandler {
fn handle_alert(&self, alert: Alert) -> Result<()> {
println!(
"ALERT [{}]: {} - {}",
format!("{:?}", alert.severity).to_uppercase(),
alert.message,
alert.timestamp.format("%Y-%m-%d %H:%M:%S UTC")
);
Ok(())
}
}
pub struct SlackAlertHandler {
pub webhook_url: String,
}
impl AlertHandler for SlackAlertHandler {
fn handle_alert(&self, alert: Alert) -> Result<()> {
tracing::info!(
"Would send Slack alert to {}: {}",
self.webhook_url,
alert.message
);
Ok(())
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum HealthStatus {
Healthy,
Degraded,
Unhealthy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckResult {
pub status: HealthStatus,
pub timestamp: DateTime<Utc>,
pub components: HashMap<String, ComponentHealth>,
pub details: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentHealth {
pub status: HealthStatus,
pub message: String,
pub last_check: DateTime<Utc>,
pub metrics: HashMap<String, f64>,
}
pub struct HealthChecker {
models_loaded: Arc<RwLock<bool>>,
last_request_time: Arc<RwLock<DateTime<Utc>>>,
error_rate_threshold: f64,
latency_threshold_ms: f64,
memory_threshold_mb: f64,
metrics: Arc<MetricsCollector>,
}
impl HealthChecker {
pub fn new(metrics: Arc<MetricsCollector>) -> Self {
Self {
models_loaded: Arc::new(RwLock::new(false)),
last_request_time: Arc::new(RwLock::new(Utc::now())),
error_rate_threshold: 0.1, latency_threshold_ms: 1000.0, memory_threshold_mb: 8192.0, metrics,
}
}
pub fn set_models_loaded(&self, loaded: bool) -> Result<()> {
let mut status = self
.models_loaded
.write()
.map_err(|e| anyhow!("Failed to write lock: {}", e))?;
*status = loaded;
Ok(())
}
pub fn update_last_request_time(&self) -> Result<()> {
let mut time = self
.last_request_time
.write()
.map_err(|e| anyhow!("Failed to write lock: {}", e))?;
*time = Utc::now();
Ok(())
}
pub fn check_liveness(&self) -> HealthCheckResult {
let mut components = HashMap::new();
components.insert(
"service".to_string(),
ComponentHealth {
status: HealthStatus::Healthy,
message: "Service is running".to_string(),
last_check: Utc::now(),
metrics: HashMap::new(),
},
);
HealthCheckResult {
status: HealthStatus::Healthy,
timestamp: Utc::now(),
components,
details: HashMap::new(),
}
}
pub fn check_readiness(&self) -> HealthCheckResult {
let mut components = HashMap::new();
let mut overall_status = HealthStatus::Healthy;
let models_loaded = self.models_loaded.read().map(|g| *g).unwrap_or(false);
if !models_loaded {
overall_status = HealthStatus::Unhealthy;
components.insert(
"models".to_string(),
ComponentHealth {
status: HealthStatus::Unhealthy,
message: "Models not loaded".to_string(),
last_check: Utc::now(),
metrics: HashMap::new(),
},
);
} else {
components.insert(
"models".to_string(),
ComponentHealth {
status: HealthStatus::Healthy,
message: "Models loaded and ready".to_string(),
last_check: Utc::now(),
metrics: HashMap::new(),
},
);
}
let cache_hit_rate = self.metrics.get_cache_hit_rate();
components.insert(
"cache".to_string(),
ComponentHealth {
status: HealthStatus::Healthy,
message: format!("Cache hit rate: {:.2}%", cache_hit_rate * 100.0),
last_check: Utc::now(),
metrics: [("hit_rate".to_string(), cache_hit_rate)]
.into_iter()
.collect(),
},
);
HealthCheckResult {
status: overall_status,
timestamp: Utc::now(),
components,
details: HashMap::new(),
}
}
pub fn check_health(&self, performance_metrics: &PerformanceMetrics) -> HealthCheckResult {
let mut components = HashMap::new();
let mut overall_status = HealthStatus::Healthy;
let models_loaded = self.models_loaded.read().map(|g| *g).unwrap_or(false);
if !models_loaded {
overall_status = HealthStatus::Unhealthy;
components.insert(
"models".to_string(),
ComponentHealth {
status: HealthStatus::Unhealthy,
message: "Models not loaded".to_string(),
last_check: Utc::now(),
metrics: HashMap::new(),
},
);
} else {
components.insert(
"models".to_string(),
ComponentHealth {
status: HealthStatus::Healthy,
message: "Models operational".to_string(),
last_check: Utc::now(),
metrics: HashMap::new(),
},
);
}
let latency_status =
if performance_metrics.latency.p95_latency_ms > self.latency_threshold_ms {
if overall_status == HealthStatus::Healthy {
overall_status = HealthStatus::Degraded;
}
HealthStatus::Degraded
} else {
HealthStatus::Healthy
};
components.insert(
"latency".to_string(),
ComponentHealth {
status: latency_status,
message: format!(
"P95 latency: {:.2}ms",
performance_metrics.latency.p95_latency_ms
),
last_check: Utc::now(),
metrics: [
(
"p50".to_string(),
performance_metrics.latency.p50_latency_ms,
),
(
"p95".to_string(),
performance_metrics.latency.p95_latency_ms,
),
(
"p99".to_string(),
performance_metrics.latency.p99_latency_ms,
),
]
.into_iter()
.collect(),
},
);
let error_rate = if performance_metrics.throughput.total_requests > 0 {
performance_metrics.errors.total_errors as f64
/ performance_metrics.throughput.total_requests as f64
} else {
0.0
};
let error_status = if error_rate > self.error_rate_threshold {
if overall_status == HealthStatus::Healthy {
overall_status = HealthStatus::Degraded;
}
HealthStatus::Degraded
} else {
HealthStatus::Healthy
};
components.insert(
"errors".to_string(),
ComponentHealth {
status: error_status,
message: format!("Error rate: {:.2}%", error_rate * 100.0),
last_check: Utc::now(),
metrics: [("error_rate".to_string(), error_rate)]
.into_iter()
.collect(),
},
);
let memory_status =
if performance_metrics.resources.memory_usage_mb > self.memory_threshold_mb {
if overall_status == HealthStatus::Healthy {
overall_status = HealthStatus::Degraded;
}
HealthStatus::Degraded
} else {
HealthStatus::Healthy
};
components.insert(
"memory".to_string(),
ComponentHealth {
status: memory_status,
message: format!(
"Memory usage: {:.2}MB / {:.2}MB",
performance_metrics.resources.memory_usage_mb, self.memory_threshold_mb
),
last_check: Utc::now(),
metrics: [
(
"usage_mb".to_string(),
performance_metrics.resources.memory_usage_mb,
),
("threshold_mb".to_string(), self.memory_threshold_mb),
]
.into_iter()
.collect(),
},
);
let cache_hit_rate = self.metrics.get_cache_hit_rate();
components.insert(
"cache".to_string(),
ComponentHealth {
status: HealthStatus::Healthy,
message: format!("Cache hit rate: {:.2}%", cache_hit_rate * 100.0),
last_check: Utc::now(),
metrics: [("hit_rate".to_string(), cache_hit_rate)]
.into_iter()
.collect(),
},
);
HealthCheckResult {
status: overall_status,
timestamp: Utc::now(),
components,
details: HashMap::new(),
}
}
pub fn get_metrics_endpoint(&self) -> Result<String> {
self.metrics.export_prometheus()
}
}