ai_lib/error_handling/
monitoring.rs

1//! Error monitoring and alerting
2
3use crate::error_handling::ErrorContext;
4use crate::metrics::Metrics;
5use crate::types::AiLibError;
6use serde::{Deserialize, Serialize};
7use std::sync::Arc;
8use std::time::Duration;
9
10/// Error monitoring configuration
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ErrorThresholds {
13    /// Maximum error rate (errors per second)
14    pub error_rate_threshold: f64,
15    /// Maximum consecutive errors before alerting
16    pub consecutive_errors: u32,
17    /// Time window for error rate calculation
18    pub time_window: Duration,
19}
20
21impl Default for ErrorThresholds {
22    fn default() -> Self {
23        Self {
24            error_rate_threshold: 0.1, // 10% error rate
25            consecutive_errors: 5,
26            time_window: Duration::from_secs(60),
27        }
28    }
29}
30
31/// Error monitor for tracking and alerting
32pub struct ErrorMonitor {
33    metrics: Arc<dyn Metrics>,
34    #[allow(dead_code)] // Reserved for future alerting functionality
35    alert_thresholds: ErrorThresholds,
36}
37
38impl ErrorMonitor {
39    /// Create a new error monitor
40    pub fn new(metrics: Arc<dyn Metrics>, alert_thresholds: ErrorThresholds) -> Self {
41        Self {
42            metrics,
43            alert_thresholds,
44        }
45    }
46
47    /// Record an error and check for alerts
48    pub async fn record_error(&self, error: &AiLibError, context: &ErrorContext) {
49        // Record error metrics
50        self.metrics.incr_counter("errors.total", 1).await;
51        self.metrics
52            .incr_counter(&format!("errors.{}", self.error_type_name(error)), 1)
53            .await;
54
55        // Check if we should send an alert
56        if self.should_alert(error, context).await {
57            self.send_alert(error, context).await;
58        }
59    }
60
61    /// Check if an alert should be sent
62    async fn should_alert(&self, error: &AiLibError, _context: &ErrorContext) -> bool {
63        // This is a simplified implementation
64        // In a real system, you would check error rates, consecutive errors, etc.
65        matches!(
66            error,
67            AiLibError::RateLimitExceeded(_) | AiLibError::ProviderError(_)
68        )
69    }
70
71    /// Send an alert (placeholder implementation)
72    async fn send_alert(&self, error: &AiLibError, context: &ErrorContext) {
73        // In a real implementation, this would send alerts via email, Slack, etc.
74        // Error monitoring: In production, this would send alerts via email, Slack, etc.
75        // For now, we just log the error context
76        let _ = (error, context);
77    }
78
79    /// Get error type name for metrics
80    fn error_type_name(&self, error: &AiLibError) -> String {
81        match error {
82            AiLibError::RateLimitExceeded(_) => "rate_limit".to_string(),
83            AiLibError::NetworkError(_) => "network".to_string(),
84            AiLibError::AuthenticationError(_) => "authentication".to_string(),
85            AiLibError::ProviderError(_) => "provider".to_string(),
86            AiLibError::TimeoutError(_) => "timeout".to_string(),
87            _ => "unknown".to_string(),
88        }
89    }
90}