kotoba_monitoring/
lib.rs

1//! # KotobaDB Monitoring & Metrics
2//!
3//! Comprehensive monitoring and metrics collection system for KotobaDB.
4//! Provides health checks, performance monitoring, and Prometheus integration.
5
6pub mod metrics_collector;
7pub mod health_checker;
8
9pub use metrics_collector::*;
10pub use health_checker::*;
11
12use std::collections::HashMap;
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15use serde::{Deserialize, Serialize};
16use tokio::sync::RwLock;
17use chrono::{DateTime, Utc};
18
19/// Monitoring configuration
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct MonitoringConfig {
22    /// Enable metrics collection
23    pub enable_metrics: bool,
24    /// Enable health checks
25    pub enable_health_checks: bool,
26    /// Metrics collection interval
27    pub collection_interval: Duration,
28    /// Health check interval
29    pub health_check_interval: Duration,
30    /// Metrics retention period
31    pub retention_period: Duration,
32    /// Maximum stored metrics points
33    pub max_metrics_points: usize,
34    /// Prometheus export configuration
35    pub prometheus_config: PrometheusConfig,
36    /// Alerting configuration
37    pub alerting_config: AlertingConfig,
38}
39
40impl Default for MonitoringConfig {
41    fn default() -> Self {
42        Self {
43            enable_metrics: true,
44            enable_health_checks: true,
45            collection_interval: Duration::from_secs(15),
46            health_check_interval: Duration::from_secs(30),
47            retention_period: Duration::from_secs(3600), // 1 hour
48            max_metrics_points: 10000,
49            prometheus_config: PrometheusConfig::default(),
50            alerting_config: AlertingConfig::default(),
51        }
52    }
53}
54
55/// Prometheus export configuration
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct PrometheusConfig {
58    /// Enable Prometheus export
59    pub enabled: bool,
60    /// HTTP server address
61    pub address: String,
62    /// HTTP server port
63    pub port: u16,
64    /// Metrics path
65    pub path: String,
66    /// Global labels
67    pub global_labels: HashMap<String, String>,
68}
69
70impl Default for PrometheusConfig {
71    fn default() -> Self {
72        Self {
73            enabled: true,
74            address: "127.0.0.1".to_string(),
75            port: 9090,
76            path: "/metrics".to_string(),
77            global_labels: HashMap::new(),
78        }
79    }
80}
81
82/// Alerting configuration
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct AlertingConfig {
85    /// Enable alerting
86    pub enabled: bool,
87    /// Alert rules
88    pub rules: Vec<AlertRule>,
89    /// Notification endpoints
90    pub notifications: Vec<NotificationConfig>,
91}
92
93impl Default for AlertingConfig {
94    fn default() -> Self {
95        Self {
96            enabled: false,
97            rules: Vec::new(),
98            notifications: Vec::new(),
99        }
100    }
101}
102
103/// Alert rule definition
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct AlertRule {
106    /// Rule name
107    pub name: String,
108    /// Rule description
109    pub description: String,
110    /// Metric query expression
111    pub query: String,
112    /// Alert threshold
113    pub threshold: AlertThreshold,
114    /// Evaluation interval
115    pub evaluation_interval: Duration,
116    /// Alert severity
117    pub severity: AlertSeverity,
118    /// Labels for the alert
119    pub labels: HashMap<String, String>,
120}
121
122/// Alert threshold
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub enum AlertThreshold {
125    /// Greater than threshold
126    GreaterThan(f64),
127    /// Less than threshold
128    LessThan(f64),
129    /// Equal to threshold
130    Equal(f64),
131    /// Not equal to threshold
132    NotEqual(f64),
133}
134
135/// Alert severity levels
136#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
137pub enum AlertSeverity {
138    Info,
139    Warning,
140    Error,
141    Critical,
142}
143
144/// Notification configuration
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct NotificationConfig {
147    /// Notification type
148    pub notification_type: NotificationType,
149    /// Configuration specific to the notification type
150    pub config: HashMap<String, String>,
151}
152
153/// Notification types
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub enum NotificationType {
156    Email,
157    Slack,
158    Webhook,
159    PagerDuty,
160}
161
162/// Metric data point
163#[derive(Debug, Clone)]
164pub struct MetricPoint {
165    /// Metric name
166    pub name: String,
167    /// Metric value
168    pub value: f64,
169    /// Timestamp
170    pub timestamp: DateTime<Utc>,
171    /// Labels
172    pub labels: HashMap<String, String>,
173}
174
175/// Health check result
176#[derive(Debug, Clone)]
177pub struct HealthCheckResult {
178    /// Check name
179    pub name: String,
180    /// Check status
181    pub status: HealthStatus,
182    /// Check message
183    pub message: String,
184    /// Check duration
185    pub duration: Duration,
186    /// Additional details
187    pub details: HashMap<String, String>,
188}
189
190/// Health status
191#[derive(Debug, Clone, Copy, PartialEq, Eq)]
192pub enum HealthStatus {
193    Healthy,
194    Degraded,
195    Unhealthy,
196    Unknown,
197}
198
199impl Default for HealthStatus {
200    fn default() -> Self {
201        HealthStatus::Unknown
202    }
203}
204
205/// Overall system health
206#[derive(Debug, Clone, Default)]
207pub struct SystemHealth {
208    /// Overall status
209    pub overall_status: HealthStatus,
210    /// Individual check results
211    pub checks: Vec<HealthCheckResult>,
212    /// System uptime
213    pub uptime: Duration,
214    /// Last health check time
215    pub last_check: DateTime<Utc>,
216}
217
218/// Performance metrics
219#[derive(Debug, Clone)]
220pub struct PerformanceMetrics {
221    /// Query performance
222    pub query_metrics: QueryMetrics,
223    /// Storage performance
224    pub storage_metrics: StorageMetrics,
225    /// System performance
226    pub system_metrics: SystemMetrics,
227    /// Timestamp
228    pub timestamp: DateTime<Utc>,
229}
230
231/// Query performance metrics
232#[derive(Debug, Clone)]
233pub struct QueryMetrics {
234    pub total_queries: u64,
235    pub queries_per_second: f64,
236    pub avg_query_latency_ms: f64,
237    pub p95_query_latency_ms: f64,
238    pub p99_query_latency_ms: f64,
239    pub slow_queries: u64,
240    pub failed_queries: u64,
241}
242
243/// Storage performance metrics
244#[derive(Debug, Clone)]
245pub struct StorageMetrics {
246    pub total_size_bytes: u64,
247    pub used_size_bytes: u64,
248    pub read_operations: u64,
249    pub write_operations: u64,
250    pub read_bytes_per_sec: f64,
251    pub write_bytes_per_sec: f64,
252    pub cache_hit_rate: f64,
253    pub io_latency_ms: f64,
254}
255
256/// System performance metrics
257#[derive(Debug, Clone)]
258pub struct SystemMetrics {
259    pub cpu_usage_percent: f64,
260    pub memory_usage_bytes: u64,
261    pub memory_usage_percent: f64,
262    pub disk_usage_bytes: u64,
263    pub disk_usage_percent: f64,
264    pub network_rx_bytes: u64,
265    pub network_tx_bytes: u64,
266}
267
268/// Monitoring errors
269#[derive(Debug, thiserror::Error)]
270pub enum MonitoringError {
271    #[error("Metrics collection error: {0}")]
272    MetricsCollection(String),
273
274    #[error("Health check error: {0}")]
275    HealthCheck(String),
276
277    #[error("Prometheus export error: {0}")]
278    PrometheusExport(String),
279
280    #[error("Configuration error: {0}")]
281    Configuration(String),
282
283    #[error("Storage error: {0}")]
284    Storage(String),
285
286    #[error("HTTP server error: {0}")]
287    HttpServer(String),
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293
294    #[test]
295    fn test_monitoring_config_default() {
296        let config = MonitoringConfig::default();
297        assert!(config.enable_metrics);
298        assert!(config.enable_health_checks);
299        assert_eq!(config.collection_interval, Duration::from_secs(15));
300    }
301
302    #[test]
303    fn test_prometheus_config_default() {
304        let config = PrometheusConfig::default();
305        assert!(config.enabled);
306        assert_eq!(config.port, 9090);
307        assert_eq!(config.path, "/metrics");
308    }
309
310    #[test]
311    fn test_health_status_ordering() {
312        assert!(HealthStatus::Healthy > HealthStatus::Degraded);
313        assert!(HealthStatus::Degraded > HealthStatus::Unhealthy);
314        assert!(HealthStatus::Unhealthy > HealthStatus::Unknown);
315    }
316
317    #[test]
318    fn test_metric_point_creation() {
319        let mut labels = HashMap::new();
320        labels.insert("service".to_string(), "kotoba-db".to_string());
321
322        let point = MetricPoint {
323            name: "query_latency".to_string(),
324            value: 15.5,
325            timestamp: Utc::now(),
326            labels,
327        };
328
329        assert_eq!(point.name, "query_latency");
330        assert_eq!(point.value, 15.5);
331        assert!(point.labels.contains_key("service"));
332    }
333}