pulseengine_mcp_logging/
metrics.rs

1//! Metrics collection for observability and monitoring
2//!
3//! This module provides comprehensive metrics collection for:
4//! - Request performance monitoring
5//! - System health metrics
6//! - Business logic metrics
7//! - Error tracking and classification
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::sync::Arc;
12use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
13use tokio::sync::RwLock;
14
15/// Global metrics collector instance
16pub struct MetricsCollector {
17    /// Request metrics
18    request_metrics: Arc<RwLock<RequestMetrics>>,
19
20    /// System health metrics
21    health_metrics: Arc<RwLock<HealthMetrics>>,
22
23    /// Business metrics
24    business_metrics: Arc<RwLock<BusinessMetrics>>,
25
26    /// Error metrics
27    error_metrics: Arc<RwLock<ErrorMetrics>>,
28
29    /// Start time for uptime calculation
30    start_time: Instant,
31}
32
33/// Request performance metrics
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct RequestMetrics {
36    /// Total number of requests
37    pub total_requests: u64,
38
39    /// Successful requests
40    pub successful_requests: u64,
41
42    /// Failed requests
43    pub failed_requests: u64,
44
45    /// Average response time in milliseconds
46    pub avg_response_time_ms: f64,
47
48    /// 95th percentile response time
49    pub p95_response_time_ms: f64,
50
51    /// 99th percentile response time
52    pub p99_response_time_ms: f64,
53
54    /// Current active requests
55    pub active_requests: u64,
56
57    /// Requests per tool
58    pub requests_by_tool: HashMap<String, u64>,
59
60    /// Response times by tool
61    pub response_times_by_tool: HashMap<String, Vec<f64>>,
62
63    /// Rate limiting hits
64    pub rate_limit_hits: u64,
65
66    /// Request throughput (requests per second)
67    pub requests_per_second: f64,
68
69    /// Last update timestamp
70    pub last_updated: u64,
71}
72
73/// System health metrics
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct HealthMetrics {
76    /// CPU usage percentage
77    pub cpu_usage_percent: Option<f64>,
78
79    /// Memory usage in MB
80    pub memory_usage_mb: Option<f64>,
81
82    /// Memory usage percentage
83    pub memory_usage_percent: Option<f64>,
84
85    /// Disk usage percentage
86    pub disk_usage_percent: Option<f64>,
87
88    /// Network latency to Loxone server
89    pub loxone_latency_ms: Option<f64>,
90
91    /// Connection pool statistics
92    pub connection_pool_active: Option<u32>,
93    pub connection_pool_idle: Option<u32>,
94    pub connection_pool_max: Option<u32>,
95
96    /// System uptime in seconds
97    pub uptime_seconds: u64,
98
99    /// Garbage collection metrics (if available)
100    pub gc_collections: Option<u64>,
101    pub gc_time_ms: Option<f64>,
102
103    /// Last health check status
104    pub last_health_check_success: bool,
105    pub last_health_check_time: u64,
106}
107
108/// Business logic metrics specific to Loxone MCP
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct BusinessMetrics {
111    /// Device operations
112    pub device_operations_total: u64,
113    pub device_operations_success: u64,
114    pub device_operations_failed: u64,
115
116    /// Operations by device type
117    pub operations_by_device_type: HashMap<String, u64>,
118
119    /// Operations by room
120    pub operations_by_room: HashMap<String, u64>,
121
122    /// Loxone API calls
123    pub loxone_api_calls_total: u64,
124    pub loxone_api_calls_success: u64,
125    pub loxone_api_calls_failed: u64,
126
127    /// Structure refresh operations
128    pub structure_refreshes: u64,
129    pub last_structure_refresh: u64,
130
131    /// Authentication operations
132    pub auth_attempts: u64,
133    pub auth_successes: u64,
134    pub auth_failures: u64,
135
136    /// Cache hit/miss ratios
137    pub cache_hits: u64,
138    pub cache_misses: u64,
139
140    /// Schema validation metrics
141    pub schema_validations_total: u64,
142    pub schema_validations_failed: u64,
143
144    /// Request coalescing effectiveness
145    pub coalesced_requests: u64,
146    pub coalescing_time_saved_ms: f64,
147}
148
149/// Error classification and tracking
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct ErrorMetrics {
152    /// Total errors
153    pub total_errors: u64,
154
155    /// Errors by classification
156    pub client_errors: u64,
157    pub server_errors: u64,
158    pub network_errors: u64,
159    pub auth_errors: u64,
160    pub business_errors: u64,
161
162    /// Errors by tool
163    pub errors_by_tool: HashMap<String, u64>,
164
165    /// Error rates
166    pub error_rate_5min: f64,
167    pub error_rate_1hour: f64,
168    pub error_rate_24hour: f64,
169
170    /// Most recent errors (last 10)
171    pub recent_errors: Vec<ErrorRecord>,
172
173    /// Error patterns
174    pub timeout_errors: u64,
175    pub connection_errors: u64,
176    pub validation_errors: u64,
177    pub device_control_errors: u64,
178}
179
180/// Individual error record for tracking
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct ErrorRecord {
183    pub timestamp: u64,
184    pub error_type: String,
185    pub error_message: String,
186    pub tool_name: String,
187    pub request_id: String,
188    pub duration_ms: u64,
189}
190
191impl Default for RequestMetrics {
192    fn default() -> Self {
193        Self {
194            total_requests: 0,
195            successful_requests: 0,
196            failed_requests: 0,
197            avg_response_time_ms: 0.0,
198            p95_response_time_ms: 0.0,
199            p99_response_time_ms: 0.0,
200            active_requests: 0,
201            requests_by_tool: HashMap::new(),
202            response_times_by_tool: HashMap::new(),
203            rate_limit_hits: 0,
204            requests_per_second: 0.0,
205            last_updated: current_timestamp(),
206        }
207    }
208}
209
210impl Default for HealthMetrics {
211    fn default() -> Self {
212        Self {
213            cpu_usage_percent: None,
214            memory_usage_mb: None,
215            memory_usage_percent: None,
216            disk_usage_percent: None,
217            loxone_latency_ms: None,
218            connection_pool_active: None,
219            connection_pool_idle: None,
220            connection_pool_max: None,
221            uptime_seconds: 0,
222            gc_collections: None,
223            gc_time_ms: None,
224            last_health_check_success: false,
225            last_health_check_time: current_timestamp(),
226        }
227    }
228}
229
230impl Default for BusinessMetrics {
231    fn default() -> Self {
232        Self {
233            device_operations_total: 0,
234            device_operations_success: 0,
235            device_operations_failed: 0,
236            operations_by_device_type: HashMap::new(),
237            operations_by_room: HashMap::new(),
238            loxone_api_calls_total: 0,
239            loxone_api_calls_success: 0,
240            loxone_api_calls_failed: 0,
241            structure_refreshes: 0,
242            last_structure_refresh: 0,
243            auth_attempts: 0,
244            auth_successes: 0,
245            auth_failures: 0,
246            cache_hits: 0,
247            cache_misses: 0,
248            schema_validations_total: 0,
249            schema_validations_failed: 0,
250            coalesced_requests: 0,
251            coalescing_time_saved_ms: 0.0,
252        }
253    }
254}
255
256impl Default for ErrorMetrics {
257    fn default() -> Self {
258        Self {
259            total_errors: 0,
260            client_errors: 0,
261            server_errors: 0,
262            network_errors: 0,
263            auth_errors: 0,
264            business_errors: 0,
265            errors_by_tool: HashMap::new(),
266            error_rate_5min: 0.0,
267            error_rate_1hour: 0.0,
268            error_rate_24hour: 0.0,
269            recent_errors: Vec::new(),
270            timeout_errors: 0,
271            connection_errors: 0,
272            validation_errors: 0,
273            device_control_errors: 0,
274        }
275    }
276}
277
278impl Default for MetricsCollector {
279    fn default() -> Self {
280        Self::new()
281    }
282}
283
284impl MetricsCollector {
285    /// Create a new metrics collector
286    pub fn new() -> Self {
287        Self {
288            request_metrics: Arc::new(RwLock::new(RequestMetrics::default())),
289            health_metrics: Arc::new(RwLock::new(HealthMetrics::default())),
290            business_metrics: Arc::new(RwLock::new(BusinessMetrics::default())),
291            error_metrics: Arc::new(RwLock::new(ErrorMetrics::default())),
292            start_time: Instant::now(),
293        }
294    }
295
296    /// Record a request start
297    pub async fn record_request_start(&self, tool_name: &str) {
298        let mut metrics = self.request_metrics.write().await;
299        metrics.total_requests += 1;
300        metrics.active_requests += 1;
301        *metrics
302            .requests_by_tool
303            .entry(tool_name.to_string())
304            .or_insert(0) += 1;
305        metrics.last_updated = current_timestamp();
306    }
307
308    /// Record a request completion
309    pub async fn record_request_end(&self, tool_name: &str, duration: Duration, success: bool) {
310        let duration_ms = duration.as_millis() as f64;
311        let mut metrics = self.request_metrics.write().await;
312
313        metrics.active_requests = metrics.active_requests.saturating_sub(1);
314
315        if success {
316            metrics.successful_requests += 1;
317        } else {
318            metrics.failed_requests += 1;
319        }
320
321        // Update response times
322        metrics
323            .response_times_by_tool
324            .entry(tool_name.to_string())
325            .or_insert_with(Vec::new)
326            .push(duration_ms);
327
328        // Keep only last 1000 response times per tool for memory efficiency
329        if let Some(times) = metrics.response_times_by_tool.get_mut(tool_name) {
330            if times.len() > 1000 {
331                times.drain(..times.len() - 1000);
332            }
333        }
334
335        // Recalculate averages and percentiles
336        self.update_response_time_statistics(&mut metrics).await;
337        metrics.last_updated = current_timestamp();
338    }
339
340    /// Record a rate limit hit
341    pub async fn record_rate_limit_hit(&self) {
342        let mut metrics = self.request_metrics.write().await;
343        metrics.rate_limit_hits += 1;
344        metrics.last_updated = current_timestamp();
345    }
346
347    /// Record an error
348    pub async fn record_error<E: crate::ErrorClassification>(
349        &self,
350        tool_name: &str,
351        request_id: &str,
352        error: &E,
353        duration: Duration,
354    ) {
355        let mut metrics = self.error_metrics.write().await;
356        metrics.total_errors += 1;
357
358        // Classify error using the trait
359        if error.is_auth_error() {
360            metrics.auth_errors += 1;
361            let mut business = self.business_metrics.write().await;
362            business.auth_failures += 1;
363        } else if error.is_connection_error() {
364            metrics.network_errors += 1;
365            metrics.connection_errors += 1;
366        } else if error.is_timeout() {
367            metrics.network_errors += 1;
368            metrics.timeout_errors += 1;
369        } else if error.is_retryable() {
370            metrics.server_errors += 1;
371        } else {
372            metrics.client_errors += 1;
373        }
374
375        // Track by tool
376        *metrics
377            .errors_by_tool
378            .entry(tool_name.to_string())
379            .or_insert(0) += 1;
380
381        // Add to recent errors (keep last 10)
382        let error_record = ErrorRecord {
383            timestamp: current_timestamp(),
384            error_type: error.error_type().to_string(),
385            error_message: error.to_string(),
386            tool_name: tool_name.to_string(),
387            request_id: request_id.to_string(),
388            duration_ms: duration.as_millis() as u64,
389        };
390
391        metrics.recent_errors.push(error_record);
392        if metrics.recent_errors.len() > 10 {
393            metrics.recent_errors.remove(0);
394        }
395    }
396
397    /// Record a device operation
398    pub async fn record_device_operation(
399        &self,
400        device_type: Option<&str>,
401        room_name: Option<&str>,
402        success: bool,
403    ) {
404        let mut metrics = self.business_metrics.write().await;
405        metrics.device_operations_total += 1;
406
407        if success {
408            metrics.device_operations_success += 1;
409        } else {
410            metrics.device_operations_failed += 1;
411        }
412
413        if let Some(dev_type) = device_type {
414            *metrics
415                .operations_by_device_type
416                .entry(dev_type.to_string())
417                .or_insert(0) += 1;
418        }
419
420        if let Some(room) = room_name {
421            *metrics
422                .operations_by_room
423                .entry(room.to_string())
424                .or_insert(0) += 1;
425        }
426    }
427
428    /// Record a Loxone API call
429    pub async fn record_loxone_api_call(&self, success: bool) {
430        let mut metrics = self.business_metrics.write().await;
431        metrics.loxone_api_calls_total += 1;
432
433        if success {
434            metrics.loxone_api_calls_success += 1;
435        } else {
436            metrics.loxone_api_calls_failed += 1;
437        }
438    }
439
440    /// Record schema validation
441    pub async fn record_schema_validation(&self, success: bool) {
442        let mut metrics = self.business_metrics.write().await;
443        metrics.schema_validations_total += 1;
444
445        if !success {
446            metrics.schema_validations_failed += 1;
447        }
448    }
449
450    /// Update health metrics
451    pub async fn update_health_metrics(
452        &self,
453        cpu_usage: Option<f64>,
454        memory_usage_mb: Option<f64>,
455        loxone_latency_ms: Option<f64>,
456        health_check_success: bool,
457    ) {
458        let mut metrics = self.health_metrics.write().await;
459        metrics.cpu_usage_percent = cpu_usage;
460        metrics.memory_usage_mb = memory_usage_mb;
461        metrics.loxone_latency_ms = loxone_latency_ms;
462        metrics.uptime_seconds = self.start_time.elapsed().as_secs();
463        metrics.last_health_check_success = health_check_success;
464        metrics.last_health_check_time = current_timestamp();
465    }
466
467    /// Get comprehensive metrics snapshot
468    pub async fn get_metrics_snapshot(&self) -> MetricsSnapshot {
469        let request_metrics = self.request_metrics.read().await.clone();
470        let health_metrics = self.health_metrics.read().await.clone();
471        let business_metrics = self.business_metrics.read().await.clone();
472        let error_metrics = self.error_metrics.read().await.clone();
473
474        MetricsSnapshot {
475            request_metrics,
476            health_metrics,
477            business_metrics,
478            error_metrics,
479            snapshot_timestamp: current_timestamp(),
480        }
481    }
482
483    /// Update response time statistics
484    async fn update_response_time_statistics(&self, metrics: &mut RequestMetrics) {
485        let mut all_times = Vec::new();
486        for times in metrics.response_times_by_tool.values() {
487            all_times.extend(times);
488        }
489
490        if !all_times.is_empty() {
491            all_times
492                .sort_by(|a: &f64, b: &f64| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
493
494            // Calculate average
495            metrics.avg_response_time_ms = all_times.iter().sum::<f64>() / all_times.len() as f64;
496
497            // Calculate percentiles
498            if all_times.len() >= 20 {
499                let p95_idx = (all_times.len() as f64 * 0.95) as usize;
500                let p99_idx = (all_times.len() as f64 * 0.99) as usize;
501                metrics.p95_response_time_ms = all_times[p95_idx.min(all_times.len() - 1)];
502                metrics.p99_response_time_ms = all_times[p99_idx.min(all_times.len() - 1)];
503            }
504        }
505    }
506}
507
508/// Complete metrics snapshot
509#[derive(Debug, Clone, Serialize, Deserialize)]
510pub struct MetricsSnapshot {
511    pub request_metrics: RequestMetrics,
512    pub health_metrics: HealthMetrics,
513    pub business_metrics: BusinessMetrics,
514    pub error_metrics: ErrorMetrics,
515    pub snapshot_timestamp: u64,
516}
517
518impl MetricsSnapshot {
519    /// Calculate error rate
520    pub fn error_rate(&self) -> f64 {
521        if self.request_metrics.total_requests == 0 {
522            0.0
523        } else {
524            self.request_metrics.failed_requests as f64 / self.request_metrics.total_requests as f64
525        }
526    }
527
528    /// Calculate success rate
529    pub fn success_rate(&self) -> f64 {
530        1.0 - self.error_rate()
531    }
532
533    /// Get availability percentage
534    pub fn availability_percentage(&self) -> f64 {
535        if self.health_metrics.last_health_check_success {
536            99.9 // Assume high availability if last check was successful
537        } else {
538            95.0 // Degraded if health check failed
539        }
540    }
541}
542
543/// Get current timestamp in seconds since Unix epoch
544fn current_timestamp() -> u64 {
545    SystemTime::now()
546        .duration_since(UNIX_EPOCH)
547        .unwrap_or_default()
548        .as_secs()
549}
550
551/// Global metrics collector instance
552static METRICS: once_cell::sync::Lazy<MetricsCollector> =
553    once_cell::sync::Lazy::new(MetricsCollector::new);
554
555/// Get the global metrics collector
556pub fn get_metrics() -> &'static MetricsCollector {
557    &METRICS
558}
559
560#[cfg(test)]
561mod tests {
562    use super::*;
563    use std::time::Duration;
564
565    #[tokio::test]
566    async fn test_metrics_collection() {
567        let collector = MetricsCollector::new();
568
569        // Record some operations
570        collector.record_request_start("test_tool").await;
571        tokio::time::sleep(Duration::from_millis(10)).await;
572        collector
573            .record_request_end("test_tool", Duration::from_millis(10), true)
574            .await;
575
576        let snapshot = collector.get_metrics_snapshot().await;
577        assert_eq!(snapshot.request_metrics.total_requests, 1);
578        assert_eq!(snapshot.request_metrics.successful_requests, 1);
579        assert!(snapshot.request_metrics.avg_response_time_ms > 0.0);
580    }
581
582    #[tokio::test]
583    async fn test_error_recording() {
584        let collector = MetricsCollector::new();
585
586        // Create a mock error implementing ErrorClassification
587        #[derive(Debug)]
588        struct MockAuthError;
589
590        impl std::fmt::Display for MockAuthError {
591            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
592                write!(f, "Test auth error")
593            }
594        }
595
596        impl std::error::Error for MockAuthError {}
597
598        impl crate::ErrorClassification for MockAuthError {
599            fn error_type(&self) -> &str {
600                "auth_error"
601            }
602            fn is_retryable(&self) -> bool {
603                false
604            }
605            fn is_timeout(&self) -> bool {
606                false
607            }
608            fn is_auth_error(&self) -> bool {
609                true
610            }
611            fn is_connection_error(&self) -> bool {
612                false
613            }
614        }
615
616        let error = MockAuthError;
617        collector
618            .record_error("test_tool", "req-123", &error, Duration::from_millis(100))
619            .await;
620
621        let snapshot = collector.get_metrics_snapshot().await;
622        assert_eq!(snapshot.error_metrics.total_errors, 1);
623        assert_eq!(snapshot.error_metrics.auth_errors, 1);
624        assert_eq!(snapshot.error_metrics.recent_errors.len(), 1);
625    }
626}