bws_web_server/monitoring/
metrics.rs

1//! Metrics collection and reporting for BWS Web Server
2//!
3//! This module provides metrics collection, aggregation, and reporting
4//! functionality for monitoring server performance and health.
5
6use crate::core::{BwsResult, HealthStatus};
7use std::collections::HashMap;
8use std::sync::atomic::{AtomicU64, Ordering};
9use std::sync::{Arc, RwLock};
10use std::time::{Duration, Instant};
11
12/// Metrics collector for BWS Web Server
13#[derive(Debug)]
14pub struct MetricsCollector {
15    /// Request counters by status code
16    request_counts: Arc<RwLock<HashMap<u16, AtomicU64>>>,
17
18    /// Response time histogram
19    response_times: Arc<RwLock<Vec<Duration>>>,
20
21    /// Active connections counter
22    active_connections: AtomicU64,
23
24    /// Total bytes served
25    bytes_served: AtomicU64,
26
27    /// Error counters by type
28    error_counts: Arc<RwLock<HashMap<String, AtomicU64>>>,
29
30    /// Server start time
31    start_time: Instant,
32}
33
34impl MetricsCollector {
35    /// Create a new metrics collector
36    pub fn new() -> Self {
37        Self {
38            request_counts: Arc::new(RwLock::new(HashMap::new())),
39            response_times: Arc::new(RwLock::new(Vec::new())),
40            active_connections: AtomicU64::new(0),
41            bytes_served: AtomicU64::new(0),
42            error_counts: Arc::new(RwLock::new(HashMap::new())),
43            start_time: Instant::now(),
44        }
45    }
46
47    /// Record a request with status code and response time
48    pub fn record_request(&self, status_code: u16, response_time: Duration, bytes: u64) {
49        // Increment request counter
50        if let Ok(mut counts) = self.request_counts.write() {
51            counts
52                .entry(status_code)
53                .or_insert_with(|| AtomicU64::new(0))
54                .fetch_add(1, Ordering::Relaxed);
55        }
56
57        // Record response time (keep only recent samples)
58        if let Ok(mut times) = self.response_times.write() {
59            times.push(response_time);
60            // Keep only last 1000 samples
61            if times.len() > 1000 {
62                let excess = times.len() - 1000;
63                times.drain(0..excess);
64            }
65        }
66
67        // Add bytes served
68        self.bytes_served.fetch_add(bytes, Ordering::Relaxed);
69    }
70
71    /// Increment active connections
72    pub fn increment_connections(&self) {
73        self.active_connections.fetch_add(1, Ordering::Relaxed);
74    }
75
76    /// Decrement active connections
77    pub fn decrement_connections(&self) {
78        self.active_connections.fetch_sub(1, Ordering::Relaxed);
79    }
80
81    /// Record an error
82    pub fn record_error(&self, error_type: &str) {
83        if let Ok(mut errors) = self.error_counts.write() {
84            errors
85                .entry(error_type.to_string())
86                .or_insert_with(|| AtomicU64::new(0))
87                .fetch_add(1, Ordering::Relaxed);
88        }
89    }
90
91    /// Get current metrics snapshot
92    pub fn get_metrics(&self) -> BwsResult<MetricsSnapshot> {
93        let request_counts = self
94            .request_counts
95            .read()
96            .map_err(|_| {
97                crate::core::BwsError::Internal("Failed to read request counts".to_string())
98            })?
99            .iter()
100            .map(|(k, v)| (*k, v.load(Ordering::Relaxed)))
101            .collect();
102
103        let response_times = self
104            .response_times
105            .read()
106            .map_err(|_| {
107                crate::core::BwsError::Internal("Failed to read response times".to_string())
108            })?
109            .clone();
110
111        let error_counts = self
112            .error_counts
113            .read()
114            .map_err(|_| {
115                crate::core::BwsError::Internal("Failed to read error counts".to_string())
116            })?
117            .iter()
118            .map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed)))
119            .collect();
120
121        let avg_response_time = if response_times.is_empty() {
122            Duration::from_millis(0)
123        } else {
124            let total: Duration = response_times.iter().sum();
125            total / response_times.len() as u32
126        };
127
128        let p95_response_time = if response_times.is_empty() {
129            Duration::from_millis(0)
130        } else {
131            let mut sorted_times = response_times.clone();
132            sorted_times.sort();
133            let index = (sorted_times.len() as f64 * 0.95) as usize;
134            sorted_times.get(index).copied().unwrap_or_default()
135        };
136
137        Ok(MetricsSnapshot {
138            uptime: self.start_time.elapsed(),
139            request_counts,
140            active_connections: self.active_connections.load(Ordering::Relaxed),
141            bytes_served: self.bytes_served.load(Ordering::Relaxed),
142            avg_response_time,
143            p95_response_time,
144            error_counts,
145            health_status: self.get_health_status(),
146        })
147    }
148
149    /// Get overall health status based on metrics
150    fn get_health_status(&self) -> HealthStatus {
151        let active_connections = self.active_connections.load(Ordering::Relaxed);
152        let error_rate = self.calculate_error_rate();
153
154        if error_rate > 0.1 {
155            HealthStatus::Unhealthy
156        } else if error_rate > 0.05 || active_connections > 1000 {
157            HealthStatus::Degraded
158        } else {
159            HealthStatus::Healthy
160        }
161    }
162
163    /// Calculate current error rate
164    fn calculate_error_rate(&self) -> f64 {
165        let request_counts = match self.request_counts.read() {
166            Ok(counts) => counts,
167            Err(_) => return 0.0,
168        };
169
170        let total_requests: u64 = request_counts
171            .values()
172            .map(|v| v.load(Ordering::Relaxed))
173            .sum();
174
175        if total_requests == 0 {
176            return 0.0;
177        }
178
179        let error_requests: u64 = request_counts
180            .iter()
181            .filter(|(status, _)| **status >= 400)
182            .map(|(_, count)| count.load(Ordering::Relaxed))
183            .sum();
184
185        error_requests as f64 / total_requests as f64
186    }
187}
188
189impl Default for MetricsCollector {
190    fn default() -> Self {
191        Self::new()
192    }
193}
194
195/// Snapshot of current metrics
196#[derive(Debug, Clone)]
197pub struct MetricsSnapshot {
198    /// Server uptime
199    pub uptime: Duration,
200
201    /// Request counts by status code
202    pub request_counts: HashMap<u16, u64>,
203
204    /// Current active connections
205    pub active_connections: u64,
206
207    /// Total bytes served
208    pub bytes_served: u64,
209
210    /// Average response time
211    pub avg_response_time: Duration,
212
213    /// 95th percentile response time
214    pub p95_response_time: Duration,
215
216    /// Error counts by type
217    pub error_counts: HashMap<String, u64>,
218
219    /// Overall health status
220    pub health_status: HealthStatus,
221}
222
223impl MetricsSnapshot {
224    /// Convert to JSON for API responses
225    pub fn to_json(&self) -> serde_json::Value {
226        serde_json::json!({
227            "uptime_seconds": self.uptime.as_secs(),
228            "request_counts": self.request_counts,
229            "active_connections": self.active_connections,
230            "bytes_served": self.bytes_served,
231            "avg_response_time_ms": self.avg_response_time.as_millis(),
232            "p95_response_time_ms": self.p95_response_time.as_millis(),
233            "error_counts": self.error_counts,
234            "health_status": format!("{:?}", self.health_status),
235        })
236    }
237}
238
239/// Global metrics instance
240static METRICS: std::sync::OnceLock<MetricsCollector> = std::sync::OnceLock::new();
241
242/// Get global metrics collector
243pub fn metrics() -> &'static MetricsCollector {
244    METRICS.get_or_init(MetricsCollector::new)
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    #[test]
252    fn test_metrics_collection() {
253        let collector = MetricsCollector::new();
254
255        // Record some requests
256        collector.record_request(200, Duration::from_millis(100), 1024);
257        collector.record_request(404, Duration::from_millis(50), 512);
258        collector.record_request(500, Duration::from_millis(200), 256);
259
260        let metrics = collector.get_metrics().unwrap();
261
262        assert_eq!(metrics.request_counts.get(&200), Some(&1));
263        assert_eq!(metrics.request_counts.get(&404), Some(&1));
264        assert_eq!(metrics.request_counts.get(&500), Some(&1));
265        assert_eq!(metrics.bytes_served, 1792);
266    }
267
268    #[test]
269    fn test_connection_tracking() {
270        let collector = MetricsCollector::new();
271
272        collector.increment_connections();
273        collector.increment_connections();
274        assert_eq!(collector.active_connections.load(Ordering::Relaxed), 2);
275
276        collector.decrement_connections();
277        assert_eq!(collector.active_connections.load(Ordering::Relaxed), 1);
278    }
279}