1use crate::persistence::{MetricsPersistence, PersistenceConfig};
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::sync::Arc;
13use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
14use tokio::sync::RwLock;
15
16pub struct MetricsCollector {
18 request_metrics: Arc<RwLock<RequestMetrics>>,
20
21 health_metrics: Arc<RwLock<HealthMetrics>>,
23
24 business_metrics: Arc<RwLock<BusinessMetrics>>,
26
27 error_metrics: Arc<RwLock<ErrorMetrics>>,
29
30 start_time: Instant,
32
33 persistence: Option<Arc<MetricsPersistence>>,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct RequestMetrics {
40 pub total_requests: u64,
42
43 pub successful_requests: u64,
45
46 pub failed_requests: u64,
48
49 pub avg_response_time_ms: f64,
51
52 pub p95_response_time_ms: f64,
54
55 pub p99_response_time_ms: f64,
57
58 pub active_requests: u64,
60
61 pub requests_by_tool: HashMap<String, u64>,
63
64 pub response_times_by_tool: HashMap<String, Vec<f64>>,
66
67 pub rate_limit_hits: u64,
69
70 pub requests_per_second: f64,
72
73 pub last_updated: u64,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct HealthMetrics {
80 pub cpu_usage_percent: Option<f64>,
82
83 pub memory_usage_mb: Option<f64>,
85
86 pub memory_usage_percent: Option<f64>,
88
89 pub disk_usage_percent: Option<f64>,
91
92 pub loxone_latency_ms: Option<f64>,
94
95 pub connection_pool_active: Option<u32>,
97 pub connection_pool_idle: Option<u32>,
98 pub connection_pool_max: Option<u32>,
99
100 pub uptime_seconds: u64,
102
103 pub gc_collections: Option<u64>,
105 pub gc_time_ms: Option<f64>,
106
107 pub last_health_check_success: bool,
109 pub last_health_check_time: u64,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct BusinessMetrics {
115 pub device_operations_total: u64,
117 pub device_operations_success: u64,
118 pub device_operations_failed: u64,
119
120 pub operations_by_device_type: HashMap<String, u64>,
122
123 pub operations_by_room: HashMap<String, u64>,
125
126 pub loxone_api_calls_total: u64,
128 pub loxone_api_calls_success: u64,
129 pub loxone_api_calls_failed: u64,
130
131 pub structure_refreshes: u64,
133 pub last_structure_refresh: u64,
134
135 pub auth_attempts: u64,
137 pub auth_successes: u64,
138 pub auth_failures: u64,
139
140 pub cache_hits: u64,
142 pub cache_misses: u64,
143
144 pub schema_validations_total: u64,
146 pub schema_validations_failed: u64,
147
148 pub coalesced_requests: u64,
150 pub coalescing_time_saved_ms: f64,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct ErrorMetrics {
156 pub total_errors: u64,
158
159 pub client_errors: u64,
161 pub server_errors: u64,
162 pub network_errors: u64,
163 pub auth_errors: u64,
164 pub business_errors: u64,
165
166 pub errors_by_tool: HashMap<String, u64>,
168
169 pub error_rate_5min: f64,
171 pub error_rate_1hour: f64,
172 pub error_rate_24hour: f64,
173
174 pub recent_errors: Vec<ErrorRecord>,
176
177 pub timeout_errors: u64,
179 pub connection_errors: u64,
180 pub validation_errors: u64,
181 pub device_control_errors: u64,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct ErrorRecord {
187 pub timestamp: u64,
188 pub error_type: String,
189 pub error_message: String,
190 pub tool_name: String,
191 pub request_id: String,
192 pub duration_ms: u64,
193}
194
195impl Default for RequestMetrics {
196 fn default() -> Self {
197 Self {
198 total_requests: 0,
199 successful_requests: 0,
200 failed_requests: 0,
201 avg_response_time_ms: 0.0,
202 p95_response_time_ms: 0.0,
203 p99_response_time_ms: 0.0,
204 active_requests: 0,
205 requests_by_tool: HashMap::new(),
206 response_times_by_tool: HashMap::new(),
207 rate_limit_hits: 0,
208 requests_per_second: 0.0,
209 last_updated: current_timestamp(),
210 }
211 }
212}
213
214impl Default for HealthMetrics {
215 fn default() -> Self {
216 Self {
217 cpu_usage_percent: None,
218 memory_usage_mb: None,
219 memory_usage_percent: None,
220 disk_usage_percent: None,
221 loxone_latency_ms: None,
222 connection_pool_active: None,
223 connection_pool_idle: None,
224 connection_pool_max: None,
225 uptime_seconds: 0,
226 gc_collections: None,
227 gc_time_ms: None,
228 last_health_check_success: false,
229 last_health_check_time: current_timestamp(),
230 }
231 }
232}
233
234impl Default for BusinessMetrics {
235 fn default() -> Self {
236 Self {
237 device_operations_total: 0,
238 device_operations_success: 0,
239 device_operations_failed: 0,
240 operations_by_device_type: HashMap::new(),
241 operations_by_room: HashMap::new(),
242 loxone_api_calls_total: 0,
243 loxone_api_calls_success: 0,
244 loxone_api_calls_failed: 0,
245 structure_refreshes: 0,
246 last_structure_refresh: 0,
247 auth_attempts: 0,
248 auth_successes: 0,
249 auth_failures: 0,
250 cache_hits: 0,
251 cache_misses: 0,
252 schema_validations_total: 0,
253 schema_validations_failed: 0,
254 coalesced_requests: 0,
255 coalescing_time_saved_ms: 0.0,
256 }
257 }
258}
259
260impl Default for ErrorMetrics {
261 fn default() -> Self {
262 Self {
263 total_errors: 0,
264 client_errors: 0,
265 server_errors: 0,
266 network_errors: 0,
267 auth_errors: 0,
268 business_errors: 0,
269 errors_by_tool: HashMap::new(),
270 error_rate_5min: 0.0,
271 error_rate_1hour: 0.0,
272 error_rate_24hour: 0.0,
273 recent_errors: Vec::new(),
274 timeout_errors: 0,
275 connection_errors: 0,
276 validation_errors: 0,
277 device_control_errors: 0,
278 }
279 }
280}
281
282impl Default for MetricsCollector {
283 fn default() -> Self {
284 Self::new()
285 }
286}
287
288impl MetricsCollector {
289 pub fn new() -> Self {
291 Self {
292 request_metrics: Arc::new(RwLock::new(RequestMetrics::default())),
293 health_metrics: Arc::new(RwLock::new(HealthMetrics::default())),
294 business_metrics: Arc::new(RwLock::new(BusinessMetrics::default())),
295 error_metrics: Arc::new(RwLock::new(ErrorMetrics::default())),
296 start_time: Instant::now(),
297 persistence: None,
298 }
299 }
300
301 pub fn with_persistence(persistence_config: PersistenceConfig) -> Result<Self, std::io::Error> {
303 let persistence = Arc::new(MetricsPersistence::new(persistence_config)?);
304 Ok(Self {
305 request_metrics: Arc::new(RwLock::new(RequestMetrics::default())),
306 health_metrics: Arc::new(RwLock::new(HealthMetrics::default())),
307 business_metrics: Arc::new(RwLock::new(BusinessMetrics::default())),
308 error_metrics: Arc::new(RwLock::new(ErrorMetrics::default())),
309 start_time: Instant::now(),
310 persistence: Some(persistence),
311 })
312 }
313
314 pub async fn enable_persistence(
316 &self,
317 _persistence_config: PersistenceConfig,
318 ) -> Result<(), std::io::Error> {
319 Ok(())
322 }
323
324 pub async fn save_snapshot(&self) -> Result<(), std::io::Error> {
326 if let Some(persistence) = &self.persistence {
327 let snapshot = self.get_metrics_snapshot().await;
328 persistence.save_snapshot(snapshot).await?;
329 }
330 Ok(())
331 }
332
333 pub async fn record_request_start(&self, tool_name: &str) {
335 let mut metrics = self.request_metrics.write().await;
336 metrics.total_requests += 1;
337 metrics.active_requests += 1;
338 *metrics
339 .requests_by_tool
340 .entry(tool_name.to_string())
341 .or_insert(0) += 1;
342 metrics.last_updated = current_timestamp();
343 }
344
345 pub async fn record_request_end(&self, tool_name: &str, duration: Duration, success: bool) {
347 #[allow(clippy::cast_precision_loss)]
348 let duration_ms = duration.as_millis() as f64;
349 let mut metrics = self.request_metrics.write().await;
350
351 metrics.active_requests = metrics.active_requests.saturating_sub(1);
352
353 if success {
354 metrics.successful_requests += 1;
355 } else {
356 metrics.failed_requests += 1;
357 }
358
359 metrics
361 .response_times_by_tool
362 .entry(tool_name.to_string())
363 .or_insert_with(Vec::new)
364 .push(duration_ms);
365
366 if let Some(times) = metrics.response_times_by_tool.get_mut(tool_name) {
368 if times.len() > 1000 {
369 times.drain(..times.len() - 1000);
370 }
371 }
372
373 Self::update_response_time_statistics(&mut metrics);
375 metrics.last_updated = current_timestamp();
376 }
377
378 pub async fn record_rate_limit_hit(&self) {
380 let mut metrics = self.request_metrics.write().await;
381 metrics.rate_limit_hits += 1;
382 metrics.last_updated = current_timestamp();
383 }
384
385 pub async fn record_error<E: crate::ErrorClassification>(
387 &self,
388 tool_name: &str,
389 request_id: &str,
390 error: &E,
391 duration: Duration,
392 ) {
393 let mut metrics = self.error_metrics.write().await;
394 metrics.total_errors += 1;
395
396 if error.is_auth_error() {
398 metrics.auth_errors += 1;
399 let mut business = self.business_metrics.write().await;
400 business.auth_failures += 1;
401 } else if error.is_connection_error() {
402 metrics.network_errors += 1;
403 metrics.connection_errors += 1;
404 } else if error.is_timeout() {
405 metrics.network_errors += 1;
406 metrics.timeout_errors += 1;
407 } else if error.is_retryable() {
408 metrics.server_errors += 1;
409 } else {
410 metrics.client_errors += 1;
411 }
412
413 *metrics
415 .errors_by_tool
416 .entry(tool_name.to_string())
417 .or_insert(0) += 1;
418
419 let error_record = ErrorRecord {
421 timestamp: current_timestamp(),
422 error_type: error.error_type().to_string(),
423 error_message: error.to_string(),
424 tool_name: tool_name.to_string(),
425 request_id: request_id.to_string(),
426 duration_ms: duration.as_millis().try_into().unwrap_or(u64::MAX),
427 };
428
429 metrics.recent_errors.push(error_record);
430 if metrics.recent_errors.len() > 100 {
431 metrics.recent_errors.remove(0);
432 }
433 }
434
435 pub async fn record_device_operation(
437 &self,
438 device_type: Option<&str>,
439 room_name: Option<&str>,
440 success: bool,
441 ) {
442 let mut metrics = self.business_metrics.write().await;
443 metrics.device_operations_total += 1;
444
445 if success {
446 metrics.device_operations_success += 1;
447 } else {
448 metrics.device_operations_failed += 1;
449 }
450
451 if let Some(dev_type) = device_type {
452 *metrics
453 .operations_by_device_type
454 .entry(dev_type.to_string())
455 .or_insert(0) += 1;
456 }
457
458 if let Some(room) = room_name {
459 *metrics
460 .operations_by_room
461 .entry(room.to_string())
462 .or_insert(0) += 1;
463 }
464 }
465
466 pub async fn record_loxone_api_call(&self, success: bool) {
468 let mut metrics = self.business_metrics.write().await;
469 metrics.loxone_api_calls_total += 1;
470
471 if success {
472 metrics.loxone_api_calls_success += 1;
473 } else {
474 metrics.loxone_api_calls_failed += 1;
475 }
476 }
477
478 pub async fn record_schema_validation(&self, success: bool) {
480 let mut metrics = self.business_metrics.write().await;
481 metrics.schema_validations_total += 1;
482
483 if !success {
484 metrics.schema_validations_failed += 1;
485 }
486 }
487
488 pub async fn update_health_metrics(
490 &self,
491 cpu_usage: Option<f64>,
492 memory_usage_mb: Option<f64>,
493 loxone_latency_ms: Option<f64>,
494 health_check_success: bool,
495 ) {
496 let mut metrics = self.health_metrics.write().await;
497 metrics.cpu_usage_percent = cpu_usage;
498 metrics.memory_usage_mb = memory_usage_mb;
499 metrics.loxone_latency_ms = loxone_latency_ms;
500 metrics.uptime_seconds = self.start_time.elapsed().as_secs();
501 metrics.last_health_check_success = health_check_success;
502 metrics.last_health_check_time = current_timestamp();
503 }
504
505 pub async fn get_metrics_snapshot(&self) -> MetricsSnapshot {
507 let request_metrics = self.request_metrics.read().await.clone();
508 let health_metrics = self.health_metrics.read().await.clone();
509 let business_metrics = self.business_metrics.read().await.clone();
510 let error_metrics = self.error_metrics.read().await.clone();
511
512 MetricsSnapshot {
513 request_metrics,
514 health_metrics,
515 business_metrics,
516 error_metrics,
517 snapshot_timestamp: current_timestamp(),
518 }
519 }
520
521 fn update_response_time_statistics(metrics: &mut RequestMetrics) {
523 let mut all_times = Vec::new();
524 for times in metrics.response_times_by_tool.values() {
525 all_times.extend(times);
526 }
527
528 if !all_times.is_empty() {
529 all_times
530 .sort_by(|a: &f64, b: &f64| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
531
532 #[allow(clippy::cast_precision_loss)]
534 {
535 metrics.avg_response_time_ms =
536 all_times.iter().sum::<f64>() / all_times.len() as f64;
537 }
538
539 if all_times.len() >= 2 {
541 #[allow(
542 clippy::cast_precision_loss,
543 clippy::cast_possible_truncation,
544 clippy::cast_sign_loss
545 )]
546 let p95_idx = (all_times.len() as f64 * 0.95) as usize;
547 #[allow(
548 clippy::cast_precision_loss,
549 clippy::cast_possible_truncation,
550 clippy::cast_sign_loss
551 )]
552 let p99_idx = (all_times.len() as f64 * 0.99) as usize;
553 metrics.p95_response_time_ms = all_times[p95_idx.min(all_times.len() - 1)];
554 metrics.p99_response_time_ms = all_times[p99_idx.min(all_times.len() - 1)];
555 } else if !all_times.is_empty() {
556 metrics.p95_response_time_ms = *all_times.last().unwrap();
558 metrics.p99_response_time_ms = *all_times.last().unwrap();
559 }
560 }
561 }
562}
563
564#[derive(Debug, Clone, Serialize, Deserialize)]
566pub struct MetricsSnapshot {
567 pub request_metrics: RequestMetrics,
568 pub health_metrics: HealthMetrics,
569 pub business_metrics: BusinessMetrics,
570 pub error_metrics: ErrorMetrics,
571 pub snapshot_timestamp: u64,
572}
573
574impl MetricsSnapshot {
575 pub fn error_rate(&self) -> f64 {
577 if self.request_metrics.total_requests == 0 {
578 0.0
579 } else {
580 #[allow(clippy::cast_precision_loss)]
581 {
582 self.request_metrics.failed_requests as f64
583 / self.request_metrics.total_requests as f64
584 }
585 }
586 }
587
588 pub fn success_rate(&self) -> f64 {
590 1.0 - self.error_rate()
591 }
592
593 pub fn availability_percentage(&self) -> f64 {
595 if self.health_metrics.last_health_check_success {
596 99.9 } else {
598 95.0 }
600 }
601}
602
603pub fn current_timestamp() -> u64 {
605 SystemTime::now()
606 .duration_since(UNIX_EPOCH)
607 .unwrap_or_default()
608 .as_secs()
609}
610
611static METRICS: once_cell::sync::Lazy<MetricsCollector> =
613 once_cell::sync::Lazy::new(MetricsCollector::new);
614
615pub fn get_metrics() -> &'static MetricsCollector {
617 &METRICS
618}
619
620#[cfg(test)]
621#[path = "metrics_tests.rs"]
622mod metrics_tests;
623
624#[cfg(test)]
625mod tests {
626 use super::*;
627 use std::time::Duration;
628
629 #[tokio::test]
630 async fn test_metrics_collection() {
631 let collector = MetricsCollector::new();
632
633 collector.record_request_start("test_tool").await;
635 tokio::time::sleep(Duration::from_millis(10)).await;
636 collector
637 .record_request_end("test_tool", Duration::from_millis(10), true)
638 .await;
639
640 let snapshot = collector.get_metrics_snapshot().await;
641 assert_eq!(snapshot.request_metrics.total_requests, 1);
642 assert_eq!(snapshot.request_metrics.successful_requests, 1);
643 assert!(snapshot.request_metrics.avg_response_time_ms > 0.0);
644 }
645
646 #[tokio::test]
647 async fn test_error_recording() {
648 let collector = MetricsCollector::new();
649
650 #[derive(Debug)]
652 #[allow(clippy::items_after_statements)]
653 struct MockAuthError;
654
655 impl std::fmt::Display for MockAuthError {
656 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
657 write!(f, "Test auth error")
658 }
659 }
660
661 impl std::error::Error for MockAuthError {}
662
663 impl crate::ErrorClassification for MockAuthError {
664 fn error_type(&self) -> &'static str {
665 "auth_error"
666 }
667 fn is_retryable(&self) -> bool {
668 false
669 }
670 fn is_timeout(&self) -> bool {
671 false
672 }
673 fn is_auth_error(&self) -> bool {
674 true
675 }
676 fn is_connection_error(&self) -> bool {
677 false
678 }
679 }
680
681 let error = MockAuthError;
682 collector
683 .record_error("test_tool", "req-123", &error, Duration::from_millis(100))
684 .await;
685
686 let snapshot = collector.get_metrics_snapshot().await;
687 assert_eq!(snapshot.error_metrics.total_errors, 1);
688 assert_eq!(snapshot.error_metrics.auth_errors, 1);
689 assert_eq!(snapshot.error_metrics.recent_errors.len(), 1);
690 }
691}