1use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::sync::Arc;
12use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
13use tokio::sync::RwLock;
14
15pub struct MetricsCollector {
17 request_metrics: Arc<RwLock<RequestMetrics>>,
19
20 health_metrics: Arc<RwLock<HealthMetrics>>,
22
23 business_metrics: Arc<RwLock<BusinessMetrics>>,
25
26 error_metrics: Arc<RwLock<ErrorMetrics>>,
28
29 start_time: Instant,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct RequestMetrics {
36 pub total_requests: u64,
38
39 pub successful_requests: u64,
41
42 pub failed_requests: u64,
44
45 pub avg_response_time_ms: f64,
47
48 pub p95_response_time_ms: f64,
50
51 pub p99_response_time_ms: f64,
53
54 pub active_requests: u64,
56
57 pub requests_by_tool: HashMap<String, u64>,
59
60 pub response_times_by_tool: HashMap<String, Vec<f64>>,
62
63 pub rate_limit_hits: u64,
65
66 pub requests_per_second: f64,
68
69 pub last_updated: u64,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct HealthMetrics {
76 pub cpu_usage_percent: Option<f64>,
78
79 pub memory_usage_mb: Option<f64>,
81
82 pub memory_usage_percent: Option<f64>,
84
85 pub disk_usage_percent: Option<f64>,
87
88 pub loxone_latency_ms: Option<f64>,
90
91 pub connection_pool_active: Option<u32>,
93 pub connection_pool_idle: Option<u32>,
94 pub connection_pool_max: Option<u32>,
95
96 pub uptime_seconds: u64,
98
99 pub gc_collections: Option<u64>,
101 pub gc_time_ms: Option<f64>,
102
103 pub last_health_check_success: bool,
105 pub last_health_check_time: u64,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct BusinessMetrics {
111 pub device_operations_total: u64,
113 pub device_operations_success: u64,
114 pub device_operations_failed: u64,
115
116 pub operations_by_device_type: HashMap<String, u64>,
118
119 pub operations_by_room: HashMap<String, u64>,
121
122 pub loxone_api_calls_total: u64,
124 pub loxone_api_calls_success: u64,
125 pub loxone_api_calls_failed: u64,
126
127 pub structure_refreshes: u64,
129 pub last_structure_refresh: u64,
130
131 pub auth_attempts: u64,
133 pub auth_successes: u64,
134 pub auth_failures: u64,
135
136 pub cache_hits: u64,
138 pub cache_misses: u64,
139
140 pub schema_validations_total: u64,
142 pub schema_validations_failed: u64,
143
144 pub coalesced_requests: u64,
146 pub coalescing_time_saved_ms: f64,
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct ErrorMetrics {
152 pub total_errors: u64,
154
155 pub client_errors: u64,
157 pub server_errors: u64,
158 pub network_errors: u64,
159 pub auth_errors: u64,
160 pub business_errors: u64,
161
162 pub errors_by_tool: HashMap<String, u64>,
164
165 pub error_rate_5min: f64,
167 pub error_rate_1hour: f64,
168 pub error_rate_24hour: f64,
169
170 pub recent_errors: Vec<ErrorRecord>,
172
173 pub timeout_errors: u64,
175 pub connection_errors: u64,
176 pub validation_errors: u64,
177 pub device_control_errors: u64,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct ErrorRecord {
183 pub timestamp: u64,
184 pub error_type: String,
185 pub error_message: String,
186 pub tool_name: String,
187 pub request_id: String,
188 pub duration_ms: u64,
189}
190
191impl Default for RequestMetrics {
192 fn default() -> Self {
193 Self {
194 total_requests: 0,
195 successful_requests: 0,
196 failed_requests: 0,
197 avg_response_time_ms: 0.0,
198 p95_response_time_ms: 0.0,
199 p99_response_time_ms: 0.0,
200 active_requests: 0,
201 requests_by_tool: HashMap::new(),
202 response_times_by_tool: HashMap::new(),
203 rate_limit_hits: 0,
204 requests_per_second: 0.0,
205 last_updated: current_timestamp(),
206 }
207 }
208}
209
210impl Default for HealthMetrics {
211 fn default() -> Self {
212 Self {
213 cpu_usage_percent: None,
214 memory_usage_mb: None,
215 memory_usage_percent: None,
216 disk_usage_percent: None,
217 loxone_latency_ms: None,
218 connection_pool_active: None,
219 connection_pool_idle: None,
220 connection_pool_max: None,
221 uptime_seconds: 0,
222 gc_collections: None,
223 gc_time_ms: None,
224 last_health_check_success: false,
225 last_health_check_time: current_timestamp(),
226 }
227 }
228}
229
230impl Default for BusinessMetrics {
231 fn default() -> Self {
232 Self {
233 device_operations_total: 0,
234 device_operations_success: 0,
235 device_operations_failed: 0,
236 operations_by_device_type: HashMap::new(),
237 operations_by_room: HashMap::new(),
238 loxone_api_calls_total: 0,
239 loxone_api_calls_success: 0,
240 loxone_api_calls_failed: 0,
241 structure_refreshes: 0,
242 last_structure_refresh: 0,
243 auth_attempts: 0,
244 auth_successes: 0,
245 auth_failures: 0,
246 cache_hits: 0,
247 cache_misses: 0,
248 schema_validations_total: 0,
249 schema_validations_failed: 0,
250 coalesced_requests: 0,
251 coalescing_time_saved_ms: 0.0,
252 }
253 }
254}
255
256impl Default for ErrorMetrics {
257 fn default() -> Self {
258 Self {
259 total_errors: 0,
260 client_errors: 0,
261 server_errors: 0,
262 network_errors: 0,
263 auth_errors: 0,
264 business_errors: 0,
265 errors_by_tool: HashMap::new(),
266 error_rate_5min: 0.0,
267 error_rate_1hour: 0.0,
268 error_rate_24hour: 0.0,
269 recent_errors: Vec::new(),
270 timeout_errors: 0,
271 connection_errors: 0,
272 validation_errors: 0,
273 device_control_errors: 0,
274 }
275 }
276}
277
278impl Default for MetricsCollector {
279 fn default() -> Self {
280 Self::new()
281 }
282}
283
284impl MetricsCollector {
285 pub fn new() -> Self {
287 Self {
288 request_metrics: Arc::new(RwLock::new(RequestMetrics::default())),
289 health_metrics: Arc::new(RwLock::new(HealthMetrics::default())),
290 business_metrics: Arc::new(RwLock::new(BusinessMetrics::default())),
291 error_metrics: Arc::new(RwLock::new(ErrorMetrics::default())),
292 start_time: Instant::now(),
293 }
294 }
295
296 pub async fn record_request_start(&self, tool_name: &str) {
298 let mut metrics = self.request_metrics.write().await;
299 metrics.total_requests += 1;
300 metrics.active_requests += 1;
301 *metrics
302 .requests_by_tool
303 .entry(tool_name.to_string())
304 .or_insert(0) += 1;
305 metrics.last_updated = current_timestamp();
306 }
307
308 pub async fn record_request_end(&self, tool_name: &str, duration: Duration, success: bool) {
310 let duration_ms = duration.as_millis() as f64;
311 let mut metrics = self.request_metrics.write().await;
312
313 metrics.active_requests = metrics.active_requests.saturating_sub(1);
314
315 if success {
316 metrics.successful_requests += 1;
317 } else {
318 metrics.failed_requests += 1;
319 }
320
321 metrics
323 .response_times_by_tool
324 .entry(tool_name.to_string())
325 .or_insert_with(Vec::new)
326 .push(duration_ms);
327
328 if let Some(times) = metrics.response_times_by_tool.get_mut(tool_name) {
330 if times.len() > 1000 {
331 times.drain(..times.len() - 1000);
332 }
333 }
334
335 self.update_response_time_statistics(&mut metrics).await;
337 metrics.last_updated = current_timestamp();
338 }
339
340 pub async fn record_rate_limit_hit(&self) {
342 let mut metrics = self.request_metrics.write().await;
343 metrics.rate_limit_hits += 1;
344 metrics.last_updated = current_timestamp();
345 }
346
347 pub async fn record_error<E: crate::ErrorClassification>(
349 &self,
350 tool_name: &str,
351 request_id: &str,
352 error: &E,
353 duration: Duration,
354 ) {
355 let mut metrics = self.error_metrics.write().await;
356 metrics.total_errors += 1;
357
358 if error.is_auth_error() {
360 metrics.auth_errors += 1;
361 let mut business = self.business_metrics.write().await;
362 business.auth_failures += 1;
363 } else if error.is_connection_error() {
364 metrics.network_errors += 1;
365 metrics.connection_errors += 1;
366 } else if error.is_timeout() {
367 metrics.network_errors += 1;
368 metrics.timeout_errors += 1;
369 } else if error.is_retryable() {
370 metrics.server_errors += 1;
371 } else {
372 metrics.client_errors += 1;
373 }
374
375 *metrics
377 .errors_by_tool
378 .entry(tool_name.to_string())
379 .or_insert(0) += 1;
380
381 let error_record = ErrorRecord {
383 timestamp: current_timestamp(),
384 error_type: error.error_type().to_string(),
385 error_message: error.to_string(),
386 tool_name: tool_name.to_string(),
387 request_id: request_id.to_string(),
388 duration_ms: duration.as_millis() as u64,
389 };
390
391 metrics.recent_errors.push(error_record);
392 if metrics.recent_errors.len() > 10 {
393 metrics.recent_errors.remove(0);
394 }
395 }
396
397 pub async fn record_device_operation(
399 &self,
400 device_type: Option<&str>,
401 room_name: Option<&str>,
402 success: bool,
403 ) {
404 let mut metrics = self.business_metrics.write().await;
405 metrics.device_operations_total += 1;
406
407 if success {
408 metrics.device_operations_success += 1;
409 } else {
410 metrics.device_operations_failed += 1;
411 }
412
413 if let Some(dev_type) = device_type {
414 *metrics
415 .operations_by_device_type
416 .entry(dev_type.to_string())
417 .or_insert(0) += 1;
418 }
419
420 if let Some(room) = room_name {
421 *metrics
422 .operations_by_room
423 .entry(room.to_string())
424 .or_insert(0) += 1;
425 }
426 }
427
428 pub async fn record_loxone_api_call(&self, success: bool) {
430 let mut metrics = self.business_metrics.write().await;
431 metrics.loxone_api_calls_total += 1;
432
433 if success {
434 metrics.loxone_api_calls_success += 1;
435 } else {
436 metrics.loxone_api_calls_failed += 1;
437 }
438 }
439
440 pub async fn record_schema_validation(&self, success: bool) {
442 let mut metrics = self.business_metrics.write().await;
443 metrics.schema_validations_total += 1;
444
445 if !success {
446 metrics.schema_validations_failed += 1;
447 }
448 }
449
450 pub async fn update_health_metrics(
452 &self,
453 cpu_usage: Option<f64>,
454 memory_usage_mb: Option<f64>,
455 loxone_latency_ms: Option<f64>,
456 health_check_success: bool,
457 ) {
458 let mut metrics = self.health_metrics.write().await;
459 metrics.cpu_usage_percent = cpu_usage;
460 metrics.memory_usage_mb = memory_usage_mb;
461 metrics.loxone_latency_ms = loxone_latency_ms;
462 metrics.uptime_seconds = self.start_time.elapsed().as_secs();
463 metrics.last_health_check_success = health_check_success;
464 metrics.last_health_check_time = current_timestamp();
465 }
466
467 pub async fn get_metrics_snapshot(&self) -> MetricsSnapshot {
469 let request_metrics = self.request_metrics.read().await.clone();
470 let health_metrics = self.health_metrics.read().await.clone();
471 let business_metrics = self.business_metrics.read().await.clone();
472 let error_metrics = self.error_metrics.read().await.clone();
473
474 MetricsSnapshot {
475 request_metrics,
476 health_metrics,
477 business_metrics,
478 error_metrics,
479 snapshot_timestamp: current_timestamp(),
480 }
481 }
482
483 async fn update_response_time_statistics(&self, metrics: &mut RequestMetrics) {
485 let mut all_times = Vec::new();
486 for times in metrics.response_times_by_tool.values() {
487 all_times.extend(times);
488 }
489
490 if !all_times.is_empty() {
491 all_times
492 .sort_by(|a: &f64, b: &f64| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
493
494 metrics.avg_response_time_ms = all_times.iter().sum::<f64>() / all_times.len() as f64;
496
497 if all_times.len() >= 20 {
499 let p95_idx = (all_times.len() as f64 * 0.95) as usize;
500 let p99_idx = (all_times.len() as f64 * 0.99) as usize;
501 metrics.p95_response_time_ms = all_times[p95_idx.min(all_times.len() - 1)];
502 metrics.p99_response_time_ms = all_times[p99_idx.min(all_times.len() - 1)];
503 }
504 }
505 }
506}
507
508#[derive(Debug, Clone, Serialize, Deserialize)]
510pub struct MetricsSnapshot {
511 pub request_metrics: RequestMetrics,
512 pub health_metrics: HealthMetrics,
513 pub business_metrics: BusinessMetrics,
514 pub error_metrics: ErrorMetrics,
515 pub snapshot_timestamp: u64,
516}
517
518impl MetricsSnapshot {
519 pub fn error_rate(&self) -> f64 {
521 if self.request_metrics.total_requests == 0 {
522 0.0
523 } else {
524 self.request_metrics.failed_requests as f64 / self.request_metrics.total_requests as f64
525 }
526 }
527
528 pub fn success_rate(&self) -> f64 {
530 1.0 - self.error_rate()
531 }
532
533 pub fn availability_percentage(&self) -> f64 {
535 if self.health_metrics.last_health_check_success {
536 99.9 } else {
538 95.0 }
540 }
541}
542
543fn current_timestamp() -> u64 {
545 SystemTime::now()
546 .duration_since(UNIX_EPOCH)
547 .unwrap_or_default()
548 .as_secs()
549}
550
551static METRICS: once_cell::sync::Lazy<MetricsCollector> =
553 once_cell::sync::Lazy::new(MetricsCollector::new);
554
555pub fn get_metrics() -> &'static MetricsCollector {
557 &METRICS
558}
559
560#[cfg(test)]
561mod tests {
562 use super::*;
563 use std::time::Duration;
564
565 #[tokio::test]
566 async fn test_metrics_collection() {
567 let collector = MetricsCollector::new();
568
569 collector.record_request_start("test_tool").await;
571 tokio::time::sleep(Duration::from_millis(10)).await;
572 collector
573 .record_request_end("test_tool", Duration::from_millis(10), true)
574 .await;
575
576 let snapshot = collector.get_metrics_snapshot().await;
577 assert_eq!(snapshot.request_metrics.total_requests, 1);
578 assert_eq!(snapshot.request_metrics.successful_requests, 1);
579 assert!(snapshot.request_metrics.avg_response_time_ms > 0.0);
580 }
581
582 #[tokio::test]
583 async fn test_error_recording() {
584 let collector = MetricsCollector::new();
585
586 #[derive(Debug)]
588 struct MockAuthError;
589
590 impl std::fmt::Display for MockAuthError {
591 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
592 write!(f, "Test auth error")
593 }
594 }
595
596 impl std::error::Error for MockAuthError {}
597
598 impl crate::ErrorClassification for MockAuthError {
599 fn error_type(&self) -> &str {
600 "auth_error"
601 }
602 fn is_retryable(&self) -> bool {
603 false
604 }
605 fn is_timeout(&self) -> bool {
606 false
607 }
608 fn is_auth_error(&self) -> bool {
609 true
610 }
611 fn is_connection_error(&self) -> bool {
612 false
613 }
614 }
615
616 let error = MockAuthError;
617 collector
618 .record_error("test_tool", "req-123", &error, Duration::from_millis(100))
619 .await;
620
621 let snapshot = collector.get_metrics_snapshot().await;
622 assert_eq!(snapshot.error_metrics.total_errors, 1);
623 assert_eq!(snapshot.error_metrics.auth_errors, 1);
624 assert_eq!(snapshot.error_metrics.recent_errors.len(), 1);
625 }
626}