oxirs_embed/
monitoring.rs

1//! Comprehensive monitoring and metrics system for embedding models
2//!
3//! This module provides real-time performance monitoring, drift detection,
4//! and comprehensive metrics collection for production embedding systems.
5
6use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use scirs2_core::random::{Random, Rng};
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, VecDeque};
11use std::sync::{Arc, RwLock};
12use std::time::Duration;
13use tokio::sync::Mutex;
14use tokio::task::JoinHandle;
15use tracing::{debug, error, info, warn};
16
17/// Comprehensive performance metrics for embedding systems
18#[derive(Debug, Clone, Serialize, Deserialize, Default)]
19pub struct PerformanceMetrics {
20    /// Request latency metrics
21    pub latency: LatencyMetrics,
22    /// Throughput metrics
23    pub throughput: ThroughputMetrics,
24    /// Resource utilization metrics
25    pub resources: ResourceMetrics,
26    /// Quality metrics
27    pub quality: QualityMetrics,
28    /// Error metrics
29    pub errors: ErrorMetrics,
30    /// Cache performance
31    pub cache: CacheMetrics,
32    /// Model drift metrics
33    pub drift: DriftMetrics,
34}
35
36/// Latency tracking and analysis
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct LatencyMetrics {
39    /// Average embedding generation time (ms)
40    pub avg_embedding_time_ms: f64,
41    /// P50 latency (ms)
42    pub p50_latency_ms: f64,
43    /// P95 latency (ms)
44    pub p95_latency_ms: f64,
45    /// P99 latency (ms)
46    pub p99_latency_ms: f64,
47    /// Maximum latency observed (ms)
48    pub max_latency_ms: f64,
49    /// Minimum latency observed (ms)
50    pub min_latency_ms: f64,
51    /// End-to-end request latency (ms)
52    pub end_to_end_latency_ms: f64,
53    /// Model inference latency (ms)
54    pub model_inference_time_ms: f64,
55    /// Queue wait time (ms)
56    pub queue_wait_time_ms: f64,
57    /// Total measurements
58    pub total_measurements: u64,
59}
60
61/// Throughput monitoring
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct ThroughputMetrics {
64    /// Requests per second
65    pub requests_per_second: f64,
66    /// Embeddings generated per second
67    pub embeddings_per_second: f64,
68    /// Batches processed per second
69    pub batches_per_second: f64,
70    /// Peak throughput achieved
71    pub peak_throughput: f64,
72    /// Current concurrent requests
73    pub concurrent_requests: u32,
74    /// Maximum concurrent requests handled
75    pub max_concurrent_requests: u32,
76    /// Total requests processed
77    pub total_requests: u64,
78    /// Failed requests
79    pub failed_requests: u64,
80    /// Success rate
81    pub success_rate: f64,
82}
83
84/// Resource utilization tracking
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct ResourceMetrics {
87    /// CPU utilization percentage
88    pub cpu_utilization_percent: f64,
89    /// Memory usage in MB
90    pub memory_usage_mb: f64,
91    /// GPU utilization percentage
92    pub gpu_utilization_percent: f64,
93    /// GPU memory usage in MB
94    pub gpu_memory_usage_mb: f64,
95    /// Network I/O in MB/s
96    pub network_io_mbps: f64,
97    /// Disk I/O in MB/s
98    pub disk_io_mbps: f64,
99    /// Peak memory usage
100    pub peak_memory_mb: f64,
101    /// Peak GPU memory usage
102    pub peak_gpu_memory_mb: f64,
103}
104
105/// Quality assessment metrics
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct QualityMetrics {
108    /// Average embedding quality score
109    pub avg_quality_score: f64,
110    /// Embedding space isotropy
111    pub isotropy_score: f64,
112    /// Neighborhood preservation
113    pub neighborhood_preservation: f64,
114    /// Clustering quality
115    pub clustering_quality: f64,
116    /// Similarity correlation
117    pub similarity_correlation: f64,
118    /// Quality degradation alerts
119    pub quality_alerts: u32,
120    /// Last quality assessment time
121    pub last_assessment: DateTime<Utc>,
122}
123
124/// Error tracking and analysis
125#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct ErrorMetrics {
127    /// Total errors
128    pub total_errors: u64,
129    /// Error rate per hour
130    pub error_rate_per_hour: f64,
131    /// Errors by type
132    pub errors_by_type: HashMap<String, u64>,
133    /// Critical errors
134    pub critical_errors: u64,
135    /// Timeout errors
136    pub timeout_errors: u64,
137    /// Model errors
138    pub model_errors: u64,
139    /// System errors
140    pub system_errors: u64,
141    /// Last error time
142    pub last_error: Option<DateTime<Utc>>,
143}
144
145/// Cache performance metrics
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct CacheMetrics {
148    /// Overall cache hit rate
149    pub hit_rate: f64,
150    /// L1 cache hit rate
151    pub l1_hit_rate: f64,
152    /// L2 cache hit rate
153    pub l2_hit_rate: f64,
154    /// L3 cache hit rate
155    pub l3_hit_rate: f64,
156    /// Cache memory usage MB
157    pub cache_memory_mb: f64,
158    /// Cache evictions
159    pub cache_evictions: u64,
160    /// Time saved by caching (seconds)
161    pub time_saved_seconds: f64,
162}
163
164/// Model drift detection metrics
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct DriftMetrics {
167    /// Embedding quality drift
168    pub quality_drift_score: f64,
169    /// Performance degradation score
170    pub performance_drift_score: f64,
171    /// Distribution shift detected
172    pub distribution_shift: bool,
173    /// Concept drift score
174    pub concept_drift_score: f64,
175    /// Data quality issues
176    pub data_quality_issues: u32,
177    /// Drift detection alerts
178    pub drift_alerts: u32,
179    /// Last drift assessment
180    pub last_drift_check: DateTime<Utc>,
181}
182
183/// Performance monitoring manager
184pub struct PerformanceMonitor {
185    /// Current metrics
186    metrics: Arc<RwLock<PerformanceMetrics>>,
187    /// Latency measurements window
188    latency_window: Arc<Mutex<VecDeque<f64>>>,
189    /// Throughput measurements window
190    throughput_window: Arc<Mutex<VecDeque<f64>>>,
191    /// Error tracking
192    error_log: Arc<Mutex<VecDeque<ErrorEvent>>>,
193    /// Quality assessments
194    quality_history: Arc<Mutex<VecDeque<QualityAssessment>>>,
195    /// Monitoring configuration
196    config: MonitoringConfig,
197    /// Background monitoring tasks
198    monitoring_tasks: Vec<JoinHandle<()>>,
199    /// Alert handlers
200    alert_handlers: Vec<Box<dyn AlertHandler + Send + Sync>>,
201}
202
203/// Monitoring configuration
204#[derive(Debug, Clone)]
205pub struct MonitoringConfig {
206    /// Metrics collection interval (seconds)
207    pub collection_interval_seconds: u64,
208    /// Latency window size for percentile calculations
209    pub latency_window_size: usize,
210    /// Throughput window size
211    pub throughput_window_size: usize,
212    /// Quality assessment interval (seconds)
213    pub quality_assessment_interval_seconds: u64,
214    /// Drift detection interval (seconds)
215    pub drift_detection_interval_seconds: u64,
216    /// Enable real-time alerting
217    pub enable_alerting: bool,
218    /// Alert thresholds
219    pub alert_thresholds: AlertThresholds,
220    /// Metrics export configuration
221    pub export_config: ExportConfig,
222}
223
224/// Alert threshold configuration
225#[derive(Debug, Clone)]
226pub struct AlertThresholds {
227    /// Maximum acceptable P95 latency (ms)
228    pub max_p95_latency_ms: f64,
229    /// Minimum acceptable throughput (req/s)
230    pub min_throughput_rps: f64,
231    /// Maximum acceptable error rate
232    pub max_error_rate: f64,
233    /// Minimum acceptable cache hit rate
234    pub min_cache_hit_rate: f64,
235    /// Maximum acceptable quality drift
236    pub max_quality_drift: f64,
237    /// Maximum acceptable memory usage (MB)
238    pub max_memory_usage_mb: f64,
239    /// Maximum acceptable GPU memory usage (MB)
240    pub max_gpu_memory_mb: f64,
241}
242
243/// Metrics export configuration
244#[derive(Debug, Clone)]
245pub struct ExportConfig {
246    /// Enable Prometheus metrics export
247    pub enable_prometheus: bool,
248    /// Prometheus metrics port
249    pub prometheus_port: u16,
250    /// Enable OpenTelemetry export
251    pub enable_opentelemetry: bool,
252    /// OTLP endpoint
253    pub otlp_endpoint: Option<String>,
254    /// Export interval (seconds)
255    pub export_interval_seconds: u64,
256    /// Enable JSON metrics export
257    pub enable_json_export: bool,
258    /// JSON export path
259    pub json_export_path: Option<String>,
260}
261
262/// Error event for tracking
263#[derive(Debug, Clone)]
264pub struct ErrorEvent {
265    pub timestamp: DateTime<Utc>,
266    pub error_type: String,
267    pub error_message: String,
268    pub severity: ErrorSeverity,
269    pub context: HashMap<String, String>,
270}
271
272/// Error severity levels
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub enum ErrorSeverity {
275    Low,
276    Medium,
277    High,
278    Critical,
279}
280
281/// Quality assessment record
282#[derive(Debug, Clone)]
283pub struct QualityAssessment {
284    pub timestamp: DateTime<Utc>,
285    pub quality_score: f64,
286    pub metrics: HashMap<String, f64>,
287    pub assessment_details: String,
288}
289
290/// Alert handling trait
291pub trait AlertHandler {
292    fn handle_alert(&self, alert: Alert) -> Result<()>;
293}
294
295/// Alert types
296#[derive(Debug, Clone)]
297pub struct Alert {
298    pub alert_type: AlertType,
299    pub message: String,
300    pub severity: AlertSeverity,
301    pub timestamp: DateTime<Utc>,
302    pub metrics: HashMap<String, f64>,
303}
304
305/// Alert types
306#[derive(Debug, Clone)]
307pub enum AlertType {
308    HighLatency,
309    LowThroughput,
310    HighErrorRate,
311    LowCacheHitRate,
312    QualityDrift,
313    PerformanceDrift,
314    ResourceExhaustion,
315    SystemFailure,
316}
317
318/// Alert severity levels
319#[derive(Debug, Clone)]
320pub enum AlertSeverity {
321    Info,
322    Warning,
323    Critical,
324    Emergency,
325}
326
327impl Default for MonitoringConfig {
328    fn default() -> Self {
329        Self {
330            collection_interval_seconds: 10,
331            latency_window_size: 1000,
332            throughput_window_size: 100,
333            quality_assessment_interval_seconds: 300, // 5 minutes
334            drift_detection_interval_seconds: 3600,   // 1 hour
335            enable_alerting: true,
336            alert_thresholds: AlertThresholds::default(),
337            export_config: ExportConfig::default(),
338        }
339    }
340}
341
342impl Default for AlertThresholds {
343    fn default() -> Self {
344        Self {
345            max_p95_latency_ms: 500.0,
346            min_throughput_rps: 100.0,
347            max_error_rate: 0.05,    // 5%
348            min_cache_hit_rate: 0.8, // 80%
349            max_quality_drift: 0.1,
350            max_memory_usage_mb: 4096.0, // 4GB
351            max_gpu_memory_mb: 8192.0,   // 8GB
352        }
353    }
354}
355
356impl Default for ExportConfig {
357    fn default() -> Self {
358        Self {
359            enable_prometheus: true,
360            prometheus_port: 9090,
361            enable_opentelemetry: false,
362            otlp_endpoint: None,
363            export_interval_seconds: 60,
364            enable_json_export: false,
365            json_export_path: None,
366        }
367    }
368}
369
370impl Default for LatencyMetrics {
371    fn default() -> Self {
372        Self {
373            avg_embedding_time_ms: 0.0,
374            p50_latency_ms: 0.0,
375            p95_latency_ms: 0.0,
376            p99_latency_ms: 0.0,
377            max_latency_ms: 0.0,
378            min_latency_ms: f64::MAX,
379            end_to_end_latency_ms: 0.0,
380            model_inference_time_ms: 0.0,
381            queue_wait_time_ms: 0.0,
382            total_measurements: 0,
383        }
384    }
385}
386
387impl Default for ThroughputMetrics {
388    fn default() -> Self {
389        Self {
390            requests_per_second: 0.0,
391            embeddings_per_second: 0.0,
392            batches_per_second: 0.0,
393            peak_throughput: 0.0,
394            concurrent_requests: 0,
395            max_concurrent_requests: 0,
396            total_requests: 0,
397            failed_requests: 0,
398            success_rate: 1.0,
399        }
400    }
401}
402
403impl Default for ResourceMetrics {
404    fn default() -> Self {
405        Self {
406            cpu_utilization_percent: 0.0,
407            memory_usage_mb: 0.0,
408            gpu_utilization_percent: 0.0,
409            gpu_memory_usage_mb: 0.0,
410            network_io_mbps: 0.0,
411            disk_io_mbps: 0.0,
412            peak_memory_mb: 0.0,
413            peak_gpu_memory_mb: 0.0,
414        }
415    }
416}
417
418impl Default for QualityMetrics {
419    fn default() -> Self {
420        Self {
421            avg_quality_score: 0.0,
422            isotropy_score: 0.0,
423            neighborhood_preservation: 0.0,
424            clustering_quality: 0.0,
425            similarity_correlation: 0.0,
426            quality_alerts: 0,
427            last_assessment: Utc::now(),
428        }
429    }
430}
431
432impl Default for ErrorMetrics {
433    fn default() -> Self {
434        Self {
435            total_errors: 0,
436            error_rate_per_hour: 0.0,
437            errors_by_type: HashMap::new(),
438            critical_errors: 0,
439            timeout_errors: 0,
440            model_errors: 0,
441            system_errors: 0,
442            last_error: None,
443        }
444    }
445}
446
447impl Default for CacheMetrics {
448    fn default() -> Self {
449        Self {
450            hit_rate: 0.0,
451            l1_hit_rate: 0.0,
452            l2_hit_rate: 0.0,
453            l3_hit_rate: 0.0,
454            cache_memory_mb: 0.0,
455            cache_evictions: 0,
456            time_saved_seconds: 0.0,
457        }
458    }
459}
460
461impl Default for DriftMetrics {
462    fn default() -> Self {
463        Self {
464            quality_drift_score: 0.0,
465            performance_drift_score: 0.0,
466            distribution_shift: false,
467            concept_drift_score: 0.0,
468            data_quality_issues: 0,
469            drift_alerts: 0,
470            last_drift_check: Utc::now(),
471        }
472    }
473}
474
475impl PerformanceMonitor {
476    /// Create new performance monitor
477    pub fn new(config: MonitoringConfig) -> Self {
478        Self {
479            metrics: Arc::new(RwLock::new(PerformanceMetrics::default())),
480            latency_window: Arc::new(Mutex::new(VecDeque::with_capacity(
481                config.latency_window_size,
482            ))),
483            throughput_window: Arc::new(Mutex::new(VecDeque::with_capacity(
484                config.throughput_window_size,
485            ))),
486            error_log: Arc::new(Mutex::new(VecDeque::with_capacity(1000))),
487            quality_history: Arc::new(Mutex::new(VecDeque::with_capacity(100))),
488            config,
489            monitoring_tasks: Vec::new(),
490            alert_handlers: Vec::new(),
491        }
492    }
493
494    /// Start monitoring services
495    pub async fn start(&mut self) -> Result<()> {
496        info!("Starting performance monitoring system");
497
498        // Start metrics collection task
499        let metrics_task = self.start_metrics_collection().await;
500        self.monitoring_tasks.push(metrics_task);
501
502        // Start drift detection task
503        let drift_task = self.start_drift_detection().await;
504        self.monitoring_tasks.push(drift_task);
505
506        // Start quality assessment task
507        let quality_task = self.start_quality_assessment().await;
508        self.monitoring_tasks.push(quality_task);
509
510        // Start metrics export task
511        if self.config.export_config.enable_prometheus {
512            let export_task = self.start_metrics_export().await;
513            self.monitoring_tasks.push(export_task);
514        }
515
516        info!("Performance monitoring system started successfully");
517        Ok(())
518    }
519
520    /// Stop monitoring services
521    pub async fn stop(&mut self) {
522        info!("Stopping performance monitoring system");
523
524        for task in self.monitoring_tasks.drain(..) {
525            task.abort();
526        }
527
528        info!("Performance monitoring system stopped");
529    }
530
531    /// Record request latency
532    pub async fn record_latency(&self, latency_ms: f64) {
533        let mut window = self.latency_window.lock().await;
534
535        // Add to sliding window
536        if window.len() >= self.config.latency_window_size {
537            window.pop_front();
538        }
539        window.push_back(latency_ms);
540
541        // Update metrics
542        {
543            let mut metrics = self.metrics.write().expect("rwlock should not be poisoned");
544            metrics.latency.total_measurements += 1;
545
546            // Update min/max
547            metrics.latency.max_latency_ms = metrics.latency.max_latency_ms.max(latency_ms);
548            metrics.latency.min_latency_ms = metrics.latency.min_latency_ms.min(latency_ms);
549
550            // Update average (rolling average)
551            let alpha = 0.1; // Exponential smoothing factor
552            metrics.latency.avg_embedding_time_ms =
553                alpha * latency_ms + (1.0 - alpha) * metrics.latency.avg_embedding_time_ms;
554
555            // Calculate percentiles from window
556            let mut sorted_latencies: Vec<f64> = window.iter().copied().collect();
557            sorted_latencies.sort_by(|a, b| {
558                a.partial_cmp(b)
559                    .expect("latency values should be comparable")
560            });
561
562            if !sorted_latencies.is_empty() {
563                let len = sorted_latencies.len();
564                metrics.latency.p50_latency_ms = sorted_latencies[len * 50 / 100];
565                metrics.latency.p95_latency_ms = sorted_latencies[len * 95 / 100];
566                metrics.latency.p99_latency_ms = sorted_latencies[len * 99 / 100];
567            }
568        }
569
570        // Check for alerts
571        if self.config.enable_alerting {
572            self.check_latency_alerts(latency_ms).await;
573        }
574    }
575
576    /// Record throughput measurement
577    pub async fn record_throughput(&self, requests_per_second: f64) {
578        let mut window = self.throughput_window.lock().await;
579
580        // Add to sliding window
581        if window.len() >= self.config.throughput_window_size {
582            window.pop_front();
583        }
584        window.push_back(requests_per_second);
585
586        // Update metrics
587        {
588            let mut metrics = self.metrics.write().expect("rwlock should not be poisoned");
589            metrics.throughput.requests_per_second = requests_per_second;
590            metrics.throughput.peak_throughput =
591                metrics.throughput.peak_throughput.max(requests_per_second);
592
593            // Calculate average throughput
594            let avg_throughput = window.iter().sum::<f64>() / window.len() as f64;
595            metrics.throughput.requests_per_second = avg_throughput;
596        }
597
598        // Check for alerts
599        if self.config.enable_alerting {
600            self.check_throughput_alerts(requests_per_second).await;
601        }
602    }
603
604    /// Record error event
605    pub async fn record_error(&self, error_event: ErrorEvent) {
606        let mut error_log = self.error_log.lock().await;
607
608        // Add to error log
609        if error_log.len() >= 1000 {
610            error_log.pop_front();
611        }
612        error_log.push_back(error_event.clone());
613
614        // Update metrics
615        {
616            let mut metrics = self.metrics.write().expect("rwlock should not be poisoned");
617            metrics.errors.total_errors += 1;
618            metrics.errors.last_error = Some(error_event.timestamp);
619
620            // Update error counts by type
621            *metrics
622                .errors
623                .errors_by_type
624                .entry(error_event.error_type.clone())
625                .or_insert(0) += 1;
626
627            // Update error type counters
628            if let ErrorSeverity::Critical = error_event.severity {
629                metrics.errors.critical_errors += 1
630            }
631
632            if error_event.error_type.contains("timeout") {
633                metrics.errors.timeout_errors += 1;
634            } else if error_event.error_type.contains("model") {
635                metrics.errors.model_errors += 1;
636            } else {
637                metrics.errors.system_errors += 1;
638            }
639
640            // Calculate error rate
641            let total_requests = metrics.throughput.total_requests;
642            if total_requests > 0 {
643                metrics.errors.error_rate_per_hour =
644                    (metrics.errors.total_errors as f64 / total_requests as f64) * 3600.0;
645            }
646        }
647
648        // Handle critical errors immediately
649        if matches!(error_event.severity, ErrorSeverity::Critical) {
650            self.handle_critical_error(error_event).await;
651        }
652    }
653
654    /// Update resource metrics
655    pub async fn update_resource_metrics(&self, resources: ResourceMetrics) {
656        {
657            let mut metrics = self.metrics.write().expect("rwlock should not be poisoned");
658
659            // Update peak values
660            metrics.resources.peak_memory_mb = metrics
661                .resources
662                .peak_memory_mb
663                .max(resources.memory_usage_mb);
664            metrics.resources.peak_gpu_memory_mb = metrics
665                .resources
666                .peak_gpu_memory_mb
667                .max(resources.gpu_memory_usage_mb);
668
669            metrics.resources = resources.clone();
670        }
671
672        // Check resource alerts
673        if self.config.enable_alerting {
674            self.check_resource_alerts(resources).await;
675        }
676    }
677
678    /// Update cache metrics
679    pub async fn update_cache_metrics(&self, cache_metrics: CacheMetrics) {
680        {
681            let mut metrics = self.metrics.write().expect("rwlock should not be poisoned");
682            metrics.cache = cache_metrics.clone();
683        }
684
685        // Check cache performance alerts
686        if self.config.enable_alerting
687            && cache_metrics.hit_rate < self.config.alert_thresholds.min_cache_hit_rate
688        {
689            self.send_alert(Alert {
690                alert_type: AlertType::LowCacheHitRate,
691                message: format!(
692                    "Cache hit rate dropped to {:.2}%",
693                    cache_metrics.hit_rate * 100.0
694                ),
695                severity: AlertSeverity::Warning,
696                timestamp: Utc::now(),
697                metrics: HashMap::from([
698                    ("hit_rate".to_string(), cache_metrics.hit_rate),
699                    (
700                        "threshold".to_string(),
701                        self.config.alert_thresholds.min_cache_hit_rate,
702                    ),
703                ]),
704            })
705            .await;
706        }
707    }
708
709    /// Get current metrics snapshot
710    pub fn get_metrics(&self) -> PerformanceMetrics {
711        self.metrics
712            .read()
713            .expect("rwlock should not be poisoned")
714            .clone()
715    }
716
717    /// Add alert handler
718    pub fn add_alert_handler(&mut self, handler: Box<dyn AlertHandler + Send + Sync>) {
719        self.alert_handlers.push(handler);
720    }
721
722    /// Start metrics collection background task
723    async fn start_metrics_collection(&self) -> JoinHandle<()> {
724        let metrics = Arc::clone(&self.metrics);
725        let interval = Duration::from_secs(self.config.collection_interval_seconds);
726
727        tokio::spawn(async move {
728            let mut interval_timer = tokio::time::interval(interval);
729
730            loop {
731                interval_timer.tick().await;
732
733                // Collect system metrics
734                let system_metrics = Self::collect_system_metrics().await;
735
736                // Update metrics
737                {
738                    let mut metrics = metrics.write().expect("rwlock should not be poisoned");
739                    metrics.resources = system_metrics;
740                }
741
742                debug!("Collected system metrics");
743            }
744        })
745    }
746
747    /// Start drift detection background task
748    async fn start_drift_detection(&self) -> JoinHandle<()> {
749        let metrics = Arc::clone(&self.metrics);
750        let quality_history = Arc::clone(&self.quality_history);
751        let interval = Duration::from_secs(self.config.drift_detection_interval_seconds);
752
753        tokio::spawn(async move {
754            let mut interval_timer = tokio::time::interval(interval);
755
756            loop {
757                interval_timer.tick().await;
758
759                // Perform drift detection
760                let drift_metrics = Self::detect_drift(&quality_history).await;
761
762                // Update metrics
763                {
764                    let mut metrics = metrics.write().expect("rwlock should not be poisoned");
765                    metrics.drift = drift_metrics;
766                    metrics.drift.last_drift_check = Utc::now();
767                }
768
769                info!("Performed drift detection analysis");
770            }
771        })
772    }
773
774    /// Start quality assessment background task
775    async fn start_quality_assessment(&self) -> JoinHandle<()> {
776        let metrics = Arc::clone(&self.metrics);
777        let quality_history = Arc::clone(&self.quality_history);
778        let interval = Duration::from_secs(self.config.quality_assessment_interval_seconds);
779
780        tokio::spawn(async move {
781            let mut interval_timer = tokio::time::interval(interval);
782
783            loop {
784                interval_timer.tick().await;
785
786                // Perform quality assessment
787                let quality_assessment = Self::assess_quality().await;
788
789                // Add to history
790                {
791                    let mut history = quality_history.lock().await;
792                    if history.len() >= 100 {
793                        history.pop_front();
794                    }
795                    history.push_back(quality_assessment.clone());
796                }
797
798                // Update metrics
799                {
800                    let mut metrics = metrics.write().expect("rwlock should not be poisoned");
801                    metrics.quality.avg_quality_score = quality_assessment.quality_score;
802                    metrics.quality.last_assessment = quality_assessment.timestamp;
803
804                    // Update quality metrics from assessment details
805                    for (key, value) in &quality_assessment.metrics {
806                        match key.as_str() {
807                            "isotropy" => metrics.quality.isotropy_score = *value,
808                            "neighborhood_preservation" => {
809                                metrics.quality.neighborhood_preservation = *value
810                            }
811                            "clustering_quality" => metrics.quality.clustering_quality = *value,
812                            "similarity_correlation" => {
813                                metrics.quality.similarity_correlation = *value
814                            }
815                            _ => {}
816                        }
817                    }
818                }
819
820                info!(
821                    "Performed quality assessment: score = {:.3}",
822                    quality_assessment.quality_score
823                );
824            }
825        })
826    }
827
828    /// Start metrics export background task
829    async fn start_metrics_export(&self) -> JoinHandle<()> {
830        let metrics = Arc::clone(&self.metrics);
831        let export_config = self.config.export_config.clone();
832        let interval = Duration::from_secs(export_config.export_interval_seconds);
833
834        tokio::spawn(async move {
835            let mut interval_timer = tokio::time::interval(interval);
836
837            loop {
838                interval_timer.tick().await;
839
840                // Export metrics
841                let current_metrics = metrics
842                    .read()
843                    .expect("rwlock should not be poisoned")
844                    .clone();
845
846                if export_config.enable_prometheus {
847                    Self::export_prometheus_metrics(&current_metrics).await;
848                }
849
850                if export_config.enable_json_export {
851                    if let Some(ref path) = export_config.json_export_path {
852                        Self::export_json_metrics(&current_metrics, path).await;
853                    }
854                }
855
856                debug!("Exported metrics");
857            }
858        })
859    }
860
861    /// Collect system resource metrics
862    async fn collect_system_metrics() -> ResourceMetrics {
863        // Simulate system metrics collection
864        // In production, this would use actual system monitoring libraries
865        let mut random = Random::default();
866        ResourceMetrics {
867            cpu_utilization_percent: random.random::<f64>() * 100.0,
868            memory_usage_mb: 1024.0 + random.random::<f64>() * 2048.0,
869            gpu_utilization_percent: random.random::<f64>() * 100.0,
870            gpu_memory_usage_mb: 2048.0 + random.random::<f64>() * 4096.0,
871            network_io_mbps: random.random::<f64>() * 100.0,
872            disk_io_mbps: random.random::<f64>() * 50.0,
873            peak_memory_mb: 3072.0,
874            peak_gpu_memory_mb: 6144.0,
875        }
876    }
877
878    /// Detect model and performance drift
879    async fn detect_drift(
880        quality_history: &Arc<Mutex<VecDeque<QualityAssessment>>>,
881    ) -> DriftMetrics {
882        let history = quality_history.lock().await;
883
884        if history.len() < 2 {
885            return DriftMetrics::default();
886        }
887
888        // Calculate quality drift
889        let recent_quality = history
890            .back()
891            .expect("quality history should not be empty")
892            .quality_score;
893        let baseline_quality = history
894            .front()
895            .expect("quality history should not be empty")
896            .quality_score;
897        let quality_drift = (recent_quality - baseline_quality).abs() / baseline_quality;
898
899        // Simulate other drift metrics
900        let mut random = Random::default();
901        DriftMetrics {
902            quality_drift_score: quality_drift,
903            performance_drift_score: random.random::<f64>() * 0.1,
904            distribution_shift: quality_drift > 0.1,
905            concept_drift_score: random.random::<f64>() * 0.05,
906            data_quality_issues: if quality_drift > 0.2 { 1 } else { 0 },
907            drift_alerts: if quality_drift > 0.15 { 1 } else { 0 },
908            last_drift_check: Utc::now(),
909        }
910    }
911
912    /// Assess embedding quality
913    async fn assess_quality() -> QualityAssessment {
914        // Simulate quality assessment
915        // In production, this would perform actual quality metrics calculation
916        let mut random = Random::default();
917        let quality_score = 0.8 + random.random::<f64>() * 0.2;
918
919        let mut metrics = HashMap::new();
920        metrics.insert("isotropy".to_string(), 0.7 + random.random::<f64>() * 0.3);
921        metrics.insert(
922            "neighborhood_preservation".to_string(),
923            0.8 + random.random::<f64>() * 0.2,
924        );
925        metrics.insert(
926            "clustering_quality".to_string(),
927            0.75 + random.random::<f64>() * 0.25,
928        );
929        metrics.insert(
930            "similarity_correlation".to_string(),
931            0.85 + random.random::<f64>() * 0.15,
932        );
933
934        QualityAssessment {
935            timestamp: Utc::now(),
936            quality_score,
937            metrics,
938            assessment_details: format!(
939                "Quality assessment completed with score: {quality_score:.3}"
940            ),
941        }
942    }
943
944    /// Export metrics to Prometheus format
945    async fn export_prometheus_metrics(metrics: &PerformanceMetrics) {
946        // In production, this would export to Prometheus
947        debug!(
948            "Exporting Prometheus metrics: P95 latency = {:.2}ms",
949            metrics.latency.p95_latency_ms
950        );
951    }
952
953    /// Export metrics to JSON file
954    async fn export_json_metrics(metrics: &PerformanceMetrics, path: &str) {
955        match serde_json::to_string_pretty(metrics) {
956            Ok(json) => {
957                if let Err(e) = tokio::fs::write(path, json).await {
958                    error!("Failed to export JSON metrics: {}", e);
959                }
960            }
961            Err(e) => error!("Failed to serialize metrics to JSON: {}", e),
962        }
963    }
964
965    /// Check latency alerts
966    async fn check_latency_alerts(&self, latency_ms: f64) {
967        if latency_ms > self.config.alert_thresholds.max_p95_latency_ms {
968            self.send_alert(Alert {
969                alert_type: AlertType::HighLatency,
970                message: format!("High latency detected: {latency_ms:.2}ms"),
971                severity: AlertSeverity::Warning,
972                timestamp: Utc::now(),
973                metrics: HashMap::from([
974                    ("latency_ms".to_string(), latency_ms),
975                    (
976                        "threshold_ms".to_string(),
977                        self.config.alert_thresholds.max_p95_latency_ms,
978                    ),
979                ]),
980            })
981            .await;
982        }
983    }
984
985    /// Check throughput alerts
986    async fn check_throughput_alerts(&self, throughput_rps: f64) {
987        if throughput_rps < self.config.alert_thresholds.min_throughput_rps {
988            self.send_alert(Alert {
989                alert_type: AlertType::LowThroughput,
990                message: format!("Low throughput detected: {throughput_rps:.2} req/s"),
991                severity: AlertSeverity::Warning,
992                timestamp: Utc::now(),
993                metrics: HashMap::from([
994                    ("throughput_rps".to_string(), throughput_rps),
995                    (
996                        "threshold_rps".to_string(),
997                        self.config.alert_thresholds.min_throughput_rps,
998                    ),
999                ]),
1000            })
1001            .await;
1002        }
1003    }
1004
1005    /// Check resource alerts
1006    async fn check_resource_alerts(&self, resources: ResourceMetrics) {
1007        if resources.memory_usage_mb > self.config.alert_thresholds.max_memory_usage_mb {
1008            self.send_alert(Alert {
1009                alert_type: AlertType::ResourceExhaustion,
1010                message: format!("High memory usage: {:.1}MB", resources.memory_usage_mb),
1011                severity: AlertSeverity::Critical,
1012                timestamp: Utc::now(),
1013                metrics: HashMap::from([
1014                    ("memory_mb".to_string(), resources.memory_usage_mb),
1015                    (
1016                        "threshold_mb".to_string(),
1017                        self.config.alert_thresholds.max_memory_usage_mb,
1018                    ),
1019                ]),
1020            })
1021            .await;
1022        }
1023
1024        if resources.gpu_memory_usage_mb > self.config.alert_thresholds.max_gpu_memory_mb {
1025            self.send_alert(Alert {
1026                alert_type: AlertType::ResourceExhaustion,
1027                message: format!(
1028                    "High GPU memory usage: {:.1}MB",
1029                    resources.gpu_memory_usage_mb
1030                ),
1031                severity: AlertSeverity::Critical,
1032                timestamp: Utc::now(),
1033                metrics: HashMap::from([
1034                    ("gpu_memory_mb".to_string(), resources.gpu_memory_usage_mb),
1035                    (
1036                        "threshold_mb".to_string(),
1037                        self.config.alert_thresholds.max_gpu_memory_mb,
1038                    ),
1039                ]),
1040            })
1041            .await;
1042        }
1043    }
1044
1045    /// Send alert to all registered handlers
1046    async fn send_alert(&self, alert: Alert) {
1047        warn!(
1048            "Alert triggered: {:?} - {}",
1049            alert.alert_type, alert.message
1050        );
1051
1052        for handler in &self.alert_handlers {
1053            if let Err(e) = handler.handle_alert(alert.clone()) {
1054                error!("Alert handler failed: {}", e);
1055            }
1056        }
1057    }
1058
1059    /// Handle critical errors immediately
1060    async fn handle_critical_error(&self, error_event: ErrorEvent) {
1061        error!(
1062            "Critical error occurred: {} - {}",
1063            error_event.error_type, error_event.error_message
1064        );
1065
1066        self.send_alert(Alert {
1067            alert_type: AlertType::SystemFailure,
1068            message: format!("Critical error: {}", error_event.error_message),
1069            severity: AlertSeverity::Emergency,
1070            timestamp: error_event.timestamp,
1071            metrics: HashMap::new(),
1072        })
1073        .await;
1074    }
1075
1076    /// Get performance summary
1077    pub fn get_performance_summary(&self) -> String {
1078        let metrics = self.metrics.read().expect("rwlock should not be poisoned");
1079
1080        format!(
1081            "Performance Summary:\n\
1082             - P95 Latency: {:.2}ms\n\
1083             - Throughput: {:.1} req/s\n\
1084             - Error Rate: {:.3}%\n\
1085             - Cache Hit Rate: {:.1}%\n\
1086             - Memory Usage: {:.1}MB\n\
1087             - Quality Score: {:.3}",
1088            metrics.latency.p95_latency_ms,
1089            metrics.throughput.requests_per_second,
1090            (metrics.errors.total_errors as f64 / metrics.throughput.total_requests.max(1) as f64)
1091                * 100.0,
1092            metrics.cache.hit_rate * 100.0,
1093            metrics.resources.memory_usage_mb,
1094            metrics.quality.avg_quality_score
1095        )
1096    }
1097}
1098
1099/// Console alert handler implementation
1100pub struct ConsoleAlertHandler;
1101
1102impl AlertHandler for ConsoleAlertHandler {
1103    fn handle_alert(&self, alert: Alert) -> Result<()> {
1104        println!(
1105            "🚨 ALERT [{}]: {} - {}",
1106            format!("{:?}", alert.severity).to_uppercase(),
1107            alert.message,
1108            alert.timestamp.format("%Y-%m-%d %H:%M:%S UTC")
1109        );
1110        Ok(())
1111    }
1112}
1113
1114/// Slack alert handler (placeholder)
1115pub struct SlackAlertHandler {
1116    pub webhook_url: String,
1117}
1118
1119impl AlertHandler for SlackAlertHandler {
1120    fn handle_alert(&self, alert: Alert) -> Result<()> {
1121        // In production, this would send to Slack
1122        info!(
1123            "Would send Slack alert to {}: {}",
1124            self.webhook_url, alert.message
1125        );
1126        Ok(())
1127    }
1128}
1129
1130// ====================================================================================
1131// ENHANCED MONITORING WITH SCIRS2-CORE METRICS
1132// ====================================================================================
1133
1134use scirs2_core::metrics::{Counter, Gauge, Histogram, MetricsRegistry, Timer};
1135
1136/// Enhanced metrics collector using scirs2_core::metrics
1137pub struct MetricsCollector {
1138    // Counters
1139    requests_total: Arc<Counter>,
1140    embeddings_generated_total: Arc<Counter>,
1141    errors_total: Arc<Counter>,
1142    cache_hits: Arc<Counter>,
1143    cache_misses: Arc<Counter>,
1144
1145    // Gauges
1146    concurrent_requests: Arc<Gauge>,
1147    memory_usage_bytes: Arc<Gauge>,
1148    gpu_memory_bytes: Arc<Gauge>,
1149    cpu_utilization: Arc<Gauge>,
1150    gpu_utilization: Arc<Gauge>,
1151
1152    // Histograms
1153    request_latency: Arc<Histogram>,
1154    embedding_generation_time: Arc<Histogram>,
1155    batch_size: Arc<Histogram>,
1156
1157    // Timers
1158    inference_timer: Arc<Timer>,
1159    preprocessing_timer: Arc<Timer>,
1160    postprocessing_timer: Arc<Timer>,
1161
1162    // Registry
1163    registry: Arc<MetricsRegistry>,
1164}
1165
1166impl MetricsCollector {
1167    /// Create a new metrics collector with scirs2-core metrics
1168    pub fn new() -> Self {
1169        let registry = Arc::new(MetricsRegistry::new());
1170
1171        // Create counters
1172        let requests_total = Arc::new(Counter::new("embed_requests_total".to_string()));
1173        let embeddings_generated_total =
1174            Arc::new(Counter::new("embeddings_generated_total".to_string()));
1175        let errors_total = Arc::new(Counter::new("embed_errors_total".to_string()));
1176        let cache_hits = Arc::new(Counter::new("embed_cache_hits_total".to_string()));
1177        let cache_misses = Arc::new(Counter::new("embed_cache_misses_total".to_string()));
1178
1179        // Create gauges
1180        let concurrent_requests = Arc::new(Gauge::new("embed_concurrent_requests".to_string()));
1181        let memory_usage_bytes = Arc::new(Gauge::new("embed_memory_usage_bytes".to_string()));
1182        let gpu_memory_bytes = Arc::new(Gauge::new("embed_gpu_memory_bytes".to_string()));
1183        let cpu_utilization = Arc::new(Gauge::new("embed_cpu_utilization".to_string()));
1184        let gpu_utilization = Arc::new(Gauge::new("embed_gpu_utilization".to_string()));
1185
1186        // Create histograms
1187        let request_latency = Arc::new(Histogram::with_buckets(
1188            "embed_request_latency_ms".to_string(),
1189            vec![
1190                1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
1191            ],
1192        ));
1193        let embedding_generation_time = Arc::new(Histogram::with_buckets(
1194            "embed_generation_time_ms".to_string(),
1195            vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0],
1196        ));
1197        let batch_size = Arc::new(Histogram::with_buckets(
1198            "embed_batch_size".to_string(),
1199            vec![1.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0],
1200        ));
1201
1202        // Create timers
1203        let inference_timer = Arc::new(Timer::new("embed_inference_duration".to_string()));
1204        let preprocessing_timer = Arc::new(Timer::new("embed_preprocessing_duration".to_string()));
1205        let postprocessing_timer =
1206            Arc::new(Timer::new("embed_postprocessing_duration".to_string()));
1207
1208        Self {
1209            requests_total,
1210            embeddings_generated_total,
1211            errors_total,
1212            cache_hits,
1213            cache_misses,
1214            concurrent_requests,
1215            memory_usage_bytes,
1216            gpu_memory_bytes,
1217            cpu_utilization,
1218            gpu_utilization,
1219            request_latency,
1220            embedding_generation_time,
1221            batch_size,
1222            inference_timer,
1223            preprocessing_timer,
1224            postprocessing_timer,
1225            registry,
1226        }
1227    }
1228
1229    /// Record a request start
1230    pub fn record_request_start(&self) {
1231        self.requests_total.inc();
1232        self.concurrent_requests.inc();
1233    }
1234
1235    /// Record a request completion
1236    pub fn record_request_complete(&self, latency_ms: f64) {
1237        self.concurrent_requests.dec();
1238        self.request_latency.observe(latency_ms);
1239    }
1240
1241    /// Record embedding generation
1242    pub fn record_embeddings(&self, count: u64, generation_time_ms: f64) {
1243        self.embeddings_generated_total.add(count);
1244        self.embedding_generation_time.observe(generation_time_ms);
1245    }
1246
1247    /// Record an error
1248    pub fn record_error(&self) {
1249        self.errors_total.inc();
1250    }
1251
1252    /// Record cache hit
1253    pub fn record_cache_hit(&self) {
1254        self.cache_hits.inc();
1255    }
1256
1257    /// Record cache miss
1258    pub fn record_cache_miss(&self) {
1259        self.cache_misses.inc();
1260    }
1261
1262    /// Update resource metrics
1263    pub fn update_resource_metrics(&self, cpu: f64, memory_mb: f64, gpu: f64, gpu_memory_mb: f64) {
1264        self.cpu_utilization.set(cpu);
1265        self.memory_usage_bytes.set(memory_mb * 1024.0 * 1024.0);
1266        self.gpu_utilization.set(gpu);
1267        self.gpu_memory_bytes.set(gpu_memory_mb * 1024.0 * 1024.0);
1268    }
1269
1270    /// Get cache hit rate
1271    pub fn get_cache_hit_rate(&self) -> f64 {
1272        let hits = self.cache_hits.get();
1273        let misses = self.cache_misses.get();
1274        let total = hits + misses;
1275        if total == 0 {
1276            return 0.0;
1277        }
1278        hits as f64 / total as f64
1279    }
1280
1281    /// Export metrics in Prometheus format
1282    pub fn export_prometheus(&self) -> Result<String> {
1283        self.registry
1284            .export_prometheus()
1285            .map_err(|e| anyhow!("Failed to export prometheus metrics: {:?}", e))
1286    }
1287}
1288
1289impl Default for MetricsCollector {
1290    fn default() -> Self {
1291        Self::new()
1292    }
1293}
1294
1295// ====================================================================================
1296// HEALTH CHECK FUNCTIONALITY
1297// ====================================================================================
1298
1299/// Health status for the embedding service
1300#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
1301pub enum HealthStatus {
1302    /// Service is healthy and operational
1303    Healthy,
1304    /// Service is degraded but operational
1305    Degraded,
1306    /// Service is unhealthy
1307    Unhealthy,
1308}
1309
1310/// Health check result
1311#[derive(Debug, Clone, Serialize, Deserialize)]
1312pub struct HealthCheckResult {
1313    /// Overall health status
1314    pub status: HealthStatus,
1315    /// Timestamp of health check
1316    pub timestamp: DateTime<Utc>,
1317    /// Individual component health
1318    pub components: HashMap<String, ComponentHealth>,
1319    /// Additional details
1320    pub details: HashMap<String, String>,
1321}
1322
1323/// Component health information
1324#[derive(Debug, Clone, Serialize, Deserialize)]
1325pub struct ComponentHealth {
1326    /// Component status
1327    pub status: HealthStatus,
1328    /// Component message
1329    pub message: String,
1330    /// Last check time
1331    pub last_check: DateTime<Utc>,
1332    /// Additional metrics
1333    pub metrics: HashMap<String, f64>,
1334}
1335
1336/// Health checker for embedding service
1337pub struct HealthChecker {
1338    /// Model load status
1339    models_loaded: Arc<RwLock<bool>>,
1340    /// Last successful request time
1341    last_request_time: Arc<RwLock<DateTime<Utc>>>,
1342    /// Error rate threshold
1343    error_rate_threshold: f64,
1344    /// Latency threshold (ms)
1345    latency_threshold_ms: f64,
1346    /// Memory threshold (MB)
1347    memory_threshold_mb: f64,
1348    /// Metrics collector
1349    metrics: Arc<MetricsCollector>,
1350}
1351
1352impl HealthChecker {
1353    /// Create a new health checker
1354    pub fn new(metrics: Arc<MetricsCollector>) -> Self {
1355        Self {
1356            models_loaded: Arc::new(RwLock::new(false)),
1357            last_request_time: Arc::new(RwLock::new(Utc::now())),
1358            error_rate_threshold: 0.1,    // 10%
1359            latency_threshold_ms: 1000.0, // 1 second
1360            memory_threshold_mb: 8192.0,  // 8GB
1361            metrics,
1362        }
1363    }
1364
1365    /// Set models loaded status
1366    pub fn set_models_loaded(&self, loaded: bool) -> Result<()> {
1367        let mut status = self
1368            .models_loaded
1369            .write()
1370            .map_err(|e| anyhow!("Failed to write lock: {}", e))?;
1371        *status = loaded;
1372        Ok(())
1373    }
1374
1375    /// Update last request time
1376    pub fn update_last_request_time(&self) -> Result<()> {
1377        let mut time = self
1378            .last_request_time
1379            .write()
1380            .map_err(|e| anyhow!("Failed to write lock: {}", e))?;
1381        *time = Utc::now();
1382        Ok(())
1383    }
1384
1385    /// Perform liveness check (basic service availability)
1386    pub fn check_liveness(&self) -> HealthCheckResult {
1387        let mut components = HashMap::new();
1388
1389        // Check if service is running (always healthy if we can respond)
1390        components.insert(
1391            "service".to_string(),
1392            ComponentHealth {
1393                status: HealthStatus::Healthy,
1394                message: "Service is running".to_string(),
1395                last_check: Utc::now(),
1396                metrics: HashMap::new(),
1397            },
1398        );
1399
1400        HealthCheckResult {
1401            status: HealthStatus::Healthy,
1402            timestamp: Utc::now(),
1403            components,
1404            details: HashMap::new(),
1405        }
1406    }
1407
1408    /// Perform readiness check (service ready to handle requests)
1409    pub fn check_readiness(&self) -> HealthCheckResult {
1410        let mut components = HashMap::new();
1411        let mut overall_status = HealthStatus::Healthy;
1412
1413        // Check if models are loaded
1414        let models_loaded = self.models_loaded.read().map(|g| *g).unwrap_or(false);
1415        if !models_loaded {
1416            overall_status = HealthStatus::Unhealthy;
1417            components.insert(
1418                "models".to_string(),
1419                ComponentHealth {
1420                    status: HealthStatus::Unhealthy,
1421                    message: "Models not loaded".to_string(),
1422                    last_check: Utc::now(),
1423                    metrics: HashMap::new(),
1424                },
1425            );
1426        } else {
1427            components.insert(
1428                "models".to_string(),
1429                ComponentHealth {
1430                    status: HealthStatus::Healthy,
1431                    message: "Models loaded and ready".to_string(),
1432                    last_check: Utc::now(),
1433                    metrics: HashMap::new(),
1434                },
1435            );
1436        }
1437
1438        // Check cache availability
1439        let cache_hit_rate = self.metrics.get_cache_hit_rate();
1440        components.insert(
1441            "cache".to_string(),
1442            ComponentHealth {
1443                status: HealthStatus::Healthy,
1444                message: format!("Cache hit rate: {:.2}%", cache_hit_rate * 100.0),
1445                last_check: Utc::now(),
1446                metrics: [("hit_rate".to_string(), cache_hit_rate)]
1447                    .into_iter()
1448                    .collect(),
1449            },
1450        );
1451
1452        HealthCheckResult {
1453            status: overall_status,
1454            timestamp: Utc::now(),
1455            components,
1456            details: HashMap::new(),
1457        }
1458    }
1459
1460    /// Perform comprehensive health check
1461    pub fn check_health(&self, performance_metrics: &PerformanceMetrics) -> HealthCheckResult {
1462        let mut components = HashMap::new();
1463        let mut overall_status = HealthStatus::Healthy;
1464
1465        // Check models
1466        let models_loaded = self.models_loaded.read().map(|g| *g).unwrap_or(false);
1467        if !models_loaded {
1468            overall_status = HealthStatus::Unhealthy;
1469            components.insert(
1470                "models".to_string(),
1471                ComponentHealth {
1472                    status: HealthStatus::Unhealthy,
1473                    message: "Models not loaded".to_string(),
1474                    last_check: Utc::now(),
1475                    metrics: HashMap::new(),
1476                },
1477            );
1478        } else {
1479            components.insert(
1480                "models".to_string(),
1481                ComponentHealth {
1482                    status: HealthStatus::Healthy,
1483                    message: "Models operational".to_string(),
1484                    last_check: Utc::now(),
1485                    metrics: HashMap::new(),
1486                },
1487            );
1488        }
1489
1490        // Check latency
1491        let latency_status =
1492            if performance_metrics.latency.p95_latency_ms > self.latency_threshold_ms {
1493                if overall_status == HealthStatus::Healthy {
1494                    overall_status = HealthStatus::Degraded;
1495                }
1496                HealthStatus::Degraded
1497            } else {
1498                HealthStatus::Healthy
1499            };
1500
1501        components.insert(
1502            "latency".to_string(),
1503            ComponentHealth {
1504                status: latency_status,
1505                message: format!(
1506                    "P95 latency: {:.2}ms",
1507                    performance_metrics.latency.p95_latency_ms
1508                ),
1509                last_check: Utc::now(),
1510                metrics: [
1511                    (
1512                        "p50".to_string(),
1513                        performance_metrics.latency.p50_latency_ms,
1514                    ),
1515                    (
1516                        "p95".to_string(),
1517                        performance_metrics.latency.p95_latency_ms,
1518                    ),
1519                    (
1520                        "p99".to_string(),
1521                        performance_metrics.latency.p99_latency_ms,
1522                    ),
1523                ]
1524                .into_iter()
1525                .collect(),
1526            },
1527        );
1528
1529        // Check error rate
1530        let error_rate = if performance_metrics.throughput.total_requests > 0 {
1531            performance_metrics.errors.total_errors as f64
1532                / performance_metrics.throughput.total_requests as f64
1533        } else {
1534            0.0
1535        };
1536
1537        let error_status = if error_rate > self.error_rate_threshold {
1538            if overall_status == HealthStatus::Healthy {
1539                overall_status = HealthStatus::Degraded;
1540            }
1541            HealthStatus::Degraded
1542        } else {
1543            HealthStatus::Healthy
1544        };
1545
1546        components.insert(
1547            "errors".to_string(),
1548            ComponentHealth {
1549                status: error_status,
1550                message: format!("Error rate: {:.2}%", error_rate * 100.0),
1551                last_check: Utc::now(),
1552                metrics: [("error_rate".to_string(), error_rate)]
1553                    .into_iter()
1554                    .collect(),
1555            },
1556        );
1557
1558        // Check memory
1559        let memory_status =
1560            if performance_metrics.resources.memory_usage_mb > self.memory_threshold_mb {
1561                if overall_status == HealthStatus::Healthy {
1562                    overall_status = HealthStatus::Degraded;
1563                }
1564                HealthStatus::Degraded
1565            } else {
1566                HealthStatus::Healthy
1567            };
1568
1569        components.insert(
1570            "memory".to_string(),
1571            ComponentHealth {
1572                status: memory_status,
1573                message: format!(
1574                    "Memory usage: {:.2}MB / {:.2}MB",
1575                    performance_metrics.resources.memory_usage_mb, self.memory_threshold_mb
1576                ),
1577                last_check: Utc::now(),
1578                metrics: [
1579                    (
1580                        "usage_mb".to_string(),
1581                        performance_metrics.resources.memory_usage_mb,
1582                    ),
1583                    ("threshold_mb".to_string(), self.memory_threshold_mb),
1584                ]
1585                .into_iter()
1586                .collect(),
1587            },
1588        );
1589
1590        // Check cache
1591        let cache_hit_rate = self.metrics.get_cache_hit_rate();
1592        components.insert(
1593            "cache".to_string(),
1594            ComponentHealth {
1595                status: HealthStatus::Healthy,
1596                message: format!("Cache hit rate: {:.2}%", cache_hit_rate * 100.0),
1597                last_check: Utc::now(),
1598                metrics: [("hit_rate".to_string(), cache_hit_rate)]
1599                    .into_iter()
1600                    .collect(),
1601            },
1602        );
1603
1604        HealthCheckResult {
1605            status: overall_status,
1606            timestamp: Utc::now(),
1607            components,
1608            details: HashMap::new(),
1609        }
1610    }
1611
1612    /// Get metrics endpoint (Prometheus format)
1613    pub fn get_metrics_endpoint(&self) -> Result<String> {
1614        self.metrics.export_prometheus()
1615    }
1616}
1617
1618#[cfg(test)]
1619mod tests {
1620    use super::*;
1621
1622    #[tokio::test]
1623    async fn test_performance_monitor_creation() {
1624        let config = MonitoringConfig::default();
1625        let monitor = PerformanceMonitor::new(config);
1626
1627        let metrics = monitor.get_metrics();
1628        assert_eq!(metrics.latency.total_measurements, 0);
1629        assert_eq!(metrics.throughput.total_requests, 0);
1630    }
1631
1632    #[tokio::test]
1633    async fn test_latency_recording() {
1634        let config = MonitoringConfig::default();
1635        let monitor = PerformanceMonitor::new(config);
1636
1637        monitor.record_latency(100.0).await;
1638        monitor.record_latency(150.0).await;
1639        monitor.record_latency(120.0).await;
1640
1641        let metrics = monitor.get_metrics();
1642        assert_eq!(metrics.latency.total_measurements, 3);
1643        assert_eq!(metrics.latency.max_latency_ms, 150.0);
1644        assert_eq!(metrics.latency.min_latency_ms, 100.0);
1645    }
1646
1647    #[tokio::test]
1648    async fn test_error_recording() {
1649        let config = MonitoringConfig::default();
1650        let monitor = PerformanceMonitor::new(config);
1651
1652        let error_event = ErrorEvent {
1653            timestamp: Utc::now(),
1654            error_type: "timeout".to_string(),
1655            error_message: "Request timeout".to_string(),
1656            severity: ErrorSeverity::Medium,
1657            context: HashMap::new(),
1658        };
1659
1660        monitor.record_error(error_event).await;
1661
1662        let metrics = monitor.get_metrics();
1663        assert_eq!(metrics.errors.total_errors, 1);
1664        assert_eq!(metrics.errors.timeout_errors, 1);
1665    }
1666
1667    #[test]
1668    fn test_alert_thresholds_default() {
1669        let thresholds = AlertThresholds::default();
1670        assert_eq!(thresholds.max_p95_latency_ms, 500.0);
1671        assert_eq!(thresholds.min_throughput_rps, 100.0);
1672        assert_eq!(thresholds.max_error_rate, 0.05);
1673    }
1674
1675    #[test]
1676    fn test_console_alert_handler() {
1677        let handler = ConsoleAlertHandler;
1678        let alert = Alert {
1679            alert_type: AlertType::HighLatency,
1680            message: "Test alert".to_string(),
1681            severity: AlertSeverity::Warning,
1682            timestamp: Utc::now(),
1683            metrics: HashMap::new(),
1684        };
1685
1686        assert!(handler.handle_alert(alert).is_ok());
1687    }
1688
1689    #[test]
1690    fn test_metrics_collector_creation() {
1691        let collector = MetricsCollector::new();
1692        // Just verify it was created successfully
1693        assert_eq!(collector.requests_total.get(), 0);
1694    }
1695
1696    #[test]
1697    fn test_metrics_collector_counters() {
1698        let collector = MetricsCollector::new();
1699
1700        collector.record_request_start();
1701        collector.record_request_complete(50.0);
1702
1703        assert_eq!(collector.requests_total.get(), 1);
1704    }
1705
1706    #[test]
1707    fn test_metrics_collector_cache_hit_rate() {
1708        let collector = MetricsCollector::new();
1709
1710        collector.record_cache_hit();
1711        collector.record_cache_hit();
1712        collector.record_cache_miss();
1713
1714        let hit_rate = collector.get_cache_hit_rate();
1715        assert!((hit_rate - 0.666).abs() < 0.01); // ~66.6%
1716    }
1717
1718    #[test]
1719    fn test_metrics_collector_resource_update() {
1720        let collector = MetricsCollector::new();
1721
1722        collector.update_resource_metrics(0.75, 2048.0, 0.5, 4096.0);
1723
1724        assert_eq!(collector.cpu_utilization.get(), 0.75);
1725        assert_eq!(collector.memory_usage_bytes.get(), 2048.0 * 1024.0 * 1024.0);
1726        assert_eq!(collector.gpu_utilization.get(), 0.5);
1727        assert_eq!(collector.gpu_memory_bytes.get(), 4096.0 * 1024.0 * 1024.0);
1728    }
1729
1730    #[test]
1731    fn test_health_checker_liveness() {
1732        let metrics = Arc::new(MetricsCollector::new());
1733        let checker = HealthChecker::new(metrics);
1734
1735        let result = checker.check_liveness();
1736        assert_eq!(result.status, HealthStatus::Healthy);
1737        assert!(result.components.contains_key("service"));
1738    }
1739
1740    #[test]
1741    fn test_health_checker_readiness_no_models() {
1742        let metrics = Arc::new(MetricsCollector::new());
1743        let checker = HealthChecker::new(metrics);
1744
1745        let result = checker.check_readiness();
1746        assert_eq!(result.status, HealthStatus::Unhealthy);
1747        assert!(result.components.contains_key("models"));
1748    }
1749
1750    #[test]
1751    fn test_health_checker_readiness_with_models() {
1752        let metrics = Arc::new(MetricsCollector::new());
1753        let checker = HealthChecker::new(metrics);
1754
1755        checker
1756            .set_models_loaded(true)
1757            .expect("Failed to set models loaded");
1758
1759        let result = checker.check_readiness();
1760        assert_eq!(result.status, HealthStatus::Healthy);
1761    }
1762
1763    #[test]
1764    fn test_health_checker_comprehensive() {
1765        let metrics = Arc::new(MetricsCollector::new());
1766        let checker = HealthChecker::new(metrics);
1767
1768        checker
1769            .set_models_loaded(true)
1770            .expect("Failed to set models loaded");
1771
1772        let perf_metrics = PerformanceMetrics::default();
1773
1774        let result = checker.check_health(&perf_metrics);
1775        assert_eq!(result.status, HealthStatus::Healthy);
1776        assert!(result.components.contains_key("models"));
1777        assert!(result.components.contains_key("latency"));
1778        assert!(result.components.contains_key("errors"));
1779        assert!(result.components.contains_key("memory"));
1780    }
1781
1782    #[test]
1783    fn test_health_checker_degraded_latency() {
1784        let metrics = Arc::new(MetricsCollector::new());
1785        let checker = HealthChecker::new(metrics);
1786
1787        checker
1788            .set_models_loaded(true)
1789            .expect("Failed to set models loaded");
1790
1791        let mut perf_metrics = PerformanceMetrics::default();
1792        perf_metrics.latency.p95_latency_ms = 2000.0; // Above threshold
1793
1794        let result = checker.check_health(&perf_metrics);
1795        assert_eq!(result.status, HealthStatus::Degraded);
1796    }
1797
1798    #[test]
1799    fn test_prometheus_export() {
1800        let collector = MetricsCollector::new();
1801
1802        collector.record_request_start();
1803        collector.record_embeddings(5, 25.0);
1804
1805        let prometheus_output = collector.export_prometheus();
1806        assert!(prometheus_output.is_ok());
1807
1808        let _output = prometheus_output.unwrap_or_default();
1809        // Check that metrics were recorded (may not always be in prometheus output depending on implementation)
1810        assert_eq!(collector.requests_total.get(), 1);
1811        assert_eq!(collector.embeddings_generated_total.get(), 5);
1812    }
1813}
oxirs_embed/monitoring.rs

oxirs_embed/
monitoring.rs