Skip to main content

oxirs_embed/
monitoring_metrics.rs

1//! Metric types, collectors, and aggregators for embedding monitoring.
2//!
3//! This module contains all metric struct definitions and the enhanced
4//! MetricsCollector backed by scirs2_core::metrics.
5
6use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use scirs2_core::metrics::{Counter, Gauge, Histogram, MetricsRegistry, Timer};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::sync::Arc;
12
13/// Error severity levels
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub enum ErrorSeverity {
16    Low,
17    Medium,
18    High,
19    Critical,
20}
21
22/// Error event for tracking
23#[derive(Debug, Clone)]
24pub struct ErrorEvent {
25    pub timestamp: DateTime<Utc>,
26    pub error_type: String,
27    pub error_message: String,
28    pub severity: ErrorSeverity,
29    pub context: HashMap<String, String>,
30}
31
32/// Quality assessment record
33#[derive(Debug, Clone)]
34pub struct QualityAssessment {
35    pub timestamp: DateTime<Utc>,
36    pub quality_score: f64,
37    pub metrics: HashMap<String, f64>,
38    pub assessment_details: String,
39}
40
41/// Comprehensive performance metrics for embedding systems
42#[derive(Debug, Clone, Serialize, Deserialize, Default)]
43pub struct PerformanceMetrics {
44    /// Request latency metrics
45    pub latency: LatencyMetrics,
46    /// Throughput metrics
47    pub throughput: ThroughputMetrics,
48    /// Resource utilization metrics
49    pub resources: ResourceMetrics,
50    /// Quality metrics
51    pub quality: QualityMetrics,
52    /// Error metrics
53    pub errors: ErrorMetrics,
54    /// Cache performance
55    pub cache: CacheMetrics,
56    /// Model drift metrics
57    pub drift: DriftMetrics,
58}
59
60/// Latency tracking and analysis
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct LatencyMetrics {
63    /// Average embedding generation time (ms)
64    pub avg_embedding_time_ms: f64,
65    /// P50 latency (ms)
66    pub p50_latency_ms: f64,
67    /// P95 latency (ms)
68    pub p95_latency_ms: f64,
69    /// P99 latency (ms)
70    pub p99_latency_ms: f64,
71    /// Maximum latency observed (ms)
72    pub max_latency_ms: f64,
73    /// Minimum latency observed (ms)
74    pub min_latency_ms: f64,
75    /// End-to-end request latency (ms)
76    pub end_to_end_latency_ms: f64,
77    /// Model inference latency (ms)
78    pub model_inference_time_ms: f64,
79    /// Queue wait time (ms)
80    pub queue_wait_time_ms: f64,
81    /// Total measurements
82    pub total_measurements: u64,
83}
84
85/// Throughput monitoring
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct ThroughputMetrics {
88    /// Requests per second
89    pub requests_per_second: f64,
90    /// Embeddings generated per second
91    pub embeddings_per_second: f64,
92    /// Batches processed per second
93    pub batches_per_second: f64,
94    /// Peak throughput achieved
95    pub peak_throughput: f64,
96    /// Current concurrent requests
97    pub concurrent_requests: u32,
98    /// Maximum concurrent requests handled
99    pub max_concurrent_requests: u32,
100    /// Total requests processed
101    pub total_requests: u64,
102    /// Failed requests
103    pub failed_requests: u64,
104    /// Success rate
105    pub success_rate: f64,
106}
107
108/// Resource utilization tracking
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ResourceMetrics {
111    /// CPU utilization percentage
112    pub cpu_utilization_percent: f64,
113    /// Memory usage in MB
114    pub memory_usage_mb: f64,
115    /// GPU utilization percentage
116    pub gpu_utilization_percent: f64,
117    /// GPU memory usage in MB
118    pub gpu_memory_usage_mb: f64,
119    /// Network I/O in MB/s
120    pub network_io_mbps: f64,
121    /// Disk I/O in MB/s
122    pub disk_io_mbps: f64,
123    /// Peak memory usage
124    pub peak_memory_mb: f64,
125    /// Peak GPU memory usage
126    pub peak_gpu_memory_mb: f64,
127}
128
129/// Quality assessment metrics
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct QualityMetrics {
132    /// Average embedding quality score
133    pub avg_quality_score: f64,
134    /// Embedding space isotropy
135    pub isotropy_score: f64,
136    /// Neighborhood preservation
137    pub neighborhood_preservation: f64,
138    /// Clustering quality
139    pub clustering_quality: f64,
140    /// Similarity correlation
141    pub similarity_correlation: f64,
142    /// Quality degradation alerts
143    pub quality_alerts: u32,
144    /// Last quality assessment time
145    pub last_assessment: DateTime<Utc>,
146}
147
148/// Error tracking and analysis
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct ErrorMetrics {
151    /// Total errors
152    pub total_errors: u64,
153    /// Error rate per hour
154    pub error_rate_per_hour: f64,
155    /// Errors by type
156    pub errors_by_type: HashMap<String, u64>,
157    /// Critical errors
158    pub critical_errors: u64,
159    /// Timeout errors
160    pub timeout_errors: u64,
161    /// Model errors
162    pub model_errors: u64,
163    /// System errors
164    pub system_errors: u64,
165    /// Last error time
166    pub last_error: Option<DateTime<Utc>>,
167}
168
169/// Cache performance metrics
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct CacheMetrics {
172    /// Overall cache hit rate
173    pub hit_rate: f64,
174    /// L1 cache hit rate
175    pub l1_hit_rate: f64,
176    /// L2 cache hit rate
177    pub l2_hit_rate: f64,
178    /// L3 cache hit rate
179    pub l3_hit_rate: f64,
180    /// Cache memory usage MB
181    pub cache_memory_mb: f64,
182    /// Cache evictions
183    pub cache_evictions: u64,
184    /// Time saved by caching (seconds)
185    pub time_saved_seconds: f64,
186}
187
188/// Model drift detection metrics
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct DriftMetrics {
191    /// Embedding quality drift
192    pub quality_drift_score: f64,
193    /// Performance degradation score
194    pub performance_drift_score: f64,
195    /// Distribution shift detected
196    pub distribution_shift: bool,
197    /// Concept drift score
198    pub concept_drift_score: f64,
199    /// Data quality issues
200    pub data_quality_issues: u32,
201    /// Drift detection alerts
202    pub drift_alerts: u32,
203    /// Last drift assessment
204    pub last_drift_check: DateTime<Utc>,
205}
206
207impl Default for LatencyMetrics {
208    fn default() -> Self {
209        Self {
210            avg_embedding_time_ms: 0.0,
211            p50_latency_ms: 0.0,
212            p95_latency_ms: 0.0,
213            p99_latency_ms: 0.0,
214            max_latency_ms: 0.0,
215            min_latency_ms: f64::MAX,
216            end_to_end_latency_ms: 0.0,
217            model_inference_time_ms: 0.0,
218            queue_wait_time_ms: 0.0,
219            total_measurements: 0,
220        }
221    }
222}
223
224impl Default for ThroughputMetrics {
225    fn default() -> Self {
226        Self {
227            requests_per_second: 0.0,
228            embeddings_per_second: 0.0,
229            batches_per_second: 0.0,
230            peak_throughput: 0.0,
231            concurrent_requests: 0,
232            max_concurrent_requests: 0,
233            total_requests: 0,
234            failed_requests: 0,
235            success_rate: 1.0,
236        }
237    }
238}
239
240impl Default for ResourceMetrics {
241    fn default() -> Self {
242        Self {
243            cpu_utilization_percent: 0.0,
244            memory_usage_mb: 0.0,
245            gpu_utilization_percent: 0.0,
246            gpu_memory_usage_mb: 0.0,
247            network_io_mbps: 0.0,
248            disk_io_mbps: 0.0,
249            peak_memory_mb: 0.0,
250            peak_gpu_memory_mb: 0.0,
251        }
252    }
253}
254
255impl Default for QualityMetrics {
256    fn default() -> Self {
257        Self {
258            avg_quality_score: 0.0,
259            isotropy_score: 0.0,
260            neighborhood_preservation: 0.0,
261            clustering_quality: 0.0,
262            similarity_correlation: 0.0,
263            quality_alerts: 0,
264            last_assessment: Utc::now(),
265        }
266    }
267}
268
269impl Default for ErrorMetrics {
270    fn default() -> Self {
271        Self {
272            total_errors: 0,
273            error_rate_per_hour: 0.0,
274            errors_by_type: HashMap::new(),
275            critical_errors: 0,
276            timeout_errors: 0,
277            model_errors: 0,
278            system_errors: 0,
279            last_error: None,
280        }
281    }
282}
283
284impl Default for CacheMetrics {
285    fn default() -> Self {
286        Self {
287            hit_rate: 0.0,
288            l1_hit_rate: 0.0,
289            l2_hit_rate: 0.0,
290            l3_hit_rate: 0.0,
291            cache_memory_mb: 0.0,
292            cache_evictions: 0,
293            time_saved_seconds: 0.0,
294        }
295    }
296}
297
298impl Default for DriftMetrics {
299    fn default() -> Self {
300        Self {
301            quality_drift_score: 0.0,
302            performance_drift_score: 0.0,
303            distribution_shift: false,
304            concept_drift_score: 0.0,
305            data_quality_issues: 0,
306            drift_alerts: 0,
307            last_drift_check: Utc::now(),
308        }
309    }
310}
311
312// ====================================================================================
313// ENHANCED MONITORING WITH SCIRS2-CORE METRICS
314// ====================================================================================
315
316/// Enhanced metrics collector using scirs2_core::metrics
317pub struct MetricsCollector {
318    // Counters
319    pub(crate) requests_total: Arc<Counter>,
320    pub(crate) embeddings_generated_total: Arc<Counter>,
321    pub(crate) errors_total: Arc<Counter>,
322    pub(crate) cache_hits: Arc<Counter>,
323    pub(crate) cache_misses: Arc<Counter>,
324
325    // Gauges
326    pub(crate) concurrent_requests: Arc<Gauge>,
327    pub(crate) memory_usage_bytes: Arc<Gauge>,
328    pub(crate) gpu_memory_bytes: Arc<Gauge>,
329    pub(crate) cpu_utilization: Arc<Gauge>,
330    pub(crate) gpu_utilization: Arc<Gauge>,
331
332    // Histograms
333    pub(crate) request_latency: Arc<Histogram>,
334    pub(crate) embedding_generation_time: Arc<Histogram>,
335    pub(crate) batch_size: Arc<Histogram>,
336
337    // Timers
338    pub(crate) inference_timer: Arc<Timer>,
339    pub(crate) preprocessing_timer: Arc<Timer>,
340    pub(crate) postprocessing_timer: Arc<Timer>,
341
342    // Registry
343    pub(crate) registry: Arc<MetricsRegistry>,
344}
345
346impl MetricsCollector {
347    /// Create a new metrics collector with scirs2-core metrics
348    pub fn new() -> Self {
349        let registry = Arc::new(MetricsRegistry::new());
350
351        // Create counters
352        let requests_total = Arc::new(Counter::new("embed_requests_total".to_string()));
353        let embeddings_generated_total =
354            Arc::new(Counter::new("embeddings_generated_total".to_string()));
355        let errors_total = Arc::new(Counter::new("embed_errors_total".to_string()));
356        let cache_hits = Arc::new(Counter::new("embed_cache_hits_total".to_string()));
357        let cache_misses = Arc::new(Counter::new("embed_cache_misses_total".to_string()));
358
359        // Create gauges
360        let concurrent_requests = Arc::new(Gauge::new("embed_concurrent_requests".to_string()));
361        let memory_usage_bytes = Arc::new(Gauge::new("embed_memory_usage_bytes".to_string()));
362        let gpu_memory_bytes = Arc::new(Gauge::new("embed_gpu_memory_bytes".to_string()));
363        let cpu_utilization = Arc::new(Gauge::new("embed_cpu_utilization".to_string()));
364        let gpu_utilization = Arc::new(Gauge::new("embed_gpu_utilization".to_string()));
365
366        // Create histograms
367        let request_latency = Arc::new(Histogram::with_buckets(
368            "embed_request_latency_ms".to_string(),
369            vec![
370                1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
371            ],
372        ));
373        let embedding_generation_time = Arc::new(Histogram::with_buckets(
374            "embed_generation_time_ms".to_string(),
375            vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0],
376        ));
377        let batch_size = Arc::new(Histogram::with_buckets(
378            "embed_batch_size".to_string(),
379            vec![1.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0],
380        ));
381
382        // Create timers
383        let inference_timer = Arc::new(Timer::new("embed_inference_duration".to_string()));
384        let preprocessing_timer = Arc::new(Timer::new("embed_preprocessing_duration".to_string()));
385        let postprocessing_timer =
386            Arc::new(Timer::new("embed_postprocessing_duration".to_string()));
387
388        Self {
389            requests_total,
390            embeddings_generated_total,
391            errors_total,
392            cache_hits,
393            cache_misses,
394            concurrent_requests,
395            memory_usage_bytes,
396            gpu_memory_bytes,
397            cpu_utilization,
398            gpu_utilization,
399            request_latency,
400            embedding_generation_time,
401            batch_size,
402            inference_timer,
403            preprocessing_timer,
404            postprocessing_timer,
405            registry,
406        }
407    }
408
409    /// Record a request start
410    pub fn record_request_start(&self) {
411        self.requests_total.inc();
412        self.concurrent_requests.inc();
413    }
414
415    /// Record a request completion
416    pub fn record_request_complete(&self, latency_ms: f64) {
417        self.concurrent_requests.dec();
418        self.request_latency.observe(latency_ms);
419    }
420
421    /// Record embedding generation
422    pub fn record_embeddings(&self, count: u64, generation_time_ms: f64) {
423        self.embeddings_generated_total.add(count);
424        self.embedding_generation_time.observe(generation_time_ms);
425    }
426
427    /// Record an error
428    pub fn record_error(&self) {
429        self.errors_total.inc();
430    }
431
432    /// Record cache hit
433    pub fn record_cache_hit(&self) {
434        self.cache_hits.inc();
435    }
436
437    /// Record cache miss
438    pub fn record_cache_miss(&self) {
439        self.cache_misses.inc();
440    }
441
442    /// Update resource metrics
443    pub fn update_resource_metrics(&self, cpu: f64, memory_mb: f64, gpu: f64, gpu_memory_mb: f64) {
444        self.cpu_utilization.set(cpu);
445        self.memory_usage_bytes.set(memory_mb * 1024.0 * 1024.0);
446        self.gpu_utilization.set(gpu);
447        self.gpu_memory_bytes.set(gpu_memory_mb * 1024.0 * 1024.0);
448    }
449
450    /// Get cache hit rate
451    pub fn get_cache_hit_rate(&self) -> f64 {
452        let hits = self.cache_hits.get();
453        let misses = self.cache_misses.get();
454        let total = hits + misses;
455        if total == 0 {
456            return 0.0;
457        }
458        hits as f64 / total as f64
459    }
460
461    /// Export metrics in Prometheus format
462    pub fn export_prometheus(&self) -> Result<String> {
463        self.registry
464            .export_prometheus()
465            .map_err(|e| anyhow!("Failed to export prometheus metrics: {:?}", e))
466    }
467}
468
469impl Default for MetricsCollector {
470    fn default() -> Self {
471        Self::new()
472    }
473}