1use anyhow::{anyhow, Result};
7use chrono::{DateTime, Utc};
8use scirs2_core::metrics::{Counter, Gauge, Histogram, MetricsRegistry, Timer};
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::sync::Arc;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub enum ErrorSeverity {
16 Low,
17 Medium,
18 High,
19 Critical,
20}
21
22#[derive(Debug, Clone)]
24pub struct ErrorEvent {
25 pub timestamp: DateTime<Utc>,
26 pub error_type: String,
27 pub error_message: String,
28 pub severity: ErrorSeverity,
29 pub context: HashMap<String, String>,
30}
31
32#[derive(Debug, Clone)]
34pub struct QualityAssessment {
35 pub timestamp: DateTime<Utc>,
36 pub quality_score: f64,
37 pub metrics: HashMap<String, f64>,
38 pub assessment_details: String,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize, Default)]
43pub struct PerformanceMetrics {
44 pub latency: LatencyMetrics,
46 pub throughput: ThroughputMetrics,
48 pub resources: ResourceMetrics,
50 pub quality: QualityMetrics,
52 pub errors: ErrorMetrics,
54 pub cache: CacheMetrics,
56 pub drift: DriftMetrics,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct LatencyMetrics {
63 pub avg_embedding_time_ms: f64,
65 pub p50_latency_ms: f64,
67 pub p95_latency_ms: f64,
69 pub p99_latency_ms: f64,
71 pub max_latency_ms: f64,
73 pub min_latency_ms: f64,
75 pub end_to_end_latency_ms: f64,
77 pub model_inference_time_ms: f64,
79 pub queue_wait_time_ms: f64,
81 pub total_measurements: u64,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct ThroughputMetrics {
88 pub requests_per_second: f64,
90 pub embeddings_per_second: f64,
92 pub batches_per_second: f64,
94 pub peak_throughput: f64,
96 pub concurrent_requests: u32,
98 pub max_concurrent_requests: u32,
100 pub total_requests: u64,
102 pub failed_requests: u64,
104 pub success_rate: f64,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ResourceMetrics {
111 pub cpu_utilization_percent: f64,
113 pub memory_usage_mb: f64,
115 pub gpu_utilization_percent: f64,
117 pub gpu_memory_usage_mb: f64,
119 pub network_io_mbps: f64,
121 pub disk_io_mbps: f64,
123 pub peak_memory_mb: f64,
125 pub peak_gpu_memory_mb: f64,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct QualityMetrics {
132 pub avg_quality_score: f64,
134 pub isotropy_score: f64,
136 pub neighborhood_preservation: f64,
138 pub clustering_quality: f64,
140 pub similarity_correlation: f64,
142 pub quality_alerts: u32,
144 pub last_assessment: DateTime<Utc>,
146}
147
148#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct ErrorMetrics {
151 pub total_errors: u64,
153 pub error_rate_per_hour: f64,
155 pub errors_by_type: HashMap<String, u64>,
157 pub critical_errors: u64,
159 pub timeout_errors: u64,
161 pub model_errors: u64,
163 pub system_errors: u64,
165 pub last_error: Option<DateTime<Utc>>,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct CacheMetrics {
172 pub hit_rate: f64,
174 pub l1_hit_rate: f64,
176 pub l2_hit_rate: f64,
178 pub l3_hit_rate: f64,
180 pub cache_memory_mb: f64,
182 pub cache_evictions: u64,
184 pub time_saved_seconds: f64,
186}
187
188#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct DriftMetrics {
191 pub quality_drift_score: f64,
193 pub performance_drift_score: f64,
195 pub distribution_shift: bool,
197 pub concept_drift_score: f64,
199 pub data_quality_issues: u32,
201 pub drift_alerts: u32,
203 pub last_drift_check: DateTime<Utc>,
205}
206
207impl Default for LatencyMetrics {
208 fn default() -> Self {
209 Self {
210 avg_embedding_time_ms: 0.0,
211 p50_latency_ms: 0.0,
212 p95_latency_ms: 0.0,
213 p99_latency_ms: 0.0,
214 max_latency_ms: 0.0,
215 min_latency_ms: f64::MAX,
216 end_to_end_latency_ms: 0.0,
217 model_inference_time_ms: 0.0,
218 queue_wait_time_ms: 0.0,
219 total_measurements: 0,
220 }
221 }
222}
223
224impl Default for ThroughputMetrics {
225 fn default() -> Self {
226 Self {
227 requests_per_second: 0.0,
228 embeddings_per_second: 0.0,
229 batches_per_second: 0.0,
230 peak_throughput: 0.0,
231 concurrent_requests: 0,
232 max_concurrent_requests: 0,
233 total_requests: 0,
234 failed_requests: 0,
235 success_rate: 1.0,
236 }
237 }
238}
239
240impl Default for ResourceMetrics {
241 fn default() -> Self {
242 Self {
243 cpu_utilization_percent: 0.0,
244 memory_usage_mb: 0.0,
245 gpu_utilization_percent: 0.0,
246 gpu_memory_usage_mb: 0.0,
247 network_io_mbps: 0.0,
248 disk_io_mbps: 0.0,
249 peak_memory_mb: 0.0,
250 peak_gpu_memory_mb: 0.0,
251 }
252 }
253}
254
255impl Default for QualityMetrics {
256 fn default() -> Self {
257 Self {
258 avg_quality_score: 0.0,
259 isotropy_score: 0.0,
260 neighborhood_preservation: 0.0,
261 clustering_quality: 0.0,
262 similarity_correlation: 0.0,
263 quality_alerts: 0,
264 last_assessment: Utc::now(),
265 }
266 }
267}
268
269impl Default for ErrorMetrics {
270 fn default() -> Self {
271 Self {
272 total_errors: 0,
273 error_rate_per_hour: 0.0,
274 errors_by_type: HashMap::new(),
275 critical_errors: 0,
276 timeout_errors: 0,
277 model_errors: 0,
278 system_errors: 0,
279 last_error: None,
280 }
281 }
282}
283
284impl Default for CacheMetrics {
285 fn default() -> Self {
286 Self {
287 hit_rate: 0.0,
288 l1_hit_rate: 0.0,
289 l2_hit_rate: 0.0,
290 l3_hit_rate: 0.0,
291 cache_memory_mb: 0.0,
292 cache_evictions: 0,
293 time_saved_seconds: 0.0,
294 }
295 }
296}
297
298impl Default for DriftMetrics {
299 fn default() -> Self {
300 Self {
301 quality_drift_score: 0.0,
302 performance_drift_score: 0.0,
303 distribution_shift: false,
304 concept_drift_score: 0.0,
305 data_quality_issues: 0,
306 drift_alerts: 0,
307 last_drift_check: Utc::now(),
308 }
309 }
310}
311
312pub struct MetricsCollector {
318 pub(crate) requests_total: Arc<Counter>,
320 pub(crate) embeddings_generated_total: Arc<Counter>,
321 pub(crate) errors_total: Arc<Counter>,
322 pub(crate) cache_hits: Arc<Counter>,
323 pub(crate) cache_misses: Arc<Counter>,
324
325 pub(crate) concurrent_requests: Arc<Gauge>,
327 pub(crate) memory_usage_bytes: Arc<Gauge>,
328 pub(crate) gpu_memory_bytes: Arc<Gauge>,
329 pub(crate) cpu_utilization: Arc<Gauge>,
330 pub(crate) gpu_utilization: Arc<Gauge>,
331
332 pub(crate) request_latency: Arc<Histogram>,
334 pub(crate) embedding_generation_time: Arc<Histogram>,
335 pub(crate) batch_size: Arc<Histogram>,
336
337 pub(crate) inference_timer: Arc<Timer>,
339 pub(crate) preprocessing_timer: Arc<Timer>,
340 pub(crate) postprocessing_timer: Arc<Timer>,
341
342 pub(crate) registry: Arc<MetricsRegistry>,
344}
345
346impl MetricsCollector {
347 pub fn new() -> Self {
349 let registry = Arc::new(MetricsRegistry::new());
350
351 let requests_total = Arc::new(Counter::new("embed_requests_total".to_string()));
353 let embeddings_generated_total =
354 Arc::new(Counter::new("embeddings_generated_total".to_string()));
355 let errors_total = Arc::new(Counter::new("embed_errors_total".to_string()));
356 let cache_hits = Arc::new(Counter::new("embed_cache_hits_total".to_string()));
357 let cache_misses = Arc::new(Counter::new("embed_cache_misses_total".to_string()));
358
359 let concurrent_requests = Arc::new(Gauge::new("embed_concurrent_requests".to_string()));
361 let memory_usage_bytes = Arc::new(Gauge::new("embed_memory_usage_bytes".to_string()));
362 let gpu_memory_bytes = Arc::new(Gauge::new("embed_gpu_memory_bytes".to_string()));
363 let cpu_utilization = Arc::new(Gauge::new("embed_cpu_utilization".to_string()));
364 let gpu_utilization = Arc::new(Gauge::new("embed_gpu_utilization".to_string()));
365
366 let request_latency = Arc::new(Histogram::with_buckets(
368 "embed_request_latency_ms".to_string(),
369 vec![
370 1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
371 ],
372 ));
373 let embedding_generation_time = Arc::new(Histogram::with_buckets(
374 "embed_generation_time_ms".to_string(),
375 vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0],
376 ));
377 let batch_size = Arc::new(Histogram::with_buckets(
378 "embed_batch_size".to_string(),
379 vec![1.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0],
380 ));
381
382 let inference_timer = Arc::new(Timer::new("embed_inference_duration".to_string()));
384 let preprocessing_timer = Arc::new(Timer::new("embed_preprocessing_duration".to_string()));
385 let postprocessing_timer =
386 Arc::new(Timer::new("embed_postprocessing_duration".to_string()));
387
388 Self {
389 requests_total,
390 embeddings_generated_total,
391 errors_total,
392 cache_hits,
393 cache_misses,
394 concurrent_requests,
395 memory_usage_bytes,
396 gpu_memory_bytes,
397 cpu_utilization,
398 gpu_utilization,
399 request_latency,
400 embedding_generation_time,
401 batch_size,
402 inference_timer,
403 preprocessing_timer,
404 postprocessing_timer,
405 registry,
406 }
407 }
408
409 pub fn record_request_start(&self) {
411 self.requests_total.inc();
412 self.concurrent_requests.inc();
413 }
414
415 pub fn record_request_complete(&self, latency_ms: f64) {
417 self.concurrent_requests.dec();
418 self.request_latency.observe(latency_ms);
419 }
420
421 pub fn record_embeddings(&self, count: u64, generation_time_ms: f64) {
423 self.embeddings_generated_total.add(count);
424 self.embedding_generation_time.observe(generation_time_ms);
425 }
426
427 pub fn record_error(&self) {
429 self.errors_total.inc();
430 }
431
432 pub fn record_cache_hit(&self) {
434 self.cache_hits.inc();
435 }
436
437 pub fn record_cache_miss(&self) {
439 self.cache_misses.inc();
440 }
441
442 pub fn update_resource_metrics(&self, cpu: f64, memory_mb: f64, gpu: f64, gpu_memory_mb: f64) {
444 self.cpu_utilization.set(cpu);
445 self.memory_usage_bytes.set(memory_mb * 1024.0 * 1024.0);
446 self.gpu_utilization.set(gpu);
447 self.gpu_memory_bytes.set(gpu_memory_mb * 1024.0 * 1024.0);
448 }
449
450 pub fn get_cache_hit_rate(&self) -> f64 {
452 let hits = self.cache_hits.get();
453 let misses = self.cache_misses.get();
454 let total = hits + misses;
455 if total == 0 {
456 return 0.0;
457 }
458 hits as f64 / total as f64
459 }
460
461 pub fn export_prometheus(&self) -> Result<String> {
463 self.registry
464 .export_prometheus()
465 .map_err(|e| anyhow!("Failed to export prometheus metrics: {:?}", e))
466 }
467}
468
469impl Default for MetricsCollector {
470 fn default() -> Self {
471 Self::new()
472 }
473}