rexis_rag/caching/
metrics.rs

1//! # Cache Metrics and Monitoring
2//!
3//! Performance metrics and monitoring for the caching layer.
4
5use super::{CacheStats, OverallCacheMetrics};
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::time::{Duration, SystemTime};
9
10/// Cache metrics collector
11pub struct MetricsCollector {
12    /// Per-cache metrics
13    cache_metrics: HashMap<String, CacheStats>,
14
15    /// Operation timings
16    operation_timings: OperationTimings,
17
18    /// Memory tracking
19    memory_tracker: MemoryTracker,
20
21    /// Performance analyzer
22    analyzer: PerformanceAnalyzer,
23
24    /// Metrics history
25    history: MetricsHistory,
26}
27
28/// Operation timing statistics
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct OperationTimings {
31    /// Get operation timings
32    pub get_timings: TimingStats,
33
34    /// Put operation timings
35    pub put_timings: TimingStats,
36
37    /// Remove operation timings
38    pub remove_timings: TimingStats,
39
40    /// Eviction timings
41    pub eviction_timings: TimingStats,
42
43    /// Compression timings
44    pub compression_timings: TimingStats,
45}
46
47/// Timing statistics for an operation
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct TimingStats {
50    /// Total operations
51    pub count: u64,
52
53    /// Total time in microseconds
54    pub total_us: u64,
55
56    /// Average time in microseconds
57    pub avg_us: f32,
58
59    /// Minimum time
60    pub min_us: u64,
61
62    /// Maximum time
63    pub max_us: u64,
64
65    /// 50th percentile
66    pub p50_us: u64,
67
68    /// 95th percentile
69    pub p95_us: u64,
70
71    /// 99th percentile
72    pub p99_us: u64,
73}
74
75/// Memory usage tracker
76#[derive(Debug, Clone)]
77pub struct MemoryTracker {
78    /// Current memory usage
79    pub current_bytes: usize,
80
81    /// Peak memory usage
82    pub peak_bytes: usize,
83
84    /// Memory saved through compression
85    pub compression_saved_bytes: usize,
86
87    /// Memory saved through deduplication
88    pub deduplication_saved_bytes: usize,
89
90    /// Memory pressure events
91    pub pressure_events: Vec<MemoryPressureEvent>,
92}
93
94/// Memory pressure event
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct MemoryPressureEvent {
97    /// When the event occurred
98    pub timestamp: SystemTime,
99
100    /// Memory usage at time of event
101    pub memory_bytes: usize,
102
103    /// Pressure level (0.0 to 1.0)
104    pub pressure_level: f32,
105
106    /// Action taken
107    pub action: PressureAction,
108
109    /// Memory freed
110    pub freed_bytes: usize,
111}
112
113/// Actions taken under memory pressure
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub enum PressureAction {
116    /// Evicted entries
117    Eviction { count: usize },
118
119    /// Compressed entries
120    Compression { count: usize },
121
122    /// Cleared entire cache
123    ClearCache,
124
125    /// No action needed
126    None,
127}
128
129/// Performance analyzer
130#[derive(Debug, Clone)]
131pub struct PerformanceAnalyzer {
132    /// Hit rate over time
133    pub hit_rate_history: Vec<(SystemTime, f32)>,
134
135    /// Operations per second history
136    pub ops_history: Vec<(SystemTime, f32)>,
137
138    /// Latency percentiles over time
139    pub latency_history: Vec<(SystemTime, LatencySnapshot)>,
140
141    /// Efficiency score history
142    pub efficiency_history: Vec<(SystemTime, f32)>,
143}
144
145/// Latency snapshot at a point in time
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct LatencySnapshot {
148    pub p50_us: u64,
149    pub p95_us: u64,
150    pub p99_us: u64,
151    pub max_us: u64,
152}
153
154/// Metrics history for trend analysis
155#[derive(Debug, Clone)]
156pub struct MetricsHistory {
157    /// Historical snapshots
158    pub snapshots: Vec<MetricsSnapshot>,
159
160    /// Maximum history size
161    pub max_size: usize,
162
163    /// Snapshot interval
164    pub interval: Duration,
165
166    /// Last snapshot time
167    pub last_snapshot: SystemTime,
168}
169
170/// Point-in-time metrics snapshot
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct MetricsSnapshot {
173    /// Snapshot timestamp
174    pub timestamp: SystemTime,
175
176    /// Overall metrics
177    pub overall: OverallCacheMetrics,
178
179    /// Individual cache stats
180    pub cache_stats: HashMap<String, CacheStats>,
181
182    /// Memory usage
183    pub memory_bytes: usize,
184
185    /// Active operations
186    pub active_operations: u32,
187}
188
189impl MetricsCollector {
190    /// Create new metrics collector
191    pub fn new() -> Self {
192        Self {
193            cache_metrics: HashMap::new(),
194            operation_timings: OperationTimings::default(),
195            memory_tracker: MemoryTracker::new(),
196            analyzer: PerformanceAnalyzer::new(),
197            history: MetricsHistory::new(1000, Duration::from_secs(60)),
198        }
199    }
200
201    /// Record cache operation
202    pub fn record_operation(&mut self, cache: &str, operation: Operation, duration: Duration) {
203        let duration_us = duration.as_micros() as u64;
204
205        match operation {
206            Operation::Get { hit } => {
207                self.operation_timings.get_timings.record(duration_us);
208
209                if let Some(stats) = self.cache_metrics.get_mut(cache) {
210                    if hit {
211                        stats.hits += 1;
212                    } else {
213                        stats.misses += 1;
214                    }
215                    stats.hit_rate = stats.hits as f32 / (stats.hits + stats.misses) as f32;
216                }
217            }
218            Operation::Put => {
219                self.operation_timings.put_timings.record(duration_us);
220            }
221            Operation::Remove => {
222                self.operation_timings.remove_timings.record(duration_us);
223            }
224            Operation::Evict => {
225                self.operation_timings.eviction_timings.record(duration_us);
226
227                if let Some(stats) = self.cache_metrics.get_mut(cache) {
228                    stats.evictions += 1;
229                }
230            }
231        }
232    }
233
234    /// Update memory usage
235    pub fn update_memory(&mut self, cache: &str, bytes: usize) {
236        self.memory_tracker.current_bytes = bytes;
237        self.memory_tracker.peak_bytes = self.memory_tracker.peak_bytes.max(bytes);
238
239        if let Some(stats) = self.cache_metrics.get_mut(cache) {
240            stats.memory_usage = bytes;
241        }
242
243        // Check for memory pressure
244        let pressure = self.calculate_memory_pressure();
245        if pressure > 0.8 {
246            self.memory_tracker
247                .pressure_events
248                .push(MemoryPressureEvent {
249                    timestamp: SystemTime::now(),
250                    memory_bytes: bytes,
251                    pressure_level: pressure,
252                    action: PressureAction::None,
253                    freed_bytes: 0,
254                });
255        }
256    }
257
258    /// Calculate memory pressure (0.0 to 1.0)
259    fn calculate_memory_pressure(&self) -> f32 {
260        // Simplified - would use system memory in real implementation
261        const MAX_MEMORY: usize = 1024 * 1024 * 1024; // 1GB
262        (self.memory_tracker.current_bytes as f32 / MAX_MEMORY as f32).min(1.0)
263    }
264
265    /// Take metrics snapshot
266    pub fn snapshot(&mut self) -> MetricsSnapshot {
267        let overall = self.calculate_overall_metrics();
268
269        MetricsSnapshot {
270            timestamp: SystemTime::now(),
271            overall,
272            cache_stats: self.cache_metrics.clone(),
273            memory_bytes: self.memory_tracker.current_bytes,
274            active_operations: 0, // Would track active operations
275        }
276    }
277
278    /// Calculate overall metrics
279    fn calculate_overall_metrics(&self) -> OverallCacheMetrics {
280        let total_hits: u64 = self.cache_metrics.values().map(|s| s.hits).sum();
281        let total_misses: u64 = self.cache_metrics.values().map(|s| s.misses).sum();
282        let total_ops = total_hits + total_misses;
283
284        let hit_rate = if total_ops > 0 {
285            total_hits as f32 / total_ops as f32
286        } else {
287            0.0
288        };
289
290        // Calculate time saved (estimated)
291        let avg_cache_time = self.operation_timings.get_timings.avg_us;
292        let avg_miss_time = avg_cache_time * 10.0; // Assume cache is 10x faster
293        let time_saved_ms = (total_hits as f32 * (avg_miss_time - avg_cache_time)) / 1000.0;
294
295        // Calculate efficiency score
296        let efficiency_score = hit_rate * 0.4
297            + (1.0 - self.calculate_memory_pressure()) * 0.3
298            + (time_saved_ms / 1000.0).min(1.0) * 0.3;
299
300        OverallCacheMetrics {
301            memory_saved: self.memory_tracker.compression_saved_bytes
302                + self.memory_tracker.deduplication_saved_bytes,
303            time_saved_ms,
304            efficiency_score,
305            memory_pressure: self.calculate_memory_pressure(),
306            ops_per_second: self.calculate_ops_per_second(),
307        }
308    }
309
310    /// Calculate operations per second
311    fn calculate_ops_per_second(&self) -> f32 {
312        // Would calculate based on recent operations
313        100.0 // Placeholder
314    }
315
316    /// Get performance report
317    pub fn get_report(&self) -> PerformanceReport {
318        PerformanceReport {
319            summary: self.get_summary(),
320            recommendations: self.generate_recommendations(),
321            alerts: self.generate_alerts(),
322            trends: self.analyze_trends(),
323        }
324    }
325
326    /// Get summary statistics
327    fn get_summary(&self) -> SummaryStats {
328        let total_hits: u64 = self.cache_metrics.values().map(|s| s.hits).sum();
329        let total_misses: u64 = self.cache_metrics.values().map(|s| s.misses).sum();
330
331        SummaryStats {
332            total_operations: total_hits + total_misses,
333            overall_hit_rate: if total_hits + total_misses > 0 {
334                total_hits as f32 / (total_hits + total_misses) as f32
335            } else {
336                0.0
337            },
338            memory_usage_mb: self.memory_tracker.current_bytes as f32 / (1024.0 * 1024.0),
339            avg_latency_us: self.operation_timings.get_timings.avg_us,
340            efficiency_score: self.calculate_overall_metrics().efficiency_score,
341        }
342    }
343
344    /// Generate performance recommendations
345    fn generate_recommendations(&self) -> Vec<String> {
346        let mut recommendations = Vec::new();
347
348        // Check hit rate
349        let summary = self.get_summary();
350        if summary.overall_hit_rate < 0.5 {
351            recommendations.push(
352                "Low hit rate detected. Consider increasing cache size or adjusting eviction policy.".to_string()
353            );
354        }
355
356        // Check memory pressure
357        if self.calculate_memory_pressure() > 0.8 {
358            recommendations
359                .push("High memory pressure. Enable compression or reduce cache size.".to_string());
360        }
361
362        // Check latency
363        if self.operation_timings.get_timings.p99_us > 1000 {
364            recommendations.push(
365                "High cache latency detected. Consider optimizing data structures.".to_string(),
366            );
367        }
368
369        recommendations
370    }
371
372    /// Generate alerts for issues
373    fn generate_alerts(&self) -> Vec<Alert> {
374        let mut alerts = Vec::new();
375
376        if self.calculate_memory_pressure() > 0.9 {
377            alerts.push(Alert {
378                severity: AlertSeverity::Critical,
379                message: "Critical memory pressure - cache may start dropping entries".to_string(),
380                timestamp: SystemTime::now(),
381            });
382        }
383
384        if self.get_summary().overall_hit_rate < 0.3 {
385            alerts.push(Alert {
386                severity: AlertSeverity::Warning,
387                message: "Very low cache hit rate - cache may not be effective".to_string(),
388                timestamp: SystemTime::now(),
389            });
390        }
391
392        alerts
393    }
394
395    /// Analyze performance trends
396    fn analyze_trends(&self) -> TrendAnalysis {
397        TrendAnalysis {
398            hit_rate_trend: Trend::Stable,
399            memory_trend: Trend::Increasing,
400            latency_trend: Trend::Stable,
401            efficiency_trend: Trend::Stable,
402        }
403    }
404}
405
406/// Cache operation types
407#[derive(Debug, Clone)]
408pub enum Operation {
409    Get { hit: bool },
410    Put,
411    Remove,
412    Evict,
413}
414
415/// Performance report
416#[derive(Debug, Clone, Serialize, Deserialize)]
417pub struct PerformanceReport {
418    pub summary: SummaryStats,
419    pub recommendations: Vec<String>,
420    pub alerts: Vec<Alert>,
421    pub trends: TrendAnalysis,
422}
423
424/// Summary statistics
425#[derive(Debug, Clone, Serialize, Deserialize)]
426pub struct SummaryStats {
427    pub total_operations: u64,
428    pub overall_hit_rate: f32,
429    pub memory_usage_mb: f32,
430    pub avg_latency_us: f32,
431    pub efficiency_score: f32,
432}
433
434/// Performance alert
435#[derive(Debug, Clone, Serialize, Deserialize)]
436pub struct Alert {
437    pub severity: AlertSeverity,
438    pub message: String,
439    pub timestamp: SystemTime,
440}
441
442/// Alert severity levels
443#[derive(Debug, Clone, Serialize, Deserialize)]
444pub enum AlertSeverity {
445    Info,
446    Warning,
447    Critical,
448}
449
450/// Trend analysis results
451#[derive(Debug, Clone, Serialize, Deserialize)]
452pub struct TrendAnalysis {
453    pub hit_rate_trend: Trend,
454    pub memory_trend: Trend,
455    pub latency_trend: Trend,
456    pub efficiency_trend: Trend,
457}
458
459/// Trend direction
460#[derive(Debug, Clone, Serialize, Deserialize)]
461pub enum Trend {
462    Increasing,
463    Decreasing,
464    Stable,
465    Volatile,
466}
467
468// Implementations
469
470impl Default for OperationTimings {
471    fn default() -> Self {
472        Self {
473            get_timings: TimingStats::default(),
474            put_timings: TimingStats::default(),
475            remove_timings: TimingStats::default(),
476            eviction_timings: TimingStats::default(),
477            compression_timings: TimingStats::default(),
478        }
479    }
480}
481
482impl Default for TimingStats {
483    fn default() -> Self {
484        Self {
485            count: 0,
486            total_us: 0,
487            avg_us: 0.0,
488            min_us: u64::MAX,
489            max_us: 0,
490            p50_us: 0,
491            p95_us: 0,
492            p99_us: 0,
493        }
494    }
495}
496
497impl TimingStats {
498    /// Record a timing measurement
499    pub fn record(&mut self, duration_us: u64) {
500        self.count += 1;
501        self.total_us += duration_us;
502        self.avg_us = self.total_us as f32 / self.count as f32;
503        self.min_us = self.min_us.min(duration_us);
504        self.max_us = self.max_us.max(duration_us);
505
506        // Update percentiles (simplified - would use proper algorithm)
507        self.p50_us = self.avg_us as u64;
508        self.p95_us = (self.avg_us * 1.5) as u64;
509        self.p99_us = (self.avg_us * 2.0) as u64;
510    }
511}
512
513impl MemoryTracker {
514    pub fn new() -> Self {
515        Self {
516            current_bytes: 0,
517            peak_bytes: 0,
518            compression_saved_bytes: 0,
519            deduplication_saved_bytes: 0,
520            pressure_events: Vec::new(),
521        }
522    }
523}
524
525impl PerformanceAnalyzer {
526    pub fn new() -> Self {
527        Self {
528            hit_rate_history: Vec::new(),
529            ops_history: Vec::new(),
530            latency_history: Vec::new(),
531            efficiency_history: Vec::new(),
532        }
533    }
534}
535
536impl MetricsHistory {
537    pub fn new(max_size: usize, interval: Duration) -> Self {
538        Self {
539            snapshots: Vec::new(),
540            max_size,
541            interval,
542            last_snapshot: SystemTime::now(),
543        }
544    }
545
546    pub fn add_snapshot(&mut self, snapshot: MetricsSnapshot) {
547        self.snapshots.push(snapshot);
548        if self.snapshots.len() > self.max_size {
549            self.snapshots.remove(0);
550        }
551        self.last_snapshot = SystemTime::now();
552    }
553}
554
555#[cfg(test)]
556mod tests {
557    use super::*;
558
559    #[test]
560    fn test_timing_stats() {
561        let mut stats = TimingStats::default();
562
563        stats.record(100);
564        stats.record(200);
565        stats.record(150);
566
567        assert_eq!(stats.count, 3);
568        assert_eq!(stats.avg_us, 150.0);
569        assert_eq!(stats.min_us, 100);
570        assert_eq!(stats.max_us, 200);
571    }
572
573    #[test]
574    fn test_metrics_collector() {
575        let mut collector = MetricsCollector::new();
576
577        collector.cache_metrics.insert(
578            "test".to_string(),
579            CacheStats {
580                total_entries: 100,
581                hits: 80,
582                misses: 20,
583                hit_rate: 0.8,
584                memory_usage: 1024,
585                avg_access_time_us: 10.0,
586                evictions: 5,
587                last_cleanup: SystemTime::now(),
588            },
589        );
590
591        collector.record_operation(
592            "test",
593            Operation::Get { hit: true },
594            Duration::from_micros(10),
595        );
596
597        let report = collector.get_report();
598        assert!(report.summary.overall_hit_rate > 0.0);
599    }
600
601    #[test]
602    fn test_memory_tracker() {
603        let mut tracker = MemoryTracker::new();
604
605        tracker.current_bytes = 1024;
606        tracker.peak_bytes = 2048;
607        tracker.compression_saved_bytes = 512;
608
609        assert_eq!(tracker.peak_bytes, 2048);
610    }
611}