ipfrs_semantic/
diagnostics.rs

1//! Index diagnostics and health monitoring
2//!
3//! This module provides comprehensive diagnostic tools for monitoring
4//! index health, detecting performance issues, and providing actionable insights.
5
6use crate::hnsw::VectorIndex;
7use std::time::{Duration, Instant};
8
9/// Overall health status of an index
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum HealthStatus {
12    /// Index is healthy and performing optimally
13    Healthy,
14    /// Index has minor issues but is functional
15    Warning,
16    /// Index has significant issues affecting performance
17    Degraded,
18    /// Index is critically impaired
19    Critical,
20}
21
22/// Detailed diagnostic report for an index
23#[derive(Debug, Clone)]
24pub struct DiagnosticReport {
25    /// Overall health status
26    pub status: HealthStatus,
27    /// Index size (number of vectors)
28    pub size: usize,
29    /// Memory usage estimate in bytes
30    pub memory_usage: usize,
31    /// Issues detected
32    pub issues: Vec<DiagnosticIssue>,
33    /// Recommendations for improvement
34    pub recommendations: Vec<String>,
35    /// Performance metrics
36    pub performance: PerformanceMetrics,
37}
38
39/// A specific issue detected during diagnostics
40#[derive(Debug, Clone)]
41pub struct DiagnosticIssue {
42    /// Severity level
43    pub severity: IssueSeverity,
44    /// Issue category
45    pub category: IssueCategory,
46    /// Human-readable description
47    pub description: String,
48    /// Suggested fix or mitigation
49    pub suggested_fix: Option<String>,
50}
51
52/// Severity of a diagnostic issue
53#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
54pub enum IssueSeverity {
55    /// Informational message
56    Info,
57    /// Warning - should be addressed
58    Warning,
59    /// Error - significantly impacts functionality
60    Error,
61    /// Critical - immediate attention required
62    Critical,
63}
64
65/// Category of diagnostic issue
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
67pub enum IssueCategory {
68    /// Memory-related issues
69    Memory,
70    /// Performance-related issues
71    Performance,
72    /// Configuration issues
73    Configuration,
74    /// Data quality issues
75    DataQuality,
76    /// Index structure issues
77    IndexStructure,
78}
79
80/// Performance metrics for the index
81#[derive(Debug, Clone)]
82pub struct PerformanceMetrics {
83    /// Average query latency (if available)
84    pub avg_query_latency: Option<Duration>,
85    /// Cache hit rate (0.0 - 1.0)
86    pub cache_hit_rate: Option<f32>,
87    /// Estimated queries per second capacity
88    pub estimated_qps: Option<f32>,
89}
90
91/// Run comprehensive diagnostics on an index
92pub fn diagnose_index(index: &VectorIndex) -> DiagnosticReport {
93    let mut issues = Vec::new();
94    let mut recommendations = Vec::new();
95
96    let size = index.len();
97    let dimension = index.dimension();
98
99    // Estimate memory usage
100    // Each vector: dimension * 4 bytes (f32) + overhead
101    // HNSW graph: ~(M * 2 * size * 8 bytes) for connections
102    let vector_memory = size * dimension * 4;
103    let graph_memory = size * 16 * 8; // Assuming M=16
104    let overhead = size * 100; // Mappings and other overhead
105    let memory_usage = vector_memory + graph_memory + overhead;
106
107    // Check for size-related issues
108    if size == 0 {
109        issues.push(DiagnosticIssue {
110            severity: IssueSeverity::Warning,
111            category: IssueCategory::IndexStructure,
112            description: "Index is empty".to_string(),
113            suggested_fix: Some("Add vectors to the index before querying".to_string()),
114        });
115    } else if size > 10_000_000 {
116        issues.push(DiagnosticIssue {
117            severity: IssueSeverity::Warning,
118            category: IssueCategory::Performance,
119            description: format!("Very large index ({} vectors)", size),
120            suggested_fix: Some("Consider using DiskANN for datasets > 10M vectors".to_string()),
121        });
122        recommendations
123            .push("Consider partitioning the index or using distributed search".to_string());
124    }
125
126    // Check memory usage
127    if memory_usage > 10 * 1024 * 1024 * 1024 {
128        // > 10GB
129        issues.push(DiagnosticIssue {
130            severity: IssueSeverity::Warning,
131            category: IssueCategory::Memory,
132            description: format!("High memory usage: ~{:.2} GB", memory_usage as f64 / 1e9),
133            suggested_fix: Some("Consider using quantization or DiskANN".to_string()),
134        });
135    }
136
137    // Check dimension
138    if dimension > 2048 {
139        issues.push(DiagnosticIssue {
140            severity: IssueSeverity::Info,
141            category: IssueCategory::Performance,
142            description: format!("High dimensionality: {}", dimension),
143            suggested_fix: Some("Consider dimensionality reduction or PCA".to_string()),
144        });
145        recommendations
146            .push("High-dimensional vectors may benefit from dimensionality reduction".to_string());
147    }
148
149    // Determine overall health status
150    let status = if issues.iter().any(|i| i.severity == IssueSeverity::Critical) {
151        HealthStatus::Critical
152    } else if issues.iter().any(|i| i.severity == IssueSeverity::Error) {
153        HealthStatus::Degraded
154    } else if issues.iter().any(|i| i.severity == IssueSeverity::Warning) {
155        HealthStatus::Warning
156    } else {
157        HealthStatus::Healthy
158    };
159
160    DiagnosticReport {
161        status,
162        size,
163        memory_usage,
164        issues,
165        recommendations,
166        performance: PerformanceMetrics {
167            avg_query_latency: None,
168            cache_hit_rate: None,
169            estimated_qps: None,
170        },
171    }
172}
173
174/// Performance profiler for search operations
175pub struct SearchProfiler {
176    start_time: Instant,
177    query_count: usize,
178    total_duration: Duration,
179    min_latency: Option<Duration>,
180    max_latency: Option<Duration>,
181}
182
183impl SearchProfiler {
184    /// Create a new search profiler
185    pub fn new() -> Self {
186        Self {
187            start_time: Instant::now(),
188            query_count: 0,
189            total_duration: Duration::from_secs(0),
190            min_latency: None,
191            max_latency: None,
192        }
193    }
194
195    /// Record a query execution
196    pub fn record_query(&mut self, duration: Duration) {
197        self.query_count += 1;
198        self.total_duration += duration;
199
200        self.min_latency = Some(match self.min_latency {
201            Some(min) => min.min(duration),
202            None => duration,
203        });
204
205        self.max_latency = Some(match self.max_latency {
206            Some(max) => max.max(duration),
207            None => duration,
208        });
209    }
210
211    /// Get profiling statistics
212    pub fn stats(&self) -> ProfilerStats {
213        let avg_latency = if self.query_count > 0 {
214            self.total_duration / self.query_count as u32
215        } else {
216            Duration::from_secs(0)
217        };
218
219        let elapsed = self.start_time.elapsed();
220        let qps = if elapsed.as_secs() > 0 {
221            self.query_count as f64 / elapsed.as_secs_f64()
222        } else {
223            0.0
224        };
225
226        ProfilerStats {
227            total_queries: self.query_count,
228            avg_latency,
229            min_latency: self.min_latency,
230            max_latency: self.max_latency,
231            qps,
232            elapsed,
233        }
234    }
235
236    /// Reset the profiler
237    pub fn reset(&mut self) {
238        self.start_time = Instant::now();
239        self.query_count = 0;
240        self.total_duration = Duration::from_secs(0);
241        self.min_latency = None;
242        self.max_latency = None;
243    }
244}
245
246impl Default for SearchProfiler {
247    fn default() -> Self {
248        Self::new()
249    }
250}
251
252/// Statistics from search profiling
253#[derive(Debug, Clone)]
254pub struct ProfilerStats {
255    /// Total number of queries executed
256    pub total_queries: usize,
257    /// Average query latency
258    pub avg_latency: Duration,
259    /// Minimum query latency
260    pub min_latency: Option<Duration>,
261    /// Maximum query latency
262    pub max_latency: Option<Duration>,
263    /// Queries per second
264    pub qps: f64,
265    /// Total elapsed time
266    pub elapsed: Duration,
267}
268
269/// Index health monitor with periodic checks
270pub struct HealthMonitor {
271    /// Last diagnostic report
272    last_report: Option<DiagnosticReport>,
273    /// Last check time
274    last_check: Option<Instant>,
275    /// Check interval
276    check_interval: Duration,
277}
278
279impl HealthMonitor {
280    /// Create a new health monitor
281    pub fn new(check_interval: Duration) -> Self {
282        Self {
283            last_report: None,
284            last_check: None,
285            check_interval,
286        }
287    }
288
289    /// Check if a health check is due
290    pub fn should_check(&self) -> bool {
291        match self.last_check {
292            Some(last) => last.elapsed() >= self.check_interval,
293            None => true,
294        }
295    }
296
297    /// Perform a health check
298    pub fn check(&mut self, index: &VectorIndex) -> &DiagnosticReport {
299        self.last_report = Some(diagnose_index(index));
300        self.last_check = Some(Instant::now());
301        self.last_report.as_ref().unwrap()
302    }
303
304    /// Get the last diagnostic report
305    pub fn last_report(&self) -> Option<&DiagnosticReport> {
306        self.last_report.as_ref()
307    }
308
309    /// Get time since last check
310    pub fn time_since_last_check(&self) -> Option<Duration> {
311        self.last_check.map(|t| t.elapsed())
312    }
313}
314
315impl Default for HealthMonitor {
316    fn default() -> Self {
317        Self::new(Duration::from_secs(300)) // 5 minutes default
318    }
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    #[test]
326    fn test_diagnose_empty_index() {
327        let index = VectorIndex::with_defaults(128).unwrap();
328        let report = diagnose_index(&index);
329
330        assert_eq!(report.size, 0);
331        assert!(!report.issues.is_empty());
332        assert!(report
333            .issues
334            .iter()
335            .any(|i| i.category == IssueCategory::IndexStructure));
336    }
337
338    #[test]
339    fn test_diagnose_normal_index() {
340        let mut index = VectorIndex::with_defaults(128).unwrap();
341        let cid: ipfrs_core::Cid = "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi"
342            .parse()
343            .unwrap();
344        index.insert(&cid, &vec![0.1; 128]).unwrap();
345
346        let report = diagnose_index(&index);
347
348        assert_eq!(report.size, 1);
349        assert!(report.status == HealthStatus::Healthy || report.status == HealthStatus::Warning);
350    }
351
352    #[test]
353    fn test_search_profiler() {
354        let mut profiler = SearchProfiler::new();
355
356        profiler.record_query(Duration::from_millis(10));
357        profiler.record_query(Duration::from_millis(20));
358        profiler.record_query(Duration::from_millis(15));
359
360        let stats = profiler.stats();
361
362        assert_eq!(stats.total_queries, 3);
363        assert!(stats.avg_latency.as_millis() >= 10);
364        assert!(stats.avg_latency.as_millis() <= 20);
365        assert_eq!(stats.min_latency, Some(Duration::from_millis(10)));
366        assert_eq!(stats.max_latency, Some(Duration::from_millis(20)));
367    }
368
369    #[test]
370    fn test_health_monitor() {
371        let mut monitor = HealthMonitor::new(Duration::from_millis(100));
372        let index = VectorIndex::with_defaults(128).unwrap();
373
374        assert!(monitor.should_check());
375
376        monitor.check(&index);
377
378        assert!(!monitor.should_check());
379        assert!(monitor.last_report().is_some());
380
381        std::thread::sleep(Duration::from_millis(150));
382        assert!(monitor.should_check());
383    }
384
385    #[test]
386    fn test_profiler_reset() {
387        let mut profiler = SearchProfiler::new();
388
389        profiler.record_query(Duration::from_millis(10));
390        profiler.record_query(Duration::from_millis(20));
391
392        assert_eq!(profiler.stats().total_queries, 2);
393
394        profiler.reset();
395
396        assert_eq!(profiler.stats().total_queries, 0);
397        assert_eq!(profiler.stats().min_latency, None);
398        assert_eq!(profiler.stats().max_latency, None);
399    }
400}