oxify_vector/
profiling.rs

1//! Query Profiling and Analysis
2//!
3//! Tools for analyzing and profiling vector search queries to help optimize performance.
4//!
5//! ## Features
6//!
7//! - **Query Profiling**: Detailed performance analysis of search operations
8//! - **Bottleneck Detection**: Identify performance bottlenecks
9//! - **Recommendations**: Get optimization suggestions based on query patterns
10//! - **Index Health**: Check index health and identify issues
11//!
12//! ## Example
13//!
14//! ```rust
15//! use oxify_vector::profiling::{QueryProfiler, ProfilingConfig};
16//! use oxify_vector::{VectorSearchIndex, SearchConfig};
17//! use std::collections::HashMap;
18//!
19//! # fn example() -> anyhow::Result<()> {
20//! // Create index
21//! let mut embeddings = HashMap::new();
22//! embeddings.insert("doc1".to_string(), vec![0.1, 0.2, 0.3]);
23//! let mut index = VectorSearchIndex::new(SearchConfig::default());
24//! index.build(&embeddings)?;
25//!
26//! // Profile a query
27//! let config = ProfilingConfig::default();
28//! let mut profiler = QueryProfiler::new(config);
29//!
30//! let query = vec![0.2, 0.3, 0.4];
31//! let profile = profiler.profile_search(|| {
32//!     index.search(&query, 10)
33//! })?;
34//!
35//! println!("Query took: {:?}", profile.total_duration);
36//! println!("Recommendations: {:?}", profile.recommendations);
37//! # Ok(())
38//! # }
39//! ```
40
41use anyhow::Result;
42use serde::{Deserialize, Serialize};
43use std::time::{Duration, Instant};
44
45use crate::types::SearchResult;
46
47/// Profiling configuration
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct ProfilingConfig {
50    /// Enable detailed timing breakdowns
51    pub detailed_timing: bool,
52    /// Enable memory profiling
53    pub memory_profiling: bool,
54    /// Threshold for slow query detection (milliseconds)
55    pub slow_query_threshold_ms: u64,
56    /// Enable automatic recommendations
57    pub enable_recommendations: bool,
58}
59
60impl Default for ProfilingConfig {
61    fn default() -> Self {
62        Self {
63            detailed_timing: true,
64            memory_profiling: false,
65            slow_query_threshold_ms: 100,
66            enable_recommendations: true,
67        }
68    }
69}
70
71/// Performance bottleneck type
72#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
73pub enum Bottleneck {
74    /// Query vector is too large (high dimensionality)
75    HighDimensionality,
76    /// Dataset is too large for current strategy
77    DatasetSize,
78    /// Filter is not selective enough
79    FilterSelectivity,
80    /// k value is too high
81    HighK,
82    /// No specific bottleneck detected
83    None,
84}
85
86/// Optimization recommendation
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct Recommendation {
89    /// Recommendation type
90    pub category: String,
91    /// Description of the recommendation
92    pub description: String,
93    /// Expected performance improvement
94    pub impact: ImpactLevel,
95}
96
97/// Impact level of a recommendation
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
99pub enum ImpactLevel {
100    /// High impact (>50% improvement)
101    High,
102    /// Medium impact (20-50% improvement)
103    Medium,
104    /// Low impact (<20% improvement)
105    Low,
106}
107
108/// Query profile result
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct QueryProfile {
111    /// Total query duration
112    pub total_duration: Duration,
113    /// Number of results returned
114    pub result_count: usize,
115    /// Detected bottleneck
116    pub bottleneck: Bottleneck,
117    /// Optimization recommendations
118    pub recommendations: Vec<Recommendation>,
119    /// Whether this is considered a slow query
120    pub is_slow_query: bool,
121}
122
123/// Query profiler
124#[derive(Debug, Clone)]
125pub struct QueryProfiler {
126    config: ProfilingConfig,
127}
128
129impl QueryProfiler {
130    /// Create a new query profiler
131    pub fn new(config: ProfilingConfig) -> Self {
132        Self { config }
133    }
134
135    /// Profile a search operation
136    ///
137    /// # Arguments
138    /// * `f` - Function that performs the search
139    pub fn profile_search<F>(&mut self, f: F) -> Result<QueryProfile>
140    where
141        F: FnOnce() -> Result<Vec<SearchResult>>,
142    {
143        let start = Instant::now();
144        let results = f()?;
145        let duration = start.elapsed();
146
147        let result_count = results.len();
148        let is_slow_query = duration.as_millis() > self.config.slow_query_threshold_ms as u128;
149
150        let bottleneck = self.detect_bottleneck(&results, duration);
151        let recommendations = if self.config.enable_recommendations {
152            self.generate_recommendations(&bottleneck, duration, result_count)
153        } else {
154            Vec::new()
155        };
156
157        Ok(QueryProfile {
158            total_duration: duration,
159            result_count,
160            bottleneck,
161            recommendations,
162            is_slow_query,
163        })
164    }
165
166    /// Detect performance bottlenecks
167    fn detect_bottleneck(&self, _results: &[SearchResult], duration: Duration) -> Bottleneck {
168        // Analyze query characteristics to detect bottlenecks
169        if duration.as_millis() > 1000 {
170            // Very slow query - likely dataset size issue
171            Bottleneck::DatasetSize
172        } else {
173            Bottleneck::None
174        }
175    }
176
177    /// Generate optimization recommendations
178    fn generate_recommendations(
179        &self,
180        bottleneck: &Bottleneck,
181        duration: Duration,
182        result_count: usize,
183    ) -> Vec<Recommendation> {
184        let mut recommendations = Vec::new();
185
186        match bottleneck {
187            Bottleneck::DatasetSize => {
188                recommendations.push(Recommendation {
189                    category: "Index Strategy".to_string(),
190                    description:
191                        "Consider using HNSW or IVF-PQ for approximate search on large datasets"
192                            .to_string(),
193                    impact: ImpactLevel::High,
194                });
195            }
196            Bottleneck::HighDimensionality => {
197                recommendations.push(Recommendation {
198                    category: "Dimensionality".to_string(),
199                    description: "Consider using dimensionality reduction (PCA) or quantization"
200                        .to_string(),
201                    impact: ImpactLevel::Medium,
202                });
203            }
204            Bottleneck::FilterSelectivity => {
205                recommendations.push(Recommendation {
206                    category: "Filtering".to_string(),
207                    description: "Use pre-filtering for highly selective filters".to_string(),
208                    impact: ImpactLevel::Medium,
209                });
210            }
211            Bottleneck::HighK => {
212                recommendations.push(Recommendation {
213                    category: "Query Parameters".to_string(),
214                    description: "Reduce k value if you don't need all top results".to_string(),
215                    impact: ImpactLevel::Low,
216                });
217            }
218            Bottleneck::None => {}
219        }
220
221        // Add general recommendations for slow queries
222        if duration.as_millis() > self.config.slow_query_threshold_ms as u128 && result_count > 100
223        {
224            recommendations.push(Recommendation {
225                category: "Result Count".to_string(),
226                description: "Consider reducing k to improve query speed".to_string(),
227                impact: ImpactLevel::Low,
228            });
229        }
230
231        recommendations
232    }
233
234    /// Get profiler configuration
235    pub fn config(&self) -> &ProfilingConfig {
236        &self.config
237    }
238}
239
240/// Index health checker
241#[derive(Debug)]
242pub struct IndexHealthChecker;
243
244impl IndexHealthChecker {
245    /// Create a new health checker
246    pub fn new() -> Self {
247        Self
248    }
249
250    /// Check index health and return recommendations
251    ///
252    /// # Arguments
253    /// * `num_vectors` - Number of vectors in index
254    /// * `dimensions` - Vector dimensions
255    /// * `avg_query_time_ms` - Average query time in milliseconds
256    pub fn check_health(
257        &self,
258        num_vectors: usize,
259        dimensions: usize,
260        avg_query_time_ms: f64,
261    ) -> Vec<Recommendation> {
262        let mut recommendations = Vec::new();
263
264        // Check if dimensionality is too high
265        if dimensions > 1024 {
266            recommendations.push(Recommendation {
267                category: "Dimensionality".to_string(),
268                description: format!(
269                    "Vector dimensionality ({}) is very high. Consider dimensionality reduction.",
270                    dimensions
271                ),
272                impact: ImpactLevel::Medium,
273            });
274        }
275
276        // Check if dataset is large but queries are slow
277        if num_vectors > 100_000 && avg_query_time_ms > 50.0 {
278            recommendations.push(Recommendation {
279                category: "Index Strategy".to_string(),
280                description: "Large dataset with slow queries. Consider using HNSW or IVF-PQ."
281                    .to_string(),
282                impact: ImpactLevel::High,
283            });
284        }
285
286        // Check if dataset is huge
287        if num_vectors > 10_000_000 {
288            recommendations.push(Recommendation {
289                category: "Scalability".to_string(),
290                description: "Very large dataset. Consider distributed search with sharding."
291                    .to_string(),
292                impact: ImpactLevel::High,
293            });
294        }
295
296        // Check if queries are consistently slow
297        if avg_query_time_ms > 100.0 {
298            recommendations.push(Recommendation {
299                category: "Performance".to_string(),
300                description:
301                    "Queries are slow. Consider enabling SIMD optimizations or using quantization."
302                        .to_string(),
303                impact: ImpactLevel::High,
304            });
305        }
306
307        recommendations
308    }
309}
310
311impl Default for IndexHealthChecker {
312    fn default() -> Self {
313        Self::new()
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_profiling_config_default() {
323        let config = ProfilingConfig::default();
324        assert!(config.detailed_timing);
325        assert!(config.enable_recommendations);
326        assert_eq!(config.slow_query_threshold_ms, 100);
327    }
328
329    #[test]
330    fn test_query_profiler_creation() {
331        let config = ProfilingConfig::default();
332        let profiler = QueryProfiler::new(config);
333        assert!(profiler.config().enable_recommendations);
334    }
335
336    #[test]
337    fn test_profile_fast_query() {
338        let config = ProfilingConfig::default();
339        let mut profiler = QueryProfiler::new(config);
340
341        let profile = profiler
342            .profile_search(|| -> Result<Vec<SearchResult>> {
343                // Simulate fast query
344                std::thread::sleep(Duration::from_millis(10));
345                Ok(vec![SearchResult {
346                    entity_id: "doc1".to_string(),
347                    score: 0.95,
348                    distance: 0.05,
349                    rank: 1,
350                }])
351            })
352            .unwrap();
353
354        assert_eq!(profile.result_count, 1);
355        assert!(!profile.is_slow_query);
356    }
357
358    #[test]
359    fn test_profile_slow_query() {
360        let config = ProfilingConfig {
361            slow_query_threshold_ms: 50,
362            ..Default::default()
363        };
364        let mut profiler = QueryProfiler::new(config);
365
366        let profile = profiler
367            .profile_search(|| -> Result<Vec<SearchResult>> {
368                // Simulate slow query
369                std::thread::sleep(Duration::from_millis(150));
370                Ok(vec![])
371            })
372            .unwrap();
373
374        assert!(profile.is_slow_query);
375        assert!(profile.total_duration.as_millis() >= 150);
376    }
377
378    #[test]
379    fn test_bottleneck_detection_slow_query() {
380        let config = ProfilingConfig::default();
381        let profiler = QueryProfiler::new(config);
382
383        let results = vec![];
384        let duration = Duration::from_millis(2000);
385
386        let bottleneck = profiler.detect_bottleneck(&results, duration);
387        assert_eq!(bottleneck, Bottleneck::DatasetSize);
388    }
389
390    #[test]
391    fn test_bottleneck_detection_fast_query() {
392        let config = ProfilingConfig::default();
393        let profiler = QueryProfiler::new(config);
394
395        let results = vec![];
396        let duration = Duration::from_millis(10);
397
398        let bottleneck = profiler.detect_bottleneck(&results, duration);
399        assert_eq!(bottleneck, Bottleneck::None);
400    }
401
402    #[test]
403    fn test_generate_recommendations_dataset_size() {
404        let config = ProfilingConfig::default();
405        let profiler = QueryProfiler::new(config);
406
407        let recommendations = profiler.generate_recommendations(
408            &Bottleneck::DatasetSize,
409            Duration::from_millis(100),
410            10,
411        );
412
413        assert!(!recommendations.is_empty());
414        assert_eq!(recommendations[0].category, "Index Strategy");
415        assert_eq!(recommendations[0].impact, ImpactLevel::High);
416    }
417
418    #[test]
419    fn test_generate_recommendations_high_k() {
420        let config = ProfilingConfig::default();
421        let profiler = QueryProfiler::new(config);
422
423        let recommendations =
424            profiler.generate_recommendations(&Bottleneck::None, Duration::from_millis(150), 200);
425
426        assert!(!recommendations.is_empty());
427        // Should recommend reducing k for slow queries with many results
428    }
429
430    #[test]
431    fn test_index_health_checker_creation() {
432        let checker = IndexHealthChecker::new();
433        let recommendations = checker.check_health(1000, 768, 10.0);
434        assert!(recommendations.is_empty()); // Small dataset, reasonable performance
435    }
436
437    #[test]
438    fn test_index_health_high_dimensionality() {
439        let checker = IndexHealthChecker::new();
440        let recommendations = checker.check_health(10_000, 2048, 10.0);
441
442        assert!(!recommendations.is_empty());
443        assert!(recommendations
444            .iter()
445            .any(|r| r.category == "Dimensionality"));
446    }
447
448    #[test]
449    fn test_index_health_large_dataset_slow() {
450        let checker = IndexHealthChecker::new();
451        let recommendations = checker.check_health(200_000, 768, 100.0);
452
453        assert!(!recommendations.is_empty());
454        assert!(recommendations
455            .iter()
456            .any(|r| r.category == "Index Strategy" || r.category == "Performance"));
457    }
458
459    #[test]
460    fn test_index_health_very_large_dataset() {
461        let checker = IndexHealthChecker::new();
462        let recommendations = checker.check_health(15_000_000, 768, 50.0);
463
464        assert!(!recommendations.is_empty());
465        assert!(recommendations.iter().any(|r| r.category == "Scalability"));
466    }
467
468    #[test]
469    fn test_recommendation_impact_levels() {
470        let high_impact = Recommendation {
471            category: "Test".to_string(),
472            description: "Test".to_string(),
473            impact: ImpactLevel::High,
474        };
475
476        let medium_impact = Recommendation {
477            category: "Test".to_string(),
478            description: "Test".to_string(),
479            impact: ImpactLevel::Medium,
480        };
481
482        let low_impact = Recommendation {
483            category: "Test".to_string(),
484            description: "Test".to_string(),
485            impact: ImpactLevel::Low,
486        };
487
488        assert_eq!(high_impact.impact, ImpactLevel::High);
489        assert_eq!(medium_impact.impact, ImpactLevel::Medium);
490        assert_eq!(low_impact.impact, ImpactLevel::Low);
491    }
492}