Skip to main content

oxirs_vec/
dynamic_index_selector.rs

1//! Dynamic index selection for optimal query performance
2//!
3//! This module provides runtime index selection based on query characteristics,
4//! automatically choosing the best index type (HNSW, NSG, IVF, PQ, etc.) for each query.
5//!
6//! # Features
7//!
8//! - **Automatic Strategy Selection**: Uses cost-based query planning
9//! - **Multiple Index Support**: Maintains multiple indices for different use cases
10//! - **Performance Learning**: Tracks actual performance to improve future selections
11//! - **Adaptive Parameters**: Automatically tunes parameters based on query requirements
12//!
13//! # Example
14//!
15//! ```rust,ignore
16//! use oxirs_vec::dynamic_index_selector::{DynamicIndexSelector, IndexSelectorConfig};
17//! use oxirs_vec::{Vector, VectorIndex};
18//!
19//! let config = IndexSelectorConfig::default();
20//! let mut selector = DynamicIndexSelector::new(config).unwrap();
21//!
22//! // Add vectors - they'll be indexed in all configured indices
23//! for i in 0..1000 {
24//!     let vec = Vector::new(vec![i as f32, (i * 2) as f32]);
25//!     selector.add(format!("vec_{}", i), vec).unwrap();
26//! }
27//!
28//! // Build all indices
29//! selector.build().unwrap();
30//!
31//! // Search - automatically selects best index
32//! let query = Vector::new(vec![500.0, 1000.0]);
33//! let results = selector.search_knn(&query, 10).unwrap();
34//! ```
35
36use crate::query_planning::*;
37use crate::{hnsw::HnswIndex, ivf::IvfIndex, lsh::LshIndex, nsg::NsgIndex};
38use crate::{Vector, VectorIndex};
39use anyhow::Result;
40use std::collections::HashMap;
41use std::sync::{Arc, RwLock};
42use tracing::{debug, info};
43
44/// Configuration for dynamic index selector
45#[derive(Debug, Clone)]
46pub struct IndexSelectorConfig {
47    /// Enable HNSW index
48    pub enable_hnsw: bool,
49    /// Enable NSG index
50    pub enable_nsg: bool,
51    /// Enable IVF index
52    pub enable_ivf: bool,
53    /// Enable LSH index
54    pub enable_lsh: bool,
55    /// Minimum recall requirement (0.0 to 1.0)
56    pub min_recall: f32,
57    /// Maximum acceptable latency (milliseconds)
58    pub max_latency_ms: f64,
59    /// Enable performance learning
60    pub enable_learning: bool,
61    /// Build all indices immediately
62    pub eager_build: bool,
63}
64
65impl Default for IndexSelectorConfig {
66    fn default() -> Self {
67        Self {
68            enable_hnsw: true,
69            enable_nsg: true,
70            enable_ivf: true,
71            enable_lsh: false, // LSH is less commonly used
72            min_recall: 0.90,
73            max_latency_ms: 100.0,
74            enable_learning: true,
75            eager_build: true,
76        }
77    }
78}
79
80/// Dynamic index selector with multiple index backends
81pub struct DynamicIndexSelector {
82    config: IndexSelectorConfig,
83    hnsw_index: Option<HnswIndex>,
84    nsg_index: Option<NsgIndex>,
85    ivf_index: Option<IvfIndex>,
86    lsh_index: Option<LshIndex>,
87    query_planner: Arc<RwLock<QueryPlanner>>,
88    data: Vec<(String, Vector)>,
89    is_built: bool,
90    performance_stats: Arc<RwLock<PerformanceStats>>,
91}
92
93/// Performance statistics for learning
94#[derive(Debug, Clone, Default)]
95struct PerformanceStats {
96    strategy_latencies: HashMap<QueryStrategy, Vec<f64>>,
97    strategy_recalls: HashMap<QueryStrategy, Vec<f32>>,
98    total_queries: usize,
99}
100
101impl PerformanceStats {
102    fn record(&mut self, strategy: QueryStrategy, latency_ms: f64, recall: f32) {
103        self.strategy_latencies
104            .entry(strategy)
105            .or_default()
106            .push(latency_ms);
107
108        self.strategy_recalls
109            .entry(strategy)
110            .or_default()
111            .push(recall);
112
113        self.total_queries += 1;
114    }
115
116    fn avg_latency(&self, strategy: QueryStrategy) -> Option<f64> {
117        self.strategy_latencies
118            .get(&strategy)
119            .and_then(|latencies| {
120                if latencies.is_empty() {
121                    None
122                } else {
123                    Some(latencies.iter().sum::<f64>() / latencies.len() as f64)
124                }
125            })
126    }
127
128    fn avg_recall(&self, strategy: QueryStrategy) -> Option<f32> {
129        self.strategy_recalls.get(&strategy).and_then(|recalls| {
130            if recalls.is_empty() {
131                None
132            } else {
133                Some(recalls.iter().sum::<f32>() / recalls.len() as f32)
134            }
135        })
136    }
137}
138
139impl DynamicIndexSelector {
140    /// Create a new dynamic index selector
141    pub fn new(config: IndexSelectorConfig) -> Result<Self> {
142        // Determine available indices based on config
143        let mut available_indices = Vec::new();
144        if config.enable_hnsw {
145            available_indices.push(QueryStrategy::HnswApproximate);
146        }
147        if config.enable_nsg {
148            available_indices.push(QueryStrategy::NsgApproximate);
149        }
150        if config.enable_ivf {
151            available_indices.push(QueryStrategy::IvfCoarse);
152        }
153        if config.enable_lsh {
154            available_indices.push(QueryStrategy::LocalitySensitiveHashing);
155        }
156
157        if available_indices.is_empty() {
158            return Err(anyhow::anyhow!("At least one index type must be enabled"));
159        }
160
161        // Create initial index statistics
162        let index_stats = IndexStatistics {
163            vector_count: 0,
164            dimensions: 0,
165            available_indices,
166            avg_latencies: HashMap::new(),
167            avg_recalls: HashMap::new(),
168        };
169
170        let cost_model = CostModel::default();
171        let query_planner = Arc::new(RwLock::new(QueryPlanner::new(cost_model, index_stats)));
172
173        Ok(Self {
174            config,
175            hnsw_index: None,
176            nsg_index: None,
177            ivf_index: None,
178            lsh_index: None,
179            query_planner,
180            data: Vec::new(),
181            is_built: false,
182            performance_stats: Arc::new(RwLock::new(PerformanceStats::default())),
183        })
184    }
185
186    /// Add a vector to all enabled indices
187    pub fn add(&mut self, uri: String, vector: Vector) -> Result<()> {
188        if self.is_built && self.config.eager_build {
189            return Err(anyhow::anyhow!(
190                "Cannot add vectors after indices are built in eager mode"
191            ));
192        }
193
194        self.data.push((uri, vector));
195        Ok(())
196    }
197
198    /// Build all enabled indices
199    pub fn build(&mut self) -> Result<()> {
200        if self.data.is_empty() {
201            return Err(anyhow::anyhow!("No vectors to index"));
202        }
203
204        let dimensions = self.data[0].1.dimensions;
205        let vector_count = self.data.len();
206
207        info!(
208            "Building dynamic index selector with {} vectors, {} dimensions",
209            vector_count, dimensions
210        );
211
212        // Build HNSW index
213        if self.config.enable_hnsw {
214            debug!("Building HNSW index");
215            let mut hnsw = HnswIndex::new(Default::default())?;
216            for (uri, vec) in &self.data {
217                hnsw.insert(uri.clone(), vec.clone())?;
218            }
219            self.hnsw_index = Some(hnsw);
220        }
221
222        // Build NSG index
223        if self.config.enable_nsg {
224            debug!("Building NSG index");
225            let mut nsg = NsgIndex::new(Default::default())?;
226            for (uri, vec) in &self.data {
227                nsg.insert(uri.clone(), vec.clone())?;
228            }
229            nsg.build()?;
230            self.nsg_index = Some(nsg);
231        }
232
233        // Build IVF index
234        if self.config.enable_ivf {
235            debug!("Building IVF index");
236            let mut ivf = IvfIndex::new(Default::default())?;
237            for (uri, vec) in &self.data {
238                ivf.insert(uri.clone(), vec.clone())?;
239            }
240            // IVF trains clusters automatically during insertion
241            self.ivf_index = Some(ivf);
242        }
243
244        // Build LSH index
245        if self.config.enable_lsh {
246            debug!("Building LSH index");
247            let lsh = LshIndex::new(Default::default());
248            let mut lsh_mut = lsh;
249            for (uri, vec) in &self.data {
250                lsh_mut.insert(uri.clone(), vec.clone())?;
251            }
252            self.lsh_index = Some(lsh_mut);
253        }
254
255        // Update query planner statistics
256        let mut planner = self
257            .query_planner
258            .write()
259            .expect("query_planner write lock should not be poisoned");
260        planner.update_index_metadata(vector_count, dimensions);
261
262        self.is_built = true;
263
264        info!("Dynamic index selector built successfully");
265
266        Ok(())
267    }
268
269    /// Search with automatic index selection
270    pub fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
271        if !self.is_built {
272            return Err(anyhow::anyhow!("Indices not built. Call build() first."));
273        }
274
275        // Create query characteristics
276        let query_chars = QueryCharacteristics {
277            k,
278            dimensions: query.dimensions,
279            min_recall: self.config.min_recall,
280            max_latency_ms: self.config.max_latency_ms,
281            query_type: VectorQueryType::Single,
282        };
283
284        // Get query plan
285        let planner = self
286            .query_planner
287            .read()
288            .expect("query_planner read lock should not be poisoned");
289        let plan = planner.plan(&query_chars)?;
290        drop(planner); // Release read lock
291
292        debug!(
293            "Selected strategy: {:?} (estimated cost: {:.2} µs, recall: {:.2})",
294            plan.strategy, plan.estimated_cost_us, plan.estimated_recall
295        );
296
297        // Execute query using selected strategy
298        let start = std::time::Instant::now();
299        let results = self.execute_strategy(plan.strategy, query, k)?;
300        let elapsed = start.elapsed().as_secs_f64() * 1000.0; // Convert to ms
301
302        // Record performance if learning is enabled
303        if self.config.enable_learning {
304            let mut stats = self
305                .performance_stats
306                .write()
307                .expect("performance_stats write lock should not be poisoned");
308            stats.record(plan.strategy, elapsed, plan.estimated_recall);
309            drop(stats);
310
311            // Update query planner with actual performance
312            let mut planner = self
313                .query_planner
314                .write()
315                .expect("query_planner write lock should not be poisoned");
316            if let Some(avg_latency) = self
317                .performance_stats
318                .read()
319                .expect("performance_stats read lock should not be poisoned")
320                .avg_latency(plan.strategy)
321            {
322                planner.update_statistics(plan.strategy, avg_latency, plan.estimated_recall);
323            }
324        }
325
326        Ok(results)
327    }
328
329    /// Execute query using specific strategy
330    fn execute_strategy(
331        &self,
332        strategy: QueryStrategy,
333        query: &Vector,
334        k: usize,
335    ) -> Result<Vec<(String, f32)>> {
336        match strategy {
337            QueryStrategy::HnswApproximate => {
338                if let Some(ref index) = self.hnsw_index {
339                    index.search_knn(query, k)
340                } else {
341                    Err(anyhow::anyhow!("HNSW index not available"))
342                }
343            }
344            QueryStrategy::NsgApproximate => {
345                if let Some(ref index) = self.nsg_index {
346                    index.search_knn(query, k)
347                } else {
348                    Err(anyhow::anyhow!("NSG index not available"))
349                }
350            }
351            QueryStrategy::IvfCoarse => {
352                if let Some(ref index) = self.ivf_index {
353                    index.search_knn(query, k)
354                } else {
355                    Err(anyhow::anyhow!("IVF index not available"))
356                }
357            }
358            QueryStrategy::LocalitySensitiveHashing => {
359                if let Some(ref index) = self.lsh_index {
360                    index.search_knn(query, k)
361                } else {
362                    Err(anyhow::anyhow!("LSH index not available"))
363                }
364            }
365            _ => Err(anyhow::anyhow!(
366                "Strategy {:?} not supported by dynamic selector",
367                strategy
368            )),
369        }
370    }
371
372    /// Get performance statistics
373    pub fn get_stats(&self) -> HashMap<String, String> {
374        let mut stats = HashMap::new();
375        let perf_stats = self
376            .performance_stats
377            .read()
378            .expect("performance_stats read lock should not be poisoned");
379
380        stats.insert(
381            "total_queries".to_string(),
382            perf_stats.total_queries.to_string(),
383        );
384        stats.insert("vector_count".to_string(), self.data.len().to_string());
385        stats.insert("is_built".to_string(), self.is_built.to_string());
386
387        // Add per-strategy stats
388        for strategy in &[
389            QueryStrategy::HnswApproximate,
390            QueryStrategy::NsgApproximate,
391            QueryStrategy::IvfCoarse,
392            QueryStrategy::LocalitySensitiveHashing,
393        ] {
394            if let Some(avg_lat) = perf_stats.avg_latency(*strategy) {
395                stats.insert(
396                    format!("{:?}_avg_latency_ms", strategy),
397                    format!("{:.2}", avg_lat),
398                );
399            }
400            if let Some(avg_rec) = perf_stats.avg_recall(*strategy) {
401                stats.insert(
402                    format!("{:?}_avg_recall", strategy),
403                    format!("{:.2}", avg_rec),
404                );
405            }
406        }
407
408        stats
409    }
410
411    /// Check if indices are built
412    pub fn is_built(&self) -> bool {
413        self.is_built
414    }
415
416    /// Get number of vectors
417    pub fn len(&self) -> usize {
418        self.data.len()
419    }
420
421    /// Check if empty
422    pub fn is_empty(&self) -> bool {
423        self.data.is_empty()
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_dynamic_selector_creation() {
433        let config = IndexSelectorConfig::default();
434        let selector = DynamicIndexSelector::new(config);
435        assert!(selector.is_ok());
436    }
437
438    #[test]
439    fn test_add_vectors() {
440        let config = IndexSelectorConfig::default();
441        let mut selector = DynamicIndexSelector::new(config).unwrap();
442
443        for i in 0..10 {
444            let vec = Vector::new(vec![i as f32, (i * 2) as f32]);
445            selector.add(format!("vec_{}", i), vec).unwrap();
446        }
447
448        assert_eq!(selector.len(), 10);
449    }
450
451    #[test]
452    fn test_build_and_search() {
453        let config = IndexSelectorConfig {
454            enable_hnsw: true,
455            enable_nsg: true,
456            enable_ivf: false, // Disable IVF to speed up test
457            enable_lsh: false,
458            ..Default::default()
459        };
460        let mut selector = DynamicIndexSelector::new(config).unwrap();
461
462        // Add test vectors
463        for i in 0..50 {
464            let vec = Vector::new(vec![i as f32, (i * 2) as f32, (i * 3) as f32]);
465            selector.add(format!("vec_{}", i), vec).unwrap();
466        }
467
468        // Build indices
469        selector.build().unwrap();
470        assert!(selector.is_built());
471
472        // Search
473        let query = Vector::new(vec![25.0, 50.0, 75.0]);
474        let results = selector.search_knn(&query, 5).unwrap();
475
476        assert_eq!(results.len(), 5);
477        // Results should be sorted by similarity
478        for i in 1..results.len() {
479            assert!(results[i - 1].1 >= results[i].1);
480        }
481    }
482
483    #[test]
484    fn test_performance_learning() {
485        let config = IndexSelectorConfig {
486            enable_hnsw: true,
487            enable_nsg: true,
488            enable_ivf: false, // Disable IVF to avoid training requirement
489            enable_lsh: false,
490            enable_learning: true,
491            ..Default::default()
492        };
493        let mut selector = DynamicIndexSelector::new(config).unwrap();
494
495        // Add vectors
496        for i in 0..30 {
497            let vec = Vector::new(vec![i as f32, (i * 2) as f32]);
498            selector.add(format!("vec_{}", i), vec).unwrap();
499        }
500
501        selector.build().unwrap();
502
503        // Perform multiple searches to build up statistics
504        for _ in 0..5 {
505            let query = Vector::new(vec![15.0, 30.0]);
506            let _ = selector.search_knn(&query, 5);
507        }
508
509        // Check that statistics were recorded
510        let stats = selector.get_stats();
511        assert!(stats.contains_key("total_queries"));
512        let total_queries: usize = stats.get("total_queries").unwrap().parse().unwrap();
513        assert!(total_queries >= 5);
514    }
515}