oxify_vector/
optimizer.rs

1//! Query Optimizer
2//!
3//! Automatically selects the best search strategy based on:
4//! - Dataset size (brute force vs HNSW vs IVF-PQ)
5//! - Query characteristics (batch size, filter selectivity)
6//! - Performance requirements (accuracy vs speed trade-off)
7//!
8//! ## Example
9//!
10//! ```rust
11//! use oxify_vector::optimizer::{QueryOptimizer, OptimizerConfig, SearchStrategy};
12//!
13//! let config = OptimizerConfig::default();
14//! let optimizer = QueryOptimizer::new(config);
15//!
16//! // Recommend strategy based on dataset size
17//! let strategy = optimizer.recommend_strategy(1_000_000, 0.95);
18//! assert_eq!(strategy, SearchStrategy::IvfPq);
19//! ```
20
21use serde::{Deserialize, Serialize};
22
23/// Search strategy recommendation
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25pub enum SearchStrategy {
26    /// Brute-force exact search (best for small datasets < 10K vectors)
27    BruteForce,
28    /// HNSW approximate search (best for medium datasets 10K-1M vectors)
29    Hnsw,
30    /// IVF-PQ approximate search (best for large datasets > 1M vectors)
31    IvfPq,
32    /// Distributed search across multiple shards
33    Distributed,
34}
35
36/// Optimizer configuration
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct OptimizerConfig {
39    /// Size threshold for switching from brute-force to HNSW
40    pub brute_force_threshold: usize,
41    /// Size threshold for switching from HNSW to IVF-PQ
42    pub hnsw_threshold: usize,
43    /// Size threshold for switching to distributed search
44    pub distributed_threshold: usize,
45    /// Minimum recall requirement (0.0 to 1.0)
46    pub min_recall: f32,
47    /// Whether to enable query plan caching
48    pub enable_caching: bool,
49}
50
51impl Default for OptimizerConfig {
52    fn default() -> Self {
53        Self {
54            brute_force_threshold: 10_000,
55            hnsw_threshold: 1_000_000,
56            distributed_threshold: 10_000_000,
57            min_recall: 0.90,
58            enable_caching: true,
59        }
60    }
61}
62
63impl OptimizerConfig {
64    /// Create config optimized for high accuracy
65    pub fn high_accuracy() -> Self {
66        Self {
67            brute_force_threshold: 50_000, // Use exact search longer
68            hnsw_threshold: 5_000_000,
69            distributed_threshold: 10_000_000,
70            min_recall: 0.98,
71            enable_caching: true,
72        }
73    }
74
75    /// Create config optimized for speed
76    pub fn high_speed() -> Self {
77        Self {
78            brute_force_threshold: 5_000, // Switch to ANN earlier
79            hnsw_threshold: 500_000,
80            distributed_threshold: 5_000_000,
81            min_recall: 0.80,
82            enable_caching: true,
83        }
84    }
85
86    /// Create config optimized for memory efficiency
87    pub fn memory_efficient() -> Self {
88        Self {
89            brute_force_threshold: 10_000,
90            hnsw_threshold: 100_000, // Use IVF-PQ earlier for compression
91            distributed_threshold: 10_000_000,
92            min_recall: 0.90,
93            enable_caching: false, // Disable caching to save memory
94        }
95    }
96}
97
98/// Query optimizer for selecting search strategies
99#[derive(Debug, Clone)]
100pub struct QueryOptimizer {
101    config: OptimizerConfig,
102}
103
104impl QueryOptimizer {
105    /// Create a new query optimizer
106    pub fn new(config: OptimizerConfig) -> Self {
107        Self { config }
108    }
109
110    /// Recommend search strategy based on dataset size and recall requirement
111    ///
112    /// # Arguments
113    /// * `num_vectors` - Number of vectors in the dataset
114    /// * `required_recall` - Required recall (0.0 to 1.0)
115    pub fn recommend_strategy(&self, num_vectors: usize, required_recall: f32) -> SearchStrategy {
116        // For very high recall requirements, use exact search if feasible
117        if required_recall >= 0.99 && num_vectors < self.config.brute_force_threshold * 2 {
118            return SearchStrategy::BruteForce;
119        }
120
121        // Select based on dataset size
122        if num_vectors < self.config.brute_force_threshold {
123            SearchStrategy::BruteForce
124        } else if num_vectors < self.config.hnsw_threshold {
125            SearchStrategy::Hnsw
126        } else if num_vectors < self.config.distributed_threshold {
127            SearchStrategy::IvfPq
128        } else {
129            SearchStrategy::Distributed
130        }
131    }
132
133    /// Estimate whether pre-filtering or post-filtering is more efficient
134    ///
135    /// # Arguments
136    /// * `num_vectors` - Total number of vectors
137    /// * `filter_selectivity` - Estimated fraction of vectors matching filter (0.0 to 1.0)
138    ///
139    /// # Returns
140    /// `true` if pre-filtering is recommended, `false` for post-filtering
141    pub fn recommend_prefiltering(&self, num_vectors: usize, filter_selectivity: f32) -> bool {
142        // Pre-filtering is better when filter is highly selective (< 10% match)
143        // Post-filtering is better when most vectors match
144
145        // For small datasets, post-filtering is fine
146        if num_vectors < 1000 {
147            return false;
148        }
149
150        // Use pre-filtering if filter removes > 90% of vectors
151        filter_selectivity < 0.10
152    }
153
154    /// Estimate optimal batch size for batch search
155    ///
156    /// # Arguments
157    /// * `num_queries` - Number of queries to process
158    /// * `num_vectors` - Number of vectors in dataset
159    ///
160    /// # Returns
161    /// Recommended batch size
162    pub fn recommend_batch_size(&self, num_queries: usize, num_vectors: usize) -> usize {
163        // For small query counts, no batching needed
164        if num_queries < 10 {
165            return num_queries;
166        }
167
168        // Balance between parallelism and memory usage
169        // Larger datasets → smaller batches to avoid memory pressure
170        let base_batch_size = if num_vectors < 100_000 {
171            1000
172        } else if num_vectors < 1_000_000 {
173            500
174        } else {
175            100
176        };
177
178        base_batch_size.min(num_queries)
179    }
180
181    /// Estimate search cost (relative units)
182    ///
183    /// Returns estimated computational cost for comparison between strategies.
184    #[allow(dead_code)]
185    pub fn estimate_cost(&self, strategy: SearchStrategy, num_vectors: usize, k: usize) -> f64 {
186        match strategy {
187            SearchStrategy::BruteForce => {
188                // O(n * d) where d is dimension (assume 768)
189                num_vectors as f64 * 768.0 * k as f64
190            }
191            SearchStrategy::Hnsw => {
192                // O(log(n) * ef_search) - sub-linear
193                let ef_search = 50.0;
194                (num_vectors as f64).log2() * ef_search * k as f64
195            }
196            SearchStrategy::IvfPq => {
197                // O(nprobe * cluster_size) - depends on nprobe
198                let nprobe = 16.0;
199                let avg_cluster_size = num_vectors as f64 / 256.0;
200                nprobe * avg_cluster_size * k as f64
201            }
202            SearchStrategy::Distributed => {
203                // Similar to IVF-PQ but with network overhead
204                let nprobe = 16.0;
205                let avg_cluster_size = num_vectors as f64 / 256.0;
206                nprobe * avg_cluster_size * k as f64 * 1.5 // 50% network overhead
207            }
208        }
209    }
210
211    /// Get optimizer configuration
212    pub fn config(&self) -> &OptimizerConfig {
213        &self.config
214    }
215}
216
217/// Query plan for execution
218#[derive(Debug, Clone)]
219pub struct QueryPlan {
220    /// Recommended search strategy
221    pub strategy: SearchStrategy,
222    /// Whether to use pre-filtering
223    pub use_prefiltering: bool,
224    /// Recommended batch size
225    pub batch_size: usize,
226    /// Estimated cost
227    pub estimated_cost: f64,
228}
229
230impl QueryPlan {
231    /// Create a query plan
232    pub fn new(
233        strategy: SearchStrategy,
234        use_prefiltering: bool,
235        batch_size: usize,
236        estimated_cost: f64,
237    ) -> Self {
238        Self {
239            strategy,
240            use_prefiltering,
241            batch_size,
242            estimated_cost,
243        }
244    }
245
246    /// Create an optimized query plan
247    pub fn optimize(
248        optimizer: &QueryOptimizer,
249        num_vectors: usize,
250        num_queries: usize,
251        k: usize,
252        filter_selectivity: Option<f32>,
253        required_recall: f32,
254    ) -> Self {
255        let strategy = optimizer.recommend_strategy(num_vectors, required_recall);
256        let use_prefiltering = if let Some(selectivity) = filter_selectivity {
257            optimizer.recommend_prefiltering(num_vectors, selectivity)
258        } else {
259            false
260        };
261        let batch_size = optimizer.recommend_batch_size(num_queries, num_vectors);
262        let estimated_cost = optimizer.estimate_cost(strategy, num_vectors, k);
263
264        Self::new(strategy, use_prefiltering, batch_size, estimated_cost)
265    }
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn test_recommend_strategy_small_dataset() {
274        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
275        let strategy = optimizer.recommend_strategy(5_000, 0.95);
276        assert_eq!(strategy, SearchStrategy::BruteForce);
277    }
278
279    #[test]
280    fn test_recommend_strategy_medium_dataset() {
281        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
282        let strategy = optimizer.recommend_strategy(100_000, 0.95);
283        assert_eq!(strategy, SearchStrategy::Hnsw);
284    }
285
286    #[test]
287    fn test_recommend_strategy_large_dataset() {
288        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
289        let strategy = optimizer.recommend_strategy(2_000_000, 0.95);
290        assert_eq!(strategy, SearchStrategy::IvfPq);
291    }
292
293    #[test]
294    fn test_recommend_strategy_very_large_dataset() {
295        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
296        let strategy = optimizer.recommend_strategy(20_000_000, 0.95);
297        assert_eq!(strategy, SearchStrategy::Distributed);
298    }
299
300    #[test]
301    fn test_recommend_strategy_high_recall() {
302        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
303        // High recall requirement should prefer exact search for small enough datasets
304        let strategy = optimizer.recommend_strategy(15_000, 0.99);
305        assert_eq!(strategy, SearchStrategy::BruteForce);
306    }
307
308    #[test]
309    fn test_recommend_prefiltering_selective() {
310        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
311        // Highly selective filter (5% match) should use pre-filtering
312        let use_prefilter = optimizer.recommend_prefiltering(100_000, 0.05);
313        assert!(use_prefilter);
314    }
315
316    #[test]
317    fn test_recommend_prefiltering_not_selective() {
318        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
319        // Non-selective filter (50% match) should use post-filtering
320        let use_prefilter = optimizer.recommend_prefiltering(100_000, 0.50);
321        assert!(!use_prefilter);
322    }
323
324    #[test]
325    fn test_recommend_batch_size() {
326        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
327
328        // Small query count
329        let batch_size = optimizer.recommend_batch_size(5, 100_000);
330        assert_eq!(batch_size, 5);
331
332        // Large query count, small dataset
333        let batch_size = optimizer.recommend_batch_size(2000, 50_000);
334        assert_eq!(batch_size, 1000);
335
336        // Large query count, large dataset
337        let batch_size = optimizer.recommend_batch_size(2000, 2_000_000);
338        assert_eq!(batch_size, 100);
339    }
340
341    #[test]
342    fn test_estimate_cost() {
343        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
344
345        // Brute force should be most expensive for large datasets
346        let brute_cost = optimizer.estimate_cost(SearchStrategy::BruteForce, 100_000, 10);
347        let hnsw_cost = optimizer.estimate_cost(SearchStrategy::Hnsw, 100_000, 10);
348
349        assert!(brute_cost > hnsw_cost);
350    }
351
352    #[test]
353    fn test_high_accuracy_config() {
354        let config = OptimizerConfig::high_accuracy();
355        assert_eq!(config.brute_force_threshold, 50_000);
356        assert_eq!(config.min_recall, 0.98);
357    }
358
359    #[test]
360    fn test_high_speed_config() {
361        let config = OptimizerConfig::high_speed();
362        assert_eq!(config.brute_force_threshold, 5_000);
363        assert_eq!(config.min_recall, 0.80);
364    }
365
366    #[test]
367    fn test_memory_efficient_config() {
368        let config = OptimizerConfig::memory_efficient();
369        assert_eq!(config.hnsw_threshold, 100_000);
370        assert!(!config.enable_caching);
371    }
372
373    #[test]
374    fn test_query_plan_optimize() {
375        let optimizer = QueryOptimizer::new(OptimizerConfig::default());
376
377        // Create optimized plan
378        let plan = QueryPlan::optimize(
379            &optimizer,
380            100_000,    // num_vectors
381            50,         // num_queries
382            10,         // k
383            Some(0.05), // filter_selectivity
384            0.95,       // required_recall
385        );
386
387        assert_eq!(plan.strategy, SearchStrategy::Hnsw);
388        assert!(plan.use_prefiltering); // Selective filter
389        assert!(plan.batch_size <= 50);
390        assert!(plan.estimated_cost > 0.0);
391    }
392}