avocado_core/
index.rs

1//! In-memory vector index for similarity search
2//!
3//! Phase 1 uses brute-force cosine similarity, which is fine for <10K spans.
4//! Phase 2 uses HNSW for fast approximate nearest neighbor search (10-100x faster).
5
6use crate::types::{Result, ScoredSpan, Span};
7use crate::embedding;
8use hnsw_rs::prelude::*;
9use hnsw_rs::api::AnnT;
10use std::path::Path;
11use std::fs;
12use std::time::Instant;
13use sha2::{Digest, Sha256};
14use serde::{Serialize, Deserialize};
15
16/// In-memory vector index for fast similarity search
17/// 
18/// Uses HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search.
19/// This provides O(log n) search time instead of O(n) brute-force, making it 10-100x faster
20/// for large repositories (10K+ spans).
21pub struct VectorIndex {
22    /// HNSW index for fast approximate search (None for empty index)
23    /// Note: When loaded from disk, we rebuild HNSW to avoid lifetime issues
24    /// This is still faster than building from scratch since we have spans cached
25    hnsw: Option<Hnsw<'static, f32, DistCosine>>,
26    /// Spans indexed by their position in the HNSW index
27    spans: Vec<Span>,
28    /// Dimension of embeddings (typically 1536 for ada-002)
29    dimension: usize,
30}
31
32impl VectorIndex {
33    /// Build an index from spans using HNSW
34    ///
35    /// # Arguments
36    ///
37    /// * `spans` - Vector of spans with embeddings
38    ///
39    /// # Returns
40    ///
41    /// A new VectorIndex ready for searching
42    ///
43    /// # HNSW Parameters
44    ///
45    /// - `m`: Maximum number of connections per node (16 = good balance of speed/quality)
46    /// - `ef_construction`: Size of candidate list during construction (200 = high quality)
47    /// - `ef_search`: Size of candidate list during search (50 = good balance)
48    ///
49    /// These parameters provide >95% recall@50 while being 10-100x faster than brute-force.
50    pub fn build(spans: Vec<Span>) -> Self {
51        let start_time = Instant::now();
52        let total_spans = spans.len();
53        if spans.is_empty() {
54            // Return empty index - no HNSW needed
55            return Self {
56                hnsw: None,
57                spans: Vec::new(),
58                dimension: embedding::embedding_dimension(), // Use current embedding dimension
59            };
60        }
61
62        // Determine dimension from first span with embedding
63        let dimension = spans
64            .iter()
65            .find_map(|s| s.embedding.as_ref().map(|e| e.len()))
66            .unwrap_or_else(|| embedding::embedding_dimension()); // Use current embedding dimension
67
68        // Create HNSW index
69        // HNSW::new signature: (max_nb_connection, max_elements, max_layer, ef_construction, distance_fn)
70        // Parameters tuned for good balance of speed and quality:
71        // - max_nb_connection=32: Maximum connections per node (must be <= 256)
72        //   This is also the "m" parameter (number of bi-directional links)
73        // - max_elements: Expected number of elements (use spans.len() for efficiency)
74        // - max_layer: Maximum layers (16 is a good default)
75        // - ef_construction=200: High quality index construction
76        // Note: Dimension is inferred from first insertion, not passed to new()
77        let max_nb_connection = 32; // Must be <= 256, this is also "m"
78        let max_elements = spans.len();
79        let max_layer = 16; // Maximum layers in the graph
80        let ef_construction = 200; // Construction quality parameter
81        let hnsw = Hnsw::<f32, DistCosine>::new(
82            max_nb_connection, // m parameter (bi-directional links per node)
83            max_elements,      // Expected number of elements
84            max_layer,         // Maximum layers
85            ef_construction,   // Construction quality
86            DistCosine::default(), // Distance function
87        );
88
89        // Insert all spans with embeddings into HNSW
90        let mut inserted = 0usize;
91        for (idx, span) in spans.iter().enumerate() {
92            if let Some(embedding) = &span.embedding {
93                if embedding.len() == dimension {
94                    // Insert into HNSW with span index as data
95                    // hnsw_rs expects a tuple: (&[f32], usize) where usize is the payload
96                    hnsw.insert((embedding.as_slice(), idx));
97                    inserted += 1;
98                }
99            }
100        }
101
102        let elapsed_ms = start_time.elapsed().as_millis();
103        log::info!(
104            "VectorIndex::build: indexed {} of {} spans (dim={}): {}ms",
105            inserted,
106            total_spans,
107            dimension,
108            elapsed_ms
109        );
110
111        Self {
112            hnsw: Some(hnsw),
113            spans,
114            dimension,
115        }
116    }
117
118    /// Search for similar spans using HNSW approximate nearest neighbor search
119    ///
120    /// # Arguments
121    ///
122    /// * `query_embedding` - The query vector
123    /// * `k` - Number of results to return
124    ///
125    /// # Returns
126    ///
127    /// Vector of scored spans, sorted by relevance (highest score first)
128    ///
129    /// # Performance
130    ///
131    /// - O(log n) search time instead of O(n) brute-force
132    /// - 10-100x faster for large repositories (10K+ spans)
133    /// - >95% recall@k (quality maintained)
134    pub fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
135        if self.spans.is_empty() || query_embedding.len() != self.dimension {
136            return Ok(Vec::new());
137        }
138
139        // If no HNSW (empty index), return empty results
140        let hnsw = match &self.hnsw {
141            Some(h) => h,
142            None => return Ok(Vec::new()),
143        };
144
145        // Use ef_search = max(k * 2, 50) for good balance of speed and quality
146        // Higher ef_search = better recall but slower
147        let ef_search = (k * 2).max(50).min(200);
148
149        // Search HNSW for approximate nearest neighbors
150        let search_results = hnsw.search(
151            query_embedding,
152            k.min(self.spans.len()),
153            ef_search,
154        );
155
156        // Convert HNSW results to ScoredSpan
157        // HNSW returns Neighbour structs with distance and data (our span index)
158        // Distance is cosine distance (0 = identical, 2 = opposite)
159        // Convert to similarity score: score = 1 - (distance / 2)
160        let results: Vec<ScoredSpan> = search_results
161            .into_iter()
162            .filter_map(|neighbour| {
163                let idx = neighbour.d_id; // The data we stored (span index)
164                let distance = neighbour.distance;
165                
166                // HNSW uses distance (0 = identical, 2 = opposite)
167                // Convert to similarity score: score = 1 - (distance / 2)
168                // This gives us a score in [0, 1] range where 1 = identical
169                let score: f32 = 1.0 - (distance / 2.0);
170                
171                // Ensure score is in valid range [0, 1]
172                let score = score.max(0.0_f32).min(1.0_f32);
173                
174                self.spans.get(idx).map(|span| ScoredSpan {
175                    span: span.clone(),
176                    score,
177                })
178            })
179            .collect();
180
181        // Sort by score descending (HNSW results are already sorted, but ensure it)
182        let mut results = results;
183        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
184
185        Ok(results)
186    }
187
188    /// Get the number of spans in the index
189    pub fn len(&self) -> usize {
190        self.spans.len()
191    }
192
193    /// Check if the index is empty
194    pub fn is_empty(&self) -> bool {
195        self.spans.is_empty()
196    }
197
198    /// Get all spans (for debugging and persistence)
199    pub fn spans(&self) -> &[Span] {
200        &self.spans
201    }
202    
203    /// Save HNSW index to disk for persistence (Phase 2.1)
204    ///
205    /// Saves both the HNSW graph structure and the spans metadata.
206    /// This allows instant loading on subsequent queries once `hnsw_rs`
207    /// supports owned loading; for now we only reload from cached spans.
208    ///
209    /// # Arguments
210    ///
211    /// * `cache_dir` - Directory to save index files
212    ///
213    /// # Returns
214    ///
215    /// Ok(()) if successful
216    pub fn save_to_disk(&self, cache_dir: &Path) -> Result<()> {
217        // Create directory if needed
218        fs::create_dir_all(cache_dir)
219            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to create cache directory: {}", e)))?;
220        
221        // Save HNSW index using its native dump format
222        // Note: We don't currently load this dump back due to lifetime
223        // limitations in `hnsw_rs` (see docs/HNSW_LIFETIME_ISSUE.md).
224        // The files are kept so we can take advantage of them once
225        // the upstream library supports owned loading.
226        if let Some(hnsw) = &self.hnsw {
227            // HNSW file_dump creates two files: {basename}.hnsw.graph and {basename}.hnsw.data
228            let basename = "index";
229            // Write to a temporary dir then atomically move into place
230            let tmp_dir = cache_dir.join(".tmp");
231            let _ = fs::remove_dir_all(&tmp_dir);
232            fs::create_dir_all(&tmp_dir)
233                .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to create tmp dir: {}", e)))?;
234            hnsw.file_dump(&tmp_dir, basename)
235                .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to dump HNSW index: {}", e)))?;
236            // Move files into place
237            for ext in &["hnsw.graph", "hnsw.data"] {
238                let src = tmp_dir.join(format!("{}.{}", basename, ext));
239                let dst = cache_dir.join(format!("{}.{}", basename, ext));
240                fs::rename(&src, &dst)
241                    .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move {}: {}", ext, e)))?;
242            }
243            let _ = fs::remove_dir_all(&tmp_dir);
244        }
245        
246        // Save spans metadata separately (for validation and reference)
247        let spans_path = cache_dir.join("spans.bin");
248        let spans_tmp = cache_dir.join("spans.bin.tmp");
249        let spans_data = bincode::serialize(&self.spans)
250            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize spans: {}", e)))?;
251        fs::write(&spans_tmp, &spans_data)
252            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write spans cache: {}", e)))?;
253        fs::rename(&spans_tmp, &spans_path)
254            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move spans cache: {}", e)))?;
255        
256        // Save dimension for validation
257        let dim_path = cache_dir.join("dimension.bin");
258        let dim_tmp = cache_dir.join("dimension.bin.tmp");
259        let dim_data = bincode::serialize(&self.dimension)
260            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize dimension: {}", e)))?;
261        fs::write(&dim_tmp, &dim_data)
262            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write dimension cache: {}", e)))?;
263        fs::rename(&dim_tmp, &dim_path)
264            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move dimension cache: {}", e)))?;
265
266        // Save metadata with checksum and version
267        let checksum = {
268            let mut hasher = Sha256::new();
269            hasher.update(&spans_data);
270            format!("{:x}", hasher.finalize())
271        };
272        let meta = IndexMeta {
273            version: 1,
274            backend: "hnsw_rs".to_string(),
275            span_count: self.spans.len(),
276            dimension: self.dimension,
277            spans_checksum: checksum,
278        };
279        let meta_json = serde_json::to_vec(&meta)
280            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize index meta: {}", e)))?;
281        let meta_path = cache_dir.join("index.meta.json");
282        let meta_tmp = cache_dir.join("index.meta.json.tmp");
283        fs::write(&meta_tmp, &meta_json)
284            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write index meta: {}", e)))?;
285        fs::rename(&meta_tmp, &meta_path)
286            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move index meta: {}", e)))?;
287        
288        Ok(())
289    }
290    
291    /// Load HNSW index from disk (Phase 2.1)
292    ///
293    /// **Note**: Due to lifetime constraints in the `hnsw_rs` library, we cannot directly
294    /// load the HNSW structure from disk. Instead, we load cached spans and rebuild HNSW.
295    /// This is still faster than loading spans from SQLite, but not as fast as loading
296    /// the HNSW structure directly would be.
297    ///
298    /// For large repositories (>10K spans), consider using server mode which keeps the
299    /// index in memory across queries for optimal performance.
300    ///
301    /// # Arguments
302    ///
303    /// * `cache_dir` - Directory containing index files
304    ///
305    /// # Returns
306    ///
307    /// Some(VectorIndex) if loaded successfully, None otherwise
308    pub fn load_from_disk(cache_dir: &Path) -> Result<Option<Self>> {
309        use std::fs;
310        // Check if cache files exist
311        let spans_path = cache_dir.join("spans.bin");
312        let dim_path = cache_dir.join("dimension.bin");
313        let meta_path = cache_dir.join("index.meta.json");
314        
315        if !spans_path.exists() || !dim_path.exists() || !meta_path.exists() {
316            return Ok(None);
317        }
318        // Read and validate metadata
319        let meta_json = fs::read(&meta_path)
320            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read index meta: {}", e)))?;
321        let meta: IndexMeta = serde_json::from_slice(&meta_json)
322            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to parse index meta: {}", e)))?;
323        if meta.version != 1 {
324            return Err(crate::types::Error::Other(anyhow::anyhow!("Unsupported index version: {}", meta.version)));
325        }
326
327        // Load dimension
328        let dim_data = fs::read(&dim_path)
329            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read dimension cache: {}", e)))?;
330        let cached_dimension: usize = bincode::deserialize(&dim_data)
331            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to deserialize dimension: {}", e)))?;
332        
333        // Load spans
334        let spans_data = fs::read(&spans_path)
335            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read spans cache: {}", e)))?;
336        // Validate checksum
337        let mut hasher = Sha256::new();
338        hasher.update(&spans_data);
339        let actual_checksum = format!("{:x}", hasher.finalize());
340        if actual_checksum != meta.spans_checksum {
341            return Err(crate::types::Error::Other(anyhow::anyhow!(
342                "Index cache checksum mismatch"
343            )));
344        }
345        let spans: Vec<Span> = bincode::deserialize(&spans_data)
346            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to deserialize spans: {}", e)))?;
347        
348        // Basic validation: ensure cached dimension matches spans embeddings
349        let actual_dimension = spans
350            .iter()
351            .find_map(|s| s.embedding.as_ref().map(|e| e.len()))
352            .unwrap_or(cached_dimension);
353        if actual_dimension != cached_dimension {
354            return Err(crate::types::Error::Other(anyhow::anyhow!(
355                "Cached index dimension mismatch (expected {}, found {})",
356                cached_dimension,
357                actual_dimension
358            )));
359        }
360        
361        // Rebuild HNSW from cached spans
362        // 
363        // LIMITATION: The `hnsw_rs` library has lifetime constraints that prevent us from
364        // directly loading and storing the HNSW structure. The loaded HNSW has a lifetime
365        // tied to HnswIo, which conflicts with our need for 'static lifetime in VectorIndex.
366        //
367        // WORKAROUND: We cache spans (fast to load) and rebuild HNSW (slower but necessary).
368        // This is still faster than loading spans from SQLite, but rebuilding HNSW from
369        // 50K+ spans can take 1-2 minutes.
370        //
371        // RECOMMENDATION: For large repositories, use server mode which keeps the index
372        // in memory across queries, avoiding rebuilds entirely.
373        //
374        // FUTURE: Consider alternative HNSW libraries (hora, instant-distance) or contribute
375        // serialization improvements to hnsw_rs that support owned HNSW structures.
376        Ok(Some(Self::build(spans)))
377    }
378}
379
380#[derive(Debug, Serialize, Deserialize)]
381struct IndexMeta {
382    version: u32,
383    backend: String,
384    span_count: usize,
385    dimension: usize,
386    spans_checksum: String,
387}
388
389/// Calculate cosine similarity between two vectors
390///
391/// # Arguments
392///
393/// * `a` - First vector
394/// * `b` - Second vector
395///
396/// # Returns
397///
398/// Cosine similarity score (0.0 to 1.0, higher is more similar)
399///
400/// # Formula
401///
402/// ```text
403/// cosine_sim(a, b) = (a ยท b) / (||a|| * ||b||)
404/// ```
405pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
406    if a.len() != b.len() {
407        return 0.0;
408    }
409
410    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
411    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
412    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
413
414    if norm_a == 0.0 || norm_b == 0.0 {
415        return 0.0;
416    }
417
418    dot / (norm_a * norm_b)
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424    use uuid::Uuid;
425
426    fn create_test_span(embedding: Vec<f32>) -> Span {
427        Span {
428            id: Uuid::new_v4().to_string(),
429            artifact_id: "test".to_string(),
430            start_line: 1,
431            end_line: 10,
432            text: "test text".to_string(),
433            embedding: Some(embedding),
434            embedding_model: Some("test".to_string()),
435            token_count: 10,
436            metadata: None,
437        }
438    }
439
440    #[test]
441    fn test_cosine_similarity_identical() {
442        let a = vec![1.0, 0.0, 0.0];
443        let b = vec![1.0, 0.0, 0.0];
444        let sim = cosine_similarity(&a, &b);
445        assert!((sim - 1.0).abs() < 0.0001);
446    }
447
448    #[test]
449    fn test_cosine_similarity_orthogonal() {
450        let a = vec![1.0, 0.0];
451        let b = vec![0.0, 1.0];
452        let sim = cosine_similarity(&a, &b);
453        assert!(sim.abs() < 0.0001); // Should be ~0
454    }
455
456    #[test]
457    fn test_cosine_similarity_opposite() {
458        let a = vec![1.0, 0.0];
459        let b = vec![-1.0, 0.0];
460        let sim = cosine_similarity(&a, &b);
461        assert!((sim + 1.0).abs() < 0.0001); // Should be ~-1
462    }
463
464    #[test]
465    fn test_vector_index_search() {
466        let spans = vec![
467            create_test_span(vec![1.0, 0.0, 0.0]),
468            create_test_span(vec![0.0, 1.0, 0.0]),
469            create_test_span(vec![0.9, 0.1, 0.0]),
470        ];
471
472        let index = VectorIndex::build(spans);
473        assert_eq!(index.len(), 3);
474
475        let query = vec![1.0, 0.0, 0.0];
476        let results = index.search(&query, 2).unwrap();
477
478        assert_eq!(results.len(), 2);
479        // First result should be most similar
480        assert!(results[0].score > results[1].score);
481    }
482
483    #[test]
484    fn test_empty_index() {
485        let index = VectorIndex::build(vec![]);
486        assert!(index.is_empty());
487
488        let results = index.search(&[1.0, 0.0], 10).unwrap();
489        assert_eq!(results.len(), 0);
490    }
491}