avocado_core/index.rs
1//! In-memory vector index for similarity search
2//!
3//! Phase 1 uses brute-force cosine similarity, which is fine for <10K spans.
4//! Phase 2 uses HNSW for fast approximate nearest neighbor search (10-100x faster).
5
6use crate::types::{Result, ScoredSpan, Span};
7use crate::embedding;
8use hnsw_rs::prelude::*;
9use hnsw_rs::api::AnnT;
10use std::path::Path;
11use std::fs;
12use std::time::Instant;
13use sha2::{Digest, Sha256};
14use serde::{Serialize, Deserialize};
15
16/// In-memory vector index for fast similarity search
17///
18/// Uses HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search.
19/// This provides O(log n) search time instead of O(n) brute-force, making it 10-100x faster
20/// for large repositories (10K+ spans).
21pub struct VectorIndex {
22 /// HNSW index for fast approximate search (None for empty index)
23 /// Note: When loaded from disk, we rebuild HNSW to avoid lifetime issues
24 /// This is still faster than building from scratch since we have spans cached
25 hnsw: Option<Hnsw<'static, f32, DistCosine>>,
26 /// Spans indexed by their position in the HNSW index
27 spans: Vec<Span>,
28 /// Dimension of embeddings (typically 1536 for ada-002)
29 dimension: usize,
30}
31
32impl VectorIndex {
33 /// Build an index from spans using HNSW
34 ///
35 /// # Arguments
36 ///
37 /// * `spans` - Vector of spans with embeddings
38 ///
39 /// # Returns
40 ///
41 /// A new VectorIndex ready for searching
42 ///
43 /// # HNSW Parameters
44 ///
45 /// - `m`: Maximum number of connections per node (16 = good balance of speed/quality)
46 /// - `ef_construction`: Size of candidate list during construction (200 = high quality)
47 /// - `ef_search`: Size of candidate list during search (50 = good balance)
48 ///
49 /// These parameters provide >95% recall@50 while being 10-100x faster than brute-force.
50 pub fn build(spans: Vec<Span>) -> Self {
51 let start_time = Instant::now();
52 let total_spans = spans.len();
53 if spans.is_empty() {
54 // Return empty index - no HNSW needed
55 return Self {
56 hnsw: None,
57 spans: Vec::new(),
58 dimension: embedding::embedding_dimension(), // Use current embedding dimension
59 };
60 }
61
62 // Determine dimension from first span with embedding
63 let dimension = spans
64 .iter()
65 .find_map(|s| s.embedding.as_ref().map(|e| e.len()))
66 .unwrap_or_else(|| embedding::embedding_dimension()); // Use current embedding dimension
67
68 // Create HNSW index
69 // HNSW::new signature: (max_nb_connection, max_elements, max_layer, ef_construction, distance_fn)
70 // Parameters tuned for good balance of speed and quality:
71 // - max_nb_connection=32: Maximum connections per node (must be <= 256)
72 // This is also the "m" parameter (number of bi-directional links)
73 // - max_elements: Expected number of elements (use spans.len() for efficiency)
74 // - max_layer: Maximum layers (16 is a good default)
75 // - ef_construction=200: High quality index construction
76 // Note: Dimension is inferred from first insertion, not passed to new()
77 let max_nb_connection = 32; // Must be <= 256, this is also "m"
78 let max_elements = spans.len();
79 let max_layer = 16; // Maximum layers in the graph
80 let ef_construction = 200; // Construction quality parameter
81 let hnsw = Hnsw::<f32, DistCosine>::new(
82 max_nb_connection, // m parameter (bi-directional links per node)
83 max_elements, // Expected number of elements
84 max_layer, // Maximum layers
85 ef_construction, // Construction quality
86 DistCosine::default(), // Distance function
87 );
88
89 // Insert all spans with embeddings into HNSW
90 let mut inserted = 0usize;
91 for (idx, span) in spans.iter().enumerate() {
92 if let Some(embedding) = &span.embedding {
93 if embedding.len() == dimension {
94 // Insert into HNSW with span index as data
95 // hnsw_rs expects a tuple: (&[f32], usize) where usize is the payload
96 hnsw.insert((embedding.as_slice(), idx));
97 inserted += 1;
98 }
99 }
100 }
101
102 let elapsed_ms = start_time.elapsed().as_millis();
103 log::info!(
104 "VectorIndex::build: indexed {} of {} spans (dim={}): {}ms",
105 inserted,
106 total_spans,
107 dimension,
108 elapsed_ms
109 );
110
111 Self {
112 hnsw: Some(hnsw),
113 spans,
114 dimension,
115 }
116 }
117
118 /// Search for similar spans using HNSW approximate nearest neighbor search
119 ///
120 /// # Arguments
121 ///
122 /// * `query_embedding` - The query vector
123 /// * `k` - Number of results to return
124 ///
125 /// # Returns
126 ///
127 /// Vector of scored spans, sorted by relevance (highest score first)
128 ///
129 /// # Performance
130 ///
131 /// - O(log n) search time instead of O(n) brute-force
132 /// - 10-100x faster for large repositories (10K+ spans)
133 /// - >95% recall@k (quality maintained)
134 pub fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
135 if self.spans.is_empty() || query_embedding.len() != self.dimension {
136 return Ok(Vec::new());
137 }
138
139 // If no HNSW (empty index), return empty results
140 let hnsw = match &self.hnsw {
141 Some(h) => h,
142 None => return Ok(Vec::new()),
143 };
144
145 // Use ef_search = max(k * 2, 50) for good balance of speed and quality
146 // Higher ef_search = better recall but slower
147 let ef_search = (k * 2).max(50).min(200);
148
149 // Search HNSW for approximate nearest neighbors
150 let search_results = hnsw.search(
151 query_embedding,
152 k.min(self.spans.len()),
153 ef_search,
154 );
155
156 // Convert HNSW results to ScoredSpan
157 // HNSW returns Neighbour structs with distance and data (our span index)
158 // Distance is cosine distance (0 = identical, 2 = opposite)
159 // Convert to similarity score: score = 1 - (distance / 2)
160 let results: Vec<ScoredSpan> = search_results
161 .into_iter()
162 .filter_map(|neighbour| {
163 let idx = neighbour.d_id; // The data we stored (span index)
164 let distance = neighbour.distance;
165
166 // HNSW uses distance (0 = identical, 2 = opposite)
167 // Convert to similarity score: score = 1 - (distance / 2)
168 // This gives us a score in [0, 1] range where 1 = identical
169 let score: f32 = 1.0 - (distance / 2.0);
170
171 // Ensure score is in valid range [0, 1]
172 let score = score.max(0.0_f32).min(1.0_f32);
173
174 self.spans.get(idx).map(|span| ScoredSpan {
175 span: span.clone(),
176 score,
177 })
178 })
179 .collect();
180
181 // Sort by score descending (HNSW results are already sorted, but ensure it)
182 let mut results = results;
183 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
184
185 Ok(results)
186 }
187
188 /// Get the number of spans in the index
189 pub fn len(&self) -> usize {
190 self.spans.len()
191 }
192
193 /// Check if the index is empty
194 pub fn is_empty(&self) -> bool {
195 self.spans.is_empty()
196 }
197
198 /// Get all spans (for debugging and persistence)
199 pub fn spans(&self) -> &[Span] {
200 &self.spans
201 }
202
203 /// Save HNSW index to disk for persistence (Phase 2.1)
204 ///
205 /// Saves both the HNSW graph structure and the spans metadata.
206 /// This allows instant loading on subsequent queries once `hnsw_rs`
207 /// supports owned loading; for now we only reload from cached spans.
208 ///
209 /// # Arguments
210 ///
211 /// * `cache_dir` - Directory to save index files
212 ///
213 /// # Returns
214 ///
215 /// Ok(()) if successful
216 pub fn save_to_disk(&self, cache_dir: &Path) -> Result<()> {
217 // Create directory if needed
218 fs::create_dir_all(cache_dir)
219 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to create cache directory: {}", e)))?;
220
221 // Save HNSW index using its native dump format
222 // Note: We don't currently load this dump back due to lifetime
223 // limitations in `hnsw_rs` (see docs/HNSW_LIFETIME_ISSUE.md).
224 // The files are kept so we can take advantage of them once
225 // the upstream library supports owned loading.
226 if let Some(hnsw) = &self.hnsw {
227 // HNSW file_dump creates two files: {basename}.hnsw.graph and {basename}.hnsw.data
228 let basename = "index";
229 // Write to a temporary dir then atomically move into place
230 let tmp_dir = cache_dir.join(".tmp");
231 let _ = fs::remove_dir_all(&tmp_dir);
232 fs::create_dir_all(&tmp_dir)
233 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to create tmp dir: {}", e)))?;
234 hnsw.file_dump(&tmp_dir, basename)
235 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to dump HNSW index: {}", e)))?;
236 // Move files into place
237 for ext in &["hnsw.graph", "hnsw.data"] {
238 let src = tmp_dir.join(format!("{}.{}", basename, ext));
239 let dst = cache_dir.join(format!("{}.{}", basename, ext));
240 fs::rename(&src, &dst)
241 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move {}: {}", ext, e)))?;
242 }
243 let _ = fs::remove_dir_all(&tmp_dir);
244 }
245
246 // Save spans metadata separately (for validation and reference)
247 let spans_path = cache_dir.join("spans.bin");
248 let spans_tmp = cache_dir.join("spans.bin.tmp");
249 let spans_data = bincode::serialize(&self.spans)
250 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize spans: {}", e)))?;
251 fs::write(&spans_tmp, &spans_data)
252 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write spans cache: {}", e)))?;
253 fs::rename(&spans_tmp, &spans_path)
254 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move spans cache: {}", e)))?;
255
256 // Save dimension for validation
257 let dim_path = cache_dir.join("dimension.bin");
258 let dim_tmp = cache_dir.join("dimension.bin.tmp");
259 let dim_data = bincode::serialize(&self.dimension)
260 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize dimension: {}", e)))?;
261 fs::write(&dim_tmp, &dim_data)
262 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write dimension cache: {}", e)))?;
263 fs::rename(&dim_tmp, &dim_path)
264 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move dimension cache: {}", e)))?;
265
266 // Save metadata with checksum and version
267 let checksum = {
268 let mut hasher = Sha256::new();
269 hasher.update(&spans_data);
270 format!("{:x}", hasher.finalize())
271 };
272 let meta = IndexMeta {
273 version: 1,
274 backend: "hnsw_rs".to_string(),
275 span_count: self.spans.len(),
276 dimension: self.dimension,
277 spans_checksum: checksum,
278 };
279 let meta_json = serde_json::to_vec(&meta)
280 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to serialize index meta: {}", e)))?;
281 let meta_path = cache_dir.join("index.meta.json");
282 let meta_tmp = cache_dir.join("index.meta.json.tmp");
283 fs::write(&meta_tmp, &meta_json)
284 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to write index meta: {}", e)))?;
285 fs::rename(&meta_tmp, &meta_path)
286 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to move index meta: {}", e)))?;
287
288 Ok(())
289 }
290
291 /// Load HNSW index from disk (Phase 2.1)
292 ///
293 /// **Note**: Due to lifetime constraints in the `hnsw_rs` library, we cannot directly
294 /// load the HNSW structure from disk. Instead, we load cached spans and rebuild HNSW.
295 /// This is still faster than loading spans from SQLite, but not as fast as loading
296 /// the HNSW structure directly would be.
297 ///
298 /// For large repositories (>10K spans), consider using server mode which keeps the
299 /// index in memory across queries for optimal performance.
300 ///
301 /// # Arguments
302 ///
303 /// * `cache_dir` - Directory containing index files
304 ///
305 /// # Returns
306 ///
307 /// Some(VectorIndex) if loaded successfully, None otherwise
308 pub fn load_from_disk(cache_dir: &Path) -> Result<Option<Self>> {
309 use std::fs;
310 // Check if cache files exist
311 let spans_path = cache_dir.join("spans.bin");
312 let dim_path = cache_dir.join("dimension.bin");
313 let meta_path = cache_dir.join("index.meta.json");
314
315 if !spans_path.exists() || !dim_path.exists() || !meta_path.exists() {
316 return Ok(None);
317 }
318 // Read and validate metadata
319 let meta_json = fs::read(&meta_path)
320 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read index meta: {}", e)))?;
321 let meta: IndexMeta = serde_json::from_slice(&meta_json)
322 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to parse index meta: {}", e)))?;
323 if meta.version != 1 {
324 return Err(crate::types::Error::Other(anyhow::anyhow!("Unsupported index version: {}", meta.version)));
325 }
326
327 // Load dimension
328 let dim_data = fs::read(&dim_path)
329 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read dimension cache: {}", e)))?;
330 let cached_dimension: usize = bincode::deserialize(&dim_data)
331 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to deserialize dimension: {}", e)))?;
332
333 // Load spans
334 let spans_data = fs::read(&spans_path)
335 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to read spans cache: {}", e)))?;
336 // Validate checksum
337 let mut hasher = Sha256::new();
338 hasher.update(&spans_data);
339 let actual_checksum = format!("{:x}", hasher.finalize());
340 if actual_checksum != meta.spans_checksum {
341 return Err(crate::types::Error::Other(anyhow::anyhow!(
342 "Index cache checksum mismatch"
343 )));
344 }
345 let spans: Vec<Span> = bincode::deserialize(&spans_data)
346 .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("Failed to deserialize spans: {}", e)))?;
347
348 // Basic validation: ensure cached dimension matches spans embeddings
349 let actual_dimension = spans
350 .iter()
351 .find_map(|s| s.embedding.as_ref().map(|e| e.len()))
352 .unwrap_or(cached_dimension);
353 if actual_dimension != cached_dimension {
354 return Err(crate::types::Error::Other(anyhow::anyhow!(
355 "Cached index dimension mismatch (expected {}, found {})",
356 cached_dimension,
357 actual_dimension
358 )));
359 }
360
361 // Rebuild HNSW from cached spans
362 //
363 // LIMITATION: The `hnsw_rs` library has lifetime constraints that prevent us from
364 // directly loading and storing the HNSW structure. The loaded HNSW has a lifetime
365 // tied to HnswIo, which conflicts with our need for 'static lifetime in VectorIndex.
366 //
367 // WORKAROUND: We cache spans (fast to load) and rebuild HNSW (slower but necessary).
368 // This is still faster than loading spans from SQLite, but rebuilding HNSW from
369 // 50K+ spans can take 1-2 minutes.
370 //
371 // RECOMMENDATION: For large repositories, use server mode which keeps the index
372 // in memory across queries, avoiding rebuilds entirely.
373 //
374 // FUTURE: Consider alternative HNSW libraries (hora, instant-distance) or contribute
375 // serialization improvements to hnsw_rs that support owned HNSW structures.
376 Ok(Some(Self::build(spans)))
377 }
378}
379
380#[derive(Debug, Serialize, Deserialize)]
381struct IndexMeta {
382 version: u32,
383 backend: String,
384 span_count: usize,
385 dimension: usize,
386 spans_checksum: String,
387}
388
389/// Calculate cosine similarity between two vectors
390///
391/// # Arguments
392///
393/// * `a` - First vector
394/// * `b` - Second vector
395///
396/// # Returns
397///
398/// Cosine similarity score (0.0 to 1.0, higher is more similar)
399///
400/// # Formula
401///
402/// ```text
403/// cosine_sim(a, b) = (a ยท b) / (||a|| * ||b||)
404/// ```
405pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
406 if a.len() != b.len() {
407 return 0.0;
408 }
409
410 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
411 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
412 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
413
414 if norm_a == 0.0 || norm_b == 0.0 {
415 return 0.0;
416 }
417
418 dot / (norm_a * norm_b)
419}
420
421#[cfg(test)]
422mod tests {
423 use super::*;
424 use uuid::Uuid;
425
426 fn create_test_span(embedding: Vec<f32>) -> Span {
427 Span {
428 id: Uuid::new_v4().to_string(),
429 artifact_id: "test".to_string(),
430 start_line: 1,
431 end_line: 10,
432 text: "test text".to_string(),
433 embedding: Some(embedding),
434 embedding_model: Some("test".to_string()),
435 token_count: 10,
436 metadata: None,
437 }
438 }
439
440 #[test]
441 fn test_cosine_similarity_identical() {
442 let a = vec![1.0, 0.0, 0.0];
443 let b = vec![1.0, 0.0, 0.0];
444 let sim = cosine_similarity(&a, &b);
445 assert!((sim - 1.0).abs() < 0.0001);
446 }
447
448 #[test]
449 fn test_cosine_similarity_orthogonal() {
450 let a = vec![1.0, 0.0];
451 let b = vec![0.0, 1.0];
452 let sim = cosine_similarity(&a, &b);
453 assert!(sim.abs() < 0.0001); // Should be ~0
454 }
455
456 #[test]
457 fn test_cosine_similarity_opposite() {
458 let a = vec![1.0, 0.0];
459 let b = vec![-1.0, 0.0];
460 let sim = cosine_similarity(&a, &b);
461 assert!((sim + 1.0).abs() < 0.0001); // Should be ~-1
462 }
463
464 #[test]
465 fn test_vector_index_search() {
466 let spans = vec![
467 create_test_span(vec![1.0, 0.0, 0.0]),
468 create_test_span(vec![0.0, 1.0, 0.0]),
469 create_test_span(vec![0.9, 0.1, 0.0]),
470 ];
471
472 let index = VectorIndex::build(spans);
473 assert_eq!(index.len(), 3);
474
475 let query = vec![1.0, 0.0, 0.0];
476 let results = index.search(&query, 2).unwrap();
477
478 assert_eq!(results.len(), 2);
479 // First result should be most similar
480 assert!(results[0].score > results[1].score);
481 }
482
483 #[test]
484 fn test_empty_index() {
485 let index = VectorIndex::build(vec![]);
486 assert!(index.is_empty());
487
488 let results = index.search(&[1.0, 0.0], 10).unwrap();
489 assert_eq!(results.len(), 0);
490 }
491}