Skip to main content

codelens_engine/
embedding_store.rs

1//! Abstraction layer for vector embedding storage.
2//! Default implementation uses sqlite-vec; trait allows future swap to Qdrant/LanceDB.
3
4use anyhow::Result;
5use serde::Serialize;
6use std::collections::BTreeMap;
7
8/// A single embedding chunk ready for storage.
9#[derive(Debug, Clone)]
10pub struct EmbeddingChunk {
11    pub file_path: String,
12    pub symbol_name: String,
13    pub kind: String,
14    pub line: usize,
15    pub signature: String,
16    pub name_path: String,
17    pub text: String,
18    /// Primary embedding: code signature + identifier split
19    pub embedding: Vec<f32>,
20    /// Optional secondary embedding: docstring/comment (for dual-vector search)
21    pub doc_embedding: Option<Vec<f32>>,
22}
23
24/// Result of a vector similarity search.
25#[derive(Debug, Clone, Serialize)]
26pub struct ScoredChunk {
27    pub file_path: String,
28    pub symbol_name: String,
29    pub kind: String,
30    pub line: usize,
31    pub signature: String,
32    pub name_path: String,
33    pub score: f64,
34}
35
36/// Trait for vector embedding storage backends.
37/// Implementations handle persistence, indexing, and similarity search.
38pub trait EmbeddingStore: Send + Sync {
39    /// Insert or update embedding chunks. Replaces ALL existing entries.
40    fn upsert(&self, chunks: &[EmbeddingChunk]) -> Result<usize>;
41
42    /// Append embedding chunks without clearing existing data.
43    fn insert(&self, chunks: &[EmbeddingChunk]) -> Result<usize>;
44
45    /// Search for chunks similar to the query embedding vector.
46    fn search(&self, query_vec: &[f32], top_k: usize) -> Result<Vec<ScoredChunk>>;
47
48    /// Dual-vector search: blend code embedding score with doc embedding score.
49    /// `doc_weight` controls the balance (0.0 = code only, 1.0 = doc only).
50    fn search_dual(
51        &self,
52        query_vec: &[f32],
53        top_k: usize,
54        doc_weight: f64,
55    ) -> Result<Vec<ScoredChunk>> {
56        // Default: fallback to single-vector search
57        let _ = doc_weight;
58        self.search(query_vec, top_k)
59    }
60
61    /// Delete all embeddings for files matching the given paths.
62    fn delete_by_file(&self, file_paths: &[&str]) -> Result<usize>;
63
64    /// Clear all stored embeddings.
65    fn clear(&self) -> Result<()>;
66
67    /// Number of stored chunks.
68    fn count(&self) -> Result<usize>;
69
70    /// Retrieve a single stored chunk and embedding by symbol identity.
71    fn get_embedding(
72        &self,
73        _file_path: &str,
74        _symbol_name: &str,
75    ) -> Result<Option<EmbeddingChunk>> {
76        Ok(None)
77    }
78
79    /// Retrieve stored chunks matching previously ranked search results so
80    /// callers can batch exact-vector follow-up work without per-result lookups.
81    fn embeddings_for_scored_chunks(&self, chunks: &[ScoredChunk]) -> Result<Vec<EmbeddingChunk>> {
82        let mut resolved = Vec::with_capacity(chunks.len());
83        for chunk in chunks {
84            if let Some(embedding) = self.get_embedding(&chunk.file_path, &chunk.symbol_name)? {
85                resolved.push(embedding);
86            }
87        }
88        Ok(resolved)
89    }
90
91    /// Retrieve all stored chunks with their embeddings for batch analysis.
92    fn all_with_embeddings(&self) -> Result<Vec<EmbeddingChunk>> {
93        Ok(Vec::new()) // Default: not supported
94    }
95
96    /// Retrieve stored chunks for the given files so incremental indexing can
97    /// reuse unchanged embeddings without materializing the full index.
98    fn embeddings_for_files(&self, file_paths: &[&str]) -> Result<Vec<EmbeddingChunk>> {
99        let file_set: std::collections::BTreeSet<&str> = file_paths.iter().copied().collect();
100        Ok(self
101            .all_with_embeddings()?
102            .into_iter()
103            .filter(|chunk| file_set.contains(chunk.file_path.as_str()))
104            .collect())
105    }
106
107    /// Stream stored chunks in bounded batches so callers can avoid loading the
108    /// entire embedding index into memory.
109    fn for_each_embedding_batch(
110        &self,
111        batch_size: usize,
112        visitor: &mut dyn FnMut(Vec<EmbeddingChunk>) -> Result<()>,
113    ) -> Result<()> {
114        if batch_size == 0 {
115            return Ok(());
116        }
117
118        let all = self.all_with_embeddings()?;
119        for chunk_batch in all.chunks(batch_size) {
120            visitor(chunk_batch.to_vec())?;
121        }
122        Ok(())
123    }
124
125    /// Stream stored chunks grouped by file path for per-file analysis without
126    /// requiring callers to materialize the entire index first.
127    /// Full and incremental reindex reconciliation rely on this grouping.
128    fn for_each_file_embeddings(
129        &self,
130        visitor: &mut dyn FnMut(String, Vec<EmbeddingChunk>) -> Result<()>,
131    ) -> Result<()> {
132        let mut by_file: BTreeMap<String, Vec<EmbeddingChunk>> = BTreeMap::new();
133        for chunk in self.all_with_embeddings()? {
134            by_file
135                .entry(chunk.file_path.clone())
136                .or_default()
137                .push(chunk);
138        }
139        for (file_path, chunks) in by_file {
140            visitor(file_path, chunks)?;
141        }
142        Ok(())
143    }
144}