Skip to main content

codelens_engine/embedding/
engine_impl.rs

1use crate::db::IndexDb;
2use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
3use crate::project::ProjectRoot;
4use anyhow::{Context, Result};
5use fastembed::TextEmbedding;
6use sha2::{Digest, Sha256};
7use std::collections::{HashMap, HashSet};
8use std::sync::Arc;
9
10use super::cache::{
11    ReusableEmbeddingKey, TextEmbeddingCache, reusable_embedding_key_for_chunk,
12    reusable_embedding_key_for_symbol,
13};
14use super::chunk_ops::{
15    CategoryScore, DuplicatePair, OutlierSymbol, StoredChunkKey, cosine_similarity,
16    duplicate_candidate_limit, duplicate_pair_key, stored_chunk_key, stored_chunk_key_for_score,
17};
18use super::ffi;
19use super::prompt::{
20    build_embedding_text, extract_leading_doc, is_test_only_symbol, split_identifier,
21};
22use super::runtime::{configured_rerank_blend, embed_batch_size, max_embed_symbols};
23use super::vec_store::SqliteVecStore;
24use super::{
25    CHANGED_FILE_QUERY_CHUNK, DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, EmbeddingEngine,
26    EmbeddingFreshnessReport, EmbeddingIndexInfo, EmbeddingRuntimeInfo, QueryEmbeddingCacheStats,
27    SemanticMatch,
28};
29use rusqlite::Connection;
30
31impl EmbeddingEngine {
32    fn configured_query_embed_cache_size() -> usize {
33        std::env::var("CODELENS_QUERY_EMBED_CACHE_SIZE")
34            .ok()
35            .and_then(|value| value.trim().parse::<usize>().ok())
36            .unwrap_or(4096)
37            .min(50_000)
38    }
39
40    fn normalize_query_for_cache(query: &str) -> String {
41        query.split_whitespace().collect::<Vec<_>>().join(" ")
42    }
43
44    fn query_cache_key(&self, query: &str) -> String {
45        let normalized = Self::normalize_query_for_cache(query);
46        let mut hasher = Sha256::new();
47        hasher.update(b"cache-v1\n");
48        hasher.update(self.model_name.as_bytes());
49        hasher.update(b"\n");
50        hasher.update(self.runtime_info.backend.as_bytes());
51        hasher.update(b"\n");
52        hasher.update(self.runtime_info.max_length.to_string().as_bytes());
53        hasher.update(b"\n");
54        hasher.update(normalized.as_bytes());
55        format!("{:x}", hasher.finalize())
56    }
57
58    fn embed_texts_cached(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
59        if texts.is_empty() {
60            return Ok(Vec::new());
61        }
62
63        let mut resolved: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
64        let mut missing_order: Vec<String> = Vec::new();
65        let mut missing_positions: HashMap<String, Vec<usize>> = HashMap::new();
66
67        {
68            let mut cache = self
69                .text_embed_cache
70                .lock()
71                .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
72            for (index, text) in texts.iter().enumerate() {
73                if let Some(cached) = cache.get(text) {
74                    resolved[index] = Some(cached);
75                } else {
76                    let key = (*text).to_owned();
77                    if !missing_positions.contains_key(&key) {
78                        missing_order.push(key.clone());
79                    }
80                    missing_positions.entry(key).or_default().push(index);
81                }
82            }
83        }
84
85        if !missing_order.is_empty() {
86            let missing_refs: Vec<&str> = missing_order.iter().map(String::as_str).collect();
87            let embeddings = self
88                .model
89                .lock()
90                .map_err(|_| anyhow::anyhow!("model lock"))?
91                .embed(missing_refs, None)
92                .context("text embedding failed")?;
93
94            let mut cache = self
95                .text_embed_cache
96                .lock()
97                .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
98            for (text, embedding) in missing_order.into_iter().zip(embeddings) {
99                cache.insert(text.clone(), embedding.clone());
100                if let Some(indices) = missing_positions.remove(&text) {
101                    for index in indices {
102                        resolved[index] = Some(embedding.clone());
103                    }
104                }
105            }
106        }
107
108        resolved
109            .into_iter()
110            .map(|item| item.ok_or_else(|| anyhow::anyhow!("missing embedding cache entry")))
111            .collect()
112    }
113
114    pub fn embed_query_cached(&self, query: &str) -> Result<Vec<f32>> {
115        let max_entries = Self::configured_query_embed_cache_size();
116        if max_entries == 0 {
117            return self
118                .embed_texts_cached(&[query])?
119                .into_iter()
120                .next()
121                .ok_or_else(|| anyhow::anyhow!("missing query embedding"));
122        }
123        let normalized = Self::normalize_query_for_cache(query);
124        let cache_key = self.query_cache_key(&normalized);
125        if let Some(embedding) = self.store.get_query_embedding(&cache_key)? {
126            return Ok(embedding);
127        }
128        let embedding = self
129            .embed_texts_cached(&[normalized.as_str()])?
130            .into_iter()
131            .next()
132            .ok_or_else(|| anyhow::anyhow!("missing query embedding"))?;
133        self.store
134            .put_query_embedding(&cache_key, &normalized, &embedding)?;
135        let _ = self.store.prune_query_embeddings(max_entries)?;
136        Ok(embedding)
137    }
138
139    pub fn prewarm_queries(&self, queries: &[String]) -> Result<usize> {
140        let max_entries = Self::configured_query_embed_cache_size();
141        if max_entries == 0 || queries.is_empty() {
142            return Ok(0);
143        }
144        let mut prewarmed = 0usize;
145        for query in queries {
146            if query.trim().is_empty() {
147                continue;
148            }
149            let _ = self.embed_query_cached(query)?;
150            prewarmed += 1;
151        }
152        Ok(prewarmed)
153    }
154
155    pub fn query_cache_stats(&self) -> Result<QueryEmbeddingCacheStats> {
156        let max_entries = Self::configured_query_embed_cache_size();
157        let entries = if max_entries == 0 {
158            0
159        } else {
160            self.store.query_cache_count()?
161        };
162        Ok(QueryEmbeddingCacheStats {
163            enabled: max_entries > 0,
164            entries,
165            max_entries,
166        })
167    }
168
169    pub fn new(project: &ProjectRoot) -> Result<Self> {
170        let (model, dimension, model_name, runtime_info) = super::runtime::load_codesearch_model()?;
171
172        let db_dir = project.as_path().join(".codelens/index");
173        std::fs::create_dir_all(&db_dir)?;
174        let db_path = db_dir.join("embeddings.db");
175
176        let store = SqliteVecStore::new(&db_path, dimension, &model_name)?;
177
178        Ok(Self {
179            model: std::sync::Mutex::new(model),
180            store,
181            model_name,
182            runtime_info,
183            text_embed_cache: std::sync::Mutex::new(TextEmbeddingCache::new(
184                super::runtime::configured_embedding_text_cache_size(),
185            )),
186            indexing: std::sync::atomic::AtomicBool::new(false),
187        })
188    }
189
190    pub fn model_name(&self) -> &str {
191        &self.model_name
192    }
193
194    pub fn runtime_info(&self) -> &EmbeddingRuntimeInfo {
195        &self.runtime_info
196    }
197
198    /// Index all symbols from the project's symbol database into the embedding index.
199    ///
200    /// Reconciles the embedding store file-by-file so unchanged symbols can
201    /// reuse their existing vectors and only changed/new symbols are re-embedded.
202    /// Caps at a configurable max to prevent runaway on huge projects.
203    /// Returns true if a full reindex is currently in progress.
204    pub fn is_indexing(&self) -> bool {
205        self.indexing.load(std::sync::atomic::Ordering::Relaxed)
206    }
207
208    pub fn index_from_project(&self, project: &ProjectRoot) -> Result<usize> {
209        // Guard against concurrent full reindex (14s+ operation)
210        if self
211            .indexing
212            .compare_exchange(
213                false,
214                true,
215                std::sync::atomic::Ordering::AcqRel,
216                std::sync::atomic::Ordering::Relaxed,
217            )
218            .is_err()
219        {
220            anyhow::bail!(
221                "Embedding indexing already in progress — wait for the current run to complete before retrying."
222            );
223        }
224        // RAII guard to reset the flag on any exit path
225        struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
226        impl Drop for IndexGuard<'_> {
227            fn drop(&mut self) {
228                self.0.store(false, std::sync::atomic::Ordering::Release);
229            }
230        }
231        let _guard = IndexGuard(&self.indexing);
232
233        let db_path = crate::db::index_db_path(project.as_path());
234        let symbol_db = IndexDb::open(&db_path)?;
235        let batch_size = embed_batch_size();
236        let max_symbols = max_embed_symbols();
237        let mut total_indexed = 0usize;
238        let mut total_seen = 0usize;
239        let mut model = None;
240        let mut existing_embeddings: HashMap<
241            String,
242            HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
243        > = HashMap::new();
244        let mut current_db_files = HashSet::new();
245        let mut capped = false;
246
247        self.store
248            .for_each_file_embeddings(&mut |file_path, chunks| {
249                existing_embeddings.insert(
250                    file_path,
251                    chunks
252                        .into_iter()
253                        .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
254                        .collect(),
255                );
256                Ok(())
257            })?;
258
259        symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
260            current_db_files.insert(file_path.clone());
261            if capped {
262                return Ok(());
263            }
264
265            let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
266            let relevant_symbols: Vec<_> = symbols
267                .into_iter()
268                .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
269                .collect();
270
271            if relevant_symbols.is_empty() {
272                self.store.delete_by_file(&[file_path.as_str()])?;
273                existing_embeddings.remove(&file_path);
274                return Ok(());
275            }
276
277            if total_seen + relevant_symbols.len() > max_symbols {
278                capped = true;
279                return Ok(());
280            }
281            total_seen += relevant_symbols.len();
282
283            let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
284            total_indexed += self.reconcile_file_embeddings(
285                &file_path,
286                relevant_symbols,
287                source.as_deref(),
288                existing_for_file,
289                batch_size,
290                &mut model,
291            )?;
292            Ok(())
293        })?;
294
295        let removed_files: Vec<String> = existing_embeddings
296            .into_keys()
297            .filter(|file_path| !current_db_files.contains(file_path))
298            .collect();
299        if !removed_files.is_empty() {
300            let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
301            self.store.delete_by_file(&removed_refs)?;
302        }
303
304        Ok(total_indexed)
305    }
306
307    pub fn ensure_index_fresh_for_project(
308        &self,
309        project: &ProjectRoot,
310    ) -> Result<EmbeddingFreshnessReport> {
311        if self
312            .indexing
313            .compare_exchange(
314                false,
315                true,
316                std::sync::atomic::Ordering::AcqRel,
317                std::sync::atomic::Ordering::Relaxed,
318            )
319            .is_err()
320        {
321            anyhow::bail!(
322                "Embedding indexing already in progress — wait for the current run to complete before retrying."
323            );
324        }
325
326        struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
327        impl Drop for IndexGuard<'_> {
328            fn drop(&mut self) {
329                self.0.store(false, std::sync::atomic::Ordering::Release);
330            }
331        }
332        let _guard = IndexGuard(&self.indexing);
333
334        let db_path = crate::db::index_db_path(project.as_path());
335        let symbol_db = IndexDb::open(&db_path)?;
336        let batch_size = embed_batch_size();
337        let mut report = EmbeddingFreshnessReport::default();
338        let mut existing_embeddings: HashMap<
339            String,
340            HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
341        > = HashMap::new();
342        let mut current_db_files = HashSet::new();
343        let mut model = None;
344
345        self.store
346            .for_each_file_embeddings(&mut |file_path, chunks| {
347                existing_embeddings.insert(
348                    file_path,
349                    chunks
350                        .into_iter()
351                        .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
352                        .collect(),
353                );
354                Ok(())
355            })?;
356
357        if existing_embeddings.is_empty() {
358            return Ok(report);
359        }
360
361        symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
362            current_db_files.insert(file_path.clone());
363            let Some(existing_for_file) = existing_embeddings.get(&file_path) else {
364                report.skipped_new_files += 1;
365                return Ok(());
366            };
367
368            report.checked_files += 1;
369            let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
370            let relevant_symbols: Vec<_> = symbols
371                .into_iter()
372                .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
373                .collect();
374
375            if relevant_symbols.is_empty() {
376                self.store.delete_by_file(&[file_path.as_str()])?;
377                existing_embeddings.remove(&file_path);
378                report.refreshed_files += 1;
379                return Ok(());
380            }
381
382            let current_keys = relevant_symbols
383                .iter()
384                .map(|sym| {
385                    let text = build_embedding_text(sym, source.as_deref());
386                    reusable_embedding_key_for_symbol(sym, &text)
387                })
388                .collect::<HashSet<_>>();
389            let stored_keys = existing_for_file.keys().cloned().collect::<HashSet<_>>();
390
391            if current_keys == stored_keys {
392                existing_embeddings.remove(&file_path);
393                report.unchanged_files += 1;
394                return Ok(());
395            }
396
397            let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
398            report.indexed_symbols += self.reconcile_file_embeddings(
399                &file_path,
400                relevant_symbols,
401                source.as_deref(),
402                existing_for_file,
403                batch_size,
404                &mut model,
405            )?;
406            report.refreshed_files += 1;
407            Ok(())
408        })?;
409
410        let removed_files: Vec<String> = existing_embeddings
411            .into_keys()
412            .filter(|file_path| !current_db_files.contains(file_path))
413            .collect();
414        if !removed_files.is_empty() {
415            let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
416            report.removed_files = self.store.delete_by_file(&removed_refs)?;
417        }
418
419        Ok(report)
420    }
421
422    /// Extract NL→code bridge candidates from indexed symbols.
423    /// For each symbol with a docstring, produces a (docstring_first_line, symbol_name) pair.
424    /// The caller writes these to `.codelens/bridges.json` for project-specific NL bridging.
425    pub fn generate_bridge_candidates(
426        &self,
427        project: &ProjectRoot,
428    ) -> Result<Vec<(String, String)>> {
429        let db_path = crate::db::index_db_path(project.as_path());
430        let symbol_db = IndexDb::open(&db_path)?;
431        let mut bridges: Vec<(String, String)> = Vec::new();
432        let mut seen_nl = HashSet::new();
433
434        symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
435            let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
436            for sym in &symbols {
437                if is_test_only_symbol(sym, source.as_deref()) {
438                    continue;
439                }
440                let doc = source.as_deref().and_then(|src| {
441                    extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize)
442                });
443                let doc = match doc {
444                    Some(d) if !d.is_empty() => d,
445                    _ => continue,
446                };
447
448                // Build code term: symbol_name + split words
449                let split = split_identifier(&sym.name);
450                let code_term = if split != sym.name {
451                    format!("{} {}", sym.name, split)
452                } else {
453                    sym.name.clone()
454                };
455
456                // Extract short NL phrases (3-6 words) from the docstring.
457                // This produces multiple bridge entries per symbol, each matching
458                // common NL query patterns like "render template" or "parse url".
459                let first_line = doc.lines().next().unwrap_or("").trim().to_lowercase();
460                // Remove trailing period/punctuation
461                let clean = first_line.trim_end_matches(|c: char| c.is_ascii_punctuation());
462                let words: Vec<&str> = clean.split_whitespace().collect();
463                if words.len() < 2 {
464                    continue;
465                }
466
467                // Generate short N-gram keys (2-4 words from the start)
468                for window in 2..=words.len().min(4) {
469                    let key = words[..window].join(" ");
470                    if key.len() < 5 || key.len() > 60 {
471                        continue;
472                    }
473                    if seen_nl.insert(key.clone()) {
474                        bridges.push((key, code_term.clone()));
475                    }
476                }
477
478                // Also add split_identifier words as a bridge key
479                // so "render template" → render_template
480                if split != sym.name && !seen_nl.contains(&split.to_lowercase()) {
481                    let lowered = split.to_lowercase();
482                    if lowered.split_whitespace().count() >= 2 && seen_nl.insert(lowered.clone()) {
483                        bridges.push((lowered, code_term.clone()));
484                    }
485                }
486            }
487            Ok(())
488        })?;
489
490        Ok(bridges)
491    }
492
493    fn reconcile_file_embeddings<'a>(
494        &'a self,
495        file_path: &str,
496        symbols: Vec<crate::db::SymbolWithFile>,
497        source: Option<&str>,
498        mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
499        batch_size: usize,
500        model: &mut Option<std::sync::MutexGuard<'a, TextEmbedding>>,
501    ) -> Result<usize> {
502        let mut reconciled_chunks = Vec::with_capacity(symbols.len());
503        let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
504        let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
505
506        for sym in symbols {
507            let text = build_embedding_text(&sym, source);
508            if let Some(existing) =
509                existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
510            {
511                reconciled_chunks.push(EmbeddingChunk {
512                    file_path: sym.file_path.clone(),
513                    symbol_name: sym.name.clone(),
514                    kind: sym.kind.clone(),
515                    line: sym.line as usize,
516                    signature: sym.signature.clone(),
517                    name_path: sym.name_path.clone(),
518                    text,
519                    embedding: existing.embedding,
520                    doc_embedding: existing.doc_embedding,
521                });
522                continue;
523            }
524
525            batch_texts.push(text);
526            batch_meta.push(sym);
527
528            if batch_texts.len() >= batch_size {
529                if model.is_none() {
530                    *model = Some(
531                        self.model
532                            .lock()
533                            .map_err(|_| anyhow::anyhow!("model lock"))?,
534                    );
535                }
536                reconciled_chunks.extend(Self::embed_chunks(
537                    model.as_mut().expect("model lock initialized"),
538                    &batch_texts,
539                    &batch_meta,
540                )?);
541                batch_texts.clear();
542                batch_meta.clear();
543            }
544        }
545
546        if !batch_texts.is_empty() {
547            if model.is_none() {
548                *model = Some(
549                    self.model
550                        .lock()
551                        .map_err(|_| anyhow::anyhow!("model lock"))?,
552                );
553            }
554            reconciled_chunks.extend(Self::embed_chunks(
555                model.as_mut().expect("model lock initialized"),
556                &batch_texts,
557                &batch_meta,
558            )?);
559        }
560
561        self.store.delete_by_file(&[file_path])?;
562        if reconciled_chunks.is_empty() {
563            return Ok(0);
564        }
565        self.store.insert(&reconciled_chunks)
566    }
567
568    fn embed_chunks(
569        model: &mut TextEmbedding,
570        texts: &[String],
571        meta: &[crate::db::SymbolWithFile],
572    ) -> Result<Vec<EmbeddingChunk>> {
573        let batch_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
574        let embeddings = model.embed(batch_refs, None).context("embedding failed")?;
575
576        Ok(meta
577            .iter()
578            .zip(embeddings)
579            .zip(texts.iter())
580            .map(|((sym, emb), text)| EmbeddingChunk {
581                file_path: sym.file_path.clone(),
582                symbol_name: sym.name.clone(),
583                kind: sym.kind.clone(),
584                line: sym.line as usize,
585                signature: sym.signature.clone(),
586                name_path: sym.name_path.clone(),
587                text: text.clone(),
588                embedding: emb,
589                doc_embedding: None,
590            })
591            .collect())
592    }
593
594    /// Embed one batch of texts and upsert immediately, then the caller drops the batch.
595    fn flush_batch(
596        model: &mut TextEmbedding,
597        store: &SqliteVecStore,
598        texts: &[String],
599        meta: &[crate::db::SymbolWithFile],
600    ) -> Result<usize> {
601        let chunks = Self::embed_chunks(model, texts, meta)?;
602        store.insert(&chunks)
603    }
604
605    /// Search for symbols semantically similar to the query.
606    pub fn search(&self, query: &str, max_results: usize) -> Result<Vec<SemanticMatch>> {
607        let results = self.search_scored(query, max_results)?;
608        Ok(results.into_iter().map(SemanticMatch::from).collect())
609    }
610
611    /// Search returning raw ScoredChunks with optional reranking.
612    ///
613    /// Pipeline: bi-encoder → candidate pool (3× requested) → rerank → top-N.
614    /// Reranking uses query-document text overlap scoring to refine bi-encoder
615    /// cosine similarity. This catches cases where embedding similarity is high
616    /// but the actual text relevance is low (or vice versa).
617    pub fn search_scored(&self, query: &str, max_results: usize) -> Result<Vec<ScoredChunk>> {
618        let query_embedding = self.embed_query_cached(query)?;
619
620        // Fetch N× candidates for reranking headroom (default 5×, override via
621        // CODELENS_RERANK_FACTOR). More candidates = better rerank quality at
622        // marginal latency cost (sqlite-vec scan is fast).
623        let factor = std::env::var("CODELENS_RERANK_FACTOR")
624            .ok()
625            .and_then(|v| v.parse::<usize>().ok())
626            .unwrap_or(5);
627        let candidate_count = max_results.saturating_mul(factor).max(max_results);
628        let mut candidates = self.store.search(&query_embedding, candidate_count)?;
629
630        if candidates.len() <= max_results {
631            return Ok(candidates);
632        }
633
634        // Lightweight rerank: blend bi-encoder score with text overlap signal.
635        // This is a stopgap until a proper cross-encoder is plugged in.
636        let query_lower = query.to_lowercase();
637        let query_tokens: Vec<&str> = query_lower
638            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
639            .filter(|t| t.len() >= 2)
640            .collect();
641
642        if query_tokens.is_empty() {
643            candidates.truncate(max_results);
644            return Ok(candidates);
645        }
646
647        let blend = configured_rerank_blend();
648        for chunk in &mut candidates {
649            // Build searchable text: symbol_name + split identifier words +
650            // name_path (parent context) + signature + file_path.
651            // split_identifier turns "parseSymbols" into "parse Symbols" for
652            // better NL token matching.
653            let split_name = split_identifier(&chunk.symbol_name);
654            let searchable = format!(
655                "{} {} {} {} {}",
656                chunk.symbol_name.to_lowercase(),
657                split_name.to_lowercase(),
658                chunk.name_path.to_lowercase(),
659                chunk.signature.to_lowercase(),
660                chunk.file_path.to_lowercase(),
661            );
662            let overlap = query_tokens
663                .iter()
664                .filter(|t| searchable.contains(**t))
665                .count() as f64;
666            let overlap_ratio = overlap / query_tokens.len().max(1) as f64;
667            // Blend: configurable bi-encoder + text overlap (default 75/25)
668            chunk.score = chunk.score * blend + overlap_ratio * (1.0 - blend);
669        }
670
671        candidates.sort_by(|a, b| {
672            b.score
673                .partial_cmp(&a.score)
674                .unwrap_or(std::cmp::Ordering::Equal)
675        });
676        candidates.truncate(max_results);
677        Ok(candidates)
678    }
679
680    /// Incrementally re-index only the given files.
681    pub fn index_changed_files(
682        &self,
683        project: &ProjectRoot,
684        changed_files: &[&str],
685    ) -> Result<usize> {
686        if changed_files.is_empty() {
687            return Ok(0);
688        }
689        let batch_size = embed_batch_size();
690        let mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk> = HashMap::new();
691        for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
692            for chunk in self.store.embeddings_for_files(file_chunk)? {
693                existing_embeddings.insert(reusable_embedding_key_for_chunk(&chunk), chunk);
694            }
695        }
696        self.store.delete_by_file(changed_files)?;
697
698        let db_path = crate::db::index_db_path(project.as_path());
699        let symbol_db = IndexDb::open(&db_path)?;
700
701        let mut total_indexed = 0usize;
702        let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
703        let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
704        let mut batch_reused: Vec<EmbeddingChunk> = Vec::with_capacity(batch_size);
705        let mut file_cache: std::collections::HashMap<String, Option<String>> =
706            std::collections::HashMap::new();
707        let mut model = None;
708
709        for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
710            let relevant = symbol_db.symbols_for_files(file_chunk)?;
711            for sym in relevant {
712                let source = file_cache.entry(sym.file_path.clone()).or_insert_with(|| {
713                    std::fs::read_to_string(project.as_path().join(&sym.file_path)).ok()
714                });
715                if is_test_only_symbol(&sym, source.as_deref()) {
716                    continue;
717                }
718                let text = build_embedding_text(&sym, source.as_deref());
719                if let Some(existing) =
720                    existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
721                {
722                    batch_reused.push(EmbeddingChunk {
723                        file_path: sym.file_path.clone(),
724                        symbol_name: sym.name.clone(),
725                        kind: sym.kind.clone(),
726                        line: sym.line as usize,
727                        signature: sym.signature.clone(),
728                        name_path: sym.name_path.clone(),
729                        text,
730                        embedding: existing.embedding,
731                        doc_embedding: existing.doc_embedding,
732                    });
733                    if batch_reused.len() >= batch_size {
734                        total_indexed += self.store.insert(&batch_reused)?;
735                        batch_reused.clear();
736                    }
737                    continue;
738                }
739                batch_texts.push(text);
740                batch_meta.push(sym);
741
742                if batch_texts.len() >= batch_size {
743                    if model.is_none() {
744                        model = Some(
745                            self.model
746                                .lock()
747                                .map_err(|_| anyhow::anyhow!("model lock"))?,
748                        );
749                    }
750                    total_indexed += Self::flush_batch(
751                        model.as_mut().expect("model lock initialized"),
752                        &self.store,
753                        &batch_texts,
754                        &batch_meta,
755                    )?;
756                    batch_texts.clear();
757                    batch_meta.clear();
758                }
759            }
760        }
761
762        if !batch_reused.is_empty() {
763            total_indexed += self.store.insert(&batch_reused)?;
764        }
765
766        if !batch_texts.is_empty() {
767            if model.is_none() {
768                model = Some(
769                    self.model
770                        .lock()
771                        .map_err(|_| anyhow::anyhow!("model lock"))?,
772                );
773            }
774            total_indexed += Self::flush_batch(
775                model.as_mut().expect("model lock initialized"),
776                &self.store,
777                &batch_texts,
778                &batch_meta,
779            )?;
780        }
781
782        Ok(total_indexed)
783    }
784
785    /// Whether the embedding index has been populated.
786    pub fn is_indexed(&self) -> bool {
787        self.store.count().unwrap_or(0) > 0
788    }
789
790    pub fn index_info(&self) -> EmbeddingIndexInfo {
791        EmbeddingIndexInfo {
792            model_name: self.model_name.clone(),
793            indexed_symbols: self.store.count().unwrap_or(0),
794        }
795    }
796
797    pub fn inspect_existing_index(project: &ProjectRoot) -> Result<Option<EmbeddingIndexInfo>> {
798        let db_path = project.as_path().join(".codelens/index/embeddings.db");
799        if !db_path.exists() {
800            return Ok(None);
801        }
802
803        let conn =
804            crate::db::open_derived_sqlite_with_recovery(&db_path, "embedding index", || {
805                ffi::register_sqlite_vec()?;
806                let conn = Connection::open(&db_path)?;
807                conn.execute_batch("PRAGMA busy_timeout=5000;")?;
808                conn.query_row("PRAGMA schema_version", [], |_row| Ok(()))?;
809                Ok(conn)
810            })?;
811
812        let model_name: Option<String> = conn
813            .query_row(
814                "SELECT value FROM meta WHERE key = 'model' LIMIT 1",
815                [],
816                |row| row.get(0),
817            )
818            .ok();
819        let indexed_symbols: usize = conn
820            .query_row("SELECT COUNT(*) FROM symbols", [], |row| {
821                row.get::<_, i64>(0)
822            })
823            .map(|count| count.max(0) as usize)
824            .unwrap_or(0);
825
826        Ok(model_name.map(|model_name| EmbeddingIndexInfo {
827            model_name,
828            indexed_symbols,
829        }))
830    }
831
832    // ── Embedding-powered analysis ─────────────────────────────────
833
834    /// Find code symbols most similar to the given symbol.
835    pub fn find_similar_code(
836        &self,
837        file_path: &str,
838        symbol_name: &str,
839        max_results: usize,
840    ) -> Result<Vec<SemanticMatch>> {
841        let target = self
842            .store
843            .get_embedding(file_path, symbol_name)?
844            .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
845
846        let oversample = max_results.saturating_add(8).max(1);
847        let scored = self
848            .store
849            .search(&target.embedding, oversample)?
850            .into_iter()
851            .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
852            .take(max_results)
853            .map(SemanticMatch::from)
854            .collect();
855        Ok(scored)
856    }
857
858    /// Find near-duplicate code pairs across the codebase.
859    /// Returns pairs with cosine similarity above the threshold (default 0.85).
860    pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
861        let mut pairs = Vec::new();
862        let mut seen_pairs = HashSet::new();
863        let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
864        let candidate_limit = duplicate_candidate_limit(max_pairs);
865        let mut done = false;
866
867        self.store
868            .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut |batch| {
869                if done {
870                    return Ok(());
871                }
872
873                let mut candidate_lists = Vec::with_capacity(batch.len());
874                let mut missing_candidates = Vec::new();
875                let mut missing_keys = HashSet::new();
876
877                for chunk in &batch {
878                    if pairs.len() >= max_pairs {
879                        done = true;
880                        break;
881                    }
882
883                    let filtered: Vec<ScoredChunk> = self
884                        .store
885                        .search(&chunk.embedding, candidate_limit)?
886                        .into_iter()
887                        .filter(|candidate| {
888                            !(chunk.file_path == candidate.file_path
889                                && chunk.symbol_name == candidate.symbol_name
890                                && chunk.line == candidate.line
891                                && chunk.signature == candidate.signature
892                                && chunk.name_path == candidate.name_path)
893                        })
894                        .collect();
895
896                    for candidate in &filtered {
897                        let cache_key = stored_chunk_key_for_score(candidate);
898                        if !embedding_cache.contains_key(&cache_key)
899                            && missing_keys.insert(cache_key)
900                        {
901                            missing_candidates.push(candidate.clone());
902                        }
903                    }
904
905                    candidate_lists.push(filtered);
906                }
907
908                if !missing_candidates.is_empty() {
909                    for candidate_chunk in self
910                        .store
911                        .embeddings_for_scored_chunks(&missing_candidates)?
912                    {
913                        embedding_cache
914                            .entry(stored_chunk_key(&candidate_chunk))
915                            .or_insert_with(|| Arc::new(candidate_chunk));
916                    }
917                }
918
919                for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
920                    if pairs.len() >= max_pairs {
921                        done = true;
922                        break;
923                    }
924
925                    for candidate in candidates {
926                        let pair_key = duplicate_pair_key(
927                            &chunk.file_path,
928                            &chunk.symbol_name,
929                            &candidate.file_path,
930                            &candidate.symbol_name,
931                        );
932                        if !seen_pairs.insert(pair_key) {
933                            continue;
934                        }
935
936                        let Some(candidate_chunk) =
937                            embedding_cache.get(&stored_chunk_key_for_score(candidate))
938                        else {
939                            continue;
940                        };
941
942                        let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
943                        if sim < threshold {
944                            continue;
945                        }
946
947                        pairs.push(DuplicatePair {
948                            symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
949                            symbol_b: format!(
950                                "{}:{}",
951                                candidate_chunk.file_path, candidate_chunk.symbol_name
952                            ),
953                            file_a: chunk.file_path.clone(),
954                            file_b: candidate_chunk.file_path.clone(),
955                            line_a: chunk.line,
956                            line_b: candidate_chunk.line,
957                            similarity: sim,
958                        });
959                        if pairs.len() >= max_pairs {
960                            done = true;
961                            break;
962                        }
963                    }
964                }
965                Ok(())
966            })?;
967
968        pairs.sort_by(|a, b| {
969            b.similarity
970                .partial_cmp(&a.similarity)
971                .unwrap_or(std::cmp::Ordering::Equal)
972        });
973        Ok(pairs)
974    }
975}
976
977impl EmbeddingEngine {
978    /// Classify a code symbol into one of the given categories using zero-shot embedding similarity.
979    pub fn classify_symbol(
980        &self,
981        file_path: &str,
982        symbol_name: &str,
983        categories: &[&str],
984    ) -> Result<Vec<CategoryScore>> {
985        let target = match self.store.get_embedding(file_path, symbol_name)? {
986            Some(target) => target,
987            None => self
988                .store
989                .all_with_embeddings()?
990                .into_iter()
991                .find(|c| c.file_path == file_path && c.symbol_name == symbol_name)
992                .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?,
993        };
994
995        let embeddings = self.embed_texts_cached(categories)?;
996
997        let mut scores: Vec<CategoryScore> = categories
998            .iter()
999            .zip(embeddings.iter())
1000            .map(|(cat, emb)| CategoryScore {
1001                category: cat.to_string(),
1002                score: cosine_similarity(&target.embedding, emb),
1003            })
1004            .collect();
1005
1006        scores.sort_by(|a, b| {
1007            b.score
1008                .partial_cmp(&a.score)
1009                .unwrap_or(std::cmp::Ordering::Equal)
1010        });
1011        Ok(scores)
1012    }
1013
1014    /// Find symbols that are outliers — semantically distant from their file's other symbols.
1015    pub fn find_misplaced_code(&self, max_results: usize) -> Result<Vec<OutlierSymbol>> {
1016        let mut outliers = Vec::new();
1017
1018        self.store
1019            .for_each_file_embeddings(&mut |file_path, chunks| {
1020                if chunks.len() < 2 {
1021                    return Ok(());
1022                }
1023
1024                for (idx, chunk) in chunks.iter().enumerate() {
1025                    let mut sim_sum = 0.0;
1026                    let mut count = 0;
1027                    for (other_idx, other_chunk) in chunks.iter().enumerate() {
1028                        if other_idx == idx {
1029                            continue;
1030                        }
1031                        sim_sum += cosine_similarity(&chunk.embedding, &other_chunk.embedding);
1032                        count += 1;
1033                    }
1034                    if count > 0 {
1035                        let avg_sim = sim_sum / count as f64; // Lower means more misplaced.
1036                        outliers.push(OutlierSymbol {
1037                            file_path: file_path.clone(),
1038                            symbol_name: chunk.symbol_name.clone(),
1039                            kind: chunk.kind.clone(),
1040                            line: chunk.line,
1041                            avg_similarity_to_file: avg_sim,
1042                        });
1043                    }
1044                }
1045                Ok(())
1046            })?;
1047
1048        outliers.sort_by(|a, b| {
1049            a.avg_similarity_to_file
1050                .partial_cmp(&b.avg_similarity_to_file)
1051                .unwrap_or(std::cmp::Ordering::Equal)
1052        });
1053        outliers.truncate(max_results);
1054        Ok(outliers)
1055    }
1056}