1use crate::db::IndexDb;
2use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
3use crate::project::ProjectRoot;
4use anyhow::{Context, Result};
5use fastembed::TextEmbedding;
6use sha2::{Digest, Sha256};
7use std::collections::{HashMap, HashSet};
8use std::sync::Arc;
9
10use super::cache::{
11 ReusableEmbeddingKey, TextEmbeddingCache, reusable_embedding_key_for_chunk,
12 reusable_embedding_key_for_symbol,
13};
14use super::chunk_ops::{
15 CategoryScore, DuplicatePair, OutlierSymbol, StoredChunkKey, cosine_similarity,
16 duplicate_candidate_limit, duplicate_pair_key, stored_chunk_key, stored_chunk_key_for_score,
17};
18use super::ffi;
19use super::prompt::{
20 build_embedding_text, extract_leading_doc, is_test_only_symbol, split_identifier,
21};
22use super::runtime::{configured_rerank_blend, embed_batch_size, max_embed_symbols};
23use super::vec_store::SqliteVecStore;
24use super::{
25 CHANGED_FILE_QUERY_CHUNK, DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, EmbeddingEngine,
26 EmbeddingFreshnessReport, EmbeddingIndexInfo, EmbeddingRuntimeInfo, QueryEmbeddingCacheStats,
27 SemanticMatch,
28};
29use rusqlite::Connection;
30
31impl EmbeddingEngine {
32 fn configured_query_embed_cache_size() -> usize {
33 std::env::var("CODELENS_QUERY_EMBED_CACHE_SIZE")
34 .ok()
35 .and_then(|value| value.trim().parse::<usize>().ok())
36 .unwrap_or(4096)
37 .min(50_000)
38 }
39
40 fn normalize_query_for_cache(query: &str) -> String {
41 query.split_whitespace().collect::<Vec<_>>().join(" ")
42 }
43
44 fn query_cache_key(&self, query: &str) -> String {
45 let normalized = Self::normalize_query_for_cache(query);
46 let mut hasher = Sha256::new();
47 hasher.update(b"cache-v1\n");
48 hasher.update(self.model_name.as_bytes());
49 hasher.update(b"\n");
50 hasher.update(self.runtime_info.backend.as_bytes());
51 hasher.update(b"\n");
52 hasher.update(self.runtime_info.max_length.to_string().as_bytes());
53 hasher.update(b"\n");
54 hasher.update(normalized.as_bytes());
55 format!("{:x}", hasher.finalize())
56 }
57
58 fn embed_texts_cached(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
59 if texts.is_empty() {
60 return Ok(Vec::new());
61 }
62
63 let mut resolved: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
64 let mut missing_order: Vec<String> = Vec::new();
65 let mut missing_positions: HashMap<String, Vec<usize>> = HashMap::new();
66
67 {
68 let mut cache = self
69 .text_embed_cache
70 .lock()
71 .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
72 for (index, text) in texts.iter().enumerate() {
73 if let Some(cached) = cache.get(text) {
74 resolved[index] = Some(cached);
75 } else {
76 let key = (*text).to_owned();
77 if !missing_positions.contains_key(&key) {
78 missing_order.push(key.clone());
79 }
80 missing_positions.entry(key).or_default().push(index);
81 }
82 }
83 }
84
85 if !missing_order.is_empty() {
86 let missing_refs: Vec<&str> = missing_order.iter().map(String::as_str).collect();
87 let embeddings = self
88 .model
89 .lock()
90 .map_err(|_| anyhow::anyhow!("model lock"))?
91 .embed(missing_refs, None)
92 .context("text embedding failed")?;
93
94 let mut cache = self
95 .text_embed_cache
96 .lock()
97 .map_err(|_| anyhow::anyhow!("text embedding cache lock"))?;
98 for (text, embedding) in missing_order.into_iter().zip(embeddings) {
99 cache.insert(text.clone(), embedding.clone());
100 if let Some(indices) = missing_positions.remove(&text) {
101 for index in indices {
102 resolved[index] = Some(embedding.clone());
103 }
104 }
105 }
106 }
107
108 resolved
109 .into_iter()
110 .map(|item| item.ok_or_else(|| anyhow::anyhow!("missing embedding cache entry")))
111 .collect()
112 }
113
114 pub fn embed_query_cached(&self, query: &str) -> Result<Vec<f32>> {
115 let max_entries = Self::configured_query_embed_cache_size();
116 if max_entries == 0 {
117 return self
118 .embed_texts_cached(&[query])?
119 .into_iter()
120 .next()
121 .ok_or_else(|| anyhow::anyhow!("missing query embedding"));
122 }
123 let normalized = Self::normalize_query_for_cache(query);
124 let cache_key = self.query_cache_key(&normalized);
125 if let Some(embedding) = self.store.get_query_embedding(&cache_key)? {
126 return Ok(embedding);
127 }
128 let embedding = self
129 .embed_texts_cached(&[normalized.as_str()])?
130 .into_iter()
131 .next()
132 .ok_or_else(|| anyhow::anyhow!("missing query embedding"))?;
133 self.store
134 .put_query_embedding(&cache_key, &normalized, &embedding)?;
135 let _ = self.store.prune_query_embeddings(max_entries)?;
136 Ok(embedding)
137 }
138
139 pub fn prewarm_queries(&self, queries: &[String]) -> Result<usize> {
140 let max_entries = Self::configured_query_embed_cache_size();
141 if max_entries == 0 || queries.is_empty() {
142 return Ok(0);
143 }
144 let mut prewarmed = 0usize;
145 for query in queries {
146 if query.trim().is_empty() {
147 continue;
148 }
149 let _ = self.embed_query_cached(query)?;
150 prewarmed += 1;
151 }
152 Ok(prewarmed)
153 }
154
155 pub fn query_cache_stats(&self) -> Result<QueryEmbeddingCacheStats> {
156 let max_entries = Self::configured_query_embed_cache_size();
157 let entries = if max_entries == 0 {
158 0
159 } else {
160 self.store.query_cache_count()?
161 };
162 Ok(QueryEmbeddingCacheStats {
163 enabled: max_entries > 0,
164 entries,
165 max_entries,
166 })
167 }
168
169 pub fn new(project: &ProjectRoot) -> Result<Self> {
170 let (model, dimension, model_name, runtime_info) = super::runtime::load_codesearch_model()?;
171
172 let db_dir = project.as_path().join(".codelens/index");
173 std::fs::create_dir_all(&db_dir)?;
174 let db_path = db_dir.join("embeddings.db");
175
176 let store = SqliteVecStore::new(&db_path, dimension, &model_name)?;
177
178 Ok(Self {
179 model: std::sync::Mutex::new(model),
180 store,
181 model_name,
182 runtime_info,
183 text_embed_cache: std::sync::Mutex::new(TextEmbeddingCache::new(
184 super::runtime::configured_embedding_text_cache_size(),
185 )),
186 indexing: std::sync::atomic::AtomicBool::new(false),
187 })
188 }
189
190 pub fn model_name(&self) -> &str {
191 &self.model_name
192 }
193
194 pub fn runtime_info(&self) -> &EmbeddingRuntimeInfo {
195 &self.runtime_info
196 }
197
198 pub fn is_indexing(&self) -> bool {
205 self.indexing.load(std::sync::atomic::Ordering::Relaxed)
206 }
207
208 pub fn index_from_project(&self, project: &ProjectRoot) -> Result<usize> {
209 if self
211 .indexing
212 .compare_exchange(
213 false,
214 true,
215 std::sync::atomic::Ordering::AcqRel,
216 std::sync::atomic::Ordering::Relaxed,
217 )
218 .is_err()
219 {
220 anyhow::bail!(
221 "Embedding indexing already in progress — wait for the current run to complete before retrying."
222 );
223 }
224 struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
226 impl Drop for IndexGuard<'_> {
227 fn drop(&mut self) {
228 self.0.store(false, std::sync::atomic::Ordering::Release);
229 }
230 }
231 let _guard = IndexGuard(&self.indexing);
232
233 let db_path = crate::db::index_db_path(project.as_path());
234 let symbol_db = IndexDb::open(&db_path)?;
235 let batch_size = embed_batch_size();
236 let max_symbols = max_embed_symbols();
237 let mut total_indexed = 0usize;
238 let mut total_seen = 0usize;
239 let mut model = None;
240 let mut existing_embeddings: HashMap<
241 String,
242 HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
243 > = HashMap::new();
244 let mut current_db_files = HashSet::new();
245 let mut capped = false;
246
247 self.store
248 .for_each_file_embeddings(&mut |file_path, chunks| {
249 existing_embeddings.insert(
250 file_path,
251 chunks
252 .into_iter()
253 .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
254 .collect(),
255 );
256 Ok(())
257 })?;
258
259 symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
260 current_db_files.insert(file_path.clone());
261 if capped {
262 return Ok(());
263 }
264
265 let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
266 let relevant_symbols: Vec<_> = symbols
267 .into_iter()
268 .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
269 .collect();
270
271 if relevant_symbols.is_empty() {
272 self.store.delete_by_file(&[file_path.as_str()])?;
273 existing_embeddings.remove(&file_path);
274 return Ok(());
275 }
276
277 if total_seen + relevant_symbols.len() > max_symbols {
278 capped = true;
279 return Ok(());
280 }
281 total_seen += relevant_symbols.len();
282
283 let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
284 total_indexed += self.reconcile_file_embeddings(
285 &file_path,
286 relevant_symbols,
287 source.as_deref(),
288 existing_for_file,
289 batch_size,
290 &mut model,
291 )?;
292 Ok(())
293 })?;
294
295 let removed_files: Vec<String> = existing_embeddings
296 .into_keys()
297 .filter(|file_path| !current_db_files.contains(file_path))
298 .collect();
299 if !removed_files.is_empty() {
300 let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
301 self.store.delete_by_file(&removed_refs)?;
302 }
303
304 Ok(total_indexed)
305 }
306
307 pub fn ensure_index_fresh_for_project(
308 &self,
309 project: &ProjectRoot,
310 ) -> Result<EmbeddingFreshnessReport> {
311 if self
312 .indexing
313 .compare_exchange(
314 false,
315 true,
316 std::sync::atomic::Ordering::AcqRel,
317 std::sync::atomic::Ordering::Relaxed,
318 )
319 .is_err()
320 {
321 anyhow::bail!(
322 "Embedding indexing already in progress — wait for the current run to complete before retrying."
323 );
324 }
325
326 struct IndexGuard<'a>(&'a std::sync::atomic::AtomicBool);
327 impl Drop for IndexGuard<'_> {
328 fn drop(&mut self) {
329 self.0.store(false, std::sync::atomic::Ordering::Release);
330 }
331 }
332 let _guard = IndexGuard(&self.indexing);
333
334 let db_path = crate::db::index_db_path(project.as_path());
335 let symbol_db = IndexDb::open(&db_path)?;
336 let batch_size = embed_batch_size();
337 let mut report = EmbeddingFreshnessReport::default();
338 let mut existing_embeddings: HashMap<
339 String,
340 HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
341 > = HashMap::new();
342 let mut current_db_files = HashSet::new();
343 let mut model = None;
344
345 self.store
346 .for_each_file_embeddings(&mut |file_path, chunks| {
347 existing_embeddings.insert(
348 file_path,
349 chunks
350 .into_iter()
351 .map(|chunk| (reusable_embedding_key_for_chunk(&chunk), chunk))
352 .collect(),
353 );
354 Ok(())
355 })?;
356
357 if existing_embeddings.is_empty() {
358 return Ok(report);
359 }
360
361 symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
362 current_db_files.insert(file_path.clone());
363 let Some(existing_for_file) = existing_embeddings.get(&file_path) else {
364 report.skipped_new_files += 1;
365 return Ok(());
366 };
367
368 report.checked_files += 1;
369 let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
370 let relevant_symbols: Vec<_> = symbols
371 .into_iter()
372 .filter(|sym| !is_test_only_symbol(sym, source.as_deref()))
373 .collect();
374
375 if relevant_symbols.is_empty() {
376 self.store.delete_by_file(&[file_path.as_str()])?;
377 existing_embeddings.remove(&file_path);
378 report.refreshed_files += 1;
379 return Ok(());
380 }
381
382 let current_keys = relevant_symbols
383 .iter()
384 .map(|sym| {
385 let text = build_embedding_text(sym, source.as_deref());
386 reusable_embedding_key_for_symbol(sym, &text)
387 })
388 .collect::<HashSet<_>>();
389 let stored_keys = existing_for_file.keys().cloned().collect::<HashSet<_>>();
390
391 if current_keys == stored_keys {
392 existing_embeddings.remove(&file_path);
393 report.unchanged_files += 1;
394 return Ok(());
395 }
396
397 let existing_for_file = existing_embeddings.remove(&file_path).unwrap_or_default();
398 report.indexed_symbols += self.reconcile_file_embeddings(
399 &file_path,
400 relevant_symbols,
401 source.as_deref(),
402 existing_for_file,
403 batch_size,
404 &mut model,
405 )?;
406 report.refreshed_files += 1;
407 Ok(())
408 })?;
409
410 let removed_files: Vec<String> = existing_embeddings
411 .into_keys()
412 .filter(|file_path| !current_db_files.contains(file_path))
413 .collect();
414 if !removed_files.is_empty() {
415 let removed_refs: Vec<&str> = removed_files.iter().map(String::as_str).collect();
416 report.removed_files = self.store.delete_by_file(&removed_refs)?;
417 }
418
419 Ok(report)
420 }
421
422 pub fn generate_bridge_candidates(
426 &self,
427 project: &ProjectRoot,
428 ) -> Result<Vec<(String, String)>> {
429 let db_path = crate::db::index_db_path(project.as_path());
430 let symbol_db = IndexDb::open(&db_path)?;
431 let mut bridges: Vec<(String, String)> = Vec::new();
432 let mut seen_nl = HashSet::new();
433
434 symbol_db.for_each_file_symbols_with_bytes(|file_path, symbols| {
435 let source = std::fs::read_to_string(project.as_path().join(&file_path)).ok();
436 for sym in &symbols {
437 if is_test_only_symbol(sym, source.as_deref()) {
438 continue;
439 }
440 let doc = source.as_deref().and_then(|src| {
441 extract_leading_doc(src, sym.start_byte as usize, sym.end_byte as usize)
442 });
443 let doc = match doc {
444 Some(d) if !d.is_empty() => d,
445 _ => continue,
446 };
447
448 let split = split_identifier(&sym.name);
450 let code_term = if split != sym.name {
451 format!("{} {}", sym.name, split)
452 } else {
453 sym.name.clone()
454 };
455
456 let first_line = doc.lines().next().unwrap_or("").trim().to_lowercase();
460 let clean = first_line.trim_end_matches(|c: char| c.is_ascii_punctuation());
462 let words: Vec<&str> = clean.split_whitespace().collect();
463 if words.len() < 2 {
464 continue;
465 }
466
467 for window in 2..=words.len().min(4) {
469 let key = words[..window].join(" ");
470 if key.len() < 5 || key.len() > 60 {
471 continue;
472 }
473 if seen_nl.insert(key.clone()) {
474 bridges.push((key, code_term.clone()));
475 }
476 }
477
478 if split != sym.name && !seen_nl.contains(&split.to_lowercase()) {
481 let lowered = split.to_lowercase();
482 if lowered.split_whitespace().count() >= 2 && seen_nl.insert(lowered.clone()) {
483 bridges.push((lowered, code_term.clone()));
484 }
485 }
486 }
487 Ok(())
488 })?;
489
490 Ok(bridges)
491 }
492
493 fn reconcile_file_embeddings<'a>(
494 &'a self,
495 file_path: &str,
496 symbols: Vec<crate::db::SymbolWithFile>,
497 source: Option<&str>,
498 mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk>,
499 batch_size: usize,
500 model: &mut Option<std::sync::MutexGuard<'a, TextEmbedding>>,
501 ) -> Result<usize> {
502 let mut reconciled_chunks = Vec::with_capacity(symbols.len());
503 let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
504 let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
505
506 for sym in symbols {
507 let text = build_embedding_text(&sym, source);
508 if let Some(existing) =
509 existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
510 {
511 reconciled_chunks.push(EmbeddingChunk {
512 file_path: sym.file_path.clone(),
513 symbol_name: sym.name.clone(),
514 kind: sym.kind.clone(),
515 line: sym.line as usize,
516 signature: sym.signature.clone(),
517 name_path: sym.name_path.clone(),
518 text,
519 embedding: existing.embedding,
520 doc_embedding: existing.doc_embedding,
521 });
522 continue;
523 }
524
525 batch_texts.push(text);
526 batch_meta.push(sym);
527
528 if batch_texts.len() >= batch_size {
529 if model.is_none() {
530 *model = Some(
531 self.model
532 .lock()
533 .map_err(|_| anyhow::anyhow!("model lock"))?,
534 );
535 }
536 reconciled_chunks.extend(Self::embed_chunks(
537 model.as_mut().expect("model lock initialized"),
538 &batch_texts,
539 &batch_meta,
540 )?);
541 batch_texts.clear();
542 batch_meta.clear();
543 }
544 }
545
546 if !batch_texts.is_empty() {
547 if model.is_none() {
548 *model = Some(
549 self.model
550 .lock()
551 .map_err(|_| anyhow::anyhow!("model lock"))?,
552 );
553 }
554 reconciled_chunks.extend(Self::embed_chunks(
555 model.as_mut().expect("model lock initialized"),
556 &batch_texts,
557 &batch_meta,
558 )?);
559 }
560
561 self.store.delete_by_file(&[file_path])?;
562 if reconciled_chunks.is_empty() {
563 return Ok(0);
564 }
565 self.store.insert(&reconciled_chunks)
566 }
567
568 fn embed_chunks(
569 model: &mut TextEmbedding,
570 texts: &[String],
571 meta: &[crate::db::SymbolWithFile],
572 ) -> Result<Vec<EmbeddingChunk>> {
573 let batch_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
574 let embeddings = model.embed(batch_refs, None).context("embedding failed")?;
575
576 Ok(meta
577 .iter()
578 .zip(embeddings)
579 .zip(texts.iter())
580 .map(|((sym, emb), text)| EmbeddingChunk {
581 file_path: sym.file_path.clone(),
582 symbol_name: sym.name.clone(),
583 kind: sym.kind.clone(),
584 line: sym.line as usize,
585 signature: sym.signature.clone(),
586 name_path: sym.name_path.clone(),
587 text: text.clone(),
588 embedding: emb,
589 doc_embedding: None,
590 })
591 .collect())
592 }
593
594 fn flush_batch(
596 model: &mut TextEmbedding,
597 store: &SqliteVecStore,
598 texts: &[String],
599 meta: &[crate::db::SymbolWithFile],
600 ) -> Result<usize> {
601 let chunks = Self::embed_chunks(model, texts, meta)?;
602 store.insert(&chunks)
603 }
604
605 pub fn search(&self, query: &str, max_results: usize) -> Result<Vec<SemanticMatch>> {
607 let results = self.search_scored(query, max_results)?;
608 Ok(results.into_iter().map(SemanticMatch::from).collect())
609 }
610
611 pub fn search_scored(&self, query: &str, max_results: usize) -> Result<Vec<ScoredChunk>> {
618 let query_embedding = self.embed_query_cached(query)?;
619
620 let factor = std::env::var("CODELENS_RERANK_FACTOR")
624 .ok()
625 .and_then(|v| v.parse::<usize>().ok())
626 .unwrap_or(5);
627 let candidate_count = max_results.saturating_mul(factor).max(max_results);
628 let mut candidates = self.store.search(&query_embedding, candidate_count)?;
629
630 if candidates.len() <= max_results {
631 return Ok(candidates);
632 }
633
634 let query_lower = query.to_lowercase();
637 let query_tokens: Vec<&str> = query_lower
638 .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
639 .filter(|t| t.len() >= 2)
640 .collect();
641
642 if query_tokens.is_empty() {
643 candidates.truncate(max_results);
644 return Ok(candidates);
645 }
646
647 let blend = configured_rerank_blend();
648 for chunk in &mut candidates {
649 let split_name = split_identifier(&chunk.symbol_name);
654 let searchable = format!(
655 "{} {} {} {} {}",
656 chunk.symbol_name.to_lowercase(),
657 split_name.to_lowercase(),
658 chunk.name_path.to_lowercase(),
659 chunk.signature.to_lowercase(),
660 chunk.file_path.to_lowercase(),
661 );
662 let overlap = query_tokens
663 .iter()
664 .filter(|t| searchable.contains(**t))
665 .count() as f64;
666 let overlap_ratio = overlap / query_tokens.len().max(1) as f64;
667 chunk.score = chunk.score * blend + overlap_ratio * (1.0 - blend);
669 }
670
671 candidates.sort_by(|a, b| {
672 b.score
673 .partial_cmp(&a.score)
674 .unwrap_or(std::cmp::Ordering::Equal)
675 });
676 candidates.truncate(max_results);
677 Ok(candidates)
678 }
679
680 pub fn index_changed_files(
682 &self,
683 project: &ProjectRoot,
684 changed_files: &[&str],
685 ) -> Result<usize> {
686 if changed_files.is_empty() {
687 return Ok(0);
688 }
689 let batch_size = embed_batch_size();
690 let mut existing_embeddings: HashMap<ReusableEmbeddingKey, EmbeddingChunk> = HashMap::new();
691 for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
692 for chunk in self.store.embeddings_for_files(file_chunk)? {
693 existing_embeddings.insert(reusable_embedding_key_for_chunk(&chunk), chunk);
694 }
695 }
696 self.store.delete_by_file(changed_files)?;
697
698 let db_path = crate::db::index_db_path(project.as_path());
699 let symbol_db = IndexDb::open(&db_path)?;
700
701 let mut total_indexed = 0usize;
702 let mut batch_texts: Vec<String> = Vec::with_capacity(batch_size);
703 let mut batch_meta: Vec<crate::db::SymbolWithFile> = Vec::with_capacity(batch_size);
704 let mut batch_reused: Vec<EmbeddingChunk> = Vec::with_capacity(batch_size);
705 let mut file_cache: std::collections::HashMap<String, Option<String>> =
706 std::collections::HashMap::new();
707 let mut model = None;
708
709 for file_chunk in changed_files.chunks(CHANGED_FILE_QUERY_CHUNK) {
710 let relevant = symbol_db.symbols_for_files(file_chunk)?;
711 for sym in relevant {
712 let source = file_cache.entry(sym.file_path.clone()).or_insert_with(|| {
713 std::fs::read_to_string(project.as_path().join(&sym.file_path)).ok()
714 });
715 if is_test_only_symbol(&sym, source.as_deref()) {
716 continue;
717 }
718 let text = build_embedding_text(&sym, source.as_deref());
719 if let Some(existing) =
720 existing_embeddings.remove(&reusable_embedding_key_for_symbol(&sym, &text))
721 {
722 batch_reused.push(EmbeddingChunk {
723 file_path: sym.file_path.clone(),
724 symbol_name: sym.name.clone(),
725 kind: sym.kind.clone(),
726 line: sym.line as usize,
727 signature: sym.signature.clone(),
728 name_path: sym.name_path.clone(),
729 text,
730 embedding: existing.embedding,
731 doc_embedding: existing.doc_embedding,
732 });
733 if batch_reused.len() >= batch_size {
734 total_indexed += self.store.insert(&batch_reused)?;
735 batch_reused.clear();
736 }
737 continue;
738 }
739 batch_texts.push(text);
740 batch_meta.push(sym);
741
742 if batch_texts.len() >= batch_size {
743 if model.is_none() {
744 model = Some(
745 self.model
746 .lock()
747 .map_err(|_| anyhow::anyhow!("model lock"))?,
748 );
749 }
750 total_indexed += Self::flush_batch(
751 model.as_mut().expect("model lock initialized"),
752 &self.store,
753 &batch_texts,
754 &batch_meta,
755 )?;
756 batch_texts.clear();
757 batch_meta.clear();
758 }
759 }
760 }
761
762 if !batch_reused.is_empty() {
763 total_indexed += self.store.insert(&batch_reused)?;
764 }
765
766 if !batch_texts.is_empty() {
767 if model.is_none() {
768 model = Some(
769 self.model
770 .lock()
771 .map_err(|_| anyhow::anyhow!("model lock"))?,
772 );
773 }
774 total_indexed += Self::flush_batch(
775 model.as_mut().expect("model lock initialized"),
776 &self.store,
777 &batch_texts,
778 &batch_meta,
779 )?;
780 }
781
782 Ok(total_indexed)
783 }
784
785 pub fn is_indexed(&self) -> bool {
787 self.store.count().unwrap_or(0) > 0
788 }
789
790 pub fn index_info(&self) -> EmbeddingIndexInfo {
791 EmbeddingIndexInfo {
792 model_name: self.model_name.clone(),
793 indexed_symbols: self.store.count().unwrap_or(0),
794 }
795 }
796
797 pub fn inspect_existing_index(project: &ProjectRoot) -> Result<Option<EmbeddingIndexInfo>> {
798 let db_path = project.as_path().join(".codelens/index/embeddings.db");
799 if !db_path.exists() {
800 return Ok(None);
801 }
802
803 let conn =
804 crate::db::open_derived_sqlite_with_recovery(&db_path, "embedding index", || {
805 ffi::register_sqlite_vec()?;
806 let conn = Connection::open(&db_path)?;
807 conn.execute_batch("PRAGMA busy_timeout=5000;")?;
808 conn.query_row("PRAGMA schema_version", [], |_row| Ok(()))?;
809 Ok(conn)
810 })?;
811
812 let model_name: Option<String> = conn
813 .query_row(
814 "SELECT value FROM meta WHERE key = 'model' LIMIT 1",
815 [],
816 |row| row.get(0),
817 )
818 .ok();
819 let indexed_symbols: usize = conn
820 .query_row("SELECT COUNT(*) FROM symbols", [], |row| {
821 row.get::<_, i64>(0)
822 })
823 .map(|count| count.max(0) as usize)
824 .unwrap_or(0);
825
826 Ok(model_name.map(|model_name| EmbeddingIndexInfo {
827 model_name,
828 indexed_symbols,
829 }))
830 }
831
832 pub fn find_similar_code(
836 &self,
837 file_path: &str,
838 symbol_name: &str,
839 max_results: usize,
840 ) -> Result<Vec<SemanticMatch>> {
841 let target = self
842 .store
843 .get_embedding(file_path, symbol_name)?
844 .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
845
846 let oversample = max_results.saturating_add(8).max(1);
847 let scored = self
848 .store
849 .search(&target.embedding, oversample)?
850 .into_iter()
851 .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
852 .take(max_results)
853 .map(SemanticMatch::from)
854 .collect();
855 Ok(scored)
856 }
857
858 pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
861 let mut pairs = Vec::new();
862 let mut seen_pairs = HashSet::new();
863 let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
864 let candidate_limit = duplicate_candidate_limit(max_pairs);
865 let mut done = false;
866
867 self.store
868 .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut |batch| {
869 if done {
870 return Ok(());
871 }
872
873 let mut candidate_lists = Vec::with_capacity(batch.len());
874 let mut missing_candidates = Vec::new();
875 let mut missing_keys = HashSet::new();
876
877 for chunk in &batch {
878 if pairs.len() >= max_pairs {
879 done = true;
880 break;
881 }
882
883 let filtered: Vec<ScoredChunk> = self
884 .store
885 .search(&chunk.embedding, candidate_limit)?
886 .into_iter()
887 .filter(|candidate| {
888 !(chunk.file_path == candidate.file_path
889 && chunk.symbol_name == candidate.symbol_name
890 && chunk.line == candidate.line
891 && chunk.signature == candidate.signature
892 && chunk.name_path == candidate.name_path)
893 })
894 .collect();
895
896 for candidate in &filtered {
897 let cache_key = stored_chunk_key_for_score(candidate);
898 if !embedding_cache.contains_key(&cache_key)
899 && missing_keys.insert(cache_key)
900 {
901 missing_candidates.push(candidate.clone());
902 }
903 }
904
905 candidate_lists.push(filtered);
906 }
907
908 if !missing_candidates.is_empty() {
909 for candidate_chunk in self
910 .store
911 .embeddings_for_scored_chunks(&missing_candidates)?
912 {
913 embedding_cache
914 .entry(stored_chunk_key(&candidate_chunk))
915 .or_insert_with(|| Arc::new(candidate_chunk));
916 }
917 }
918
919 for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
920 if pairs.len() >= max_pairs {
921 done = true;
922 break;
923 }
924
925 for candidate in candidates {
926 let pair_key = duplicate_pair_key(
927 &chunk.file_path,
928 &chunk.symbol_name,
929 &candidate.file_path,
930 &candidate.symbol_name,
931 );
932 if !seen_pairs.insert(pair_key) {
933 continue;
934 }
935
936 let Some(candidate_chunk) =
937 embedding_cache.get(&stored_chunk_key_for_score(candidate))
938 else {
939 continue;
940 };
941
942 let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
943 if sim < threshold {
944 continue;
945 }
946
947 pairs.push(DuplicatePair {
948 symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
949 symbol_b: format!(
950 "{}:{}",
951 candidate_chunk.file_path, candidate_chunk.symbol_name
952 ),
953 file_a: chunk.file_path.clone(),
954 file_b: candidate_chunk.file_path.clone(),
955 line_a: chunk.line,
956 line_b: candidate_chunk.line,
957 similarity: sim,
958 });
959 if pairs.len() >= max_pairs {
960 done = true;
961 break;
962 }
963 }
964 }
965 Ok(())
966 })?;
967
968 pairs.sort_by(|a, b| {
969 b.similarity
970 .partial_cmp(&a.similarity)
971 .unwrap_or(std::cmp::Ordering::Equal)
972 });
973 Ok(pairs)
974 }
975}
976
977impl EmbeddingEngine {
978 pub fn classify_symbol(
980 &self,
981 file_path: &str,
982 symbol_name: &str,
983 categories: &[&str],
984 ) -> Result<Vec<CategoryScore>> {
985 let target = match self.store.get_embedding(file_path, symbol_name)? {
986 Some(target) => target,
987 None => self
988 .store
989 .all_with_embeddings()?
990 .into_iter()
991 .find(|c| c.file_path == file_path && c.symbol_name == symbol_name)
992 .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?,
993 };
994
995 let embeddings = self.embed_texts_cached(categories)?;
996
997 let mut scores: Vec<CategoryScore> = categories
998 .iter()
999 .zip(embeddings.iter())
1000 .map(|(cat, emb)| CategoryScore {
1001 category: cat.to_string(),
1002 score: cosine_similarity(&target.embedding, emb),
1003 })
1004 .collect();
1005
1006 scores.sort_by(|a, b| {
1007 b.score
1008 .partial_cmp(&a.score)
1009 .unwrap_or(std::cmp::Ordering::Equal)
1010 });
1011 Ok(scores)
1012 }
1013
1014 pub fn find_misplaced_code(&self, max_results: usize) -> Result<Vec<OutlierSymbol>> {
1016 let mut outliers = Vec::new();
1017
1018 self.store
1019 .for_each_file_embeddings(&mut |file_path, chunks| {
1020 if chunks.len() < 2 {
1021 return Ok(());
1022 }
1023
1024 for (idx, chunk) in chunks.iter().enumerate() {
1025 let mut sim_sum = 0.0;
1026 let mut count = 0;
1027 for (other_idx, other_chunk) in chunks.iter().enumerate() {
1028 if other_idx == idx {
1029 continue;
1030 }
1031 sim_sum += cosine_similarity(&chunk.embedding, &other_chunk.embedding);
1032 count += 1;
1033 }
1034 if count > 0 {
1035 let avg_sim = sim_sum / count as f64; outliers.push(OutlierSymbol {
1037 file_path: file_path.clone(),
1038 symbol_name: chunk.symbol_name.clone(),
1039 kind: chunk.kind.clone(),
1040 line: chunk.line,
1041 avg_similarity_to_file: avg_sim,
1042 });
1043 }
1044 }
1045 Ok(())
1046 })?;
1047
1048 outliers.sort_by(|a, b| {
1049 a.avg_similarity_to_file
1050 .partial_cmp(&b.avg_similarity_to_file)
1051 .unwrap_or(std::cmp::Ordering::Equal)
1052 });
1053 outliers.truncate(max_results);
1054 Ok(outliers)
1055 }
1056}