codelens_engine/embedding/
duplicates.rs1use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
8use anyhow::Result;
9use std::collections::{HashMap, HashSet};
10use std::sync::Arc;
11
12use super::chunk_ops::{
13 DuplicatePair, SIGNATURE_ONLY_COSINE_FLOOR, SIGNATURE_ONLY_JACCARD_CEIL, StoredChunkKey,
14 body_token_jaccard, cosine_similarity, duplicate_candidate_limit, duplicate_pair_key,
15 stored_chunk_key, stored_chunk_key_for_score,
16};
17use super::{DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, EmbeddingEngine, SemanticMatch};
18
19impl EmbeddingEngine {
20 pub fn find_similar_code(
24 &self,
25 file_path: &str,
26 symbol_name: &str,
27 max_results: usize,
28 ) -> Result<Vec<SemanticMatch>> {
29 let target = self
30 .store
31 .get_embedding(file_path, symbol_name)?
32 .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
33
34 let oversample = max_results.saturating_add(8).max(1);
35 let scored = self
36 .store
37 .search(&target.embedding, oversample)?
38 .into_iter()
39 .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
40 .take(max_results)
41 .map(SemanticMatch::from)
42 .collect();
43 Ok(scored)
44 }
45
46 pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
49 self.find_duplicates_in_scope(threshold, max_pairs, None)
50 }
51
52 fn normalize_duplicate_scope(scope: Option<&str>) -> Option<String> {
53 let scope = scope?
54 .trim()
55 .trim_start_matches("./")
56 .trim_end_matches('/')
57 .replace('\\', "/");
58 if scope.is_empty() || scope == "." {
59 None
60 } else {
61 Some(scope)
62 }
63 }
64
65 fn file_in_duplicate_scope(scope: &str, file_path: &str) -> bool {
66 let file_path = file_path.trim_start_matches("./");
67 file_path == scope
68 || file_path
69 .strip_prefix(scope)
70 .is_some_and(|suffix| suffix.starts_with('/'))
71 }
72
73 fn duplicate_pair_matches_scope(scope: Option<&str>, file_a: &str, file_b: &str) -> bool {
74 let Some(scope) = scope else {
75 return true;
76 };
77 Self::file_in_duplicate_scope(scope, file_a) || Self::file_in_duplicate_scope(scope, file_b)
78 }
79
80 pub fn find_duplicates_in_scope(
85 &self,
86 threshold: f64,
87 max_pairs: usize,
88 scope: Option<&str>,
89 ) -> Result<Vec<DuplicatePair>> {
90 if max_pairs == 0 {
91 return Ok(Vec::new());
92 }
93
94 let scope = Self::normalize_duplicate_scope(scope);
95 let mut pairs = Vec::new();
96 let mut seen_pairs = HashSet::new();
97 let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
98 let candidate_limit = duplicate_candidate_limit(max_pairs);
99 let mut done = false;
100
101 let mut visit_batch = |batch: Vec<EmbeddingChunk>| {
102 if done {
103 return Ok(());
104 }
105
106 let mut candidate_lists = Vec::with_capacity(batch.len());
107 let mut missing_candidates = Vec::new();
108 let mut missing_keys = HashSet::new();
109
110 for chunk in &batch {
111 if pairs.len() >= max_pairs {
112 done = true;
113 break;
114 }
115
116 let filtered: Vec<ScoredChunk> = self
117 .store
118 .search(&chunk.embedding, candidate_limit)?
119 .into_iter()
120 .filter(|candidate| {
121 !(chunk.file_path == candidate.file_path
122 && chunk.symbol_name == candidate.symbol_name
123 && chunk.line == candidate.line
124 && chunk.signature == candidate.signature
125 && chunk.name_path == candidate.name_path)
126 })
127 .filter(|candidate| {
128 Self::duplicate_pair_matches_scope(
129 scope.as_deref(),
130 &chunk.file_path,
131 &candidate.file_path,
132 )
133 })
134 .collect();
135
136 for candidate in &filtered {
137 let cache_key = stored_chunk_key_for_score(candidate);
138 if !embedding_cache.contains_key(&cache_key) && missing_keys.insert(cache_key) {
139 missing_candidates.push(candidate.clone());
140 }
141 }
142
143 candidate_lists.push(filtered);
144 }
145
146 if !missing_candidates.is_empty() {
147 for candidate_chunk in self
148 .store
149 .embeddings_for_scored_chunks(&missing_candidates)?
150 {
151 embedding_cache
152 .entry(stored_chunk_key(&candidate_chunk))
153 .or_insert_with(|| Arc::new(candidate_chunk));
154 }
155 }
156
157 for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
158 if pairs.len() >= max_pairs {
159 done = true;
160 break;
161 }
162
163 for candidate in candidates {
164 let pair_key = duplicate_pair_key(
165 &chunk.file_path,
166 &chunk.symbol_name,
167 &candidate.file_path,
168 &candidate.symbol_name,
169 );
170 if !seen_pairs.insert(pair_key) {
171 continue;
172 }
173
174 let Some(candidate_chunk) =
175 embedding_cache.get(&stored_chunk_key_for_score(candidate))
176 else {
177 continue;
178 };
179
180 let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
181 let effective_threshold = effective_duplicate_threshold(
184 threshold,
185 &chunk.file_path,
186 &candidate_chunk.file_path,
187 );
188 if sim < effective_threshold {
189 continue;
190 }
191
192 let jaccard = body_token_jaccard(&chunk.text, &candidate_chunk.text);
200 let signature_only_match = matches!(
201 (sim >= SIGNATURE_ONLY_COSINE_FLOOR, jaccard),
202 (true, Some(j)) if j < SIGNATURE_ONLY_JACCARD_CEIL
203 );
204
205 pairs.push(DuplicatePair {
206 symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
207 symbol_b: format!(
208 "{}:{}",
209 candidate_chunk.file_path, candidate_chunk.symbol_name
210 ),
211 file_a: chunk.file_path.clone(),
212 file_b: candidate_chunk.file_path.clone(),
213 line_a: chunk.line,
214 line_b: candidate_chunk.line,
215 similarity: sim,
216 body_token_jaccard: jaccard,
217 signature_only_match,
218 kind_a: chunk.kind.clone(),
219 kind_b: candidate_chunk.kind.clone(),
220 });
221 if pairs.len() >= max_pairs {
222 done = true;
223 break;
224 }
225 }
226 }
227 Ok(())
228 };
229
230 if let Some(scope) = scope.as_deref() {
231 self.store.for_each_embedding_batch_in_scope(
232 scope,
233 DEFAULT_DUPLICATE_SCAN_BATCH_SIZE,
234 &mut visit_batch,
235 )?;
236 } else {
237 self.store
238 .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut visit_batch)?;
239 }
240
241 pairs.sort_by(|a, b| {
242 b.similarity
243 .partial_cmp(&a.similarity)
244 .unwrap_or(std::cmp::Ordering::Equal)
245 });
246 Ok(pairs)
247 }
248}
249
250const STRUCTURED_FILETYPE_DUPLICATE_FLOOR: f64 = 0.95;
260
261fn structured_filetype_floor(path: &str) -> f64 {
264 let ext = std::path::Path::new(path)
265 .extension()
266 .and_then(|e| e.to_str())
267 .map(|e| e.to_ascii_lowercase());
268 match ext.as_deref() {
269 Some("yml" | "yaml" | "json" | "toml" | "lock" | "md" | "cfg" | "ini") => {
270 STRUCTURED_FILETYPE_DUPLICATE_FLOOR
271 }
272 _ => 0.0,
273 }
274}
275
276fn effective_duplicate_threshold(base: f64, file_a: &str, file_b: &str) -> f64 {
279 base.max(structured_filetype_floor(file_a))
280 .max(structured_filetype_floor(file_b))
281}
282
283#[cfg(test)]
284mod g6_filetype_threshold_tests {
285 use super::{effective_duplicate_threshold, structured_filetype_floor};
286
287 #[test]
288 fn structured_filetypes_get_higher_floor() {
289 assert_eq!(structured_filetype_floor("ci.yml"), 0.95);
290 assert_eq!(structured_filetype_floor("a/b/config.yaml"), 0.95);
291 assert_eq!(structured_filetype_floor("Cargo.lock"), 0.95);
292 assert_eq!(structured_filetype_floor("data.json"), 0.95);
293 assert_eq!(structured_filetype_floor("Config.TOML"), 0.95);
294 }
295
296 #[test]
297 fn code_and_unknown_filetypes_get_no_floor() {
298 assert_eq!(structured_filetype_floor("src/main.rs"), 0.0);
299 assert_eq!(structured_filetype_floor("app.py"), 0.0);
300 assert_eq!(structured_filetype_floor("noext"), 0.0);
301 }
302
303 #[test]
304 fn effective_threshold_raises_when_either_side_structured() {
305 assert_eq!(effective_duplicate_threshold(0.85, "a.yml", "b.rs"), 0.95);
306 assert_eq!(effective_duplicate_threshold(0.85, "a.rs", "b.yaml"), 0.95);
307 }
308
309 #[test]
310 fn effective_threshold_keeps_base_for_code_pairs() {
311 assert_eq!(effective_duplicate_threshold(0.85, "a.rs", "b.rs"), 0.85);
312 }
313
314 #[test]
315 fn effective_threshold_respects_stricter_base() {
316 let t = effective_duplicate_threshold(0.97, "a.yml", "b.yml");
317 assert!((t - 0.97).abs() < 1e-9, "got {t}");
318 }
319}