Skip to main content

codelens_engine/embedding/
duplicates.rs

1//! Duplicate / similarity detection methods for [`EmbeddingEngine`].
2//!
3//! Extracted from `engine_impl.rs` to keep each file focused on a single
4//! responsibility. All six methods (and their private helpers) live here so
5//! the private helpers remain in scope for the callers within this file.
6
7use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
8use anyhow::Result;
9use std::collections::{HashMap, HashSet};
10use std::sync::Arc;
11
12use super::chunk_ops::{
13    DuplicatePair, SIGNATURE_ONLY_COSINE_FLOOR, SIGNATURE_ONLY_JACCARD_CEIL, StoredChunkKey,
14    body_token_jaccard, cosine_similarity, duplicate_candidate_limit, duplicate_pair_key,
15    stored_chunk_key, stored_chunk_key_for_score,
16};
17use super::{DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, EmbeddingEngine, SemanticMatch};
18
19impl EmbeddingEngine {
20    // ── Embedding-powered analysis ─────────────────────────────────
21
22    /// Find code symbols most similar to the given symbol.
23    pub fn find_similar_code(
24        &self,
25        file_path: &str,
26        symbol_name: &str,
27        max_results: usize,
28    ) -> Result<Vec<SemanticMatch>> {
29        let target = self
30            .store
31            .get_embedding(file_path, symbol_name)?
32            .ok_or_else(|| anyhow::anyhow!("Symbol '{}' not found in index", symbol_name))?;
33
34        let oversample = max_results.saturating_add(8).max(1);
35        let scored = self
36            .store
37            .search(&target.embedding, oversample)?
38            .into_iter()
39            .filter(|c| !(c.file_path == file_path && c.symbol_name == symbol_name))
40            .take(max_results)
41            .map(SemanticMatch::from)
42            .collect();
43        Ok(scored)
44    }
45
46    /// Find near-duplicate code pairs across the codebase.
47    /// Returns pairs with cosine similarity above the threshold (default 0.85).
48    pub fn find_duplicates(&self, threshold: f64, max_pairs: usize) -> Result<Vec<DuplicatePair>> {
49        self.find_duplicates_in_scope(threshold, max_pairs, None)
50    }
51
52    fn normalize_duplicate_scope(scope: Option<&str>) -> Option<String> {
53        let scope = scope?
54            .trim()
55            .trim_start_matches("./")
56            .trim_end_matches('/')
57            .replace('\\', "/");
58        if scope.is_empty() || scope == "." {
59            None
60        } else {
61            Some(scope)
62        }
63    }
64
65    fn file_in_duplicate_scope(scope: &str, file_path: &str) -> bool {
66        let file_path = file_path.trim_start_matches("./");
67        file_path == scope
68            || file_path
69                .strip_prefix(scope)
70                .is_some_and(|suffix| suffix.starts_with('/'))
71    }
72
73    fn duplicate_pair_matches_scope(scope: Option<&str>, file_a: &str, file_b: &str) -> bool {
74        let Some(scope) = scope else {
75            return true;
76        };
77        Self::file_in_duplicate_scope(scope, file_a) || Self::file_in_duplicate_scope(scope, file_b)
78    }
79
80    /// Find near-duplicate code pairs, using scoped anchors when `scope` is provided.
81    ///
82    /// Candidate search remains global, so cross-boundary duplicates remain
83    /// visible without paying a full-corpus anchor scan for narrow scopes.
84    pub fn find_duplicates_in_scope(
85        &self,
86        threshold: f64,
87        max_pairs: usize,
88        scope: Option<&str>,
89    ) -> Result<Vec<DuplicatePair>> {
90        if max_pairs == 0 {
91            return Ok(Vec::new());
92        }
93
94        let scope = Self::normalize_duplicate_scope(scope);
95        let mut pairs = Vec::new();
96        let mut seen_pairs = HashSet::new();
97        let mut embedding_cache: HashMap<StoredChunkKey, Arc<EmbeddingChunk>> = HashMap::new();
98        let candidate_limit = duplicate_candidate_limit(max_pairs);
99        let mut done = false;
100
101        let mut visit_batch = |batch: Vec<EmbeddingChunk>| {
102            if done {
103                return Ok(());
104            }
105
106            let mut candidate_lists = Vec::with_capacity(batch.len());
107            let mut missing_candidates = Vec::new();
108            let mut missing_keys = HashSet::new();
109
110            for chunk in &batch {
111                if pairs.len() >= max_pairs {
112                    done = true;
113                    break;
114                }
115
116                let filtered: Vec<ScoredChunk> = self
117                    .store
118                    .search(&chunk.embedding, candidate_limit)?
119                    .into_iter()
120                    .filter(|candidate| {
121                        !(chunk.file_path == candidate.file_path
122                            && chunk.symbol_name == candidate.symbol_name
123                            && chunk.line == candidate.line
124                            && chunk.signature == candidate.signature
125                            && chunk.name_path == candidate.name_path)
126                    })
127                    .filter(|candidate| {
128                        Self::duplicate_pair_matches_scope(
129                            scope.as_deref(),
130                            &chunk.file_path,
131                            &candidate.file_path,
132                        )
133                    })
134                    .collect();
135
136                for candidate in &filtered {
137                    let cache_key = stored_chunk_key_for_score(candidate);
138                    if !embedding_cache.contains_key(&cache_key) && missing_keys.insert(cache_key) {
139                        missing_candidates.push(candidate.clone());
140                    }
141                }
142
143                candidate_lists.push(filtered);
144            }
145
146            if !missing_candidates.is_empty() {
147                for candidate_chunk in self
148                    .store
149                    .embeddings_for_scored_chunks(&missing_candidates)?
150                {
151                    embedding_cache
152                        .entry(stored_chunk_key(&candidate_chunk))
153                        .or_insert_with(|| Arc::new(candidate_chunk));
154                }
155            }
156
157            for (chunk, candidates) in batch.iter().zip(candidate_lists.iter()) {
158                if pairs.len() >= max_pairs {
159                    done = true;
160                    break;
161                }
162
163                for candidate in candidates {
164                    let pair_key = duplicate_pair_key(
165                        &chunk.file_path,
166                        &chunk.symbol_name,
167                        &candidate.file_path,
168                        &candidate.symbol_name,
169                    );
170                    if !seen_pairs.insert(pair_key) {
171                        continue;
172                    }
173
174                    let Some(candidate_chunk) =
175                        embedding_cache.get(&stored_chunk_key_for_score(candidate))
176                    else {
177                        continue;
178                    };
179
180                    let sim = cosine_similarity(&chunk.embedding, &candidate_chunk.embedding);
181                    // G6: structured/config filetypes need a higher floor than
182                    // code, because boilerplate structure inflates cosine.
183                    let effective_threshold = effective_duplicate_threshold(
184                        threshold,
185                        &chunk.file_path,
186                        &candidate_chunk.file_path,
187                    );
188                    if sim < effective_threshold {
189                        continue;
190                    }
191
192                    // #299: a high embedding cosine can match on
193                    // signature + identifier shape alone — three
194                    // namespaced wrappers around the same helper
195                    // produced 0.94–0.96 pairs even though their
196                    // predicates diverged. Tag the pair when body token
197                    // Jaccard contradicts the cosine so callers can
198                    // suppress signature-only matches.
199                    let jaccard = body_token_jaccard(&chunk.text, &candidate_chunk.text);
200                    let signature_only_match = matches!(
201                        (sim >= SIGNATURE_ONLY_COSINE_FLOOR, jaccard),
202                        (true, Some(j)) if j < SIGNATURE_ONLY_JACCARD_CEIL
203                    );
204
205                    pairs.push(DuplicatePair {
206                        symbol_a: format!("{}:{}", chunk.file_path, chunk.symbol_name),
207                        symbol_b: format!(
208                            "{}:{}",
209                            candidate_chunk.file_path, candidate_chunk.symbol_name
210                        ),
211                        file_a: chunk.file_path.clone(),
212                        file_b: candidate_chunk.file_path.clone(),
213                        line_a: chunk.line,
214                        line_b: candidate_chunk.line,
215                        similarity: sim,
216                        body_token_jaccard: jaccard,
217                        signature_only_match,
218                        kind_a: chunk.kind.clone(),
219                        kind_b: candidate_chunk.kind.clone(),
220                    });
221                    if pairs.len() >= max_pairs {
222                        done = true;
223                        break;
224                    }
225                }
226            }
227            Ok(())
228        };
229
230        if let Some(scope) = scope.as_deref() {
231            self.store.for_each_embedding_batch_in_scope(
232                scope,
233                DEFAULT_DUPLICATE_SCAN_BATCH_SIZE,
234                &mut visit_batch,
235            )?;
236        } else {
237            self.store
238                .for_each_embedding_batch(DEFAULT_DUPLICATE_SCAN_BATCH_SIZE, &mut visit_batch)?;
239        }
240
241        pairs.sort_by(|a, b| {
242            b.similarity
243                .partial_cmp(&a.similarity)
244                .unwrap_or(std::cmp::Ordering::Equal)
245        });
246        Ok(pairs)
247    }
248}
249
250// ── G6: filetype-aware duplicate threshold ─────────────────────────
251// Structured/config filetypes (CI YAML, lockfiles, JSON/TOML, Markdown)
252// share boilerplate structure that inflates embedding cosine, producing
253// false-positive "duplicate" pairs at the default 0.85 floor. Raise the
254// floor for those filetypes; code files keep the caller's threshold so
255// behavior is unchanged for code-vs-code pairs.
256
257/// Similarity floor for structured/config filetypes whose boilerplate
258/// inflates embedding cosine (CI YAML, lockfiles, JSON/TOML/INI, Markdown).
259const STRUCTURED_FILETYPE_DUPLICATE_FLOOR: f64 = 0.95;
260
261/// Similarity floor for a structured/config filetype, or `0.0` for code
262/// and unknown extensions. Extension match is case-insensitive.
263fn structured_filetype_floor(path: &str) -> f64 {
264    let ext = std::path::Path::new(path)
265        .extension()
266        .and_then(|e| e.to_str())
267        .map(|e| e.to_ascii_lowercase());
268    match ext.as_deref() {
269        Some("yml" | "yaml" | "json" | "toml" | "lock" | "md" | "cfg" | "ini") => {
270            STRUCTURED_FILETYPE_DUPLICATE_FLOOR
271        }
272        _ => 0.0,
273    }
274}
275
276/// Effective duplicate threshold for a pair: the higher of the caller's
277/// `base` and the structured-filetype floor of either file.
278fn effective_duplicate_threshold(base: f64, file_a: &str, file_b: &str) -> f64 {
279    base.max(structured_filetype_floor(file_a))
280        .max(structured_filetype_floor(file_b))
281}
282
283#[cfg(test)]
284mod g6_filetype_threshold_tests {
285    use super::{effective_duplicate_threshold, structured_filetype_floor};
286
287    #[test]
288    fn structured_filetypes_get_higher_floor() {
289        assert_eq!(structured_filetype_floor("ci.yml"), 0.95);
290        assert_eq!(structured_filetype_floor("a/b/config.yaml"), 0.95);
291        assert_eq!(structured_filetype_floor("Cargo.lock"), 0.95);
292        assert_eq!(structured_filetype_floor("data.json"), 0.95);
293        assert_eq!(structured_filetype_floor("Config.TOML"), 0.95);
294    }
295
296    #[test]
297    fn code_and_unknown_filetypes_get_no_floor() {
298        assert_eq!(structured_filetype_floor("src/main.rs"), 0.0);
299        assert_eq!(structured_filetype_floor("app.py"), 0.0);
300        assert_eq!(structured_filetype_floor("noext"), 0.0);
301    }
302
303    #[test]
304    fn effective_threshold_raises_when_either_side_structured() {
305        assert_eq!(effective_duplicate_threshold(0.85, "a.yml", "b.rs"), 0.95);
306        assert_eq!(effective_duplicate_threshold(0.85, "a.rs", "b.yaml"), 0.95);
307    }
308
309    #[test]
310    fn effective_threshold_keeps_base_for_code_pairs() {
311        assert_eq!(effective_duplicate_threshold(0.85, "a.rs", "b.rs"), 0.85);
312    }
313
314    #[test]
315    fn effective_threshold_respects_stricter_base() {
316        let t = effective_duplicate_threshold(0.97, "a.yml", "b.yml");
317        assert!((t - 0.97).abs() < 1e-9, "got {t}");
318    }
319}