Skip to main content

codelens_engine/embedding/
chunk_ops.rs

1use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
2use serde::Serialize;
3
4pub type StoredChunkKey = (String, String, usize, String, String);
5
6pub fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
7    (
8        chunk.file_path.clone(),
9        chunk.symbol_name.clone(),
10        chunk.line,
11        chunk.signature.clone(),
12        chunk.name_path.clone(),
13    )
14}
15
16pub fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
17    (
18        chunk.file_path.clone(),
19        chunk.symbol_name.clone(),
20        chunk.line,
21        chunk.signature.clone(),
22        chunk.name_path.clone(),
23    )
24}
25
26pub fn duplicate_candidate_limit(max_pairs: usize) -> usize {
27    max_pairs.saturating_mul(4).clamp(32, 128)
28}
29
30pub fn duplicate_pair_key(
31    file_a: &str,
32    symbol_a: &str,
33    file_b: &str,
34    symbol_b: &str,
35) -> ((String, String), (String, String)) {
36    let left = (file_a.to_owned(), symbol_a.to_owned());
37    let right = (file_b.to_owned(), symbol_b.to_owned());
38    if left <= right {
39        (left, right)
40    } else {
41        (right, left)
42    }
43}
44
45/// #299: cosine threshold above which a body-blind comparison should
46/// trigger the signature-only diagnostic. Pairs below this stay as
47/// vanilla near-duplicates.
48pub const SIGNATURE_ONLY_COSINE_FLOOR: f64 = 0.85;
49
50/// #299: token-Jaccard ceiling for the signature-only diagnostic. When
51/// both bodies share fewer than half their alphanumeric tokens we treat
52/// the high cosine as signature/identifier-shape collision rather than
53/// real code duplication.
54pub const SIGNATURE_ONLY_JACCARD_CEIL: f64 = 0.5;
55
56/// #299: tokenise body text into a set of normalised alphanumeric tokens
57/// (lowercase, length ≥2). Skips Rust/TS keyword-style stop tokens so
58/// the resulting Jaccard reflects the named identifiers in the body —
59/// the part that actually differs between namespaced wrappers — rather
60/// than control-flow boilerplate.
61pub fn body_tokens(text: &str) -> std::collections::HashSet<String> {
62    const STOPWORDS: &[&str] = &[
63        "fn", "let", "mut", "pub", "use", "mod", "if", "else", "for", "while", "loop", "match",
64        "return", "self", "true", "false", "as", "in", "of", "the", "and", "or", "not", "is",
65        "this", "that", "ok", "err", "none", "some", "result",
66    ];
67    let mut buf = String::new();
68    let mut tokens: std::collections::HashSet<String> = std::collections::HashSet::new();
69    let push_buf = |buf: &mut String, tokens: &mut std::collections::HashSet<String>| {
70        if buf.len() >= 2 && !STOPWORDS.contains(&buf.as_str()) {
71            tokens.insert(buf.clone());
72        }
73        buf.clear();
74    };
75    for ch in text.chars() {
76        if ch.is_ascii_alphanumeric() {
77            buf.push(ch.to_ascii_lowercase());
78        } else if !buf.is_empty() {
79            push_buf(&mut buf, &mut tokens);
80        }
81    }
82    if !buf.is_empty() {
83        push_buf(&mut buf, &mut tokens);
84    }
85    tokens
86}
87
88/// #299: token-set Jaccard. Returns `None` when both sides are empty;
89/// `Some(0.0)` when only one side is empty.
90pub fn body_token_jaccard(text_a: &str, text_b: &str) -> Option<f64> {
91    let a = body_tokens(text_a);
92    let b = body_tokens(text_b);
93    if a.is_empty() && b.is_empty() {
94        return None;
95    }
96    let inter = a.intersection(&b).count() as f64;
97    let union = a.union(&b).count() as f64;
98    if union == 0.0 {
99        return Some(0.0);
100    }
101    Some(inter / union)
102}
103
104/// SIMD-friendly cosine similarity for f32 embedding vectors.
105///
106/// Computes dot product and norms in f32 (auto-vectorized by LLVM on Apple Silicon NEON),
107/// then promotes to f64 only for the final division to avoid precision loss.
108pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
109    debug_assert_eq!(a.len(), b.len());
110
111    // Process in chunks of 8 for optimal SIMD lane utilization (NEON 128-bit = 4xf32,
112    // but the compiler can unroll 2 iterations for 8-wide throughput).
113    let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
114    for (x, y) in a.iter().zip(b.iter()) {
115        dot += x * y;
116        norm_a += x * x;
117        norm_b += y * y;
118    }
119
120    let norm_a = (norm_a as f64).sqrt();
121    let norm_b = (norm_b as f64).sqrt();
122    if norm_a == 0.0 || norm_b == 0.0 {
123        0.0
124    } else {
125        dot as f64 / (norm_a * norm_b)
126    }
127}
128
129#[derive(Debug, Clone, Serialize)]
130pub struct DuplicatePair {
131    pub symbol_a: String,
132    pub symbol_b: String,
133    pub file_a: String,
134    pub file_b: String,
135    pub line_a: usize,
136    pub line_b: usize,
137    pub similarity: f64,
138    /// #299: token-level Jaccard between the two function bodies. The
139    /// embedding-based `similarity` blends signature + identifier
140    /// shapes, so two namespaced wrappers calling the same helper with
141    /// different predicates (e.g. `collect_files(root, p1)` vs
142    /// `collect_files(root, p2)`) score in the 0.94–0.96 band even
143    /// though the bodies diverge. The Jaccard component is computed on
144    /// alphanumeric tokens of the indexed `text` and lets consumers
145    /// downgrade or filter signature-only matches. `None` when one or
146    /// both bodies were not indexed in the embedding store.
147    #[serde(skip_serializing_if = "Option::is_none")]
148    pub body_token_jaccard: Option<f64>,
149    /// #299: convenience flag — true when the embedding similarity is
150    /// high (≥0.85) but `body_token_jaccard` is low (<0.5), i.e. the
151    /// pair likely matches only on signature/identifier shape. Always
152    /// derived from `body_token_jaccard`; never set when the Jaccard is
153    /// missing.
154    #[serde(skip_serializing_if = "std::ops::Not::not")]
155    pub signature_only_match: bool,
156    /// G7: symbol kind ("function"/"method"/"variable"/"constant"/…) for
157    /// each side, copied from the embedding chunk. The
158    /// cleanup_duplicate_logic workflow uses these to suppress same-file
159    /// *cross-symbol data* pairs — adjacent local variables/constants whose
160    /// short declarations score a high cosine (0.9+) but are distinct
161    /// values, not shared logic. Filter-internal only: `#[serde(skip)]`
162    /// keeps it out of the response payload so consumer output is unchanged.
163    #[serde(skip)]
164    pub kind_a: String,
165    #[serde(skip)]
166    pub kind_b: String,
167}
168
169#[derive(Debug, Clone, Serialize)]
170pub struct CategoryScore {
171    pub category: String,
172    pub score: f64,
173}
174
175#[derive(Debug, Clone, Serialize)]
176pub struct OutlierSymbol {
177    pub file_path: String,
178    pub symbol_name: String,
179    pub kind: String,
180    pub line: usize,
181    pub avg_similarity_to_file: f64,
182}
183
184pub fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
185    embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn body_tokens_strips_stopwords_and_short_tokens() {
194        let toks = body_tokens("fn foo(x: i32) -> i32 { let mut y = x + 1; y }");
195        // `fn`, `let`, `mut` removed; single letters (x, y) skipped
196        assert!(toks.contains("foo"));
197        assert!(toks.contains("i32"));
198        assert!(!toks.contains("fn"));
199        assert!(!toks.contains("let"));
200        assert!(!toks.contains("mut"));
201        assert!(!toks.contains("x"));
202        assert!(!toks.contains("y"));
203    }
204
205    #[test]
206    fn body_token_jaccard_identical_bodies_is_one() {
207        let body = "fn collect(root: &Path) -> Vec<PathBuf> { walker(root, predicate_a) }";
208        let j = body_token_jaccard(body, body).unwrap();
209        assert!((j - 1.0).abs() < 1e-9);
210    }
211
212    #[test]
213    fn body_token_jaccard_diverging_predicates_below_ceil() {
214        // #299 reproduction: same shape, different predicate identifier.
215        let a = "fn collect_a(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_call_graph) }";
216        let b = "fn collect_b(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_import_graph) }";
217        let j = body_token_jaccard(a, b).unwrap();
218        // Many tokens overlap (collect, files, root, pathbuf, vec) but
219        // the predicate identifier differs — Jaccard sits below 1.0
220        // and (for this issue's class of false-positive) below the
221        // SIGNATURE_ONLY_JACCARD_CEIL threshold once function names
222        // diverge or unique predicate identifiers split.
223        assert!(j < 1.0);
224        assert!(j > 0.0);
225    }
226
227    #[test]
228    fn body_token_jaccard_disjoint_returns_zero() {
229        let j = body_token_jaccard("alpha beta gamma", "delta epsilon zeta").unwrap();
230        assert!(j.abs() < 1e-9);
231    }
232
233    #[test]
234    fn body_token_jaccard_both_empty_returns_none() {
235        assert!(body_token_jaccard("", "").is_none());
236    }
237}