codelens_engine/embedding/
chunk_ops.rs1use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
2use serde::Serialize;
3
4pub type StoredChunkKey = (String, String, usize, String, String);
5
6pub fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
7 (
8 chunk.file_path.clone(),
9 chunk.symbol_name.clone(),
10 chunk.line,
11 chunk.signature.clone(),
12 chunk.name_path.clone(),
13 )
14}
15
16pub fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
17 (
18 chunk.file_path.clone(),
19 chunk.symbol_name.clone(),
20 chunk.line,
21 chunk.signature.clone(),
22 chunk.name_path.clone(),
23 )
24}
25
26pub fn duplicate_candidate_limit(max_pairs: usize) -> usize {
27 max_pairs.saturating_mul(4).clamp(32, 128)
28}
29
30pub fn duplicate_pair_key(
31 file_a: &str,
32 symbol_a: &str,
33 file_b: &str,
34 symbol_b: &str,
35) -> ((String, String), (String, String)) {
36 let left = (file_a.to_owned(), symbol_a.to_owned());
37 let right = (file_b.to_owned(), symbol_b.to_owned());
38 if left <= right {
39 (left, right)
40 } else {
41 (right, left)
42 }
43}
44
45pub const SIGNATURE_ONLY_COSINE_FLOOR: f64 = 0.85;
49
50pub const SIGNATURE_ONLY_JACCARD_CEIL: f64 = 0.5;
55
56pub fn body_tokens(text: &str) -> std::collections::HashSet<String> {
62 const STOPWORDS: &[&str] = &[
63 "fn", "let", "mut", "pub", "use", "mod", "if", "else", "for", "while", "loop", "match",
64 "return", "self", "true", "false", "as", "in", "of", "the", "and", "or", "not", "is",
65 "this", "that", "ok", "err", "none", "some", "result",
66 ];
67 let mut buf = String::new();
68 let mut tokens: std::collections::HashSet<String> = std::collections::HashSet::new();
69 let push_buf = |buf: &mut String, tokens: &mut std::collections::HashSet<String>| {
70 if buf.len() >= 2 && !STOPWORDS.contains(&buf.as_str()) {
71 tokens.insert(buf.clone());
72 }
73 buf.clear();
74 };
75 for ch in text.chars() {
76 if ch.is_ascii_alphanumeric() {
77 buf.push(ch.to_ascii_lowercase());
78 } else if !buf.is_empty() {
79 push_buf(&mut buf, &mut tokens);
80 }
81 }
82 if !buf.is_empty() {
83 push_buf(&mut buf, &mut tokens);
84 }
85 tokens
86}
87
88pub fn body_token_jaccard(text_a: &str, text_b: &str) -> Option<f64> {
91 let a = body_tokens(text_a);
92 let b = body_tokens(text_b);
93 if a.is_empty() && b.is_empty() {
94 return None;
95 }
96 let inter = a.intersection(&b).count() as f64;
97 let union = a.union(&b).count() as f64;
98 if union == 0.0 {
99 return Some(0.0);
100 }
101 Some(inter / union)
102}
103
104pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
109 debug_assert_eq!(a.len(), b.len());
110
111 let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
114 for (x, y) in a.iter().zip(b.iter()) {
115 dot += x * y;
116 norm_a += x * x;
117 norm_b += y * y;
118 }
119
120 let norm_a = (norm_a as f64).sqrt();
121 let norm_b = (norm_b as f64).sqrt();
122 if norm_a == 0.0 || norm_b == 0.0 {
123 0.0
124 } else {
125 dot as f64 / (norm_a * norm_b)
126 }
127}
128
129#[derive(Debug, Clone, Serialize)]
130pub struct DuplicatePair {
131 pub symbol_a: String,
132 pub symbol_b: String,
133 pub file_a: String,
134 pub file_b: String,
135 pub line_a: usize,
136 pub line_b: usize,
137 pub similarity: f64,
138 #[serde(skip_serializing_if = "Option::is_none")]
148 pub body_token_jaccard: Option<f64>,
149 #[serde(skip_serializing_if = "std::ops::Not::not")]
155 pub signature_only_match: bool,
156 #[serde(skip)]
164 pub kind_a: String,
165 #[serde(skip)]
166 pub kind_b: String,
167}
168
169#[derive(Debug, Clone, Serialize)]
170pub struct CategoryScore {
171 pub category: String,
172 pub score: f64,
173}
174
175#[derive(Debug, Clone, Serialize)]
176pub struct OutlierSymbol {
177 pub file_path: String,
178 pub symbol_name: String,
179 pub kind: String,
180 pub line: usize,
181 pub avg_similarity_to_file: f64,
182}
183
184pub fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
185 embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn body_tokens_strips_stopwords_and_short_tokens() {
194 let toks = body_tokens("fn foo(x: i32) -> i32 { let mut y = x + 1; y }");
195 assert!(toks.contains("foo"));
197 assert!(toks.contains("i32"));
198 assert!(!toks.contains("fn"));
199 assert!(!toks.contains("let"));
200 assert!(!toks.contains("mut"));
201 assert!(!toks.contains("x"));
202 assert!(!toks.contains("y"));
203 }
204
205 #[test]
206 fn body_token_jaccard_identical_bodies_is_one() {
207 let body = "fn collect(root: &Path) -> Vec<PathBuf> { walker(root, predicate_a) }";
208 let j = body_token_jaccard(body, body).unwrap();
209 assert!((j - 1.0).abs() < 1e-9);
210 }
211
212 #[test]
213 fn body_token_jaccard_diverging_predicates_below_ceil() {
214 let a = "fn collect_a(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_call_graph) }";
216 let b = "fn collect_b(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_import_graph) }";
217 let j = body_token_jaccard(a, b).unwrap();
218 assert!(j < 1.0);
224 assert!(j > 0.0);
225 }
226
227 #[test]
228 fn body_token_jaccard_disjoint_returns_zero() {
229 let j = body_token_jaccard("alpha beta gamma", "delta epsilon zeta").unwrap();
230 assert!(j.abs() < 1e-9);
231 }
232
233 #[test]
234 fn body_token_jaccard_both_empty_returns_none() {
235 assert!(body_token_jaccard("", "").is_none());
236 }
237}