use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
use serde::Serialize;
pub type StoredChunkKey = (String, String, usize, String, String);
pub fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
(
chunk.file_path.clone(),
chunk.symbol_name.clone(),
chunk.line,
chunk.signature.clone(),
chunk.name_path.clone(),
)
}
pub fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
(
chunk.file_path.clone(),
chunk.symbol_name.clone(),
chunk.line,
chunk.signature.clone(),
chunk.name_path.clone(),
)
}
pub fn duplicate_candidate_limit(max_pairs: usize) -> usize {
max_pairs.saturating_mul(4).clamp(32, 128)
}
pub fn duplicate_pair_key(
file_a: &str,
symbol_a: &str,
file_b: &str,
symbol_b: &str,
) -> ((String, String), (String, String)) {
let left = (file_a.to_owned(), symbol_a.to_owned());
let right = (file_b.to_owned(), symbol_b.to_owned());
if left <= right {
(left, right)
} else {
(right, left)
}
}
pub const SIGNATURE_ONLY_COSINE_FLOOR: f64 = 0.85;
pub const SIGNATURE_ONLY_JACCARD_CEIL: f64 = 0.5;
pub fn body_tokens(text: &str) -> std::collections::HashSet<String> {
const STOPWORDS: &[&str] = &[
"fn", "let", "mut", "pub", "use", "mod", "if", "else", "for", "while", "loop", "match",
"return", "self", "true", "false", "as", "in", "of", "the", "and", "or", "not", "is",
"this", "that", "ok", "err", "none", "some", "result",
];
let mut buf = String::new();
let mut tokens: std::collections::HashSet<String> = std::collections::HashSet::new();
let push_buf = |buf: &mut String, tokens: &mut std::collections::HashSet<String>| {
if buf.len() >= 2 && !STOPWORDS.contains(&buf.as_str()) {
tokens.insert(buf.clone());
}
buf.clear();
};
for ch in text.chars() {
if ch.is_ascii_alphanumeric() {
buf.push(ch.to_ascii_lowercase());
} else if !buf.is_empty() {
push_buf(&mut buf, &mut tokens);
}
}
if !buf.is_empty() {
push_buf(&mut buf, &mut tokens);
}
tokens
}
pub fn body_token_jaccard(text_a: &str, text_b: &str) -> Option<f64> {
let a = body_tokens(text_a);
let b = body_tokens(text_b);
if a.is_empty() && b.is_empty() {
return None;
}
let inter = a.intersection(&b).count() as f64;
let union = a.union(&b).count() as f64;
if union == 0.0 {
return Some(0.0);
}
Some(inter / union)
}
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
debug_assert_eq!(a.len(), b.len());
let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
for (x, y) in a.iter().zip(b.iter()) {
dot += x * y;
norm_a += x * x;
norm_b += y * y;
}
let norm_a = (norm_a as f64).sqrt();
let norm_b = (norm_b as f64).sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
0.0
} else {
dot as f64 / (norm_a * norm_b)
}
}
#[derive(Debug, Clone, Serialize)]
pub struct DuplicatePair {
pub symbol_a: String,
pub symbol_b: String,
pub file_a: String,
pub file_b: String,
pub line_a: usize,
pub line_b: usize,
pub similarity: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub body_token_jaccard: Option<f64>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub signature_only_match: bool,
#[serde(skip)]
pub kind_a: String,
#[serde(skip)]
pub kind_b: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct CategoryScore {
pub category: String,
pub score: f64,
}
#[derive(Debug, Clone, Serialize)]
pub struct OutlierSymbol {
pub file_path: String,
pub symbol_name: String,
pub kind: String,
pub line: usize,
pub avg_similarity_to_file: f64,
}
pub fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn body_tokens_strips_stopwords_and_short_tokens() {
let toks = body_tokens("fn foo(x: i32) -> i32 { let mut y = x + 1; y }");
assert!(toks.contains("foo"));
assert!(toks.contains("i32"));
assert!(!toks.contains("fn"));
assert!(!toks.contains("let"));
assert!(!toks.contains("mut"));
assert!(!toks.contains("x"));
assert!(!toks.contains("y"));
}
#[test]
fn body_token_jaccard_identical_bodies_is_one() {
let body = "fn collect(root: &Path) -> Vec<PathBuf> { walker(root, predicate_a) }";
let j = body_token_jaccard(body, body).unwrap();
assert!((j - 1.0).abs() < 1e-9);
}
#[test]
fn body_token_jaccard_diverging_predicates_below_ceil() {
let a = "fn collect_a(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_call_graph) }";
let b = "fn collect_b(root: &Path) -> Vec<PathBuf> { collect_files(root, supports_import_graph) }";
let j = body_token_jaccard(a, b).unwrap();
assert!(j < 1.0);
assert!(j > 0.0);
}
#[test]
fn body_token_jaccard_disjoint_returns_zero() {
let j = body_token_jaccard("alpha beta gamma", "delta epsilon zeta").unwrap();
assert!(j.abs() < 1e-9);
}
#[test]
fn body_token_jaccard_both_empty_returns_none() {
assert!(body_token_jaccard("", "").is_none());
}
}