Skip to main content

codelens_engine/embedding/
chunk_ops.rs

1use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
2use serde::Serialize;
3
4pub type StoredChunkKey = (String, String, usize, String, String);
5
6pub fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
7    (
8        chunk.file_path.clone(),
9        chunk.symbol_name.clone(),
10        chunk.line,
11        chunk.signature.clone(),
12        chunk.name_path.clone(),
13    )
14}
15
16pub fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
17    (
18        chunk.file_path.clone(),
19        chunk.symbol_name.clone(),
20        chunk.line,
21        chunk.signature.clone(),
22        chunk.name_path.clone(),
23    )
24}
25
26pub fn duplicate_candidate_limit(max_pairs: usize) -> usize {
27    max_pairs.saturating_mul(4).clamp(32, 128)
28}
29
30pub fn duplicate_pair_key(
31    file_a: &str,
32    symbol_a: &str,
33    file_b: &str,
34    symbol_b: &str,
35) -> ((String, String), (String, String)) {
36    let left = (file_a.to_owned(), symbol_a.to_owned());
37    let right = (file_b.to_owned(), symbol_b.to_owned());
38    if left <= right {
39        (left, right)
40    } else {
41        (right, left)
42    }
43}
44
45/// SIMD-friendly cosine similarity for f32 embedding vectors.
46///
47/// Computes dot product and norms in f32 (auto-vectorized by LLVM on Apple Silicon NEON),
48/// then promotes to f64 only for the final division to avoid precision loss.
49pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
50    debug_assert_eq!(a.len(), b.len());
51
52    // Process in chunks of 8 for optimal SIMD lane utilization (NEON 128-bit = 4xf32,
53    // but the compiler can unroll 2 iterations for 8-wide throughput).
54    let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
55    for (x, y) in a.iter().zip(b.iter()) {
56        dot += x * y;
57        norm_a += x * x;
58        norm_b += y * y;
59    }
60
61    let norm_a = (norm_a as f64).sqrt();
62    let norm_b = (norm_b as f64).sqrt();
63    if norm_a == 0.0 || norm_b == 0.0 {
64        0.0
65    } else {
66        dot as f64 / (norm_a * norm_b)
67    }
68}
69
70#[derive(Debug, Clone, Serialize)]
71pub struct DuplicatePair {
72    pub symbol_a: String,
73    pub symbol_b: String,
74    pub file_a: String,
75    pub file_b: String,
76    pub line_a: usize,
77    pub line_b: usize,
78    pub similarity: f64,
79}
80
81#[derive(Debug, Clone, Serialize)]
82pub struct CategoryScore {
83    pub category: String,
84    pub score: f64,
85}
86
87#[derive(Debug, Clone, Serialize)]
88pub struct OutlierSymbol {
89    pub file_path: String,
90    pub symbol_name: String,
91    pub kind: String,
92    pub line: usize,
93    pub avg_similarity_to_file: f64,
94}
95
96pub fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
97    embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
98}