codelens_engine/embedding/
chunk_ops.rs1use crate::embedding_store::{EmbeddingChunk, ScoredChunk};
2use serde::Serialize;
3
4pub type StoredChunkKey = (String, String, usize, String, String);
5
6pub fn stored_chunk_key(chunk: &EmbeddingChunk) -> StoredChunkKey {
7 (
8 chunk.file_path.clone(),
9 chunk.symbol_name.clone(),
10 chunk.line,
11 chunk.signature.clone(),
12 chunk.name_path.clone(),
13 )
14}
15
16pub fn stored_chunk_key_for_score(chunk: &ScoredChunk) -> StoredChunkKey {
17 (
18 chunk.file_path.clone(),
19 chunk.symbol_name.clone(),
20 chunk.line,
21 chunk.signature.clone(),
22 chunk.name_path.clone(),
23 )
24}
25
26pub fn duplicate_candidate_limit(max_pairs: usize) -> usize {
27 max_pairs.saturating_mul(4).clamp(32, 128)
28}
29
30pub fn duplicate_pair_key(
31 file_a: &str,
32 symbol_a: &str,
33 file_b: &str,
34 symbol_b: &str,
35) -> ((String, String), (String, String)) {
36 let left = (file_a.to_owned(), symbol_a.to_owned());
37 let right = (file_b.to_owned(), symbol_b.to_owned());
38 if left <= right {
39 (left, right)
40 } else {
41 (right, left)
42 }
43}
44
45pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
50 debug_assert_eq!(a.len(), b.len());
51
52 let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
55 for (x, y) in a.iter().zip(b.iter()) {
56 dot += x * y;
57 norm_a += x * x;
58 norm_b += y * y;
59 }
60
61 let norm_a = (norm_a as f64).sqrt();
62 let norm_b = (norm_b as f64).sqrt();
63 if norm_a == 0.0 || norm_b == 0.0 {
64 0.0
65 } else {
66 dot as f64 / (norm_a * norm_b)
67 }
68}
69
70#[derive(Debug, Clone, Serialize)]
71pub struct DuplicatePair {
72 pub symbol_a: String,
73 pub symbol_b: String,
74 pub file_a: String,
75 pub file_b: String,
76 pub line_a: usize,
77 pub line_b: usize,
78 pub similarity: f64,
79}
80
81#[derive(Debug, Clone, Serialize)]
82pub struct CategoryScore {
83 pub category: String,
84 pub score: f64,
85}
86
87#[derive(Debug, Clone, Serialize)]
88pub struct OutlierSymbol {
89 pub file_path: String,
90 pub symbol_name: String,
91 pub kind: String,
92 pub line: usize,
93 pub avg_similarity_to_file: f64,
94}
95
96pub fn embedding_to_bytes(embedding: &[f32]) -> Vec<u8> {
97 embedding.iter().flat_map(|f| f.to_le_bytes()).collect()
98}