Skip to main content

canon_core/
embedding.rs

1//! Embedding node representing a vector derived from a chunk
2//!
3//! Stores the vector in i16 format (quantized) for determinism and storage efficiency.
4//! Per CP-010: unit vectors are scaled by 32767, quantized with round_ties_even.
5//! Per CP-003 §4.3: canonical operations use integer math.
6
7use serde::{Deserialize, Serialize};
8use uuid::Uuid;
9
10/// An embedding vector derived from a chunk
11///
12/// Stores the vector in i16 format (quantized) for determinism and storage efficiency.
13/// The embedding ID is derived deterministically via BLAKE3-16 from chunk_id + model_hash.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Embedding {
16    /// Unique identifier for this embedding (BLAKE3-16 of chunk_id || model_hash)
17    pub id: Uuid,
18
19    /// Parent chunk ID
20    pub chunk_id: Uuid,
21
22    /// The embedding vector (i16 quantized, scale = 32767)
23    pub vector: Vec<i16>,
24
25    /// Hash of the model weights used to generate this embedding
26    pub model_hash: [u8; 32],
27
28    /// Dimensionality of the vector
29    pub dim: u16,
30
31    /// Precomputed L2 norm of the quantized vector (for similarity computation)
32    /// Per CP-001: stored for efficient cosine similarity without recomputation
33    pub l2_norm: f32,
34
35    /// Version of the embedding generation process (default 0)
36    pub embedding_version: u32,
37}
38
39impl Embedding {
40    /// Create a new embedding from an f32 vector.
41    ///
42    /// Per CP-010 §3.4-3.5:
43    /// 1. Normalize f32 vector to unit length
44    /// 2. Quantize to i16 with round_ties_even (scale = 32767)
45    ///
46    /// The embedding ID is deterministic: BLAKE3-16(chunk_id || model_hash || embedding_version).
47    pub fn new(chunk_id: Uuid, vector_f32: &[f32], model_hash: [u8; 32], embedding_version: u32) -> Self {
48        // 1. Normalize f32 vector to unit length
49        let normalized = normalize_l2(vector_f32);
50
51        // 2. Quantize to i16 (scale by 32767, round_ties_even per CP-003 §4.3)
52        let quantized: Vec<i16> = normalized
53            .iter()
54            .map(|&v| quantize_f32_to_i16(v))
55            .collect();
56
57        let dim = quantized.len() as u16;
58
59        // Compute L2 norm of quantized vector
60        let l2_norm = compute_l2_norm(&quantized);
61
62        // Deterministic ID: BLAKE3-16(chunk_id || model_hash || embedding_version)
63        let id_bytes = crate::id::generate_composite_id(&[
64            chunk_id.as_bytes(),
65            &model_hash,
66            &embedding_version.to_le_bytes(),
67        ]);
68        let id = Uuid::from_bytes(id_bytes);
69
70        Self {
71            id,
72            chunk_id,
73            vector: quantized,
74            model_hash,
75            dim,
76            l2_norm,
77            embedding_version,
78        }
79    }
80
81    /// Create an embedding directly from pre-quantized i16 values.
82    ///
83    /// Used when loading from storage where quantization already occurred.
84    pub fn from_quantized(
85        chunk_id: Uuid,
86        vector: Vec<i16>,
87        model_hash: [u8; 32],
88        embedding_version: u32,
89    ) -> Self {
90        let dim = vector.len() as u16;
91        let l2_norm = compute_l2_norm(&vector);
92        let id_bytes = crate::id::generate_composite_id(&[
93            chunk_id.as_bytes(),
94            &model_hash,
95            &embedding_version.to_le_bytes(),
96        ]);
97        let id = Uuid::from_bytes(id_bytes);
98
99        Self {
100            id,
101            chunk_id,
102            vector,
103            model_hash,
104            dim,
105            l2_norm,
106            embedding_version,
107        }
108    }
109
110    /// Create an embedding from pre-quantized values with a precomputed L2 norm.
111    ///
112    /// Used when loading from storage where the norm was already stored.
113    pub fn from_quantized_with_norm(
114        chunk_id: Uuid,
115        vector: Vec<i16>,
116        model_hash: [u8; 32],
117        l2_norm: f32,
118        embedding_version: u32,
119    ) -> Self {
120        let dim = vector.len() as u16;
121        let id_bytes = crate::id::generate_composite_id(&[
122            chunk_id.as_bytes(),
123            &model_hash,
124            &embedding_version.to_le_bytes(),
125        ]);
126        let id = Uuid::from_bytes(id_bytes);
127
128        Self {
129            id,
130            chunk_id,
131            vector,
132            model_hash,
133            dim,
134            l2_norm,
135            embedding_version,
136        }
137    }
138
139    /// Convert the quantized vector back to f32 (approximate).
140    pub fn to_f32(&self) -> Vec<f32> {
141        self.vector.iter().map(|&v| v as f32 / 32767.0).collect()
142    }
143
144    /// Compute integer dot product between this embedding and another i16 vector.
145    ///
146    /// Per CP-003 §4.5: all similarity computations use integer math.
147    /// Returns i64 to avoid overflow (384 dims * 32767^2 fits in i64).
148    pub fn integer_dot_product(&self, other: &[i16]) -> i64 {
149        if self.vector.len() != other.len() {
150            return 0;
151        }
152
153        self.vector
154            .iter()
155            .zip(other.iter())
156            .map(|(&a, &b)| (a as i64) * (b as i64))
157            .sum()
158    }
159
160    /// Compute the squared L2 norm of the quantized vector (integer).
161    ///
162    /// This avoids sqrt and floating-point entirely.
163    pub fn norm_squared(&self) -> i64 {
164        self.vector
165            .iter()
166            .map(|&v| (v as i64) * (v as i64))
167            .sum()
168    }
169
170    /// Compute L2 norm as f32 (for display/diagnostics only, not canonical).
171    pub fn norm_f32(&self) -> f32 {
172        (self.norm_squared() as f64).sqrt() as f32
173    }
174
175    /// Compute cosine similarity using integer math.
176    ///
177    /// Returns f32 for convenience, but the dot product and norms are
178    /// computed entirely in integer arithmetic first.
179    pub fn cosine_similarity(&self, other: &Embedding) -> f32 {
180        if self.vector.len() != other.vector.len() {
181            return 0.0;
182        }
183
184        let dot = self.integer_dot_product(&other.vector);
185        let norm_a = self.norm_squared();
186        let norm_b = other.norm_squared();
187
188        if norm_a == 0 || norm_b == 0 {
189            return 0.0;
190        }
191
192        // Final division: integer results → f32
193        // dot / sqrt(norm_a * norm_b)
194        let denom = ((norm_a as f64) * (norm_b as f64)).sqrt();
195        (dot as f64 / denom) as f32
196    }
197}
198
199/// Normalize vector to unit length
200fn normalize_l2(vector: &[f32]) -> Vec<f32> {
201    let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
202    if norm == 0.0 {
203        return vector.to_vec();
204    }
205    vector.iter().map(|v| v / norm).collect()
206}
207
208/// Compute L2 norm of quantized i16 vector
209fn compute_l2_norm(vector: &[i16]) -> f32 {
210    let sum_sq: i64 = vector.iter().map(|&v| (v as i64) * (v as i64)).sum();
211    (sum_sq as f64).sqrt() as f32
212}
213
214/// Quantize f32 (-1.0 to 1.0) to i16 (-32767 to 32767)
215///
216/// Per CP-003 §4.3: uses round_ties_even for deterministic rounding.
217/// Dead-zone: values in (-1e-7, 1e-7) map to 0.
218fn quantize_f32_to_i16(val: f32) -> i16 {
219    // Dead-zone per CP-003 §4.3
220    if val.abs() < 1e-7 {
221        return 0;
222    }
223    let scaled = val * 32767.0;
224    let rounded = scaled.round_ties_even();
225    rounded.clamp(-32767.0, 32767.0) as i16
226}
227
228impl PartialEq for Embedding {
229    fn eq(&self, other: &Self) -> bool {
230        self.id == other.id
231            && self.chunk_id == other.chunk_id
232            && self.model_hash == other.model_hash
233            && self.dim == other.dim
234            && self.embedding_version == other.embedding_version
235    }
236}
237
238impl Eq for Embedding {}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_embedding_id_is_blake3_not_uuid_v5() {
246        let chunk_id = Uuid::from_bytes([42u8; 16]);
247        let model_hash = [1u8; 32];
248        let vector = vec![1.0, 0.0];
249
250        let emb = Embedding::new(chunk_id, &vector, model_hash, 0);
251
252        // Verify ID matches BLAKE3-16(chunk_id || model_hash || embedding_version)
253        let expected = crate::id::generate_composite_id(&[
254            chunk_id.as_bytes(),
255            &model_hash,
256            &0u32.to_le_bytes(),
257        ]);
258        assert_eq!(emb.id.as_bytes(), &expected);
259    }
260
261    #[test]
262    fn test_embedding_creation_quantized() {
263        let chunk_id = Uuid::from_bytes([0u8; 16]);
264        // [1.0, 0.0] -> Normalized [1.0, 0.0] -> Quantized [32767, 0]
265        let vector = vec![1.0, 0.0];
266        let model_hash = [0u8; 32];
267
268        let emb = Embedding::new(chunk_id, &vector, model_hash, 0);
269
270        assert_eq!(emb.vector[0], 32767);
271        assert_eq!(emb.vector[1], 0);
272        assert!((emb.norm_f32() - 32767.0).abs() < 1.0);
273    }
274
275    #[test]
276    fn test_quantize_round_ties_even() {
277        // 0.5 * 32767 = 16383.5 → should round to 16384 (even)
278        let result = quantize_f32_to_i16(0.5);
279        // round_ties_even(16383.5) = 16384
280        assert_eq!(result, 16384);
281    }
282
283    #[test]
284    fn test_quantize_dead_zone() {
285        assert_eq!(quantize_f32_to_i16(0.0), 0);
286        assert_eq!(quantize_f32_to_i16(1e-8), 0);  // Below dead-zone threshold
287        assert_eq!(quantize_f32_to_i16(-1e-8), 0); // Below dead-zone threshold
288    }
289
290    #[test]
291    fn test_integer_dot_product() {
292        let chunk_id = Uuid::from_bytes([0u8; 16]);
293        let model_hash = [0u8; 32];
294
295        let emb = Embedding::from_quantized(chunk_id, vec![100, 200, 300], model_hash, 0);
296        let other = vec![1i16, 2, 3];
297
298        // 100*1 + 200*2 + 300*3 = 100 + 400 + 900 = 1400
299        assert_eq!(emb.integer_dot_product(&other), 1400);
300    }
301
302    #[test]
303    fn test_cosine_similarity() {
304        let chunk_id = Uuid::from_bytes([0u8; 16]);
305        let model_hash = [0u8; 32];
306
307        let emb1 = Embedding::new(chunk_id, &[1.0, 0.0], model_hash, 0);
308        let emb2 = Embedding::new(chunk_id, &[1.0, 0.0], model_hash, 0);
309        let emb3 = Embedding::new(chunk_id, &[0.0, 1.0], model_hash, 0); // Orthogonal
310        let emb4 = Embedding::new(chunk_id, &[-1.0, 0.0], model_hash, 0); // Opposite
311
312        assert!((emb1.cosine_similarity(&emb2) - 1.0).abs() < 0.01);
313        assert!(emb1.cosine_similarity(&emb3).abs() < 0.01);
314        assert!((emb1.cosine_similarity(&emb4) + 1.0).abs() < 0.01);
315    }
316
317    #[test]
318    fn test_embedding_id_determinism() {
319        let chunk_id = Uuid::from_bytes([42u8; 16]);
320        let model_hash = [7u8; 32];
321        let vector = vec![0.5, -0.3, 0.8];
322
323        let emb1 = Embedding::new(chunk_id, &vector, model_hash, 0);
324        let emb2 = Embedding::new(chunk_id, &vector, model_hash, 0);
325        assert_eq!(emb1.id, emb2.id);
326    }
327
328    #[test]
329    fn test_from_quantized() {
330        let chunk_id = Uuid::from_bytes([0u8; 16]);
331        let model_hash = [0u8; 32];
332        let vec = vec![32767i16, 0, -32767];
333
334        let emb = Embedding::from_quantized(chunk_id, vec.clone(), model_hash, 0);
335        assert_eq!(emb.vector, vec);
336        assert_eq!(emb.dim, 3);
337    }
338
339    #[test]
340    fn test_embedding_l2_norm_computed() {
341        let chunk_id = Uuid::from_bytes([0u8; 16]);
342        let model_hash = [0u8; 32];
343
344        // Non-zero vector should have positive norm
345        let vector = vec![0.5, 0.5, 0.5, 0.5];
346        let emb = Embedding::new(chunk_id, &vector, model_hash, 0);
347        assert!(emb.l2_norm > 0.0, "l2_norm should be positive for non-zero vectors");
348
349        // For a unit vector quantized to 32767, norm should be close to 32767
350        let unit_vec = vec![1.0, 0.0];
351        let emb2 = Embedding::new(chunk_id, &unit_vec, model_hash, 0);
352        assert!((emb2.l2_norm - 32767.0).abs() < 1.0);
353    }
354
355    #[test]
356    fn test_l2_norm_from_quantized() {
357        let chunk_id = Uuid::from_bytes([0u8; 16]);
358        let model_hash = [0u8; 32];
359        let vec = vec![100i16, 200, 300];
360
361        let emb = Embedding::from_quantized(chunk_id, vec.clone(), model_hash, 0);
362
363        // Manual calculation: sqrt(100^2 + 200^2 + 300^2) = sqrt(10000 + 40000 + 90000) = sqrt(140000)
364        let expected = (140000.0_f64).sqrt() as f32;
365        assert!((emb.l2_norm - expected).abs() < 0.01);
366    }
367
368    #[test]
369    fn test_l2_norm_with_precomputed() {
370        let chunk_id = Uuid::from_bytes([0u8; 16]);
371        let model_hash = [0u8; 32];
372        let vec = vec![100i16, 200, 300];
373        let precomputed_norm = 374.17;
374
375        let emb = Embedding::from_quantized_with_norm(chunk_id, vec, model_hash, precomputed_norm, 0);
376        assert_eq!(emb.l2_norm, precomputed_norm);
377    }
378}