manifoldb_vector/distance/
sparse.rs

1//! Sparse vector distance functions.
2//!
3//! This module provides distance and similarity functions optimized for
4//! sparse vectors, where only non-zero elements are stored.
5//!
6//! Sparse vectors are represented as sorted `&[(u32, f32)]` slices where
7//! each pair is `(index, value)`.
8
9/// Calculate the dot product between two sparse vectors.
10///
11/// Both vectors must be sorted by index in ascending order.
12/// Time complexity: O(n + m) where n and m are the number of non-zero elements.
13///
14/// # Example
15///
16/// ```
17/// use manifoldb_vector::distance::sparse::sparse_dot_product;
18///
19/// let a = [(0, 1.0), (2, 2.0), (5, 3.0)];
20/// let b = [(1, 1.0), (2, 2.0), (5, 1.0)];
21/// // Only indices 2 and 5 overlap: 2*2 + 3*1 = 7
22/// assert!((sparse_dot_product(&a, &b) - 7.0).abs() < 1e-6);
23/// ```
24#[inline]
25#[must_use]
26pub fn sparse_dot_product(a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
27    let mut result = 0.0;
28    let mut i = 0;
29    let mut j = 0;
30
31    while i < a.len() && j < b.len() {
32        let (idx_a, val_a) = a[i];
33        let (idx_b, val_b) = b[j];
34
35        match idx_a.cmp(&idx_b) {
36            std::cmp::Ordering::Less => i += 1,
37            std::cmp::Ordering::Greater => j += 1,
38            std::cmp::Ordering::Equal => {
39                result += val_a * val_b;
40                i += 1;
41                j += 1;
42            }
43        }
44    }
45
46    result
47}
48
49/// Calculate the sum of squares (squared L2 norm) of a sparse vector.
50#[inline]
51#[must_use]
52pub fn sparse_sum_of_squares(v: &[(u32, f32)]) -> f32 {
53    v.iter().map(|&(_, val)| val * val).sum()
54}
55
56/// Calculate the L2 norm (magnitude) of a sparse vector.
57#[inline]
58#[must_use]
59pub fn sparse_l2_norm(v: &[(u32, f32)]) -> f32 {
60    sparse_sum_of_squares(v).sqrt()
61}
62
63/// Calculate the cosine similarity between two sparse vectors.
64///
65/// Returns a value in the range [-1, 1] where:
66/// - 1 means identical direction
67/// - 0 means orthogonal (no common indices or zero dot product)
68/// - -1 means opposite direction
69///
70/// Returns 0.0 if either vector has zero magnitude.
71#[inline]
72#[must_use]
73pub fn sparse_cosine_similarity(a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
74    let dot = sparse_dot_product(a, b);
75    let norm_a = sparse_l2_norm(a);
76    let norm_b = sparse_l2_norm(b);
77
78    if norm_a == 0.0 || norm_b == 0.0 {
79        return 0.0;
80    }
81
82    dot / (norm_a * norm_b)
83}
84
85/// Calculate the cosine similarity using pre-computed norms.
86///
87/// Returns `None` if either norm is zero.
88#[inline]
89#[must_use]
90pub fn sparse_cosine_similarity_with_norms(
91    a: &[(u32, f32)],
92    b: &[(u32, f32)],
93    norm_a: f32,
94    norm_b: f32,
95) -> Option<f32> {
96    if norm_a == 0.0 || norm_b == 0.0 {
97        return None;
98    }
99
100    let dot = sparse_dot_product(a, b);
101    Some(dot / (norm_a * norm_b))
102}
103
104/// Calculate the cosine distance between two sparse vectors.
105///
106/// Cosine distance = 1 - cosine_similarity, returning a value in [0, 2].
107#[inline]
108#[must_use]
109pub fn sparse_cosine_distance(a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
110    1.0 - sparse_cosine_similarity(a, b)
111}
112
113/// Calculate the squared Euclidean (L2) distance between two sparse vectors.
114///
115/// For sparse vectors, this is computed as:
116/// ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * (a . b)
117///
118/// This avoids the sqrt operation for cases where only relative distances matter.
119#[inline]
120#[must_use]
121pub fn sparse_euclidean_distance_squared(a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
122    let norm_a_sq = sparse_sum_of_squares(a);
123    let norm_b_sq = sparse_sum_of_squares(b);
124    let dot = sparse_dot_product(a, b);
125
126    // ||a - b||^2 = ||a||^2 + ||b||^2 - 2(a . b)
127    (norm_a_sq + norm_b_sq - 2.0 * dot).max(0.0)
128}
129
130/// Calculate the Euclidean (L2) distance between two sparse vectors.
131#[inline]
132#[must_use]
133pub fn sparse_euclidean_distance(a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
134    sparse_euclidean_distance_squared(a, b).sqrt()
135}
136
137/// Pre-computed L2 norm for efficient repeated sparse cosine similarity calculations.
138#[derive(Debug, Clone, Copy)]
139pub struct SparseCachedNorm {
140    /// The squared L2 norm (sum of squares)
141    norm_squared: f32,
142    /// The L2 norm (square root of sum of squares)
143    norm: f32,
144}
145
146impl SparseCachedNorm {
147    /// Compute and cache the L2 norm of a sparse vector.
148    #[must_use]
149    pub fn new(v: &[(u32, f32)]) -> Self {
150        let norm_squared = sparse_sum_of_squares(v);
151        let norm = norm_squared.sqrt();
152        Self { norm_squared, norm }
153    }
154
155    /// Get the cached L2 norm.
156    #[inline]
157    #[must_use]
158    pub const fn norm(&self) -> f32 {
159        self.norm
160    }
161
162    /// Get the cached squared L2 norm.
163    #[inline]
164    #[must_use]
165    pub const fn norm_squared(&self) -> f32 {
166        self.norm_squared
167    }
168
169    /// Check if the vector has zero magnitude.
170    #[inline]
171    #[must_use]
172    pub fn is_zero(&self) -> bool {
173        self.norm == 0.0
174    }
175}
176
177/// Sparse distance metric for comparing sparse vectors.
178#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
179pub enum SparseDistanceMetric {
180    /// Euclidean (L2) distance.
181    Euclidean,
182    /// Cosine distance (1 - cosine similarity).
183    Cosine,
184    /// Dot product (negative, for max similarity).
185    DotProduct,
186}
187
188impl SparseDistanceMetric {
189    /// Calculate the distance between two sparse vectors using this metric.
190    #[inline]
191    #[must_use]
192    pub fn calculate(&self, a: &[(u32, f32)], b: &[(u32, f32)]) -> f32 {
193        match self {
194            Self::Euclidean => sparse_euclidean_distance(a, b),
195            Self::Cosine => sparse_cosine_distance(a, b),
196            Self::DotProduct => -sparse_dot_product(a, b),
197        }
198    }
199
200    /// Calculate distance using cached norms (more efficient for repeated queries).
201    #[inline]
202    #[must_use]
203    pub fn calculate_with_norms(
204        &self,
205        a: &[(u32, f32)],
206        b: &[(u32, f32)],
207        norm_a: f32,
208        norm_b: f32,
209    ) -> f32 {
210        match self {
211            Self::Euclidean => {
212                // ||a - b||^2 = ||a||^2 + ||b||^2 - 2(a . b)
213                let norm_a_sq = norm_a * norm_a;
214                let norm_b_sq = norm_b * norm_b;
215                let dot = sparse_dot_product(a, b);
216                (norm_a_sq + norm_b_sq - 2.0 * dot).max(0.0).sqrt()
217            }
218            Self::Cosine => {
219                sparse_cosine_similarity_with_norms(a, b, norm_a, norm_b).map_or(1.0, |s| 1.0 - s)
220            }
221            Self::DotProduct => -sparse_dot_product(a, b),
222        }
223    }
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229
230    const EPSILON: f32 = 1e-5;
231
232    fn assert_near(a: f32, b: f32, epsilon: f32) {
233        assert!(
234            (a - b).abs() < epsilon,
235            "assertion failed: {} !~ {} (diff: {})",
236            a,
237            b,
238            (a - b).abs()
239        );
240    }
241
242    #[test]
243    fn test_sparse_dot_product_no_overlap() {
244        let a = [(0, 1.0), (2, 2.0)];
245        let b = [(1, 1.0), (3, 1.0)];
246        assert_near(sparse_dot_product(&a, &b), 0.0, EPSILON);
247    }
248
249    #[test]
250    fn test_sparse_dot_product_full_overlap() {
251        let a = [(0, 1.0), (2, 2.0), (5, 3.0)];
252        let b = [(0, 1.0), (2, 2.0), (5, 3.0)];
253        // 1*1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
254        assert_near(sparse_dot_product(&a, &b), 14.0, EPSILON);
255    }
256
257    #[test]
258    fn test_sparse_dot_product_partial_overlap() {
259        let a = [(0, 1.0), (2, 2.0), (5, 3.0)];
260        let b = [(1, 1.0), (2, 2.0), (5, 1.0)];
261        // Only indices 2 and 5 overlap: 2*2 + 3*1 = 7
262        assert_near(sparse_dot_product(&a, &b), 7.0, EPSILON);
263    }
264
265    #[test]
266    fn test_sparse_dot_product_empty() {
267        let a: [(u32, f32); 0] = [];
268        let b = [(0, 1.0)];
269        assert_near(sparse_dot_product(&a, &b), 0.0, EPSILON);
270    }
271
272    #[test]
273    fn test_sparse_l2_norm() {
274        let v = [(0, 3.0), (5, 4.0)];
275        assert_near(sparse_l2_norm(&v), 5.0, EPSILON);
276    }
277
278    #[test]
279    fn test_sparse_cosine_similarity_identical() {
280        let a = [(0, 1.0), (1, 2.0)];
281        assert_near(sparse_cosine_similarity(&a, &a), 1.0, EPSILON);
282    }
283
284    #[test]
285    fn test_sparse_cosine_similarity_orthogonal() {
286        let a = [(0, 1.0)];
287        let b = [(1, 1.0)];
288        assert_near(sparse_cosine_similarity(&a, &b), 0.0, EPSILON);
289    }
290
291    #[test]
292    fn test_sparse_cosine_similarity_opposite() {
293        let a = [(0, 1.0)];
294        let b = [(0, -1.0)];
295        assert_near(sparse_cosine_similarity(&a, &b), -1.0, EPSILON);
296    }
297
298    #[test]
299    fn test_sparse_cosine_distance() {
300        let a = [(0, 1.0), (1, 2.0)];
301        assert_near(sparse_cosine_distance(&a, &a), 0.0, EPSILON);
302
303        let b = [(2, 1.0)];
304        assert_near(sparse_cosine_distance(&a, &b), 1.0, EPSILON);
305    }
306
307    #[test]
308    fn test_sparse_euclidean_distance() {
309        let a = [(0, 0.0)];
310        let b = [(0, 3.0), (1, 4.0)];
311        assert_near(sparse_euclidean_distance(&a, &b), 5.0, EPSILON);
312    }
313
314    #[test]
315    fn test_sparse_euclidean_distance_no_overlap() {
316        // a = [3, 0, 0], b = [0, 4, 0]
317        let a = [(0, 3.0)];
318        let b = [(1, 4.0)];
319        // ||a - b||^2 = 9 + 16 = 25
320        assert_near(sparse_euclidean_distance(&a, &b), 5.0, EPSILON);
321    }
322
323    #[test]
324    fn test_sparse_cached_norm() {
325        let v = [(0, 3.0), (5, 4.0)];
326        let cached = SparseCachedNorm::new(&v);
327
328        assert_near(cached.norm(), 5.0, EPSILON);
329        assert_near(cached.norm_squared(), 25.0, EPSILON);
330        assert!(!cached.is_zero());
331    }
332
333    #[test]
334    fn test_sparse_cached_norm_empty() {
335        let v: [(u32, f32); 0] = [];
336        let cached = SparseCachedNorm::new(&v);
337        assert!(cached.is_zero());
338    }
339
340    #[test]
341    fn test_sparse_distance_metric() {
342        let a = [(0, 3.0), (1, 4.0)];
343        let b = [(0, 3.0), (1, 4.0)];
344
345        assert_near(SparseDistanceMetric::Euclidean.calculate(&a, &b), 0.0, EPSILON);
346        assert_near(SparseDistanceMetric::Cosine.calculate(&a, &b), 0.0, EPSILON);
347        assert_near(SparseDistanceMetric::DotProduct.calculate(&a, &b), -25.0, EPSILON);
348    }
349
350    #[test]
351    fn test_sparse_cosine_with_norms() {
352        let a = [(0, 3.0), (1, 4.0)];
353        let b = [(0, 3.0), (1, 4.0)];
354        let norm = sparse_l2_norm(&a);
355
356        let sim = sparse_cosine_similarity_with_norms(&a, &b, norm, norm);
357        assert!(sim.is_some());
358        assert_near(sim.unwrap(), 1.0, EPSILON);
359    }
360
361    #[test]
362    fn test_sparse_cosine_with_zero_norm() {
363        let a: [(u32, f32); 0] = [];
364        let b = [(0, 1.0)];
365
366        let sim = sparse_cosine_similarity_with_norms(&a, &b, 0.0, 1.0);
367        assert!(sim.is_none());
368    }
369}