Skip to main content

grafeo_core/index/vector/
distance.rs

1//! Distance metrics for vector similarity search.
2//!
3//! Provides efficient computation of various distance metrics between vectors.
4//! All functions expect vectors of equal length.
5//!
6//! # SIMD Acceleration
7//!
8//! This module automatically uses SIMD instructions when available:
9//! - **AVX2** (x86_64): 8 floats per instruction, ~6x speedup
10//! - **SSE** (x86_64): 4 floats per instruction, ~3x speedup
11//! - **NEON** (aarch64): 4 floats per instruction, ~3x speedup
12//!
13//! Use [`simd_support`] to check which instruction set is being used.
14
15use serde::{Deserialize, Serialize};
16
17use super::simd;
18
19/// Distance metric for vector similarity computation.
20///
21/// Different metrics are suited for different embedding types:
22/// - **Cosine**: Best for normalized embeddings (most text embeddings)
23/// - **Euclidean**: Best for raw embeddings where magnitude matters
24/// - **DotProduct**: Best for maximum inner product search
25/// - **Manhattan**: Alternative to Euclidean, less sensitive to outliers
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
27pub enum DistanceMetric {
28    /// Cosine distance: 1 - cosine_similarity.
29    ///
30    /// Range: [0, 2], where 0 = identical direction, 2 = opposite direction.
31    /// Best for normalized embeddings (most text/sentence embeddings).
32    #[default]
33    Cosine,
34
35    /// Euclidean (L2) distance: `sqrt(sum((a[i] - b[i])^2))`.
36    ///
37    /// Range: [0, infinity), where 0 = identical vectors.
38    /// Best when magnitude matters.
39    Euclidean,
40
41    /// Negative dot product: `-sum(a[i] * b[i])`.
42    ///
43    /// Returns negative so that smaller = more similar (for min-heap).
44    /// Best for maximum inner product search (MIPS).
45    DotProduct,
46
47    /// Manhattan (L1) distance: `sum(|a[i] - b[i]|)`.
48    ///
49    /// Range: [0, infinity), where 0 = identical vectors.
50    /// Less sensitive to outliers than Euclidean.
51    Manhattan,
52}
53
54impl DistanceMetric {
55    /// Returns the name of the metric as a string.
56    #[must_use]
57    pub const fn name(&self) -> &'static str {
58        match self {
59            Self::Cosine => "cosine",
60            Self::Euclidean => "euclidean",
61            Self::DotProduct => "dot_product",
62            Self::Manhattan => "manhattan",
63        }
64    }
65
66    /// Parses a metric from a string (case-insensitive).
67    ///
68    /// # Examples
69    ///
70    /// ```
71    /// use grafeo_core::index::vector::DistanceMetric;
72    ///
73    /// assert_eq!(DistanceMetric::from_str("cosine"), Some(DistanceMetric::Cosine));
74    /// assert_eq!(DistanceMetric::from_str("EUCLIDEAN"), Some(DistanceMetric::Euclidean));
75    /// assert_eq!(DistanceMetric::from_str("l2"), Some(DistanceMetric::Euclidean));
76    /// assert_eq!(DistanceMetric::from_str("invalid"), None);
77    /// ```
78    #[must_use]
79    pub fn from_str(s: &str) -> Option<Self> {
80        match s.to_lowercase().as_str() {
81            "cosine" | "cos" => Some(Self::Cosine),
82            "euclidean" | "l2" | "euclid" => Some(Self::Euclidean),
83            "dot_product" | "dotproduct" | "dot" | "inner_product" | "ip" => Some(Self::DotProduct),
84            "manhattan" | "l1" | "taxicab" => Some(Self::Manhattan),
85            _ => None,
86        }
87    }
88}
89
90/// Returns the active SIMD instruction set name.
91///
92/// Useful for diagnostics and performance tuning.
93///
94/// # Returns
95///
96/// One of: `"avx2"`, `"sse"`, `"neon"`, or `"scalar"`.
97///
98/// # Examples
99///
100/// ```
101/// use grafeo_core::index::vector::simd_support;
102///
103/// let support = simd_support();
104/// println!("Using SIMD: {}", support);
105/// ```
106#[must_use]
107#[inline]
108pub fn simd_support() -> &'static str {
109    simd::simd_support()
110}
111
112/// Computes the distance between two vectors using the specified metric.
113///
114/// This function automatically uses SIMD acceleration when available,
115/// providing 3-6x speedup on modern CPUs.
116///
117/// # Panics
118///
119/// Debug-asserts that vectors have equal length. In release builds,
120/// mismatched lengths may cause incorrect results.
121///
122/// # Examples
123///
124/// ```
125/// use grafeo_core::index::vector::{compute_distance, DistanceMetric};
126///
127/// let a = [1.0f32, 0.0, 0.0];
128/// let b = [0.0f32, 1.0, 0.0];
129///
130/// // Cosine distance between orthogonal vectors = 1.0
131/// let dist = compute_distance(&a, &b, DistanceMetric::Cosine);
132/// assert!((dist - 1.0).abs() < 0.001);
133///
134/// // Euclidean distance = sqrt(2)
135/// let dist = compute_distance(&a, &b, DistanceMetric::Euclidean);
136/// assert!((dist - 1.414).abs() < 0.01);
137/// ```
138#[inline]
139pub fn compute_distance(a: &[f32], b: &[f32], metric: DistanceMetric) -> f32 {
140    simd::compute_distance_simd(a, b, metric)
141}
142
143/// Computes cosine distance: 1 - cosine_similarity.
144///
145/// Cosine similarity = dot(a, b) / (||a|| * ||b||)
146/// Cosine distance = 1 - cosine_similarity
147///
148/// Range: [0, 2] where 0 = same direction, 1 = orthogonal, 2 = opposite.
149///
150/// Uses SIMD acceleration when available.
151#[inline]
152pub fn cosine_distance(a: &[f32], b: &[f32]) -> f32 {
153    simd::cosine_distance_simd(a, b)
154}
155
156/// Computes cosine similarity: dot(a, b) / (||a|| * ||b||).
157///
158/// Range: [-1, 1] where 1 = same direction, 0 = orthogonal, -1 = opposite.
159#[inline]
160pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
161    1.0 - cosine_distance(a, b)
162}
163
164/// Computes Euclidean (L2) distance: `sqrt(sum((a[i] - b[i])^2))`.
165///
166/// Uses SIMD acceleration when available.
167#[inline]
168pub fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
169    simd::euclidean_distance_simd(a, b)
170}
171
172/// Computes squared Euclidean distance: `sum((a[i] - b[i])^2)`.
173///
174/// Use this when you only need to compare distances (avoids sqrt).
175/// Uses SIMD acceleration when available.
176#[inline]
177pub fn euclidean_distance_squared(a: &[f32], b: &[f32]) -> f32 {
178    simd::euclidean_distance_squared_simd(a, b)
179}
180
181/// Computes dot product: `sum(a[i] * b[i])`.
182///
183/// Uses SIMD acceleration when available.
184#[inline]
185pub fn dot_product(a: &[f32], b: &[f32]) -> f32 {
186    simd::dot_product_simd(a, b)
187}
188
189/// Computes Manhattan (L1) distance: `sum(|a[i] - b[i]|)`.
190///
191/// Uses SIMD acceleration when available.
192#[inline]
193pub fn manhattan_distance(a: &[f32], b: &[f32]) -> f32 {
194    simd::manhattan_distance_simd(a, b)
195}
196
197/// Normalizes a vector to unit length (L2 norm = 1).
198///
199/// Returns the original magnitude. If magnitude is zero, returns 0.0
200/// and leaves the vector unchanged.
201#[inline]
202pub fn normalize(v: &mut [f32]) -> f32 {
203    let mut norm = 0.0f32;
204    for &x in v.iter() {
205        norm += x * x;
206    }
207    let norm = norm.sqrt();
208
209    if norm > f32::EPSILON {
210        for x in v.iter_mut() {
211            *x /= norm;
212        }
213    }
214
215    norm
216}
217
218/// Computes the L2 norm (magnitude) of a vector.
219#[inline]
220pub fn l2_norm(v: &[f32]) -> f32 {
221    let mut sum = 0.0f32;
222    for &x in v {
223        sum += x * x;
224    }
225    sum.sqrt()
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    const EPSILON: f32 = 1e-5;
233
234    fn approx_eq(a: f32, b: f32) -> bool {
235        (a - b).abs() < EPSILON
236    }
237
238    #[test]
239    fn test_cosine_distance_identical() {
240        let a = [1.0f32, 2.0, 3.0];
241        let b = [1.0f32, 2.0, 3.0];
242        assert!(approx_eq(cosine_distance(&a, &b), 0.0));
243    }
244
245    #[test]
246    fn test_cosine_distance_orthogonal() {
247        let a = [1.0f32, 0.0, 0.0];
248        let b = [0.0f32, 1.0, 0.0];
249        assert!(approx_eq(cosine_distance(&a, &b), 1.0));
250    }
251
252    #[test]
253    fn test_cosine_distance_opposite() {
254        let a = [1.0f32, 0.0, 0.0];
255        let b = [-1.0f32, 0.0, 0.0];
256        assert!(approx_eq(cosine_distance(&a, &b), 2.0));
257    }
258
259    #[test]
260    fn test_euclidean_distance_identical() {
261        let a = [1.0f32, 2.0, 3.0];
262        let b = [1.0f32, 2.0, 3.0];
263        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
264    }
265
266    #[test]
267    fn test_euclidean_distance_unit_vectors() {
268        let a = [1.0f32, 0.0, 0.0];
269        let b = [0.0f32, 1.0, 0.0];
270        assert!(approx_eq(euclidean_distance(&a, &b), 2.0f32.sqrt()));
271    }
272
273    #[test]
274    fn test_euclidean_distance_3_4_5() {
275        let a = [0.0f32, 0.0];
276        let b = [3.0f32, 4.0];
277        assert!(approx_eq(euclidean_distance(&a, &b), 5.0));
278    }
279
280    #[test]
281    fn test_dot_product() {
282        let a = [1.0f32, 2.0, 3.0];
283        let b = [4.0f32, 5.0, 6.0];
284        // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
285        assert!(approx_eq(dot_product(&a, &b), 32.0));
286    }
287
288    #[test]
289    fn test_manhattan_distance() {
290        let a = [1.0f32, 2.0, 3.0];
291        let b = [4.0f32, 6.0, 3.0];
292        // |1-4| + |2-6| + |3-3| = 3 + 4 + 0 = 7
293        assert!(approx_eq(manhattan_distance(&a, &b), 7.0));
294    }
295
296    #[test]
297    fn test_normalize() {
298        let mut v = [3.0f32, 4.0];
299        let orig_norm = normalize(&mut v);
300        assert!(approx_eq(orig_norm, 5.0));
301        assert!(approx_eq(v[0], 0.6));
302        assert!(approx_eq(v[1], 0.8));
303        assert!(approx_eq(l2_norm(&v), 1.0));
304    }
305
306    #[test]
307    fn test_normalize_zero_vector() {
308        let mut v = [0.0f32, 0.0, 0.0];
309        let norm = normalize(&mut v);
310        assert!(approx_eq(norm, 0.0));
311        // Vector should remain unchanged
312        assert!(approx_eq(v[0], 0.0));
313    }
314
315    #[test]
316    fn test_compute_distance_dispatch() {
317        let a = [1.0f32, 0.0];
318        let b = [0.0f32, 1.0];
319
320        let cos = compute_distance(&a, &b, DistanceMetric::Cosine);
321        let euc = compute_distance(&a, &b, DistanceMetric::Euclidean);
322        let man = compute_distance(&a, &b, DistanceMetric::Manhattan);
323
324        assert!(approx_eq(cos, 1.0)); // Orthogonal
325        assert!(approx_eq(euc, 2.0f32.sqrt()));
326        assert!(approx_eq(man, 2.0));
327    }
328
329    #[test]
330    fn test_metric_from_str() {
331        assert_eq!(
332            DistanceMetric::from_str("cosine"),
333            Some(DistanceMetric::Cosine)
334        );
335        assert_eq!(
336            DistanceMetric::from_str("COSINE"),
337            Some(DistanceMetric::Cosine)
338        );
339        assert_eq!(
340            DistanceMetric::from_str("cos"),
341            Some(DistanceMetric::Cosine)
342        );
343
344        assert_eq!(
345            DistanceMetric::from_str("euclidean"),
346            Some(DistanceMetric::Euclidean)
347        );
348        assert_eq!(
349            DistanceMetric::from_str("l2"),
350            Some(DistanceMetric::Euclidean)
351        );
352
353        assert_eq!(
354            DistanceMetric::from_str("dot_product"),
355            Some(DistanceMetric::DotProduct)
356        );
357        assert_eq!(
358            DistanceMetric::from_str("ip"),
359            Some(DistanceMetric::DotProduct)
360        );
361
362        assert_eq!(
363            DistanceMetric::from_str("manhattan"),
364            Some(DistanceMetric::Manhattan)
365        );
366        assert_eq!(
367            DistanceMetric::from_str("l1"),
368            Some(DistanceMetric::Manhattan)
369        );
370
371        assert_eq!(DistanceMetric::from_str("invalid"), None);
372    }
373
374    #[test]
375    fn test_metric_name() {
376        assert_eq!(DistanceMetric::Cosine.name(), "cosine");
377        assert_eq!(DistanceMetric::Euclidean.name(), "euclidean");
378        assert_eq!(DistanceMetric::DotProduct.name(), "dot_product");
379        assert_eq!(DistanceMetric::Manhattan.name(), "manhattan");
380    }
381
382    #[test]
383    fn test_high_dimensional() {
384        // Test with 384-dim vectors (common embedding size)
385        let a: Vec<f32> = (0..384).map(|i| (i as f32) / 384.0).collect();
386        let b: Vec<f32> = (0..384).map(|i| ((383 - i) as f32) / 384.0).collect();
387
388        let cos = cosine_distance(&a, &b);
389        let euc = euclidean_distance(&a, &b);
390
391        // Just verify they produce reasonable values
392        assert!((0.0..=2.0).contains(&cos));
393        assert!(euc >= 0.0);
394    }
395
396    // ── Edge case tests ─────────────────────────────────────────────
397
398    #[test]
399    fn test_single_dimension() {
400        let a = [5.0f32];
401        let b = [3.0f32];
402        assert!(approx_eq(euclidean_distance(&a, &b), 2.0));
403        assert!(approx_eq(manhattan_distance(&a, &b), 2.0));
404    }
405
406    #[test]
407    fn test_zero_vectors_euclidean() {
408        let a = [0.0f32, 0.0, 0.0];
409        let b = [0.0f32, 0.0, 0.0];
410        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
411    }
412
413    #[test]
414    fn test_zero_vectors_cosine() {
415        let a = [0.0f32, 0.0, 0.0];
416        let b = [0.0f32, 0.0, 0.0];
417        let d = cosine_distance(&a, &b);
418        // Zero vectors have undefined cosine; should not panic
419        assert!(!d.is_nan() || d.is_nan()); // Just verify no panic
420    }
421
422    #[test]
423    fn test_one_zero_vector_cosine() {
424        let a = [1.0f32, 0.0, 0.0];
425        let b = [0.0f32, 0.0, 0.0];
426        let d = cosine_distance(&a, &b);
427        // Should not panic; result depends on implementation
428        assert!(d.is_finite() || d.is_nan());
429    }
430
431    #[test]
432    fn test_identical_vectors_all_metrics() {
433        let v = [0.5f32, -0.3, 0.8, 1.2];
434        assert!(approx_eq(cosine_distance(&v, &v), 0.0));
435        assert!(approx_eq(euclidean_distance(&v, &v), 0.0));
436        assert!(approx_eq(manhattan_distance(&v, &v), 0.0));
437    }
438
439    #[test]
440    fn test_negative_values() {
441        let a = [-1.0f32, -2.0, -3.0];
442        let b = [-1.0f32, -2.0, -3.0];
443        assert!(approx_eq(cosine_distance(&a, &b), 0.0));
444        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
445    }
446
447    #[test]
448    fn test_dot_product_orthogonal() {
449        let a = [1.0f32, 0.0];
450        let b = [0.0f32, 1.0];
451        assert!(approx_eq(dot_product(&a, &b), 0.0));
452    }
453
454    #[test]
455    fn test_dot_product_negative() {
456        let a = [1.0f32, 0.0];
457        let b = [-1.0f32, 0.0];
458        assert!(approx_eq(dot_product(&a, &b), -1.0));
459    }
460
461    #[test]
462    fn test_manhattan_single_axis_diff() {
463        let a = [0.0f32, 0.0, 0.0];
464        let b = [0.0f32, 5.0, 0.0];
465        assert!(approx_eq(manhattan_distance(&a, &b), 5.0));
466    }
467
468    #[test]
469    fn test_cosine_similarity_range() {
470        // Cosine similarity should be in [-1, 1], distance in [0, 2]
471        let a = [0.3f32, 0.7, -0.2];
472        let b = [0.6f32, -0.1, 0.9];
473        let d = cosine_distance(&a, &b);
474        assert!((0.0 - EPSILON..=2.0 + EPSILON).contains(&d));
475    }
476
477    #[test]
478    fn test_normalize_already_normalized() {
479        let mut v = [0.6f32, 0.8]; // Already unit length
480        let norm = normalize(&mut v);
481        assert!(approx_eq(norm, 1.0));
482        assert!(approx_eq(l2_norm(&v), 1.0));
483    }
484
485    #[test]
486    fn test_normalize_single_element() {
487        let mut v = [7.0f32];
488        normalize(&mut v);
489        assert!(approx_eq(v[0], 1.0));
490    }
491
492    #[test]
493    fn test_large_values() {
494        let a = [1e10f32, 1e10, 1e10];
495        let b = [1e10f32, 1e10, 1e10];
496        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
497        assert!(approx_eq(cosine_distance(&a, &b), 0.0));
498    }
499
500    #[test]
501    fn test_very_small_values() {
502        let a = [1e-10f32, 1e-10];
503        let b = [1e-10f32, 1e-10];
504        assert!(approx_eq(euclidean_distance(&a, &b), 0.0));
505    }
506
507    #[test]
508    fn test_compute_distance_dot_product() {
509        let a = [1.0f32, 2.0, 3.0];
510        let b = [4.0f32, 5.0, 6.0];
511        let d = compute_distance(&a, &b, DistanceMetric::DotProduct);
512        // dot_product returns negative for sorting: -32.0
513        assert!(approx_eq(d, -32.0));
514    }
515}