manifoldb_vector/distance/
mod.rs

1//! Distance functions for vector similarity.
2//!
3//! This module provides both SIMD-optimized and scalar implementations of
4//! distance functions for vector similarity search.
5//!
6//! # Dense Vectors
7//!
8//! When the `simd` feature is enabled (default), this module uses the `wide` crate
9//! for portable SIMD operations that work across:
10//! - x86/x86_64: SSE2, SSE4.1, AVX, AVX2
11//! - ARM: NEON
12//! - WebAssembly: SIMD128
13//!
14//! The SIMD implementations process 8 floats at a time using `f32x8` vectors,
15//! with a scalar fallback for the remainder.
16//!
17//! # Distance Metrics
18//!
19//! - **Euclidean (L2)**: Standard Euclidean distance
20//! - **Cosine**: Angular distance (1 - cosine similarity)
21//! - **Dot Product**: Inner product (negated for distance)
22//! - **Manhattan (L1)**: Sum of absolute differences
23//! - **Chebyshev (Lāˆž)**: Maximum absolute difference
24//!
25//! # Sparse Vectors
26//!
27//! The [`sparse`] module provides efficient distance calculations for sparse vectors,
28//! represented as sorted `(index, value)` pairs. These are optimized for vectors
29//! with few non-zero elements, such as SPLADE embeddings.
30//!
31//! # Binary Vectors
32//!
33//! The [`binary`] module provides efficient distance calculations for bit-packed
34//! binary vectors, using hardware popcount instructions for Hamming distance.
35//!
36//! # Performance
37//!
38//! For 1536-dimensional vectors (e.g., OpenAI embeddings), the SIMD implementations
39//! typically achieve 5-10x speedup compared to scalar implementations.
40//!
41//! # Features
42//!
43//! - `simd` (default): Enable SIMD-optimized distance calculations
44//! - `scalar`: Force scalar implementations (useful for debugging)
45
46pub mod binary;
47
48#[cfg(not(feature = "scalar"))]
49mod simd;
50
51#[cfg(feature = "scalar")]
52mod scalar;
53
54pub mod sparse;
55
56// Re-export the appropriate implementation
57#[cfg(not(feature = "scalar"))]
58pub use simd::{
59    chebyshev_distance, cosine_distance, cosine_similarity, cosine_similarity_with_norms,
60    dot_product, euclidean_distance, euclidean_distance_squared, l2_norm, manhattan_distance,
61    sum_of_squares, CachedNorm,
62};
63
64#[cfg(feature = "scalar")]
65pub use scalar::{
66    chebyshev_distance, cosine_distance, cosine_similarity, cosine_similarity_with_norms,
67    dot_product, euclidean_distance, euclidean_distance_squared, l2_norm, manhattan_distance,
68    sum_of_squares, CachedNorm,
69};
70
71/// Distance metric for comparing vectors.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
73pub enum DistanceMetric {
74    /// Euclidean (L2) distance.
75    Euclidean,
76    /// Cosine distance (1 - cosine similarity).
77    Cosine,
78    /// Dot product (negative, for max similarity).
79    DotProduct,
80    /// Manhattan (L1) distance - sum of absolute differences.
81    Manhattan,
82    /// Chebyshev (Lāˆž) distance - maximum absolute difference.
83    Chebyshev,
84}
85
86impl DistanceMetric {
87    /// Calculate the distance between two vectors using this metric.
88    #[inline]
89    #[must_use]
90    pub fn calculate(&self, a: &[f32], b: &[f32]) -> f32 {
91        match self {
92            Self::Euclidean => euclidean_distance(a, b),
93            Self::Cosine => cosine_distance(a, b),
94            Self::DotProduct => -dot_product(a, b),
95            Self::Manhattan => manhattan_distance(a, b),
96            Self::Chebyshev => chebyshev_distance(a, b),
97        }
98    }
99
100    /// Calculate distance using cached norms (more efficient for repeated queries).
101    ///
102    /// Note: Norms are only used for Cosine distance. For other metrics, the norms
103    /// are ignored and the standard calculation is performed.
104    #[inline]
105    #[must_use]
106    pub fn calculate_with_norms(&self, a: &[f32], b: &[f32], norm_a: f32, norm_b: f32) -> f32 {
107        match self {
108            Self::Euclidean => euclidean_distance(a, b),
109            Self::Cosine => {
110                cosine_similarity_with_norms(a, b, norm_a, norm_b).map_or(1.0, |s| 1.0 - s)
111            }
112            Self::DotProduct => -dot_product(a, b),
113            Self::Manhattan => manhattan_distance(a, b),
114            Self::Chebyshev => chebyshev_distance(a, b),
115        }
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    const EPSILON: f32 = 1e-5;
124
125    fn assert_near(a: f32, b: f32, epsilon: f32) {
126        assert!(
127            (a - b).abs() < epsilon,
128            "assertion failed: {} !~ {} (diff: {})",
129            a,
130            b,
131            (a - b).abs()
132        );
133    }
134
135    #[test]
136    fn test_euclidean_distance() {
137        let a = [0.0, 0.0];
138        let b = [3.0, 4.0];
139        assert_near(euclidean_distance(&a, &b), 5.0, EPSILON);
140    }
141
142    #[test]
143    fn test_euclidean_distance_squared() {
144        let a = [0.0, 0.0];
145        let b = [3.0, 4.0];
146        assert_near(euclidean_distance_squared(&a, &b), 25.0, EPSILON);
147    }
148
149    #[test]
150    fn test_euclidean_distance_large() {
151        // Test with 1536-dim vectors (OpenAI embedding size)
152        let a: Vec<f32> = (0..1536).map(|i| i as f32 * 0.001).collect();
153        let b: Vec<f32> = (0..1536).map(|i| (i + 1) as f32 * 0.001).collect();
154
155        let dist = euclidean_distance(&a, &b);
156        // All differences are 0.001, so squared sum = 1536 * 0.000001 = 0.001536
157        // sqrt(0.001536) ā‰ˆ 0.0392
158        assert!(dist > 0.039 && dist < 0.040, "Expected ~0.0392, got {}", dist);
159    }
160
161    #[test]
162    fn test_cosine_similarity_identical() {
163        let a = [1.0, 0.0];
164        let b = [1.0, 0.0];
165        assert_near(cosine_similarity(&a, &b), 1.0, EPSILON);
166    }
167
168    #[test]
169    fn test_cosine_similarity_orthogonal() {
170        let c = [1.0, 0.0];
171        let d = [0.0, 1.0];
172        assert_near(cosine_similarity(&c, &d), 0.0, EPSILON);
173    }
174
175    #[test]
176    fn test_cosine_similarity_opposite() {
177        let a = [1.0, 0.0];
178        let b = [-1.0, 0.0];
179        assert_near(cosine_similarity(&a, &b), -1.0, EPSILON);
180    }
181
182    #[test]
183    fn test_cosine_distance() {
184        let a = [1.0, 0.0];
185        let b = [1.0, 0.0];
186        assert_near(cosine_distance(&a, &b), 0.0, EPSILON);
187
188        let c = [1.0, 0.0];
189        let d = [0.0, 1.0];
190        assert_near(cosine_distance(&c, &d), 1.0, EPSILON);
191    }
192
193    #[test]
194    fn test_dot_product() {
195        let a = [1.0, 2.0, 3.0];
196        let b = [4.0, 5.0, 6.0];
197        assert_near(dot_product(&a, &b), 32.0, EPSILON);
198    }
199
200    #[test]
201    fn test_dot_product_large() {
202        // Test with 1536-dim vectors
203        let a: Vec<f32> = (0..1536).map(|i| 1.0 / (i + 1) as f32).collect();
204        let b: Vec<f32> = (0..1536).map(|i| (i + 1) as f32).collect();
205
206        let dot = dot_product(&a, &b);
207        // Sum of 1/(i+1) * (i+1) = sum of 1s = 1536
208        assert_near(dot, 1536.0, EPSILON);
209    }
210
211    #[test]
212    fn test_distance_metric_calculate() {
213        let a = [3.0, 4.0];
214        let b = [0.0, 0.0];
215
216        assert_near(DistanceMetric::Euclidean.calculate(&a, &b), 5.0, EPSILON);
217
218        // Cosine distance for [3,4] and [0,0] - zero vector handling
219        let c = [1.0, 0.0];
220        let d = [1.0, 0.0];
221        assert_near(DistanceMetric::Cosine.calculate(&c, &d), 0.0, EPSILON);
222
223        assert_near(DistanceMetric::DotProduct.calculate(&a, &b), 0.0, EPSILON);
224
225        // Manhattan distance: |3-0| + |4-0| = 7
226        assert_near(DistanceMetric::Manhattan.calculate(&a, &b), 7.0, EPSILON);
227
228        // Chebyshev distance: max(|3-0|, |4-0|) = 4
229        assert_near(DistanceMetric::Chebyshev.calculate(&a, &b), 4.0, EPSILON);
230    }
231
232    #[test]
233    fn test_manhattan_distance_metric() {
234        let a = [1.0, 2.0, 3.0];
235        let b = [4.0, 6.0, 9.0];
236        // |1-4| + |2-6| + |3-9| = 3 + 4 + 6 = 13
237        assert_near(DistanceMetric::Manhattan.calculate(&a, &b), 13.0, EPSILON);
238    }
239
240    #[test]
241    fn test_chebyshev_distance_metric() {
242        let a = [1.0, 2.0, 3.0];
243        let b = [4.0, 6.0, 9.0];
244        // max(|1-4|, |2-6|, |3-9|) = max(3, 4, 6) = 6
245        assert_near(DistanceMetric::Chebyshev.calculate(&a, &b), 6.0, EPSILON);
246    }
247
248    #[test]
249    fn test_cached_norm() {
250        let v = [3.0, 4.0];
251        let cached = CachedNorm::new(&v);
252
253        assert_near(cached.norm(), 5.0, EPSILON);
254        assert_near(cached.norm_squared(), 25.0, EPSILON);
255    }
256
257    #[test]
258    fn test_cosine_similarity_with_norms() {
259        let a = [1.0, 0.0];
260        let b = [1.0, 0.0];
261        let norm_a = 1.0;
262        let norm_b = 1.0;
263
264        let sim = cosine_similarity_with_norms(&a, &b, norm_a, norm_b);
265        assert!(sim.is_some());
266        assert_near(sim.unwrap(), 1.0, EPSILON);
267    }
268
269    #[test]
270    fn test_cosine_similarity_with_zero_norm() {
271        let a = [0.0, 0.0];
272        let b = [1.0, 0.0];
273
274        let sim = cosine_similarity_with_norms(&a, &b, 0.0, 1.0);
275        assert!(sim.is_none());
276    }
277}