manifoldb_vector/distance/
binary.rs

1//! Distance functions for binary vectors.
2//!
3//! This module provides efficient distance calculations for bit-packed binary vectors
4//! using hardware popcount instructions.
5//!
6//! # Hamming Distance
7//!
8//! Hamming distance counts the number of positions where corresponding bits differ.
9//! It's computed efficiently using XOR (to find differing bits) followed by popcount
10//! (to count them).
11//!
12//! Modern CPUs provide hardware popcount instructions:
13//! - x86/x86_64: POPCNT instruction (SSE4.2+)
14//! - ARM: CNT instruction (NEON)
15//!
16//! Rust's `u64::count_ones()` automatically uses these intrinsics when available.
17
18use crate::types::BinaryEmbedding;
19
20/// Calculate the Hamming distance between two binary embeddings.
21///
22/// Hamming distance is the number of positions where corresponding bits differ.
23/// It's computed as: popcount(a XOR b)
24///
25/// # Panics
26///
27/// Debug-panics if embeddings have different dimensions.
28///
29/// # Example
30///
31/// ```
32/// use manifoldb_vector::types::BinaryEmbedding;
33/// use manifoldb_vector::distance::binary::hamming_distance;
34///
35/// let a = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
36/// let b = BinaryEmbedding::new(vec![0b1010_1010u64], 8).unwrap();
37///
38/// // Bits that differ: positions 1, 3, 4, 6 = 4 differences
39/// assert_eq!(hamming_distance(&a, &b), 4);
40/// ```
41#[inline]
42#[must_use]
43pub fn hamming_distance(a: &BinaryEmbedding, b: &BinaryEmbedding) -> u32 {
44    debug_assert_eq!(a.dimension(), b.dimension(), "binary embeddings must have same dimension");
45
46    a.data().iter().zip(b.data().iter()).map(|(&x, &y)| (x ^ y).count_ones()).sum()
47}
48
49/// Calculate the Hamming distance between two raw u64 slices.
50///
51/// This is a lower-level function that operates directly on bit-packed data.
52/// For most use cases, prefer [`hamming_distance`] which works with [`BinaryEmbedding`].
53///
54/// # Panics
55///
56/// Debug-panics if slices have different lengths.
57#[inline]
58#[must_use]
59pub fn hamming_distance_raw(a: &[u64], b: &[u64]) -> u32 {
60    debug_assert_eq!(a.len(), b.len(), "bit vectors must have same length");
61
62    a.iter().zip(b.iter()).map(|(&x, &y)| (x ^ y).count_ones()).sum()
63}
64
65/// Calculate the normalized Hamming distance (Hamming distance / dimension).
66///
67/// Returns a value in [0.0, 1.0] where:
68/// - 0.0 means identical
69/// - 1.0 means completely opposite (all bits differ)
70///
71/// # Panics
72///
73/// Debug-panics if embeddings have different dimensions.
74#[inline]
75#[must_use]
76pub fn hamming_distance_normalized(a: &BinaryEmbedding, b: &BinaryEmbedding) -> f32 {
77    let distance = hamming_distance(a, b);
78    distance as f32 / a.dimension() as f32
79}
80
81/// Calculate the Jaccard similarity between two binary embeddings.
82///
83/// Jaccard similarity = |A ∩ B| / |A ∪ B| = popcount(a AND b) / popcount(a OR b)
84///
85/// Returns a value in [0.0, 1.0] where:
86/// - 1.0 means identical (same bits set)
87/// - 0.0 means completely disjoint (no common bits)
88///
89/// Returns 1.0 if both embeddings are all zeros (by convention).
90///
91/// # Panics
92///
93/// Debug-panics if embeddings have different dimensions.
94#[inline]
95#[must_use]
96pub fn jaccard_similarity(a: &BinaryEmbedding, b: &BinaryEmbedding) -> f32 {
97    debug_assert_eq!(a.dimension(), b.dimension(), "binary embeddings must have same dimension");
98
99    let mut intersection: u32 = 0;
100    let mut union: u32 = 0;
101
102    for (&x, &y) in a.data().iter().zip(b.data().iter()) {
103        intersection += (x & y).count_ones();
104        union += (x | y).count_ones();
105    }
106
107    if union == 0 {
108        1.0 // Both are all zeros - consider them identical
109    } else {
110        intersection as f32 / union as f32
111    }
112}
113
114/// Calculate the Jaccard distance between two binary embeddings.
115///
116/// Jaccard distance = 1 - Jaccard similarity
117///
118/// # Panics
119///
120/// Debug-panics if embeddings have different dimensions.
121#[inline]
122#[must_use]
123pub fn jaccard_distance(a: &BinaryEmbedding, b: &BinaryEmbedding) -> f32 {
124    1.0 - jaccard_similarity(a, b)
125}
126
127/// Distance metric for comparing binary vectors.
128#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
129pub enum BinaryDistanceMetric {
130    /// Hamming distance (count of differing bits).
131    Hamming,
132    /// Normalized Hamming distance (Hamming / dimension), range [0, 1].
133    HammingNormalized,
134    /// Jaccard distance (1 - Jaccard similarity), range [0, 1].
135    Jaccard,
136}
137
138impl BinaryDistanceMetric {
139    /// Calculate the distance between two binary embeddings using this metric.
140    ///
141    /// # Panics
142    ///
143    /// Debug-panics if embeddings have different dimensions.
144    #[inline]
145    #[must_use]
146    pub fn calculate(&self, a: &BinaryEmbedding, b: &BinaryEmbedding) -> f32 {
147        match self {
148            Self::Hamming => hamming_distance(a, b) as f32,
149            Self::HammingNormalized => hamming_distance_normalized(a, b),
150            Self::Jaccard => jaccard_distance(a, b),
151        }
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158
159    const EPSILON: f32 = 1e-6;
160
161    fn assert_near(a: f32, b: f32, epsilon: f32) {
162        assert!(
163            (a - b).abs() < epsilon,
164            "assertion failed: {} !~ {} (diff: {})",
165            a,
166            b,
167            (a - b).abs()
168        );
169    }
170
171    #[test]
172    fn test_hamming_distance_identical() {
173        let a = BinaryEmbedding::new(vec![0xFFFF_FFFF_FFFF_FFFFu64], 64).unwrap();
174        let b = BinaryEmbedding::new(vec![0xFFFF_FFFF_FFFF_FFFFu64], 64).unwrap();
175        assert_eq!(hamming_distance(&a, &b), 0);
176    }
177
178    #[test]
179    fn test_hamming_distance_opposite() {
180        let a = BinaryEmbedding::new(vec![0x0000_0000_0000_0000u64], 64).unwrap();
181        let b = BinaryEmbedding::new(vec![0xFFFF_FFFF_FFFF_FFFFu64], 64).unwrap();
182        assert_eq!(hamming_distance(&a, &b), 64);
183    }
184
185    #[test]
186    fn test_hamming_distance_half() {
187        let a = BinaryEmbedding::new(vec![0xFFFF_FFFF_0000_0000u64], 64).unwrap();
188        let b = BinaryEmbedding::new(vec![0x0000_0000_FFFF_FFFFu64], 64).unwrap();
189        assert_eq!(hamming_distance(&a, &b), 64);
190    }
191
192    #[test]
193    fn test_hamming_distance_one_bit() {
194        let a = BinaryEmbedding::new(vec![0b0000_0001u64], 8).unwrap();
195        let b = BinaryEmbedding::new(vec![0b0000_0000u64], 8).unwrap();
196        assert_eq!(hamming_distance(&a, &b), 1);
197    }
198
199    #[test]
200    fn test_hamming_distance_multi_word() {
201        let a = BinaryEmbedding::new(vec![0xFFFF_FFFF_FFFF_FFFFu64, 0x0000_0000_0000_0000u64], 128)
202            .unwrap();
203        let b = BinaryEmbedding::new(vec![0x0000_0000_0000_0000u64, 0xFFFF_FFFF_FFFF_FFFFu64], 128)
204            .unwrap();
205        assert_eq!(hamming_distance(&a, &b), 128);
206    }
207
208    #[test]
209    fn test_hamming_distance_raw() {
210        let a = [0xFFFF_0000u64, 0x0000_FFFFu64];
211        let b = [0x0000_FFFFu64, 0xFFFF_0000u64];
212        assert_eq!(hamming_distance_raw(&a, &b), 64);
213    }
214
215    #[test]
216    fn test_hamming_distance_normalized() {
217        let a = BinaryEmbedding::zeros(100).unwrap();
218        let b = BinaryEmbedding::ones(100).unwrap();
219        assert_near(hamming_distance_normalized(&a, &b), 1.0, EPSILON);
220
221        let c = BinaryEmbedding::zeros(100).unwrap();
222        assert_near(hamming_distance_normalized(&a, &c), 0.0, EPSILON);
223    }
224
225    #[test]
226    fn test_jaccard_similarity_identical() {
227        let a = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
228        let b = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
229        assert_near(jaccard_similarity(&a, &b), 1.0, EPSILON);
230    }
231
232    #[test]
233    fn test_jaccard_similarity_disjoint() {
234        let a = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
235        let b = BinaryEmbedding::new(vec![0b0000_1111u64], 8).unwrap();
236        // No intersection, union = 8 bits
237        assert_near(jaccard_similarity(&a, &b), 0.0, EPSILON);
238    }
239
240    #[test]
241    fn test_jaccard_similarity_partial_overlap() {
242        let a = BinaryEmbedding::new(vec![0b1111_1100u64], 8).unwrap();
243        let b = BinaryEmbedding::new(vec![0b0011_1111u64], 8).unwrap();
244        // Intersection: 4 bits (middle 4)
245        // Union: 8 bits
246        assert_near(jaccard_similarity(&a, &b), 4.0 / 8.0, EPSILON);
247    }
248
249    #[test]
250    fn test_jaccard_similarity_both_zero() {
251        let a = BinaryEmbedding::zeros(64).unwrap();
252        let b = BinaryEmbedding::zeros(64).unwrap();
253        assert_near(jaccard_similarity(&a, &b), 1.0, EPSILON);
254    }
255
256    #[test]
257    fn test_jaccard_distance() {
258        let a = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
259        let b = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
260        assert_near(jaccard_distance(&a, &b), 0.0, EPSILON);
261    }
262
263    #[test]
264    fn test_binary_distance_metric_calculate() {
265        let a = BinaryEmbedding::new(vec![0b1111_0000u64], 8).unwrap();
266        let b = BinaryEmbedding::new(vec![0b1010_1010u64], 8).unwrap();
267
268        // Hamming: 4 bits differ
269        assert_near(BinaryDistanceMetric::Hamming.calculate(&a, &b), 4.0, EPSILON);
270
271        // Normalized: 4/8 = 0.5
272        assert_near(BinaryDistanceMetric::HammingNormalized.calculate(&a, &b), 0.5, EPSILON);
273    }
274}