oxirs_vec/
advanced_metrics.rs

1//! Advanced distance and similarity metrics for vectors.
2//!
3//! This module provides statistical and domain-specific metrics including:
4//! - Pearson correlation coefficient
5//! - Spearman rank correlation
6//! - Jaccard similarity for binary vectors
7//! - Hamming distance for binary vectors
8//! - Jensen-Shannon divergence
9//! - Earth Mover's Distance (EMD)
10//! - Wasserstein distance
11//! - KL divergence
12//! - Hellinger distance
13//! - Mahalanobis distance
14
15use anyhow::Result;
16
17use crate::{Vector, VectorData};
18
19impl Vector {
20    /// Calculate Pearson correlation coefficient
21    pub fn pearson_correlation(&self, other: &Vector) -> Result<f32> {
22        if self.dimensions != other.dimensions {
23            return Err(anyhow::anyhow!("Vector dimensions must match"));
24        }
25
26        let x = self.as_f32();
27        let y = other.as_f32();
28        let n = x.len() as f32;
29
30        let sum_x: f32 = x.iter().sum();
31        let sum_y: f32 = y.iter().sum();
32        let sum_x2: f32 = x.iter().map(|v| v * v).sum();
33        let sum_y2: f32 = y.iter().map(|v| v * v).sum();
34        let sum_xy: f32 = x.iter().zip(&y).map(|(a, b)| a * b).sum();
35
36        let numerator = n * sum_xy - sum_x * sum_y;
37        let denominator = ((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)).sqrt();
38
39        if denominator == 0.0 {
40            Ok(0.0)
41        } else {
42            Ok(numerator / denominator)
43        }
44    }
45
46    /// Calculate Spearman rank correlation
47    pub fn spearman_correlation(&self, other: &Vector) -> Result<f32> {
48        if self.dimensions != other.dimensions {
49            return Err(anyhow::anyhow!("Vector dimensions must match"));
50        }
51
52        let x = self.as_f32();
53        let y = other.as_f32();
54
55        // Convert to ranks
56        let x_ranks = to_ranks(&x);
57        let y_ranks = to_ranks(&y);
58
59        // Calculate Pearson correlation on ranks
60        let rank_vec_x = Vector::new(x_ranks);
61        let rank_vec_y = Vector::new(y_ranks);
62
63        rank_vec_x.pearson_correlation(&rank_vec_y)
64    }
65
66    /// Calculate Jaccard similarity for binary vectors
67    pub fn jaccard_similarity(&self, other: &Vector) -> Result<f32> {
68        if self.dimensions != other.dimensions {
69            return Err(anyhow::anyhow!("Vector dimensions must match"));
70        }
71
72        match (&self.values, &other.values) {
73            (VectorData::Binary(a), VectorData::Binary(b)) => {
74                let mut intersection = 0u32;
75                let mut union = 0u32;
76
77                for (byte_a, byte_b) in a.iter().zip(b) {
78                    let and_result = byte_a & byte_b;
79                    let or_result = byte_a | byte_b;
80
81                    intersection += and_result.count_ones();
82                    union += or_result.count_ones();
83                }
84
85                if union == 0 {
86                    Ok(0.0)
87                } else {
88                    Ok(intersection as f32 / union as f32)
89                }
90            }
91            _ => {
92                // Convert to binary using threshold of 0.5
93                let a_binary = Vector::to_binary(&self.as_f32(), 0.5);
94                let b_binary = Vector::to_binary(&other.as_f32(), 0.5);
95
96                let vec_a = Vector::binary(a_binary);
97                let vec_b = Vector::binary(b_binary);
98
99                vec_a.jaccard_similarity(&vec_b)
100            }
101        }
102    }
103
104    /// Calculate Hamming distance for binary vectors
105    pub fn hamming_distance(&self, other: &Vector) -> Result<u32> {
106        if self.dimensions != other.dimensions {
107            return Err(anyhow::anyhow!("Vector dimensions must match"));
108        }
109
110        match (&self.values, &other.values) {
111            (VectorData::Binary(a), VectorData::Binary(b)) => {
112                let mut distance = 0u32;
113
114                for (byte_a, byte_b) in a.iter().zip(b) {
115                    let xor_result = byte_a ^ byte_b;
116                    distance += xor_result.count_ones();
117                }
118
119                Ok(distance)
120            }
121            _ => {
122                // Convert to binary and calculate
123                let a_binary = Vector::to_binary(&self.as_f32(), 0.5);
124                let b_binary = Vector::to_binary(&other.as_f32(), 0.5);
125
126                let vec_a = Vector::binary(a_binary);
127                let vec_b = Vector::binary(b_binary);
128
129                vec_a.hamming_distance(&vec_b)
130            }
131        }
132    }
133
134    /// Calculate Jensen-Shannon divergence
135    pub fn jensen_shannon_divergence(&self, other: &Vector) -> Result<f32> {
136        if self.dimensions != other.dimensions {
137            return Err(anyhow::anyhow!("Vector dimensions must match"));
138        }
139
140        let p = normalize_to_probability(&self.as_f32())?;
141        let q = normalize_to_probability(&other.as_f32())?;
142
143        // Calculate average distribution
144        let m: Vec<f32> = p.iter().zip(&q).map(|(a, b)| (a + b) / 2.0).collect();
145
146        // Calculate KL divergences
147        let kl_pm = kl_divergence_raw(&p, &m)?;
148        let kl_qm = kl_divergence_raw(&q, &m)?;
149
150        Ok((kl_pm + kl_qm) / 2.0)
151    }
152
153    /// Calculate Kullback-Leibler divergence
154    pub fn kl_divergence(&self, other: &Vector) -> Result<f32> {
155        if self.dimensions != other.dimensions {
156            return Err(anyhow::anyhow!("Vector dimensions must match"));
157        }
158
159        let p = normalize_to_probability(&self.as_f32())?;
160        let q = normalize_to_probability(&other.as_f32())?;
161
162        kl_divergence_raw(&p, &q)
163    }
164
165    /// Calculate Hellinger distance
166    pub fn hellinger_distance(&self, other: &Vector) -> Result<f32> {
167        if self.dimensions != other.dimensions {
168            return Err(anyhow::anyhow!("Vector dimensions must match"));
169        }
170
171        let p = normalize_to_probability(&self.as_f32())?;
172        let q = normalize_to_probability(&other.as_f32())?;
173
174        let sum: f32 = p
175            .iter()
176            .zip(&q)
177            .map(|(a, b)| (a.sqrt() - b.sqrt()).powi(2))
178            .sum();
179
180        Ok((sum / 2.0).sqrt())
181    }
182
183    /// Calculate Earth Mover's Distance (1D approximation)
184    pub fn earth_movers_distance(&self, other: &Vector) -> Result<f32> {
185        if self.dimensions != other.dimensions {
186            return Err(anyhow::anyhow!("Vector dimensions must match"));
187        }
188
189        // Normalize to probability distributions
190        let p = normalize_to_probability(&self.as_f32())?;
191        let q = normalize_to_probability(&other.as_f32())?;
192
193        // Calculate cumulative distributions
194        let mut cdf_p = vec![0.0; p.len()];
195        let mut cdf_q = vec![0.0; q.len()];
196
197        cdf_p[0] = p[0];
198        cdf_q[0] = q[0];
199
200        for i in 1..p.len() {
201            cdf_p[i] = cdf_p[i - 1] + p[i];
202            cdf_q[i] = cdf_q[i - 1] + q[i];
203        }
204
205        // EMD is the L1 distance between CDFs
206        let emd: f32 = cdf_p.iter().zip(&cdf_q).map(|(a, b)| (a - b).abs()).sum();
207
208        Ok(emd)
209    }
210
211    /// Calculate Wasserstein distance (1-Wasserstein, same as EMD for 1D)
212    pub fn wasserstein_distance(&self, other: &Vector) -> Result<f32> {
213        self.earth_movers_distance(other)
214    }
215
216    /// Calculate Mahalanobis distance (simplified version without covariance matrix)
217    /// This version assumes diagonal covariance (independent dimensions)
218    pub fn mahalanobis_distance(&self, other: &Vector, variance: &[f32]) -> Result<f32> {
219        if self.dimensions != other.dimensions || self.dimensions != variance.len() {
220            return Err(anyhow::anyhow!("Dimensions must match"));
221        }
222
223        let x = self.as_f32();
224        let y = other.as_f32();
225
226        let distance_sq: f32 = x
227            .iter()
228            .zip(&y)
229            .zip(variance)
230            .map(|((a, b), &var)| {
231                if var > 0.0 {
232                    (a - b).powi(2) / var
233                } else {
234                    0.0
235                }
236            })
237            .sum();
238
239        Ok(distance_sq.sqrt())
240    }
241}
242
243/// Convert values to ranks (1-based)
244fn to_ranks(values: &[f32]) -> Vec<f32> {
245    let mut indexed: Vec<(usize, f32)> = values.iter().enumerate().map(|(i, &v)| (i, v)).collect();
246
247    indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
248
249    let mut ranks = vec![0.0; values.len()];
250    let mut i = 0;
251
252    while i < indexed.len() {
253        let mut j = i;
254        // Find all equal values
255        while j < indexed.len() && indexed[j].1 == indexed[i].1 {
256            j += 1;
257        }
258
259        // Assign average rank to all equal values
260        let avg_rank = (i + j) as f32 / 2.0 + 0.5;
261        for k in i..j {
262            ranks[indexed[k].0] = avg_rank;
263        }
264
265        i = j;
266    }
267
268    ranks
269}
270
271/// Normalize vector to probability distribution
272fn normalize_to_probability(values: &[f32]) -> Result<Vec<f32>> {
273    // Ensure non-negative values
274    let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
275    let shifted: Vec<f32> = if min_val < 0.0 {
276        values.iter().map(|v| v - min_val).collect()
277    } else {
278        values.to_vec()
279    };
280
281    let sum: f32 = shifted.iter().sum();
282
283    if sum == 0.0 {
284        // Uniform distribution if all zeros
285        Ok(vec![1.0 / values.len() as f32; values.len()])
286    } else {
287        Ok(shifted.iter().map(|v| v / sum).collect())
288    }
289}
290
291/// Calculate KL divergence between two probability distributions
292fn kl_divergence_raw(p: &[f32], q: &[f32]) -> Result<f32> {
293    if p.len() != q.len() {
294        return Err(anyhow::anyhow!("Distributions must have same length"));
295    }
296
297    let mut kl = 0.0;
298    for (pi, qi) in p.iter().zip(q) {
299        if *pi > 0.0 && *qi > 0.0 {
300            kl += pi * (pi / qi).ln();
301        } else if *pi > 0.0 && *qi == 0.0 {
302            return Ok(f32::INFINITY);
303        }
304    }
305
306    Ok(kl)
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312
313    #[test]
314    fn test_pearson_correlation() {
315        let vec1 = Vector::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
316        let vec2 = Vector::new(vec![2.0, 4.0, 6.0, 8.0, 10.0]);
317
318        let correlation = vec1.pearson_correlation(&vec2).unwrap();
319        assert!((correlation - 1.0).abs() < 1e-6); // Perfect correlation
320    }
321
322    #[test]
323    fn test_spearman_correlation() {
324        let vec1 = Vector::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
325        let vec2 = Vector::new(vec![1.0, 4.0, 9.0, 16.0, 25.0]); // Monotonic but not linear
326
327        let correlation = vec1.spearman_correlation(&vec2).unwrap();
328        assert!((correlation - 1.0).abs() < 1e-6); // Perfect rank correlation
329    }
330
331    #[test]
332    fn test_jaccard_similarity() {
333        let vec1 = Vector::binary(vec![0b11110000]);
334        let vec2 = Vector::binary(vec![0b11001100]);
335
336        let similarity = vec1.jaccard_similarity(&vec2).unwrap();
337        // Intersection: 0b11000000 (2 bits)
338        // Union: 0b11111100 (6 bits)
339        assert!((similarity - 2.0 / 6.0).abs() < 1e-6);
340    }
341
342    #[test]
343    fn test_hamming_distance() {
344        let vec1 = Vector::binary(vec![0b11110000]);
345        let vec2 = Vector::binary(vec![0b11001100]);
346
347        let distance = vec1.hamming_distance(&vec2).unwrap();
348        // XOR: 0b00111100 (4 different bits)
349        assert_eq!(distance, 4);
350    }
351
352    #[test]
353    fn test_jensen_shannon_divergence() {
354        let vec1 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
355        let vec2 = Vector::new(vec![0.5, 0.3, 0.1, 0.1]);
356
357        let jsd = vec1.jensen_shannon_divergence(&vec2).unwrap();
358        assert!((0.0..=1.0).contains(&jsd));
359
360        // Same distribution should have JSD = 0
361        let jsd_same = vec1.jensen_shannon_divergence(&vec1).unwrap();
362        assert!(jsd_same.abs() < 1e-6);
363    }
364
365    #[test]
366    fn test_hellinger_distance() {
367        let vec1 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
368        let vec2 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
369
370        let distance = vec1.hellinger_distance(&vec2).unwrap();
371        assert!(distance.abs() < 1e-6); // Same distribution
372
373        let vec3 = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
374        let distance2 = vec1.hellinger_distance(&vec3).unwrap();
375        assert!(distance2 > 0.0);
376    }
377
378    #[test]
379    fn test_earth_movers_distance() {
380        let vec1 = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
381        let vec2 = Vector::new(vec![0.0, 0.0, 0.0, 1.0]);
382
383        let emd = vec1.earth_movers_distance(&vec2).unwrap();
384        assert!(emd > 0.0); // Mass needs to be moved
385
386        // Same distribution
387        let emd_same = vec1.earth_movers_distance(&vec1).unwrap();
388        assert!(emd_same.abs() < 1e-6);
389    }
390
391    #[test]
392    fn test_mahalanobis_distance() {
393        let vec1 = Vector::new(vec![1.0, 2.0, 3.0]);
394        let vec2 = Vector::new(vec![4.0, 5.0, 6.0]);
395        let variance = vec![1.0, 2.0, 3.0];
396
397        let distance = vec1.mahalanobis_distance(&vec2, &variance).unwrap();
398        assert!(distance > 0.0);
399
400        // Distance to self should be 0
401        let self_distance = vec1.mahalanobis_distance(&vec1, &variance).unwrap();
402        assert!(self_distance.abs() < 1e-6);
403    }
404}