oxirs_vec/
advanced_metrics.rs1use anyhow::Result;
16
17use crate::{Vector, VectorData};
18
19impl Vector {
20 pub fn pearson_correlation(&self, other: &Vector) -> Result<f32> {
22 if self.dimensions != other.dimensions {
23 return Err(anyhow::anyhow!("Vector dimensions must match"));
24 }
25
26 let x = self.as_f32();
27 let y = other.as_f32();
28 let n = x.len() as f32;
29
30 let sum_x: f32 = x.iter().sum();
31 let sum_y: f32 = y.iter().sum();
32 let sum_x2: f32 = x.iter().map(|v| v * v).sum();
33 let sum_y2: f32 = y.iter().map(|v| v * v).sum();
34 let sum_xy: f32 = x.iter().zip(&y).map(|(a, b)| a * b).sum();
35
36 let numerator = n * sum_xy - sum_x * sum_y;
37 let denominator = ((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)).sqrt();
38
39 if denominator == 0.0 {
40 Ok(0.0)
41 } else {
42 Ok(numerator / denominator)
43 }
44 }
45
46 pub fn spearman_correlation(&self, other: &Vector) -> Result<f32> {
48 if self.dimensions != other.dimensions {
49 return Err(anyhow::anyhow!("Vector dimensions must match"));
50 }
51
52 let x = self.as_f32();
53 let y = other.as_f32();
54
55 let x_ranks = to_ranks(&x);
57 let y_ranks = to_ranks(&y);
58
59 let rank_vec_x = Vector::new(x_ranks);
61 let rank_vec_y = Vector::new(y_ranks);
62
63 rank_vec_x.pearson_correlation(&rank_vec_y)
64 }
65
66 pub fn jaccard_similarity(&self, other: &Vector) -> Result<f32> {
68 if self.dimensions != other.dimensions {
69 return Err(anyhow::anyhow!("Vector dimensions must match"));
70 }
71
72 match (&self.values, &other.values) {
73 (VectorData::Binary(a), VectorData::Binary(b)) => {
74 let mut intersection = 0u32;
75 let mut union = 0u32;
76
77 for (byte_a, byte_b) in a.iter().zip(b) {
78 let and_result = byte_a & byte_b;
79 let or_result = byte_a | byte_b;
80
81 intersection += and_result.count_ones();
82 union += or_result.count_ones();
83 }
84
85 if union == 0 {
86 Ok(0.0)
87 } else {
88 Ok(intersection as f32 / union as f32)
89 }
90 }
91 _ => {
92 let a_binary = Vector::to_binary(&self.as_f32(), 0.5);
94 let b_binary = Vector::to_binary(&other.as_f32(), 0.5);
95
96 let vec_a = Vector::binary(a_binary);
97 let vec_b = Vector::binary(b_binary);
98
99 vec_a.jaccard_similarity(&vec_b)
100 }
101 }
102 }
103
104 pub fn hamming_distance(&self, other: &Vector) -> Result<u32> {
106 if self.dimensions != other.dimensions {
107 return Err(anyhow::anyhow!("Vector dimensions must match"));
108 }
109
110 match (&self.values, &other.values) {
111 (VectorData::Binary(a), VectorData::Binary(b)) => {
112 let mut distance = 0u32;
113
114 for (byte_a, byte_b) in a.iter().zip(b) {
115 let xor_result = byte_a ^ byte_b;
116 distance += xor_result.count_ones();
117 }
118
119 Ok(distance)
120 }
121 _ => {
122 let a_binary = Vector::to_binary(&self.as_f32(), 0.5);
124 let b_binary = Vector::to_binary(&other.as_f32(), 0.5);
125
126 let vec_a = Vector::binary(a_binary);
127 let vec_b = Vector::binary(b_binary);
128
129 vec_a.hamming_distance(&vec_b)
130 }
131 }
132 }
133
134 pub fn jensen_shannon_divergence(&self, other: &Vector) -> Result<f32> {
136 if self.dimensions != other.dimensions {
137 return Err(anyhow::anyhow!("Vector dimensions must match"));
138 }
139
140 let p = normalize_to_probability(&self.as_f32())?;
141 let q = normalize_to_probability(&other.as_f32())?;
142
143 let m: Vec<f32> = p.iter().zip(&q).map(|(a, b)| (a + b) / 2.0).collect();
145
146 let kl_pm = kl_divergence_raw(&p, &m)?;
148 let kl_qm = kl_divergence_raw(&q, &m)?;
149
150 Ok((kl_pm + kl_qm) / 2.0)
151 }
152
153 pub fn kl_divergence(&self, other: &Vector) -> Result<f32> {
155 if self.dimensions != other.dimensions {
156 return Err(anyhow::anyhow!("Vector dimensions must match"));
157 }
158
159 let p = normalize_to_probability(&self.as_f32())?;
160 let q = normalize_to_probability(&other.as_f32())?;
161
162 kl_divergence_raw(&p, &q)
163 }
164
165 pub fn hellinger_distance(&self, other: &Vector) -> Result<f32> {
167 if self.dimensions != other.dimensions {
168 return Err(anyhow::anyhow!("Vector dimensions must match"));
169 }
170
171 let p = normalize_to_probability(&self.as_f32())?;
172 let q = normalize_to_probability(&other.as_f32())?;
173
174 let sum: f32 = p
175 .iter()
176 .zip(&q)
177 .map(|(a, b)| (a.sqrt() - b.sqrt()).powi(2))
178 .sum();
179
180 Ok((sum / 2.0).sqrt())
181 }
182
183 pub fn earth_movers_distance(&self, other: &Vector) -> Result<f32> {
185 if self.dimensions != other.dimensions {
186 return Err(anyhow::anyhow!("Vector dimensions must match"));
187 }
188
189 let p = normalize_to_probability(&self.as_f32())?;
191 let q = normalize_to_probability(&other.as_f32())?;
192
193 let mut cdf_p = vec![0.0; p.len()];
195 let mut cdf_q = vec![0.0; q.len()];
196
197 cdf_p[0] = p[0];
198 cdf_q[0] = q[0];
199
200 for i in 1..p.len() {
201 cdf_p[i] = cdf_p[i - 1] + p[i];
202 cdf_q[i] = cdf_q[i - 1] + q[i];
203 }
204
205 let emd: f32 = cdf_p.iter().zip(&cdf_q).map(|(a, b)| (a - b).abs()).sum();
207
208 Ok(emd)
209 }
210
211 pub fn wasserstein_distance(&self, other: &Vector) -> Result<f32> {
213 self.earth_movers_distance(other)
214 }
215
216 pub fn mahalanobis_distance(&self, other: &Vector, variance: &[f32]) -> Result<f32> {
219 if self.dimensions != other.dimensions || self.dimensions != variance.len() {
220 return Err(anyhow::anyhow!("Dimensions must match"));
221 }
222
223 let x = self.as_f32();
224 let y = other.as_f32();
225
226 let distance_sq: f32 = x
227 .iter()
228 .zip(&y)
229 .zip(variance)
230 .map(|((a, b), &var)| {
231 if var > 0.0 {
232 (a - b).powi(2) / var
233 } else {
234 0.0
235 }
236 })
237 .sum();
238
239 Ok(distance_sq.sqrt())
240 }
241}
242
243fn to_ranks(values: &[f32]) -> Vec<f32> {
245 let mut indexed: Vec<(usize, f32)> = values.iter().enumerate().map(|(i, &v)| (i, v)).collect();
246
247 indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
248
249 let mut ranks = vec![0.0; values.len()];
250 let mut i = 0;
251
252 while i < indexed.len() {
253 let mut j = i;
254 while j < indexed.len() && indexed[j].1 == indexed[i].1 {
256 j += 1;
257 }
258
259 let avg_rank = (i + j) as f32 / 2.0 + 0.5;
261 for k in i..j {
262 ranks[indexed[k].0] = avg_rank;
263 }
264
265 i = j;
266 }
267
268 ranks
269}
270
271fn normalize_to_probability(values: &[f32]) -> Result<Vec<f32>> {
273 let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
275 let shifted: Vec<f32> = if min_val < 0.0 {
276 values.iter().map(|v| v - min_val).collect()
277 } else {
278 values.to_vec()
279 };
280
281 let sum: f32 = shifted.iter().sum();
282
283 if sum == 0.0 {
284 Ok(vec![1.0 / values.len() as f32; values.len()])
286 } else {
287 Ok(shifted.iter().map(|v| v / sum).collect())
288 }
289}
290
291fn kl_divergence_raw(p: &[f32], q: &[f32]) -> Result<f32> {
293 if p.len() != q.len() {
294 return Err(anyhow::anyhow!("Distributions must have same length"));
295 }
296
297 let mut kl = 0.0;
298 for (pi, qi) in p.iter().zip(q) {
299 if *pi > 0.0 && *qi > 0.0 {
300 kl += pi * (pi / qi).ln();
301 } else if *pi > 0.0 && *qi == 0.0 {
302 return Ok(f32::INFINITY);
303 }
304 }
305
306 Ok(kl)
307}
308
309#[cfg(test)]
310mod tests {
311 use super::*;
312
313 #[test]
314 fn test_pearson_correlation() {
315 let vec1 = Vector::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
316 let vec2 = Vector::new(vec![2.0, 4.0, 6.0, 8.0, 10.0]);
317
318 let correlation = vec1.pearson_correlation(&vec2).unwrap();
319 assert!((correlation - 1.0).abs() < 1e-6); }
321
322 #[test]
323 fn test_spearman_correlation() {
324 let vec1 = Vector::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
325 let vec2 = Vector::new(vec![1.0, 4.0, 9.0, 16.0, 25.0]); let correlation = vec1.spearman_correlation(&vec2).unwrap();
328 assert!((correlation - 1.0).abs() < 1e-6); }
330
331 #[test]
332 fn test_jaccard_similarity() {
333 let vec1 = Vector::binary(vec![0b11110000]);
334 let vec2 = Vector::binary(vec![0b11001100]);
335
336 let similarity = vec1.jaccard_similarity(&vec2).unwrap();
337 assert!((similarity - 2.0 / 6.0).abs() < 1e-6);
340 }
341
342 #[test]
343 fn test_hamming_distance() {
344 let vec1 = Vector::binary(vec![0b11110000]);
345 let vec2 = Vector::binary(vec![0b11001100]);
346
347 let distance = vec1.hamming_distance(&vec2).unwrap();
348 assert_eq!(distance, 4);
350 }
351
352 #[test]
353 fn test_jensen_shannon_divergence() {
354 let vec1 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
355 let vec2 = Vector::new(vec![0.5, 0.3, 0.1, 0.1]);
356
357 let jsd = vec1.jensen_shannon_divergence(&vec2).unwrap();
358 assert!((0.0..=1.0).contains(&jsd));
359
360 let jsd_same = vec1.jensen_shannon_divergence(&vec1).unwrap();
362 assert!(jsd_same.abs() < 1e-6);
363 }
364
365 #[test]
366 fn test_hellinger_distance() {
367 let vec1 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
368 let vec2 = Vector::new(vec![0.25, 0.25, 0.25, 0.25]);
369
370 let distance = vec1.hellinger_distance(&vec2).unwrap();
371 assert!(distance.abs() < 1e-6); let vec3 = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
374 let distance2 = vec1.hellinger_distance(&vec3).unwrap();
375 assert!(distance2 > 0.0);
376 }
377
378 #[test]
379 fn test_earth_movers_distance() {
380 let vec1 = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
381 let vec2 = Vector::new(vec![0.0, 0.0, 0.0, 1.0]);
382
383 let emd = vec1.earth_movers_distance(&vec2).unwrap();
384 assert!(emd > 0.0); let emd_same = vec1.earth_movers_distance(&vec1).unwrap();
388 assert!(emd_same.abs() < 1e-6);
389 }
390
391 #[test]
392 fn test_mahalanobis_distance() {
393 let vec1 = Vector::new(vec![1.0, 2.0, 3.0]);
394 let vec2 = Vector::new(vec![4.0, 5.0, 6.0]);
395 let variance = vec![1.0, 2.0, 3.0];
396
397 let distance = vec1.mahalanobis_distance(&vec2, &variance).unwrap();
398 assert!(distance > 0.0);
399
400 let self_distance = vec1.mahalanobis_distance(&vec1, &variance).unwrap();
402 assert!(self_distance.abs() < 1e-6);
403 }
404}