#![allow(dead_code)]
use rayon::prelude::*;
#[derive(Debug, Clone, PartialEq)]
#[allow(missing_docs)]
pub enum SimilarityError {
DimensionMismatch { left: usize, right: usize },
EmptyVector,
ZeroNorm,
HistogramLengthMismatch { left: usize, right: usize },
EmptyHistogram,
NegativeHistogramValue,
EmptyCorpus,
ZeroTopK,
}
impl std::fmt::Display for SimilarityError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::DimensionMismatch { left, right } => {
write!(f, "dimension mismatch: {left} vs {right}")
}
Self::EmptyVector => write!(f, "feature vector is empty"),
Self::ZeroNorm => write!(f, "cannot compute cosine similarity with zero-norm vector"),
Self::HistogramLengthMismatch { left, right } => {
write!(f, "histogram length mismatch: {left} vs {right}")
}
Self::EmptyHistogram => write!(f, "histogram is empty"),
Self::NegativeHistogramValue => write!(f, "histogram contains negative values"),
Self::EmptyCorpus => write!(f, "corpus is empty"),
Self::ZeroTopK => write!(f, "top-K must be at least 1"),
}
}
}
impl std::error::Error for SimilarityError {}
pub type SimilarityResult<T> = Result<T, SimilarityError>;
#[derive(Debug, Clone, PartialEq)]
pub struct FeatureVector {
data: Vec<f64>,
}
impl FeatureVector {
pub fn new(data: Vec<f64>) -> SimilarityResult<Self> {
if data.is_empty() {
return Err(SimilarityError::EmptyVector);
}
Ok(Self { data })
}
#[must_use]
pub fn len(&self) -> usize {
self.data.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
#[must_use]
pub fn as_slice(&self) -> &[f64] {
&self.data
}
#[must_use]
pub fn l2_norm(&self) -> f64 {
self.data.iter().map(|&x| x * x).sum::<f64>().sqrt()
}
#[must_use]
pub fn l1_norm(&self) -> f64 {
self.data.iter().map(|&x| x.abs()).sum()
}
#[must_use]
pub fn l2_normalized(&self) -> Option<Self> {
let norm = self.l2_norm();
if norm < 1e-15 {
return None;
}
Some(Self {
data: self.data.iter().map(|&x| x / norm).collect(),
})
}
#[must_use]
pub fn z_score_normalized(&self) -> Option<Self> {
let n = self.data.len() as f64;
let mean = self.data.iter().sum::<f64>() / n;
let variance = self.data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
let std = variance.sqrt();
if std < 1e-15 {
return None;
}
Some(Self {
data: self.data.iter().map(|&x| (x - mean) / std).collect(),
})
}
}
#[derive(Debug, Clone, Default)]
pub struct CosineSimilarity;
impl CosineSimilarity {
pub fn compute(&self, a: &FeatureVector, b: &FeatureVector) -> SimilarityResult<f64> {
if a.len() != b.len() {
return Err(SimilarityError::DimensionMismatch {
left: a.len(),
right: b.len(),
});
}
let norm_a = a.l2_norm();
let norm_b = b.l2_norm();
if norm_a < 1e-15 || norm_b < 1e-15 {
return Err(SimilarityError::ZeroNorm);
}
let dot: f64 = a
.as_slice()
.iter()
.zip(b.as_slice().iter())
.map(|(&x, &y)| x * y)
.sum();
Ok((dot / (norm_a * norm_b)).clamp(-1.0, 1.0))
}
pub fn distance(&self, a: &FeatureVector, b: &FeatureVector) -> SimilarityResult<f64> {
Ok(1.0 - self.compute(a, b)?)
}
}
#[derive(Debug, Clone, Default)]
pub struct EarthMoverDistance;
impl EarthMoverDistance {
pub fn compute(&self, a: &[f64], b: &[f64]) -> SimilarityResult<f64> {
if a.is_empty() || b.is_empty() {
return Err(SimilarityError::EmptyHistogram);
}
if a.len() != b.len() {
return Err(SimilarityError::HistogramLengthMismatch {
left: a.len(),
right: b.len(),
});
}
if a.iter().any(|&v| v < 0.0) || b.iter().any(|&v| v < 0.0) {
return Err(SimilarityError::NegativeHistogramValue);
}
let sum_a: f64 = a.iter().sum();
let sum_b: f64 = b.iter().sum();
if sum_a < 1e-15 && sum_b < 1e-15 {
return Ok(0.0);
}
let norm_a = if sum_a < 1e-15 { 1.0 } else { sum_a };
let norm_b = if sum_b < 1e-15 { 1.0 } else { sum_b };
let mut cdf_a = 0.0_f64;
let mut cdf_b = 0.0_f64;
let mut emd = 0.0_f64;
for (&va, &vb) in a.iter().zip(b.iter()) {
cdf_a += va / norm_a;
cdf_b += vb / norm_b;
emd += (cdf_a - cdf_b).abs();
}
Ok(emd)
}
pub fn normalized(&self, a: &[f64], b: &[f64]) -> SimilarityResult<f64> {
let emd = self.compute(a, b)?;
Ok(emd / a.len() as f64)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SimilarityMode {
Cosine,
EmdSimilarity,
}
#[derive(Debug, Clone)]
pub struct SimilarityMatrix {
pub matrix: Vec<Vec<f64>>,
pub n: usize,
}
impl SimilarityMatrix {
pub fn compute(vectors: &[FeatureVector], mode: SimilarityMode) -> SimilarityResult<Self> {
if vectors.is_empty() {
return Err(SimilarityError::EmptyCorpus);
}
let n = vectors.len();
let cos = CosineSimilarity;
let emd = EarthMoverDistance;
let upper: Vec<Vec<f64>> = (0..n)
.into_par_iter()
.map(|i| {
let mut row = vec![0.0_f64; n];
for j in i..n {
if i == j {
row[j] = 1.0;
} else {
let sim = match mode {
SimilarityMode::Cosine => {
cos.compute(&vectors[i], &vectors[j]).unwrap_or(0.0)
}
SimilarityMode::EmdSimilarity => {
let d = emd
.normalized(vectors[i].as_slice(), vectors[j].as_slice())
.unwrap_or(1.0);
(1.0 - d).clamp(0.0, 1.0)
}
};
row[j] = sim;
}
}
row
})
.collect();
let mut matrix = upper;
for i in 0..n {
for j in 0..i {
let val = matrix[j][i];
matrix[i][j] = val;
}
}
Ok(Self { matrix, n })
}
#[must_use]
pub fn get(&self, i: usize, j: usize) -> Option<f64> {
self.matrix.get(i).and_then(|row| row.get(j)).copied()
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SimilarityHit {
pub index: usize,
pub score: f64,
}
#[derive(Debug)]
pub struct AudioSimilaritySearch {
corpus: Vec<FeatureVector>,
mode: SimilarityMode,
}
impl AudioSimilaritySearch {
pub fn new(corpus: Vec<FeatureVector>, mode: SimilarityMode) -> SimilarityResult<Self> {
if corpus.is_empty() {
return Err(SimilarityError::EmptyCorpus);
}
Ok(Self { corpus, mode })
}
pub fn search(
&self,
query: &FeatureVector,
k: usize,
min_score: Option<f64>,
) -> SimilarityResult<Vec<SimilarityHit>> {
if k == 0 {
return Err(SimilarityError::ZeroTopK);
}
let cos = CosineSimilarity;
let emd = EarthMoverDistance;
let threshold = min_score.unwrap_or(f64::NEG_INFINITY);
let scores: Vec<SimilarityResult<f64>> = self
.corpus
.par_iter()
.map(|item| match self.mode {
SimilarityMode::Cosine => cos.compute(query, item),
SimilarityMode::EmdSimilarity => {
let d = emd.normalized(query.as_slice(), item.as_slice())?;
Ok((1.0 - d).clamp(0.0, 1.0))
}
})
.collect();
let mut hits: Vec<SimilarityHit> = Vec::with_capacity(self.corpus.len());
for (idx, score_res) in scores.into_iter().enumerate() {
let score = score_res?;
if score >= threshold {
hits.push(SimilarityHit { index: idx, score });
}
}
hits.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
hits.truncate(k);
Ok(hits)
}
#[must_use]
pub fn len(&self) -> usize {
self.corpus.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.corpus.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn fv(data: Vec<f64>) -> FeatureVector {
FeatureVector::new(data).expect("test vector must not be empty")
}
#[test]
fn test_feature_vector_l2_norm() {
let v = fv(vec![3.0, 4.0]);
assert!((v.l2_norm() - 5.0).abs() < 1e-10);
}
#[test]
fn test_feature_vector_l1_norm() {
let v = fv(vec![-1.0, 2.0, -3.0]);
assert!((v.l1_norm() - 6.0).abs() < 1e-10);
}
#[test]
fn test_feature_vector_l2_normalized() {
let v = fv(vec![3.0, 4.0]);
let n = v.l2_normalized().expect("should not be zero");
assert!((n.l2_norm() - 1.0).abs() < 1e-10);
}
#[test]
fn test_feature_vector_empty_rejected() {
let res = FeatureVector::new(vec![]);
assert!(matches!(res, Err(SimilarityError::EmptyVector)));
}
#[test]
fn test_cosine_identical_vectors() {
let a = fv(vec![1.0, 2.0, 3.0]);
let cos = CosineSimilarity;
let sim = cos.compute(&a, &a).expect("should succeed");
assert!((sim - 1.0).abs() < 1e-10, "identical vectors have sim=1.0");
}
#[test]
fn test_cosine_orthogonal_vectors() {
let a = fv(vec![1.0, 0.0, 0.0]);
let b = fv(vec![0.0, 1.0, 0.0]);
let cos = CosineSimilarity;
let sim = cos.compute(&a, &b).expect("should succeed");
assert!((sim - 0.0).abs() < 1e-10, "orthogonal vectors have sim=0.0");
}
#[test]
fn test_cosine_opposite_vectors() {
let a = fv(vec![1.0, 0.0]);
let b = fv(vec![-1.0, 0.0]);
let cos = CosineSimilarity;
let sim = cos.compute(&a, &b).expect("should succeed");
assert!((sim - (-1.0)).abs() < 1e-10);
}
#[test]
fn test_cosine_dimension_mismatch() {
let a = fv(vec![1.0, 2.0]);
let b = fv(vec![1.0, 2.0, 3.0]);
let cos = CosineSimilarity;
let err = cos.compute(&a, &b).unwrap_err();
assert!(matches!(err, SimilarityError::DimensionMismatch { .. }));
}
#[test]
fn test_cosine_zero_norm_error() {
let a = fv(vec![0.0, 0.0]);
let b = fv(vec![1.0, 0.0]);
let cos = CosineSimilarity;
let err = cos.compute(&a, &b).unwrap_err();
assert!(matches!(err, SimilarityError::ZeroNorm));
}
#[test]
fn test_emd_identical_distributions() {
let hist = vec![0.1, 0.4, 0.3, 0.2];
let emd_calc = EarthMoverDistance;
let d = emd_calc.compute(&hist, &hist).expect("should succeed");
assert!(d.abs() < 1e-10, "identical distributions have EMD=0");
}
#[test]
fn test_emd_completely_different() {
let n = 10;
let mut a = vec![0.0; n];
let mut b = vec![0.0; n];
a[0] = 1.0;
b[n - 1] = 1.0;
let emd_calc = EarthMoverDistance;
let d = emd_calc.compute(&a, &b).expect("should succeed");
assert!(d > 0.0, "different distributions should have EMD > 0");
}
#[test]
fn test_emd_length_mismatch() {
let a = vec![1.0, 2.0];
let b = vec![1.0, 2.0, 3.0];
let emd_calc = EarthMoverDistance;
let err = emd_calc.compute(&a, &b).unwrap_err();
assert!(matches!(
err,
SimilarityError::HistogramLengthMismatch { .. }
));
}
#[test]
fn test_similarity_matrix_diagonal_is_one() {
let vectors = vec![
fv(vec![1.0, 0.0, 0.0]),
fv(vec![0.0, 1.0, 0.0]),
fv(vec![0.0, 0.0, 1.0]),
];
let mat =
SimilarityMatrix::compute(&vectors, SimilarityMode::Cosine).expect("should succeed");
for i in 0..3 {
assert!(
(mat.get(i, i).expect("in bounds") - 1.0).abs() < 1e-10,
"diagonal should be 1.0"
);
}
}
#[test]
fn test_similarity_matrix_is_symmetric() {
let vectors = vec![
fv(vec![1.0, 2.0, 3.0]),
fv(vec![4.0, 5.0, 6.0]),
fv(vec![7.0, 8.0, 9.0]),
];
let mat =
SimilarityMatrix::compute(&vectors, SimilarityMode::Cosine).expect("should succeed");
for i in 0..3 {
for j in 0..3 {
let a = mat.get(i, j).expect("in bounds");
let b = mat.get(j, i).expect("in bounds");
assert!(
(a - b).abs() < 1e-10,
"matrix should be symmetric: [{i}][{j}]={a:.6} != [{j}][{i}]={b:.6}"
);
}
}
}
#[test]
fn test_search_returns_top_k() {
let corpus = vec![
fv(vec![1.0, 0.0]),
fv(vec![0.0, 1.0]),
fv(vec![1.0, 1.0]),
fv(vec![-1.0, 0.0]),
];
let search =
AudioSimilaritySearch::new(corpus, SimilarityMode::Cosine).expect("should succeed");
let query = fv(vec![1.0, 0.0]);
let hits = search.search(&query, 2, None).expect("should succeed");
assert_eq!(hits.len(), 2, "should return exactly 2 results");
assert_eq!(hits[0].index, 0, "most similar should be index 0");
}
#[test]
fn test_search_with_min_score_filters() {
let corpus = vec![
fv(vec![1.0, 0.0]),
fv(vec![0.0, 1.0]), ];
let search =
AudioSimilaritySearch::new(corpus, SimilarityMode::Cosine).expect("should succeed");
let query = fv(vec![1.0, 0.0]);
let hits = search
.search(&query, 10, Some(0.5))
.expect("should succeed");
assert_eq!(
hits.len(),
1,
"only the identical vector should pass the threshold"
);
assert_eq!(hits[0].index, 0);
}
#[test]
fn test_search_empty_corpus_error() {
let err = AudioSimilaritySearch::new(vec![], SimilarityMode::Cosine).unwrap_err();
assert!(matches!(err, SimilarityError::EmptyCorpus));
}
#[test]
fn test_search_zero_k_error() {
let corpus = vec![fv(vec![1.0, 2.0])];
let search =
AudioSimilaritySearch::new(corpus, SimilarityMode::Cosine).expect("should succeed");
let query = fv(vec![1.0, 2.0]);
let err = search.search(&query, 0, None).unwrap_err();
assert!(matches!(err, SimilarityError::ZeroTopK));
}
}