use crate::primitives::Vector;
use crate::AprenderError;
pub fn cosine_similarity(a: &Vector<f64>, b: &Vector<f64>) -> Result<f64, AprenderError> {
if a.len() != b.len() {
return Err(AprenderError::Other(
"Vectors must have same length".to_string(),
));
}
if a.is_empty() {
return Err(AprenderError::Other("Vectors cannot be empty".to_string()));
}
let dot_product: f64 = a
.as_slice()
.iter()
.zip(b.as_slice())
.map(|(x, y)| x * y)
.sum();
let norm_a: f64 = a.as_slice().iter().map(|x| x * x).sum::<f64>().sqrt();
let norm_b: f64 = b.as_slice().iter().map(|x| x * x).sum::<f64>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
return Ok(0.0); }
Ok(dot_product / (norm_a * norm_b))
}
pub fn jaccard_similarity<S: AsRef<str>>(a: &[S], b: &[S]) -> Result<f64, AprenderError> {
if a.is_empty() && b.is_empty() {
return Ok(1.0); }
if a.is_empty() || b.is_empty() {
return Ok(0.0); }
let set_a: std::collections::HashSet<&str> = a.iter().map(AsRef::as_ref).collect();
let set_b: std::collections::HashSet<&str> = b.iter().map(AsRef::as_ref).collect();
let intersection_size = set_a.intersection(&set_b).count();
let union_size = set_a.union(&set_b).count();
if union_size == 0 {
return Ok(0.0);
}
Ok(intersection_size as f64 / union_size as f64)
}
pub fn edit_distance(a: &str, b: &str) -> Result<usize, AprenderError> {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let m = a_chars.len();
let n = b_chars.len();
if m == 0 {
return Ok(n);
}
if n == 0 {
return Ok(m);
}
let mut dp = vec![vec![0usize; n + 1]; m + 1];
#[allow(clippy::needless_range_loop)]
for i in 0..=m {
dp[i][0] = i;
}
#[allow(clippy::needless_range_loop)]
for j in 0..=n {
dp[0][j] = j;
}
for i in 1..=m {
for j in 1..=n {
let cost = usize::from(a_chars[i - 1] != b_chars[j - 1]);
dp[i][j] = std::cmp::min(
std::cmp::min(
dp[i - 1][j] + 1, dp[i][j - 1] + 1, ),
dp[i - 1][j - 1] + cost, );
}
}
Ok(dp[m][n])
}
pub fn edit_distance_similarity(a: &str, b: &str) -> Result<f64, AprenderError> {
let distance = edit_distance(a, b)?;
let max_len = std::cmp::max(a.len(), b.len());
if max_len == 0 {
return Ok(1.0); }
Ok(1.0 - (distance as f64 / max_len as f64))
}
fn init_similarity_matrix(n: usize) -> Vec<Vec<f64>> {
let mut matrix = vec![vec![0.0; n]; n];
for (i, row) in matrix.iter_mut().enumerate() {
row[i] = 1.0;
}
matrix
}
fn store_symmetric_similarity(
similarities: &mut [Vec<f64>],
vectors: &[Vector<f64>],
i: usize,
j: usize,
) -> Result<(), AprenderError> {
let sim = cosine_similarity(&vectors[i], &vectors[j])?;
similarities[i][j] = sim;
similarities[j][i] = sim;
Ok(())
}
pub fn pairwise_cosine_similarity(vectors: &[Vector<f64>]) -> Result<Vec<Vec<f64>>, AprenderError> {
if vectors.is_empty() {
return Ok(Vec::new());
}
let n = vectors.len();
let mut similarities = init_similarity_matrix(n);
for i in 0..n {
for j in (i + 1)..n {
store_symmetric_similarity(&mut similarities, vectors, i, j)?;
}
}
Ok(similarities)
}
pub fn top_k_similar(
query: &Vector<f64>,
documents: &[Vector<f64>],
k: usize,
) -> Result<Vec<(usize, f64)>, AprenderError> {
if documents.is_empty() {
return Ok(Vec::new());
}
let mut similarities: Vec<(usize, f64)> = documents
.iter()
.enumerate()
.map(|(idx, doc)| {
let sim = cosine_similarity(query, doc).unwrap_or(0.0);
(idx, sim)
})
.collect();
similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
similarities.truncate(k);
Ok(similarities)
}
#[cfg(test)]
#[path = "similarity_tests.rs"]
mod tests;
#[cfg(test)]
#[path = "similarity_contract_falsify.rs"]
mod similarity_contract_falsify;