use std::cmp;
use std::collections::HashSet;
use std::iter::FromIterator;
use super::ngrams::word_ngrams;
pub fn hamming_distance (word1: &str, word2: &str) -> Result<usize, &'static str> {
let chars1: Vec<char> = word1.chars().collect();
let chars2: Vec<char> = word2.chars().collect();
if chars1.len() != chars2.len() {
return Err("The words are not the same size");
}
let count = chars1.iter()
.zip(chars2.iter())
.filter(|(c1, c2)| c1 != c2)
.count();
Ok(count)
}
pub fn levenshtein_distance (word1: &str, word2: &str) -> usize {
compute_seq_distance(word1.chars().collect(), word2.chars().collect(), false)
}
pub fn optimal_string_alignment_distance (word1: &str, word2: &str) -> usize {
compute_seq_distance(word1.chars().collect(), word2.chars().collect(), true)
}
fn compute_seq_distance<T> (items1: Vec<T>, items2: Vec<T>, osa: bool) -> usize
where T: std::cmp::Eq {
if items1 == items2 {
0
} else if items1.is_empty() {
items2.len()
} else if items2.is_empty() {
items1.len()
} else {
let mut curr1: Vec<usize> = (0..=items2.len()).collect();
let mut curr2 = vec![0; items2.len()+1];
for i in 0..items1.len() {
let mut curr = vec![0; curr1.len()];
curr[0] = i+1;
for j in 0..items2.len() {
let cost = if items1[i] == items2[j] { 0 } else { 1 };
curr[j+1] = cmp::min(1+curr[j], cmp::min(1+curr1[j+1], cost+curr1[j]));
if osa && i>1 && j>1 && items1[i] == items2[j-1] && items1[i-1] == items2[j] {
curr[j+1] = cmp::min(curr[j+1], cost+curr2[j-1]);
}
}
curr2 = curr1;
curr1 = curr;
}
curr1.pop().unwrap_or_else(|| 0) }
}
pub fn sorenson_dice_similarity (word1: &str, word2: &str) -> f64 {
sorenson_dice_similarity_n(word1, word2, 2)
}
pub fn sorenson_dice_similarity_n (word1: &str, word2: &str, n: usize) -> f64 {
let ngrams1: HashSet<_> = HashSet::from_iter(word_ngrams(word1, n));
let ngrams2: HashSet<_> = HashSet::from_iter(word_ngrams(word2, n));
if ngrams1.is_empty() || ngrams2.is_empty() {
1.0
} else {
let total_size = (ngrams1.len() + ngrams2.len()) as f64;
let intersection: HashSet<_> = ngrams1.intersection(&ngrams2).collect();
let intersection_size = intersection.len() as f64;
(2.0 * intersection_size) / total_size
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hamming_distance () {
assert_eq!(Ok(0), hamming_distance("", ""));
assert_eq!(Ok(0), hamming_distance("string", "string"));
assert_eq!(Ok(3), hamming_distance("this string", "that strong"));
assert_eq!(Ok(4), hamming_distance("This string", "that strong"));
assert_eq!(Err("The words are not the same size"), hamming_distance("no", "not"));
}
#[test]
fn test_levenshtein_distance () {
assert_eq!(0, levenshtein_distance("sitting", "sitting"));
assert_eq!(7, levenshtein_distance("sitting", ""));
assert_eq!(6, levenshtein_distance("", "kitten"));
assert_eq!(3, levenshtein_distance("sitting", "kitten"));
assert_eq!(3, levenshtein_distance("Sunday", "Saturday"));
assert_eq!(4, levenshtein_distance("Sunday", "saturday"));
assert_eq!(2, levenshtein_distance("cast", "cats"));
}
#[test]
fn test_optimal_string_alignment_distance () {
assert_eq!(3, optimal_string_alignment_distance("kitten", "sitting"));
assert_eq!(3, optimal_string_alignment_distance("this string", "that strnig"));
assert_eq!(3, optimal_string_alignment_distance("this string", "that strnig"));
assert_eq!(1, optimal_string_alignment_distance("cast", "cats"));
}
fn test_approx_same (num1: f64, num2: f64, eps: f64) {
assert!((num1-num2).abs() < eps);
}
#[test]
fn test_sorenson_dice_similarity () {
test_approx_same(0.8, sorenson_dice_similarity("Healed", "Sealed"), 0.01);
test_approx_same(0.55, sorenson_dice_similarity("Healed", "Healthy"), 0.01);
test_approx_same(0.44, sorenson_dice_similarity("Healed", "Heard"), 0.01);
test_approx_same(0.40, sorenson_dice_similarity("Healed", "Herded"), 0.01);
test_approx_same(0.25, sorenson_dice_similarity("Healed", "Help"), 0.01);
test_approx_same(0.0, sorenson_dice_similarity("Healed", "Sold"), 0.01);
test_approx_same(1.0, sorenson_dice_similarity("abcde fghi", "abcde fghi"), 0.01);
test_approx_same(0.5882, sorenson_dice_similarity("sympathize", "sympthise"), 0.01);
test_approx_same(0.5882, sorenson_dice_similarity_n("sympathize", "sympthise", 2), 0.01);
test_approx_same(0.4, sorenson_dice_similarity_n("sympathize", "sympthise", 3), 0.01);
test_approx_same(0.1538, sorenson_dice_similarity_n("sympathize", "sympthise", 4), 0.01);
}
}