use crate::utils::StringWrapper;
use crate::algorithms::{Similarity, SimilarityMetric};
use std::cmp::min;
pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize
where
&'a Iter1: IntoIterator<Item = Elem1>,
&'b Iter2: IntoIterator<Item = Elem2>,
Elem1: PartialEq<Elem2>,
{
let b_len = b.into_iter().count();
let mut cache: Vec<usize> = (1..b_len + 1).collect();
let mut result = b_len;
for (i, a_elem) in a.into_iter().enumerate() {
result = i + 1;
let mut distance_b = i;
for (j, b_elem) in b.into_iter().enumerate() {
let cost = usize::from(a_elem != b_elem);
let distance_a = distance_b + cost;
distance_b = cache[j];
result = min(result + 1, min(distance_a, distance_b + 1));
cache[j] = result;
}
}
result
}
pub fn levenshtein(a: &str, b: &str) -> usize {
generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
}
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
pub struct Levenshtein;
pub struct NormalizedLevenshtein;
impl SimilarityMetric for Levenshtein {
fn compute_metric(&self, a: &str, b: &str) -> Similarity {
Similarity::Usize(levenshtein(a, b))
}
}
impl SimilarityMetric for NormalizedLevenshtein {
fn compute_metric(&self, a: &str, b: &str) -> Similarity {
Similarity::Float(normalized_levenshtein(a, b))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn levenshtein_empty() {
assert_eq!(0, levenshtein("", ""));
}
#[test]
fn levenshtein_same() {
assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
}
#[test]
fn levenshtein_diff_short() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn levenshtein_diff_with_space() {
assert_eq!(5, levenshtein("hello, world", "bye, world"));
}
#[test]
fn levenshtein_diff_multibyte() {
assert_eq!(3, levenshtein("öঙ香", "abc"));
assert_eq!(3, levenshtein("abc", "öঙ香"));
}
#[test]
fn levenshtein_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(37, levenshtein(a, b));
}
#[test]
fn levenshtein_first_empty() {
assert_eq!(7, levenshtein("", "sitting"));
}
#[test]
fn levenshtein_second_empty() {
assert_eq!(6, levenshtein("kitten", ""));
}
#[test]
fn normalized_levenshtein_diff_short() {
assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting"));
}
#[test]
fn normalized_levenshtein_for_empty_strings() {
assert_delta!(1.0, normalized_levenshtein("", ""));
}
#[test]
fn normalized_levenshtein_first_empty() {
assert_delta!(0.0, normalized_levenshtein("", "second"));
}
#[test]
fn normalized_levenshtein_second_empty() {
assert_delta!(0.0, normalized_levenshtein("first", ""));
}
#[test]
fn normalized_levenshtein_identical_strings() {
assert_delta!(1.0, normalized_levenshtein("identical", "identical"));
}
}