use crate::algorithms::{Similarity, SimilarityMetric};
use crate::utils::StringWrapper;
use std::cmp::{max, min};
fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
where
&'a Iter1: IntoIterator<Item = Elem1>,
&'b Iter2: IntoIterator<Item = Elem2>,
Elem1: PartialEq<Elem2>,
{
let a_len = a.into_iter().count();
let b_len = b.into_iter().count();
if a_len == 0 && b_len == 0 {
return 1.0;
} else if a_len == 0 || b_len == 0 {
return 0.0;
}
let mut search_range = max(a_len, b_len) / 2;
search_range = search_range.saturating_sub(1);
let mut flags_memory = vec![false; a_len + b_len];
let (a_flags, b_flags) = flags_memory.split_at_mut(a_len);
let mut matches = 0_usize;
for (i, a_elem) in a.into_iter().enumerate() {
let min_bound = if i > search_range {
i - search_range
} else {
0
};
let max_bound = min(b_len, i + search_range + 1);
for (j, b_elem) in b.into_iter().enumerate().take(max_bound) {
if min_bound <= j && a_elem == b_elem && !b_flags[j] {
a_flags[i] = true;
b_flags[j] = true;
matches += 1;
break;
}
}
}
let mut transpositions = 0_usize;
if matches != 0 {
let mut b_iter = b_flags.iter().zip(b);
for (a_flag, ch1) in a_flags.iter().zip(a) {
if *a_flag {
loop {
if let Some((b_flag, ch2)) = b_iter.next() {
if !*b_flag {
continue;
}
if ch1 != ch2 {
transpositions += 1;
}
break;
}
}
}
}
}
transpositions /= 2;
if matches == 0 {
0.0
} else {
((matches as f64 / a_len as f64)
+ (matches as f64 / b_len as f64)
+ ((matches - transpositions) as f64 / matches as f64))
/ 3.0
}
}
fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
where
&'a Iter1: IntoIterator<Item = Elem1>,
&'b Iter2: IntoIterator<Item = Elem2>,
Elem1: PartialEq<Elem2>,
{
let sim = generic_jaro(a, b);
if sim > 0.7 {
let prefix_length = a
.into_iter()
.take(4)
.zip(b)
.take_while(|(a_elem, b_elem)| a_elem == b_elem)
.count();
sim + 0.1 * prefix_length as f64 * (1.0 - sim)
} else {
sim
}
}
pub fn jaro(a: &str, b: &str) -> f64 {
generic_jaro(&StringWrapper(a), &StringWrapper(b))
}
pub fn jaro_winkler(a: &str, b: &str) -> f64 {
generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b))
}
pub struct Jaro;
pub struct JaroWinkler;
impl SimilarityMetric for Jaro {
fn compute_metric(&self, a: &str, b: &str) -> Similarity {
Similarity::Float(jaro(a, b))
}
}
impl SimilarityMetric for JaroWinkler {
fn compute_metric(&self, a: &str, b: &str) -> Similarity {
Similarity::Float(jaro_winkler(a, b))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn jaro_both_empty() {
assert_eq!(1.0, jaro("", ""));
}
#[test]
fn jaro_first_empty() {
assert_eq!(0.0, jaro("", "jaro"));
}
#[test]
fn jaro_second_empty() {
assert_eq!(0.0, jaro("distance", ""));
}
#[test]
fn jaro_same() {
assert_eq!(1.0, jaro("jaro", "jaro"));
}
#[test]
fn jaro_multibyte() {
assert_delta!(0.818, jaro("testabctest", "testöঙ香test"), 0.001);
assert_delta!(0.818, jaro("testöঙ香test", "testabctest"), 0.001);
}
#[test]
fn jaro_diff_short() {
assert_delta!(0.767, jaro("dixon", "dicksonx"), 0.001);
}
#[test]
fn jaro_diff_one_character() {
assert_eq!(0.0, jaro("a", "b"));
}
#[test]
fn jaro_same_one_character() {
assert_eq!(1.0, jaro("a", "a"));
}
#[test]
fn generic_jaro_diff() {
assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4]));
}
#[test]
fn jaro_diff_one_and_two() {
assert_delta!(0.83, jaro("a", "ab"), 0.01);
}
#[test]
fn jaro_diff_two_and_one() {
assert_delta!(0.83, jaro("ab", "a"), 0.01);
}
#[test]
fn jaro_diff_no_transposition() {
assert_delta!(0.822, jaro("dwayne", "duane"), 0.001);
}
#[test]
fn jaro_diff_with_transposition() {
assert_delta!(0.944, jaro("martha", "marhta"), 0.001);
assert_delta!(0.6, jaro("a jke", "jane a k"), 0.001);
}
#[test]
fn jaro_names() {
assert_delta!(
0.392,
jaro("Friedrich Nietzsche", "Jean-Paul Sartre"),
0.001
);
}
#[test]
fn jaro_winkler_both_empty() {
assert_eq!(1.0, jaro_winkler("", ""));
}
#[test]
fn jaro_winkler_first_empty() {
assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
}
#[test]
fn jaro_winkler_second_empty() {
assert_eq!(0.0, jaro_winkler("distance", ""));
}
#[test]
fn jaro_winkler_same() {
assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
}
#[test]
fn jaro_winkler_multibyte() {
assert_delta!(0.89, jaro_winkler("testabctest", "testöঙ香test"), 0.001);
assert_delta!(0.89, jaro_winkler("testöঙ香test", "testabctest"), 0.001);
}
#[test]
fn jaro_winkler_diff_short() {
assert_delta!(0.813, jaro_winkler("dixon", "dicksonx"), 0.001);
assert_delta!(0.813, jaro_winkler("dicksonx", "dixon"), 0.001);
}
#[test]
fn jaro_winkler_diff_one_character() {
assert_eq!(0.0, jaro_winkler("a", "b"));
}
#[test]
fn jaro_winkler_same_one_character() {
assert_eq!(1.0, jaro_winkler("a", "a"));
}
#[test]
fn jaro_winkler_diff_no_transposition() {
assert_delta!(0.84, jaro_winkler("dwayne", "duane"), 0.001);
}
#[test]
fn jaro_winkler_diff_with_transposition() {
assert_delta!(0.961, jaro_winkler("martha", "marhta"), 0.001);
assert_delta!(0.6, jaro_winkler("a jke", "jane a k"), 0.001);
}
#[test]
fn jaro_winkler_names() {
assert_delta!(
0.452,
jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"),
0.001
);
}
#[test]
fn jaro_winkler_long_prefix() {
assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001);
}
#[test]
fn jaro_winkler_more_names() {
assert_delta!(0.868, jaro_winkler("Thorkel", "Thorgier"), 0.001);
}
#[test]
fn jaro_winkler_length_of_one() {
assert_delta!(0.738, jaro_winkler("Dinsdale", "D"), 0.001);
}
#[test]
fn jaro_winkler_very_long_prefix() {
assert_delta!(
0.98519,
jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery")
);
}
}