use std::cmp::{min, max};
use std::iter::repeat;
use std::ptr;
pub fn jaro(a: &str, b: &str) -> f64 {
if a == b {
return 1.0;
}
if a.len() == 0 || b.len() == 0 {
return 0.0;
}
let a = a.to_uppercase();
let b = b.to_uppercase();
let search_range = max(0, (max(a.len(), b.len()) / 2) - 1);
let mut b_consumed: Vec<bool> = repeat(false).take(b.len()).collect::<Vec<bool>>();
let mut matches = 0.0;
let mut transpositions = 0.0;
let mut b_match_index = 0;
for (i, a_char) in a.chars().enumerate() {
let min_bound = if i > search_range {
max(0, i - search_range)
} else {
0
};
let max_bound = min(b.len() - 1, i + search_range);
if min_bound > max_bound {
continue;
}
for (j, b_char) in b.chars().enumerate() {
if (min_bound <= j && j <= max_bound) && (a_char == b_char && !b_consumed[j]) {
b_consumed[j] = true;
matches += 1.0;
if j < b_match_index {
transpositions += 1.0;
}
b_match_index = j;
break;
}
}
}
if matches == 0.0 {
0.0
} else {
(1.0 / 3.0) *
((matches / a.len() as f64) + (matches / b.len() as f64) +
((matches - transpositions) / matches))
}
}
pub fn jaro_winkler<T:ToString + ?Sized>(a: &T, b: &T) -> f64 {
let a = a.to_string();
let b = b.to_string();
let jaro_distance = jaro(&a, &b);
let prefrix_length = a.chars()
.zip(b.chars())
.take_while(|&(a_char, b_char)| a_char == b_char)
.count();
let jaro_winkler_distance = jaro_distance +
(0.1 * prefrix_length as f64 * (1.0 - jaro_distance));
if jaro_winkler_distance <= 1.0 {
jaro_winkler_distance
} else {
1.0
}
}
pub fn levenshtein(a: &str, b: &str) -> usize {
if a == b {
return 0;
} else if a.len() == 0 {
return b.len();
} else if b.len() == 0 {
return a.len();
}
let mut prev_distances: Vec<u16> = (0..(b.len() as u16 + 1)).collect::<Vec<u16>>();
let mut curr_distances: Vec<u16> = repeat(0).take(b.len()+1).collect::<Vec<u16>>();
for (i, a_char) in a.chars().enumerate() {
curr_distances[0] = i as u16 + 1;
for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char {
0
} else {
1
};
curr_distances[j + 1] = min(curr_distances[j] + 1,
min(prev_distances[j + 1] + 1, prev_distances[j] + cost));
}
unsafe {
let slice_ptr = (&curr_distances[..]).as_ptr();
ptr::copy(slice_ptr, prev_distances.as_mut_ptr(), prev_distances.len());
}
}
curr_distances[b.len()] as usize
}
pub fn levenshtein_against_vec(a: &str, v: &[&str]) -> Vec<usize> {
let mut r: Vec<usize> = Vec::with_capacity(v.len());
for b in v.iter() {
r.push(levenshtein(a, b));
}
r
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn levenshtein_empty_string() {
assert_eq!(0, levenshtein("", ""));
let test: String = "hey".to_owned();
assert_eq!(0, levenshtein(&test, &test))
}
#[test]
fn levenshtein_same_string() {
assert_eq!(0, levenshtein("kitten", "kitten"))
}
#[test]
fn levenshtein_diff_short_string() {
assert_eq!(3, levenshtein("kitten", "sitting"))
}
#[test]
fn levenshtein_diff_spaced_sring() {
assert_eq!(5, levenshtein("hello, world", "bye, world"))
}
#[test]
fn levenshtein_diff_long_string() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(37, levenshtein(a, b))
}
#[test]
fn levenshtein_first_empty() {
assert_eq!(7, levenshtein("", "sitting"))
}
#[test]
fn levenshtein_second_empty() {
assert_eq!(6, levenshtein("kitten", ""))
}
#[test]
fn jaro_empty_string() {
assert!((1.0 - jaro("", "")).abs() < 0.001)
}
#[test]
fn jaro_first_empty() {
assert!((0.0 - jaro("", "jaro")).abs() < 0.001)
}
#[test]
fn jaro_second_empty() {
assert!((0.0 - jaro("distance", "")).abs() < 0.001)
}
#[test]
fn jaro_same() {
assert!((1.0 - jaro("jaro", "jaro")).abs() < 0.001);
}
#[test]
fn jaro_2() {
assert!( (100.0 * jaro("FUCK", "FUKC").abs()) > 90.0 );
assert!( (100.0 * jaro("Fuck", "FUKC").abs()) > 90.0 );
}
#[test]
fn jaro_diff_short() {
assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001)
}
#[test]
fn jaro_diff_no_transposition() {
assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001)
}
#[test]
fn jaro_diff_with_transposition() {
assert!((0.411 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < 0.001)
}
#[test]
fn jaro_1() {
assert!((0.411 - jaro(&"Friedrich Nietzsche".to_owned(), &"Jean-Paul Sartre".to_owned())).abs() < 0.001)
}
#[test]
fn levenshtein_only_strings() {
let vec: Vec<String> = vec!["test".to_owned(), "bibi".to_owned()];
let mut vv: Vec<&str> = Vec::new();
for v in &vec {
vv.push(&v);
}
let test = "test".to_owned();
assert_eq!(levenshtein_against_vec(&test, &vv[..]), [0, 4])
}
}