use hashbrown::HashSet;
use itertools::Itertools;
use ndarray::{Array, ShapeBuilder};
use std::cmp::{max, min};
#[allow(clippy::many_single_char_names)]
pub fn edit_distance(x: &str, y: &str, substitution_cost: usize, transpositions: bool) -> f64 {
let x_len = x.chars().count();
let y_len = y.chars().count();
let mut lev = Array::<i32, _>::zeros((x_len + 1, y_len + 1).f());
for idx in 1..=x_len {
lev[[idx, 0]] = idx as i32
}
for idx in 1..=y_len {
lev[[0, idx]] = idx as i32
}
for (x_idx, c1) in x.chars().enumerate() {
for (y_idx, c2) in y.chars().enumerate() {
let a = lev[[x_idx, y_idx + 1]] + 1;
let b = lev[[x_idx + 1, y_idx]] + 1;
let mut c = lev[[x_idx, y_idx]];
if c1 != c2 {
c += substitution_cost as i32;
}
c = min(min(a, b), c);
if transpositions & (x_idx > 1) & (y_idx > 1) {
if (x.chars().nth(x_idx - 1).unwrap() == c2)
& (y.chars().nth(y_idx - 1).unwrap() == c1)
{
c = min(c, lev[[x_idx - 1, y_idx - 1]] + 1);
}
}
lev[[x_idx + 1, y_idx + 1]] = c;
}
}
f64::from(lev[[x_len, y_len]])
}
pub fn dice_similarity(x: &str, y: &str) -> f64 {
if (x.len() < 2) | (y.len() < 2) {
0.0
} else if x == y {
1.0
} else {
let mut x_set: HashSet<(char, char)> = HashSet::with_capacity(5);
for (char_1, char_2) in x.chars().tuple_windows() {
x_set.insert((char_1, char_2));
}
let mut y_set: HashSet<(char, char)> = HashSet::with_capacity(x_set.len());
for (char_1, char_2) in y.chars().tuple_windows() {
y_set.insert((char_1, char_2));
}
let intersection_len = x_set.intersection(&y_set).count();
(2 * intersection_len) as f64 / (x_set.len() + y_set.len()) as f64
}
}
pub fn jaro_similarity(x: &str, y: &str) -> f64 {
let x_chars: Vec<char> = x.chars().collect::<Vec<char>>();
let y_chars: Vec<char> = y.chars().collect::<Vec<char>>();
let x_len = x_chars.len();
let y_len = y_chars.len();
let match_bound = max(x_len, y_len);
let mut transpositions = 0;
let mut flagged_1: Vec<usize> = Vec::with_capacity(5);
let mut flagged_2: Vec<usize> = Vec::with_capacity(5);
for (x_idx, x_char) in x_chars.iter().enumerate() {
let upperbound = min(x_idx + match_bound, y_len - 1);
let lowerbound = max(0, x_idx as i32 - match_bound as i32) as usize;
for (j, y_char_el) in y_chars
.iter()
.enumerate()
.take(upperbound + 1)
.skip(lowerbound)
{
if (x_char == y_char_el) & !flagged_2.contains(&j) {
flagged_1.push(x_idx);
flagged_2.push(j);
break;
}
}
}
flagged_2.sort_unstable();
let matches = flagged_1.len();
if matches == 0 {
return 0.0;
}
for (i, j) in flagged_1.iter().zip(flagged_2.iter()) {
if x_chars[*i] != y_chars[*j] {
transpositions += 1
}
}
(matches as f64 / x_len as f64
+ matches as f64 / y_len as f64
+ (matches as f64 - f64::from(transpositions / 2)) / matches as f64)
/ 3.0
}
pub fn jaro_winkler_similarity(x: &str, y: &str, p: f64, max_l: usize) -> f64 {
if (p * max_l as f64 <= 0.0) | (p * max_l as f64 >= 1.0) {
panic!("{} not in (0, 1)!", p * max_l as f64)
}
let jaro_sim = jaro_similarity(x, y);
let mut l = 0;
for (s1_i, s2_i) in x.chars().zip(y.chars()) {
if s1_i == s2_i {
l += 1;
if l == max_l {
break;
}
} else {
break;
}
}
jaro_sim + (l as f64 * p * (1.0 - jaro_sim))
}
#[cfg(test)]
mod tests {
use crate::metrics::string::*;
use float_cmp::ApproxEqUlps;
#[test]
fn test_dice_similarity() {
let res = dice_similarity("yesterday", "today");
assert!(((res * 100.).round() / 100.).approx_eq_ulps(&0.33, 2));
assert!(dice_similarity("healed", "sealed").approx_eq_ulps(&0.8, 2));
assert!(dice_similarity("", "").approx_eq_ulps(&0.0, 2));
assert!(dice_similarity("1", "test").approx_eq_ulps(&0.0, 2));
assert!(dice_similarity("test", "test").approx_eq_ulps(&1.0, 2));
}
#[test]
fn test_jaro_similarity() {
let res = jaro_similarity("AABABCAAAC", "ABAACBAAAC");
assert!(((res * 1000.).round() / 1000.).approx_eq_ulps(&0.933, 2));
let res = jaro_similarity("SHACKLEFORD", "SHACKELFORD");
assert!(((res * 1000.).round() / 1000.).approx_eq_ulps(&0.970, 2));
assert!(jaro_similarity("", "").approx_eq_ulps(&0.0, 2));
assert!(jaro_similarity("1", "2").approx_eq_ulps(&0.0, 2));
assert!(jaro_similarity("test", "test").approx_eq_ulps(&1.0, 2));
}
#[test]
fn test_jaro_winkler_similarity() {
let res = jaro_winkler_similarity("SHACKLEFORD", "SHACKELFORD", 0.1, 4);
assert!(((res * 1000.).round() / 1000.).approx_eq_ulps(&0.982, 2));
assert!(jaro_winkler_similarity("", "", 0.1, 4).approx_eq_ulps(&0.0, 2));
assert!(jaro_winkler_similarity("1", "2", 0.1, 4).approx_eq_ulps(&0.0, 2));
assert!(jaro_winkler_similarity("test", "test", 0.1, 4).approx_eq_ulps(&1.0, 2));
}
#[test]
#[should_panic]
fn test_jaro_winkler_similarity_invalid() {
jaro_winkler_similarity("AABABCAAAC", "ABAACBAAAC", 0.5, 4);
}
#[test]
fn test_edit_distance() {
let res = edit_distance("yesterday", "today", 1, false);
assert!(res.approx_eq_ulps(&5.0, 2));
}
}