use std::cmp::min;
use std::ops::Range;
use std::cell::RefCell;
use crate::utils::common_prefix_size;
use crate::utils::Zippable;
const DEFAULT_CAPACITY: usize = 25;
pub struct Jaro {
matches1: RefCell<Vec<bool>>,
matches2: RefCell<Vec<bool>>,
}
impl Jaro {
pub fn new() -> Self {
Self {
matches1: RefCell::new(Vec::with_capacity(DEFAULT_CAPACITY)),
matches2: RefCell::new(Vec::with_capacity(DEFAULT_CAPACITY)),
}
}
pub fn similarity<T: PartialEq + Copy>(&self, slice1: &[T], slice2: &[T]) -> f64 {
match (slice1.len(), slice2.len()) {
(0, 0) => { return 1.0; }
(_, 0) => { return 0.0; }
(0, _) => { return 0.0; }
(_, _) => { }
}
let prefix = common_prefix_size(slice1, slice2);
let mut slice1 = &slice1[prefix..];
let mut slice2 = &slice2[prefix..];
if slice2.len() < slice1.len() {
std::mem::swap(&mut slice1, &mut slice2);
}
let matches1 = &mut *self.matches1.borrow_mut();
let matches2 = &mut *self.matches2.borrow_mut();
matches1.clear();
matches2.clear();
matches1.resize(slice1.len(), false);
matches2.resize(slice2.len(), false);
let mut matches = 0;
let len1 = slice1.len();
let len2 = slice2.len();
let i2_range = max!(1, (len1 + prefix) / 2, (len2 + prefix) / 2) - 1;
let mut i1 = 0;
for (x1, match1) in (slice1, &mut matches1[..]).zip() {
let rng = get_range(i1, i2_range, len2);
if rng.start >= rng.end { continue; }
for (x2, match2) in (&slice2[rng.clone()], &mut matches2[rng]).zip() {
if !*match2 && x1 == x2 {
*match1 = true;
*match2 = true;
matches += 1;
break;
}
}
i1 += 1;
}
if prefix + matches == 0 { return 0.0; }
let mut trans = 0;
if matches != 0 {
let matched1 = (slice1, matches1).zip().filter_map(|(x, m)| some_if(*m, *x));
let matched2 = (slice2, matches2).zip().filter_map(|(x, m)| some_if(*m, *x));
trans = (matched1, matched2).zip()
.filter(|(x1, x2)| x1 != x2)
.count();
}
let matches = (prefix + matches) as f64;
let trans = trans as f64;
let len1 = (prefix + len1) as f64;
let len2 = (prefix + len2) as f64;
(matches/len1 + matches/len2 + ((matches - trans/2.) / matches)) / 3.
}
pub fn rel_dist<T: PartialEq + Copy>(&self, slice1: &[T], slice2: &[T]) -> f64 {
1.0 - self.similarity(slice1, slice2)
}
}
#[inline]
fn get_range(i1: usize, search_range: usize, len2: usize) -> Range<usize> {
let lo = i1 - min(search_range, i1);
let up = min(i1 + search_range + 1, len2);
lo .. up
}
#[inline]
fn some_if<T>(cond: bool, val: T) -> Option<T> {
if cond { Some(val) } else { None }
}
#[cfg(test)]
mod tests {
use super::Jaro;
fn floor3(num: f64) -> f64 {
let p = 10usize.pow(3) as f64;
(num * p).floor() / p
}
#[test]
fn equality() {
let jaro = Jaro::new();
let sample = [
(1., vec![]),
(1., vec![1]),
(1., vec![1, 2]),
(1., vec![1, 2, 3]),
];
for (d, s) in sample.iter() {
assert_eq!(jaro.similarity(s, s), *d);
}
}
#[test]
fn inequality() {
let jaro = Jaro::new();
let sample = [
(0., vec![1], vec![2]),
(0., vec![1, 1], vec![2, 2]),
(0., vec![1, 1, 1], vec![2, 2, 2]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(jaro.similarity(s1, s2), *d);
}
}
#[test]
fn prefix() {
let jaro = Jaro::new();
let sample = [
(0.916, vec![1, 2, 3, 4], vec![1, 2, 3]),
(0.833, vec![1, 2, 3, 4], vec![1, 2]),
(0.750, vec![1, 2, 3, 4], vec![1]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn postfix() {
let jaro = Jaro::new();
let sample = [
(0.916, vec![1, 2, 3, 4], vec![2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![3, 4]),
(0.000, vec![1, 2, 3, 4], vec![4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn match_distance() {
let jaro = Jaro::new();
let sample = [
(0.000, vec![1, 2, 3, 4, 5], vec![3, 0, 0, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 3, 0, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 0, 3, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 0, 0, 3, 0]),
(0.000, vec![1, 2, 3, 4, 5], vec![0, 0, 0, 0, 3]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn add_del_continuous() {
let jaro = Jaro::new();
let sample = [
(0.933, vec![1, 2, 3, 4], vec![0, 1, 2, 3, 4]),
(0.888, vec![1, 2, 3, 4], vec![0, 0, 1, 2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 1, 2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 0, 1, 2, 3, 4]),
(0.933, vec![1, 2, 3, 4], vec![1, 2, 0, 3, 4]),
(0.888, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 3, 4]),
(0.595, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 0, 3, 4]),
(0.583, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 0, 0, 3, 4]),
(0.933, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0]),
(0.888, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0]),
(0.857, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0, 0]),
(0.833, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0, 0, 0]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn sub_continuous() {
let jaro = Jaro::new();
let sample = [
(0.833, vec![1, 2, 3, 4], vec![1, 2, 3, 0]),
(0.666, vec![1, 2, 3, 4], vec![1, 2, 0, 0]),
(0.500, vec![1, 2, 3, 4], vec![1, 0, 0, 0]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 0]),
(0.500, vec![1, 2, 3, 4], vec![0, 0, 0, 4]),
(0.666, vec![1, 2, 3, 4], vec![0, 0, 3, 4]),
(0.833, vec![1, 2, 3, 4], vec![0, 2, 3, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn add_del_intermittent() {
let jaro = Jaro::new();
let sample = [
(0.933, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0]),
(0.888, vec![1, 2, 3, 4], vec![1, 2, 3, 0, 4, 0]),
(0.857, vec![1, 2, 3, 4], vec![1, 2, 0, 3, 0, 4, 0]),
(0.833, vec![1, 2, 3, 4], vec![1, 0, 2, 0, 3, 0, 4, 0]),
(0.694, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 0, 4, 0]),
(0.708, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 0, 4]),
(0.595, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 4]),
(0.888, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 3, 4]),
(0.933, vec![1, 2, 3, 4], vec![0, 1, 2, 3, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn sub_intermittent() {
let jaro = Jaro::new();
let sample = [
(0.833, vec![1, 2, 3, 4], vec![1, 2, 3, 0]),
(0.666, vec![1, 2, 3, 4], vec![1, 0, 3, 0]),
(0.833, vec![1, 2, 3, 4], vec![0, 2, 3, 4]),
(0.666, vec![1, 2, 3, 4], vec![0, 2, 0, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn transpose() {
let jaro = Jaro::new();
let sample = [
(0.916, vec![1, 2, 3, 4], vec![2, 1, 3, 4]),
(0.916, vec![1, 2, 3, 4], vec![1, 2, 4, 3]),
(0.833, vec![1, 2, 3, 4], vec![2, 1, 4, 3]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.similarity(s1, s2)), *d);
assert_eq!(floor3(jaro.similarity(s2, s1)), *d);
}
}
#[test]
fn rel_dist() {
let jaro = Jaro::new();
let sample = [
(0.000, vec![], vec![]),
(1.000, vec![1, 2, 3, 4], vec![]),
(0.166, vec![1, 2, 3, 4], vec![1, 2]),
(0.083, vec![1, 2, 3, 4], vec![2, 3, 4]),
(0.500, vec![1, 2, 3, 4], vec![0, 0, 3, 0]),
(0.111, vec![1, 2, 3, 4], vec![0, 0, 1, 2, 3, 4]),
(0.333, vec![1, 2, 3, 4], vec![1, 2, 0, 0]),
(0.166, vec![1, 2, 3, 4], vec![2, 1, 4, 3]),
(0.500, vec![1, 2, 3, 4], vec![4, 3, 2, 1]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jaro.rel_dist(s1, s2)), *d);
assert_eq!(floor3(jaro.rel_dist(s2, s1)), *d);
}
}
#[test]
fn growth() {
let jaro = Jaro::new();
for len in (1..1001).step_by(100) {
let mut v1 = Vec::with_capacity(len);
let mut v2 = Vec::with_capacity(len);
v1.resize(len, 1);
v2.resize(len, 2);
assert_eq!(jaro.similarity(&v1, &v1), 1.0);
assert_eq!(jaro.similarity(&v1, &[]), 0.0);
assert_eq!(jaro.similarity(&v1, &v2), 0.0);
}
}
}