use crate::slice::Jaro;
const MAX_PREFIX: usize = 4;
const DEFAULT_SCALING: f64 = 0.1;
pub struct JaroWinkler {
scaling: f64,
jaro: Jaro,
}
impl JaroWinkler {
pub fn new() -> JaroWinkler {
let scaling = DEFAULT_SCALING;
let jaro = Jaro::new();
JaroWinkler { scaling, jaro }
}
pub fn set_scaling(&mut self, scaling: f64) {
if scaling > 0.25 {
panic!("Scaling factor should not be greater than 0.25");
}
if scaling < 0.0 {
panic!("Scaling factor should not be less than 0.0");
}
self.scaling = scaling;
}
pub fn similarity<T: Copy + PartialEq>(&self, slice1: &[T], slice2: &[T]) -> f64 {
let jaro_dist = self.jaro.similarity(slice1, slice2);
if jaro_dist == 0. { return 0.; }
let prefix_size = slice1.into_iter()
.zip(slice2.into_iter())
.take(MAX_PREFIX)
.take_while(|(x1, x2)| x1 == x2)
.count() as f64;
jaro_dist + prefix_size * self.scaling * (1. - jaro_dist)
}
pub fn rel_dist<T: Copy + PartialEq>(&self, slice1: &[T], slice2: &[T]) -> f64 {
1.0 - self.similarity(slice1, slice2)
}
}
#[cfg(test)]
mod tests {
use super::{JaroWinkler};
fn floor3(num: f64) -> f64 {
let p = 10usize.pow(3) as f64;
(num * p).floor() / p
}
#[test]
fn equality() {
let jarwin = JaroWinkler::new();
let sample = [
(1., vec![]),
(1., vec![1]),
(1., vec![1, 2]),
(1., vec![1, 2, 3]),
];
for (d, s) in sample.iter() {
assert_eq!(jarwin.similarity(s, s), *d);
}
}
#[test]
fn inequality() {
let jarwin = JaroWinkler::new();
let sample = [
(0., vec![1], vec![2]),
(0., vec![1, 1], vec![2, 2]),
(0., vec![1, 1, 1], vec![2, 2, 2]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(jarwin.similarity(s1, s2), *d);
}
}
#[test]
fn prefix() {
let jarwin = JaroWinkler::new();
let sample = [
(0.941, vec![1, 2, 3, 4], vec![1, 2, 3]),
(0.866, vec![1, 2, 3, 4], vec![1, 2]),
(0.775, vec![1, 2, 3, 4], vec![1]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn postfix() {
let jarwin = JaroWinkler::new();
let sample = [
(0.916, vec![1, 2, 3, 4], vec![2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![3, 4]),
(0.000, vec![1, 2, 3, 4], vec![4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn match_distance() {
let jarwin = JaroWinkler::new();
let sample = [
(0.000, vec![1, 2, 3, 4, 5], vec![3, 0, 0, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 3, 0, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 0, 3, 0, 0]),
(0.466, vec![1, 2, 3, 4, 5], vec![0, 0, 0, 3, 0]),
(0.000, vec![1, 2, 3, 4, 5], vec![0, 0, 0, 0, 3]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn add_del_continuous() {
let jarwin = JaroWinkler::new();
let sample = [
(0.933, vec![1, 2, 3, 4], vec![0, 1, 2, 3, 4]),
(0.888, vec![1, 2, 3, 4], vec![0, 0, 1, 2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 1, 2, 3, 4]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 0, 1, 2, 3, 4]),
(0.946, vec![1, 2, 3, 4], vec![1, 2, 0, 3, 4]),
(0.911, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 3, 4]),
(0.676, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 0, 3, 4]),
(0.666, vec![1, 2, 3, 4], vec![1, 2, 0, 0, 0, 0, 3, 4]),
(0.960, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0]),
(0.933, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0]),
(0.914, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0, 0]),
(0.900, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0, 0, 0, 0]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn sub_continuous() {
let jarwin = JaroWinkler::new();
let sample = [
(0.883, vec![1, 2, 3, 4], vec![1, 2, 3, 0]),
(0.733, vec![1, 2, 3, 4], vec![1, 2, 0, 0]),
(0.550, vec![1, 2, 3, 4], vec![1, 0, 0, 0]),
(0.000, vec![1, 2, 3, 4], vec![0, 0, 0, 0]),
(0.500, vec![1, 2, 3, 4], vec![0, 0, 0, 4]),
(0.666, vec![1, 2, 3, 4], vec![0, 0, 3, 4]),
(0.833, vec![1, 2, 3, 4], vec![0, 2, 3, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn add_del_intermittent() {
let jarwin = JaroWinkler::new();
let sample = [
(0.960, vec![1, 2, 3, 4], vec![1, 2, 3, 4, 0]),
(0.922, vec![1, 2, 3, 4], vec![1, 2, 3, 0, 4, 0]),
(0.885, vec![1, 2, 3, 4], vec![1, 2, 0, 3, 0, 4, 0]),
(0.850, vec![1, 2, 3, 4], vec![1, 0, 2, 0, 3, 0, 4, 0]),
(0.694, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 0, 4, 0]),
(0.708, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 0, 4]),
(0.595, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 0, 3, 4]),
(0.888, vec![1, 2, 3, 4], vec![0, 1, 0, 2, 3, 4]),
(0.933, vec![1, 2, 3, 4], vec![0, 1, 2, 3, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn sub_intermittent() {
let jarwin = JaroWinkler::new();
let sample = [
(0.883, vec![1, 2, 3, 4], vec![1, 2, 3, 0]),
(0.700, vec![1, 2, 3, 4], vec![1, 0, 3, 0]),
(0.833, vec![1, 2, 3, 4], vec![0, 2, 3, 4]),
(0.666, vec![1, 2, 3, 4], vec![0, 2, 0, 4]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn transpose() {
let jarwin = JaroWinkler::new();
let sample = [
(0.916, vec![1, 2, 3, 4], vec![2, 1, 3, 4]),
(0.933, vec![1, 2, 3, 4], vec![1, 2, 4, 3]),
(0.833, vec![1, 2, 3, 4], vec![2, 1, 4, 3]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.similarity(s1, s2)), *d);
assert_eq!(floor3(jarwin.similarity(s2, s1)), *d);
}
}
#[test]
fn rel_dist() {
let jarwin = JaroWinkler::new();
let sample = [
(0.000, vec![], vec![]),
(1.000, vec![1, 2, 3, 4], vec![]),
(0.133, vec![1, 2, 3, 4], vec![1, 2]),
(0.083, vec![1, 2, 3, 4], vec![2, 3, 4]),
(0.500, vec![1, 2, 3, 4], vec![0, 0, 3, 0]),
(0.111, vec![1, 2, 3, 4], vec![0, 0, 1, 2, 3, 4]),
(0.266, vec![1, 2, 3, 4], vec![1, 2, 0, 0]),
(0.166, vec![1, 2, 3, 4], vec![2, 1, 4, 3]),
(0.500, vec![1, 2, 3, 4], vec![4, 3, 2, 1]),
];
for (d, s1, s2) in sample.iter() {
assert_eq!(floor3(jarwin.rel_dist(s1, s2)), *d);
assert_eq!(floor3(jarwin.rel_dist(s2, s1)), *d);
}
}
#[test]
fn growth() {
let jarwin = JaroWinkler::new();
for len in (1..1001).step_by(100) {
let mut v1 = Vec::with_capacity(len);
let mut v2 = Vec::with_capacity(len);
v1.resize(len, 1);
v2.resize(len, 2);
assert_eq!(jarwin.similarity(&v1, &v1), 1.0);
assert_eq!(jarwin.similarity(&v1, &[]), 0.0);
assert_eq!(jarwin.similarity(&v1, &v2), 0.0);
}
}
}