vec_vp_tree/dist/
string.rs

1// Copyright 2016 Austin Bonander
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Distance functions for strings (`strsim` feature, enabled by default)
9use strsim;
10
11use super::{DistFn, KnownDist};
12
13use std::cmp;
14
15const JARO_SCALE_FACTOR: f64 = 100.;
16
17/// Hamming distance function for strings. Default as the fastest.
18///
19/// Returns the number of characters that are different between the two strings.
20///
21/// If the strings are not the same length, adds the length difference as well.
22#[derive(Copy, Clone)]
23pub struct Hamming;
24
25/// Levenshtein distance (edit-distance) function for strings.
26///
27/// Returns the the minimum number of insertions, deletions, and substitutions required to change
28/// one string into the other.
29#[derive(Copy, Clone)]
30pub struct Levenshtein;
31
32/// Damerau-Levenshtein distance function for strings.
33///
34/// Returns the the minimum number of insertions, deletions, and substitutions required to change
35/// one string into the other, also counts swaps of adjacent characters.
36#[derive(Copy, Clone)]
37pub struct DamerauLevenshtein;
38
39/// Distance function for strings using the Jaro similarity factor.
40///
41/// Returns a number in `[0, 100]`, inversely proportional to the similarity between
42/// the two strings, where 0 is exactly the same, and 100 is not at all similar.
43#[derive(Copy, Clone)]
44pub struct Jaro;
45
46/// Distance function for strings using the Jaro similarity factor,
47/// optimized for when strings are expected to have a common prefix.
48///
49/// Returns a number in `[0, 100]`, inversely proportional to the similarity between
50/// the two strings, where 0 is exactly the same, and 100 is not at all similar.
51#[derive(Copy, Clone)]
52pub struct JaroWinkler;
53
54macro_rules! impl_dist_fn {
55    ($($ty:ty = $distfn:path),*) => (
56        $(
57            impl<'a> DistFn<&'a str> for $ty {
58                fn dist(&self, left: &&'a str, right: &&'a str) -> u64 {
59                    $distfn(left, right) as u64
60                }
61            }
62
63            impl DistFn<String> for $ty {
64                fn dist(&self, left: &String, right: &String) -> u64 {
65                    $distfn(left, right) as u64
66                }
67            }
68        )*
69    )
70}
71
72impl_dist_fn! {
73    Hamming = hamming_dist,
74    Levenshtein = strsim::levenshtein,
75    DamerauLevenshtein = strsim::damerau_levenshtein,
76    Jaro = jaro_factor,
77    JaroWinkler = jaro_winkler_factor
78}
79
80impl<'a> KnownDist for &'a str {
81    /// The fastest distance function for strings.
82    type DistFn = Hamming;
83    fn dist_fn() -> Hamming {
84        Hamming
85    }
86}
87
88impl KnownDist for String {
89    /// The fastest distance function for strings.
90    type DistFn = Hamming;
91    fn dist_fn() -> Hamming {
92        Hamming
93    }
94}
95
96fn hamming_dist(left: &str, right: &str) -> u64 {
97    let len = cmp::min(left.len(), right.len());
98    let diff = cmp::max(left.len(), right.len()) - len;
99
100    (strsim::hamming(&left[..len], &right[..len]).unwrap() + diff) as u64
101}
102
103fn jaro_factor(left: &str, right: &str) -> u64 {
104    strsim::jaro(left, right).mul_add(JARO_SCALE_FACTOR, -JARO_SCALE_FACTOR).round() as u64
105}
106
107fn jaro_winkler_factor(left: &str, right: &str) -> u64 {
108    strsim::jaro_winkler(left, right).mul_add(JARO_SCALE_FACTOR, -JARO_SCALE_FACTOR).round() as u64
109}