vec_vp_tree/dist/string.rs
1// Copyright 2016 Austin Bonander
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Distance functions for strings (`strsim` feature, enabled by default)
9use strsim;
10
11use super::{DistFn, KnownDist};
12
13use std::cmp;
14
15const JARO_SCALE_FACTOR: f64 = 100.;
16
17/// Hamming distance function for strings. Default as the fastest.
18///
19/// Returns the number of characters that are different between the two strings.
20///
21/// If the strings are not the same length, adds the length difference as well.
22#[derive(Copy, Clone)]
23pub struct Hamming;
24
25/// Levenshtein distance (edit-distance) function for strings.
26///
27/// Returns the the minimum number of insertions, deletions, and substitutions required to change
28/// one string into the other.
29#[derive(Copy, Clone)]
30pub struct Levenshtein;
31
32/// Damerau-Levenshtein distance function for strings.
33///
34/// Returns the the minimum number of insertions, deletions, and substitutions required to change
35/// one string into the other, also counts swaps of adjacent characters.
36#[derive(Copy, Clone)]
37pub struct DamerauLevenshtein;
38
39/// Distance function for strings using the Jaro similarity factor.
40///
41/// Returns a number in `[0, 100]`, inversely proportional to the similarity between
42/// the two strings, where 0 is exactly the same, and 100 is not at all similar.
43#[derive(Copy, Clone)]
44pub struct Jaro;
45
46/// Distance function for strings using the Jaro similarity factor,
47/// optimized for when strings are expected to have a common prefix.
48///
49/// Returns a number in `[0, 100]`, inversely proportional to the similarity between
50/// the two strings, where 0 is exactly the same, and 100 is not at all similar.
51#[derive(Copy, Clone)]
52pub struct JaroWinkler;
53
54macro_rules! impl_dist_fn {
55 ($($ty:ty = $distfn:path),*) => (
56 $(
57 impl<'a> DistFn<&'a str> for $ty {
58 fn dist(&self, left: &&'a str, right: &&'a str) -> u64 {
59 $distfn(left, right) as u64
60 }
61 }
62
63 impl DistFn<String> for $ty {
64 fn dist(&self, left: &String, right: &String) -> u64 {
65 $distfn(left, right) as u64
66 }
67 }
68 )*
69 )
70}
71
72impl_dist_fn! {
73 Hamming = hamming_dist,
74 Levenshtein = strsim::levenshtein,
75 DamerauLevenshtein = strsim::damerau_levenshtein,
76 Jaro = jaro_factor,
77 JaroWinkler = jaro_winkler_factor
78}
79
80impl<'a> KnownDist for &'a str {
81 /// The fastest distance function for strings.
82 type DistFn = Hamming;
83 fn dist_fn() -> Hamming {
84 Hamming
85 }
86}
87
88impl KnownDist for String {
89 /// The fastest distance function for strings.
90 type DistFn = Hamming;
91 fn dist_fn() -> Hamming {
92 Hamming
93 }
94}
95
96fn hamming_dist(left: &str, right: &str) -> u64 {
97 let len = cmp::min(left.len(), right.len());
98 let diff = cmp::max(left.len(), right.len()) - len;
99
100 (strsim::hamming(&left[..len], &right[..len]).unwrap() + diff) as u64
101}
102
103fn jaro_factor(left: &str, right: &str) -> u64 {
104 strsim::jaro(left, right).mul_add(JARO_SCALE_FACTOR, -JARO_SCALE_FACTOR).round() as u64
105}
106
107fn jaro_winkler_factor(left: &str, right: &str) -> u64 {
108 strsim::jaro_winkler(left, right).mul_add(JARO_SCALE_FACTOR, -JARO_SCALE_FACTOR).round() as u64
109}