1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
//! This module provides the raw algorithms used for string similarity. These can be used directly if raw weights are
//! required between two strings, but most users should prefer the functionality in the crate root.

use sliding_windows::{IterExt, Storage};
use std::collections::HashSet;
use util::round_score_decimal;

const SPACE: char = ' ';

/// Trait defining a single similarity algorithm.
pub trait SimilarityAlgorithm {
    /// Create a new instance of this algorithm. It's recommended to keep a single instance around as long as possible,
    /// as this allows any temporary storage to be recycled without additional allocations.
    fn new() -> Self;

    /// Gets the similarity of a given pair of strings using this algorithm. Two identical strings will
    /// always produce `1.0`, and mismatching length 1 strings produce `0.0`. If either string is empty, `0.0` will be
    /// returned. Implementations within the crate round the weight to 5 decimal places, but other implementations don't
    /// need to do the same.
    fn get_similarity(&mut self, a: &str, b: &str) -> f32;
}

/// Sorensen-Dice coefficient algorithm.
pub struct SorensenDice(Storage<char>);
impl SorensenDice {
    fn str_to_char_set(&mut self, s: &str) -> HashSet<(char, char)> {
        if s.len() < 2 {
            return HashSet::new();
        }
        let windows = s.chars().sliding_windows(&mut self.0);
        windows
            .map(|x| {
                let mut iter = x.iter();
                (iter.next().map(|it| *it), iter.next().map(|it| *it))
            })
            .filter_map(|entry| {
                if let (Some(a), Some(b)) = entry {
                    if a != SPACE && b != SPACE {
                        Some((a, b))
                    } else {
                        None
                    }
                } else {
                    None
                }
            })
            .collect::<HashSet<(char, char)>>()
    }
}
impl SimilarityAlgorithm for SorensenDice {
    fn new() -> SorensenDice {
        SorensenDice(Storage::new(2))
    }

    fn get_similarity(&mut self, a: &str, b: &str) -> f32 {
        if a == b {
            1f32
        } else if a.len() == 1 && b.len() == 1 {
            0f32
        } else if a.len() == 0 || b.len() == 0 {
            0f32
        } else {
            let acs = self.str_to_char_set(a);
            let bcs = self.str_to_char_set(b);

            let union = acs.len() + bcs.len();
            let intersect = acs.intersection(&bcs).count();

            round_score_decimal((2f32 * intersect as f32) / union as f32)
        }
    }
}

/// Levenshtein edit distance algorithm.
// Add a hidden, unused param to prevent direct construction.
pub struct Levenshtein(bool);
impl SimilarityAlgorithm for Levenshtein {
    fn new() -> Levenshtein {
        Levenshtein(false)
    }

    fn get_similarity(&mut self, s: &str, t: &str) -> f32 {
        use std::cmp::{min, max};

        let n = s.len();
        let m = t.len();

        if s == t {
            1f32
        } else if n == 1 && m == 1 {
            0f32
        } else if n == 0 || m == 0 {
            0f32
        } else {
            // For a description of the algorithm, see
            // https://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Spring2006/assignments/editdistance/Levenshtein%20Distance.htm

            // Get character vector for both strings
            let s_chars = s.chars().collect::<Vec<char>>();
            let t_chars = t.chars().collect::<Vec<char>>();

            // Build the matrix
            let mut rows: Vec<Vec<usize>> = Vec::with_capacity(m);
            for i in 0..n + 1 {
                let mut row = Vec::with_capacity(n + 1);
                for j in 0..m + 1 {
                    if i == 0 { 
                        row.push(j);
                    } else if j == 0 {
                        row.push(i);
                    } else {
                        row.push(0);
                    }
                }
                rows.push(row);
            }

            // Iterate over the strings
            for i in 1..n + 1 {
                for j in 1..m + 1 {
                    let cost = if s_chars[i - 1] == t_chars[j - 1] {
                        0
                    } else {
                        1
                    };

                    let above = 1 + rows[i-1][j];
                    let left = 1 + rows[i][j-1];
                    let diag = cost + rows[i-1][j-1];

                    rows[i][j] = min(min(above, left), diag);
                }
            }

            round_score_decimal(1f32 - (rows[n][m] as f32 / max(n, m) as f32))
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    // Test to catch sliding_window panics
    #[test]
    fn test_sorensen_bigrams_single_char() {
        assert_eq!(0, SorensenDice::new().str_to_char_set("b").len());
    }

    #[test]
    fn test_sorensen_bigrams_correctness() {
        let mut bigrams = HashSet::new();
        bigrams.insert(('r', 'u'));
        bigrams.insert(('u', 's'));
        bigrams.insert(('s', 't'));
        assert_eq!(bigrams, SorensenDice::new().str_to_char_set("rust"));
    }

    #[test]
    fn test_sorensen_eq_strs() {
        assert_eq!(1f32, SorensenDice::new().get_similarity("string", "string"));
    }

    #[test]
    fn test_sorensen_one_char() {
        assert_eq!(0f32, SorensenDice::new().get_similarity("a", "b"));
    }

    #[test]
    fn test_sorensen_empty_str() {
        assert_eq!(0f32, SorensenDice::new().get_similarity("string", ""));
    }

    // Added due to sliding_window panic when either string is size 1
    #[test]
    fn test_sorensen_one_short() {
        assert_eq!(0f32, SorensenDice::new().get_similarity("rust", "b"))
    }

    #[test]
    fn test_sorensen_correctness() {
        let mut inst = SorensenDice::new();
        assert_eq!(0.66667f32, inst.get_similarity("rust", "bust"));
        assert_eq!(0.0f32, inst.get_similarity("rust", "ritz"));
        assert_eq!(0.72727f32, inst.get_similarity("chance", "enhance"));
    }

    #[test]
    fn test_levenshtein_eq_strs() {
        assert_eq!(1f32, Levenshtein::new().get_similarity("string", "string"));
    }

    #[test]
    fn test_levenshtein_one_char() {
        assert_eq!(0f32, Levenshtein::new().get_similarity("a", "b"));
    }

    #[test]
    fn test_levenshtein_empty_str() {
        assert_eq!(0f32, Levenshtein::new().get_similarity("string", ""));
    }

    #[test]
    fn test_levenshtein_correctness() {
        let mut inst = Levenshtein::new();
        assert_eq!(0.75f32, inst.get_similarity("rust", "bust"));
        assert_eq!(0.25f32, inst.get_similarity("rust", "ritz"));
        assert_eq!(0.71429f32, inst.get_similarity("chance", "enhance"));
    }

    #[cfg(feature = "nightly")]
    mod bench {
        use super::super::*;
        use test::Bencher;

        fn get_words_contents() -> String {
            use std::fs::File;
            use std::io::prelude::*;
            // Load the words file.
            let path = concat!(env!("CARGO_MANIFEST_DIR"), "/words_alpha.txt");
            let mut contents = String::new();
            {
                let mut f = File::open(path).expect("Unable to load words_alpha.txt");
                f.read_to_string(&mut contents)
                    .expect("Unable to read words_alpha.txt");
            }
            contents
        }

        fn get_words(contents: &str) -> Vec<&str> {
            contents
                .lines()
                .map(|it| it.trim_right_matches("\n").trim_right_matches("\r"))
                .collect()
        }

        #[bench]
        fn bench_sorensen_pairs(bench: &mut Bencher) {
            let contents = get_words_contents();
            let words = get_words(&contents);
            let words_iter = words.iter();

            bench.iter(|| {
                let mut inst = SorensenDice::new();
                words_iter
                    .clone()
                    .map(|word| inst.str_to_char_set(word).len())
                    .collect::<Vec<_>>()
            });
        }

        #[bench]
        fn bench_complete_sorensen(bench: &mut Bencher) {
            let contents = get_words_contents();
            let words = get_words(&contents);
            let words_iter = words.iter();

            bench.iter(|| {
                let mut inst = SorensenDice::new();
                words_iter
                    .clone()
                    .map(|word| inst.get_similarity("rust", word))
                    .collect::<Vec<f32>>()
            });
        }

        #[bench]
        fn bench_complete_levenshtein(bench: &mut Bencher) {
            let contents = get_words_contents();
            let words = get_words(&contents);
            let words_iter = words.iter();

            bench.iter(|| {
                let mut inst = Levenshtein::new();
                words_iter
                    .clone()
                    .map(|word| inst.get_similarity("rust", word))
                    .collect::<Vec<f32>>()
            });
        }
    }
}