fuzzy_string_distance/
lib.rs

1//! Fuzzy string comparisons using [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
2//!
3//! Whereas simple string comparison is very sensitive to typos, Levenshtein Distance gives the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. This gives us a sliding scale between 0 (strings are identical) and the length of the longer string (strings are unrelated), which can be used for fuzzy comparisons that can be resilient to typos, minor mistakes, and inconsistent spelling.
4//!
5//! ```
6//! use fuzzy_string_distance::levenshtein_distance;
7//! assert_eq!(1, levenshtein_distance(&"rust", &"rusty")); // insert y
8//! assert_eq!(3, levenshtein_distance(&"bug", &"")); // delete all characters
9//! assert_eq!(2, levenshtein_distance(&"typography", &"typpgrapy")); // fix both typos
10//! ```
11//!
12
13/// Returns the minimum number of single character insertions, deletions or substitutions
14/// required to convert the source string to the target string, known as the Levenshtein distance.
15///
16/// This is like a fuzzy [Eq], where a distance of 0 means the strings are equal
17/// and the distance can be up to the length of the longer string if they are completely unrelated.
18///
19/// See also:
20/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
21///
22/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
23/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
24/// still does not compare grapheme clusters.
25pub fn levenshtein_distance(source: &str, target: &str) -> usize {
26    // If either input is empty then the shortest transformation is all deletions or insertions
27    // from/to an empty string, which will be equal to the number of characters in the other input
28    // This check also guards against any index out of bounds issues in the main implementation
29    let target_chars = target.chars().count();
30    let source_chars = source.chars().count();
31    if source.is_empty() {
32        return target_chars;
33    }
34    if target.is_empty() {
35        return source_chars;
36    }
37
38    // We'll have a matrix A of `source` length + 1 rows and `target` length + 1 columns
39    // This stores the edit distances for prefixes of source and target from the empty string
40    // through to the entire inputs.
41    // A[0, 0] is therefore "" to "" which is 0, and A[source length + 1, target length + 1] is
42    // the edit distance from source to target.
43    // We only need to store two rows at a time so we never construct this matrix.
44
45    let mut edit_distances = vec![0; target_chars + 1];
46    // First row of edit distances are converting an empty string `source` to prefixes of length 0
47    // to the entire `target`, "" to "" is 0 edits, "" to one character is one insertion, and
48    // so on through to the entire target string.
49    for (i, x) in edit_distances.iter_mut().enumerate() {
50        *x = i;
51    }
52
53    for i in 0..source_chars {
54        // Step through each subsequent row of the matrix of edit distances, each time looking at
55        // a prefix of `source` one character longer
56        let mut new_edit_distances = vec![0; target_chars + 1];
57        // We're on the i+1 prefix of characters in `source`, so converting this to an empty string
58        // (the 0 character prefix of target) is purely deletions equal to the length of the
59        // source.
60        new_edit_distances[0] = i + 1;
61
62        for j in 0..target_chars {
63            // Step through columns for the prefixes of `target` on this prefix of `source` row.
64            // For a source of "kitten" and a target of "sitting", if we were up to i = 1 and
65            // j = 2 then this would look like a source of "ki" we already have the distance for
66            // converting to "si" and we now need to work out the distance to convert to "sit".
67            // We're now calculating the edit distance for A[i + 1, j + 1]
68
69            // At A[i, j + 1] we have the cost to reach the same `target` prefix with a source
70            // that was one character shorter, so we can delete the extraneous character and the
71            // distance could be 1 greater
72            let deletion = edit_distances[j + 1] + 1;
73            // At A[i + 1, j] we have the cost to reach a shorter `target` prefix with the same
74            // source, so we can insert the extra character and the distance could be 1 greater
75            let insertion = new_edit_distances[j] + 1;
76            // We can unwrap here because we're taking an element from both iterators within
77            // their respective bounds of 0 to source_chars -1 and target_chars - 1
78            let source_char = source.chars().skip(i).next().unwrap();
79            let target_char = target.chars().skip(j).next().unwrap();
80            let substitution = if source_char == target_char {
81                // If the `source` character at i and the `target` character at j match, we
82                // don't need to transform anything
83                edit_distances[j]
84            } else {
85                // Otherwise we can transform the character to match the target, and the distance
86                // could be 1 greater
87                edit_distances[j] + 1
88            };
89
90            // We always pick the cheapest option from the 3 we could do, which populates
91            // A[i + 1, j + 1]
92            new_edit_distances[j + 1] = std::cmp::min(
93                deletion, std::cmp::min(insertion, substitution)
94            );
95        }
96
97        edit_distances = new_edit_distances;
98    }
99    // The distance from `target` to `source` will be the final entry in the array as this
100    // is the full strings of both with no characters ignored.
101    edit_distances[target_chars]
102}
103
104/// Returns the minimum number of single character insertions, deletions or substitutions
105/// required to convert the source string to the target string, known as the Levenshtein distance,
106/// ignoring ASCII case differences.
107///
108/// This is like a fuzzy [eq_ignore_ascii_case](str::eq_ignore_ascii_case), where a distance
109/// of 0 means the strings are equal and the distance can be up to the length of the longer
110/// string if they are completely unrelated.
111///
112/// See also:
113/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
114///
115/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
116/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
117/// still does not compare grapheme clusters.
118pub fn levenshtein_distance_ignore_ascii_case(source: &str, target: &str) -> usize {
119    levenshtein_distance(&source.to_ascii_lowercase(), &target.to_ascii_lowercase())
120}
121
122/// A modified Levenshtein distance that matches from the source string to an arbitrary substring
123/// of the target string, returning the minimum number of single character insertions, deletions
124/// or substitutions required to convert the source string to match any substring in the target.
125///
126/// This is like a fuzzy [str::contains], where a distance of 0 means the source string is equal
127/// to a substring in the target and the distance can be up to the length of the longer string
128/// if they are completely unrelated.
129///
130/// It can help to think of the source string being a query to search against a list of items each
131/// with a longer target string, short searches that match exactly against part of the target
132/// string will have the minimum number of edits 0, and as a few characters need to be modified to
133/// match against part of the target the distances will increase.
134///
135/// ```
136/// use fuzzy_string_distance::local_levenshtein_distance;
137/// // trivial match to substring
138/// assert_eq!(0, local_levenshtein_distance(&"long", &"A long sentence"));
139/// // local distance is asymmetric, here we have to delete almost all the search term
140/// assert_eq!(11, local_levenshtein_distance(&"A long sentence", &"long"));
141/// ```
142///
143/// See also:
144/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
145/// - [Fuzzy Substring Matching: On-device Fuzzy Friend Search at Snapchat](http://arxiv.org/pdf/2211.02767)
146///
147/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
148/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
149/// still does not compare grapheme clusters.
150pub fn local_levenshtein_distance(source: &str, target: &str) -> usize {
151    // If either input is empty then the shortest transformation is all deletions or insertions
152    // from/to an empty string.
153    // This check also guards against any index out of bounds issues in the main implementation
154    let target_chars = target.chars().count();
155    let source_chars = source.chars().count();
156    if source.is_empty() {
157        // We can trivially match a 0 length substring in target with no edits
158        return 0;
159    }
160    if target.is_empty() {
161        // We only allow matches against substrings of the target, so we still need to delete all
162        // the source characters if matching against "".
163        return source_chars;
164    }
165
166    // We'll have a matrix A of `source` length + 1 rows and `target` length + 1 columns
167    // This stores the edit distances for prefixes of source and target from the empty string
168    // through to the entire inputs.
169    // A[0, 0] is therefore "" to "" which is 0, and A[source length + 1, target length + 1] is
170    // the edit distance from source to target.
171    // We only need to store two rows at a time so we never construct this matrix.
172
173    let mut edit_distances = vec![0; target_chars + 1];
174    // Unlike in Levenshtein distance, we do not initialise the first row of the edit distances
175    // to non zero values. These distances for converting an empty string `source` to prefixes of
176    // length 0 to the entire `target`, "" to "" are all zero, as we do not want to penalise
177    // starting a match further into the string. If the target was "racecar" and the source was
178    // "car" this should also be 0 edits.
179
180    for i in 0..source_chars {
181        // Step through each subsequent row of the matrix of edit distances, each time looking at
182        // a prefix of `source` one character longer
183        let mut new_edit_distances = vec![0; target_chars + 1];
184        // We're on the i+1 prefix of characters in `source`, so converting this to an empty string
185        // (the 0 character prefix of target) is purely deletions equal to the length of the
186        // source.
187        new_edit_distances[0] = i + 1;
188
189        for j in 0..target_chars {
190            // Step through columns for the prefixes of `target` on this prefix of `source` row.
191            // For a source of "kitten" and a target of "sitting", if we were up to i = 1 and
192            // j = 2 then this would look like a source of "ki" we already have the distance for
193            // converting to "si" and we now need to work out the distance to convert to "sit".
194            // We're now calculating the edit distance for A[i + 1, j + 1]
195
196            // At A[i, j + 1] we have the cost to reach the same `target` prefix with a source
197            // that was one character shorter, so we can delete the extraneous character and the
198            // distance could be 1 greater
199            let deletion = edit_distances[j + 1] + 1;
200            // At A[i + 1, j] we have the cost to reach a shorter `target` prefix with the same
201            // source, so we can insert the extra character and the distance could be 1 greater
202            let insertion = new_edit_distances[j] + 1;
203            // We can unwrap here because we're taking an element from both iterators within
204            // their respective bounds of 0 to source_chars -1 and target_chars - 1
205            let source_char = source.chars().skip(i).next().unwrap();
206            let target_char = target.chars().skip(j).next().unwrap();
207            let substitution = if source_char == target_char {
208                // If the `source` character at i and the `target` character at j match, we
209                // don't need to transform anything
210                edit_distances[j]
211            } else {
212                // Otherwise we can transform the character to match the target, and the distance
213                // could be 1 greater
214                edit_distances[j] + 1
215            };
216
217            // We always pick the cheapest option from the 3 we could do, which populates
218            // A[i + 1, j + 1]
219            new_edit_distances[j + 1] = std::cmp::min(
220                deletion, std::cmp::min(insertion, substitution)
221            );
222        }
223
224        edit_distances = new_edit_distances;
225    }
226    // The distance from `target` to `source` will be the final entry in the array as this
227    // is the full strings of both with no characters ignored. We just want to match against any
228    // substring of target, so by taking the minimum value in the final row we allow arbitrary
229    // suffixes in the target to be ignored for the match. Since we initialised the first row
230    // to 0 to allow arbitrary prefixes of the target to be ignored for the match, together this
231    // returns the smallest edits required to match any substring of target.
232    edit_distances.into_iter().min().unwrap()
233}
234
235/// A modified Levenshtein distance that matches from the source string to an arbitrary substring
236/// of the target string, returning the minimum number of single character insertions, deletions
237/// or substitutions required to convert the source string to match any substring in the target,
238/// ignoring ASCII case differences
239///
240/// This is like a fuzzy [str::contains] that ignores case, where a distance of 0 means the source
241/// string is equal to a substring in the target and the distance can be up to the length of the
242/// longer string if they are completely unrelated.
243///
244/// It can help to think of the source string being a query to search against a list of items each
245/// with a longer target string, short searches that match exactly against part of the target
246/// string will have the minimum number of edits 0, and as a few characters need to be modified to
247/// match against part of the target the distances will increase.
248///
249/// ```
250/// use fuzzy_string_distance::local_levenshtein_distance_ignore_ascii_case;
251/// // trivial match to substring
252/// assert_eq!(0, local_levenshtein_distance_ignore_ascii_case(&"LONG", &"A long sentence"));
253/// // local distance is asymmetric, here we have to delete almost all the search term
254/// assert_eq!(11, local_levenshtein_distance_ignore_ascii_case(&"A long sentence", &"LONG"));
255/// ```
256///
257/// See also:
258/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
259/// - [Fuzzy Substring Matching: On-device Fuzzy Friend Search at Snapchat](http://arxiv.org/pdf/2211.02767)
260///
261/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
262/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
263/// still does not compare grapheme clusters.
264pub fn local_levenshtein_distance_ignore_ascii_case(source: &str, target: &str) -> usize {
265    local_levenshtein_distance(&source.to_ascii_lowercase(), &target.to_ascii_lowercase())
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn transforming_input() {
274        let kitten = "kitten";
275        let sitting = "sitting";
276        let result = levenshtein_distance(&kitten, &sitting);
277        assert_eq!(result, 3);
278    }
279
280    #[test]
281    fn adding_a_character() {
282        let result = levenshtein_distance(&"rust", &"rusty");
283        assert_eq!(result, 1);
284    }
285
286    #[test]
287    fn removing_characters() {
288        let result = levenshtein_distance(&"ferrisground", &"run");
289        // run is present in the source input, so shortest transformation is removing the other
290        // characters
291        assert_eq!(result, 9);
292    }
293
294    #[test]
295    fn empty_source() {
296        let result = levenshtein_distance(&"", &"rust");
297        assert_eq!(result, 4);
298    }
299
300    #[test]
301    fn empty_target() {
302        let result = levenshtein_distance(&"bug", &"");
303        assert_eq!(result, 3);
304    }
305
306    #[test]
307    fn multiple_transformations() {
308        let result = levenshtein_distance(&"Edit distance", &"Eddy");
309        // Edd already present in input, so can delete all the other characters and insert y,
310        // so 3 edits fewer than the source input
311        assert_eq!(result, 10);
312    }
313
314    #[test]
315    fn unrelated() {
316        let result = levenshtein_distance(&"unrelated", &"SCREAMING");
317        assert_eq!(result, 9);
318    }
319
320    #[test]
321    fn slightly_related_ignoring_case() {
322        let result = levenshtein_distance_ignore_ascii_case(&"unrelated", &"SCREAMING");
323        assert_eq!(result, 7);
324    }
325
326    #[test]
327    fn non_english() {
328        let result = levenshtein_distance(&"El delfín español", &"Dolphin");
329        assert_eq!(result, 15);
330    }
331
332    #[test]
333    fn graphemes() {
334        let result = levenshtein_distance(&"🧑‍🔬", &"🧑");
335        // Split scientist into just person emoji
336        assert_eq!(result, 2);
337    }
338
339    #[test]
340    fn non_english_local() {
341        let result = local_levenshtein_distance(&"Dolphin", &"El delfín español");
342        // delfín -> Dolphin is 5 edits
343        assert_eq!(result, 5);
344        // local distance is asymmetric, search term is going to have to be modified to match
345        // entire target as with non local distance
346        let result = local_levenshtein_distance(&"El delfín español", &"Dolphin");
347        assert_eq!(result, 15);
348    }
349
350    #[test]
351    fn search_term() {
352        let result = local_levenshtein_distance(&"Piñata", &"Pinecone tree");
353        // Pineco -> Piñata is 4 edits
354        assert_eq!(result, 4);
355    }
356
357    #[test]
358    fn no_search() {
359        let result = local_levenshtein_distance(&"", &"A long sentence");
360        // trivial match
361        assert_eq!(result, 0);
362        let result = local_levenshtein_distance(&"A long sentence", &"");
363        // local distance is asymmetric, have to delete entire search term
364        assert_eq!(result, 15);
365    }
366
367    #[test]
368    fn one_character_term() {
369        let result = local_levenshtein_distance(&"g", &"A long sentence");
370        assert_eq!(result, 0);
371    }
372
373    #[test]
374    fn slightly_related_ignoring_case_local() {
375        let result = local_levenshtein_distance_ignore_ascii_case(&"SCREAMING", &"unrelated");
376        assert_eq!(result, 7);
377        let result = local_levenshtein_distance_ignore_ascii_case(&"SCREAM", &"unrelated");
378        assert_eq!(result, 4);
379    }
380}