fuzzy_string_distance/lib.rs
1//! Fuzzy string comparisons using [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
2//!
3//! Whereas simple string comparison is very sensitive to typos, Levenshtein Distance gives the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. This gives us a sliding scale between 0 (strings are identical) and the length of the longer string (strings are unrelated), which can be used for fuzzy comparisons that can be resilient to typos, minor mistakes, and inconsistent spelling.
4//!
5//! ```
6//! use fuzzy_string_distance::levenshtein_distance;
7//! assert_eq!(1, levenshtein_distance(&"rust", &"rusty")); // insert y
8//! assert_eq!(3, levenshtein_distance(&"bug", &"")); // delete all characters
9//! assert_eq!(2, levenshtein_distance(&"typography", &"typpgrapy")); // fix both typos
10//! ```
11//!
12
13/// Returns the minimum number of single character insertions, deletions or substitutions
14/// required to convert the source string to the target string, known as the Levenshtein distance.
15///
16/// This is like a fuzzy [Eq], where a distance of 0 means the strings are equal
17/// and the distance can be up to the length of the longer string if they are completely unrelated.
18///
19/// See also:
20/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
21///
22/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
23/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
24/// still does not compare grapheme clusters.
25pub fn levenshtein_distance(source: &str, target: &str) -> usize {
26 // If either input is empty then the shortest transformation is all deletions or insertions
27 // from/to an empty string, which will be equal to the number of characters in the other input
28 // This check also guards against any index out of bounds issues in the main implementation
29 let target_chars = target.chars().count();
30 let source_chars = source.chars().count();
31 if source.is_empty() {
32 return target_chars;
33 }
34 if target.is_empty() {
35 return source_chars;
36 }
37
38 // We'll have a matrix A of `source` length + 1 rows and `target` length + 1 columns
39 // This stores the edit distances for prefixes of source and target from the empty string
40 // through to the entire inputs.
41 // A[0, 0] is therefore "" to "" which is 0, and A[source length + 1, target length + 1] is
42 // the edit distance from source to target.
43 // We only need to store two rows at a time so we never construct this matrix.
44
45 let mut edit_distances = vec![0; target_chars + 1];
46 // First row of edit distances are converting an empty string `source` to prefixes of length 0
47 // to the entire `target`, "" to "" is 0 edits, "" to one character is one insertion, and
48 // so on through to the entire target string.
49 for (i, x) in edit_distances.iter_mut().enumerate() {
50 *x = i;
51 }
52
53 for i in 0..source_chars {
54 // Step through each subsequent row of the matrix of edit distances, each time looking at
55 // a prefix of `source` one character longer
56 let mut new_edit_distances = vec![0; target_chars + 1];
57 // We're on the i+1 prefix of characters in `source`, so converting this to an empty string
58 // (the 0 character prefix of target) is purely deletions equal to the length of the
59 // source.
60 new_edit_distances[0] = i + 1;
61
62 for j in 0..target_chars {
63 // Step through columns for the prefixes of `target` on this prefix of `source` row.
64 // For a source of "kitten" and a target of "sitting", if we were up to i = 1 and
65 // j = 2 then this would look like a source of "ki" we already have the distance for
66 // converting to "si" and we now need to work out the distance to convert to "sit".
67 // We're now calculating the edit distance for A[i + 1, j + 1]
68
69 // At A[i, j + 1] we have the cost to reach the same `target` prefix with a source
70 // that was one character shorter, so we can delete the extraneous character and the
71 // distance could be 1 greater
72 let deletion = edit_distances[j + 1] + 1;
73 // At A[i + 1, j] we have the cost to reach a shorter `target` prefix with the same
74 // source, so we can insert the extra character and the distance could be 1 greater
75 let insertion = new_edit_distances[j] + 1;
76 // We can unwrap here because we're taking an element from both iterators within
77 // their respective bounds of 0 to source_chars -1 and target_chars - 1
78 let source_char = source.chars().skip(i).next().unwrap();
79 let target_char = target.chars().skip(j).next().unwrap();
80 let substitution = if source_char == target_char {
81 // If the `source` character at i and the `target` character at j match, we
82 // don't need to transform anything
83 edit_distances[j]
84 } else {
85 // Otherwise we can transform the character to match the target, and the distance
86 // could be 1 greater
87 edit_distances[j] + 1
88 };
89
90 // We always pick the cheapest option from the 3 we could do, which populates
91 // A[i + 1, j + 1]
92 new_edit_distances[j + 1] = std::cmp::min(
93 deletion, std::cmp::min(insertion, substitution)
94 );
95 }
96
97 edit_distances = new_edit_distances;
98 }
99 // The distance from `target` to `source` will be the final entry in the array as this
100 // is the full strings of both with no characters ignored.
101 edit_distances[target_chars]
102}
103
104/// Returns the minimum number of single character insertions, deletions or substitutions
105/// required to convert the source string to the target string, known as the Levenshtein distance,
106/// ignoring ASCII case differences.
107///
108/// This is like a fuzzy [eq_ignore_ascii_case](str::eq_ignore_ascii_case), where a distance
109/// of 0 means the strings are equal and the distance can be up to the length of the longer
110/// string if they are completely unrelated.
111///
112/// See also:
113/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
114///
115/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
116/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
117/// still does not compare grapheme clusters.
118pub fn levenshtein_distance_ignore_ascii_case(source: &str, target: &str) -> usize {
119 levenshtein_distance(&source.to_ascii_lowercase(), &target.to_ascii_lowercase())
120}
121
122/// A modified Levenshtein distance that matches from the source string to an arbitrary substring
123/// of the target string, returning the minimum number of single character insertions, deletions
124/// or substitutions required to convert the source string to match any substring in the target.
125///
126/// This is like a fuzzy [str::contains], where a distance of 0 means the source string is equal
127/// to a substring in the target and the distance can be up to the length of the longer string
128/// if they are completely unrelated.
129///
130/// It can help to think of the source string being a query to search against a list of items each
131/// with a longer target string, short searches that match exactly against part of the target
132/// string will have the minimum number of edits 0, and as a few characters need to be modified to
133/// match against part of the target the distances will increase.
134///
135/// ```
136/// use fuzzy_string_distance::local_levenshtein_distance;
137/// // trivial match to substring
138/// assert_eq!(0, local_levenshtein_distance(&"long", &"A long sentence"));
139/// // local distance is asymmetric, here we have to delete almost all the search term
140/// assert_eq!(11, local_levenshtein_distance(&"A long sentence", &"long"));
141/// ```
142///
143/// See also:
144/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
145/// - [Fuzzy Substring Matching: On-device Fuzzy Friend Search at Snapchat](http://arxiv.org/pdf/2211.02767)
146///
147/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
148/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
149/// still does not compare grapheme clusters.
150pub fn local_levenshtein_distance(source: &str, target: &str) -> usize {
151 // If either input is empty then the shortest transformation is all deletions or insertions
152 // from/to an empty string.
153 // This check also guards against any index out of bounds issues in the main implementation
154 let target_chars = target.chars().count();
155 let source_chars = source.chars().count();
156 if source.is_empty() {
157 // We can trivially match a 0 length substring in target with no edits
158 return 0;
159 }
160 if target.is_empty() {
161 // We only allow matches against substrings of the target, so we still need to delete all
162 // the source characters if matching against "".
163 return source_chars;
164 }
165
166 // We'll have a matrix A of `source` length + 1 rows and `target` length + 1 columns
167 // This stores the edit distances for prefixes of source and target from the empty string
168 // through to the entire inputs.
169 // A[0, 0] is therefore "" to "" which is 0, and A[source length + 1, target length + 1] is
170 // the edit distance from source to target.
171 // We only need to store two rows at a time so we never construct this matrix.
172
173 let mut edit_distances = vec![0; target_chars + 1];
174 // Unlike in Levenshtein distance, we do not initialise the first row of the edit distances
175 // to non zero values. These distances for converting an empty string `source` to prefixes of
176 // length 0 to the entire `target`, "" to "" are all zero, as we do not want to penalise
177 // starting a match further into the string. If the target was "racecar" and the source was
178 // "car" this should also be 0 edits.
179
180 for i in 0..source_chars {
181 // Step through each subsequent row of the matrix of edit distances, each time looking at
182 // a prefix of `source` one character longer
183 let mut new_edit_distances = vec![0; target_chars + 1];
184 // We're on the i+1 prefix of characters in `source`, so converting this to an empty string
185 // (the 0 character prefix of target) is purely deletions equal to the length of the
186 // source.
187 new_edit_distances[0] = i + 1;
188
189 for j in 0..target_chars {
190 // Step through columns for the prefixes of `target` on this prefix of `source` row.
191 // For a source of "kitten" and a target of "sitting", if we were up to i = 1 and
192 // j = 2 then this would look like a source of "ki" we already have the distance for
193 // converting to "si" and we now need to work out the distance to convert to "sit".
194 // We're now calculating the edit distance for A[i + 1, j + 1]
195
196 // At A[i, j + 1] we have the cost to reach the same `target` prefix with a source
197 // that was one character shorter, so we can delete the extraneous character and the
198 // distance could be 1 greater
199 let deletion = edit_distances[j + 1] + 1;
200 // At A[i + 1, j] we have the cost to reach a shorter `target` prefix with the same
201 // source, so we can insert the extra character and the distance could be 1 greater
202 let insertion = new_edit_distances[j] + 1;
203 // We can unwrap here because we're taking an element from both iterators within
204 // their respective bounds of 0 to source_chars -1 and target_chars - 1
205 let source_char = source.chars().skip(i).next().unwrap();
206 let target_char = target.chars().skip(j).next().unwrap();
207 let substitution = if source_char == target_char {
208 // If the `source` character at i and the `target` character at j match, we
209 // don't need to transform anything
210 edit_distances[j]
211 } else {
212 // Otherwise we can transform the character to match the target, and the distance
213 // could be 1 greater
214 edit_distances[j] + 1
215 };
216
217 // We always pick the cheapest option from the 3 we could do, which populates
218 // A[i + 1, j + 1]
219 new_edit_distances[j + 1] = std::cmp::min(
220 deletion, std::cmp::min(insertion, substitution)
221 );
222 }
223
224 edit_distances = new_edit_distances;
225 }
226 // The distance from `target` to `source` will be the final entry in the array as this
227 // is the full strings of both with no characters ignored. We just want to match against any
228 // substring of target, so by taking the minimum value in the final row we allow arbitrary
229 // suffixes in the target to be ignored for the match. Since we initialised the first row
230 // to 0 to allow arbitrary prefixes of the target to be ignored for the match, together this
231 // returns the smallest edits required to match any substring of target.
232 edit_distances.into_iter().min().unwrap()
233}
234
235/// A modified Levenshtein distance that matches from the source string to an arbitrary substring
236/// of the target string, returning the minimum number of single character insertions, deletions
237/// or substitutions required to convert the source string to match any substring in the target,
238/// ignoring ASCII case differences
239///
240/// This is like a fuzzy [str::contains] that ignores case, where a distance of 0 means the source
241/// string is equal to a substring in the target and the distance can be up to the length of the
242/// longer string if they are completely unrelated.
243///
244/// It can help to think of the source string being a query to search against a list of items each
245/// with a longer target string, short searches that match exactly against part of the target
246/// string will have the minimum number of edits 0, and as a few characters need to be modified to
247/// match against part of the target the distances will increase.
248///
249/// ```
250/// use fuzzy_string_distance::local_levenshtein_distance_ignore_ascii_case;
251/// // trivial match to substring
252/// assert_eq!(0, local_levenshtein_distance_ignore_ascii_case(&"LONG", &"A long sentence"));
253/// // local distance is asymmetric, here we have to delete almost all the search term
254/// assert_eq!(11, local_levenshtein_distance_ignore_ascii_case(&"A long sentence", &"LONG"));
255/// ```
256///
257/// See also:
258/// - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
259/// - [Fuzzy Substring Matching: On-device Fuzzy Friend Search at Snapchat](http://arxiv.org/pdf/2211.02767)
260///
261/// Note, this compares strings on a unicode scalar value basis, as per [str::chars]. While
262/// this comparison is less likely to cut a 'character' in two than a byte by byte basis, it
263/// still does not compare grapheme clusters.
264pub fn local_levenshtein_distance_ignore_ascii_case(source: &str, target: &str) -> usize {
265 local_levenshtein_distance(&source.to_ascii_lowercase(), &target.to_ascii_lowercase())
266}
267
268#[cfg(test)]
269mod tests {
270 use super::*;
271
272 #[test]
273 fn transforming_input() {
274 let kitten = "kitten";
275 let sitting = "sitting";
276 let result = levenshtein_distance(&kitten, &sitting);
277 assert_eq!(result, 3);
278 }
279
280 #[test]
281 fn adding_a_character() {
282 let result = levenshtein_distance(&"rust", &"rusty");
283 assert_eq!(result, 1);
284 }
285
286 #[test]
287 fn removing_characters() {
288 let result = levenshtein_distance(&"ferrisground", &"run");
289 // run is present in the source input, so shortest transformation is removing the other
290 // characters
291 assert_eq!(result, 9);
292 }
293
294 #[test]
295 fn empty_source() {
296 let result = levenshtein_distance(&"", &"rust");
297 assert_eq!(result, 4);
298 }
299
300 #[test]
301 fn empty_target() {
302 let result = levenshtein_distance(&"bug", &"");
303 assert_eq!(result, 3);
304 }
305
306 #[test]
307 fn multiple_transformations() {
308 let result = levenshtein_distance(&"Edit distance", &"Eddy");
309 // Edd already present in input, so can delete all the other characters and insert y,
310 // so 3 edits fewer than the source input
311 assert_eq!(result, 10);
312 }
313
314 #[test]
315 fn unrelated() {
316 let result = levenshtein_distance(&"unrelated", &"SCREAMING");
317 assert_eq!(result, 9);
318 }
319
320 #[test]
321 fn slightly_related_ignoring_case() {
322 let result = levenshtein_distance_ignore_ascii_case(&"unrelated", &"SCREAMING");
323 assert_eq!(result, 7);
324 }
325
326 #[test]
327 fn non_english() {
328 let result = levenshtein_distance(&"El delfín español", &"Dolphin");
329 assert_eq!(result, 15);
330 }
331
332 #[test]
333 fn graphemes() {
334 let result = levenshtein_distance(&"🧑🔬", &"🧑");
335 // Split scientist into just person emoji
336 assert_eq!(result, 2);
337 }
338
339 #[test]
340 fn non_english_local() {
341 let result = local_levenshtein_distance(&"Dolphin", &"El delfín español");
342 // delfín -> Dolphin is 5 edits
343 assert_eq!(result, 5);
344 // local distance is asymmetric, search term is going to have to be modified to match
345 // entire target as with non local distance
346 let result = local_levenshtein_distance(&"El delfín español", &"Dolphin");
347 assert_eq!(result, 15);
348 }
349
350 #[test]
351 fn search_term() {
352 let result = local_levenshtein_distance(&"Piñata", &"Pinecone tree");
353 // Pineco -> Piñata is 4 edits
354 assert_eq!(result, 4);
355 }
356
357 #[test]
358 fn no_search() {
359 let result = local_levenshtein_distance(&"", &"A long sentence");
360 // trivial match
361 assert_eq!(result, 0);
362 let result = local_levenshtein_distance(&"A long sentence", &"");
363 // local distance is asymmetric, have to delete entire search term
364 assert_eq!(result, 15);
365 }
366
367 #[test]
368 fn one_character_term() {
369 let result = local_levenshtein_distance(&"g", &"A long sentence");
370 assert_eq!(result, 0);
371 }
372
373 #[test]
374 fn slightly_related_ignoring_case_local() {
375 let result = local_levenshtein_distance_ignore_ascii_case(&"SCREAMING", &"unrelated");
376 assert_eq!(result, 7);
377 let result = local_levenshtein_distance_ignore_ascii_case(&"SCREAM", &"unrelated");
378 assert_eq!(result, 4);
379 }
380}