nu_protocol/lev_distance.rs
1// This file is copied from the rust compiler project:
2// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/compiler/rustc_span/src/lev_distance.rs#L1
3// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/LICENSE-MIT
4//
5// - the rust compiler-specific symbol::Symbol has been replaced by &str
6// - unstable feature .then_some has been replaced by an if ... else expression
7
8//! Levenshtein distances.
9//!
10//! The [Levenshtein distance] is a metric for measuring the difference between two strings.
11//!
12//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
13
14use std::cmp;
15
16/// Finds the Levenshtein distance between two strings.
17///
18/// Returns None if the distance exceeds the limit.
19pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
20 let n = a.chars().count();
21 let m = b.chars().count();
22 let min_dist = m.abs_diff(n);
23
24 if min_dist > limit {
25 return None;
26 }
27 if n == 0 || m == 0 {
28 return Some(min_dist);
29 }
30
31 let mut dcol: Vec<_> = (0..=m).collect();
32
33 for (i, sc) in a.chars().enumerate() {
34 let mut current = i;
35 dcol[0] = current + 1;
36
37 for (j, tc) in b.chars().enumerate() {
38 let next = dcol[j + 1];
39 if sc == tc {
40 dcol[j + 1] = current;
41 } else {
42 dcol[j + 1] = cmp::min(current, next);
43 dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
44 }
45 current = next;
46 }
47 }
48
49 if dcol[m] <= limit {
50 Some(dcol[m])
51 } else {
52 None
53 }
54}
55
56/// Finds the Levenshtein distance between two strings.
57pub fn levenshtein_distance(a: &str, b: &str) -> usize {
58 lev_distance(a, b, usize::MAX).unwrap_or(usize::MAX)
59}
60/// Provides a word similarity score between two words that accounts for substrings being more
61/// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
62/// 0 is an identical match.
63///
64/// Uses the Levenshtein distance between the two strings and removes the cost of the length
65/// difference. If this is 0 then it is either a substring match or a full word match, in the
66/// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
67/// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
68/// is not greater than twice the length of the other. For cases where the words are close in size
69/// but not an exact substring then the cost of the length difference is discounted by half.
70///
71/// Returns `None` if the distance exceeds the limit.
72pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
73 let n = a.chars().count();
74 let m = b.chars().count();
75
76 // Check one isn't less than half the length of the other. If this is true then there is a
77 // big difference in length.
78 let big_len_diff = (n * 2) < m || (m * 2) < n;
79 let len_diff = m.abs_diff(n);
80 let lev = lev_distance(a, b, limit + len_diff)?;
81
82 // This is the crux, subtracting length difference means exact substring matches will now be 0
83 let score = lev - len_diff;
84
85 // If the score is 0 but the words have different lengths then it's a substring match not a full
86 // word match
87 let score = if score == 0 && len_diff > 0 && !big_len_diff {
88 1 // Exact substring match, but not a total word match so return non-zero
89 } else if !big_len_diff {
90 // Not a big difference in length, discount cost of length difference
91 score + len_diff.div_ceil(2)
92 } else {
93 // A big difference in length, add back the difference in length to the score
94 score + len_diff
95 };
96
97 if score <= limit { Some(score) } else { None }
98}
99
100/// Finds the best match for given word in the given iterator where substrings are meaningful.
101///
102/// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
103/// for word similarity. This takes an optional distance limit which defaults to one-third of the
104/// given word.
105///
106/// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
107/// on an edge case with a lower(upper)case letters mismatch.
108pub fn find_best_match_for_name_with_substrings<'c>(
109 candidates: &[&'c str],
110 lookup: &str,
111 dist: Option<usize>,
112) -> Option<&'c str> {
113 find_best_match_for_name_impl(true, candidates, lookup, dist)
114}
115
116/// Finds the best match for a given word in the given iterator.
117///
118/// As a loose rule to avoid the obviously incorrect suggestions, it takes
119/// an optional limit for the maximum allowable edit distance, which defaults
120/// to one-third of the given word.
121///
122/// Besides Levenshtein, we use case insensitive comparison to improve accuracy
123/// on an edge case with a lower(upper)case letters mismatch.
124#[allow(dead_code)]
125pub fn find_best_match_for_name<'c>(
126 candidates: &[&'c str],
127 lookup: &str,
128 dist: Option<usize>,
129) -> Option<&'c str> {
130 find_best_match_for_name_impl(false, candidates, lookup, dist)
131}
132
133#[cold]
134fn find_best_match_for_name_impl<'c>(
135 use_substring_score: bool,
136 candidates: &[&'c str],
137 lookup: &str,
138 dist: Option<usize>,
139) -> Option<&'c str> {
140 let lookup_uppercase = lookup.to_uppercase();
141
142 // Priority of matches:
143 // 1. Exact case insensitive match
144 // 2. Levenshtein distance match
145 // 3. Sorted word match
146 if let Some(c) = candidates
147 .iter()
148 .find(|c| c.to_uppercase() == lookup_uppercase)
149 {
150 return Some(*c);
151 }
152
153 let mut dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
154 let mut best = None;
155 for c in candidates {
156 let lev_dist = if use_substring_score {
157 lev_distance_with_substrings(lookup, c, dist)
158 } else {
159 lev_distance(lookup, c, dist)
160 };
161 match lev_dist {
162 Some(0) => return Some(*c),
163 Some(d) => {
164 dist = d - 1;
165 best = Some(*c);
166 }
167 None => {}
168 }
169 }
170 if best.is_some() {
171 return best;
172 }
173
174 find_match_by_sorted_words(candidates, lookup)
175}
176
177fn find_match_by_sorted_words<'c>(iter_names: &[&'c str], lookup: &str) -> Option<&'c str> {
178 iter_names.iter().fold(None, |result, candidate| {
179 if sort_by_words(candidate) == sort_by_words(lookup) {
180 Some(*candidate)
181 } else {
182 result
183 }
184 })
185}
186
187fn sort_by_words(name: &str) -> String {
188 let mut split_words: Vec<&str> = name.split('_').collect();
189 // We are sorting primitive &strs and can use unstable sort here.
190 split_words.sort_unstable();
191 split_words.join("_")
192}
193
194// This file is copied from the rust compiler project:
195// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/compiler/rustc_span/src/lev_distance.rs#L1
196// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/LICENSE-MIT
197
198// Permission is hereby granted, free of charge, to any
199// person obtaining a copy of this software and associated
200// documentation files (the "Software"), to deal in the
201// Software without restriction, including without
202// limitation the rights to use, copy, modify, merge,
203// publish, distribute, sublicense, and/or sell copies of
204// the Software, and to permit persons to whom the Software
205// is furnished to do so, subject to the following
206// conditions:
207
208// The above copyright notice and this permission notice
209// shall be included in all copies or substantial portions
210// of the Software.
211
212// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
213// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
214// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
215// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
216// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
217// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
218// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
219// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
220// DEALINGS IN THE SOFTWARE.
221// Footer