nu_protocol/
lev_distance.rs

1// This file is copied from the rust compiler project:
2// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/compiler/rustc_span/src/lev_distance.rs#L1
3// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/LICENSE-MIT
4//
5// - the rust compiler-specific symbol::Symbol has been replaced by &str
6// - unstable feature .then_some has been replaced by an if ... else expression
7
8//! Levenshtein distances.
9//!
10//! The [Levenshtein distance] is a metric for measuring the difference between two strings.
11//!
12//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
13
14use std::cmp;
15
16/// Finds the Levenshtein distance between two strings.
17///
18/// Returns None if the distance exceeds the limit.
19pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
20    let n = a.chars().count();
21    let m = b.chars().count();
22    let min_dist = m.abs_diff(n);
23
24    if min_dist > limit {
25        return None;
26    }
27    if n == 0 || m == 0 {
28        return Some(min_dist);
29    }
30
31    let mut dcol: Vec<_> = (0..=m).collect();
32
33    for (i, sc) in a.chars().enumerate() {
34        let mut current = i;
35        dcol[0] = current + 1;
36
37        for (j, tc) in b.chars().enumerate() {
38            let next = dcol[j + 1];
39            if sc == tc {
40                dcol[j + 1] = current;
41            } else {
42                dcol[j + 1] = cmp::min(current, next);
43                dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
44            }
45            current = next;
46        }
47    }
48
49    if dcol[m] <= limit {
50        Some(dcol[m])
51    } else {
52        None
53    }
54}
55
56/// Finds the Levenshtein distance between two strings.
57pub fn levenshtein_distance(a: &str, b: &str) -> usize {
58    lev_distance(a, b, usize::MAX).unwrap_or(usize::MAX)
59}
60/// Provides a word similarity score between two words that accounts for substrings being more
61/// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
62/// 0 is an identical match.
63///
64/// Uses the Levenshtein distance between the two strings and removes the cost of the length
65/// difference. If this is 0 then it is either a substring match or a full word match, in the
66/// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
67/// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
68/// is not greater than twice the length of the other. For cases where the words are close in size
69/// but not an exact substring then the cost of the length difference is discounted by half.
70///
71/// Returns `None` if the distance exceeds the limit.
72pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
73    let n = a.chars().count();
74    let m = b.chars().count();
75
76    // Check one isn't less than half the length of the other. If this is true then there is a
77    // big difference in length.
78    let big_len_diff = (n * 2) < m || (m * 2) < n;
79    let len_diff = m.abs_diff(n);
80    let lev = lev_distance(a, b, limit + len_diff)?;
81
82    // This is the crux, subtracting length difference means exact substring matches will now be 0
83    let score = lev - len_diff;
84
85    // If the score is 0 but the words have different lengths then it's a substring match not a full
86    // word match
87    let score = if score == 0 && len_diff > 0 && !big_len_diff {
88        1 // Exact substring match, but not a total word match so return non-zero
89    } else if !big_len_diff {
90        // Not a big difference in length, discount cost of length difference
91        score + len_diff.div_ceil(2)
92    } else {
93        // A big difference in length, add back the difference in length to the score
94        score + len_diff
95    };
96
97    if score <= limit { Some(score) } else { None }
98}
99
100/// Finds the best match for given word in the given iterator where substrings are meaningful.
101///
102/// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
103/// for word similarity. This takes an optional distance limit which defaults to one-third of the
104/// given word.
105///
106/// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
107/// on an edge case with a lower(upper)case letters mismatch.
108pub fn find_best_match_for_name_with_substrings<'c>(
109    candidates: &[&'c str],
110    lookup: &str,
111    dist: Option<usize>,
112) -> Option<&'c str> {
113    find_best_match_for_name_impl(true, candidates, lookup, dist)
114}
115
116/// Finds the best match for a given word in the given iterator.
117///
118/// As a loose rule to avoid the obviously incorrect suggestions, it takes
119/// an optional limit for the maximum allowable edit distance, which defaults
120/// to one-third of the given word.
121///
122/// Besides Levenshtein, we use case insensitive comparison to improve accuracy
123/// on an edge case with a lower(upper)case letters mismatch.
124#[allow(dead_code)]
125pub fn find_best_match_for_name<'c>(
126    candidates: &[&'c str],
127    lookup: &str,
128    dist: Option<usize>,
129) -> Option<&'c str> {
130    find_best_match_for_name_impl(false, candidates, lookup, dist)
131}
132
133#[cold]
134fn find_best_match_for_name_impl<'c>(
135    use_substring_score: bool,
136    candidates: &[&'c str],
137    lookup: &str,
138    dist: Option<usize>,
139) -> Option<&'c str> {
140    let lookup_uppercase = lookup.to_uppercase();
141
142    // Priority of matches:
143    // 1. Exact case insensitive match
144    // 2. Levenshtein distance match
145    // 3. Sorted word match
146    if let Some(c) = candidates
147        .iter()
148        .find(|c| c.to_uppercase() == lookup_uppercase)
149    {
150        return Some(*c);
151    }
152
153    let mut dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
154    let mut best = None;
155    for c in candidates {
156        let lev_dist = if use_substring_score {
157            lev_distance_with_substrings(lookup, c, dist)
158        } else {
159            lev_distance(lookup, c, dist)
160        };
161        match lev_dist {
162            Some(0) => return Some(*c),
163            Some(d) => {
164                dist = d - 1;
165                best = Some(*c);
166            }
167            None => {}
168        }
169    }
170    if best.is_some() {
171        return best;
172    }
173
174    find_match_by_sorted_words(candidates, lookup)
175}
176
177fn find_match_by_sorted_words<'c>(iter_names: &[&'c str], lookup: &str) -> Option<&'c str> {
178    iter_names.iter().fold(None, |result, candidate| {
179        if sort_by_words(candidate) == sort_by_words(lookup) {
180            Some(*candidate)
181        } else {
182            result
183        }
184    })
185}
186
187fn sort_by_words(name: &str) -> String {
188    let mut split_words: Vec<&str> = name.split('_').collect();
189    // We are sorting primitive &strs and can use unstable sort here.
190    split_words.sort_unstable();
191    split_words.join("_")
192}
193
194// This file is copied from the rust compiler project:
195// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/compiler/rustc_span/src/lev_distance.rs#L1
196// https://github.com/rust-lang/rust/blob/cf9ed0dd5836201843d28bbad50abfbe1913af2a/LICENSE-MIT
197
198// Permission is hereby granted, free of charge, to any
199// person obtaining a copy of this software and associated
200// documentation files (the "Software"), to deal in the
201// Software without restriction, including without
202// limitation the rights to use, copy, modify, merge,
203// publish, distribute, sublicense, and/or sell copies of
204// the Software, and to permit persons to whom the Software
205// is furnished to do so, subject to the following
206// conditions:
207
208// The above copyright notice and this permission notice
209// shall be included in all copies or substantial portions
210// of the Software.
211
212// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
213// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
214// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
215// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
216// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
217// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
218// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
219// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
220// DEALINGS IN THE SOFTWARE.
221// Footer