rustc_edit_distance/
lib.rs

1//! This implementation is derived from the Rust compiler ("rustc"),
2//! specifically from:
3//! https://github.com/rust-lang/rust/blob/3b022d8ceea570db9730be34d964f0cc663a567f/compiler/rustc_span/src/edit_distance.rs
4//!
5//! The Rust compiler is dual-licensed under the Apache License, Version 2.0
6//! and the MIT license. See the LICENSE-* files and COPYRIGHT file in this
7//! repository for details.
8//!
9//! This project is likewise dual-licensed under Apache-2.0 OR MIT, at your option.
10
11use std::{cmp, mem};
12
13/// Finds the [edit distance] between two strings.
14///
15/// Returns `None` if the distance exceeds the limit.
16///
17/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
18pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
19    let mut a = &a.chars().collect::<Vec<_>>()[..];
20    let mut b = &b.chars().collect::<Vec<_>>()[..];
21
22    // Ensure that `b` is the shorter string, minimizing memory use.
23    if a.len() < b.len() {
24        mem::swap(&mut a, &mut b);
25    }
26
27    let min_dist = a.len() - b.len();
28    // If we know the limit will be exceeded, we can return early.
29    if min_dist > limit {
30        return None;
31    }
32
33    // Strip common prefix.
34    while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) {
35        if a_char != b_char {
36            break;
37        }
38        a = a_rest;
39        b = b_rest;
40    }
41    // Strip common suffix.
42    while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) {
43        if a_char != b_char {
44            break;
45        }
46        a = a_rest;
47        b = b_rest;
48    }
49
50    // If either string is empty, the distance is the length of the other.
51    // We know that `b` is the shorter string, so we don't need to check `a`.
52    if b.is_empty() {
53        return Some(min_dist);
54    }
55
56    let mut prev_prev = vec![usize::MAX; b.len() + 1];
57    let mut prev = (0..=b.len()).collect::<Vec<_>>();
58    let mut current = vec![0; b.len() + 1];
59
60    // row by row
61    for i in 1..=a.len() {
62        current[0] = i;
63        let a_idx = i - 1;
64
65        // column by column
66        for j in 1..=b.len() {
67            let b_idx = j - 1;
68
69            // There is no cost to substitute a character with itself.
70            let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
71
72            current[j] = cmp::min(
73                // deletion
74                prev[j] + 1,
75                cmp::min(
76                    // insertion
77                    current[j - 1] + 1,
78                    // substitution
79                    prev[j - 1] + substitution_cost,
80                ),
81            );
82
83            if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
84                // transposition
85                current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
86            }
87        }
88
89        // Rotate the buffers, reusing the memory.
90        [prev_prev, prev, current] = [prev, current, prev_prev];
91    }
92
93    // `prev` because we already rotated the buffers.
94    let distance = prev[b.len()];
95    (distance <= limit).then_some(distance)
96}
97
98pub fn find_best_match_for_name<'a>(
99    candidates: &[&'a str],
100    lookup: &'a str,
101    dist: Option<usize>,
102) -> Option<&'a str> {
103    find_best_match_for_name_impl(false, candidates, lookup, dist)
104}
105
106pub fn edit_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
107    let n = a.chars().count();
108    let m = b.chars().count();
109
110    // Check one isn't less than half the length of the other. If this is true then there is a
111    // big difference in length.
112    let big_len_diff = (n * 2) < m || (m * 2) < n;
113    let len_diff = m.abs_diff(n);
114    let distance = edit_distance(a, b, limit + len_diff)?;
115
116    // This is the crux, subtracting length difference means exact substring matches will now be 0
117    let score = distance - len_diff;
118
119    // If the score is 0 but the words have different lengths then it's a substring match not a full
120    // word match
121    let score = if score == 0 && len_diff > 0 && !big_len_diff {
122        1 // Exact substring match, but not a total word match so return non-zero
123    } else if !big_len_diff {
124        // Not a big difference in length, discount cost of length difference
125        score + (len_diff + 1) / 2
126    } else {
127        // A big difference in length, add back the difference in length to the score
128        score + len_diff
129    };
130
131    (score <= limit).then_some(score)
132}
133
134pub fn find_best_match_for_name_impl<'a>(
135    use_substring_score: bool,
136    candidates: &[&'a str],
137    lookup: &'a str,
138    dist: Option<usize>,
139) -> Option<&'a str> {
140    let lookup_uppercase = lookup.to_uppercase();
141
142    // Priority of matches:
143    // 1. Exact case insensitive match or Substring insensitive match
144    // 2. Edit distance match
145    // 3. Sorted word match
146    if let Some(c) = candidates.iter().find(|c| {
147        c.to_uppercase() == lookup_uppercase
148            || c.to_uppercase().contains(&lookup_uppercase)
149            || lookup_uppercase.contains(&c.to_uppercase())
150    }) {
151        return Some(*c);
152    }
153
154    // `fn edit_distance()` use `chars()` to calculate edit distance, so we must
155    // also use `chars()` (and not `str::len()`) to calculate length here.
156    let lookup_len = lookup.chars().count();
157
158    let mut dist = dist.unwrap_or_else(|| cmp::max(lookup_len, 3) / 3);
159    let mut best = None;
160    // store the candidates with the same distance, only for `use_substring_score` current.
161    let mut next_candidates = vec![];
162    for c in candidates {
163        match if use_substring_score {
164            edit_distance_with_substrings(lookup, c, dist)
165        } else {
166            edit_distance(lookup, c, dist)
167        } {
168            Some(0) => return Some(*c),
169            Some(d) => {
170                if use_substring_score {
171                    if d < dist {
172                        dist = d;
173                        next_candidates.clear();
174                    } else {
175                        // `d == dist` here, we need to store the candidates with the same distance
176                        // so we won't decrease the distance in the next loop.
177                    }
178                    next_candidates.push(*c);
179                } else {
180                    dist = d - 1;
181                }
182                best = Some(*c);
183            }
184            None => {}
185        }
186    }
187
188    // We have a tie among several candidates, try to select the best among them ignoring substrings.
189    // For example, the candidates list `force_capture`, `capture`, and user inputted `forced_capture`,
190    // we select `force_capture` with a extra round of edit distance calculation.
191    if next_candidates.len() > 1 {
192        debug_assert!(use_substring_score);
193        best = find_best_match_for_name_impl(false, &next_candidates, lookup, Some(lookup.len()));
194    }
195    if best.is_some() {
196        return best;
197    }
198
199    find_match_by_sorted_words(candidates, lookup)
200}
201
202fn find_match_by_sorted_words<'a>(iter_names: &[&'a str], lookup: &str) -> Option<&'a str> {
203    let lookup_sorted_by_words = sort_by_words(lookup);
204    iter_names.iter().fold(None, |result, candidate| {
205        if sort_by_words(candidate) == lookup_sorted_by_words {
206            Some(*candidate)
207        } else {
208            result
209        }
210    })
211}
212
213fn sort_by_words(name: &str) -> Vec<&str> {
214    let mut split_words: Vec<&str> = name.split('_').collect();
215    // We are sorting primitive &strs and can use unstable sort here.
216    split_words.sort_unstable();
217    split_words
218}