code_fuzzy_match/
lib.rs

1//! Fuzzy string matching inspired by [Visual Studio Code](https://github.com/microsoft/vscode).
2//!
3//! The fuzzy matching algorithm used in this crate is optimized for use cases such as
4//! command palettes, quick file navigation, and code searching. It does not use Levenshtein
5//! distance, which is more suited to use cases like spell checking.
6//!
7//! The algorithm only allows matches where the characters in the query string are present and
8//! in the same order as the characters in the target string. All queries are substring queries,
9//! so it is not a major hit to the match score to search for a term in the middle of the target
10//! string. The algorithm prefers matches that are at the beginning of words in the target
11//! string, with words treated as they might appear in code (letters following a separator or
12//! in camel case are treated as a word). Sequential matches are also favored.
13//!
14//! This crate provides a [`FuzzyMatcher`] struct for batch processing in addition to a
15//! [`fuzzy_match`] function for matching a single item.
16//!
17//! # Example usage
18//!
19//! ```
20//! let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
21//! let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
22//! assert!(matches.is_some());
23//! let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
24//! assert!(no_match.is_none());
25//!
26//! let high_score = matcher.fuzzy_match("Example string", "example");
27//! let lower_score = matcher.fuzzy_match("Example string", "str");
28//! assert!(high_score.unwrap() > lower_score.unwrap());
29//! ```
30
31#![no_std]
32
33extern crate alloc;
34use alloc::vec::Vec;
35
36/// Fuzzy matcher instance. Holds memory for the state of the fuzzy matcher so that
37/// large batches of queries can be processed with minimal allocations. When performing a
38/// large batch of fuzzy match queries, use a common instance of this struct to improve
39/// performance by avoiding extra allocations.
40pub struct FuzzyMatcher {
41    target_chars: Vec<char>,
42    first_possible_match: Vec<usize>,
43    prev_seq_match_counts: Vec<usize>,
44    prev_score: Vec<usize>,
45    seq_match_counts: Vec<usize>,
46    score: Vec<usize>,
47}
48
49fn char_matches(query_char: char, target_char: char) -> bool {
50    // Treat slashes and backslashes as the same character to be able to use as a path
51    // matching function.
52    match query_char {
53        '/' => matches!(target_char, '/' | '\\'),
54        '\\' => matches!(target_char, '/' | '\\'),
55        _ => {
56            // The `eq_ignore_ascii_case` function is *much* faster than a full
57            // Unicode case-insensitive comparison, so if the target character is
58            // ASCII, optimize for performance.
59            if query_char.is_ascii() {
60                query_char.eq_ignore_ascii_case(&target_char)
61            } else {
62                query_char
63                    .to_lowercase()
64                    .zip(target_char.to_lowercase())
65                    .all(|(a, b)| a == b)
66            }
67        }
68    }
69}
70
71impl FuzzyMatcher {
72    /// Creates a new instance of a fuzzy matcher.
73    pub fn new() -> Self {
74        FuzzyMatcher {
75            target_chars: Vec::new(),
76            first_possible_match: Vec::new(),
77            prev_seq_match_counts: Vec::new(),
78            prev_score: Vec::new(),
79            seq_match_counts: Vec::new(),
80            score: Vec::new(),
81        }
82    }
83
84    /// Fuzzy match a string against a query string. Returns a score that is higher for
85    /// a more confident match, or `None` if the query does not match the target string.
86    ///
87    /// # Examples
88    ///
89    /// ```
90    /// let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
91    /// let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
92    /// assert!(matches.is_some());
93    /// let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
94    /// assert!(no_match.is_none());
95    ///
96    /// let high_score = matcher.fuzzy_match("Example string", "example");
97    /// let lower_score = matcher.fuzzy_match("Example string", "str");
98    /// assert!(high_score.unwrap() > lower_score.unwrap());
99    /// ```
100    pub fn fuzzy_match(&mut self, target: &str, query: &str) -> Option<usize> {
101        // Break the target string into a vector of characters, since we need to manage
102        // parallel vectors with information per character. Match query string characters
103        // along the way to perform an early exit if the query string definitely does not
104        // match, as well as computing the earliest possible index for each given query
105        // character.
106        self.target_chars.clear();
107        self.target_chars.extend(target.chars());
108        self.first_possible_match.clear();
109        let mut query_chars = query.chars();
110        let mut cur_query_char = query_chars.next();
111        for (target_idx, target_char) in target.chars().enumerate() {
112            if let Some(query_char) = cur_query_char {
113                if char_matches(query_char, target_char) {
114                    self.first_possible_match.push(target_idx);
115                    cur_query_char = query_chars.next();
116                }
117            }
118        }
119
120        // If we didn't consume all query characters, then the query is not a match.
121        if cur_query_char.is_some() {
122            return None;
123        }
124
125        debug_assert_eq!(target.chars().count(), self.target_chars.len());
126        debug_assert_eq!(query.chars().count(), self.first_possible_match.len());
127
128        // Create vectors holding the score and sequential counts for two query characters.
129        // This algorithm implements a matrix-based method of fuzzy matching, but we don't
130        // need to hold the entire matrix in memory, just the current and previous rows.
131        self.prev_seq_match_counts.clear();
132        self.prev_score.clear();
133        self.prev_seq_match_counts
134            .resize(self.target_chars.len(), 0);
135        self.prev_score.resize(self.target_chars.len(), 0);
136
137        self.seq_match_counts.clear();
138        self.score.clear();
139        self.seq_match_counts.resize(self.target_chars.len(), 0);
140        self.score.resize(self.target_chars.len(), 0);
141
142        let mut first_possible_target_idx: usize = 0;
143
144        // Compute match scores for each query character in sequence
145        let mut first_query_char = true;
146        for (query_char, first_possible_match) in
147            query.chars().zip(self.first_possible_match.iter())
148        {
149            // If the starting point of the search is beyond the end of the target string,
150            // we can't have a match.
151            if first_possible_target_idx >= self.target_chars.len() {
152                return None;
153            }
154
155            // If the initial scan saw that the first possible match for this query character
156            // is later in the string, use that instead.
157            first_possible_target_idx = first_possible_target_idx.max(*first_possible_match);
158
159            // Reset vector holding the score and sequential counts for this query character.
160            // This algorithm implements a matrix-based method of fuzzy matching, but we don't
161            // need to hold the entire matrix in memory, just the current and previous rows.
162            (&mut self.seq_match_counts[first_possible_target_idx..self.target_chars.len()])
163                .fill(0);
164            (&mut self.score[first_possible_target_idx..self.target_chars.len()]).fill(0);
165
166            let mut first_nonzero_score = None;
167
168            // Compute match scores for each target character in sequence, for this query character.
169            // Start at the character after the previous earliest character that had a score. Any
170            // character before that cannot have a score, so we don't need to check those.
171            for i in first_possible_target_idx..self.target_chars.len() {
172                // Get characters and the score for the previous character in the target
173                let target_char = self.target_chars[i];
174                let prev_target_score = if i == first_possible_target_idx {
175                    0
176                } else {
177                    self.score[i - 1]
178                };
179
180                // Previous score and sequential match count comes from the previous character
181                // in both the target and the query
182                let prev_query_score = if i == 0 { 0 } else { self.prev_score[i - 1] };
183                let seq_match_count = if i == 0 {
184                    0
185                } else {
186                    self.prev_seq_match_counts[i - 1]
187                };
188
189                if !first_query_char && prev_query_score == 0 {
190                    self.score[i] = prev_target_score;
191                    continue;
192                }
193
194                // Check to ensure the characters match at all.
195                if !char_matches(query_char, target_char) {
196                    // No match, use existing score and reset sequential count
197                    self.score[i] = prev_target_score;
198                    continue;
199                }
200
201                // Compute score for this character match. These bonuses are inspired by
202                // the algorithm used by Visual Studio Code.
203                let mut char_score = 1;
204
205                // Sequential match bonus
206                char_score += seq_match_count * 5;
207
208                if target_char == query_char {
209                    // Same case bonus
210                    char_score += 1;
211                }
212
213                if i == 0 {
214                    // Start of target bonus
215                    char_score += 8;
216                } else {
217                    if matches!(target_char, '/' | '\\') {
218                        // Path separator bonus
219                        char_score += 5;
220                    } else if matches!(target_char, '_' | '-' | '.' | ' ' | '\'' | '"' | ':') {
221                        // Separator bonus
222                        char_score += 4;
223                    } else if seq_match_count == 0 {
224                        if i > 0
225                            && matches!(
226                                self.target_chars[i - 1],
227                                '_' | '-' | '.' | ' ' | '\'' | '"' | ':'
228                            )
229                        {
230                            // Start of word after separator bonus
231                            char_score += 2;
232                        } else if target_char.is_ascii() {
233                            // It is faster to check for ASCII first and then use
234                            // `is_ascii_uppercase` than to always use `is_uppercase`.
235                            if target_char.is_ascii_uppercase() {
236                                // Start of word bonus
237                                char_score += 2;
238                            }
239                        } else if target_char.is_uppercase() {
240                            // Start of word bonus
241                            char_score += 2;
242                        }
243                    }
244                }
245
246                if i + 1 == self.target_chars.len() {
247                    // End of target bonus
248                    char_score += 2;
249                }
250
251                // Compute new score and check if it's improved
252                let new_score = prev_query_score + char_score;
253                if new_score >= prev_target_score {
254                    // Score is at least the previous score, keep sequential match going
255                    self.score[i] = new_score;
256                    self.seq_match_counts[i] = seq_match_count + 1;
257                    if first_nonzero_score.is_none() {
258                        first_nonzero_score = Some(i);
259                    }
260                } else {
261                    // Score is lower than the previous score, don't use this match
262                    self.score[i] = prev_target_score;
263                }
264            }
265
266            if let Some(first_nonzero_score) = first_nonzero_score {
267                // Start the next character's matching at the character following the one that
268                // first set a valid score.
269                first_possible_target_idx = first_nonzero_score + 1;
270
271                // Keep scores and sequential match information for this character in the query
272                // for lookup during the next character.
273                (&mut self.prev_score[first_nonzero_score..self.target_chars.len()])
274                    .copy_from_slice(&self.score[first_nonzero_score..self.target_chars.len()]);
275                (&mut self.prev_seq_match_counts[first_nonzero_score..self.target_chars.len()])
276                    .copy_from_slice(
277                        &self.seq_match_counts[first_nonzero_score..self.target_chars.len()],
278                    );
279                first_query_char = false;
280            } else {
281                // If the all scores are zero, we already know we don't have a match. Exit early
282                // in this case.
283                return None;
284            }
285        }
286
287        // Final score will always be in the last slot of the final score vector
288        let score = *self.prev_score.last().unwrap_or(&0);
289        if score == 0 {
290            // Score of zero is not a match
291            None
292        } else {
293            Some(score)
294        }
295    }
296}
297
298/// Fuzzy match a string against a query string. Returns a score that is higher for
299/// a more confident match, or `None` if the query does not match the target string.
300///
301/// When performing a large batch of fuzzy matches, use [`FuzzyMatcher`] instead.
302///
303/// # Examples
304///
305/// ```
306/// let matches = code_fuzzy_match::fuzzy_match("the quick brown fox", "bro fox");
307/// assert!(matches.is_some());
308/// let no_match = code_fuzzy_match::fuzzy_match("the quick brown fox", "cat");
309/// assert!(no_match.is_none());
310///
311/// let high_score = code_fuzzy_match::fuzzy_match("Example string", "example");
312/// let lower_score = code_fuzzy_match::fuzzy_match("Example string", "str");
313/// assert!(high_score.unwrap() > lower_score.unwrap());
314/// ```
315pub fn fuzzy_match(target: &str, query: &str) -> Option<usize> {
316    let mut matcher = FuzzyMatcher::new();
317    matcher.fuzzy_match(target, query)
318}
319
320#[cfg(test)]
321mod tests {
322    use alloc::vec::Vec;
323
324    #[test]
325    fn test_match() {
326        let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "fox");
327        assert!(result.is_some());
328        let result = crate::fuzzy_match(
329            "The quick brown fox jumps over the lazy dog.",
330            "Quick fox jumps the dog",
331        );
332        assert!(result.is_some());
333    }
334
335    #[test]
336    fn test_no_match() {
337        let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "cat");
338        assert!(result.is_none());
339        let result = crate::fuzzy_match(
340            "The quick brown fox jumps over the lazy dog.",
341            "Quick fox jumps the cat",
342        );
343        assert!(result.is_none());
344    }
345
346    #[test]
347    fn test_ranking() {
348        const TARGET: &str = "The quick brown fox jumps over the lazy dog.";
349        const QUERIES: &[&str] = &[
350            "fx",           // Match short word in the middle, omitted letter
351            "fox",          // Match short word in the middle
352            "jump",         // Match word in the middle
353            "JUMP",         // Match word but not a case match, lower than "jump"
354            "The",          // Short match on first word
355            "the",          // Matches first word but not a case match, lower than "The"
356            "fx over",      // Match with omitted letter
357            "quick cat",    // Not a match, last word not present
358            "The quick",    // Long case match at the start, this should be highest
359            "the quick",    // Long match at the start, this should be just below "The quick"
360            "jump the dog", // Long match, high because of three exact word matches
361            "jmp the do",   // Match, but not as high as "jump the dog"
362            "jmp the cat",  // Not a match, last word not present
363            "dog the fox",  // Not a match, out of order
364            "het",          // Matches part of "the" and then a later "t" but should be low rank
365            "xz",           // Letters are in order but not related, low rank
366            "xx",           // Not a match, letter is present but only once
367            "ee",           // Match, letter is present twice in the target
368        ];
369
370        // Gather results for each query
371        let mut results = QUERIES
372            .iter()
373            .map(|query| (query, crate::fuzzy_match(TARGET, query)))
374            .collect::<Vec<_>>();
375
376        // Get rid of anything that isn't a match
377        results.retain(|(_, result)| result.is_some());
378
379        // Sort by score
380        results.sort_by_key(|(_, result)| result.unwrap());
381
382        // Validate results
383        assert_eq!(
384            results.iter().map(|(query, _)| *query).collect::<Vec<_>>(),
385            &[
386                &"xz",
387                &"ee",
388                &"fx",
389                &"het",
390                &"fox",
391                &"the",
392                &"The",
393                &"JUMP",
394                &"jump",
395                &"fx over",
396                &"jmp the do",
397                &"jump the dog",
398                &"the quick",
399                &"The quick",
400            ]
401        );
402    }
403
404    #[test]
405    fn test_slash() {
406        let result = crate::fuzzy_match("/bin/ls", "/ls");
407        assert!(result.is_some());
408        let result = crate::fuzzy_match("/bin/ls", "\\ls");
409        assert!(result.is_some());
410        let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "/windows");
411        assert!(result.is_some());
412        let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "\\windows");
413        assert!(result.is_some());
414    }
415
416    #[test]
417    fn test_word_bonus() {
418        let higher = crate::fuzzy_match("words with spaces", "spa");
419        let lower = crate::fuzzy_match("words with spaces", "pac");
420        assert!(higher.is_some());
421        assert!(lower.is_some());
422        assert!(
423            higher.unwrap() > lower.unwrap(),
424            "higher = {:?}, lower = {:?}",
425            higher,
426            lower
427        );
428
429        let higher = crate::fuzzy_match("words_with_underscores", "und");
430        let lower = crate::fuzzy_match("words_with_underscores", "nde");
431        assert!(higher.is_some());
432        assert!(lower.is_some());
433        assert!(
434            higher.unwrap() > lower.unwrap(),
435            "higher = {:?}, lower = {:?}",
436            higher,
437            lower
438        );
439
440        let higher = crate::fuzzy_match("camelCaseWords", "Wor");
441        let lower = crate::fuzzy_match("camelCaseWords", "ord");
442        assert!(higher.is_some());
443        assert!(lower.is_some());
444        assert!(
445            higher.unwrap() > lower.unwrap(),
446            "higher = {:?}, lower = {:?}",
447            higher,
448            lower
449        );
450    }
451}