code_fuzzy_match/lib.rs
1//! Fuzzy string matching inspired by [Visual Studio Code](https://github.com/microsoft/vscode).
2//!
3//! The fuzzy matching algorithm used in this crate is optimized for use cases such as
4//! command palettes, quick file navigation, and code searching. It does not use Levenshtein
5//! distance, which is more suited to use cases like spell checking.
6//!
7//! The algorithm only allows matches where the characters in the query string are present and
8//! in the same order as the characters in the target string. All queries are substring queries,
9//! so it is not a major hit to the match score to search for a term in the middle of the target
10//! string. The algorithm prefers matches that are at the beginning of words in the target
11//! string, with words treated as they might appear in code (letters following a separator or
12//! in camel case are treated as a word). Sequential matches are also favored.
13//!
14//! This crate provides a [`FuzzyMatcher`] struct for batch processing in addition to a
15//! [`fuzzy_match`] function for matching a single item.
16//!
17//! # Example usage
18//!
19//! ```
20//! let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
21//! let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
22//! assert!(matches.is_some());
23//! let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
24//! assert!(no_match.is_none());
25//!
26//! let high_score = matcher.fuzzy_match("Example string", "example");
27//! let lower_score = matcher.fuzzy_match("Example string", "str");
28//! assert!(high_score.unwrap() > lower_score.unwrap());
29//! ```
30
31#![no_std]
32
33extern crate alloc;
34use alloc::vec::Vec;
35
36/// Fuzzy matcher instance. Holds memory for the state of the fuzzy matcher so that
37/// large batches of queries can be processed with minimal allocations. When performing a
38/// large batch of fuzzy match queries, use a common instance of this struct to improve
39/// performance by avoiding extra allocations.
40pub struct FuzzyMatcher {
41 target_chars: Vec<char>,
42 first_possible_match: Vec<usize>,
43 prev_seq_match_counts: Vec<usize>,
44 prev_score: Vec<usize>,
45 seq_match_counts: Vec<usize>,
46 score: Vec<usize>,
47}
48
49fn char_matches(query_char: char, target_char: char) -> bool {
50 // Treat slashes and backslashes as the same character to be able to use as a path
51 // matching function.
52 match query_char {
53 '/' => matches!(target_char, '/' | '\\'),
54 '\\' => matches!(target_char, '/' | '\\'),
55 _ => {
56 // The `eq_ignore_ascii_case` function is *much* faster than a full
57 // Unicode case-insensitive comparison, so if the target character is
58 // ASCII, optimize for performance.
59 if query_char.is_ascii() {
60 query_char.eq_ignore_ascii_case(&target_char)
61 } else {
62 query_char
63 .to_lowercase()
64 .zip(target_char.to_lowercase())
65 .all(|(a, b)| a == b)
66 }
67 }
68 }
69}
70
71impl FuzzyMatcher {
72 /// Creates a new instance of a fuzzy matcher.
73 pub fn new() -> Self {
74 FuzzyMatcher {
75 target_chars: Vec::new(),
76 first_possible_match: Vec::new(),
77 prev_seq_match_counts: Vec::new(),
78 prev_score: Vec::new(),
79 seq_match_counts: Vec::new(),
80 score: Vec::new(),
81 }
82 }
83
84 /// Fuzzy match a string against a query string. Returns a score that is higher for
85 /// a more confident match, or `None` if the query does not match the target string.
86 ///
87 /// # Examples
88 ///
89 /// ```
90 /// let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
91 /// let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
92 /// assert!(matches.is_some());
93 /// let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
94 /// assert!(no_match.is_none());
95 ///
96 /// let high_score = matcher.fuzzy_match("Example string", "example");
97 /// let lower_score = matcher.fuzzy_match("Example string", "str");
98 /// assert!(high_score.unwrap() > lower_score.unwrap());
99 /// ```
100 pub fn fuzzy_match(&mut self, target: &str, query: &str) -> Option<usize> {
101 // Break the target string into a vector of characters, since we need to manage
102 // parallel vectors with information per character. Match query string characters
103 // along the way to perform an early exit if the query string definitely does not
104 // match, as well as computing the earliest possible index for each given query
105 // character.
106 self.target_chars.clear();
107 self.target_chars.extend(target.chars());
108 self.first_possible_match.clear();
109 let mut query_chars = query.chars();
110 let mut cur_query_char = query_chars.next();
111 for (target_idx, target_char) in target.chars().enumerate() {
112 if let Some(query_char) = cur_query_char {
113 if char_matches(query_char, target_char) {
114 self.first_possible_match.push(target_idx);
115 cur_query_char = query_chars.next();
116 }
117 }
118 }
119
120 // If we didn't consume all query characters, then the query is not a match.
121 if cur_query_char.is_some() {
122 return None;
123 }
124
125 debug_assert_eq!(target.chars().count(), self.target_chars.len());
126 debug_assert_eq!(query.chars().count(), self.first_possible_match.len());
127
128 // Create vectors holding the score and sequential counts for two query characters.
129 // This algorithm implements a matrix-based method of fuzzy matching, but we don't
130 // need to hold the entire matrix in memory, just the current and previous rows.
131 self.prev_seq_match_counts.clear();
132 self.prev_score.clear();
133 self.prev_seq_match_counts
134 .resize(self.target_chars.len(), 0);
135 self.prev_score.resize(self.target_chars.len(), 0);
136
137 self.seq_match_counts.clear();
138 self.score.clear();
139 self.seq_match_counts.resize(self.target_chars.len(), 0);
140 self.score.resize(self.target_chars.len(), 0);
141
142 let mut first_possible_target_idx: usize = 0;
143
144 // Compute match scores for each query character in sequence
145 let mut first_query_char = true;
146 for (query_char, first_possible_match) in
147 query.chars().zip(self.first_possible_match.iter())
148 {
149 // If the starting point of the search is beyond the end of the target string,
150 // we can't have a match.
151 if first_possible_target_idx >= self.target_chars.len() {
152 return None;
153 }
154
155 // If the initial scan saw that the first possible match for this query character
156 // is later in the string, use that instead.
157 first_possible_target_idx = first_possible_target_idx.max(*first_possible_match);
158
159 // Reset vector holding the score and sequential counts for this query character.
160 // This algorithm implements a matrix-based method of fuzzy matching, but we don't
161 // need to hold the entire matrix in memory, just the current and previous rows.
162 (&mut self.seq_match_counts[first_possible_target_idx..self.target_chars.len()])
163 .fill(0);
164 (&mut self.score[first_possible_target_idx..self.target_chars.len()]).fill(0);
165
166 let mut first_nonzero_score = None;
167
168 // Compute match scores for each target character in sequence, for this query character.
169 // Start at the character after the previous earliest character that had a score. Any
170 // character before that cannot have a score, so we don't need to check those.
171 for i in first_possible_target_idx..self.target_chars.len() {
172 // Get characters and the score for the previous character in the target
173 let target_char = self.target_chars[i];
174 let prev_target_score = if i == first_possible_target_idx {
175 0
176 } else {
177 self.score[i - 1]
178 };
179
180 // Previous score and sequential match count comes from the previous character
181 // in both the target and the query
182 let prev_query_score = if i == 0 { 0 } else { self.prev_score[i - 1] };
183 let seq_match_count = if i == 0 {
184 0
185 } else {
186 self.prev_seq_match_counts[i - 1]
187 };
188
189 if !first_query_char && prev_query_score == 0 {
190 self.score[i] = prev_target_score;
191 continue;
192 }
193
194 // Check to ensure the characters match at all.
195 if !char_matches(query_char, target_char) {
196 // No match, use existing score and reset sequential count
197 self.score[i] = prev_target_score;
198 continue;
199 }
200
201 // Compute score for this character match. These bonuses are inspired by
202 // the algorithm used by Visual Studio Code.
203 let mut char_score = 1;
204
205 // Sequential match bonus
206 char_score += seq_match_count * 5;
207
208 if target_char == query_char {
209 // Same case bonus
210 char_score += 1;
211 }
212
213 if i == 0 {
214 // Start of target bonus
215 char_score += 8;
216 } else {
217 if matches!(target_char, '/' | '\\') {
218 // Path separator bonus
219 char_score += 5;
220 } else if matches!(target_char, '_' | '-' | '.' | ' ' | '\'' | '"' | ':') {
221 // Separator bonus
222 char_score += 4;
223 } else if seq_match_count == 0 {
224 if i > 0
225 && matches!(
226 self.target_chars[i - 1],
227 '_' | '-' | '.' | ' ' | '\'' | '"' | ':'
228 )
229 {
230 // Start of word after separator bonus
231 char_score += 2;
232 } else if target_char.is_ascii() {
233 // It is faster to check for ASCII first and then use
234 // `is_ascii_uppercase` than to always use `is_uppercase`.
235 if target_char.is_ascii_uppercase() {
236 // Start of word bonus
237 char_score += 2;
238 }
239 } else if target_char.is_uppercase() {
240 // Start of word bonus
241 char_score += 2;
242 }
243 }
244 }
245
246 if i + 1 == self.target_chars.len() {
247 // End of target bonus
248 char_score += 2;
249 }
250
251 // Compute new score and check if it's improved
252 let new_score = prev_query_score + char_score;
253 if new_score >= prev_target_score {
254 // Score is at least the previous score, keep sequential match going
255 self.score[i] = new_score;
256 self.seq_match_counts[i] = seq_match_count + 1;
257 if first_nonzero_score.is_none() {
258 first_nonzero_score = Some(i);
259 }
260 } else {
261 // Score is lower than the previous score, don't use this match
262 self.score[i] = prev_target_score;
263 }
264 }
265
266 if let Some(first_nonzero_score) = first_nonzero_score {
267 // Start the next character's matching at the character following the one that
268 // first set a valid score.
269 first_possible_target_idx = first_nonzero_score + 1;
270
271 // Keep scores and sequential match information for this character in the query
272 // for lookup during the next character.
273 (&mut self.prev_score[first_nonzero_score..self.target_chars.len()])
274 .copy_from_slice(&self.score[first_nonzero_score..self.target_chars.len()]);
275 (&mut self.prev_seq_match_counts[first_nonzero_score..self.target_chars.len()])
276 .copy_from_slice(
277 &self.seq_match_counts[first_nonzero_score..self.target_chars.len()],
278 );
279 first_query_char = false;
280 } else {
281 // If the all scores are zero, we already know we don't have a match. Exit early
282 // in this case.
283 return None;
284 }
285 }
286
287 // Final score will always be in the last slot of the final score vector
288 let score = *self.prev_score.last().unwrap_or(&0);
289 if score == 0 {
290 // Score of zero is not a match
291 None
292 } else {
293 Some(score)
294 }
295 }
296}
297
298/// Fuzzy match a string against a query string. Returns a score that is higher for
299/// a more confident match, or `None` if the query does not match the target string.
300///
301/// When performing a large batch of fuzzy matches, use [`FuzzyMatcher`] instead.
302///
303/// # Examples
304///
305/// ```
306/// let matches = code_fuzzy_match::fuzzy_match("the quick brown fox", "bro fox");
307/// assert!(matches.is_some());
308/// let no_match = code_fuzzy_match::fuzzy_match("the quick brown fox", "cat");
309/// assert!(no_match.is_none());
310///
311/// let high_score = code_fuzzy_match::fuzzy_match("Example string", "example");
312/// let lower_score = code_fuzzy_match::fuzzy_match("Example string", "str");
313/// assert!(high_score.unwrap() > lower_score.unwrap());
314/// ```
315pub fn fuzzy_match(target: &str, query: &str) -> Option<usize> {
316 let mut matcher = FuzzyMatcher::new();
317 matcher.fuzzy_match(target, query)
318}
319
320#[cfg(test)]
321mod tests {
322 use alloc::vec::Vec;
323
324 #[test]
325 fn test_match() {
326 let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "fox");
327 assert!(result.is_some());
328 let result = crate::fuzzy_match(
329 "The quick brown fox jumps over the lazy dog.",
330 "Quick fox jumps the dog",
331 );
332 assert!(result.is_some());
333 }
334
335 #[test]
336 fn test_no_match() {
337 let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "cat");
338 assert!(result.is_none());
339 let result = crate::fuzzy_match(
340 "The quick brown fox jumps over the lazy dog.",
341 "Quick fox jumps the cat",
342 );
343 assert!(result.is_none());
344 }
345
346 #[test]
347 fn test_ranking() {
348 const TARGET: &str = "The quick brown fox jumps over the lazy dog.";
349 const QUERIES: &[&str] = &[
350 "fx", // Match short word in the middle, omitted letter
351 "fox", // Match short word in the middle
352 "jump", // Match word in the middle
353 "JUMP", // Match word but not a case match, lower than "jump"
354 "The", // Short match on first word
355 "the", // Matches first word but not a case match, lower than "The"
356 "fx over", // Match with omitted letter
357 "quick cat", // Not a match, last word not present
358 "The quick", // Long case match at the start, this should be highest
359 "the quick", // Long match at the start, this should be just below "The quick"
360 "jump the dog", // Long match, high because of three exact word matches
361 "jmp the do", // Match, but not as high as "jump the dog"
362 "jmp the cat", // Not a match, last word not present
363 "dog the fox", // Not a match, out of order
364 "het", // Matches part of "the" and then a later "t" but should be low rank
365 "xz", // Letters are in order but not related, low rank
366 "xx", // Not a match, letter is present but only once
367 "ee", // Match, letter is present twice in the target
368 ];
369
370 // Gather results for each query
371 let mut results = QUERIES
372 .iter()
373 .map(|query| (query, crate::fuzzy_match(TARGET, query)))
374 .collect::<Vec<_>>();
375
376 // Get rid of anything that isn't a match
377 results.retain(|(_, result)| result.is_some());
378
379 // Sort by score
380 results.sort_by_key(|(_, result)| result.unwrap());
381
382 // Validate results
383 assert_eq!(
384 results.iter().map(|(query, _)| *query).collect::<Vec<_>>(),
385 &[
386 &"xz",
387 &"ee",
388 &"fx",
389 &"het",
390 &"fox",
391 &"the",
392 &"The",
393 &"JUMP",
394 &"jump",
395 &"fx over",
396 &"jmp the do",
397 &"jump the dog",
398 &"the quick",
399 &"The quick",
400 ]
401 );
402 }
403
404 #[test]
405 fn test_slash() {
406 let result = crate::fuzzy_match("/bin/ls", "/ls");
407 assert!(result.is_some());
408 let result = crate::fuzzy_match("/bin/ls", "\\ls");
409 assert!(result.is_some());
410 let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "/windows");
411 assert!(result.is_some());
412 let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "\\windows");
413 assert!(result.is_some());
414 }
415
416 #[test]
417 fn test_word_bonus() {
418 let higher = crate::fuzzy_match("words with spaces", "spa");
419 let lower = crate::fuzzy_match("words with spaces", "pac");
420 assert!(higher.is_some());
421 assert!(lower.is_some());
422 assert!(
423 higher.unwrap() > lower.unwrap(),
424 "higher = {:?}, lower = {:?}",
425 higher,
426 lower
427 );
428
429 let higher = crate::fuzzy_match("words_with_underscores", "und");
430 let lower = crate::fuzzy_match("words_with_underscores", "nde");
431 assert!(higher.is_some());
432 assert!(lower.is_some());
433 assert!(
434 higher.unwrap() > lower.unwrap(),
435 "higher = {:?}, lower = {:?}",
436 higher,
437 lower
438 );
439
440 let higher = crate::fuzzy_match("camelCaseWords", "Wor");
441 let lower = crate::fuzzy_match("camelCaseWords", "ord");
442 assert!(higher.is_some());
443 assert!(lower.is_some());
444 assert!(
445 higher.unwrap() > lower.unwrap(),
446 "higher = {:?}, lower = {:?}",
447 higher,
448 lower
449 );
450 }
451}