1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//! Fuzzy string matching inspired by [Visual Studio Code](https://github.com/microsoft/vscode).
//!
//! The fuzzy matching algorithm used in this crate is optimized for use cases such as
//! command palettes, quick file navigation, and code searching. It does not use Levenshtein
//! distance, which is more suited to use cases like spell checking.
//!
//! The algorithm only allows matches where the characters in the query string are present and
//! in the same order as the characters in the target string. All queries are substring queries,
//! so it is not a major hit to the match score to search for a term in the middle of the target
//! string. The algorithm prefers matches that are at the beginning of words in the target
//! string, with words treated as they might appear in code (letters following a separator or
//! in camel case are treated as a word). Sequential matches are also favored.
//!
//! This crate provides a [`FuzzyMatcher`] struct for batch processing in addition to a
//! [`fuzzy_match`] function for matching a single item.
//!
//! # Example usage
//!
//! ```
//! let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
//! let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
//! assert!(matches.is_some());
//! let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
//! assert!(no_match.is_none());
//!
//! let high_score = matcher.fuzzy_match("Example string", "example");
//! let lower_score = matcher.fuzzy_match("Example string", "str");
//! assert!(high_score.unwrap() > lower_score.unwrap());
//! ```

#![no_std]

extern crate alloc;
use alloc::vec::Vec;

/// Fuzzy matcher instance. Holds memory for the state of the fuzzy matcher so that
/// large batches of queries can be processed with minimal allocations. When performing a
/// large batch of fuzzy match queries, use a common instance of this struct to improve
/// performance by avoiding extra allocations.
pub struct FuzzyMatcher {
    target_chars: Vec<char>,
    prev_seq_match_counts: Vec<usize>,
    prev_score: Vec<usize>,
    seq_match_counts: Vec<usize>,
    score: Vec<usize>,
}

impl FuzzyMatcher {
    /// Creates a new instance of a fuzzy matcher.
    pub fn new() -> Self {
        FuzzyMatcher {
            target_chars: Vec::new(),
            prev_seq_match_counts: Vec::new(),
            prev_score: Vec::new(),
            seq_match_counts: Vec::new(),
            score: Vec::new(),
        }
    }

    /// Fuzzy match a string against a query string. Returns a score that is higher for
    /// a more confident match, or `None` if the query does not match the target string.
    ///
    /// # Examples
    ///
    /// ```
    /// let mut matcher = code_fuzzy_match::FuzzyMatcher::new();
    /// let matches = matcher.fuzzy_match("the quick brown fox", "bro fox");
    /// assert!(matches.is_some());
    /// let no_match = matcher.fuzzy_match("the quick brown fox", "cat");
    /// assert!(no_match.is_none());
    ///
    /// let high_score = matcher.fuzzy_match("Example string", "example");
    /// let lower_score = matcher.fuzzy_match("Example string", "str");
    /// assert!(high_score.unwrap() > lower_score.unwrap());
    /// ```
    pub fn fuzzy_match(&mut self, target: &str, query: &str) -> Option<usize> {
        // Break the target string into a vector of characters, since we need to manage
        // parallel vectors with information per character.
        self.target_chars.clear();
        self.target_chars.extend(target.chars());

        // Create vectors holding the score and sequential counts for two query characters.
        // This algorithm implements a matrix-based method of fuzzy matching, but we don't
        // need to hold the entire matrix in memory, just the current and previous rows.
        self.prev_seq_match_counts.clear();
        self.prev_score.clear();
        self.prev_seq_match_counts
            .resize(self.target_chars.len(), 0);
        self.prev_score.resize(self.target_chars.len(), 0);

        self.seq_match_counts.clear();
        self.score.clear();
        self.seq_match_counts.resize(self.target_chars.len(), 0);
        self.score.resize(self.target_chars.len(), 0);

        let mut first_possible_target_idx: usize = 0;

        // Compute match scores for each query character in sequence
        let mut first_query_char = true;
        for query_char in query.chars() {
            // If the starting point of the search is beyond the end of the target string,
            // we can't have a match.
            if first_possible_target_idx >= self.target_chars.len() {
                return None;
            }

            // Reset vector holding the score and sequential counts for this query character.
            // This algorithm implements a matrix-based method of fuzzy matching, but we don't
            // need to hold the entire matrix in memory, just the current and previous rows.
            (&mut self.seq_match_counts[first_possible_target_idx..self.target_chars.len()])
                .fill(0);
            (&mut self.score[first_possible_target_idx..self.target_chars.len()]).fill(0);

            let mut first_nonzero_score = None;
            let mut prev_is_separator = false;

            // Compute match scores for each target character in sequence, for this query character.
            // Start at the character after the previous earliest character that had a score. Any
            // character before that cannot have a score, so we don't need to check those.
            for i in first_possible_target_idx..self.target_chars.len() {
                // Get characters and the score for the previous character in the target
                let target_char = self.target_chars[i];
                let target_separator =
                    matches!(target_char, '_' | '-' | '.' | ' ' | '\'' | '"' | ':');
                let prev_target_score = if i == first_possible_target_idx {
                    0
                } else {
                    self.score[i - 1]
                };

                // Previous score and sequential match count comes from the previous character
                // in both the target and the query
                let prev_query_score = if i == 0 { 0 } else { self.prev_score[i - 1] };
                let seq_match_count = if i == 0 {
                    0
                } else {
                    self.prev_seq_match_counts[i - 1]
                };

                if !first_query_char && prev_query_score == 0 {
                    self.score[i] = prev_target_score;
                    prev_is_separator = target_separator;
                    continue;
                }

                // Check to ensure the characters match at all. Treat slashes and backslashes
                // as the same character to be able to use as a path matching function.
                let char_matches = match target_char {
                    '/' => matches!(query_char, '/' | '\\'),
                    '\\' => matches!(query_char, '/' | '\\'),
                    _ => {
                        // The `eq_ignore_ascii_case` function is *much* faster than a full
                        // Unicode case-insensitive comparison, so if the target character is
                        // ASCII, optimize for performance.
                        if target_char.is_ascii() {
                            target_char.eq_ignore_ascii_case(&query_char)
                        } else {
                            target_char
                                .to_lowercase()
                                .zip(query_char.to_lowercase())
                                .all(|(a, b)| a == b)
                        }
                    }
                };
                if !char_matches {
                    // No match, use existing score and reset sequential count
                    self.score[i] = prev_target_score;
                    prev_is_separator = target_separator;
                    continue;
                }

                // Compute score for this character match. These bonuses are inspired by
                // the algorithm used by Visual Studio Code.
                let mut char_score = 1;

                // Sequential match bonus
                char_score += seq_match_count * 5;

                if target_char == query_char {
                    // Same case bonus
                    char_score += 1;
                }

                if i == 0 {
                    // Start of target bonus
                    char_score += 8;
                } else {
                    if matches!(target_char, '/' | '\\') {
                        // Path separator bonus
                        char_score += 5;
                    } else if target_separator {
                        // Separator bonus
                        char_score += 4;
                    } else if seq_match_count == 0 {
                        if prev_is_separator {
                            // Start of word after separator bonus
                            char_score += 2;
                        } else if target_char.is_ascii() {
                            // It is faster to check for ASCII first and then use
                            // `is_ascii_uppercase` than to always use `is_uppercase`.
                            if target_char.is_ascii_uppercase() {
                                // Start of word bonus
                                char_score += 2;
                            }
                        } else if target_char.is_uppercase() {
                            // Start of word bonus
                            char_score += 2;
                        }
                    }
                }

                if i + 1 == self.target_chars.len() {
                    // End of target bonus
                    char_score += 2;
                }

                prev_is_separator = target_separator;

                // Compute new score and check if it's improved
                let new_score = prev_query_score + char_score;
                if new_score >= prev_target_score {
                    // Score is at least the previous score, keep sequential match going
                    self.score[i] = new_score;
                    self.seq_match_counts[i] = seq_match_count + 1;
                    if first_nonzero_score.is_none() {
                        first_nonzero_score = Some(i);
                    }
                } else {
                    // Score is lower than the previous score, don't use this match
                    self.score[i] = prev_target_score;
                }
            }

            if let Some(first_nonzero_score) = first_nonzero_score {
                // Start the next character's matching at the character following the one that
                // first set a valid score.
                first_possible_target_idx = first_nonzero_score + 1;

                // Keep scores and sequential match information for this character in the query
                // for lookup during the next character.
                (&mut self.prev_score[first_nonzero_score..self.target_chars.len()])
                    .copy_from_slice(&self.score[first_nonzero_score..self.target_chars.len()]);
                (&mut self.prev_seq_match_counts[first_nonzero_score..self.target_chars.len()])
                    .copy_from_slice(
                        &self.seq_match_counts[first_nonzero_score..self.target_chars.len()],
                    );
                first_query_char = false;
            } else {
                // If the all scores are zero, we already know we don't have a match. Exit early
                // in this case.
                return None;
            }
        }

        // Final score will always be in the last slot of the final score vector
        let score = *self.prev_score.last().unwrap_or(&0);
        if score == 0 {
            // Score of zero is not a match
            None
        } else {
            Some(score)
        }
    }
}

/// Fuzzy match a string against a query string. Returns a score that is higher for
/// a more confident match, or `None` if the query does not match the target string.
///
/// When performing a large batch of fuzzy matches, use [`FuzzyMatcher`] instead.
///
/// # Examples
///
/// ```
/// let matches = code_fuzzy_match::fuzzy_match("the quick brown fox", "bro fox");
/// assert!(matches.is_some());
/// let no_match = code_fuzzy_match::fuzzy_match("the quick brown fox", "cat");
/// assert!(no_match.is_none());
///
/// let high_score = code_fuzzy_match::fuzzy_match("Example string", "example");
/// let lower_score = code_fuzzy_match::fuzzy_match("Example string", "str");
/// assert!(high_score.unwrap() > lower_score.unwrap());
/// ```
pub fn fuzzy_match(target: &str, query: &str) -> Option<usize> {
    let mut matcher = FuzzyMatcher::new();
    matcher.fuzzy_match(target, query)
}

#[cfg(test)]
mod tests {
    use alloc::vec::Vec;

    #[test]
    fn test_match() {
        let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "fox");
        assert!(result.is_some());
        let result = crate::fuzzy_match(
            "The quick brown fox jumps over the lazy dog.",
            "Quick fox jumps the dog",
        );
        assert!(result.is_some());
    }

    #[test]
    fn test_no_match() {
        let result = crate::fuzzy_match("The quick brown fox jumps over the lazy dog.", "cat");
        assert!(result.is_none());
        let result = crate::fuzzy_match(
            "The quick brown fox jumps over the lazy dog.",
            "Quick fox jumps the cat",
        );
        assert!(result.is_none());
    }

    #[test]
    fn test_ranking() {
        const TARGET: &str = "The quick brown fox jumps over the lazy dog.";
        const QUERIES: &[&str] = &[
            "fx",           // Match short word in the middle, omitted letter
            "fox",          // Match short word in the middle
            "jump",         // Match word in the middle
            "JUMP",         // Match word but not a case match, lower than "jump"
            "The",          // Short match on first word
            "the",          // Matches first word but not a case match, lower than "The"
            "fx over",      // Match with omitted letter
            "quick cat",    // Not a match, last word not present
            "The quick",    // Long case match at the start, this should be highest
            "the quick",    // Long match at the start, this should be just below "The quick"
            "jump the dog", // Long match, high because of three exact word matches
            "jmp the do",   // Match, but not as high as "jump the dog"
            "jmp the cat",  // Not a match, last word not present
            "dog the fox",  // Not a match, out of order
            "het",          // Matches part of "the" and then a later "t" but should be low rank
            "xz",           // Letters are in order but not related, low rank
            "xx",           // Not a match, letter is present but only once
            "ee",           // Match, letter is present twice in the target
        ];

        // Gather results for each query
        let mut results = QUERIES
            .iter()
            .map(|query| (query, crate::fuzzy_match(TARGET, query)))
            .collect::<Vec<_>>();

        // Get rid of anything that isn't a match
        results.retain(|(_, result)| result.is_some());

        // Sort by score
        results.sort_by_key(|(_, result)| result.unwrap());

        // Validate results
        assert_eq!(
            results.iter().map(|(query, _)| *query).collect::<Vec<_>>(),
            &[
                &"xz",
                &"ee",
                &"fx",
                &"het",
                &"fox",
                &"the",
                &"The",
                &"JUMP",
                &"jump",
                &"fx over",
                &"jmp the do",
                &"jump the dog",
                &"the quick",
                &"The quick",
            ]
        );
    }

    #[test]
    fn test_slash() {
        let result = crate::fuzzy_match("/bin/ls", "/ls");
        assert!(result.is_some());
        let result = crate::fuzzy_match("/bin/ls", "\\ls");
        assert!(result.is_some());
        let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "/windows");
        assert!(result.is_some());
        let result = crate::fuzzy_match("c:\\windows\\notepad.exe", "\\windows");
        assert!(result.is_some());
    }

    #[test]
    fn test_word_bonus() {
        let higher = crate::fuzzy_match("words with spaces", "spa");
        let lower = crate::fuzzy_match("words with spaces", "pac");
        assert!(higher.is_some());
        assert!(lower.is_some());
        assert!(
            higher.unwrap() > lower.unwrap(),
            "higher = {:?}, lower = {:?}",
            higher,
            lower
        );

        let higher = crate::fuzzy_match("words_with_underscores", "und");
        let lower = crate::fuzzy_match("words_with_underscores", "nde");
        assert!(higher.is_some());
        assert!(lower.is_some());
        assert!(
            higher.unwrap() > lower.unwrap(),
            "higher = {:?}, lower = {:?}",
            higher,
            lower
        );

        let higher = crate::fuzzy_match("camelCaseWords", "Wor");
        let lower = crate::fuzzy_match("camelCaseWords", "ord");
        assert!(higher.is_some());
        assert!(lower.is_some());
        assert!(
            higher.unwrap() > lower.unwrap(),
            "higher = {:?}, lower = {:?}",
            higher,
            lower
        );
    }
}