Skip to main content

fresh/input/fuzzy/
matcher.rs

1//! The core fuzzy-matching algorithm.
2//!
3//! Given a [`PreparedPattern`] and a target string, this module scores the
4//! match (or rejects it) using two complementary strategies:
5//!
6//! 1. A **DP pass** ([`find_best_match`]) that tracks the highest-scoring
7//!    way to interleave query characters into the target, rewarding
8//!    consecutive matches, word boundaries, camelCase transitions, etc.
9//!    The DP uses an arena of backpointer nodes instead of cloning
10//!    position vectors, keeping each state update O(1).
11//!
12//! 2. A **contiguous-substring pass** ([`find_contiguous_match`]) that
13//!    finds literal-substring occurrences of the query and scores them.
14//!    The DP sometimes prefers scattered matches with more word-boundary
15//!    bonuses, so we always compare with the substring result and take
16//!    whichever scores higher.
17//!
18//! Early rejection via [`super::pattern::is_subsequence_prepared`] happens
19//! before any target-side allocation, so the heavy lifting here only runs
20//! for candidates that could plausibly match.
21//!
22//! # Amortised hot path via [`FuzzyMatcher`]
23//!
24//! A [`FuzzyMatcher`] owns a [`PreparedPattern`] *and* two reusable
25//! `Vec<char>` scratch buffers for the target.  Callers construct one per
26//! keystroke and reuse it across all candidate targets:
27//!
28//! ```ignore
29//! let mut matcher = FuzzyMatcher::new(search_query);
30//! for file in files {
31//!     let m = matcher.match_target(&file.relative_path);
32//!     // ...
33//! }
34//! ```
35//!
36//! On the first call the scratch buffers grow to the size of the longest
37//! target seen; every subsequent call truncates and refills them in place,
38//! so the allocator is touched once per keystroke rather than once per
39//! candidate.  This is the explicit alternative to stashing the scratch in
40//! `thread_local!` state — lifetimes are visible in the signature and
41//! future parallelisation can give each worker its own matcher.
42
43use super::pattern::{is_subsequence_prepared, PreparedPattern, PreparedTerm};
44use super::{score, FuzzyMatch};
45
46/// A reusable fuzzy matcher that amortises both query preparation and
47/// target-side scratch allocation across many calls.
48///
49/// See the module docs for usage.
50#[derive(Debug, Clone)]
51pub struct FuzzyMatcher {
52    pattern: PreparedPattern,
53    /// Scratch buffer: `target.chars().collect()`, cleared and refilled per call.
54    target_chars: Vec<char>,
55    /// Scratch buffer: lowercased target chars, cleared and refilled per call.
56    target_lower: Vec<char>,
57}
58
59impl FuzzyMatcher {
60    /// Create a new matcher for the given query, with empty scratch buffers.
61    ///
62    /// The scratch buffers start empty and grow lazily on the first
63    /// [`match_target`](Self::match_target) call that gets past rejection.
64    pub fn new(query: &str) -> Self {
65        Self::from_pattern(PreparedPattern::new(query))
66    }
67
68    /// Construct a matcher from an already-built [`PreparedPattern`].
69    pub fn from_pattern(pattern: PreparedPattern) -> Self {
70        Self {
71            pattern,
72            target_chars: Vec::new(),
73            target_lower: Vec::new(),
74        }
75    }
76
77    /// Returns `true` if the prepared query has no terms (empty / whitespace).
78    pub fn is_empty(&self) -> bool {
79        self.pattern.is_empty()
80    }
81
82    /// Match a single target string against this matcher's query.
83    ///
84    /// Reuses the internal `Vec<char>` scratch buffers — no per-call
85    /// allocations on the common "passed rejection → scoring" path after
86    /// the first call has warmed them up.
87    pub fn match_target(&mut self, target: &str) -> FuzzyMatch {
88        if self.pattern.is_empty() {
89            return FuzzyMatch {
90                matched: true,
91                score: 0,
92                match_positions: Vec::new(),
93            };
94        }
95
96        // Fast rejection gate.  Runs entirely on `&str`, no allocation.
97        // In the multi-term case we must pass the gate for *every* term.
98        for term in &self.pattern.terms {
99            if !is_subsequence_prepared(term, target) {
100                return FuzzyMatch::no_match();
101            }
102        }
103
104        // Refill scratch buffers in place.  No `malloc`/`realloc` on
105        // steady-state — the existing capacity is reused after the first
106        // call that made them large enough for the corpus's worst case.
107        refill_target(&mut self.target_chars, &mut self.target_lower, target);
108
109        // Split the borrow so the scoring helpers can see both the pattern
110        // and the scratch buffers simultaneously without fighting the
111        // borrow checker.
112        let pattern = &self.pattern;
113        let target_chars: &[char] = &self.target_chars;
114        let target_lower: &[char] = &self.target_lower;
115
116        if pattern.terms.len() > 1 {
117            score_multi_term(pattern, target_chars, target_lower)
118        } else {
119            score_single_term(&pattern.terms[0], target_chars, target_lower)
120        }
121    }
122}
123
124/// Entry point used by the backward-compatible free functions.
125///
126/// Creates a temporary `FuzzyMatcher` (which allocates empty scratch Vecs
127/// but does not grow them until first use), so callers using the legacy
128/// `fuzzy_match_prepared` API still work, just without cross-call
129/// amortisation.  Hot-path callers should construct their own
130/// `FuzzyMatcher` and reuse it.
131pub(super) fn match_prepared(pattern: &PreparedPattern, target: &str) -> FuzzyMatch {
132    let mut matcher = FuzzyMatcher::from_pattern(pattern.clone());
133    matcher.match_target(target)
134}
135
136/// Refill the scratch buffers with the raw and lowercased characters of
137/// `target`.  Grows the buffers if needed; otherwise reuses existing
138/// capacity, so steady-state calls do zero allocations.
139///
140/// Note: for non-ASCII chars with expanding case mappings (e.g. Turkish
141/// `İ` → `i` + combining dot), `target_lower` can be longer than
142/// `target_chars`.  This matches the semantics of the previous
143/// `target.to_lowercase().chars().collect()` implementation, including
144/// its latent indexing inconsistency — preserving behaviour is out of
145/// scope for this change.
146fn refill_target(target_chars: &mut Vec<char>, target_lower: &mut Vec<char>, target: &str) {
147    target_chars.clear();
148    target_lower.clear();
149    // `target.len()` is the byte length, which is an upper bound on the
150    // character count (chars are 1-4 bytes in UTF-8).  Reserving once
151    // avoids any growth checks during the push loop.
152    target_chars.reserve(target.len());
153    target_lower.reserve(target.len());
154    for c in target.chars() {
155        target_chars.push(c);
156        for lc in c.to_lowercase() {
157            target_lower.push(lc);
158        }
159    }
160}
161
162/// Score a single prepared term against a target whose char buffers have
163/// already been refilled in the caller's scratch space.
164fn score_single_term(
165    term: &PreparedTerm,
166    target_chars: &[char],
167    target_lower: &[char],
168) -> FuzzyMatch {
169    let query_lower = &term.lower_chars;
170    let query_len = query_lower.len();
171    let target_len = target_lower.len();
172
173    // Try to find the best matching positions using a DP approach.
174    let dp_result = find_best_match(query_lower, target_chars, target_lower);
175
176    // Also check for a contiguous substring match.  The DP may miss this
177    // because it optimises per-character bonuses (word boundaries, etc.)
178    // which can favour scattered matches over a tight substring.
179    let substr_result = find_contiguous_match(query_lower, target_chars, target_lower);
180
181    // Pick the better result.
182    let (positions, mut final_score) = match (dp_result, substr_result) {
183        (Some(dp), Some(sub)) => {
184            if sub.1 >= dp.1 {
185                sub
186            } else {
187                dp
188            }
189        }
190        (Some(dp), None) => dp,
191        (None, Some(sub)) => sub,
192        (None, None) => return FuzzyMatch::no_match(),
193    };
194
195    // Check if all matched positions are consecutive (contiguous substring).
196    let is_contiguous =
197        positions.len() == query_len && positions.windows(2).all(|w| w[1] == w[0] + 1);
198
199    if is_contiguous {
200        final_score += score::CONTIGUOUS_SUBSTRING;
201    }
202
203    // Exact match bonus: query matches entire target.
204    let mut credited_full_path_prefix = false;
205    if query_len == target_len {
206        final_score += score::EXACT_MATCH;
207    } else if target_len > query_len && is_contiguous {
208        // Check if the query is a prefix match (all consecutive from start).
209        let is_prefix_match = positions.iter().enumerate().all(|(i, &pos)| pos == i);
210
211        if is_prefix_match && query_len < target_chars.len() {
212            let next_char = target_chars[query_len];
213            if next_char == '.' {
214                // Highest priority: exact basename match (before extension).
215                final_score += score::EXACT_MATCH;
216                credited_full_path_prefix = true;
217            } else if next_char == '-' || next_char == '_' || next_char == ' ' {
218                // Second priority: match before word separator.
219                final_score += score::EXACT_BASENAME_MATCH;
220                credited_full_path_prefix = true;
221            }
222        }
223    }
224
225    // Path-segment prefix bonus.  A contiguous match that begins at the start
226    // of a path segment (basename or interior directory) outranks an equally
227    // contiguous match that lands mid-segment.  This is what users expect
228    // when typing a short query like "ts": a basename starting with "ts"
229    // (e.g. `tsconfig.json`) should rank above unrelated files whose only
230    // `ts` happens to follow an extension dot (e.g. `pkg.ts`, `finder.ts`).
231    //
232    // Skipped when the existing `EXACT_*` block above already credited a
233    // full-path prefix that ends in a separator — those matches are already
234    // in the top tier and would be double-counted otherwise.
235    if is_contiguous
236        && !credited_full_path_prefix
237        && !positions.is_empty()
238        && positions.len() == query_len
239    {
240        let start = positions[0];
241        let starts_segment = start == 0 || (start > 0 && target_chars[start - 1] == '/');
242        if starts_segment {
243            let last_slash = target_chars.iter().rposition(|&c| c == '/');
244            let basename_start = last_slash.map(|i| i + 1).unwrap_or(0);
245            if start == basename_start {
246                final_score += score::BASENAME_PREFIX;
247            } else {
248                final_score += score::PATH_SEGMENT_PREFIX;
249            }
250        }
251    }
252
253    FuzzyMatch {
254        matched: true,
255        score: final_score,
256        match_positions: positions,
257    }
258}
259
260/// Score a multi-term pattern against a target whose char buffers have
261/// already been refilled in the caller's scratch space.
262fn score_multi_term(
263    pattern: &PreparedPattern,
264    target_chars: &[char],
265    target_lower: &[char],
266) -> FuzzyMatch {
267    let mut total_score = 0;
268    let mut all_positions = Vec::new();
269
270    for term in &pattern.terms {
271        let result = score_single_term(term, target_chars, target_lower);
272        if !result.matched {
273            return FuzzyMatch::no_match();
274        }
275        total_score += result.score;
276        all_positions.extend(result.match_positions);
277    }
278
279    // Sort and deduplicate positions (terms may have overlapping matches).
280    all_positions.sort_unstable();
281    all_positions.dedup();
282
283    // Tight-span bonus: reward targets where each term appears as a
284    // contiguous substring and all the term matches are packed close
285    // together (regardless of what sits between them).  This handles
286    // every realistic "the query reconstructs a single name" case
287    // uniformly — "/etc/hosts", "save_file.rs", "saveFile.rs",
288    // "etcmohosts", even "foo.bar.baz" — without hard-coding a list
289    // of separator characters that would miss camelCase, runs-together
290    // words, or unusual punctuation.
291    //
292    // The bonus fires iff:
293    //   1. Every term appears as a contiguous substring in `target_lower`,
294    //   2. In the order given by the query, and
295    //   3. The gap between consecutive term matches is small (we allow
296    //      up to 4 chars per gap — generous enough for "etcmohosts"
297    //      and "foo/bar/baz" but not for two terms at opposite ends
298    //      of a long path).
299    if let Some((span_start, span_end)) = find_sequential_term_span(target_lower, &pattern.terms) {
300        let term_len_sum: usize = pattern.terms.iter().map(|t| t.lower_chars.len()).sum();
301        let span_len = span_end - span_start;
302        let gap_excess = span_len.saturating_sub(term_len_sum);
303        let num_gaps = pattern.terms.len().saturating_sub(1);
304        // Tolerance scales with the number of gaps: 4 chars per gap.
305        let max_gap_excess = num_gaps.saturating_mul(4);
306
307        if gap_excess <= max_gap_excess {
308            total_score += score::EXACT_MATCH;
309
310            let target_char_count = target_chars.len();
311            all_positions = (span_start..span_end)
312                .filter(|&i| i < target_char_count)
313                .collect();
314        }
315    }
316
317    FuzzyMatch {
318        matched: true,
319        score: total_score,
320        match_positions: all_positions,
321    }
322}
323
324/// Greedily find the first sequential occurrence of each term as a
325/// contiguous substring in `target_lower`, advancing past each match
326/// before searching for the next term.  Returns the `(start, end)`
327/// char indices of the overall span covering the first term's start
328/// through the last term's end, or `None` if any term can't be found
329/// contiguously in the required order.
330///
331/// "Greedy" here means we take the *earliest* occurrence of each term,
332/// not the tightest span — tighter spans would require backtracking
333/// and are not worth the cost for a ranking signal.
334fn find_sequential_term_span(
335    target_lower: &[char],
336    terms: &[PreparedTerm],
337) -> Option<(usize, usize)> {
338    let mut search_from = 0;
339    let mut first_start: Option<usize> = None;
340    let mut last_end = 0;
341    for term in terms {
342        let needle = &term.lower_chars;
343        if needle.is_empty() {
344            continue;
345        }
346        if search_from + needle.len() > target_lower.len() {
347            return None;
348        }
349        let end_bound = target_lower.len() - needle.len();
350        let found =
351            (search_from..=end_bound).find(|&s| target_lower[s..s + needle.len()] == *needle)?;
352        if first_start.is_none() {
353            first_start = Some(found);
354        }
355        search_from = found + needle.len();
356        last_end = search_from;
357    }
358    first_start.map(|start| (start, last_end))
359}
360
361/// Find the best contiguous substring match of `query` in `target`.
362///
363/// Scans for all occurrences of the query as a substring and picks the
364/// one with the highest score (preferring word boundaries, basename, etc.).
365fn find_contiguous_match(
366    query: &[char],
367    target_chars: &[char],
368    target_lower: &[char],
369) -> Option<(Vec<usize>, i32)> {
370    let m = query.len();
371    let n = target_lower.len();
372    if m == 0 || m > n {
373        return None;
374    }
375
376    let mut best: Option<(Vec<usize>, i32)> = None;
377
378    for start in 0..=n - m {
379        // Check if query matches at this position.
380        if target_lower[start..start + m] != *query {
381            continue;
382        }
383
384        // Score this contiguous match.
385        let mut match_score = 0;
386
387        if start == 0 {
388            match_score += score::START_OF_STRING;
389        }
390
391        // Word boundary bonus for the first character.
392        if start > 0 && start < target_chars.len() {
393            let prev_char = target_chars[start - 1];
394            if prev_char == ' '
395                || prev_char == '_'
396                || prev_char == '-'
397                || prev_char == '/'
398                || prev_char == '.'
399            {
400                match_score += score::WORD_BOUNDARY;
401            } else if prev_char.is_lowercase() && target_chars[start].is_uppercase() {
402                match_score += score::CAMEL_CASE;
403            }
404        }
405
406        // Consecutive bonus for chars 1..m.
407        match_score += score::CONSECUTIVE * (m as i32 - 1);
408
409        let is_better = match &best {
410            None => true,
411            Some((_, s)) => match_score > *s,
412        };
413        if is_better {
414            let positions: Vec<usize> = (start..start + m).collect();
415            best = Some((positions, match_score));
416        }
417    }
418
419    best
420}
421
422/// A single node in the backpointer arena used by [`find_best_match`].
423///
424/// Each node records the target index matched for one query character and a
425/// link to the node that matched the previous query character (or `None` for
426/// the first match).  Walking back from the final node reconstructs the full
427/// list of match positions without ever cloning a `Vec<usize>`.
428#[derive(Clone, Copy)]
429struct ChainNode {
430    ti: usize,
431    prev: Option<u32>,
432}
433
434/// Find the best matching positions for query in target.
435///
436/// Same greedy DP as the original implementation, but replaces the
437/// `Vec<usize>` position-cloning with an arena of linked backpointer nodes.
438/// Per-state updates become O(1) (push one node) instead of O(m) (clone the
439/// full positions vector), turning the worst-case cost from O(n·m²) into
440/// O(n·m) with a single linear walk at the end to reconstruct positions.
441///
442/// Callers are expected to have already run subsequence rejection via
443/// [`super::pattern::is_subsequence_prepared`] before allocating the
444/// target buffers this function consumes — the duplicate guard that used
445/// to live here was pure overhead on the hot path and has been removed.
446fn find_best_match(
447    query: &[char],
448    target_chars: &[char],
449    target_lower: &[char],
450) -> Option<(Vec<usize>, i32)> {
451    if query.is_empty() {
452        return Some((Vec::new(), 0));
453    }
454
455    let n = target_lower.len();
456    let m = query.len();
457
458    if n < m {
459        return None;
460    }
461
462    // Arena of backpointer nodes.  `best_node_for_qi[qi]` indexes into the
463    // arena for the currently-best chain ending at query index `qi`, or
464    // `None` if no match for `query[0..qi]` has been seen yet.  `qi == 0`
465    // is always `None` (empty chain).
466    let mut arena: Vec<ChainNode> = Vec::with_capacity(m.saturating_mul(4));
467    let mut best_score: Vec<Option<i32>> = vec![None; m + 1];
468    let mut best_node_for_qi: Vec<Option<u32>> = vec![None; m + 1];
469    best_score[0] = Some(0); // empty query matches with score 0
470
471    for ti in 0..n {
472        // Process in reverse so we don't use values we just wrote this iteration.
473        for qi in (0..m).rev() {
474            if target_lower[ti] != query[qi] {
475                continue;
476            }
477
478            // Can we extend the best chain for query[0..qi]?
479            let prev_score = match best_score[qi] {
480                Some(s) => s,
481                None => continue,
482            };
483            // `last_match_pos` comes from the previous chain head, or None
484            // if `qi == 0` (first character of the query).
485            let prev_last_pos = best_node_for_qi[qi].map(|idx| arena[idx as usize].ti);
486
487            // Match positions must be strictly increasing.
488            if let Some(lp) = prev_last_pos {
489                if ti <= lp {
490                    continue;
491                }
492            }
493
494            // Score the (ti, prev_last_pos) transition.
495            let mut match_score = 0;
496
497            // Start of string bonus
498            if ti == 0 {
499                match_score += score::START_OF_STRING;
500            }
501
502            // Word boundary bonus
503            if ti > 0 && ti < target_chars.len() {
504                let prev_char = target_chars[ti - 1];
505                if prev_char == ' '
506                    || prev_char == '_'
507                    || prev_char == '-'
508                    || prev_char == '/'
509                    || prev_char == '.'
510                {
511                    match_score += score::WORD_BOUNDARY;
512                } else if prev_char.is_lowercase() && target_chars[ti].is_uppercase() {
513                    match_score += score::CAMEL_CASE;
514                }
515            }
516
517            // Consecutive / gap handling
518            if let Some(lp) = prev_last_pos {
519                if ti == lp + 1 {
520                    match_score += score::CONSECUTIVE;
521                } else {
522                    let gap_size = ti - lp - 1;
523                    match_score += score::GAP_START_PENALTY;
524                    match_score += score::GAP_PENALTY * (gap_size as i32 - 1).max(0);
525                }
526            }
527
528            let new_score = prev_score + match_score;
529
530            let should_update = match best_score[qi + 1] {
531                None => true,
532                Some(curr) => new_score > curr,
533            };
534
535            if should_update {
536                let new_idx = arena.len() as u32;
537                arena.push(ChainNode {
538                    ti,
539                    prev: best_node_for_qi[qi],
540                });
541                best_score[qi + 1] = Some(new_score);
542                best_node_for_qi[qi + 1] = Some(new_idx);
543            }
544        }
545    }
546
547    let final_score = best_score[m]?;
548    let final_node = best_node_for_qi[m]?;
549
550    // Walk backwards through the arena to recover positions.
551    let mut positions = vec![0usize; m];
552    let mut cursor = Some(final_node);
553    let mut idx = m;
554    while let Some(node_idx) = cursor {
555        debug_assert!(idx > 0);
556        idx -= 1;
557        let node = arena[node_idx as usize];
558        positions[idx] = node.ti;
559        cursor = node.prev;
560    }
561
562    Some((positions, final_score))
563}