Skip to main content

fresh/input/fuzzy/
matcher.rs

1//! The core fuzzy-matching algorithm.
2//!
3//! Given a [`PreparedPattern`] and a target string, this module scores the
4//! match (or rejects it) using two complementary strategies:
5//!
6//! 1. A **DP pass** ([`find_best_match`]) that tracks the highest-scoring
7//!    way to interleave query characters into the target, rewarding
8//!    consecutive matches, word boundaries, camelCase transitions, etc.
9//!    The DP uses an arena of backpointer nodes instead of cloning
10//!    position vectors, keeping each state update O(1).
11//!
12//! 2. A **contiguous-substring pass** ([`find_contiguous_match`]) that
13//!    finds literal-substring occurrences of the query and scores them.
14//!    The DP sometimes prefers scattered matches with more word-boundary
15//!    bonuses, so we always compare with the substring result and take
16//!    whichever scores higher.
17//!
18//! Early rejection via [`super::pattern::is_subsequence_prepared`] happens
19//! before any target-side allocation, so the heavy lifting here only runs
20//! for candidates that could plausibly match.
21//!
22//! # Amortised hot path via [`FuzzyMatcher`]
23//!
24//! A [`FuzzyMatcher`] owns a [`PreparedPattern`] *and* two reusable
25//! `Vec<char>` scratch buffers for the target.  Callers construct one per
26//! keystroke and reuse it across all candidate targets:
27//!
28//! ```ignore
29//! let mut matcher = FuzzyMatcher::new(search_query);
30//! for file in files {
31//!     let m = matcher.match_target(&file.relative_path);
32//!     // ...
33//! }
34//! ```
35//!
36//! On the first call the scratch buffers grow to the size of the longest
37//! target seen; every subsequent call truncates and refills them in place,
38//! so the allocator is touched once per keystroke rather than once per
39//! candidate.  This is the explicit alternative to stashing the scratch in
40//! `thread_local!` state — lifetimes are visible in the signature and
41//! future parallelisation can give each worker its own matcher.
42
43use super::pattern::{is_subsequence_prepared, PreparedPattern, PreparedTerm};
44use super::{score, FuzzyMatch};
45
46/// A reusable fuzzy matcher that amortises both query preparation and
47/// target-side scratch allocation across many calls.
48///
49/// See the module docs for usage.
50#[derive(Debug, Clone)]
51pub struct FuzzyMatcher {
52    pattern: PreparedPattern,
53    /// Scratch buffer: `target.chars().collect()`, cleared and refilled per call.
54    target_chars: Vec<char>,
55    /// Scratch buffer: lowercased target chars, cleared and refilled per call.
56    target_lower: Vec<char>,
57}
58
59impl FuzzyMatcher {
60    /// Create a new matcher for the given query, with empty scratch buffers.
61    ///
62    /// The scratch buffers start empty and grow lazily on the first
63    /// [`match_target`](Self::match_target) call that gets past rejection.
64    pub fn new(query: &str) -> Self {
65        Self::from_pattern(PreparedPattern::new(query))
66    }
67
68    /// Construct a matcher from an already-built [`PreparedPattern`].
69    pub fn from_pattern(pattern: PreparedPattern) -> Self {
70        Self {
71            pattern,
72            target_chars: Vec::new(),
73            target_lower: Vec::new(),
74        }
75    }
76
77    /// Returns `true` if the prepared query has no terms (empty / whitespace).
78    pub fn is_empty(&self) -> bool {
79        self.pattern.is_empty()
80    }
81
82    /// Match a single target string against this matcher's query.
83    ///
84    /// Reuses the internal `Vec<char>` scratch buffers — no per-call
85    /// allocations on the common "passed rejection → scoring" path after
86    /// the first call has warmed them up.
87    pub fn match_target(&mut self, target: &str) -> FuzzyMatch {
88        if self.pattern.is_empty() {
89            return FuzzyMatch {
90                matched: true,
91                score: 0,
92                match_positions: Vec::new(),
93            };
94        }
95
96        // Fast rejection gate.  Runs entirely on `&str`, no allocation.
97        // In the multi-term case we must pass the gate for *every* term.
98        for term in &self.pattern.terms {
99            if !is_subsequence_prepared(term, target) {
100                return FuzzyMatch::no_match();
101            }
102        }
103
104        // Refill scratch buffers in place.  No `malloc`/`realloc` on
105        // steady-state — the existing capacity is reused after the first
106        // call that made them large enough for the corpus's worst case.
107        refill_target(&mut self.target_chars, &mut self.target_lower, target);
108
109        // Split the borrow so the scoring helpers can see both the pattern
110        // and the scratch buffers simultaneously without fighting the
111        // borrow checker.
112        let pattern = &self.pattern;
113        let target_chars: &[char] = &self.target_chars;
114        let target_lower: &[char] = &self.target_lower;
115
116        if pattern.terms.len() > 1 {
117            score_multi_term(pattern, target_chars, target_lower)
118        } else {
119            score_single_term(&pattern.terms[0], target_chars, target_lower)
120        }
121    }
122}
123
124/// Entry point used by the backward-compatible free functions.
125///
126/// Creates a temporary `FuzzyMatcher` (which allocates empty scratch Vecs
127/// but does not grow them until first use), so callers using the legacy
128/// `fuzzy_match_prepared` API still work, just without cross-call
129/// amortisation.  Hot-path callers should construct their own
130/// `FuzzyMatcher` and reuse it.
131pub(super) fn match_prepared(pattern: &PreparedPattern, target: &str) -> FuzzyMatch {
132    let mut matcher = FuzzyMatcher::from_pattern(pattern.clone());
133    matcher.match_target(target)
134}
135
136/// Refill the scratch buffers with the raw and lowercased characters of
137/// `target`.  Grows the buffers if needed; otherwise reuses existing
138/// capacity, so steady-state calls do zero allocations.
139///
140/// Note: for non-ASCII chars with expanding case mappings (e.g. Turkish
141/// `İ` → `i` + combining dot), `target_lower` can be longer than
142/// `target_chars`.  This matches the semantics of the previous
143/// `target.to_lowercase().chars().collect()` implementation, including
144/// its latent indexing inconsistency — preserving behaviour is out of
145/// scope for this change.
146fn refill_target(target_chars: &mut Vec<char>, target_lower: &mut Vec<char>, target: &str) {
147    target_chars.clear();
148    target_lower.clear();
149    // `target.len()` is the byte length, which is an upper bound on the
150    // character count (chars are 1-4 bytes in UTF-8).  Reserving once
151    // avoids any growth checks during the push loop.
152    target_chars.reserve(target.len());
153    target_lower.reserve(target.len());
154    for c in target.chars() {
155        target_chars.push(c);
156        for lc in c.to_lowercase() {
157            target_lower.push(lc);
158        }
159    }
160}
161
162/// Score a single prepared term against a target whose char buffers have
163/// already been refilled in the caller's scratch space.
164fn score_single_term(
165    term: &PreparedTerm,
166    target_chars: &[char],
167    target_lower: &[char],
168) -> FuzzyMatch {
169    let query_lower = &term.lower_chars;
170    let query_len = query_lower.len();
171    let target_len = target_lower.len();
172
173    // Try to find the best matching positions using a DP approach.
174    let dp_result = find_best_match(query_lower, target_chars, target_lower);
175
176    // Also check for a contiguous substring match.  The DP may miss this
177    // because it optimises per-character bonuses (word boundaries, etc.)
178    // which can favour scattered matches over a tight substring.
179    let substr_result = find_contiguous_match(query_lower, target_chars, target_lower);
180
181    // Pick the better result.
182    let (positions, mut final_score) = match (dp_result, substr_result) {
183        (Some(dp), Some(sub)) => {
184            if sub.1 >= dp.1 {
185                sub
186            } else {
187                dp
188            }
189        }
190        (Some(dp), None) => dp,
191        (None, Some(sub)) => sub,
192        (None, None) => return FuzzyMatch::no_match(),
193    };
194
195    // Check if all matched positions are consecutive (contiguous substring).
196    let is_contiguous =
197        positions.len() == query_len && positions.windows(2).all(|w| w[1] == w[0] + 1);
198
199    if is_contiguous {
200        final_score += score::CONTIGUOUS_SUBSTRING;
201    }
202
203    // Exact match bonus: query matches entire target.
204    if query_len == target_len {
205        final_score += score::EXACT_MATCH;
206    } else if target_len > query_len && is_contiguous {
207        // Check if the query is a prefix match (all consecutive from start).
208        let is_prefix_match = positions.iter().enumerate().all(|(i, &pos)| pos == i);
209
210        if is_prefix_match && query_len < target_chars.len() {
211            let next_char = target_chars[query_len];
212            if next_char == '.' {
213                // Highest priority: exact basename match (before extension).
214                final_score += score::EXACT_MATCH;
215            } else if next_char == '-' || next_char == '_' || next_char == ' ' {
216                // Second priority: match before word separator.
217                final_score += score::EXACT_BASENAME_MATCH;
218            }
219        }
220    }
221
222    FuzzyMatch {
223        matched: true,
224        score: final_score,
225        match_positions: positions,
226    }
227}
228
229/// Score a multi-term pattern against a target whose char buffers have
230/// already been refilled in the caller's scratch space.
231fn score_multi_term(
232    pattern: &PreparedPattern,
233    target_chars: &[char],
234    target_lower: &[char],
235) -> FuzzyMatch {
236    let mut total_score = 0;
237    let mut all_positions = Vec::new();
238
239    for term in &pattern.terms {
240        let result = score_single_term(term, target_chars, target_lower);
241        if !result.matched {
242            return FuzzyMatch::no_match();
243        }
244        total_score += result.score;
245        all_positions.extend(result.match_positions);
246    }
247
248    // Sort and deduplicate positions (terms may have overlapping matches).
249    all_positions.sort_unstable();
250    all_positions.dedup();
251
252    // Tight-span bonus: reward targets where each term appears as a
253    // contiguous substring and all the term matches are packed close
254    // together (regardless of what sits between them).  This handles
255    // every realistic "the query reconstructs a single name" case
256    // uniformly — "/etc/hosts", "save_file.rs", "saveFile.rs",
257    // "etcmohosts", even "foo.bar.baz" — without hard-coding a list
258    // of separator characters that would miss camelCase, runs-together
259    // words, or unusual punctuation.
260    //
261    // The bonus fires iff:
262    //   1. Every term appears as a contiguous substring in `target_lower`,
263    //   2. In the order given by the query, and
264    //   3. The gap between consecutive term matches is small (we allow
265    //      up to 4 chars per gap — generous enough for "etcmohosts"
266    //      and "foo/bar/baz" but not for two terms at opposite ends
267    //      of a long path).
268    if let Some((span_start, span_end)) = find_sequential_term_span(target_lower, &pattern.terms) {
269        let term_len_sum: usize = pattern.terms.iter().map(|t| t.lower_chars.len()).sum();
270        let span_len = span_end - span_start;
271        let gap_excess = span_len.saturating_sub(term_len_sum);
272        let num_gaps = pattern.terms.len().saturating_sub(1);
273        // Tolerance scales with the number of gaps: 4 chars per gap.
274        let max_gap_excess = num_gaps.saturating_mul(4);
275
276        if gap_excess <= max_gap_excess {
277            total_score += score::EXACT_MATCH;
278
279            let target_char_count = target_chars.len();
280            all_positions = (span_start..span_end)
281                .filter(|&i| i < target_char_count)
282                .collect();
283        }
284    }
285
286    FuzzyMatch {
287        matched: true,
288        score: total_score,
289        match_positions: all_positions,
290    }
291}
292
293/// Greedily find the first sequential occurrence of each term as a
294/// contiguous substring in `target_lower`, advancing past each match
295/// before searching for the next term.  Returns the `(start, end)`
296/// char indices of the overall span covering the first term's start
297/// through the last term's end, or `None` if any term can't be found
298/// contiguously in the required order.
299///
300/// "Greedy" here means we take the *earliest* occurrence of each term,
301/// not the tightest span — tighter spans would require backtracking
302/// and are not worth the cost for a ranking signal.
303fn find_sequential_term_span(
304    target_lower: &[char],
305    terms: &[PreparedTerm],
306) -> Option<(usize, usize)> {
307    let mut search_from = 0;
308    let mut first_start: Option<usize> = None;
309    let mut last_end = 0;
310    for term in terms {
311        let needle = &term.lower_chars;
312        if needle.is_empty() {
313            continue;
314        }
315        if search_from + needle.len() > target_lower.len() {
316            return None;
317        }
318        let end_bound = target_lower.len() - needle.len();
319        let found =
320            (search_from..=end_bound).find(|&s| target_lower[s..s + needle.len()] == *needle)?;
321        if first_start.is_none() {
322            first_start = Some(found);
323        }
324        search_from = found + needle.len();
325        last_end = search_from;
326    }
327    first_start.map(|start| (start, last_end))
328}
329
330/// Find the best contiguous substring match of `query` in `target`.
331///
332/// Scans for all occurrences of the query as a substring and picks the
333/// one with the highest score (preferring word boundaries, basename, etc.).
334fn find_contiguous_match(
335    query: &[char],
336    target_chars: &[char],
337    target_lower: &[char],
338) -> Option<(Vec<usize>, i32)> {
339    let m = query.len();
340    let n = target_lower.len();
341    if m == 0 || m > n {
342        return None;
343    }
344
345    let mut best: Option<(Vec<usize>, i32)> = None;
346
347    for start in 0..=n - m {
348        // Check if query matches at this position.
349        if target_lower[start..start + m] != *query {
350            continue;
351        }
352
353        // Score this contiguous match.
354        let mut match_score = 0;
355
356        if start == 0 {
357            match_score += score::START_OF_STRING;
358        }
359
360        // Word boundary bonus for the first character.
361        if start > 0 && start < target_chars.len() {
362            let prev_char = target_chars[start - 1];
363            if prev_char == ' '
364                || prev_char == '_'
365                || prev_char == '-'
366                || prev_char == '/'
367                || prev_char == '.'
368            {
369                match_score += score::WORD_BOUNDARY;
370            } else if prev_char.is_lowercase() && target_chars[start].is_uppercase() {
371                match_score += score::CAMEL_CASE;
372            }
373        }
374
375        // Consecutive bonus for chars 1..m.
376        match_score += score::CONSECUTIVE * (m as i32 - 1);
377
378        let is_better = match &best {
379            None => true,
380            Some((_, s)) => match_score > *s,
381        };
382        if is_better {
383            let positions: Vec<usize> = (start..start + m).collect();
384            best = Some((positions, match_score));
385        }
386    }
387
388    best
389}
390
391/// A single node in the backpointer arena used by [`find_best_match`].
392///
393/// Each node records the target index matched for one query character and a
394/// link to the node that matched the previous query character (or `None` for
395/// the first match).  Walking back from the final node reconstructs the full
396/// list of match positions without ever cloning a `Vec<usize>`.
397#[derive(Clone, Copy)]
398struct ChainNode {
399    ti: usize,
400    prev: Option<u32>,
401}
402
403/// Find the best matching positions for query in target.
404///
405/// Same greedy DP as the original implementation, but replaces the
406/// `Vec<usize>` position-cloning with an arena of linked backpointer nodes.
407/// Per-state updates become O(1) (push one node) instead of O(m) (clone the
408/// full positions vector), turning the worst-case cost from O(n·m²) into
409/// O(n·m) with a single linear walk at the end to reconstruct positions.
410///
411/// Callers are expected to have already run subsequence rejection via
412/// [`super::pattern::is_subsequence_prepared`] before allocating the
413/// target buffers this function consumes — the duplicate guard that used
414/// to live here was pure overhead on the hot path and has been removed.
415fn find_best_match(
416    query: &[char],
417    target_chars: &[char],
418    target_lower: &[char],
419) -> Option<(Vec<usize>, i32)> {
420    if query.is_empty() {
421        return Some((Vec::new(), 0));
422    }
423
424    let n = target_lower.len();
425    let m = query.len();
426
427    if n < m {
428        return None;
429    }
430
431    // Arena of backpointer nodes.  `best_node_for_qi[qi]` indexes into the
432    // arena for the currently-best chain ending at query index `qi`, or
433    // `None` if no match for `query[0..qi]` has been seen yet.  `qi == 0`
434    // is always `None` (empty chain).
435    let mut arena: Vec<ChainNode> = Vec::with_capacity(m.saturating_mul(4));
436    let mut best_score: Vec<Option<i32>> = vec![None; m + 1];
437    let mut best_node_for_qi: Vec<Option<u32>> = vec![None; m + 1];
438    best_score[0] = Some(0); // empty query matches with score 0
439
440    for ti in 0..n {
441        // Process in reverse so we don't use values we just wrote this iteration.
442        for qi in (0..m).rev() {
443            if target_lower[ti] != query[qi] {
444                continue;
445            }
446
447            // Can we extend the best chain for query[0..qi]?
448            let prev_score = match best_score[qi] {
449                Some(s) => s,
450                None => continue,
451            };
452            // `last_match_pos` comes from the previous chain head, or None
453            // if `qi == 0` (first character of the query).
454            let prev_last_pos = best_node_for_qi[qi].map(|idx| arena[idx as usize].ti);
455
456            // Match positions must be strictly increasing.
457            if let Some(lp) = prev_last_pos {
458                if ti <= lp {
459                    continue;
460                }
461            }
462
463            // Score the (ti, prev_last_pos) transition.
464            let mut match_score = 0;
465
466            // Start of string bonus
467            if ti == 0 {
468                match_score += score::START_OF_STRING;
469            }
470
471            // Word boundary bonus
472            if ti > 0 && ti < target_chars.len() {
473                let prev_char = target_chars[ti - 1];
474                if prev_char == ' '
475                    || prev_char == '_'
476                    || prev_char == '-'
477                    || prev_char == '/'
478                    || prev_char == '.'
479                {
480                    match_score += score::WORD_BOUNDARY;
481                } else if prev_char.is_lowercase() && target_chars[ti].is_uppercase() {
482                    match_score += score::CAMEL_CASE;
483                }
484            }
485
486            // Consecutive / gap handling
487            if let Some(lp) = prev_last_pos {
488                if ti == lp + 1 {
489                    match_score += score::CONSECUTIVE;
490                } else {
491                    let gap_size = ti - lp - 1;
492                    match_score += score::GAP_START_PENALTY;
493                    match_score += score::GAP_PENALTY * (gap_size as i32 - 1).max(0);
494                }
495            }
496
497            let new_score = prev_score + match_score;
498
499            let should_update = match best_score[qi + 1] {
500                None => true,
501                Some(curr) => new_score > curr,
502            };
503
504            if should_update {
505                let new_idx = arena.len() as u32;
506                arena.push(ChainNode {
507                    ti,
508                    prev: best_node_for_qi[qi],
509                });
510                best_score[qi + 1] = Some(new_score);
511                best_node_for_qi[qi + 1] = Some(new_idx);
512            }
513        }
514    }
515
516    let final_score = best_score[m]?;
517    let final_node = best_node_for_qi[m]?;
518
519    // Walk backwards through the arena to recover positions.
520    let mut positions = vec![0usize; m];
521    let mut cursor = Some(final_node);
522    let mut idx = m;
523    while let Some(node_idx) = cursor {
524        debug_assert!(idx > 0);
525        idx -= 1;
526        let node = arena[node_idx as usize];
527        positions[idx] = node.ti;
528        cursor = node.prev;
529    }
530
531    Some((positions, final_score))
532}