fresh/input/fuzzy/matcher.rs
1//! The core fuzzy-matching algorithm.
2//!
3//! Given a [`PreparedPattern`] and a target string, this module scores the
4//! match (or rejects it) using two complementary strategies:
5//!
6//! 1. A **DP pass** ([`find_best_match`]) that tracks the highest-scoring
7//! way to interleave query characters into the target, rewarding
8//! consecutive matches, word boundaries, camelCase transitions, etc.
9//! The DP uses an arena of backpointer nodes instead of cloning
10//! position vectors, keeping each state update O(1).
11//!
12//! 2. A **contiguous-substring pass** ([`find_contiguous_match`]) that
13//! finds literal-substring occurrences of the query and scores them.
14//! The DP sometimes prefers scattered matches with more word-boundary
15//! bonuses, so we always compare with the substring result and take
16//! whichever scores higher.
17//!
18//! Early rejection via [`super::pattern::is_subsequence_prepared`] happens
19//! before any target-side allocation, so the heavy lifting here only runs
20//! for candidates that could plausibly match.
21//!
22//! # Amortised hot path via [`FuzzyMatcher`]
23//!
24//! A [`FuzzyMatcher`] owns a [`PreparedPattern`] *and* two reusable
25//! `Vec<char>` scratch buffers for the target. Callers construct one per
26//! keystroke and reuse it across all candidate targets:
27//!
28//! ```ignore
29//! let mut matcher = FuzzyMatcher::new(search_query);
30//! for file in files {
31//! let m = matcher.match_target(&file.relative_path);
32//! // ...
33//! }
34//! ```
35//!
36//! On the first call the scratch buffers grow to the size of the longest
37//! target seen; every subsequent call truncates and refills them in place,
38//! so the allocator is touched once per keystroke rather than once per
39//! candidate. This is the explicit alternative to stashing the scratch in
40//! `thread_local!` state — lifetimes are visible in the signature and
41//! future parallelisation can give each worker its own matcher.
42
43use super::pattern::{is_subsequence_prepared, PreparedPattern, PreparedTerm};
44use super::{score, FuzzyMatch};
45
46/// A reusable fuzzy matcher that amortises both query preparation and
47/// target-side scratch allocation across many calls.
48///
49/// See the module docs for usage.
50#[derive(Debug, Clone)]
51pub struct FuzzyMatcher {
52 pattern: PreparedPattern,
53 /// Scratch buffer: `target.chars().collect()`, cleared and refilled per call.
54 target_chars: Vec<char>,
55 /// Scratch buffer: lowercased target chars, cleared and refilled per call.
56 target_lower: Vec<char>,
57}
58
59impl FuzzyMatcher {
60 /// Create a new matcher for the given query, with empty scratch buffers.
61 ///
62 /// The scratch buffers start empty and grow lazily on the first
63 /// [`match_target`](Self::match_target) call that gets past rejection.
64 pub fn new(query: &str) -> Self {
65 Self::from_pattern(PreparedPattern::new(query))
66 }
67
68 /// Construct a matcher from an already-built [`PreparedPattern`].
69 pub fn from_pattern(pattern: PreparedPattern) -> Self {
70 Self {
71 pattern,
72 target_chars: Vec::new(),
73 target_lower: Vec::new(),
74 }
75 }
76
77 /// Returns `true` if the prepared query has no terms (empty / whitespace).
78 pub fn is_empty(&self) -> bool {
79 self.pattern.is_empty()
80 }
81
82 /// Match a single target string against this matcher's query.
83 ///
84 /// Reuses the internal `Vec<char>` scratch buffers — no per-call
85 /// allocations on the common "passed rejection → scoring" path after
86 /// the first call has warmed them up.
87 pub fn match_target(&mut self, target: &str) -> FuzzyMatch {
88 if self.pattern.is_empty() {
89 return FuzzyMatch {
90 matched: true,
91 score: 0,
92 match_positions: Vec::new(),
93 };
94 }
95
96 // Fast rejection gate. Runs entirely on `&str`, no allocation.
97 // In the multi-term case we must pass the gate for *every* term.
98 for term in &self.pattern.terms {
99 if !is_subsequence_prepared(term, target) {
100 return FuzzyMatch::no_match();
101 }
102 }
103
104 // Refill scratch buffers in place. No `malloc`/`realloc` on
105 // steady-state — the existing capacity is reused after the first
106 // call that made them large enough for the corpus's worst case.
107 refill_target(&mut self.target_chars, &mut self.target_lower, target);
108
109 // Split the borrow so the scoring helpers can see both the pattern
110 // and the scratch buffers simultaneously without fighting the
111 // borrow checker.
112 let pattern = &self.pattern;
113 let target_chars: &[char] = &self.target_chars;
114 let target_lower: &[char] = &self.target_lower;
115
116 if pattern.terms.len() > 1 {
117 score_multi_term(pattern, target_chars, target_lower)
118 } else {
119 score_single_term(&pattern.terms[0], target_chars, target_lower)
120 }
121 }
122}
123
124/// Entry point used by the backward-compatible free functions.
125///
126/// Creates a temporary `FuzzyMatcher` (which allocates empty scratch Vecs
127/// but does not grow them until first use), so callers using the legacy
128/// `fuzzy_match_prepared` API still work, just without cross-call
129/// amortisation. Hot-path callers should construct their own
130/// `FuzzyMatcher` and reuse it.
131pub(super) fn match_prepared(pattern: &PreparedPattern, target: &str) -> FuzzyMatch {
132 let mut matcher = FuzzyMatcher::from_pattern(pattern.clone());
133 matcher.match_target(target)
134}
135
136/// Refill the scratch buffers with the raw and lowercased characters of
137/// `target`. Grows the buffers if needed; otherwise reuses existing
138/// capacity, so steady-state calls do zero allocations.
139///
140/// Note: for non-ASCII chars with expanding case mappings (e.g. Turkish
141/// `İ` → `i` + combining dot), `target_lower` can be longer than
142/// `target_chars`. This matches the semantics of the previous
143/// `target.to_lowercase().chars().collect()` implementation, including
144/// its latent indexing inconsistency — preserving behaviour is out of
145/// scope for this change.
146fn refill_target(target_chars: &mut Vec<char>, target_lower: &mut Vec<char>, target: &str) {
147 target_chars.clear();
148 target_lower.clear();
149 // `target.len()` is the byte length, which is an upper bound on the
150 // character count (chars are 1-4 bytes in UTF-8). Reserving once
151 // avoids any growth checks during the push loop.
152 target_chars.reserve(target.len());
153 target_lower.reserve(target.len());
154 for c in target.chars() {
155 target_chars.push(c);
156 for lc in c.to_lowercase() {
157 target_lower.push(lc);
158 }
159 }
160}
161
162/// Score a single prepared term against a target whose char buffers have
163/// already been refilled in the caller's scratch space.
164fn score_single_term(
165 term: &PreparedTerm,
166 target_chars: &[char],
167 target_lower: &[char],
168) -> FuzzyMatch {
169 let query_lower = &term.lower_chars;
170 let query_len = query_lower.len();
171 let target_len = target_lower.len();
172
173 // Try to find the best matching positions using a DP approach.
174 let dp_result = find_best_match(query_lower, target_chars, target_lower);
175
176 // Also check for a contiguous substring match. The DP may miss this
177 // because it optimises per-character bonuses (word boundaries, etc.)
178 // which can favour scattered matches over a tight substring.
179 let substr_result = find_contiguous_match(query_lower, target_chars, target_lower);
180
181 // Pick the better result.
182 let (positions, mut final_score) = match (dp_result, substr_result) {
183 (Some(dp), Some(sub)) => {
184 if sub.1 >= dp.1 {
185 sub
186 } else {
187 dp
188 }
189 }
190 (Some(dp), None) => dp,
191 (None, Some(sub)) => sub,
192 (None, None) => return FuzzyMatch::no_match(),
193 };
194
195 // Check if all matched positions are consecutive (contiguous substring).
196 let is_contiguous =
197 positions.len() == query_len && positions.windows(2).all(|w| w[1] == w[0] + 1);
198
199 if is_contiguous {
200 final_score += score::CONTIGUOUS_SUBSTRING;
201 }
202
203 // Exact match bonus: query matches entire target.
204 if query_len == target_len {
205 final_score += score::EXACT_MATCH;
206 } else if target_len > query_len && is_contiguous {
207 // Check if the query is a prefix match (all consecutive from start).
208 let is_prefix_match = positions.iter().enumerate().all(|(i, &pos)| pos == i);
209
210 if is_prefix_match && query_len < target_chars.len() {
211 let next_char = target_chars[query_len];
212 if next_char == '.' {
213 // Highest priority: exact basename match (before extension).
214 final_score += score::EXACT_MATCH;
215 } else if next_char == '-' || next_char == '_' || next_char == ' ' {
216 // Second priority: match before word separator.
217 final_score += score::EXACT_BASENAME_MATCH;
218 }
219 }
220 }
221
222 FuzzyMatch {
223 matched: true,
224 score: final_score,
225 match_positions: positions,
226 }
227}
228
229/// Score a multi-term pattern against a target whose char buffers have
230/// already been refilled in the caller's scratch space.
231fn score_multi_term(
232 pattern: &PreparedPattern,
233 target_chars: &[char],
234 target_lower: &[char],
235) -> FuzzyMatch {
236 let mut total_score = 0;
237 let mut all_positions = Vec::new();
238
239 for term in &pattern.terms {
240 let result = score_single_term(term, target_chars, target_lower);
241 if !result.matched {
242 return FuzzyMatch::no_match();
243 }
244 total_score += result.score;
245 all_positions.extend(result.match_positions);
246 }
247
248 // Sort and deduplicate positions (terms may have overlapping matches).
249 all_positions.sort_unstable();
250 all_positions.dedup();
251
252 // Tight-span bonus: reward targets where each term appears as a
253 // contiguous substring and all the term matches are packed close
254 // together (regardless of what sits between them). This handles
255 // every realistic "the query reconstructs a single name" case
256 // uniformly — "/etc/hosts", "save_file.rs", "saveFile.rs",
257 // "etcmohosts", even "foo.bar.baz" — without hard-coding a list
258 // of separator characters that would miss camelCase, runs-together
259 // words, or unusual punctuation.
260 //
261 // The bonus fires iff:
262 // 1. Every term appears as a contiguous substring in `target_lower`,
263 // 2. In the order given by the query, and
264 // 3. The gap between consecutive term matches is small (we allow
265 // up to 4 chars per gap — generous enough for "etcmohosts"
266 // and "foo/bar/baz" but not for two terms at opposite ends
267 // of a long path).
268 if let Some((span_start, span_end)) = find_sequential_term_span(target_lower, &pattern.terms) {
269 let term_len_sum: usize = pattern.terms.iter().map(|t| t.lower_chars.len()).sum();
270 let span_len = span_end - span_start;
271 let gap_excess = span_len.saturating_sub(term_len_sum);
272 let num_gaps = pattern.terms.len().saturating_sub(1);
273 // Tolerance scales with the number of gaps: 4 chars per gap.
274 let max_gap_excess = num_gaps.saturating_mul(4);
275
276 if gap_excess <= max_gap_excess {
277 total_score += score::EXACT_MATCH;
278
279 let target_char_count = target_chars.len();
280 all_positions = (span_start..span_end)
281 .filter(|&i| i < target_char_count)
282 .collect();
283 }
284 }
285
286 FuzzyMatch {
287 matched: true,
288 score: total_score,
289 match_positions: all_positions,
290 }
291}
292
293/// Greedily find the first sequential occurrence of each term as a
294/// contiguous substring in `target_lower`, advancing past each match
295/// before searching for the next term. Returns the `(start, end)`
296/// char indices of the overall span covering the first term's start
297/// through the last term's end, or `None` if any term can't be found
298/// contiguously in the required order.
299///
300/// "Greedy" here means we take the *earliest* occurrence of each term,
301/// not the tightest span — tighter spans would require backtracking
302/// and are not worth the cost for a ranking signal.
303fn find_sequential_term_span(
304 target_lower: &[char],
305 terms: &[PreparedTerm],
306) -> Option<(usize, usize)> {
307 let mut search_from = 0;
308 let mut first_start: Option<usize> = None;
309 let mut last_end = 0;
310 for term in terms {
311 let needle = &term.lower_chars;
312 if needle.is_empty() {
313 continue;
314 }
315 if search_from + needle.len() > target_lower.len() {
316 return None;
317 }
318 let end_bound = target_lower.len() - needle.len();
319 let found =
320 (search_from..=end_bound).find(|&s| target_lower[s..s + needle.len()] == *needle)?;
321 if first_start.is_none() {
322 first_start = Some(found);
323 }
324 search_from = found + needle.len();
325 last_end = search_from;
326 }
327 first_start.map(|start| (start, last_end))
328}
329
330/// Find the best contiguous substring match of `query` in `target`.
331///
332/// Scans for all occurrences of the query as a substring and picks the
333/// one with the highest score (preferring word boundaries, basename, etc.).
334fn find_contiguous_match(
335 query: &[char],
336 target_chars: &[char],
337 target_lower: &[char],
338) -> Option<(Vec<usize>, i32)> {
339 let m = query.len();
340 let n = target_lower.len();
341 if m == 0 || m > n {
342 return None;
343 }
344
345 let mut best: Option<(Vec<usize>, i32)> = None;
346
347 for start in 0..=n - m {
348 // Check if query matches at this position.
349 if target_lower[start..start + m] != *query {
350 continue;
351 }
352
353 // Score this contiguous match.
354 let mut match_score = 0;
355
356 if start == 0 {
357 match_score += score::START_OF_STRING;
358 }
359
360 // Word boundary bonus for the first character.
361 if start > 0 && start < target_chars.len() {
362 let prev_char = target_chars[start - 1];
363 if prev_char == ' '
364 || prev_char == '_'
365 || prev_char == '-'
366 || prev_char == '/'
367 || prev_char == '.'
368 {
369 match_score += score::WORD_BOUNDARY;
370 } else if prev_char.is_lowercase() && target_chars[start].is_uppercase() {
371 match_score += score::CAMEL_CASE;
372 }
373 }
374
375 // Consecutive bonus for chars 1..m.
376 match_score += score::CONSECUTIVE * (m as i32 - 1);
377
378 let is_better = match &best {
379 None => true,
380 Some((_, s)) => match_score > *s,
381 };
382 if is_better {
383 let positions: Vec<usize> = (start..start + m).collect();
384 best = Some((positions, match_score));
385 }
386 }
387
388 best
389}
390
391/// A single node in the backpointer arena used by [`find_best_match`].
392///
393/// Each node records the target index matched for one query character and a
394/// link to the node that matched the previous query character (or `None` for
395/// the first match). Walking back from the final node reconstructs the full
396/// list of match positions without ever cloning a `Vec<usize>`.
397#[derive(Clone, Copy)]
398struct ChainNode {
399 ti: usize,
400 prev: Option<u32>,
401}
402
403/// Find the best matching positions for query in target.
404///
405/// Same greedy DP as the original implementation, but replaces the
406/// `Vec<usize>` position-cloning with an arena of linked backpointer nodes.
407/// Per-state updates become O(1) (push one node) instead of O(m) (clone the
408/// full positions vector), turning the worst-case cost from O(n·m²) into
409/// O(n·m) with a single linear walk at the end to reconstruct positions.
410///
411/// Callers are expected to have already run subsequence rejection via
412/// [`super::pattern::is_subsequence_prepared`] before allocating the
413/// target buffers this function consumes — the duplicate guard that used
414/// to live here was pure overhead on the hot path and has been removed.
415fn find_best_match(
416 query: &[char],
417 target_chars: &[char],
418 target_lower: &[char],
419) -> Option<(Vec<usize>, i32)> {
420 if query.is_empty() {
421 return Some((Vec::new(), 0));
422 }
423
424 let n = target_lower.len();
425 let m = query.len();
426
427 if n < m {
428 return None;
429 }
430
431 // Arena of backpointer nodes. `best_node_for_qi[qi]` indexes into the
432 // arena for the currently-best chain ending at query index `qi`, or
433 // `None` if no match for `query[0..qi]` has been seen yet. `qi == 0`
434 // is always `None` (empty chain).
435 let mut arena: Vec<ChainNode> = Vec::with_capacity(m.saturating_mul(4));
436 let mut best_score: Vec<Option<i32>> = vec![None; m + 1];
437 let mut best_node_for_qi: Vec<Option<u32>> = vec![None; m + 1];
438 best_score[0] = Some(0); // empty query matches with score 0
439
440 for ti in 0..n {
441 // Process in reverse so we don't use values we just wrote this iteration.
442 for qi in (0..m).rev() {
443 if target_lower[ti] != query[qi] {
444 continue;
445 }
446
447 // Can we extend the best chain for query[0..qi]?
448 let prev_score = match best_score[qi] {
449 Some(s) => s,
450 None => continue,
451 };
452 // `last_match_pos` comes from the previous chain head, or None
453 // if `qi == 0` (first character of the query).
454 let prev_last_pos = best_node_for_qi[qi].map(|idx| arena[idx as usize].ti);
455
456 // Match positions must be strictly increasing.
457 if let Some(lp) = prev_last_pos {
458 if ti <= lp {
459 continue;
460 }
461 }
462
463 // Score the (ti, prev_last_pos) transition.
464 let mut match_score = 0;
465
466 // Start of string bonus
467 if ti == 0 {
468 match_score += score::START_OF_STRING;
469 }
470
471 // Word boundary bonus
472 if ti > 0 && ti < target_chars.len() {
473 let prev_char = target_chars[ti - 1];
474 if prev_char == ' '
475 || prev_char == '_'
476 || prev_char == '-'
477 || prev_char == '/'
478 || prev_char == '.'
479 {
480 match_score += score::WORD_BOUNDARY;
481 } else if prev_char.is_lowercase() && target_chars[ti].is_uppercase() {
482 match_score += score::CAMEL_CASE;
483 }
484 }
485
486 // Consecutive / gap handling
487 if let Some(lp) = prev_last_pos {
488 if ti == lp + 1 {
489 match_score += score::CONSECUTIVE;
490 } else {
491 let gap_size = ti - lp - 1;
492 match_score += score::GAP_START_PENALTY;
493 match_score += score::GAP_PENALTY * (gap_size as i32 - 1).max(0);
494 }
495 }
496
497 let new_score = prev_score + match_score;
498
499 let should_update = match best_score[qi + 1] {
500 None => true,
501 Some(curr) => new_score > curr,
502 };
503
504 if should_update {
505 let new_idx = arena.len() as u32;
506 arena.push(ChainNode {
507 ti,
508 prev: best_node_for_qi[qi],
509 });
510 best_score[qi + 1] = Some(new_score);
511 best_node_for_qi[qi + 1] = Some(new_idx);
512 }
513 }
514 }
515
516 let final_score = best_score[m]?;
517 let final_node = best_node_for_qi[m]?;
518
519 // Walk backwards through the arena to recover positions.
520 let mut positions = vec![0usize; m];
521 let mut cursor = Some(final_node);
522 let mut idx = m;
523 while let Some(node_idx) = cursor {
524 debug_assert!(idx > 0);
525 idx -= 1;
526 let node = arena[node_idx as usize];
527 positions[idx] = node.ti;
528 cursor = node.prev;
529 }
530
531 Some((positions, final_score))
532}