fresh/input/fuzzy/matcher.rs
1//! The core fuzzy-matching algorithm.
2//!
3//! Given a [`PreparedPattern`] and a target string, this module scores the
4//! match (or rejects it) using two complementary strategies:
5//!
6//! 1. A **DP pass** ([`find_best_match`]) that tracks the highest-scoring
7//! way to interleave query characters into the target, rewarding
8//! consecutive matches, word boundaries, camelCase transitions, etc.
9//! The DP uses an arena of backpointer nodes instead of cloning
10//! position vectors, keeping each state update O(1).
11//!
12//! 2. A **contiguous-substring pass** ([`find_contiguous_match`]) that
13//! finds literal-substring occurrences of the query and scores them.
14//! The DP sometimes prefers scattered matches with more word-boundary
15//! bonuses, so we always compare with the substring result and take
16//! whichever scores higher.
17//!
18//! Early rejection via [`super::pattern::is_subsequence_prepared`] happens
19//! before any target-side allocation, so the heavy lifting here only runs
20//! for candidates that could plausibly match.
21//!
22//! # Amortised hot path via [`FuzzyMatcher`]
23//!
24//! A [`FuzzyMatcher`] owns a [`PreparedPattern`] *and* two reusable
25//! `Vec<char>` scratch buffers for the target. Callers construct one per
26//! keystroke and reuse it across all candidate targets:
27//!
28//! ```ignore
29//! let mut matcher = FuzzyMatcher::new(search_query);
30//! for file in files {
31//! let m = matcher.match_target(&file.relative_path);
32//! // ...
33//! }
34//! ```
35//!
36//! On the first call the scratch buffers grow to the size of the longest
37//! target seen; every subsequent call truncates and refills them in place,
38//! so the allocator is touched once per keystroke rather than once per
39//! candidate. This is the explicit alternative to stashing the scratch in
40//! `thread_local!` state — lifetimes are visible in the signature and
41//! future parallelisation can give each worker its own matcher.
42
43use super::pattern::{is_subsequence_prepared, PreparedPattern, PreparedTerm};
44use super::{score, FuzzyMatch};
45
46/// A reusable fuzzy matcher that amortises both query preparation and
47/// target-side scratch allocation across many calls.
48///
49/// See the module docs for usage.
50#[derive(Debug, Clone)]
51pub struct FuzzyMatcher {
52 pattern: PreparedPattern,
53 /// Scratch buffer: `target.chars().collect()`, cleared and refilled per call.
54 target_chars: Vec<char>,
55 /// Scratch buffer: lowercased target chars, cleared and refilled per call.
56 target_lower: Vec<char>,
57}
58
59impl FuzzyMatcher {
60 /// Create a new matcher for the given query, with empty scratch buffers.
61 ///
62 /// The scratch buffers start empty and grow lazily on the first
63 /// [`match_target`](Self::match_target) call that gets past rejection.
64 pub fn new(query: &str) -> Self {
65 Self::from_pattern(PreparedPattern::new(query))
66 }
67
68 /// Construct a matcher from an already-built [`PreparedPattern`].
69 pub fn from_pattern(pattern: PreparedPattern) -> Self {
70 Self {
71 pattern,
72 target_chars: Vec::new(),
73 target_lower: Vec::new(),
74 }
75 }
76
77 /// Returns `true` if the prepared query has no terms (empty / whitespace).
78 pub fn is_empty(&self) -> bool {
79 self.pattern.is_empty()
80 }
81
82 /// Match a single target string against this matcher's query.
83 ///
84 /// Reuses the internal `Vec<char>` scratch buffers — no per-call
85 /// allocations on the common "passed rejection → scoring" path after
86 /// the first call has warmed them up.
87 pub fn match_target(&mut self, target: &str) -> FuzzyMatch {
88 if self.pattern.is_empty() {
89 return FuzzyMatch {
90 matched: true,
91 score: 0,
92 match_positions: Vec::new(),
93 };
94 }
95
96 // Fast rejection gate. Runs entirely on `&str`, no allocation.
97 // In the multi-term case we must pass the gate for *every* term.
98 for term in &self.pattern.terms {
99 if !is_subsequence_prepared(term, target) {
100 return FuzzyMatch::no_match();
101 }
102 }
103
104 // Refill scratch buffers in place. No `malloc`/`realloc` on
105 // steady-state — the existing capacity is reused after the first
106 // call that made them large enough for the corpus's worst case.
107 refill_target(&mut self.target_chars, &mut self.target_lower, target);
108
109 // Split the borrow so the scoring helpers can see both the pattern
110 // and the scratch buffers simultaneously without fighting the
111 // borrow checker.
112 let pattern = &self.pattern;
113 let target_chars: &[char] = &self.target_chars;
114 let target_lower: &[char] = &self.target_lower;
115
116 if pattern.terms.len() > 1 {
117 score_multi_term(pattern, target_chars, target_lower)
118 } else {
119 score_single_term(&pattern.terms[0], target_chars, target_lower)
120 }
121 }
122}
123
124/// Entry point used by the backward-compatible free functions.
125///
126/// Creates a temporary `FuzzyMatcher` (which allocates empty scratch Vecs
127/// but does not grow them until first use), so callers using the legacy
128/// `fuzzy_match_prepared` API still work, just without cross-call
129/// amortisation. Hot-path callers should construct their own
130/// `FuzzyMatcher` and reuse it.
131pub(super) fn match_prepared(pattern: &PreparedPattern, target: &str) -> FuzzyMatch {
132 let mut matcher = FuzzyMatcher::from_pattern(pattern.clone());
133 matcher.match_target(target)
134}
135
136/// Refill the scratch buffers with the raw and lowercased characters of
137/// `target`. Grows the buffers if needed; otherwise reuses existing
138/// capacity, so steady-state calls do zero allocations.
139///
140/// Note: for non-ASCII chars with expanding case mappings (e.g. Turkish
141/// `İ` → `i` + combining dot), `target_lower` can be longer than
142/// `target_chars`. This matches the semantics of the previous
143/// `target.to_lowercase().chars().collect()` implementation, including
144/// its latent indexing inconsistency — preserving behaviour is out of
145/// scope for this change.
146fn refill_target(target_chars: &mut Vec<char>, target_lower: &mut Vec<char>, target: &str) {
147 target_chars.clear();
148 target_lower.clear();
149 // `target.len()` is the byte length, which is an upper bound on the
150 // character count (chars are 1-4 bytes in UTF-8). Reserving once
151 // avoids any growth checks during the push loop.
152 target_chars.reserve(target.len());
153 target_lower.reserve(target.len());
154 for c in target.chars() {
155 target_chars.push(c);
156 for lc in c.to_lowercase() {
157 target_lower.push(lc);
158 }
159 }
160}
161
162/// Score a single prepared term against a target whose char buffers have
163/// already been refilled in the caller's scratch space.
164fn score_single_term(
165 term: &PreparedTerm,
166 target_chars: &[char],
167 target_lower: &[char],
168) -> FuzzyMatch {
169 let query_lower = &term.lower_chars;
170 let query_len = query_lower.len();
171 let target_len = target_lower.len();
172
173 // Try to find the best matching positions using a DP approach.
174 let dp_result = find_best_match(query_lower, target_chars, target_lower);
175
176 // Also check for a contiguous substring match. The DP may miss this
177 // because it optimises per-character bonuses (word boundaries, etc.)
178 // which can favour scattered matches over a tight substring.
179 let substr_result = find_contiguous_match(query_lower, target_chars, target_lower);
180
181 // Pick the better result.
182 let (positions, mut final_score) = match (dp_result, substr_result) {
183 (Some(dp), Some(sub)) => {
184 if sub.1 >= dp.1 {
185 sub
186 } else {
187 dp
188 }
189 }
190 (Some(dp), None) => dp,
191 (None, Some(sub)) => sub,
192 (None, None) => return FuzzyMatch::no_match(),
193 };
194
195 // Check if all matched positions are consecutive (contiguous substring).
196 let is_contiguous =
197 positions.len() == query_len && positions.windows(2).all(|w| w[1] == w[0] + 1);
198
199 if is_contiguous {
200 final_score += score::CONTIGUOUS_SUBSTRING;
201 }
202
203 // Exact match bonus: query matches entire target.
204 let mut credited_full_path_prefix = false;
205 if query_len == target_len {
206 final_score += score::EXACT_MATCH;
207 } else if target_len > query_len && is_contiguous {
208 // Check if the query is a prefix match (all consecutive from start).
209 let is_prefix_match = positions.iter().enumerate().all(|(i, &pos)| pos == i);
210
211 if is_prefix_match && query_len < target_chars.len() {
212 let next_char = target_chars[query_len];
213 if next_char == '.' {
214 // Highest priority: exact basename match (before extension).
215 final_score += score::EXACT_MATCH;
216 credited_full_path_prefix = true;
217 } else if next_char == '-' || next_char == '_' || next_char == ' ' {
218 // Second priority: match before word separator.
219 final_score += score::EXACT_BASENAME_MATCH;
220 credited_full_path_prefix = true;
221 }
222 }
223 }
224
225 // Path-segment prefix bonus. A contiguous match that begins at the start
226 // of a path segment (basename or interior directory) outranks an equally
227 // contiguous match that lands mid-segment. This is what users expect
228 // when typing a short query like "ts": a basename starting with "ts"
229 // (e.g. `tsconfig.json`) should rank above unrelated files whose only
230 // `ts` happens to follow an extension dot (e.g. `pkg.ts`, `finder.ts`).
231 //
232 // Skipped when the existing `EXACT_*` block above already credited a
233 // full-path prefix that ends in a separator — those matches are already
234 // in the top tier and would be double-counted otherwise.
235 if is_contiguous
236 && !credited_full_path_prefix
237 && !positions.is_empty()
238 && positions.len() == query_len
239 {
240 let start = positions[0];
241 let starts_segment = start == 0 || (start > 0 && target_chars[start - 1] == '/');
242 if starts_segment {
243 let last_slash = target_chars.iter().rposition(|&c| c == '/');
244 let basename_start = last_slash.map(|i| i + 1).unwrap_or(0);
245 if start == basename_start {
246 final_score += score::BASENAME_PREFIX;
247 } else {
248 final_score += score::PATH_SEGMENT_PREFIX;
249 }
250 }
251 }
252
253 FuzzyMatch {
254 matched: true,
255 score: final_score,
256 match_positions: positions,
257 }
258}
259
260/// Score a multi-term pattern against a target whose char buffers have
261/// already been refilled in the caller's scratch space.
262fn score_multi_term(
263 pattern: &PreparedPattern,
264 target_chars: &[char],
265 target_lower: &[char],
266) -> FuzzyMatch {
267 let mut total_score = 0;
268 let mut all_positions = Vec::new();
269
270 for term in &pattern.terms {
271 let result = score_single_term(term, target_chars, target_lower);
272 if !result.matched {
273 return FuzzyMatch::no_match();
274 }
275 total_score += result.score;
276 all_positions.extend(result.match_positions);
277 }
278
279 // Sort and deduplicate positions (terms may have overlapping matches).
280 all_positions.sort_unstable();
281 all_positions.dedup();
282
283 // Tight-span bonus: reward targets where each term appears as a
284 // contiguous substring and all the term matches are packed close
285 // together (regardless of what sits between them). This handles
286 // every realistic "the query reconstructs a single name" case
287 // uniformly — "/etc/hosts", "save_file.rs", "saveFile.rs",
288 // "etcmohosts", even "foo.bar.baz" — without hard-coding a list
289 // of separator characters that would miss camelCase, runs-together
290 // words, or unusual punctuation.
291 //
292 // The bonus fires iff:
293 // 1. Every term appears as a contiguous substring in `target_lower`,
294 // 2. In the order given by the query, and
295 // 3. The gap between consecutive term matches is small (we allow
296 // up to 4 chars per gap — generous enough for "etcmohosts"
297 // and "foo/bar/baz" but not for two terms at opposite ends
298 // of a long path).
299 if let Some((span_start, span_end)) = find_sequential_term_span(target_lower, &pattern.terms) {
300 let term_len_sum: usize = pattern.terms.iter().map(|t| t.lower_chars.len()).sum();
301 let span_len = span_end - span_start;
302 let gap_excess = span_len.saturating_sub(term_len_sum);
303 let num_gaps = pattern.terms.len().saturating_sub(1);
304 // Tolerance scales with the number of gaps: 4 chars per gap.
305 let max_gap_excess = num_gaps.saturating_mul(4);
306
307 if gap_excess <= max_gap_excess {
308 total_score += score::EXACT_MATCH;
309
310 let target_char_count = target_chars.len();
311 all_positions = (span_start..span_end)
312 .filter(|&i| i < target_char_count)
313 .collect();
314 }
315 }
316
317 FuzzyMatch {
318 matched: true,
319 score: total_score,
320 match_positions: all_positions,
321 }
322}
323
324/// Greedily find the first sequential occurrence of each term as a
325/// contiguous substring in `target_lower`, advancing past each match
326/// before searching for the next term. Returns the `(start, end)`
327/// char indices of the overall span covering the first term's start
328/// through the last term's end, or `None` if any term can't be found
329/// contiguously in the required order.
330///
331/// "Greedy" here means we take the *earliest* occurrence of each term,
332/// not the tightest span — tighter spans would require backtracking
333/// and are not worth the cost for a ranking signal.
334fn find_sequential_term_span(
335 target_lower: &[char],
336 terms: &[PreparedTerm],
337) -> Option<(usize, usize)> {
338 let mut search_from = 0;
339 let mut first_start: Option<usize> = None;
340 let mut last_end = 0;
341 for term in terms {
342 let needle = &term.lower_chars;
343 if needle.is_empty() {
344 continue;
345 }
346 if search_from + needle.len() > target_lower.len() {
347 return None;
348 }
349 let end_bound = target_lower.len() - needle.len();
350 let found =
351 (search_from..=end_bound).find(|&s| target_lower[s..s + needle.len()] == *needle)?;
352 if first_start.is_none() {
353 first_start = Some(found);
354 }
355 search_from = found + needle.len();
356 last_end = search_from;
357 }
358 first_start.map(|start| (start, last_end))
359}
360
361/// Find the best contiguous substring match of `query` in `target`.
362///
363/// Scans for all occurrences of the query as a substring and picks the
364/// one with the highest score (preferring word boundaries, basename, etc.).
365fn find_contiguous_match(
366 query: &[char],
367 target_chars: &[char],
368 target_lower: &[char],
369) -> Option<(Vec<usize>, i32)> {
370 let m = query.len();
371 let n = target_lower.len();
372 if m == 0 || m > n {
373 return None;
374 }
375
376 let mut best: Option<(Vec<usize>, i32)> = None;
377
378 for start in 0..=n - m {
379 // Check if query matches at this position.
380 if target_lower[start..start + m] != *query {
381 continue;
382 }
383
384 // Score this contiguous match.
385 let mut match_score = 0;
386
387 if start == 0 {
388 match_score += score::START_OF_STRING;
389 }
390
391 // Word boundary bonus for the first character.
392 if start > 0 && start < target_chars.len() {
393 let prev_char = target_chars[start - 1];
394 if prev_char == ' '
395 || prev_char == '_'
396 || prev_char == '-'
397 || prev_char == '/'
398 || prev_char == '.'
399 {
400 match_score += score::WORD_BOUNDARY;
401 } else if prev_char.is_lowercase() && target_chars[start].is_uppercase() {
402 match_score += score::CAMEL_CASE;
403 }
404 }
405
406 // Consecutive bonus for chars 1..m.
407 match_score += score::CONSECUTIVE * (m as i32 - 1);
408
409 let is_better = match &best {
410 None => true,
411 Some((_, s)) => match_score > *s,
412 };
413 if is_better {
414 let positions: Vec<usize> = (start..start + m).collect();
415 best = Some((positions, match_score));
416 }
417 }
418
419 best
420}
421
422/// A single node in the backpointer arena used by [`find_best_match`].
423///
424/// Each node records the target index matched for one query character and a
425/// link to the node that matched the previous query character (or `None` for
426/// the first match). Walking back from the final node reconstructs the full
427/// list of match positions without ever cloning a `Vec<usize>`.
428#[derive(Clone, Copy)]
429struct ChainNode {
430 ti: usize,
431 prev: Option<u32>,
432}
433
434/// Find the best matching positions for query in target.
435///
436/// Same greedy DP as the original implementation, but replaces the
437/// `Vec<usize>` position-cloning with an arena of linked backpointer nodes.
438/// Per-state updates become O(1) (push one node) instead of O(m) (clone the
439/// full positions vector), turning the worst-case cost from O(n·m²) into
440/// O(n·m) with a single linear walk at the end to reconstruct positions.
441///
442/// Callers are expected to have already run subsequence rejection via
443/// [`super::pattern::is_subsequence_prepared`] before allocating the
444/// target buffers this function consumes — the duplicate guard that used
445/// to live here was pure overhead on the hot path and has been removed.
446fn find_best_match(
447 query: &[char],
448 target_chars: &[char],
449 target_lower: &[char],
450) -> Option<(Vec<usize>, i32)> {
451 if query.is_empty() {
452 return Some((Vec::new(), 0));
453 }
454
455 let n = target_lower.len();
456 let m = query.len();
457
458 if n < m {
459 return None;
460 }
461
462 // Arena of backpointer nodes. `best_node_for_qi[qi]` indexes into the
463 // arena for the currently-best chain ending at query index `qi`, or
464 // `None` if no match for `query[0..qi]` has been seen yet. `qi == 0`
465 // is always `None` (empty chain).
466 let mut arena: Vec<ChainNode> = Vec::with_capacity(m.saturating_mul(4));
467 let mut best_score: Vec<Option<i32>> = vec![None; m + 1];
468 let mut best_node_for_qi: Vec<Option<u32>> = vec![None; m + 1];
469 best_score[0] = Some(0); // empty query matches with score 0
470
471 for ti in 0..n {
472 // Process in reverse so we don't use values we just wrote this iteration.
473 for qi in (0..m).rev() {
474 if target_lower[ti] != query[qi] {
475 continue;
476 }
477
478 // Can we extend the best chain for query[0..qi]?
479 let prev_score = match best_score[qi] {
480 Some(s) => s,
481 None => continue,
482 };
483 // `last_match_pos` comes from the previous chain head, or None
484 // if `qi == 0` (first character of the query).
485 let prev_last_pos = best_node_for_qi[qi].map(|idx| arena[idx as usize].ti);
486
487 // Match positions must be strictly increasing.
488 if let Some(lp) = prev_last_pos {
489 if ti <= lp {
490 continue;
491 }
492 }
493
494 // Score the (ti, prev_last_pos) transition.
495 let mut match_score = 0;
496
497 // Start of string bonus
498 if ti == 0 {
499 match_score += score::START_OF_STRING;
500 }
501
502 // Word boundary bonus
503 if ti > 0 && ti < target_chars.len() {
504 let prev_char = target_chars[ti - 1];
505 if prev_char == ' '
506 || prev_char == '_'
507 || prev_char == '-'
508 || prev_char == '/'
509 || prev_char == '.'
510 {
511 match_score += score::WORD_BOUNDARY;
512 } else if prev_char.is_lowercase() && target_chars[ti].is_uppercase() {
513 match_score += score::CAMEL_CASE;
514 }
515 }
516
517 // Consecutive / gap handling
518 if let Some(lp) = prev_last_pos {
519 if ti == lp + 1 {
520 match_score += score::CONSECUTIVE;
521 } else {
522 let gap_size = ti - lp - 1;
523 match_score += score::GAP_START_PENALTY;
524 match_score += score::GAP_PENALTY * (gap_size as i32 - 1).max(0);
525 }
526 }
527
528 let new_score = prev_score + match_score;
529
530 let should_update = match best_score[qi + 1] {
531 None => true,
532 Some(curr) => new_score > curr,
533 };
534
535 if should_update {
536 let new_idx = arena.len() as u32;
537 arena.push(ChainNode {
538 ti,
539 prev: best_node_for_qi[qi],
540 });
541 best_score[qi + 1] = Some(new_score);
542 best_node_for_qi[qi + 1] = Some(new_idx);
543 }
544 }
545 }
546
547 let final_score = best_score[m]?;
548 let final_node = best_node_for_qi[m]?;
549
550 // Walk backwards through the arena to recover positions.
551 let mut positions = vec![0usize; m];
552 let mut cursor = Some(final_node);
553 let mut idx = m;
554 while let Some(node_idx) = cursor {
555 debug_assert!(idx > 0);
556 idx -= 1;
557 let node = arena[node_idx as usize];
558 positions[idx] = node.ti;
559 cursor = node.prev;
560 }
561
562 Some((positions, final_score))
563}