nucleo_matcher/pattern.rs
1//! This module provides a slightly higher level API for matching strings.
2
3use std::cmp::Reverse;
4
5use crate::{chars, Matcher, Utf32Str};
6
7#[cfg(test)]
8mod tests;
9
10use crate::Utf32String;
11
12#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
13#[non_exhaustive]
14/// How to treat a case mismatch between two characters.
15pub enum CaseMatching {
16 /// Characters never match their case folded version (`a != A`).
17 Respect,
18 /// Characters always match their case folded version (`a == A`).
19 #[cfg(feature = "unicode-casefold")]
20 Ignore,
21 /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
22 /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
23 #[default]
24 #[cfg(feature = "unicode-casefold")]
25 Smart,
26}
27
28#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
29#[non_exhaustive]
30/// How to handle unicode normalization,
31pub enum Normalization {
32 /// Characters never match their normalized version (`a != ä`).
33 Never,
34 /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
35 /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
36 #[default]
37 #[cfg(feature = "unicode-normalization")]
38 Smart,
39}
40
41#[derive(Debug, PartialEq, Eq, Clone, Copy)]
42#[non_exhaustive]
43/// The kind of matching algorithm to run for an atom.
44pub enum AtomKind {
45 /// Fuzzy matching where the needle must match any haystack characters
46 /// (match can contain gaps). This atom kind is used by default if no
47 /// special syntax is used. There is no negated fuzzy matching (too
48 /// many false positives).
49 ///
50 /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
51 Fuzzy,
52 /// The needle must match a contiguous sequence of haystack characters
53 /// without gaps. This atom kind is parsed from the following syntax:
54 /// `'foo` and `!foo` (negated).
55 ///
56 /// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
57 Substring,
58 /// The needle must match all leading haystack characters without gaps or
59 /// prefix. This atom kind is parsed from the following syntax: `^foo` and
60 /// `!^foo` (negated).
61 ///
62 /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
63 Prefix,
64 /// The needle must match all trailing haystack characters without gaps or
65 /// postfix. This atom kind is parsed from the following syntax: `foo$` and
66 /// `!foo$` (negated).
67 ///
68 /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
69 Postfix,
70 /// The needle must match all haystack characters without gaps or prefix.
71 /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
72 /// (negated).
73 ///
74 /// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
75 Exact,
76}
77
78/// A single pattern component that is matched with a single [`Matcher`] function
79#[derive(Debug, PartialEq, Eq, Clone)]
80pub struct Atom {
81 /// Whether this pattern atom is a negative match.
82 /// A negative pattern atom will prevent haystacks matching it from
83 /// being matchend. It does not contribute to scoring/indices
84 pub negative: bool,
85 /// The kind of match that this pattern performs
86 pub kind: AtomKind,
87 needle: Utf32String,
88 ignore_case: bool,
89 normalize: bool,
90}
91
92impl Atom {
93 /// Creates a single [`Atom`] from a string by performing unicode
94 /// normalization and case folding (if necessary). Optionally `\ ` can
95 /// be escaped to ` `.
96 pub fn new(
97 needle: &str,
98 case: CaseMatching,
99 normalize: Normalization,
100 kind: AtomKind,
101 escape_whitespace: bool,
102 ) -> Atom {
103 Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
104 }
105
106 fn new_inner(
107 needle: &str,
108 case: CaseMatching,
109 normalization: Normalization,
110 kind: AtomKind,
111 escape_whitespace: bool,
112 append_dollar: bool,
113 ) -> Atom {
114 let mut ignore_case;
115 let mut normalize;
116 #[cfg(feature = "unicode-normalization")]
117 {
118 normalize = matches!(normalization, Normalization::Smart);
119 }
120 #[cfg(not(feature = "unicode-normalization"))]
121 {
122 normalize = false;
123 }
124 let needle = if needle.is_ascii() {
125 let mut needle = if escape_whitespace {
126 if let Some((start, rem)) = needle.split_once("\\ ") {
127 let mut needle = start.to_owned();
128 for rem in rem.split("\\ ") {
129 needle.push(' ');
130 needle.push_str(rem);
131 }
132 needle
133 } else {
134 needle.to_owned()
135 }
136 } else {
137 needle.to_owned()
138 };
139
140 match case {
141 #[cfg(feature = "unicode-casefold")]
142 CaseMatching::Ignore => {
143 ignore_case = true;
144 needle.make_ascii_lowercase()
145 }
146 #[cfg(feature = "unicode-casefold")]
147 CaseMatching::Smart => {
148 ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
149 }
150 CaseMatching::Respect => ignore_case = false,
151 }
152 if append_dollar {
153 needle.push('$');
154 }
155 Utf32String::Ascii(needle.into_boxed_str())
156 } else {
157 let mut needle_ = Vec::with_capacity(needle.len());
158 #[cfg(feature = "unicode-casefold")]
159 {
160 ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
161 }
162 #[cfg(not(feature = "unicode-casefold"))]
163 {
164 ignore_case = false;
165 }
166 #[cfg(feature = "unicode-normalization")]
167 {
168 normalize = matches!(normalization, Normalization::Smart);
169 }
170 if escape_whitespace {
171 let mut saw_backslash = false;
172 for mut c in chars::graphemes(needle) {
173 if saw_backslash {
174 if c == ' ' {
175 needle_.push(' ');
176 saw_backslash = false;
177 continue;
178 } else {
179 needle_.push('\\');
180 }
181 }
182 saw_backslash = c == '\\';
183 match case {
184 #[cfg(feature = "unicode-casefold")]
185 CaseMatching::Ignore => c = chars::to_lower_case(c),
186 #[cfg(feature = "unicode-casefold")]
187 CaseMatching::Smart => {
188 ignore_case = ignore_case && !chars::is_upper_case(c)
189 }
190 CaseMatching::Respect => (),
191 }
192 match normalization {
193 #[cfg(feature = "unicode-normalization")]
194 Normalization::Smart => {
195 normalize = normalize && chars::normalize(c) == c;
196 }
197 Normalization::Never => (),
198 }
199 needle_.push(c);
200 }
201 } else {
202 let chars = chars::graphemes(needle).map(|mut c| {
203 match case {
204 #[cfg(feature = "unicode-casefold")]
205 CaseMatching::Ignore => c = chars::to_lower_case(c),
206 #[cfg(feature = "unicode-casefold")]
207 CaseMatching::Smart => {
208 ignore_case = ignore_case && !chars::is_upper_case(c);
209 }
210 CaseMatching::Respect => (),
211 }
212 match normalization {
213 #[cfg(feature = "unicode-normalization")]
214 Normalization::Smart => {
215 normalize = normalize && chars::normalize(c) == c;
216 }
217 Normalization::Never => (),
218 }
219 c
220 });
221 needle_.extend(chars);
222 };
223 if append_dollar {
224 needle_.push('$');
225 }
226 Utf32String::Unicode(needle_.into_boxed_slice())
227 };
228 Atom {
229 kind,
230 needle,
231 negative: false,
232 ignore_case,
233 normalize,
234 }
235 }
236
237 /// Parse a pattern atom from a string. Some special trailing and leading
238 /// characters can be used to control the atom kind. See [`AtomKind`] for
239 /// details.
240 pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
241 let mut atom = raw;
242 let invert = match atom.as_bytes() {
243 [b'!', ..] => {
244 atom = &atom[1..];
245 true
246 }
247 [b'\\', b'!', ..] => {
248 atom = &atom[1..];
249 false
250 }
251 _ => false,
252 };
253
254 let mut kind = match atom.as_bytes() {
255 [b'^', ..] => {
256 atom = &atom[1..];
257 AtomKind::Prefix
258 }
259 [b'\'', ..] => {
260 atom = &atom[1..];
261 AtomKind::Substring
262 }
263 [b'\\', b'^' | b'\'', ..] => {
264 atom = &atom[1..];
265 AtomKind::Fuzzy
266 }
267 _ => AtomKind::Fuzzy,
268 };
269
270 let mut append_dollar = false;
271 match atom.as_bytes() {
272 [.., b'\\', b'$'] => {
273 append_dollar = true;
274 atom = &atom[..atom.len() - 2]
275 }
276 [.., b'$'] => {
277 kind = if kind == AtomKind::Fuzzy {
278 AtomKind::Postfix
279 } else {
280 AtomKind::Exact
281 };
282 atom = &atom[..atom.len() - 1]
283 }
284 _ => (),
285 }
286
287 if invert && kind == AtomKind::Fuzzy {
288 kind = AtomKind::Substring
289 }
290
291 let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
292 pattern.negative = invert;
293 pattern
294 }
295
296 /// Matches this pattern against `haystack` (using the allocation and configuration
297 /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
298 /// Documentation for more details.
299 ///
300 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
301 /// each pattern atom.
302 pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
303 matcher.config.ignore_case = self.ignore_case;
304 matcher.config.normalize = self.normalize;
305 let pattern_score = match self.kind {
306 AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
307 AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
308 AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
309 AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
310 AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
311 };
312 if self.negative {
313 if pattern_score.is_some() {
314 return None;
315 }
316 Some(0)
317 } else {
318 pattern_score
319 }
320 }
321
322 /// Matches this pattern against `haystack` (using the allocation and
323 /// configuration from `matcher`), calculates a ranking score and the match
324 /// indices. See the [`Matcher`]. Documentation for more
325 /// details.
326 ///
327 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
328 /// each pattern atom.
329 ///
330 /// *Note:* The `indices` vector is not cleared by this function.
331 pub fn indices(
332 &self,
333 haystack: Utf32Str<'_>,
334 matcher: &mut Matcher,
335 indices: &mut Vec<u32>,
336 ) -> Option<u16> {
337 matcher.config.ignore_case = self.ignore_case;
338 matcher.config.normalize = self.normalize;
339 if self.negative {
340 let pattern_score = match self.kind {
341 AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
342 AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
343 AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
344 AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
345 AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
346 };
347 pattern_score.is_none().then_some(0)
348 } else {
349 match self.kind {
350 AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
351 AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
352 AtomKind::Substring => {
353 matcher.substring_indices(haystack, self.needle.slice(..), indices)
354 }
355 AtomKind::Prefix => {
356 matcher.prefix_indices(haystack, self.needle.slice(..), indices)
357 }
358 AtomKind::Postfix => {
359 matcher.postfix_indices(haystack, self.needle.slice(..), indices)
360 }
361 }
362 }
363 }
364
365 /// Returns the needle text that is passed to the matcher. All indices
366 /// produced by the `indices` functions produce char indices used to index
367 /// this text
368 pub fn needle_text(&self) -> Utf32Str<'_> {
369 self.needle.slice(..)
370 }
371 /// Convenience function to easily match (and sort) a (relatively small)
372 /// list of inputs.
373 ///
374 /// *Note* This function is not recommended for building a full fuzzy
375 /// matching application that can match large numbers of matches (like all
376 /// files in a directory) as all matching is done on the current thread,
377 /// effectively blocking the UI. For such applications the high level
378 /// `nucleo` crate can be used instead.
379 pub fn match_list<T: AsRef<str>>(
380 &self,
381 items: impl IntoIterator<Item = T>,
382 matcher: &mut Matcher,
383 ) -> Vec<(T, u16)> {
384 if self.needle.is_empty() {
385 return items.into_iter().map(|item| (item, 0)).collect();
386 }
387 let mut buf = Vec::new();
388 let mut items: Vec<_> = items
389 .into_iter()
390 .filter_map(|item| {
391 self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
392 .map(|score| (item, score))
393 })
394 .collect();
395 items.sort_by_key(|(_, score)| Reverse(*score));
396 items
397 }
398}
399
400fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
401 let mut saw_backslash = false;
402 pattern.split(move |c| {
403 saw_backslash = match c {
404 ' ' if !saw_backslash => return true,
405 '\\' => true,
406 _ => false,
407 };
408 false
409 })
410}
411
412#[derive(Debug, Default)]
413/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
414#[non_exhaustive]
415pub struct Pattern {
416 /// The individual pattern (words) in this pattern
417 pub atoms: Vec<Atom>,
418}
419
420impl Pattern {
421 /// Creates a pattern where each word is matched individually (whitespaces
422 /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
423 /// and ^ don't receive special treatment). If you want to match the entire
424 /// pattern as a single needle use a single [`Atom`] instead.
425 pub fn new(
426 pattern: &str,
427 case_matching: CaseMatching,
428 normalize: Normalization,
429 kind: AtomKind,
430 ) -> Pattern {
431 let atoms = pattern_atoms(pattern)
432 .filter_map(|pat| {
433 let pat = Atom::new(pat, case_matching, normalize, kind, true);
434 (!pat.needle.is_empty()).then_some(pat)
435 })
436 .collect();
437 Pattern { atoms }
438 }
439 /// Creates a pattern where each word is matched individually (whitespaces
440 /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
441 /// cause different matching behaviour (see [`AtomKind`]). These can be
442 /// escaped with backslash.
443 pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
444 let atoms = pattern_atoms(pattern)
445 .filter_map(|pat| {
446 let pat = Atom::parse(pat, case_matching, normalize);
447 (!pat.needle.is_empty()).then_some(pat)
448 })
449 .collect();
450 Pattern { atoms }
451 }
452
453 /// Convenience function to easily match (and sort) a (relatively small)
454 /// list of inputs.
455 ///
456 /// *Note* This function is not recommended for building a full fuzzy
457 /// matching application that can match large numbers of matches (like all
458 /// files in a directory) as all matching is done on the current thread,
459 /// effectively blocking the UI. For such applications the high level
460 /// `nucleo` crate can be used instead.
461 pub fn match_list<T: AsRef<str>>(
462 &self,
463 items: impl IntoIterator<Item = T>,
464 matcher: &mut Matcher,
465 ) -> Vec<(T, u32)> {
466 if self.atoms.is_empty() {
467 return items.into_iter().map(|item| (item, 0)).collect();
468 }
469 let mut buf = Vec::new();
470 let mut items: Vec<_> = items
471 .into_iter()
472 .filter_map(|item| {
473 self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
474 .map(|score| (item, score))
475 })
476 .collect();
477 items.sort_by_key(|(_, score)| Reverse(*score));
478 items
479 }
480
481 /// Matches this pattern against `haystack` (using the allocation and configuration
482 /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
483 /// Documentation for more details.
484 ///
485 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
486 /// each pattern atom.
487 pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
488 if self.atoms.is_empty() {
489 return Some(0);
490 }
491 let mut score = 0;
492 for pattern in &self.atoms {
493 score += pattern.score(haystack, matcher)? as u32;
494 }
495 Some(score)
496 }
497
498 /// Matches this pattern against `haystack` (using the allocation and
499 /// configuration from `matcher`), calculates a ranking score and the match
500 /// indices. See the [`Matcher`]. Documentation for more
501 /// details.
502 ///
503 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
504 /// each pattern atom.
505 ///
506 /// *Note:* The indices for each pattern are calculated individually
507 /// and simply appended to the `indices` vector and not deduplicated/sorted.
508 /// This allows associating the match indices to their source pattern. If
509 /// required (like for highlighting) unique/sorted indices can be obtained
510 /// as follows:
511 ///
512 /// ```
513 /// # let mut indices: Vec<u32> = Vec::new();
514 /// indices.sort_unstable();
515 /// indices.dedup();
516 /// ```
517 pub fn indices(
518 &self,
519 haystack: Utf32Str<'_>,
520 matcher: &mut Matcher,
521 indices: &mut Vec<u32>,
522 ) -> Option<u32> {
523 if self.atoms.is_empty() {
524 return Some(0);
525 }
526 let mut score = 0;
527 for pattern in &self.atoms {
528 score += pattern.indices(haystack, matcher, indices)? as u32;
529 }
530 Some(score)
531 }
532
533 /// Refreshes this pattern by reparsing it from a string. This is mostly
534 /// equivalent to just constructing a new pattern using [`Pattern::parse`]
535 /// but is slightly more efficient by reusing some allocations
536 pub fn reparse(
537 &mut self,
538 pattern: &str,
539 case_matching: CaseMatching,
540 normalize: Normalization,
541 ) {
542 self.atoms.clear();
543 let atoms = pattern_atoms(pattern).filter_map(|atom| {
544 let atom = Atom::parse(atom, case_matching, normalize);
545 if atom.needle.is_empty() {
546 return None;
547 }
548 Some(atom)
549 });
550 self.atoms.extend(atoms);
551 }
552}
553
554impl Clone for Pattern {
555 fn clone(&self) -> Self {
556 Self {
557 atoms: self.atoms.clone(),
558 }
559 }
560
561 fn clone_from(&mut self, source: &Self) {
562 self.atoms.clone_from(&source.atoms);
563 }
564}