ncp_matcher/pattern.rs
1//! This module provides a slightly higher level API for matching strings.
2
3use std::cmp::Reverse;
4
5use crate::{Matcher, Utf32Str, chars};
6
7#[cfg(test)]
8mod tests;
9
10use crate::Utf32String;
11
12#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
13#[non_exhaustive]
14/// How to treat a case mismatch between two characters.
15pub enum CaseMatching {
16 /// Characters never match their case folded version (`a != A`).
17 #[cfg_attr(not(feature = "unicode-casefold"), default)]
18 Respect,
19 /// Characters always match their case folded version (`a == A`).
20 #[cfg(feature = "unicode-casefold")]
21 Ignore,
22 /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
23 /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
24 #[default]
25 #[cfg(feature = "unicode-casefold")]
26 Smart,
27}
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
30#[non_exhaustive]
31/// How to handle unicode normalization.
32pub enum Normalization {
33 /// Characters never match their normalized version (`a != ä`).
34 #[cfg_attr(not(feature = "unicode-normalization"), default)]
35 Never,
36 /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
37 /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
38 #[default]
39 #[cfg(feature = "unicode-normalization")]
40 Smart,
41}
42
43#[derive(Debug, PartialEq, Eq, Clone, Copy)]
44#[non_exhaustive]
45/// The kind of matching algorithm to run for an atom.
46pub enum AtomKind {
47 /// Fuzzy matching where the needle must match any haystack characters
48 /// (match can contain gaps). This atom kind is used by default if no
49 /// special syntax is used. There is no negated fuzzy matching (too
50 /// many false positives).
51 ///
52 /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
53 Fuzzy,
54 /// The needle must match a contiguous sequence of haystack characters
55 /// without gaps. This atom kind is parsed from the following syntax:
56 /// `'foo` and `!foo` (negated).
57 ///
58 /// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
59 Substring,
60 /// The needle must match all leading haystack characters without gaps or
61 /// prefix. This atom kind is parsed from the following syntax: `^foo` and
62 /// `!^foo` (negated).
63 ///
64 /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
65 Prefix,
66 /// The needle must match all trailing haystack characters without gaps or
67 /// postfix. This atom kind is parsed from the following syntax: `foo$` and
68 /// `!foo$` (negated).
69 ///
70 /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
71 Postfix,
72 /// The needle must match all haystack characters without gaps or prefix.
73 /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
74 /// (negated).
75 ///
76 /// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
77 Exact,
78}
79
80/// A single pattern component that is matched with a single [`Matcher`] function.
81#[derive(Debug, PartialEq, Eq, Clone)]
82pub struct Atom {
83 /// Whether this pattern atom is a negative match.
84 /// A negative pattern atom will prevent haystacks matching it from
85 /// being matchend. It does not contribute to scoring/indices
86 pub negative: bool,
87 /// The kind of match that this pattern performs
88 pub kind: AtomKind,
89 needle: Utf32String,
90 ignore_case: bool,
91 normalize: bool,
92}
93
94impl Atom {
95 /// Creates a single [`Atom`] from a string by performing unicode
96 /// normalization and case folding (if necessary). Optionally `\ ` can
97 /// be escaped to ` `.
98 pub fn new(
99 needle: &str,
100 case: CaseMatching,
101 normalize: Normalization,
102 kind: AtomKind,
103 escape_whitespace: bool,
104 ) -> Atom {
105 Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
106 }
107
108 fn new_inner(
109 needle: &str,
110 case: CaseMatching,
111 normalization: Normalization,
112 kind: AtomKind,
113 escape_whitespace: bool,
114 append_dollar: bool,
115 ) -> Atom {
116 let mut ignore_case;
117 let mut normalize;
118 #[cfg(feature = "unicode-normalization")]
119 {
120 normalize = matches!(normalization, Normalization::Smart);
121 }
122 #[cfg(not(feature = "unicode-normalization"))]
123 {
124 normalize = false;
125 }
126 let needle = if needle.is_ascii() {
127 let mut needle_string = if escape_whitespace {
128 let mut needle_bytes = Vec::with_capacity(needle.len());
129 let mut saw_backslash = false;
130 for c in needle.bytes() {
131 if saw_backslash {
132 if c.is_ascii_whitespace() {
133 needle_bytes.push(c);
134 saw_backslash = false;
135 continue;
136 } else {
137 needle_bytes.push(b'\\');
138 }
139 }
140 saw_backslash = c == b'\\';
141 if !saw_backslash {
142 needle_bytes.push(c);
143 }
144 }
145 // push the potentially trailing backslash
146 if saw_backslash {
147 needle_bytes.push(b'\\');
148 }
149 // SAFETY: we just checked that needle is ascii, so each `c` is a valid ASCII byte
150 unsafe { String::from_utf8_unchecked(needle_bytes) }
151 } else {
152 needle.to_owned()
153 };
154
155 match case {
156 #[cfg(feature = "unicode-casefold")]
157 CaseMatching::Ignore => {
158 ignore_case = true;
159 needle_string.make_ascii_lowercase()
160 }
161 #[cfg(feature = "unicode-casefold")]
162 CaseMatching::Smart => {
163 ignore_case = !needle_string.bytes().any(|b| b.is_ascii_uppercase())
164 }
165 CaseMatching::Respect => ignore_case = false,
166 }
167
168 if append_dollar {
169 needle_string.push('$');
170 }
171 Utf32String::Ascii(needle_string.into_boxed_str())
172 } else {
173 let mut needle_ = Vec::with_capacity(needle.len());
174 #[cfg(feature = "unicode-casefold")]
175 {
176 ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
177 }
178 #[cfg(not(feature = "unicode-casefold"))]
179 {
180 ignore_case = false;
181 }
182 #[cfg(feature = "unicode-normalization")]
183 {
184 normalize = matches!(normalization, Normalization::Smart);
185 }
186 if escape_whitespace {
187 let mut saw_backslash = false;
188 for mut c in chars::graphemes(needle) {
189 if saw_backslash {
190 if c.is_whitespace() {
191 needle_.push(c);
192 saw_backslash = false;
193 continue;
194 } else {
195 needle_.push('\\');
196 }
197 }
198 saw_backslash = c == '\\';
199 if !saw_backslash {
200 match case {
201 #[cfg(feature = "unicode-casefold")]
202 CaseMatching::Ignore => c = chars::to_lower_case(c),
203 #[cfg(feature = "unicode-casefold")]
204 CaseMatching::Smart => {
205 ignore_case = ignore_case && !chars::is_upper_case(c)
206 }
207 CaseMatching::Respect => (),
208 }
209 match normalization {
210 #[cfg(feature = "unicode-normalization")]
211 Normalization::Smart => {
212 normalize = normalize && chars::normalize(c) == c;
213 }
214 Normalization::Never => (),
215 }
216 needle_.push(c);
217 }
218 }
219 // push the potentially trailing backslash
220 if saw_backslash {
221 needle_.push('\\');
222 }
223 } else {
224 let chars = chars::graphemes(needle).map(|mut c| {
225 match case {
226 #[cfg(feature = "unicode-casefold")]
227 CaseMatching::Ignore => c = chars::to_lower_case(c),
228 #[cfg(feature = "unicode-casefold")]
229 CaseMatching::Smart => {
230 ignore_case = ignore_case && !chars::is_upper_case(c);
231 }
232 CaseMatching::Respect => (),
233 }
234 match normalization {
235 #[cfg(feature = "unicode-normalization")]
236 Normalization::Smart => {
237 normalize = normalize && chars::normalize(c) == c;
238 }
239 Normalization::Never => (),
240 }
241 c
242 });
243 needle_.extend(chars);
244 };
245 if append_dollar {
246 needle_.push('$');
247 }
248 Utf32String::Unicode(needle_.into_boxed_slice())
249 };
250 Atom {
251 kind,
252 needle,
253 negative: false,
254 ignore_case,
255 normalize,
256 }
257 }
258
259 /// Parse a pattern atom from a string. Some special trailing and leading
260 /// characters can be used to control the atom kind. See [`AtomKind`] for
261 /// details.
262 pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
263 let mut atom = raw;
264 let invert = match atom.as_bytes() {
265 [b'!', ..] => {
266 atom = &atom[1..];
267 true
268 }
269 [b'\\', b'!', ..] => {
270 atom = &atom[1..];
271 false
272 }
273 _ => false,
274 };
275
276 let mut kind = match atom.as_bytes() {
277 [b'^', ..] => {
278 atom = &atom[1..];
279 AtomKind::Prefix
280 }
281 [b'\'', ..] => {
282 atom = &atom[1..];
283 AtomKind::Substring
284 }
285 [b'\\', b'^' | b'\'', ..] => {
286 atom = &atom[1..];
287 AtomKind::Fuzzy
288 }
289 _ => AtomKind::Fuzzy,
290 };
291
292 let mut append_dollar = false;
293 match atom.as_bytes() {
294 [.., b'\\', b'$'] => {
295 append_dollar = true;
296 atom = &atom[..atom.len() - 2]
297 }
298 [.., b'$'] => {
299 kind = if kind == AtomKind::Fuzzy {
300 AtomKind::Postfix
301 } else {
302 AtomKind::Exact
303 };
304 atom = &atom[..atom.len() - 1]
305 }
306 _ => (),
307 }
308
309 if invert && kind == AtomKind::Fuzzy {
310 kind = AtomKind::Substring
311 }
312
313 let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
314 pattern.negative = invert;
315 pattern
316 }
317
318 /// Matches this pattern against `haystack` (using the allocation and configuration
319 /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
320 /// Documentation for more details.
321 ///
322 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
323 /// each pattern atom.
324 pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
325 matcher.config.ignore_case = self.ignore_case;
326 matcher.config.normalize = self.normalize;
327 let pattern_score = match self.kind {
328 AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
329 AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
330 AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
331 AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
332 AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
333 };
334 if self.negative {
335 if pattern_score.is_some() {
336 return None;
337 }
338 Some(0)
339 } else {
340 pattern_score
341 }
342 }
343
344 /// Matches this pattern against `haystack` (using the allocation and
345 /// configuration from `matcher`), calculates a ranking score and the match
346 /// indices. See the [`Matcher`]. Documentation for more
347 /// details.
348 ///
349 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
350 /// each pattern atom.
351 ///
352 /// *Note:* The `indices` vector is not cleared by this function.
353 pub fn indices(
354 &self,
355 haystack: Utf32Str<'_>,
356 matcher: &mut Matcher,
357 indices: &mut Vec<u32>,
358 ) -> Option<u16> {
359 matcher.config.ignore_case = self.ignore_case;
360 matcher.config.normalize = self.normalize;
361 if self.negative {
362 let pattern_score = match self.kind {
363 AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
364 AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
365 AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
366 AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
367 AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
368 };
369 pattern_score.is_none().then_some(0)
370 } else {
371 match self.kind {
372 AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
373 AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
374 AtomKind::Substring => {
375 matcher.substring_indices(haystack, self.needle.slice(..), indices)
376 }
377 AtomKind::Prefix => {
378 matcher.prefix_indices(haystack, self.needle.slice(..), indices)
379 }
380 AtomKind::Postfix => {
381 matcher.postfix_indices(haystack, self.needle.slice(..), indices)
382 }
383 }
384 }
385 }
386
387 /// Returns the needle text that is passed to the matcher. All indices
388 /// produced by the `indices` functions produce char indices used to index
389 /// this text
390 pub fn needle_text(&self) -> Utf32Str<'_> {
391 self.needle.slice(..)
392 }
393 /// Convenience function to easily match (and sort) a (relatively small)
394 /// list of inputs.
395 ///
396 /// *Note* This function is not recommended for building a full fuzzy
397 /// matching application that can match large numbers of matches (like all
398 /// files in a directory) as all matching is done on the current thread,
399 /// effectively blocking the UI. For such applications the high level
400 /// `nucleo` crate can be used instead.
401 pub fn match_list<T: AsRef<str>>(
402 &self,
403 items: impl IntoIterator<Item = T>,
404 matcher: &mut Matcher,
405 ) -> Vec<(T, u16)> {
406 if self.needle.is_empty() {
407 return items.into_iter().map(|item| (item, 0)).collect();
408 }
409 let mut buf = Vec::new();
410 let mut items: Vec<_> = items
411 .into_iter()
412 .filter_map(|item| {
413 self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
414 .map(|score| (item, score))
415 })
416 .collect();
417 items.sort_by_key(|(_, score)| Reverse(*score));
418 items
419 }
420}
421
422fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
423 let mut saw_backslash = false;
424 pattern.split(move |c| {
425 saw_backslash = match c {
426 c if c.is_whitespace() && !saw_backslash => return true,
427 '\\' => true,
428 _ => false,
429 };
430 false
431 })
432}
433
434#[derive(Debug, Default)]
435/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
436#[non_exhaustive]
437pub struct Pattern {
438 /// The individual pattern (words) in this pattern
439 pub atoms: Vec<Atom>,
440}
441
442impl Pattern {
443 /// Creates a pattern where each word is matched individually (whitespaces
444 /// can be escaped with `\`). Otherwise no parsing is performed (so `$`, `!`,
445 /// `'` and `^` don't receive special treatment). If you want to match the entire
446 /// pattern as a single needle use a single [`Atom`] instead.
447 pub fn new(
448 pattern: &str,
449 case_matching: CaseMatching,
450 normalize: Normalization,
451 kind: AtomKind,
452 ) -> Pattern {
453 let atoms = pattern_atoms(pattern)
454 .filter_map(|pat| {
455 let pat = Atom::new(pat, case_matching, normalize, kind, true);
456 (!pat.needle.is_empty()).then_some(pat)
457 })
458 .collect();
459 Pattern { atoms }
460 }
461 /// Creates a pattern where each word is matched individually (whitespaces
462 /// can be escaped with `\`). And `$`, `!`, `'` and `^` at word boundaries will
463 /// cause different matching behaviour (see [`AtomKind`]). These can be
464 /// escaped with backslash.
465 pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
466 let atoms = pattern_atoms(pattern)
467 .filter_map(|pat| {
468 let pat = Atom::parse(pat, case_matching, normalize);
469 (!pat.needle.is_empty()).then_some(pat)
470 })
471 .collect();
472 Pattern { atoms }
473 }
474
475 /// Convenience function to easily match (and sort) a (relatively small)
476 /// list of inputs.
477 ///
478 /// *Note* This function is not recommended for building a full fuzzy
479 /// matching application that can match large numbers of matches (like all
480 /// files in a directory) as all matching is done on the current thread,
481 /// effectively blocking the UI. For such applications the high level
482 /// `nucleo` crate can be used instead.
483 pub fn match_list<T: AsRef<str>>(
484 &self,
485 items: impl IntoIterator<Item = T>,
486 matcher: &mut Matcher,
487 ) -> Vec<(T, u32)> {
488 if self.atoms.is_empty() {
489 return items.into_iter().map(|item| (item, 0)).collect();
490 }
491 let mut buf = Vec::new();
492 let mut items: Vec<_> = items
493 .into_iter()
494 .filter_map(|item| {
495 self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
496 .map(|score| (item, score))
497 })
498 .collect();
499 items.sort_by_key(|(_, score)| Reverse(*score));
500 items
501 }
502
503 /// Matches this pattern against `haystack` (using the allocation and configuration
504 /// from `matcher`) and calculates a ranking score. See the [`Matcher`]
505 /// documentation for more details.
506 ///
507 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
508 /// each pattern atom.
509 pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
510 if self.atoms.is_empty() {
511 return Some(0);
512 }
513 let mut score = 0;
514 for pattern in &self.atoms {
515 score += pattern.score(haystack, matcher)? as u32;
516 }
517 Some(score)
518 }
519
520 /// Matches this pattern against `haystack` (using the allocation and
521 /// configuration from `matcher`), calculates a ranking score and the match
522 /// indices. See the [`Matcher`] documentation for more
523 /// details.
524 ///
525 /// *Note:* The `ignore_case` setting is overwritten to match the casing of
526 /// each pattern atom.
527 ///
528 /// *Note:* The indices for each pattern are calculated individually
529 /// and simply appended to the `indices` vector and not deduplicated/sorted.
530 /// This allows associating the match indices to their source pattern. If
531 /// required (like for highlighting) unique/sorted indices can be obtained
532 /// as follows:
533 ///
534 /// ```
535 /// # let mut indices: Vec<u32> = Vec::new();
536 /// indices.sort_unstable();
537 /// indices.dedup();
538 /// ```
539 pub fn indices(
540 &self,
541 haystack: Utf32Str<'_>,
542 matcher: &mut Matcher,
543 indices: &mut Vec<u32>,
544 ) -> Option<u32> {
545 if self.atoms.is_empty() {
546 return Some(0);
547 }
548 let mut score = 0;
549 for pattern in &self.atoms {
550 score += pattern.indices(haystack, matcher, indices)? as u32;
551 }
552 Some(score)
553 }
554
555 /// Refreshes this pattern by reparsing it from a string. This is mostly
556 /// equivalent to just constructing a new pattern using [`Pattern::parse`]
557 /// but is slightly more efficient by reusing some allocations
558 pub fn reparse(
559 &mut self,
560 pattern: &str,
561 case_matching: CaseMatching,
562 normalize: Normalization,
563 ) {
564 self.atoms.clear();
565 let atoms = pattern_atoms(pattern).filter_map(|atom| {
566 let atom = Atom::parse(atom, case_matching, normalize);
567 if atom.needle.is_empty() {
568 return None;
569 }
570 Some(atom)
571 });
572 self.atoms.extend(atoms);
573 }
574}
575
576impl Clone for Pattern {
577 fn clone(&self) -> Self {
578 Self {
579 atoms: self.atoms.clone(),
580 }
581 }
582
583 fn clone_from(&mut self, source: &Self) {
584 self.atoms.clone_from(&source.atoms);
585 }
586}