ncp_matcher/
pattern.rs

1//! This module provides a slightly higher level API for matching strings.
2
3use std::cmp::Reverse;
4
5use crate::{Matcher, Utf32Str, chars};
6
7#[cfg(test)]
8mod tests;
9
10use crate::Utf32String;
11
12#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
13#[non_exhaustive]
14/// How to treat a case mismatch between two characters.
15pub enum CaseMatching {
16    /// Characters never match their case folded version (`a != A`).
17    #[cfg_attr(not(feature = "unicode-casefold"), default)]
18    Respect,
19    /// Characters always match their case folded version (`a == A`).
20    #[cfg(feature = "unicode-casefold")]
21    Ignore,
22    /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
23    /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
24    #[default]
25    #[cfg(feature = "unicode-casefold")]
26    Smart,
27}
28
29#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
30#[non_exhaustive]
31/// How to handle unicode normalization.
32pub enum Normalization {
33    /// Characters never match their normalized version (`a != ä`).
34    #[cfg_attr(not(feature = "unicode-normalization"), default)]
35    Never,
36    /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
37    /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
38    #[default]
39    #[cfg(feature = "unicode-normalization")]
40    Smart,
41}
42
43#[derive(Debug, PartialEq, Eq, Clone, Copy)]
44#[non_exhaustive]
45/// The kind of matching algorithm to run for an atom.
46pub enum AtomKind {
47    /// Fuzzy matching where the needle must match any haystack characters
48    /// (match can contain gaps). This atom kind is used by default if no
49    /// special syntax is used. There is no negated fuzzy matching (too
50    /// many false positives).
51    ///
52    /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
53    Fuzzy,
54    /// The needle must match a contiguous sequence of haystack characters
55    /// without gaps.  This atom kind is parsed from the following syntax:
56    /// `'foo` and `!foo` (negated).
57    ///
58    /// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
59    Substring,
60    /// The needle must match all leading haystack characters without gaps or
61    /// prefix. This atom kind is parsed from the following syntax: `^foo` and
62    /// `!^foo` (negated).
63    ///
64    /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
65    Prefix,
66    /// The needle must match all trailing haystack characters without gaps or
67    /// postfix. This atom kind is parsed from the following syntax: `foo$` and
68    /// `!foo$` (negated).
69    ///
70    /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
71    Postfix,
72    /// The needle must match all haystack characters without gaps or prefix.
73    /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
74    /// (negated).
75    ///
76    /// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
77    Exact,
78}
79
80/// A single pattern component that is matched with a single [`Matcher`] function.
81#[derive(Debug, PartialEq, Eq, Clone)]
82pub struct Atom {
83    /// Whether this pattern atom is a negative match.
84    /// A negative pattern atom will prevent haystacks matching it from
85    /// being matchend. It does not contribute to scoring/indices
86    pub negative: bool,
87    /// The kind of match that this pattern performs
88    pub kind: AtomKind,
89    needle: Utf32String,
90    ignore_case: bool,
91    normalize: bool,
92}
93
94impl Atom {
95    /// Creates a single [`Atom`] from a string by performing unicode
96    /// normalization and case folding (if necessary). Optionally `\ ` can
97    /// be escaped to ` `.
98    pub fn new(
99        needle: &str,
100        case: CaseMatching,
101        normalize: Normalization,
102        kind: AtomKind,
103        escape_whitespace: bool,
104    ) -> Atom {
105        Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
106    }
107
108    fn new_inner(
109        needle: &str,
110        case: CaseMatching,
111        normalization: Normalization,
112        kind: AtomKind,
113        escape_whitespace: bool,
114        append_dollar: bool,
115    ) -> Atom {
116        let mut ignore_case;
117        let mut normalize;
118        #[cfg(feature = "unicode-normalization")]
119        {
120            normalize = matches!(normalization, Normalization::Smart);
121        }
122        #[cfg(not(feature = "unicode-normalization"))]
123        {
124            normalize = false;
125        }
126        let needle = if needle.is_ascii() {
127            let mut needle_string = if escape_whitespace {
128                let mut needle_bytes = Vec::with_capacity(needle.len());
129                let mut saw_backslash = false;
130                for c in needle.bytes() {
131                    if saw_backslash {
132                        if c.is_ascii_whitespace() {
133                            needle_bytes.push(c);
134                            saw_backslash = false;
135                            continue;
136                        } else {
137                            needle_bytes.push(b'\\');
138                        }
139                    }
140                    saw_backslash = c == b'\\';
141                    if !saw_backslash {
142                        needle_bytes.push(c);
143                    }
144                }
145                // push the potentially trailing backslash
146                if saw_backslash {
147                    needle_bytes.push(b'\\');
148                }
149                // SAFETY: we just checked that needle is ascii, so each `c` is a valid ASCII byte
150                unsafe { String::from_utf8_unchecked(needle_bytes) }
151            } else {
152                needle.to_owned()
153            };
154
155            match case {
156                #[cfg(feature = "unicode-casefold")]
157                CaseMatching::Ignore => {
158                    ignore_case = true;
159                    needle_string.make_ascii_lowercase()
160                }
161                #[cfg(feature = "unicode-casefold")]
162                CaseMatching::Smart => {
163                    ignore_case = !needle_string.bytes().any(|b| b.is_ascii_uppercase())
164                }
165                CaseMatching::Respect => ignore_case = false,
166            }
167
168            if append_dollar {
169                needle_string.push('$');
170            }
171            Utf32String::Ascii(needle_string.into_boxed_str())
172        } else {
173            let mut needle_ = Vec::with_capacity(needle.len());
174            #[cfg(feature = "unicode-casefold")]
175            {
176                ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
177            }
178            #[cfg(not(feature = "unicode-casefold"))]
179            {
180                ignore_case = false;
181            }
182            #[cfg(feature = "unicode-normalization")]
183            {
184                normalize = matches!(normalization, Normalization::Smart);
185            }
186            if escape_whitespace {
187                let mut saw_backslash = false;
188                for mut c in chars::graphemes(needle) {
189                    if saw_backslash {
190                        if c.is_whitespace() {
191                            needle_.push(c);
192                            saw_backslash = false;
193                            continue;
194                        } else {
195                            needle_.push('\\');
196                        }
197                    }
198                    saw_backslash = c == '\\';
199                    if !saw_backslash {
200                        match case {
201                            #[cfg(feature = "unicode-casefold")]
202                            CaseMatching::Ignore => c = chars::to_lower_case(c),
203                            #[cfg(feature = "unicode-casefold")]
204                            CaseMatching::Smart => {
205                                ignore_case = ignore_case && !chars::is_upper_case(c)
206                            }
207                            CaseMatching::Respect => (),
208                        }
209                        match normalization {
210                            #[cfg(feature = "unicode-normalization")]
211                            Normalization::Smart => {
212                                normalize = normalize && chars::normalize(c) == c;
213                            }
214                            Normalization::Never => (),
215                        }
216                        needle_.push(c);
217                    }
218                }
219                // push the potentially trailing backslash
220                if saw_backslash {
221                    needle_.push('\\');
222                }
223            } else {
224                let chars = chars::graphemes(needle).map(|mut c| {
225                    match case {
226                        #[cfg(feature = "unicode-casefold")]
227                        CaseMatching::Ignore => c = chars::to_lower_case(c),
228                        #[cfg(feature = "unicode-casefold")]
229                        CaseMatching::Smart => {
230                            ignore_case = ignore_case && !chars::is_upper_case(c);
231                        }
232                        CaseMatching::Respect => (),
233                    }
234                    match normalization {
235                        #[cfg(feature = "unicode-normalization")]
236                        Normalization::Smart => {
237                            normalize = normalize && chars::normalize(c) == c;
238                        }
239                        Normalization::Never => (),
240                    }
241                    c
242                });
243                needle_.extend(chars);
244            };
245            if append_dollar {
246                needle_.push('$');
247            }
248            Utf32String::Unicode(needle_.into_boxed_slice())
249        };
250        Atom {
251            kind,
252            needle,
253            negative: false,
254            ignore_case,
255            normalize,
256        }
257    }
258
259    /// Parse a pattern atom from a string. Some special trailing and leading
260    /// characters can be used to control the atom kind. See [`AtomKind`] for
261    /// details.
262    pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
263        let mut atom = raw;
264        let invert = match atom.as_bytes() {
265            [b'!', ..] => {
266                atom = &atom[1..];
267                true
268            }
269            [b'\\', b'!', ..] => {
270                atom = &atom[1..];
271                false
272            }
273            _ => false,
274        };
275
276        let mut kind = match atom.as_bytes() {
277            [b'^', ..] => {
278                atom = &atom[1..];
279                AtomKind::Prefix
280            }
281            [b'\'', ..] => {
282                atom = &atom[1..];
283                AtomKind::Substring
284            }
285            [b'\\', b'^' | b'\'', ..] => {
286                atom = &atom[1..];
287                AtomKind::Fuzzy
288            }
289            _ => AtomKind::Fuzzy,
290        };
291
292        let mut append_dollar = false;
293        match atom.as_bytes() {
294            [.., b'\\', b'$'] => {
295                append_dollar = true;
296                atom = &atom[..atom.len() - 2]
297            }
298            [.., b'$'] => {
299                kind = if kind == AtomKind::Fuzzy {
300                    AtomKind::Postfix
301                } else {
302                    AtomKind::Exact
303                };
304                atom = &atom[..atom.len() - 1]
305            }
306            _ => (),
307        }
308
309        if invert && kind == AtomKind::Fuzzy {
310            kind = AtomKind::Substring
311        }
312
313        let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
314        pattern.negative = invert;
315        pattern
316    }
317
318    /// Matches this pattern against `haystack` (using the allocation and configuration
319    /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
320    /// Documentation for more details.
321    ///
322    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
323    /// each pattern atom.
324    pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
325        matcher.config.ignore_case = self.ignore_case;
326        matcher.config.normalize = self.normalize;
327        let pattern_score = match self.kind {
328            AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
329            AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
330            AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
331            AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
332            AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
333        };
334        if self.negative {
335            if pattern_score.is_some() {
336                return None;
337            }
338            Some(0)
339        } else {
340            pattern_score
341        }
342    }
343
344    /// Matches this pattern against `haystack` (using the allocation and
345    /// configuration from `matcher`), calculates a ranking score and the match
346    /// indices. See the [`Matcher`]. Documentation for more
347    /// details.
348    ///
349    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
350    /// each pattern atom.
351    ///
352    /// *Note:*  The `indices` vector is not cleared by this function.
353    pub fn indices(
354        &self,
355        haystack: Utf32Str<'_>,
356        matcher: &mut Matcher,
357        indices: &mut Vec<u32>,
358    ) -> Option<u16> {
359        matcher.config.ignore_case = self.ignore_case;
360        matcher.config.normalize = self.normalize;
361        if self.negative {
362            let pattern_score = match self.kind {
363                AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
364                AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
365                AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
366                AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
367                AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
368            };
369            pattern_score.is_none().then_some(0)
370        } else {
371            match self.kind {
372                AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
373                AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
374                AtomKind::Substring => {
375                    matcher.substring_indices(haystack, self.needle.slice(..), indices)
376                }
377                AtomKind::Prefix => {
378                    matcher.prefix_indices(haystack, self.needle.slice(..), indices)
379                }
380                AtomKind::Postfix => {
381                    matcher.postfix_indices(haystack, self.needle.slice(..), indices)
382                }
383            }
384        }
385    }
386
387    /// Returns the needle text that is passed to the matcher. All indices
388    /// produced by the `indices` functions produce char indices used to index
389    /// this text
390    pub fn needle_text(&self) -> Utf32Str<'_> {
391        self.needle.slice(..)
392    }
393    /// Convenience function to easily match (and sort) a (relatively small)
394    /// list of inputs.
395    ///
396    /// *Note* This function is not recommended for building a full fuzzy
397    /// matching application that can match large numbers of matches (like all
398    /// files in a directory) as all matching is done on the current thread,
399    /// effectively blocking the UI. For such applications the high level
400    /// `nucleo` crate can be used instead.
401    pub fn match_list<T: AsRef<str>>(
402        &self,
403        items: impl IntoIterator<Item = T>,
404        matcher: &mut Matcher,
405    ) -> Vec<(T, u16)> {
406        if self.needle.is_empty() {
407            return items.into_iter().map(|item| (item, 0)).collect();
408        }
409        let mut buf = Vec::new();
410        let mut items: Vec<_> = items
411            .into_iter()
412            .filter_map(|item| {
413                self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
414                    .map(|score| (item, score))
415            })
416            .collect();
417        items.sort_by_key(|(_, score)| Reverse(*score));
418        items
419    }
420}
421
422fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
423    let mut saw_backslash = false;
424    pattern.split(move |c| {
425        saw_backslash = match c {
426            c if c.is_whitespace() && !saw_backslash => return true,
427            '\\' => true,
428            _ => false,
429        };
430        false
431    })
432}
433
434#[derive(Debug, Default)]
435/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
436#[non_exhaustive]
437pub struct Pattern {
438    /// The individual pattern (words) in this pattern
439    pub atoms: Vec<Atom>,
440}
441
442impl Pattern {
443    /// Creates a pattern where each word is matched individually (whitespaces
444    /// can be escaped with `\`). Otherwise no parsing is performed (so `$`, `!`,
445    /// `'` and `^` don't receive special treatment). If you want to match the entire
446    /// pattern as a single needle use a single [`Atom`] instead.
447    pub fn new(
448        pattern: &str,
449        case_matching: CaseMatching,
450        normalize: Normalization,
451        kind: AtomKind,
452    ) -> Pattern {
453        let atoms = pattern_atoms(pattern)
454            .filter_map(|pat| {
455                let pat = Atom::new(pat, case_matching, normalize, kind, true);
456                (!pat.needle.is_empty()).then_some(pat)
457            })
458            .collect();
459        Pattern { atoms }
460    }
461    /// Creates a pattern where each word is matched individually (whitespaces
462    /// can be escaped with `\`). And `$`, `!`, `'` and `^` at word boundaries will
463    /// cause different matching behaviour (see [`AtomKind`]). These can be
464    /// escaped with backslash.
465    pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
466        let atoms = pattern_atoms(pattern)
467            .filter_map(|pat| {
468                let pat = Atom::parse(pat, case_matching, normalize);
469                (!pat.needle.is_empty()).then_some(pat)
470            })
471            .collect();
472        Pattern { atoms }
473    }
474
475    /// Convenience function to easily match (and sort) a (relatively small)
476    /// list of inputs.
477    ///
478    /// *Note* This function is not recommended for building a full fuzzy
479    /// matching application that can match large numbers of matches (like all
480    /// files in a directory) as all matching is done on the current thread,
481    /// effectively blocking the UI. For such applications the high level
482    /// `nucleo` crate can be used instead.
483    pub fn match_list<T: AsRef<str>>(
484        &self,
485        items: impl IntoIterator<Item = T>,
486        matcher: &mut Matcher,
487    ) -> Vec<(T, u32)> {
488        if self.atoms.is_empty() {
489            return items.into_iter().map(|item| (item, 0)).collect();
490        }
491        let mut buf = Vec::new();
492        let mut items: Vec<_> = items
493            .into_iter()
494            .filter_map(|item| {
495                self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
496                    .map(|score| (item, score))
497            })
498            .collect();
499        items.sort_by_key(|(_, score)| Reverse(*score));
500        items
501    }
502
503    /// Matches this pattern against `haystack` (using the allocation and configuration
504    /// from `matcher`) and calculates a ranking score. See the [`Matcher`]
505    /// documentation for more details.
506    ///
507    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
508    /// each pattern atom.
509    pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
510        if self.atoms.is_empty() {
511            return Some(0);
512        }
513        let mut score = 0;
514        for pattern in &self.atoms {
515            score += pattern.score(haystack, matcher)? as u32;
516        }
517        Some(score)
518    }
519
520    /// Matches this pattern against `haystack` (using the allocation and
521    /// configuration from `matcher`), calculates a ranking score and the match
522    /// indices. See the [`Matcher`] documentation for more
523    /// details.
524    ///
525    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
526    /// each pattern atom.
527    ///
528    /// *Note:*  The indices for each pattern are calculated individually
529    /// and simply appended to the `indices` vector and not deduplicated/sorted.
530    /// This allows associating the match indices to their source pattern. If
531    /// required (like for highlighting) unique/sorted indices can be obtained
532    /// as follows:
533    ///
534    /// ```
535    /// # let mut indices: Vec<u32> = Vec::new();
536    /// indices.sort_unstable();
537    /// indices.dedup();
538    /// ```
539    pub fn indices(
540        &self,
541        haystack: Utf32Str<'_>,
542        matcher: &mut Matcher,
543        indices: &mut Vec<u32>,
544    ) -> Option<u32> {
545        if self.atoms.is_empty() {
546            return Some(0);
547        }
548        let mut score = 0;
549        for pattern in &self.atoms {
550            score += pattern.indices(haystack, matcher, indices)? as u32;
551        }
552        Some(score)
553    }
554
555    /// Refreshes this pattern by reparsing it from a string. This is mostly
556    /// equivalent to just constructing a new pattern using [`Pattern::parse`]
557    /// but is slightly more efficient by reusing some allocations
558    pub fn reparse(
559        &mut self,
560        pattern: &str,
561        case_matching: CaseMatching,
562        normalize: Normalization,
563    ) {
564        self.atoms.clear();
565        let atoms = pattern_atoms(pattern).filter_map(|atom| {
566            let atom = Atom::parse(atom, case_matching, normalize);
567            if atom.needle.is_empty() {
568                return None;
569            }
570            Some(atom)
571        });
572        self.atoms.extend(atoms);
573    }
574}
575
576impl Clone for Pattern {
577    fn clone(&self) -> Self {
578        Self {
579            atoms: self.atoms.clone(),
580        }
581    }
582
583    fn clone_from(&mut self, source: &Self) {
584        self.atoms.clone_from(&source.atoms);
585    }
586}