nucleo_matcher/
pattern.rs

1//! This module provides a slightly higher level API for matching strings.
2
3use std::cmp::Reverse;
4
5use crate::{chars, Matcher, Utf32Str};
6
7#[cfg(test)]
8mod tests;
9
10use crate::Utf32String;
11
12#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
13#[non_exhaustive]
14/// How to treat a case mismatch between two characters.
15pub enum CaseMatching {
16    /// Characters never match their case folded version (`a != A`).
17    Respect,
18    /// Characters always match their case folded version (`a == A`).
19    #[cfg(feature = "unicode-casefold")]
20    Ignore,
21    /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are
22    /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise.
23    #[default]
24    #[cfg(feature = "unicode-casefold")]
25    Smart,
26}
27
28#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
29#[non_exhaustive]
30/// How to handle unicode normalization,
31pub enum Normalization {
32    /// Characters never match their normalized version (`a != ä`).
33    Never,
34    /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom
35    /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`).
36    #[default]
37    #[cfg(feature = "unicode-normalization")]
38    Smart,
39}
40
41#[derive(Debug, PartialEq, Eq, Clone, Copy)]
42#[non_exhaustive]
43/// The kind of matching algorithm to run for an atom.
44pub enum AtomKind {
45    /// Fuzzy matching where the needle must match any haystack characters
46    /// (match can contain gaps). This atom kind is used by default if no
47    /// special syntax is used. There is no negated fuzzy matching (too
48    /// many false positives).
49    ///
50    /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match).
51    Fuzzy,
52    /// The needle must match a contiguous sequence of haystack characters
53    /// without gaps.  This atom kind is parsed from the following syntax:
54    /// `'foo` and `!foo` (negated).
55    ///
56    /// See also [`Matcher::substring_match`](crate::Matcher::substring_match).
57    Substring,
58    /// The needle must match all leading haystack characters without gaps or
59    /// prefix. This atom kind is parsed from the following syntax: `^foo` and
60    /// `!^foo` (negated).
61    ///
62    /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match).
63    Prefix,
64    /// The needle must match all trailing haystack characters without gaps or
65    /// postfix. This atom kind is parsed from the following syntax: `foo$` and
66    /// `!foo$` (negated).
67    ///
68    /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match).
69    Postfix,
70    /// The needle must match all haystack characters without gaps or prefix.
71    /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$`
72    /// (negated).
73    ///
74    /// See also [`Matcher::exact_match`](crate::Matcher::exact_match).
75    Exact,
76}
77
78/// A single pattern component that is matched with a single [`Matcher`] function
79#[derive(Debug, PartialEq, Eq, Clone)]
80pub struct Atom {
81    /// Whether this pattern atom is a negative match.
82    /// A negative pattern atom will prevent haystacks matching it from
83    /// being matchend. It does not contribute to scoring/indices
84    pub negative: bool,
85    /// The kind of match that this pattern performs
86    pub kind: AtomKind,
87    needle: Utf32String,
88    ignore_case: bool,
89    normalize: bool,
90}
91
92impl Atom {
93    /// Creates a single [`Atom`] from a string by performing unicode
94    /// normalization and case folding (if necessary). Optionally `\ ` can
95    /// be escaped to ` `.
96    pub fn new(
97        needle: &str,
98        case: CaseMatching,
99        normalize: Normalization,
100        kind: AtomKind,
101        escape_whitespace: bool,
102    ) -> Atom {
103        Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false)
104    }
105
106    fn new_inner(
107        needle: &str,
108        case: CaseMatching,
109        normalization: Normalization,
110        kind: AtomKind,
111        escape_whitespace: bool,
112        append_dollar: bool,
113    ) -> Atom {
114        let mut ignore_case;
115        let mut normalize;
116        #[cfg(feature = "unicode-normalization")]
117        {
118            normalize = matches!(normalization, Normalization::Smart);
119        }
120        #[cfg(not(feature = "unicode-normalization"))]
121        {
122            normalize = false;
123        }
124        let needle = if needle.is_ascii() {
125            let mut needle = if escape_whitespace {
126                if let Some((start, rem)) = needle.split_once("\\ ") {
127                    let mut needle = start.to_owned();
128                    for rem in rem.split("\\ ") {
129                        needle.push(' ');
130                        needle.push_str(rem);
131                    }
132                    needle
133                } else {
134                    needle.to_owned()
135                }
136            } else {
137                needle.to_owned()
138            };
139
140            match case {
141                #[cfg(feature = "unicode-casefold")]
142                CaseMatching::Ignore => {
143                    ignore_case = true;
144                    needle.make_ascii_lowercase()
145                }
146                #[cfg(feature = "unicode-casefold")]
147                CaseMatching::Smart => {
148                    ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
149                }
150                CaseMatching::Respect => ignore_case = false,
151            }
152            if append_dollar {
153                needle.push('$');
154            }
155            Utf32String::Ascii(needle.into_boxed_str())
156        } else {
157            let mut needle_ = Vec::with_capacity(needle.len());
158            #[cfg(feature = "unicode-casefold")]
159            {
160                ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart);
161            }
162            #[cfg(not(feature = "unicode-casefold"))]
163            {
164                ignore_case = false;
165            }
166            #[cfg(feature = "unicode-normalization")]
167            {
168                normalize = matches!(normalization, Normalization::Smart);
169            }
170            if escape_whitespace {
171                let mut saw_backslash = false;
172                for mut c in chars::graphemes(needle) {
173                    if saw_backslash {
174                        if c == ' ' {
175                            needle_.push(' ');
176                            saw_backslash = false;
177                            continue;
178                        } else {
179                            needle_.push('\\');
180                        }
181                    }
182                    saw_backslash = c == '\\';
183                    match case {
184                        #[cfg(feature = "unicode-casefold")]
185                        CaseMatching::Ignore => c = chars::to_lower_case(c),
186                        #[cfg(feature = "unicode-casefold")]
187                        CaseMatching::Smart => {
188                            ignore_case = ignore_case && !chars::is_upper_case(c)
189                        }
190                        CaseMatching::Respect => (),
191                    }
192                    match normalization {
193                        #[cfg(feature = "unicode-normalization")]
194                        Normalization::Smart => {
195                            normalize = normalize && chars::normalize(c) == c;
196                        }
197                        Normalization::Never => (),
198                    }
199                    needle_.push(c);
200                }
201            } else {
202                let chars = chars::graphemes(needle).map(|mut c| {
203                    match case {
204                        #[cfg(feature = "unicode-casefold")]
205                        CaseMatching::Ignore => c = chars::to_lower_case(c),
206                        #[cfg(feature = "unicode-casefold")]
207                        CaseMatching::Smart => {
208                            ignore_case = ignore_case && !chars::is_upper_case(c);
209                        }
210                        CaseMatching::Respect => (),
211                    }
212                    match normalization {
213                        #[cfg(feature = "unicode-normalization")]
214                        Normalization::Smart => {
215                            normalize = normalize && chars::normalize(c) == c;
216                        }
217                        Normalization::Never => (),
218                    }
219                    c
220                });
221                needle_.extend(chars);
222            };
223            if append_dollar {
224                needle_.push('$');
225            }
226            Utf32String::Unicode(needle_.into_boxed_slice())
227        };
228        Atom {
229            kind,
230            needle,
231            negative: false,
232            ignore_case,
233            normalize,
234        }
235    }
236
237    /// Parse a pattern atom from a string. Some special trailing and leading
238    /// characters can be used to control the atom kind. See [`AtomKind`] for
239    /// details.
240    pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom {
241        let mut atom = raw;
242        let invert = match atom.as_bytes() {
243            [b'!', ..] => {
244                atom = &atom[1..];
245                true
246            }
247            [b'\\', b'!', ..] => {
248                atom = &atom[1..];
249                false
250            }
251            _ => false,
252        };
253
254        let mut kind = match atom.as_bytes() {
255            [b'^', ..] => {
256                atom = &atom[1..];
257                AtomKind::Prefix
258            }
259            [b'\'', ..] => {
260                atom = &atom[1..];
261                AtomKind::Substring
262            }
263            [b'\\', b'^' | b'\'', ..] => {
264                atom = &atom[1..];
265                AtomKind::Fuzzy
266            }
267            _ => AtomKind::Fuzzy,
268        };
269
270        let mut append_dollar = false;
271        match atom.as_bytes() {
272            [.., b'\\', b'$'] => {
273                append_dollar = true;
274                atom = &atom[..atom.len() - 2]
275            }
276            [.., b'$'] => {
277                kind = if kind == AtomKind::Fuzzy {
278                    AtomKind::Postfix
279                } else {
280                    AtomKind::Exact
281                };
282                atom = &atom[..atom.len() - 1]
283            }
284            _ => (),
285        }
286
287        if invert && kind == AtomKind::Fuzzy {
288            kind = AtomKind::Substring
289        }
290
291        let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar);
292        pattern.negative = invert;
293        pattern
294    }
295
296    /// Matches this pattern against `haystack` (using the allocation and configuration
297    /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
298    /// Documentation for more details.
299    ///
300    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
301    /// each pattern atom.
302    pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u16> {
303        matcher.config.ignore_case = self.ignore_case;
304        matcher.config.normalize = self.normalize;
305        let pattern_score = match self.kind {
306            AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
307            AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
308            AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
309            AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
310            AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
311        };
312        if self.negative {
313            if pattern_score.is_some() {
314                return None;
315            }
316            Some(0)
317        } else {
318            pattern_score
319        }
320    }
321
322    /// Matches this pattern against `haystack` (using the allocation and
323    /// configuration from `matcher`), calculates a ranking score and the match
324    /// indices. See the [`Matcher`]. Documentation for more
325    /// details.
326    ///
327    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
328    /// each pattern atom.
329    ///
330    /// *Note:*  The `indices` vector is not cleared by this function.
331    pub fn indices(
332        &self,
333        haystack: Utf32Str<'_>,
334        matcher: &mut Matcher,
335        indices: &mut Vec<u32>,
336    ) -> Option<u16> {
337        matcher.config.ignore_case = self.ignore_case;
338        matcher.config.normalize = self.normalize;
339        if self.negative {
340            let pattern_score = match self.kind {
341                AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)),
342                AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)),
343                AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)),
344                AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)),
345                AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)),
346            };
347            pattern_score.is_none().then_some(0)
348        } else {
349            match self.kind {
350                AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices),
351                AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices),
352                AtomKind::Substring => {
353                    matcher.substring_indices(haystack, self.needle.slice(..), indices)
354                }
355                AtomKind::Prefix => {
356                    matcher.prefix_indices(haystack, self.needle.slice(..), indices)
357                }
358                AtomKind::Postfix => {
359                    matcher.postfix_indices(haystack, self.needle.slice(..), indices)
360                }
361            }
362        }
363    }
364
365    /// Returns the needle text that is passed to the matcher. All indices
366    /// produced by the `indices` functions produce char indices used to index
367    /// this text
368    pub fn needle_text(&self) -> Utf32Str<'_> {
369        self.needle.slice(..)
370    }
371    /// Convenience function to easily match (and sort) a (relatively small)
372    /// list of inputs.
373    ///
374    /// *Note* This function is not recommended for building a full fuzzy
375    /// matching application that can match large numbers of matches (like all
376    /// files in a directory) as all matching is done on the current thread,
377    /// effectively blocking the UI. For such applications the high level
378    /// `nucleo` crate can be used instead.
379    pub fn match_list<T: AsRef<str>>(
380        &self,
381        items: impl IntoIterator<Item = T>,
382        matcher: &mut Matcher,
383    ) -> Vec<(T, u16)> {
384        if self.needle.is_empty() {
385            return items.into_iter().map(|item| (item, 0)).collect();
386        }
387        let mut buf = Vec::new();
388        let mut items: Vec<_> = items
389            .into_iter()
390            .filter_map(|item| {
391                self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
392                    .map(|score| (item, score))
393            })
394            .collect();
395        items.sort_by_key(|(_, score)| Reverse(*score));
396        items
397    }
398}
399
400fn pattern_atoms(pattern: &str) -> impl Iterator<Item = &str> + '_ {
401    let mut saw_backslash = false;
402    pattern.split(move |c| {
403        saw_backslash = match c {
404            ' ' if !saw_backslash => return true,
405            '\\' => true,
406            _ => false,
407        };
408        false
409    })
410}
411
412#[derive(Debug, Default)]
413/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom).
414#[non_exhaustive]
415pub struct Pattern {
416    /// The individual pattern (words) in this pattern
417    pub atoms: Vec<Atom>,
418}
419
420impl Pattern {
421    /// Creates a pattern where each word is matched individually (whitespaces
422    /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, '
423    /// and ^ don't receive special treatment). If you want to match the entire
424    /// pattern as a single needle use a single [`Atom`] instead.
425    pub fn new(
426        pattern: &str,
427        case_matching: CaseMatching,
428        normalize: Normalization,
429        kind: AtomKind,
430    ) -> Pattern {
431        let atoms = pattern_atoms(pattern)
432            .filter_map(|pat| {
433                let pat = Atom::new(pat, case_matching, normalize, kind, true);
434                (!pat.needle.is_empty()).then_some(pat)
435            })
436            .collect();
437        Pattern { atoms }
438    }
439    /// Creates a pattern where each word is matched individually (whitespaces
440    /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will
441    /// cause different matching behaviour (see [`AtomKind`]). These can be
442    /// escaped with backslash.
443    pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern {
444        let atoms = pattern_atoms(pattern)
445            .filter_map(|pat| {
446                let pat = Atom::parse(pat, case_matching, normalize);
447                (!pat.needle.is_empty()).then_some(pat)
448            })
449            .collect();
450        Pattern { atoms }
451    }
452
453    /// Convenience function to easily match (and sort) a (relatively small)
454    /// list of inputs.
455    ///
456    /// *Note* This function is not recommended for building a full fuzzy
457    /// matching application that can match large numbers of matches (like all
458    /// files in a directory) as all matching is done on the current thread,
459    /// effectively blocking the UI. For such applications the high level
460    /// `nucleo` crate can be used instead.
461    pub fn match_list<T: AsRef<str>>(
462        &self,
463        items: impl IntoIterator<Item = T>,
464        matcher: &mut Matcher,
465    ) -> Vec<(T, u32)> {
466        if self.atoms.is_empty() {
467            return items.into_iter().map(|item| (item, 0)).collect();
468        }
469        let mut buf = Vec::new();
470        let mut items: Vec<_> = items
471            .into_iter()
472            .filter_map(|item| {
473                self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher)
474                    .map(|score| (item, score))
475            })
476            .collect();
477        items.sort_by_key(|(_, score)| Reverse(*score));
478        items
479    }
480
481    /// Matches this pattern against `haystack` (using the allocation and configuration
482    /// from `matcher`) and calculates a ranking score. See the [`Matcher`].
483    /// Documentation for more details.
484    ///
485    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
486    /// each pattern atom.
487    pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option<u32> {
488        if self.atoms.is_empty() {
489            return Some(0);
490        }
491        let mut score = 0;
492        for pattern in &self.atoms {
493            score += pattern.score(haystack, matcher)? as u32;
494        }
495        Some(score)
496    }
497
498    /// Matches this pattern against `haystack` (using the allocation and
499    /// configuration from `matcher`), calculates a ranking score and the match
500    /// indices. See the [`Matcher`]. Documentation for more
501    /// details.
502    ///
503    /// *Note:*  The `ignore_case` setting is overwritten to match the casing of
504    /// each pattern atom.
505    ///
506    /// *Note:*  The indices for each pattern are calculated individually
507    /// and simply appended to the `indices` vector and not deduplicated/sorted.
508    /// This allows associating the match indices to their source pattern. If
509    /// required (like for highlighting) unique/sorted indices can be obtained
510    /// as follows:
511    ///
512    /// ```
513    /// # let mut indices: Vec<u32> = Vec::new();
514    /// indices.sort_unstable();
515    /// indices.dedup();
516    /// ```
517    pub fn indices(
518        &self,
519        haystack: Utf32Str<'_>,
520        matcher: &mut Matcher,
521        indices: &mut Vec<u32>,
522    ) -> Option<u32> {
523        if self.atoms.is_empty() {
524            return Some(0);
525        }
526        let mut score = 0;
527        for pattern in &self.atoms {
528            score += pattern.indices(haystack, matcher, indices)? as u32;
529        }
530        Some(score)
531    }
532
533    /// Refreshes this pattern by reparsing it from a string. This is mostly
534    /// equivalent to just constructing a new pattern using [`Pattern::parse`]
535    /// but is slightly more efficient by reusing some allocations
536    pub fn reparse(
537        &mut self,
538        pattern: &str,
539        case_matching: CaseMatching,
540        normalize: Normalization,
541    ) {
542        self.atoms.clear();
543        let atoms = pattern_atoms(pattern).filter_map(|atom| {
544            let atom = Atom::parse(atom, case_matching, normalize);
545            if atom.needle.is_empty() {
546                return None;
547            }
548            Some(atom)
549        });
550        self.atoms.extend(atoms);
551    }
552}
553
554impl Clone for Pattern {
555    fn clone(&self) -> Self {
556        Self {
557            atoms: self.atoms.clone(),
558        }
559    }
560
561    fn clone_from(&mut self, source: &Self) {
562        self.atoms.clone_from(&source.atoms);
563    }
564}