grep_pcre2/
matcher.rs

1use std::collections::HashMap;
2
3use {
4    grep_matcher::{Captures, Match, Matcher},
5    pcre2::bytes::{CaptureLocations, Regex, RegexBuilder},
6};
7
8use crate::error::Error;
9
10/// A builder for configuring the compilation of a PCRE2 regex.
11#[derive(Clone, Debug)]
12pub struct RegexMatcherBuilder {
13    builder: RegexBuilder,
14    case_smart: bool,
15    word: bool,
16    fixed_strings: bool,
17    whole_line: bool,
18}
19
20impl RegexMatcherBuilder {
21    /// Create a new matcher builder with a default configuration.
22    pub fn new() -> RegexMatcherBuilder {
23        RegexMatcherBuilder {
24            builder: RegexBuilder::new(),
25            case_smart: false,
26            word: false,
27            fixed_strings: false,
28            whole_line: false,
29        }
30    }
31
32    /// Compile the given pattern into a PCRE matcher using the current
33    /// configuration.
34    ///
35    /// If there was a problem compiling the pattern, then an error is
36    /// returned.
37    pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
38        self.build_many(&[pattern])
39    }
40
41    /// Compile all of the given patterns into a single regex that matches when
42    /// at least one of the patterns matches.
43    ///
44    /// If there was a problem building the regex, then an error is returned.
45    pub fn build_many<P: AsRef<str>>(
46        &self,
47        patterns: &[P],
48    ) -> Result<RegexMatcher, Error> {
49        let mut builder = self.builder.clone();
50        let mut pats = Vec::with_capacity(patterns.len());
51        for p in patterns.iter() {
52            pats.push(if self.fixed_strings {
53                format!("(?:{})", pcre2::escape(p.as_ref()))
54            } else {
55                format!("(?:{})", p.as_ref())
56            });
57        }
58        let mut singlepat = if patterns.is_empty() {
59            // A way to spell a pattern that can never match anything.
60            r"[^\S\s]".to_string()
61        } else {
62            pats.join("|")
63        };
64        if self.case_smart && !has_uppercase_literal(&singlepat) {
65            builder.caseless(true);
66        }
67        if self.whole_line {
68            singlepat = format!(r"(?m:^)(?:{})(?m:$)", singlepat);
69        } else if self.word {
70            // We make this option exclusive with whole_line because when
71            // whole_line is enabled, all matches necessary fall on word
72            // boundaries. So this extra goop is strictly redundant.
73            singlepat = format!(r"(?<!\w)(?:{})(?!\w)", singlepat);
74        }
75        log::trace!("final regex: {:?}", singlepat);
76        builder.build(&singlepat).map_err(Error::regex).map(|regex| {
77            let mut names = HashMap::new();
78            for (i, name) in regex.capture_names().iter().enumerate() {
79                if let Some(ref name) = *name {
80                    names.insert(name.to_string(), i);
81                }
82            }
83            RegexMatcher { regex, names }
84        })
85    }
86
87    /// Enables case insensitive matching.
88    ///
89    /// If the `utf` option is also set, then Unicode case folding is used
90    /// to determine case insensitivity. When the `utf` option is not set,
91    /// then only standard ASCII case insensitivity is considered.
92    ///
93    /// This option corresponds to the `i` flag.
94    pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
95        self.builder.caseless(yes);
96        self
97    }
98
99    /// Whether to enable "smart case" or not.
100    ///
101    /// When smart case is enabled, the builder will automatically enable
102    /// case insensitive matching based on how the pattern is written. Namely,
103    /// case insensitive mode is enabled when both of the following things
104    /// are believed to be true:
105    ///
106    /// 1. The pattern contains at least one literal character. For example,
107    ///    `a\w` contains a literal (`a`) but `\w` does not.
108    /// 2. Of the literals in the pattern, none of them are considered to be
109    ///    uppercase according to Unicode. For example, `foo\pL` has no
110    ///    uppercase literals but `Foo\pL` does.
111    ///
112    /// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
113    /// will prevent case insensitive matching even though it is part of a meta
114    /// sequence. This bug will probably never be fixed.
115    pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
116        self.case_smart = yes;
117        self
118    }
119
120    /// Enables "dot all" matching.
121    ///
122    /// When enabled, the `.` metacharacter in the pattern matches any
123    /// character, include `\n`. When disabled (the default), `.` will match
124    /// any character except for `\n`.
125    ///
126    /// This option corresponds to the `s` flag.
127    pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
128        self.builder.dotall(yes);
129        self
130    }
131
132    /// Enable "extended" mode in the pattern, where whitespace is ignored.
133    ///
134    /// This option corresponds to the `x` flag.
135    pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
136        self.builder.extended(yes);
137        self
138    }
139
140    /// Enable multiline matching mode.
141    ///
142    /// When enabled, the `^` and `$` anchors will match both at the beginning
143    /// and end of a subject string, in addition to matching at the start of
144    /// a line and the end of a line. When disabled, the `^` and `$` anchors
145    /// will only match at the beginning and end of a subject string.
146    ///
147    /// This option corresponds to the `m` flag.
148    pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
149        self.builder.multi_line(yes);
150        self
151    }
152
153    /// Enable matching of CRLF as a line terminator.
154    ///
155    /// When enabled, anchors such as `^` and `$` will match any of the
156    /// following as a line terminator: `\r`, `\n` or `\r\n`.
157    ///
158    /// This is disabled by default, in which case, only `\n` is recognized as
159    /// a line terminator.
160    pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
161        self.builder.crlf(yes);
162        self
163    }
164
165    /// Require that all matches occur on word boundaries.
166    ///
167    /// Enabling this option is subtly different than putting `\b` assertions
168    /// on both sides of your pattern. In particular, a `\b` assertion requires
169    /// that one side of it match a word character while the other match a
170    /// non-word character. This option, in contrast, merely requires that
171    /// one side match a non-word character.
172    ///
173    /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
174    /// word character. However, `-2` with this `word` option enabled will
175    /// match the `-2` in `foo -2 bar`.
176    pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
177        self.word = yes;
178        self
179    }
180
181    /// Whether the patterns should be treated as literal strings or not. When
182    /// this is active, all characters, including ones that would normally be
183    /// special regex meta characters, are matched literally.
184    pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
185        self.fixed_strings = yes;
186        self
187    }
188
189    /// Whether each pattern should match the entire line or not. This is
190    /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`.
191    pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
192        self.whole_line = yes;
193        self
194    }
195
196    /// Enable Unicode matching mode.
197    ///
198    /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
199    /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
200    ///
201    /// When set, this implies UTF matching mode. It is not possible to enable
202    /// Unicode matching mode without enabling UTF matching mode.
203    ///
204    /// This is disabled by default.
205    pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
206        self.builder.ucp(yes);
207        self
208    }
209
210    /// Enable UTF matching mode.
211    ///
212    /// When enabled, characters are treated as sequences of code units that
213    /// make up a single codepoint instead of as single bytes. For example,
214    /// this will cause `.` to match any single UTF-8 encoded codepoint, where
215    /// as when this is disabled, `.` will any single byte (except for `\n` in
216    /// both cases, unless "dot all" mode is enabled).
217    ///
218    /// Note that when UTF matching mode is enabled, every search performed
219    /// will do a UTF-8 validation check, which can impact performance. The
220    /// UTF-8 check can be disabled via the `disable_utf_check` option, but it
221    /// is undefined behavior to enable UTF matching mode and search invalid
222    /// UTF-8.
223    ///
224    /// This is disabled by default.
225    pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
226        self.builder.utf(yes);
227        self
228    }
229
230    /// This is now deprecated and is a no-op.
231    ///
232    /// Previously, this option permitted disabling PCRE2's UTF-8 validity
233    /// check, which could result in undefined behavior if the haystack was
234    /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
235    /// in 10.34 which this crate always sets. When this option is enabled,
236    /// PCRE2 claims to not have undefined behavior when the haystack is
237    /// invalid UTF-8.
238    ///
239    /// Therefore, disabling the UTF-8 check is not something that is exposed
240    /// by this crate.
241    #[deprecated(
242        since = "0.2.4",
243        note = "now a no-op due to new PCRE2 features"
244    )]
245    pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
246        self
247    }
248
249    /// Enable PCRE2's JIT and return an error if it's not available.
250    ///
251    /// This generally speeds up matching quite a bit. The downside is that it
252    /// can increase the time it takes to compile a pattern.
253    ///
254    /// If the JIT isn't available or if JIT compilation returns an error, then
255    /// regex compilation will fail with the corresponding error.
256    ///
257    /// This is disabled by default, and always overrides `jit_if_available`.
258    pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
259        self.builder.jit(yes);
260        self
261    }
262
263    /// Enable PCRE2's JIT if it's available.
264    ///
265    /// This generally speeds up matching quite a bit. The downside is that it
266    /// can increase the time it takes to compile a pattern.
267    ///
268    /// If the JIT isn't available or if JIT compilation returns an error,
269    /// then a debug message with the error will be emitted and the regex will
270    /// otherwise silently fall back to non-JIT matching.
271    ///
272    /// This is disabled by default, and always overrides `jit`.
273    pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
274        self.builder.jit_if_available(yes);
275        self
276    }
277
278    /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
279    /// not enabled, then this has no effect.
280    ///
281    /// When `None` is given, no custom JIT stack will be created, and instead,
282    /// the default JIT stack is used. When the default is used, its maximum
283    /// size is 32 KB.
284    ///
285    /// When this is set, then a new JIT stack will be created with the given
286    /// maximum size as its limit.
287    ///
288    /// Increasing the stack size can be useful for larger regular expressions.
289    ///
290    /// By default, this is set to `None`.
291    pub fn max_jit_stack_size(
292        &mut self,
293        bytes: Option<usize>,
294    ) -> &mut RegexMatcherBuilder {
295        self.builder.max_jit_stack_size(bytes);
296        self
297    }
298}
299
300/// An implementation of the `Matcher` trait using PCRE2.
301#[derive(Clone, Debug)]
302pub struct RegexMatcher {
303    regex: Regex,
304    names: HashMap<String, usize>,
305}
306
307impl RegexMatcher {
308    /// Create a new matcher from the given pattern using the default
309    /// configuration.
310    pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
311        RegexMatcherBuilder::new().build(pattern)
312    }
313}
314
315impl Matcher for RegexMatcher {
316    type Captures = RegexCaptures;
317    type Error = Error;
318
319    fn find_at(
320        &self,
321        haystack: &[u8],
322        at: usize,
323    ) -> Result<Option<Match>, Error> {
324        Ok(self
325            .regex
326            .find_at(haystack, at)
327            .map_err(Error::regex)?
328            .map(|m| Match::new(m.start(), m.end())))
329    }
330
331    fn new_captures(&self) -> Result<RegexCaptures, Error> {
332        Ok(RegexCaptures::new(self.regex.capture_locations()))
333    }
334
335    fn capture_count(&self) -> usize {
336        self.regex.captures_len()
337    }
338
339    fn capture_index(&self, name: &str) -> Option<usize> {
340        self.names.get(name).map(|i| *i)
341    }
342
343    fn try_find_iter<F, E>(
344        &self,
345        haystack: &[u8],
346        mut matched: F,
347    ) -> Result<Result<(), E>, Error>
348    where
349        F: FnMut(Match) -> Result<bool, E>,
350    {
351        for result in self.regex.find_iter(haystack) {
352            let m = result.map_err(Error::regex)?;
353            match matched(Match::new(m.start(), m.end())) {
354                Ok(true) => continue,
355                Ok(false) => return Ok(Ok(())),
356                Err(err) => return Ok(Err(err)),
357            }
358        }
359        Ok(Ok(()))
360    }
361
362    fn captures_at(
363        &self,
364        haystack: &[u8],
365        at: usize,
366        caps: &mut RegexCaptures,
367    ) -> Result<bool, Error> {
368        Ok(self
369            .regex
370            .captures_read_at(&mut caps.locs, haystack, at)
371            .map_err(Error::regex)?
372            .is_some())
373    }
374}
375
376/// Represents the match offsets of each capturing group in a match.
377///
378/// The first, or `0`th capture group, always corresponds to the entire match
379/// and is guaranteed to be present when a match occurs. The next capture
380/// group, at index `1`, corresponds to the first capturing group in the regex,
381/// ordered by the position at which the left opening parenthesis occurs.
382///
383/// Note that not all capturing groups are guaranteed to be present in a match.
384/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
385/// or `bar` will ever be set in any given match.
386///
387/// In order to access a capture group by name, you'll need to first find the
388/// index of the group using the corresponding matcher's `capture_index`
389/// method, and then use that index with `RegexCaptures::get`.
390#[derive(Clone, Debug)]
391pub struct RegexCaptures {
392    /// Where the locations are stored.
393    locs: CaptureLocations,
394}
395
396impl Captures for RegexCaptures {
397    fn len(&self) -> usize {
398        self.locs.len()
399    }
400
401    fn get(&self, i: usize) -> Option<Match> {
402        self.locs.get(i).map(|(s, e)| Match::new(s, e))
403    }
404}
405
406impl RegexCaptures {
407    pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
408        RegexCaptures { locs }
409    }
410}
411
412/// Determine whether the pattern contains an uppercase character which should
413/// negate the effect of the smart-case option.
414///
415/// Ideally we would be able to check the AST in order to correctly handle
416/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
417/// cased), but PCRE doesn't expose enough details for that kind of analysis.
418/// For now, our 'good enough' solution is to simply perform a semi-naïve
419/// scan of the input pattern and ignore all characters following a '\'. The
420/// This at least lets us support the most common cases, like 'foo\w' and
421/// 'foo\S', in an intuitive manner.
422fn has_uppercase_literal(pattern: &str) -> bool {
423    let mut chars = pattern.chars();
424    while let Some(c) = chars.next() {
425        if c == '\\' {
426            chars.next();
427        } else if c.is_uppercase() {
428            return true;
429        }
430    }
431    false
432}
433
434#[cfg(test)]
435mod tests {
436    use grep_matcher::LineMatchKind;
437
438    use super::*;
439
440    // Test that enabling word matches does the right thing and demonstrate
441    // the difference between it and surrounding the regex in `\b`.
442    #[test]
443    fn word() {
444        let matcher =
445            RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
446        assert!(matcher.is_match(b"abc -2 foo").unwrap());
447
448        let matcher =
449            RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
450        assert!(!matcher.is_match(b"abc -2 foo").unwrap());
451    }
452
453    // Test that enabling CRLF permits `$` to match at the end of a line.
454    #[test]
455    fn line_terminator_crlf() {
456        // Test normal use of `$` with a `\n` line terminator.
457        let matcher = RegexMatcherBuilder::new()
458            .multi_line(true)
459            .build(r"abc$")
460            .unwrap();
461        assert!(matcher.is_match(b"abc\n").unwrap());
462
463        // Test that `$` doesn't match at `\r\n` boundary normally.
464        let matcher = RegexMatcherBuilder::new()
465            .multi_line(true)
466            .build(r"abc$")
467            .unwrap();
468        assert!(!matcher.is_match(b"abc\r\n").unwrap());
469
470        // Now check the CRLF handling.
471        let matcher = RegexMatcherBuilder::new()
472            .multi_line(true)
473            .crlf(true)
474            .build(r"abc$")
475            .unwrap();
476        assert!(matcher.is_match(b"abc\r\n").unwrap());
477    }
478
479    // Test that smart case works.
480    #[test]
481    fn case_smart() {
482        let matcher =
483            RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
484        assert!(matcher.is_match(b"ABC").unwrap());
485
486        let matcher =
487            RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
488        assert!(!matcher.is_match(b"ABC").unwrap());
489    }
490
491    // Test that finding candidate lines works as expected.
492    #[test]
493    fn candidate_lines() {
494        fn is_confirmed(m: LineMatchKind) -> bool {
495            match m {
496                LineMatchKind::Confirmed(_) => true,
497                _ => false,
498            }
499        }
500
501        let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
502        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
503        assert!(is_confirmed(m));
504    }
505}