grep_pcre2/
matcher.rs

1use std::collections::HashMap;
2
3use {
4    grep_matcher::{Captures, Match, Matcher},
5    pcre2::bytes::{CaptureLocations, Regex, RegexBuilder},
6};
7
8use crate::error::Error;
9
10/// A builder for configuring the compilation of a PCRE2 regex.
11#[derive(Clone, Debug)]
12pub struct RegexMatcherBuilder {
13    builder: RegexBuilder,
14    case_smart: bool,
15    word: bool,
16    fixed_strings: bool,
17    whole_line: bool,
18}
19
20impl RegexMatcherBuilder {
21    /// Create a new matcher builder with a default configuration.
22    pub fn new() -> RegexMatcherBuilder {
23        RegexMatcherBuilder {
24            builder: RegexBuilder::new(),
25            case_smart: false,
26            word: false,
27            fixed_strings: false,
28            whole_line: false,
29        }
30    }
31
32    /// Compile the given pattern into a PCRE matcher using the current
33    /// configuration.
34    ///
35    /// If there was a problem compiling the pattern, then an error is
36    /// returned.
37    pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
38        self.build_many(&[pattern])
39    }
40
41    /// Compile all of the given patterns into a single regex that matches when
42    /// at least one of the patterns matches.
43    ///
44    /// If there was a problem building the regex, then an error is returned.
45    pub fn build_many<P: AsRef<str>>(
46        &self,
47        patterns: &[P],
48    ) -> Result<RegexMatcher, Error> {
49        let mut builder = self.builder.clone();
50        let mut pats = Vec::with_capacity(patterns.len());
51        for p in patterns.iter() {
52            pats.push(if self.fixed_strings {
53                format!("(?:{})", pcre2::escape(p.as_ref()))
54            } else {
55                format!("(?:{})", p.as_ref())
56            });
57        }
58        let mut singlepat = pats.join("|");
59        if self.case_smart && !has_uppercase_literal(&singlepat) {
60            builder.caseless(true);
61        }
62        if self.whole_line {
63            singlepat = format!(r"(?m:^)(?:{})(?m:$)", singlepat);
64        } else if self.word {
65            // We make this option exclusive with whole_line because when
66            // whole_line is enabled, all matches necessary fall on word
67            // boundaries. So this extra goop is strictly redundant.
68            singlepat = format!(r"(?<!\w)(?:{})(?!\w)", singlepat);
69        }
70        log::trace!("final regex: {:?}", singlepat);
71        builder.build(&singlepat).map_err(Error::regex).map(|regex| {
72            let mut names = HashMap::new();
73            for (i, name) in regex.capture_names().iter().enumerate() {
74                if let Some(ref name) = *name {
75                    names.insert(name.to_string(), i);
76                }
77            }
78            RegexMatcher { regex, names }
79        })
80    }
81
82    /// Enables case insensitive matching.
83    ///
84    /// If the `utf` option is also set, then Unicode case folding is used
85    /// to determine case insensitivity. When the `utf` option is not set,
86    /// then only standard ASCII case insensitivity is considered.
87    ///
88    /// This option corresponds to the `i` flag.
89    pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
90        self.builder.caseless(yes);
91        self
92    }
93
94    /// Whether to enable "smart case" or not.
95    ///
96    /// When smart case is enabled, the builder will automatically enable
97    /// case insensitive matching based on how the pattern is written. Namely,
98    /// case insensitive mode is enabled when both of the following things
99    /// are believed to be true:
100    ///
101    /// 1. The pattern contains at least one literal character. For example,
102    ///    `a\w` contains a literal (`a`) but `\w` does not.
103    /// 2. Of the literals in the pattern, none of them are considered to be
104    ///    uppercase according to Unicode. For example, `foo\pL` has no
105    ///    uppercase literals but `Foo\pL` does.
106    ///
107    /// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
108    /// will prevent case insensitive matching even though it is part of a meta
109    /// sequence. This bug will probably never be fixed.
110    pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
111        self.case_smart = yes;
112        self
113    }
114
115    /// Enables "dot all" matching.
116    ///
117    /// When enabled, the `.` metacharacter in the pattern matches any
118    /// character, include `\n`. When disabled (the default), `.` will match
119    /// any character except for `\n`.
120    ///
121    /// This option corresponds to the `s` flag.
122    pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
123        self.builder.dotall(yes);
124        self
125    }
126
127    /// Enable "extended" mode in the pattern, where whitespace is ignored.
128    ///
129    /// This option corresponds to the `x` flag.
130    pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
131        self.builder.extended(yes);
132        self
133    }
134
135    /// Enable multiline matching mode.
136    ///
137    /// When enabled, the `^` and `$` anchors will match both at the beginning
138    /// and end of a subject string, in addition to matching at the start of
139    /// a line and the end of a line. When disabled, the `^` and `$` anchors
140    /// will only match at the beginning and end of a subject string.
141    ///
142    /// This option corresponds to the `m` flag.
143    pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
144        self.builder.multi_line(yes);
145        self
146    }
147
148    /// Enable matching of CRLF as a line terminator.
149    ///
150    /// When enabled, anchors such as `^` and `$` will match any of the
151    /// following as a line terminator: `\r`, `\n` or `\r\n`.
152    ///
153    /// This is disabled by default, in which case, only `\n` is recognized as
154    /// a line terminator.
155    pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
156        self.builder.crlf(yes);
157        self
158    }
159
160    /// Require that all matches occur on word boundaries.
161    ///
162    /// Enabling this option is subtly different than putting `\b` assertions
163    /// on both sides of your pattern. In particular, a `\b` assertion requires
164    /// that one side of it match a word character while the other match a
165    /// non-word character. This option, in contrast, merely requires that
166    /// one side match a non-word character.
167    ///
168    /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
169    /// word character. However, `-2` with this `word` option enabled will
170    /// match the `-2` in `foo -2 bar`.
171    pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
172        self.word = yes;
173        self
174    }
175
176    /// Whether the patterns should be treated as literal strings or not. When
177    /// this is active, all characters, including ones that would normally be
178    /// special regex meta characters, are matched literally.
179    pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
180        self.fixed_strings = yes;
181        self
182    }
183
184    /// Whether each pattern should match the entire line or not. This is
185    /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`.
186    pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
187        self.whole_line = yes;
188        self
189    }
190
191    /// Enable Unicode matching mode.
192    ///
193    /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
194    /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
195    ///
196    /// When set, this implies UTF matching mode. It is not possible to enable
197    /// Unicode matching mode without enabling UTF matching mode.
198    ///
199    /// This is disabled by default.
200    pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
201        self.builder.ucp(yes);
202        self
203    }
204
205    /// Enable UTF matching mode.
206    ///
207    /// When enabled, characters are treated as sequences of code units that
208    /// make up a single codepoint instead of as single bytes. For example,
209    /// this will cause `.` to match any single UTF-8 encoded codepoint, where
210    /// as when this is disabled, `.` will any single byte (except for `\n` in
211    /// both cases, unless "dot all" mode is enabled).
212    ///
213    /// Note that when UTF matching mode is enabled, every search performed
214    /// will do a UTF-8 validation check, which can impact performance. The
215    /// UTF-8 check can be disabled via the `disable_utf_check` option, but it
216    /// is undefined behavior to enable UTF matching mode and search invalid
217    /// UTF-8.
218    ///
219    /// This is disabled by default.
220    pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
221        self.builder.utf(yes);
222        self
223    }
224
225    /// This is now deprecated and is a no-op.
226    ///
227    /// Previously, this option permitted disabling PCRE2's UTF-8 validity
228    /// check, which could result in undefined behavior if the haystack was
229    /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
230    /// in 10.34 which this crate always sets. When this option is enabled,
231    /// PCRE2 claims to not have undefined behavior when the haystack is
232    /// invalid UTF-8.
233    ///
234    /// Therefore, disabling the UTF-8 check is not something that is exposed
235    /// by this crate.
236    #[deprecated(
237        since = "0.2.4",
238        note = "now a no-op due to new PCRE2 features"
239    )]
240    pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
241        self
242    }
243
244    /// Enable PCRE2's JIT and return an error if it's not available.
245    ///
246    /// This generally speeds up matching quite a bit. The downside is that it
247    /// can increase the time it takes to compile a pattern.
248    ///
249    /// If the JIT isn't available or if JIT compilation returns an error, then
250    /// regex compilation will fail with the corresponding error.
251    ///
252    /// This is disabled by default, and always overrides `jit_if_available`.
253    pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
254        self.builder.jit(yes);
255        self
256    }
257
258    /// Enable PCRE2's JIT if it's available.
259    ///
260    /// This generally speeds up matching quite a bit. The downside is that it
261    /// can increase the time it takes to compile a pattern.
262    ///
263    /// If the JIT isn't available or if JIT compilation returns an error,
264    /// then a debug message with the error will be emitted and the regex will
265    /// otherwise silently fall back to non-JIT matching.
266    ///
267    /// This is disabled by default, and always overrides `jit`.
268    pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
269        self.builder.jit_if_available(yes);
270        self
271    }
272
273    /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
274    /// not enabled, then this has no effect.
275    ///
276    /// When `None` is given, no custom JIT stack will be created, and instead,
277    /// the default JIT stack is used. When the default is used, its maximum
278    /// size is 32 KB.
279    ///
280    /// When this is set, then a new JIT stack will be created with the given
281    /// maximum size as its limit.
282    ///
283    /// Increasing the stack size can be useful for larger regular expressions.
284    ///
285    /// By default, this is set to `None`.
286    pub fn max_jit_stack_size(
287        &mut self,
288        bytes: Option<usize>,
289    ) -> &mut RegexMatcherBuilder {
290        self.builder.max_jit_stack_size(bytes);
291        self
292    }
293}
294
295/// An implementation of the `Matcher` trait using PCRE2.
296#[derive(Clone, Debug)]
297pub struct RegexMatcher {
298    regex: Regex,
299    names: HashMap<String, usize>,
300}
301
302impl RegexMatcher {
303    /// Create a new matcher from the given pattern using the default
304    /// configuration.
305    pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
306        RegexMatcherBuilder::new().build(pattern)
307    }
308}
309
310impl Matcher for RegexMatcher {
311    type Captures = RegexCaptures;
312    type Error = Error;
313
314    fn find_at(
315        &self,
316        haystack: &[u8],
317        at: usize,
318    ) -> Result<Option<Match>, Error> {
319        Ok(self
320            .regex
321            .find_at(haystack, at)
322            .map_err(Error::regex)?
323            .map(|m| Match::new(m.start(), m.end())))
324    }
325
326    fn new_captures(&self) -> Result<RegexCaptures, Error> {
327        Ok(RegexCaptures::new(self.regex.capture_locations()))
328    }
329
330    fn capture_count(&self) -> usize {
331        self.regex.captures_len()
332    }
333
334    fn capture_index(&self, name: &str) -> Option<usize> {
335        self.names.get(name).map(|i| *i)
336    }
337
338    fn try_find_iter<F, E>(
339        &self,
340        haystack: &[u8],
341        mut matched: F,
342    ) -> Result<Result<(), E>, Error>
343    where
344        F: FnMut(Match) -> Result<bool, E>,
345    {
346        for result in self.regex.find_iter(haystack) {
347            let m = result.map_err(Error::regex)?;
348            match matched(Match::new(m.start(), m.end())) {
349                Ok(true) => continue,
350                Ok(false) => return Ok(Ok(())),
351                Err(err) => return Ok(Err(err)),
352            }
353        }
354        Ok(Ok(()))
355    }
356
357    fn captures_at(
358        &self,
359        haystack: &[u8],
360        at: usize,
361        caps: &mut RegexCaptures,
362    ) -> Result<bool, Error> {
363        Ok(self
364            .regex
365            .captures_read_at(&mut caps.locs, haystack, at)
366            .map_err(Error::regex)?
367            .is_some())
368    }
369}
370
371/// Represents the match offsets of each capturing group in a match.
372///
373/// The first, or `0`th capture group, always corresponds to the entire match
374/// and is guaranteed to be present when a match occurs. The next capture
375/// group, at index `1`, corresponds to the first capturing group in the regex,
376/// ordered by the position at which the left opening parenthesis occurs.
377///
378/// Note that not all capturing groups are guaranteed to be present in a match.
379/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
380/// or `bar` will ever be set in any given match.
381///
382/// In order to access a capture group by name, you'll need to first find the
383/// index of the group using the corresponding matcher's `capture_index`
384/// method, and then use that index with `RegexCaptures::get`.
385#[derive(Clone, Debug)]
386pub struct RegexCaptures {
387    /// Where the locations are stored.
388    locs: CaptureLocations,
389}
390
391impl Captures for RegexCaptures {
392    fn len(&self) -> usize {
393        self.locs.len()
394    }
395
396    fn get(&self, i: usize) -> Option<Match> {
397        self.locs.get(i).map(|(s, e)| Match::new(s, e))
398    }
399}
400
401impl RegexCaptures {
402    pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
403        RegexCaptures { locs }
404    }
405}
406
407/// Determine whether the pattern contains an uppercase character which should
408/// negate the effect of the smart-case option.
409///
410/// Ideally we would be able to check the AST in order to correctly handle
411/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
412/// cased), but PCRE doesn't expose enough details for that kind of analysis.
413/// For now, our 'good enough' solution is to simply perform a semi-naïve
414/// scan of the input pattern and ignore all characters following a '\'. The
415/// This at least lets us support the most common cases, like 'foo\w' and
416/// 'foo\S', in an intuitive manner.
417fn has_uppercase_literal(pattern: &str) -> bool {
418    let mut chars = pattern.chars();
419    while let Some(c) = chars.next() {
420        if c == '\\' {
421            chars.next();
422        } else if c.is_uppercase() {
423            return true;
424        }
425    }
426    false
427}
428
429#[cfg(test)]
430mod tests {
431    use grep_matcher::LineMatchKind;
432
433    use super::*;
434
435    // Test that enabling word matches does the right thing and demonstrate
436    // the difference between it and surrounding the regex in `\b`.
437    #[test]
438    fn word() {
439        let matcher =
440            RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
441        assert!(matcher.is_match(b"abc -2 foo").unwrap());
442
443        let matcher =
444            RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
445        assert!(!matcher.is_match(b"abc -2 foo").unwrap());
446    }
447
448    // Test that enabling CRLF permits `$` to match at the end of a line.
449    #[test]
450    fn line_terminator_crlf() {
451        // Test normal use of `$` with a `\n` line terminator.
452        let matcher = RegexMatcherBuilder::new()
453            .multi_line(true)
454            .build(r"abc$")
455            .unwrap();
456        assert!(matcher.is_match(b"abc\n").unwrap());
457
458        // Test that `$` doesn't match at `\r\n` boundary normally.
459        let matcher = RegexMatcherBuilder::new()
460            .multi_line(true)
461            .build(r"abc$")
462            .unwrap();
463        assert!(!matcher.is_match(b"abc\r\n").unwrap());
464
465        // Now check the CRLF handling.
466        let matcher = RegexMatcherBuilder::new()
467            .multi_line(true)
468            .crlf(true)
469            .build(r"abc$")
470            .unwrap();
471        assert!(matcher.is_match(b"abc\r\n").unwrap());
472    }
473
474    // Test that smart case works.
475    #[test]
476    fn case_smart() {
477        let matcher =
478            RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
479        assert!(matcher.is_match(b"ABC").unwrap());
480
481        let matcher =
482            RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
483        assert!(!matcher.is_match(b"ABC").unwrap());
484    }
485
486    // Test that finding candidate lines works as expected.
487    #[test]
488    fn candidate_lines() {
489        fn is_confirmed(m: LineMatchKind) -> bool {
490            match m {
491                LineMatchKind::Confirmed(_) => true,
492                _ => false,
493            }
494        }
495
496        let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
497        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
498        assert!(is_confirmed(m));
499    }
500}