grep_pcre2/matcher.rs
1use std::collections::HashMap;
2
3use {
4 grep_matcher::{Captures, Match, Matcher},
5 pcre2::bytes::{CaptureLocations, Regex, RegexBuilder},
6};
7
8use crate::error::Error;
9
10/// A builder for configuring the compilation of a PCRE2 regex.
11#[derive(Clone, Debug)]
12pub struct RegexMatcherBuilder {
13 builder: RegexBuilder,
14 case_smart: bool,
15 word: bool,
16 fixed_strings: bool,
17 whole_line: bool,
18}
19
20impl RegexMatcherBuilder {
21 /// Create a new matcher builder with a default configuration.
22 pub fn new() -> RegexMatcherBuilder {
23 RegexMatcherBuilder {
24 builder: RegexBuilder::new(),
25 case_smart: false,
26 word: false,
27 fixed_strings: false,
28 whole_line: false,
29 }
30 }
31
32 /// Compile the given pattern into a PCRE matcher using the current
33 /// configuration.
34 ///
35 /// If there was a problem compiling the pattern, then an error is
36 /// returned.
37 pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
38 self.build_many(&[pattern])
39 }
40
41 /// Compile all of the given patterns into a single regex that matches when
42 /// at least one of the patterns matches.
43 ///
44 /// If there was a problem building the regex, then an error is returned.
45 pub fn build_many<P: AsRef<str>>(
46 &self,
47 patterns: &[P],
48 ) -> Result<RegexMatcher, Error> {
49 let mut builder = self.builder.clone();
50 let mut pats = Vec::with_capacity(patterns.len());
51 for p in patterns.iter() {
52 pats.push(if self.fixed_strings {
53 format!("(?:{})", pcre2::escape(p.as_ref()))
54 } else {
55 format!("(?:{})", p.as_ref())
56 });
57 }
58 let mut singlepat = if patterns.is_empty() {
59 // A way to spell a pattern that can never match anything.
60 r"[^\S\s]".to_string()
61 } else {
62 pats.join("|")
63 };
64 if self.case_smart && !has_uppercase_literal(&singlepat) {
65 builder.caseless(true);
66 }
67 if self.whole_line {
68 singlepat = format!(r"(?m:^)(?:{})(?m:$)", singlepat);
69 } else if self.word {
70 // We make this option exclusive with whole_line because when
71 // whole_line is enabled, all matches necessary fall on word
72 // boundaries. So this extra goop is strictly redundant.
73 singlepat = format!(r"(?<!\w)(?:{})(?!\w)", singlepat);
74 }
75 log::trace!("final regex: {:?}", singlepat);
76 builder.build(&singlepat).map_err(Error::regex).map(|regex| {
77 let mut names = HashMap::new();
78 for (i, name) in regex.capture_names().iter().enumerate() {
79 if let Some(ref name) = *name {
80 names.insert(name.to_string(), i);
81 }
82 }
83 RegexMatcher { regex, names }
84 })
85 }
86
87 /// Enables case insensitive matching.
88 ///
89 /// If the `utf` option is also set, then Unicode case folding is used
90 /// to determine case insensitivity. When the `utf` option is not set,
91 /// then only standard ASCII case insensitivity is considered.
92 ///
93 /// This option corresponds to the `i` flag.
94 pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
95 self.builder.caseless(yes);
96 self
97 }
98
99 /// Whether to enable "smart case" or not.
100 ///
101 /// When smart case is enabled, the builder will automatically enable
102 /// case insensitive matching based on how the pattern is written. Namely,
103 /// case insensitive mode is enabled when both of the following things
104 /// are believed to be true:
105 ///
106 /// 1. The pattern contains at least one literal character. For example,
107 /// `a\w` contains a literal (`a`) but `\w` does not.
108 /// 2. Of the literals in the pattern, none of them are considered to be
109 /// uppercase according to Unicode. For example, `foo\pL` has no
110 /// uppercase literals but `Foo\pL` does.
111 ///
112 /// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
113 /// will prevent case insensitive matching even though it is part of a meta
114 /// sequence. This bug will probably never be fixed.
115 pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
116 self.case_smart = yes;
117 self
118 }
119
120 /// Enables "dot all" matching.
121 ///
122 /// When enabled, the `.` metacharacter in the pattern matches any
123 /// character, include `\n`. When disabled (the default), `.` will match
124 /// any character except for `\n`.
125 ///
126 /// This option corresponds to the `s` flag.
127 pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
128 self.builder.dotall(yes);
129 self
130 }
131
132 /// Enable "extended" mode in the pattern, where whitespace is ignored.
133 ///
134 /// This option corresponds to the `x` flag.
135 pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
136 self.builder.extended(yes);
137 self
138 }
139
140 /// Enable multiline matching mode.
141 ///
142 /// When enabled, the `^` and `$` anchors will match both at the beginning
143 /// and end of a subject string, in addition to matching at the start of
144 /// a line and the end of a line. When disabled, the `^` and `$` anchors
145 /// will only match at the beginning and end of a subject string.
146 ///
147 /// This option corresponds to the `m` flag.
148 pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
149 self.builder.multi_line(yes);
150 self
151 }
152
153 /// Enable matching of CRLF as a line terminator.
154 ///
155 /// When enabled, anchors such as `^` and `$` will match any of the
156 /// following as a line terminator: `\r`, `\n` or `\r\n`.
157 ///
158 /// This is disabled by default, in which case, only `\n` is recognized as
159 /// a line terminator.
160 pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
161 self.builder.crlf(yes);
162 self
163 }
164
165 /// Require that all matches occur on word boundaries.
166 ///
167 /// Enabling this option is subtly different than putting `\b` assertions
168 /// on both sides of your pattern. In particular, a `\b` assertion requires
169 /// that one side of it match a word character while the other match a
170 /// non-word character. This option, in contrast, merely requires that
171 /// one side match a non-word character.
172 ///
173 /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
174 /// word character. However, `-2` with this `word` option enabled will
175 /// match the `-2` in `foo -2 bar`.
176 pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
177 self.word = yes;
178 self
179 }
180
181 /// Whether the patterns should be treated as literal strings or not. When
182 /// this is active, all characters, including ones that would normally be
183 /// special regex meta characters, are matched literally.
184 pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
185 self.fixed_strings = yes;
186 self
187 }
188
189 /// Whether each pattern should match the entire line or not. This is
190 /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`.
191 pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
192 self.whole_line = yes;
193 self
194 }
195
196 /// Enable Unicode matching mode.
197 ///
198 /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
199 /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
200 ///
201 /// When set, this implies UTF matching mode. It is not possible to enable
202 /// Unicode matching mode without enabling UTF matching mode.
203 ///
204 /// This is disabled by default.
205 pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
206 self.builder.ucp(yes);
207 self
208 }
209
210 /// Enable UTF matching mode.
211 ///
212 /// When enabled, characters are treated as sequences of code units that
213 /// make up a single codepoint instead of as single bytes. For example,
214 /// this will cause `.` to match any single UTF-8 encoded codepoint, where
215 /// as when this is disabled, `.` will any single byte (except for `\n` in
216 /// both cases, unless "dot all" mode is enabled).
217 ///
218 /// Note that when UTF matching mode is enabled, every search performed
219 /// will do a UTF-8 validation check, which can impact performance. The
220 /// UTF-8 check can be disabled via the `disable_utf_check` option, but it
221 /// is undefined behavior to enable UTF matching mode and search invalid
222 /// UTF-8.
223 ///
224 /// This is disabled by default.
225 pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
226 self.builder.utf(yes);
227 self
228 }
229
230 /// This is now deprecated and is a no-op.
231 ///
232 /// Previously, this option permitted disabling PCRE2's UTF-8 validity
233 /// check, which could result in undefined behavior if the haystack was
234 /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
235 /// in 10.34 which this crate always sets. When this option is enabled,
236 /// PCRE2 claims to not have undefined behavior when the haystack is
237 /// invalid UTF-8.
238 ///
239 /// Therefore, disabling the UTF-8 check is not something that is exposed
240 /// by this crate.
241 #[deprecated(
242 since = "0.2.4",
243 note = "now a no-op due to new PCRE2 features"
244 )]
245 pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
246 self
247 }
248
249 /// Enable PCRE2's JIT and return an error if it's not available.
250 ///
251 /// This generally speeds up matching quite a bit. The downside is that it
252 /// can increase the time it takes to compile a pattern.
253 ///
254 /// If the JIT isn't available or if JIT compilation returns an error, then
255 /// regex compilation will fail with the corresponding error.
256 ///
257 /// This is disabled by default, and always overrides `jit_if_available`.
258 pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
259 self.builder.jit(yes);
260 self
261 }
262
263 /// Enable PCRE2's JIT if it's available.
264 ///
265 /// This generally speeds up matching quite a bit. The downside is that it
266 /// can increase the time it takes to compile a pattern.
267 ///
268 /// If the JIT isn't available or if JIT compilation returns an error,
269 /// then a debug message with the error will be emitted and the regex will
270 /// otherwise silently fall back to non-JIT matching.
271 ///
272 /// This is disabled by default, and always overrides `jit`.
273 pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
274 self.builder.jit_if_available(yes);
275 self
276 }
277
278 /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
279 /// not enabled, then this has no effect.
280 ///
281 /// When `None` is given, no custom JIT stack will be created, and instead,
282 /// the default JIT stack is used. When the default is used, its maximum
283 /// size is 32 KB.
284 ///
285 /// When this is set, then a new JIT stack will be created with the given
286 /// maximum size as its limit.
287 ///
288 /// Increasing the stack size can be useful for larger regular expressions.
289 ///
290 /// By default, this is set to `None`.
291 pub fn max_jit_stack_size(
292 &mut self,
293 bytes: Option<usize>,
294 ) -> &mut RegexMatcherBuilder {
295 self.builder.max_jit_stack_size(bytes);
296 self
297 }
298}
299
300/// An implementation of the `Matcher` trait using PCRE2.
301#[derive(Clone, Debug)]
302pub struct RegexMatcher {
303 regex: Regex,
304 names: HashMap<String, usize>,
305}
306
307impl RegexMatcher {
308 /// Create a new matcher from the given pattern using the default
309 /// configuration.
310 pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
311 RegexMatcherBuilder::new().build(pattern)
312 }
313}
314
315impl Matcher for RegexMatcher {
316 type Captures = RegexCaptures;
317 type Error = Error;
318
319 fn find_at(
320 &self,
321 haystack: &[u8],
322 at: usize,
323 ) -> Result<Option<Match>, Error> {
324 Ok(self
325 .regex
326 .find_at(haystack, at)
327 .map_err(Error::regex)?
328 .map(|m| Match::new(m.start(), m.end())))
329 }
330
331 fn new_captures(&self) -> Result<RegexCaptures, Error> {
332 Ok(RegexCaptures::new(self.regex.capture_locations()))
333 }
334
335 fn capture_count(&self) -> usize {
336 self.regex.captures_len()
337 }
338
339 fn capture_index(&self, name: &str) -> Option<usize> {
340 self.names.get(name).map(|i| *i)
341 }
342
343 fn try_find_iter<F, E>(
344 &self,
345 haystack: &[u8],
346 mut matched: F,
347 ) -> Result<Result<(), E>, Error>
348 where
349 F: FnMut(Match) -> Result<bool, E>,
350 {
351 for result in self.regex.find_iter(haystack) {
352 let m = result.map_err(Error::regex)?;
353 match matched(Match::new(m.start(), m.end())) {
354 Ok(true) => continue,
355 Ok(false) => return Ok(Ok(())),
356 Err(err) => return Ok(Err(err)),
357 }
358 }
359 Ok(Ok(()))
360 }
361
362 fn captures_at(
363 &self,
364 haystack: &[u8],
365 at: usize,
366 caps: &mut RegexCaptures,
367 ) -> Result<bool, Error> {
368 Ok(self
369 .regex
370 .captures_read_at(&mut caps.locs, haystack, at)
371 .map_err(Error::regex)?
372 .is_some())
373 }
374}
375
376/// Represents the match offsets of each capturing group in a match.
377///
378/// The first, or `0`th capture group, always corresponds to the entire match
379/// and is guaranteed to be present when a match occurs. The next capture
380/// group, at index `1`, corresponds to the first capturing group in the regex,
381/// ordered by the position at which the left opening parenthesis occurs.
382///
383/// Note that not all capturing groups are guaranteed to be present in a match.
384/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
385/// or `bar` will ever be set in any given match.
386///
387/// In order to access a capture group by name, you'll need to first find the
388/// index of the group using the corresponding matcher's `capture_index`
389/// method, and then use that index with `RegexCaptures::get`.
390#[derive(Clone, Debug)]
391pub struct RegexCaptures {
392 /// Where the locations are stored.
393 locs: CaptureLocations,
394}
395
396impl Captures for RegexCaptures {
397 fn len(&self) -> usize {
398 self.locs.len()
399 }
400
401 fn get(&self, i: usize) -> Option<Match> {
402 self.locs.get(i).map(|(s, e)| Match::new(s, e))
403 }
404}
405
406impl RegexCaptures {
407 pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
408 RegexCaptures { locs }
409 }
410}
411
412/// Determine whether the pattern contains an uppercase character which should
413/// negate the effect of the smart-case option.
414///
415/// Ideally we would be able to check the AST in order to correctly handle
416/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
417/// cased), but PCRE doesn't expose enough details for that kind of analysis.
418/// For now, our 'good enough' solution is to simply perform a semi-naïve
419/// scan of the input pattern and ignore all characters following a '\'. The
420/// This at least lets us support the most common cases, like 'foo\w' and
421/// 'foo\S', in an intuitive manner.
422fn has_uppercase_literal(pattern: &str) -> bool {
423 let mut chars = pattern.chars();
424 while let Some(c) = chars.next() {
425 if c == '\\' {
426 chars.next();
427 } else if c.is_uppercase() {
428 return true;
429 }
430 }
431 false
432}
433
434#[cfg(test)]
435mod tests {
436 use grep_matcher::LineMatchKind;
437
438 use super::*;
439
440 // Test that enabling word matches does the right thing and demonstrate
441 // the difference between it and surrounding the regex in `\b`.
442 #[test]
443 fn word() {
444 let matcher =
445 RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
446 assert!(matcher.is_match(b"abc -2 foo").unwrap());
447
448 let matcher =
449 RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
450 assert!(!matcher.is_match(b"abc -2 foo").unwrap());
451 }
452
453 // Test that enabling CRLF permits `$` to match at the end of a line.
454 #[test]
455 fn line_terminator_crlf() {
456 // Test normal use of `$` with a `\n` line terminator.
457 let matcher = RegexMatcherBuilder::new()
458 .multi_line(true)
459 .build(r"abc$")
460 .unwrap();
461 assert!(matcher.is_match(b"abc\n").unwrap());
462
463 // Test that `$` doesn't match at `\r\n` boundary normally.
464 let matcher = RegexMatcherBuilder::new()
465 .multi_line(true)
466 .build(r"abc$")
467 .unwrap();
468 assert!(!matcher.is_match(b"abc\r\n").unwrap());
469
470 // Now check the CRLF handling.
471 let matcher = RegexMatcherBuilder::new()
472 .multi_line(true)
473 .crlf(true)
474 .build(r"abc$")
475 .unwrap();
476 assert!(matcher.is_match(b"abc\r\n").unwrap());
477 }
478
479 // Test that smart case works.
480 #[test]
481 fn case_smart() {
482 let matcher =
483 RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
484 assert!(matcher.is_match(b"ABC").unwrap());
485
486 let matcher =
487 RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
488 assert!(!matcher.is_match(b"ABC").unwrap());
489 }
490
491 // Test that finding candidate lines works as expected.
492 #[test]
493 fn candidate_lines() {
494 fn is_confirmed(m: LineMatchKind) -> bool {
495 match m {
496 LineMatchKind::Confirmed(_) => true,
497 _ => false,
498 }
499 }
500
501 let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
502 let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
503 assert!(is_confirmed(m));
504 }
505}