grep_pcre2/matcher.rs
1use std::collections::HashMap;
2
3use {
4 grep_matcher::{Captures, Match, Matcher},
5 pcre2::bytes::{CaptureLocations, Regex, RegexBuilder},
6};
7
8use crate::error::Error;
9
10/// A builder for configuring the compilation of a PCRE2 regex.
11#[derive(Clone, Debug)]
12pub struct RegexMatcherBuilder {
13 builder: RegexBuilder,
14 case_smart: bool,
15 word: bool,
16 fixed_strings: bool,
17 whole_line: bool,
18}
19
20impl RegexMatcherBuilder {
21 /// Create a new matcher builder with a default configuration.
22 pub fn new() -> RegexMatcherBuilder {
23 RegexMatcherBuilder {
24 builder: RegexBuilder::new(),
25 case_smart: false,
26 word: false,
27 fixed_strings: false,
28 whole_line: false,
29 }
30 }
31
32 /// Compile the given pattern into a PCRE matcher using the current
33 /// configuration.
34 ///
35 /// If there was a problem compiling the pattern, then an error is
36 /// returned.
37 pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
38 self.build_many(&[pattern])
39 }
40
41 /// Compile all of the given patterns into a single regex that matches when
42 /// at least one of the patterns matches.
43 ///
44 /// If there was a problem building the regex, then an error is returned.
45 pub fn build_many<P: AsRef<str>>(
46 &self,
47 patterns: &[P],
48 ) -> Result<RegexMatcher, Error> {
49 let mut builder = self.builder.clone();
50 let mut pats = Vec::with_capacity(patterns.len());
51 for p in patterns.iter() {
52 pats.push(if self.fixed_strings {
53 format!("(?:{})", pcre2::escape(p.as_ref()))
54 } else {
55 format!("(?:{})", p.as_ref())
56 });
57 }
58 let mut singlepat = pats.join("|");
59 if self.case_smart && !has_uppercase_literal(&singlepat) {
60 builder.caseless(true);
61 }
62 if self.whole_line {
63 singlepat = format!(r"(?m:^)(?:{})(?m:$)", singlepat);
64 } else if self.word {
65 // We make this option exclusive with whole_line because when
66 // whole_line is enabled, all matches necessary fall on word
67 // boundaries. So this extra goop is strictly redundant.
68 singlepat = format!(r"(?<!\w)(?:{})(?!\w)", singlepat);
69 }
70 log::trace!("final regex: {:?}", singlepat);
71 builder.build(&singlepat).map_err(Error::regex).map(|regex| {
72 let mut names = HashMap::new();
73 for (i, name) in regex.capture_names().iter().enumerate() {
74 if let Some(ref name) = *name {
75 names.insert(name.to_string(), i);
76 }
77 }
78 RegexMatcher { regex, names }
79 })
80 }
81
82 /// Enables case insensitive matching.
83 ///
84 /// If the `utf` option is also set, then Unicode case folding is used
85 /// to determine case insensitivity. When the `utf` option is not set,
86 /// then only standard ASCII case insensitivity is considered.
87 ///
88 /// This option corresponds to the `i` flag.
89 pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
90 self.builder.caseless(yes);
91 self
92 }
93
94 /// Whether to enable "smart case" or not.
95 ///
96 /// When smart case is enabled, the builder will automatically enable
97 /// case insensitive matching based on how the pattern is written. Namely,
98 /// case insensitive mode is enabled when both of the following things
99 /// are believed to be true:
100 ///
101 /// 1. The pattern contains at least one literal character. For example,
102 /// `a\w` contains a literal (`a`) but `\w` does not.
103 /// 2. Of the literals in the pattern, none of them are considered to be
104 /// uppercase according to Unicode. For example, `foo\pL` has no
105 /// uppercase literals but `Foo\pL` does.
106 ///
107 /// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
108 /// will prevent case insensitive matching even though it is part of a meta
109 /// sequence. This bug will probably never be fixed.
110 pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
111 self.case_smart = yes;
112 self
113 }
114
115 /// Enables "dot all" matching.
116 ///
117 /// When enabled, the `.` metacharacter in the pattern matches any
118 /// character, include `\n`. When disabled (the default), `.` will match
119 /// any character except for `\n`.
120 ///
121 /// This option corresponds to the `s` flag.
122 pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
123 self.builder.dotall(yes);
124 self
125 }
126
127 /// Enable "extended" mode in the pattern, where whitespace is ignored.
128 ///
129 /// This option corresponds to the `x` flag.
130 pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
131 self.builder.extended(yes);
132 self
133 }
134
135 /// Enable multiline matching mode.
136 ///
137 /// When enabled, the `^` and `$` anchors will match both at the beginning
138 /// and end of a subject string, in addition to matching at the start of
139 /// a line and the end of a line. When disabled, the `^` and `$` anchors
140 /// will only match at the beginning and end of a subject string.
141 ///
142 /// This option corresponds to the `m` flag.
143 pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
144 self.builder.multi_line(yes);
145 self
146 }
147
148 /// Enable matching of CRLF as a line terminator.
149 ///
150 /// When enabled, anchors such as `^` and `$` will match any of the
151 /// following as a line terminator: `\r`, `\n` or `\r\n`.
152 ///
153 /// This is disabled by default, in which case, only `\n` is recognized as
154 /// a line terminator.
155 pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
156 self.builder.crlf(yes);
157 self
158 }
159
160 /// Require that all matches occur on word boundaries.
161 ///
162 /// Enabling this option is subtly different than putting `\b` assertions
163 /// on both sides of your pattern. In particular, a `\b` assertion requires
164 /// that one side of it match a word character while the other match a
165 /// non-word character. This option, in contrast, merely requires that
166 /// one side match a non-word character.
167 ///
168 /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
169 /// word character. However, `-2` with this `word` option enabled will
170 /// match the `-2` in `foo -2 bar`.
171 pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
172 self.word = yes;
173 self
174 }
175
176 /// Whether the patterns should be treated as literal strings or not. When
177 /// this is active, all characters, including ones that would normally be
178 /// special regex meta characters, are matched literally.
179 pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
180 self.fixed_strings = yes;
181 self
182 }
183
184 /// Whether each pattern should match the entire line or not. This is
185 /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`.
186 pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
187 self.whole_line = yes;
188 self
189 }
190
191 /// Enable Unicode matching mode.
192 ///
193 /// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
194 /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
195 ///
196 /// When set, this implies UTF matching mode. It is not possible to enable
197 /// Unicode matching mode without enabling UTF matching mode.
198 ///
199 /// This is disabled by default.
200 pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
201 self.builder.ucp(yes);
202 self
203 }
204
205 /// Enable UTF matching mode.
206 ///
207 /// When enabled, characters are treated as sequences of code units that
208 /// make up a single codepoint instead of as single bytes. For example,
209 /// this will cause `.` to match any single UTF-8 encoded codepoint, where
210 /// as when this is disabled, `.` will any single byte (except for `\n` in
211 /// both cases, unless "dot all" mode is enabled).
212 ///
213 /// Note that when UTF matching mode is enabled, every search performed
214 /// will do a UTF-8 validation check, which can impact performance. The
215 /// UTF-8 check can be disabled via the `disable_utf_check` option, but it
216 /// is undefined behavior to enable UTF matching mode and search invalid
217 /// UTF-8.
218 ///
219 /// This is disabled by default.
220 pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
221 self.builder.utf(yes);
222 self
223 }
224
225 /// This is now deprecated and is a no-op.
226 ///
227 /// Previously, this option permitted disabling PCRE2's UTF-8 validity
228 /// check, which could result in undefined behavior if the haystack was
229 /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`,
230 /// in 10.34 which this crate always sets. When this option is enabled,
231 /// PCRE2 claims to not have undefined behavior when the haystack is
232 /// invalid UTF-8.
233 ///
234 /// Therefore, disabling the UTF-8 check is not something that is exposed
235 /// by this crate.
236 #[deprecated(
237 since = "0.2.4",
238 note = "now a no-op due to new PCRE2 features"
239 )]
240 pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
241 self
242 }
243
244 /// Enable PCRE2's JIT and return an error if it's not available.
245 ///
246 /// This generally speeds up matching quite a bit. The downside is that it
247 /// can increase the time it takes to compile a pattern.
248 ///
249 /// If the JIT isn't available or if JIT compilation returns an error, then
250 /// regex compilation will fail with the corresponding error.
251 ///
252 /// This is disabled by default, and always overrides `jit_if_available`.
253 pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
254 self.builder.jit(yes);
255 self
256 }
257
258 /// Enable PCRE2's JIT if it's available.
259 ///
260 /// This generally speeds up matching quite a bit. The downside is that it
261 /// can increase the time it takes to compile a pattern.
262 ///
263 /// If the JIT isn't available or if JIT compilation returns an error,
264 /// then a debug message with the error will be emitted and the regex will
265 /// otherwise silently fall back to non-JIT matching.
266 ///
267 /// This is disabled by default, and always overrides `jit`.
268 pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
269 self.builder.jit_if_available(yes);
270 self
271 }
272
273 /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
274 /// not enabled, then this has no effect.
275 ///
276 /// When `None` is given, no custom JIT stack will be created, and instead,
277 /// the default JIT stack is used. When the default is used, its maximum
278 /// size is 32 KB.
279 ///
280 /// When this is set, then a new JIT stack will be created with the given
281 /// maximum size as its limit.
282 ///
283 /// Increasing the stack size can be useful for larger regular expressions.
284 ///
285 /// By default, this is set to `None`.
286 pub fn max_jit_stack_size(
287 &mut self,
288 bytes: Option<usize>,
289 ) -> &mut RegexMatcherBuilder {
290 self.builder.max_jit_stack_size(bytes);
291 self
292 }
293}
294
295/// An implementation of the `Matcher` trait using PCRE2.
296#[derive(Clone, Debug)]
297pub struct RegexMatcher {
298 regex: Regex,
299 names: HashMap<String, usize>,
300}
301
302impl RegexMatcher {
303 /// Create a new matcher from the given pattern using the default
304 /// configuration.
305 pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
306 RegexMatcherBuilder::new().build(pattern)
307 }
308}
309
310impl Matcher for RegexMatcher {
311 type Captures = RegexCaptures;
312 type Error = Error;
313
314 fn find_at(
315 &self,
316 haystack: &[u8],
317 at: usize,
318 ) -> Result<Option<Match>, Error> {
319 Ok(self
320 .regex
321 .find_at(haystack, at)
322 .map_err(Error::regex)?
323 .map(|m| Match::new(m.start(), m.end())))
324 }
325
326 fn new_captures(&self) -> Result<RegexCaptures, Error> {
327 Ok(RegexCaptures::new(self.regex.capture_locations()))
328 }
329
330 fn capture_count(&self) -> usize {
331 self.regex.captures_len()
332 }
333
334 fn capture_index(&self, name: &str) -> Option<usize> {
335 self.names.get(name).map(|i| *i)
336 }
337
338 fn try_find_iter<F, E>(
339 &self,
340 haystack: &[u8],
341 mut matched: F,
342 ) -> Result<Result<(), E>, Error>
343 where
344 F: FnMut(Match) -> Result<bool, E>,
345 {
346 for result in self.regex.find_iter(haystack) {
347 let m = result.map_err(Error::regex)?;
348 match matched(Match::new(m.start(), m.end())) {
349 Ok(true) => continue,
350 Ok(false) => return Ok(Ok(())),
351 Err(err) => return Ok(Err(err)),
352 }
353 }
354 Ok(Ok(()))
355 }
356
357 fn captures_at(
358 &self,
359 haystack: &[u8],
360 at: usize,
361 caps: &mut RegexCaptures,
362 ) -> Result<bool, Error> {
363 Ok(self
364 .regex
365 .captures_read_at(&mut caps.locs, haystack, at)
366 .map_err(Error::regex)?
367 .is_some())
368 }
369}
370
371/// Represents the match offsets of each capturing group in a match.
372///
373/// The first, or `0`th capture group, always corresponds to the entire match
374/// and is guaranteed to be present when a match occurs. The next capture
375/// group, at index `1`, corresponds to the first capturing group in the regex,
376/// ordered by the position at which the left opening parenthesis occurs.
377///
378/// Note that not all capturing groups are guaranteed to be present in a match.
379/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
380/// or `bar` will ever be set in any given match.
381///
382/// In order to access a capture group by name, you'll need to first find the
383/// index of the group using the corresponding matcher's `capture_index`
384/// method, and then use that index with `RegexCaptures::get`.
385#[derive(Clone, Debug)]
386pub struct RegexCaptures {
387 /// Where the locations are stored.
388 locs: CaptureLocations,
389}
390
391impl Captures for RegexCaptures {
392 fn len(&self) -> usize {
393 self.locs.len()
394 }
395
396 fn get(&self, i: usize) -> Option<Match> {
397 self.locs.get(i).map(|(s, e)| Match::new(s, e))
398 }
399}
400
401impl RegexCaptures {
402 pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
403 RegexCaptures { locs }
404 }
405}
406
407/// Determine whether the pattern contains an uppercase character which should
408/// negate the effect of the smart-case option.
409///
410/// Ideally we would be able to check the AST in order to correctly handle
411/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
412/// cased), but PCRE doesn't expose enough details for that kind of analysis.
413/// For now, our 'good enough' solution is to simply perform a semi-naïve
414/// scan of the input pattern and ignore all characters following a '\'. The
415/// This at least lets us support the most common cases, like 'foo\w' and
416/// 'foo\S', in an intuitive manner.
417fn has_uppercase_literal(pattern: &str) -> bool {
418 let mut chars = pattern.chars();
419 while let Some(c) = chars.next() {
420 if c == '\\' {
421 chars.next();
422 } else if c.is_uppercase() {
423 return true;
424 }
425 }
426 false
427}
428
429#[cfg(test)]
430mod tests {
431 use grep_matcher::LineMatchKind;
432
433 use super::*;
434
435 // Test that enabling word matches does the right thing and demonstrate
436 // the difference between it and surrounding the regex in `\b`.
437 #[test]
438 fn word() {
439 let matcher =
440 RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
441 assert!(matcher.is_match(b"abc -2 foo").unwrap());
442
443 let matcher =
444 RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
445 assert!(!matcher.is_match(b"abc -2 foo").unwrap());
446 }
447
448 // Test that enabling CRLF permits `$` to match at the end of a line.
449 #[test]
450 fn line_terminator_crlf() {
451 // Test normal use of `$` with a `\n` line terminator.
452 let matcher = RegexMatcherBuilder::new()
453 .multi_line(true)
454 .build(r"abc$")
455 .unwrap();
456 assert!(matcher.is_match(b"abc\n").unwrap());
457
458 // Test that `$` doesn't match at `\r\n` boundary normally.
459 let matcher = RegexMatcherBuilder::new()
460 .multi_line(true)
461 .build(r"abc$")
462 .unwrap();
463 assert!(!matcher.is_match(b"abc\r\n").unwrap());
464
465 // Now check the CRLF handling.
466 let matcher = RegexMatcherBuilder::new()
467 .multi_line(true)
468 .crlf(true)
469 .build(r"abc$")
470 .unwrap();
471 assert!(matcher.is_match(b"abc\r\n").unwrap());
472 }
473
474 // Test that smart case works.
475 #[test]
476 fn case_smart() {
477 let matcher =
478 RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
479 assert!(matcher.is_match(b"ABC").unwrap());
480
481 let matcher =
482 RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
483 assert!(!matcher.is_match(b"ABC").unwrap());
484 }
485
486 // Test that finding candidate lines works as expected.
487 #[test]
488 fn candidate_lines() {
489 fn is_confirmed(m: LineMatchKind) -> bool {
490 match m {
491 LineMatchKind::Confirmed(_) => true,
492 _ => false,
493 }
494 }
495
496 let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
497 let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
498 assert!(is_confirmed(m));
499 }
500}