grep_matcher/lib.rs
1/*!
2This crate provides an interface for regular expressions, with a focus on line
3oriented search. The purpose of this crate is to provide a low level matching
4interface that permits any kind of substring or regex implementation to power
5the search routines provided by the
6[`grep-searcher`](https://docs.rs/grep-searcher)
7crate.
8
9The primary thing provided by this crate is the [`Matcher`] trait. The trait
10defines an abstract interface for text search. It is robust enough to support
11everything from basic substring search all the way to arbitrarily complex
12regular expression implementations without sacrificing performance.
13
14A key design decision made in this crate is the use of *internal iteration*,
15or otherwise known as the "push" model of searching. In this paradigm,
16implementations of the `Matcher` trait will drive search and execute callbacks
17provided by the caller when a match is found. This is in contrast to the
18usual style of *external iteration* (the "pull" model) found throughout the
19Rust ecosystem. There are two primary reasons why internal iteration was
20chosen:
21
22* Some search implementations may themselves require internal iteration.
23 Converting an internal iterator to an external iterator can be non-trivial
24 and sometimes even practically impossible.
25* Rust's type system isn't quite expressive enough to write a generic interface
26 using external iteration without giving something else up (namely, ease of
27 use and/or performance).
28
29In other words, internal iteration was chosen because it is the lowest common
30denominator and because it is probably the least bad way of expressing the
31interface in today's Rust. As a result, this trait isn't specifically intended
32for everyday use, although, you might find it to be a happy price to pay if you
33want to write code that is generic over multiple different regex
34implementations.
35*/
36
37#![deny(missing_docs)]
38
39use crate::interpolate::interpolate;
40
41mod interpolate;
42
43/// The type of a match.
44///
45/// The type of a match is a possibly empty range pointing to a contiguous
46/// block of addressable memory.
47///
48/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
49///
50/// # Indexing
51///
52/// This type is structurally identical to `std::ops::Range<usize>`, but
53/// is a bit more ergonomic for dealing with match indices. In particular,
54/// this type implements `Copy` and provides methods for building new `Match`
55/// values based on old `Match` values. Finally, the invariant that `start`
56/// is always less than or equal to `end` is enforced.
57///
58/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
59/// range notation. e.g.,
60///
61/// ```
62/// use grep_matcher::Match;
63///
64/// let m = Match::new(2, 5);
65/// let bytes = b"abcdefghi";
66/// assert_eq!(b"cde", &bytes[m]);
67/// ```
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub struct Match {
70 start: usize,
71 end: usize,
72}
73
74impl Match {
75 /// Create a new match.
76 ///
77 /// # Panics
78 ///
79 /// This function panics if `start > end`.
80 #[inline]
81 pub fn new(start: usize, end: usize) -> Match {
82 assert!(start <= end);
83 Match { start, end }
84 }
85
86 /// Creates a zero width match at the given offset.
87 #[inline]
88 pub fn zero(offset: usize) -> Match {
89 Match { start: offset, end: offset }
90 }
91
92 /// Return the start offset of this match.
93 #[inline]
94 pub fn start(&self) -> usize {
95 self.start
96 }
97
98 /// Return the end offset of this match.
99 #[inline]
100 pub fn end(&self) -> usize {
101 self.end
102 }
103
104 /// Return a new match with the start offset replaced with the given
105 /// value.
106 ///
107 /// # Panics
108 ///
109 /// This method panics if `start > self.end`.
110 #[inline]
111 pub fn with_start(&self, start: usize) -> Match {
112 assert!(start <= self.end, "{} is not <= {}", start, self.end);
113 Match { start, ..*self }
114 }
115
116 /// Return a new match with the end offset replaced with the given
117 /// value.
118 ///
119 /// # Panics
120 ///
121 /// This method panics if `self.start > end`.
122 #[inline]
123 pub fn with_end(&self, end: usize) -> Match {
124 assert!(self.start <= end, "{} is not <= {}", self.start, end);
125 Match { end, ..*self }
126 }
127
128 /// Offset this match by the given amount and return a new match.
129 ///
130 /// This adds the given offset to the start and end of this match, and
131 /// returns the resulting match.
132 ///
133 /// # Panics
134 ///
135 /// This panics if adding the given amount to either the start or end
136 /// offset would result in an overflow.
137 #[inline]
138 pub fn offset(&self, amount: usize) -> Match {
139 Match {
140 start: self.start.checked_add(amount).unwrap(),
141 end: self.end.checked_add(amount).unwrap(),
142 }
143 }
144
145 /// Returns the number of bytes in this match.
146 #[inline]
147 pub fn len(&self) -> usize {
148 self.end - self.start
149 }
150
151 /// Returns true if and only if this match is empty.
152 #[inline]
153 pub fn is_empty(&self) -> bool {
154 self.len() == 0
155 }
156}
157
158impl std::ops::Index<Match> for [u8] {
159 type Output = [u8];
160
161 #[inline]
162 fn index(&self, index: Match) -> &[u8] {
163 &self[index.start..index.end]
164 }
165}
166
167impl std::ops::IndexMut<Match> for [u8] {
168 #[inline]
169 fn index_mut(&mut self, index: Match) -> &mut [u8] {
170 &mut self[index.start..index.end]
171 }
172}
173
174impl std::ops::Index<Match> for str {
175 type Output = str;
176
177 #[inline]
178 fn index(&self, index: Match) -> &str {
179 &self[index.start..index.end]
180 }
181}
182
183/// A line terminator.
184///
185/// A line terminator represents the end of a line. Generally, every line is
186/// either "terminated" by the end of a stream or a specific byte (or sequence
187/// of bytes).
188///
189/// Generally, a line terminator is a single byte, specifically, `\n`, on
190/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
191/// as `CRLF` for `Carriage Return; Line Feed`).
192///
193/// The default line terminator is `\n` on all platforms.
194#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
195pub struct LineTerminator(LineTerminatorImp);
196
197#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
198enum LineTerminatorImp {
199 /// Any single byte representing a line terminator.
200 Byte(u8),
201 /// A line terminator represented by `\r\n`.
202 ///
203 /// When this option is used, consumers may generally treat a lone `\n` as
204 /// a line terminator in addition to `\r\n`.
205 CRLF,
206}
207
208impl LineTerminator {
209 /// Return a new single-byte line terminator. Any byte is valid.
210 #[inline]
211 pub fn byte(byte: u8) -> LineTerminator {
212 LineTerminator(LineTerminatorImp::Byte(byte))
213 }
214
215 /// Return a new line terminator represented by `\r\n`.
216 ///
217 /// When this option is used, consumers may generally treat a lone `\n` as
218 /// a line terminator in addition to `\r\n`.
219 #[inline]
220 pub fn crlf() -> LineTerminator {
221 LineTerminator(LineTerminatorImp::CRLF)
222 }
223
224 /// Returns true if and only if this line terminator is CRLF.
225 #[inline]
226 pub fn is_crlf(&self) -> bool {
227 self.0 == LineTerminatorImp::CRLF
228 }
229
230 /// Returns this line terminator as a single byte.
231 ///
232 /// If the line terminator is CRLF, then this returns `\n`. This is
233 /// useful for routines that, for example, find line boundaries by treating
234 /// `\n` as a line terminator even when it isn't preceded by `\r`.
235 #[inline]
236 pub fn as_byte(&self) -> u8 {
237 match self.0 {
238 LineTerminatorImp::Byte(byte) => byte,
239 LineTerminatorImp::CRLF => b'\n',
240 }
241 }
242
243 /// Returns this line terminator as a sequence of bytes.
244 ///
245 /// This returns a singleton sequence for all line terminators except for
246 /// `CRLF`, in which case, it returns `\r\n`.
247 ///
248 /// The slice returned is guaranteed to have length at least `1`.
249 #[inline]
250 pub fn as_bytes(&self) -> &[u8] {
251 match self.0 {
252 LineTerminatorImp::Byte(ref byte) => std::slice::from_ref(byte),
253 LineTerminatorImp::CRLF => &[b'\r', b'\n'],
254 }
255 }
256
257 /// Returns true if and only if the given slice ends with this line
258 /// terminator.
259 ///
260 /// If this line terminator is `CRLF`, then this only checks whether the
261 /// last byte is `\n`.
262 #[inline]
263 pub fn is_suffix(&self, slice: &[u8]) -> bool {
264 slice.last().map_or(false, |&b| b == self.as_byte())
265 }
266}
267
268impl Default for LineTerminator {
269 #[inline]
270 fn default() -> LineTerminator {
271 LineTerminator::byte(b'\n')
272 }
273}
274
275/// A set of bytes.
276///
277/// In this crate, byte sets are used to express bytes that can never appear
278/// anywhere in a match for a particular implementation of the `Matcher` trait.
279/// Specifically, if such a set can be determined, then it's possible for
280/// callers to perform additional operations on the basis that certain bytes
281/// may never match.
282///
283/// For example, if a search is configured to possibly produce results that
284/// span multiple lines but a caller provided pattern can never match across
285/// multiple lines, then it may make sense to divert to more optimized line
286/// oriented routines that don't need to handle the multi-line match case.
287#[derive(Clone, Debug)]
288pub struct ByteSet(BitSet);
289
290#[derive(Clone, Copy)]
291struct BitSet([u64; 4]);
292
293impl std::fmt::Debug for BitSet {
294 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
295 let mut fmtd = f.debug_set();
296 for b in 0..=255 {
297 if ByteSet(*self).contains(b) {
298 fmtd.entry(&b);
299 }
300 }
301 fmtd.finish()
302 }
303}
304
305impl ByteSet {
306 /// Create an empty set of bytes.
307 #[inline]
308 pub fn empty() -> ByteSet {
309 ByteSet(BitSet([0; 4]))
310 }
311
312 /// Create a full set of bytes such that every possible byte is in the set
313 /// returned.
314 #[inline]
315 pub fn full() -> ByteSet {
316 ByteSet(BitSet([u64::MAX; 4]))
317 }
318
319 /// Add a byte to this set.
320 ///
321 /// If the given byte already belongs to this set, then this is a no-op.
322 #[inline]
323 pub fn add(&mut self, byte: u8) {
324 let bucket = byte / 64;
325 let bit = byte % 64;
326 (self.0).0[usize::from(bucket)] |= 1 << bit;
327 }
328
329 /// Add an inclusive range of bytes.
330 #[inline]
331 pub fn add_all(&mut self, start: u8, end: u8) {
332 for b in start..=end {
333 self.add(b);
334 }
335 }
336
337 /// Remove a byte from this set.
338 ///
339 /// If the given byte is not in this set, then this is a no-op.
340 #[inline]
341 pub fn remove(&mut self, byte: u8) {
342 let bucket = byte / 64;
343 let bit = byte % 64;
344 (self.0).0[usize::from(bucket)] &= !(1 << bit);
345 }
346
347 /// Remove an inclusive range of bytes.
348 #[inline]
349 pub fn remove_all(&mut self, start: u8, end: u8) {
350 for b in start..=end {
351 self.remove(b);
352 }
353 }
354
355 /// Return true if and only if the given byte is in this set.
356 #[inline]
357 pub fn contains(&self, byte: u8) -> bool {
358 let bucket = byte / 64;
359 let bit = byte % 64;
360 (self.0).0[usize::from(bucket)] & (1 << bit) > 0
361 }
362}
363
364/// A trait that describes implementations of capturing groups.
365///
366/// When a matcher supports capturing group extraction, then it is the
367/// matcher's responsibility to provide an implementation of this trait.
368///
369/// Principally, this trait provides a way to access capturing groups
370/// in a uniform way that does not require any specific representation.
371/// Namely, different matcher implementations may require different in-memory
372/// representations of capturing groups. This trait permits matchers to
373/// maintain their specific in-memory representation.
374///
375/// Note that this trait explicitly does not provide a way to construct a new
376/// capture value. Instead, it is the responsibility of a `Matcher` to build
377/// one, which might require knowledge of the matcher's internal implementation
378/// details.
379pub trait Captures {
380 /// Return the total number of capturing groups. This includes capturing
381 /// groups that have not matched anything.
382 fn len(&self) -> usize;
383
384 /// Return the capturing group match at the given index. If no match of
385 /// that capturing group exists, then this returns `None`.
386 ///
387 /// When a matcher reports a match with capturing groups, then the first
388 /// capturing group (at index `0`) must always correspond to the offsets
389 /// for the overall match.
390 fn get(&self, i: usize) -> Option<Match>;
391
392 /// Return the overall match for the capture.
393 ///
394 /// This returns the match for index `0`. That is it is equivalent to
395 /// `get(0).unwrap()`
396 #[inline]
397 fn as_match(&self) -> Match {
398 self.get(0).unwrap()
399 }
400
401 /// Returns true if and only if these captures are empty. This occurs
402 /// when `len` is `0`.
403 ///
404 /// Note that capturing groups that have non-zero length but otherwise
405 /// contain no matching groups are *not* empty.
406 #[inline]
407 fn is_empty(&self) -> bool {
408 self.len() == 0
409 }
410
411 /// Expands all instances of `$name` in `replacement` to the corresponding
412 /// capture group `name`, and writes them to the `dst` buffer given.
413 ///
414 /// (Note: If you're looking for a convenient way to perform replacements
415 /// with interpolation, then you'll want to use the `replace_with_captures`
416 /// method on the `Matcher` trait.)
417 ///
418 /// `name` may be an integer corresponding to the index of the
419 /// capture group (counted by order of opening parenthesis where `0` is the
420 /// entire match) or it can be a name (consisting of letters, digits or
421 /// underscores) corresponding to a named capture group.
422 ///
423 /// A `name` is translated to a capture group index via the given
424 /// `name_to_index` function. If `name` isn't a valid capture group
425 /// (whether the name doesn't exist or isn't a valid index), then it is
426 /// replaced with the empty string.
427 ///
428 /// The longest possible name is used. e.g., `$1a` looks up the capture
429 /// group named `1a` and not the capture group at index `1`. To exert
430 /// more precise control over the name, use braces, e.g., `${1}a`. In all
431 /// cases, capture group names are limited to ASCII letters, numbers and
432 /// underscores.
433 ///
434 /// To write a literal `$` use `$$`.
435 ///
436 /// Note that the capture group match indices are resolved by slicing
437 /// the given `haystack`. Generally, this means that `haystack` should be
438 /// the same slice that was searched to get the current capture group
439 /// matches.
440 #[inline]
441 fn interpolate<F>(
442 &self,
443 name_to_index: F,
444 haystack: &[u8],
445 replacement: &[u8],
446 dst: &mut Vec<u8>,
447 ) where
448 F: FnMut(&str) -> Option<usize>,
449 {
450 interpolate(
451 replacement,
452 |i, dst| {
453 if let Some(range) = self.get(i) {
454 dst.extend(&haystack[range]);
455 }
456 },
457 name_to_index,
458 dst,
459 )
460 }
461}
462
463/// NoCaptures provides an always-empty implementation of the `Captures` trait.
464///
465/// This type is useful for implementations of `Matcher` that don't support
466/// capturing groups.
467#[derive(Clone, Debug)]
468pub struct NoCaptures(());
469
470impl NoCaptures {
471 /// Create an empty set of capturing groups.
472 #[inline]
473 pub fn new() -> NoCaptures {
474 NoCaptures(())
475 }
476}
477
478impl Captures for NoCaptures {
479 #[inline]
480 fn len(&self) -> usize {
481 0
482 }
483
484 #[inline]
485 fn get(&self, _: usize) -> Option<Match> {
486 None
487 }
488}
489
490/// NoError provides an error type for matchers that never produce errors.
491///
492/// This error type implements the `std::error::Error` and `std::fmt::Display`
493/// traits for use in matcher implementations that can never produce errors.
494///
495/// The `std::fmt::Debug` and `std::fmt::Display` impls for this type panics.
496#[derive(Debug, Eq, PartialEq)]
497pub struct NoError(());
498
499impl std::error::Error for NoError {
500 fn description(&self) -> &str {
501 "no error"
502 }
503}
504
505impl std::fmt::Display for NoError {
506 fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
507 panic!("BUG for NoError: an impossible error occurred")
508 }
509}
510
511impl From<NoError> for std::io::Error {
512 fn from(_: NoError) -> std::io::Error {
513 panic!("BUG for NoError: an impossible error occurred")
514 }
515}
516
517/// The type of match for a line oriented matcher.
518#[derive(Clone, Copy, Debug)]
519pub enum LineMatchKind {
520 /// A position inside a line that is known to contain a match.
521 ///
522 /// This position can be anywhere in the line. It does not need to point
523 /// at the location of the match.
524 Confirmed(usize),
525 /// A position inside a line that may contain a match, and must be searched
526 /// for verification.
527 ///
528 /// This position can be anywhere in the line. It does not need to point
529 /// at the location of the match.
530 Candidate(usize),
531}
532
533/// A matcher defines an interface for regular expression implementations.
534///
535/// While this trait is large, there are only two required methods that
536/// implementors must provide: `find_at` and `new_captures`. If captures aren't
537/// supported by your implementation, then `new_captures` can be implemented
538/// with [`NoCaptures`]. If your implementation does support capture groups,
539/// then you should also implement the other capture related methods, as
540/// dictated by the documentation. Crucially, this includes `captures_at`.
541///
542/// The rest of the methods on this trait provide default implementations on
543/// top of `find_at` and `new_captures`. It is not uncommon for implementations
544/// to be able to provide faster variants of some methods; in those cases,
545/// simply override the default implementation.
546pub trait Matcher {
547 /// The concrete type of capturing groups used for this matcher.
548 ///
549 /// If this implementation does not support capturing groups, then set
550 /// this to `NoCaptures`.
551 type Captures: Captures;
552
553 /// The error type used by this matcher.
554 ///
555 /// For matchers in which an error is not possible, they are encouraged to
556 /// use the `NoError` type in this crate. In the future, when the "never"
557 /// (spelled `!`) type is stabilized, then it should probably be used
558 /// instead.
559 type Error: std::fmt::Display;
560
561 /// Returns the start and end byte range of the first match in `haystack`
562 /// after `at`, where the byte offsets are relative to that start of
563 /// `haystack` (and not `at`). If no match exists, then `None` is returned.
564 ///
565 /// The text encoding of `haystack` is not strictly specified. Matchers are
566 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
567 ///
568 /// The significance of the starting point is that it takes the surrounding
569 /// context into consideration. For example, the `\A` anchor can only
570 /// match when `at == 0`.
571 fn find_at(
572 &self,
573 haystack: &[u8],
574 at: usize,
575 ) -> Result<Option<Match>, Self::Error>;
576
577 /// Creates an empty group of captures suitable for use with the capturing
578 /// APIs of this trait.
579 ///
580 /// Implementations that don't support capturing groups should use
581 /// the `NoCaptures` type and implement this method by calling
582 /// `NoCaptures::new()`.
583 fn new_captures(&self) -> Result<Self::Captures, Self::Error>;
584
585 /// Returns the total number of capturing groups in this matcher.
586 ///
587 /// If a matcher supports capturing groups, then this value must always be
588 /// at least 1, where the first capturing group always corresponds to the
589 /// overall match.
590 ///
591 /// If a matcher does not support capturing groups, then this should
592 /// always return 0.
593 ///
594 /// By default, capturing groups are not supported, so this always
595 /// returns 0.
596 #[inline]
597 fn capture_count(&self) -> usize {
598 0
599 }
600
601 /// Maps the given capture group name to its corresponding capture group
602 /// index, if one exists. If one does not exist, then `None` is returned.
603 ///
604 /// If the given capture group name maps to multiple indices, then it is
605 /// not specified which one is returned. However, it is guaranteed that
606 /// one of them is returned.
607 ///
608 /// By default, capturing groups are not supported, so this always returns
609 /// `None`.
610 #[inline]
611 fn capture_index(&self, _name: &str) -> Option<usize> {
612 None
613 }
614
615 /// Returns the start and end byte range of the first match in `haystack`.
616 /// If no match exists, then `None` is returned.
617 ///
618 /// The text encoding of `haystack` is not strictly specified. Matchers are
619 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
620 #[inline]
621 fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
622 self.find_at(haystack, 0)
623 }
624
625 /// Executes the given function over successive non-overlapping matches
626 /// in `haystack`. If no match exists, then the given function is never
627 /// called. If the function returns `false`, then iteration stops.
628 #[inline]
629 fn find_iter<F>(
630 &self,
631 haystack: &[u8],
632 matched: F,
633 ) -> Result<(), Self::Error>
634 where
635 F: FnMut(Match) -> bool,
636 {
637 self.find_iter_at(haystack, 0, matched)
638 }
639
640 /// Executes the given function over successive non-overlapping matches
641 /// in `haystack`. If no match exists, then the given function is never
642 /// called. If the function returns `false`, then iteration stops.
643 ///
644 /// The significance of the starting point is that it takes the surrounding
645 /// context into consideration. For example, the `\A` anchor can only
646 /// match when `at == 0`.
647 #[inline]
648 fn find_iter_at<F>(
649 &self,
650 haystack: &[u8],
651 at: usize,
652 mut matched: F,
653 ) -> Result<(), Self::Error>
654 where
655 F: FnMut(Match) -> bool,
656 {
657 self.try_find_iter_at(haystack, at, |m| Ok(matched(m)))
658 .map(|r: Result<(), ()>| r.unwrap())
659 }
660
661 /// Executes the given function over successive non-overlapping matches
662 /// in `haystack`. If no match exists, then the given function is never
663 /// called. If the function returns `false`, then iteration stops.
664 /// Similarly, if the function returns an error then iteration stops and
665 /// the error is yielded. If an error occurs while executing the search,
666 /// then it is converted to
667 /// `E`.
668 #[inline]
669 fn try_find_iter<F, E>(
670 &self,
671 haystack: &[u8],
672 matched: F,
673 ) -> Result<Result<(), E>, Self::Error>
674 where
675 F: FnMut(Match) -> Result<bool, E>,
676 {
677 self.try_find_iter_at(haystack, 0, matched)
678 }
679
680 /// Executes the given function over successive non-overlapping matches
681 /// in `haystack`. If no match exists, then the given function is never
682 /// called. If the function returns `false`, then iteration stops.
683 /// Similarly, if the function returns an error then iteration stops and
684 /// the error is yielded. If an error occurs while executing the search,
685 /// then it is converted to
686 /// `E`.
687 ///
688 /// The significance of the starting point is that it takes the surrounding
689 /// context into consideration. For example, the `\A` anchor can only
690 /// match when `at == 0`.
691 #[inline]
692 fn try_find_iter_at<F, E>(
693 &self,
694 haystack: &[u8],
695 at: usize,
696 mut matched: F,
697 ) -> Result<Result<(), E>, Self::Error>
698 where
699 F: FnMut(Match) -> Result<bool, E>,
700 {
701 let mut last_end = at;
702 let mut last_match = None;
703
704 loop {
705 if last_end > haystack.len() {
706 return Ok(Ok(()));
707 }
708 let m = match self.find_at(haystack, last_end)? {
709 None => return Ok(Ok(())),
710 Some(m) => m,
711 };
712 if m.start == m.end {
713 // This is an empty match. To ensure we make progress, start
714 // the next search at the smallest possible starting position
715 // of the next match following this one.
716 last_end = m.end + 1;
717 // Don't accept empty matches immediately following a match.
718 // Just move on to the next match.
719 if Some(m.end) == last_match {
720 continue;
721 }
722 } else {
723 last_end = m.end;
724 }
725 last_match = Some(m.end);
726 match matched(m) {
727 Ok(true) => continue,
728 Ok(false) => return Ok(Ok(())),
729 Err(err) => return Ok(Err(err)),
730 }
731 }
732 }
733
734 /// Populates the first set of capture group matches from `haystack` into
735 /// `caps`. If no match exists, then `false` is returned.
736 ///
737 /// The text encoding of `haystack` is not strictly specified. Matchers are
738 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
739 #[inline]
740 fn captures(
741 &self,
742 haystack: &[u8],
743 caps: &mut Self::Captures,
744 ) -> Result<bool, Self::Error> {
745 self.captures_at(haystack, 0, caps)
746 }
747
748 /// Executes the given function over successive non-overlapping matches
749 /// in `haystack` with capture groups extracted from each match. If no
750 /// match exists, then the given function is never called. If the function
751 /// returns `false`, then iteration stops.
752 #[inline]
753 fn captures_iter<F>(
754 &self,
755 haystack: &[u8],
756 caps: &mut Self::Captures,
757 matched: F,
758 ) -> Result<(), Self::Error>
759 where
760 F: FnMut(&Self::Captures) -> bool,
761 {
762 self.captures_iter_at(haystack, 0, caps, matched)
763 }
764
765 /// Executes the given function over successive non-overlapping matches
766 /// in `haystack` with capture groups extracted from each match. If no
767 /// match exists, then the given function is never called. If the function
768 /// returns `false`, then iteration stops.
769 ///
770 /// The significance of the starting point is that it takes the surrounding
771 /// context into consideration. For example, the `\A` anchor can only
772 /// match when `at == 0`.
773 #[inline]
774 fn captures_iter_at<F>(
775 &self,
776 haystack: &[u8],
777 at: usize,
778 caps: &mut Self::Captures,
779 mut matched: F,
780 ) -> Result<(), Self::Error>
781 where
782 F: FnMut(&Self::Captures) -> bool,
783 {
784 self.try_captures_iter_at(haystack, at, caps, |caps| Ok(matched(caps)))
785 .map(|r: Result<(), ()>| r.unwrap())
786 }
787
788 /// Executes the given function over successive non-overlapping matches
789 /// in `haystack` with capture groups extracted from each match. If no
790 /// match exists, then the given function is never called. If the function
791 /// returns `false`, then iteration stops. Similarly, if the function
792 /// returns an error then iteration stops and the error is yielded. If
793 /// an error occurs while executing the search, then it is converted to
794 /// `E`.
795 #[inline]
796 fn try_captures_iter<F, E>(
797 &self,
798 haystack: &[u8],
799 caps: &mut Self::Captures,
800 matched: F,
801 ) -> Result<Result<(), E>, Self::Error>
802 where
803 F: FnMut(&Self::Captures) -> Result<bool, E>,
804 {
805 self.try_captures_iter_at(haystack, 0, caps, matched)
806 }
807
808 /// Executes the given function over successive non-overlapping matches
809 /// in `haystack` with capture groups extracted from each match. If no
810 /// match exists, then the given function is never called. If the function
811 /// returns `false`, then iteration stops. Similarly, if the function
812 /// returns an error then iteration stops and the error is yielded. If
813 /// an error occurs while executing the search, then it is converted to
814 /// `E`.
815 ///
816 /// The significance of the starting point is that it takes the surrounding
817 /// context into consideration. For example, the `\A` anchor can only
818 /// match when `at == 0`.
819 #[inline]
820 fn try_captures_iter_at<F, E>(
821 &self,
822 haystack: &[u8],
823 at: usize,
824 caps: &mut Self::Captures,
825 mut matched: F,
826 ) -> Result<Result<(), E>, Self::Error>
827 where
828 F: FnMut(&Self::Captures) -> Result<bool, E>,
829 {
830 let mut last_end = at;
831 let mut last_match = None;
832
833 loop {
834 if last_end > haystack.len() {
835 return Ok(Ok(()));
836 }
837 if !self.captures_at(haystack, last_end, caps)? {
838 return Ok(Ok(()));
839 }
840 let m = caps.get(0).unwrap();
841 if m.start == m.end {
842 // This is an empty match. To ensure we make progress, start
843 // the next search at the smallest possible starting position
844 // of the next match following this one.
845 last_end = m.end + 1;
846 // Don't accept empty matches immediately following a match.
847 // Just move on to the next match.
848 if Some(m.end) == last_match {
849 continue;
850 }
851 } else {
852 last_end = m.end;
853 }
854 last_match = Some(m.end);
855 match matched(caps) {
856 Ok(true) => continue,
857 Ok(false) => return Ok(Ok(())),
858 Err(err) => return Ok(Err(err)),
859 }
860 }
861 }
862
863 /// Populates the first set of capture group matches from `haystack`
864 /// into `matches` after `at`, where the byte offsets in each capturing
865 /// group are relative to the start of `haystack` (and not `at`). If no
866 /// match exists, then `false` is returned and the contents of the given
867 /// capturing groups are unspecified.
868 ///
869 /// The text encoding of `haystack` is not strictly specified. Matchers are
870 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
871 ///
872 /// The significance of the starting point is that it takes the surrounding
873 /// context into consideration. For example, the `\A` anchor can only
874 /// match when `at == 0`.
875 ///
876 /// By default, capturing groups aren't supported, and this implementation
877 /// will always behave as if a match were impossible.
878 ///
879 /// Implementors that provide support for capturing groups must guarantee
880 /// that when a match occurs, the first capture match (at index `0`) is
881 /// always set to the overall match offsets.
882 ///
883 /// Note that if implementors seek to support capturing groups, then they
884 /// should implement this method. Other methods that match based on
885 /// captures will then work automatically.
886 #[inline]
887 fn captures_at(
888 &self,
889 _haystack: &[u8],
890 _at: usize,
891 _caps: &mut Self::Captures,
892 ) -> Result<bool, Self::Error> {
893 Ok(false)
894 }
895
896 /// Replaces every match in the given haystack with the result of calling
897 /// `append`. `append` is given the start and end of a match, along with
898 /// a handle to the `dst` buffer provided.
899 ///
900 /// If the given `append` function returns `false`, then replacement stops.
901 #[inline]
902 fn replace<F>(
903 &self,
904 haystack: &[u8],
905 dst: &mut Vec<u8>,
906 mut append: F,
907 ) -> Result<(), Self::Error>
908 where
909 F: FnMut(Match, &mut Vec<u8>) -> bool,
910 {
911 let mut last_match = 0;
912 self.find_iter(haystack, |m| {
913 dst.extend(&haystack[last_match..m.start]);
914 last_match = m.end;
915 append(m, dst)
916 })?;
917 dst.extend(&haystack[last_match..]);
918 Ok(())
919 }
920
921 /// Replaces every match in the given haystack with the result of calling
922 /// `append` with the matching capture groups.
923 ///
924 /// If the given `append` function returns `false`, then replacement stops.
925 #[inline]
926 fn replace_with_captures<F>(
927 &self,
928 haystack: &[u8],
929 caps: &mut Self::Captures,
930 dst: &mut Vec<u8>,
931 append: F,
932 ) -> Result<(), Self::Error>
933 where
934 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
935 {
936 self.replace_with_captures_at(haystack, 0, caps, dst, append)
937 }
938
939 /// Replaces every match in the given haystack with the result of calling
940 /// `append` with the matching capture groups.
941 ///
942 /// If the given `append` function returns `false`, then replacement stops.
943 ///
944 /// The significance of the starting point is that it takes the surrounding
945 /// context into consideration. For example, the `\A` anchor can only
946 /// match when `at == 0`.
947 #[inline]
948 fn replace_with_captures_at<F>(
949 &self,
950 haystack: &[u8],
951 at: usize,
952 caps: &mut Self::Captures,
953 dst: &mut Vec<u8>,
954 mut append: F,
955 ) -> Result<(), Self::Error>
956 where
957 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
958 {
959 let mut last_match = at;
960 self.captures_iter_at(haystack, at, caps, |caps| {
961 let m = caps.get(0).unwrap();
962 dst.extend(&haystack[last_match..m.start]);
963 last_match = m.end;
964 append(caps, dst)
965 })?;
966 dst.extend(&haystack[last_match..]);
967 Ok(())
968 }
969
970 /// Returns true if and only if the matcher matches the given haystack.
971 ///
972 /// By default, this method is implemented by calling `shortest_match`.
973 #[inline]
974 fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
975 self.is_match_at(haystack, 0)
976 }
977
978 /// Returns true if and only if the matcher matches the given haystack
979 /// starting at the given position.
980 ///
981 /// By default, this method is implemented by calling `shortest_match_at`.
982 ///
983 /// The significance of the starting point is that it takes the surrounding
984 /// context into consideration. For example, the `\A` anchor can only
985 /// match when `at == 0`.
986 #[inline]
987 fn is_match_at(
988 &self,
989 haystack: &[u8],
990 at: usize,
991 ) -> Result<bool, Self::Error> {
992 Ok(self.shortest_match_at(haystack, at)?.is_some())
993 }
994
995 /// Returns an end location of the first match in `haystack`. If no match
996 /// exists, then `None` is returned.
997 ///
998 /// Note that the end location reported by this method may be less than the
999 /// same end location reported by `find`. For example, running `find` with
1000 /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1001 /// 3)`, but `shortest_match` may report `1` as the ending location since
1002 /// that is the place at which a match is guaranteed to occur.
1003 ///
1004 /// This method should never report false positives or false negatives. The
1005 /// point of this method is that some implementors may be able to provide
1006 /// a faster implementation of this than what `find` does.
1007 ///
1008 /// By default, this method is implemented by calling `find`.
1009 #[inline]
1010 fn shortest_match(
1011 &self,
1012 haystack: &[u8],
1013 ) -> Result<Option<usize>, Self::Error> {
1014 self.shortest_match_at(haystack, 0)
1015 }
1016
1017 /// Returns an end location of the first match in `haystack` starting at
1018 /// the given position. If no match exists, then `None` is returned.
1019 ///
1020 /// Note that the end location reported by this method may be less than the
1021 /// same end location reported by `find`. For example, running `find` with
1022 /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1023 /// 3)`, but `shortest_match` may report `1` as the ending location since
1024 /// that is the place at which a match is guaranteed to occur.
1025 ///
1026 /// This method should never report false positives or false negatives. The
1027 /// point of this method is that some implementors may be able to provide
1028 /// a faster implementation of this than what `find` does.
1029 ///
1030 /// By default, this method is implemented by calling `find_at`.
1031 ///
1032 /// The significance of the starting point is that it takes the surrounding
1033 /// context into consideration. For example, the `\A` anchor can only
1034 /// match when `at == 0`.
1035 #[inline]
1036 fn shortest_match_at(
1037 &self,
1038 haystack: &[u8],
1039 at: usize,
1040 ) -> Result<Option<usize>, Self::Error> {
1041 Ok(self.find_at(haystack, at)?.map(|m| m.end))
1042 }
1043
1044 /// If available, return a set of bytes that will never appear in a match
1045 /// produced by an implementation.
1046 ///
1047 /// Specifically, if such a set can be determined, then it's possible for
1048 /// callers to perform additional operations on the basis that certain
1049 /// bytes may never match.
1050 ///
1051 /// For example, if a search is configured to possibly produce results
1052 /// that span multiple lines but a caller provided pattern can never
1053 /// match across multiple lines, then it may make sense to divert to
1054 /// more optimized line oriented routines that don't need to handle the
1055 /// multi-line match case.
1056 ///
1057 /// Implementations that produce this set must never report false
1058 /// positives, but may produce false negatives. That is, is a byte is in
1059 /// this set then it must be guaranteed that it is never in a match. But,
1060 /// if a byte is not in this set, then callers cannot assume that a match
1061 /// exists with that byte.
1062 ///
1063 /// By default, this returns `None`.
1064 #[inline]
1065 fn non_matching_bytes(&self) -> Option<&ByteSet> {
1066 None
1067 }
1068
1069 /// If this matcher was compiled as a line oriented matcher, then this
1070 /// method returns the line terminator if and only if the line terminator
1071 /// never appears in any match produced by this matcher. If this wasn't
1072 /// compiled as a line oriented matcher, or if the aforementioned guarantee
1073 /// cannot be made, then this must return `None`, which is the default.
1074 /// It is **never wrong** to return `None`, but returning a line terminator
1075 /// when it can appear in a match results in unspecified behavior.
1076 ///
1077 /// The line terminator is typically `b'\n'`, but can be any single byte or
1078 /// `CRLF`.
1079 ///
1080 /// By default, this returns `None`.
1081 #[inline]
1082 fn line_terminator(&self) -> Option<LineTerminator> {
1083 None
1084 }
1085
1086 /// Return one of the following: a confirmed line match, a candidate line
1087 /// match (which may be a false positive) or no match at all (which **must
1088 /// not** be a false negative). When reporting a confirmed or candidate
1089 /// match, the position returned can be any position in the line.
1090 ///
1091 /// By default, this never returns a candidate match, and always either
1092 /// returns a confirmed match or no match at all.
1093 ///
1094 /// When a matcher can match spans over multiple lines, then the behavior
1095 /// of this method is unspecified. Namely, use of this method only
1096 /// makes sense in a context where the caller is looking for the next
1097 /// matching line. That is, callers should only use this method when
1098 /// `line_terminator` does not return `None`.
1099 ///
1100 /// # Design rationale
1101 ///
1102 /// A line matcher is, fundamentally, a normal matcher with the addition
1103 /// of one optional method: finding a line. By default, this routine
1104 /// is implemented via the matcher's `shortest_match` method, which
1105 /// always yields either no match or a `LineMatchKind::Confirmed`. However,
1106 /// implementors may provide a routine for this that can return candidate
1107 /// lines that need subsequent verification to be confirmed as a match.
1108 /// This can be useful in cases where it may be quicker to find candidate
1109 /// lines via some other means instead of relying on the more general
1110 /// implementations for `find` and `shortest_match`.
1111 ///
1112 /// For example, consider the regex `\w+foo\s+`. Both `find` and
1113 /// `shortest_match` must consider the entire regex, including the `\w+`
1114 /// and `\s+`, while searching. However, this method could look for lines
1115 /// containing `foo` and return them as candidates. Finding `foo` might
1116 /// be implemented as a highly optimized substring search routine (like
1117 /// `memmem`), which is likely to be faster than whatever more generalized
1118 /// routine is required for resolving `\w+foo\s+`. The caller is then
1119 /// responsible for confirming whether a match exists or not.
1120 ///
1121 /// Note that while this method may report false positives, it must never
1122 /// report false negatives. That is, it can never skip over lines that
1123 /// contain a match.
1124 #[inline]
1125 fn find_candidate_line(
1126 &self,
1127 haystack: &[u8],
1128 ) -> Result<Option<LineMatchKind>, Self::Error> {
1129 Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
1130 }
1131}
1132
1133impl<'a, M: Matcher> Matcher for &'a M {
1134 type Captures = M::Captures;
1135 type Error = M::Error;
1136
1137 #[inline]
1138 fn find_at(
1139 &self,
1140 haystack: &[u8],
1141 at: usize,
1142 ) -> Result<Option<Match>, Self::Error> {
1143 (*self).find_at(haystack, at)
1144 }
1145
1146 #[inline]
1147 fn new_captures(&self) -> Result<Self::Captures, Self::Error> {
1148 (*self).new_captures()
1149 }
1150
1151 #[inline]
1152 fn captures_at(
1153 &self,
1154 haystack: &[u8],
1155 at: usize,
1156 caps: &mut Self::Captures,
1157 ) -> Result<bool, Self::Error> {
1158 (*self).captures_at(haystack, at, caps)
1159 }
1160
1161 #[inline]
1162 fn capture_index(&self, name: &str) -> Option<usize> {
1163 (*self).capture_index(name)
1164 }
1165
1166 #[inline]
1167 fn capture_count(&self) -> usize {
1168 (*self).capture_count()
1169 }
1170
1171 #[inline]
1172 fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
1173 (*self).find(haystack)
1174 }
1175
1176 #[inline]
1177 fn find_iter<F>(
1178 &self,
1179 haystack: &[u8],
1180 matched: F,
1181 ) -> Result<(), Self::Error>
1182 where
1183 F: FnMut(Match) -> bool,
1184 {
1185 (*self).find_iter(haystack, matched)
1186 }
1187
1188 #[inline]
1189 fn find_iter_at<F>(
1190 &self,
1191 haystack: &[u8],
1192 at: usize,
1193 matched: F,
1194 ) -> Result<(), Self::Error>
1195 where
1196 F: FnMut(Match) -> bool,
1197 {
1198 (*self).find_iter_at(haystack, at, matched)
1199 }
1200
1201 #[inline]
1202 fn try_find_iter<F, E>(
1203 &self,
1204 haystack: &[u8],
1205 matched: F,
1206 ) -> Result<Result<(), E>, Self::Error>
1207 where
1208 F: FnMut(Match) -> Result<bool, E>,
1209 {
1210 (*self).try_find_iter(haystack, matched)
1211 }
1212
1213 #[inline]
1214 fn try_find_iter_at<F, E>(
1215 &self,
1216 haystack: &[u8],
1217 at: usize,
1218 matched: F,
1219 ) -> Result<Result<(), E>, Self::Error>
1220 where
1221 F: FnMut(Match) -> Result<bool, E>,
1222 {
1223 (*self).try_find_iter_at(haystack, at, matched)
1224 }
1225
1226 #[inline]
1227 fn captures(
1228 &self,
1229 haystack: &[u8],
1230 caps: &mut Self::Captures,
1231 ) -> Result<bool, Self::Error> {
1232 (*self).captures(haystack, caps)
1233 }
1234
1235 #[inline]
1236 fn captures_iter<F>(
1237 &self,
1238 haystack: &[u8],
1239 caps: &mut Self::Captures,
1240 matched: F,
1241 ) -> Result<(), Self::Error>
1242 where
1243 F: FnMut(&Self::Captures) -> bool,
1244 {
1245 (*self).captures_iter(haystack, caps, matched)
1246 }
1247
1248 #[inline]
1249 fn captures_iter_at<F>(
1250 &self,
1251 haystack: &[u8],
1252 at: usize,
1253 caps: &mut Self::Captures,
1254 matched: F,
1255 ) -> Result<(), Self::Error>
1256 where
1257 F: FnMut(&Self::Captures) -> bool,
1258 {
1259 (*self).captures_iter_at(haystack, at, caps, matched)
1260 }
1261
1262 #[inline]
1263 fn try_captures_iter<F, E>(
1264 &self,
1265 haystack: &[u8],
1266 caps: &mut Self::Captures,
1267 matched: F,
1268 ) -> Result<Result<(), E>, Self::Error>
1269 where
1270 F: FnMut(&Self::Captures) -> Result<bool, E>,
1271 {
1272 (*self).try_captures_iter(haystack, caps, matched)
1273 }
1274
1275 #[inline]
1276 fn try_captures_iter_at<F, E>(
1277 &self,
1278 haystack: &[u8],
1279 at: usize,
1280 caps: &mut Self::Captures,
1281 matched: F,
1282 ) -> Result<Result<(), E>, Self::Error>
1283 where
1284 F: FnMut(&Self::Captures) -> Result<bool, E>,
1285 {
1286 (*self).try_captures_iter_at(haystack, at, caps, matched)
1287 }
1288
1289 #[inline]
1290 fn replace<F>(
1291 &self,
1292 haystack: &[u8],
1293 dst: &mut Vec<u8>,
1294 append: F,
1295 ) -> Result<(), Self::Error>
1296 where
1297 F: FnMut(Match, &mut Vec<u8>) -> bool,
1298 {
1299 (*self).replace(haystack, dst, append)
1300 }
1301
1302 #[inline]
1303 fn replace_with_captures<F>(
1304 &self,
1305 haystack: &[u8],
1306 caps: &mut Self::Captures,
1307 dst: &mut Vec<u8>,
1308 append: F,
1309 ) -> Result<(), Self::Error>
1310 where
1311 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1312 {
1313 (*self).replace_with_captures(haystack, caps, dst, append)
1314 }
1315
1316 #[inline]
1317 fn replace_with_captures_at<F>(
1318 &self,
1319 haystack: &[u8],
1320 at: usize,
1321 caps: &mut Self::Captures,
1322 dst: &mut Vec<u8>,
1323 append: F,
1324 ) -> Result<(), Self::Error>
1325 where
1326 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1327 {
1328 (*self).replace_with_captures_at(haystack, at, caps, dst, append)
1329 }
1330
1331 #[inline]
1332 fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
1333 (*self).is_match(haystack)
1334 }
1335
1336 #[inline]
1337 fn is_match_at(
1338 &self,
1339 haystack: &[u8],
1340 at: usize,
1341 ) -> Result<bool, Self::Error> {
1342 (*self).is_match_at(haystack, at)
1343 }
1344
1345 #[inline]
1346 fn shortest_match(
1347 &self,
1348 haystack: &[u8],
1349 ) -> Result<Option<usize>, Self::Error> {
1350 (*self).shortest_match(haystack)
1351 }
1352
1353 #[inline]
1354 fn shortest_match_at(
1355 &self,
1356 haystack: &[u8],
1357 at: usize,
1358 ) -> Result<Option<usize>, Self::Error> {
1359 (*self).shortest_match_at(haystack, at)
1360 }
1361
1362 #[inline]
1363 fn non_matching_bytes(&self) -> Option<&ByteSet> {
1364 (*self).non_matching_bytes()
1365 }
1366
1367 #[inline]
1368 fn line_terminator(&self) -> Option<LineTerminator> {
1369 (*self).line_terminator()
1370 }
1371
1372 #[inline]
1373 fn find_candidate_line(
1374 &self,
1375 haystack: &[u8],
1376 ) -> Result<Option<LineMatchKind>, Self::Error> {
1377 (*self).find_candidate_line(haystack)
1378 }
1379}