grep_matcher/
lib.rs

1/*!
2This crate provides an interface for regular expressions, with a focus on line
3oriented search. The purpose of this crate is to provide a low level matching
4interface that permits any kind of substring or regex implementation to power
5the search routines provided by the
6[`grep-searcher`](https://docs.rs/grep-searcher)
7crate.
8
9The primary thing provided by this crate is the [`Matcher`] trait. The trait
10defines an abstract interface for text search. It is robust enough to support
11everything from basic substring search all the way to arbitrarily complex
12regular expression implementations without sacrificing performance.
13
14A key design decision made in this crate is the use of *internal iteration*,
15or otherwise known as the "push" model of searching. In this paradigm,
16implementations of the `Matcher` trait will drive search and execute callbacks
17provided by the caller when a match is found. This is in contrast to the
18usual style of *external iteration* (the "pull" model) found throughout the
19Rust ecosystem. There are two primary reasons why internal iteration was
20chosen:
21
22* Some search implementations may themselves require internal iteration.
23  Converting an internal iterator to an external iterator can be non-trivial
24  and sometimes even practically impossible.
25* Rust's type system isn't quite expressive enough to write a generic interface
26  using external iteration without giving something else up (namely, ease of
27  use and/or performance).
28
29In other words, internal iteration was chosen because it is the lowest common
30denominator and because it is probably the least bad way of expressing the
31interface in today's Rust. As a result, this trait isn't specifically intended
32for everyday use, although, you might find it to be a happy price to pay if you
33want to write code that is generic over multiple different regex
34implementations.
35*/
36
37#![deny(missing_docs)]
38
39use crate::interpolate::interpolate;
40
41mod interpolate;
42
43/// The type of a match.
44///
45/// The type of a match is a possibly empty range pointing to a contiguous
46/// block of addressable memory.
47///
48/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
49///
50/// # Indexing
51///
52/// This type is structurally identical to `std::ops::Range<usize>`, but
53/// is a bit more ergonomic for dealing with match indices. In particular,
54/// this type implements `Copy` and provides methods for building new `Match`
55/// values based on old `Match` values. Finally, the invariant that `start`
56/// is always less than or equal to `end` is enforced.
57///
58/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
59/// range notation. e.g.,
60///
61/// ```
62/// use grep_matcher::Match;
63///
64/// let m = Match::new(2, 5);
65/// let bytes = b"abcdefghi";
66/// assert_eq!(b"cde", &bytes[m]);
67/// ```
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub struct Match {
70    start: usize,
71    end: usize,
72}
73
74impl Match {
75    /// Create a new match.
76    ///
77    /// # Panics
78    ///
79    /// This function panics if `start > end`.
80    #[inline]
81    pub fn new(start: usize, end: usize) -> Match {
82        assert!(start <= end);
83        Match { start, end }
84    }
85
86    /// Creates a zero width match at the given offset.
87    #[inline]
88    pub fn zero(offset: usize) -> Match {
89        Match { start: offset, end: offset }
90    }
91
92    /// Return the start offset of this match.
93    #[inline]
94    pub fn start(&self) -> usize {
95        self.start
96    }
97
98    /// Return the end offset of this match.
99    #[inline]
100    pub fn end(&self) -> usize {
101        self.end
102    }
103
104    /// Return a new match with the start offset replaced with the given
105    /// value.
106    ///
107    /// # Panics
108    ///
109    /// This method panics if `start > self.end`.
110    #[inline]
111    pub fn with_start(&self, start: usize) -> Match {
112        assert!(start <= self.end, "{} is not <= {}", start, self.end);
113        Match { start, ..*self }
114    }
115
116    /// Return a new match with the end offset replaced with the given
117    /// value.
118    ///
119    /// # Panics
120    ///
121    /// This method panics if `self.start > end`.
122    #[inline]
123    pub fn with_end(&self, end: usize) -> Match {
124        assert!(self.start <= end, "{} is not <= {}", self.start, end);
125        Match { end, ..*self }
126    }
127
128    /// Offset this match by the given amount and return a new match.
129    ///
130    /// This adds the given offset to the start and end of this match, and
131    /// returns the resulting match.
132    ///
133    /// # Panics
134    ///
135    /// This panics if adding the given amount to either the start or end
136    /// offset would result in an overflow.
137    #[inline]
138    pub fn offset(&self, amount: usize) -> Match {
139        Match {
140            start: self.start.checked_add(amount).unwrap(),
141            end: self.end.checked_add(amount).unwrap(),
142        }
143    }
144
145    /// Returns the number of bytes in this match.
146    #[inline]
147    pub fn len(&self) -> usize {
148        self.end - self.start
149    }
150
151    /// Returns true if and only if this match is empty.
152    #[inline]
153    pub fn is_empty(&self) -> bool {
154        self.len() == 0
155    }
156}
157
158impl std::ops::Index<Match> for [u8] {
159    type Output = [u8];
160
161    #[inline]
162    fn index(&self, index: Match) -> &[u8] {
163        &self[index.start..index.end]
164    }
165}
166
167impl std::ops::IndexMut<Match> for [u8] {
168    #[inline]
169    fn index_mut(&mut self, index: Match) -> &mut [u8] {
170        &mut self[index.start..index.end]
171    }
172}
173
174impl std::ops::Index<Match> for str {
175    type Output = str;
176
177    #[inline]
178    fn index(&self, index: Match) -> &str {
179        &self[index.start..index.end]
180    }
181}
182
183/// A line terminator.
184///
185/// A line terminator represents the end of a line. Generally, every line is
186/// either "terminated" by the end of a stream or a specific byte (or sequence
187/// of bytes).
188///
189/// Generally, a line terminator is a single byte, specifically, `\n`, on
190/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
191/// as `CRLF` for `Carriage Return; Line Feed`).
192///
193/// The default line terminator is `\n` on all platforms.
194#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
195pub struct LineTerminator(LineTerminatorImp);
196
197#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
198enum LineTerminatorImp {
199    /// Any single byte representing a line terminator.
200    Byte(u8),
201    /// A line terminator represented by `\r\n`.
202    ///
203    /// When this option is used, consumers may generally treat a lone `\n` as
204    /// a line terminator in addition to `\r\n`.
205    CRLF,
206}
207
208impl LineTerminator {
209    /// Return a new single-byte line terminator. Any byte is valid.
210    #[inline]
211    pub fn byte(byte: u8) -> LineTerminator {
212        LineTerminator(LineTerminatorImp::Byte(byte))
213    }
214
215    /// Return a new line terminator represented by `\r\n`.
216    ///
217    /// When this option is used, consumers may generally treat a lone `\n` as
218    /// a line terminator in addition to `\r\n`.
219    #[inline]
220    pub fn crlf() -> LineTerminator {
221        LineTerminator(LineTerminatorImp::CRLF)
222    }
223
224    /// Returns true if and only if this line terminator is CRLF.
225    #[inline]
226    pub fn is_crlf(&self) -> bool {
227        self.0 == LineTerminatorImp::CRLF
228    }
229
230    /// Returns this line terminator as a single byte.
231    ///
232    /// If the line terminator is CRLF, then this returns `\n`. This is
233    /// useful for routines that, for example, find line boundaries by treating
234    /// `\n` as a line terminator even when it isn't preceded by `\r`.
235    #[inline]
236    pub fn as_byte(&self) -> u8 {
237        match self.0 {
238            LineTerminatorImp::Byte(byte) => byte,
239            LineTerminatorImp::CRLF => b'\n',
240        }
241    }
242
243    /// Returns this line terminator as a sequence of bytes.
244    ///
245    /// This returns a singleton sequence for all line terminators except for
246    /// `CRLF`, in which case, it returns `\r\n`.
247    ///
248    /// The slice returned is guaranteed to have length at least `1`.
249    #[inline]
250    pub fn as_bytes(&self) -> &[u8] {
251        match self.0 {
252            LineTerminatorImp::Byte(ref byte) => std::slice::from_ref(byte),
253            LineTerminatorImp::CRLF => &[b'\r', b'\n'],
254        }
255    }
256
257    /// Returns true if and only if the given slice ends with this line
258    /// terminator.
259    ///
260    /// If this line terminator is `CRLF`, then this only checks whether the
261    /// last byte is `\n`.
262    #[inline]
263    pub fn is_suffix(&self, slice: &[u8]) -> bool {
264        slice.last().map_or(false, |&b| b == self.as_byte())
265    }
266}
267
268impl Default for LineTerminator {
269    #[inline]
270    fn default() -> LineTerminator {
271        LineTerminator::byte(b'\n')
272    }
273}
274
275/// A set of bytes.
276///
277/// In this crate, byte sets are used to express bytes that can never appear
278/// anywhere in a match for a particular implementation of the `Matcher` trait.
279/// Specifically, if such a set can be determined, then it's possible for
280/// callers to perform additional operations on the basis that certain bytes
281/// may never match.
282///
283/// For example, if a search is configured to possibly produce results that
284/// span multiple lines but a caller provided pattern can never match across
285/// multiple lines, then it may make sense to divert to more optimized line
286/// oriented routines that don't need to handle the multi-line match case.
287#[derive(Clone, Debug)]
288pub struct ByteSet(BitSet);
289
290#[derive(Clone, Copy)]
291struct BitSet([u64; 4]);
292
293impl std::fmt::Debug for BitSet {
294    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
295        let mut fmtd = f.debug_set();
296        for b in 0..=255 {
297            if ByteSet(*self).contains(b) {
298                fmtd.entry(&b);
299            }
300        }
301        fmtd.finish()
302    }
303}
304
305impl ByteSet {
306    /// Create an empty set of bytes.
307    #[inline]
308    pub fn empty() -> ByteSet {
309        ByteSet(BitSet([0; 4]))
310    }
311
312    /// Create a full set of bytes such that every possible byte is in the set
313    /// returned.
314    #[inline]
315    pub fn full() -> ByteSet {
316        ByteSet(BitSet([u64::MAX; 4]))
317    }
318
319    /// Add a byte to this set.
320    ///
321    /// If the given byte already belongs to this set, then this is a no-op.
322    #[inline]
323    pub fn add(&mut self, byte: u8) {
324        let bucket = byte / 64;
325        let bit = byte % 64;
326        (self.0).0[usize::from(bucket)] |= 1 << bit;
327    }
328
329    /// Add an inclusive range of bytes.
330    #[inline]
331    pub fn add_all(&mut self, start: u8, end: u8) {
332        for b in start..=end {
333            self.add(b);
334        }
335    }
336
337    /// Remove a byte from this set.
338    ///
339    /// If the given byte is not in this set, then this is a no-op.
340    #[inline]
341    pub fn remove(&mut self, byte: u8) {
342        let bucket = byte / 64;
343        let bit = byte % 64;
344        (self.0).0[usize::from(bucket)] &= !(1 << bit);
345    }
346
347    /// Remove an inclusive range of bytes.
348    #[inline]
349    pub fn remove_all(&mut self, start: u8, end: u8) {
350        for b in start..=end {
351            self.remove(b);
352        }
353    }
354
355    /// Return true if and only if the given byte is in this set.
356    #[inline]
357    pub fn contains(&self, byte: u8) -> bool {
358        let bucket = byte / 64;
359        let bit = byte % 64;
360        (self.0).0[usize::from(bucket)] & (1 << bit) > 0
361    }
362}
363
364/// A trait that describes implementations of capturing groups.
365///
366/// When a matcher supports capturing group extraction, then it is the
367/// matcher's responsibility to provide an implementation of this trait.
368///
369/// Principally, this trait provides a way to access capturing groups
370/// in a uniform way that does not require any specific representation.
371/// Namely, different matcher implementations may require different in-memory
372/// representations of capturing groups. This trait permits matchers to
373/// maintain their specific in-memory representation.
374///
375/// Note that this trait explicitly does not provide a way to construct a new
376/// capture value. Instead, it is the responsibility of a `Matcher` to build
377/// one, which might require knowledge of the matcher's internal implementation
378/// details.
379pub trait Captures {
380    /// Return the total number of capturing groups. This includes capturing
381    /// groups that have not matched anything.
382    fn len(&self) -> usize;
383
384    /// Return the capturing group match at the given index. If no match of
385    /// that capturing group exists, then this returns `None`.
386    ///
387    /// When a matcher reports a match with capturing groups, then the first
388    /// capturing group (at index `0`) must always correspond to the offsets
389    /// for the overall match.
390    fn get(&self, i: usize) -> Option<Match>;
391
392    /// Return the overall match for the capture.
393    ///
394    /// This returns the match for index `0`. That is it is equivalent to
395    /// `get(0).unwrap()`
396    #[inline]
397    fn as_match(&self) -> Match {
398        self.get(0).unwrap()
399    }
400
401    /// Returns true if and only if these captures are empty. This occurs
402    /// when `len` is `0`.
403    ///
404    /// Note that capturing groups that have non-zero length but otherwise
405    /// contain no matching groups are *not* empty.
406    #[inline]
407    fn is_empty(&self) -> bool {
408        self.len() == 0
409    }
410
411    /// Expands all instances of `$name` in `replacement` to the corresponding
412    /// capture group `name`, and writes them to the `dst` buffer given.
413    ///
414    /// (Note: If you're looking for a convenient way to perform replacements
415    /// with interpolation, then you'll want to use the `replace_with_captures`
416    /// method on the `Matcher` trait.)
417    ///
418    /// `name` may be an integer corresponding to the index of the
419    /// capture group (counted by order of opening parenthesis where `0` is the
420    /// entire match) or it can be a name (consisting of letters, digits or
421    /// underscores) corresponding to a named capture group.
422    ///
423    /// A `name` is translated to a capture group index via the given
424    /// `name_to_index` function. If `name` isn't a valid capture group
425    /// (whether the name doesn't exist or isn't a valid index), then it is
426    /// replaced with the empty string.
427    ///
428    /// The longest possible name is used. e.g., `$1a` looks up the capture
429    /// group named `1a` and not the capture group at index `1`. To exert
430    /// more precise control over the name, use braces, e.g., `${1}a`. In all
431    /// cases, capture group names are limited to ASCII letters, numbers and
432    /// underscores.
433    ///
434    /// To write a literal `$` use `$$`.
435    ///
436    /// Note that the capture group match indices are resolved by slicing
437    /// the given `haystack`. Generally, this means that `haystack` should be
438    /// the same slice that was searched to get the current capture group
439    /// matches.
440    #[inline]
441    fn interpolate<F>(
442        &self,
443        name_to_index: F,
444        haystack: &[u8],
445        replacement: &[u8],
446        dst: &mut Vec<u8>,
447    ) where
448        F: FnMut(&str) -> Option<usize>,
449    {
450        interpolate(
451            replacement,
452            |i, dst| {
453                if let Some(range) = self.get(i) {
454                    dst.extend(&haystack[range]);
455                }
456            },
457            name_to_index,
458            dst,
459        )
460    }
461}
462
463/// NoCaptures provides an always-empty implementation of the `Captures` trait.
464///
465/// This type is useful for implementations of `Matcher` that don't support
466/// capturing groups.
467#[derive(Clone, Debug)]
468pub struct NoCaptures(());
469
470impl NoCaptures {
471    /// Create an empty set of capturing groups.
472    #[inline]
473    pub fn new() -> NoCaptures {
474        NoCaptures(())
475    }
476}
477
478impl Captures for NoCaptures {
479    #[inline]
480    fn len(&self) -> usize {
481        0
482    }
483
484    #[inline]
485    fn get(&self, _: usize) -> Option<Match> {
486        None
487    }
488}
489
490/// NoError provides an error type for matchers that never produce errors.
491///
492/// This error type implements the `std::error::Error` and `std::fmt::Display`
493/// traits for use in matcher implementations that can never produce errors.
494///
495/// The `std::fmt::Debug` and `std::fmt::Display` impls for this type panics.
496#[derive(Debug, Eq, PartialEq)]
497pub struct NoError(());
498
499impl std::error::Error for NoError {
500    fn description(&self) -> &str {
501        "no error"
502    }
503}
504
505impl std::fmt::Display for NoError {
506    fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
507        panic!("BUG for NoError: an impossible error occurred")
508    }
509}
510
511impl From<NoError> for std::io::Error {
512    fn from(_: NoError) -> std::io::Error {
513        panic!("BUG for NoError: an impossible error occurred")
514    }
515}
516
517/// The type of match for a line oriented matcher.
518#[derive(Clone, Copy, Debug)]
519pub enum LineMatchKind {
520    /// A position inside a line that is known to contain a match.
521    ///
522    /// This position can be anywhere in the line. It does not need to point
523    /// at the location of the match.
524    Confirmed(usize),
525    /// A position inside a line that may contain a match, and must be searched
526    /// for verification.
527    ///
528    /// This position can be anywhere in the line. It does not need to point
529    /// at the location of the match.
530    Candidate(usize),
531}
532
533/// A matcher defines an interface for regular expression implementations.
534///
535/// While this trait is large, there are only two required methods that
536/// implementors must provide: `find_at` and `new_captures`. If captures aren't
537/// supported by your implementation, then `new_captures` can be implemented
538/// with [`NoCaptures`]. If your implementation does support capture groups,
539/// then you should also implement the other capture related methods, as
540/// dictated by the documentation. Crucially, this includes `captures_at`.
541///
542/// The rest of the methods on this trait provide default implementations on
543/// top of `find_at` and `new_captures`. It is not uncommon for implementations
544/// to be able to provide faster variants of some methods; in those cases,
545/// simply override the default implementation.
546pub trait Matcher {
547    /// The concrete type of capturing groups used for this matcher.
548    ///
549    /// If this implementation does not support capturing groups, then set
550    /// this to `NoCaptures`.
551    type Captures: Captures;
552
553    /// The error type used by this matcher.
554    ///
555    /// For matchers in which an error is not possible, they are encouraged to
556    /// use the `NoError` type in this crate. In the future, when the "never"
557    /// (spelled `!`) type is stabilized, then it should probably be used
558    /// instead.
559    type Error: std::fmt::Display;
560
561    /// Returns the start and end byte range of the first match in `haystack`
562    /// after `at`, where the byte offsets are relative to that start of
563    /// `haystack` (and not `at`). If no match exists, then `None` is returned.
564    ///
565    /// The text encoding of `haystack` is not strictly specified. Matchers are
566    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
567    ///
568    /// The significance of the starting point is that it takes the surrounding
569    /// context into consideration. For example, the `\A` anchor can only
570    /// match when `at == 0`.
571    fn find_at(
572        &self,
573        haystack: &[u8],
574        at: usize,
575    ) -> Result<Option<Match>, Self::Error>;
576
577    /// Creates an empty group of captures suitable for use with the capturing
578    /// APIs of this trait.
579    ///
580    /// Implementations that don't support capturing groups should use
581    /// the `NoCaptures` type and implement this method by calling
582    /// `NoCaptures::new()`.
583    fn new_captures(&self) -> Result<Self::Captures, Self::Error>;
584
585    /// Returns the total number of capturing groups in this matcher.
586    ///
587    /// If a matcher supports capturing groups, then this value must always be
588    /// at least 1, where the first capturing group always corresponds to the
589    /// overall match.
590    ///
591    /// If a matcher does not support capturing groups, then this should
592    /// always return 0.
593    ///
594    /// By default, capturing groups are not supported, so this always
595    /// returns 0.
596    #[inline]
597    fn capture_count(&self) -> usize {
598        0
599    }
600
601    /// Maps the given capture group name to its corresponding capture group
602    /// index, if one exists. If one does not exist, then `None` is returned.
603    ///
604    /// If the given capture group name maps to multiple indices, then it is
605    /// not specified which one is returned. However, it is guaranteed that
606    /// one of them is returned.
607    ///
608    /// By default, capturing groups are not supported, so this always returns
609    /// `None`.
610    #[inline]
611    fn capture_index(&self, _name: &str) -> Option<usize> {
612        None
613    }
614
615    /// Returns the start and end byte range of the first match in `haystack`.
616    /// If no match exists, then `None` is returned.
617    ///
618    /// The text encoding of `haystack` is not strictly specified. Matchers are
619    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
620    #[inline]
621    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
622        self.find_at(haystack, 0)
623    }
624
625    /// Executes the given function over successive non-overlapping matches
626    /// in `haystack`. If no match exists, then the given function is never
627    /// called. If the function returns `false`, then iteration stops.
628    #[inline]
629    fn find_iter<F>(
630        &self,
631        haystack: &[u8],
632        matched: F,
633    ) -> Result<(), Self::Error>
634    where
635        F: FnMut(Match) -> bool,
636    {
637        self.find_iter_at(haystack, 0, matched)
638    }
639
640    /// Executes the given function over successive non-overlapping matches
641    /// in `haystack`. If no match exists, then the given function is never
642    /// called. If the function returns `false`, then iteration stops.
643    ///
644    /// The significance of the starting point is that it takes the surrounding
645    /// context into consideration. For example, the `\A` anchor can only
646    /// match when `at == 0`.
647    #[inline]
648    fn find_iter_at<F>(
649        &self,
650        haystack: &[u8],
651        at: usize,
652        mut matched: F,
653    ) -> Result<(), Self::Error>
654    where
655        F: FnMut(Match) -> bool,
656    {
657        self.try_find_iter_at(haystack, at, |m| Ok(matched(m)))
658            .map(|r: Result<(), ()>| r.unwrap())
659    }
660
661    /// Executes the given function over successive non-overlapping matches
662    /// in `haystack`. If no match exists, then the given function is never
663    /// called. If the function returns `false`, then iteration stops.
664    /// Similarly, if the function returns an error then iteration stops and
665    /// the error is yielded. If an error occurs while executing the search,
666    /// then it is converted to
667    /// `E`.
668    #[inline]
669    fn try_find_iter<F, E>(
670        &self,
671        haystack: &[u8],
672        matched: F,
673    ) -> Result<Result<(), E>, Self::Error>
674    where
675        F: FnMut(Match) -> Result<bool, E>,
676    {
677        self.try_find_iter_at(haystack, 0, matched)
678    }
679
680    /// Executes the given function over successive non-overlapping matches
681    /// in `haystack`. If no match exists, then the given function is never
682    /// called. If the function returns `false`, then iteration stops.
683    /// Similarly, if the function returns an error then iteration stops and
684    /// the error is yielded. If an error occurs while executing the search,
685    /// then it is converted to
686    /// `E`.
687    ///
688    /// The significance of the starting point is that it takes the surrounding
689    /// context into consideration. For example, the `\A` anchor can only
690    /// match when `at == 0`.
691    #[inline]
692    fn try_find_iter_at<F, E>(
693        &self,
694        haystack: &[u8],
695        at: usize,
696        mut matched: F,
697    ) -> Result<Result<(), E>, Self::Error>
698    where
699        F: FnMut(Match) -> Result<bool, E>,
700    {
701        let mut last_end = at;
702        let mut last_match = None;
703
704        loop {
705            if last_end > haystack.len() {
706                return Ok(Ok(()));
707            }
708            let m = match self.find_at(haystack, last_end)? {
709                None => return Ok(Ok(())),
710                Some(m) => m,
711            };
712            if m.start == m.end {
713                // This is an empty match. To ensure we make progress, start
714                // the next search at the smallest possible starting position
715                // of the next match following this one.
716                last_end = m.end + 1;
717                // Don't accept empty matches immediately following a match.
718                // Just move on to the next match.
719                if Some(m.end) == last_match {
720                    continue;
721                }
722            } else {
723                last_end = m.end;
724            }
725            last_match = Some(m.end);
726            match matched(m) {
727                Ok(true) => continue,
728                Ok(false) => return Ok(Ok(())),
729                Err(err) => return Ok(Err(err)),
730            }
731        }
732    }
733
734    /// Populates the first set of capture group matches from `haystack` into
735    /// `caps`. If no match exists, then `false` is returned.
736    ///
737    /// The text encoding of `haystack` is not strictly specified. Matchers are
738    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
739    #[inline]
740    fn captures(
741        &self,
742        haystack: &[u8],
743        caps: &mut Self::Captures,
744    ) -> Result<bool, Self::Error> {
745        self.captures_at(haystack, 0, caps)
746    }
747
748    /// Executes the given function over successive non-overlapping matches
749    /// in `haystack` with capture groups extracted from each match. If no
750    /// match exists, then the given function is never called. If the function
751    /// returns `false`, then iteration stops.
752    #[inline]
753    fn captures_iter<F>(
754        &self,
755        haystack: &[u8],
756        caps: &mut Self::Captures,
757        matched: F,
758    ) -> Result<(), Self::Error>
759    where
760        F: FnMut(&Self::Captures) -> bool,
761    {
762        self.captures_iter_at(haystack, 0, caps, matched)
763    }
764
765    /// Executes the given function over successive non-overlapping matches
766    /// in `haystack` with capture groups extracted from each match. If no
767    /// match exists, then the given function is never called. If the function
768    /// returns `false`, then iteration stops.
769    ///
770    /// The significance of the starting point is that it takes the surrounding
771    /// context into consideration. For example, the `\A` anchor can only
772    /// match when `at == 0`.
773    #[inline]
774    fn captures_iter_at<F>(
775        &self,
776        haystack: &[u8],
777        at: usize,
778        caps: &mut Self::Captures,
779        mut matched: F,
780    ) -> Result<(), Self::Error>
781    where
782        F: FnMut(&Self::Captures) -> bool,
783    {
784        self.try_captures_iter_at(haystack, at, caps, |caps| Ok(matched(caps)))
785            .map(|r: Result<(), ()>| r.unwrap())
786    }
787
788    /// Executes the given function over successive non-overlapping matches
789    /// in `haystack` with capture groups extracted from each match. If no
790    /// match exists, then the given function is never called. If the function
791    /// returns `false`, then iteration stops. Similarly, if the function
792    /// returns an error then iteration stops and the error is yielded. If
793    /// an error occurs while executing the search, then it is converted to
794    /// `E`.
795    #[inline]
796    fn try_captures_iter<F, E>(
797        &self,
798        haystack: &[u8],
799        caps: &mut Self::Captures,
800        matched: F,
801    ) -> Result<Result<(), E>, Self::Error>
802    where
803        F: FnMut(&Self::Captures) -> Result<bool, E>,
804    {
805        self.try_captures_iter_at(haystack, 0, caps, matched)
806    }
807
808    /// Executes the given function over successive non-overlapping matches
809    /// in `haystack` with capture groups extracted from each match. If no
810    /// match exists, then the given function is never called. If the function
811    /// returns `false`, then iteration stops. Similarly, if the function
812    /// returns an error then iteration stops and the error is yielded. If
813    /// an error occurs while executing the search, then it is converted to
814    /// `E`.
815    ///
816    /// The significance of the starting point is that it takes the surrounding
817    /// context into consideration. For example, the `\A` anchor can only
818    /// match when `at == 0`.
819    #[inline]
820    fn try_captures_iter_at<F, E>(
821        &self,
822        haystack: &[u8],
823        at: usize,
824        caps: &mut Self::Captures,
825        mut matched: F,
826    ) -> Result<Result<(), E>, Self::Error>
827    where
828        F: FnMut(&Self::Captures) -> Result<bool, E>,
829    {
830        let mut last_end = at;
831        let mut last_match = None;
832
833        loop {
834            if last_end > haystack.len() {
835                return Ok(Ok(()));
836            }
837            if !self.captures_at(haystack, last_end, caps)? {
838                return Ok(Ok(()));
839            }
840            let m = caps.get(0).unwrap();
841            if m.start == m.end {
842                // This is an empty match. To ensure we make progress, start
843                // the next search at the smallest possible starting position
844                // of the next match following this one.
845                last_end = m.end + 1;
846                // Don't accept empty matches immediately following a match.
847                // Just move on to the next match.
848                if Some(m.end) == last_match {
849                    continue;
850                }
851            } else {
852                last_end = m.end;
853            }
854            last_match = Some(m.end);
855            match matched(caps) {
856                Ok(true) => continue,
857                Ok(false) => return Ok(Ok(())),
858                Err(err) => return Ok(Err(err)),
859            }
860        }
861    }
862
863    /// Populates the first set of capture group matches from `haystack`
864    /// into `matches` after `at`, where the byte offsets in each capturing
865    /// group are relative to the start of `haystack` (and not `at`). If no
866    /// match exists, then `false` is returned and the contents of the given
867    /// capturing groups are unspecified.
868    ///
869    /// The text encoding of `haystack` is not strictly specified. Matchers are
870    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
871    ///
872    /// The significance of the starting point is that it takes the surrounding
873    /// context into consideration. For example, the `\A` anchor can only
874    /// match when `at == 0`.
875    ///
876    /// By default, capturing groups aren't supported, and this implementation
877    /// will always behave as if a match were impossible.
878    ///
879    /// Implementors that provide support for capturing groups must guarantee
880    /// that when a match occurs, the first capture match (at index `0`) is
881    /// always set to the overall match offsets.
882    ///
883    /// Note that if implementors seek to support capturing groups, then they
884    /// should implement this method. Other methods that match based on
885    /// captures will then work automatically.
886    #[inline]
887    fn captures_at(
888        &self,
889        _haystack: &[u8],
890        _at: usize,
891        _caps: &mut Self::Captures,
892    ) -> Result<bool, Self::Error> {
893        Ok(false)
894    }
895
896    /// Replaces every match in the given haystack with the result of calling
897    /// `append`. `append` is given the start and end of a match, along with
898    /// a handle to the `dst` buffer provided.
899    ///
900    /// If the given `append` function returns `false`, then replacement stops.
901    #[inline]
902    fn replace<F>(
903        &self,
904        haystack: &[u8],
905        dst: &mut Vec<u8>,
906        mut append: F,
907    ) -> Result<(), Self::Error>
908    where
909        F: FnMut(Match, &mut Vec<u8>) -> bool,
910    {
911        let mut last_match = 0;
912        self.find_iter(haystack, |m| {
913            dst.extend(&haystack[last_match..m.start]);
914            last_match = m.end;
915            append(m, dst)
916        })?;
917        dst.extend(&haystack[last_match..]);
918        Ok(())
919    }
920
921    /// Replaces every match in the given haystack with the result of calling
922    /// `append` with the matching capture groups.
923    ///
924    /// If the given `append` function returns `false`, then replacement stops.
925    #[inline]
926    fn replace_with_captures<F>(
927        &self,
928        haystack: &[u8],
929        caps: &mut Self::Captures,
930        dst: &mut Vec<u8>,
931        append: F,
932    ) -> Result<(), Self::Error>
933    where
934        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
935    {
936        self.replace_with_captures_at(haystack, 0, caps, dst, append)
937    }
938
939    /// Replaces every match in the given haystack with the result of calling
940    /// `append` with the matching capture groups.
941    ///
942    /// If the given `append` function returns `false`, then replacement stops.
943    ///
944    /// The significance of the starting point is that it takes the surrounding
945    /// context into consideration. For example, the `\A` anchor can only
946    /// match when `at == 0`.
947    #[inline]
948    fn replace_with_captures_at<F>(
949        &self,
950        haystack: &[u8],
951        at: usize,
952        caps: &mut Self::Captures,
953        dst: &mut Vec<u8>,
954        mut append: F,
955    ) -> Result<(), Self::Error>
956    where
957        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
958    {
959        let mut last_match = at;
960        self.captures_iter_at(haystack, at, caps, |caps| {
961            let m = caps.get(0).unwrap();
962            dst.extend(&haystack[last_match..m.start]);
963            last_match = m.end;
964            append(caps, dst)
965        })?;
966        dst.extend(&haystack[last_match..]);
967        Ok(())
968    }
969
970    /// Returns true if and only if the matcher matches the given haystack.
971    ///
972    /// By default, this method is implemented by calling `shortest_match`.
973    #[inline]
974    fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
975        self.is_match_at(haystack, 0)
976    }
977
978    /// Returns true if and only if the matcher matches the given haystack
979    /// starting at the given position.
980    ///
981    /// By default, this method is implemented by calling `shortest_match_at`.
982    ///
983    /// The significance of the starting point is that it takes the surrounding
984    /// context into consideration. For example, the `\A` anchor can only
985    /// match when `at == 0`.
986    #[inline]
987    fn is_match_at(
988        &self,
989        haystack: &[u8],
990        at: usize,
991    ) -> Result<bool, Self::Error> {
992        Ok(self.shortest_match_at(haystack, at)?.is_some())
993    }
994
995    /// Returns an end location of the first match in `haystack`. If no match
996    /// exists, then `None` is returned.
997    ///
998    /// Note that the end location reported by this method may be less than the
999    /// same end location reported by `find`. For example, running `find` with
1000    /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1001    /// 3)`, but `shortest_match` may report `1` as the ending location since
1002    /// that is the place at which a match is guaranteed to occur.
1003    ///
1004    /// This method should never report false positives or false negatives. The
1005    /// point of this method is that some implementors may be able to provide
1006    /// a faster implementation of this than what `find` does.
1007    ///
1008    /// By default, this method is implemented by calling `find`.
1009    #[inline]
1010    fn shortest_match(
1011        &self,
1012        haystack: &[u8],
1013    ) -> Result<Option<usize>, Self::Error> {
1014        self.shortest_match_at(haystack, 0)
1015    }
1016
1017    /// Returns an end location of the first match in `haystack` starting at
1018    /// the given position. If no match exists, then `None` is returned.
1019    ///
1020    /// Note that the end location reported by this method may be less than the
1021    /// same end location reported by `find`. For example, running `find` with
1022    /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1023    /// 3)`, but `shortest_match` may report `1` as the ending location since
1024    /// that is the place at which a match is guaranteed to occur.
1025    ///
1026    /// This method should never report false positives or false negatives. The
1027    /// point of this method is that some implementors may be able to provide
1028    /// a faster implementation of this than what `find` does.
1029    ///
1030    /// By default, this method is implemented by calling `find_at`.
1031    ///
1032    /// The significance of the starting point is that it takes the surrounding
1033    /// context into consideration. For example, the `\A` anchor can only
1034    /// match when `at == 0`.
1035    #[inline]
1036    fn shortest_match_at(
1037        &self,
1038        haystack: &[u8],
1039        at: usize,
1040    ) -> Result<Option<usize>, Self::Error> {
1041        Ok(self.find_at(haystack, at)?.map(|m| m.end))
1042    }
1043
1044    /// If available, return a set of bytes that will never appear in a match
1045    /// produced by an implementation.
1046    ///
1047    /// Specifically, if such a set can be determined, then it's possible for
1048    /// callers to perform additional operations on the basis that certain
1049    /// bytes may never match.
1050    ///
1051    /// For example, if a search is configured to possibly produce results
1052    /// that span multiple lines but a caller provided pattern can never
1053    /// match across multiple lines, then it may make sense to divert to
1054    /// more optimized line oriented routines that don't need to handle the
1055    /// multi-line match case.
1056    ///
1057    /// Implementations that produce this set must never report false
1058    /// positives, but may produce false negatives. That is, is a byte is in
1059    /// this set then it must be guaranteed that it is never in a match. But,
1060    /// if a byte is not in this set, then callers cannot assume that a match
1061    /// exists with that byte.
1062    ///
1063    /// By default, this returns `None`.
1064    #[inline]
1065    fn non_matching_bytes(&self) -> Option<&ByteSet> {
1066        None
1067    }
1068
1069    /// If this matcher was compiled as a line oriented matcher, then this
1070    /// method returns the line terminator if and only if the line terminator
1071    /// never appears in any match produced by this matcher. If this wasn't
1072    /// compiled as a line oriented matcher, or if the aforementioned guarantee
1073    /// cannot be made, then this must return `None`, which is the default.
1074    /// It is **never wrong** to return `None`, but returning a line terminator
1075    /// when it can appear in a match results in unspecified behavior.
1076    ///
1077    /// The line terminator is typically `b'\n'`, but can be any single byte or
1078    /// `CRLF`.
1079    ///
1080    /// By default, this returns `None`.
1081    #[inline]
1082    fn line_terminator(&self) -> Option<LineTerminator> {
1083        None
1084    }
1085
1086    /// Return one of the following: a confirmed line match, a candidate line
1087    /// match (which may be a false positive) or no match at all (which **must
1088    /// not** be a false negative). When reporting a confirmed or candidate
1089    /// match, the position returned can be any position in the line.
1090    ///
1091    /// By default, this never returns a candidate match, and always either
1092    /// returns a confirmed match or no match at all.
1093    ///
1094    /// When a matcher can match spans over multiple lines, then the behavior
1095    /// of this method is unspecified. Namely, use of this method only
1096    /// makes sense in a context where the caller is looking for the next
1097    /// matching line. That is, callers should only use this method when
1098    /// `line_terminator` does not return `None`.
1099    ///
1100    /// # Design rationale
1101    ///
1102    /// A line matcher is, fundamentally, a normal matcher with the addition
1103    /// of one optional method: finding a line. By default, this routine
1104    /// is implemented via the matcher's `shortest_match` method, which
1105    /// always yields either no match or a `LineMatchKind::Confirmed`. However,
1106    /// implementors may provide a routine for this that can return candidate
1107    /// lines that need subsequent verification to be confirmed as a match.
1108    /// This can be useful in cases where it may be quicker to find candidate
1109    /// lines via some other means instead of relying on the more general
1110    /// implementations for `find` and `shortest_match`.
1111    ///
1112    /// For example, consider the regex `\w+foo\s+`. Both `find` and
1113    /// `shortest_match` must consider the entire regex, including the `\w+`
1114    /// and `\s+`, while searching. However, this method could look for lines
1115    /// containing `foo` and return them as candidates. Finding `foo` might
1116    /// be implemented as a highly optimized substring search routine (like
1117    /// `memmem`), which is likely to be faster than whatever more generalized
1118    /// routine is required for resolving `\w+foo\s+`. The caller is then
1119    /// responsible for confirming whether a match exists or not.
1120    ///
1121    /// Note that while this method may report false positives, it must never
1122    /// report false negatives. That is, it can never skip over lines that
1123    /// contain a match.
1124    #[inline]
1125    fn find_candidate_line(
1126        &self,
1127        haystack: &[u8],
1128    ) -> Result<Option<LineMatchKind>, Self::Error> {
1129        Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
1130    }
1131}
1132
1133impl<'a, M: Matcher> Matcher for &'a M {
1134    type Captures = M::Captures;
1135    type Error = M::Error;
1136
1137    #[inline]
1138    fn find_at(
1139        &self,
1140        haystack: &[u8],
1141        at: usize,
1142    ) -> Result<Option<Match>, Self::Error> {
1143        (*self).find_at(haystack, at)
1144    }
1145
1146    #[inline]
1147    fn new_captures(&self) -> Result<Self::Captures, Self::Error> {
1148        (*self).new_captures()
1149    }
1150
1151    #[inline]
1152    fn captures_at(
1153        &self,
1154        haystack: &[u8],
1155        at: usize,
1156        caps: &mut Self::Captures,
1157    ) -> Result<bool, Self::Error> {
1158        (*self).captures_at(haystack, at, caps)
1159    }
1160
1161    #[inline]
1162    fn capture_index(&self, name: &str) -> Option<usize> {
1163        (*self).capture_index(name)
1164    }
1165
1166    #[inline]
1167    fn capture_count(&self) -> usize {
1168        (*self).capture_count()
1169    }
1170
1171    #[inline]
1172    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
1173        (*self).find(haystack)
1174    }
1175
1176    #[inline]
1177    fn find_iter<F>(
1178        &self,
1179        haystack: &[u8],
1180        matched: F,
1181    ) -> Result<(), Self::Error>
1182    where
1183        F: FnMut(Match) -> bool,
1184    {
1185        (*self).find_iter(haystack, matched)
1186    }
1187
1188    #[inline]
1189    fn find_iter_at<F>(
1190        &self,
1191        haystack: &[u8],
1192        at: usize,
1193        matched: F,
1194    ) -> Result<(), Self::Error>
1195    where
1196        F: FnMut(Match) -> bool,
1197    {
1198        (*self).find_iter_at(haystack, at, matched)
1199    }
1200
1201    #[inline]
1202    fn try_find_iter<F, E>(
1203        &self,
1204        haystack: &[u8],
1205        matched: F,
1206    ) -> Result<Result<(), E>, Self::Error>
1207    where
1208        F: FnMut(Match) -> Result<bool, E>,
1209    {
1210        (*self).try_find_iter(haystack, matched)
1211    }
1212
1213    #[inline]
1214    fn try_find_iter_at<F, E>(
1215        &self,
1216        haystack: &[u8],
1217        at: usize,
1218        matched: F,
1219    ) -> Result<Result<(), E>, Self::Error>
1220    where
1221        F: FnMut(Match) -> Result<bool, E>,
1222    {
1223        (*self).try_find_iter_at(haystack, at, matched)
1224    }
1225
1226    #[inline]
1227    fn captures(
1228        &self,
1229        haystack: &[u8],
1230        caps: &mut Self::Captures,
1231    ) -> Result<bool, Self::Error> {
1232        (*self).captures(haystack, caps)
1233    }
1234
1235    #[inline]
1236    fn captures_iter<F>(
1237        &self,
1238        haystack: &[u8],
1239        caps: &mut Self::Captures,
1240        matched: F,
1241    ) -> Result<(), Self::Error>
1242    where
1243        F: FnMut(&Self::Captures) -> bool,
1244    {
1245        (*self).captures_iter(haystack, caps, matched)
1246    }
1247
1248    #[inline]
1249    fn captures_iter_at<F>(
1250        &self,
1251        haystack: &[u8],
1252        at: usize,
1253        caps: &mut Self::Captures,
1254        matched: F,
1255    ) -> Result<(), Self::Error>
1256    where
1257        F: FnMut(&Self::Captures) -> bool,
1258    {
1259        (*self).captures_iter_at(haystack, at, caps, matched)
1260    }
1261
1262    #[inline]
1263    fn try_captures_iter<F, E>(
1264        &self,
1265        haystack: &[u8],
1266        caps: &mut Self::Captures,
1267        matched: F,
1268    ) -> Result<Result<(), E>, Self::Error>
1269    where
1270        F: FnMut(&Self::Captures) -> Result<bool, E>,
1271    {
1272        (*self).try_captures_iter(haystack, caps, matched)
1273    }
1274
1275    #[inline]
1276    fn try_captures_iter_at<F, E>(
1277        &self,
1278        haystack: &[u8],
1279        at: usize,
1280        caps: &mut Self::Captures,
1281        matched: F,
1282    ) -> Result<Result<(), E>, Self::Error>
1283    where
1284        F: FnMut(&Self::Captures) -> Result<bool, E>,
1285    {
1286        (*self).try_captures_iter_at(haystack, at, caps, matched)
1287    }
1288
1289    #[inline]
1290    fn replace<F>(
1291        &self,
1292        haystack: &[u8],
1293        dst: &mut Vec<u8>,
1294        append: F,
1295    ) -> Result<(), Self::Error>
1296    where
1297        F: FnMut(Match, &mut Vec<u8>) -> bool,
1298    {
1299        (*self).replace(haystack, dst, append)
1300    }
1301
1302    #[inline]
1303    fn replace_with_captures<F>(
1304        &self,
1305        haystack: &[u8],
1306        caps: &mut Self::Captures,
1307        dst: &mut Vec<u8>,
1308        append: F,
1309    ) -> Result<(), Self::Error>
1310    where
1311        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1312    {
1313        (*self).replace_with_captures(haystack, caps, dst, append)
1314    }
1315
1316    #[inline]
1317    fn replace_with_captures_at<F>(
1318        &self,
1319        haystack: &[u8],
1320        at: usize,
1321        caps: &mut Self::Captures,
1322        dst: &mut Vec<u8>,
1323        append: F,
1324    ) -> Result<(), Self::Error>
1325    where
1326        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1327    {
1328        (*self).replace_with_captures_at(haystack, at, caps, dst, append)
1329    }
1330
1331    #[inline]
1332    fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
1333        (*self).is_match(haystack)
1334    }
1335
1336    #[inline]
1337    fn is_match_at(
1338        &self,
1339        haystack: &[u8],
1340        at: usize,
1341    ) -> Result<bool, Self::Error> {
1342        (*self).is_match_at(haystack, at)
1343    }
1344
1345    #[inline]
1346    fn shortest_match(
1347        &self,
1348        haystack: &[u8],
1349    ) -> Result<Option<usize>, Self::Error> {
1350        (*self).shortest_match(haystack)
1351    }
1352
1353    #[inline]
1354    fn shortest_match_at(
1355        &self,
1356        haystack: &[u8],
1357        at: usize,
1358    ) -> Result<Option<usize>, Self::Error> {
1359        (*self).shortest_match_at(haystack, at)
1360    }
1361
1362    #[inline]
1363    fn non_matching_bytes(&self) -> Option<&ByteSet> {
1364        (*self).non_matching_bytes()
1365    }
1366
1367    #[inline]
1368    fn line_terminator(&self) -> Option<LineTerminator> {
1369        (*self).line_terminator()
1370    }
1371
1372    #[inline]
1373    fn find_candidate_line(
1374        &self,
1375        haystack: &[u8],
1376    ) -> Result<Option<LineMatchKind>, Self::Error> {
1377        (*self).find_candidate_line(haystack)
1378    }
1379}