regex_cursor/
input.rs

1/*!
2Types and routines that support the search APIs of most regex engines.
3
4This sub-module isn't exposed directly, but rather, its contents are exported
5at the crate root due to the universality of most of the types and routines in
6this module.
7*/
8
9use std::ops::RangeBounds;
10
11use regex_automata::{Anchored, Span};
12
13use crate::cursor::{Cursor, IntoCursor};
14use crate::util::utf8::is_boundary;
15
16const MAX_CODEPOINT_LEN: usize = 4;
17
18#[derive(Clone)]
19pub struct Input<C: Cursor> {
20    // span: Span,
21    anchored: Anchored,
22    earliest: bool,
23    /// Position within the current chunk
24    pub(crate) chunk_pos: usize,
25    span: Span,
26    pub(crate) slice_span: Span,
27    look_behind_len: usize,
28    /// the last 4 bytes before the current chunk
29    look_around: [u8; MAX_CODEPOINT_LEN * 2],
30    cursor: C,
31}
32
33impl<C: Cursor> Input<C> {
34    /// Create a new search configuration for the given cursor.
35    #[inline]
36    pub fn new<T: IntoCursor<Cursor = C>>(cursor: T) -> Self {
37        let cursor = cursor.into_cursor();
38        let end = cursor.total_bytes().unwrap_or(usize::MAX);
39        let start = cursor.offset();
40        Input {
41            anchored: Anchored::No,
42            earliest: false,
43            chunk_pos: 0,
44            cursor: cursor.into_cursor(),
45            // init with invalid utf8. We don't need to track
46            // which of these have been filed since we only look
47            // behind more than one byte in utf8 mode
48            look_around: [255; 8],
49            span: Span { start, end },
50            slice_span: Span { start: 0, end: usize::MAX },
51            look_behind_len: 0,
52        }
53    }
54
55    /// Return a borrow of the current underlying chunk as a slice of bytes.
56    ///
57    /// # Example
58    ///
59    /// ```
60    /// use regex_cursor::Input;
61    ///
62    /// let input = Input::new("foobar");
63    /// assert_eq!(b"foobar", input.chunk());
64    /// ```
65    #[cfg_attr(feature = "perf-inline", inline(always))]
66    pub fn chunk(&self) -> &[u8] {
67        self.cursor.chunk()
68    }
69
70    /// Return a borrow of the current underlying chunk as a slice of bytes.
71    ///
72    /// # Example
73    ///
74    /// ```
75    /// use regex_cursor::Input;
76    ///
77    /// let input = Input::new("foobar");
78    /// assert_eq!(b"foobar", input.chunk());
79    /// ```
80    #[cfg_attr(feature = "perf-inline", inline(always))]
81    pub fn chunk_offset(&self) -> usize {
82        self.cursor.offset()
83    }
84
85    /// Return the start position of this search.
86    ///
87    /// This is a convenience routine for `search.get_span().start()`.
88    ///
89    /// When [`Input::is_done`] is `false`, this is guaranteed to return
90    /// an offset that is less than or equal to [`Input::end`]. Otherwise,
91    /// the offset is one greater than [`Input::end`].
92    ///
93    /// # Example
94    ///
95    /// ```
96    /// use regex_automata::Input;
97    ///
98    /// let input = Input::new("foobar");
99    /// assert_eq!(0, input.start());
100    ///
101    /// let input = Input::new("foobar").span(2..4);
102    /// assert_eq!(2, input.start());
103    /// ```
104    #[inline]
105    pub fn start(&self) -> usize {
106        self.get_span().start
107    }
108
109    #[inline]
110    pub fn clear_look_behind(&mut self) {
111        self.look_around = [255; 8];
112    }
113
114    /// Return the end position of this search.
115    ///
116    /// This is a convenience routine for `search.get_span().end()`.
117    ///
118    /// This is guaranteed to return an offset that is a valid exclusive end
119    /// bound for this input's haystack.
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// use regex_automata::Input;
125    ///
126    /// let input = Input::new("foobar");
127    /// assert_eq!(6, input.end());
128    ///
129    /// let input = Input::new("foobar").span(2..4);
130    /// assert_eq!(4, input.end());
131    /// ```
132    #[inline]
133    pub fn end(&self) -> usize {
134        self.span.end
135    }
136
137    #[inline(always)]
138    pub fn get_chunk_end(&self) -> usize {
139        let end = self.span.end - self.cursor.offset();
140        end.min(self.chunk().len())
141    }
142
143    /// Return the span for this search configuration.
144    ///
145    /// If one was not explicitly set, then the span corresponds to the entire
146    /// range of the haystack.
147    ///
148    /// When [`Input::is_done`] is `false`, the span returned is guaranteed
149    /// to correspond to valid bounds for this input's haystack.
150    ///
151    /// # Example
152    ///
153    /// ```
154    /// use regex_automata::{Input, Span};
155    ///
156    /// let input = Input::new("foobar");
157    /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
158    /// ```
159    #[inline]
160    pub fn get_span(&self) -> Span {
161        self.span
162    }
163
164    #[cfg_attr(feature = "perf-inline", inline(always))]
165    pub(crate) fn set_look_behind(&mut self) {
166        #[cold]
167        fn copy_partial_look_behind(look_behind: &mut [u8; MAX_CODEPOINT_LEN * 2], chunk: &[u8]) {
168            look_behind[..chunk.len()].copy_from_slice(chunk)
169        }
170
171        let chunk = self.cursor.chunk();
172        let len = chunk.len();
173        if len < MAX_CODEPOINT_LEN {
174            copy_partial_look_behind(&mut self.look_around, chunk);
175            self.look_behind_len = chunk.len();
176        } else {
177            self.look_behind_len = MAX_CODEPOINT_LEN;
178            self.look_around[..MAX_CODEPOINT_LEN].copy_from_slice(&chunk[len - MAX_CODEPOINT_LEN..])
179        }
180    }
181
182    #[cfg_attr(feature = "perf-inline", inline(always))]
183    pub(crate) fn advance(&mut self) -> bool {
184        let old_len = self.cursor.chunk().len();
185        let advanced = self.cursor.advance();
186        if advanced {
187            self.chunk_pos = 0;
188        } else if self.span.end > self.cursor.offset() + old_len {
189            self.span.end = self.cursor.offset() + old_len;
190        }
191        advanced
192    }
193
194    #[cfg_attr(feature = "perf-inline", inline(always))]
195    pub(crate) fn advance_with_look_behind(&mut self) -> bool {
196        self.set_look_behind();
197        self.advance()
198    }
199
200    #[cfg_attr(feature = "perf-inline", inline(always))]
201    pub(crate) fn backtrack(&mut self) -> bool {
202        let backtracked = self.cursor.backtrack();
203        if backtracked {
204            self.chunk_pos = self.chunk().len();
205        } else if self.cursor.offset() != 0 {
206            unreachable!("cursor does not support backtracking {}", self.cursor.offset())
207        }
208        backtracked
209    }
210
211    #[cfg_attr(feature = "perf-inline", inline(always))]
212    pub(crate) fn ensure_look_behind(&mut self) -> Option<u8> {
213        let look_behind = if self.chunk_pos == 0 {
214            // move back to the last chunk to read the look behind
215            if self.slice_span.start != self.chunk_offset() && self.backtrack() {
216                self.advance_with_look_behind();
217                Some(self.look_around[self.look_behind_len - 1])
218            } else {
219                self.look_behind_len = 0;
220                None
221            }
222        } else if self.slice_span.start == self.chunk_offset() + self.chunk_pos {
223            None
224        } else {
225            self.chunk().get(self.chunk_pos - 1).copied()
226        };
227        look_behind
228    }
229
230    pub fn look_around(&mut self) -> (&[u8], usize) {
231        // TODO: cache look_ahead?
232
233        let mut chunk = self.cursor.chunk();
234        let end = chunk.len().min(self.slice_span.end - self.chunk_offset());
235        chunk = &chunk[..end];
236        if self.chunk_pos == 0 {
237            #[cold]
238            fn copy_partial_look_ahead(look_behind: &mut [u8], chunk: &[u8]) {
239                look_behind[..chunk.len()].copy_from_slice(chunk)
240            }
241
242            let look_around_len;
243            if chunk.len() < MAX_CODEPOINT_LEN {
244                look_around_len = self.look_behind_len + chunk.len();
245                copy_partial_look_ahead(&mut self.look_around[self.look_behind_len..], chunk);
246            } else {
247                look_around_len = self.look_behind_len + MAX_CODEPOINT_LEN;
248                self.look_around[self.look_behind_len..look_around_len]
249                    .copy_from_slice(&chunk[..MAX_CODEPOINT_LEN])
250            }
251            (&self.look_around[..look_around_len], self.look_behind_len)
252        } else {
253            (chunk, self.chunk_pos)
254        }
255    }
256
257    #[cfg_attr(feature = "perf-inline", inline(always))]
258    pub(crate) fn chunk_pos(&self) -> usize {
259        self.chunk_pos
260    }
261
262    #[cfg_attr(feature = "perf-inline", inline(always))]
263    pub(crate) fn set_chunk_pos(&mut self, at: usize) {
264        self.chunk_pos = at;
265    }
266
267    /// Sets the anchor mode of a search.
268    ///
269    /// When a search is anchored (so that's [`Anchored::Yes`] or
270    /// [`Anchored::Pattern`]), a match must begin at the start of a search.
271    /// When a search is not anchored (that's [`Anchored::No`]), regex engines
272    /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix
273    /// permits a match to appear anywhere.
274    ///
275    /// By default, the anchored mode is [`Anchored::No`].
276    ///
277    /// **WARNING:** this is subtly different than using a `^` at the start of
278    /// your regex. A `^` forces a regex to match exclusively at the start of
279    /// a chunk, regardless of where you begin your search. In contrast,
280    /// anchoring a search will allow your regex to match anywhere in your
281    /// chunk, but the match must start at the beginning of a search.
282    ///
283    /// For example, consider the chunk `aba` and the following searches:
284    ///
285    /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
286    ///    starting at position `2`. Since `^` requires the match to start at
287    ///    the beginning of the chunk and `2 > 0`, no match is found.
288    /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
289    ///    starting at position `2`. This reports a match at `[2, 3]` since
290    ///    the match starts where the search started. Since there is no `^`,
291    ///    there is no requirement for the match to start at the beginning of
292    ///    the chunk.
293    /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
294    ///    starting at position `1`. Since `b` corresponds to position `1` and
295    ///    since the search is anchored, it finds no match. While the regex
296    ///    matches at other positions, configuring the search to be anchored
297    ///    requires that it only report a match that begins at the same offset
298    ///    as the beginning of the search.
299    /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
300    ///    startting at position `1`. Since the search is not anchored and
301    ///    the regex does not start with `^`, the search executes as if there
302    ///    is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
303    ///    reports a match at `[2, 3]`.
304    ///
305    /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
306    /// except it only reports matches for a particular pattern.
307    ///
308    /// # Example
309    ///
310    /// This demonstrates the differences between an anchored search and
311    /// a pattern that begins with `^` (as described in the above warning
312    /// message).
313    ///
314    /// ```
315    /// use regex_automata::{
316    ///     nfa::thompson::pikevm::PikeVM,
317    ///     Anchored, Match, Input,
318    /// };
319    ///
320    /// let chunk = "aba";
321    ///
322    /// let re = PikeVM::new(r"^a")?;
323    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
324    /// let input = Input::new(chunk).span(2..3).anchored(Anchored::No);
325    /// re.search(&mut cache, &input, &mut caps);
326    /// // No match is found because 2 is not the beginning of the chunk,
327    /// // which is what ^ requires.
328    /// assert_eq!(None, caps.get_match());
329    ///
330    /// let re = PikeVM::new(r"a")?;
331    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
332    /// let input = Input::new(chunk).span(2..3).anchored(Anchored::Yes);
333    /// re.search(&mut cache, &input, &mut caps);
334    /// // An anchored search can still match anywhere in the chunk, it just
335    /// // must begin at the start of the search which is '2' in this case.
336    /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
337    ///
338    /// let re = PikeVM::new(r"a")?;
339    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
340    /// let input = Input::new(chunk).span(1..3).anchored(Anchored::Yes);
341    /// re.search(&mut cache, &input, &mut caps);
342    /// // No match is found since we start searching at offset 1 which
343    /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
344    /// // is found.
345    /// assert_eq!(None, caps.get_match());
346    ///
347    /// let re = PikeVM::new(r"a")?;
348    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
349    /// let input = Input::new(chunk).span(1..3).anchored(Anchored::No);
350    /// re.search(&mut cache, &input, &mut caps);
351    /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
352    /// // pattern. Even though the search starts at 'b', the 'match anything'
353    /// // prefix allows the search to match 'a'.
354    /// let expected = Some(Match::must(0, 2..3));
355    /// assert_eq!(expected, caps.get_match());
356    ///
357    /// # Ok::<(), Box<dyn std::error::Error>>(())
358    /// ```
359    #[inline]
360    pub fn anchored(&mut self, mode: Anchored) -> &mut Self {
361        self.set_anchored(mode);
362        self
363    }
364
365    /// Whether to execute an "earliest" search or not.
366    ///
367    /// When running a non-overlapping search, an "earliest" search will return
368    /// the match location as early as possible. For example, given a pattern
369    /// of `foo[0-9]+` and a chunk of `foo12345`, a normal leftmost search
370    /// will return `foo12345` as a match. But an "earliest" search for regex
371    /// engines that support "earliest" semantics will return `foo1` as a
372    /// match, since as soon as the first digit following `foo` is seen, it is
373    /// known to have found a match.
374    ///
375    /// Note that "earliest" semantics generally depend on the regex engine.
376    /// Different regex engines may determine there is a match at different
377    /// points. So there is no guarantee that "earliest" matches will always
378    /// return the same offsets for all regex engines. The "earliest" notion
379    /// is really about when the particular regex engine determines there is
380    /// a match rather than a consistent semantic unto itself. This is often
381    /// useful for implementing "did a match occur or not" predicates, but
382    /// sometimes the offset is useful as well.
383    ///
384    /// This is disabled by default.
385    ///
386    /// # Example
387    ///
388    /// This example shows the difference between "earliest" searching and
389    /// normal searching.
390    ///
391    /// ```
392    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
393    ///
394    /// let re = PikeVM::new(r"foo[0-9]+")?;
395    /// let mut cache = re.create_cache();
396    /// let mut caps = re.create_captures();
397    ///
398    /// // A normal search implements greediness like you expect.
399    /// let input = Input::new("foo12345");
400    /// re.search(&mut cache, &input, &mut caps);
401    /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
402    ///
403    /// // When 'earliest' is enabled and the regex engine supports
404    /// // it, the search will bail once it knows a match has been
405    /// // found.
406    /// let input = Input::new("foo12345").earliest(true);
407    /// re.search(&mut cache, &input, &mut caps);
408    /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
409    /// # Ok::<(), Box<dyn std::error::Error>>(())
410    /// ```
411    #[inline]
412    pub fn earliest(&mut self, yes: bool) -> &mut Self {
413        self.set_earliest(yes);
414        self
415    }
416
417    /// Set the anchor mode of a search.
418    ///
419    /// This is like [`Input::anchored`], except it mutates the search
420    /// configuration in place.
421    ///
422    /// # Example
423    ///
424    /// ```
425    /// use regex_automata::{Anchored, Input, PatternID};
426    ///
427    /// let mut input = Input::new("foobar");
428    /// assert_eq!(Anchored::No, input.get_anchored());
429    ///
430    /// let pid = PatternID::must(5);
431    /// input.set_anchored(Anchored::Pattern(pid));
432    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
433    /// ```
434    #[inline]
435    pub fn set_anchored(&mut self, mode: Anchored) {
436        self.anchored = mode;
437    }
438
439    /// Set whether the search should execute in "earliest" mode or not.
440    ///
441    /// This is like [`Input::earliest`], except it mutates the search
442    /// configuration in place.
443    ///
444    /// # Example
445    ///
446    /// ```
447    /// use regex_automata::Input;
448    ///
449    /// let mut input = Input::new("foobar");
450    /// assert!(!input.get_earliest());
451    /// input.set_earliest(true);
452    /// assert!(input.get_earliest());
453    /// ```
454    #[inline]
455    pub fn set_earliest(&mut self, yes: bool) {
456        self.earliest = yes;
457    }
458
459    /// Set the span for this search.
460    ///
461    /// This routine does not panic if the span given is not a valid range for
462    /// this search's haystack. If this search is run with an invalid range,
463    /// then the most likely outcome is that the actual search execution will
464    /// panic.
465    ///
466    /// This routine is generic over how a span is provided. While
467    /// a [`Span`] may be given directly, one may also provide a
468    /// `std::ops::Range<usize>`. To provide anything supported by range
469    /// syntax, use the [`Input::range`] method.
470    ///
471    /// The default span is the entire haystack.
472    ///
473    /// Note that [`Input::range`] overrides this method and vice versa.
474    ///
475    /// # Panics
476    ///
477    /// This panics if the given span does not correspond to valid bounds in
478    /// the haystack or the termination of a search.
479    ///
480    /// # Example
481    ///
482    /// This example shows how the span of the search can impact whether a
483    /// match is reported or not. This is particularly relevant for look-around
484    /// operators, which might take things outside of the span into account
485    /// when determining whether they match.
486    ///
487    /// ```
488    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
489    /// use regex_automata::{
490    ///     nfa::thompson::pikevm::PikeVM,
491    ///     Match, Input,
492    /// };
493    ///
494    /// // Look for 'at', but as a distinct word.
495    /// let re = PikeVM::new(r"\bat\b")?;
496    /// let mut cache = re.create_cache();
497    /// let mut caps = re.create_captures();
498    ///
499    /// // Our haystack contains 'at', but not as a distinct word.
500    /// let haystack = "batter";
501    ///
502    /// // A standard search finds nothing, as expected.
503    /// let input = Input::new(haystack);
504    /// re.search(&mut cache, &input, &mut caps);
505    /// assert_eq!(None, caps.get_match());
506    ///
507    /// // But if we wanted to search starting at position '1', we might
508    /// // slice the haystack. If we do this, it's impossible for the \b
509    /// // anchors to take the surrounding context into account! And thus,
510    /// // a match is produced.
511    /// let input = Input::new(&haystack[1..3]);
512    /// re.search(&mut cache, &input, &mut caps);
513    /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
514    ///
515    /// // But if we specify the span of the search instead of slicing the
516    /// // haystack, then the regex engine can "see" outside of the span
517    /// // and resolve the anchors correctly.
518    /// let input = Input::new(haystack).span(1..3);
519    /// re.search(&mut cache, &input, &mut caps);
520    /// assert_eq!(None, caps.get_match());
521    ///
522    /// # Ok::<(), Box<dyn std::error::Error>>(())
523    /// ```
524    ///
525    /// This may seem a little ham-fisted, but this scenario tends to come up
526    /// if some other regex engine found the match span and now you need to
527    /// re-process that span to look for capturing groups. (e.g., Run a faster
528    /// DFA first, find a match, then run the PikeVM on just the match span to
529    /// resolve capturing groups.) In order to implement that sort of logic
530    /// correctly, you need to set the span on the search instead of slicing
531    /// the haystack directly.
532    ///
533    /// The other advantage of using this routine to specify the bounds of the
534    /// search is that the match offsets are still reported in terms of the
535    /// original haystack. For example, the second search in the example above
536    /// reported a match at position `0`, even though `at` starts at offset
537    /// `1` because we sliced the haystack.
538    #[inline]
539    pub fn span<S: Into<Span>>(&mut self, span: S) -> &mut Input<C> {
540        self.set_span(span);
541        self
542    }
543
544    /// Set the starting offset for the span for this search configuration.
545    ///
546    /// This is a convenience routine for only mutating the start of a span
547    /// without having to set the entire span.
548    ///
549    /// # Panics
550    ///
551    /// This panics if the span resulting from the new start position does not
552    /// correspond to valid bounds in the haystack or the termination of a
553    /// search.
554    ///
555    #[inline]
556    pub fn set_start(&mut self, start: usize) {
557        self.set_span(Span { start, ..self.get_span() });
558    }
559
560    /// Set the ending offset for the span for this search configuration.
561    ///
562    /// This is a convenience routine for only mutating the end of a span
563    /// without having to set the entire span.
564    ///
565    /// # Panics
566    ///
567    /// This panics if the span resulting from the new end position does not
568    /// correspond to valid bounds in the haystack or the termination of a
569    /// search.
570    #[inline]
571    pub fn set_end(&mut self, end: usize) {
572        self.set_span(Span { end, ..self.get_span() });
573    }
574
575    /// Like `Input::span`, but accepts any range instead.
576    ///
577    /// This routine does not panic if the range given is not a valid range for
578    /// this search's haystack. If this search is run with an invalid range,
579    /// then the most likely outcome is that the actual search execution will
580    /// panic.
581    ///
582    /// The default range is the entire haystack.
583    ///
584    /// Note that [`Input::span`] overrides this method and vice versa.
585    ///
586    /// # Panics
587    ///
588    /// This routine will panic if the given range could not be converted
589    /// to a valid [`Range`]. For example, this would panic when given
590    /// `0..=usize::MAX` since it cannot be represented using a half-open
591    /// interval in terms of `usize`.
592    ///
593    /// This also panics if the given range does not correspond to valid bounds
594    /// in the haystack or the termination of a search.
595    ///
596    /// # Example
597    ///
598    /// ```
599    /// use regex_automata::Input;
600    ///
601    /// let input = Input::new("foobar");
602    /// assert_eq!(0..6, input.get_range());
603    ///
604    /// let input = Input::new("foobar").range(2..=4);
605    /// assert_eq!(2..5, input.get_range());
606    /// ```
607    #[inline]
608    pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<C> {
609        self.set_range(range);
610        self
611    }
612
613    /// Set the span for this search configuration.
614    ///
615    /// This is like the [`Input::span`] method, except this mutates the
616    /// span in place.
617    ///
618    /// This routine is generic over how a span is provided. While
619    /// a [`Span`] may be given directly, one may also provide a
620    /// `std::ops::Range<usize>`.
621    ///
622    /// # Panics
623    ///
624    /// This panics if the given span does not correspond to valid bounds in
625    /// the haystack or the termination of a search.
626    ///
627    /// # Example
628    ///
629    /// ```
630    /// use regex_automata::Input;
631    ///
632    /// let mut input = Input::new("foobar");
633    /// assert_eq!(0..6, input.get_range());
634    /// input.set_span(2..4);
635    /// assert_eq!(2..4, input.get_range());
636    /// ```
637    #[inline]
638    pub fn set_span<S: Into<Span>>(&mut self, span: S) {
639        let span = span.into();
640        assert!(span.start <= span.end.saturating_add(1), "invalid span {:?}", span,);
641        if self.at() < span.start {
642            self.move_to(span.start);
643        } else if !self.is_done() && self.at() > span.end {
644            self.move_to(span.end);
645        }
646        self.span = span;
647    }
648
649    #[inline]
650    pub fn slice_span<S: Into<Span>>(&mut self, span: S) -> &mut Input<C> {
651        let span = span.into();
652        assert!(span.start <= span.end.saturating_add(1), "invalid span {:?}", span,);
653        if self.at() < span.start {
654            self.move_to(span.start);
655        } else if !self.is_done() && self.at() > span.end {
656            self.move_to(span.end);
657        }
658        self.slice_span = span;
659        self.span = span;
660        self
661    }
662
663    #[inline]
664    pub fn slice<R: RangeBounds<usize>>(&mut self, range: R) -> &mut Input<C> {
665        use core::ops::Bound;
666
667        // It's a little weird to convert ranges into spans, and then spans
668        // back into ranges when we actually slice the haystack. Because
669        // of that process, we always represent everything as a half-open
670        // internal. Therefore, handling things like m..=n is a little awkward.
671        let start = match range.start_bound() {
672            Bound::Included(&i) => i,
673            // Can this case ever happen? Range syntax doesn't support it...
674            Bound::Excluded(&i) => i.checked_add(1).unwrap(),
675            Bound::Unbounded => 0,
676        };
677        let end = match range.end_bound() {
678            Bound::Included(&i) => i.checked_add(1).unwrap(),
679            Bound::Excluded(&i) => i,
680            Bound::Unbounded => self.cursor.total_bytes().unwrap_or(usize::MAX),
681        };
682        self.slice_span(Span { start, end })
683    }
684
685    #[inline]
686    pub(crate) fn move_to(&mut self, at: usize) {
687        debug_assert!(at <= self.span.end.saturating_add(1));
688        // TODO: fastpath for O(log N) chunk jumping
689        while at < self.cursor.offset() {
690            self.backtrack();
691        }
692        if at != self.cursor.offset() {
693            while at >= self.cursor.offset() + self.chunk().len() {
694                let advanced = self.advance();
695                if !advanced {
696                    let chunk_pos = (at - self.cursor.offset()).min(self.chunk().len());
697                    self.set_chunk_pos(chunk_pos);
698                    return;
699                }
700            }
701        }
702        self.set_chunk_pos(at - self.cursor.offset());
703    }
704
705    #[cfg_attr(feature = "perf-inline", inline(always))]
706    pub(crate) fn at(&self) -> usize {
707        self.cursor.offset() + self.chunk_pos()
708    }
709
710    #[cfg_attr(feature = "perf-inline", inline(always))]
711    pub(crate) fn with<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> T {
712        let anchored = self.anchored;
713        let earliest = self.earliest;
714        let span = self.span;
715        let res = f(self);
716        self.set_span(span);
717        self.set_earliest(earliest);
718        self.set_anchored(anchored);
719        res
720    }
721
722    // #[cfg_attr(feature = "perf-inline", inline(always))]
723    // pub(crate) fn try_clone(&self) -> Option<Input<C>> {
724    //     let res = Input {
725    //         cursor: self.cursor.try_clone()?,
726    //         anchored: self.anchored,
727    //         earliest: self.earliest,
728    //         offset: self.offset,
729    //         chunk_pos: self.chunk_pos,
730    //         span: self.span,
731    //         look_behind: self.look_behind,
732    //     };
733    //     Some(res)
734    // }
735
736    /// Set the span for this search configuration given any range.
737    ///
738    /// This is like the [`Input::range`] method, except this mutates the
739    /// span in place.
740    ///
741    /// This routine does not panic if the range given is not a valid range for
742    /// this search's haystack. If this search is run with an invalid range,
743    /// then the most likely outcome is that the actual search execution will
744    /// panic.
745    ///
746    /// # Panics
747    ///
748    /// This routine will panic if the given range could not be converted
749    /// to a valid [`Range`]. For example, this would panic when given
750    /// `0..=usize::MAX` since it cannot be represented using a half-open
751    /// interval in terms of `usize`.
752    ///
753    /// This also panics if the given span does not correspond to valid bounds
754    /// in the haystack or the termination of a search.
755    ///
756    /// # Example
757    ///
758    /// ```
759    /// use regex_automata::Input;
760    ///
761    /// let mut input = Input::new("foobar");
762    /// assert_eq!(0..6, input.get_range());
763    /// input.set_range(2..=4);
764    /// assert_eq!(2..5, input.get_range());
765    /// ```
766    #[inline]
767    pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
768        use core::ops::Bound;
769
770        // It's a little weird to convert ranges into spans, and then spans
771        // back into ranges when we actually slice the haystack. Because
772        // of that process, we always represent everything as a half-open
773        // internal. Therefore, handling things like m..=n is a little awkward.
774        let start = match range.start_bound() {
775            Bound::Included(&i) => i,
776            // Can this case ever happen? Range syntax doesn't support it...
777            Bound::Excluded(&i) => i.checked_add(1).unwrap(),
778            Bound::Unbounded => 0,
779        };
780        let end = match range.end_bound() {
781            Bound::Included(&i) => i.checked_add(1).unwrap(),
782            Bound::Excluded(&i) => i,
783            Bound::Unbounded => self.cursor.total_bytes().unwrap_or(usize::MAX),
784        };
785        self.set_span(Span { start, end });
786    }
787
788    /// Return the anchored mode for this search configuration.
789    ///
790    /// If no anchored mode was set, then it defaults to [`Anchored::No`].
791    ///
792    /// # Example
793    ///
794    /// ```
795    /// use regex_automata::{Anchored, Input, PatternID};
796    ///
797    /// let mut input = Input::new("foobar");
798    /// assert_eq!(Anchored::No, input.get_anchored());
799    ///
800    /// let pid = PatternID::must(5);
801    /// input.set_anchored(Anchored::Pattern(pid));
802    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
803    /// ```
804    #[inline]
805    pub fn get_anchored(&self) -> Anchored {
806        self.anchored
807    }
808
809    /// Return whether this search should execute in "earliest" mode.
810    ///
811    /// # Example
812    ///
813    /// ```
814    /// use regex_automata::Input;
815    ///
816    /// let input = Input::new("foobar");
817    /// assert!(!input.get_earliest());
818    /// ```
819    #[inline]
820    pub fn get_earliest(&self) -> bool {
821        self.earliest
822    }
823
824    /// Return true if and only if this search can never return any other
825    /// matches.
826    ///
827    /// This occurs when the start position of this search is greater than the
828    /// end position of the search.
829    ///
830    /// # Example
831    ///
832    /// ```
833    /// use regex_automata::Input;
834    ///
835    /// let mut input = Input::new("foobar");
836    /// assert!(!input.is_done());
837    /// input.set_start(6);
838    /// assert!(!input.is_done());
839    /// input.set_start(7);
840    /// assert!(input.is_done());
841    /// ```
842    #[inline]
843    pub fn is_done(&self) -> bool {
844        self.get_span().start > self.get_span().end
845    }
846
847    /// Returns true if and only if the given offset in this search's chunk
848    /// falls on a valid UTF-8 encoded codepoint boundary.
849    ///
850    /// If the chunk is not valid UTF-8, then the behavior of this routine
851    /// is unspecified.
852    ///
853    /// # Example
854    ///
855    /// This shows where codepoint bounardies do and don't exist in valid
856    /// UTF-8.
857    ///
858    /// ```
859    /// use regex_automata::Input;
860    ///
861    /// let input = Input::new("☃");
862    /// assert!(input.is_char_boundary(0));
863    /// assert!(!input.is_char_boundary(1));
864    /// assert!(!input.is_char_boundary(2));
865    /// assert!(input.is_char_boundary(3));
866    /// assert!(!input.is_char_boundary(4));
867    /// ```
868    #[inline]
869    pub fn is_char_boundary(&mut self) -> bool {
870        is_boundary(self.chunk(), self.chunk_pos)
871    }
872}
873
874impl<C: Cursor> core::fmt::Debug for Input<C> {
875    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
876        use regex_automata::util::escape::DebugHaystack;
877
878        f.debug_struct("Input")
879            .field("chunk", &DebugHaystack(self.chunk()))
880            .field("anchored", &self.anchored)
881            .field("earliest", &self.earliest)
882            .field("chunk_pos", &self.chunk_pos)
883            .field("chunk_offset", &self.cursor.offset())
884            .field("span", &self.span)
885            .finish()
886    }
887}
regex_cursor/input.rs

regex_cursor/
input.rs