regex_cursor/input.rs
1/*!
2Types and routines that support the search APIs of most regex engines.
3
4This sub-module isn't exposed directly, but rather, its contents are exported
5at the crate root due to the universality of most of the types and routines in
6this module.
7*/
8
9use std::ops::RangeBounds;
10
11use regex_automata::{Anchored, Span};
12
13use crate::cursor::{Cursor, IntoCursor};
14use crate::util::utf8::is_boundary;
15
16const MAX_CODEPOINT_LEN: usize = 4;
17
18#[derive(Clone)]
19pub struct Input<C: Cursor> {
20 // span: Span,
21 anchored: Anchored,
22 earliest: bool,
23 /// Position within the current chunk
24 pub(crate) chunk_pos: usize,
25 span: Span,
26 pub(crate) slice_span: Span,
27 look_behind_len: usize,
28 /// the last 4 bytes before the current chunk
29 look_around: [u8; MAX_CODEPOINT_LEN * 2],
30 cursor: C,
31}
32
33impl<C: Cursor> Input<C> {
34 /// Create a new search configuration for the given cursor.
35 #[inline]
36 pub fn new<T: IntoCursor<Cursor = C>>(cursor: T) -> Self {
37 let cursor = cursor.into_cursor();
38 let end = cursor.total_bytes().unwrap_or(usize::MAX);
39 let start = cursor.offset();
40 Input {
41 anchored: Anchored::No,
42 earliest: false,
43 chunk_pos: 0,
44 cursor: cursor.into_cursor(),
45 // init with invalid utf8. We don't need to track
46 // which of these have been filed since we only look
47 // behind more than one byte in utf8 mode
48 look_around: [255; 8],
49 span: Span { start, end },
50 slice_span: Span { start: 0, end: usize::MAX },
51 look_behind_len: 0,
52 }
53 }
54
55 /// Return a borrow of the current underlying chunk as a slice of bytes.
56 ///
57 /// # Example
58 ///
59 /// ```
60 /// use regex_cursor::Input;
61 ///
62 /// let input = Input::new("foobar");
63 /// assert_eq!(b"foobar", input.chunk());
64 /// ```
65 #[cfg_attr(feature = "perf-inline", inline(always))]
66 pub fn chunk(&self) -> &[u8] {
67 self.cursor.chunk()
68 }
69
70 /// Return a borrow of the current underlying chunk as a slice of bytes.
71 ///
72 /// # Example
73 ///
74 /// ```
75 /// use regex_cursor::Input;
76 ///
77 /// let input = Input::new("foobar");
78 /// assert_eq!(b"foobar", input.chunk());
79 /// ```
80 #[cfg_attr(feature = "perf-inline", inline(always))]
81 pub fn chunk_offset(&self) -> usize {
82 self.cursor.offset()
83 }
84
85 /// Return the start position of this search.
86 ///
87 /// This is a convenience routine for `search.get_span().start()`.
88 ///
89 /// When [`Input::is_done`] is `false`, this is guaranteed to return
90 /// an offset that is less than or equal to [`Input::end`]. Otherwise,
91 /// the offset is one greater than [`Input::end`].
92 ///
93 /// # Example
94 ///
95 /// ```
96 /// use regex_automata::Input;
97 ///
98 /// let input = Input::new("foobar");
99 /// assert_eq!(0, input.start());
100 ///
101 /// let input = Input::new("foobar").span(2..4);
102 /// assert_eq!(2, input.start());
103 /// ```
104 #[inline]
105 pub fn start(&self) -> usize {
106 self.get_span().start
107 }
108
109 #[inline]
110 pub fn clear_look_behind(&mut self) {
111 self.look_around = [255; 8];
112 }
113
114 /// Return the end position of this search.
115 ///
116 /// This is a convenience routine for `search.get_span().end()`.
117 ///
118 /// This is guaranteed to return an offset that is a valid exclusive end
119 /// bound for this input's haystack.
120 ///
121 /// # Example
122 ///
123 /// ```
124 /// use regex_automata::Input;
125 ///
126 /// let input = Input::new("foobar");
127 /// assert_eq!(6, input.end());
128 ///
129 /// let input = Input::new("foobar").span(2..4);
130 /// assert_eq!(4, input.end());
131 /// ```
132 #[inline]
133 pub fn end(&self) -> usize {
134 self.span.end
135 }
136
137 #[inline(always)]
138 pub fn get_chunk_end(&self) -> usize {
139 let end = self.span.end - self.cursor.offset();
140 end.min(self.chunk().len())
141 }
142
143 /// Return the span for this search configuration.
144 ///
145 /// If one was not explicitly set, then the span corresponds to the entire
146 /// range of the haystack.
147 ///
148 /// When [`Input::is_done`] is `false`, the span returned is guaranteed
149 /// to correspond to valid bounds for this input's haystack.
150 ///
151 /// # Example
152 ///
153 /// ```
154 /// use regex_automata::{Input, Span};
155 ///
156 /// let input = Input::new("foobar");
157 /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
158 /// ```
159 #[inline]
160 pub fn get_span(&self) -> Span {
161 self.span
162 }
163
164 #[cfg_attr(feature = "perf-inline", inline(always))]
165 pub(crate) fn set_look_behind(&mut self) {
166 #[cold]
167 fn copy_partial_look_behind(look_behind: &mut [u8; MAX_CODEPOINT_LEN * 2], chunk: &[u8]) {
168 look_behind[..chunk.len()].copy_from_slice(chunk)
169 }
170
171 let chunk = self.cursor.chunk();
172 let len = chunk.len();
173 if len < MAX_CODEPOINT_LEN {
174 copy_partial_look_behind(&mut self.look_around, chunk);
175 self.look_behind_len = chunk.len();
176 } else {
177 self.look_behind_len = MAX_CODEPOINT_LEN;
178 self.look_around[..MAX_CODEPOINT_LEN].copy_from_slice(&chunk[len - MAX_CODEPOINT_LEN..])
179 }
180 }
181
182 #[cfg_attr(feature = "perf-inline", inline(always))]
183 pub(crate) fn advance(&mut self) -> bool {
184 let old_len = self.cursor.chunk().len();
185 let advanced = self.cursor.advance();
186 if advanced {
187 self.chunk_pos = 0;
188 } else if self.span.end > self.cursor.offset() + old_len {
189 self.span.end = self.cursor.offset() + old_len;
190 }
191 advanced
192 }
193
194 #[cfg_attr(feature = "perf-inline", inline(always))]
195 pub(crate) fn advance_with_look_behind(&mut self) -> bool {
196 self.set_look_behind();
197 self.advance()
198 }
199
200 #[cfg_attr(feature = "perf-inline", inline(always))]
201 pub(crate) fn backtrack(&mut self) -> bool {
202 let backtracked = self.cursor.backtrack();
203 if backtracked {
204 self.chunk_pos = self.chunk().len();
205 } else if self.cursor.offset() != 0 {
206 unreachable!("cursor does not support backtracking {}", self.cursor.offset())
207 }
208 backtracked
209 }
210
211 #[cfg_attr(feature = "perf-inline", inline(always))]
212 pub(crate) fn ensure_look_behind(&mut self) -> Option<u8> {
213 let look_behind = if self.chunk_pos == 0 {
214 // move back to the last chunk to read the look behind
215 if self.slice_span.start != self.chunk_offset() && self.backtrack() {
216 self.advance_with_look_behind();
217 Some(self.look_around[self.look_behind_len - 1])
218 } else {
219 self.look_behind_len = 0;
220 None
221 }
222 } else if self.slice_span.start == self.chunk_offset() + self.chunk_pos {
223 None
224 } else {
225 self.chunk().get(self.chunk_pos - 1).copied()
226 };
227 look_behind
228 }
229
230 pub fn look_around(&mut self) -> (&[u8], usize) {
231 // TODO: cache look_ahead?
232
233 let mut chunk = self.cursor.chunk();
234 let end = chunk.len().min(self.slice_span.end - self.chunk_offset());
235 chunk = &chunk[..end];
236 if self.chunk_pos == 0 {
237 #[cold]
238 fn copy_partial_look_ahead(look_behind: &mut [u8], chunk: &[u8]) {
239 look_behind[..chunk.len()].copy_from_slice(chunk)
240 }
241
242 let look_around_len;
243 if chunk.len() < MAX_CODEPOINT_LEN {
244 look_around_len = self.look_behind_len + chunk.len();
245 copy_partial_look_ahead(&mut self.look_around[self.look_behind_len..], chunk);
246 } else {
247 look_around_len = self.look_behind_len + MAX_CODEPOINT_LEN;
248 self.look_around[self.look_behind_len..look_around_len]
249 .copy_from_slice(&chunk[..MAX_CODEPOINT_LEN])
250 }
251 (&self.look_around[..look_around_len], self.look_behind_len)
252 } else {
253 (chunk, self.chunk_pos)
254 }
255 }
256
257 #[cfg_attr(feature = "perf-inline", inline(always))]
258 pub(crate) fn chunk_pos(&self) -> usize {
259 self.chunk_pos
260 }
261
262 #[cfg_attr(feature = "perf-inline", inline(always))]
263 pub(crate) fn set_chunk_pos(&mut self, at: usize) {
264 self.chunk_pos = at;
265 }
266
267 /// Sets the anchor mode of a search.
268 ///
269 /// When a search is anchored (so that's [`Anchored::Yes`] or
270 /// [`Anchored::Pattern`]), a match must begin at the start of a search.
271 /// When a search is not anchored (that's [`Anchored::No`]), regex engines
272 /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix
273 /// permits a match to appear anywhere.
274 ///
275 /// By default, the anchored mode is [`Anchored::No`].
276 ///
277 /// **WARNING:** this is subtly different than using a `^` at the start of
278 /// your regex. A `^` forces a regex to match exclusively at the start of
279 /// a chunk, regardless of where you begin your search. In contrast,
280 /// anchoring a search will allow your regex to match anywhere in your
281 /// chunk, but the match must start at the beginning of a search.
282 ///
283 /// For example, consider the chunk `aba` and the following searches:
284 ///
285 /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
286 /// starting at position `2`. Since `^` requires the match to start at
287 /// the beginning of the chunk and `2 > 0`, no match is found.
288 /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
289 /// starting at position `2`. This reports a match at `[2, 3]` since
290 /// the match starts where the search started. Since there is no `^`,
291 /// there is no requirement for the match to start at the beginning of
292 /// the chunk.
293 /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
294 /// starting at position `1`. Since `b` corresponds to position `1` and
295 /// since the search is anchored, it finds no match. While the regex
296 /// matches at other positions, configuring the search to be anchored
297 /// requires that it only report a match that begins at the same offset
298 /// as the beginning of the search.
299 /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
300 /// startting at position `1`. Since the search is not anchored and
301 /// the regex does not start with `^`, the search executes as if there
302 /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
303 /// reports a match at `[2, 3]`.
304 ///
305 /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
306 /// except it only reports matches for a particular pattern.
307 ///
308 /// # Example
309 ///
310 /// This demonstrates the differences between an anchored search and
311 /// a pattern that begins with `^` (as described in the above warning
312 /// message).
313 ///
314 /// ```
315 /// use regex_automata::{
316 /// nfa::thompson::pikevm::PikeVM,
317 /// Anchored, Match, Input,
318 /// };
319 ///
320 /// let chunk = "aba";
321 ///
322 /// let re = PikeVM::new(r"^a")?;
323 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
324 /// let input = Input::new(chunk).span(2..3).anchored(Anchored::No);
325 /// re.search(&mut cache, &input, &mut caps);
326 /// // No match is found because 2 is not the beginning of the chunk,
327 /// // which is what ^ requires.
328 /// assert_eq!(None, caps.get_match());
329 ///
330 /// let re = PikeVM::new(r"a")?;
331 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
332 /// let input = Input::new(chunk).span(2..3).anchored(Anchored::Yes);
333 /// re.search(&mut cache, &input, &mut caps);
334 /// // An anchored search can still match anywhere in the chunk, it just
335 /// // must begin at the start of the search which is '2' in this case.
336 /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
337 ///
338 /// let re = PikeVM::new(r"a")?;
339 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
340 /// let input = Input::new(chunk).span(1..3).anchored(Anchored::Yes);
341 /// re.search(&mut cache, &input, &mut caps);
342 /// // No match is found since we start searching at offset 1 which
343 /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
344 /// // is found.
345 /// assert_eq!(None, caps.get_match());
346 ///
347 /// let re = PikeVM::new(r"a")?;
348 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
349 /// let input = Input::new(chunk).span(1..3).anchored(Anchored::No);
350 /// re.search(&mut cache, &input, &mut caps);
351 /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
352 /// // pattern. Even though the search starts at 'b', the 'match anything'
353 /// // prefix allows the search to match 'a'.
354 /// let expected = Some(Match::must(0, 2..3));
355 /// assert_eq!(expected, caps.get_match());
356 ///
357 /// # Ok::<(), Box<dyn std::error::Error>>(())
358 /// ```
359 #[inline]
360 pub fn anchored(&mut self, mode: Anchored) -> &mut Self {
361 self.set_anchored(mode);
362 self
363 }
364
365 /// Whether to execute an "earliest" search or not.
366 ///
367 /// When running a non-overlapping search, an "earliest" search will return
368 /// the match location as early as possible. For example, given a pattern
369 /// of `foo[0-9]+` and a chunk of `foo12345`, a normal leftmost search
370 /// will return `foo12345` as a match. But an "earliest" search for regex
371 /// engines that support "earliest" semantics will return `foo1` as a
372 /// match, since as soon as the first digit following `foo` is seen, it is
373 /// known to have found a match.
374 ///
375 /// Note that "earliest" semantics generally depend on the regex engine.
376 /// Different regex engines may determine there is a match at different
377 /// points. So there is no guarantee that "earliest" matches will always
378 /// return the same offsets for all regex engines. The "earliest" notion
379 /// is really about when the particular regex engine determines there is
380 /// a match rather than a consistent semantic unto itself. This is often
381 /// useful for implementing "did a match occur or not" predicates, but
382 /// sometimes the offset is useful as well.
383 ///
384 /// This is disabled by default.
385 ///
386 /// # Example
387 ///
388 /// This example shows the difference between "earliest" searching and
389 /// normal searching.
390 ///
391 /// ```
392 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
393 ///
394 /// let re = PikeVM::new(r"foo[0-9]+")?;
395 /// let mut cache = re.create_cache();
396 /// let mut caps = re.create_captures();
397 ///
398 /// // A normal search implements greediness like you expect.
399 /// let input = Input::new("foo12345");
400 /// re.search(&mut cache, &input, &mut caps);
401 /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
402 ///
403 /// // When 'earliest' is enabled and the regex engine supports
404 /// // it, the search will bail once it knows a match has been
405 /// // found.
406 /// let input = Input::new("foo12345").earliest(true);
407 /// re.search(&mut cache, &input, &mut caps);
408 /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
409 /// # Ok::<(), Box<dyn std::error::Error>>(())
410 /// ```
411 #[inline]
412 pub fn earliest(&mut self, yes: bool) -> &mut Self {
413 self.set_earliest(yes);
414 self
415 }
416
417 /// Set the anchor mode of a search.
418 ///
419 /// This is like [`Input::anchored`], except it mutates the search
420 /// configuration in place.
421 ///
422 /// # Example
423 ///
424 /// ```
425 /// use regex_automata::{Anchored, Input, PatternID};
426 ///
427 /// let mut input = Input::new("foobar");
428 /// assert_eq!(Anchored::No, input.get_anchored());
429 ///
430 /// let pid = PatternID::must(5);
431 /// input.set_anchored(Anchored::Pattern(pid));
432 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
433 /// ```
434 #[inline]
435 pub fn set_anchored(&mut self, mode: Anchored) {
436 self.anchored = mode;
437 }
438
439 /// Set whether the search should execute in "earliest" mode or not.
440 ///
441 /// This is like [`Input::earliest`], except it mutates the search
442 /// configuration in place.
443 ///
444 /// # Example
445 ///
446 /// ```
447 /// use regex_automata::Input;
448 ///
449 /// let mut input = Input::new("foobar");
450 /// assert!(!input.get_earliest());
451 /// input.set_earliest(true);
452 /// assert!(input.get_earliest());
453 /// ```
454 #[inline]
455 pub fn set_earliest(&mut self, yes: bool) {
456 self.earliest = yes;
457 }
458
459 /// Set the span for this search.
460 ///
461 /// This routine does not panic if the span given is not a valid range for
462 /// this search's haystack. If this search is run with an invalid range,
463 /// then the most likely outcome is that the actual search execution will
464 /// panic.
465 ///
466 /// This routine is generic over how a span is provided. While
467 /// a [`Span`] may be given directly, one may also provide a
468 /// `std::ops::Range<usize>`. To provide anything supported by range
469 /// syntax, use the [`Input::range`] method.
470 ///
471 /// The default span is the entire haystack.
472 ///
473 /// Note that [`Input::range`] overrides this method and vice versa.
474 ///
475 /// # Panics
476 ///
477 /// This panics if the given span does not correspond to valid bounds in
478 /// the haystack or the termination of a search.
479 ///
480 /// # Example
481 ///
482 /// This example shows how the span of the search can impact whether a
483 /// match is reported or not. This is particularly relevant for look-around
484 /// operators, which might take things outside of the span into account
485 /// when determining whether they match.
486 ///
487 /// ```
488 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
489 /// use regex_automata::{
490 /// nfa::thompson::pikevm::PikeVM,
491 /// Match, Input,
492 /// };
493 ///
494 /// // Look for 'at', but as a distinct word.
495 /// let re = PikeVM::new(r"\bat\b")?;
496 /// let mut cache = re.create_cache();
497 /// let mut caps = re.create_captures();
498 ///
499 /// // Our haystack contains 'at', but not as a distinct word.
500 /// let haystack = "batter";
501 ///
502 /// // A standard search finds nothing, as expected.
503 /// let input = Input::new(haystack);
504 /// re.search(&mut cache, &input, &mut caps);
505 /// assert_eq!(None, caps.get_match());
506 ///
507 /// // But if we wanted to search starting at position '1', we might
508 /// // slice the haystack. If we do this, it's impossible for the \b
509 /// // anchors to take the surrounding context into account! And thus,
510 /// // a match is produced.
511 /// let input = Input::new(&haystack[1..3]);
512 /// re.search(&mut cache, &input, &mut caps);
513 /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
514 ///
515 /// // But if we specify the span of the search instead of slicing the
516 /// // haystack, then the regex engine can "see" outside of the span
517 /// // and resolve the anchors correctly.
518 /// let input = Input::new(haystack).span(1..3);
519 /// re.search(&mut cache, &input, &mut caps);
520 /// assert_eq!(None, caps.get_match());
521 ///
522 /// # Ok::<(), Box<dyn std::error::Error>>(())
523 /// ```
524 ///
525 /// This may seem a little ham-fisted, but this scenario tends to come up
526 /// if some other regex engine found the match span and now you need to
527 /// re-process that span to look for capturing groups. (e.g., Run a faster
528 /// DFA first, find a match, then run the PikeVM on just the match span to
529 /// resolve capturing groups.) In order to implement that sort of logic
530 /// correctly, you need to set the span on the search instead of slicing
531 /// the haystack directly.
532 ///
533 /// The other advantage of using this routine to specify the bounds of the
534 /// search is that the match offsets are still reported in terms of the
535 /// original haystack. For example, the second search in the example above
536 /// reported a match at position `0`, even though `at` starts at offset
537 /// `1` because we sliced the haystack.
538 #[inline]
539 pub fn span<S: Into<Span>>(&mut self, span: S) -> &mut Input<C> {
540 self.set_span(span);
541 self
542 }
543
544 /// Set the starting offset for the span for this search configuration.
545 ///
546 /// This is a convenience routine for only mutating the start of a span
547 /// without having to set the entire span.
548 ///
549 /// # Panics
550 ///
551 /// This panics if the span resulting from the new start position does not
552 /// correspond to valid bounds in the haystack or the termination of a
553 /// search.
554 ///
555 #[inline]
556 pub fn set_start(&mut self, start: usize) {
557 self.set_span(Span { start, ..self.get_span() });
558 }
559
560 /// Set the ending offset for the span for this search configuration.
561 ///
562 /// This is a convenience routine for only mutating the end of a span
563 /// without having to set the entire span.
564 ///
565 /// # Panics
566 ///
567 /// This panics if the span resulting from the new end position does not
568 /// correspond to valid bounds in the haystack or the termination of a
569 /// search.
570 #[inline]
571 pub fn set_end(&mut self, end: usize) {
572 self.set_span(Span { end, ..self.get_span() });
573 }
574
575 /// Like `Input::span`, but accepts any range instead.
576 ///
577 /// This routine does not panic if the range given is not a valid range for
578 /// this search's haystack. If this search is run with an invalid range,
579 /// then the most likely outcome is that the actual search execution will
580 /// panic.
581 ///
582 /// The default range is the entire haystack.
583 ///
584 /// Note that [`Input::span`] overrides this method and vice versa.
585 ///
586 /// # Panics
587 ///
588 /// This routine will panic if the given range could not be converted
589 /// to a valid [`Range`]. For example, this would panic when given
590 /// `0..=usize::MAX` since it cannot be represented using a half-open
591 /// interval in terms of `usize`.
592 ///
593 /// This also panics if the given range does not correspond to valid bounds
594 /// in the haystack or the termination of a search.
595 ///
596 /// # Example
597 ///
598 /// ```
599 /// use regex_automata::Input;
600 ///
601 /// let input = Input::new("foobar");
602 /// assert_eq!(0..6, input.get_range());
603 ///
604 /// let input = Input::new("foobar").range(2..=4);
605 /// assert_eq!(2..5, input.get_range());
606 /// ```
607 #[inline]
608 pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<C> {
609 self.set_range(range);
610 self
611 }
612
613 /// Set the span for this search configuration.
614 ///
615 /// This is like the [`Input::span`] method, except this mutates the
616 /// span in place.
617 ///
618 /// This routine is generic over how a span is provided. While
619 /// a [`Span`] may be given directly, one may also provide a
620 /// `std::ops::Range<usize>`.
621 ///
622 /// # Panics
623 ///
624 /// This panics if the given span does not correspond to valid bounds in
625 /// the haystack or the termination of a search.
626 ///
627 /// # Example
628 ///
629 /// ```
630 /// use regex_automata::Input;
631 ///
632 /// let mut input = Input::new("foobar");
633 /// assert_eq!(0..6, input.get_range());
634 /// input.set_span(2..4);
635 /// assert_eq!(2..4, input.get_range());
636 /// ```
637 #[inline]
638 pub fn set_span<S: Into<Span>>(&mut self, span: S) {
639 let span = span.into();
640 assert!(span.start <= span.end.saturating_add(1), "invalid span {:?}", span,);
641 if self.at() < span.start {
642 self.move_to(span.start);
643 } else if !self.is_done() && self.at() > span.end {
644 self.move_to(span.end);
645 }
646 self.span = span;
647 }
648
649 #[inline]
650 pub fn slice_span<S: Into<Span>>(&mut self, span: S) -> &mut Input<C> {
651 let span = span.into();
652 assert!(span.start <= span.end.saturating_add(1), "invalid span {:?}", span,);
653 if self.at() < span.start {
654 self.move_to(span.start);
655 } else if !self.is_done() && self.at() > span.end {
656 self.move_to(span.end);
657 }
658 self.slice_span = span;
659 self.span = span;
660 self
661 }
662
663 #[inline]
664 pub fn slice<R: RangeBounds<usize>>(&mut self, range: R) -> &mut Input<C> {
665 use core::ops::Bound;
666
667 // It's a little weird to convert ranges into spans, and then spans
668 // back into ranges when we actually slice the haystack. Because
669 // of that process, we always represent everything as a half-open
670 // internal. Therefore, handling things like m..=n is a little awkward.
671 let start = match range.start_bound() {
672 Bound::Included(&i) => i,
673 // Can this case ever happen? Range syntax doesn't support it...
674 Bound::Excluded(&i) => i.checked_add(1).unwrap(),
675 Bound::Unbounded => 0,
676 };
677 let end = match range.end_bound() {
678 Bound::Included(&i) => i.checked_add(1).unwrap(),
679 Bound::Excluded(&i) => i,
680 Bound::Unbounded => self.cursor.total_bytes().unwrap_or(usize::MAX),
681 };
682 self.slice_span(Span { start, end })
683 }
684
685 #[inline]
686 pub(crate) fn move_to(&mut self, at: usize) {
687 debug_assert!(at <= self.span.end.saturating_add(1));
688 // TODO: fastpath for O(log N) chunk jumping
689 while at < self.cursor.offset() {
690 self.backtrack();
691 }
692 if at != self.cursor.offset() {
693 while at >= self.cursor.offset() + self.chunk().len() {
694 let advanced = self.advance();
695 if !advanced {
696 let chunk_pos = (at - self.cursor.offset()).min(self.chunk().len());
697 self.set_chunk_pos(chunk_pos);
698 return;
699 }
700 }
701 }
702 self.set_chunk_pos(at - self.cursor.offset());
703 }
704
705 #[cfg_attr(feature = "perf-inline", inline(always))]
706 pub(crate) fn at(&self) -> usize {
707 self.cursor.offset() + self.chunk_pos()
708 }
709
710 #[cfg_attr(feature = "perf-inline", inline(always))]
711 pub(crate) fn with<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> T {
712 let anchored = self.anchored;
713 let earliest = self.earliest;
714 let span = self.span;
715 let res = f(self);
716 self.set_span(span);
717 self.set_earliest(earliest);
718 self.set_anchored(anchored);
719 res
720 }
721
722 // #[cfg_attr(feature = "perf-inline", inline(always))]
723 // pub(crate) fn try_clone(&self) -> Option<Input<C>> {
724 // let res = Input {
725 // cursor: self.cursor.try_clone()?,
726 // anchored: self.anchored,
727 // earliest: self.earliest,
728 // offset: self.offset,
729 // chunk_pos: self.chunk_pos,
730 // span: self.span,
731 // look_behind: self.look_behind,
732 // };
733 // Some(res)
734 // }
735
736 /// Set the span for this search configuration given any range.
737 ///
738 /// This is like the [`Input::range`] method, except this mutates the
739 /// span in place.
740 ///
741 /// This routine does not panic if the range given is not a valid range for
742 /// this search's haystack. If this search is run with an invalid range,
743 /// then the most likely outcome is that the actual search execution will
744 /// panic.
745 ///
746 /// # Panics
747 ///
748 /// This routine will panic if the given range could not be converted
749 /// to a valid [`Range`]. For example, this would panic when given
750 /// `0..=usize::MAX` since it cannot be represented using a half-open
751 /// interval in terms of `usize`.
752 ///
753 /// This also panics if the given span does not correspond to valid bounds
754 /// in the haystack or the termination of a search.
755 ///
756 /// # Example
757 ///
758 /// ```
759 /// use regex_automata::Input;
760 ///
761 /// let mut input = Input::new("foobar");
762 /// assert_eq!(0..6, input.get_range());
763 /// input.set_range(2..=4);
764 /// assert_eq!(2..5, input.get_range());
765 /// ```
766 #[inline]
767 pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
768 use core::ops::Bound;
769
770 // It's a little weird to convert ranges into spans, and then spans
771 // back into ranges when we actually slice the haystack. Because
772 // of that process, we always represent everything as a half-open
773 // internal. Therefore, handling things like m..=n is a little awkward.
774 let start = match range.start_bound() {
775 Bound::Included(&i) => i,
776 // Can this case ever happen? Range syntax doesn't support it...
777 Bound::Excluded(&i) => i.checked_add(1).unwrap(),
778 Bound::Unbounded => 0,
779 };
780 let end = match range.end_bound() {
781 Bound::Included(&i) => i.checked_add(1).unwrap(),
782 Bound::Excluded(&i) => i,
783 Bound::Unbounded => self.cursor.total_bytes().unwrap_or(usize::MAX),
784 };
785 self.set_span(Span { start, end });
786 }
787
788 /// Return the anchored mode for this search configuration.
789 ///
790 /// If no anchored mode was set, then it defaults to [`Anchored::No`].
791 ///
792 /// # Example
793 ///
794 /// ```
795 /// use regex_automata::{Anchored, Input, PatternID};
796 ///
797 /// let mut input = Input::new("foobar");
798 /// assert_eq!(Anchored::No, input.get_anchored());
799 ///
800 /// let pid = PatternID::must(5);
801 /// input.set_anchored(Anchored::Pattern(pid));
802 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
803 /// ```
804 #[inline]
805 pub fn get_anchored(&self) -> Anchored {
806 self.anchored
807 }
808
809 /// Return whether this search should execute in "earliest" mode.
810 ///
811 /// # Example
812 ///
813 /// ```
814 /// use regex_automata::Input;
815 ///
816 /// let input = Input::new("foobar");
817 /// assert!(!input.get_earliest());
818 /// ```
819 #[inline]
820 pub fn get_earliest(&self) -> bool {
821 self.earliest
822 }
823
824 /// Return true if and only if this search can never return any other
825 /// matches.
826 ///
827 /// This occurs when the start position of this search is greater than the
828 /// end position of the search.
829 ///
830 /// # Example
831 ///
832 /// ```
833 /// use regex_automata::Input;
834 ///
835 /// let mut input = Input::new("foobar");
836 /// assert!(!input.is_done());
837 /// input.set_start(6);
838 /// assert!(!input.is_done());
839 /// input.set_start(7);
840 /// assert!(input.is_done());
841 /// ```
842 #[inline]
843 pub fn is_done(&self) -> bool {
844 self.get_span().start > self.get_span().end
845 }
846
847 /// Returns true if and only if the given offset in this search's chunk
848 /// falls on a valid UTF-8 encoded codepoint boundary.
849 ///
850 /// If the chunk is not valid UTF-8, then the behavior of this routine
851 /// is unspecified.
852 ///
853 /// # Example
854 ///
855 /// This shows where codepoint bounardies do and don't exist in valid
856 /// UTF-8.
857 ///
858 /// ```
859 /// use regex_automata::Input;
860 ///
861 /// let input = Input::new("☃");
862 /// assert!(input.is_char_boundary(0));
863 /// assert!(!input.is_char_boundary(1));
864 /// assert!(!input.is_char_boundary(2));
865 /// assert!(input.is_char_boundary(3));
866 /// assert!(!input.is_char_boundary(4));
867 /// ```
868 #[inline]
869 pub fn is_char_boundary(&mut self) -> bool {
870 is_boundary(self.chunk(), self.chunk_pos)
871 }
872}
873
874impl<C: Cursor> core::fmt::Debug for Input<C> {
875 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
876 use regex_automata::util::escape::DebugHaystack;
877
878 f.debug_struct("Input")
879 .field("chunk", &DebugHaystack(self.chunk()))
880 .field("anchored", &self.anchored)
881 .field("earliest", &self.earliest)
882 .field("chunk_pos", &self.chunk_pos)
883 .field("chunk_offset", &self.cursor.offset())
884 .field("span", &self.span)
885 .finish()
886 }
887}