regex_cursor/engines/hybrid.rs
1pub use regex_automata::hybrid::regex::{Cache, Regex};
2use regex_automata::{Anchored, Match, MatchError};
3
4use crate::cursor::Cursor;
5use crate::input::Input;
6use crate::util::iter;
7
8pub use crate::engines::hybrid::search::{try_search_fwd, try_search_rev};
9
10mod search;
11#[cfg(test)]
12mod test;
13
14/// Returns true if either the given input specifies an anchored search
15/// or if the underlying NFA is always anchored.
16fn is_anchored(regex: &Regex, input: &Input<impl Cursor>) -> bool {
17 match input.get_anchored() {
18 Anchored::No => regex.forward().get_nfa().is_always_start_anchored(),
19 Anchored::Yes | Anchored::Pattern(_) => true,
20 }
21}
22
23/// Returns an iterator over all non-overlapping leftmost matches in the
24/// given bytes. If no match exists, then the iterator yields no elements.
25///
26/// # Panics
27///
28/// This routine panics if the search could not complete. This can occur
29/// in a number of circumstances:
30///
31/// * The configuration of the lazy DFA may permit it to "quit" the search.
32/// For example, setting quit bytes or enabling heuristic support for
33/// Unicode word boundaries. The default configuration does not enable any
34/// option that could result in the lazy DFA quitting.
35/// * The configuration of the lazy DFA may also permit it to "give up"
36/// on a search if it makes ineffective use of its transition table
37/// cache. The default configuration does not enable this by default,
38/// although it is typically a good idea to.
39/// * When the provided `Input` configuration is not supported. For
40/// example, by providing an unsupported anchor mode.
41///
42/// When a search panics, callers cannot know whether a match exists or
43/// not.
44///
45/// The above conditions also apply to the iterator returned as well. For
46/// example, if the lazy DFA gives up or quits during a search using this
47/// method, then a panic will occur during iteration.
48///
49/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
50/// if you want to handle these error conditions.
51///
52/// # Example
53///
54/// ```
55/// use regex_automata::{hybrid::regex::Regex, Match};
56///
57/// let re = Regex::new("foo[0-9]+")?;
58/// let mut cache = re.create_cache();
59///
60/// let text = "foo1 foo12 foo123";
61/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
62/// assert_eq!(matches, vec![
63/// Match::must(0, 0..4),
64/// Match::must(0, 5..10),
65/// Match::must(0, 11..17),
66/// ]);
67/// # Ok::<(), Box<dyn std::error::Error>>(())
68/// ```
69#[inline]
70pub fn find_iter<'r, 'c, C: Cursor>(
71 regex: &'r Regex,
72 cache: &'c mut Cache,
73 input: Input<C>,
74) -> FindMatches<'r, 'c, C> {
75 let it = iter::Searcher::new(input);
76 FindMatches { re: regex, cache, it }
77}
78
79/// Returns the start and end offset of the leftmost match. If no match
80/// exists, then `None` is returned.
81///
82/// # Panics
83///
84/// This routine panics if the search could not complete. This can occur
85/// in a number of circumstances:
86///
87/// * The configuration of the lazy DFA may permit it to "quit" the search.
88/// For example, setting quit bytes or enabling heuristic support for
89/// Unicode word boundaries. The default configuration does not enable any
90/// option that could result in the lazy DFA quitting.
91/// * The configuration of the lazy DFA may also permit it to "give up"
92/// on a search if it makes ineffective use of its transition table
93/// cache. The default configuration does not enable this by default,
94/// although it is typically a good idea to.
95/// * When the provided `Input` configuration is not supported. For
96/// example, by providing an unsupported anchor mode.
97///
98/// When a search panics, callers cannot know whether a match exists or
99/// not.
100///
101/// Use [`Regex::try_search`] if you want to handle these error conditions.
102///
103/// # Example
104///
105/// ```
106/// use regex_automata::{Match, hybrid::regex::Regex};
107///
108/// let re = Regex::new("foo[0-9]+")?;
109/// let mut cache = re.create_cache();
110/// assert_eq!(
111/// Some(Match::must(0, 3..11)),
112/// re.find(&mut cache, "zzzfoo12345zzz"),
113/// );
114///
115/// // Even though a match is found after reading the first byte (`a`),
116/// // the default leftmost-first match semantics demand that we find the
117/// // earliest match that prefers earlier parts of the pattern over latter
118/// // parts.
119/// let re = Regex::new("abc|a")?;
120/// let mut cache = re.create_cache();
121/// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
122/// # Ok::<(), Box<dyn std::error::Error>>(())
123/// ```
124pub fn find<C: Cursor>(regex: &Regex, cache: &mut Cache, input: &mut Input<C>) -> Option<Match> {
125 try_search(regex, cache, input).unwrap()
126}
127
128/// Returns the start and end offset of the leftmost match. If no match
129/// exists, then `None` is returned.
130///
131/// This is like [`Regex::find`] but with two differences:
132///
133/// 1. It is not generic over `Into<Input>` and instead accepts a
134/// `&Input`. This permits reusing the same `Input` for multiple searches
135/// without needing to create a new one. This _may_ help with latency.
136/// 2. It returns an error if the search could not complete where as
137/// [`Regex::find`] will panic.
138///
139/// # Errors
140///
141/// This routine errors if the search could not complete. This can occur
142/// in a number of circumstances:
143///
144/// * The configuration of the lazy DFA may permit it to "quit" the search.
145/// For example, setting quit bytes or enabling heuristic support for
146/// Unicode word boundaries. The default configuration does not enable any
147/// option that could result in the lazy DFA quitting.
148/// * The configuration of the lazy DFA may also permit it to "give up"
149/// on a search if it makes ineffective use of its transition table
150/// cache. The default configuration does not enable this by default,
151/// although it is typically a good idea to.
152/// * When the provided `Input` configuration is not supported. For
153/// example, by providing an unsupported anchor mode.
154///
155/// When a search returns an error, callers cannot know whether a match
156/// exists or not.
157pub fn try_search<C: Cursor>(
158 regex: &Regex,
159 cache: &mut Cache,
160 input: &mut Input<C>,
161) -> Result<Option<Match>, MatchError> {
162 let (fcache, rcache) = cache.as_parts_mut();
163 let end = match try_search_fwd(regex.forward(), fcache, input)? {
164 None => return Ok(None),
165 Some(end) => end,
166 };
167 // This special cases an empty match at the beginning of the search. If
168 // our end matches our start, then since a reverse DFA can't match past
169 // the start, it must follow that our starting position is also our end
170 // position. So short circuit and skip the reverse search.
171 if input.start() == end.offset() {
172 return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset())));
173 }
174 // We can also skip the reverse search if we know our search was
175 // anchored. This occurs either when the input config is anchored or
176 // when we know the regex itself is anchored. In this case, we know the
177 // start of the match, if one is found, must be the start of the
178 // search.
179 if is_anchored(regex, input) {
180 return Ok(Some(Match::new(end.pattern(), input.start()..end.offset())));
181 }
182 // N.B. I have tentatively convinced myself that it isn't necessary
183 // to specify the specific pattern for the reverse search since the
184 // reverse search will always find the same pattern to match as the
185 // forward search. But I lack a rigorous proof. Why not just provide
186 // the pattern anyway? Well, if it is needed, then leaving it out
187 // gives us a chance to find a witness. (Also, if we don't need to
188 // specify the pattern, then we don't need to build the reverse DFA
189 // with 'starts_for_each_pattern' enabled. It doesn't matter too much
190 // for the lazy DFA, but does make the overall DFA bigger.)
191 //
192 // We also need to be careful to disable 'earliest' for the reverse
193 // search, since it could be enabled for the forward search. In the
194 // reverse case, to satisfy "leftmost" criteria, we need to match as
195 // much as we can. We also need to be careful to make the search
196 // anchored. We don't want the reverse search to report any matches
197 // other than the one beginning at the end of our forward search.
198
199 let match_range = input.start()..end.offset();
200 let start = input.with(|mut revsearch| {
201 revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false);
202 try_search_rev(regex.reverse(), rcache, revsearch)
203 });
204 let start = start?.expect("reverse search must match if forward search does");
205 debug_assert_eq!(
206 start.pattern(),
207 end.pattern(),
208 "forward and reverse search must match same pattern",
209 );
210 debug_assert!(start.offset() <= end.offset());
211 debug_assert!(end.offset() <= input.end());
212 debug_assert!(input.start() <= start.offset());
213 Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
214}
215
216/// An iterator over all non-overlapping matches for an infallible search.
217///
218/// The iterator yields a [`Match`] value until no more matches could be found.
219/// If the underlying regex engine returns an error, then a panic occurs.
220///
221/// The lifetime parameters are as follows:
222///
223/// * `'r` represents the lifetime of the regex object.
224/// * `'h` represents the lifetime of the haystack being searched.
225/// * `'c` represents the lifetime of the regex cache.
226///
227/// This iterator can be created with the [`Regex::find_iter`] method.
228#[derive(Debug)]
229pub struct FindMatches<'r, 'c, C: Cursor> {
230 re: &'r Regex,
231 cache: &'c mut Cache,
232 it: iter::Searcher<C>,
233}
234
235impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> {
236 type Item = Match;
237
238 #[inline]
239 fn next(&mut self) -> Option<Match> {
240 let FindMatches { re, ref mut cache, ref mut it } = *self;
241 it.advance(|input| try_search(re, cache, input))
242 }
243}