regex_cursor/engines/dfa.rs
1pub use regex_automata::dfa::regex::Regex;
2use regex_automata::dfa::Automaton;
3use regex_automata::{Anchored, Match, MatchError};
4
5use crate::cursor::Cursor;
6use crate::util::iter;
7use crate::Input;
8
9pub use crate::engines::dfa::search::{try_search_fwd, try_search_rev};
10
11mod accel;
12mod search;
13#[cfg(test)]
14mod test;
15
16/// Returns true if either the given input specifies an anchored search
17/// or if the underlying NFA is always anchored.
18fn is_anchored(regex: &Regex, input: &Input<impl Cursor>) -> bool {
19 match input.get_anchored() {
20 Anchored::No => regex.forward().is_always_start_anchored(),
21 Anchored::Yes | Anchored::Pattern(_) => true,
22 }
23}
24
25/// Returns an iterator over all non-overlapping leftmost matches in the
26/// given bytes. If no match exists, then the iterator yields no elements.
27///
28/// # Panics
29///
30/// This routine panics if the search could not complete. This can occur
31/// in a number of circumstances:
32///
33/// * The configuration of the lazy DFA may permit it to "quit" the search.
34/// For example, setting quit bytes or enabling heuristic support for
35/// Unicode word boundaries. The default configuration does not enable any
36/// option that could result in the lazy DFA quitting.
37/// * The configuration of the lazy DFA may also permit it to "give up"
38/// on a search if it makes ineffective use of its transition table
39/// cache. The default configuration does not enable this by default,
40/// although it is typically a good idea to.
41/// * When the provided `Input` configuration is not supported. For
42/// example, by providing an unsupported anchor mode.
43///
44/// When a search panics, callers cannot know whether a match exists or
45/// not.
46///
47/// The above conditions also apply to the iterator returned as well. For
48/// example, if the lazy DFA gives up or quits during a search using this
49/// method, then a panic will occur during iteration.
50///
51/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
52/// if you want to handle these error conditions.
53///
54/// # Example
55///
56/// ```
57/// use regex_automata::{hybrid::regex::Regex, Match};
58///
59/// let re = Regex::new("foo[0-9]+")?;
60/// let mut cache = re.create_cache();
61///
62/// let text = "foo1 foo12 foo123";
63/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
64/// assert_eq!(matches, vec![
65/// Match::must(0, 0..4),
66/// Match::must(0, 5..10),
67/// Match::must(0, 11..17),
68/// ]);
69/// # Ok::<(), Box<dyn std::error::Error>>(())
70/// ```
71#[inline]
72pub fn find_iter<C: Cursor>(regex: &Regex, input: Input<C>) -> FindMatches<'_, C> {
73 let it = iter::Searcher::new(input);
74 FindMatches { re: regex, it }
75}
76
77/// Returns the start and end offset of the leftmost match. If no match
78/// exists, then `None` is returned.
79///
80/// # Panics
81///
82/// This routine panics if the search could not complete. This can occur
83/// in a number of circumstances:
84///
85/// * The configuration of the lazy DFA may permit it to "quit" the search.
86/// For example, setting quit bytes or enabling heuristic support for
87/// Unicode word boundaries. The default configuration does not enable any
88/// option that could result in the lazy DFA quitting.
89/// * The configuration of the lazy DFA may also permit it to "give up"
90/// on a search if it makes ineffective use of its transition table
91/// cache. The default configuration does not enable this by default,
92/// although it is typically a good idea to.
93/// * When the provided `Input` configuration is not supported. For
94/// example, by providing an unsupported anchor mode.
95///
96/// When a search panics, callers cannot know whether a match exists or
97/// not.
98///
99/// Use [`Regex::try_search`] if you want to handle these error conditions.
100///
101/// # Example
102///
103/// ```
104/// use regex_automata::{Match, hybrid::regex::Regex};
105///
106/// let re = Regex::new("foo[0-9]+")?;
107/// let mut cache = re.create_cache();
108/// assert_eq!(
109/// Some(Match::must(0, 3..11)),
110/// re.find(&mut cache, "zzzfoo12345zzz"),
111/// );
112///
113/// // Even though a match is found after reading the first byte (`a`),
114/// // the default leftmost-first match semantics demand that we find the
115/// // earliest match that prefers earlier parts of the pattern over latter
116/// // parts.
117/// let re = Regex::new("abc|a")?;
118/// let mut cache = re.create_cache();
119/// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
120/// # Ok::<(), Box<dyn std::error::Error>>(())
121/// ```
122pub fn find<C: Cursor>(regex: &Regex, input: &mut Input<C>) -> Option<Match> {
123 try_search(regex, input).unwrap()
124}
125
126/// Returns the start and end offset of the leftmost match. If no match
127/// exists, then `None` is returned.
128///
129/// This is like [`Regex::find`] but with two differences:
130///
131/// 1. It is not generic over `Into<Input>` and instead accepts a
132/// `&Input`. This permits reusing the same `Input` for multiple searches
133/// without needing to create a new one. This _may_ help with latency.
134/// 2. It returns an error if the search could not complete where as
135/// [`Regex::find`] will panic.
136///
137/// # Errors
138///
139/// This routine errors if the search could not complete. This can occur
140/// in a number of circumstances:
141///
142/// * The configuration of the lazy DFA may permit it to "quit" the search.
143/// For example, setting quit bytes or enabling heuristic support for
144/// Unicode word boundaries. The default configuration does not enable any
145/// option that could result in the lazy DFA quitting.
146/// * The configuration of the lazy DFA may also permit it to "give up"
147/// on a search if it makes ineffective use of its transition table
148/// cache. The default configuration does not enable this by default,
149/// although it is typically a good idea to.
150/// * When the provided `Input` configuration is not supported. For
151/// example, by providing an unsupported anchor mode.
152///
153/// When a search returns an error, callers cannot know whether a match
154/// exists or not.
155pub fn try_search<C: Cursor>(
156 regex: &Regex,
157 input: &mut Input<C>,
158) -> Result<Option<Match>, MatchError> {
159 let fwd = regex.forward();
160 let end = match try_search_fwd(fwd, input)? {
161 None => return Ok(None),
162 Some(end) => end,
163 };
164 // This special cases an empty match at the beginning of the search. If
165 // our end matches our start, then since a reverse DFA can't match past
166 // the start, it must follow that our starting position is also our end
167 // position. So short circuit and skip the reverse search.
168 if input.start() == end.offset() {
169 return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset())));
170 }
171 // We can also skip the reverse search if we know our search was
172 // anchored. This occurs either when the input config is anchored or
173 // when we know the regex itself is anchored. In this case, we know the
174 // start of the match, if one is found, must be the start of the
175 // search.
176 if is_anchored(regex, input) {
177 return Ok(Some(Match::new(end.pattern(), input.start()..end.offset())));
178 }
179 // N.B. I have tentatively convinced myself that it isn't necessary
180 // to specify the specific pattern for the reverse search since the
181 // reverse search will always find the same pattern to match as the
182 // forward search. But I lack a rigorous proof. Why not just provide
183 // the pattern anyway? Well, if it is needed, then leaving it out
184 // gives us a chance to find a witness. (Also, if we don't need to
185 // specify the pattern, then we don't need to build the reverse DFA
186 // with 'starts_for_each_pattern' enabled. It doesn't matter too much
187 // for the lazy DFA, but does make the overall DFA bigger.)
188 //
189 // We also need to be careful to disable 'earliest' for the reverse
190 // search, since it could be enabled for the forward search. In the
191 // reverse case, to satisfy "leftmost" criteria, we need to match as
192 // much as we can. We also need to be careful to make the search
193 // anchored. We don't want the reverse search to report any matches
194 // other than the one beginning at the end of our forward search.
195
196 let match_range = input.start()..end.offset();
197 let start = input.with(|mut revsearch| {
198 revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false);
199 try_search_rev(regex.reverse(), revsearch)
200 });
201 let start = start?.expect("reverse search must match if forward search does");
202 debug_assert_eq!(
203 start.pattern(),
204 end.pattern(),
205 "forward and reverse search must match same pattern",
206 );
207 debug_assert!(start.offset() <= end.offset());
208 debug_assert!(end.offset() <= input.end());
209 debug_assert!(input.start() <= start.offset());
210 Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
211}
212
213/// An iterator over all non-overlapping matches for an infallible search.
214///
215/// The iterator yields a [`Match`] value until no more matches could be found.
216/// If the underlying regex engine returns an error, then a panic occurs.
217///
218/// This iterator can be created with the [`Regex::find_iter`] method.
219#[derive(Debug)]
220pub struct FindMatches<'r, C: Cursor> {
221 re: &'r Regex,
222 it: iter::Searcher<C>,
223}
224
225impl<'r, C: Cursor> Iterator for FindMatches<'r, C> {
226 type Item = Match;
227
228 #[inline]
229 fn next(&mut self) -> Option<Match> {
230 let FindMatches { re, ref mut it } = *self;
231 it.advance(|input| try_search(re, input))
232 }
233}