regex_cursor/engines/
dfa.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
pub use regex_automata::dfa::regex::Regex;
use regex_automata::dfa::Automaton;
use regex_automata::{Anchored, Match, MatchError};

use crate::cursor::Cursor;
use crate::util::iter;
use crate::Input;

pub use crate::engines::dfa::search::{try_search_fwd, try_search_rev};

mod accel;
mod search;
#[cfg(test)]
mod test;

/// Returns true if either the given input specifies an anchored search
/// or if the underlying NFA is always anchored.
fn is_anchored(regex: &Regex, input: &Input<impl Cursor>) -> bool {
    match input.get_anchored() {
        Anchored::No => regex.forward().is_always_start_anchored(),
        Anchored::Yes | Anchored::Pattern(_) => true,
    }
}

/// Returns an iterator over all non-overlapping leftmost matches in the
/// given bytes. If no match exists, then the iterator yields no elements.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// The above conditions also apply to the iterator returned as well. For
/// example, if the lazy DFA gives up or quits during a search using this
/// method, then a panic will occur during iteration.
///
/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
/// if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
///
/// let text = "foo1 foo12 foo123";
/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
/// assert_eq!(matches, vec![
///     Match::must(0, 0..4),
///     Match::must(0, 5..10),
///     Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find_iter<C: Cursor>(regex: &Regex, input: Input<C>) -> FindMatches<'_, C> {
    let it = iter::Searcher::new(input);
    FindMatches { re: regex, it }
}

/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, hybrid::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
///     Some(Match::must(0, 3..11)),
///     re.find(&mut cache, "zzzfoo12345zzz"),
/// );
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the default leftmost-first match semantics demand that we find the
/// // earliest match that prefers earlier parts of the pattern over latter
/// // parts.
/// let re = Regex::new("abc|a")?;
/// let mut cache = re.create_cache();
/// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn find<C: Cursor>(regex: &Regex, input: &mut Input<C>) -> Option<Match> {
    try_search(regex, input).unwrap()
}

/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// This is like [`Regex::find`] but with two differences:
///
/// 1. It is not generic over `Into<Input>` and instead accepts a
/// `&Input`. This permits reusing the same `Input` for multiple searches
/// without needing to create a new one. This _may_ help with latency.
/// 2. It returns an error if the search could not complete where as
/// [`Regex::find`] will panic.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
pub fn try_search<C: Cursor>(
    regex: &Regex,
    input: &mut Input<C>,
) -> Result<Option<Match>, MatchError> {
    let fwd = regex.forward();
    let end = match try_search_fwd(fwd, input)? {
        None => return Ok(None),
        Some(end) => end,
    };
    // This special cases an empty match at the beginning of the search. If
    // our end matches our start, then since a reverse DFA can't match past
    // the start, it must follow that our starting position is also our end
    // position. So short circuit and skip the reverse search.
    if input.start() == end.offset() {
        return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset())));
    }
    // We can also skip the reverse search if we know our search was
    // anchored. This occurs either when the input config is anchored or
    // when we know the regex itself is anchored. In this case, we know the
    // start of the match, if one is found, must be the start of the
    // search.
    if is_anchored(regex, input) {
        return Ok(Some(Match::new(end.pattern(), input.start()..end.offset())));
    }
    // N.B. I have tentatively convinced myself that it isn't necessary
    // to specify the specific pattern for the reverse search since the
    // reverse search will always find the same pattern to match as the
    // forward search. But I lack a rigorous proof. Why not just provide
    // the pattern anyway? Well, if it is needed, then leaving it out
    // gives us a chance to find a witness. (Also, if we don't need to
    // specify the pattern, then we don't need to build the reverse DFA
    // with 'starts_for_each_pattern' enabled. It doesn't matter too much
    // for the lazy DFA, but does make the overall DFA bigger.)
    //
    // We also need to be careful to disable 'earliest' for the reverse
    // search, since it could be enabled for the forward search. In the
    // reverse case, to satisfy "leftmost" criteria, we need to match as
    // much as we can. We also need to be careful to make the search
    // anchored. We don't want the reverse search to report any matches
    // other than the one beginning at the end of our forward search.

    let match_range = input.start()..end.offset();
    let start = input.with(|mut revsearch| {
        revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false);
        try_search_rev(regex.reverse(), revsearch)
    });
    let start = start?.expect("reverse search must match if forward search does");
    debug_assert_eq!(
        start.pattern(),
        end.pattern(),
        "forward and reverse search must match same pattern",
    );
    debug_assert!(start.offset() <= end.offset());
    debug_assert!(end.offset() <= input.end());
    debug_assert!(input.start() <= start.offset());
    Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}

/// An iterator over all non-overlapping matches for an infallible search.
///
/// The iterator yields a [`Match`] value until no more matches could be found.
/// If the underlying regex engine returns an error, then a panic occurs.
///
/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
pub struct FindMatches<'r, C: Cursor> {
    re: &'r Regex,
    it: iter::Searcher<C>,
}

impl<'r, C: Cursor> Iterator for FindMatches<'r, C> {
    type Item = Match;

    #[inline]
    fn next(&mut self) -> Option<Match> {
        let FindMatches { re, ref mut it } = *self;
        it.advance(|input| try_search(re, input))
    }
}