regex_utils/
dfa.rs

1#![allow(clippy::result_large_err)]
2
3use regex_automata::{
4    dfa::{dense, sparse, Automaton},
5    util::primitives::StateID,
6    Input,
7};
8
9/// A [`DfaIter`] using [`dense::DFA`] representation
10pub type DenseDfaIter<T> = DfaIter<dense::DFA<T>>;
11/// A [`DfaIter`] using [`sparse::DFA`] representation
12pub type SparseDfaIter<T> = DfaIter<sparse::DFA<T>>;
13
14/// `DfaIter` will produce every possible string value that will match with the given regex.
15///
16/// # Note
17///
18/// Regexes can be infinite (eg `a*`). Either use this iterator lazily, or limit the number
19/// of iterations.
20///
21/// Because this uses a DFA, search space memory can be quite large and unbounded.
22///
23/// # Implementation Details
24///
25/// Given a `DFA` (Deterministic Finite Automaton), the iterator walks the graph
26/// of states using [`IDDFS`](https://en.wikipedia.org/wiki/Iterative_deepening_depth-first_search)
27/// to traverse through every possible state path. At each depth, if we find a match, it is returned.
28///
29/// The order of matches is not guaranteed, but it currently returns all strings in lexicographical byte ordering.
30pub struct DfaIter<A> {
31    // the graph to search
32    pub(crate) regex: A,
33    // the start node of the graph
34    start: StateID,
35    // the max depth we currently want to search
36    depth: usize,
37    // the max depth observed in the graph
38    max_depth: usize,
39    // (state, edge, depth)
40    stack: Vec<(StateID, u8, usize)>,
41    // the current path
42    str: Vec<u8>,
43}
44
45impl<A: Automaton> From<A> for DfaIter<A> {
46    fn from(dfa: A) -> Self {
47        // anchored because if we didn't anchor our search we would have an infinite amount of prefixes that were valid
48        // and that isn't very interesting
49        let start = dfa
50            .start_state_forward(&Input::new("").anchored(regex_automata::Anchored::Yes))
51            .unwrap();
52
53        Self {
54            regex: dfa,
55            start,
56            depth: 0,
57            max_depth: 0,
58            stack: vec![(start, 0, 0)],
59            str: vec![],
60        }
61    }
62}
63
64impl DenseDfaIter<Vec<u32>> {
65    /// Parse the given regular expression using a default configuration and
66    /// return the corresponding dense `DfaIter`.
67    ///
68    /// If you want a non-default configuration, then use the
69    /// [`dense::Builder`](dense::Builder) to set your own configuration.
70    ///
71    /// See [`DFA`] for details
72    pub fn new(pattern: &str) -> Result<Self, dense::BuildError> {
73        dense::DFA::builder()
74            .configure(dense::Config::new().accelerate(false))
75            .build(pattern)
76            .map(Self::from)
77    }
78
79    /// Parse the given regular expressions using a default configuration and
80    /// return the corresponding dense multi-`DfaIter`.
81    ///
82    /// If you want a non-default configuration, then use the
83    /// [`dense::Builder`](dense::Builder) to set your own configuration.
84    ///
85    /// See [`DFA`] for details
86    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Self, dense::BuildError> {
87        dense::DFA::builder()
88            .configure(dense::Config::new().accelerate(false))
89            .build_many(patterns)
90            .map(Self::from)
91    }
92}
93
94impl SparseDfaIter<Vec<u8>> {
95    /// Parse the given regular expression using a default configuration and
96    /// return the corresponding sparse `DfaIter`.
97    ///
98    /// If you want a non-default configuration, then use
99    /// the [`dense::Builder`] to set your own configuration, and then call
100    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
101    ///
102    /// See [`DFA`] for details
103    pub fn new(pattern: &str) -> Result<Self, dense::BuildError> {
104        dense::DFA::builder()
105            .configure(dense::Config::new().accelerate(false))
106            .build(pattern)
107            .and_then(|dense| dense.to_sparse())
108            .map(Self::from)
109    }
110
111    /// Parse the given regular expressions using a default configuration and
112    /// return the corresponding sparse multi-`DfaIter`.
113    ///
114    /// If you want a non-default configuration, then use
115    /// the [`dense::Builder`] to set your own configuration, and then call
116    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
117    ///
118    /// See [`DFA`] for details
119    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Self, dense::BuildError> {
120        dense::DFA::builder()
121            .configure(dense::Config::new().accelerate(false))
122            .build_many(patterns)
123            .and_then(|dense| dense.to_sparse())
124            .map(Self::from)
125    }
126}
127
128impl<A: Automaton> DfaIter<A> {
129    /// Get the next matching string ref from this regex iterator
130    pub fn borrow_next(&mut self) -> Option<&[u8]> {
131        loop {
132            let Some((current, b, depth)) = self.stack.pop() else {
133                // we didn't get any deeper. no more search space
134                if self.max_depth < self.depth {
135                    break None;
136                }
137
138                self.depth += 1;
139                self.stack.clear();
140                self.stack.push((self.start, 0, 0));
141                continue;
142            };
143
144            // update recorded max depth
145            self.max_depth = usize::max(self.max_depth, depth);
146            self.str.truncate(depth);
147            self.str.push(b);
148
149            // check we can explore deeper
150            if depth < self.depth {
151                for b in (0..=255).rev() {
152                    let next_state = self.regex.next_state(current, b);
153                    // check if the next state is valid
154                    if !self.regex.is_dead_state(next_state) {
155                        self.stack.push((next_state, b, depth + 1));
156                    }
157                }
158            } else {
159                // test that this state is final
160                let eoi_state = self.regex.next_eoi_state(current);
161                if self.regex.is_match_state(eoi_state) {
162                    break Some(&self.str[1..]);
163                }
164            }
165        }
166    }
167}
168
169impl<A: Automaton> Iterator for DfaIter<A> {
170    type Item = Vec<u8>;
171
172    fn next(&mut self) -> Option<Self::Item> {
173        self.borrow_next().map(ToOwned::to_owned)
174    }
175}
176
177#[cfg(test)]
178mod tests {
179    use std::collections::HashSet;
180
181    use regex_automata::dfa::dense::DFA;
182
183    use super::*;
184
185    #[test]
186    fn set() {
187        let iter = DenseDfaIter::new(r"^b|(a)?|cc").unwrap();
188
189        let x: Vec<Vec<u8>> = iter.collect();
190        assert_eq!(
191            x,
192            [
193                b"".to_vec(),
194                b"a".to_vec(),
195                b"b".to_vec(),
196                // not sure what's happening here
197                // b"cc".to_vec(),
198            ]
199        );
200    }
201
202    #[test]
203    fn finite() {
204        let dfa = DFA::new(r"[0-1]{4}-[0-1]{2}-[0-1]{2}").unwrap();
205
206        // finite regex has finite iteration depth
207        // and no repeats
208        let x: HashSet<Vec<u8>> = DfaIter::from(&dfa).collect();
209        assert_eq!(x.len(), 256);
210        for y in x {
211            assert_eq!(y.len(), 10);
212        }
213    }
214
215    #[test]
216    fn repeated() {
217        let dfa = DFA::new(r"a+(0|1)").unwrap();
218
219        // infinite regex iterates over all cases
220        let x: Vec<Vec<u8>> = DfaIter::from(&dfa).take(20).collect();
221        let y = [
222            b"a0".to_vec(),
223            b"a1".to_vec(),
224            b"aa0".to_vec(),
225            b"aa1".to_vec(),
226            b"aaa0".to_vec(),
227            b"aaa1".to_vec(),
228            b"aaaa0".to_vec(),
229            b"aaaa1".to_vec(),
230            b"aaaaa0".to_vec(),
231            b"aaaaa1".to_vec(),
232            b"aaaaaa0".to_vec(),
233            b"aaaaaa1".to_vec(),
234            b"aaaaaaa0".to_vec(),
235            b"aaaaaaa1".to_vec(),
236            b"aaaaaaaa0".to_vec(),
237            b"aaaaaaaa1".to_vec(),
238            b"aaaaaaaaa0".to_vec(),
239            b"aaaaaaaaa1".to_vec(),
240            b"aaaaaaaaaa0".to_vec(),
241            b"aaaaaaaaaa1".to_vec(),
242        ];
243        assert_eq!(x, y);
244    }
245
246    #[test]
247    fn complex() {
248        let dfa = DFA::new(r"(a+|b+)*").unwrap();
249
250        // infinite regex iterates over all cases
251        let x: Vec<Vec<u8>> = DfaIter::from(&dfa).take(8).collect();
252        let y = [
253            b"".to_vec(),
254            b"a".to_vec(),
255            b"b".to_vec(),
256            b"aa".to_vec(),
257            b"ab".to_vec(),
258            b"ba".to_vec(),
259            b"bb".to_vec(),
260            b"aaa".to_vec(),
261        ];
262        assert_eq!(x, y);
263    }
264
265    #[test]
266    fn email() {
267        let dfa = DFA::new(r"[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*")
268            .unwrap();
269        let mut search = DfaIter::from(&dfa);
270
271        // skip a ~few
272        for _ in 0..100_000 {
273            search.borrow_next();
274        }
275
276        let x = search.borrow_next().unwrap();
277        assert_eq!(String::from_utf8_lossy(x), "0@hI");
278    }
279
280    #[test]
281    fn many() {
282        let search = SparseDfaIter::new_many(&["[0-1]+", "^[a-b]+"]).unwrap();
283        let x: Vec<Vec<u8>> = search.take(12).collect();
284        let y = [
285            b"0".to_vec(),
286            b"1".to_vec(),
287            b"a".to_vec(),
288            b"b".to_vec(),
289            b"00".to_vec(),
290            b"01".to_vec(),
291            b"10".to_vec(),
292            b"11".to_vec(),
293            b"aa".to_vec(),
294            b"ab".to_vec(),
295            b"ba".to_vec(),
296            b"bb".to_vec(),
297        ];
298        assert_eq!(x, y);
299    }
300}