regex_utils/
lib.rs

1//! Utilitise for working with regexes
2//!
3//! # Regex Iteration
4//!
5//! Given a regex, generate all (potentially infinite) possible inputs that match the pattern.
6//!
7//! ```
8//! use regex_utils::{DenseDfaIter, NfaIter, Utf8Iter};
9//!
10//! // parse a regex as an NFA
11//! let iter = NfaIter::new(r"a+(0|1)").unwrap();
12//! // and expect it to be utf8
13//! let iter = Utf8Iter::try_from(iter).unwrap();
14//!
15//! // Because the regex has an infinite pattern (`+`)
16//! // the iterator will be infinite. Let's take a subset
17//! let x: Vec<String> = iter.take(10).collect();
18//! assert_eq!(x, [
19//!     "a0".to_owned(),
20//!     "a1".to_owned(),
21//!     "aa0".to_owned(),
22//!     "aa1".to_owned(),
23//!     "aaa0".to_owned(),
24//!     "aaa1".to_owned(),
25//!     "aaaa0".to_owned(),
26//!     "aaaa1".to_owned(),
27//!     "aaaaa0".to_owned(),
28//!     "aaaaa1".to_owned(),
29//! ]);
30//!
31//! // parse a regex as a dense DFA
32//! let iter = DenseDfaIter::new(r"foo|(bar){1,2}|quux").unwrap();
33//!
34//! // The regex has a finite number of states, so we can collect all of them
35//! let x: Vec<Vec<u8>> = iter.collect();
36//! assert_eq!(x, [
37//!     b"bar".to_vec(),
38//!     b"foo".to_vec(),
39//!     b"quux".to_vec(),
40//!     b"barbar".to_vec(),
41//! ]);
42//! ```
43//!
44//! ## NFA (Nondeterministic Finite Automaton)
45//!
46//! Using [`NfaIter`] you can traverse the regex using an [`NFA`](regex_automata::nfa). NFAs are low memory
47//! representations of regular expressions, at the cost of being slower.
48//!
49//! These do not guarantee that output strings are unique (given that the graph is non-deterministic)
50//! but the search space memory will be much smaller.
51//!
52//! ## DFA (Deterministic Finite Automaton)
53//!
54//! Using [`DfaIter`] you can traverse the regex using a [`DFA`](regex_automata::dfa). DFAs are high memory
55//! representations of regular expressions, at the cost of using much more memory.
56//!
57//! These guarantee that output strings are unique, but the search space will likely use more memory.
58//!
59//! ## Utf8
60//!
61//! Using [`Utf8Iter`] you can get the outputs of the NFA or DFA iterators as [`String`]
62//! representations of regular expressions, at the cost of using much more memory.
63//!
64//! These guarantee that output strings are unique, but the search space will likely use more memory.
65
66use core::fmt;
67use std::error;
68
69pub use dfa::{DenseDfaIter, DfaIter, SparseDfaIter};
70pub use nfa::NfaIter;
71use regex_automata::dfa::Automaton;
72
73mod dfa;
74mod nfa;
75
76/// [`NfaIter`] or [`DfaIter`] iterator with UTF8 [`String`]s as output
77pub struct Utf8Iter<I>(I);
78
79#[derive(Debug)]
80/// Regex provided to [`Utf8Iter`] was not valid for generating UTF8 strings
81pub struct RegexNotUtf8;
82
83impl fmt::Display for RegexNotUtf8 {
84    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
85        f.write_str("regex is not utf8")
86    }
87}
88
89impl error::Error for RegexNotUtf8 {}
90
91impl TryFrom<NfaIter> for Utf8Iter<NfaIter> {
92    type Error = RegexNotUtf8;
93    fn try_from(value: NfaIter) -> Result<Self, Self::Error> {
94        if value.regex.is_utf8() {
95            Ok(Self(value))
96        } else {
97            Err(RegexNotUtf8)
98        }
99    }
100}
101
102impl<A: Automaton> TryFrom<DfaIter<A>> for Utf8Iter<DfaIter<A>> {
103    type Error = RegexNotUtf8;
104    fn try_from(value: DfaIter<A>) -> Result<Self, Self::Error> {
105        if value.regex.is_utf8() {
106            Ok(Self(value))
107        } else {
108            Err(RegexNotUtf8)
109        }
110    }
111}
112
113impl<A: Automaton> Utf8Iter<DfaIter<A>> {
114    /// Get the next matching string ref from this regex iterator
115    pub fn borrow_next(&mut self) -> Option<&str> {
116        let next = self.0.borrow_next()?;
117        Some(std::str::from_utf8(next).expect("Regex should only match utf8"))
118    }
119}
120impl Utf8Iter<NfaIter> {
121    /// Get the next matching string ref from this regex iterator
122    pub fn borrow_next(&mut self) -> Option<&str> {
123        let next = self.0.borrow_next()?;
124        Some(std::str::from_utf8(next).expect("Regex should only match utf8"))
125    }
126}
127
128impl<I: Iterator<Item = Vec<u8>>> Iterator for Utf8Iter<I> {
129    type Item = String;
130
131    fn next(&mut self) -> Option<Self::Item> {
132        let next = self.0.next()?;
133        Some(String::from_utf8(next).expect("Regex should only match utf8"))
134    }
135}
136
137#[cfg(test)]
138mod tests {
139    use std::collections::HashSet;
140
141    use regex_automata::nfa::thompson::NFA;
142
143    use super::*;
144
145    #[test]
146    fn finite() {
147        let nfa = NFA::new(r"[0-1]{4}-[0-1]{2}-[0-1]{2}").unwrap();
148        let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
149
150        // finite regex has finite iteration depth
151        // and no repeats
152        let x: HashSet<String> = iter.collect();
153        assert_eq!(x.len(), 256);
154        for y in x {
155            assert_eq!(y.len(), 10);
156        }
157    }
158
159    #[test]
160    fn repeated() {
161        let nfa = NFA::new(r"a+(0|1)").unwrap();
162        let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
163
164        // infinite regex iterates over all cases
165        let x: Vec<String> = iter.take(20).collect();
166        let y = [
167            "a0".to_owned(),
168            "a1".to_owned(),
169            "aa0".to_owned(),
170            "aa1".to_owned(),
171            "aaa0".to_owned(),
172            "aaa1".to_owned(),
173            "aaaa0".to_owned(),
174            "aaaa1".to_owned(),
175            "aaaaa0".to_owned(),
176            "aaaaa1".to_owned(),
177            "aaaaaa0".to_owned(),
178            "aaaaaa1".to_owned(),
179            "aaaaaaa0".to_owned(),
180            "aaaaaaa1".to_owned(),
181            "aaaaaaaa0".to_owned(),
182            "aaaaaaaa1".to_owned(),
183            "aaaaaaaaa0".to_owned(),
184            "aaaaaaaaa1".to_owned(),
185            "aaaaaaaaaa0".to_owned(),
186            "aaaaaaaaaa1".to_owned(),
187        ];
188        assert_eq!(x, y);
189    }
190
191    #[test]
192    fn complex() {
193        let nfa = NFA::new(r"(a+|b+)*").unwrap();
194        let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
195
196        // infinite regex iterates over all cases
197        let x: Vec<String> = iter.take(13).collect();
198        let y = [
199            "".to_owned(),
200            "a".to_owned(),
201            "b".to_owned(),
202            "aa".to_owned(),
203            "bb".to_owned(),
204            "aaa".to_owned(),
205            "bbb".to_owned(),
206            "aaaa".to_owned(),
207            // technically a different path
208            "aa".to_owned(),
209            "ab".to_owned(),
210            "bbbb".to_owned(),
211            "ba".to_owned(),
212            // technically a different path
213            "bb".to_owned(),
214        ];
215        assert_eq!(x, y);
216    }
217
218    #[test]
219    fn many() {
220        let search = NfaIter::new_many(&["[0-1]+", "^[a-b]+"]).unwrap();
221        let iter = Utf8Iter::try_from(search).unwrap();
222        let x: Vec<String> = iter.take(12).collect();
223        let y = [
224            "0".to_owned(),
225            "1".to_owned(),
226            "a".to_owned(),
227            "b".to_owned(),
228            "00".to_owned(),
229            "01".to_owned(),
230            "10".to_owned(),
231            "11".to_owned(),
232            "aa".to_owned(),
233            "ab".to_owned(),
234            "ba".to_owned(),
235            "bb".to_owned(),
236        ];
237        assert_eq!(x, y);
238    }
239}