regex_utils/lib.rs
1//! Utilitise for working with regexes
2//!
3//! # Regex Iteration
4//!
5//! Given a regex, generate all (potentially infinite) possible inputs that match the pattern.
6//!
7//! ```
8//! use regex_utils::{DenseDfaIter, NfaIter, Utf8Iter};
9//!
10//! // parse a regex as an NFA
11//! let iter = NfaIter::new(r"a+(0|1)").unwrap();
12//! // and expect it to be utf8
13//! let iter = Utf8Iter::try_from(iter).unwrap();
14//!
15//! // Because the regex has an infinite pattern (`+`)
16//! // the iterator will be infinite. Let's take a subset
17//! let x: Vec<String> = iter.take(10).collect();
18//! assert_eq!(x, [
19//! "a0".to_owned(),
20//! "a1".to_owned(),
21//! "aa0".to_owned(),
22//! "aa1".to_owned(),
23//! "aaa0".to_owned(),
24//! "aaa1".to_owned(),
25//! "aaaa0".to_owned(),
26//! "aaaa1".to_owned(),
27//! "aaaaa0".to_owned(),
28//! "aaaaa1".to_owned(),
29//! ]);
30//!
31//! // parse a regex as a dense DFA
32//! let iter = DenseDfaIter::new(r"foo|(bar){1,2}|quux").unwrap();
33//!
34//! // The regex has a finite number of states, so we can collect all of them
35//! let x: Vec<Vec<u8>> = iter.collect();
36//! assert_eq!(x, [
37//! b"bar".to_vec(),
38//! b"foo".to_vec(),
39//! b"quux".to_vec(),
40//! b"barbar".to_vec(),
41//! ]);
42//! ```
43//!
44//! ## NFA (Nondeterministic Finite Automaton)
45//!
46//! Using [`NfaIter`] you can traverse the regex using an [`NFA`](regex_automata::nfa). NFAs are low memory
47//! representations of regular expressions, at the cost of being slower.
48//!
49//! These do not guarantee that output strings are unique (given that the graph is non-deterministic)
50//! but the search space memory will be much smaller.
51//!
52//! ## DFA (Deterministic Finite Automaton)
53//!
54//! Using [`DfaIter`] you can traverse the regex using a [`DFA`](regex_automata::dfa). DFAs are high memory
55//! representations of regular expressions, at the cost of using much more memory.
56//!
57//! These guarantee that output strings are unique, but the search space will likely use more memory.
58//!
59//! ## Utf8
60//!
61//! Using [`Utf8Iter`] you can get the outputs of the NFA or DFA iterators as [`String`]
62//! representations of regular expressions, at the cost of using much more memory.
63//!
64//! These guarantee that output strings are unique, but the search space will likely use more memory.
65
66use core::fmt;
67use std::error;
68
69pub use dfa::{DenseDfaIter, DfaIter, SparseDfaIter};
70pub use nfa::NfaIter;
71use regex_automata::dfa::Automaton;
72
73mod dfa;
74mod nfa;
75
76/// [`NfaIter`] or [`DfaIter`] iterator with UTF8 [`String`]s as output
77pub struct Utf8Iter<I>(I);
78
79#[derive(Debug)]
80/// Regex provided to [`Utf8Iter`] was not valid for generating UTF8 strings
81pub struct RegexNotUtf8;
82
83impl fmt::Display for RegexNotUtf8 {
84 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
85 f.write_str("regex is not utf8")
86 }
87}
88
89impl error::Error for RegexNotUtf8 {}
90
91impl TryFrom<NfaIter> for Utf8Iter<NfaIter> {
92 type Error = RegexNotUtf8;
93 fn try_from(value: NfaIter) -> Result<Self, Self::Error> {
94 if value.regex.is_utf8() {
95 Ok(Self(value))
96 } else {
97 Err(RegexNotUtf8)
98 }
99 }
100}
101
102impl<A: Automaton> TryFrom<DfaIter<A>> for Utf8Iter<DfaIter<A>> {
103 type Error = RegexNotUtf8;
104 fn try_from(value: DfaIter<A>) -> Result<Self, Self::Error> {
105 if value.regex.is_utf8() {
106 Ok(Self(value))
107 } else {
108 Err(RegexNotUtf8)
109 }
110 }
111}
112
113impl<A: Automaton> Utf8Iter<DfaIter<A>> {
114 /// Get the next matching string ref from this regex iterator
115 pub fn borrow_next(&mut self) -> Option<&str> {
116 let next = self.0.borrow_next()?;
117 Some(std::str::from_utf8(next).expect("Regex should only match utf8"))
118 }
119}
120impl Utf8Iter<NfaIter> {
121 /// Get the next matching string ref from this regex iterator
122 pub fn borrow_next(&mut self) -> Option<&str> {
123 let next = self.0.borrow_next()?;
124 Some(std::str::from_utf8(next).expect("Regex should only match utf8"))
125 }
126}
127
128impl<I: Iterator<Item = Vec<u8>>> Iterator for Utf8Iter<I> {
129 type Item = String;
130
131 fn next(&mut self) -> Option<Self::Item> {
132 let next = self.0.next()?;
133 Some(String::from_utf8(next).expect("Regex should only match utf8"))
134 }
135}
136
137#[cfg(test)]
138mod tests {
139 use std::collections::HashSet;
140
141 use regex_automata::nfa::thompson::NFA;
142
143 use super::*;
144
145 #[test]
146 fn finite() {
147 let nfa = NFA::new(r"[0-1]{4}-[0-1]{2}-[0-1]{2}").unwrap();
148 let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
149
150 // finite regex has finite iteration depth
151 // and no repeats
152 let x: HashSet<String> = iter.collect();
153 assert_eq!(x.len(), 256);
154 for y in x {
155 assert_eq!(y.len(), 10);
156 }
157 }
158
159 #[test]
160 fn repeated() {
161 let nfa = NFA::new(r"a+(0|1)").unwrap();
162 let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
163
164 // infinite regex iterates over all cases
165 let x: Vec<String> = iter.take(20).collect();
166 let y = [
167 "a0".to_owned(),
168 "a1".to_owned(),
169 "aa0".to_owned(),
170 "aa1".to_owned(),
171 "aaa0".to_owned(),
172 "aaa1".to_owned(),
173 "aaaa0".to_owned(),
174 "aaaa1".to_owned(),
175 "aaaaa0".to_owned(),
176 "aaaaa1".to_owned(),
177 "aaaaaa0".to_owned(),
178 "aaaaaa1".to_owned(),
179 "aaaaaaa0".to_owned(),
180 "aaaaaaa1".to_owned(),
181 "aaaaaaaa0".to_owned(),
182 "aaaaaaaa1".to_owned(),
183 "aaaaaaaaa0".to_owned(),
184 "aaaaaaaaa1".to_owned(),
185 "aaaaaaaaaa0".to_owned(),
186 "aaaaaaaaaa1".to_owned(),
187 ];
188 assert_eq!(x, y);
189 }
190
191 #[test]
192 fn complex() {
193 let nfa = NFA::new(r"(a+|b+)*").unwrap();
194 let iter = Utf8Iter::try_from(NfaIter::from(nfa)).unwrap();
195
196 // infinite regex iterates over all cases
197 let x: Vec<String> = iter.take(13).collect();
198 let y = [
199 "".to_owned(),
200 "a".to_owned(),
201 "b".to_owned(),
202 "aa".to_owned(),
203 "bb".to_owned(),
204 "aaa".to_owned(),
205 "bbb".to_owned(),
206 "aaaa".to_owned(),
207 // technically a different path
208 "aa".to_owned(),
209 "ab".to_owned(),
210 "bbbb".to_owned(),
211 "ba".to_owned(),
212 // technically a different path
213 "bb".to_owned(),
214 ];
215 assert_eq!(x, y);
216 }
217
218 #[test]
219 fn many() {
220 let search = NfaIter::new_many(&["[0-1]+", "^[a-b]+"]).unwrap();
221 let iter = Utf8Iter::try_from(search).unwrap();
222 let x: Vec<String> = iter.take(12).collect();
223 let y = [
224 "0".to_owned(),
225 "1".to_owned(),
226 "a".to_owned(),
227 "b".to_owned(),
228 "00".to_owned(),
229 "01".to_owned(),
230 "10".to_owned(),
231 "11".to_owned(),
232 "aa".to_owned(),
233 "ab".to_owned(),
234 "ba".to_owned(),
235 "bb".to_owned(),
236 ];
237 assert_eq!(x, y);
238 }
239}