1use std::collections::HashMap;
15
16use super::types::{Dfa, Nfa, NfaStateId, PatternError, INVALID_STATE};
17
18pub const MAX_DFA_STATES: u32 = 65_535;
23
24#[inline]
46pub fn nfa_to_dfa(nfa: &Nfa) -> Result<Dfa, PatternError> {
47 let n_states = nfa.state_count as usize;
55 let mut eps_slice: Vec<(u32, u32)> = vec![(0, 0); n_states];
56 let mut byte_slice: Vec<[(u32, u32); 256]> = Vec::with_capacity(n_states);
57 byte_slice.resize(n_states, [(0, 0); 256]);
58 let mut eps_targets: Vec<NfaStateId> = Vec::with_capacity(nfa.edges.len());
59 let mut byte_targets: Vec<NfaStateId> = Vec::with_capacity(nfa.edges.len());
60 {
61 let mut eps_count = vec![0u32; n_states];
64 let mut byte_count = vec![[0u32; 256]; n_states];
65 for edge in &nfa.edges {
66 let from = edge.from as usize;
67 match edge.byte {
68 None => eps_count[from] += 1,
69 Some(b) => byte_count[from][b as usize] += 1,
70 }
71 }
72 let mut eps_cursor = 0u32;
73 let mut byte_cursor = 0u32;
74 for s in 0..n_states {
75 eps_slice[s] = (eps_cursor, eps_count[s]);
76 eps_cursor += eps_count[s];
77 for b in 0..256usize {
78 byte_slice[s][b] = (byte_cursor, byte_count[s][b]);
79 byte_cursor += byte_count[s][b];
80 }
81 }
82 eps_targets.resize(eps_cursor as usize, 0);
83 byte_targets.resize(byte_cursor as usize, 0);
84 let mut eps_fill = vec![0u32; n_states];
85 let mut byte_fill = vec![[0u32; 256]; n_states];
86 for edge in &nfa.edges {
87 let from = edge.from as usize;
88 match edge.byte {
89 None => {
90 let off = eps_slice[from].0 + eps_fill[from];
91 eps_targets[off as usize] = edge.to;
92 eps_fill[from] += 1;
93 }
94 Some(b) => {
95 let off = byte_slice[from][b as usize].0 + byte_fill[from][b as usize];
96 byte_targets[off as usize] = edge.to;
97 byte_fill[from][b as usize] += 1;
98 }
99 }
100 }
101 }
102
103 let mut closure_scratch = ClosureScratch::with_capacity(n_states);
109 let start_closure = epsilon_closure(
110 &eps_slice,
111 &eps_targets,
112 &[nfa.start],
113 n_states,
114 &mut closure_scratch,
115 );
116 let mut subsets: Vec<Vec<NfaStateId>> = vec![start_closure.clone()];
117 let mut subset_index: HashMap<Vec<NfaStateId>, u32> = HashMap::new();
118 subset_index.insert(start_closure.clone(), 0);
119
120 let mut transitions: Vec<u32> = Vec::with_capacity(256);
121 let mut accept: Vec<bool> = Vec::with_capacity(1);
122 extend_row(&mut transitions, &mut accept, &start_closure, &nfa.accept);
123
124 let mut worklist: Vec<usize> = vec![0];
125 let mut reachable_scratch: Vec<NfaStateId> = Vec::with_capacity(n_states);
126 while let Some(idx) = worklist.pop() {
127 for byte in 0u16..256 {
128 reachable_scratch.clear();
129 for &state in subsets[idx].iter() {
130 let (off, len) = byte_slice[state as usize][byte as usize];
131 for t in &byte_targets[off as usize..(off + len) as usize] {
132 reachable_scratch.push(*t);
133 }
134 }
135 if reachable_scratch.is_empty() {
136 transitions[idx * 256 + byte as usize] = INVALID_STATE;
137 continue;
138 }
139 let closure = epsilon_closure(
140 &eps_slice,
141 &eps_targets,
142 &reachable_scratch,
143 n_states,
144 &mut closure_scratch,
145 );
146 let next_idx = match subset_index.get(&closure) {
147 Some(existing) => *existing,
148 None => {
149 if subsets.len() as u64 >= u64::from(MAX_DFA_STATES) {
154 return Err(PatternError::StateOverflow {
155 limit: MAX_DFA_STATES,
156 });
157 }
158 let new_id = subsets.len() as u32;
159 subsets.push(closure.clone());
160 subset_index.insert(closure.clone(), new_id);
161 extend_row(&mut transitions, &mut accept, &closure, &nfa.accept);
162 worklist.push(new_id as usize);
163 new_id
164 }
165 };
166 transitions[idx * 256 + byte as usize] = next_idx;
167 }
168 }
169
170 Ok(Dfa {
171 state_count: subsets.len() as u32,
172 transitions,
173 start: 0,
174 accept,
175 })
176}
177
178fn extend_row(
179 transitions: &mut Vec<u32>,
180 accept: &mut Vec<bool>,
181 closure: &[NfaStateId],
182 nfa_accept: &[bool],
183) {
184 transitions.extend(std::iter::repeat_n(INVALID_STATE, 256));
185 accept.push(closure.iter().any(|s| nfa_accept[*s as usize]));
186}
187
188pub(crate) struct ClosureScratch {
202 in_closure: Vec<bool>,
203 stack: Vec<NfaStateId>,
204 collected: Vec<NfaStateId>,
205}
206
207impl ClosureScratch {
208 pub(crate) fn with_capacity(n_states: usize) -> Self {
209 Self {
210 in_closure: vec![false; n_states],
211 stack: Vec::with_capacity(n_states.min(64)),
212 collected: Vec::with_capacity(n_states.min(64)),
213 }
214 }
215}
216
217fn epsilon_closure(
218 eps_slice: &[(u32, u32)],
219 eps_targets: &[NfaStateId],
220 seeds: &[NfaStateId],
221 n_states: usize,
222 scratch: &mut ClosureScratch,
223) -> Vec<NfaStateId> {
224 if scratch.in_closure.len() < n_states {
229 scratch.in_closure.resize(n_states, false);
230 }
231 for state in scratch.collected.drain(..) {
235 scratch.in_closure[state as usize] = false;
236 }
237 scratch.stack.clear();
238 scratch.stack.extend_from_slice(seeds);
239 while let Some(state) = scratch.stack.pop() {
240 let idx = state as usize;
241 if scratch.in_closure[idx] {
242 continue;
243 }
244 scratch.in_closure[idx] = true;
245 scratch.collected.push(state);
246 let (off, len) = eps_slice[idx];
247 for t in &eps_targets[off as usize..(off + len) as usize] {
248 scratch.stack.push(*t);
249 }
250 }
251 let mut out = scratch.collected.clone();
254 out.sort_unstable();
255 out
256}
257
258#[cfg(test)]
259mod tests {
260 use super::*;
261 use crate::pattern::regex_to_nfa::regex_to_nfa;
262
263 fn run(dfa: &Dfa, input: &[u8]) -> bool {
264 let mut state = dfa.start;
265 for &b in input {
266 let next = dfa.go(state, b);
267 if next == INVALID_STATE {
268 return false;
269 }
270 state = next;
271 }
272 dfa.accept[state as usize]
273 }
274
275 #[test]
276 fn literal_pattern_accepts_literal_input() {
277 let nfa = regex_to_nfa("abc").unwrap();
278 let dfa = nfa_to_dfa(&nfa).unwrap();
279 assert!(run(&dfa, b"abc"));
280 assert!(!run(&dfa, b"abd"));
281 assert!(!run(&dfa, b"ab"));
282 }
283
284 #[test]
285 fn alternation_accepts_either() {
286 let nfa = regex_to_nfa("foo|bar").unwrap();
287 let dfa = nfa_to_dfa(&nfa).unwrap();
288 assert!(run(&dfa, b"foo"));
289 assert!(run(&dfa, b"bar"));
290 assert!(!run(&dfa, b"baz"));
291 }
292
293 #[test]
294 fn kleene_star_accepts_repeats() {
295 let nfa = regex_to_nfa("a*").unwrap();
296 let dfa = nfa_to_dfa(&nfa).unwrap();
297 assert!(run(&dfa, b""));
298 assert!(run(&dfa, b"a"));
299 assert!(run(&dfa, b"aaaaa"));
300 assert!(!run(&dfa, b"b"));
301 }
302
303 #[test]
304 fn plus_requires_one() {
305 let nfa = regex_to_nfa("a+").unwrap();
306 let dfa = nfa_to_dfa(&nfa).unwrap();
307 assert!(!run(&dfa, b""));
308 assert!(run(&dfa, b"a"));
309 assert!(run(&dfa, b"aaa"));
310 }
311
312 #[test]
313 fn optional_allows_absence() {
314 let nfa = regex_to_nfa("ab?c").unwrap();
315 let dfa = nfa_to_dfa(&nfa).unwrap();
316 assert!(run(&dfa, b"ac"));
317 assert!(run(&dfa, b"abc"));
318 assert!(!run(&dfa, b"abbc"));
319 }
320
321 #[test]
322 fn character_class_accepts_members() {
323 let nfa = regex_to_nfa("[a-c]").unwrap();
324 let dfa = nfa_to_dfa(&nfa).unwrap();
325 assert!(run(&dfa, b"a"));
326 assert!(run(&dfa, b"b"));
327 assert!(run(&dfa, b"c"));
328 assert!(!run(&dfa, b"d"));
329 }
330
331 #[test]
332 fn group_with_star() {
333 let nfa = regex_to_nfa("(ab)*c").unwrap();
334 let dfa = nfa_to_dfa(&nfa).unwrap();
335 assert!(run(&dfa, b"c"));
336 assert!(run(&dfa, b"abc"));
337 assert!(run(&dfa, b"ababababc"));
338 assert!(!run(&dfa, b"ac"));
339 }
340
341 #[test]
342 fn empty_regex_only_accepts_empty() {
343 let nfa = regex_to_nfa("").unwrap();
344 let dfa = nfa_to_dfa(&nfa).unwrap();
345 assert!(run(&dfa, b""));
346 assert!(!run(&dfa, b"a"));
347 }
348}