vyre_std/pattern/
dfa_minimize.rs1use std::collections::VecDeque;
31
32use super::types::{Dfa, INVALID_STATE};
33
34#[must_use]
50#[inline]
51pub fn dfa_minimize(dfa: &Dfa) -> Dfa {
52 let state_count = dfa.state_count as usize;
53 if state_count == 0 {
54 return dfa.clone();
55 }
56
57 let dead = state_count as u32;
61 let total_states = state_count + 1;
62
63 let mut trans = vec![dead; total_states * 256];
67 for s in 0..state_count {
68 for b in 0..256usize {
69 let t = dfa.transitions[s * 256 + b];
70 trans[s * 256 + b] = if t == INVALID_STATE { dead } else { t };
71 }
72 }
73 let mut pred_head: Vec<(u32, u32)> = vec![(0, 0); 256 * total_states];
84 let mut pred_flat: Vec<u32>;
85 {
86 let mut count = vec![0u32; 256 * total_states];
87 for p in 0..total_states as u32 {
88 for b in 0..256usize {
89 let t = trans[p as usize * 256 + b];
90 count[b * total_states + t as usize] += 1;
91 }
92 }
93 let mut cursor = 0u32;
94 for i in 0..(256 * total_states) {
95 pred_head[i] = (cursor, count[i]);
96 cursor += count[i];
97 }
98 pred_flat = vec![0u32; cursor as usize];
99 let mut fill = vec![0u32; 256 * total_states];
100 for p in 0..total_states as u32 {
101 for b in 0..256usize {
102 let t = trans[p as usize * 256 + b];
103 let slot = b * total_states + t as usize;
104 let off = pred_head[slot].0 + fill[slot];
105 pred_flat[off as usize] = p;
106 fill[slot] += 1;
107 }
108 }
109 }
110 let mut accept_class: Vec<u32> = Vec::with_capacity(state_count);
113 let mut non_accept_class: Vec<u32> = Vec::with_capacity(total_states);
114 for s in 0..state_count {
115 if dfa.accept[s] {
116 accept_class.push(s as u32);
117 } else {
118 non_accept_class.push(s as u32);
119 }
120 }
121 non_accept_class.push(dead);
122
123 let mut classes: Vec<Vec<u32>> = Vec::with_capacity(total_states);
124 let mut class_of: Vec<u32> = vec![0; total_states];
125 if !accept_class.is_empty() {
126 let id = classes.len() as u32;
127 for &s in &accept_class {
128 class_of[s as usize] = id;
129 }
130 classes.push(accept_class);
131 }
132 if !non_accept_class.is_empty() {
133 let id = classes.len() as u32;
134 for &s in &non_accept_class {
135 class_of[s as usize] = id;
136 }
137 classes.push(non_accept_class);
138 }
139
140 let mut worklist: VecDeque<u32> = VecDeque::new();
141 let mut on_worklist = vec![false; classes.len()];
142 if classes.len() == 2 {
144 let smaller = if classes[0].len() <= classes[1].len() {
145 0
146 } else {
147 1
148 };
149 worklist.push_back(smaller as u32);
150 on_worklist[smaller] = true;
151 } else if classes.len() == 1 {
152 worklist.push_back(0);
153 on_worklist[0] = true;
154 }
155
156 let mut hit = vec![false; classes.len()];
159 let mut x_set = vec![false; total_states];
160 let mut intersect_buf: Vec<u32> = Vec::with_capacity(total_states);
161 let mut x_list: Vec<u32> = Vec::with_capacity(total_states);
162
163 while let Some(splitter_id) = worklist.pop_front() {
164 if (splitter_id as usize) >= on_worklist.len() {
165 continue;
166 }
167 on_worklist[splitter_id as usize] = false;
168 for byte in 0u16..256 {
169 let b_idx = byte as usize;
170 for &s in &classes[splitter_id as usize] {
173 let (off, len) = pred_head[b_idx * total_states + s as usize];
174 for p in &pred_flat[off as usize..(off + len) as usize] {
175 if !x_set[*p as usize] {
176 x_set[*p as usize] = true;
177 }
178 }
179 }
180 x_list.clear();
181 x_list.extend((0..total_states as u32).filter(|p| x_set[*p as usize]));
182
183 hit.resize(classes.len(), false);
186 hit.iter_mut().for_each(|h| *h = false);
187 for &p in &x_list {
188 let c = class_of[p as usize] as usize;
189 if !hit[c] {
190 hit[c] = true;
191 }
192 }
193
194 for c in 0..classes.len() {
195 if !hit[c] {
196 continue;
197 }
198 intersect_buf.clear();
199 classes[c].retain(|&s| {
200 if x_set[s as usize] {
201 intersect_buf.push(s);
202 false
203 } else {
204 true
205 }
206 });
207 if intersect_buf.is_empty() || classes[c].is_empty() {
208 if classes[c].is_empty() {
211 classes[c] = std::mem::take(&mut intersect_buf);
212 } else {
213 classes[c].append(&mut intersect_buf);
214 }
215 continue;
216 }
217
218 let new_id = classes.len() as u32;
219 let intersect = std::mem::take(&mut intersect_buf);
220 for &s in &intersect {
221 class_of[s as usize] = new_id;
222 }
223 classes.push(intersect);
224 on_worklist.push(false);
225 hit.push(false);
226
227 if on_worklist[c] {
233 worklist.push_back(new_id);
234 on_worklist[new_id as usize] = true;
235 } else {
236 let enqueue = if classes[c].len() <= classes[new_id as usize].len() {
237 c as u32
238 } else {
239 new_id
240 };
241 worklist.push_back(enqueue);
242 on_worklist[enqueue as usize] = true;
243 }
244 }
245
246 for &p in &x_list {
249 x_set[p as usize] = false;
250 }
251 }
252 }
253
254 let dead_class = class_of[dead as usize];
257 let start_class = class_of[dfa.start as usize];
258
259 let mut canonical: Vec<i64> = vec![-1; classes.len()];
260 canonical[start_class as usize] = 0;
261 let mut queue: VecDeque<u32> = VecDeque::new();
262 queue.push_back(start_class);
263 let mut next_id: u32 = 1;
264 while let Some(class_id) = queue.pop_front() {
265 let representative = match classes[class_id as usize].iter().find(|s| **s != dead) {
266 Some(&s) => s,
267 None => continue,
268 };
269 for byte in 0u8..=255 {
270 let target = trans[representative as usize * 256 + byte as usize];
271 if target == dead {
272 continue;
273 }
274 let target_class = class_of[target as usize];
275 if canonical[target_class as usize] < 0 {
276 canonical[target_class as usize] = i64::from(next_id);
277 queue.push_back(target_class);
278 next_id += 1;
279 }
280 }
281 }
282
283 let final_state_count = next_id;
284 let mut transitions = vec![INVALID_STATE; (final_state_count as usize) * 256];
285 let mut accept = vec![false; final_state_count as usize];
286
287 for (class_id, &canon) in canonical.iter().enumerate() {
288 if canon < 0 {
289 continue;
290 }
291 let canon = canon as u32;
292 let representative = match classes[class_id].iter().find(|s| **s != dead) {
293 Some(&s) => s,
294 None => continue,
295 };
296 if dfa.accept[representative as usize] {
297 accept[canon as usize] = true;
298 }
299 for byte in 0u8..=255 {
300 let target = trans[representative as usize * 256 + byte as usize];
301 if target == dead {
302 continue;
303 }
304 let target_class = class_of[target as usize];
305 if target_class == dead_class {
306 continue;
307 }
308 transitions[(canon as usize) * 256 + byte as usize] =
309 canonical[target_class as usize] as u32;
310 }
311 }
312
313 Dfa {
314 state_count: final_state_count,
315 transitions,
316 start: 0,
317 accept,
318 }
319}
320
321#[cfg(test)]
322mod tests {
323 use super::*;
324 use crate::pattern::{nfa_to_dfa::nfa_to_dfa, regex_to_nfa::regex_to_nfa};
325
326 fn run(dfa: &Dfa, input: &[u8]) -> bool {
327 let mut state = dfa.start;
328 for &b in input {
329 let next = dfa.go(state, b);
330 if next == INVALID_STATE {
331 return false;
332 }
333 state = next;
334 }
335 dfa.accept[state as usize]
336 }
337
338 #[test]
339 fn minimize_preserves_language_literal() {
340 let nfa = regex_to_nfa("hello").unwrap();
341 let dfa = nfa_to_dfa(&nfa).unwrap();
342 let min = dfa_minimize(&dfa);
343 assert!(run(&min, b"hello"));
344 assert!(!run(&min, b"world"));
345 assert!(min.state_count <= dfa.state_count);
346 }
347
348 #[test]
349 fn minimize_is_idempotent() {
350 let nfa = regex_to_nfa("a|aa|aaa").unwrap();
351 let dfa = nfa_to_dfa(&nfa).unwrap();
352 let once = dfa_minimize(&dfa);
353 let twice = dfa_minimize(&once);
354 assert_eq!(once, twice, "Hopcroft output must be canonical");
355 }
356
357 #[test]
358 fn minimize_alternation() {
359 let nfa = regex_to_nfa("foo|bar|baz").unwrap();
360 let dfa = nfa_to_dfa(&nfa).unwrap();
361 let min = dfa_minimize(&dfa);
362 assert!(run(&min, b"foo"));
363 assert!(run(&min, b"bar"));
364 assert!(run(&min, b"baz"));
365 assert!(!run(&min, b"qux"));
366 }
367
368 #[test]
369 fn minimize_collapses_equivalent_states() {
370 let nfa = regex_to_nfa("(a|b)(c|d)").unwrap();
371 let dfa = nfa_to_dfa(&nfa).unwrap();
372 let min = dfa_minimize(&dfa);
373 assert!(run(&min, b"ac"));
374 assert!(run(&min, b"bd"));
375 assert!(!run(&min, b"ab"));
376 assert!(min.state_count <= dfa.state_count);
377 }
378
379 #[test]
380 fn minimize_empty_language() {
381 let nfa = regex_to_nfa("").unwrap();
382 let dfa = nfa_to_dfa(&nfa).unwrap();
383 let min = dfa_minimize(&dfa);
384 assert!(run(&min, b""));
385 assert!(!run(&min, b"x"));
386 }
387
388 #[test]
389 fn minimize_kleene_star() {
390 let nfa = regex_to_nfa("a*").unwrap();
391 let dfa = nfa_to_dfa(&nfa).unwrap();
392 let min = dfa_minimize(&dfa);
393 assert!(run(&min, b""));
394 assert!(run(&min, b"aaaaa"));
395 assert!(!run(&min, b"b"));
396 }
397}