1use std::{
2 collections::{BTreeMap, HashMap, HashSet, VecDeque},
3 hash::{BuildHasher, Hash, RandomState},
4 marker::PhantomData,
5 sync::atomic::AtomicUsize,
6};
7
8use crate::{Atom, ast::ASTNode};
9
10#[derive(Debug, PartialEq, Eq, Default)]
11struct FAFragment<A: Atom> {
12 is_accepted: bool,
13 rule_map: Vec<(A, A, usize)>,
15}
16
17impl<A: Atom> PartialOrd for FAFragment<A> {
18 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
19 Some(self.cmp(other))
20 }
21}
22
23impl<A: Atom> Ord for FAFragment<A> {
24 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
25 if self.is_accepted != other.is_accepted {
26 self.is_accepted.cmp(&other.is_accepted)
27 } else {
28 self.rule_map
29 .iter()
30 .map(|(start, end, to)| (start, end, to))
31 .cmp(
32 other
33 .rule_map
34 .iter()
35 .map(|(start, end, to)| (start, end, to)),
36 )
37 }
38 }
39}
40
41static AUTOMATON_ID: AtomicUsize = AtomicUsize::new(0);
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub struct State<A: Atom> {
45 index: usize,
46 fa_id: usize,
48 _phantom: PhantomData<fn() -> A>,
49}
50
51impl<A: Atom> State<A> {
52 pub fn generated_by(&self, dfa: &DFA<A>) -> bool {
54 self.fa_id == dfa.id
55 }
56}
57
58impl<A: Atom> std::hash::Hash for State<A> {
59 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
60 self.index.hash(state);
61 self.fa_id.hash(state);
62 }
63}
64
65#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
66pub enum TransitionError {
67 InvalidState,
69 InputIsRejected,
71}
72
73impl std::fmt::Display for TransitionError {
74 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75 write!(f, "{self:?}")
76 }
77}
78
79impl std::error::Error for TransitionError {}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum FAAssembleError {
83 InvalidQuantifier,
84}
85
86impl std::fmt::Display for FAAssembleError {
87 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
88 write!(f, "{self:?}")
89 }
90}
91
92impl std::error::Error for FAAssembleError {}
93
94#[derive(Debug)]
95pub struct NFA<A: Atom> {
96 id: usize,
97 initial_state: usize,
98 states: Vec<FAFragment<A>>,
99}
100
101impl<A: Atom> NFA<A> {
102 fn empty() -> Self {
103 let id = AUTOMATON_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
104 Self {
105 id,
106 initial_state: 0,
107 states: vec![FAFragment {
108 is_accepted: true,
109 rule_map: vec![],
110 }],
111 }
112 }
113
114 pub fn assemble(ast: Option<&ASTNode<A>>) -> Result<NFA<A>, FAAssembleError> {
115 let Some(ast) = ast else {
116 return Ok(Self::empty());
117 };
118
119 fn collect_character_ranges<A: Atom>(ast: &ASTNode<A>, ranges: &mut Vec<(A, bool)>) {
121 match ast {
122 ASTNode::Charcters {
123 start,
124 end,
125 negation: _,
126 } => {
127 ranges.push((*start, false));
128 ranges.push((*end, true));
129 }
130 ASTNode::Catenation(front, back) => {
131 collect_character_ranges(front, ranges);
132 collect_character_ranges(back, ranges);
133 }
134 ASTNode::Alternation(left, right) => {
135 collect_character_ranges(left, ranges);
136 collect_character_ranges(right, ranges);
137 }
138 ASTNode::ZeroOrOne(node) => {
139 collect_character_ranges(node, ranges);
140 }
141 ASTNode::ZeroOrMore(node) => {
142 collect_character_ranges(node, ranges);
143 }
144 ASTNode::OneOrMore(node) => {
145 collect_character_ranges(node, ranges);
146 }
147 ASTNode::Repeat {
148 node,
149 at_least: _,
150 at_most: _,
151 } => {
152 collect_character_ranges(node, ranges);
153 }
154 ASTNode::RepeatExact(node, _) => {
155 collect_character_ranges(node, ranges);
156 }
157 }
158 }
159
160 let mut ranges = vec![(A::MIN, false), (A::MAX, true)];
161 collect_character_ranges(ast, &mut ranges);
162 ranges.sort_unstable();
163 ranges.dedup();
164 let ranges = ranges
165 .windows(2)
166 .filter_map(|v| {
167 let (mut start, f) = v[0];
168 let (mut end, g) = v[1];
169 if start == end {
170 Some((start, end))
171 } else {
172 if f {
173 start = start.next()?;
174 }
175 if !g {
176 end = end.previous()?;
177 }
178 (start <= end).then_some((start, end))
179 }
180 })
181 .collect::<Vec<_>>();
182
183 fn build_states<A: Atom>(
184 ast: &ASTNode<A>,
185 ranges: &[(A, A)],
186 states: &mut Vec<FAFragment<A>>,
187 ) -> Result<(usize, usize), FAAssembleError> {
188 match ast {
189 &ASTNode::Charcters {
190 start,
191 end,
192 negation,
193 } => {
194 let mut pos = ranges.partition_point(|p| p.0 < start);
195 let mut from = FAFragment::default();
196 let to = FAFragment {
197 is_accepted: true,
198 ..Default::default()
199 };
200 if negation {
201 for &(f, t) in ranges.iter().take(pos) {
202 from.rule_map.push((f, t, states.len() + 1));
203 }
204 while pos < ranges.len() && ranges[pos].1 <= end {
205 pos += 1;
206 }
207 for &(f, t) in ranges.iter().skip(pos) {
208 from.rule_map.push((f, t, states.len() + 1));
209 }
210 } else {
211 while pos < ranges.len() && ranges[pos].1 <= end {
212 let (s, e) = ranges[pos];
213 from.rule_map.push((s, e, states.len() + 1));
214 pos += 1;
215 }
216 }
217 states.push(from);
218 states.push(to);
219 Ok((states.len() - 2, states.len() - 1))
220 }
221 ASTNode::Catenation(front, back) => {
222 let (sf, ef) = build_states(front, ranges, states)?;
224 let (sb, eb) = build_states(back, ranges, states)?;
225 states[ef].rule_map.push((A::EPSILON, A::EPSILON, sb));
226 states[ef].is_accepted = false;
227 Ok((sf, eb))
228 }
229 ASTNode::Alternation(left, right) => {
230 let (sl, el) = build_states(left, ranges, states)?;
234 let (sr, er) = build_states(right, ranges, states)?;
235 let mut start = FAFragment::default();
236 start.rule_map.push((A::EPSILON, A::EPSILON, sl));
237 start.rule_map.push((A::EPSILON, A::EPSILON, sr));
238 states.push(start);
239 let mut end = FAFragment::default();
240 let len = states.len();
241 states[el].rule_map.push((A::EPSILON, A::EPSILON, len));
242 states[el].is_accepted = false;
243 states[er].rule_map.push((A::EPSILON, A::EPSILON, len));
244 states[er].is_accepted = false;
245 end.is_accepted = true;
246 states.push(end);
247 Ok((states.len() - 2, states.len() - 1))
248 }
249 ASTNode::ZeroOrOne(node) => {
250 let (start, end) = build_states(node, ranges, states)?;
254 states[start].rule_map.push((A::EPSILON, A::EPSILON, end));
255 Ok((start, end))
256 }
257 ASTNode::ZeroOrMore(node) => {
258 let (start, end) = build_states(node, ranges, states)?;
263 let mut new_end = FAFragment::default();
264 states[end].is_accepted = false;
265 new_end.is_accepted = true;
266 let len = states.len();
267 states[start].rule_map.push((A::EPSILON, A::EPSILON, len));
268 states[end].rule_map.push((A::EPSILON, A::EPSILON, len));
269 states[end].rule_map.push((A::EPSILON, A::EPSILON, start));
270 states.push(new_end);
271 Ok((start, states.len() - 1))
272 }
273 ASTNode::OneOrMore(node) => {
274 let (start, end) = build_states(node, ranges, states)?;
279 let (start2, end2) = build_states(node, ranges, states)?;
280 states[end].is_accepted = false;
281 states[end2].is_accepted = false;
282 states[end].rule_map.push((A::EPSILON, A::EPSILON, start2));
283 let new_end = FAFragment {
284 is_accepted: true,
285 ..Default::default()
286 };
287 let len = states.len();
288 states[start2].rule_map.push((A::EPSILON, A::EPSILON, len));
289 states[end2].rule_map.push((A::EPSILON, A::EPSILON, len));
290 states[end2].rule_map.push((A::EPSILON, A::EPSILON, start2));
291 states.push(new_end);
292 Ok((start, states.len() - 1))
293 }
294 ASTNode::Repeat {
295 node,
296 at_least,
297 at_most,
298 } => {
299 let new_start = FAFragment::default();
300 states.push(new_start);
301 let new_start = states.len() - 1;
302 let new_end = FAFragment {
303 is_accepted: true,
304 ..Default::default()
305 };
306 states.push(new_end);
307 let new_end = states.len() - 1;
308
309 let mut start = new_start;
310 for _ in 0..*at_least {
311 let (new_start, new_end) = build_states(node, ranges, states)?;
312 states[start]
313 .rule_map
314 .push((A::EPSILON, A::EPSILON, new_start));
315 states[new_end].is_accepted = false;
316 start = new_end;
317 }
318 if let &Some(at_most) = at_most {
319 if *at_least > at_most {
320 return Err(FAAssembleError::InvalidQuantifier);
321 }
322 for _ in *at_least..at_most {
323 let (new_start, end) = build_states(node, ranges, states)?;
324 states[start]
325 .rule_map
326 .push((A::EPSILON, A::EPSILON, new_start));
327 states[start]
328 .rule_map
329 .push((A::EPSILON, A::EPSILON, new_end));
330 states[end].is_accepted = false;
331 start = end;
332 }
333 states[start]
334 .rule_map
335 .push((A::EPSILON, A::EPSILON, new_end));
336 } else {
337 let (new_start, end) = build_states(node, ranges, states)?;
338 states[start]
339 .rule_map
340 .push((A::EPSILON, A::EPSILON, new_start));
341 states[start]
342 .rule_map
343 .push((A::EPSILON, A::EPSILON, new_end));
344 states[end].rule_map.push((A::EPSILON, A::EPSILON, new_end));
345 states[end]
346 .rule_map
347 .push((A::EPSILON, A::EPSILON, new_start));
348 states[end].is_accepted = false;
349 }
350
351 Ok((new_start, new_end))
352 }
353 ASTNode::RepeatExact(node, n) => {
354 let new_start = FAFragment::default();
355 states.push(new_start);
356 let new_start = states.len() - 1;
357 let new_end = FAFragment {
358 is_accepted: true,
359 ..Default::default()
360 };
361 states.push(new_end);
362 let new_end = states.len() - 1;
363
364 let mut start = new_start;
365 for _ in 0..*n {
366 let (new_start, end) = build_states(node, ranges, states)?;
367 states[start]
368 .rule_map
369 .push((A::EPSILON, A::EPSILON, new_start));
370 states[end].is_accepted = false;
371 start = end;
372 }
373
374 states[start]
375 .rule_map
376 .push((A::EPSILON, A::EPSILON, new_end));
377
378 Ok((new_start, new_end))
379 }
380 }
381 }
382
383 let mut states = vec![];
384 let (initial_state, _) = build_states(ast, &ranges, &mut states)?;
385 states.iter_mut().for_each(|state| {
386 state.rule_map.sort_unstable();
387 state.rule_map.dedup();
388 });
389
390 Ok(NFA {
391 id: AUTOMATON_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed),
392 initial_state,
393 states,
394 })
395 }
396
397 pub fn is_deterministic(&self) -> bool {
403 let mut cache = BTreeMap::new();
404 let mut set = HashSet::new();
405 for i in 0..self.states.len() {
406 cache.clear();
407 set.insert(i);
408 let mut nt = vec![i];
409 while let Some(now) = nt.pop() {
410 let state = &self.states[now];
411 let pos = state
412 .rule_map
413 .partition_point(|rule| (rule.0, rule.1) < (A::EPSILON, A::EPSILON));
414 for rule in state.rule_map[pos..]
415 .iter()
416 .take_while(|rule| (rule.0, rule.1) == (A::EPSILON, A::EPSILON))
417 {
418 if set.insert(rule.2) {
419 nt.push(rule.2);
420 }
421 }
422 }
423
424 for state in set.drain() {
425 for rule in self.states[state]
426 .rule_map
427 .iter()
428 .filter(|rule| (rule.0, rule.1) != (A::EPSILON, A::EPSILON))
429 {
430 if let Some(to) = cache.insert((rule.0, rule.1), rule.2)
431 && to != rule.2
432 {
433 return false;
434 }
435 }
436 }
437 }
438
439 true
440 }
441
442 pub fn initial_state(&self) -> State<A> {
444 State {
445 index: self.initial_state,
446 fa_id: self.id,
447 _phantom: PhantomData,
448 }
449 }
450
451 pub fn transition(
458 &self,
459 initial_states: impl Iterator<Item = State<A>>,
460 input: impl Iterator<Item = A>,
461 ) -> Result<Vec<State<A>>, TransitionError> {
462 let mut current = HashSet::new();
463 let mut nt = VecDeque::new();
464 for state in initial_states {
465 if state.fa_id != self.id || self.states.len() <= state.index {
466 return Err(TransitionError::InvalidState);
467 }
468 if current.insert(state) {
469 nt.push_back(state.index);
470 }
471 }
472
473 let resolve_epsilon = |set: &mut HashSet<State<A>>, nt: &mut VecDeque<usize>| {
474 while let Some(now) = nt.pop_front() {
475 for rule in &self.states[now].rule_map {
476 if (rule.0, rule.1) == (A::EPSILON, A::EPSILON)
477 && set.insert(State {
478 index: rule.2,
479 fa_id: self.id,
480 _phantom: PhantomData,
481 })
482 {
483 nt.push_back(rule.2);
484 }
485 }
486 }
487 };
488 resolve_epsilon(&mut current, &mut nt);
489
490 for c in input {
491 let mut next = HashSet::new();
492 for cur in current {
493 for rule in &self.states[cur.index].rule_map {
494 if (rule.0..=rule.1).contains(&c) {
495 next.insert(State {
496 index: rule.2,
497 fa_id: self.id,
498 _phantom: PhantomData,
499 });
500 }
501 }
502 }
503
504 if next.is_empty() {
505 return Err(TransitionError::InputIsRejected);
506 }
507
508 current = {
509 nt.extend(next.iter().map(|state| state.index));
510 resolve_epsilon(&mut next, &mut nt);
511 next
512 };
513 }
514
515 Ok(current.into_iter().collect())
516 }
517
518 pub fn is_accepted(&self, state: State<A>) -> bool {
522 state.fa_id == self.id && self.states[state.index].is_accepted
523 }
524
525 pub fn build_dfa(&self) -> DFA<A> {
527 let mut states = vec![];
528 let mut cur = HashSet::from([self.initial_state]);
529 let mut stack = vec![self.initial_state];
530 while let Some(now) = stack.pop() {
531 for &(_, _, to) in self.states[now]
532 .rule_map
533 .iter()
534 .take_while(|v| v.0 == A::EPSILON)
535 {
536 if cur.insert(to) {
537 stack.push(to);
538 }
539 }
540 }
541 let hash_state = RandomState::new();
542
543 fn generate_hash(set: &HashSet<usize>, state: &RandomState) -> u64 {
545 set.iter()
546 .copied()
547 .map(|id| state.hash_one(id))
548 .fold(0, |s, v| s ^ v)
549 }
550
551 fn build_states<A: Atom>(
552 hash: u64,
553 cur: &HashSet<usize>,
554 hash_state: &RandomState,
555 old_states: &[FAFragment<A>],
556 states: &mut Vec<FAFragment<A>>,
557 memo: &mut HashMap<u64, usize>,
558 ) -> usize {
559 let state_index = states.len();
560 memo.insert(hash, state_index);
561 states.push(FAFragment::default());
562 if cur.iter().any(|index| old_states[*index].is_accepted) {
563 states[state_index].is_accepted = true;
564 }
565
566 let mut rules = vec![];
568 for &c in cur.iter() {
569 rules.extend(
570 old_states[c]
571 .rule_map
572 .iter()
573 .filter(|&v| v.0 != A::EPSILON)
574 .cloned(),
575 );
576 }
577 rules.sort_unstable_by_key(|v| (v.0, v.2));
578 rules.dedup();
579
580 if rules.is_empty() {
581 return state_index;
582 }
583 let mut rule = (rules[0].0, rules[0].1);
584 let mut buf = vec![];
585 let mut set = HashSet::new();
586 let mut push_state = |rule: (A, A), buf: &mut Vec<usize>| {
587 set.clear();
588 set.extend(buf.iter().copied());
589 while let Some(now) = buf.pop() {
592 for to in old_states[now]
593 .rule_map
594 .iter()
595 .take_while(|v| v.0 == A::EPSILON)
596 {
597 if set.insert(to.2) {
598 buf.push(to.2);
599 }
600 }
601 }
602 let hash = generate_hash(&set, hash_state);
603 if let Some(to) = memo.get(&hash) {
604 states[state_index].rule_map.push((rule.0, rule.1, *to));
606 } else {
607 let index = build_states(hash, &set, hash_state, old_states, states, memo);
609 states[state_index].rule_map.push((rule.0, rule.1, index));
610 }
611 };
612 for (start, end, to) in rules {
613 if rule == (start, end) {
614 buf.push(to);
615 continue;
616 }
617
618 if !buf.is_empty() {
619 push_state(rule, &mut buf);
620 debug_assert!(buf.is_empty());
623 }
624
625 rule = (start, end);
626 buf.push(to);
627 }
628
629 if !buf.is_empty() {
630 push_state(rule, &mut buf);
631 }
632 state_index
633 }
634
635 build_states(
636 generate_hash(&cur, &hash_state),
637 &cur,
638 &hash_state,
639 &self.states,
640 &mut states,
641 &mut HashMap::new(),
642 );
643
644 let mut dfa = DFA {
645 id: self.id,
646 states,
647 };
648 dfa.minimize();
649 dfa
650 }
651}
652
653#[derive(Debug)]
654pub struct DFA<A: Atom> {
655 id: usize,
656 states: Vec<FAFragment<A>>,
659}
660
661impl<A: Atom> DFA<A> {
662 fn empty() -> Self {
663 let id = AUTOMATON_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
664 Self {
665 id,
666 states: vec![FAFragment {
667 is_accepted: true,
668 rule_map: vec![],
669 }],
670 }
671 }
672
673 pub fn assemble(ast: Option<&ASTNode<A>>) -> Result<DFA<A>, FAAssembleError> {
679 let Some(ast) = ast else {
680 return Ok(DFA::empty());
681 };
682
683 Ok(NFA::assemble(Some(ast))?.build_dfa())
684 }
685
686 pub fn initial_state(&self) -> State<A> {
688 State {
689 index: 0,
690 fa_id: self.id,
691 _phantom: PhantomData,
692 }
693 }
694
695 pub fn transition(
700 &self,
701 initial_state: State<A>,
702 input: impl Iterator<Item = A>,
703 ) -> Result<State<A>, TransitionError> {
704 if initial_state.fa_id != self.id {
705 return Err(TransitionError::InvalidState);
706 }
707
708 let mut state = initial_state.index;
709 if self.states.len() <= state {
710 return Err(TransitionError::InvalidState);
711 }
712
713 for c in input {
714 debug_assert!(
715 self.states[state]
716 .rule_map
717 .windows(2)
718 .all(|v| v[0].1 < v[1].0)
719 );
720 state = self.states[state]
721 .rule_map
722 .binary_search_by(|(start, end, _)| {
723 if &c < start {
724 std::cmp::Ordering::Greater
725 } else if end < &c {
726 std::cmp::Ordering::Less
727 } else {
728 std::cmp::Ordering::Equal
729 }
730 })
731 .map(|index| self.states[state].rule_map[index].2)
732 .map_err(|_| TransitionError::InputIsRejected)?;
733 }
734
735 Ok(State {
736 index: state,
737 fa_id: self.id,
738 _phantom: PhantomData,
739 })
740 }
741
742 pub fn is_match(&self, input: impl Iterator<Item = A>) -> bool {
744 self.transition(self.initial_state(), input)
745 .is_ok_and(|state| self.is_accepted(state))
746 }
747
748 pub fn is_accepted(&self, state: State<A>) -> bool {
752 state.fa_id == self.id && self.states[state.index].is_accepted
753 }
754
755 fn minimize(&mut self) {
756 let mut replace_to = (0..self.states.len()).collect::<Vec<_>>();
757 let mut index = (0..self.states.len()).collect::<Vec<_>>();
758 loop {
759 index.sort_unstable_by_key(|&index| (&self.states[index], index));
760 let mut update = false;
761 for v in index.windows(2) {
762 let l = v[0];
763 let r = v[1];
764
765 if self.states[l] == self.states[r] {
766 replace_to[r] = replace_to[l];
769 update = true;
770 }
771 }
772
773 if !update {
774 break;
775 }
776
777 for &index in &index {
778 if index == replace_to[index] {
779 for (_, _, to) in self.states[index].rule_map.iter_mut() {
780 *to = replace_to[*to];
781 }
782 }
783 }
784 index.retain(|&index| index == replace_to[index]);
785 }
786
787 index.sort_unstable();
788 for (to, &from) in index.iter().enumerate() {
789 self.states.swap(to, from);
790 replace_to[from] = to;
791 }
792 self.states.truncate(index.len());
793 for state in self.states.iter_mut() {
794 for (_, _, to) in state.rule_map.iter_mut() {
795 *to = replace_to[*to];
796 }
797 }
798 }
799}