ere_core/
working_nfa.rs

1use quote::{quote, ToTokens};
2
3/// This recognizes "true" regular expressions, meaning implicit anchors and no capture groups.
4/// Capture groups are treated as simple concatenation.
5/// However, it also includes additional stuff like special start/end anchors
6use crate::parse_tree::Atom;
7use crate::simplified_tree::SimplifiedTreeNode;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum EpsilonType {
11    None,
12    StartAnchor,
13    EndAnchor,
14    StartCapture(usize),
15    EndCapture(usize),
16}
17impl ToTokens for EpsilonType {
18    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
19        match self {
20            EpsilonType::None => {
21                tokens.extend(quote! { ::ere_core::working_nfa::EpsilonType::None })
22            }
23            EpsilonType::StartAnchor => {
24                tokens.extend(quote! { ::ere_core::working_nfa::EpsilonType::StartAnchor })
25            }
26            EpsilonType::EndAnchor => {
27                tokens.extend(quote! { ::ere_core::working_nfa::EpsilonType::EndAnchor })
28            }
29            EpsilonType::StartCapture(group_num) => tokens.extend(quote! {
30                ::ere_core::working_nfa::EpsilonType::StartCapture(#group_num)
31            }),
32            EpsilonType::EndCapture(group_num) => tokens.extend(quote! {
33                ::ere_core::working_nfa::EpsilonType::EndCapture(#group_num)
34            }),
35        };
36    }
37}
38
39/// An epsilon transition for the [`WorkingNFA`]
40#[derive(Debug, Clone, Copy)]
41pub struct EpsilonTransition {
42    pub(crate) from: usize,
43    pub(crate) to: usize,
44    pub(crate) special: EpsilonType,
45}
46impl EpsilonTransition {
47    pub(crate) const fn new(from: usize, to: usize) -> EpsilonTransition {
48        return EpsilonTransition {
49            from,
50            to,
51            special: EpsilonType::None,
52        };
53    }
54    pub(crate) const fn with_offset(self, offset: usize) -> EpsilonTransition {
55        return EpsilonTransition {
56            from: self.from + offset,
57            to: self.to + offset,
58            special: self.special,
59        };
60    }
61    pub(crate) const fn add_offset(&self, offset: usize) -> EpsilonTransition {
62        return EpsilonTransition {
63            from: self.from + offset,
64            to: self.to + offset,
65            special: self.special,
66        };
67    }
68    /// Only intended for internal use by macros.
69    pub const fn __load(from: usize, to: usize, special: EpsilonType) -> EpsilonTransition {
70        return EpsilonTransition { from, to, special };
71    }
72}
73impl std::fmt::Display for EpsilonTransition {
74    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75        return write!(f, "{} -> {}", self.from, self.to);
76    }
77}
78impl ToTokens for EpsilonTransition {
79    fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) {
80        let EpsilonTransition { from, to, special } = self;
81        tokens.extend(quote! {
82            ere_core::working_nfa::EpsilonTransition::__load(
83                #from,
84                #to,
85                #special,
86            )
87        });
88    }
89}
90
91#[derive(Debug, Clone)]
92pub(crate) struct WorkingTransition {
93    pub(crate) from: usize,
94    pub(crate) to: usize,
95    pub(crate) symbol: Atom,
96}
97impl WorkingTransition {
98    pub fn new(from: usize, to: usize, symbol: Atom) -> WorkingTransition {
99        return WorkingTransition { from, to, symbol };
100    }
101    pub fn with_offset(self, offset: usize) -> WorkingTransition {
102        return WorkingTransition {
103            from: self.from + offset,
104            to: self.to + offset,
105            symbol: self.symbol,
106        };
107    }
108    pub fn add_offset(&self, offset: usize) -> WorkingTransition {
109        return WorkingTransition {
110            from: self.from + offset,
111            to: self.to + offset,
112            symbol: self.symbol.clone(),
113        };
114    }
115}
116impl std::fmt::Display for WorkingTransition {
117    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
118        return write!(f, "{} -({})> {}", self.from, self.symbol, self.to);
119    }
120}
121
122/// Each NFA has one start state (`0`) and one accept state (`states - 1`)
123#[derive(Debug)]
124pub struct WorkingNFA {
125    pub(crate) transitions: Vec<WorkingTransition>,
126    pub(crate) epsilons: Vec<EpsilonTransition>,
127    pub(crate) states: usize,
128}
129impl WorkingNFA {
130    const fn build_empty() -> WorkingNFA {
131        return WorkingNFA {
132            transitions: Vec::new(),
133            epsilons: Vec::new(),
134            states: 1,
135        };
136    }
137    fn build_symbol(c: &Atom) -> WorkingNFA {
138        return WorkingNFA {
139            transitions: vec![WorkingTransition::new(0, 1, c.clone())],
140            epsilons: Vec::new(),
141            states: 2,
142        };
143    }
144    fn build_union(nodes: &[SimplifiedTreeNode]) -> WorkingNFA {
145        let sub_nfas: Vec<WorkingNFA> = nodes.iter().map(WorkingNFA::build).collect();
146        let states = 2 + sub_nfas.iter().map(|n| n.states).sum::<usize>();
147        let mut transitions = Vec::new();
148        let mut epsilons = Vec::new();
149        let mut used_states = 1usize;
150        for nfa in sub_nfas {
151            // Epsilon transition in
152            epsilons.push(EpsilonTransition::new(0, used_states));
153            // Copy internal transitions
154            transitions.extend(
155                nfa.transitions
156                    .into_iter()
157                    .map(|t| t.with_offset(used_states)),
158            );
159            epsilons.extend(nfa.epsilons.into_iter().map(|t| t.with_offset(used_states)));
160            // Epsilon transition out
161            epsilons.push(EpsilonTransition::new(
162                used_states + nfa.states - 1,
163                states - 1,
164            ));
165            used_states += nfa.states;
166        }
167        assert_eq!(used_states + 1, states);
168
169        return WorkingNFA {
170            transitions,
171            epsilons,
172            states,
173        };
174    }
175    fn build_capture(tree: &SimplifiedTreeNode, group_num: usize) -> WorkingNFA {
176        let nfa = WorkingNFA::build(tree);
177        let states = nfa.states + 2;
178        let mut epsilons: Vec<_> = nfa.epsilons.into_iter().map(|t| t.with_offset(1)).collect();
179        let transitions: Vec<_> = nfa
180            .transitions
181            .into_iter()
182            .map(|t| t.with_offset(1))
183            .collect();
184        epsilons.push(EpsilonTransition {
185            from: 0,
186            to: 1,
187            special: EpsilonType::StartCapture(group_num),
188        });
189        epsilons.push(EpsilonTransition {
190            from: states - 2,
191            to: states - 1,
192            special: EpsilonType::EndCapture(group_num),
193        });
194        return WorkingNFA {
195            transitions,
196            epsilons,
197            states,
198        };
199    }
200    fn build_concat(nodes: &[SimplifiedTreeNode]) -> WorkingNFA {
201        if nodes.is_empty() {
202            return WorkingNFA {
203                transitions: Vec::new(),
204                epsilons: Vec::new(),
205                states: 1,
206            };
207        }
208
209        let mut states = 1usize;
210        let mut epsilons = vec![EpsilonTransition::new(0, 1)];
211        let mut transitions = vec![];
212
213        for sub in nodes {
214            let nfa = WorkingNFA::build(sub);
215            transitions.extend(nfa.transitions.into_iter().map(|t| t.with_offset(states)));
216            epsilons.extend(nfa.epsilons.into_iter().map(|t| t.with_offset(states)));
217            states += nfa.states;
218            epsilons.push(EpsilonTransition::new(states - 1, states));
219        }
220
221        return WorkingNFA {
222            transitions,
223            epsilons,
224            states: states + 1,
225        };
226    }
227    fn build_repeat(tree: &SimplifiedTreeNode, times: usize) -> WorkingNFA {
228        let mut states = 1usize;
229        let mut epsilons = vec![EpsilonTransition::new(0, 1)];
230        let mut transitions = vec![];
231        let nfa = WorkingNFA::build(tree);
232
233        for _ in 0..times {
234            transitions.extend(nfa.transitions.iter().map(|t| t.add_offset(states)));
235            epsilons.extend(nfa.epsilons.iter().map(|t| t.add_offset(states)));
236            states += nfa.states;
237            epsilons.push(EpsilonTransition::new(states - 1, states));
238        }
239
240        return WorkingNFA {
241            transitions,
242            epsilons,
243            states: states + 1,
244        };
245    }
246    fn build_upto(tree: &SimplifiedTreeNode, times: usize) -> WorkingNFA {
247        let nfa = WorkingNFA::build(tree);
248        let states = 2 + nfa.states * times;
249        let mut used_states = 1usize;
250        let mut transitions = Vec::new();
251        let mut epsilons = vec![
252            EpsilonTransition::new(0, 1),
253            EpsilonTransition::new(0, states - 1),
254        ];
255
256        for _ in 0..times {
257            transitions.extend(nfa.transitions.iter().map(|t| t.add_offset(states)));
258            epsilons.extend(nfa.epsilons.iter().map(|t| t.add_offset(states)));
259            used_states += nfa.states;
260            epsilons.push(EpsilonTransition::new(used_states - 1, used_states));
261            if used_states != states - 1 {
262                epsilons.push(EpsilonTransition::new(used_states - 1, states - 1));
263            }
264        }
265
266        return WorkingNFA {
267            transitions,
268            epsilons,
269            states,
270        };
271    }
272    fn build_star(tree: &SimplifiedTreeNode) -> WorkingNFA {
273        let nfa = WorkingNFA::build(tree);
274        let states = 2 + nfa.states;
275        let transitions: Vec<_> = nfa
276            .transitions
277            .into_iter()
278            .map(|t| t.with_offset(1))
279            .collect();
280        let mut epsilons: Vec<_> = nfa.epsilons.into_iter().map(|t| t.with_offset(1)).collect();
281        epsilons.push(EpsilonTransition::new(0, 1));
282        epsilons.push(EpsilonTransition::new(states - 2, 0));
283        epsilons.push(EpsilonTransition::new(0, states - 1));
284        return WorkingNFA {
285            transitions,
286            epsilons,
287            states,
288        };
289    }
290    fn build_start() -> WorkingNFA {
291        return WorkingNFA {
292            transitions: Vec::new(),
293            epsilons: vec![EpsilonTransition {
294                from: 0,
295                to: 1,
296                special: EpsilonType::StartAnchor,
297            }],
298            states: 2,
299        };
300    }
301    fn build_end() -> WorkingNFA {
302        return WorkingNFA {
303            transitions: Vec::new(),
304            epsilons: vec![EpsilonTransition {
305                from: 0,
306                to: 1,
307                special: EpsilonType::EndAnchor,
308            }],
309            states: 2,
310        };
311    }
312    const fn build_never() -> WorkingNFA {
313        return WorkingNFA {
314            transitions: Vec::new(),
315            epsilons: Vec::new(),
316            states: 2,
317        };
318    }
319    /// Recursively builds an inefficient but valid NFA based loosely on Thompson's Algorithm.
320    ///
321    /// Should be optimized using [`NFA::optimize_pass`]
322    fn build(tree: &SimplifiedTreeNode) -> WorkingNFA {
323        return match tree {
324            SimplifiedTreeNode::Empty => WorkingNFA::build_empty(),
325            SimplifiedTreeNode::Symbol(c) => WorkingNFA::build_symbol(c),
326            SimplifiedTreeNode::Union(nodes) => WorkingNFA::build_union(nodes),
327            SimplifiedTreeNode::Capture(tree, group_num) => {
328                WorkingNFA::build_capture(&tree, *group_num)
329            }
330            SimplifiedTreeNode::Concat(nodes) => WorkingNFA::build_concat(nodes),
331            SimplifiedTreeNode::Repeat(tree, times) => WorkingNFA::build_repeat(tree, *times),
332            SimplifiedTreeNode::UpTo(tree, times) => WorkingNFA::build_upto(tree, *times),
333            SimplifiedTreeNode::Star(tree) => WorkingNFA::build_star(tree),
334            SimplifiedTreeNode::Start => WorkingNFA::build_start(),
335            SimplifiedTreeNode::End => WorkingNFA::build_end(),
336            SimplifiedTreeNode::Never => WorkingNFA::build_never(),
337        };
338    }
339    pub fn new(tree: &SimplifiedTreeNode) -> WorkingNFA {
340        let mut nfa = WorkingNFA::build(tree);
341
342        nfa.clean_start_anchors();
343        nfa.clean_end_anchors();
344
345        // add loops at start and end in case we lack anchors
346        nfa.transitions = nfa.transitions.iter().map(|t| t.add_offset(1)).collect();
347        nfa.epsilons = nfa.epsilons.iter().map(|e| e.add_offset(1)).collect();
348        nfa.states += 2;
349        nfa.epsilons.push(EpsilonTransition::new(0, 1));
350        nfa.transitions.push(WorkingTransition::new(
351            0,
352            0,
353            Atom::NonmatchingList(Vec::new()),
354        ));
355        nfa.epsilons
356            .push(EpsilonTransition::new(nfa.states - 2, nfa.states - 1));
357        nfa.transitions.push(WorkingTransition::new(
358            nfa.states - 1,
359            nfa.states - 1,
360            Atom::NonmatchingList(Vec::new()),
361        ));
362
363        // Then remove redundant transitions from nodes before/after anchors
364        // May include the loops we just added
365        let zero_symbol_states: Vec<bool> =
366            std::iter::zip(nfa.nodes_after_end(), nfa.nodes_before_start())
367                .map(|(a, b)| a || b)
368                .collect();
369        nfa.transitions.retain(|t| !zero_symbol_states[t.from]);
370
371        // Finally, do normal optimization passes
372        while nfa.optimize_pass() {}
373        nfa.remove_unreachable();
374        return nfa;
375    }
376
377    /// Removes start anchors that will never be satisfied
378    /// (basically turning them into a `Never` to allow further optimization)
379    fn clean_start_anchors(&mut self) {
380        let mut zero_len_reachable = vec![false; self.states];
381        zero_len_reachable[0] = true;
382        let mut changed = false;
383        loop {
384            for e in self.epsilons.iter() {
385                if zero_len_reachable[e.from] && !zero_len_reachable[e.to] {
386                    zero_len_reachable[e.to] = true;
387                    changed = true;
388                }
389            }
390            if !changed {
391                break;
392            }
393            changed = false;
394        }
395        self.epsilons = self
396            .epsilons
397            .iter()
398            .filter(|t| t.special != EpsilonType::StartAnchor || zero_len_reachable[t.from])
399            .cloned()
400            .collect();
401    }
402
403    /// Removes end anchors that will never be satisfied
404    /// (basically turning them into a `Never` to allow further optimization)    
405    fn clean_end_anchors(&mut self) {
406        let mut zero_len_reachable = vec![false; self.states];
407        zero_len_reachable[self.states - 1] = true;
408        let mut changed = false;
409        loop {
410            for e in self.epsilons.iter() {
411                if !zero_len_reachable[e.from] && zero_len_reachable[e.to] {
412                    zero_len_reachable[e.from] = true;
413                    changed = true;
414                }
415            }
416            if !changed {
417                break;
418            }
419            changed = false;
420        }
421        self.epsilons = self
422            .epsilons
423            .iter()
424            .filter(|t| t.special != EpsilonType::EndAnchor || zero_len_reachable[t.to])
425            .cloned()
426            .collect();
427    }
428    /// Finds all nodes that are only ever visited after a `$`.
429    fn nodes_after_end(&self) -> Vec<bool> {
430        let mut nodes = vec![true; self.states];
431        nodes[0] = false;
432        let mut changed = false;
433        loop {
434            for e in self.epsilons.iter() {
435                if !nodes[e.from] && nodes[e.to] && e.special != EpsilonType::EndAnchor {
436                    nodes[e.to] = false;
437                    changed = true;
438                }
439            }
440            for t in self.transitions.iter() {
441                if !nodes[t.from] && nodes[t.to] {
442                    nodes[t.to] = false;
443                    changed = true;
444                }
445            }
446            if !changed {
447                break;
448            }
449            changed = false;
450        }
451        return nodes;
452    }
453    /// Finds all nodes that are only ever visited before a `^`.
454    fn nodes_before_start(&self) -> Vec<bool> {
455        let mut nodes = vec![true; self.states];
456        nodes[self.states - 1] = false;
457        let mut changed = false;
458        loop {
459            for e in self.epsilons.iter() {
460                if nodes[e.from] && !nodes[e.to] && e.special != EpsilonType::StartAnchor {
461                    nodes[e.from] = false;
462                    changed = true;
463                }
464            }
465            for t in self.transitions.iter() {
466                if nodes[t.from] && !nodes[t.to] {
467                    nodes[t.from] = false;
468                    changed = true;
469                }
470            }
471            if !changed {
472                break;
473            }
474            changed = false;
475        }
476        return nodes;
477    }
478
479    /// Optimizes the NFA graph
480    ///
481    /// Returns `true` if changes were made (meaning another pass should be tried).
482    fn optimize_pass(&mut self) -> bool {
483        let mut changed = false;
484
485        let mut dead_states = vec![false; self.states];
486
487        // Skip redundant states
488        // Special transitions (anchors + capture groups) are treated similar to non-epsilon transitions
489        for state in 1..self.states - 1 {
490            let incoming: Vec<usize> = self
491                .transitions
492                .iter()
493                .enumerate()
494                .filter(|(_, t)| t.to == state)
495                .map(|(i, _)| i)
496                .collect();
497            let outgoing: Vec<usize> = self
498                .transitions
499                .iter()
500                .enumerate()
501                .filter(|(_, t)| t.from == state)
502                .map(|(i, _)| i)
503                .collect();
504            let incoming_eps: Vec<usize> = self
505                .epsilons
506                .iter()
507                .enumerate()
508                .filter(|(_, t)| t.to == state)
509                .map(|(i, _)| i)
510                .collect();
511            let outgoing_eps: Vec<usize> = self
512                .epsilons
513                .iter()
514                .enumerate()
515                .filter(|(_, t)| t.from == state)
516                .map(|(i, _)| i)
517                .collect();
518
519            match (
520                incoming.as_slice(),
521                incoming_eps.as_slice(),
522                outgoing.as_slice(),
523                outgoing_eps.as_slice(),
524            ) {
525                // `as -xes> b -e> c` can become `as -xes> c` (assuming no other transitions)
526                (incoming, incoming_eps, &[], &[outgoing_eps])
527                    if self.epsilons[outgoing_eps].special == EpsilonType::None =>
528                {
529                    for idx in incoming {
530                        self.transitions[*idx].to = self.epsilons[outgoing_eps].to;
531                    }
532                    for idx in incoming_eps {
533                        self.epsilons[*idx].to = self.epsilons[outgoing_eps].to;
534                    }
535                    dead_states[state] = true;
536                    self.epsilons.swap_remove(outgoing_eps);
537                    changed = true;
538                    continue;
539                }
540                // `a -e> b -xes> cs` can become `a -xes> cs` (assuming no other transitions)
541                (&[], &[incoming_eps], outgoing, outgoing_eps)
542                    if self.epsilons[incoming_eps].special == EpsilonType::None =>
543                {
544                    for idx in outgoing {
545                        self.transitions[*idx].from = self.epsilons[incoming_eps].from;
546                    }
547                    for idx in outgoing_eps {
548                        self.epsilons[*idx].from = self.epsilons[incoming_eps].from;
549                    }
550                    dead_states[state] = true;
551                    self.epsilons.swap_remove(incoming_eps);
552                    changed = true;
553                    continue;
554                }
555                _ => {}
556            }
557
558            // TODO:
559            // `a -e> b -e> a` can combine `a` and `b` (including other transitions)
560            // TODO: might cause additional overhead in some cases, should we do
561            // ??? `a -x> b -es> cs` can become `a -xs> cs`
562            // ??? `as -es> b -x> c` can become `as -xs> c`
563        }
564        if !changed {
565            return changed;
566        }
567
568        let mut new_states = 0;
569        let state_map: Vec<usize> = dead_states
570            .into_iter()
571            .scan(0, |s, dead| {
572                if dead {
573                    return Some(usize::MAX);
574                } else {
575                    let out = *s;
576                    *s += 1;
577                    new_states = *s;
578                    return Some(out);
579                }
580            })
581            .collect();
582        self.states = new_states;
583
584        for t in &mut self.transitions {
585            t.from = state_map[t.from];
586            t.to = state_map[t.to];
587        }
588        for t in &mut self.epsilons {
589            t.from = state_map[t.from];
590            t.to = state_map[t.to];
591        }
592
593        return changed;
594    }
595
596    /// Removes all nodes that cannot be reached or cannot reach the end.
597    ///
598    /// Ignores special epsilon types (so should be called after they have been resolved)
599    fn remove_unreachable(&mut self) {
600        let mut reach_start = vec![false; self.states];
601        reach_start[0] = true;
602        let mut reach_end = vec![false; self.states];
603        reach_end[self.states - 1] = true;
604
605        let mut changed = false;
606        loop {
607            for e in self.epsilons.iter() {
608                if reach_start[e.from] && !reach_start[e.to] {
609                    reach_start[e.to] = true;
610                    changed = true;
611                }
612                if !reach_end[e.from] && reach_end[e.to] {
613                    reach_end[e.from] = true;
614                    changed = true;
615                }
616            }
617            for t in self.transitions.iter() {
618                if reach_start[t.from] && !reach_start[t.to] {
619                    reach_start[t.to] = true;
620                    changed = true;
621                }
622                if !reach_end[t.from] && reach_end[t.to] {
623                    reach_end[t.from] = true;
624                    changed = true;
625                }
626            }
627            if !changed {
628                break;
629            }
630            changed = false;
631        }
632
633        // Remove transitions that involve redundant states
634        self.transitions = self
635            .transitions
636            .iter()
637            .filter(|t| reach_start[t.from] && reach_end[t.to])
638            .cloned()
639            .collect();
640        self.epsilons = self
641            .epsilons
642            .iter()
643            .filter(|e| reach_start[e.from] && reach_end[e.to])
644            .cloned()
645            .collect();
646
647        // Then remove the states
648        let mut new_states = 0;
649        let state_map: Vec<usize> = std::iter::zip(reach_start.into_iter(), reach_end.into_iter())
650            .map(|(a, b)| !a || !b)
651            .scan(0, |s, dead| {
652                if dead {
653                    return Some(usize::MAX);
654                } else {
655                    let out = *s;
656                    *s += 1;
657                    new_states = *s;
658                    return Some(out);
659                }
660            })
661            .collect();
662        self.states = new_states;
663
664        for t in &mut self.transitions {
665            t.from = state_map[t.from];
666            t.to = state_map[t.to];
667        }
668        for t in &mut self.epsilons {
669            t.from = state_map[t.from];
670            t.to = state_map[t.to];
671        }
672    }
673
674    /// Writes a LaTeX TikZ representation to visualize the graph.
675    ///
676    /// If `include_doc` is `true`, will include the headers.
677    /// Otherwise, you should include `\usepackage{tikz}` and `\usetikzlibrary{automata, positioning}`.
678    pub fn to_tikz(&self, include_doc: bool) -> String {
679        // TODO: make the layout better
680        fn escape_latex(text: String) -> String {
681            return text
682                .chars()
683                .map(|c| match c {
684                    '\\' => r"{\textbackslash}".to_string(),
685                    '&' => r"\&".to_string(),
686                    '%' => r"\%".to_string(),
687                    '$' => r"\$".to_string(),
688                    '#' => r"\#".to_string(),
689                    '_' => r"\_".to_string(),
690                    '{' => r"\{".to_string(),
691                    '}' => r"\}".to_string(),
692                    '~' => r"{\textasciitilde}".to_string(),
693                    '^' => r"{\textasciicircum}".to_string(),
694                    c => c.to_string(),
695                })
696                .collect();
697        }
698
699        let mut text_parts: Vec<String> = Vec::new();
700        if include_doc {
701            text_parts.push(
702                "\\documentclass{standalone}\n\\usepackage{tikz}\n\\usetikzlibrary{automata, positioning}\n\\begin{document}\n"
703                .into(),
704            );
705        }
706        text_parts.push("\\begin{tikzpicture}[node distance=2cm, auto]\n".into());
707
708        text_parts.push("\\node[state, initial](q0){$q_0$};\n".into());
709        for i in 1..self.states - 1 {
710            text_parts.push(format!(
711                "\\node[state, right of=q{}](q{}){{$q_{{{}}}$}};\n",
712                i - 1,
713                i,
714                i,
715            ));
716        }
717        text_parts.push(format!(
718            "\\node[state, accepting, right of=q{}](q{}){{$q_{{{}}}$}};\n",
719            self.states - 2,
720            self.states - 1,
721            self.states - 1,
722        ));
723
724        for WorkingTransition { from, to, symbol } in &self.transitions {
725            let bend = match to.cmp(from) {
726                std::cmp::Ordering::Less => "[bend left] ",
727                std::cmp::Ordering::Equal => "[loop below]",
728                std::cmp::Ordering::Greater => "[bend left] ",
729            };
730            text_parts.push(format!(
731                "\\path[->] (q{from}) edge {bend} node {{{}}} (q{to});\n",
732                escape_latex(symbol.to_string()),
733            ));
734        }
735        for EpsilonTransition { from, to, special } in &self.epsilons {
736            let bend = match to.cmp(from) {
737                std::cmp::Ordering::Less => "[bend left] ",
738                std::cmp::Ordering::Equal => "[loop below]",
739                std::cmp::Ordering::Greater => "[bend left] ",
740            };
741            let label = match special {
742                EpsilonType::None => r"$\epsilon$".to_string(),
743                EpsilonType::StartAnchor => r"{\textasciicircum}".to_string(),
744                EpsilonType::EndAnchor => r"\$".to_string(),
745                EpsilonType::StartCapture(group) => format!("{group}("),
746                EpsilonType::EndCapture(group) => format!("){group}"),
747            };
748            text_parts.push(format!(
749                "\\path[->] (q{from}) edge {bend} node {{{label}}} (q{to});\n"
750            ));
751        }
752
753        text_parts.push("\\end{tikzpicture}\n".into());
754        if include_doc {
755            text_parts.push("\\end{document}\n".into());
756        }
757        return text_parts.into_iter().collect();
758    }
759
760    /// Using the classical NFA algorithm to do a simple boolean test on a string.
761    pub fn test(&self, text: &str) -> bool {
762        let mut list = vec![false; self.states];
763        let mut new_list = vec![false; self.states];
764        list[0] = true;
765
766        // Adds all states reachable by epsilon transitions
767        let propogate_epsilon = |list: &mut Vec<bool>, idx: usize| loop {
768            let mut has_new = false;
769            for EpsilonTransition { from, to, special } in &self.epsilons {
770                if list[*from]
771                    && !list[*to]
772                    && (match special {
773                        EpsilonType::StartAnchor => idx == 0,
774                        EpsilonType::EndAnchor => idx == text.len(),
775                        _ => true,
776                    })
777                {
778                    list[*to] = true;
779                    has_new = true;
780                }
781            }
782            if !has_new {
783                break;
784            }
785        };
786
787        for (i, c) in text.char_indices() {
788            propogate_epsilon(&mut list, i);
789            for WorkingTransition { from, to, symbol } in &self.transitions {
790                if list[*from] && symbol.check(c) {
791                    new_list[*to] = true;
792                }
793            }
794            let tmp = list;
795            list = new_list;
796            new_list = tmp;
797            new_list.fill(false);
798        }
799        propogate_epsilon(&mut list, text.len());
800        return *list.last().unwrap_or(&false);
801    }
802}
803impl std::fmt::Display for WorkingNFA {
804    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
805        writeln!(f, "{} states:", self.states)?;
806        for t in &self.transitions {
807            writeln!(f, "  {t}")?;
808        }
809        return Ok(());
810    }
811}
812
813#[cfg(test)]
814mod tests {
815    use super::*;
816    use crate::parse_tree::ERE;
817
818    #[test]
819    fn abbc_raw() {
820        let nfa = WorkingNFA {
821            transitions: vec![
822                WorkingTransition::new(0, 1, 'a'.into()),
823                WorkingTransition::new(1, 2, 'b'.into()),
824                WorkingTransition::new(2, 3, 'c'.into()),
825            ],
826            epsilons: vec![EpsilonTransition::new(2, 1)],
827            states: 4,
828        };
829        // println!("{}", nfa.to_tikz(true));
830
831        assert!(nfa.test("abc"));
832        assert!(nfa.test("abbc"));
833        assert!(nfa.test("abbbc"));
834        assert!(nfa.test("abbbbc"));
835
836        assert!(!nfa.test("ac"));
837        assert!(!nfa.test("abcc"));
838        assert!(!nfa.test("bac"));
839        assert!(!nfa.test("acb"));
840    }
841
842    #[test]
843    fn phone_number() {
844        let ere = ERE::parse_str(r"^(\+1 )?[0-9]{3}-[0-9]{3}-[0-9]{4}$").unwrap();
845        let (tree, capture_groups) = SimplifiedTreeNode::from_ere(&ere);
846        assert_eq!(capture_groups, 2);
847        let nfa = WorkingNFA::new(&tree);
848        // println!("{}", nfa.to_tikz(true));
849
850        assert!(nfa.test("012-345-6789"));
851        assert!(nfa.test("987-654-3210"));
852        assert!(nfa.test("+1 555-555-5555"));
853        assert!(nfa.test("123-555-9876"));
854
855        assert!(!nfa.test("abcd"));
856        assert!(!nfa.test("0123456789"));
857        assert!(!nfa.test("012--345-6789"));
858        assert!(!nfa.test("(555) 555-5555"));
859        assert!(!nfa.test("1 555-555-5555"));
860    }
861
862    #[test]
863    fn double_loop() {
864        let ere = ERE::parse_str(r"^.*(.*)*$").unwrap();
865        let (tree, capture_groups) = SimplifiedTreeNode::from_ere(&ere);
866        assert_eq!(capture_groups, 2);
867        let nfa = WorkingNFA::new(&tree);
868        // println!("{}", nfa.to_tikz(true));
869
870        assert!(nfa.test(""));
871        assert!(nfa.test("asdf"));
872        assert!(nfa.test("1234567"));
873        assert!(nfa.test("0"));
874
875        assert!(!nfa.test("\0"));
876    }
877
878    #[test]
879    fn good_anchored_start() {
880        let ere = ERE::parse_str(r"^a|b*^c|d^|n").unwrap();
881        let (tree, capture_groups) = SimplifiedTreeNode::from_ere(&ere);
882        assert_eq!(capture_groups, 1);
883        let nfa = WorkingNFA::new(&tree);
884        // println!("{}", nfa.to_tikz(true));
885
886        assert!(nfa.test("a"));
887        assert!(nfa.test("c"));
888        assert!(nfa.test("cq"));
889        assert!(nfa.test("wwwnwww"));
890
891        assert!(!nfa.test(""));
892        assert!(!nfa.test("qb"));
893        assert!(!nfa.test("qc"));
894        assert!(!nfa.test("b"));
895        assert!(!nfa.test("bc"));
896        assert!(!nfa.test("bbbbbbc"));
897        assert!(!nfa.test("d"));
898    }
899
900    #[test]
901    fn good_anchored_end() {
902        let ere = ERE::parse_str(r"a$|b$c*|$d|n").unwrap();
903        let (tree, capture_groups) = SimplifiedTreeNode::from_ere(&ere);
904        assert_eq!(capture_groups, 1);
905        let nfa = WorkingNFA::new(&tree);
906        println!("{}", nfa.to_tikz(true));
907
908        assert!(nfa.test("a"));
909        assert!(nfa.test("b"));
910        assert!(nfa.test("qb"));
911        assert!(nfa.test("wwwnwww"));
912
913        assert!(!nfa.test(""));
914        assert!(!nfa.test("bq"));
915        assert!(!nfa.test("qc"));
916        assert!(!nfa.test("c"));
917        assert!(!nfa.test("bc"));
918        assert!(!nfa.test("bcccccc"));
919        assert!(!nfa.test("d"));
920    }
921}
ere_core/working_nfa.rs

ere_core/
working_nfa.rs