scopegraphs_regular_expressions/
compile.rs

1use crate::regex::Symbol;
2use crate::Regex;
3#[cfg(feature = "dynamic")]
4use quote::quote;
5use std::collections::{HashMap, HashSet, VecDeque};
6use std::rc::Rc;
7
8pub type StateID = usize;
9
10#[derive(Clone)]
11pub(crate) struct MatchState {
12    pub is_final: bool,
13    nullable: bool,
14    pub transition_table: HashMap<Rc<Symbol>, StateID>,
15    #[cfg(feature = "dynamic")]
16    pub string_transition_table: HashMap<String, StateID>,
17    pub default_transition: StateID,
18    pub regex: Rc<Regex>,
19}
20
21impl MatchState {
22    pub fn is_accepting(&self) -> bool {
23        self.nullable
24    }
25}
26
27/// A regex automaton is a compiled regular expression.
28///
29/// It represents the state machine corresponding to the [`Regex`].
30/// This struct can either be turned into a
31/// * [`DynamicMatcher`](crate::dynamic::DynamicMatcher) to match on a regex that was compiled at runtime.
32/// * An implementation of [`RegexMatcher`](crate::RegexMatcher) generated using [emit](Automaton::emit).
33///
34/// This function can be called at compile time (through the `compile_regex!` macro) and it
35/// emits the Rust code that can match the regular expression.
36///
37/// You can create an automaton using [`Regex::compile`]
38#[derive(Clone)]
39pub struct Automaton {
40    pub(crate) states: Vec<MatchState>,
41    pub(crate) initial: StateID,
42}
43
44pub struct AlphabetOrder {
45    order: HashMap<Rc<Symbol>, i64>,
46}
47
48impl AlphabetOrder {
49    pub fn new(alphabet: &HashSet<Rc<Symbol>>) -> Self {
50        Self {
51            order: alphabet.iter().cloned().zip(0..).collect(),
52        }
53    }
54
55    pub fn get(&self, symbol: &Rc<Symbol>) -> i64 {
56        *self.order.get(symbol).expect("not in alphabet")
57    }
58}
59
60type State = Rc<Regex>;
61
62struct RegexCompiler {
63    alphabet: HashSet<Rc<Symbol>>,
64    regex: Rc<Regex>,
65    state_transitions: HashMap<State, HashMap<Rc<Symbol>, State>>,
66    default_transitions: HashMap<State, State>,
67    reverse_transitions: HashMap<State, HashSet<State>>,
68    alphabet_order: AlphabetOrder,
69}
70
71impl RegexCompiler {
72    fn new(regex: Regex) -> Self {
73        let alphabet = regex.alphabet();
74
75        Self {
76            alphabet_order: AlphabetOrder::new(&alphabet),
77            alphabet,
78            regex: Rc::new(regex),
79            state_transitions: Default::default(),
80            default_transitions: Default::default(),
81            reverse_transitions: Default::default(),
82        }
83    }
84
85    fn create_transitions_step(&mut self, state: State, work_list: &mut VecDeque<State>) {
86        // if we've already seen this state, we are done instantly
87        if self.state_transitions.contains_key(&state) {
88            return;
89        }
90
91        let mut transitions = HashMap::new();
92
93        // find all reachable states from this state
94        for symbol in &self.alphabet {
95            let next_state = state.derive(Some(symbol)).normalize(&self.alphabet_order);
96
97            // put it in reverse states
98            self.reverse_transitions
99                .entry(next_state.clone())
100                .or_default()
101                .insert(state.clone());
102            // add the transition
103            transitions.insert(symbol.clone(), next_state.clone());
104
105            // add to the work list
106            work_list.push_back(next_state);
107        }
108
109        // now do exactly the same we did but for the default
110        self.step_default_state(&state, work_list);
111
112        self.state_transitions.insert(state, transitions);
113    }
114
115    fn step_default_state(&mut self, state: &State, work_list: &mut VecDeque<State>) {
116        let default_state = state.derive(None).normalize(&self.alphabet_order);
117
118        // put it in reverse states
119        self.reverse_transitions
120            .entry(default_state.clone())
121            .or_default()
122            .insert(state.clone());
123
124        // add the transition (default)
125        self.default_transitions
126            .insert(state.clone(), default_state.clone());
127
128        // add to the work list
129        work_list.push_back(default_state);
130    }
131
132    fn create_transitions(&mut self) {
133        let mut work_list = VecDeque::new();
134
135        work_list.push_back(self.regex.clone());
136        work_list.push_back(Rc::new(Regex::EmptySet));
137
138        while let Some(state) = work_list.pop_front() {
139            self.create_transitions_step(state, &mut work_list);
140        }
141    }
142
143    fn find_non_final(&self) -> HashSet<&State> {
144        let mut work_list = VecDeque::new();
145        for state in self.state_transitions.keys() {
146            if state.is_nullable() {
147                work_list.push_back(state)
148            }
149        }
150
151        let mut visited = HashSet::new();
152        let mut non_final = HashSet::new();
153        while let Some(state) = work_list.pop_front() {
154            if visited.insert(state) {
155                for next_state in self.reverse_transitions.get(state).into_iter().flatten() {
156                    non_final.insert(next_state);
157                    work_list.push_back(next_state);
158                }
159            }
160        }
161        non_final
162    }
163
164    fn find_state_ids(&self) -> HashMap<&State, StateID> {
165        let mut state_ids = HashMap::new();
166        for (state_id_counter, state) in self.state_transitions.keys().enumerate() {
167            state_ids.insert(state, state_id_counter);
168        }
169        state_ids
170    }
171
172    fn compile(mut self) -> Automaton {
173        self.create_transitions();
174
175        let non_final = self.find_non_final();
176        let state_ids = self.find_state_ids();
177
178        let mut match_states = HashMap::new();
179        for (state, edges) in &self.state_transitions {
180            let transition_table: HashMap<_, _> = edges
181                .iter()
182                .map(|(k, v)| (k.clone(), *state_ids.get(v).unwrap()))
183                .collect();
184
185            let non_final = non_final.contains(state);
186            let nullable = state.is_nullable();
187
188            match_states.insert(
189                *state_ids.get(state).unwrap(),
190                MatchState {
191                    is_final: !non_final,
192                    nullable,
193                    #[cfg(feature = "dynamic")]
194                    string_transition_table: transition_table
195                        .iter()
196                        .map(|(label, dst)| {
197                            let path = &label.name;
198                            (format!("{}", quote!(#path)), *dst)
199                        })
200                        .collect(),
201                    transition_table,
202                    default_transition: *state_ids
203                        .get(self.default_transitions.get(state).unwrap())
204                        .unwrap(),
205                    regex: state.clone(),
206                },
207            );
208        }
209
210        let mut match_states = match_states.into_iter().collect::<Vec<_>>();
211        match_states.sort_unstable_by_key(|x| x.0);
212
213        #[cfg(debug_assertions)]
214        if !match_states.is_empty() {
215            let mut curr = match_states[0].0;
216            assert_eq!(curr, 0);
217            for &(i, _) in match_states.iter().skip(1) {
218                assert_eq!(i, curr + 1);
219                curr = i;
220            }
221        }
222
223        let compiled = Automaton {
224            initial: *state_ids.get(&self.regex).unwrap(),
225            states: match_states.into_iter().map(|i| i.1).collect(),
226        };
227
228        compiled
229    }
230}
231
232impl Regex {
233    /// Turn a [`Regex`] into an [`Automaton`].
234    ///
235    /// A `Regex` represents only the syntax of a regular expression,
236    /// while a compiled regex, an `Automaton` is more like a state machine.
237    /// It can directly be used to actually match on the regular expression.
238    pub fn compile(self) -> Automaton {
239        let compiler = RegexCompiler::new(self);
240        compiler.compile()
241    }
242}