scopegraphs_regular_expressions/
compile.rs1use crate::regex::Symbol;
2use crate::Regex;
3#[cfg(feature = "dynamic")]
4use quote::quote;
5use std::collections::{HashMap, HashSet, VecDeque};
6use std::rc::Rc;
7
8pub type StateID = usize;
9
10#[derive(Clone)]
11pub(crate) struct MatchState {
12 pub is_final: bool,
13 nullable: bool,
14 pub transition_table: HashMap<Rc<Symbol>, StateID>,
15 #[cfg(feature = "dynamic")]
16 pub string_transition_table: HashMap<String, StateID>,
17 pub default_transition: StateID,
18 pub regex: Rc<Regex>,
19}
20
21impl MatchState {
22 pub fn is_accepting(&self) -> bool {
23 self.nullable
24 }
25}
26
27#[derive(Clone)]
39pub struct Automaton {
40 pub(crate) states: Vec<MatchState>,
41 pub(crate) initial: StateID,
42}
43
44pub struct AlphabetOrder {
45 order: HashMap<Rc<Symbol>, i64>,
46}
47
48impl AlphabetOrder {
49 pub fn new(alphabet: &HashSet<Rc<Symbol>>) -> Self {
50 Self {
51 order: alphabet.iter().cloned().zip(0..).collect(),
52 }
53 }
54
55 pub fn get(&self, symbol: &Rc<Symbol>) -> i64 {
56 *self.order.get(symbol).expect("not in alphabet")
57 }
58}
59
60type State = Rc<Regex>;
61
62struct RegexCompiler {
63 alphabet: HashSet<Rc<Symbol>>,
64 regex: Rc<Regex>,
65 state_transitions: HashMap<State, HashMap<Rc<Symbol>, State>>,
66 default_transitions: HashMap<State, State>,
67 reverse_transitions: HashMap<State, HashSet<State>>,
68 alphabet_order: AlphabetOrder,
69}
70
71impl RegexCompiler {
72 fn new(regex: Regex) -> Self {
73 let alphabet = regex.alphabet();
74
75 Self {
76 alphabet_order: AlphabetOrder::new(&alphabet),
77 alphabet,
78 regex: Rc::new(regex),
79 state_transitions: Default::default(),
80 default_transitions: Default::default(),
81 reverse_transitions: Default::default(),
82 }
83 }
84
85 fn create_transitions_step(&mut self, state: State, work_list: &mut VecDeque<State>) {
86 if self.state_transitions.contains_key(&state) {
88 return;
89 }
90
91 let mut transitions = HashMap::new();
92
93 for symbol in &self.alphabet {
95 let next_state = state.derive(Some(symbol)).normalize(&self.alphabet_order);
96
97 self.reverse_transitions
99 .entry(next_state.clone())
100 .or_default()
101 .insert(state.clone());
102 transitions.insert(symbol.clone(), next_state.clone());
104
105 work_list.push_back(next_state);
107 }
108
109 self.step_default_state(&state, work_list);
111
112 self.state_transitions.insert(state, transitions);
113 }
114
115 fn step_default_state(&mut self, state: &State, work_list: &mut VecDeque<State>) {
116 let default_state = state.derive(None).normalize(&self.alphabet_order);
117
118 self.reverse_transitions
120 .entry(default_state.clone())
121 .or_default()
122 .insert(state.clone());
123
124 self.default_transitions
126 .insert(state.clone(), default_state.clone());
127
128 work_list.push_back(default_state);
130 }
131
132 fn create_transitions(&mut self) {
133 let mut work_list = VecDeque::new();
134
135 work_list.push_back(self.regex.clone());
136 work_list.push_back(Rc::new(Regex::EmptySet));
137
138 while let Some(state) = work_list.pop_front() {
139 self.create_transitions_step(state, &mut work_list);
140 }
141 }
142
143 fn find_non_final(&self) -> HashSet<&State> {
144 let mut work_list = VecDeque::new();
145 for state in self.state_transitions.keys() {
146 if state.is_nullable() {
147 work_list.push_back(state)
148 }
149 }
150
151 let mut visited = HashSet::new();
152 let mut non_final = HashSet::new();
153 while let Some(state) = work_list.pop_front() {
154 if visited.insert(state) {
155 for next_state in self.reverse_transitions.get(state).into_iter().flatten() {
156 non_final.insert(next_state);
157 work_list.push_back(next_state);
158 }
159 }
160 }
161 non_final
162 }
163
164 fn find_state_ids(&self) -> HashMap<&State, StateID> {
165 let mut state_ids = HashMap::new();
166 for (state_id_counter, state) in self.state_transitions.keys().enumerate() {
167 state_ids.insert(state, state_id_counter);
168 }
169 state_ids
170 }
171
172 fn compile(mut self) -> Automaton {
173 self.create_transitions();
174
175 let non_final = self.find_non_final();
176 let state_ids = self.find_state_ids();
177
178 let mut match_states = HashMap::new();
179 for (state, edges) in &self.state_transitions {
180 let transition_table: HashMap<_, _> = edges
181 .iter()
182 .map(|(k, v)| (k.clone(), *state_ids.get(v).unwrap()))
183 .collect();
184
185 let non_final = non_final.contains(state);
186 let nullable = state.is_nullable();
187
188 match_states.insert(
189 *state_ids.get(state).unwrap(),
190 MatchState {
191 is_final: !non_final,
192 nullable,
193 #[cfg(feature = "dynamic")]
194 string_transition_table: transition_table
195 .iter()
196 .map(|(label, dst)| {
197 let path = &label.name;
198 (format!("{}", quote!(#path)), *dst)
199 })
200 .collect(),
201 transition_table,
202 default_transition: *state_ids
203 .get(self.default_transitions.get(state).unwrap())
204 .unwrap(),
205 regex: state.clone(),
206 },
207 );
208 }
209
210 let mut match_states = match_states.into_iter().collect::<Vec<_>>();
211 match_states.sort_unstable_by_key(|x| x.0);
212
213 #[cfg(debug_assertions)]
214 if !match_states.is_empty() {
215 let mut curr = match_states[0].0;
216 assert_eq!(curr, 0);
217 for &(i, _) in match_states.iter().skip(1) {
218 assert_eq!(i, curr + 1);
219 curr = i;
220 }
221 }
222
223 let compiled = Automaton {
224 initial: *state_ids.get(&self.regex).unwrap(),
225 states: match_states.into_iter().map(|i| i.1).collect(),
226 };
227
228 compiled
229 }
230}
231
232impl Regex {
233 pub fn compile(self) -> Automaton {
239 let compiler = RegexCompiler::new(self);
240 compiler.compile()
241 }
242}