brzozowski_regex/
automaton.rs

1// Copyright 2024 Hendrik van Antwerpen
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Finite automaton based matching from regular expression.
16
17use std::borrow::Borrow;
18use std::borrow::Cow;
19use std::collections::HashMap;
20use std::collections::HashSet;
21use std::collections::VecDeque;
22
23use crate::builder::ApproximatelySimilarCanonical;
24use crate::builder::Regex;
25use crate::derivation::Symbols;
26use crate::Alphabet;
27
28/// Finite automaton over the given alphabet.
29#[derive(Clone)]
30pub struct FiniteAutomaton<S: Alphabet> {
31    states: Vec<State<S>>,
32}
33
34#[derive(Clone)]
35struct State<S: Alphabet> {
36    regex: Regex<ApproximatelySimilarCanonical<S>>,
37    transitions: HashMap<S, u16>,
38    default_transition: u16,
39    acceptance: Acceptance,
40}
41
42/// Type describing acceptance of a matched sequence of symbols.
43#[derive(Clone, Copy, Debug, Eq, PartialEq)]
44pub enum Acceptance {
45    /// The sequence is rejected, regardless of any subsequent symbols.
46    Reject,
47    /// The sequence is not accepted, but subsequent symbols may result in accept.
48    MayAccept,
49    /// The current sequence is accepted.
50    Accept,
51}
52
53impl Acceptance {
54    fn from_nullable(is_nullable: bool) -> Self {
55        if is_nullable {
56            Self::Accept
57        } else {
58            Self::Reject
59        }
60    }
61}
62
63impl<S: Alphabet> Regex<ApproximatelySimilarCanonical<S>> {
64    /// Returns a finite automaton for this regular expression.
65    pub fn to_automaton(&self) -> FiniteAutomaton<S> {
66        let mut symbols = HashSet::new();
67        self.collect_symbols(&mut symbols);
68        let default_symbols = Symbols::Exclude(symbols.clone());
69
70        let mut regexes: HashMap<Self, u16> = HashMap::new();
71        let mut states = Vec::new();
72        let mut reverse_transitions = Vec::new();
73
74        let mut queue = VecDeque::new();
75        fn get_or_insert<S: Alphabet>(
76            regex: Regex<ApproximatelySimilarCanonical<S>>,
77            queue: &mut VecDeque<Regex<ApproximatelySimilarCanonical<S>>>,
78            regexes: &mut HashMap<Regex<ApproximatelySimilarCanonical<S>>, u16>,
79            reverse_transitions: &mut Vec<Vec<u16>>,
80        ) -> u16 {
81            if let Some(idx) = regexes.get(&regex) {
82                *idx
83            } else {
84                let idx = regexes.len() as u16;
85                regexes.insert(regex.clone(), idx);
86                queue.push_back(regex);
87                reverse_transitions.push(Vec::new());
88                idx
89            }
90        }
91
92        get_or_insert(
93            self.clone(),
94            &mut queue,
95            &mut regexes,
96            &mut reverse_transitions,
97        );
98        let mut reverse_queue = VecDeque::new();
99        while let Some(regex) = queue.pop_front() {
100            let current_idx = states.len() as u16;
101            let accepting = Acceptance::from_nullable(regex.is_nullable());
102            if accepting == Acceptance::Accept {
103                reverse_queue.push_front(states.len() as u16);
104            }
105            let mut transitions = HashMap::default();
106            for symbol in &symbols {
107                let next = regex.derive_symbols(&Symbols::include([symbol.clone()]));
108                let next_idx =
109                    get_or_insert(next, &mut queue, &mut regexes, &mut reverse_transitions);
110                reverse_transitions[next_idx as usize].push(current_idx);
111                transitions.insert(symbol.clone(), next_idx);
112            }
113            let default_transition = {
114                let next = regex.derive_symbols(&default_symbols);
115                let next_idx =
116                    get_or_insert(next, &mut queue, &mut regexes, &mut reverse_transitions);
117                reverse_transitions[next_idx as usize].push(current_idx);
118                next_idx
119            };
120            states.push(State {
121                regex,
122                transitions,
123                default_transition,
124                acceptance: accepting,
125            });
126        }
127
128        while let Some(next_idx) = reverse_queue.pop_front() {
129            for prev_idx in &reverse_transitions[next_idx as usize] {
130                let prev = &mut states[*prev_idx as usize];
131                if prev.acceptance == Acceptance::Reject {
132                    prev.acceptance = Acceptance::MayAccept;
133                    reverse_queue.push_front(*prev_idx);
134                }
135            }
136        }
137
138        FiniteAutomaton { states }
139    }
140
141    fn collect_symbols(&self, symbols: &mut HashSet<S>) {
142        match self {
143            Regex::EmptySet => {}
144            Regex::EmptyString => {}
145            Regex::Symbol(symbol) => {
146                symbols.insert(symbol.clone());
147            }
148            Regex::Concat(left, right) => {
149                left.collect_symbols(symbols);
150                right.collect_symbols(symbols);
151            }
152            Regex::Closure(inner) => inner.collect_symbols(symbols),
153            Regex::Or(left, right) => {
154                left.collect_symbols(symbols);
155                right.collect_symbols(symbols);
156            }
157            Regex::And(left, right) => {
158                left.collect_symbols(symbols);
159                right.collect_symbols(symbols);
160            }
161            Regex::Complement(inner) => inner.collect_symbols(symbols),
162        }
163    }
164}
165
166impl<S: Alphabet> FiniteAutomaton<S> {
167    pub fn to_matcher(&self) -> Matcher<'_, S> {
168        Matcher {
169            fa: Cow::Borrowed(self),
170            state: 0,
171        }
172    }
173
174    pub fn into_matcher(self) -> Matcher<'static, S> {
175        Matcher {
176            fa: Cow::Owned(self),
177            state: 0,
178        }
179    }
180
181    pub fn matches<I>(&self, symbols: impl IntoIterator<Item = I>) -> bool
182    where
183        I: Borrow<S>,
184    {
185        self.to_matcher().next_iter(symbols) == Acceptance::Accept
186    }
187
188    #[inline]
189    fn state(&self, idx: u16) -> &State<S> {
190        &self.states[idx as usize]
191    }
192}
193
194impl<S: Alphabet> State<S> {
195    fn next(&self, symbol: &S) -> u16 {
196        self.transitions
197            .get(symbol)
198            .copied()
199            .unwrap_or(self.default_transition)
200    }
201}
202
203/// Type implementing incremental matching against a finite automaton.
204pub struct Matcher<'a, S: Alphabet> {
205    fa: Cow<'a, FiniteAutomaton<S>>,
206    state: u16,
207}
208
209impl<'a, S: Alphabet> Matcher<'a, S> {
210    pub fn next(&mut self, symbol: &S) -> Acceptance {
211        self.state = self.state().next(symbol);
212        self.fa.state(self.state).acceptance
213    }
214
215    pub fn next_iter<I>(&mut self, symbols: impl IntoIterator<Item = I>) -> Acceptance
216    where
217        I: Borrow<S>,
218    {
219        for symbol in symbols {
220            self.state = self.state().next(symbol.borrow());
221        }
222        self.state().acceptance
223    }
224
225    #[inline]
226    pub fn regex(&self) -> &Regex<ApproximatelySimilarCanonical<S>> {
227        &self.state().regex
228    }
229
230    #[inline]
231    fn state(&self) -> &State<S> {
232        self.fa.state(self.state)
233    }
234}
235
236#[cfg(test)]
237mod tests {
238    use itertools::Itertools;
239
240    use crate::builder::ApproximatelySimilarCanonical;
241    use crate::builder::Regex;
242    use crate::ops::*;
243
244    use super::*;
245
246    #[test]
247    fn test_matches() {
248        let tests: Vec<(
249            Regex<ApproximatelySimilarCanonical<usize>>,
250            Vec<_>,
251            Acceptance,
252        )> = vec![
253            ((().r()), vec![], Acceptance::Reject),
254            (([].r()), vec![], Acceptance::Accept),
255            (42.s(), vec![42], Acceptance::Accept),
256            (42.s(), vec![42, 42], Acceptance::Reject), // FIXME
257            (42.s(), vec![11], Acceptance::Reject),     // FIXME
258            ((![42.s()].r()), vec![], Acceptance::Accept),
259            (([42.s()].r()), vec![42], Acceptance::Accept),
260            (
261                ([42.s(), (11.s() | 7.s())].r()),
262                vec![42],
263                Acceptance::MayAccept,
264            ),
265            (
266                ([42.s(), (11.s() | 7.s())].r()),
267                vec![42, 11],
268                Acceptance::Accept,
269            ),
270            (
271                ([42.s(), (11.s() | 7.s())].r()),
272                vec![42, 7],
273                Acceptance::Accept,
274            ),
275            ((42.s().c()), vec![], Acceptance::Accept),
276            ((42.s().c()), vec![42], Acceptance::Accept),
277            ((42.s().c()), vec![42, 42, 42], Acceptance::Accept),
278            ((42.s().c()), vec![42, 11], Acceptance::Reject),
279            ((42.s() & [].r()), vec![42], Acceptance::Reject),
280            ((42.s() & 42.s().c()), vec![42], Acceptance::Accept),
281            ((42.s() & 42.s().c()), vec![42, 42], Acceptance::Reject),
282            ((42.s() | 11.s()), vec![42], Acceptance::Accept),
283            ((42.s() | 11.s()), vec![11], Acceptance::Accept),
284            ((42.s() | 11.s()), vec![11, 42], Acceptance::Reject),
285            ((42.s() | 11.s().c()), vec![42], Acceptance::Accept),
286            ((42.s() | 11.s().c()), vec![11], Acceptance::Accept),
287            ((42.s() | 11.s().c()), vec![11, 11], Acceptance::Accept),
288            ((42.s() | 11.s().c()), vec![42, 11], Acceptance::Reject),
289            ((!().r()), vec![11], Acceptance::Accept),
290            ((!11.s()), vec![42], Acceptance::Accept),
291            ((!11.s()), vec![11], Acceptance::MayAccept),
292        ];
293        for test in tests {
294            assert_eq!(
295                test.2,
296                test.0.to_automaton().into_matcher().next_iter(&test.1),
297                "expected {:?} matching {} with [{}]",
298                test.2,
299                test.0,
300                test.1.iter().join(", ")
301            );
302        }
303    }
304}