brzozowski_regex/builder/
similarity.rs

1// Copyright 2024 Hendrik van Antwerpen
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! A builder implementation that produces regular expressions in approximately-similar canonical form.
16
17use std::cmp::Ordering;
18use std::marker::PhantomData;
19
20use itertools::Itertools;
21
22use crate::builder::Builder;
23use crate::builder::Regex;
24use crate::Alphabet;
25
26/// A regular expression builder that keeps the structure in approximately-similar
27/// canonical form.
28#[derive(Clone, Debug, Hash, PartialEq, Eq)]
29pub struct ApproximatelySimilarCanonical<S: Alphabet> {
30    _phantom: PhantomData<S>,
31}
32
33impl<S: Alphabet> Builder for ApproximatelySimilarCanonical<S> {
34    type Symbol = S;
35
36    #[inline]
37    fn empty_set() -> Regex<Self> {
38        Regex::EmptySet
39    }
40
41    #[inline]
42    fn empty_string() -> Regex<Self> {
43        Regex::EmptyString
44    }
45
46    #[inline]
47    fn symbol(value: Self::Symbol) -> Regex<Self> {
48        Regex::Symbol(value)
49    }
50
51    fn closure(inner: Regex<Self>) -> Regex<Self> {
52        match inner {
53            // ()* --> e
54            Regex::EmptySet => Self::empty_string(),
55            // e* --> e
56            Regex::EmptyString => Self::empty_string(),
57            // e** --> e*
58            Regex::Closure(inner) => Regex::Closure(inner),
59            // (build)
60            inner => Regex::Closure(inner.into()),
61        }
62    }
63
64    fn concat(left: Regex<Self>, right: Regex<Self>) -> Regex<Self> {
65        match (left, right) {
66            // 0 R --> 0
67            (Regex::EmptySet, _) | (_, Regex::EmptySet) => Self::empty_set(),
68            // e R --> R
69            (Regex::EmptyString, inner) | (inner, Regex::EmptyString) => inner,
70            // R (S T) --> (R S) T
71            // (build)
72            (left, right) => right
73                .into_reverse_concat_iter()
74                .chain(left.into_reverse_concat_iter())
75                .collect_vec()
76                .into_iter()
77                .rev()
78                .reduce(|l, r| Regex::Concat(l.into(), r.into()))
79                .expect("at least two items"),
80        }
81    }
82
83    fn or(left: Regex<Self>, right: Regex<Self>) -> Regex<Self> {
84        match (left, right) {
85            // 0 | R --> R
86            (Regex::EmptySet, inner) | (inner, Regex::EmptySet) => inner,
87            // !0 | R --> !0
88            (any, _) | (_, any) if any.is_empty_set_complement() => {
89                Self::complement(Self::empty_set())
90            }
91            // R | R --> R
92            // R | (S | T) --> (R | S) | T
93            // S | R --> R | S
94            // (build)
95            (left, right) => right
96                .into_reverse_or_iter()
97                .chain(left.into_reverse_or_iter())
98                .sorted_by(cmp)
99                .dedup()
100                .reduce(|l, r| Regex::Or(l.into(), r.into()))
101                .expect("at least two items"),
102        }
103    }
104
105    fn and(left: Regex<Self>, right: Regex<Self>) -> Regex<Self> {
106        match (left, right) {
107            // 0 & R --> 0
108            (Regex::EmptySet, _) | (_, Regex::EmptySet) => Self::empty_set(),
109            // !0 & R --> R
110            (any, inner) | (inner, any) if any.is_empty_set_complement() => inner,
111            // R & R --> R
112            // R & (S & T) --> (R & S) & T
113            // S | R --> R | S
114            // (build)
115            (left, right) => right
116                .into_reverse_and_iter()
117                .chain(left.into_reverse_and_iter())
118                .sorted_by(cmp)
119                .dedup()
120                .reduce(|l, r| Regex::And(l.into(), r.into()))
121                .expect("at least two items"),
122        }
123    }
124
125    fn complement(inner: Regex<Self>) -> Regex<Self> {
126        match inner {
127            // !!R --> R
128            Regex::Complement(inner) => *inner,
129            // (build)
130            inner => Regex::Complement(inner.into()),
131        }
132    }
133}
134
135impl<S: Alphabet> Regex<ApproximatelySimilarCanonical<S>> {
136    /// Iterate in reverse over nested "concat" regular expressions.
137    fn into_reverse_concat_iter(self) -> impl Iterator<Item = Self> {
138        ReverseIter(Some(self), |r| {
139            if let Regex::Concat(next, value) = r {
140                (*value, Some(*next))
141            } else {
142                (r, None)
143            }
144        })
145    }
146
147    /// Iterate in reverse over nested "or" regular expressions.
148    fn into_reverse_or_iter(self) -> impl Iterator<Item = Self> {
149        ReverseIter(Some(self), |r| {
150            if let Regex::Or(next, value) = r {
151                (*value, Some(*next))
152            } else {
153                (r, None)
154            }
155        })
156    }
157
158    /// Iterate in reverse over nested "and" regular expressions.
159    fn into_reverse_and_iter(self) -> impl Iterator<Item = Self> {
160        ReverseIter(Some(self), |r| {
161            if let Regex::And(next, value) = r {
162                (*value, Some(*next))
163            } else {
164                (r, None)
165            }
166        })
167    }
168
169    // Returns whether this regular expression is the complement of the empty set.
170    fn is_empty_set_complement(&self) -> bool {
171        if let Regex::Complement(inner) = self {
172            matches!(inner.as_ref(), Regex::EmptySet)
173        } else {
174            false
175        }
176    }
177}
178
179struct ReverseIter<S, F>(Option<Regex<ApproximatelySimilarCanonical<S>>>, F)
180where
181    S: Alphabet,
182    F: Fn(
183        Regex<ApproximatelySimilarCanonical<S>>,
184    ) -> (
185        Regex<ApproximatelySimilarCanonical<S>>,
186        Option<Regex<ApproximatelySimilarCanonical<S>>>,
187    );
188
189impl<S, F> Iterator for ReverseIter<S, F>
190where
191    S: Alphabet,
192    F: Fn(
193        Regex<ApproximatelySimilarCanonical<S>>,
194    ) -> (
195        Regex<ApproximatelySimilarCanonical<S>>,
196        Option<Regex<ApproximatelySimilarCanonical<S>>>,
197    ),
198{
199    type Item = Regex<ApproximatelySimilarCanonical<S>>;
200    fn next(&mut self) -> Option<Self::Item> {
201        let mut r = None;
202        std::mem::swap(&mut r, &mut self.0);
203        if let Some(r) = r {
204            let (value, next) = self.1(r);
205            self.0 = next;
206            Some(value)
207        } else {
208            None
209        }
210    }
211}
212
213fn cmp<B: Builder>(left: &Regex<B>, right: &Regex<B>) -> Ordering {
214    match (left, right) {
215        (Regex::Symbol(left_value), Regex::Symbol(right_value)) => left_value.cmp(right_value),
216        (Regex::Concat(left_left, left_right), Regex::Concat(right_left, right_right)) => {
217            cmp(left_left, right_left).then(cmp(left_right, right_right))
218        }
219        (Regex::Closure(left_inner), Regex::Closure(right_inner)) => cmp(left_inner, right_inner),
220        (Regex::Or(left_left, left_right), Regex::Or(right_left, right_right)) => {
221            cmp(left_left, right_left).then(cmp(left_right, right_right))
222        }
223        (Regex::And(left_left, left_right), Regex::And(right_left, right_right)) => {
224            cmp(left_left, right_left).then(cmp(left_right, right_right))
225        }
226        (Regex::Complement(left_inner), Regex::Complement(right_inner)) => {
227            cmp(left_inner, right_inner)
228        }
229        (left, right) => rank(left).cmp(&rank(right)),
230    }
231}
232
233fn rank<B: Builder>(re: &Regex<B>) -> usize {
234    match re {
235        Regex::EmptySet => 1,
236        Regex::EmptyString => 2,
237        Regex::Symbol(_) => 3,
238        Regex::Concat(_, _) => 4,
239        Regex::Closure(_) => 5,
240        Regex::Or(_, _) => 6,
241        Regex::And(_, _) => 7,
242        Regex::Complement(_) => 8,
243    }
244}
245
246#[cfg(test)]
247mod tests {
248    use crate::builder::Pure;
249    use crate::ops::*;
250
251    use super::*;
252
253    #[test]
254    fn test_canonical_forms() {
255        let tests: Vec<(
256            Regex<ApproximatelySimilarCanonical<usize>>,
257            Regex<Pure<usize>>,
258        )> = vec![
259            (().r(), ().r()),
260            (().r().c(), [].r()),
261            ([].r().c(), [].r()),
262            (11.s() & [42.s(), ().r()].r(), ().r()),
263            (42.s() & 11.s() & 17.s(), 11.s() & 17.s() & 42.s()),
264            (42.s() & !11.s() & 17.s(), 17.s() & 42.s() & !11.s()),
265            (42.s() | 11.s() | 17.s(), 11.s() | 17.s() | 42.s()),
266            (42.s() | !11.s() | 17.s(), 17.s() | 42.s() | !11.s()),
267            (!42.s() & !11.s(), !11.s() & !42.s()),
268        ];
269        for test in tests {
270            assert_eq!(test.1, test.0.rebuild());
271        }
272    }
273
274    #[test]
275    fn test_equivalent_forms() {
276        let tests: Vec<(
277            Regex<ApproximatelySimilarCanonical<usize>>,
278            Regex<ApproximatelySimilarCanonical<usize>>,
279        )> = vec![
280            (11.s() & 42.s() & 7.s(), 7.s() & 42.s() & 11.s()),
281            (11.s() & 7.s() & 42.s(), 42.s() & 7.s() & 11.s()),
282            (11.s() | 42.s() | 7.s(), 7.s() | 42.s() | 11.s()),
283            (11.s() | 7.s() | 42.s(), 42.s() | 7.s() | 11.s()),
284            (42.s().c().c(), 42.s().c()),
285            (11.s() | 11.s(), 11.s()),
286            (11.s() & !().r(), 11.s()),
287            (11.s() | !().r(), !().r()),
288            (11.s() & (42.s() & 7.s()), 7.s() & 11.s() & 42.s()),
289            (11.s() | (42.s() | 7.s()), 7.s() | 11.s() | 42.s()),
290        ];
291        for test in tests {
292            assert_eq!(test.1, test.0.rebuild());
293        }
294    }
295}