brzozowski_regex/
derivation.rs

1// Copyright 2024 Hendrik van Antwerpen
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Derivation and derivation-based matching for regular expressions.
16
17use std::borrow::Borrow;
18use std::collections::HashSet;
19
20use itertools::Itertools;
21
22use crate::builder::Builder;
23use crate::builder::Regex;
24use crate::Alphabet;
25
26impl<B: Builder> Regex<B> {
27    /// Returns the derivative of this regular expression w.r.t. the given symbols.
28    pub fn derive_iter<I>(&self, symbols: impl IntoIterator<Item = I>) -> Regex<B>
29    where
30        I: Borrow<B::Symbol>,
31    {
32        let mut d = self.clone();
33        for symbol in symbols {
34            d = d.derive(symbol.borrow());
35        }
36        d
37    }
38
39    /// Returns the derivative of this regular expression w.r.t. to the given symbol.
40    #[inline]
41    pub fn derive(&self, symbol: &B::Symbol) -> Regex<B> {
42        self.derive_symbols(&Symbols::include([symbol.clone()]))
43    }
44
45    pub(crate) fn derive_symbols(&self, symbol: &Symbols<B::Symbol>) -> Regex<B> {
46        match self {
47            Self::EmptySet => B::empty_set(),
48            Self::EmptyString => B::empty_set(),
49            Self::Symbol(inner) => {
50                if symbol.matches(inner) {
51                    B::empty_string()
52                } else {
53                    B::empty_set()
54                }
55            }
56            Self::Concat(left, right) => B::or(
57                B::concat(left.derive_symbols(symbol), *right.clone()),
58                B::concat(left.nullable(), right.derive_symbols(symbol)),
59            ),
60            Self::Closure(inner) => {
61                B::concat(inner.derive_symbols(symbol), B::closure(*inner.clone()))
62            }
63            Self::Or(left, right) => {
64                B::or(left.derive_symbols(symbol), right.derive_symbols(symbol))
65            }
66            Self::And(left, right) => {
67                B::and(left.derive_symbols(symbol), right.derive_symbols(symbol))
68            }
69            Self::Complement(inner) => B::complement(inner.derive_symbols(symbol)),
70        }
71    }
72
73    /// Returns whether the string of symbols is in the language of this regular expression.
74    pub fn matches<I>(&self, symbols: impl IntoIterator<Item = I>) -> bool
75    where
76        I: Borrow<B::Symbol>,
77    {
78        self.derive_iter(symbols).is_nullable()
79    }
80}
81
82#[derive(Clone, Debug, Eq, PartialEq)]
83pub(crate) enum Symbols<S: Alphabet> {
84    /// Only the given symbols.
85    Include(HashSet<S>),
86    /// All except the given symbols.
87    Exclude(HashSet<S>),
88}
89
90impl<S: Alphabet> std::fmt::Display for Symbols<S>
91where
92    S: std::fmt::Display,
93{
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        match self {
96            Symbols::Include(symbols) => write!(f, "{{{}}}", symbols.iter().join(", ")),
97            Symbols::Exclude(symbols) => write!(f, "Σ∖{{{}}}", symbols.iter().join(", ")),
98        }
99    }
100}
101
102impl<S: Alphabet> Symbols<S> {
103    #[inline]
104    pub(crate) fn include<const N: usize>(symbols: [S; N]) -> Self {
105        Self::Include(HashSet::from(symbols))
106    }
107
108    #[cfg(test)]
109    #[inline]
110    pub(crate) fn exclude<const N: usize>(symbols: [S; N]) -> Self {
111        Self::Exclude(HashSet::from(symbols))
112    }
113
114    pub(crate) fn matches(&self, symbol: &S) -> bool {
115        match self {
116            Self::Include(included) => included.contains(symbol),
117            Self::Exclude(excluded) => !excluded.contains(symbol),
118        }
119    }
120}
121
122impl<S: Alphabet> std::ops::BitOr for Symbols<S> {
123    type Output = Self;
124
125    fn bitor(self, other: Self) -> Self::Output {
126        // either this or the other regular expressions must match
127        match (self, other) {
128            // include all included symbols
129            (Self::Include(left), Self::Include(right)) => {
130                Self::Include(HashSet::union(&left, &right).cloned().collect())
131            }
132            // exclude shared excluded symbols
133            (Self::Exclude(left), Self::Exclude(right)) => {
134                Self::Exclude(HashSet::intersection(&left, &right).cloned().collect())
135            }
136            // exclude the excluded symbols except the included symbols
137            (Self::Include(included), Self::Exclude(excluded))
138            | (Self::Exclude(excluded), Self::Include(included)) => {
139                Self::Exclude(excluded.difference(&included).cloned().collect())
140            }
141        }
142    }
143}
144
145impl<S: Alphabet> std::ops::BitAnd for Symbols<S> {
146    type Output = Self;
147
148    fn bitand(self, other: Self) -> Self::Output {
149        // both this and the other regular expression must match
150        match (self, other) {
151            // include shared included symbols
152            (Self::Include(left), Self::Include(right)) => {
153                Self::Include(HashSet::intersection(&left, &right).cloned().collect())
154            }
155            // exclude all excluded symbols
156            (Self::Exclude(left), Self::Exclude(right)) => {
157                Self::Exclude(HashSet::union(&left, &right).cloned().collect())
158            }
159            // include the included symbols except the excluded symbols
160            (Self::Include(included), Self::Exclude(excluded))
161            | (Self::Exclude(excluded), Self::Include(included)) => {
162                Self::Include(included.difference(&excluded).cloned().collect())
163            }
164        }
165    }
166}
167
168impl<S: Alphabet> std::ops::Not for Symbols<S> {
169    type Output = Self;
170
171    fn not(self) -> Self::Output {
172        match self {
173            Self::Include(symbols) => Self::Exclude(symbols),
174            Self::Exclude(symbols) => Self::Include(symbols),
175        }
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use crate::builder::ApproximatelySimilarCanonical;
182    use crate::builder::Pure;
183    use crate::ops::*;
184
185    use super::*;
186
187    #[test]
188    fn test_derive_symbols() {
189        let tests: Vec<(Regex<Pure<usize>>, Symbols<usize>, Regex<Pure<usize>>)> = vec![
190            (().r(), Symbols::include([42]), ().r()),
191            (().r(), Symbols::exclude([42]), ().r()),
192            ([].r(), Symbols::include([42]), ().r()),
193            ([].r(), Symbols::exclude([42]), ().r()),
194            ([42.s()].r(), Symbols::include([42]), [].r()),
195            ([42.s()].r(), Symbols::exclude([42]), ().r()),
196            (!().r(), Symbols::include([42]), !().r()),
197            (!().r(), Symbols::exclude([42]), !().r()),
198            (!42.s(), Symbols::include([42]), ![].r()),
199            (!42.s(), Symbols::exclude([42]), !().r()),
200        ];
201        for (r, symbols, expected) in tests {
202            let actual = r.derive_symbols(&symbols);
203            assert_eq!(
204                expected, actual,
205                "derive {} for {}, expected {}, got {}",
206                r, symbols, expected, actual
207            );
208        }
209    }
210
211    #[test]
212    fn test_matches_pure() {
213        test_matches::<Pure<_>>();
214    }
215
216    #[test]
217    fn test_matches_asc() {
218        test_matches::<ApproximatelySimilarCanonical<_>>();
219    }
220
221    fn test_matches<B: Builder<Symbol = usize> + Clone>() {
222        let tests: Vec<(Regex<B>, Vec<_>, bool)> = vec![
223            ((().r()), vec![], false),
224            (([].r()), vec![], true),
225            (42.s(), vec![42], true),
226            (42.s(), vec![42, 42], false),
227            (42.s(), vec![11], false),
228            ((![42.s()].r()), vec![], true),
229            (([42.s()].r()), vec![42], true),
230            (([42.s(), (11.s() | 7.s())].r()), vec![42], false),
231            (([42.s(), (11.s() | 7.s())].r()), vec![42, 11], true),
232            (([42.s(), (11.s() | 7.s())].r()), vec![42, 7], true),
233            ((42.s().c()), vec![], true),
234            ((42.s().c()), vec![42], true),
235            ((42.s().c()), vec![42, 42, 42], true),
236            ((42.s().c()), vec![42, 11], false),
237            ((42.s() & [].r()), vec![42], false),
238            ((42.s() & 42.s().c()), vec![42], true),
239            ((42.s() & 42.s().c()), vec![42, 42], false),
240            ((42.s() | 11.s()), vec![42], true),
241            ((42.s() | 11.s()), vec![11], true),
242            ((42.s() | 11.s()), vec![11, 42], false),
243            ((42.s() | 11.s().c()), vec![42], true),
244            ((42.s() | 11.s().c()), vec![11], true),
245            ((42.s() | 11.s().c()), vec![11, 11], true),
246            ((42.s() | 11.s().c()), vec![42, 11], false),
247            ((!().r()), vec![11], true),
248            ((!11.s()), vec![42], true),
249            ((!11.s()), vec![11], false),
250        ];
251        for test in tests {
252            assert_eq!(test.2, test.0.matches(test.1));
253        }
254    }
255}