scopegraphs_regular_expressions/
regex.rs

1use crate::compile::AlphabetOrder;
2use proc_macro2::TokenStream;
3use std::collections::HashSet;
4use std::fmt::Debug;
5use std::fmt::Display;
6use std::fmt::Formatter;
7use std::hash::{Hash, Hasher};
8use std::ops::Deref;
9use std::rc::Rc;
10use syn::Path;
11
12pub struct Symbol {
13    pub(super) name: Path,
14}
15
16impl Eq for Symbol {}
17
18impl PartialEq for Symbol {
19    fn eq(&self, other: &Self) -> bool {
20        self.to_string() == other.to_string()
21    }
22}
23
24impl Debug for Symbol {
25    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
26        Display::fmt(self, f)
27    }
28}
29
30impl Hash for Symbol {
31    fn hash<H: Hasher>(&self, state: &mut H) {
32        self.to_string().hash(state)
33    }
34}
35
36impl Display for Symbol {
37    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
38        let name = &self.name;
39        write!(f, "{}", quote::quote!(#name))
40    }
41}
42
43impl From<&str> for Symbol {
44    fn from(value: &str) -> Self {
45        Self {
46            name: syn::parse2(value.parse::<TokenStream>().unwrap()).unwrap(),
47        }
48    }
49}
50
51/// A regular expression that can specify a path through a scope graph.
52///
53/// Can be created using [`parse_regex`](crate::parse_regex)
54#[derive(Hash, Clone, PartialEq, Eq)]
55pub enum Regex {
56    /// `e`
57    EmptyString,
58    /// `0`
59    EmptySet,
60    /// A symbol like `a`
61    Symbol(Rc<Symbol>),
62    /// x*
63    Repeat(Rc<Regex>),
64    /// ~x
65    Complement(Rc<Regex>),
66    /// x | y
67    Or(Rc<Regex>, Rc<Regex>),
68    /// x & y
69    And(Rc<Regex>, Rc<Regex>),
70    /// x y
71    Concat(Rc<Regex>, Rc<Regex>),
72}
73
74impl Debug for Regex {
75    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
76        match self {
77            Self::EmptyString => write!(f, "e"),
78            Self::EmptySet => write!(f, "0"),
79            Self::Symbol(sym) => write!(f, "{:?}", sym.name.get_ident()),
80            Self::Repeat(re) => write!(f, "{:?}*", *re),
81            Self::Complement(re) => write!(f, "~{:?}", *re),
82            Self::Or(l, r) => write!(f, "{:?} | {:?}", *l, *r),
83            Self::And(l, r) => write!(f, "{:?} & {:?}", *l, *r),
84            Self::Concat(l, r) => write!(f, "{:?} {:?}", *l, *r),
85        }
86    }
87}
88
89impl Regex {
90    /// Applies a symbol to a regex. When a symbol is applied to a regex, a new regex
91    /// is returned that matches everything after this symbol.
92    /// Essentially this "strips" one symbol off of the regex, like from
93    /// state regex(abab), if we see a symbol a, we go to state regex(bab)
94    ///
95    /// symbol can be None, to see what happens when a token outside of the alphabet
96    /// is given to see what state we go to then.
97    ///
98    /// This is called the Brzozowski derivative
99    /// Janusz A. Brzozowski (1964). "Derivatives of Regular Expressions". J ACM. 11 (4): 481–494. doi:10.1145/321239.321249
100    pub(crate) fn derive(&self, symbol: Option<&Rc<Symbol>>) -> Rc<Regex> {
101        match self {
102            // a: 0 => 0
103            Regex::EmptySet => Regex::EmptySet.into(),
104            // a: e => 0
105            Regex::EmptyString => Regex::EmptySet.into(),
106            // a: a => e
107            // a: b => 0
108            Regex::Symbol(s) => {
109                if Some(s) == symbol {
110                    Regex::EmptyString.into()
111                } else {
112                    Regex::EmptySet.into()
113                }
114            }
115            // a: (ab)* => b(ab)*
116            Regex::Repeat(r) => {
117                Regex::Concat(r.derive(symbol), Regex::Repeat(r.clone()).into()).into()
118            }
119            // a: ~(ab) => ~(b)
120            Regex::Complement(c) => Regex::Complement(c.derive(symbol)).into(),
121            // a: (ab | ac) => b | c
122            Regex::Or(l, r) => Regex::Or(l.derive(symbol), r.derive(symbol)).into(),
123            // a: (ab & ac) => b & c
124            Regex::And(l, r) => Regex::And(l.derive(symbol), r.derive(symbol)).into(),
125            // a: ab => b
126            Regex::Concat(l, r) => {
127                let new = Regex::Concat(l.derive(symbol), r.clone()).into();
128
129                if l.is_nullable() {
130                    Regex::Or(new, r.derive(symbol)).into()
131                } else {
132                    new
133                }
134            }
135        }
136    }
137
138    /// Normalizes a regex to a standard form.
139    ///
140    /// For example, `a e` is equivalent to `a`, and this transformation is made here.
141    pub(crate) fn normalize(self: &Rc<Self>, ab: &AlphabetOrder) -> Rc<Regex> {
142        match self.deref() {
143            Regex::EmptyString => self.clone(),
144            Regex::EmptySet => self.clone(),
145            Regex::Symbol(_) => self.clone(),
146            Regex::Repeat(r) => {
147                let r = r.normalize(ab);
148                match r.deref() {
149                    // 0* => e
150                    Regex::EmptySet => Regex::EmptyString.into(),
151                    // e* => e
152                    Regex::EmptyString => r,
153                    // a** => a*
154                    Regex::Repeat(_) => r,
155                    _ => Regex::Repeat(r).into(),
156                }
157            }
158            Regex::Complement(c) => {
159                let c = c.normalize(ab);
160                match c.deref() {
161                    // ~~a => a
162                    Regex::Complement(c) => c.clone(),
163                    _ => Regex::Complement(c).into(),
164                }
165            }
166            Regex::Or(l, r) => normalize_or(l, r, ab),
167            Regex::And(l, r) => normalize_and(l, r, ab),
168            Regex::Concat(l, r) => {
169                let l = l.normalize(ab);
170                let r = r.normalize(ab);
171
172                match (l.deref(), r.deref()) {
173                    // 0 a => 0
174                    (Regex::EmptySet, _) => l,
175                    // e a => e
176                    (Regex::EmptyString, _) => r,
177                    // (a b) c => a (b c)
178                    (Regex::Concat(il, ir), _) => {
179                        Regex::Concat(il.clone(), Regex::Concat(ir.clone(), r).into()).into()
180                    }
181                    // a 0 => 0
182                    (_, Regex::EmptySet) => r,
183                    // a e => a
184                    (_, Regex::EmptyString) => l,
185                    _ => Regex::Concat(l, r).into(),
186                }
187            }
188        }
189    }
190
191    /// Returns whether this regex is nullable (i.e., accepts the empty string `e`).
192    ///
193    /// Examples of this include the `e` itself, any 0-ary repeat `A*`, `e | A`, etc.
194    ///
195    /// ```
196    /// # use scopegraphs_regular_expressions::Regex;
197    /// assert!(Regex::EmptyString.is_nullable())
198    /// ```
199    pub fn is_nullable(&self) -> bool {
200        match self {
201            Regex::EmptySet => false,
202            Regex::EmptyString => true,
203            Regex::Symbol(_) => false,
204            Regex::Concat(l, r) => l.is_nullable() && r.is_nullable(),
205            Regex::Repeat(_) => true,
206            Regex::Or(l, r) => l.is_nullable() || r.is_nullable(),
207            Regex::And(l, r) => l.is_nullable() && r.is_nullable(),
208            Regex::Complement(i) => !i.is_nullable(),
209        }
210    }
211
212    /// Searches for the alphabet used in this regular expression.
213    ///
214    /// Uses depth-first search to traverse the regex to get the alphabet in use.
215    pub(crate) fn alphabet(&self) -> HashSet<Rc<Symbol>> {
216        let mut alphabet = HashSet::new();
217        self.search_alphabet(&mut alphabet);
218        alphabet
219    }
220
221    fn search_alphabet(&self, alphabet: &mut HashSet<Rc<Symbol>>) {
222        match self {
223            Regex::EmptyString => {}
224            Regex::EmptySet => {}
225            Regex::Symbol(s) => {
226                alphabet.insert(s.clone());
227            }
228            Regex::Repeat(i) | Regex::Complement(i) => i.search_alphabet(alphabet),
229            Regex::Or(l, r) | Regex::And(l, r) | Regex::Concat(l, r) => {
230                l.search_alphabet(alphabet);
231                r.search_alphabet(alphabet);
232            }
233        }
234    }
235
236    fn order(&self, ab: &AlphabetOrder) -> i64 {
237        // ordering of these variants is the same as the one in the java implementation of spoofax
238        // at https://github.com/metaborg/nabl/blob/802559782da2216b66d290f90179c2ac8f21ba3f/scopegraph/src/main/java/mb/scopegraph/regexp/impl/RegExpNormalizingBuilder.java#L164.
239        // The exact order is likely not necessary for correctness, and it's unclear if it's a good
240        // order for speed, but we decided to use the same order regardless, just to be sure.
241        match self {
242            Regex::EmptySet => 1,
243            Regex::EmptyString => 2,
244            Regex::Concat(_, _) => 3,
245            Regex::Repeat(_) => 4,
246            Regex::Or(_, _) => 5,
247            Regex::And(_, _) => 6,
248            Regex::Complement(_) => 7,
249            Regex::Symbol(s) => ab.get(s) + 8,
250        }
251    }
252
253    fn compare(&self, other: &Regex, ab: &AlphabetOrder) -> i64 {
254        match (self, other) {
255            (Regex::Concat(l1, r1), Regex::Concat(l2, r2)) => {
256                let ans = l1.compare(l2, ab);
257                if ans == 0 {
258                    r1.compare(r2, ab)
259                } else {
260                    ans
261                }
262            }
263            (Regex::Repeat(l), Regex::Repeat(r)) => l.compare(r, ab),
264            (Regex::Or(l1, r1), Regex::Or(l2, r2)) => {
265                let ans = l1.compare(l2, ab);
266                if ans == 0 {
267                    r1.compare(r2, ab)
268                } else {
269                    ans
270                }
271            }
272            (Regex::And(l1, r1), Regex::And(l2, r2)) => {
273                let ans = l1.compare(l2, ab);
274                if ans == 0 {
275                    r1.compare(r2, ab)
276                } else {
277                    ans
278                }
279            }
280            (Regex::Complement(l), Regex::Complement(r)) => l.compare(r, ab),
281            _ => self.order(ab) - other.order(ab),
282        }
283    }
284}
285
286fn normalize_or(l: &Rc<Regex>, r: &Rc<Regex>, ab: &AlphabetOrder) -> Rc<Regex> {
287    let l = l.normalize(ab);
288    let r = r.normalize(ab);
289
290    match (l.deref(), r.deref()) {
291        // a | a => a
292        (a, b) if a == b => l,
293        // 0 | a => a
294        (Regex::EmptySet, _) => r,
295        // e | a => a if a.is_nullable()
296        (Regex::EmptyString, rr) if rr.is_nullable() => r,
297        // (a | b) | c => a | (b | c)
298        (Regex::Or(il, ir), _) => normalize_or(il, &normalize_or(ir, &r, ab), ab),
299        // ~0 | a => ~0
300        (Regex::Complement(c), _) if c.deref() == &Regex::EmptySet => l,
301        // (a | (b | c)) => (b | (a | c)) if b < a
302        (_, Regex::Or(il, ir)) => {
303            if l.compare(il, ab) > 0 {
304                normalize_or(il, &normalize_or(&l, ir, ab), ab)
305            } else {
306                Regex::Or(l, r).into()
307            }
308        }
309        // a | b => b | a if b < a
310        _ if l.compare(&r, ab) > 0 => normalize_or(&r, &l, ab),
311        _ => Regex::Or(l, r).into(),
312    }
313}
314
315fn normalize_and(l: &Rc<Regex>, r: &Rc<Regex>, ab: &AlphabetOrder) -> Rc<Regex> {
316    let l = l.normalize(ab);
317    let r = r.normalize(ab);
318
319    match (l.deref(), r.deref()) {
320        // a & a => a
321        (a, b) if a == b => l,
322        // 0 & a => 0
323        (Regex::EmptySet, _) => l,
324        // e & a => e if a.is_nullable()
325        (Regex::EmptyString, r) if r.is_nullable() => l,
326        // e & a => 0 if !a.is_nullable()
327        (Regex::EmptyString, _) => Regex::EmptySet.into(),
328        // (a & b) & c => a & (b & c)
329        (Regex::And(il, ir), _) => normalize_and(il, &normalize_and(ir, &r, ab), ab),
330        // ~e & a => a
331        (Regex::Complement(c), _) if c.normalize(ab).deref() == &Regex::EmptySet => r,
332        // (a & (b & c)) => (b & (a & c)) if b < a
333        (_, Regex::And(il, ir)) if l.compare(ir, ab) > 0 => {
334            normalize_and(il, &normalize_and(&l, ir, ab), ab)
335        }
336        // a & b => b & a if b < a
337        _ if l.compare(&r, ab) > 0 => normalize_and(&r, &l, ab),
338        _ => Regex::And(l, r).into(),
339    }
340}
341
342impl Display for Regex {
343    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
344        match self {
345            Regex::EmptyString => write!(f, "e"),
346            Regex::EmptySet => write!(f, "0"),
347            Regex::Symbol(s) => write!(f, "{s}"),
348            Regex::Repeat(r) => write!(f, "({r})*"),
349            Regex::Complement(c) => write!(f, "~({c})"),
350            Regex::Or(a, b) => write!(f, "{a} | {b}"),
351            Regex::And(a, b) => write!(f, "{a} | {b}"),
352            Regex::Concat(a, b) => write!(f, "{a} {b}"),
353        }
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use crate::compile::AlphabetOrder;
360    use crate::parse_regex;
361    use crate::regex::Symbol;
362    use std::rc::Rc;
363
364    #[test]
365    fn nullable() {
366        assert!(parse_regex("e").unwrap().is_nullable());
367        assert!(!parse_regex("0").unwrap().is_nullable());
368
369        assert!(parse_regex("e | e").unwrap().is_nullable());
370        assert!(parse_regex("e | A").unwrap().is_nullable());
371        assert!(parse_regex("A | e").unwrap().is_nullable());
372        assert!(!parse_regex("A | B").unwrap().is_nullable());
373
374        assert!(parse_regex("e | e").unwrap().is_nullable());
375        assert!(!parse_regex("e & A").unwrap().is_nullable());
376        assert!(!parse_regex("e & A").unwrap().is_nullable());
377        assert!(!parse_regex("A & B").unwrap().is_nullable());
378
379        assert!(parse_regex("e | e").unwrap().is_nullable());
380        assert!(!parse_regex("A e").unwrap().is_nullable());
381        assert!(!parse_regex("e B").unwrap().is_nullable());
382        assert!(!parse_regex("A B").unwrap().is_nullable());
383
384        assert!(parse_regex("A*").unwrap().is_nullable());
385        assert!(!parse_regex("A+").unwrap().is_nullable());
386        assert!(parse_regex("A?").unwrap().is_nullable());
387    }
388
389    #[test]
390    fn apply_symbol() {
391        let a = Rc::new(Symbol::from("a"));
392        let b = Rc::new(Symbol::from("b"));
393        let c = Rc::new(Symbol::from("c"));
394        let ab = AlphabetOrder::new(&[a.clone(), b.clone(), c.clone()].into_iter().collect());
395
396        assert_eq!(
397            parse_regex("a b").unwrap().derive(Some(&a)).normalize(&ab),
398            parse_regex("b").unwrap().into()
399        );
400        assert_eq!(
401            parse_regex("a b").unwrap().derive(Some(&b)).normalize(&ab),
402            parse_regex("0").unwrap().into()
403        );
404    }
405}