scopegraphs_regular_expressions/
parse.rs

1//! This module contains a bottom-up (LR(1)) parser for our regular expression data type.
2//!
3//!
4//! ### Grammar
5//!
6//! The parser is implemented based on a LR parse table generated from [an online generator](https://jsmachines.sourceforge.net/machines/lr1.html), using the following grammar.
7//!
8//! ```bnf
9//!  0: S -> E
10//!  1: E -> 0
11//!  2: E -> e
12//!  3: E -> l
13//!  4: E -> ~ E
14//!  5: E -> E *
15//!  6: E -> E +
16//!  7: E -> E ?
17//!  8: E -> E E
18//!  9: E -> E | E
19//! 10: E -> E & E
20//! ```
21//!
22//! Here, `l` should be interpreted as label literals and parenthesized expressions.
23//! We do not spell out the parenthesized expressions because the Rust lexer can automatically handle those for us.
24//!
25//!
26//! ### Disambiguation
27//!
28//! To disambiguate this grammar, we used the same operator priorities as [Statix](https://github.com/metaborg/nabl/blob/802559782da2216b66d290f90179c2ac8f21ba3f/statix.lang/syntax/statix/lang/Core.sdf3#L218-L221):
29//!
30//! | Operator            | Precedence Level | Associativity |
31//! | :-----------------: | :--------------- | ------------- |
32//! | `~`                 | 4                |               |
33//! | `*`                 | 4                |               |
34//! | `+`                 | 4                |               |
35//! | `?`                 | 4                |               |
36//! | (concat)            | 3                | (right)       |
37//! | `&`                 | 2                | (left)        |
38//! | <code>\|</code>     | 1                | (left)        |
39//!
40//!
41//! ### Parse Table
42//!
43//! The LR parse table generated from this grammar is as follows (closure sets on the documentation of [`ParseState`]):
44//!
45//! | State | `0`       | `e`        | `l`        | `~`        | `*`        | `+`        | `?`        | <code>\|</code> | `&`        | `$`        |   | `S` | `E`   |
46//! | ----- | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | --------------- | ---------- | ---------- | - | --- | ----- |
47//! | 0     | `s2`      | `s3`       | `s4`       | `s5`       |            |            |            |                 |            |            |   |     | `1`   |
48//! | 1     | `s2`      | `s3`       | `s4`       | `s5`       | `s6`       | `s7`       | `s8`       | `s10`           | `s11`      | `ACC`      |   |     | `9`   |
49//! | 2     | `r1`      | `r1`       | `r1`       | `r1`       | `r1`       | `r1`       | `r1`       | `r1`            | `r1`       | `r1`       |   |     |       |
50//! | 3     | `r2`      | `r2`       | `r2`       | `r2`       | `r2`       | `r2`       | `r2`       | `r2`            | `r2`       | `r2`       |   |     |       |
51//! | 4     | `r3`      | `r3`       | `r3`       | `r3`       | `r3`       | `r3`       | `r3`       | `r3`            | `r3`       | `r3`       |   |     |       |
52//! | 5     | `s2`      | `s3`       | `s4`       | `s5`       |            |            |            |                 |            |            |   |     | `12`  |
53//! | 6     | `r5`      | `r5`       | `r5`       | `r5`       | `r5`       | `r5`       | `r5`       | `r5`            | `r5`       | `r5`       |   |     |       |
54//! | 7     | `r6`      | `r6`       | `r6`       | `r6`       | `r6`       | `r6`       | `r6`       | `r6`            | `r6`       | `r6`       |   |     |       |
55//! | 8     | `r7`      | `r7`       | `r7`       | `r7`       | `r7`       | `r7`       | `r7`       | `r7`            | `r7`       | `r7`       |   |     |       |
56//! | 9     | `s2`[^9l] | `s3`[^9l]  | `s4`[^9l]  | `s5`[^9n]  | `s6`[^9n]  | `s7`[^9n]  | `s8`[^9n]  | `r8`[^9o]       | `r8`[^9a]  | `r8`       |   |     | `9`   |
57//! | 10    | `s2`      | `s3`       | `s4`       | `s5`       |            |            |            |                 |            |            |   |     | `13`  |
58//! | 11    | `s2`      | `s3`       | `s4`       | `s5`       |            |            |            |                 |            |            |   |     | `14`  |
59//! | 12    | `r4`[^120]| `r4`[^12e] | `r4`[^12l] | `r4`[^12n] | `AMB`[^2b] | `AMB`[^2b] | `AMB`[^2b] | `r4`[^12o]      | `r4`[^12a] | `r4`       |   |     | `9`   |
60//! | 13    | `s2`[^3l] | `s3`[^3l]  | `s4`[^3l]  | `s5`[^3l]  | `s6`[^3l]  | `s7`[^3l]  | `s8`[^3l]  | `r9`[^3o]       | `s11`[^3l] | `r9`       |   |     | `9`   |
61//! | 14    | `s2`[^4l] | `s3`[^4l]  | `s4`[^4l]  | `s5`[^4l]  | `s6`[^4l]  | `s7`[^4l]  | `s8`[^4l]  | `r10`[^4o]      | `r10`[^4a] | `r10`      |   |     | `9`   |
62//!
63//! The first segment being the ACTION-table, the last two columns being the GOTO-table.
64//! In the action table, actions as `sX` means: shift the symbol onto the stack, transition to state `X`.
65//! Actions like `rY` mean: reduce the top of the stack using production `Y` (from original grammar).
66//! The `ACC` action means accepting the current input.
67//! The `AMB` action is emitting an error that the current expression is ambiguous.
68//! Empty boxes are parse errors: the tokens are not expected at that position.
69//! The resolution of ambiguities (positions in the table where both a shift and a reduce action are possible (shift/reduce conflicts)) are annotated and their resolution motivated in the footnotes.
70//! For more explanation on constructing the parse table, see [these slides](https://www.eecis.udel.edu/~cavazos/cisc672-fall08/lectures/Lecture-10.pdf).
71//!
72//!
73//! ### Implementation
74//!
75//! This module contains a lexer ([`Lexer`]), which is a wrapper around a [`ParseStream`] that emits [`Token`]s (the tokens of our language).
76//! Using [`parenthesized`], we recursively parse regular expressions inside parentheses.
77//!
78//! The parser implementation ([`Parser`]) corresponds to the parse table in the following way:
79//! - The [`Parser::shift`] function implements shifting and transitioning to some state.
80//! - The [`Parser::goto`] function implements the GOTO-table.
81//! - The [`Parser::accept`] function accepts the input, leaving the resulting regular expression on the stack.
82//! - The [`Parser::step`] function performs a parsing step per invocation (i.e., it implements the ACTION-table).
83//!
84//! The reduction rules each have their own function as well:
85//! - `r1`: [`Parser::reduce_zero`]
86//! - `r2`: [`Parser::reduce_epsilon`]
87//! - `r3`: [`Parser::reduce_symbol`] (also parses parenthesized expressions)
88//! - `r4`: [`Parser::reduce_neg`]
89//! - `r5`: [`Parser::reduce_repeat`]
90//! - `r6`: [`Parser::reduce_plus`]
91//! - `r7`: [`Parser::reduce_optional`]
92//! - `r8`: [`Parser::reduce_concat`]
93//! - `r9`: [`Parser::reduce_or`]
94//! - `r10`: [`Parser::reduce_and`]
95//!
96//! The parser does not have error recovery.
97//!
98//!
99//! [^9l]: shift/reduce conflict with `r8`: resolved this way because concatenation is right-associative, so we need to shift more before we can reduce.
100//!
101//! [^9n]: shift/reduce conflict with `r8`: resolved this way because `~`/`*`/`+`/`?` have priority over concatenation.
102//!
103//! [^9o]: shift/reduce conflict with `s8`: resolved as `r8` because concatenation has priority over `|`.
104//!
105//! [^9a]: shift/reduce conflict with `s10`: resolved as `r8` because concatenation has priority over `&`.
106//!
107//!
108//!
109//! [^120]: shift/reduce conflict with `s2`: resolved as `r4` because negation has priority over concatenation.
110//!
111//! [^12e]: shift/reduce conflict with `s3`: resolved as `r4` because negation has priority over concatenation.
112//!
113//! [^12l]: shift/reduce conflict with `s4`: resolved as `r4` because negation has priority over concatenation.
114//!
115//! [^12n]: shift/reduce conflict with `s2`: resolved as `r4` because negation has priority over concatenation.
116//!
117//! [^2b]:  shift/reduce conflict with `s2`: resolved as an _ambiguity error_ because there is no priority between `~` (prefix operator) and `*`/`+`/`?` (postfix operators).
118//!
119//! [^12o]: shift/reduce conflict with `s2`: resolved as `r4` because negation has priority over `|`.
120//!
121//! [^12a]: shift/reduce conflict with `s2`: resolved as `r4` because negation has priority over `&`.
122//!
123//!
124//!
125//! [^3l]: shift/reduce conflict with `r9`: resolved as a shift because `|` has lowest priority.
126//!
127//! [^3o]: shift/reduce conflict with `s10`: resolved as a `r9` because `|` is left-associative.
128//!
129//!
130//!
131//! [^4l]: shift/reduce conflict with `r10`: resolved as a shift because `&` has lower priority than all operators, except for `|`.
132//!
133//! [^4o]: shift/reduce conflict with `s10`: resolved as a `r10` because `&` is left-associative.
134//!
135//! [^4a]: shift/reduce conflict with `s11`: resolved as a `r10` because `&` has priority over `|`.
136
137use crate::regex::Symbol;
138use crate::Regex;
139use std::fmt::Debug;
140use std::mem;
141use std::ops::Deref;
142use std::rc::Rc;
143use syn::parse::{Parse, ParseStream};
144use syn::{parenthesized, Path, Token};
145
146#[derive(Clone, PartialEq, Eq)]
147enum Token {
148    Zero,
149    Epsilon,
150    Neg,
151    Repeat,
152    Plus,
153    Optional,
154    Or,
155    And,
156    Regex(Rc<Regex>), // used for labels and parenthesized expressions
157    End,
158}
159
160impl Debug for Token {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        match self {
163            Self::Zero => write!(f, "'0'"),
164            Self::Epsilon => write!(f, "'e'"),
165            Self::Neg => write!(f, "'~'"),
166            Self::Repeat => write!(f, "'*'"),
167            Self::Plus => write!(f, "'+'"),
168            Self::Optional => write!(f, "'?'"),
169            Self::Or => write!(f, "'|'"),
170            Self::And => write!(f, "'&'"),
171            Self::Regex(regex) => write!(f, "{:?}", *regex),
172            Self::End => write!(f, "'$'"),
173        }
174    }
175}
176
177struct Lexer<'a> {
178    input: ParseStream<'a>,
179    top: Token,
180}
181
182impl<'a> Lexer<'a> {
183    /// Returns the next [`Token`] from the `input`.
184    fn scan(input: ParseStream<'a>) -> syn::Result<Token> {
185        if input.is_empty() {
186            return Ok(Token::End);
187        }
188
189        // Rust performs parenthesis matching: leverage that here.
190        if input.peek(syn::token::Paren) {
191            let inner;
192            parenthesized!(inner in input);
193            return Regex::parse(&inner).map(|re| Token::Regex(Rc::new(re)));
194        }
195
196        // Scan '0' token
197        if let Ok(val) = input.parse::<syn::LitInt>() {
198            if let Ok(0) = val.base10_parse() {
199                return Ok(Token::Zero);
200            }
201        }
202
203        // Scan 'e' and names
204        if let Ok(name) = input.parse::<Path>() {
205            if name.is_ident("e") {
206                return Ok(Token::Epsilon);
207            } else {
208                let regex = Regex::Symbol(Symbol { name }.into());
209                return Ok(Token::Regex(Rc::new(regex)));
210            }
211        }
212
213        // Scan '~' token
214        if input.parse::<Token![~]>().is_ok() {
215            return Ok(Token::Neg);
216        }
217
218        // Scan '*' token
219        if input.parse::<Token![*]>().is_ok() {
220            return Ok(Token::Repeat);
221        }
222
223        // Scan '+' token
224        if input.parse::<Token![+]>().is_ok() {
225            return Ok(Token::Plus);
226        }
227
228        // Scan '?' token
229        if input.parse::<Token![?]>().is_ok() {
230            return Ok(Token::Optional);
231        }
232
233        // Scan '|' token
234        if input.parse::<Token![|]>().is_ok() {
235            return Ok(Token::Or);
236        }
237
238        // Scan '&' token
239        if input.parse::<Token![&]>().is_ok() {
240            return Ok(Token::And);
241        }
242
243        Err(syn::Error::new(
244            input.span(),
245            "expected '0', 'e', '~', '*', '+', '?', '|', '&', '(' or label here.",
246        ))
247    }
248
249    /// Peeks the first token in the stream.
250    fn peek(&self) -> &Token {
251        &self.top
252    }
253
254    /// Advances lexer to the next token.
255    fn next(&mut self) -> syn::Result<Token> {
256        Ok(mem::replace(&mut self.top, Self::scan(self.input)?))
257    }
258
259    /// Returns the current position in the stream (mainly for emitting errors).
260    fn span(&self) -> proc_macro2::Span {
261        self.input.span()
262    }
263}
264
265/// Enumeration of the states of the parser.
266///
267/// The state names are chosen to represent the items currently on top of the stack.
268/// For example `FooBar` means that a `Bar` symbol is on top of the stack, and a `Foo` right below.
269/// Ordinal numbers correspond to the state numbers in top-level docs.
270///
271/// The documentation of each item contains its closure set.
272/// If not documented, the lookahead is `$/*/+/?/0/e/l/~/|/&` (i.e., all tokens).
273#[derive(Clone, Copy, Debug)]
274enum ParseState {
275    /// Initial state of the parser.
276    ///
277    /// Closure set:
278    /// - [S -> .E, $]
279    /// - [E -> .0]
280    /// - [E -> .e]
281    /// - [E -> .l]
282    /// - [E -> .~ E]
283    /// - [E -> .E *]
284    /// - [E -> .E +]
285    /// - [E -> .E ?]
286    /// - [E -> .E E]
287    /// - [E -> .E | E]
288    /// - [E -> .E & E]
289    Initial = 0,
290    /// State after reducing a top-level regular expression.
291    /// Might accept when the end of the stream is reached, or continue parsing a infix/post-fix operation.
292    ///
293    /// Closure set:    
294    /// - [S -> E., $]
295    /// - [E -> E.*]
296    /// - [E -> E.+]
297    /// - [E -> E.?]
298    /// - [E -> E.E]
299    /// - [E -> E.| E]
300    /// - [E -> E.& E]
301    /// - [E -> .0]
302    /// - [E -> .e]
303    /// - [E -> .l]
304    /// - [E -> .~ E]
305    /// - [E -> .E *]
306    /// - [E -> .E +]
307    /// - [E -> .E ?]
308    /// - [E -> .E E]
309    /// - [E -> .E | E]
310    /// - [E -> .E & E]
311    Regex = 1,
312    /// State after shifting a `0`: will always reduce using `E -> 0`.
313    ///
314    /// Closure set:
315    /// - [E -> 0.]
316    Zero = 2,
317    /// State after shifting a `e`: will always reduce using `E -> e`.
318    ///
319    /// Closure set:
320    /// - [E -> e.]
321    Epsilon = 3,
322    /// State after shifting a `l` or parenthesized expression: will always reduce using `E -> l` or `E -> ( E )`.
323    ///
324    /// Closure set:
325    /// - [E -> l.]
326    /// - [E -> ( E ).] (manually added)
327    Symbol = 4,
328    /// State after shifting a `~`.
329    ///
330    /// Closure set:
331    /// - [E -> ~.E]
332    /// - [E -> .0]
333    /// - [E -> .e]
334    /// - [E -> .l]
335    /// - [E -> .~ E]
336    /// - [E -> .E *]
337    /// - [E -> .E +]
338    /// - [E -> .E ?]
339    /// - [E -> .E E]
340    /// - [E -> .E | E]
341    /// - [E -> .E & E]
342    Neg = 5,
343    /// State after shifting a `*`.
344    /// Will always reduce using `E -> E *`.
345    ///
346    /// Closure set:
347    /// - [E -> E *.]
348    RegexRepeat = 6,
349    /// State after shifting a `+`.
350    /// Will always reduce using `E -> E +`.
351    ///
352    /// Closure set:
353    /// - [E -> E +.]
354    RegexPlus = 7,
355    /// State after shifting a `?`.
356    /// Will always reduce using `E -> E ?`.
357    ///
358    /// Closure set:
359    /// - [E -> E ?.]
360    RegexOptional = 8,
361    /// State two fully reduced regular expressions are at the top of the stack.
362    /// Can further reduce concat operations, or shift inputs with higher priority.
363    ///
364    /// Closure set:
365    /// - [E -> E E.]
366    /// - [E -> E.*]
367    /// - [E -> E.+]
368    /// - [E -> E.?]
369    /// - [E -> E.E]
370    /// - [E -> E.| E]
371    /// - [E -> E.& E]
372    /// - [E -> .0]
373    /// - [E -> .e]
374    /// - [E -> .l]
375    /// - [E -> .~ E]
376    /// - [E -> .E *]
377    /// - [E -> .E +]
378    /// - [E -> .E ?]
379    /// - [E -> .E E]
380    /// - [E -> .E | E]
381    /// - [E -> .E & E]
382    RegexRegex = 9,
383    /// State after shifting a `|`.
384    /// Can reduce using `E -> E | E`, or shift inputs with higher priority.
385    ///
386    /// Closure set:
387    /// - [E -> E |.E]
388    /// - [E -> .0]
389    /// - [E -> .e]
390    /// - [E -> .l]
391    /// - [E -> .~ E]
392    /// - [E -> .E *]
393    /// - [E -> .E +]
394    /// - [E -> .E ?]
395    /// - [E -> .E E]
396    /// - [E -> .E | E]
397    /// - [E -> .E & E]
398    RegexOr = 10,
399    /// State after shifting a `&`.
400    /// Can reduce using `E -> E & E`, or shift inputs with higher priority.
401    ///
402    /// Closure set:
403    /// - [E -> E &.E]
404    /// - [E -> .0]
405    /// - [E -> .e]
406    /// - [E -> .l]
407    /// - [E -> .~ E]
408    /// - [E -> .E *]
409    /// - [E -> .E +]
410    /// - [E -> .E ?]
411    /// - [E -> .E E]
412    /// - [E -> .E | E]
413    /// - [E -> .E & E]
414    RegexAnd = 11,
415    /// State when a negation operator and a RE are at the top of the stack.
416    /// Can reduce using the `E -> ~ E` production, or give an ambiguity error with other unary operators.
417    ///
418    /// Closure set:
419    /// - [E -> ~ E.]
420    /// - [E -> E.*]
421    /// - [E -> E.+]
422    /// - [E -> E.?]
423    /// - [E -> E.E]
424    /// - [E -> E.| E]
425    /// - [E -> E.& E]
426    /// - [E -> .0]
427    /// - [E -> .e]
428    /// - [E -> .l]
429    /// - [E -> .~ E]
430    /// - [E -> .E *]
431    /// - [E -> .E +]
432    /// - [E -> .E ?]
433    /// - [E -> .E E]
434    /// - [E -> .E | E]
435    /// - [E -> .E & E]
436    NegRegex = 12,
437    /// State when an RE, an or-operator and another RE are at the top of the stack.
438    /// Can reduce using the `E -> E | E` production, or shift higher priority operators.
439    ///
440    /// Closure set:
441    /// - [E -> E | E.]
442    /// - [E -> E.*]
443    /// - [E -> E.+]
444    /// - [E -> E.?]
445    /// - [E -> E.E]
446    /// - [E -> E.| E]
447    /// - [E -> E.& E]
448    /// - [E -> .0]
449    /// - [E -> .e]
450    /// - [E -> .l]
451    /// - [E -> .~ E]
452    /// - [E -> .E *]
453    /// - [E -> .E +]
454    /// - [E -> .E ?]
455    /// - [E -> .E E]
456    /// - [E -> .E | E]
457    /// - [E -> .E & E]
458    RegexOrRegex = 13,
459    /// State when an RE, an and-operator and another RE are at the top of the stack.
460    /// Can reduce using the `E -> E & E` production, or shift higher priority operators.
461    ///
462    /// Closure set:
463    /// - [E -> E & E.]
464    /// - [E -> E.*]
465    /// - [E -> E.+]
466    /// - [E -> E.?]
467    /// - [E -> E.E]
468    /// - [E -> E.| E]
469    /// - [E -> E.& E]
470    /// - [E -> .0]
471    /// - [E -> .e]
472    /// - [E -> .l]
473    /// - [E -> .~ E]
474    /// - [E -> .E *]
475    /// - [E -> .E +]
476    /// - [E -> .E ?]
477    /// - [E -> .E E]
478    /// - [E -> .E | E]
479    /// - [E -> .E & E]
480    RegexAndRegex = 14,
481}
482
483/// Segment of the parse stack.
484///
485/// In regular LR parsing literature, the parse stack is an alternating sequence of states and symbols.
486/// It starts with the initial state [`ParseState::Initial`], and always (except between a reduction and a goto) has a state on top.
487/// We differ from this representation in two ways:
488/// 1. We do not explicitly push the initial state to the stack, but implicitly assume it in [`Parser::state`].
489/// 2. We combine the reduce and goto actions.
490///    The reduce actions do not push the result to the stack, but pass it to [`Parser::goto`] directly.
491///    [`Parser::goto`] computes the next state based on the top of the stack and the result that was passed in, and pushes the result and the new state to the stack.
492///
493/// This allows us to combine the stack elements as 2-tuples, getting rid of a lot of runtime checks & pattern matching.
494#[derive(Clone)]
495struct StackSegment {
496    state: ParseState,
497    token: Token,
498}
499
500impl Debug for StackSegment {
501    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
502        write!(f, "<{:?}, {:?}>", self.state, self.token)
503    }
504}
505
506struct Parser<'a> {
507    lexer: Lexer<'a>,
508    parse_stack: Vec<StackSegment>,
509}
510
511impl<'a> Parser<'a> {
512    /// Initializer a new parser for the underlying token stream.
513    fn init(input: ParseStream<'a>) -> syn::Result<Self> {
514        let lexer = Lexer {
515            input,
516            top: Lexer::scan(input)?,
517        };
518        Ok(Self {
519            lexer,
520            parse_stack: vec![],
521        })
522    }
523
524    /// Shift action (standard in LR parsers).
525    ///
526    /// Pushes current token to the stack, and transitions to `new_state`.
527    fn shift(&mut self, new_state: ParseState) -> syn::Result<bool> {
528        let next_token = self.lexer.next()?;
529        self.parse_stack.push(StackSegment {
530            state: new_state,
531            token: next_token,
532        });
533        Ok(false) // not (yet) in accepting state
534    }
535
536    /// Get last [state](ParseState) shifted to the stack.
537    ///
538    /// Defaults to the (implicit) [initial state](ParseState::Initial) in case of an empty stack.
539    fn state(&self) -> ParseState {
540        self.parse_stack
541            .last()
542            .map(|ss| ss.state)
543            .unwrap_or(ParseState::Initial) // implicitly assume State 0 on bottom of stack.
544    }
545
546    /// Pop last state and token from the stack.
547    fn pop_stack(&mut self) -> syn::Result<StackSegment> {
548        if let Some(segment) = self.parse_stack.pop() {
549            Ok(segment)
550        } else {
551            self.internal_error("expected non-empty stack")
552        }
553    }
554
555    /// Get last [token](Token) shifted to the stack.
556    fn pop_stack_token(&mut self) -> syn::Result<Token> {
557        self.pop_stack().map(|ss| ss.token)
558    }
559
560    /// Get last token shifted to the stack, and assert it is a [`Token::Regex`] variant.
561    fn pop_stack_regex(&mut self) -> syn::Result<Rc<Regex>> {
562        if let Token::Regex(regex) = self.pop_stack_token()? {
563            Ok(regex)
564        } else {
565            self.internal_error("expected regex on top of stack")
566        }
567    }
568
569    /// Get last token shifted to the stack, and assert it is a particular `expected` [`Token`].
570    fn expect_token(&mut self, expected: Token) -> syn::Result<()> {
571        if expected == self.pop_stack_token()? {
572            Ok(())
573        } else {
574            self.internal_error("expected other token on top of stack")
575        }
576    }
577
578    // reduce atoms
579
580    fn reduce_atom(&mut self, expected: Token, result: Regex) -> syn::Result<bool> {
581        self.expect_token(expected)?;
582        self.goto(Rc::new(result))
583    }
584
585    /// Reduce `E -> 0` preduction
586    fn reduce_zero(&mut self) -> syn::Result<bool> {
587        self.reduce_atom(Token::Zero, Regex::EmptySet)
588    }
589
590    /// Reduce `E -> e` preduction.
591    fn reduce_epsilon(&mut self) -> syn::Result<bool> {
592        self.reduce_atom(Token::Epsilon, Regex::EmptyString)
593    }
594
595    /// Reduce `E -> l` and `E -> ( E )` productions (NO-OP: symbol is shared).
596    fn reduce_symbol(&mut self) -> syn::Result<bool> {
597        let re = self.pop_stack_regex()?;
598        self.goto(re)
599    }
600
601    // reduce unary operators
602
603    fn reduce_unary_prefix(
604        &mut self,
605        expected: Token,
606        build: impl Fn(Rc<Regex>) -> Regex,
607    ) -> syn::Result<bool> {
608        let re = self.pop_stack_regex()?;
609        self.reduce_atom(expected, build(re))
610    }
611
612    /// Reduce `E -> ~ E` production.
613    fn reduce_neg(&mut self) -> syn::Result<bool> {
614        self.reduce_unary_prefix(Token::Neg, Regex::Complement)
615    }
616
617    fn reduce_unary_postfix(
618        &mut self,
619        expected: Token,
620        build: impl Fn(Rc<Regex>) -> Regex,
621    ) -> syn::Result<bool> {
622        self.expect_token(expected)?;
623        let re = self.pop_stack_regex()?;
624        self.goto(Rc::new(build(re)))
625    }
626
627    /// Reduce `E -> E *` production.
628    fn reduce_repeat(&mut self) -> syn::Result<bool> {
629        self.reduce_unary_postfix(Token::Repeat, Regex::Repeat)
630    }
631
632    /// Reduce `E -> E +` production.
633    ///
634    /// Immediately desugars `E1 +` to `E1 E1*`.
635    fn reduce_plus(&mut self) -> syn::Result<bool> {
636        self.reduce_unary_postfix(Token::Plus, |re| {
637            Regex::Concat(re.clone(), Rc::new(Regex::Repeat(re)))
638        })
639    }
640
641    /// Reduce `E -> E ?` production.
642    ///
643    /// Immediately desugars `E1 ?` to `e | E1`.
644    fn reduce_optional(&mut self) -> syn::Result<bool> {
645        self.reduce_unary_postfix(Token::Optional, |re| {
646            Regex::Or(Rc::new(Regex::EmptyString), re)
647        })
648    }
649
650    // reduce binary operators
651
652    /// Reduce `E -> E E` production.
653    fn reduce_concat(&mut self) -> syn::Result<bool> {
654        // right-hand-side is on top of stack ...
655        let r = self.pop_stack_regex()?;
656        let l = self.pop_stack_regex()?;
657        self.goto(Rc::new(Regex::Concat(l, r)))
658    }
659
660    fn reduce_binary_infix(
661        &mut self,
662        expected: Token,
663        build: impl Fn(Rc<Regex>, Rc<Regex>) -> Regex,
664    ) -> syn::Result<bool> {
665        // right-hand-side is on top of stack ...
666        let r = self.pop_stack_regex()?;
667        self.expect_token(expected)?;
668        let l = self.pop_stack_regex()?;
669        self.goto(Rc::new(build(l, r)))
670    }
671
672    /// Reduce `E -> E | E` production.
673    fn reduce_or(&mut self) -> syn::Result<bool> {
674        self.reduce_binary_infix(Token::Or, Regex::Or)
675    }
676
677    /// Reduce `E -> E & E` production.
678    fn reduce_and(&mut self) -> syn::Result<bool> {
679        self.reduce_binary_infix(Token::And, Regex::And)
680    }
681
682    /// Implement GOTO-table.
683    ///
684    /// Called after a reduce operation.
685    /// The `result` argument is the result of the reduction (which is always of sort `E`).
686    ///
687    /// It peeks the state `S` on top of the stack (assuming `0` in case of an empty stack).
688    /// Then it finds the `GOTO[S, E]` entry, and pushes the result and the new state to the stack.
689    fn goto(&mut self, result: Rc<Regex>) -> syn::Result<bool> {
690        // compute state that has a new regex pushed to the stack
691        let state = match self.state() {
692            ParseState::Initial => ParseState::Regex,
693            ParseState::Regex => ParseState::RegexRegex,
694            ParseState::Neg => ParseState::NegRegex,
695            ParseState::RegexRegex => ParseState::RegexRegex,
696            ParseState::RegexOr => ParseState::RegexOrRegex,
697            ParseState::RegexAnd => ParseState::RegexAndRegex,
698            ParseState::NegRegex => ParseState::RegexRegex, // redundant: negations are always reduced eagerly
699            ParseState::RegexOrRegex => ParseState::RegexRegex,
700            ParseState::RegexAndRegex => ParseState::RegexRegex,
701            _ => return self.internal_error("cannot perform 'goto' action on current state"),
702        };
703        self.parse_stack.push(StackSegment {
704            state,
705            token: Token::Regex(result),
706        });
707        Ok(false)
708    }
709
710    /// Accepts the input.
711    fn accept() -> syn::Result<bool> {
712        Ok(true)
713    }
714
715    fn build_error(&self, msg: &str) -> syn::Error {
716        syn::Error::new(self.lexer.span(), msg)
717    }
718
719    fn error<T>(&self, msg: &str) -> syn::Result<T> {
720        Err(self.build_error(msg))
721    }
722
723    fn internal_error<T>(&self, msg: &str) -> syn::Result<T> {
724        Err(self.build_error(&format!("internal parsing error: {}", msg)))
725    }
726
727    /// Implementation of the ACTION-table.
728    fn step(&mut self) -> syn::Result<bool> {
729        match self.state() {
730            ParseState::Initial | ParseState::Neg | ParseState::RegexOr | ParseState::RegexAnd => {
731                match self.lexer.peek() {
732                    Token::Zero => self.shift(ParseState::Zero),
733                    Token::Epsilon => self.shift(ParseState::Epsilon),
734                    Token::Regex(_) => self.shift(ParseState::Symbol),
735                    Token::Neg => self.shift(ParseState::Neg),
736                    _ => self.error(
737                        "expected '0', 'e', '~', label or parenthesized regular expression here",
738                    ),
739                }
740            }
741            ParseState::Regex => match self.lexer.peek() {
742                Token::Zero => self.shift(ParseState::Zero),
743                Token::Epsilon => self.shift(ParseState::Epsilon),
744                Token::Regex(_) => self.shift(ParseState::Symbol),
745                Token::Neg => self.shift(ParseState::Neg),
746                Token::Repeat => self.shift(ParseState::RegexRepeat),
747                Token::Plus => self.shift(ParseState::RegexPlus),
748                Token::Optional => self.shift(ParseState::RegexOptional),
749                Token::Or => self.shift(ParseState::RegexOr),
750                Token::And => self.shift(ParseState::RegexAnd),
751                Token::End => Self::accept(),
752            },
753            ParseState::Zero => self.reduce_zero(),
754            ParseState::Epsilon => self.reduce_epsilon(),
755            ParseState::Symbol => self.reduce_symbol(),
756            ParseState::RegexRepeat => self.reduce_repeat(),
757            ParseState::RegexPlus => self.reduce_plus(),
758            ParseState::RegexOptional => self.reduce_optional(),
759            ParseState::RegexRegex => match self.lexer.peek() {
760                // concat is right-associative, so shift in case of a new literal/pre-fix operator
761                Token::Zero => self.shift(ParseState::Zero),
762                Token::Epsilon => self.shift(ParseState::Epsilon),
763                Token::Regex(_) => self.shift(ParseState::Symbol),
764                Token::Neg => self.shift(ParseState::Neg),
765                // post-fix operators have priority over concat, so shift here
766                Token::Repeat => self.shift(ParseState::RegexRepeat),
767                Token::Plus => self.shift(ParseState::RegexPlus),
768                Token::Optional => self.shift(ParseState::RegexOptional),
769                // concat has priority over '|' and  '&', so reduce here
770                Token::Or => self.reduce_concat(),
771                Token::And => self.reduce_concat(),
772                Token::End => self.reduce_concat(),
773            },
774            ParseState::NegRegex => match self.lexer.peek() {
775                // neg has top priority, but is ambiguous with posf-fix operators.
776                Token::Zero => self.reduce_neg(),
777                Token::Epsilon => self.reduce_neg(),
778                Token::Regex(_) => self.reduce_neg(),
779                Token::Neg => self.reduce_neg(),
780                Token::Repeat => self.error(
781                    "ambiguous regex: simultaneous use of '~' prefix and '*' postfix operator",
782                ),
783                Token::Plus => self.error(
784                    "ambiguous regex: simultaneous use of '~' prefix and '+' postfix operator",
785                ),
786                Token::Optional => self.error(
787                    "ambiguous regex: simultaneous use of '~' prefix and '?' postfix operator",
788                ),
789                Token::Or => self.reduce_neg(),
790                Token::And => self.reduce_neg(),
791                Token::End => self.reduce_neg(),
792            },
793            ParseState::RegexOrRegex => match self.lexer.peek() {
794                // or has lowest priority, so shift in any case
795                Token::Zero => self.shift(ParseState::Zero),
796                Token::Epsilon => self.shift(ParseState::Epsilon),
797                Token::Regex(_) => self.shift(ParseState::Symbol),
798                Token::Neg => self.shift(ParseState::Neg),
799                Token::Repeat => self.shift(ParseState::RegexRepeat),
800                Token::Plus => self.shift(ParseState::RegexPlus),
801                Token::Optional => self.shift(ParseState::RegexOptional),
802                // or is left-associative, so reduce eagerly
803                Token::Or => self.reduce_or(),
804                Token::And => self.shift(ParseState::RegexAnd),
805                Token::End => self.reduce_or(),
806            },
807            ParseState::RegexAndRegex => match self.lexer.peek() {
808                // '&' has priority over '|' only, so shift in any other case
809                Token::Zero => self.shift(ParseState::Zero),
810                Token::Epsilon => self.shift(ParseState::Epsilon),
811                Token::Regex(_) => self.shift(ParseState::Symbol),
812                Token::Neg => self.shift(ParseState::Neg),
813                Token::Repeat => self.shift(ParseState::RegexRepeat),
814                Token::Plus => self.shift(ParseState::RegexPlus),
815                Token::Optional => self.shift(ParseState::RegexOptional),
816                // has priority over '|'
817                Token::Or => self.reduce_and(),
818                // and is left-recursive, so reduce eagerly
819                Token::And => self.reduce_and(),
820                Token::End => self.reduce_and(),
821            },
822        }
823    }
824
825    /// Extracts parsing result from the stack.
826    fn finalize(mut self) -> syn::Result<Regex> {
827        let regex = self.pop_stack_regex()?;
828        if self.parse_stack.is_empty() {
829            Ok(regex.deref().clone())
830        } else {
831            self.internal_error("residual input after parsing finished.")
832        }
833    }
834
835    /// Entry point: parses the `input` to a [`Regex`].
836    pub fn parse_regex(input: ParseStream) -> syn::Result<Regex> {
837        let mut parser = Parser::init(input)?;
838        let mut accept = false;
839        while !accept {
840            accept = parser.step()?
841        }
842        parser.finalize()
843    }
844}
845
846impl Parse for Regex {
847    fn parse(input: ParseStream) -> syn::Result<Self> {
848        Parser::parse_regex(input)
849    }
850}
851
852#[cfg(test)]
853mod tests {
854    // Most test cases derived from
855    // https://github.com/metaborg/nabl/blob/master/statix.test/syntax/regex.spt
856
857    use crate::{parse_regex, Regex::*};
858    use std::rc::Rc;
859
860    #[test]
861    fn test_symbols() {
862        assert_eq!(parse_regex("A").unwrap(), Symbol(Rc::new("A".into())));
863        assert_eq!(parse_regex("a").unwrap(), Symbol(Rc::new("a".into())));
864        assert_eq!(
865            parse_regex("CamelCase").unwrap(),
866            Symbol(Rc::new("CamelCase".into()))
867        );
868        assert_eq!(parse_regex("0").unwrap(), EmptySet);
869        assert_eq!(parse_regex("e").unwrap(), EmptyString);
870    }
871    #[test]
872    fn test_operators() {
873        assert_eq!(
874            parse_regex("A*").unwrap(),
875            Repeat(Rc::new(Symbol(Rc::new("A".into()))))
876        );
877        assert_eq!(
878            parse_regex("A+").unwrap(),
879            Concat(
880                Rc::new(Symbol(Rc::new("A".into()))),
881                Rc::new(Repeat(Rc::new(Symbol(Rc::new("A".into())))))
882            )
883        );
884        assert_eq!(
885            parse_regex("A?").unwrap(),
886            Or(Rc::new(EmptyString), Rc::new(Symbol(Rc::new("A".into()))))
887        );
888        assert_eq!(
889            parse_regex("A B").unwrap(),
890            Concat(
891                Rc::new(Symbol(Rc::new("A".into()))),
892                Rc::new(Symbol(Rc::new("B".into())))
893            )
894        );
895        assert_eq!(
896            parse_regex("A | B").unwrap(),
897            Or(
898                Rc::new(Symbol(Rc::new("A".into()))),
899                Rc::new(Symbol(Rc::new("B".into())))
900            )
901        );
902        assert_eq!(
903            parse_regex("A & B").unwrap(),
904            And(
905                Rc::new(Symbol(Rc::new("A".into()))),
906                Rc::new(Symbol(Rc::new("B".into())))
907            )
908        );
909    }
910    #[test]
911    fn test_disambiguation() {
912        // or left-associative
913        assert_eq!(
914            parse_regex("A | B | C").unwrap(),
915            Or(
916                Rc::new(Or(
917                    Rc::new(Symbol(Rc::new("A".into()))),
918                    Rc::new(Symbol(Rc::new("B".into())))
919                )),
920                Rc::new(Symbol(Rc::new("C".into())))
921            )
922        );
923
924        // closure < concat
925        assert_eq!(
926            parse_regex("A B*").unwrap(),
927            Concat(
928                Rc::new(Symbol(Rc::new("A".into()))),
929                Rc::new(Repeat(Rc::new(Symbol(Rc::new("B".into()))))),
930            )
931        );
932
933        // nested post-fix operators
934        assert_eq!(
935            parse_regex("A*?+").unwrap(),
936            parse_regex("((A*)?)+").unwrap()
937        );
938
939        // not & closure < or
940        assert_eq!(
941            parse_regex("~A | B*").unwrap(),
942            Or(
943                Rc::new(Complement(Rc::new(Symbol(Rc::new("A".into()))))),
944                Rc::new(Repeat(Rc::new(Symbol(Rc::new("B".into())))))
945            )
946        );
947
948        // and < or
949        assert_eq!(
950            parse_regex("~A | B* & C?").unwrap(),
951            parse_regex("(~A) | ((B*) & C?)").unwrap()
952        );
953    }
954
955    #[test]
956    fn neg_should_have_argument() {
957        assert!(parse_regex("A ~").is_err());
958    }
959
960    #[test]
961    fn postfix_op_should_have_lhs() {
962        assert!(parse_regex("* A").is_err());
963        assert!(parse_regex("+ A").is_err());
964        assert!(parse_regex("? A").is_err());
965    }
966
967    #[test]
968    fn infix_op_should_have_lhs() {
969        assert!(parse_regex("& A").is_err());
970        assert!(parse_regex("| A").is_err());
971    }
972
973    #[test]
974    fn infix_op_should_have_rhs() {
975        assert!(parse_regex("A &").is_err());
976        assert!(parse_regex("A |").is_err());
977    }
978
979    #[test]
980    fn infix_op_no_double() {
981        assert!(parse_regex("A & & A A").is_err());
982        assert!(parse_regex("A | | A A").is_err());
983        assert!(parse_regex("A | & A A").is_err());
984        assert!(parse_regex("A & | A A").is_err());
985    }
986
987    #[test]
988    fn infix_postfix_mix() {
989        assert!(parse_regex("A & * A A").is_err());
990        assert!(parse_regex("A & + A A").is_err());
991        assert!(parse_regex("A & ? A A").is_err());
992        assert!(parse_regex("A | * A A").is_err());
993        assert!(parse_regex("A | + A A").is_err());
994        assert!(parse_regex("A | ? A A").is_err());
995    }
996}
scopegraphs_regular_expressions/parse.rs

scopegraphs_regular_expressions/
parse.rs