monster_regex/parser/
mod.rs

1use crate::flags::Flags;
2use std::fmt;
3
4/// Represents a node in the Abstract Syntax Tree (AST) of a regular expression.
5#[derive(Debug, Clone, PartialEq)]
6pub enum AstNode {
7    /// A literal character match.
8    Literal(char),
9
10    /// A character class (e.g., `\d`, `[a-z]`, `.`).
11    CharClass(CharClass),
12
13    /// Start of string (or line in multiline mode) anchor `^`.
14    StartAnchor,
15    /// End of string (or line in multiline mode) anchor `$`.
16    EndAnchor,
17    /// Word boundary anchor `\b`.
18    WordBoundary,
19    /// Start of word anchor `\<`.
20    StartWord,
21    /// End of word anchor `\>`.
22    EndWord,
23    /// Sets the start of the match `\zs`.
24    SetMatchStart,
25    /// Sets the end of the match `\ze`.
26    SetMatchEnd,
27
28    /// Zero or more repetitions `*`.
29    ZeroOrMore {
30        /// The node being repeated.
31        node: Box<AstNode>,
32        /// Whether the quantifier is greedy (default) or lazy (`?`).
33        greedy: bool,
34    },
35    /// One or more repetitions `+`.
36    OneOrMore {
37        /// The node being repeated.
38        node: Box<AstNode>,
39        /// Whether the quantifier is greedy (default) or lazy (`?`).
40        greedy: bool,
41    },
42    /// Zero or one repetition `?`.
43    Optional {
44        /// The node being repeated.
45        node: Box<AstNode>,
46        /// Whether the quantifier is greedy (default) or lazy (`?`).
47        greedy: bool,
48    },
49    /// Exact number of repetitions `{n}`.
50    Exact {
51        /// The node being repeated.
52        node: Box<AstNode>,
53        /// The exact count.
54        count: usize,
55    },
56    /// Range of repetitions `{n,m}` or `{n,}`.
57    Range {
58        /// The node being repeated.
59        node: Box<AstNode>,
60        /// The minimum count.
61        min: usize,
62        /// The maximum count (None means infinite).
63        max: Option<usize>,
64        /// Whether the quantifier is greedy (default) or lazy (`?`).
65        greedy: bool,
66    },
67
68    /// A capturing or non-capturing group `(...)`.
69    Group {
70        /// The sequence of nodes inside the group.
71        nodes: Vec<AstNode>,
72        /// The name of the group, if it is a named capture `(?<name>...)`.
73        name: Option<String>,
74        /// Whether this group captures text.
75        capture: bool,
76        /// The index of the capture group (1-based), if capturing.
77        index: Option<usize>,
78    },
79    /// Alternation `|`.
80    Alternation(Vec<Vec<AstNode>>),
81
82    /// Backreference to a captured group `\n`.
83    Backref(usize),
84
85    /// Lookahead assertion `(?>=...)` or `(?>!...)`.
86    LookAhead {
87        /// The sequence of nodes to check ahead.
88        nodes: Vec<AstNode>,
89        /// True for positive lookahead, false for negative.
90        positive: bool,
91    },
92    /// Lookbehind assertion `(?<=...)` or `(?<!...)`.
93    LookBehind {
94        /// The sequence of nodes to check behind.
95        nodes: Vec<AstNode>,
96        /// True for positive lookbehind, false for negative.
97        positive: bool,
98    },
99}
100
101/// Represents a class of characters.
102#[derive(Debug, Clone, PartialEq)]
103pub enum CharClass {
104    // Standard classes
105    /// Digit `\d` (`[0-9]`).
106    Digit,
107    /// Non-digit `\D`.
108    NonDigit,
109    /// Word character `\w` (`[a-zA-Z0-9_]`).
110    Word,
111    /// Non-word character `\W`.
112    NonWord,
113    /// Whitespace `\s` (`[ \t\r\n\f\v]`).
114    Whitespace,
115    /// Non-whitespace `\S`.
116    NonWhitespace,
117
118    // Extended classes
119    /// Lowercase character `\l`.
120    Lowercase,
121    /// Non-lowercase character `\L`.
122    NonLowercase,
123    /// Uppercase character `\u`.
124    Uppercase,
125    /// Non-uppercase character `\U`.
126    NonUppercase,
127    /// Hexadecimal digit `\x`.
128    Hex,
129    /// Non-hexadecimal digit `\X`.
130    NonHex,
131    /// Octal digit `\o`.
132    Octal,
133    /// Non-octal digit `\O`.
134    NonOctal,
135    /// Start of word character `\h`.
136    WordStart,
137    /// Non-start of word character `\H`.
138    NonWordStart,
139    /// Punctuation `\p`.
140    Punctuation,
141    /// Non-punctuation `\P`.
142    NonPunctuation,
143    /// Alphanumeric `\a`.
144    Alphanumeric,
145    /// Non-alphanumeric `\A`.
146    NonAlphanumeric,
147
148    // Custom sets
149    /// Custom character set `[...]`.
150    Set {
151        /// The ranges or characters included in the set.
152        chars: Vec<CharRange>,
153        /// Whether the set is negated `[^...]`.
154        negated: bool,
155    },
156
157    /// Dot `.` (matches any character except newline, or any character with `s` flag).
158    Dot,
159}
160
161/// A range of characters in a character set.
162#[derive(Debug, Clone, PartialEq)]
163pub struct CharRange {
164    /// Start of the range.
165    pub start: char,
166    /// End of the range.
167    pub end: char,
168}
169
170/// The recursive descent parser for the regex pattern.
171#[derive(Debug, Clone)]
172pub struct Parser {
173    input: Vec<char>,
174    pos: usize,
175    flags: Flags,
176    group_count: usize,
177}
178
179/// Errors that can occur during parsing.
180#[derive(Debug)]
181pub enum ParseError {
182    UnexpectedChar(char, usize),
183    UnexpectedEof,
184    InvalidQuantifier(String),
185    UnmatchedParen,
186    InvalidGroupName(String),
187    InvalidEscape(char),
188    InvalidCharClass,
189    DuplicateGroupName(String),
190    InvalidBackref(usize),
191    InvalidLineNumber(String),
192    InvalidGroup(String),
193}
194
195impl fmt::Display for ParseError {
196    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
197        match self {
198            ParseError::UnexpectedChar(c, pos) => {
199                write!(f, "Unexpected '{}' at position {}", c, pos)
200            }
201            ParseError::UnexpectedEof => write!(f, "Unexpected end of input"),
202            ParseError::InvalidQuantifier(s) => {
203                write!(f, "Invalid quantifier: {}", s)
204            }
205            ParseError::UnmatchedParen => write!(f, "Unmatched parenthesis"),
206            ParseError::InvalidGroupName(s) => {
207                write!(f, "Invalid group name: {}", s)
208            }
209            ParseError::InvalidEscape(c) => {
210                write!(f, "Invalid escape sequence: \\{}", c)
211            }
212            ParseError::InvalidCharClass => {
213                write!(f, "Invalid character class")
214            }
215            ParseError::DuplicateGroupName(s) => {
216                write!(f, "Duplicate group name: {}", s)
217            }
218            ParseError::InvalidBackref(n) => {
219                write!(f, "Invalid backreference: \\{}", n)
220            }
221            ParseError::InvalidLineNumber(s) => {
222                write!(f, "Invalid line number: {}", s)
223            }
224            ParseError::InvalidGroup(s) => {
225                write!(f, "Invalid group syntax: {}", s)
226            }
227        }
228    }
229}
230
231impl std::error::Error for ParseError {}
232
233impl Parser {
234    /// Creates a new parser for the given pattern.
235    pub fn new(pattern: &str, flags: Flags) -> Self {
236        Parser {
237            input: pattern.chars().collect(),
238            pos: 0,
239            flags,
240            group_count: 0,
241        }
242    }
243
244    /// Parses the pattern into an AST.
245    pub fn parse(&mut self) -> Result<Vec<AstNode>, ParseError> {
246        self.parse_alternation()
247    }
248
249    // Top level: handle |
250    fn parse_alternation(&mut self) -> Result<Vec<AstNode>, ParseError> {
251        let mut alternatives = vec![];
252        let mut current = self.parse_sequence()?;
253
254        while self.peek() == Some('|') {
255            self.consume()?;
256            alternatives.push(current);
257            current = self.parse_sequence()?;
258        }
259        alternatives.push(current);
260
261        if alternatives.len() == 1 {
262            Ok(alternatives.pop().unwrap())
263        } else {
264            Ok(vec![AstNode::Alternation(alternatives)])
265        }
266    }
267
268    fn skip_whitespace_and_comments(&mut self) {
269        if !self.flags.verbose {
270            return;
271        }
272        while self.pos < self.input.len() {
273            let ch = self.input[self.pos];
274            if ch.is_whitespace() {
275                self.pos += 1;
276            } else if ch == '#' {
277                self.pos += 1;
278                while self.pos < self.input.len() && self.input[self.pos] != '\n' {
279                    self.pos += 1;
280                }
281            } else {
282                break;
283            }
284        }
285    }
286
287    // Parse sequence of atoms with quantifiers
288    fn parse_sequence(&mut self) -> Result<Vec<AstNode>, ParseError> {
289        let mut nodes = vec![];
290
291        loop {
292            self.skip_whitespace_and_comments();
293            match self.current() {
294                Some(&'|') | Some(&')') | None => break,
295                _ => {
296                    let node = self.parse_atom()?;
297                    let node = self.apply_quantifier(node)?;
298                    nodes.push(node);
299                }
300            }
301        }
302
303        Ok(nodes)
304    }
305
306    // Parse a single atom (before quantifiers)
307    fn parse_atom(&mut self) -> Result<AstNode, ParseError> {
308        match self.current() {
309            None => Err(ParseError::UnexpectedEof),
310            Some(&'.') => {
311                self.consume()?;
312                Ok(AstNode::CharClass(CharClass::Dot))
313            }
314            Some(&'^') => {
315                self.consume()?;
316                Ok(AstNode::StartAnchor)
317            }
318            Some(&'$') => {
319                self.consume()?;
320                Ok(AstNode::EndAnchor)
321            }
322            Some(&'[') => self.parse_char_class(),
323            Some(&'(') => self.parse_group(),
324            Some(&'\\') => self.parse_escape(),
325            Some(&ch) => {
326                self.consume()?;
327                Ok(AstNode::Literal(ch))
328            }
329        }
330    }
331
332    // Parse \escape sequences
333    fn parse_escape(&mut self) -> Result<AstNode, ParseError> {
334        self.consume()?; // consume \
335
336        match self.current() {
337            None => Err(ParseError::UnexpectedEof),
338            Some(&'d') => {
339                self.consume()?;
340                Ok(AstNode::CharClass(CharClass::Digit))
341            }
342            Some(&'D') => {
343                self.consume()?;
344                Ok(AstNode::CharClass(CharClass::NonDigit))
345            }
346            Some(&'w') => {
347                self.consume()?;
348                Ok(AstNode::CharClass(CharClass::Word))
349            }
350            Some(&'W') => {
351                self.consume()?;
352                Ok(AstNode::CharClass(CharClass::NonWord))
353            }
354            Some(&'s') => {
355                self.consume()?;
356                Ok(AstNode::CharClass(CharClass::Whitespace))
357            }
358            Some(&'S') => {
359                self.consume()?;
360                Ok(AstNode::CharClass(CharClass::NonWhitespace))
361            }
362            Some(&'l') => {
363                self.consume()?;
364                Ok(AstNode::CharClass(CharClass::Lowercase))
365            }
366            Some(&'L') => {
367                self.consume()?;
368                Ok(AstNode::CharClass(CharClass::NonLowercase))
369            }
370            Some(&'u') => {
371                self.consume()?;
372                Ok(AstNode::CharClass(CharClass::Uppercase))
373            }
374            Some(&'U') => {
375                self.consume()?;
376                Ok(AstNode::CharClass(CharClass::NonUppercase))
377            }
378            Some(&'x') => {
379                self.consume()?;
380                Ok(AstNode::CharClass(CharClass::Hex))
381            }
382            Some(&'X') => {
383                self.consume()?;
384                Ok(AstNode::CharClass(CharClass::NonHex))
385            }
386            Some(&'o') => {
387                self.consume()?;
388                Ok(AstNode::CharClass(CharClass::Octal))
389            }
390            Some(&'O') => {
391                self.consume()?;
392                Ok(AstNode::CharClass(CharClass::NonOctal))
393            }
394            Some(&'h') => {
395                self.consume()?;
396                Ok(AstNode::CharClass(CharClass::WordStart))
397            }
398            Some(&'H') => {
399                self.consume()?;
400                Ok(AstNode::CharClass(CharClass::NonWordStart))
401            }
402            Some(&'p') => {
403                self.consume()?;
404                Ok(AstNode::CharClass(CharClass::Punctuation))
405            }
406            Some(&'P') => {
407                self.consume()?;
408                Ok(AstNode::CharClass(CharClass::NonPunctuation))
409            }
410            Some(&'a') => {
411                self.consume()?;
412                Ok(AstNode::CharClass(CharClass::Alphanumeric))
413            }
414            Some(&'A') => {
415                self.consume()?;
416                Ok(AstNode::CharClass(CharClass::NonAlphanumeric))
417            }
418            Some(&'b') => {
419                self.consume()?;
420                Ok(AstNode::WordBoundary)
421            }
422            Some(&'<') => {
423                self.consume()?;
424                Ok(AstNode::StartWord)
425            }
426            Some(&'>') => {
427                self.consume()?;
428                Ok(AstNode::EndWord)
429            }
430            Some(&'z') => {
431                self.consume()?;
432                match self.current() {
433                    Some(&'s') => {
434                        self.consume()?;
435                        Ok(AstNode::SetMatchStart)
436                    }
437                    Some(&'e') => {
438                        self.consume()?;
439                        Ok(AstNode::SetMatchEnd)
440                    }
441                    _ => Err(ParseError::InvalidEscape('z')),
442                }
443            }
444            Some(&c @ '0'..='9') => {
445                self.consume()?;
446                let digit = c.to_digit(10).unwrap() as usize;
447                Ok(AstNode::Backref(digit))
448            }
449            Some(&'n') => {
450                self.consume()?;
451                Ok(AstNode::Literal('\n'))
452            }
453            Some(&'t') => {
454                self.consume()?;
455                Ok(AstNode::Literal('\t'))
456            }
457            Some(&'r') => {
458                self.consume()?;
459                Ok(AstNode::Literal('\r'))
460            }
461            Some(&'f') => {
462                self.consume()?;
463                Ok(AstNode::Literal('\x0C'))
464            }
465            Some(&'v') => {
466                self.consume()?;
467                Ok(AstNode::Literal('\x0B'))
468            }
469            Some(&'\\') => {
470                self.consume()?;
471                Ok(AstNode::Literal('\\'))
472            }
473            Some(&ch) => {
474                self.consume()?;
475                // Literal escape (e.g. \*, \[)
476                Ok(AstNode::Literal(ch))
477            }
478        }
479    }
480
481    // Parse (group) or (?:non-capture) or (?<name>) or lookarounds
482    fn parse_group(&mut self) -> Result<AstNode, ParseError> {
483        self.consume()?; // consume (
484
485        if self.current() == Some(&'?') {
486            self.consume()?;
487            self.parse_extended_group()
488        } else {
489            // Capturing group
490            self.group_count += 1;
491            let index = self.group_count;
492            let nodes = self.parse_alternation()?;
493            self.expect_close_paren()?;
494            Ok(AstNode::Group {
495                nodes,
496                name: None,
497                capture: true,
498                index: Some(index),
499            })
500        }
501    }
502
503    fn parse_extended_group(&mut self) -> Result<AstNode, ParseError> {
504        match self.current() {
505            Some(&':') => {
506                self.consume()?;
507                let nodes = self.parse_alternation()?;
508                self.expect_close_paren()?;
509                Ok(AstNode::Group {
510                    nodes,
511                    name: None,
512                    capture: false,
513                    index: None,
514                })
515            }
516            Some(&'<') => {
517                self.consume()?;
518                // Check for lookbehind
519                match self.current() {
520                    Some(&'=') => {
521                        self.consume()?;
522                        let nodes = self.parse_alternation()?;
523                        self.expect_close_paren()?;
524                        Ok(AstNode::LookBehind {
525                            nodes,
526                            positive: true,
527                        })
528                    }
529                    Some(&'!') => {
530                        self.consume()?;
531                        let nodes = self.parse_alternation()?;
532                        self.expect_close_paren()?;
533                        Ok(AstNode::LookBehind {
534                            nodes,
535                            positive: false,
536                        })
537                    }
538                    _ => {
539                        // Named capture (?<name>...)
540                        let name = self.parse_group_name()?;
541                        if self.current() != Some(&'>') {
542                            return Err(ParseError::InvalidGroupName("expected '>'".to_string()));
543                        }
544                        self.consume()?;
545
546                        self.group_count += 1;
547                        let index = self.group_count;
548
549                        let nodes = self.parse_alternation()?;
550                        self.expect_close_paren()?;
551                        Ok(AstNode::Group {
552                            nodes,
553                            name: Some(name),
554                            capture: true,
555                            index: Some(index),
556                        })
557                    }
558                }
559            }
560            Some(&'>') => {
561                self.consume()?;
562                match self.current() {
563                    Some(&'=') => {
564                        self.consume()?;
565                        let nodes = self.parse_alternation()?;
566                        self.expect_close_paren()?;
567                        Ok(AstNode::LookAhead {
568                            nodes,
569                            positive: true,
570                        })
571                    }
572                    Some(&'!') => {
573                        self.consume()?;
574                        let nodes = self.parse_alternation()?;
575                        self.expect_close_paren()?;
576                        Ok(AstNode::LookAhead {
577                            nodes,
578                            positive: false,
579                        })
580                    }
581                    _ => Err(ParseError::InvalidGroup(
582                        "Expected = or ! after ?>".to_string(),
583                    )),
584                }
585            }
586            _ => Err(ParseError::InvalidGroup("Unknown extension ?".to_string())),
587        }
588    }
589
590    // Parse group name [a-zA-Z_][a-zA-Z0-9_]*
591    fn parse_group_name(&mut self) -> Result<String, ParseError> {
592        let mut name = String::new();
593
594        loop {
595            match self.current() {
596                Some(&c) if c.is_alphanumeric() || c == '_' => {
597                    name.push(c);
598                    self.consume()?;
599                }
600                _ => break,
601            }
602        }
603
604        if name.is_empty() {
605            return Err(ParseError::InvalidGroupName("empty name".to_string()));
606        }
607
608        Ok(name)
609    }
610
611    // Parse [char class]
612    fn parse_char_class(&mut self) -> Result<AstNode, ParseError> {
613        self.consume()?; // consume [
614
615        let negated = if self.current() == Some(&'^') {
616            self.consume()?;
617            true
618        } else {
619            false
620        };
621
622        let mut ranges = vec![];
623
624        loop {
625            match self.current() {
626                None => return Err(ParseError::UnexpectedEof),
627                Some(&']') => {
628                    self.consume()?;
629                    break;
630                }
631                Some(&'\\') => {
632                    // Escaped char in class
633                    self.consume()?;
634                    match self.current() {
635                        Some(&c) => {
636                            self.consume()?;
637                            ranges.push(CharRange { start: c, end: c });
638                        }
639                        None => return Err(ParseError::UnexpectedEof),
640                    }
641                }
642                Some(&c) => {
643                    self.consume()?;
644                    // Check for range
645                    if self.current() == Some(&'-')
646                        && self.peek_ahead(1).is_some()
647                        && self.peek_ahead(1) != Some(&']')
648                    {
649                        self.consume()?;
650                        match self.current() {
651                            Some(&end) => {
652                                self.consume()?;
653                                ranges.push(CharRange { start: c, end });
654                            }
655                            None => return Err(ParseError::UnexpectedEof),
656                        }
657                    } else {
658                        ranges.push(CharRange { start: c, end: c });
659                    }
660                }
661            }
662        }
663
664        Ok(AstNode::CharClass(CharClass::Set {
665            chars: ranges,
666            negated,
667        }))
668    }
669
670    // Apply quantifiers: *, +, ?, {n}, {n,m}, etc
671    fn apply_quantifier(&mut self, node: AstNode) -> Result<AstNode, ParseError> {
672        self.skip_whitespace_and_comments();
673        match self.current() {
674            Some(&'*') => {
675                self.consume()?;
676                let greedy = self.current() != Some(&'?');
677                if !greedy {
678                    self.consume()?;
679                }
680                Ok(AstNode::ZeroOrMore {
681                    node: Box::new(node),
682                    greedy,
683                })
684            }
685            Some(&'+') => {
686                self.consume()?;
687                let greedy = self.current() != Some(&'?');
688                if !greedy {
689                    self.consume()?;
690                }
691                Ok(AstNode::OneOrMore {
692                    node: Box::new(node),
693                    greedy,
694                })
695            }
696            Some(&'?') => {
697                self.consume()?;
698                let greedy = self.current() != Some(&'?');
699                if !greedy {
700                    self.consume()?;
701                }
702                Ok(AstNode::Optional {
703                    node: Box::new(node),
704                    greedy,
705                })
706            }
707            Some(&'{') => self.parse_bounded_quantifier(node),
708            _ => Ok(node),
709        }
710    }
711
712    // Parse {n}, {n,}, {n,m}, {,m}
713    fn parse_bounded_quantifier(&mut self, node: AstNode) -> Result<AstNode, ParseError> {
714        self.consume()?; // consume {
715
716        // Parse min
717        let min = if self.current() == Some(&',') {
718            0
719        } else {
720            self.parse_number()?
721        };
722
723        match self.current() {
724            Some(&',') => {
725                self.consume()?;
726                // Parse max (optional)
727                let max = if self.current() == Some(&'}') {
728                    None
729                } else {
730                    Some(self.parse_number()?)
731                };
732
733                if self.current() != Some(&'}') {
734                    return Err(ParseError::InvalidQuantifier("expected '}'".to_string()));
735                }
736                self.consume()?;
737
738                let greedy = self.current() != Some(&'?');
739                if !greedy {
740                    self.consume()?;
741                }
742
743                Ok(AstNode::Range {
744                    node: Box::new(node),
745                    min,
746                    max,
747                    greedy,
748                })
749            }
750            Some(&'}') => {
751                self.consume()?;
752                Ok(AstNode::Exact {
753                    node: Box::new(node),
754                    count: min,
755                })
756            }
757            _ => Err(ParseError::InvalidQuantifier(
758                "expected ',' or '}'".to_string(),
759            )),
760        }
761    }
762
763    // Helper: parse a decimal number
764    fn parse_number(&mut self) -> Result<usize, ParseError> {
765        let mut num = 0;
766        let mut found = false;
767
768        while let Some(&c @ '0'..='9') = self.current() {
769            found = true;
770            num = num * 10 + (c.to_digit(10).unwrap() as usize);
771            self.consume()?;
772        }
773
774        if !found {
775            return Err(ParseError::InvalidLineNumber("expected digits".to_string()));
776        }
777
778        Ok(num)
779    }
780
781    fn expect_close_paren(&mut self) -> Result<(), ParseError> {
782        if self.current() != Some(&')') {
783            return Err(ParseError::UnmatchedParen);
784        }
785        self.consume()?;
786        Ok(())
787    }
788
789    // Helper: get current char without advancing
790    fn current(&self) -> Option<&char> {
791        self.input.get(self.pos)
792    }
793
794    // Helper: peek ahead n positions
795    fn peek_ahead(&self, n: usize) -> Option<&char> {
796        self.input.get(self.pos + n)
797    }
798
799    // Helper: peek next char
800    fn peek(&self) -> Option<char> {
801        self.input.get(self.pos).copied()
802    }
803
804    // Helper: consume current char and advance
805    fn consume(&mut self) -> Result<char, ParseError> {
806        match self.current() {
807            Some(&ch) => {
808                self.pos += 1;
809                Ok(ch)
810            }
811            None => Err(ParseError::UnexpectedEof),
812        }
813    }
814}