resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `EngineOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// allow whitespace and `#` comments in the pattern.
37    pub ignore_whitespace: bool,
38    /// ASCII `\w`/`\d`/`\s` tables, but
39    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
40    pub ascii_perl_classes: bool,
41}
42
43// arbitrary safeguards, these will not prevent intentional DoS patterns
44// more to protect you from shooting yourself in the foot
45const REPETITION_COUNT_LIMIT: u32 = 2_000;
46const EXPANDED_AST_LIMIT: u64 = 50_000;
47const MAX_LIST_LEN: usize = 4_000;
48
49impl Default for PatternFlags {
50    fn default() -> Self {
51        Self {
52            unicode: true,
53            full_unicode: false,
54            case_insensitive: false,
55            dot_matches_new_line: false,
56            ignore_whitespace: false,
57            ascii_perl_classes: false,
58        }
59    }
60}
61
62#[derive(Clone, Copy, PartialEq, Debug)]
63enum WordCharKind {
64    Word,
65    NonWord,
66    MaybeWord,
67    MaybeNonWord,
68    Unknown,
69    Edge,
70}
71
72fn is_word_byte(b: u8) -> bool {
73    b.is_ascii_alphanumeric() || b == b'_'
74}
75
76#[derive(Clone, Debug, Eq, PartialEq)]
77enum Primitive {
78    Literal(Literal),
79    Assertion(ast::Assertion),
80    Dot(Span),
81    Top(Span),
82    Perl(ClassPerl),
83    Unicode(ClassUnicode),
84}
85
86impl Primitive {
87    fn span(&self) -> &Span {
88        match *self {
89            Primitive::Literal(ref x) => &x.span,
90            Primitive::Assertion(ref x) => &x.span,
91            Primitive::Dot(ref span) => span,
92            Primitive::Top(ref span) => span,
93            Primitive::Perl(ref x) => &x.span,
94            Primitive::Unicode(ref x) => &x.span,
95        }
96    }
97
98    fn into_ast(self) -> Ast {
99        match self {
100            Primitive::Literal(lit) => Ast::literal(lit),
101            Primitive::Assertion(assert) => Ast::assertion(assert),
102            Primitive::Dot(span) => Ast::dot(span),
103            Primitive::Top(span) => Ast::top(span),
104            Primitive::Perl(cls) => Ast::class_perl(cls),
105            Primitive::Unicode(cls) => Ast::class_unicode(cls),
106        }
107    }
108
109    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
110        use self::Primitive::*;
111        use regex_syntax::ast::ClassSetItem;
112
113        match self {
114            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
115            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
116            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
117            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
118        }
119    }
120
121    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
122        use self::Primitive::*;
123
124        match self {
125            Literal(lit) => Ok(lit),
126            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
127        }
128    }
129}
130
131#[derive(Clone, Debug, Eq, PartialEq)]
132pub enum Either<Left, Right> {
133    Left(Left),
134    Right(Right),
135}
136
137#[derive(Clone, Debug, Eq, PartialEq)]
138pub struct ResharpError {
139    /// The kind of error.
140    pub kind: ErrorKind,
141    /// The original pattern that the parser generated the error from. Every
142    /// span in an error is a valid range into this string.
143    pattern: String,
144    /// The span of this error.
145    pub span: Span,
146}
147
148impl std::fmt::Display for ResharpError {
149    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
150        write!(f, "{:?}: {:?}", self.kind, self.span)
151    }
152}
153impl std::error::Error for ResharpError {}
154
155type Result<T> = core::result::Result<T, ResharpError>;
156
157#[derive(Clone, Debug)]
158enum GroupState {
159    /// This state is pushed whenever an opening group is found.
160    Group {
161        /// The concatenation immediately preceding the opening group.
162        concat: Concat,
163        /// The group that has been opened. Its sub-AST is always empty.
164        group: ast::Group,
165        /// Whether this group has the `x` flag enabled or not.
166        ignore_whitespace: bool,
167    },
168    /// This state is pushed whenever a new alternation branch is found. If
169    /// an alternation branch is found and this state is at the top of the
170    /// stack, then this state should be modified to include the new
171    /// alternation.
172    Alternation(ast::Alternation),
173    Intersection(ast::Intersection),
174}
175
176#[derive(Clone, Debug)]
177enum ClassState {
178    /// This state is pushed whenever an opening bracket is found.
179    Open {
180        /// The union of class items immediately preceding this class.
181        union: regex_syntax::ast::ClassSetUnion,
182        /// The class that has been opened. Typically this just corresponds
183        /// to the `[`, but it can also include `[^` since `^` indicates
184        /// negation of the class.
185        set: regex_syntax::ast::ClassBracketed,
186    },
187    /// This state is pushed when a operator is seen. When popped, the stored
188    /// set becomes the left hand side of the operator.
189    Op {
190        /// The type of the operation, i.e., &&, -- or ~~.
191        kind: regex_syntax::ast::ClassSetBinaryOpKind,
192        /// The left-hand side of the operator.
193        lhs: regex_syntax::ast::ClassSet,
194    },
195}
196
197/// RE# syntax parser based on the regex-syntax crate.
198pub struct ResharpParser<'s> {
199    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
200    unicode_classes: resharp_algebra::UnicodeClassCache,
201    pub translator: regex_syntax::hir::translate::Translator,
202    pub pattern: &'s str,
203    pos: Cell<Position>,
204    capture_index: Cell<u32>,
205    octal: bool,
206    empty_min_range: bool,
207    ignore_whitespace: Cell<bool>,
208    dot_all: Cell<bool>,
209    global_unicode: bool,
210    global_full_unicode: bool,
211    global_ascii_perl: bool,
212    global_case_insensitive: bool,
213    comments: RefCell<Vec<ast::Comment>>,
214    stack_group: RefCell<Vec<GroupState>>,
215    stack_class: RefCell<Vec<ClassState>>,
216    capture_names: RefCell<Vec<ast::CaptureName>>,
217    scratch: RefCell<String>,
218}
219
220fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
221    result.map_err(|e| {
222        if e.kind == from {
223            ResharpError {
224                kind: to,
225                pattern: e.pattern,
226                span: e.span,
227            }
228        } else {
229            e
230        }
231    })
232}
233
234fn is_capture_char(c: char, first: bool) -> bool {
235    if first {
236        c == '_' || c.is_alphabetic()
237    } else {
238        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
239    }
240}
241
242pub fn is_meta_character(c: char) -> bool {
243    matches!(
244        c,
245        '\\' | '.'
246            | '+'
247            | '*'
248            | '?'
249            | '('
250            | ')'
251            | '|'
252            | '['
253            | ']'
254            | '{'
255            | '}'
256            | '^'
257            | '$'
258            | '#'
259            | '&'
260            | '-'
261            | '~'
262            | '_'
263    )
264}
265
266/// escapes all resharp meta characters in `text`.
267pub fn escape(text: &str) -> String {
268    let mut buf = String::new();
269    escape_into(text, &mut buf);
270    buf
271}
272
273/// escapes all resharp meta characters in `text` and appends to `buf`.
274pub fn escape_into(text: &str, buf: &mut String) {
275    buf.reserve(text.len());
276    for c in text.chars() {
277        if is_meta_character(c) {
278            buf.push('\\');
279        }
280        buf.push(c);
281    }
282}
283
284pub fn is_escapeable_character(c: char) -> bool {
285    // Certainly escapeable if it's a meta character.
286    if is_meta_character(c) {
287        return true;
288    }
289    // Any character that isn't ASCII is definitely not escapeable. There's
290    // no real need to allow things like \☃ right?
291    if !c.is_ascii() {
292        return false;
293    }
294    // Otherwise, we basically say that everything is escapeable unless it's a
295    // letter or digit. Things like \3 are either octal (when enabled) or an
296    // error, and we should keep it that way. Otherwise, letters are reserved
297    // for adding new syntax in a backwards compatible way.
298    match c {
299        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
300        // While not currently supported, we keep these as not escapeable to
301        // give us some flexibility with respect to supporting the \< and
302        // \> word boundary assertions in the future. By rejecting them as
303        // escapeable, \< and \> will result in a parse error. Thus, we can
304        // turn them into something else in the future without it being a
305        // backwards incompatible change.
306        //
307        // OK, now we support \< and \>, and we need to retain them as *not*
308        // escapeable here since the escape sequence is significant.
309        '<' | '>' => false,
310        _ => true,
311    }
312}
313
314fn is_hex(c: char) -> bool {
315    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
316}
317
318impl<'s> ResharpParser<'s> {
319    fn default_translator_builder(&self) -> TranslatorBuilder {
320        let mut trb = TranslatorBuilder::new();
321        trb.unicode(self.global_unicode);
322        trb.utf8(false);
323        trb.case_insensitive(self.global_case_insensitive);
324        trb
325    }
326
327    pub fn new(pattern: &'s str) -> Self {
328        Self::with_flags(pattern, &PatternFlags::default())
329    }
330
331    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
332        let mut trb = TranslatorBuilder::new();
333        trb.unicode(flags.unicode);
334        trb.utf8(false);
335        trb.case_insensitive(flags.case_insensitive);
336        Self {
337            translator: trb.build(),
338            pattern,
339            perl_classes: vec![],
340            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
341            pos: Cell::new(Position::new(0, 0, 0)),
342            capture_index: Cell::new(0),
343            octal: false,
344            empty_min_range: false,
345            ignore_whitespace: Cell::new(flags.ignore_whitespace),
346            dot_all: Cell::new(flags.dot_matches_new_line),
347            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
348            global_full_unicode: flags.full_unicode,
349            global_ascii_perl: flags.ascii_perl_classes,
350            global_case_insensitive: flags.case_insensitive,
351            comments: RefCell::new(vec![]),
352            stack_group: RefCell::new(vec![]),
353            stack_class: RefCell::new(vec![]),
354            capture_names: RefCell::new(vec![]),
355            scratch: RefCell::new(String::new()),
356        }
357    }
358
359    /// Return a reference to the parser state.
360    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
361        self
362    }
363
364    /// Return a reference to the pattern being parsed.
365    fn pattern(&self) -> &str {
366        self.pattern
367    }
368
369    /// Create a new error with the given span and error type.
370    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
371        ResharpError {
372            kind,
373            pattern: self.pattern().to_string(),
374            span,
375        }
376    }
377
378    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
379        self.error(
380            Span::splat(self.pos()),
381            ast::ErrorKind::UnsupportedResharpRegex,
382        )
383    }
384
385    /// Return the current offset of the parser.
386    ///
387    /// The offset starts at `0` from the beginning of the regular expression
388    /// pattern string.
389    fn offset(&self) -> usize {
390        self.parser().pos.get().offset
391    }
392
393    /// Return the current line number of the parser.
394    ///
395    /// The line number starts at `1`.
396    fn line(&self) -> usize {
397        self.parser().pos.get().line
398    }
399
400    /// Return the current column of the parser.
401    ///
402    /// The column number starts at `1` and is reset whenever a `\n` is seen.
403    fn column(&self) -> usize {
404        self.parser().pos.get().column
405    }
406
407    /// Return the next capturing index. Each subsequent call increments the
408    /// internal index.
409    ///
410    /// The span given should correspond to the location of the opening
411    /// parenthesis.
412    ///
413    /// If the capture limit is exceeded, then an error is returned.
414    fn next_capture_index(&self, span: Span) -> Result<u32> {
415        let current = self.parser().capture_index.get();
416        let i = current
417            .checked_add(1)
418            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
419        self.parser().capture_index.set(i);
420        Ok(i)
421    }
422
423    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
424        let mut names = self.parser().capture_names.borrow_mut();
425        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
426            Err(i) => {
427                names.insert(i, cap.clone());
428                Ok(())
429            }
430            Ok(i) => Err(self.error(
431                cap.span,
432                ast::ErrorKind::GroupNameDuplicate {
433                    original: names[i].span,
434                },
435            )),
436        }
437    }
438
439    fn ignore_whitespace(&self) -> bool {
440        self.parser().ignore_whitespace.get()
441    }
442
443    fn char(&self) -> char {
444        self.char_at(self.offset())
445    }
446
447    fn char_at(&self, i: usize) -> char {
448        self.pattern()[i..]
449            .chars()
450            .next()
451            .unwrap_or_else(|| panic!("expected char at offset {}", i))
452    }
453
454    fn bump(&self) -> bool {
455        if self.is_eof() {
456            return false;
457        }
458        let Position {
459            mut offset,
460            mut line,
461            mut column,
462        } = self.pos();
463        if self.char() == '\n' {
464            line = line.checked_add(1).unwrap();
465            column = 1;
466        } else {
467            column = column.checked_add(1).unwrap();
468        }
469        offset += self.char().len_utf8();
470        self.parser().pos.set(Position {
471            offset,
472            line,
473            column,
474        });
475        self.pattern()[self.offset()..].chars().next().is_some()
476    }
477
478    fn bump_if(&self, prefix: &str) -> bool {
479        if self.pattern()[self.offset()..].starts_with(prefix) {
480            for _ in 0..prefix.chars().count() {
481                self.bump();
482            }
483            true
484        } else {
485            false
486        }
487    }
488
489    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
490        if self.bump_if("?=") {
491            return Some((true, true));
492        }
493        if self.bump_if("?!") {
494            return Some((true, false));
495        }
496        if self.bump_if("?<=") {
497            return Some((false, true));
498        }
499        if self.bump_if("?<!") {
500            return Some((false, false));
501        }
502        None
503    }
504
505    fn bump_and_bump_space(&self) -> bool {
506        if !self.bump() {
507            return false;
508        }
509        self.bump_space();
510        !self.is_eof()
511    }
512
513    fn bump_space(&self) {
514        if !self.ignore_whitespace() {
515            return;
516        }
517        while !self.is_eof() {
518            if self.char().is_whitespace() {
519                self.bump();
520            } else if self.char() == '#' {
521                let start = self.pos();
522                let mut comment_text = String::new();
523                self.bump();
524                while !self.is_eof() {
525                    let c = self.char();
526                    self.bump();
527                    if c == '\n' {
528                        break;
529                    }
530                    comment_text.push(c);
531                }
532                let comment = ast::Comment {
533                    span: Span::new(start, self.pos()),
534                    comment: comment_text,
535                };
536                self.parser().comments.borrow_mut().push(comment);
537            } else {
538                break;
539            }
540        }
541    }
542
543    /// Peek at the next character in the input without advancing the parser.
544    ///
545    /// If the input has been exhausted, then this returns `None`.
546    fn peek(&self) -> Option<char> {
547        if self.is_eof() {
548            return None;
549        }
550        self.pattern()[self.offset() + self.char().len_utf8()..]
551            .chars()
552            .next()
553    }
554
555    /// Like peek, but will ignore spaces when the parser is in whitespace
556    /// insensitive mode.
557    fn peek_space(&self) -> Option<char> {
558        if !self.ignore_whitespace() {
559            return self.peek();
560        }
561        if self.is_eof() {
562            return None;
563        }
564        let mut start = self.offset() + self.char().len_utf8();
565        let mut in_comment = false;
566        for (i, c) in self.pattern()[start..].char_indices() {
567            if c.is_whitespace() {
568                continue;
569            } else if !in_comment && c == '#' {
570                in_comment = true;
571            } else if in_comment && c == '\n' {
572                in_comment = false;
573            } else {
574                start += i;
575                break;
576            }
577        }
578        self.pattern()[start..].chars().next()
579    }
580
581    /// Returns true if the next call to `bump` would return false.
582    fn is_eof(&self) -> bool {
583        self.offset() == self.pattern().len()
584    }
585
586    /// Return the current position of the parser, which includes the offset,
587    /// line and column.
588    fn pos(&self) -> Position {
589        self.parser().pos.get()
590    }
591
592    /// Create a span at the current position of the parser. Both the start
593    /// and end of the span are set.
594    fn span(&self) -> Span {
595        Span::splat(self.pos())
596    }
597
598    /// Create a span that covers the current character.
599    fn span_char(&self) -> Span {
600        let mut next = Position {
601            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
602            line: self.line(),
603            column: self.column().checked_add(1).unwrap(),
604        };
605        if self.char() == '\n' {
606            next.line += 1;
607            next.column = 1;
608        }
609        Span::new(self.pos(), next)
610    }
611
612    /// Parse and push a single alternation on to the parser's internal stack.
613    /// If the top of the stack already has an alternation, then add to that
614    /// instead of pushing a new one.
615    ///
616    /// The concatenation given corresponds to a single alternation branch.
617    /// The concatenation returned starts the next branch and is empty.
618    ///
619    /// This assumes the parser is currently positioned at `|` and will advance
620    /// the parser to the character following `|`.
621    #[inline(never)]
622    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
623        assert_eq!(self.char(), '|');
624        concat.span.end = self.pos();
625        self.push_or_add_alternation(concat);
626        self.bump();
627        Ok(ast::Concat {
628            span: self.span(),
629            asts: vec![],
630        })
631    }
632
633    /// Pushes or adds the given branch of an alternation to the parser's
634    /// internal stack of state.
635    fn push_or_add_alternation(&self, concat: Concat) {
636        use self::GroupState::*;
637
638        let mut stack = self.parser().stack_group.borrow_mut();
639        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
640            alts.asts.push(concat.into_ast());
641            return;
642        }
643        stack.push(Alternation(ast::Alternation {
644            span: Span::new(concat.span.start, self.pos()),
645            asts: vec![concat.into_ast()],
646        }));
647    }
648
649    #[inline(never)]
650    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
651        assert_eq!(self.char(), '&');
652        concat.span.end = self.pos();
653        self.push_or_add_intersect(concat);
654        self.bump();
655        Ok(Concat {
656            span: self.span(),
657            asts: vec![],
658        })
659    }
660
661    /// Pushes or adds the given branch of an alternation to the parser's
662    /// internal stack of state.
663    fn push_or_add_intersect(&self, concat: Concat) {
664        use self::GroupState::*;
665
666        let mut stack = self.parser().stack_group.borrow_mut();
667        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
668            alts.asts.push(concat.into_ast());
669            return;
670        }
671        stack.push(Intersection(ast::Intersection {
672            span: Span::new(concat.span.start, self.pos()),
673            asts: vec![concat.into_ast()],
674        }));
675    }
676
677    /// Parse and push a group AST (and its parent concatenation) on to the
678    /// parser's internal stack. Return a fresh concatenation corresponding
679    /// to the group's sub-AST.
680    ///
681    /// If a set of flags was found (with no group), then the concatenation
682    /// is returned with that set of flags added.
683    ///
684    /// This assumes that the parser is currently positioned on the opening
685    /// parenthesis. It advances the parser to the character at the start
686    /// of the sub-expression (or adjoining expression).
687    ///
688    /// If there was a problem parsing the start of the group, then an error
689    /// is returned.
690    #[inline(never)]
691    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
692        assert_eq!(self.char(), '(');
693        match self.parse_group()? {
694            Either::Left(set) => {
695                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
696                if let Some(v) = ignore {
697                    self.parser().ignore_whitespace.set(v);
698                }
699
700                concat.asts.push(Ast::flags(set));
701                Ok(concat)
702            }
703            Either::Right(group) => {
704                let old_ignore_whitespace = self.ignore_whitespace();
705                let new_ignore_whitespace = group
706                    .flags()
707                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
708                    .unwrap_or(old_ignore_whitespace);
709                self.parser()
710                    .stack_group
711                    .borrow_mut()
712                    .push(GroupState::Group {
713                        concat,
714                        group,
715                        ignore_whitespace: old_ignore_whitespace,
716                    });
717                self.parser().ignore_whitespace.set(new_ignore_whitespace);
718                Ok(Concat {
719                    span: self.span(),
720                    asts: vec![],
721                })
722            }
723        }
724    }
725
726    #[inline(never)]
727    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
728        assert_eq!(self.char(), '~');
729        self.bump();
730        if self.is_eof() || self.char() != '(' {
731            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
732        }
733        let open_span = self.span_char();
734        self.bump();
735        let group = ast::Group {
736            span: open_span,
737            kind: ast::GroupKind::Complement,
738            ast: Box::new(Ast::empty(self.span())),
739        };
740
741        let old_ignore_whitespace = self.ignore_whitespace();
742        let new_ignore_whitespace = group
743            .flags()
744            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
745            .unwrap_or(old_ignore_whitespace);
746        self.parser()
747            .stack_group
748            .borrow_mut()
749            .push(GroupState::Group {
750                concat,
751                group,
752                ignore_whitespace: old_ignore_whitespace,
753            });
754        self.parser().ignore_whitespace.set(new_ignore_whitespace);
755        Ok(Concat {
756            span: self.span(),
757            asts: vec![],
758        })
759    }
760
761    /// Pop a group AST from the parser's internal stack and set the group's
762    /// AST to the given concatenation. Return the concatenation containing
763    /// the group.
764    ///
765    /// This assumes that the parser is currently positioned on the closing
766    /// parenthesis and advances the parser to the character following the `)`.
767    ///
768    /// If no such group could be popped, then an unopened group error is
769    /// returned.
770    #[inline(never)]
771    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
772        use self::GroupState::*;
773        assert_eq!(self.char(), ')');
774        let mut stack = self.parser().stack_group.borrow_mut();
775        let topstack = stack.pop();
776
777        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
778            Some(Group {
779                concat,
780                group,
781                ignore_whitespace,
782            }) => (concat, group, ignore_whitespace, None),
783            Some(Alternation(alt)) => match stack.pop() {
784                Some(Group {
785                    concat,
786                    group,
787                    ignore_whitespace,
788                }) => (
789                    concat,
790                    group,
791                    ignore_whitespace,
792                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
793                ),
794                None | Some(Alternation(_)) | Some(Intersection(_)) => {
795                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
796                }
797            },
798            Some(Intersection(int)) => match stack.pop() {
799                Some(Group {
800                    concat,
801                    group,
802                    ignore_whitespace,
803                }) => (
804                    concat,
805                    group,
806                    ignore_whitespace,
807                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
808                ),
809                None | Some(Alternation(_)) | Some(Intersection(_)) => {
810                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
811                }
812            },
813
814            None => {
815                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
816            }
817        };
818        self.parser().ignore_whitespace.set(ignore_whitespace);
819        group_concat.span.end = self.pos();
820        self.bump();
821        group.span.end = self.pos();
822        match alt {
823            Some(Either::Left(mut alt)) => {
824                alt.span.end = group_concat.span.end;
825                alt.asts.push(group_concat.into_ast());
826                group.ast = Box::new(alt.into_ast());
827            }
828            Some(Either::Right(mut int)) => {
829                int.span.end = group_concat.span.end;
830                int.asts.push(group_concat.into_ast());
831                group.ast = Box::new(int.into_ast());
832            }
833            None => {
834                group.ast = Box::new(group_concat.into_ast());
835            }
836        }
837
838        if group.kind == GroupKind::Complement {
839            let complement = ast::Complement {
840                span: self.span(),
841                ast: group.ast,
842            };
843            prior_concat.asts.push(Ast::complement(complement));
844        }
845        // ignore groups for now
846        else {
847            prior_concat.asts.push(Ast::group(group));
848        }
849        Ok(prior_concat)
850    }
851
852    /// Pop the last state from the parser's internal stack, if it exists, and
853    /// add the given concatenation to it. There either must be no state or a
854    /// single alternation item on the stack. Any other scenario produces an
855    /// error.
856    ///
857    /// This assumes that the parser has advanced to the end.
858    #[inline(never)]
859    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
860        concat.span.end = self.pos();
861        let mut stack = self.parser().stack_group.borrow_mut();
862        let ast = match stack.pop() {
863            None => Ok(concat.into_ast()),
864            Some(GroupState::Alternation(mut alt)) => {
865                alt.span.end = self.pos();
866                alt.asts.push(concat.into_ast());
867                Ok(Ast::alternation(alt))
868            }
869            Some(GroupState::Intersection(mut int)) => {
870                int.span.end = self.pos();
871                int.asts.push(concat.into_ast());
872
873                Ok(Ast::intersection(int))
874            }
875            Some(GroupState::Group { group, .. }) => {
876                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
877            }
878        };
879        // If we try to pop again, there should be nothing.
880        match stack.pop() {
881            None => ast,
882            Some(GroupState::Alternation(alt)) => {
883                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
884            }
885            Some(GroupState::Intersection(int)) => {
886                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
887            }
888            Some(GroupState::Group { group, .. }) => {
889                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
890            }
891        }
892    }
893
894    /// Parse the opening of a character class and push the current class
895    /// parsing context onto the parser's stack. This assumes that the parser
896    /// is positioned at an opening `[`. The given union should correspond to
897    /// the union of set items built up before seeing the `[`.
898    ///
899    /// If there was a problem parsing the opening of the class, then an error
900    /// is returned. Otherwise, a new union of set items for the class is
901    /// returned (which may be populated with either a `]` or a `-`).
902    #[inline(never)]
903    fn push_class_open(
904        &self,
905        parent_union: regex_syntax::ast::ClassSetUnion,
906    ) -> Result<regex_syntax::ast::ClassSetUnion> {
907        assert_eq!(self.char(), '[');
908
909        let (nested_set, nested_union) = self.parse_set_class_open()?;
910        self.parser()
911            .stack_class
912            .borrow_mut()
913            .push(ClassState::Open {
914                union: parent_union,
915                set: nested_set,
916            });
917        Ok(nested_union)
918    }
919
920    /// Parse the end of a character class set and pop the character class
921    /// parser stack. The union given corresponds to the last union built
922    /// before seeing the closing `]`. The union returned corresponds to the
923    /// parent character class set with the nested class added to it.
924    ///
925    /// This assumes that the parser is positioned at a `]` and will advance
926    /// the parser to the byte immediately following the `]`.
927    ///
928    /// If the stack is empty after popping, then this returns the final
929    /// "top-level" character class AST (where a "top-level" character class
930    /// is one that is not nested inside any other character class).
931    ///
932    /// If there is no corresponding opening bracket on the parser's stack,
933    /// then an error is returned.
934    #[inline(never)]
935    fn pop_class(
936        &self,
937        nested_union: regex_syntax::ast::ClassSetUnion,
938    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
939        assert_eq!(self.char(), ']');
940
941        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
942        let prevset = self.pop_class_op(item);
943        let mut stack = self.parser().stack_class.borrow_mut();
944        match stack.pop() {
945            None => {
946                // We can never observe an empty stack:
947                //
948                // 1) We are guaranteed to start with a non-empty stack since
949                //    the character class parser is only initiated when it sees
950                //    a `[`.
951                // 2) If we ever observe an empty stack while popping after
952                //    seeing a `]`, then we signal the character class parser
953                //    to terminate.
954                panic!("unexpected empty character class stack")
955            }
956            Some(ClassState::Op { .. }) => {
957                // This panic is unfortunate, but this case is impossible
958                // since we already popped the Op state if one exists above.
959                // Namely, every push to the class parser stack is guarded by
960                // whether an existing Op is already on the top of the stack.
961                // If it is, the existing Op is modified. That is, the stack
962                // can never have consecutive Op states.
963                panic!("unexpected ClassState::Op")
964            }
965            Some(ClassState::Open { mut union, mut set }) => {
966                self.bump();
967                set.span.end = self.pos();
968                set.kind = prevset;
969                if stack.is_empty() {
970                    Ok(Either::Right(set))
971                } else {
972                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
973                    Ok(Either::Left(union))
974                }
975            }
976        }
977    }
978
979    /// Return an "unclosed class" error whose span points to the most
980    /// recently opened class.
981    ///
982    /// This should only be called while parsing a character class.
983    #[inline(never)]
984    fn unclosed_class_error(&self) -> ResharpError {
985        for state in self.parser().stack_class.borrow().iter().rev() {
986            if let ClassState::Open { ref set, .. } = *state {
987                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
988            }
989        }
990        // We are guaranteed to have a non-empty stack with at least
991        // one open bracket, so we should never get here.
992        panic!("no open character class found")
993    }
994
995    /// Push the current set of class items on to the class parser's stack as
996    /// the left hand side of the given operator.
997    ///
998    /// A fresh set union is returned, which should be used to build the right
999    /// hand side of this operator.
1000    #[inline(never)]
1001    fn push_class_op(
1002        &self,
1003        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1004        next_union: regex_syntax::ast::ClassSetUnion,
1005    ) -> regex_syntax::ast::ClassSetUnion {
1006        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1007        let new_lhs = self.pop_class_op(item);
1008        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1009            kind: next_kind,
1010            lhs: new_lhs,
1011        });
1012        regex_syntax::ast::ClassSetUnion {
1013            span: self.span(),
1014            items: vec![],
1015        }
1016    }
1017
1018    /// Pop a character class set from the character class parser stack. If the
1019    /// top of the stack is just an item (not an operation), then return the
1020    /// given set unchanged. If the top of the stack is an operation, then the
1021    /// given set will be used as the rhs of the operation on the top of the
1022    /// stack. In that case, the binary operation is returned as a set.
1023    #[inline(never)]
1024    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1025        let mut stack = self.parser().stack_class.borrow_mut();
1026        let (kind, lhs) = match stack.pop() {
1027            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1028            Some(state @ ClassState::Open { .. }) => {
1029                stack.push(state);
1030                return rhs;
1031            }
1032            None => unreachable!(),
1033        };
1034        let span = Span::new(lhs.span().start, rhs.span().end);
1035        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1036            span,
1037            kind,
1038            lhs: Box::new(lhs),
1039            rhs: Box::new(rhs),
1040        })
1041    }
1042
1043    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1044        match hir.kind() {
1045            hir::HirKind::Empty => Ok(NodeId::EPS),
1046            hir::HirKind::Literal(l) => {
1047                if l.0.len() == 1 {
1048                    let node = tb.mk_u8(l.0[0]);
1049                    Ok(node)
1050                } else {
1051                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1052                    let conc = tb.mk_concats(ws.iter().copied());
1053                    Ok(conc)
1054                }
1055            }
1056            hir::HirKind::Class(class) => match class {
1057                hir::Class::Unicode(class_unicode) => {
1058                    let ranges = class_unicode.ranges();
1059                    if ranges.len() == 1
1060                        && ranges[0].start() == '\u{0}'
1061                        && ranges[0].end() == '\u{10FFFF}'
1062                    {
1063                        return Ok(tb.mk_range_u8(0, 255));
1064                    }
1065                    let mut nodes = Vec::new();
1066                    for range in ranges {
1067                        for seq in Utf8Sequences::new(range.start(), range.end()) {
1068                            let sl = seq.as_slice();
1069                            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1070                            let node = match bytes.len() {
1071                                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1072                                n => {
1073                                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1074                                    let mut conc = last;
1075                                    for i in (0..n - 1).rev() {
1076                                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1077                                        conc = tb.mk_concat(b, conc);
1078                                    }
1079                                    conc
1080                                }
1081                            };
1082                            nodes.push(node);
1083                        }
1084                    }
1085                    let merged = tb.mk_unions(nodes.into_iter());
1086                    Ok(merged)
1087                }
1088                hir::Class::Bytes(class_bytes) => {
1089                    let ranges = class_bytes.ranges();
1090                    let mut result = NodeId::BOT;
1091                    for range in ranges {
1092                        let start = range.start();
1093                        let end = range.end();
1094                        let node = tb.mk_range_u8(start, end);
1095                        result = tb.mk_union(result, node);
1096                    }
1097                    Ok(result)
1098                }
1099            },
1100            hir::HirKind::Look(_) => Err(self.error(
1101                Span::splat(self.pos()),
1102                ast::ErrorKind::UnsupportedResharpRegex,
1103            )),
1104            hir::HirKind::Repetition(_) => Err(self.error(
1105                Span::splat(self.pos()),
1106                ast::ErrorKind::UnsupportedResharpRegex,
1107            )),
1108            hir::HirKind::Capture(_) => Err(self.error(
1109                Span::splat(self.pos()),
1110                ast::ErrorKind::UnsupportedResharpRegex,
1111            )),
1112            hir::HirKind::Concat(body) => {
1113                let mut result = NodeId::EPS;
1114                for child in body {
1115                    let node = self.hir_to_node_id(child, tb)?;
1116                    result = tb.mk_concat(result, node);
1117                }
1118                Ok(result)
1119            }
1120            hir::HirKind::Alternation(_) => Err(self.error(
1121                Span::splat(self.pos()),
1122                ast::ErrorKind::UnsupportedResharpRegex,
1123            )),
1124        }
1125    }
1126
1127    fn translate_ast_to_hir(
1128        &mut self,
1129        orig_ast: &regex_syntax::ast::Ast,
1130        tb: &mut TB<'s>,
1131    ) -> Result<NodeId> {
1132        match self.translator.translate("", orig_ast) {
1133            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1134            Ok(hir) => self.hir_to_node_id(&hir, tb),
1135        }
1136    }
1137
1138    fn translator_to_node_id(
1139        &mut self,
1140        orig_ast: &regex_syntax::ast::Ast,
1141        translator: &mut Option<Translator>,
1142        tb: &mut TB<'s>,
1143    ) -> Result<NodeId> {
1144        match translator {
1145            Some(tr) => {
1146                let hir = tr
1147                    .translate("", orig_ast)
1148                    .map_err(|e| self.unsupported_error(e))?;
1149                self.hir_to_node_id(&hir, tb)
1150            }
1151            None => self.translate_ast_to_hir(orig_ast, tb),
1152        }
1153    }
1154
1155    fn get_class(
1156        &mut self,
1157        negated: bool,
1158        kind: regex_syntax::ast::ClassPerlKind,
1159        tb: &mut TB<'s>,
1160    ) -> Result<NodeId> {
1161        let w = self
1162            .perl_classes
1163            .iter()
1164            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1165        match w {
1166            Some((_, _, value)) => Ok(*value),
1167            None => {
1168                let translated = if self.global_ascii_perl {
1169                    let pos = match kind {
1170                        regex_syntax::ast::ClassPerlKind::Word => {
1171                            let az = tb.mk_range_u8(b'a', b'z');
1172                            let big = tb.mk_range_u8(b'A', b'Z');
1173                            let dig = tb.mk_range_u8(b'0', b'9');
1174                            let us = tb.mk_u8(b'_');
1175                            tb.mk_unions([az, big, dig, us].into_iter())
1176                        }
1177                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1178                        regex_syntax::ast::ClassPerlKind::Space => {
1179                            let sp = tb.mk_u8(b' ');
1180                            let tab = tb.mk_u8(b'\t');
1181                            let nl = tb.mk_u8(b'\n');
1182                            let cr = tb.mk_u8(b'\r');
1183                            let ff = tb.mk_u8(0x0C);
1184                            let vt = tb.mk_u8(0x0B);
1185                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1186                        }
1187                    };
1188                    if negated {
1189                        resharp_algebra::neg_class(tb, pos)
1190                    } else {
1191                        pos
1192                    }
1193                } else if self.global_unicode {
1194                    match kind {
1195                        regex_syntax::ast::ClassPerlKind::Word => {
1196                            if self.global_full_unicode {
1197                                self.unicode_classes.ensure_word_full(tb);
1198                            } else {
1199                                self.unicode_classes.ensure_word(tb);
1200                            }
1201                            if negated {
1202                                self.unicode_classes.non_word
1203                            } else {
1204                                self.unicode_classes.word
1205                            }
1206                        }
1207                        regex_syntax::ast::ClassPerlKind::Digit => {
1208                            if self.global_full_unicode {
1209                                self.unicode_classes.ensure_digit_full(tb);
1210                            } else {
1211                                self.unicode_classes.ensure_digit(tb);
1212                            }
1213                            if negated {
1214                                self.unicode_classes.non_digit
1215                            } else {
1216                                self.unicode_classes.digit
1217                            }
1218                        }
1219                        regex_syntax::ast::ClassPerlKind::Space => {
1220                            if self.global_full_unicode {
1221                                self.unicode_classes.ensure_space_full(tb);
1222                            } else {
1223                                self.unicode_classes.ensure_space(tb);
1224                            }
1225                            if negated {
1226                                self.unicode_classes.non_space
1227                            } else {
1228                                self.unicode_classes.space
1229                            }
1230                        }
1231                    }
1232                } else {
1233                    let pos = match kind {
1234                        regex_syntax::ast::ClassPerlKind::Word => {
1235                            let az = tb.mk_range_u8(b'a', b'z');
1236                            let big = tb.mk_range_u8(b'A', b'Z');
1237                            let dig = tb.mk_range_u8(b'0', b'9');
1238                            let us = tb.mk_u8(b'_');
1239                            tb.mk_unions([az, big, dig, us].into_iter())
1240                        }
1241                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1242                        regex_syntax::ast::ClassPerlKind::Space => {
1243                            let sp = tb.mk_u8(b' ');
1244                            let tab = tb.mk_u8(b'\t');
1245                            let nl = tb.mk_u8(b'\n');
1246                            let cr = tb.mk_u8(b'\r');
1247                            let ff = tb.mk_u8(0x0C);
1248                            let vt = tb.mk_u8(0x0B);
1249                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1250                        }
1251                    };
1252                    if negated {
1253                        tb.mk_compl(pos)
1254                    } else {
1255                        pos
1256                    }
1257                };
1258                self.perl_classes.push((negated, kind, translated));
1259                Ok(translated)
1260            }
1261        }
1262    }
1263
1264    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1265        use WordCharKind::*;
1266        match ast {
1267            Ast::Literal(lit) => {
1268                if is_word_byte(lit.c as u8) {
1269                    Word
1270                } else {
1271                    NonWord
1272                }
1273            }
1274            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1275                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1276                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1277                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1278                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1279                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1280                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1281            },
1282            Ast::Dot(_) | Ast::Top(_) => Unknown,
1283            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1284            Ast::Concat(c) if !c.asts.is_empty() => {
1285                let edge = if left { c.asts.len() - 1 } else { 0 };
1286                let kind = Self::word_char_kind(&c.asts[edge], left);
1287                match kind {
1288                    MaybeWord => {
1289                        let dir: isize = if left { -1 } else { 1 };
1290                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1291                            Word => Word,
1292                            _ => MaybeWord,
1293                        }
1294                    }
1295                    MaybeNonWord => {
1296                        let dir: isize = if left { -1 } else { 1 };
1297                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1298                            NonWord => NonWord,
1299                            _ => MaybeNonWord,
1300                        }
1301                    }
1302                    other => other,
1303                }
1304            }
1305            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1306                let first = Self::word_char_kind(&alt.asts[0], left);
1307                if alt.asts[1..]
1308                    .iter()
1309                    .all(|a| Self::word_char_kind(a, left) == first)
1310                {
1311                    first
1312                } else {
1313                    Unknown
1314                }
1315            }
1316            Ast::Repetition(r) => {
1317                let inner = Self::word_char_kind(&r.ast, left);
1318                let nullable = matches!(
1319                    &r.op.kind,
1320                    ast::RepetitionKind::ZeroOrMore
1321                        | ast::RepetitionKind::ZeroOrOne
1322                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1323                );
1324                if nullable {
1325                    match inner {
1326                        Word => MaybeWord,
1327                        NonWord => MaybeNonWord,
1328                        _ => Unknown,
1329                    }
1330                } else {
1331                    inner
1332                }
1333            }
1334            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1335            _ => Unknown,
1336        }
1337    }
1338
1339    // ok to return None here, it's only an optimization
1340    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1341        match ast {
1342            Ast::Literal(_)
1343            | Ast::ClassPerl(_)
1344            | Ast::ClassBracketed(_)
1345            | Ast::ClassUnicode(_)
1346            | Ast::Dot(_)
1347            | Ast::Top(_) => Some(ast),
1348            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1349            Ast::Concat(c) if !c.asts.is_empty() => {
1350                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1351            }
1352            Ast::Repetition(r) => {
1353                let nullable = matches!(
1354                    &r.op.kind,
1355                    ast::RepetitionKind::ZeroOrMore
1356                        | ast::RepetitionKind::ZeroOrOne
1357                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1358                );
1359                if nullable {
1360                    None
1361                } else {
1362                    Self::edge_class_ast(&r.ast, left)
1363                }
1364            }
1365            _ => None,
1366        }
1367    }
1368
1369    fn resolve_word_kind(
1370        &mut self,
1371        asts: &[Ast],
1372        idx: usize,
1373        dir: isize,
1374        translator: &mut Option<Translator>,
1375        tb: &mut TB<'s>,
1376        word_id: NodeId,
1377        not_word_id: NodeId,
1378    ) -> Result<WordCharKind> {
1379        use WordCharKind::*;
1380        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1381        if fast != Unknown {
1382            return Ok(fast);
1383        }
1384        let neighbor_idx = (idx as isize + dir) as usize;
1385        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1386            self.ast_to_node_id(edge, translator, tb)?
1387        } else {
1388            // check if \w_* (starts-with-word) or \W_* (starts-with-non-word) subsumes it.
1389            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1390            let mut neighbor_node = tb
1391                .try_elim_lookarounds(neighbor_node)
1392                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1393            if dir < 0 {
1394                neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1395                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1396                })?;
1397            }
1398            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1399            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1400            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1401                Ok(Word)
1402            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1403                Ok(NonWord)
1404            } else {
1405                Ok(Unknown)
1406            };
1407        };
1408        if tb.subsumes(word_id, node) == Some(true) {
1409            Ok(Word)
1410        } else if tb.subsumes(not_word_id, node) == Some(true) {
1411            Ok(NonWord)
1412        } else {
1413            Ok(Unknown)
1414        }
1415    }
1416
1417    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1418        use WordCharKind::*;
1419        let next = idx as isize + dir;
1420        if next < 0 || next >= asts.len() as isize {
1421            return Edge;
1422        }
1423        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1424        match kind {
1425            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1426                Word => Word,
1427                _ => Unknown,
1428            },
1429            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1430                NonWord => NonWord,
1431                _ => Unknown,
1432            },
1433            other => other,
1434        }
1435    }
1436
1437    fn rewrite_word_boundary_in_concat(
1438        &mut self,
1439        asts: &[Ast],
1440        idx: usize,
1441        translator: &mut Option<Translator>,
1442        tb: &mut TB<'s>,
1443    ) -> Result<(NodeId, usize)> {
1444        use WordCharKind::*;
1445        let (word_id, not_word_id) = if self.global_full_unicode {
1446            self.unicode_classes.ensure_word_full(tb);
1447            (self.unicode_classes.word, self.unicode_classes.non_word)
1448        } else if self.global_unicode && !self.global_ascii_perl {
1449            self.unicode_classes.ensure_word(tb);
1450            (self.unicode_classes.word, self.unicode_classes.non_word)
1451        } else {
1452            let az = tb.mk_range_u8(b'a', b'z');
1453            let big = tb.mk_range_u8(b'A', b'Z');
1454            let dig = tb.mk_range_u8(b'0', b'9');
1455            let us = tb.mk_u8(b'_');
1456            let w = tb.mk_unions([az, big, dig, us].into_iter());
1457            (w, tb.mk_compl(w))
1458        };
1459        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1460        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1461        match (left, right) {
1462            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1463            (Word, _) => {
1464                let neg = tb.mk_neg_lookahead(word_id, 0);
1465                Ok((neg, idx + 1))
1466            }
1467            (NonWord, _) => {
1468                let tail = tb.mk_concat(word_id, NodeId::TS);
1469                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1470            }
1471            (_, Word) => Ok((tb.mk_neg_lookbehind(word_id), idx + 1)),
1472            (_, NonWord) => Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1)),
1473            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1474            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1475            // reimplement once/if the builder is more optimized
1476            _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1477        }
1478    }
1479
1480    fn merge_boundary_with_following_lookaheads(
1481        &mut self,
1482        asts: &[Ast],
1483        wb_idx: usize,
1484        boundary_tail: NodeId,
1485        translator: &mut Option<Translator>,
1486        tb: &mut TB<'s>,
1487    ) -> Result<(NodeId, usize)> {
1488        let mut next = wb_idx + 1;
1489        let mut la_bodies = vec![boundary_tail];
1490        while next < asts.len() {
1491            match &asts[next] {
1492                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1493                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1494                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1495                    next += 1;
1496                }
1497                _ => break,
1498            }
1499        }
1500        let merged = tb.mk_inters(la_bodies.into_iter());
1501        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1502    }
1503
1504    fn ast_to_node_id(
1505        &mut self,
1506        ast: &Ast,
1507        translator: &mut Option<Translator>,
1508        tb: &mut TB<'s>,
1509    ) -> Result<NodeId> {
1510        match ast {
1511            Ast::Empty(_) => Ok(NodeId::EPS),
1512            Ast::Flags(f) => {
1513                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1514                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1515                }
1516                let mut translator_builder = self.default_translator_builder();
1517                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1518                    translator_builder.case_insensitive(state);
1519                }
1520                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1521                    translator_builder.unicode(state);
1522                }
1523                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1524                    self.dot_all.set(state);
1525                }
1526                let concat_translator = Some(translator_builder.build());
1527                *translator = concat_translator;
1528                Ok(NodeId::EPS)
1529            }
1530            Ast::Literal(l) => {
1531                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1532                self.translator_to_node_id(&ast_lit, translator, tb)
1533            }
1534            Ast::Top(_) => Ok(NodeId::TOP),
1535            Ast::Dot(_) => {
1536                let hirv = match (self.global_ascii_perl, self.dot_all.get()) {
1537                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1538                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1539                    (false, true) => return Ok(NodeId::TOP),
1540                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1541                };
1542                self.hir_to_node_id(&hirv, tb)
1543            }
1544            Ast::Assertion(a) => match &a.kind {
1545                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1546                ast::AssertionKind::EndText => Ok(NodeId::END),
1547                ast::AssertionKind::WordBoundary => {
1548                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1549                }
1550                ast::AssertionKind::NotWordBoundary => {
1551                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1552                }
1553                ast::AssertionKind::StartLine => {
1554                    let left = NodeId::BEGIN;
1555                    let right = tb.mk_u8(b'\n');
1556                    let union = tb.mk_union(left, right);
1557                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1558                }
1559                ast::AssertionKind::EndLine => {
1560                    let left = NodeId::END;
1561                    let right = tb.mk_u8(b'\n');
1562                    let union = tb.mk_union(left, right);
1563                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1564                }
1565                ast::AssertionKind::WordBoundaryStart => {
1566                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1567                }
1568                ast::AssertionKind::WordBoundaryEnd => {
1569                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1570                }
1571                ast::AssertionKind::WordBoundaryStartAngle => {
1572                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1573                }
1574                ast::AssertionKind::WordBoundaryEndAngle => {
1575                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1576                }
1577                ast::AssertionKind::WordBoundaryStartHalf => {
1578                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1579                }
1580                ast::AssertionKind::WordBoundaryEndHalf => {
1581                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1582                }
1583            },
1584            Ast::ClassUnicode(c) => {
1585                let tmp = regex_syntax::ast::ClassUnicode {
1586                    span: c.span,
1587                    negated: c.negated,
1588                    kind: c.kind.clone(),
1589                };
1590                if !c.negated {
1591                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1592                        match s.as_str() {
1593                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1594                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1595                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1596                            "utf8" => {
1597                                let ascii = tb.mk_range_u8(0, 127);
1598                                let beta = tb.mk_range_u8(128, 0xBF);
1599                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1600                                let c0s = tb.mk_concats([c0, beta].into_iter());
1601                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1602                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1603                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1604                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1605                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1606                                return Ok(tb.mk_star(merged));
1607                            }
1608                            "hex" => {
1609                                let nums = tb.mk_range_u8(b'0', b'9');
1610                                let lets = tb.mk_range_u8(b'a', b'f');
1611                                let lets2 = tb.mk_range_u8(b'A', b'F');
1612                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1613                                return Ok(merged);
1614                            }
1615                            _ => {}
1616                        }
1617                    };
1618                }
1619
1620                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1621                self.translator_to_node_id(&orig_ast, translator, tb)
1622            }
1623            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1624            Ast::ClassBracketed(c) => match &c.kind {
1625                regex_syntax::ast::ClassSet::Item(item) => {
1626                    if !c.negated && is_universal_perl_pair(item) {
1627                        return Ok(NodeId::TOP);
1628                    }
1629                    let tmp = regex_syntax::ast::ClassBracketed {
1630                        span: c.span,
1631                        negated: c.negated,
1632                        kind: c.kind.clone(),
1633                    };
1634                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1635                    self.translator_to_node_id(&orig_ast, translator, tb)
1636                }
1637                regex_syntax::ast::ClassSet::BinaryOp(_) => {
1638                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1639                }
1640            },
1641            Ast::Repetition(r) => {
1642                let body = self.ast_to_node_id(&r.ast, translator, tb);
1643                match body {
1644                    Ok(body) => match &r.op.kind {
1645                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1646                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1647                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1648                        ast::RepetitionKind::Range(r) => match r {
1649                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1650                            ast::RepetitionRange::AtLeast(n) => {
1651                                let rep = tb.mk_repeat(body, *n, *n);
1652                                let st = tb.mk_star(body);
1653                                Ok(tb.mk_concat(rep, st))
1654                            }
1655
1656                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1657                        },
1658                    },
1659                    Err(_) => body,
1660                }
1661            }
1662            Ast::Lookaround(g) => {
1663                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1664                match g.kind {
1665                    ast::LookaroundKind::PositiveLookahead => {
1666                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1667                    }
1668                    ast::LookaroundKind::PositiveLookbehind => {
1669                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1670                    }
1671                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1672                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1673                }
1674            }
1675            Ast::Group(g) => {
1676                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1677                    if !flags.items.is_empty() {
1678                        let mut translator_builder = self.default_translator_builder();
1679                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1680                            translator_builder.case_insensitive(state);
1681                        }
1682                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1683                            translator_builder.unicode(state);
1684                        }
1685                        let saved_dot_all = self.dot_all.get();
1686                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1687                            self.dot_all.set(state);
1688                        }
1689                        let mut scoped = Some(translator_builder.build());
1690                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1691                        self.dot_all.set(saved_dot_all);
1692                        return result;
1693                    }
1694                }
1695                self.ast_to_node_id(&g.ast, translator, tb)
1696            }
1697            Ast::Alternation(a) => {
1698                let mut children = vec![];
1699                for ast in &a.asts {
1700                    match self.ast_to_node_id(ast, translator, tb) {
1701                        Ok(node_id) => children.push(node_id),
1702                        Err(err) => return Err(err),
1703                    }
1704                }
1705                Ok(tb.mk_unions(children.iter().copied()))
1706            }
1707            Ast::Concat(c) => {
1708                let mut concat_translator: Option<Translator> = None;
1709                let mut children = vec![];
1710                let mut i = 0;
1711                while i < c.asts.len() {
1712                    let ast = &c.asts[i];
1713                    match ast {
1714                        Ast::Flags(f) => {
1715                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1716                                return Err(
1717                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1718                                );
1719                            }
1720                            let mut translator_builder = self.default_translator_builder();
1721                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1722                                translator_builder.case_insensitive(state);
1723                            }
1724                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1725                                translator_builder.unicode(state);
1726                            }
1727                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1728                                self.dot_all.set(state);
1729                            }
1730                            concat_translator = Some(translator_builder.build());
1731                            *translator = concat_translator.clone();
1732                            i += 1;
1733                            continue;
1734                        }
1735                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1736                            let node =
1737                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1738                            children.push(node.0);
1739                            i = node.1; // skip consumed lookaheads
1740                            continue;
1741                        }
1742                        _ => {}
1743                    }
1744                    match concat_translator {
1745                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1746                            Ok(node_id) => children.push(node_id),
1747                            Err(err) => return Err(err),
1748                        },
1749                        None => match self.ast_to_node_id(ast, translator, tb) {
1750                            Ok(node_id) => children.push(node_id),
1751                            Err(err) => return Err(err),
1752                        },
1753                    }
1754                    i += 1;
1755                }
1756                Ok(tb.mk_concats(children.iter().cloned()))
1757            }
1758            Ast::Intersection(intersection) => {
1759                let mut children = vec![];
1760                for ast in &intersection.asts {
1761                    match self.ast_to_node_id(ast, translator, tb) {
1762                        Ok(node_id) => children.push(node_id),
1763                        Err(err) => return Err(err),
1764                    }
1765                }
1766                Ok(tb.mk_inters(children.into_iter()))
1767            }
1768            Ast::Complement(complement) => {
1769                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1770                body.map(|x| tb.mk_compl(x))
1771            }
1772        }
1773    }
1774
1775    fn parse_inner(&mut self) -> Result<Ast> {
1776        let mut concat = Concat {
1777            span: self.span(),
1778            asts: vec![],
1779        };
1780        loop {
1781            self.bump_space();
1782            if self.is_eof() {
1783                break;
1784            }
1785            match self.char() {
1786                '(' => concat = self.push_group(concat)?,
1787                ')' => concat = self.pop_group(concat)?,
1788                '|' => concat = self.push_alternate(concat)?,
1789                '&' => concat = self.push_intersect(concat)?,
1790                '~' => concat = self.push_compl_group(concat)?,
1791                '[' => {
1792                    let class = self.parse_set_class()?;
1793                    concat.asts.push(Ast::class_bracketed(class));
1794                }
1795                '?' => {
1796                    concat =
1797                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1798                }
1799                '*' => {
1800                    concat =
1801                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1802                }
1803                '+' => {
1804                    concat =
1805                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1806                }
1807                '{' => {
1808                    concat = self.parse_counted_repetition(concat)?;
1809                }
1810                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1811            }
1812        }
1813        let ast = self.pop_group_end(concat)?;
1814        if expanded_ast_size(&ast, EXPANDED_AST_LIMIT) >= EXPANDED_AST_LIMIT
1815            || max_list_length(&ast) >= MAX_LIST_LEN
1816        {
1817            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1818        }
1819        Ok(ast)
1820    }
1821
1822    /// Parse the regular expression and return an abstract syntax tree with
1823    /// all of the comments found in the pattern.
1824    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1825        let ast = self.parse_inner()?;
1826        self.ast_to_node_id(&ast, &mut None, tb)
1827    }
1828
1829    #[inline(never)]
1830    fn parse_uncounted_repetition(
1831        &self,
1832        mut concat: ast::Concat,
1833        kind: ast::RepetitionKind,
1834    ) -> Result<ast::Concat> {
1835        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1836        let op_start = self.pos();
1837        let ast = match concat.asts.pop() {
1838            Some(ast) => ast,
1839            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1840        };
1841        match ast {
1842            Ast::Empty(_) | Ast::Flags(_) => {
1843                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1844            }
1845            _ => {}
1846        }
1847        if self.bump() && self.char() == '?' {
1848            return Err(self.error(
1849                Span::new(op_start, self.pos()),
1850                ast::ErrorKind::UnsupportedLazyQuantifier,
1851            ));
1852        }
1853        concat.asts.push(Ast::repetition(ast::Repetition {
1854            span: ast.span().with_end(self.pos()),
1855            op: ast::RepetitionOp {
1856                span: Span::new(op_start, self.pos()),
1857                kind,
1858            },
1859            greedy: true,
1860            ast: Box::new(ast),
1861        }));
1862        Ok(concat)
1863    }
1864
1865    #[inline(never)]
1866    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1867        assert!(self.char() == '{');
1868        let start = self.pos();
1869        let ast = match concat.asts.pop() {
1870            Some(ast) => ast,
1871            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1872        };
1873        match ast {
1874            Ast::Empty(_) | Ast::Flags(_) => {
1875                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1876            }
1877            _ => {}
1878        }
1879        if !self.bump_and_bump_space() {
1880            return Err(self.error(
1881                Span::new(start, self.pos()),
1882                ast::ErrorKind::RepetitionCountUnclosed,
1883            ));
1884        }
1885        let count_start = specialize_err(
1886            self.parse_decimal(),
1887            ast::ErrorKind::DecimalEmpty,
1888            ast::ErrorKind::RepetitionCountDecimalEmpty,
1889        );
1890        if self.is_eof() {
1891            return Err(self.error(
1892                Span::new(start, self.pos()),
1893                ast::ErrorKind::RepetitionCountUnclosed,
1894            ));
1895        }
1896        let range = if self.char() == ',' {
1897            if !self.bump_and_bump_space() {
1898                return Err(self.error(
1899                    Span::new(start, self.pos()),
1900                    ast::ErrorKind::RepetitionCountUnclosed,
1901                ));
1902            }
1903            if self.char() != '}' {
1904                let count_start = match count_start {
1905                    Ok(c) => c,
1906                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1907                        if self.parser().empty_min_range {
1908                            0
1909                        } else {
1910                            return Err(err);
1911                        }
1912                    }
1913                    err => err?,
1914                };
1915                let count_end = specialize_err(
1916                    self.parse_decimal(),
1917                    ast::ErrorKind::DecimalEmpty,
1918                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1919                )?;
1920                ast::RepetitionRange::Bounded(count_start, count_end)
1921            } else {
1922                ast::RepetitionRange::AtLeast(count_start?)
1923            }
1924        } else {
1925            ast::RepetitionRange::Exactly(count_start?)
1926        };
1927
1928        if self.is_eof() || self.char() != '}' {
1929            return Err(self.error(
1930                Span::new(start, self.pos()),
1931                ast::ErrorKind::RepetitionCountUnclosed,
1932            ));
1933        }
1934
1935        if self.bump_and_bump_space() && self.char() == '?' {
1936            return Err(self.error(
1937                Span::new(start, self.pos()),
1938                ast::ErrorKind::UnsupportedLazyQuantifier,
1939            ));
1940        }
1941
1942        let op_span = Span::new(start, self.pos());
1943        if !range.is_valid() {
1944            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1945        }
1946
1947        let over_limit = match &range {
1948            ast::RepetitionRange::Exactly(n) => *n > REPETITION_COUNT_LIMIT,
1949            ast::RepetitionRange::AtLeast(n) => *n > REPETITION_COUNT_LIMIT,
1950            ast::RepetitionRange::Bounded(n, m) => {
1951                *n > REPETITION_COUNT_LIMIT || *m > REPETITION_COUNT_LIMIT
1952            }
1953        };
1954        if over_limit {
1955            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
1956        }
1957        concat.asts.push(Ast::repetition(ast::Repetition {
1958            span: ast.span().with_end(self.pos()),
1959            op: ast::RepetitionOp {
1960                span: op_span,
1961                kind: ast::RepetitionKind::Range(range),
1962            },
1963            greedy: true,
1964            ast: Box::new(ast),
1965        }));
1966        Ok(concat)
1967    }
1968
1969    #[inline(never)]
1970    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1971        assert_eq!(self.char(), '(');
1972        let open_span = self.span_char();
1973        self.bump();
1974        self.bump_space();
1975        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1976            let kind = match (pos, ahead) {
1977                (true, true) => LookaroundKind::PositiveLookahead,
1978                (true, false) => LookaroundKind::PositiveLookbehind,
1979                (false, true) => LookaroundKind::NegativeLookahead,
1980                (false, false) => LookaroundKind::NegativeLookbehind,
1981            };
1982            return Ok(Either::Right(ast::Group {
1983                span: open_span,
1984                kind: ast::GroupKind::Lookaround(kind),
1985                ast: Box::new(Ast::empty(self.span())),
1986            }));
1987        }
1988        let inner_span = self.span();
1989        let mut starts_with_p = true;
1990        if self.bump_if("?P<") || {
1991            starts_with_p = false;
1992            self.bump_if("?<")
1993        } {
1994            let capture_index = self.next_capture_index(open_span)?;
1995            let name = self.parse_capture_name(capture_index)?;
1996            Ok(Either::Right(ast::Group {
1997                span: open_span,
1998                kind: ast::GroupKind::CaptureName {
1999                    starts_with_p,
2000                    name,
2001                },
2002                ast: Box::new(Ast::empty(self.span())),
2003            }))
2004        } else if self.bump_if("?") {
2005            if self.is_eof() {
2006                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2007            }
2008            let flags = self.parse_flags()?;
2009            let char_end = self.char();
2010            self.bump();
2011            if char_end == ')' {
2012                // We don't allow empty flags, e.g., `(?)`. We instead
2013                // interpret it as a repetition operator missing its argument.
2014                if flags.items.is_empty() {
2015                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2016                }
2017                Ok(Either::Left(ast::SetFlags {
2018                    span: Span {
2019                        end: self.pos(),
2020                        ..open_span
2021                    },
2022                    flags,
2023                }))
2024            } else {
2025                assert_eq!(char_end, ':');
2026                Ok(Either::Right(ast::Group {
2027                    span: open_span,
2028                    kind: ast::GroupKind::NonCapturing(flags),
2029                    ast: Box::new(Ast::empty(self.span())),
2030                }))
2031            }
2032        } else {
2033            let capture_index = self.next_capture_index(open_span)?;
2034            Ok(Either::Right(ast::Group {
2035                span: open_span,
2036                kind: ast::GroupKind::CaptureIndex(capture_index),
2037                ast: Box::new(Ast::empty(self.span())),
2038            }))
2039        }
2040    }
2041
2042    #[inline(never)]
2043    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2044        if self.is_eof() {
2045            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2046        }
2047        let start = self.pos();
2048        loop {
2049            if self.char() == '>' {
2050                break;
2051            }
2052            if !is_capture_char(self.char(), self.pos() == start) {
2053                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2054            }
2055            if !self.bump() {
2056                break;
2057            }
2058        }
2059        let end = self.pos();
2060        if self.is_eof() {
2061            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2062        }
2063        assert_eq!(self.char(), '>');
2064        self.bump();
2065        let name = &self.pattern()[start.offset..end.offset];
2066        if name.is_empty() {
2067            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2068        }
2069        let capname = ast::CaptureName {
2070            span: Span::new(start, end),
2071            name: name.to_string(),
2072            index: capture_index,
2073        };
2074        self.add_capture_name(&capname)?;
2075        Ok(capname)
2076    }
2077
2078    #[inline(never)]
2079    fn parse_flags(&self) -> Result<ast::Flags> {
2080        let mut flags = ast::Flags {
2081            span: self.span(),
2082            items: vec![],
2083        };
2084        let mut last_was_negation = None;
2085        while self.char() != ':' && self.char() != ')' {
2086            if self.char() == '-' {
2087                last_was_negation = Some(self.span_char());
2088                let item = ast::FlagsItem {
2089                    span: self.span_char(),
2090                    kind: ast::FlagsItemKind::Negation,
2091                };
2092                if let Some(i) = flags.add_item(item) {
2093                    return Err(self.error(
2094                        self.span_char(),
2095                        ast::ErrorKind::FlagRepeatedNegation {
2096                            original: flags.items[i].span,
2097                        },
2098                    ));
2099                }
2100            } else {
2101                last_was_negation = None;
2102                let item = ast::FlagsItem {
2103                    span: self.span_char(),
2104                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2105                };
2106                if let Some(i) = flags.add_item(item) {
2107                    return Err(self.error(
2108                        self.span_char(),
2109                        ast::ErrorKind::FlagDuplicate {
2110                            original: flags.items[i].span,
2111                        },
2112                    ));
2113                }
2114            }
2115            if !self.bump() {
2116                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2117            }
2118        }
2119        if let Some(span) = last_was_negation {
2120            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2121        }
2122        flags.span.end = self.pos();
2123        Ok(flags)
2124    }
2125
2126    #[inline(never)]
2127    fn parse_flag(&self) -> Result<ast::Flag> {
2128        match self.char() {
2129            'i' => Ok(ast::Flag::CaseInsensitive),
2130            'm' => Ok(ast::Flag::MultiLine),
2131            's' => Ok(ast::Flag::DotMatchesNewLine),
2132            'U' => Ok(ast::Flag::SwapGreed),
2133            'u' => Ok(ast::Flag::Unicode),
2134            'R' => Ok(ast::Flag::CRLF),
2135            'x' => Ok(ast::Flag::IgnoreWhitespace),
2136            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2137        }
2138    }
2139
2140    fn parse_primitive(&self) -> Result<Primitive> {
2141        match self.char() {
2142            '\\' => self.parse_escape(),
2143            '_' => {
2144                let ast = Primitive::Top(self.span_char());
2145                self.bump();
2146                Ok(ast)
2147            }
2148            '.' => {
2149                let ast = Primitive::Dot(self.span_char());
2150                self.bump();
2151                Ok(ast)
2152            }
2153            '^' => {
2154                let ast = Primitive::Assertion(ast::Assertion {
2155                    span: self.span_char(),
2156                    kind: ast::AssertionKind::StartLine,
2157                });
2158                self.bump();
2159                Ok(ast)
2160            }
2161            '$' => {
2162                let ast = Primitive::Assertion(ast::Assertion {
2163                    span: self.span_char(),
2164                    kind: ast::AssertionKind::EndLine,
2165                });
2166                self.bump();
2167                Ok(ast)
2168            }
2169            c => {
2170                let ast = Primitive::Literal(Literal {
2171                    span: self.span_char(),
2172                    kind: LiteralKind::Verbatim,
2173                    c,
2174                });
2175                self.bump();
2176                Ok(ast)
2177            }
2178        }
2179    }
2180
2181    #[inline(never)]
2182    fn parse_escape(&self) -> Result<Primitive> {
2183        assert_eq!(self.char(), '\\');
2184        let start = self.pos();
2185        if !self.bump() {
2186            return Err(self.error(
2187                Span::new(start, self.pos()),
2188                ast::ErrorKind::EscapeUnexpectedEof,
2189            ));
2190        }
2191        let c = self.char();
2192        // Put some of the more complicated routines into helpers.
2193        match c {
2194            '0'..='9' => {
2195                if !self.parser().octal {
2196                    return Err(self.error(
2197                        Span::new(start, self.span_char().end),
2198                        ast::ErrorKind::UnsupportedBackreference,
2199                    ));
2200                }
2201                let mut lit = self.parse_octal();
2202                lit.span.start = start;
2203                return Ok(Primitive::Literal(lit));
2204            }
2205            // '8'..='9' if !self.parser().octal => {
2206            //     return Err(self.error(
2207            //         Span::new(start, self.span_char().end),
2208            //         ast::ErrorKind::UnsupportedBackreference,
2209            //     ));
2210            // }
2211            'x' | 'u' | 'U' => {
2212                let mut lit = self.parse_hex()?;
2213                lit.span.start = start;
2214                return Ok(Primitive::Literal(lit));
2215            }
2216            'p' | 'P' => {
2217                let mut cls = self.parse_unicode_class()?;
2218                cls.span.start = start;
2219                return Ok(Primitive::Unicode(cls));
2220            }
2221            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2222                let mut cls = self.parse_perl_class();
2223                cls.span.start = start;
2224                return Ok(Primitive::Perl(cls));
2225            }
2226            _ => {}
2227        }
2228
2229        // Handle all of the one letter sequences inline.
2230        self.bump();
2231        let span = Span::new(start, self.pos());
2232        if is_meta_character(c) {
2233            return Ok(Primitive::Literal(Literal {
2234                span,
2235                kind: LiteralKind::Meta,
2236                c,
2237            }));
2238        }
2239        if is_escapeable_character(c) {
2240            return Ok(Primitive::Literal(Literal {
2241                span,
2242                kind: LiteralKind::Superfluous,
2243                c,
2244            }));
2245        }
2246        let special = |kind, c| {
2247            Ok(Primitive::Literal(Literal {
2248                span,
2249                kind: LiteralKind::Special(kind),
2250                c,
2251            }))
2252        };
2253        match c {
2254            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2255            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2256            't' => special(SpecialLiteralKind::Tab, '\t'),
2257            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2258            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2259            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2260            'A' => Ok(Primitive::Assertion(ast::Assertion {
2261                span,
2262                kind: ast::AssertionKind::StartText,
2263            })),
2264            'z' => Ok(Primitive::Assertion(ast::Assertion {
2265                span,
2266                kind: ast::AssertionKind::EndText,
2267            })),
2268            'b' => {
2269                let mut wb = ast::Assertion {
2270                    span,
2271                    kind: ast::AssertionKind::WordBoundary,
2272                };
2273                // After a \b, we "try" to parse things like \b{start} for
2274                // special word boundary assertions.
2275                if !self.is_eof() && self.char() == '{' {
2276                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2277                        wb.kind = kind;
2278                        wb.span.end = self.pos();
2279                    }
2280                }
2281                Ok(Primitive::Assertion(wb))
2282            }
2283            'B' => Ok(Primitive::Assertion(ast::Assertion {
2284                span,
2285                kind: ast::AssertionKind::NotWordBoundary,
2286            })),
2287            '<' => Ok(Primitive::Assertion(ast::Assertion {
2288                span,
2289                kind: ast::AssertionKind::WordBoundaryStartAngle,
2290            })),
2291            '>' => Ok(Primitive::Assertion(ast::Assertion {
2292                span,
2293                kind: ast::AssertionKind::WordBoundaryEndAngle,
2294            })),
2295            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2296        }
2297    }
2298
2299    fn maybe_parse_special_word_boundary(
2300        &self,
2301        wb_start: Position,
2302    ) -> Result<Option<ast::AssertionKind>> {
2303        assert_eq!(self.char(), '{');
2304
2305        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2306        let start = self.pos();
2307        if !self.bump_and_bump_space() {
2308            return Err(self.error(
2309                Span::new(wb_start, self.pos()),
2310                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2311            ));
2312        }
2313        let start_contents = self.pos();
2314        // This is one of the critical bits: if the first non-whitespace
2315        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2316        // boundary), then we bail and let the counted repetition parser deal
2317        // with this.
2318        if !is_valid_char(self.char()) {
2319            self.parser().pos.set(start);
2320            return Ok(None);
2321        }
2322
2323        // Now collect up our chars until we see a '}'.
2324        let mut scratch = self.parser().scratch.borrow_mut();
2325        scratch.clear();
2326        while !self.is_eof() && is_valid_char(self.char()) {
2327            scratch.push(self.char());
2328            self.bump_and_bump_space();
2329        }
2330        if self.is_eof() || self.char() != '}' {
2331            return Err(self.error(
2332                Span::new(start, self.pos()),
2333                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2334            ));
2335        }
2336        let end = self.pos();
2337        self.bump();
2338        let kind = match scratch.as_str() {
2339            "start" => ast::AssertionKind::WordBoundaryStart,
2340            "end" => ast::AssertionKind::WordBoundaryEnd,
2341            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2342            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2343            _ => {
2344                return Err(self.error(
2345                    Span::new(start_contents, end),
2346                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2347                ))
2348            }
2349        };
2350        Ok(Some(kind))
2351    }
2352
2353    #[inline(never)]
2354    fn parse_octal(&self) -> Literal {
2355        assert!(self.parser().octal);
2356        assert!('0' <= self.char() && self.char() <= '7');
2357        let start = self.pos();
2358        // Parse up to two more digits.
2359        while self.bump()
2360            && '0' <= self.char()
2361            && self.char() <= '7'
2362            && self.pos().offset - start.offset <= 2
2363        {}
2364        let end = self.pos();
2365        let octal = &self.pattern()[start.offset..end.offset];
2366        // Parsing the octal should never fail since the above guarantees a
2367        // valid number.
2368        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2369        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2370        // invalid Unicode scalar values.
2371        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2372        Literal {
2373            span: Span::new(start, end),
2374            kind: LiteralKind::Octal,
2375            c,
2376        }
2377    }
2378
2379    #[inline(never)]
2380    fn parse_hex(&self) -> Result<Literal> {
2381        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2382
2383        let hex_kind = match self.char() {
2384            'x' => HexLiteralKind::X,
2385            'u' => HexLiteralKind::UnicodeShort,
2386            _ => HexLiteralKind::UnicodeLong,
2387        };
2388        if !self.bump_and_bump_space() {
2389            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2390        }
2391        if self.char() == '{' {
2392            self.parse_hex_brace(hex_kind)
2393        } else {
2394            self.parse_hex_digits(hex_kind)
2395        }
2396    }
2397
2398    #[inline(never)]
2399    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2400        let mut scratch = self.parser().scratch.borrow_mut();
2401        scratch.clear();
2402
2403        let start = self.pos();
2404        for i in 0..kind.digits() {
2405            if i > 0 && !self.bump_and_bump_space() {
2406                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2407            }
2408            if !is_hex(self.char()) {
2409                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2410            }
2411            scratch.push(self.char());
2412        }
2413        // The final bump just moves the parser past the literal, which may
2414        // be EOF.
2415        self.bump_and_bump_space();
2416        let end = self.pos();
2417        let hex = scratch.as_str();
2418        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2419            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2420            Some(c) => Ok(Literal {
2421                span: Span::new(start, end),
2422                kind: LiteralKind::HexFixed(kind),
2423                c,
2424            }),
2425        }
2426    }
2427
2428    #[inline(never)]
2429    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2430        let mut scratch = self.parser().scratch.borrow_mut();
2431        scratch.clear();
2432
2433        let brace_pos = self.pos();
2434        let start = self.span_char().end;
2435        while self.bump_and_bump_space() && self.char() != '}' {
2436            if !is_hex(self.char()) {
2437                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2438            }
2439            scratch.push(self.char());
2440        }
2441        if self.is_eof() {
2442            return Err(self.error(
2443                Span::new(brace_pos, self.pos()),
2444                ast::ErrorKind::EscapeUnexpectedEof,
2445            ));
2446        }
2447        let end = self.pos();
2448        let hex = scratch.as_str();
2449        assert_eq!(self.char(), '}');
2450        self.bump_and_bump_space();
2451
2452        if hex.is_empty() {
2453            return Err(self.error(
2454                Span::new(brace_pos, self.pos()),
2455                ast::ErrorKind::EscapeHexEmpty,
2456            ));
2457        }
2458        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2459            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2460            Some(c) => Ok(Literal {
2461                span: Span::new(start, self.pos()),
2462                kind: LiteralKind::HexBrace(kind),
2463                c,
2464            }),
2465        }
2466    }
2467
2468    fn parse_decimal(&self) -> Result<u32> {
2469        let mut scratch = self.parser().scratch.borrow_mut();
2470        scratch.clear();
2471
2472        while !self.is_eof() && self.char().is_whitespace() {
2473            self.bump();
2474        }
2475        let start = self.pos();
2476        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2477            scratch.push(self.char());
2478            self.bump_and_bump_space();
2479        }
2480        let span = Span::new(start, self.pos());
2481        while !self.is_eof() && self.char().is_whitespace() {
2482            self.bump_and_bump_space();
2483        }
2484        let digits = scratch.as_str();
2485        if digits.is_empty() {
2486            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2487        }
2488        match digits.parse::<u32>().ok() {
2489            Some(n) => Ok(n),
2490            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2491        }
2492    }
2493
2494    #[inline(never)]
2495    fn parse_set_class(&self) -> Result<ClassBracketed> {
2496        assert_eq!(self.char(), '[');
2497
2498        let mut union = ClassSetUnion {
2499            span: self.span(),
2500            items: vec![],
2501        };
2502        loop {
2503            self.bump_space();
2504            if self.is_eof() {
2505                return Err(self.unclosed_class_error());
2506            }
2507            match self.char() {
2508                '[' => {
2509                    // If we've already parsed the opening bracket, then
2510                    // attempt to treat this as the beginning of an ASCII
2511                    // class. If ASCII class parsing fails, then the parser
2512                    // backs up to `[`.
2513                    if !self.parser().stack_class.borrow().is_empty() {
2514                        if let Some(cls) = self.maybe_parse_ascii_class() {
2515                            union.push(ClassSetItem::Ascii(cls));
2516                            continue;
2517                        }
2518                    }
2519                    union = self.push_class_open(union)?;
2520                }
2521                ']' => match self.pop_class(union)? {
2522                    Either::Left(nested_union) => {
2523                        union = nested_union;
2524                    }
2525                    Either::Right(class) => return Ok(class),
2526                },
2527                '&' if self.peek() == Some('&') => {
2528                    assert!(self.bump_if("&&"));
2529                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2530                }
2531                '-' if self.peek() == Some('-') => {
2532                    assert!(self.bump_if("--"));
2533                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2534                }
2535                '~' if self.peek() == Some('~') => {
2536                    assert!(self.bump_if("~~"));
2537                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2538                }
2539                _ => {
2540                    union.push(self.parse_set_class_range()?);
2541                }
2542            }
2543        }
2544    }
2545
2546    #[inline(never)]
2547    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2548        let prim1 = self.parse_set_class_item()?;
2549        self.bump_space();
2550        if self.is_eof() {
2551            return Err(self.unclosed_class_error());
2552        }
2553        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2554            return prim1.into_class_set_item(self);
2555        }
2556        if !self.bump_and_bump_space() {
2557            return Err(self.unclosed_class_error());
2558        }
2559        let prim2 = self.parse_set_class_item()?;
2560        let range = ClassSetRange {
2561            span: Span::new(prim1.span().start, prim2.span().end),
2562            start: prim1.into_class_literal(self)?,
2563            end: prim2.into_class_literal(self)?,
2564        };
2565        if !range.is_valid() {
2566            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2567        }
2568        Ok(ClassSetItem::Range(range))
2569    }
2570
2571    #[inline(never)]
2572    fn parse_set_class_item(&self) -> Result<Primitive> {
2573        if self.char() == '\\' {
2574            self.parse_escape()
2575        } else {
2576            let x = Primitive::Literal(Literal {
2577                span: self.span_char(),
2578                kind: LiteralKind::Verbatim,
2579                c: self.char(),
2580            });
2581            self.bump();
2582            Ok(x)
2583        }
2584    }
2585
2586    #[inline(never)]
2587    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2588        assert_eq!(self.char(), '[');
2589        let start = self.pos();
2590        if !self.bump_and_bump_space() {
2591            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2592        }
2593
2594        let negated = if self.char() != '^' {
2595            false
2596        } else {
2597            if !self.bump_and_bump_space() {
2598                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2599            }
2600            true
2601        };
2602        // Accept any number of `-` as literal `-`.
2603        let mut union = ClassSetUnion {
2604            span: self.span(),
2605            items: vec![],
2606        };
2607        while self.char() == '-' {
2608            union.push(ClassSetItem::Literal(Literal {
2609                span: self.span_char(),
2610                kind: LiteralKind::Verbatim,
2611                c: '-',
2612            }));
2613            if !self.bump_and_bump_space() {
2614                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2615            }
2616        }
2617        // If `]` is the *first* char in a set, then interpret it as a literal
2618        // `]`. That is, an empty class is impossible to write.
2619        if union.items.is_empty() && self.char() == ']' {
2620            union.push(ClassSetItem::Literal(Literal {
2621                span: self.span_char(),
2622                kind: LiteralKind::Verbatim,
2623                c: ']',
2624            }));
2625            if !self.bump_and_bump_space() {
2626                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2627            }
2628        }
2629        let set = ClassBracketed {
2630            span: Span::new(start, self.pos()),
2631            negated,
2632            kind: ClassSet::union(ClassSetUnion {
2633                span: Span::new(union.span.start, union.span.start),
2634                items: vec![],
2635            }),
2636        };
2637        Ok((set, union))
2638    }
2639
2640    #[inline(never)]
2641    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2642        assert_eq!(self.char(), '[');
2643        // If parsing fails, then we back up the parser to this starting point.
2644        let start = self.pos();
2645        let mut negated = false;
2646        if !self.bump() || self.char() != ':' {
2647            self.parser().pos.set(start);
2648            return None;
2649        }
2650        if !self.bump() {
2651            self.parser().pos.set(start);
2652            return None;
2653        }
2654        if self.char() == '^' {
2655            negated = true;
2656            if !self.bump() {
2657                self.parser().pos.set(start);
2658                return None;
2659            }
2660        }
2661        let name_start = self.offset();
2662        while self.char() != ':' && self.bump() {}
2663        if self.is_eof() {
2664            self.parser().pos.set(start);
2665            return None;
2666        }
2667        let name = &self.pattern()[name_start..self.offset()];
2668        if !self.bump_if(":]") {
2669            self.parser().pos.set(start);
2670            return None;
2671        }
2672        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2673            Some(kind) => kind,
2674            None => {
2675                self.parser().pos.set(start);
2676                return None;
2677            }
2678        };
2679        Some(ClassAscii {
2680            span: Span::new(start, self.pos()),
2681            kind,
2682            negated,
2683        })
2684    }
2685
2686    #[inline(never)]
2687    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2688        assert!(self.char() == 'p' || self.char() == 'P');
2689
2690        let mut scratch = self.parser().scratch.borrow_mut();
2691        scratch.clear();
2692
2693        let negated = self.char() == 'P';
2694        if !self.bump_and_bump_space() {
2695            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2696        }
2697        let (start, kind) = if self.char() == '{' {
2698            let start = self.span_char().end;
2699            while self.bump_and_bump_space() && self.char() != '}' {
2700                scratch.push(self.char());
2701            }
2702            if self.is_eof() {
2703                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2704            }
2705            assert_eq!(self.char(), '}');
2706            self.bump();
2707
2708            let name = scratch.as_str();
2709            if let Some(i) = name.find("!=") {
2710                (
2711                    start,
2712                    ClassUnicodeKind::NamedValue {
2713                        op: ClassUnicodeOpKind::NotEqual,
2714                        name: name[..i].to_string(),
2715                        value: name[i + 2..].to_string(),
2716                    },
2717                )
2718            } else if let Some(i) = name.find(':') {
2719                (
2720                    start,
2721                    ClassUnicodeKind::NamedValue {
2722                        op: ClassUnicodeOpKind::Colon,
2723                        name: name[..i].to_string(),
2724                        value: name[i + 1..].to_string(),
2725                    },
2726                )
2727            } else if let Some(i) = name.find('=') {
2728                (
2729                    start,
2730                    ClassUnicodeKind::NamedValue {
2731                        op: ClassUnicodeOpKind::Equal,
2732                        name: name[..i].to_string(),
2733                        value: name[i + 1..].to_string(),
2734                    },
2735                )
2736            } else {
2737                (start, ClassUnicodeKind::Named(name.to_string()))
2738            }
2739        } else {
2740            let start = self.pos();
2741            let c = self.char();
2742            if c == '\\' {
2743                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2744            }
2745            self.bump_and_bump_space();
2746            let kind = ClassUnicodeKind::OneLetter(c);
2747            (start, kind)
2748        };
2749        Ok(ClassUnicode {
2750            span: Span::new(start, self.pos()),
2751            negated,
2752            kind,
2753        })
2754    }
2755
2756    #[inline(never)]
2757    fn parse_perl_class(&self) -> ClassPerl {
2758        let c = self.char();
2759        let span = self.span_char();
2760        self.bump();
2761        let (negated, kind) = match c {
2762            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2763            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2764            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2765            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2766            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2767            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2768            c => panic!("expected valid Perl class but got '{}'", c),
2769        };
2770        ClassPerl {
2771            span,
2772            kind,
2773            negated,
2774        }
2775    }
2776}
2777
2778/// `[\s\S]`, `[\w\W]`, `[\d\D]` etc into `_` canonicalization.
2779/// should really be done as a u32 BDD but good enough for now
2780fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
2781    use regex_syntax::ast::ClassSetItem;
2782    let items = match item {
2783        ClassSetItem::Union(u) => &u.items,
2784        _ => return false,
2785    };
2786    if items.len() != 2 {
2787        return false;
2788    }
2789    match (&items[0], &items[1]) {
2790        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2791            let is_all = a.kind == b.kind && a.negated != b.negated;
2792            is_all
2793        }
2794        _ => false,
2795    }
2796}
2797
2798pub fn max_list_length(ast: &ast::Ast) -> usize {
2799    match ast {
2800        ast::Ast::Empty(_)
2801        | ast::Ast::Flags(_)
2802        | ast::Ast::Literal(_)
2803        | ast::Ast::Dot(_)
2804        | ast::Ast::Top(_)
2805        | ast::Ast::Assertion(_)
2806        | ast::Ast::ClassUnicode(_)
2807        | ast::Ast::ClassPerl(_)
2808        | ast::Ast::ClassBracketed(_) => 0,
2809        ast::Ast::Group(g) => max_list_length(&g.ast),
2810        ast::Ast::Complement(c) => max_list_length(&c.ast),
2811        ast::Ast::Lookaround(l) => max_list_length(&l.ast),
2812        ast::Ast::Repetition(r) => max_list_length(&r.ast),
2813        ast::Ast::Concat(c) => c
2814            .asts
2815            .len()
2816            .max(c.asts.iter().map(max_list_length).max().unwrap_or(0)),
2817        ast::Ast::Alternation(a) => a
2818            .asts
2819            .len()
2820            .max(a.asts.iter().map(max_list_length).max().unwrap_or(0)),
2821        ast::Ast::Intersection(i) => i
2822            .asts
2823            .len()
2824            .max(i.asts.iter().map(max_list_length).max().unwrap_or(0)),
2825    }
2826}
2827
2828pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2829    fn go(ast: &ast::Ast, limit: u64) -> u64 {
2830        match ast {
2831            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2832            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2833            ast::Ast::Assertion(_) => 1,
2834            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2835            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2836            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2837            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2838            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2839            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2840            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2841            ast::Ast::Repetition(r) => {
2842                let body = go(&r.ast, limit);
2843                let factor: u64 = match &r.op.kind {
2844                    ast::RepetitionKind::ZeroOrOne => 2,
2845                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2846                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2847                        (*n as u64).max(1)
2848                    }
2849                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2850                        (*n as u64).max(1).saturating_add(1)
2851                    }
2852                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2853                        (*m as u64).max(1)
2854                    }
2855                };
2856                body.saturating_mul(factor).min(limit)
2857            }
2858        }
2859    }
2860    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
2861        let mut total: u64 = 0;
2862        for c in children {
2863            total = total.saturating_add(go(c, limit));
2864            if total >= limit {
2865                return limit;
2866            }
2867        }
2868        total
2869    }
2870    go(ast, limit)
2871}
2872
2873pub fn parse_ast<'s>(
2874    tb: &mut TB<'s>,
2875    pattern: &'s str,
2876) -> std::result::Result<NodeId, ResharpError> {
2877    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2878    p.parse(tb)
2879}
2880
2881pub fn parse_ast_with<'s>(
2882    tb: &mut TB<'s>,
2883    pattern: &'s str,
2884    flags: &PatternFlags,
2885) -> std::result::Result<NodeId, ResharpError> {
2886    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2887    p.parse(tb)
2888}
2889
2890/// Parse a pattern into the raw AST without converting to algebra nodes.
2891pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ResharpError> {
2892    let mut p: ResharpParser = ResharpParser::new(pattern);
2893    p.parse_inner()
2894}
resharp_parser/lib.rs

resharp_parser/
lib.rs