resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `EngineOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// allow whitespace and `#` comments in the pattern.
37    pub ignore_whitespace: bool,
38    /// ASCII `\w`/`\d`/`\s` tables, but
39    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
40    pub ascii_perl_classes: bool,
41}
42
43// arbitrary safeguards, these will not prevent intentional DoS patterns
44// more to protect you from shooting yourself in the foot
45const REPETITION_COUNT_LIMIT: u32 = 2_000;
46const EXPANDED_AST_LIMIT: u64 = 50_000;
47const MAX_LIST_LEN: usize = 4_000;
48
49impl Default for PatternFlags {
50    fn default() -> Self {
51        Self {
52            unicode: true,
53            full_unicode: false,
54            case_insensitive: false,
55            dot_matches_new_line: false,
56            ignore_whitespace: false,
57            ascii_perl_classes: false,
58        }
59    }
60}
61
62#[derive(Clone, Copy, PartialEq, Debug)]
63enum WordCharKind {
64    Word,
65    NonWord,
66    MaybeWord,
67    MaybeNonWord,
68    Unknown,
69    Edge,
70}
71
72fn is_word_byte(b: u8) -> bool {
73    b.is_ascii_alphanumeric() || b == b'_'
74}
75
76#[derive(Clone, Debug, Eq, PartialEq)]
77enum Primitive {
78    Literal(Literal),
79    Assertion(ast::Assertion),
80    Dot(Span),
81    Top(Span),
82    Perl(ClassPerl),
83    Unicode(ClassUnicode),
84}
85
86impl Primitive {
87    fn span(&self) -> &Span {
88        match *self {
89            Primitive::Literal(ref x) => &x.span,
90            Primitive::Assertion(ref x) => &x.span,
91            Primitive::Dot(ref span) => span,
92            Primitive::Top(ref span) => span,
93            Primitive::Perl(ref x) => &x.span,
94            Primitive::Unicode(ref x) => &x.span,
95        }
96    }
97
98    fn into_ast(self) -> Ast {
99        match self {
100            Primitive::Literal(lit) => Ast::literal(lit),
101            Primitive::Assertion(assert) => Ast::assertion(assert),
102            Primitive::Dot(span) => Ast::dot(span),
103            Primitive::Top(span) => Ast::top(span),
104            Primitive::Perl(cls) => Ast::class_perl(cls),
105            Primitive::Unicode(cls) => Ast::class_unicode(cls),
106        }
107    }
108
109    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
110        use self::Primitive::*;
111        use regex_syntax::ast::ClassSetItem;
112
113        match self {
114            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
115            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
116            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
117            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
118        }
119    }
120
121    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
122        use self::Primitive::*;
123
124        match self {
125            Literal(lit) => Ok(lit),
126            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
127        }
128    }
129}
130
131#[derive(Clone, Debug, Eq, PartialEq)]
132pub enum Either<Left, Right> {
133    Left(Left),
134    Right(Right),
135}
136
137#[derive(Clone, Debug, Eq, PartialEq)]
138pub struct ResharpError {
139    /// The kind of error.
140    pub kind: ErrorKind,
141    /// The original pattern that the parser generated the error from. Every
142    /// span in an error is a valid range into this string.
143    pattern: String,
144    /// The span of this error.
145    pub span: Span,
146}
147
148impl std::fmt::Display for ResharpError {
149    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
150        write!(f, "{:?}: {:?}", self.kind, self.span)
151    }
152}
153impl std::error::Error for ResharpError {}
154
155type Result<T> = core::result::Result<T, ResharpError>;
156
157#[derive(Clone, Debug)]
158enum GroupState {
159    /// This state is pushed whenever an opening group is found.
160    Group {
161        /// The concatenation immediately preceding the opening group.
162        concat: Concat,
163        /// The group that has been opened. Its sub-AST is always empty.
164        group: ast::Group,
165        /// Whether this group has the `x` flag enabled or not.
166        ignore_whitespace: bool,
167    },
168    /// This state is pushed whenever a new alternation branch is found. If
169    /// an alternation branch is found and this state is at the top of the
170    /// stack, then this state should be modified to include the new
171    /// alternation.
172    Alternation(ast::Alternation),
173    Intersection(ast::Intersection),
174}
175
176#[derive(Clone, Debug)]
177enum ClassState {
178    /// This state is pushed whenever an opening bracket is found.
179    Open {
180        /// The union of class items immediately preceding this class.
181        union: regex_syntax::ast::ClassSetUnion,
182        /// The class that has been opened. Typically this just corresponds
183        /// to the `[`, but it can also include `[^` since `^` indicates
184        /// negation of the class.
185        set: regex_syntax::ast::ClassBracketed,
186    },
187    /// This state is pushed when a operator is seen. When popped, the stored
188    /// set becomes the left hand side of the operator.
189    Op {
190        /// The type of the operation, i.e., &&, -- or ~~.
191        kind: regex_syntax::ast::ClassSetBinaryOpKind,
192        /// The left-hand side of the operator.
193        lhs: regex_syntax::ast::ClassSet,
194    },
195}
196
197/// RE# syntax parser based on the regex-syntax crate.
198pub struct ResharpParser<'s> {
199    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
200    unicode_classes: resharp_algebra::UnicodeClassCache,
201    pub translator: regex_syntax::hir::translate::Translator,
202    pub pattern: &'s str,
203    pos: Cell<Position>,
204    capture_index: Cell<u32>,
205    octal: bool,
206    empty_min_range: bool,
207    ignore_whitespace: Cell<bool>,
208    dot_all: Cell<bool>,
209    global_unicode: bool,
210    global_full_unicode: bool,
211    global_ascii_perl: bool,
212    global_case_insensitive: bool,
213    comments: RefCell<Vec<ast::Comment>>,
214    stack_group: RefCell<Vec<GroupState>>,
215    stack_class: RefCell<Vec<ClassState>>,
216    capture_names: RefCell<Vec<ast::CaptureName>>,
217    scratch: RefCell<String>,
218}
219
220fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
221    result.map_err(|e| {
222        if e.kind == from {
223            ResharpError {
224                kind: to,
225                pattern: e.pattern,
226                span: e.span,
227            }
228        } else {
229            e
230        }
231    })
232}
233
234fn is_capture_char(c: char, first: bool) -> bool {
235    if first {
236        c == '_' || c.is_alphabetic()
237    } else {
238        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
239    }
240}
241
242pub fn is_meta_character(c: char) -> bool {
243    matches!(
244        c,
245        '\\' | '.'
246            | '+'
247            | '*'
248            | '?'
249            | '('
250            | ')'
251            | '|'
252            | '['
253            | ']'
254            | '{'
255            | '}'
256            | '^'
257            | '$'
258            | '#'
259            | '&'
260            | '-'
261            | '~'
262            | '_'
263    )
264}
265
266/// escapes all resharp meta characters in `text`.
267pub fn escape(text: &str) -> String {
268    let mut buf = String::new();
269    escape_into(text, &mut buf);
270    buf
271}
272
273/// escapes all resharp meta characters in `text` and appends to `buf`.
274pub fn escape_into(text: &str, buf: &mut String) {
275    buf.reserve(text.len());
276    for c in text.chars() {
277        if is_meta_character(c) {
278            buf.push('\\');
279        }
280        buf.push(c);
281    }
282}
283
284pub fn is_escapeable_character(c: char) -> bool {
285    // Certainly escapeable if it's a meta character.
286    if is_meta_character(c) {
287        return true;
288    }
289    // Any character that isn't ASCII is definitely not escapeable. There's
290    // no real need to allow things like \☃ right?
291    if !c.is_ascii() {
292        return false;
293    }
294    // Otherwise, we basically say that everything is escapeable unless it's a
295    // letter or digit. Things like \3 are either octal (when enabled) or an
296    // error, and we should keep it that way. Otherwise, letters are reserved
297    // for adding new syntax in a backwards compatible way.
298    match c {
299        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
300        // While not currently supported, we keep these as not escapeable to
301        // give us some flexibility with respect to supporting the \< and
302        // \> word boundary assertions in the future. By rejecting them as
303        // escapeable, \< and \> will result in a parse error. Thus, we can
304        // turn them into something else in the future without it being a
305        // backwards incompatible change.
306        //
307        // OK, now we support \< and \>, and we need to retain them as *not*
308        // escapeable here since the escape sequence is significant.
309        '<' | '>' => false,
310        _ => true,
311    }
312}
313
314fn is_hex(c: char) -> bool {
315    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
316}
317
318impl<'s> ResharpParser<'s> {
319    fn default_translator_builder(&self) -> TranslatorBuilder {
320        let mut trb = TranslatorBuilder::new();
321        trb.unicode(self.global_unicode);
322        trb.utf8(false);
323        trb.case_insensitive(self.global_case_insensitive);
324        trb
325    }
326
327    pub fn new(pattern: &'s str) -> Self {
328        Self::with_flags(pattern, &PatternFlags::default())
329    }
330
331    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
332        let mut trb = TranslatorBuilder::new();
333        trb.unicode(flags.unicode);
334        trb.utf8(false);
335        trb.case_insensitive(flags.case_insensitive);
336        Self {
337            translator: trb.build(),
338            pattern,
339            perl_classes: vec![],
340            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
341            pos: Cell::new(Position::new(0, 0, 0)),
342            capture_index: Cell::new(0),
343            octal: false,
344            empty_min_range: false,
345            ignore_whitespace: Cell::new(flags.ignore_whitespace),
346            dot_all: Cell::new(flags.dot_matches_new_line),
347            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
348            global_full_unicode: flags.full_unicode,
349            global_ascii_perl: flags.ascii_perl_classes,
350            global_case_insensitive: flags.case_insensitive,
351            comments: RefCell::new(vec![]),
352            stack_group: RefCell::new(vec![]),
353            stack_class: RefCell::new(vec![]),
354            capture_names: RefCell::new(vec![]),
355            scratch: RefCell::new(String::new()),
356        }
357    }
358
359    /// Return a reference to the parser state.
360    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
361        self
362    }
363
364    /// Return a reference to the pattern being parsed.
365    fn pattern(&self) -> &str {
366        self.pattern
367    }
368
369    /// Create a new error with the given span and error type.
370    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
371        ResharpError {
372            kind,
373            pattern: self.pattern().to_string(),
374            span,
375        }
376    }
377
378    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
379        self.error(
380            Span::splat(self.pos()),
381            ast::ErrorKind::UnsupportedResharpRegex,
382        )
383    }
384
385    /// Return the current offset of the parser.
386    ///
387    /// The offset starts at `0` from the beginning of the regular expression
388    /// pattern string.
389    fn offset(&self) -> usize {
390        self.parser().pos.get().offset
391    }
392
393    /// Return the current line number of the parser.
394    ///
395    /// The line number starts at `1`.
396    fn line(&self) -> usize {
397        self.parser().pos.get().line
398    }
399
400    /// Return the current column of the parser.
401    ///
402    /// The column number starts at `1` and is reset whenever a `\n` is seen.
403    fn column(&self) -> usize {
404        self.parser().pos.get().column
405    }
406
407    /// Return the next capturing index. Each subsequent call increments the
408    /// internal index.
409    ///
410    /// The span given should correspond to the location of the opening
411    /// parenthesis.
412    ///
413    /// If the capture limit is exceeded, then an error is returned.
414    fn next_capture_index(&self, span: Span) -> Result<u32> {
415        let current = self.parser().capture_index.get();
416        let i = current
417            .checked_add(1)
418            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
419        self.parser().capture_index.set(i);
420        Ok(i)
421    }
422
423    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
424        let mut names = self.parser().capture_names.borrow_mut();
425        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
426            Err(i) => {
427                names.insert(i, cap.clone());
428                Ok(())
429            }
430            Ok(i) => Err(self.error(
431                cap.span,
432                ast::ErrorKind::GroupNameDuplicate {
433                    original: names[i].span,
434                },
435            )),
436        }
437    }
438
439    fn ignore_whitespace(&self) -> bool {
440        self.parser().ignore_whitespace.get()
441    }
442
443    fn char(&self) -> char {
444        self.char_at(self.offset())
445    }
446
447    fn char_at(&self, i: usize) -> char {
448        self.pattern()[i..]
449            .chars()
450            .next()
451            .unwrap_or_else(|| panic!("expected char at offset {}", i))
452    }
453
454    fn bump(&self) -> bool {
455        if self.is_eof() {
456            return false;
457        }
458        let Position {
459            mut offset,
460            mut line,
461            mut column,
462        } = self.pos();
463        if self.char() == '\n' {
464            line = line.checked_add(1).unwrap();
465            column = 1;
466        } else {
467            column = column.checked_add(1).unwrap();
468        }
469        offset += self.char().len_utf8();
470        self.parser().pos.set(Position {
471            offset,
472            line,
473            column,
474        });
475        self.pattern()[self.offset()..].chars().next().is_some()
476    }
477
478    fn bump_if(&self, prefix: &str) -> bool {
479        if self.pattern()[self.offset()..].starts_with(prefix) {
480            for _ in 0..prefix.chars().count() {
481                self.bump();
482            }
483            true
484        } else {
485            false
486        }
487    }
488
489    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
490        if self.bump_if("?=") {
491            return Some((true, true));
492        }
493        if self.bump_if("?!") {
494            return Some((true, false));
495        }
496        if self.bump_if("?<=") {
497            return Some((false, true));
498        }
499        if self.bump_if("?<!") {
500            return Some((false, false));
501        }
502        None
503    }
504
505    fn bump_and_bump_space(&self) -> bool {
506        if !self.bump() {
507            return false;
508        }
509        self.bump_space();
510        !self.is_eof()
511    }
512
513    fn bump_space(&self) {
514        if !self.ignore_whitespace() {
515            return;
516        }
517        while !self.is_eof() {
518            if self.char().is_whitespace() {
519                self.bump();
520            } else if self.char() == '#' {
521                let start = self.pos();
522                let mut comment_text = String::new();
523                self.bump();
524                while !self.is_eof() {
525                    let c = self.char();
526                    self.bump();
527                    if c == '\n' {
528                        break;
529                    }
530                    comment_text.push(c);
531                }
532                let comment = ast::Comment {
533                    span: Span::new(start, self.pos()),
534                    comment: comment_text,
535                };
536                self.parser().comments.borrow_mut().push(comment);
537            } else {
538                break;
539            }
540        }
541    }
542
543    /// Peek at the next character in the input without advancing the parser.
544    ///
545    /// If the input has been exhausted, then this returns `None`.
546    fn peek(&self) -> Option<char> {
547        if self.is_eof() {
548            return None;
549        }
550        self.pattern()[self.offset() + self.char().len_utf8()..]
551            .chars()
552            .next()
553    }
554
555    /// Like peek, but will ignore spaces when the parser is in whitespace
556    /// insensitive mode.
557    fn peek_space(&self) -> Option<char> {
558        if !self.ignore_whitespace() {
559            return self.peek();
560        }
561        if self.is_eof() {
562            return None;
563        }
564        let mut start = self.offset() + self.char().len_utf8();
565        let mut in_comment = false;
566        for (i, c) in self.pattern()[start..].char_indices() {
567            if c.is_whitespace() {
568                continue;
569            } else if !in_comment && c == '#' {
570                in_comment = true;
571            } else if in_comment && c == '\n' {
572                in_comment = false;
573            } else {
574                start += i;
575                break;
576            }
577        }
578        self.pattern()[start..].chars().next()
579    }
580
581    /// Returns true if the next call to `bump` would return false.
582    fn is_eof(&self) -> bool {
583        self.offset() == self.pattern().len()
584    }
585
586    /// Return the current position of the parser, which includes the offset,
587    /// line and column.
588    fn pos(&self) -> Position {
589        self.parser().pos.get()
590    }
591
592    /// Create a span at the current position of the parser. Both the start
593    /// and end of the span are set.
594    fn span(&self) -> Span {
595        Span::splat(self.pos())
596    }
597
598    /// Create a span that covers the current character.
599    fn span_char(&self) -> Span {
600        let mut next = Position {
601            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
602            line: self.line(),
603            column: self.column().checked_add(1).unwrap(),
604        };
605        if self.char() == '\n' {
606            next.line += 1;
607            next.column = 1;
608        }
609        Span::new(self.pos(), next)
610    }
611
612    /// Parse and push a single alternation on to the parser's internal stack.
613    /// If the top of the stack already has an alternation, then add to that
614    /// instead of pushing a new one.
615    ///
616    /// The concatenation given corresponds to a single alternation branch.
617    /// The concatenation returned starts the next branch and is empty.
618    ///
619    /// This assumes the parser is currently positioned at `|` and will advance
620    /// the parser to the character following `|`.
621    #[inline(never)]
622    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
623        assert_eq!(self.char(), '|');
624        concat.span.end = self.pos();
625        self.push_or_add_alternation(concat);
626        self.bump();
627        Ok(ast::Concat {
628            span: self.span(),
629            asts: vec![],
630        })
631    }
632
633    /// Pushes or adds the given branch of an alternation to the parser's
634    /// internal stack of state.
635    fn push_or_add_alternation(&self, concat: Concat) {
636        use self::GroupState::*;
637
638        let mut stack = self.parser().stack_group.borrow_mut();
639        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
640            alts.asts.push(concat.into_ast());
641            return;
642        }
643        stack.push(Alternation(ast::Alternation {
644            span: Span::new(concat.span.start, self.pos()),
645            asts: vec![concat.into_ast()],
646        }));
647    }
648
649    #[inline(never)]
650    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
651        assert_eq!(self.char(), '&');
652        concat.span.end = self.pos();
653        self.push_or_add_intersect(concat);
654        self.bump();
655        Ok(Concat {
656            span: self.span(),
657            asts: vec![],
658        })
659    }
660
661    /// Pushes or adds the given branch of an alternation to the parser's
662    /// internal stack of state.
663    fn push_or_add_intersect(&self, concat: Concat) {
664        use self::GroupState::*;
665
666        let mut stack = self.parser().stack_group.borrow_mut();
667        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
668            alts.asts.push(concat.into_ast());
669            return;
670        }
671        stack.push(Intersection(ast::Intersection {
672            span: Span::new(concat.span.start, self.pos()),
673            asts: vec![concat.into_ast()],
674        }));
675    }
676
677    /// Parse and push a group AST (and its parent concatenation) on to the
678    /// parser's internal stack. Return a fresh concatenation corresponding
679    /// to the group's sub-AST.
680    ///
681    /// If a set of flags was found (with no group), then the concatenation
682    /// is returned with that set of flags added.
683    ///
684    /// This assumes that the parser is currently positioned on the opening
685    /// parenthesis. It advances the parser to the character at the start
686    /// of the sub-expression (or adjoining expression).
687    ///
688    /// If there was a problem parsing the start of the group, then an error
689    /// is returned.
690    #[inline(never)]
691    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
692        assert_eq!(self.char(), '(');
693        match self.parse_group()? {
694            Either::Left(set) => {
695                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
696                if let Some(v) = ignore {
697                    self.parser().ignore_whitespace.set(v);
698                }
699
700                concat.asts.push(Ast::flags(set));
701                Ok(concat)
702            }
703            Either::Right(group) => {
704                let old_ignore_whitespace = self.ignore_whitespace();
705                let new_ignore_whitespace = group
706                    .flags()
707                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
708                    .unwrap_or(old_ignore_whitespace);
709                self.parser()
710                    .stack_group
711                    .borrow_mut()
712                    .push(GroupState::Group {
713                        concat,
714                        group,
715                        ignore_whitespace: old_ignore_whitespace,
716                    });
717                self.parser().ignore_whitespace.set(new_ignore_whitespace);
718                Ok(Concat {
719                    span: self.span(),
720                    asts: vec![],
721                })
722            }
723        }
724    }
725
726    #[inline(never)]
727    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
728        assert_eq!(self.char(), '~');
729        self.bump();
730        if self.is_eof() || self.char() != '(' {
731            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
732        }
733        let open_span = self.span_char();
734        self.bump();
735        let group = ast::Group {
736            span: open_span,
737            kind: ast::GroupKind::Complement,
738            ast: Box::new(Ast::empty(self.span())),
739        };
740
741        let old_ignore_whitespace = self.ignore_whitespace();
742        let new_ignore_whitespace = group
743            .flags()
744            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
745            .unwrap_or(old_ignore_whitespace);
746        self.parser()
747            .stack_group
748            .borrow_mut()
749            .push(GroupState::Group {
750                concat,
751                group,
752                ignore_whitespace: old_ignore_whitespace,
753            });
754        self.parser().ignore_whitespace.set(new_ignore_whitespace);
755        Ok(Concat {
756            span: self.span(),
757            asts: vec![],
758        })
759    }
760
761    /// Pop a group AST from the parser's internal stack and set the group's
762    /// AST to the given concatenation. Return the concatenation containing
763    /// the group.
764    ///
765    /// This assumes that the parser is currently positioned on the closing
766    /// parenthesis and advances the parser to the character following the `)`.
767    ///
768    /// If no such group could be popped, then an unopened group error is
769    /// returned.
770    #[inline(never)]
771    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
772        use self::GroupState::*;
773        assert_eq!(self.char(), ')');
774        let mut stack = self.parser().stack_group.borrow_mut();
775        let topstack = stack.pop();
776
777        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
778            Some(Group {
779                concat,
780                group,
781                ignore_whitespace,
782            }) => (concat, group, ignore_whitespace, None),
783            Some(Alternation(alt)) => match stack.pop() {
784                Some(Group {
785                    concat,
786                    group,
787                    ignore_whitespace,
788                }) => (
789                    concat,
790                    group,
791                    ignore_whitespace,
792                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
793                ),
794                None | Some(Alternation(_)) | Some(Intersection(_)) => {
795                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
796                }
797            },
798            Some(Intersection(int)) => match stack.pop() {
799                Some(Group {
800                    concat,
801                    group,
802                    ignore_whitespace,
803                }) => (
804                    concat,
805                    group,
806                    ignore_whitespace,
807                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
808                ),
809                None | Some(Alternation(_)) | Some(Intersection(_)) => {
810                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
811                }
812            },
813
814            None => {
815                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
816            }
817        };
818        self.parser().ignore_whitespace.set(ignore_whitespace);
819        group_concat.span.end = self.pos();
820        self.bump();
821        group.span.end = self.pos();
822        match alt {
823            Some(Either::Left(mut alt)) => {
824                alt.span.end = group_concat.span.end;
825                alt.asts.push(group_concat.into_ast());
826                group.ast = Box::new(alt.into_ast());
827            }
828            Some(Either::Right(mut int)) => {
829                int.span.end = group_concat.span.end;
830                int.asts.push(group_concat.into_ast());
831                group.ast = Box::new(int.into_ast());
832            }
833            None => {
834                group.ast = Box::new(group_concat.into_ast());
835            }
836        }
837
838        if group.kind == GroupKind::Complement {
839            let complement = ast::Complement {
840                span: self.span(),
841                ast: group.ast,
842            };
843            prior_concat.asts.push(Ast::complement(complement));
844        }
845        // ignore groups for now
846        else {
847            prior_concat.asts.push(Ast::group(group));
848        }
849        Ok(prior_concat)
850    }
851
852    /// Pop the last state from the parser's internal stack, if it exists, and
853    /// add the given concatenation to it. There either must be no state or a
854    /// single alternation item on the stack. Any other scenario produces an
855    /// error.
856    ///
857    /// This assumes that the parser has advanced to the end.
858    #[inline(never)]
859    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
860        concat.span.end = self.pos();
861        let mut stack = self.parser().stack_group.borrow_mut();
862        let ast = match stack.pop() {
863            None => Ok(concat.into_ast()),
864            Some(GroupState::Alternation(mut alt)) => {
865                alt.span.end = self.pos();
866                alt.asts.push(concat.into_ast());
867                Ok(Ast::alternation(alt))
868            }
869            Some(GroupState::Intersection(mut int)) => {
870                int.span.end = self.pos();
871                int.asts.push(concat.into_ast());
872
873                Ok(Ast::intersection(int))
874            }
875            Some(GroupState::Group { group, .. }) => {
876                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
877            }
878        };
879        // If we try to pop again, there should be nothing.
880        match stack.pop() {
881            None => ast,
882            Some(GroupState::Alternation(alt)) => {
883                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
884            }
885            Some(GroupState::Intersection(int)) => {
886                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
887            }
888            Some(GroupState::Group { group, .. }) => {
889                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
890            }
891        }
892    }
893
894    /// Parse the opening of a character class and push the current class
895    /// parsing context onto the parser's stack. This assumes that the parser
896    /// is positioned at an opening `[`. The given union should correspond to
897    /// the union of set items built up before seeing the `[`.
898    ///
899    /// If there was a problem parsing the opening of the class, then an error
900    /// is returned. Otherwise, a new union of set items for the class is
901    /// returned (which may be populated with either a `]` or a `-`).
902    #[inline(never)]
903    fn push_class_open(
904        &self,
905        parent_union: regex_syntax::ast::ClassSetUnion,
906    ) -> Result<regex_syntax::ast::ClassSetUnion> {
907        assert_eq!(self.char(), '[');
908
909        let (nested_set, nested_union) = self.parse_set_class_open()?;
910        self.parser()
911            .stack_class
912            .borrow_mut()
913            .push(ClassState::Open {
914                union: parent_union,
915                set: nested_set,
916            });
917        Ok(nested_union)
918    }
919
920    /// Parse the end of a character class set and pop the character class
921    /// parser stack. The union given corresponds to the last union built
922    /// before seeing the closing `]`. The union returned corresponds to the
923    /// parent character class set with the nested class added to it.
924    ///
925    /// This assumes that the parser is positioned at a `]` and will advance
926    /// the parser to the byte immediately following the `]`.
927    ///
928    /// If the stack is empty after popping, then this returns the final
929    /// "top-level" character class AST (where a "top-level" character class
930    /// is one that is not nested inside any other character class).
931    ///
932    /// If there is no corresponding opening bracket on the parser's stack,
933    /// then an error is returned.
934    #[inline(never)]
935    fn pop_class(
936        &self,
937        nested_union: regex_syntax::ast::ClassSetUnion,
938    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
939        assert_eq!(self.char(), ']');
940
941        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
942        let prevset = self.pop_class_op(item);
943        let mut stack = self.parser().stack_class.borrow_mut();
944        match stack.pop() {
945            None => {
946                // We can never observe an empty stack:
947                //
948                // 1) We are guaranteed to start with a non-empty stack since
949                //    the character class parser is only initiated when it sees
950                //    a `[`.
951                // 2) If we ever observe an empty stack while popping after
952                //    seeing a `]`, then we signal the character class parser
953                //    to terminate.
954                panic!("unexpected empty character class stack")
955            }
956            Some(ClassState::Op { .. }) => {
957                // This panic is unfortunate, but this case is impossible
958                // since we already popped the Op state if one exists above.
959                // Namely, every push to the class parser stack is guarded by
960                // whether an existing Op is already on the top of the stack.
961                // If it is, the existing Op is modified. That is, the stack
962                // can never have consecutive Op states.
963                panic!("unexpected ClassState::Op")
964            }
965            Some(ClassState::Open { mut union, mut set }) => {
966                self.bump();
967                set.span.end = self.pos();
968                set.kind = prevset;
969                if stack.is_empty() {
970                    Ok(Either::Right(set))
971                } else {
972                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
973                    Ok(Either::Left(union))
974                }
975            }
976        }
977    }
978
979    /// Return an "unclosed class" error whose span points to the most
980    /// recently opened class.
981    ///
982    /// This should only be called while parsing a character class.
983    #[inline(never)]
984    fn unclosed_class_error(&self) -> ResharpError {
985        for state in self.parser().stack_class.borrow().iter().rev() {
986            if let ClassState::Open { ref set, .. } = *state {
987                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
988            }
989        }
990        // We are guaranteed to have a non-empty stack with at least
991        // one open bracket, so we should never get here.
992        panic!("no open character class found")
993    }
994
995    /// Push the current set of class items on to the class parser's stack as
996    /// the left hand side of the given operator.
997    ///
998    /// A fresh set union is returned, which should be used to build the right
999    /// hand side of this operator.
1000    #[inline(never)]
1001    fn push_class_op(
1002        &self,
1003        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1004        next_union: regex_syntax::ast::ClassSetUnion,
1005    ) -> regex_syntax::ast::ClassSetUnion {
1006        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1007        let new_lhs = self.pop_class_op(item);
1008        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1009            kind: next_kind,
1010            lhs: new_lhs,
1011        });
1012        regex_syntax::ast::ClassSetUnion {
1013            span: self.span(),
1014            items: vec![],
1015        }
1016    }
1017
1018    /// Pop a character class set from the character class parser stack. If the
1019    /// top of the stack is just an item (not an operation), then return the
1020    /// given set unchanged. If the top of the stack is an operation, then the
1021    /// given set will be used as the rhs of the operation on the top of the
1022    /// stack. In that case, the binary operation is returned as a set.
1023    #[inline(never)]
1024    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1025        let mut stack = self.parser().stack_class.borrow_mut();
1026        let (kind, lhs) = match stack.pop() {
1027            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1028            Some(state @ ClassState::Open { .. }) => {
1029                stack.push(state);
1030                return rhs;
1031            }
1032            None => unreachable!(),
1033        };
1034        let span = Span::new(lhs.span().start, rhs.span().end);
1035        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1036            span,
1037            kind,
1038            lhs: Box::new(lhs),
1039            rhs: Box::new(rhs),
1040        })
1041    }
1042
1043    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1044        match hir.kind() {
1045            hir::HirKind::Empty => Ok(NodeId::EPS),
1046            hir::HirKind::Literal(l) => {
1047                if l.0.len() == 1 {
1048                    let node = tb.mk_u8(l.0[0]);
1049                    Ok(node)
1050                } else {
1051                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1052                    let conc = tb.mk_concats(ws.iter().copied());
1053                    Ok(conc)
1054                }
1055            }
1056            hir::HirKind::Class(class) => match class {
1057                hir::Class::Unicode(class_unicode) => {
1058                    let ranges = class_unicode.ranges();
1059                    if ranges.len() == 1
1060                        && ranges[0].start() == '\u{0}'
1061                        && ranges[0].end() == '\u{10FFFF}'
1062                    {
1063                        return Ok(tb.mk_range_u8(0, 255));
1064                    }
1065                    let mut nodes = Vec::new();
1066                    for range in ranges {
1067                        for seq in Utf8Sequences::new(range.start(), range.end()) {
1068                            let sl = seq.as_slice();
1069                            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1070                            let node = match bytes.len() {
1071                                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1072                                n => {
1073                                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1074                                    let mut conc = last;
1075                                    for i in (0..n - 1).rev() {
1076                                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1077                                        conc = tb.mk_concat(b, conc);
1078                                    }
1079                                    conc
1080                                }
1081                            };
1082                            nodes.push(node);
1083                        }
1084                    }
1085                    let merged = tb.mk_unions(nodes.into_iter());
1086                    Ok(merged)
1087                }
1088                hir::Class::Bytes(class_bytes) => {
1089                    let ranges = class_bytes.ranges();
1090                    let mut result = NodeId::BOT;
1091                    for range in ranges {
1092                        let start = range.start();
1093                        let end = range.end();
1094                        let node = tb.mk_range_u8(start, end);
1095                        result = tb.mk_union(result, node);
1096                    }
1097                    Ok(result)
1098                }
1099            },
1100            hir::HirKind::Look(_) => Err(self.error(
1101                Span::splat(self.pos()),
1102                ast::ErrorKind::UnsupportedResharpRegex,
1103            )),
1104            hir::HirKind::Repetition(_) => Err(self.error(
1105                Span::splat(self.pos()),
1106                ast::ErrorKind::UnsupportedResharpRegex,
1107            )),
1108            hir::HirKind::Capture(_) => Err(self.error(
1109                Span::splat(self.pos()),
1110                ast::ErrorKind::UnsupportedResharpRegex,
1111            )),
1112            hir::HirKind::Concat(body) => {
1113                let mut result = NodeId::EPS;
1114                for child in body {
1115                    let node = self.hir_to_node_id(child, tb)?;
1116                    result = tb.mk_concat(result, node);
1117                }
1118                Ok(result)
1119            }
1120            hir::HirKind::Alternation(_) => Err(self.error(
1121                Span::splat(self.pos()),
1122                ast::ErrorKind::UnsupportedResharpRegex,
1123            )),
1124        }
1125    }
1126
1127    fn translate_ast_to_hir(
1128        &mut self,
1129        orig_ast: &regex_syntax::ast::Ast,
1130        tb: &mut TB<'s>,
1131    ) -> Result<NodeId> {
1132        match self.translator.translate("", orig_ast) {
1133            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1134            Ok(hir) => self.hir_to_node_id(&hir, tb),
1135        }
1136    }
1137
1138    fn translator_to_node_id(
1139        &mut self,
1140        orig_ast: &regex_syntax::ast::Ast,
1141        translator: &mut Option<Translator>,
1142        tb: &mut TB<'s>,
1143    ) -> Result<NodeId> {
1144        match translator {
1145            Some(tr) => {
1146                let hir = tr
1147                    .translate("", orig_ast)
1148                    .map_err(|e| self.unsupported_error(e))?;
1149                self.hir_to_node_id(&hir, tb)
1150            }
1151            None => self.translate_ast_to_hir(orig_ast, tb),
1152        }
1153    }
1154
1155    fn get_class(
1156        &mut self,
1157        negated: bool,
1158        kind: regex_syntax::ast::ClassPerlKind,
1159        tb: &mut TB<'s>,
1160    ) -> Result<NodeId> {
1161        let w = self
1162            .perl_classes
1163            .iter()
1164            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1165        match w {
1166            Some((_, _, value)) => Ok(*value),
1167            None => {
1168                let translated = if self.global_ascii_perl {
1169                    let pos = match kind {
1170                        regex_syntax::ast::ClassPerlKind::Word => {
1171                            let az = tb.mk_range_u8(b'a', b'z');
1172                            let big = tb.mk_range_u8(b'A', b'Z');
1173                            let dig = tb.mk_range_u8(b'0', b'9');
1174                            let us = tb.mk_u8(b'_');
1175                            tb.mk_unions([az, big, dig, us].into_iter())
1176                        }
1177                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1178                        regex_syntax::ast::ClassPerlKind::Space => {
1179                            let sp = tb.mk_u8(b' ');
1180                            let tab = tb.mk_u8(b'\t');
1181                            let nl = tb.mk_u8(b'\n');
1182                            let cr = tb.mk_u8(b'\r');
1183                            let ff = tb.mk_u8(0x0C);
1184                            let vt = tb.mk_u8(0x0B);
1185                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1186                        }
1187                    };
1188                    if negated {
1189                        resharp_algebra::neg_class(tb, pos)
1190                    } else {
1191                        pos
1192                    }
1193                } else if self.global_unicode {
1194                    match kind {
1195                        regex_syntax::ast::ClassPerlKind::Word => {
1196                            if self.global_full_unicode {
1197                                self.unicode_classes.ensure_word_full(tb);
1198                            } else {
1199                                self.unicode_classes.ensure_word(tb);
1200                            }
1201                            if negated {
1202                                self.unicode_classes.non_word
1203                            } else {
1204                                self.unicode_classes.word
1205                            }
1206                        }
1207                        regex_syntax::ast::ClassPerlKind::Digit => {
1208                            if self.global_full_unicode {
1209                                self.unicode_classes.ensure_digit_full(tb);
1210                            } else {
1211                                self.unicode_classes.ensure_digit(tb);
1212                            }
1213                            if negated {
1214                                self.unicode_classes.non_digit
1215                            } else {
1216                                self.unicode_classes.digit
1217                            }
1218                        }
1219                        regex_syntax::ast::ClassPerlKind::Space => {
1220                            self.unicode_classes.ensure_space(tb);
1221                            if negated {
1222                                self.unicode_classes.non_space
1223                            } else {
1224                                self.unicode_classes.space
1225                            }
1226                        }
1227                    }
1228                } else {
1229                    let pos = match kind {
1230                        regex_syntax::ast::ClassPerlKind::Word => {
1231                            let az = tb.mk_range_u8(b'a', b'z');
1232                            let big = tb.mk_range_u8(b'A', b'Z');
1233                            let dig = tb.mk_range_u8(b'0', b'9');
1234                            let us = tb.mk_u8(b'_');
1235                            tb.mk_unions([az, big, dig, us].into_iter())
1236                        }
1237                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1238                        regex_syntax::ast::ClassPerlKind::Space => {
1239                            let sp = tb.mk_u8(b' ');
1240                            let tab = tb.mk_u8(b'\t');
1241                            let nl = tb.mk_u8(b'\n');
1242                            let cr = tb.mk_u8(b'\r');
1243                            let ff = tb.mk_u8(0x0C);
1244                            let vt = tb.mk_u8(0x0B);
1245                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1246                        }
1247                    };
1248                    if negated {
1249                        tb.mk_compl(pos)
1250                    } else {
1251                        pos
1252                    }
1253                };
1254                self.perl_classes.push((negated, kind, translated));
1255                Ok(translated)
1256            }
1257        }
1258    }
1259
1260    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1261        use WordCharKind::*;
1262        match ast {
1263            Ast::Literal(lit) => {
1264                if is_word_byte(lit.c as u8) {
1265                    Word
1266                } else {
1267                    NonWord
1268                }
1269            }
1270            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1271                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1272                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1273                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1274                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1275                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1276                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1277            },
1278            Ast::Dot(_) | Ast::Top(_) => Unknown,
1279            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1280            Ast::Concat(c) if !c.asts.is_empty() => {
1281                let edge = if left { c.asts.len() - 1 } else { 0 };
1282                let kind = Self::word_char_kind(&c.asts[edge], left);
1283                match kind {
1284                    MaybeWord => {
1285                        let dir: isize = if left { -1 } else { 1 };
1286                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1287                            Word => Word,
1288                            _ => MaybeWord,
1289                        }
1290                    }
1291                    MaybeNonWord => {
1292                        let dir: isize = if left { -1 } else { 1 };
1293                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1294                            NonWord => NonWord,
1295                            _ => MaybeNonWord,
1296                        }
1297                    }
1298                    other => other,
1299                }
1300            }
1301            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1302                let first = Self::word_char_kind(&alt.asts[0], left);
1303                if alt.asts[1..]
1304                    .iter()
1305                    .all(|a| Self::word_char_kind(a, left) == first)
1306                {
1307                    first
1308                } else {
1309                    Unknown
1310                }
1311            }
1312            Ast::Repetition(r) => {
1313                let inner = Self::word_char_kind(&r.ast, left);
1314                let nullable = matches!(
1315                    &r.op.kind,
1316                    ast::RepetitionKind::ZeroOrMore
1317                        | ast::RepetitionKind::ZeroOrOne
1318                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1319                );
1320                if nullable {
1321                    match inner {
1322                        Word => MaybeWord,
1323                        NonWord => MaybeNonWord,
1324                        _ => Unknown,
1325                    }
1326                } else {
1327                    inner
1328                }
1329            }
1330            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1331            _ => Unknown,
1332        }
1333    }
1334
1335    // ok to return None here, it's only an optimization
1336    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1337        match ast {
1338            Ast::Literal(_)
1339            | Ast::ClassPerl(_)
1340            | Ast::ClassBracketed(_)
1341            | Ast::ClassUnicode(_)
1342            | Ast::Dot(_)
1343            | Ast::Top(_) => Some(ast),
1344            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1345            Ast::Concat(c) if !c.asts.is_empty() => {
1346                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1347            }
1348            Ast::Repetition(r) => {
1349                let nullable = matches!(
1350                    &r.op.kind,
1351                    ast::RepetitionKind::ZeroOrMore
1352                        | ast::RepetitionKind::ZeroOrOne
1353                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1354                );
1355                if nullable {
1356                    None
1357                } else {
1358                    Self::edge_class_ast(&r.ast, left)
1359                }
1360            }
1361            _ => None,
1362        }
1363    }
1364
1365    fn resolve_word_kind(
1366        &mut self,
1367        asts: &[Ast],
1368        idx: usize,
1369        dir: isize,
1370        translator: &mut Option<Translator>,
1371        tb: &mut TB<'s>,
1372        word_id: NodeId,
1373        not_word_id: NodeId,
1374    ) -> Result<WordCharKind> {
1375        use WordCharKind::*;
1376        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1377        if fast != Unknown {
1378            return Ok(fast);
1379        }
1380        let neighbor_idx = (idx as isize + dir) as usize;
1381        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1382            self.ast_to_node_id(edge, translator, tb)?
1383        } else {
1384            // check if \w_* (starts-with-word) or \W_* (starts-with-non-word) subsumes it.
1385            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1386            let mut neighbor_node = tb
1387                .try_elim_lookarounds(neighbor_node)
1388                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1389            if dir < 0 {
1390                neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1391                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1392                })?;
1393            }
1394            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1395            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1396            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1397                Ok(Word)
1398            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1399                Ok(NonWord)
1400            } else {
1401                Ok(Unknown)
1402            };
1403        };
1404        if tb.subsumes(word_id, node) == Some(true) {
1405            Ok(Word)
1406        } else if tb.subsumes(not_word_id, node) == Some(true) {
1407            Ok(NonWord)
1408        } else {
1409            Ok(Unknown)
1410        }
1411    }
1412
1413    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1414        use WordCharKind::*;
1415        let next = idx as isize + dir;
1416        if next < 0 || next >= asts.len() as isize {
1417            return Edge;
1418        }
1419        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1420        match kind {
1421            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1422                Word => Word,
1423                _ => Unknown,
1424            },
1425            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1426                NonWord => NonWord,
1427                _ => Unknown,
1428            },
1429            other => other,
1430        }
1431    }
1432
1433    fn rewrite_word_boundary_in_concat(
1434        &mut self,
1435        asts: &[Ast],
1436        idx: usize,
1437        translator: &mut Option<Translator>,
1438        tb: &mut TB<'s>,
1439    ) -> Result<(NodeId, usize)> {
1440        use WordCharKind::*;
1441        let (word_id, not_word_id) = if self.global_full_unicode {
1442            self.unicode_classes.ensure_word_full(tb);
1443            (self.unicode_classes.word, self.unicode_classes.non_word)
1444        } else if self.global_unicode && !self.global_ascii_perl {
1445            self.unicode_classes.ensure_word(tb);
1446            (self.unicode_classes.word, self.unicode_classes.non_word)
1447        } else {
1448            let az = tb.mk_range_u8(b'a', b'z');
1449            let big = tb.mk_range_u8(b'A', b'Z');
1450            let dig = tb.mk_range_u8(b'0', b'9');
1451            let us = tb.mk_u8(b'_');
1452            let w = tb.mk_unions([az, big, dig, us].into_iter());
1453            (w, tb.mk_compl(w))
1454        };
1455        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1456        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1457        match (left, right) {
1458            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1459            (Word, _) => {
1460                let neg = tb.mk_neg_lookahead(word_id, 0);
1461                Ok((neg, idx + 1))
1462            }
1463            (NonWord, _) => {
1464                let tail = tb.mk_concat(word_id, NodeId::TS);
1465                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1466            }
1467            (_, Word) => Ok((tb.mk_neg_lookbehind(word_id), idx + 1)),
1468            (_, NonWord) => Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1)),
1469            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1470            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1471            // reimplement once/if the builder is more optimized
1472            _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1473        }
1474    }
1475
1476    fn merge_boundary_with_following_lookaheads(
1477        &mut self,
1478        asts: &[Ast],
1479        wb_idx: usize,
1480        boundary_tail: NodeId,
1481        translator: &mut Option<Translator>,
1482        tb: &mut TB<'s>,
1483    ) -> Result<(NodeId, usize)> {
1484        let mut next = wb_idx + 1;
1485        let mut la_bodies = vec![boundary_tail];
1486        while next < asts.len() {
1487            match &asts[next] {
1488                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1489                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1490                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1491                    next += 1;
1492                }
1493                _ => break,
1494            }
1495        }
1496        let merged = tb.mk_inters(la_bodies.into_iter());
1497        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1498    }
1499
1500    fn ast_to_node_id(
1501        &mut self,
1502        ast: &Ast,
1503        translator: &mut Option<Translator>,
1504        tb: &mut TB<'s>,
1505    ) -> Result<NodeId> {
1506        match ast {
1507            Ast::Empty(_) => Ok(NodeId::EPS),
1508            Ast::Flags(f) => {
1509                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1510                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1511                }
1512                let mut translator_builder = self.default_translator_builder();
1513                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1514                    translator_builder.case_insensitive(state);
1515                }
1516                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1517                    translator_builder.unicode(state);
1518                }
1519                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1520                    self.dot_all.set(state);
1521                }
1522                let concat_translator = Some(translator_builder.build());
1523                *translator = concat_translator;
1524                Ok(NodeId::EPS)
1525            }
1526            Ast::Literal(l) => {
1527                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1528                self.translator_to_node_id(&ast_lit, translator, tb)
1529            }
1530            Ast::Top(_) => Ok(NodeId::TOP),
1531            Ast::Dot(_) => {
1532                let hirv = match (self.global_ascii_perl, self.dot_all.get()) {
1533                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1534                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1535                    (false, true) => return Ok(NodeId::TOP),
1536                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1537                };
1538                self.hir_to_node_id(&hirv, tb)
1539            }
1540            Ast::Assertion(a) => match &a.kind {
1541                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1542                ast::AssertionKind::EndText => Ok(NodeId::END),
1543                ast::AssertionKind::WordBoundary => {
1544                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1545                }
1546                ast::AssertionKind::NotWordBoundary => {
1547                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1548                }
1549                ast::AssertionKind::StartLine => {
1550                    let left = NodeId::BEGIN;
1551                    let right = tb.mk_u8(b'\n');
1552                    let union = tb.mk_union(left, right);
1553                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1554                }
1555                ast::AssertionKind::EndLine => {
1556                    let left = NodeId::END;
1557                    let right = tb.mk_u8(b'\n');
1558                    let union = tb.mk_union(left, right);
1559                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1560                }
1561                ast::AssertionKind::WordBoundaryStart => {
1562                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1563                }
1564                ast::AssertionKind::WordBoundaryEnd => {
1565                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1566                }
1567                ast::AssertionKind::WordBoundaryStartAngle => {
1568                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1569                }
1570                ast::AssertionKind::WordBoundaryEndAngle => {
1571                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1572                }
1573                ast::AssertionKind::WordBoundaryStartHalf => {
1574                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1575                }
1576                ast::AssertionKind::WordBoundaryEndHalf => {
1577                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1578                }
1579            },
1580            Ast::ClassUnicode(c) => {
1581                let tmp = regex_syntax::ast::ClassUnicode {
1582                    span: c.span,
1583                    negated: c.negated,
1584                    kind: c.kind.clone(),
1585                };
1586                if !c.negated {
1587                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1588                        match s.as_str() {
1589                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1590                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1591                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1592                            "utf8" => {
1593                                let ascii = tb.mk_range_u8(0, 127);
1594                                let beta = tb.mk_range_u8(128, 0xBF);
1595                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1596                                let c0s = tb.mk_concats([c0, beta].into_iter());
1597                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1598                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1599                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1600                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1601                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1602                                return Ok(tb.mk_star(merged));
1603                            }
1604                            "hex" => {
1605                                let nums = tb.mk_range_u8(b'0', b'9');
1606                                let lets = tb.mk_range_u8(b'a', b'f');
1607                                let lets2 = tb.mk_range_u8(b'A', b'F');
1608                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1609                                return Ok(merged);
1610                            }
1611                            _ => {}
1612                        }
1613                    };
1614                }
1615
1616                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1617                self.translator_to_node_id(&orig_ast, translator, tb)
1618            }
1619            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1620            Ast::ClassBracketed(c) => match &c.kind {
1621                regex_syntax::ast::ClassSet::Item(item) => {
1622                    if !c.negated && is_universal_perl_pair(item) {
1623                        return Ok(NodeId::TOP);
1624                    }
1625                    let tmp = regex_syntax::ast::ClassBracketed {
1626                        span: c.span,
1627                        negated: c.negated,
1628                        kind: c.kind.clone(),
1629                    };
1630                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1631                    self.translator_to_node_id(&orig_ast, translator, tb)
1632                }
1633                regex_syntax::ast::ClassSet::BinaryOp(_) => {
1634                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1635                }
1636            },
1637            Ast::Repetition(r) => {
1638                let body = self.ast_to_node_id(&r.ast, translator, tb);
1639                match body {
1640                    Ok(body) => match &r.op.kind {
1641                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1642                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1643                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1644                        ast::RepetitionKind::Range(r) => match r {
1645                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1646                            ast::RepetitionRange::AtLeast(n) => {
1647                                let rep = tb.mk_repeat(body, *n, *n);
1648                                let st = tb.mk_star(body);
1649                                Ok(tb.mk_concat(rep, st))
1650                            }
1651
1652                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1653                        },
1654                    },
1655                    Err(_) => body,
1656                }
1657            }
1658            Ast::Lookaround(g) => {
1659                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1660                match g.kind {
1661                    ast::LookaroundKind::PositiveLookahead => {
1662                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1663                    }
1664                    ast::LookaroundKind::PositiveLookbehind => {
1665                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1666                    }
1667                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1668                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1669                }
1670            }
1671            Ast::Group(g) => {
1672                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1673                    if !flags.items.is_empty() {
1674                        let mut translator_builder = self.default_translator_builder();
1675                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1676                            translator_builder.case_insensitive(state);
1677                        }
1678                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1679                            translator_builder.unicode(state);
1680                        }
1681                        let saved_dot_all = self.dot_all.get();
1682                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1683                            self.dot_all.set(state);
1684                        }
1685                        let mut scoped = Some(translator_builder.build());
1686                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1687                        self.dot_all.set(saved_dot_all);
1688                        return result;
1689                    }
1690                }
1691                self.ast_to_node_id(&g.ast, translator, tb)
1692            }
1693            Ast::Alternation(a) => {
1694                let mut children = vec![];
1695                for ast in &a.asts {
1696                    match self.ast_to_node_id(ast, translator, tb) {
1697                        Ok(node_id) => children.push(node_id),
1698                        Err(err) => return Err(err),
1699                    }
1700                }
1701                Ok(tb.mk_unions(children.iter().copied()))
1702            }
1703            Ast::Concat(c) => {
1704                let mut concat_translator: Option<Translator> = None;
1705                let mut children = vec![];
1706                let mut i = 0;
1707                while i < c.asts.len() {
1708                    let ast = &c.asts[i];
1709                    match ast {
1710                        Ast::Flags(f) => {
1711                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1712                                return Err(
1713                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1714                                );
1715                            }
1716                            let mut translator_builder = self.default_translator_builder();
1717                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1718                                translator_builder.case_insensitive(state);
1719                            }
1720                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1721                                translator_builder.unicode(state);
1722                            }
1723                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1724                                self.dot_all.set(state);
1725                            }
1726                            concat_translator = Some(translator_builder.build());
1727                            *translator = concat_translator.clone();
1728                            i += 1;
1729                            continue;
1730                        }
1731                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1732                            let node =
1733                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1734                            children.push(node.0);
1735                            i = node.1; // skip consumed lookaheads
1736                            continue;
1737                        }
1738                        _ => {}
1739                    }
1740                    match concat_translator {
1741                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1742                            Ok(node_id) => children.push(node_id),
1743                            Err(err) => return Err(err),
1744                        },
1745                        None => match self.ast_to_node_id(ast, translator, tb) {
1746                            Ok(node_id) => children.push(node_id),
1747                            Err(err) => return Err(err),
1748                        },
1749                    }
1750                    i += 1;
1751                }
1752                Ok(tb.mk_concats(children.iter().cloned()))
1753            }
1754            Ast::Intersection(intersection) => {
1755                let mut children = vec![];
1756                for ast in &intersection.asts {
1757                    match self.ast_to_node_id(ast, translator, tb) {
1758                        Ok(node_id) => children.push(node_id),
1759                        Err(err) => return Err(err),
1760                    }
1761                }
1762                Ok(tb.mk_inters(children.into_iter()))
1763            }
1764            Ast::Complement(complement) => {
1765                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1766                body.map(|x| tb.mk_compl(x))
1767            }
1768        }
1769    }
1770
1771    fn parse_inner(&mut self) -> Result<Ast> {
1772        let mut concat = Concat {
1773            span: self.span(),
1774            asts: vec![],
1775        };
1776        loop {
1777            self.bump_space();
1778            if self.is_eof() {
1779                break;
1780            }
1781            match self.char() {
1782                '(' => concat = self.push_group(concat)?,
1783                ')' => concat = self.pop_group(concat)?,
1784                '|' => concat = self.push_alternate(concat)?,
1785                '&' => concat = self.push_intersect(concat)?,
1786                '~' => concat = self.push_compl_group(concat)?,
1787                '[' => {
1788                    let class = self.parse_set_class()?;
1789                    concat.asts.push(Ast::class_bracketed(class));
1790                }
1791                '?' => {
1792                    concat =
1793                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1794                }
1795                '*' => {
1796                    concat =
1797                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1798                }
1799                '+' => {
1800                    concat =
1801                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1802                }
1803                '{' => {
1804                    concat = self.parse_counted_repetition(concat)?;
1805                }
1806                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1807            }
1808        }
1809        let ast = self.pop_group_end(concat)?;
1810        if expanded_ast_size(&ast, EXPANDED_AST_LIMIT) >= EXPANDED_AST_LIMIT
1811            || max_list_length(&ast) >= MAX_LIST_LEN
1812        {
1813            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1814        }
1815        Ok(ast)
1816    }
1817
1818    /// Parse the regular expression and return an abstract syntax tree with
1819    /// all of the comments found in the pattern.
1820    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1821        let ast = self.parse_inner()?;
1822        self.ast_to_node_id(&ast, &mut None, tb)
1823    }
1824
1825    #[inline(never)]
1826    fn parse_uncounted_repetition(
1827        &self,
1828        mut concat: ast::Concat,
1829        kind: ast::RepetitionKind,
1830    ) -> Result<ast::Concat> {
1831        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1832        let op_start = self.pos();
1833        let ast = match concat.asts.pop() {
1834            Some(ast) => ast,
1835            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1836        };
1837        match ast {
1838            Ast::Empty(_) | Ast::Flags(_) => {
1839                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1840            }
1841            _ => {}
1842        }
1843        if self.bump() && self.char() == '?' {
1844            return Err(self.error(
1845                Span::new(op_start, self.pos()),
1846                ast::ErrorKind::UnsupportedLazyQuantifier,
1847            ));
1848        }
1849        concat.asts.push(Ast::repetition(ast::Repetition {
1850            span: ast.span().with_end(self.pos()),
1851            op: ast::RepetitionOp {
1852                span: Span::new(op_start, self.pos()),
1853                kind,
1854            },
1855            greedy: true,
1856            ast: Box::new(ast),
1857        }));
1858        Ok(concat)
1859    }
1860
1861    #[inline(never)]
1862    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1863        assert!(self.char() == '{');
1864        let start = self.pos();
1865        let ast = match concat.asts.pop() {
1866            Some(ast) => ast,
1867            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1868        };
1869        match ast {
1870            Ast::Empty(_) | Ast::Flags(_) => {
1871                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1872            }
1873            _ => {}
1874        }
1875        if !self.bump_and_bump_space() {
1876            return Err(self.error(
1877                Span::new(start, self.pos()),
1878                ast::ErrorKind::RepetitionCountUnclosed,
1879            ));
1880        }
1881        let count_start = specialize_err(
1882            self.parse_decimal(),
1883            ast::ErrorKind::DecimalEmpty,
1884            ast::ErrorKind::RepetitionCountDecimalEmpty,
1885        );
1886        if self.is_eof() {
1887            return Err(self.error(
1888                Span::new(start, self.pos()),
1889                ast::ErrorKind::RepetitionCountUnclosed,
1890            ));
1891        }
1892        let range = if self.char() == ',' {
1893            if !self.bump_and_bump_space() {
1894                return Err(self.error(
1895                    Span::new(start, self.pos()),
1896                    ast::ErrorKind::RepetitionCountUnclosed,
1897                ));
1898            }
1899            if self.char() != '}' {
1900                let count_start = match count_start {
1901                    Ok(c) => c,
1902                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1903                        if self.parser().empty_min_range {
1904                            0
1905                        } else {
1906                            return Err(err);
1907                        }
1908                    }
1909                    err => err?,
1910                };
1911                let count_end = specialize_err(
1912                    self.parse_decimal(),
1913                    ast::ErrorKind::DecimalEmpty,
1914                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1915                )?;
1916                ast::RepetitionRange::Bounded(count_start, count_end)
1917            } else {
1918                ast::RepetitionRange::AtLeast(count_start?)
1919            }
1920        } else {
1921            ast::RepetitionRange::Exactly(count_start?)
1922        };
1923
1924        if self.is_eof() || self.char() != '}' {
1925            return Err(self.error(
1926                Span::new(start, self.pos()),
1927                ast::ErrorKind::RepetitionCountUnclosed,
1928            ));
1929        }
1930
1931        if self.bump_and_bump_space() && self.char() == '?' {
1932            return Err(self.error(
1933                Span::new(start, self.pos()),
1934                ast::ErrorKind::UnsupportedLazyQuantifier,
1935            ));
1936        }
1937
1938        let op_span = Span::new(start, self.pos());
1939        if !range.is_valid() {
1940            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1941        }
1942
1943        let over_limit = match &range {
1944            ast::RepetitionRange::Exactly(n) => *n > REPETITION_COUNT_LIMIT,
1945            ast::RepetitionRange::AtLeast(n) => *n > REPETITION_COUNT_LIMIT,
1946            ast::RepetitionRange::Bounded(n, m) => {
1947                *n > REPETITION_COUNT_LIMIT || *m > REPETITION_COUNT_LIMIT
1948            }
1949        };
1950        if over_limit {
1951            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
1952        }
1953        concat.asts.push(Ast::repetition(ast::Repetition {
1954            span: ast.span().with_end(self.pos()),
1955            op: ast::RepetitionOp {
1956                span: op_span,
1957                kind: ast::RepetitionKind::Range(range),
1958            },
1959            greedy: true,
1960            ast: Box::new(ast),
1961        }));
1962        Ok(concat)
1963    }
1964
1965    #[inline(never)]
1966    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1967        assert_eq!(self.char(), '(');
1968        let open_span = self.span_char();
1969        self.bump();
1970        self.bump_space();
1971        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1972            let kind = match (pos, ahead) {
1973                (true, true) => LookaroundKind::PositiveLookahead,
1974                (true, false) => LookaroundKind::PositiveLookbehind,
1975                (false, true) => LookaroundKind::NegativeLookahead,
1976                (false, false) => LookaroundKind::NegativeLookbehind,
1977            };
1978            return Ok(Either::Right(ast::Group {
1979                span: open_span,
1980                kind: ast::GroupKind::Lookaround(kind),
1981                ast: Box::new(Ast::empty(self.span())),
1982            }));
1983        }
1984        let inner_span = self.span();
1985        let mut starts_with_p = true;
1986        if self.bump_if("?P<") || {
1987            starts_with_p = false;
1988            self.bump_if("?<")
1989        } {
1990            let capture_index = self.next_capture_index(open_span)?;
1991            let name = self.parse_capture_name(capture_index)?;
1992            Ok(Either::Right(ast::Group {
1993                span: open_span,
1994                kind: ast::GroupKind::CaptureName {
1995                    starts_with_p,
1996                    name,
1997                },
1998                ast: Box::new(Ast::empty(self.span())),
1999            }))
2000        } else if self.bump_if("?") {
2001            if self.is_eof() {
2002                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2003            }
2004            let flags = self.parse_flags()?;
2005            let char_end = self.char();
2006            self.bump();
2007            if char_end == ')' {
2008                // We don't allow empty flags, e.g., `(?)`. We instead
2009                // interpret it as a repetition operator missing its argument.
2010                if flags.items.is_empty() {
2011                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2012                }
2013                Ok(Either::Left(ast::SetFlags {
2014                    span: Span {
2015                        end: self.pos(),
2016                        ..open_span
2017                    },
2018                    flags,
2019                }))
2020            } else {
2021                assert_eq!(char_end, ':');
2022                Ok(Either::Right(ast::Group {
2023                    span: open_span,
2024                    kind: ast::GroupKind::NonCapturing(flags),
2025                    ast: Box::new(Ast::empty(self.span())),
2026                }))
2027            }
2028        } else {
2029            let capture_index = self.next_capture_index(open_span)?;
2030            Ok(Either::Right(ast::Group {
2031                span: open_span,
2032                kind: ast::GroupKind::CaptureIndex(capture_index),
2033                ast: Box::new(Ast::empty(self.span())),
2034            }))
2035        }
2036    }
2037
2038    #[inline(never)]
2039    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2040        if self.is_eof() {
2041            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2042        }
2043        let start = self.pos();
2044        loop {
2045            if self.char() == '>' {
2046                break;
2047            }
2048            if !is_capture_char(self.char(), self.pos() == start) {
2049                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2050            }
2051            if !self.bump() {
2052                break;
2053            }
2054        }
2055        let end = self.pos();
2056        if self.is_eof() {
2057            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2058        }
2059        assert_eq!(self.char(), '>');
2060        self.bump();
2061        let name = &self.pattern()[start.offset..end.offset];
2062        if name.is_empty() {
2063            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2064        }
2065        let capname = ast::CaptureName {
2066            span: Span::new(start, end),
2067            name: name.to_string(),
2068            index: capture_index,
2069        };
2070        self.add_capture_name(&capname)?;
2071        Ok(capname)
2072    }
2073
2074    #[inline(never)]
2075    fn parse_flags(&self) -> Result<ast::Flags> {
2076        let mut flags = ast::Flags {
2077            span: self.span(),
2078            items: vec![],
2079        };
2080        let mut last_was_negation = None;
2081        while self.char() != ':' && self.char() != ')' {
2082            if self.char() == '-' {
2083                last_was_negation = Some(self.span_char());
2084                let item = ast::FlagsItem {
2085                    span: self.span_char(),
2086                    kind: ast::FlagsItemKind::Negation,
2087                };
2088                if let Some(i) = flags.add_item(item) {
2089                    return Err(self.error(
2090                        self.span_char(),
2091                        ast::ErrorKind::FlagRepeatedNegation {
2092                            original: flags.items[i].span,
2093                        },
2094                    ));
2095                }
2096            } else {
2097                last_was_negation = None;
2098                let item = ast::FlagsItem {
2099                    span: self.span_char(),
2100                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2101                };
2102                if let Some(i) = flags.add_item(item) {
2103                    return Err(self.error(
2104                        self.span_char(),
2105                        ast::ErrorKind::FlagDuplicate {
2106                            original: flags.items[i].span,
2107                        },
2108                    ));
2109                }
2110            }
2111            if !self.bump() {
2112                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2113            }
2114        }
2115        if let Some(span) = last_was_negation {
2116            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2117        }
2118        flags.span.end = self.pos();
2119        Ok(flags)
2120    }
2121
2122    #[inline(never)]
2123    fn parse_flag(&self) -> Result<ast::Flag> {
2124        match self.char() {
2125            'i' => Ok(ast::Flag::CaseInsensitive),
2126            'm' => Ok(ast::Flag::MultiLine),
2127            's' => Ok(ast::Flag::DotMatchesNewLine),
2128            'U' => Ok(ast::Flag::SwapGreed),
2129            'u' => Ok(ast::Flag::Unicode),
2130            'R' => Ok(ast::Flag::CRLF),
2131            'x' => Ok(ast::Flag::IgnoreWhitespace),
2132            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2133        }
2134    }
2135
2136    fn parse_primitive(&self) -> Result<Primitive> {
2137        match self.char() {
2138            '\\' => self.parse_escape(),
2139            '_' => {
2140                let ast = Primitive::Top(self.span_char());
2141                self.bump();
2142                Ok(ast)
2143            }
2144            '.' => {
2145                let ast = Primitive::Dot(self.span_char());
2146                self.bump();
2147                Ok(ast)
2148            }
2149            '^' => {
2150                let ast = Primitive::Assertion(ast::Assertion {
2151                    span: self.span_char(),
2152                    kind: ast::AssertionKind::StartLine,
2153                });
2154                self.bump();
2155                Ok(ast)
2156            }
2157            '$' => {
2158                let ast = Primitive::Assertion(ast::Assertion {
2159                    span: self.span_char(),
2160                    kind: ast::AssertionKind::EndLine,
2161                });
2162                self.bump();
2163                Ok(ast)
2164            }
2165            c => {
2166                let ast = Primitive::Literal(Literal {
2167                    span: self.span_char(),
2168                    kind: LiteralKind::Verbatim,
2169                    c,
2170                });
2171                self.bump();
2172                Ok(ast)
2173            }
2174        }
2175    }
2176
2177    #[inline(never)]
2178    fn parse_escape(&self) -> Result<Primitive> {
2179        assert_eq!(self.char(), '\\');
2180        let start = self.pos();
2181        if !self.bump() {
2182            return Err(self.error(
2183                Span::new(start, self.pos()),
2184                ast::ErrorKind::EscapeUnexpectedEof,
2185            ));
2186        }
2187        let c = self.char();
2188        // Put some of the more complicated routines into helpers.
2189        match c {
2190            '0'..='9' => {
2191                if !self.parser().octal {
2192                    return Err(self.error(
2193                        Span::new(start, self.span_char().end),
2194                        ast::ErrorKind::UnsupportedBackreference,
2195                    ));
2196                }
2197                let mut lit = self.parse_octal();
2198                lit.span.start = start;
2199                return Ok(Primitive::Literal(lit));
2200            }
2201            // '8'..='9' if !self.parser().octal => {
2202            //     return Err(self.error(
2203            //         Span::new(start, self.span_char().end),
2204            //         ast::ErrorKind::UnsupportedBackreference,
2205            //     ));
2206            // }
2207            'x' | 'u' | 'U' => {
2208                let mut lit = self.parse_hex()?;
2209                lit.span.start = start;
2210                return Ok(Primitive::Literal(lit));
2211            }
2212            'p' | 'P' => {
2213                let mut cls = self.parse_unicode_class()?;
2214                cls.span.start = start;
2215                return Ok(Primitive::Unicode(cls));
2216            }
2217            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2218                let mut cls = self.parse_perl_class();
2219                cls.span.start = start;
2220                return Ok(Primitive::Perl(cls));
2221            }
2222            _ => {}
2223        }
2224
2225        // Handle all of the one letter sequences inline.
2226        self.bump();
2227        let span = Span::new(start, self.pos());
2228        if is_meta_character(c) {
2229            return Ok(Primitive::Literal(Literal {
2230                span,
2231                kind: LiteralKind::Meta,
2232                c,
2233            }));
2234        }
2235        if is_escapeable_character(c) {
2236            return Ok(Primitive::Literal(Literal {
2237                span,
2238                kind: LiteralKind::Superfluous,
2239                c,
2240            }));
2241        }
2242        let special = |kind, c| {
2243            Ok(Primitive::Literal(Literal {
2244                span,
2245                kind: LiteralKind::Special(kind),
2246                c,
2247            }))
2248        };
2249        match c {
2250            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2251            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2252            't' => special(SpecialLiteralKind::Tab, '\t'),
2253            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2254            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2255            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2256            'A' => Ok(Primitive::Assertion(ast::Assertion {
2257                span,
2258                kind: ast::AssertionKind::StartText,
2259            })),
2260            'z' => Ok(Primitive::Assertion(ast::Assertion {
2261                span,
2262                kind: ast::AssertionKind::EndText,
2263            })),
2264            'b' => {
2265                let mut wb = ast::Assertion {
2266                    span,
2267                    kind: ast::AssertionKind::WordBoundary,
2268                };
2269                // After a \b, we "try" to parse things like \b{start} for
2270                // special word boundary assertions.
2271                if !self.is_eof() && self.char() == '{' {
2272                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2273                        wb.kind = kind;
2274                        wb.span.end = self.pos();
2275                    }
2276                }
2277                Ok(Primitive::Assertion(wb))
2278            }
2279            'B' => Ok(Primitive::Assertion(ast::Assertion {
2280                span,
2281                kind: ast::AssertionKind::NotWordBoundary,
2282            })),
2283            '<' => Ok(Primitive::Assertion(ast::Assertion {
2284                span,
2285                kind: ast::AssertionKind::WordBoundaryStartAngle,
2286            })),
2287            '>' => Ok(Primitive::Assertion(ast::Assertion {
2288                span,
2289                kind: ast::AssertionKind::WordBoundaryEndAngle,
2290            })),
2291            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2292        }
2293    }
2294
2295    fn maybe_parse_special_word_boundary(
2296        &self,
2297        wb_start: Position,
2298    ) -> Result<Option<ast::AssertionKind>> {
2299        assert_eq!(self.char(), '{');
2300
2301        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2302        let start = self.pos();
2303        if !self.bump_and_bump_space() {
2304            return Err(self.error(
2305                Span::new(wb_start, self.pos()),
2306                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2307            ));
2308        }
2309        let start_contents = self.pos();
2310        // This is one of the critical bits: if the first non-whitespace
2311        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2312        // boundary), then we bail and let the counted repetition parser deal
2313        // with this.
2314        if !is_valid_char(self.char()) {
2315            self.parser().pos.set(start);
2316            return Ok(None);
2317        }
2318
2319        // Now collect up our chars until we see a '}'.
2320        let mut scratch = self.parser().scratch.borrow_mut();
2321        scratch.clear();
2322        while !self.is_eof() && is_valid_char(self.char()) {
2323            scratch.push(self.char());
2324            self.bump_and_bump_space();
2325        }
2326        if self.is_eof() || self.char() != '}' {
2327            return Err(self.error(
2328                Span::new(start, self.pos()),
2329                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2330            ));
2331        }
2332        let end = self.pos();
2333        self.bump();
2334        let kind = match scratch.as_str() {
2335            "start" => ast::AssertionKind::WordBoundaryStart,
2336            "end" => ast::AssertionKind::WordBoundaryEnd,
2337            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2338            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2339            _ => {
2340                return Err(self.error(
2341                    Span::new(start_contents, end),
2342                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2343                ))
2344            }
2345        };
2346        Ok(Some(kind))
2347    }
2348
2349    #[inline(never)]
2350    fn parse_octal(&self) -> Literal {
2351        assert!(self.parser().octal);
2352        assert!('0' <= self.char() && self.char() <= '7');
2353        let start = self.pos();
2354        // Parse up to two more digits.
2355        while self.bump()
2356            && '0' <= self.char()
2357            && self.char() <= '7'
2358            && self.pos().offset - start.offset <= 2
2359        {}
2360        let end = self.pos();
2361        let octal = &self.pattern()[start.offset..end.offset];
2362        // Parsing the octal should never fail since the above guarantees a
2363        // valid number.
2364        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2365        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2366        // invalid Unicode scalar values.
2367        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2368        Literal {
2369            span: Span::new(start, end),
2370            kind: LiteralKind::Octal,
2371            c,
2372        }
2373    }
2374
2375    #[inline(never)]
2376    fn parse_hex(&self) -> Result<Literal> {
2377        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2378
2379        let hex_kind = match self.char() {
2380            'x' => HexLiteralKind::X,
2381            'u' => HexLiteralKind::UnicodeShort,
2382            _ => HexLiteralKind::UnicodeLong,
2383        };
2384        if !self.bump_and_bump_space() {
2385            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2386        }
2387        if self.char() == '{' {
2388            self.parse_hex_brace(hex_kind)
2389        } else {
2390            self.parse_hex_digits(hex_kind)
2391        }
2392    }
2393
2394    #[inline(never)]
2395    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2396        let mut scratch = self.parser().scratch.borrow_mut();
2397        scratch.clear();
2398
2399        let start = self.pos();
2400        for i in 0..kind.digits() {
2401            if i > 0 && !self.bump_and_bump_space() {
2402                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2403            }
2404            if !is_hex(self.char()) {
2405                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2406            }
2407            scratch.push(self.char());
2408        }
2409        // The final bump just moves the parser past the literal, which may
2410        // be EOF.
2411        self.bump_and_bump_space();
2412        let end = self.pos();
2413        let hex = scratch.as_str();
2414        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2415            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2416            Some(c) => Ok(Literal {
2417                span: Span::new(start, end),
2418                kind: LiteralKind::HexFixed(kind),
2419                c,
2420            }),
2421        }
2422    }
2423
2424    #[inline(never)]
2425    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2426        let mut scratch = self.parser().scratch.borrow_mut();
2427        scratch.clear();
2428
2429        let brace_pos = self.pos();
2430        let start = self.span_char().end;
2431        while self.bump_and_bump_space() && self.char() != '}' {
2432            if !is_hex(self.char()) {
2433                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2434            }
2435            scratch.push(self.char());
2436        }
2437        if self.is_eof() {
2438            return Err(self.error(
2439                Span::new(brace_pos, self.pos()),
2440                ast::ErrorKind::EscapeUnexpectedEof,
2441            ));
2442        }
2443        let end = self.pos();
2444        let hex = scratch.as_str();
2445        assert_eq!(self.char(), '}');
2446        self.bump_and_bump_space();
2447
2448        if hex.is_empty() {
2449            return Err(self.error(
2450                Span::new(brace_pos, self.pos()),
2451                ast::ErrorKind::EscapeHexEmpty,
2452            ));
2453        }
2454        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2455            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2456            Some(c) => Ok(Literal {
2457                span: Span::new(start, self.pos()),
2458                kind: LiteralKind::HexBrace(kind),
2459                c,
2460            }),
2461        }
2462    }
2463
2464    fn parse_decimal(&self) -> Result<u32> {
2465        let mut scratch = self.parser().scratch.borrow_mut();
2466        scratch.clear();
2467
2468        while !self.is_eof() && self.char().is_whitespace() {
2469            self.bump();
2470        }
2471        let start = self.pos();
2472        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2473            scratch.push(self.char());
2474            self.bump_and_bump_space();
2475        }
2476        let span = Span::new(start, self.pos());
2477        while !self.is_eof() && self.char().is_whitespace() {
2478            self.bump_and_bump_space();
2479        }
2480        let digits = scratch.as_str();
2481        if digits.is_empty() {
2482            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2483        }
2484        match digits.parse::<u32>().ok() {
2485            Some(n) => Ok(n),
2486            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2487        }
2488    }
2489
2490    #[inline(never)]
2491    fn parse_set_class(&self) -> Result<ClassBracketed> {
2492        assert_eq!(self.char(), '[');
2493
2494        let mut union = ClassSetUnion {
2495            span: self.span(),
2496            items: vec![],
2497        };
2498        loop {
2499            self.bump_space();
2500            if self.is_eof() {
2501                return Err(self.unclosed_class_error());
2502            }
2503            match self.char() {
2504                '[' => {
2505                    // If we've already parsed the opening bracket, then
2506                    // attempt to treat this as the beginning of an ASCII
2507                    // class. If ASCII class parsing fails, then the parser
2508                    // backs up to `[`.
2509                    if !self.parser().stack_class.borrow().is_empty() {
2510                        if let Some(cls) = self.maybe_parse_ascii_class() {
2511                            union.push(ClassSetItem::Ascii(cls));
2512                            continue;
2513                        }
2514                    }
2515                    union = self.push_class_open(union)?;
2516                }
2517                ']' => match self.pop_class(union)? {
2518                    Either::Left(nested_union) => {
2519                        union = nested_union;
2520                    }
2521                    Either::Right(class) => return Ok(class),
2522                },
2523                '&' if self.peek() == Some('&') => {
2524                    assert!(self.bump_if("&&"));
2525                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2526                }
2527                '-' if self.peek() == Some('-') => {
2528                    assert!(self.bump_if("--"));
2529                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2530                }
2531                '~' if self.peek() == Some('~') => {
2532                    assert!(self.bump_if("~~"));
2533                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2534                }
2535                _ => {
2536                    union.push(self.parse_set_class_range()?);
2537                }
2538            }
2539        }
2540    }
2541
2542    #[inline(never)]
2543    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2544        let prim1 = self.parse_set_class_item()?;
2545        self.bump_space();
2546        if self.is_eof() {
2547            return Err(self.unclosed_class_error());
2548        }
2549        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2550            return prim1.into_class_set_item(self);
2551        }
2552        if !self.bump_and_bump_space() {
2553            return Err(self.unclosed_class_error());
2554        }
2555        let prim2 = self.parse_set_class_item()?;
2556        let range = ClassSetRange {
2557            span: Span::new(prim1.span().start, prim2.span().end),
2558            start: prim1.into_class_literal(self)?,
2559            end: prim2.into_class_literal(self)?,
2560        };
2561        if !range.is_valid() {
2562            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2563        }
2564        Ok(ClassSetItem::Range(range))
2565    }
2566
2567    #[inline(never)]
2568    fn parse_set_class_item(&self) -> Result<Primitive> {
2569        if self.char() == '\\' {
2570            self.parse_escape()
2571        } else {
2572            let x = Primitive::Literal(Literal {
2573                span: self.span_char(),
2574                kind: LiteralKind::Verbatim,
2575                c: self.char(),
2576            });
2577            self.bump();
2578            Ok(x)
2579        }
2580    }
2581
2582    #[inline(never)]
2583    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2584        assert_eq!(self.char(), '[');
2585        let start = self.pos();
2586        if !self.bump_and_bump_space() {
2587            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2588        }
2589
2590        let negated = if self.char() != '^' {
2591            false
2592        } else {
2593            if !self.bump_and_bump_space() {
2594                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2595            }
2596            true
2597        };
2598        // Accept any number of `-` as literal `-`.
2599        let mut union = ClassSetUnion {
2600            span: self.span(),
2601            items: vec![],
2602        };
2603        while self.char() == '-' {
2604            union.push(ClassSetItem::Literal(Literal {
2605                span: self.span_char(),
2606                kind: LiteralKind::Verbatim,
2607                c: '-',
2608            }));
2609            if !self.bump_and_bump_space() {
2610                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2611            }
2612        }
2613        // If `]` is the *first* char in a set, then interpret it as a literal
2614        // `]`. That is, an empty class is impossible to write.
2615        if union.items.is_empty() && self.char() == ']' {
2616            union.push(ClassSetItem::Literal(Literal {
2617                span: self.span_char(),
2618                kind: LiteralKind::Verbatim,
2619                c: ']',
2620            }));
2621            if !self.bump_and_bump_space() {
2622                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2623            }
2624        }
2625        let set = ClassBracketed {
2626            span: Span::new(start, self.pos()),
2627            negated,
2628            kind: ClassSet::union(ClassSetUnion {
2629                span: Span::new(union.span.start, union.span.start),
2630                items: vec![],
2631            }),
2632        };
2633        Ok((set, union))
2634    }
2635
2636    #[inline(never)]
2637    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2638        assert_eq!(self.char(), '[');
2639        // If parsing fails, then we back up the parser to this starting point.
2640        let start = self.pos();
2641        let mut negated = false;
2642        if !self.bump() || self.char() != ':' {
2643            self.parser().pos.set(start);
2644            return None;
2645        }
2646        if !self.bump() {
2647            self.parser().pos.set(start);
2648            return None;
2649        }
2650        if self.char() == '^' {
2651            negated = true;
2652            if !self.bump() {
2653                self.parser().pos.set(start);
2654                return None;
2655            }
2656        }
2657        let name_start = self.offset();
2658        while self.char() != ':' && self.bump() {}
2659        if self.is_eof() {
2660            self.parser().pos.set(start);
2661            return None;
2662        }
2663        let name = &self.pattern()[name_start..self.offset()];
2664        if !self.bump_if(":]") {
2665            self.parser().pos.set(start);
2666            return None;
2667        }
2668        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2669            Some(kind) => kind,
2670            None => {
2671                self.parser().pos.set(start);
2672                return None;
2673            }
2674        };
2675        Some(ClassAscii {
2676            span: Span::new(start, self.pos()),
2677            kind,
2678            negated,
2679        })
2680    }
2681
2682    #[inline(never)]
2683    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2684        assert!(self.char() == 'p' || self.char() == 'P');
2685
2686        let mut scratch = self.parser().scratch.borrow_mut();
2687        scratch.clear();
2688
2689        let negated = self.char() == 'P';
2690        if !self.bump_and_bump_space() {
2691            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2692        }
2693        let (start, kind) = if self.char() == '{' {
2694            let start = self.span_char().end;
2695            while self.bump_and_bump_space() && self.char() != '}' {
2696                scratch.push(self.char());
2697            }
2698            if self.is_eof() {
2699                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2700            }
2701            assert_eq!(self.char(), '}');
2702            self.bump();
2703
2704            let name = scratch.as_str();
2705            if let Some(i) = name.find("!=") {
2706                (
2707                    start,
2708                    ClassUnicodeKind::NamedValue {
2709                        op: ClassUnicodeOpKind::NotEqual,
2710                        name: name[..i].to_string(),
2711                        value: name[i + 2..].to_string(),
2712                    },
2713                )
2714            } else if let Some(i) = name.find(':') {
2715                (
2716                    start,
2717                    ClassUnicodeKind::NamedValue {
2718                        op: ClassUnicodeOpKind::Colon,
2719                        name: name[..i].to_string(),
2720                        value: name[i + 1..].to_string(),
2721                    },
2722                )
2723            } else if let Some(i) = name.find('=') {
2724                (
2725                    start,
2726                    ClassUnicodeKind::NamedValue {
2727                        op: ClassUnicodeOpKind::Equal,
2728                        name: name[..i].to_string(),
2729                        value: name[i + 1..].to_string(),
2730                    },
2731                )
2732            } else {
2733                (start, ClassUnicodeKind::Named(name.to_string()))
2734            }
2735        } else {
2736            let start = self.pos();
2737            let c = self.char();
2738            if c == '\\' {
2739                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2740            }
2741            self.bump_and_bump_space();
2742            let kind = ClassUnicodeKind::OneLetter(c);
2743            (start, kind)
2744        };
2745        Ok(ClassUnicode {
2746            span: Span::new(start, self.pos()),
2747            negated,
2748            kind,
2749        })
2750    }
2751
2752    #[inline(never)]
2753    fn parse_perl_class(&self) -> ClassPerl {
2754        let c = self.char();
2755        let span = self.span_char();
2756        self.bump();
2757        let (negated, kind) = match c {
2758            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2759            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2760            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2761            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2762            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2763            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2764            c => panic!("expected valid Perl class but got '{}'", c),
2765        };
2766        ClassPerl {
2767            span,
2768            kind,
2769            negated,
2770        }
2771    }
2772}
2773
2774/// `[\s\S]`, `[\w\W]`, `[\d\D]` etc into `_` canonicalization.
2775/// should really be done as a u32 BDD but good enough for now
2776fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
2777    use regex_syntax::ast::ClassSetItem;
2778    let items = match item {
2779        ClassSetItem::Union(u) => &u.items,
2780        _ => return false,
2781    };
2782    if items.len() != 2 {
2783        return false;
2784    }
2785    match (&items[0], &items[1]) {
2786        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2787            let is_all = a.kind == b.kind && a.negated != b.negated;
2788            is_all
2789        }
2790        _ => false,
2791    }
2792}
2793
2794pub fn max_list_length(ast: &ast::Ast) -> usize {
2795    match ast {
2796        ast::Ast::Empty(_)
2797        | ast::Ast::Flags(_)
2798        | ast::Ast::Literal(_)
2799        | ast::Ast::Dot(_)
2800        | ast::Ast::Top(_)
2801        | ast::Ast::Assertion(_)
2802        | ast::Ast::ClassUnicode(_)
2803        | ast::Ast::ClassPerl(_)
2804        | ast::Ast::ClassBracketed(_) => 0,
2805        ast::Ast::Group(g) => max_list_length(&g.ast),
2806        ast::Ast::Complement(c) => max_list_length(&c.ast),
2807        ast::Ast::Lookaround(l) => max_list_length(&l.ast),
2808        ast::Ast::Repetition(r) => max_list_length(&r.ast),
2809        ast::Ast::Concat(c) => c
2810            .asts
2811            .len()
2812            .max(c.asts.iter().map(max_list_length).max().unwrap_or(0)),
2813        ast::Ast::Alternation(a) => a
2814            .asts
2815            .len()
2816            .max(a.asts.iter().map(max_list_length).max().unwrap_or(0)),
2817        ast::Ast::Intersection(i) => i
2818            .asts
2819            .len()
2820            .max(i.asts.iter().map(max_list_length).max().unwrap_or(0)),
2821    }
2822}
2823
2824pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2825    fn go(ast: &ast::Ast, limit: u64) -> u64 {
2826        match ast {
2827            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2828            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2829            ast::Ast::Assertion(_) => 1,
2830            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2831            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2832            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2833            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2834            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2835            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2836            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2837            ast::Ast::Repetition(r) => {
2838                let body = go(&r.ast, limit);
2839                let factor: u64 = match &r.op.kind {
2840                    ast::RepetitionKind::ZeroOrOne => 2,
2841                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2842                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2843                        (*n as u64).max(1)
2844                    }
2845                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2846                        (*n as u64).max(1).saturating_add(1)
2847                    }
2848                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2849                        (*m as u64).max(1)
2850                    }
2851                };
2852                body.saturating_mul(factor).min(limit)
2853            }
2854        }
2855    }
2856    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
2857        let mut total: u64 = 0;
2858        for c in children {
2859            total = total.saturating_add(go(c, limit));
2860            if total >= limit {
2861                return limit;
2862            }
2863        }
2864        total
2865    }
2866    go(ast, limit)
2867}
2868
2869pub fn parse_ast<'s>(
2870    tb: &mut TB<'s>,
2871    pattern: &'s str,
2872) -> std::result::Result<NodeId, ResharpError> {
2873    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2874    p.parse(tb)
2875}
2876
2877pub fn parse_ast_with<'s>(
2878    tb: &mut TB<'s>,
2879    pattern: &'s str,
2880    flags: &PatternFlags,
2881) -> std::result::Result<NodeId, ResharpError> {
2882    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2883    p.parse(tb)
2884}
2885
2886/// Parse a pattern into the raw AST without converting to algebra nodes.
2887pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ResharpError> {
2888    let mut p: ResharpParser = ResharpParser::new(pattern);
2889    p.parse_inner()
2890}
resharp_parser/lib.rs

resharp_parser/
lib.rs