Skip to main content

resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `RegexOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// `^` and `$` match at line boundaries (`\n`) in addition to text boundaries.
37    pub multiline: bool,
38    /// allow whitespace and `#` comments in the pattern.
39    pub ignore_whitespace: bool,
40    /// ASCII `\w`/`\d`/`\s` tables, but
41    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
42    pub ascii_perl_classes: bool,
43    /// max upper bound on `expanded_ast_size` before parser rejects the
44    /// pattern as too complex. default 50_000.
45    pub expanded_ast_limit: u64,
46    /// max children allowed in any single `Concat`/`Alternation`/
47    /// `Intersection` node. default 4_000.
48    pub max_list_len: usize,
49    /// max upper bound on bounded repetition `{n,m}`. default 500.
50    pub max_repeat: u32,
51    pub max_depth: usize,
52}
53
54// arbitrary safeguards, these will not prevent intentional DoS patterns
55// more to protect you from shooting yourself in the foot
56pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62    fn default() -> Self {
63        Self {
64            unicode: true,
65            full_unicode: false,
66            case_insensitive: false,
67            dot_matches_new_line: false,
68            multiline: true,
69            ignore_whitespace: false,
70            ascii_perl_classes: false,
71            expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72            max_list_len: DEFAULT_MAX_LIST_LEN,
73            max_repeat: DEFAULT_MAX_REPEAT,
74            max_depth: DEFAULT_MAX_DEPTH,
75        }
76    }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81    Word,
82    NonWord,
83    MaybeWord,
84    MaybeNonWord,
85    Unknown,
86    Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90    b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: &regex_syntax::ast::ClassSetItem) -> WordCharKind {
94    use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95    use WordCharKind::*;
96    match item {
97        ClassSetItem::Empty(_) => Unknown,
98        ClassSetItem::Literal(l) => {
99            if is_word_byte(l.c as u8) {
100                Word
101            } else {
102                NonWord
103            }
104        }
105        ClassSetItem::Range(r) => {
106            let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107            let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108            if all_word {
109                Word
110            } else if all_non {
111                NonWord
112            } else {
113                Unknown
114            }
115        }
116        ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117            (ClassPerlKind::Word, false) => Word,
118            (ClassPerlKind::Word, true) => NonWord,
119            (ClassPerlKind::Space, false) => NonWord,
120            (ClassPerlKind::Digit, false) => Word,
121            _ => Unknown,
122        },
123        ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124        ClassSetItem::Union(u) => {
125            let mut kind = Unknown;
126            for item in &u.items {
127                let k = class_set_item_word_kind(item);
128                kind = match (kind, k) {
129                    (_, Unknown) => return Unknown,
130                    (Unknown, _) => k,
131                    (Word, Word) => Word,
132                    (NonWord, NonWord) => NonWord,
133                    _ => return Unknown,
134                };
135            }
136            kind
137        }
138        _ => Unknown,
139    }
140}
141
142fn class_bracketed_word_kind(c: &regex_syntax::ast::ClassBracketed) -> WordCharKind {
143    use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144    use WordCharKind::*;
145    if c.negated {
146        return match &c.kind {
147            ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148                if p.negated {
149                    Word
150                } else {
151                    NonWord
152                }
153            }
154            _ => Unknown,
155        };
156    }
157    match &c.kind {
158        ClassSet::Item(item) => class_set_item_word_kind(item),
159        ClassSet::BinaryOp(_) => Unknown,
160    }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164    regex_syntax::ast::Literal {
165        span,
166        kind: regex_syntax::ast::LiteralKind::Verbatim,
167        c,
168    }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172    regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173        span,
174        start: ascii_class_lit(span, a),
175        end: ascii_class_lit(span, b),
176    })
177}
178
179fn ascii_perl_positive(
180    span: Span,
181    kind: &regex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183    use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184    match kind {
185        ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186        ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187            span,
188            items: vec![
189                ascii_class_range(span, 'a', 'z'),
190                ascii_class_range(span, 'A', 'Z'),
191                ascii_class_range(span, '0', '9'),
192                ClassSetItem::Literal(ascii_class_lit(span, '_')),
193            ],
194        }),
195        ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196            span,
197            items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198                .into_iter()
199                .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200                .collect(),
201        }),
202    }
203}
204
205fn ascii_perl_set_item(
206    span: Span,
207    kind: &regex_syntax::ast::ClassPerlKind,
208    negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210    use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211    let positive = ascii_perl_positive(span, kind);
212    if negated {
213        ClassSetItem::Bracketed(Box::new(ClassBracketed {
214            span,
215            negated: true,
216            kind: ClassSet::Item(positive),
217        }))
218    } else {
219        positive
220    }
221}
222
223fn rewrite_ascii_perl_set(set: &regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224    use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225    match set {
226        ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227        ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228            span: op.span,
229            kind: op.kind.clone(),
230            lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231            rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232        }),
233    }
234}
235
236fn rewrite_ascii_perl_item(
237    item: &regex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239    use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240    match item {
241        ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242        ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243            span: u.span,
244            items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245        }),
246        ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247            span: b.span,
248            negated: b.negated,
249            kind: rewrite_ascii_perl_set(&b.kind),
250        })),
251        other => other.clone(),
252    }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257    Literal(Literal),
258    Assertion(ast::Assertion),
259    Dot(Span),
260    Top(Span),
261    Perl(ClassPerl),
262    Unicode(ClassUnicode),
263}
264
265impl Primitive {
266    fn span(&self) -> &Span {
267        match *self {
268            Primitive::Literal(ref x) => &x.span,
269            Primitive::Assertion(ref x) => &x.span,
270            Primitive::Dot(ref span) => span,
271            Primitive::Top(ref span) => span,
272            Primitive::Perl(ref x) => &x.span,
273            Primitive::Unicode(ref x) => &x.span,
274        }
275    }
276
277    fn into_ast(self) -> Ast {
278        match self {
279            Primitive::Literal(lit) => Ast::literal(lit),
280            Primitive::Assertion(assert) => Ast::assertion(assert),
281            Primitive::Dot(span) => Ast::dot(span),
282            Primitive::Top(span) => Ast::top(span),
283            Primitive::Perl(cls) => Ast::class_perl(cls),
284            Primitive::Unicode(cls) => Ast::class_unicode(cls),
285        }
286    }
287
288    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289        use self::Primitive::*;
290        use regex_syntax::ast::ClassSetItem;
291
292        match self {
293            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297        }
298    }
299
300    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301        use self::Primitive::*;
302
303        match self {
304            Literal(lit) => Ok(lit),
305            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306        }
307    }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312    Left(Left),
313    Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318    /// The kind of error.
319    pub kind: ErrorKind,
320    /// The original pattern that the parser generated the error from. Every
321    /// span in an error is a valid range into this string.
322    pattern: String,
323    /// The span of this error.
324    pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329        write!(f, "{:?}: {:?}", self.kind, self.span)
330    }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338    /// This state is pushed whenever an opening group is found.
339    Group {
340        /// The concatenation immediately preceding the opening group.
341        concat: Concat,
342        /// The group that has been opened. Its sub-AST is always empty.
343        group: ast::Group,
344        /// Whether this group has the `x` flag enabled or not.
345        ignore_whitespace: bool,
346    },
347    Alternation(ast::Alternation),
348    Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353    /// This state is pushed whenever an opening bracket is found.
354    Open {
355        /// The union of class items immediately preceding this class.
356        union: regex_syntax::ast::ClassSetUnion,
357        set: regex_syntax::ast::ClassBracketed,
358    },
359    /// This state is pushed when a operator is seen. When popped, the stored
360    /// set becomes the left hand side of the operator.
361    Op {
362        /// The type of the operation, i.e., &&, -- or ~~.
363        kind: regex_syntax::ast::ClassSetBinaryOpKind,
364        /// The left-hand side of the operator.
365        lhs: regex_syntax::ast::ClassSet,
366    },
367}
368
369/// RE# syntax parser based on the regex-syntax crate.
370pub struct ResharpParser<'s> {
371    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372    unicode_classes: resharp_algebra::UnicodeClassCache,
373    pub translator: regex_syntax::hir::translate::Translator,
374    pub pattern: &'s str,
375    pos: Cell<Position>,
376    capture_index: Cell<u32>,
377    octal: bool,
378    empty_min_range: bool,
379    ignore_whitespace: Cell<bool>,
380    dot_all: Cell<bool>,
381    multiline: Cell<bool>,
382    global_unicode: bool,
383    global_full_unicode: bool,
384    global_ascii_perl: bool,
385    global_case_insensitive: bool,
386    expanded_ast_limit: u64,
387    max_list_len: usize,
388    max_repeat: u32,
389    max_depth: usize,
390    comments: RefCell<Vec<ast::Comment>>,
391    stack_group: RefCell<Vec<GroupState>>,
392    stack_class: RefCell<Vec<ClassState>>,
393    capture_names: RefCell<Vec<ast::CaptureName>>,
394    scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398    result.map_err(|e| {
399        if e.kind == from {
400            ParseError {
401                kind: to,
402                pattern: e.pattern,
403                span: e.span,
404            }
405        } else {
406            e
407        }
408    })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412    if first {
413        c == '_' || c.is_alphabetic()
414    } else {
415        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416    }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420    matches!(
421        c,
422        '\\' | '.'
423            | '+'
424            | '*'
425            | '?'
426            | '('
427            | ')'
428            | '|'
429            | '['
430            | ']'
431            | '{'
432            | '}'
433            | '^'
434            | '$'
435            | '#'
436            | '&'
437            | '-'
438            | '~'
439            | '_'
440    )
441}
442
443/// escapes all resharp meta characters in `text`.
444pub fn escape(text: &str) -> String {
445    let mut buf = String::new();
446    escape_into(text, &mut buf);
447    buf
448}
449
450/// escapes all resharp meta characters in `text` and appends to `buf`.
451pub fn escape_into(text: &str, buf: &mut String) {
452    buf.reserve(text.len());
453    for c in text.chars() {
454        if is_meta_character(c) {
455            buf.push('\\');
456        }
457        buf.push(c);
458    }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462    if is_meta_character(c) {
463        return true;
464    }
465    if !c.is_ascii() {
466        return false;
467    }
468    match c {
469        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470        '<' | '>' => false,
471        _ => true,
472    }
473}
474
475fn is_hex(c: char) -> bool {
476    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480    match ast {
481        Ast::Concat(c) => {
482            let mut child_at_start = at_start;
483            for child in &c.asts {
484                ensure_lookbehind_at_start(child, child_at_start)?;
485                if ast_may_consume(child) {
486                    child_at_start = false;
487                }
488            }
489            Ok(())
490        }
491        Ast::Alternation(a) => {
492            for child in &a.asts {
493                ensure_lookbehind_at_start(child, at_start)?;
494            }
495            Ok(())
496        }
497        Ast::Intersection(i) => {
498            for child in &i.asts {
499                ensure_lookbehind_at_start(child, at_start)?;
500            }
501            Ok(())
502        }
503        Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504        Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505        Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506        Ast::Lookaround(g) => {
507            match g.kind {
508                LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509                    if !at_start {
510                        return Err(g.span);
511                    }
512                }
513                LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514            }
515            ensure_lookbehind_at_start(&g.ast, true)
516        }
517        Ast::Empty(_)
518        | Ast::Flags(_)
519        | Ast::Literal(_)
520        | Ast::Dot(_)
521        | Ast::Top(_)
522        | Ast::Assertion(_)
523        | Ast::ClassUnicode(_)
524        | Ast::ClassPerl(_)
525        | Ast::ClassBracketed(_) => Ok(()),
526    }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530    match ast {
531        Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532        Ast::Literal(_)
533        | Ast::Dot(_)
534        | Ast::Top(_)
535        | Ast::ClassUnicode(_)
536        | Ast::ClassPerl(_)
537        | Ast::ClassBracketed(_) => true,
538        Ast::Group(g) => ast_may_consume(&g.ast),
539        Ast::Repetition(r) => {
540            if !ast_may_consume(&r.ast) {
541                return false;
542            }
543            match r.op.kind {
544                RepetitionKind::ZeroOrOne
545                | RepetitionKind::ZeroOrMore
546                | RepetitionKind::OneOrMore => true,
547                RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548                RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549                RepetitionKind::Range(_) => true,
550            }
551        }
552        Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553        Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554        Ast::Complement(_) => true,
555        Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556    }
557}
558
559impl<'s> ResharpParser<'s> {
560    fn default_translator_builder(&self) -> TranslatorBuilder {
561        let mut trb = TranslatorBuilder::new();
562        trb.unicode(self.global_unicode);
563        trb.utf8(false);
564        trb.case_insensitive(self.global_case_insensitive);
565        trb
566    }
567
568    pub fn new(pattern: &'s str) -> Self {
569        Self::with_flags(pattern, &PatternFlags::default())
570    }
571
572    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573        let mut trb = TranslatorBuilder::new();
574        trb.unicode(flags.unicode);
575        trb.utf8(false);
576        trb.case_insensitive(flags.case_insensitive);
577        Self {
578            translator: trb.build(),
579            pattern,
580            perl_classes: vec![],
581            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582            pos: Cell::new(Position::new(0, 0, 0)),
583            capture_index: Cell::new(0),
584            octal: false,
585            empty_min_range: false,
586            ignore_whitespace: Cell::new(flags.ignore_whitespace),
587            dot_all: Cell::new(flags.dot_matches_new_line),
588            multiline: Cell::new(flags.multiline),
589            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590            global_full_unicode: flags.full_unicode,
591            global_ascii_perl: flags.ascii_perl_classes,
592            global_case_insensitive: flags.case_insensitive,
593            expanded_ast_limit: flags.expanded_ast_limit,
594            max_list_len: flags.max_list_len,
595            max_repeat: flags.max_repeat,
596            max_depth: flags.max_depth,
597            comments: RefCell::new(vec![]),
598            stack_group: RefCell::new(vec![]),
599            stack_class: RefCell::new(vec![]),
600            capture_names: RefCell::new(vec![]),
601            scratch: RefCell::new(String::new()),
602        }
603    }
604
605    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606        self
607    }
608
609    fn pattern(&self) -> &str {
610        self.pattern
611    }
612
613    fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614        ParseError {
615            kind,
616            pattern: self.pattern().to_string(),
617            span,
618        }
619    }
620
621    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622        self.error(
623            Span::splat(self.pos()),
624            ast::ErrorKind::UnsupportedResharpRegex,
625        )
626    }
627
628    fn offset(&self) -> usize {
629        self.parser().pos.get().offset
630    }
631
632    fn line(&self) -> usize {
633        self.parser().pos.get().line
634    }
635
636    fn column(&self) -> usize {
637        self.parser().pos.get().column
638    }
639
640    fn next_capture_index(&self, span: Span) -> Result<u32> {
641        let current = self.parser().capture_index.get();
642        let i = current
643            .checked_add(1)
644            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645        self.parser().capture_index.set(i);
646        Ok(i)
647    }
648
649    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650        let mut names = self.parser().capture_names.borrow_mut();
651        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652            Err(i) => {
653                names.insert(i, cap.clone());
654                Ok(())
655            }
656            Ok(i) => Err(self.error(
657                cap.span,
658                ast::ErrorKind::GroupNameDuplicate {
659                    original: names[i].span,
660                },
661            )),
662        }
663    }
664
665    fn ignore_whitespace(&self) -> bool {
666        self.parser().ignore_whitespace.get()
667    }
668
669    fn char(&self) -> char {
670        self.char_at(self.offset())
671    }
672
673    fn char_at(&self, i: usize) -> char {
674        self.pattern()[i..]
675            .chars()
676            .next()
677            .unwrap_or_else(|| panic!("expected char at offset {}", i))
678    }
679
680    fn bump(&self) -> bool {
681        if self.is_eof() {
682            return false;
683        }
684        let Position {
685            mut offset,
686            mut line,
687            mut column,
688        } = self.pos();
689        if self.char() == '\n' {
690            line = line.checked_add(1).unwrap();
691            column = 1;
692        } else {
693            column = column.checked_add(1).unwrap();
694        }
695        offset += self.char().len_utf8();
696        self.parser().pos.set(Position {
697            offset,
698            line,
699            column,
700        });
701        self.pattern()[self.offset()..].chars().next().is_some()
702    }
703
704    fn bump_if(&self, prefix: &str) -> bool {
705        if self.pattern()[self.offset()..].starts_with(prefix) {
706            for _ in 0..prefix.chars().count() {
707                self.bump();
708            }
709            true
710        } else {
711            false
712        }
713    }
714
715    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716        if self.bump_if("?=") {
717            return Some((true, true));
718        }
719        if self.bump_if("?!") {
720            return Some((true, false));
721        }
722        if self.bump_if("?<=") {
723            return Some((false, true));
724        }
725        if self.bump_if("?<!") {
726            return Some((false, false));
727        }
728        None
729    }
730
731    fn bump_and_bump_space(&self) -> bool {
732        if !self.bump() {
733            return false;
734        }
735        self.bump_space();
736        !self.is_eof()
737    }
738
739    fn bump_space(&self) {
740        if !self.ignore_whitespace() {
741            return;
742        }
743        while !self.is_eof() {
744            if self.char().is_whitespace() {
745                self.bump();
746            } else if self.char() == '#' {
747                let start = self.pos();
748                let mut comment_text = String::new();
749                self.bump();
750                while !self.is_eof() {
751                    let c = self.char();
752                    self.bump();
753                    if c == '\n' {
754                        break;
755                    }
756                    comment_text.push(c);
757                }
758                let comment = ast::Comment {
759                    span: Span::new(start, self.pos()),
760                    comment: comment_text,
761                };
762                self.parser().comments.borrow_mut().push(comment);
763            } else {
764                break;
765            }
766        }
767    }
768
769    fn peek(&self) -> Option<char> {
770        if self.is_eof() {
771            return None;
772        }
773        self.pattern()[self.offset() + self.char().len_utf8()..]
774            .chars()
775            .next()
776    }
777
778    /// Like peek, but will ignore spaces when the parser is in whitespace
779    /// insensitive mode.
780    fn peek_space(&self) -> Option<char> {
781        if !self.ignore_whitespace() {
782            return self.peek();
783        }
784        if self.is_eof() {
785            return None;
786        }
787        let mut start = self.offset() + self.char().len_utf8();
788        let mut in_comment = false;
789        for (i, c) in self.pattern()[start..].char_indices() {
790            if c.is_whitespace() {
791                continue;
792            } else if !in_comment && c == '#' {
793                in_comment = true;
794            } else if in_comment && c == '\n' {
795                in_comment = false;
796            } else {
797                start += i;
798                break;
799            }
800        }
801        self.pattern()[start..].chars().next()
802    }
803
804    fn is_eof(&self) -> bool {
805        self.offset() == self.pattern().len()
806    }
807
808    fn pos(&self) -> Position {
809        self.parser().pos.get()
810    }
811
812    fn span(&self) -> Span {
813        Span::splat(self.pos())
814    }
815
816    fn span_char(&self) -> Span {
817        let mut next = Position {
818            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819            line: self.line(),
820            column: self.column().checked_add(1).unwrap(),
821        };
822        if self.char() == '\n' {
823            next.line += 1;
824            next.column = 1;
825        }
826        Span::new(self.pos(), next)
827    }
828
829    #[inline(never)]
830    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831        assert_eq!(self.char(), '|');
832        concat.span.end = self.pos();
833        self.push_or_add_alternation(concat);
834        self.bump();
835        Ok(ast::Concat {
836            span: self.span(),
837            asts: vec![],
838        })
839    }
840
841    fn push_or_add_alternation(&self, concat: Concat) {
842        use self::GroupState::*;
843
844        let mut stack = self.parser().stack_group.borrow_mut();
845        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846            alts.asts.push(concat.into_ast());
847            return;
848        }
849        stack.push(Alternation(ast::Alternation {
850            span: Span::new(concat.span.start, self.pos()),
851            asts: vec![concat.into_ast()],
852        }));
853    }
854
855    #[inline(never)]
856    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857        assert_eq!(self.char(), '&');
858        concat.span.end = self.pos();
859        self.push_or_add_intersect(concat);
860        self.bump();
861        Ok(Concat {
862            span: self.span(),
863            asts: vec![],
864        })
865    }
866
867    fn push_or_add_intersect(&self, concat: Concat) {
868        use self::GroupState::*;
869
870        let mut stack = self.parser().stack_group.borrow_mut();
871        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872            alts.asts.push(concat.into_ast());
873            return;
874        }
875        stack.push(Intersection(ast::Intersection {
876            span: Span::new(concat.span.start, self.pos()),
877            asts: vec![concat.into_ast()],
878        }));
879    }
880
881    #[inline(never)]
882    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883        assert_eq!(self.char(), '(');
884        match self.parse_group()? {
885            Either::Left(set) => {
886                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887                if let Some(v) = ignore {
888                    self.parser().ignore_whitespace.set(v);
889                }
890
891                concat.asts.push(Ast::flags(set));
892                Ok(concat)
893            }
894            Either::Right(group) => {
895                let old_ignore_whitespace = self.ignore_whitespace();
896                let new_ignore_whitespace = group
897                    .flags()
898                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899                    .unwrap_or(old_ignore_whitespace);
900                self.parser()
901                    .stack_group
902                    .borrow_mut()
903                    .push(GroupState::Group {
904                        concat,
905                        group,
906                        ignore_whitespace: old_ignore_whitespace,
907                    });
908                self.parser().ignore_whitespace.set(new_ignore_whitespace);
909                Ok(Concat {
910                    span: self.span(),
911                    asts: vec![],
912                })
913            }
914        }
915    }
916
917    #[inline(never)]
918    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919        assert_eq!(self.char(), '~');
920        self.bump();
921        if self.is_eof() || self.char() != '(' {
922            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923        }
924        let open_span = self.span_char();
925        self.bump();
926        let group = ast::Group {
927            span: open_span,
928            kind: ast::GroupKind::Complement,
929            ast: Box::new(Ast::empty(self.span())),
930        };
931
932        let old_ignore_whitespace = self.ignore_whitespace();
933        let new_ignore_whitespace = group
934            .flags()
935            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936            .unwrap_or(old_ignore_whitespace);
937        self.parser()
938            .stack_group
939            .borrow_mut()
940            .push(GroupState::Group {
941                concat,
942                group,
943                ignore_whitespace: old_ignore_whitespace,
944            });
945        self.parser().ignore_whitespace.set(new_ignore_whitespace);
946        Ok(Concat {
947            span: self.span(),
948            asts: vec![],
949        })
950    }
951
952    #[inline(never)]
953    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954        use self::GroupState::*;
955        assert_eq!(self.char(), ')');
956        let mut stack = self.parser().stack_group.borrow_mut();
957        let topstack = stack.pop();
958
959        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960            Some(Group {
961                concat,
962                group,
963                ignore_whitespace,
964            }) => (concat, group, ignore_whitespace, None),
965            Some(Alternation(alt)) => match stack.pop() {
966                Some(Group {
967                    concat,
968                    group,
969                    ignore_whitespace,
970                }) => (
971                    concat,
972                    group,
973                    ignore_whitespace,
974                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975                ),
976                None | Some(Alternation(_)) | Some(Intersection(_)) => {
977                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978                }
979            },
980            Some(Intersection(int)) => match stack.pop() {
981                Some(Group {
982                    concat,
983                    group,
984                    ignore_whitespace,
985                }) => (
986                    concat,
987                    group,
988                    ignore_whitespace,
989                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990                ),
991                None | Some(Alternation(_)) | Some(Intersection(_)) => {
992                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993                }
994            },
995
996            None => {
997                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998            }
999        };
1000        self.parser().ignore_whitespace.set(ignore_whitespace);
1001        group_concat.span.end = self.pos();
1002        self.bump();
1003        group.span.end = self.pos();
1004        match alt {
1005            Some(Either::Left(mut alt)) => {
1006                alt.span.end = group_concat.span.end;
1007                alt.asts.push(group_concat.into_ast());
1008                group.ast = Box::new(alt.into_ast());
1009            }
1010            Some(Either::Right(mut int)) => {
1011                int.span.end = group_concat.span.end;
1012                int.asts.push(group_concat.into_ast());
1013                group.ast = Box::new(int.into_ast());
1014            }
1015            None => {
1016                group.ast = Box::new(group_concat.into_ast());
1017            }
1018        }
1019
1020        if group.kind == GroupKind::Complement {
1021            let complement = ast::Complement {
1022                span: self.span(),
1023                ast: group.ast,
1024            };
1025            prior_concat.asts.push(Ast::complement(complement));
1026        }
1027        // ignore groups for now
1028        else {
1029            prior_concat.asts.push(Ast::group(group));
1030        }
1031        Ok(prior_concat)
1032    }
1033
1034    #[inline(never)]
1035    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036        concat.span.end = self.pos();
1037        let mut stack = self.parser().stack_group.borrow_mut();
1038        let ast = match stack.pop() {
1039            None => Ok(concat.into_ast()),
1040            Some(GroupState::Alternation(mut alt)) => {
1041                alt.span.end = self.pos();
1042                alt.asts.push(concat.into_ast());
1043                Ok(Ast::alternation(alt))
1044            }
1045            Some(GroupState::Intersection(mut int)) => {
1046                int.span.end = self.pos();
1047                int.asts.push(concat.into_ast());
1048
1049                Ok(Ast::intersection(int))
1050            }
1051            Some(GroupState::Group { group, .. }) => {
1052                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053            }
1054        };
1055        // If we try to pop again, there should be nothing.
1056        match stack.pop() {
1057            None => ast,
1058            Some(GroupState::Alternation(alt)) => {
1059                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060            }
1061            Some(GroupState::Intersection(int)) => {
1062                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063            }
1064            Some(GroupState::Group { group, .. }) => {
1065                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066            }
1067        }
1068    }
1069
1070    #[inline(never)]
1071    fn push_class_open(
1072        &self,
1073        parent_union: regex_syntax::ast::ClassSetUnion,
1074    ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075        assert_eq!(self.char(), '[');
1076
1077        let (nested_set, nested_union) = self.parse_set_class_open()?;
1078        self.parser()
1079            .stack_class
1080            .borrow_mut()
1081            .push(ClassState::Open {
1082                union: parent_union,
1083                set: nested_set,
1084            });
1085        Ok(nested_union)
1086    }
1087
1088    #[inline(never)]
1089    fn pop_class(
1090        &self,
1091        nested_union: regex_syntax::ast::ClassSetUnion,
1092    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093        assert_eq!(self.char(), ']');
1094
1095        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096        let prevset = self.pop_class_op(item);
1097        let mut stack = self.parser().stack_class.borrow_mut();
1098        match stack.pop() {
1099            None => panic!("unexpected empty character class stack"),
1100            Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101            Some(ClassState::Open { mut union, mut set }) => {
1102                self.bump();
1103                set.span.end = self.pos();
1104                set.kind = prevset;
1105                if stack.is_empty() {
1106                    Ok(Either::Right(set))
1107                } else {
1108                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109                    Ok(Either::Left(union))
1110                }
1111            }
1112        }
1113    }
1114
1115    #[inline(never)]
1116    fn unclosed_class_error(&self) -> ParseError {
1117        for state in self.parser().stack_class.borrow().iter().rev() {
1118            if let ClassState::Open { ref set, .. } = *state {
1119                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120            }
1121        }
1122        panic!("no open character class found")
1123    }
1124
1125    #[inline(never)]
1126    fn push_class_op(
1127        &self,
1128        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129        next_union: regex_syntax::ast::ClassSetUnion,
1130    ) -> regex_syntax::ast::ClassSetUnion {
1131        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132        let new_lhs = self.pop_class_op(item);
1133        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134            kind: next_kind,
1135            lhs: new_lhs,
1136        });
1137        regex_syntax::ast::ClassSetUnion {
1138            span: self.span(),
1139            items: vec![],
1140        }
1141    }
1142
1143    #[inline(never)]
1144    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145        let mut stack = self.parser().stack_class.borrow_mut();
1146        let (kind, lhs) = match stack.pop() {
1147            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148            Some(state @ ClassState::Open { .. }) => {
1149                stack.push(state);
1150                return rhs;
1151            }
1152            None => unreachable!(),
1153        };
1154        let span = Span::new(lhs.span().start, rhs.span().end);
1155        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156            span,
1157            kind,
1158            lhs: Box::new(lhs),
1159            rhs: Box::new(rhs),
1160        })
1161    }
1162
1163    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164        match hir.kind() {
1165            hir::HirKind::Empty => Ok(NodeId::EPS),
1166            hir::HirKind::Literal(l) => {
1167                if l.0.len() == 1 {
1168                    let node = tb.mk_u8(l.0[0]);
1169                    Ok(node)
1170                } else {
1171                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172                    let conc = tb.mk_concats(ws.iter().copied());
1173                    Ok(conc)
1174                }
1175            }
1176            hir::HirKind::Class(class) => match class {
1177                hir::Class::Unicode(class_unicode) => {
1178                    let ranges = class_unicode.ranges();
1179                    if ranges.len() == 1
1180                        && ranges[0].start() == '\u{0}'
1181                        && ranges[0].end() == '\u{10FFFF}'
1182                    {
1183                        return Ok(tb.mk_range_u8(0, 255));
1184                    }
1185                    let mut nodes = Vec::new();
1186                    for range in ranges {
1187                        for seq in Utf8Sequences::new(range.start(), range.end()) {
1188                            let sl = seq.as_slice();
1189                            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190                            let node = match bytes.len() {
1191                                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192                                n => {
1193                                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194                                    let mut conc = last;
1195                                    for i in (0..n - 1).rev() {
1196                                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197                                        conc = tb.mk_concat(b, conc);
1198                                    }
1199                                    conc
1200                                }
1201                            };
1202                            nodes.push(node);
1203                        }
1204                    }
1205                    let merged = tb.mk_unions(nodes.into_iter());
1206                    Ok(merged)
1207                }
1208                hir::Class::Bytes(class_bytes) => {
1209                    let ranges = class_bytes.ranges();
1210                    let mut result = NodeId::BOT;
1211                    for range in ranges {
1212                        let start = range.start();
1213                        let end = range.end();
1214                        let node = tb.mk_range_u8(start, end);
1215                        result = tb.mk_union(result, node);
1216                    }
1217                    Ok(result)
1218                }
1219            },
1220            hir::HirKind::Look(_) => Err(self.error(
1221                Span::splat(self.pos()),
1222                ast::ErrorKind::UnsupportedResharpRegex,
1223            )),
1224            hir::HirKind::Repetition(_) => Err(self.error(
1225                Span::splat(self.pos()),
1226                ast::ErrorKind::UnsupportedResharpRegex,
1227            )),
1228            hir::HirKind::Capture(_) => Err(self.error(
1229                Span::splat(self.pos()),
1230                ast::ErrorKind::UnsupportedResharpRegex,
1231            )),
1232            hir::HirKind::Concat(body) => {
1233                let mut result = NodeId::EPS;
1234                for child in body {
1235                    let node = self.hir_to_node_id(child, tb)?;
1236                    result = tb.mk_concat(result, node);
1237                }
1238                Ok(result)
1239            }
1240            hir::HirKind::Alternation(_) => Err(self.error(
1241                Span::splat(self.pos()),
1242                ast::ErrorKind::UnsupportedResharpRegex,
1243            )),
1244        }
1245    }
1246
1247    fn translate_ast_to_hir(
1248        &mut self,
1249        orig_ast: &regex_syntax::ast::Ast,
1250        tb: &mut TB<'s>,
1251    ) -> Result<NodeId> {
1252        match self.translator.translate("", orig_ast) {
1253            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254            Ok(hir) => self.hir_to_node_id(&hir, tb),
1255        }
1256    }
1257
1258    fn translator_to_node_id(
1259        &mut self,
1260        orig_ast: &regex_syntax::ast::Ast,
1261        translator: &mut Option<Translator>,
1262        tb: &mut TB<'s>,
1263    ) -> Result<NodeId> {
1264        match translator {
1265            Some(tr) => {
1266                let hir = tr
1267                    .translate("", orig_ast)
1268                    .map_err(|e| self.unsupported_error(e))?;
1269                self.hir_to_node_id(&hir, tb)
1270            }
1271            None => self.translate_ast_to_hir(orig_ast, tb),
1272        }
1273    }
1274
1275    fn get_class(
1276        &mut self,
1277        negated: bool,
1278        kind: regex_syntax::ast::ClassPerlKind,
1279        tb: &mut TB<'s>,
1280    ) -> Result<NodeId> {
1281        let w = self
1282            .perl_classes
1283            .iter()
1284            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285        match w {
1286            Some((_, _, value)) => Ok(*value),
1287            None => {
1288                let translated = if self.global_ascii_perl {
1289                    let pos = match kind {
1290                        regex_syntax::ast::ClassPerlKind::Word => {
1291                            let az = tb.mk_range_u8(b'a', b'z');
1292                            let big = tb.mk_range_u8(b'A', b'Z');
1293                            let dig = tb.mk_range_u8(b'0', b'9');
1294                            let us = tb.mk_u8(b'_');
1295                            tb.mk_unions([az, big, dig, us].into_iter())
1296                        }
1297                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298                        regex_syntax::ast::ClassPerlKind::Space => {
1299                            let sp = tb.mk_u8(b' ');
1300                            let tab = tb.mk_u8(b'\t');
1301                            let nl = tb.mk_u8(b'\n');
1302                            let cr = tb.mk_u8(b'\r');
1303                            let ff = tb.mk_u8(0x0C);
1304                            let vt = tb.mk_u8(0x0B);
1305                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306                        }
1307                    };
1308                    if negated {
1309                        resharp_algebra::neg_class(tb, pos)
1310                    } else {
1311                        pos
1312                    }
1313                } else if self.global_unicode {
1314                    match kind {
1315                        regex_syntax::ast::ClassPerlKind::Word => {
1316                            if self.global_full_unicode {
1317                                self.unicode_classes.ensure_word_full(tb);
1318                            } else {
1319                                self.unicode_classes.ensure_word(tb);
1320                            }
1321                            if negated {
1322                                self.unicode_classes.non_word
1323                            } else {
1324                                self.unicode_classes.word
1325                            }
1326                        }
1327                        regex_syntax::ast::ClassPerlKind::Digit => {
1328                            if self.global_full_unicode {
1329                                self.unicode_classes.ensure_digit_full(tb);
1330                            } else {
1331                                self.unicode_classes.ensure_digit(tb);
1332                            }
1333                            if negated {
1334                                self.unicode_classes.non_digit
1335                            } else {
1336                                self.unicode_classes.digit
1337                            }
1338                        }
1339                        regex_syntax::ast::ClassPerlKind::Space => {
1340                            if self.global_full_unicode {
1341                                self.unicode_classes.ensure_space_full(tb);
1342                            } else {
1343                                self.unicode_classes.ensure_space(tb);
1344                            }
1345                            if negated {
1346                                self.unicode_classes.non_space
1347                            } else {
1348                                self.unicode_classes.space
1349                            }
1350                        }
1351                    }
1352                } else {
1353                    let pos = match kind {
1354                        regex_syntax::ast::ClassPerlKind::Word => {
1355                            let az = tb.mk_range_u8(b'a', b'z');
1356                            let big = tb.mk_range_u8(b'A', b'Z');
1357                            let dig = tb.mk_range_u8(b'0', b'9');
1358                            let us = tb.mk_u8(b'_');
1359                            tb.mk_unions([az, big, dig, us].into_iter())
1360                        }
1361                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362                        regex_syntax::ast::ClassPerlKind::Space => {
1363                            let sp = tb.mk_u8(b' ');
1364                            let tab = tb.mk_u8(b'\t');
1365                            let nl = tb.mk_u8(b'\n');
1366                            let cr = tb.mk_u8(b'\r');
1367                            let ff = tb.mk_u8(0x0C);
1368                            let vt = tb.mk_u8(0x0B);
1369                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370                        }
1371                    };
1372                    if negated {
1373                        // negate within the byte, not the language, or `\D`/`\S`/`\W` wrongly match "".
1374                        resharp_algebra::neg_class(tb, pos)
1375                    } else {
1376                        pos
1377                    }
1378                };
1379                self.perl_classes.push((negated, kind, translated));
1380                Ok(translated)
1381            }
1382        }
1383    }
1384
1385    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1386        use WordCharKind::*;
1387        match ast {
1388            Ast::Literal(lit) => {
1389                if is_word_byte(lit.c as u8) {
1390                    Word
1391                } else {
1392                    NonWord
1393                }
1394            }
1395            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1396                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1397                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1398                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1399                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1400                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1401                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1402            },
1403            Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1404            Ast::Dot(_) | Ast::Top(_) => Unknown,
1405            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1406            Ast::Concat(c) if !c.asts.is_empty() => {
1407                let dir: isize = if left { -1 } else { 1 };
1408                let edge = match Self::concat_edge_index(&c.asts, left) {
1409                    Some(e) => e,
1410                    None => return Unknown,
1411                };
1412                let kind = Self::word_char_kind(&c.asts[edge], left);
1413                match kind {
1414                    MaybeWord => {
1415                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1416                            Word => Word,
1417                            Edge => MaybeWord,
1418                            _ => Unknown,
1419                        }
1420                    }
1421                    MaybeNonWord => {
1422                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1423                            NonWord => NonWord,
1424                            Edge => MaybeNonWord,
1425                            _ => Unknown,
1426                        }
1427                    }
1428                    other => other,
1429                }
1430            }
1431            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1432                let first = Self::word_char_kind(&alt.asts[0], left);
1433                if alt.asts[1..]
1434                    .iter()
1435                    .all(|a| Self::word_char_kind(a, left) == first)
1436                {
1437                    first
1438                } else {
1439                    Unknown
1440                }
1441            }
1442            Ast::Repetition(r) => {
1443                let inner = Self::word_char_kind(&r.ast, left);
1444                let nullable = matches!(
1445                    &r.op.kind,
1446                    ast::RepetitionKind::ZeroOrMore
1447                        | ast::RepetitionKind::ZeroOrOne
1448                        | ast::RepetitionKind::Range(
1449                            ast::RepetitionRange::Bounded(0, _)
1450                                | ast::RepetitionRange::Exactly(0)
1451                        )
1452                );
1453                if nullable {
1454                    match inner {
1455                        Word => MaybeWord,
1456                        NonWord => MaybeNonWord,
1457                        _ => Unknown,
1458                    }
1459                } else {
1460                    inner
1461                }
1462            }
1463            Ast::Lookaround(la) => match la.kind {
1464                ast::LookaroundKind::PositiveLookahead
1465                | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1466                _ => Unknown,
1467            },
1468            Ast::Assertion(a) => match (&a.kind, left) {
1469                (ast::AssertionKind::EndText, false) => NonWord,
1470                (ast::AssertionKind::StartText, true) => NonWord,
1471                _ => Unknown,
1472            },
1473            _ => Unknown,
1474        }
1475    }
1476
1477    // ok to return None here, it's only an optimization
1478    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1479        match ast {
1480            Ast::Literal(_)
1481            | Ast::ClassPerl(_)
1482            | Ast::ClassBracketed(_)
1483            | Ast::ClassUnicode(_)
1484            | Ast::Dot(_)
1485            | Ast::Top(_) => Some(ast),
1486            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1487            Ast::Concat(c) if !c.asts.is_empty() => {
1488                Self::concat_edge_index(&c.asts, left)
1489                    .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1490            }
1491            Ast::Repetition(r) => {
1492                let nullable = matches!(
1493                    &r.op.kind,
1494                    ast::RepetitionKind::ZeroOrMore
1495                        | ast::RepetitionKind::ZeroOrOne
1496                        | ast::RepetitionKind::Range(
1497                            ast::RepetitionRange::Bounded(0, _)
1498                                | ast::RepetitionRange::Exactly(0)
1499                        )
1500                );
1501                if nullable {
1502                    None
1503                } else {
1504                    Self::edge_class_ast(&r.ast, left)
1505                }
1506            }
1507            _ => None,
1508        }
1509    }
1510
1511    fn resolve_word_kind(
1512        &mut self,
1513        asts: &[Ast],
1514        idx: usize,
1515        dir: isize,
1516        translator: &mut Option<Translator>,
1517        tb: &mut TB<'s>,
1518        word_id: NodeId,
1519        not_word_id: NodeId,
1520    ) -> Result<WordCharKind> {
1521        use WordCharKind::*;
1522        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1523        if fast != Unknown {
1524            return Ok(fast);
1525        }
1526        let neighbor_idx = (idx as isize + dir) as usize;
1527        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1528            self.ast_to_node_id(edge, translator, tb)?
1529        } else if dir > 0 {
1530            let mut bodies: Vec<NodeId> = vec![];
1531            let mut j = neighbor_idx;
1532            while j < asts.len() {
1533                match &asts[j] {
1534                    Ast::Lookaround(la) => {
1535                        let kind = la.kind.clone();
1536                        let lookbehind = matches!(
1537                            kind,
1538                            ast::LookaroundKind::PositiveLookbehind
1539                                | ast::LookaroundKind::NegativeLookbehind
1540                        );
1541                        if lookbehind {
1542                            j += 1;
1543                            continue;
1544                        }
1545                        let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1546                        let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1547                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1548                        })?;
1549                        let body_ts = tb.mk_concat(body, NodeId::TS);
1550                        let constraint = match kind {
1551                            ast::LookaroundKind::PositiveLookahead => body_ts,
1552                            ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1553                            _ => unreachable!(),
1554                        };
1555                        bodies.push(constraint);
1556                        j += 1;
1557                    }
1558                    other => {
1559                        let n = self.ast_to_node_id(other, translator, tb)?;
1560                        let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1561                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1562                        })?;
1563                        bodies.push(tb.mk_concat(n, NodeId::TS));
1564                        break;
1565                    }
1566                }
1567            }
1568            if bodies.is_empty() {
1569                return Ok(Unknown);
1570            }
1571            let combined = tb.mk_inters(bodies.into_iter());
1572            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1573            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1574            return if tb.subsumes(word_prefix, combined) == Some(true) {
1575                Ok(Word)
1576            } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1577                Ok(NonWord)
1578            } else {
1579                Ok(Unknown)
1580            };
1581        } else {
1582            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1583            let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1584            let mut neighbor_node = tb
1585                .try_elim_lookarounds(neighbor_node)
1586                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1587            neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1588                Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1589            })?;
1590            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1591            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1592            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1593                Ok(Word)
1594            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1595                Ok(NonWord)
1596            } else {
1597                Ok(Unknown)
1598            };
1599        };
1600        if tb.subsumes(word_id, node) == Some(true) {
1601            Ok(Word)
1602        } else if tb.subsumes(not_word_id, node) == Some(true) {
1603            Ok(NonWord)
1604        } else {
1605            Ok(Unknown)
1606        }
1607    }
1608
1609    fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1610        match tb.get_kind(node) {
1611            Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1612            Kind::Concat => {
1613                let l = node.left(tb);
1614                let r = node.right(tb);
1615                let stripped_r = Self::strip_trailing_lookahead(tb, r);
1616                if stripped_r == NodeId::EPS {
1617                    Self::strip_trailing_lookahead(tb, l)
1618                } else if stripped_r == r {
1619                    node
1620                } else {
1621                    tb.mk_concat(l, stripped_r)
1622                }
1623            }
1624            _ => node,
1625        }
1626    }
1627
1628    fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1629        let dir: isize = if left { -1 } else { 1 };
1630        let mut e = if left { asts.len() as isize - 1 } else { 0 };
1631        while e >= 0
1632            && (e as usize) < asts.len()
1633            && Self::is_transparent_for_dir(&asts[e as usize], dir)
1634        {
1635            e += dir;
1636        }
1637        if e < 0 || e as usize >= asts.len() {
1638            None
1639        } else {
1640            Some(e as usize)
1641        }
1642    }
1643
1644    fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1645        match ast {
1646            Ast::Lookaround(la) => match la.kind {
1647                ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1648                    dir < 0
1649                }
1650                ast::LookaroundKind::PositiveLookbehind
1651                | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1652            },
1653            Ast::Repetition(r) => matches!(
1654                &r.op.kind,
1655                ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1656            ),
1657            _ => false,
1658        }
1659    }
1660
1661    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1662        use WordCharKind::*;
1663        let next = idx as isize + dir;
1664        if next < 0 || next >= asts.len() as isize {
1665            return Edge;
1666        }
1667        if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1668            return Self::concat_neighbor_kind(asts, next as usize, dir);
1669        }
1670        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1671        match kind {
1672            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1673                Word => Word,
1674                _ => Unknown,
1675            },
1676            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1677                NonWord => NonWord,
1678                _ => Unknown,
1679            },
1680            other => other,
1681        }
1682    }
1683
1684    fn specialize_word_boundaries(
1685        &mut self,
1686        children: &mut [NodeId],
1687        tb: &mut TB<'s>,
1688    ) -> Result<()> {
1689        let wb = self.unicode_classes.wb;
1690        let non_wb = self.unicode_classes.non_wb;
1691        if wb == NodeId::MISSING {
1692            return Ok(());
1693        }
1694        let word = self.unicode_classes.word;
1695        let non_word = self.unicode_classes.non_word;
1696        if word == NodeId::MISSING {
1697            return Ok(());
1698        }
1699        // narrow check, could be done over the whole regex instead of one node
1700        // but i need to make sure the cost isnt too large
1701        let word_pref = tb.mk_concat(word, NodeId::TS);
1702        let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1703        let word_suf = tb.mk_concat(NodeId::TS, word);
1704        let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1705        let len = children.len();
1706        for k in 0..len {
1707            let l = if k == 0 {
1708                WordCharKind::Edge
1709            } else {
1710                use resharp_algebra::Kind;
1711                if tb.get_kind(children[k - 1]) == Kind::End
1712                    && (children[k] == wb || children[k] == non_wb)
1713                {
1714                    return Err(
1715                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1716                    );
1717                }
1718                Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1719            };
1720            let r = if k + 1 >= len {
1721                WordCharKind::Edge
1722            } else {
1723                Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1724            };
1725            children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1726                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1727        }
1728        Ok(())
1729    }
1730
1731    fn rewrite_wb_in_node(
1732        b: &mut TB<'s>,
1733        node: NodeId,
1734        wb: NodeId,
1735        non_wb: NodeId,
1736        word: NodeId,
1737        left: WordCharKind,
1738        right: WordCharKind,
1739    ) -> Option<NodeId> {
1740        let boundary_match = if node == wb {
1741            true
1742        } else if node == non_wb {
1743            false
1744        } else if b.get_kind(node) == Kind::Union {
1745            let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1746            let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1747            return Some(b.mk_union(l, r));
1748        } else {
1749            return Some(node);
1750        };
1751        use WordCharKind::*;
1752        let result = match (left, right) {
1753            (NonWord, Word) | (Word, NonWord) => {
1754                if boundary_match {
1755                    NodeId::EPS
1756                } else {
1757                    NodeId::BOT
1758                }
1759            }
1760            (Word, Word) | (NonWord, NonWord) => {
1761                if boundary_match {
1762                    NodeId::BOT
1763                } else {
1764                    NodeId::EPS
1765                }
1766            }
1767            (Word, _) => {
1768                if boundary_match {
1769                    b.mk_neg_lookahead(word, 0)
1770                } else {
1771                    let tail = b.mk_concat(word, NodeId::TS);
1772                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1773                }
1774            }
1775            (NonWord, _) => {
1776                if boundary_match {
1777                    let tail = b.mk_concat(word, NodeId::TS);
1778                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1779                } else {
1780                    b.mk_neg_lookahead(word, 0)
1781                }
1782            }
1783            (_, Word) => {
1784                if boundary_match {
1785                    b.mk_neg_lookbehind(word)
1786                } else {
1787                    b.mk_lookbehind(word, NodeId::MISSING)
1788                }
1789            }
1790            (_, NonWord) => {
1791                if boundary_match {
1792                    b.mk_lookbehind(word, NodeId::MISSING)
1793                } else {
1794                    b.mk_neg_lookbehind(word)
1795                }
1796            }
1797            _ => return Some(node),
1798        };
1799        Some(result)
1800    }
1801
1802    fn classify(
1803        b: &mut TB<'s>,
1804        node: NodeId,
1805        word_dir: NodeId,
1806        non_word_dir: NodeId,
1807    ) -> WordCharKind {
1808        if b.contains_look(node) || b.contains_anchors(node) {
1809            return WordCharKind::Unknown;
1810        }
1811        if b.subsumes(word_dir, node) == Some(true) {
1812            WordCharKind::Word
1813        } else if b.subsumes(non_word_dir, node) == Some(true) {
1814            WordCharKind::NonWord
1815        } else {
1816            WordCharKind::Unknown
1817        }
1818    }
1819
1820    fn rewrite_word_boundary_in_concat(
1821        &mut self,
1822        asts: &[Ast],
1823        idx: usize,
1824        translator: &mut Option<Translator>,
1825        tb: &mut TB<'s>,
1826        negated: bool,
1827    ) -> Result<(NodeId, usize)> {
1828        use WordCharKind::*;
1829        if self.global_full_unicode {
1830            self.unicode_classes.ensure_word_full(tb);
1831        } else if self.global_unicode && !self.global_ascii_perl {
1832            self.unicode_classes.ensure_word(tb);
1833        } else {
1834            self.unicode_classes.ensure_word_ascii(tb);
1835        }
1836        let word_id = self.unicode_classes.word;
1837        let not_word_id = self.unicode_classes.non_word;
1838        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1839        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1840        let boundary_match = !negated;
1841        match (left, right) {
1842            (NonWord, Word) | (Word, NonWord) => Ok((
1843                if boundary_match {
1844                    NodeId::EPS
1845                } else {
1846                    NodeId::BOT
1847                },
1848                idx + 1,
1849            )),
1850            (Word, Word) | (NonWord, NonWord) => Ok((
1851                if boundary_match {
1852                    NodeId::BOT
1853                } else {
1854                    NodeId::EPS
1855                },
1856                idx + 1,
1857            )),
1858            (Word, _) => {
1859                if boundary_match {
1860                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1861                } else {
1862                    let tail = tb.mk_concat(word_id, NodeId::TS);
1863                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1864                }
1865            }
1866            (NonWord, _) => {
1867                if boundary_match {
1868                    let tail = tb.mk_concat(word_id, NodeId::TS);
1869                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1870                } else {
1871                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1872                }
1873            }
1874            (_, Word) => {
1875                if boundary_match {
1876                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1877                } else {
1878                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1879                }
1880            }
1881            (_, NonWord) => {
1882                if boundary_match {
1883                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1884                } else {
1885                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1886                }
1887            }
1888            //   \b = (?<=\w)(?!\w) | (?<!\w)(?=\w)
1889            //   \B = (?<=\w)(?=\w)  | (?<!\w)(?!\w)
1890            //   TODO: expensive and the cost could be reduced further
1891            _ => {
1892                self.unicode_classes.ensure_wb(tb);
1893                let node = if boundary_match {
1894                    self.unicode_classes.wb
1895                } else {
1896                    self.unicode_classes.non_wb
1897                };
1898                Ok((node, idx + 1))
1899            }
1900        }
1901    }
1902
1903    fn merge_boundary_with_following_lookaheads(
1904        &mut self,
1905        asts: &[Ast],
1906        wb_idx: usize,
1907        boundary_tail: NodeId,
1908        translator: &mut Option<Translator>,
1909        tb: &mut TB<'s>,
1910    ) -> Result<(NodeId, usize)> {
1911        let mut next = wb_idx + 1;
1912        let mut la_bodies = vec![boundary_tail];
1913        while next < asts.len() {
1914            match &asts[next] {
1915                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1916                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1917                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1918                    next += 1;
1919                }
1920                _ => break,
1921            }
1922        }
1923        let merged = tb.mk_inters(la_bodies.into_iter());
1924        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1925    }
1926
1927    fn ast_to_node_id(
1928        &mut self,
1929        ast: &Ast,
1930        translator: &mut Option<Translator>,
1931        tb: &mut TB<'s>,
1932    ) -> Result<NodeId> {
1933        match ast {
1934            Ast::Empty(_) => Ok(NodeId::EPS),
1935            Ast::Flags(f) => {
1936                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1937                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1938                }
1939                let mut translator_builder = self.default_translator_builder();
1940                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1941                    translator_builder.case_insensitive(state);
1942                }
1943                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1944                    translator_builder.unicode(state);
1945                }
1946                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1947                    self.dot_all.set(state);
1948                }
1949                if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1950                    self.multiline.set(state);
1951                }
1952                let concat_translator = Some(translator_builder.build());
1953                *translator = concat_translator;
1954                Ok(NodeId::EPS)
1955            }
1956            Ast::Literal(l) => {
1957                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1958                self.translator_to_node_id(&ast_lit, translator, tb)
1959            }
1960            Ast::Top(_) => Ok(NodeId::TOP),
1961            Ast::Dot(_) => {
1962                let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1963                let hirv = match (codepoint_dot, self.dot_all.get()) {
1964                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1965                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1966                    (false, true) => return Ok(NodeId::TOP),
1967                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1968                };
1969                self.hir_to_node_id(&hirv, tb)
1970            }
1971            Ast::Assertion(a) => match &a.kind {
1972                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1973                ast::AssertionKind::EndText => Ok(NodeId::END),
1974                ast::AssertionKind::WordBoundary => {
1975                    let only = Ast::Assertion(a.clone());
1976                    let asts = std::slice::from_ref(&only);
1977                    let (node, _) =
1978                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1979                    Ok(node)
1980                }
1981                ast::AssertionKind::NotWordBoundary => {
1982                    // bare \B with no surrounding concat: treat as singleton.
1983                    let only = Ast::Assertion(a.clone());
1984                    let asts = std::slice::from_ref(&only);
1985                    let (node, _) =
1986                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1987                    Ok(node)
1988                }
1989                ast::AssertionKind::StartLine => {
1990                    if !self.multiline.get() {
1991                        return Ok(NodeId::BEGIN);
1992                    }
1993                    let left = NodeId::BEGIN;
1994                    let right = tb.mk_u8(b'\n');
1995                    let union = tb.mk_union(left, right);
1996                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1997                }
1998                ast::AssertionKind::EndLine => {
1999                    if !self.multiline.get() {
2000                        return Ok(NodeId::END);
2001                    }
2002                    let left = NodeId::END;
2003                    let right = tb.mk_u8(b'\n');
2004                    let union = tb.mk_union(left, right);
2005                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2006                }
2007                ast::AssertionKind::WordBoundaryStart => {
2008                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2009                }
2010                ast::AssertionKind::WordBoundaryEnd => {
2011                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2012                }
2013                ast::AssertionKind::WordBoundaryStartAngle => {
2014                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2015                }
2016                ast::AssertionKind::WordBoundaryEndAngle => {
2017                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2018                }
2019                ast::AssertionKind::WordBoundaryStartHalf => {
2020                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2021                }
2022                ast::AssertionKind::WordBoundaryEndHalf => {
2023                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2024                }
2025            },
2026            Ast::ClassUnicode(c) => {
2027                let tmp = regex_syntax::ast::ClassUnicode {
2028                    span: c.span,
2029                    negated: c.negated,
2030                    kind: c.kind.clone(),
2031                };
2032                if !c.negated {
2033                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2034                        match s.as_str() {
2035                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
2036                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2037                            // a single valid utf8 codepoint; \p{utf8}*&~(a) => non a, but valid utf8
2038                            "utf8" => {
2039                                let ascii = tb.mk_range_u8(0, 127);
2040                                let beta = tb.mk_range_u8(128, 0xBF);
2041                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
2042                                let c0s = tb.mk_concats([c0, beta].into_iter());
2043                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
2044                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2045                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
2046                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2047                                return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2048                            }
2049                            "hex" => {
2050                                let nums = tb.mk_range_u8(b'0', b'9');
2051                                let lets = tb.mk_range_u8(b'a', b'f');
2052                                let lets2 = tb.mk_range_u8(b'A', b'F');
2053                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2054                                return Ok(merged);
2055                            }
2056                            _ => {}
2057                        }
2058                    };
2059                }
2060
2061                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2062                self.translator_to_node_id(&orig_ast, translator, tb)
2063            }
2064            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2065            Ast::ClassBracketed(c) => match &c.kind {
2066                regex_syntax::ast::ClassSet::Item(item) => {
2067                    if !c.negated && is_universal_perl_pair(item) {
2068                        return Ok(NodeId::TOP);
2069                    }
2070                    if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2071                        return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2072                    }
2073                    let kind = if self.global_ascii_perl {
2074                        rewrite_ascii_perl_set(&c.kind)
2075                    } else {
2076                        c.kind.clone()
2077                    };
2078                    let tmp = regex_syntax::ast::ClassBracketed {
2079                        span: c.span,
2080                        negated: c.negated,
2081                        kind,
2082                    };
2083                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2084                    self.translator_to_node_id(&orig_ast, translator, tb)
2085                }
2086                regex_syntax::ast::ClassSet::BinaryOp(_) => {
2087                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2088                }
2089            },
2090            Ast::Repetition(r) => {
2091                let body = self.ast_to_node_id(&r.ast, translator, tb);
2092                match body {
2093                    Ok(body) => match &r.op.kind {
2094                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2095                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2096                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2097                        ast::RepetitionKind::Range(r) => match r {
2098                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2099                            ast::RepetitionRange::AtLeast(n) => {
2100                                let rep = tb.mk_repeat(body, *n, *n);
2101                                let st = tb.mk_star(body);
2102                                Ok(tb.mk_concat(rep, st))
2103                            }
2104
2105                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2106                        },
2107                    },
2108                    Err(_) => body,
2109                }
2110            }
2111            Ast::Lookaround(g) => {
2112                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2113                match g.kind {
2114                    ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2115                        let mut prefix = NodeId::EPS;
2116                        let mut rest = body;
2117                        while tb.get_kind(rest) == Kind::Concat
2118                            && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2119                        {
2120                            prefix = tb.mk_concat(prefix, rest.left(tb));
2121                            rest = rest.right(tb);
2122                        }
2123                        if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2124                            return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2125                        }
2126                        let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2127                        Ok(tb.mk_concat(prefix, la))
2128                    }
2129                    ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2130                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2131                    }
2132                    ast::LookaroundKind::PositiveLookahead => {
2133                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2134                    }
2135                    ast::LookaroundKind::PositiveLookbehind
2136                    | ast::LookaroundKind::NegativeLookbehind
2137                        if body.contains_lookahead(tb) =>
2138                    {
2139                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2140                    }
2141                    ast::LookaroundKind::PositiveLookbehind => {
2142                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2143                    }
2144                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2145                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2146                }
2147            }
2148            Ast::Group(g) => {
2149                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2150                    if !flags.items.is_empty() {
2151                        let mut translator_builder = self.default_translator_builder();
2152                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2153                            translator_builder.case_insensitive(state);
2154                        }
2155                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2156                            translator_builder.unicode(state);
2157                        }
2158                        let saved_dot_all = self.dot_all.get();
2159                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2160                            self.dot_all.set(state);
2161                        }
2162                        let saved_multiline = self.multiline.get();
2163                        if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2164                            self.multiline.set(state);
2165                        }
2166                        let mut scoped = Some(translator_builder.build());
2167                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2168                        self.dot_all.set(saved_dot_all);
2169                        self.multiline.set(saved_multiline);
2170                        return result;
2171                    }
2172                }
2173                self.ast_to_node_id(&g.ast, translator, tb)
2174            }
2175            Ast::Alternation(a) => {
2176                let mut children = vec![];
2177                for ast in &a.asts {
2178                    match self.ast_to_node_id(ast, translator, tb) {
2179                        Ok(node_id) => children.push(node_id),
2180                        Err(err) => return Err(err),
2181                    }
2182                }
2183                Ok(tb.mk_unions(children.iter().copied()))
2184            }
2185            Ast::Concat(c) => {
2186                let mut concat_translator: Option<Translator> = None;
2187                let mut children = vec![];
2188                let mut prev_boundary_child: Option<usize> = None;
2189                let mut i = 0;
2190                while i < c.asts.len() {
2191                    let ast = &c.asts[i];
2192                    match ast {
2193                        Ast::Flags(f) => {
2194                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2195                                return Err(
2196                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2197                                );
2198                            }
2199                            let mut translator_builder = self.default_translator_builder();
2200                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2201                                translator_builder.case_insensitive(state);
2202                            }
2203                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2204                                translator_builder.unicode(state);
2205                            }
2206                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2207                                self.dot_all.set(state);
2208                            }
2209                            if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2210                                self.multiline.set(state);
2211                            }
2212                            concat_translator = Some(translator_builder.build());
2213                            *translator = concat_translator.clone();
2214                            i += 1;
2215                            continue;
2216                        }
2217                        Ast::Assertion(a)
2218                            if a.kind == ast::AssertionKind::WordBoundary
2219                                || a.kind == ast::AssertionKind::NotWordBoundary =>
2220                        {
2221                            let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2222                            let node = self.rewrite_word_boundary_in_concat(
2223                                &c.asts, i, translator, tb, negated,
2224                            )?;
2225                            match prev_boundary_child {
2226                                Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2227                                None => {
2228                                    children.push(node.0);
2229                                    prev_boundary_child = Some(children.len() - 1);
2230                                }
2231                            }
2232                            i = node.1; // skip consumed lookaheads
2233                            continue;
2234                        }
2235                        _ => {}
2236                    }
2237                    match concat_translator {
2238                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2239                            Ok(node_id) => {
2240                                if node_id != resharp_algebra::NodeId::EPS {
2241                                    prev_boundary_child = None;
2242                                    children.push(node_id);
2243                                }
2244                            }
2245                            Err(err) => return Err(err),
2246                        },
2247                        None => match self.ast_to_node_id(ast, translator, tb) {
2248                            Ok(node_id) => {
2249                                if node_id != resharp_algebra::NodeId::EPS {
2250                                    prev_boundary_child = None;
2251                                    children.push(node_id);
2252                                }
2253                            }
2254                            Err(err) => return Err(err),
2255                        },
2256                    }
2257                    i += 1;
2258                }
2259                self.specialize_word_boundaries(&mut children, tb)?;
2260                Ok(tb.mk_concats(children.iter().cloned()))
2261            }
2262            Ast::Intersection(intersection) => {
2263                let mut children = vec![];
2264                for ast in &intersection.asts {
2265                    match self.ast_to_node_id(ast, translator, tb) {
2266                        Ok(node_id) => children.push(node_id),
2267                        Err(err) => return Err(err),
2268                    }
2269                }
2270                Ok(tb.mk_inters(children.into_iter()))
2271            }
2272            Ast::Complement(complement) => {
2273                let body = self.ast_to_node_id(&complement.ast, translator, tb);
2274                body.map(|x| tb.mk_compl(x))
2275            }
2276        }
2277    }
2278
2279    fn parse_inner(&mut self) -> Result<Ast> {
2280        let mut concat = Concat {
2281            span: self.span(),
2282            asts: vec![],
2283        };
2284        loop {
2285            self.bump_space();
2286            if self.is_eof() {
2287                break;
2288            }
2289            match self.char() {
2290                '(' => concat = self.push_group(concat)?,
2291                ')' => concat = self.pop_group(concat)?,
2292                '|' => concat = self.push_alternate(concat)?,
2293                '&' => concat = self.push_intersect(concat)?,
2294                '~' => concat = self.push_compl_group(concat)?,
2295                '[' => {
2296                    let class = self.parse_set_class()?;
2297                    concat.asts.push(Ast::class_bracketed(class));
2298                }
2299                '?' => {
2300                    concat =
2301                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2302                }
2303                '*' => {
2304                    concat =
2305                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2306                }
2307                '+' => {
2308                    concat =
2309                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2310                }
2311                '{' => {
2312                    concat = self.parse_counted_repetition(concat)?;
2313                }
2314                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2315            }
2316            if self.stack_group.borrow().len() > self.max_depth {
2317                return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2318            }
2319        }
2320        let ast = self.pop_group_end(concat)?;
2321        if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2322            || max_concat_length(&ast) >= self.max_list_len
2323        {
2324            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2325        }
2326        Ok(ast)
2327    }
2328
2329    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2330        let ast = self.parse_inner()?;
2331        if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2332            return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2333        }
2334        self.ast_to_node_id(&ast, &mut None, tb)
2335    }
2336
2337    #[inline(never)]
2338    fn parse_uncounted_repetition(
2339        &self,
2340        mut concat: ast::Concat,
2341        kind: ast::RepetitionKind,
2342    ) -> Result<ast::Concat> {
2343        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
2344        let op_start = self.pos();
2345        let ast = match concat.asts.pop() {
2346            Some(ast) => ast,
2347            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2348        };
2349        match ast {
2350            Ast::Empty(_) | Ast::Flags(_) => {
2351                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2352            }
2353            _ => {}
2354        }
2355        if self.bump() && self.char() == '?' {
2356            return Err(self.error(
2357                Span::new(op_start, self.pos()),
2358                ast::ErrorKind::UnsupportedLazyQuantifier,
2359            ));
2360        }
2361        concat.asts.push(Ast::repetition(ast::Repetition {
2362            span: ast.span().with_end(self.pos()),
2363            op: ast::RepetitionOp {
2364                span: Span::new(op_start, self.pos()),
2365                kind,
2366            },
2367            greedy: true,
2368            ast: Box::new(ast),
2369        }));
2370        Ok(concat)
2371    }
2372
2373    #[inline(never)]
2374    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2375        assert!(self.char() == '{');
2376        let start = self.pos();
2377        let ast = match concat.asts.pop() {
2378            Some(ast) => ast,
2379            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2380        };
2381        match ast {
2382            Ast::Empty(_) | Ast::Flags(_) => {
2383                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2384            }
2385            _ => {}
2386        }
2387        if !self.bump_and_bump_space() {
2388            return Err(self.error(
2389                Span::new(start, self.pos()),
2390                ast::ErrorKind::RepetitionCountUnclosed,
2391            ));
2392        }
2393        let count_start = specialize_err(
2394            self.parse_decimal(),
2395            ast::ErrorKind::DecimalEmpty,
2396            ast::ErrorKind::RepetitionCountDecimalEmpty,
2397        );
2398        if self.is_eof() {
2399            return Err(self.error(
2400                Span::new(start, self.pos()),
2401                ast::ErrorKind::RepetitionCountUnclosed,
2402            ));
2403        }
2404        let range = if self.char() == ',' {
2405            if !self.bump_and_bump_space() {
2406                return Err(self.error(
2407                    Span::new(start, self.pos()),
2408                    ast::ErrorKind::RepetitionCountUnclosed,
2409                ));
2410            }
2411            if self.char() != '}' {
2412                let count_start = match count_start {
2413                    Ok(c) => c,
2414                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2415                        if self.parser().empty_min_range {
2416                            0
2417                        } else {
2418                            return Err(err);
2419                        }
2420                    }
2421                    err => err?,
2422                };
2423                let count_end = specialize_err(
2424                    self.parse_decimal(),
2425                    ast::ErrorKind::DecimalEmpty,
2426                    ast::ErrorKind::RepetitionCountDecimalEmpty,
2427                )?;
2428                ast::RepetitionRange::Bounded(count_start, count_end)
2429            } else {
2430                ast::RepetitionRange::AtLeast(count_start?)
2431            }
2432        } else {
2433            ast::RepetitionRange::Exactly(count_start?)
2434        };
2435
2436        if self.is_eof() || self.char() != '}' {
2437            return Err(self.error(
2438                Span::new(start, self.pos()),
2439                ast::ErrorKind::RepetitionCountUnclosed,
2440            ));
2441        }
2442
2443        if self.bump_and_bump_space() && self.char() == '?' {
2444            return Err(self.error(
2445                Span::new(start, self.pos()),
2446                ast::ErrorKind::UnsupportedLazyQuantifier,
2447            ));
2448        }
2449
2450        let op_span = Span::new(start, self.pos());
2451        if !range.is_valid() {
2452            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2453        }
2454
2455        let over_limit = match &range {
2456            ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2457            ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2458            ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2459        };
2460        if over_limit {
2461            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2462        }
2463        concat.asts.push(Ast::repetition(ast::Repetition {
2464            span: ast.span().with_end(self.pos()),
2465            op: ast::RepetitionOp {
2466                span: op_span,
2467                kind: ast::RepetitionKind::Range(range),
2468            },
2469            greedy: true,
2470            ast: Box::new(ast),
2471        }));
2472        Ok(concat)
2473    }
2474
2475    #[inline(never)]
2476    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2477        assert_eq!(self.char(), '(');
2478        let open_span = self.span_char();
2479        self.bump();
2480        self.bump_space();
2481        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2482            let kind = match (pos, ahead) {
2483                (true, true) => LookaroundKind::PositiveLookahead,
2484                (true, false) => LookaroundKind::PositiveLookbehind,
2485                (false, true) => LookaroundKind::NegativeLookahead,
2486                (false, false) => LookaroundKind::NegativeLookbehind,
2487            };
2488            return Ok(Either::Right(ast::Group {
2489                span: open_span,
2490                kind: ast::GroupKind::Lookaround(kind),
2491                ast: Box::new(Ast::empty(self.span())),
2492            }));
2493        }
2494        let inner_span = self.span();
2495        let mut starts_with_p = true;
2496        if self.bump_if("?P<") || {
2497            starts_with_p = false;
2498            self.bump_if("?<")
2499        } {
2500            let capture_index = self.next_capture_index(open_span)?;
2501            let name = self.parse_capture_name(capture_index)?;
2502            Ok(Either::Right(ast::Group {
2503                span: open_span,
2504                kind: ast::GroupKind::CaptureName {
2505                    starts_with_p,
2506                    name,
2507                },
2508                ast: Box::new(Ast::empty(self.span())),
2509            }))
2510        } else if self.bump_if("?") {
2511            if self.is_eof() {
2512                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2513            }
2514            let flags = self.parse_flags()?;
2515            let char_end = self.char();
2516            self.bump();
2517            if char_end == ')' {
2518                // We don't allow empty flags, e.g., `(?)`. We instead
2519                // interpret it as a repetition operator missing its argument.
2520                if flags.items.is_empty() {
2521                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2522                }
2523                Ok(Either::Left(ast::SetFlags {
2524                    span: Span {
2525                        end: self.pos(),
2526                        ..open_span
2527                    },
2528                    flags,
2529                }))
2530            } else {
2531                assert_eq!(char_end, ':');
2532                Ok(Either::Right(ast::Group {
2533                    span: open_span,
2534                    kind: ast::GroupKind::NonCapturing(flags),
2535                    ast: Box::new(Ast::empty(self.span())),
2536                }))
2537            }
2538        } else {
2539            let capture_index = self.next_capture_index(open_span)?;
2540            Ok(Either::Right(ast::Group {
2541                span: open_span,
2542                kind: ast::GroupKind::CaptureIndex(capture_index),
2543                ast: Box::new(Ast::empty(self.span())),
2544            }))
2545        }
2546    }
2547
2548    #[inline(never)]
2549    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2550        if self.is_eof() {
2551            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2552        }
2553        let start = self.pos();
2554        loop {
2555            if self.char() == '>' {
2556                break;
2557            }
2558            if !is_capture_char(self.char(), self.pos() == start) {
2559                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2560            }
2561            if !self.bump() {
2562                break;
2563            }
2564        }
2565        let end = self.pos();
2566        if self.is_eof() {
2567            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2568        }
2569        assert_eq!(self.char(), '>');
2570        self.bump();
2571        let name = &self.pattern()[start.offset..end.offset];
2572        if name.is_empty() {
2573            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2574        }
2575        let capname = ast::CaptureName {
2576            span: Span::new(start, end),
2577            name: name.to_string(),
2578            index: capture_index,
2579        };
2580        self.add_capture_name(&capname)?;
2581        Ok(capname)
2582    }
2583
2584    #[inline(never)]
2585    fn parse_flags(&self) -> Result<ast::Flags> {
2586        let mut flags = ast::Flags {
2587            span: self.span(),
2588            items: vec![],
2589        };
2590        let mut last_was_negation = None;
2591        while self.char() != ':' && self.char() != ')' {
2592            if self.char() == '-' {
2593                last_was_negation = Some(self.span_char());
2594                let item = ast::FlagsItem {
2595                    span: self.span_char(),
2596                    kind: ast::FlagsItemKind::Negation,
2597                };
2598                if let Some(i) = flags.add_item(item) {
2599                    return Err(self.error(
2600                        self.span_char(),
2601                        ast::ErrorKind::FlagRepeatedNegation {
2602                            original: flags.items[i].span,
2603                        },
2604                    ));
2605                }
2606            } else {
2607                last_was_negation = None;
2608                let item = ast::FlagsItem {
2609                    span: self.span_char(),
2610                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2611                };
2612                if let Some(i) = flags.add_item(item) {
2613                    return Err(self.error(
2614                        self.span_char(),
2615                        ast::ErrorKind::FlagDuplicate {
2616                            original: flags.items[i].span,
2617                        },
2618                    ));
2619                }
2620            }
2621            if !self.bump() {
2622                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2623            }
2624        }
2625        if let Some(span) = last_was_negation {
2626            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2627        }
2628        flags.span.end = self.pos();
2629        Ok(flags)
2630    }
2631
2632    #[inline(never)]
2633    fn parse_flag(&self) -> Result<ast::Flag> {
2634        match self.char() {
2635            'i' => Ok(ast::Flag::CaseInsensitive),
2636            'm' => Ok(ast::Flag::MultiLine),
2637            's' => Ok(ast::Flag::DotMatchesNewLine),
2638            'U' => Ok(ast::Flag::SwapGreed),
2639            'u' => Ok(ast::Flag::Unicode),
2640            'R' => Ok(ast::Flag::CRLF),
2641            'x' => Ok(ast::Flag::IgnoreWhitespace),
2642            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2643        }
2644    }
2645
2646    fn parse_primitive(&self) -> Result<Primitive> {
2647        match self.char() {
2648            '\\' => self.parse_escape(),
2649            '_' => {
2650                let ast = Primitive::Top(self.span_char());
2651                self.bump();
2652                Ok(ast)
2653            }
2654            '.' => {
2655                let ast = Primitive::Dot(self.span_char());
2656                self.bump();
2657                Ok(ast)
2658            }
2659            '^' => {
2660                let ast = Primitive::Assertion(ast::Assertion {
2661                    span: self.span_char(),
2662                    kind: ast::AssertionKind::StartLine,
2663                });
2664                self.bump();
2665                Ok(ast)
2666            }
2667            '$' => {
2668                let ast = Primitive::Assertion(ast::Assertion {
2669                    span: self.span_char(),
2670                    kind: ast::AssertionKind::EndLine,
2671                });
2672                self.bump();
2673                Ok(ast)
2674            }
2675            c => {
2676                let ast = Primitive::Literal(Literal {
2677                    span: self.span_char(),
2678                    kind: LiteralKind::Verbatim,
2679                    c,
2680                });
2681                self.bump();
2682                Ok(ast)
2683            }
2684        }
2685    }
2686
2687    #[inline(never)]
2688    fn parse_escape(&self) -> Result<Primitive> {
2689        assert_eq!(self.char(), '\\');
2690        let start = self.pos();
2691        if !self.bump() {
2692            return Err(self.error(
2693                Span::new(start, self.pos()),
2694                ast::ErrorKind::EscapeUnexpectedEof,
2695            ));
2696        }
2697        let c = self.char();
2698        // Put some of the more complicated routines into helpers.
2699        match c {
2700            '0'..='9' => {
2701                if !self.parser().octal {
2702                    return Err(self.error(
2703                        Span::new(start, self.span_char().end),
2704                        ast::ErrorKind::UnsupportedBackreference,
2705                    ));
2706                }
2707                let mut lit = self.parse_octal();
2708                lit.span.start = start;
2709                return Ok(Primitive::Literal(lit));
2710            }
2711            'x' | 'u' | 'U' => {
2712                let mut lit = self.parse_hex()?;
2713                lit.span.start = start;
2714                return Ok(Primitive::Literal(lit));
2715            }
2716            'p' | 'P' => {
2717                let mut cls = self.parse_unicode_class()?;
2718                cls.span.start = start;
2719                return Ok(Primitive::Unicode(cls));
2720            }
2721            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2722                let mut cls = self.parse_perl_class();
2723                cls.span.start = start;
2724                return Ok(Primitive::Perl(cls));
2725            }
2726            _ => {}
2727        }
2728
2729        // Handle all of the one letter sequences inline.
2730        self.bump();
2731        let span = Span::new(start, self.pos());
2732        if is_meta_character(c) {
2733            return Ok(Primitive::Literal(Literal {
2734                span,
2735                kind: LiteralKind::Meta,
2736                c,
2737            }));
2738        }
2739        if is_escapeable_character(c) {
2740            return Ok(Primitive::Literal(Literal {
2741                span,
2742                kind: LiteralKind::Superfluous,
2743                c,
2744            }));
2745        }
2746        let special = |kind, c| {
2747            Ok(Primitive::Literal(Literal {
2748                span,
2749                kind: LiteralKind::Special(kind),
2750                c,
2751            }))
2752        };
2753        match c {
2754            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2755            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2756            't' => special(SpecialLiteralKind::Tab, '\t'),
2757            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2758            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2759            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2760            'A' => Ok(Primitive::Assertion(ast::Assertion {
2761                span,
2762                kind: ast::AssertionKind::StartText,
2763            })),
2764            'z' => Ok(Primitive::Assertion(ast::Assertion {
2765                span,
2766                kind: ast::AssertionKind::EndText,
2767            })),
2768            'b' => {
2769                let mut wb = ast::Assertion {
2770                    span,
2771                    kind: ast::AssertionKind::WordBoundary,
2772                };
2773                // After a \b, we "try" to parse things like \b{start} for
2774                // special word boundary assertions.
2775                if !self.is_eof() && self.char() == '{' {
2776                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2777                        wb.kind = kind;
2778                        wb.span.end = self.pos();
2779                    }
2780                }
2781                Ok(Primitive::Assertion(wb))
2782            }
2783            'B' => Ok(Primitive::Assertion(ast::Assertion {
2784                span,
2785                kind: ast::AssertionKind::NotWordBoundary,
2786            })),
2787            '<' => Ok(Primitive::Assertion(ast::Assertion {
2788                span,
2789                kind: ast::AssertionKind::WordBoundaryStartAngle,
2790            })),
2791            '>' => Ok(Primitive::Assertion(ast::Assertion {
2792                span,
2793                kind: ast::AssertionKind::WordBoundaryEndAngle,
2794            })),
2795            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2796        }
2797    }
2798
2799    fn maybe_parse_special_word_boundary(
2800        &self,
2801        wb_start: Position,
2802    ) -> Result<Option<ast::AssertionKind>> {
2803        assert_eq!(self.char(), '{');
2804
2805        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2806        let start = self.pos();
2807        if !self.bump_and_bump_space() {
2808            return Err(self.error(
2809                Span::new(wb_start, self.pos()),
2810                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2811            ));
2812        }
2813        let start_contents = self.pos();
2814        if !is_valid_char(self.char()) {
2815            self.parser().pos.set(start);
2816            return Ok(None);
2817        }
2818
2819        // Now collect up our chars until we see a '}'.
2820        let mut scratch = self.parser().scratch.borrow_mut();
2821        scratch.clear();
2822        while !self.is_eof() && is_valid_char(self.char()) {
2823            scratch.push(self.char());
2824            self.bump_and_bump_space();
2825        }
2826        if self.is_eof() || self.char() != '}' {
2827            return Err(self.error(
2828                Span::new(start, self.pos()),
2829                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2830            ));
2831        }
2832        let end = self.pos();
2833        self.bump();
2834        let kind = match scratch.as_str() {
2835            "start" => ast::AssertionKind::WordBoundaryStart,
2836            "end" => ast::AssertionKind::WordBoundaryEnd,
2837            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2838            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2839            _ => {
2840                return Err(self.error(
2841                    Span::new(start_contents, end),
2842                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2843                ))
2844            }
2845        };
2846        Ok(Some(kind))
2847    }
2848
2849    #[inline(never)]
2850    fn parse_octal(&self) -> Literal {
2851        assert!(self.parser().octal);
2852        assert!('0' <= self.char() && self.char() <= '7');
2853        let start = self.pos();
2854        // Parse up to two more digits.
2855        while self.bump()
2856            && '0' <= self.char()
2857            && self.char() <= '7'
2858            && self.pos().offset - start.offset <= 2
2859        {}
2860        let end = self.pos();
2861        let octal = &self.pattern()[start.offset..end.offset];
2862        // Parsing the octal should never fail since the above guarantees a
2863        // valid number.
2864        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2865        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2866        // invalid Unicode scalar values.
2867        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2868        Literal {
2869            span: Span::new(start, end),
2870            kind: LiteralKind::Octal,
2871            c,
2872        }
2873    }
2874
2875    #[inline(never)]
2876    fn parse_hex(&self) -> Result<Literal> {
2877        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2878
2879        let hex_kind = match self.char() {
2880            'x' => HexLiteralKind::X,
2881            'u' => HexLiteralKind::UnicodeShort,
2882            _ => HexLiteralKind::UnicodeLong,
2883        };
2884        if !self.bump_and_bump_space() {
2885            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2886        }
2887        if self.char() == '{' {
2888            self.parse_hex_brace(hex_kind)
2889        } else {
2890            self.parse_hex_digits(hex_kind)
2891        }
2892    }
2893
2894    #[inline(never)]
2895    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2896        let mut scratch = self.parser().scratch.borrow_mut();
2897        scratch.clear();
2898
2899        let start = self.pos();
2900        for i in 0..kind.digits() {
2901            if i > 0 && !self.bump_and_bump_space() {
2902                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2903            }
2904            if !is_hex(self.char()) {
2905                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2906            }
2907            scratch.push(self.char());
2908        }
2909        self.bump_and_bump_space();
2910        let end = self.pos();
2911        let hex = scratch.as_str();
2912        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2913            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2914            Some(c) => Ok(Literal {
2915                span: Span::new(start, end),
2916                kind: LiteralKind::HexFixed(kind),
2917                c,
2918            }),
2919        }
2920    }
2921
2922    #[inline(never)]
2923    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2924        let mut scratch = self.parser().scratch.borrow_mut();
2925        scratch.clear();
2926
2927        let brace_pos = self.pos();
2928        let start = self.span_char().end;
2929        while self.bump_and_bump_space() && self.char() != '}' {
2930            if !is_hex(self.char()) {
2931                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2932            }
2933            scratch.push(self.char());
2934        }
2935        if self.is_eof() {
2936            return Err(self.error(
2937                Span::new(brace_pos, self.pos()),
2938                ast::ErrorKind::EscapeUnexpectedEof,
2939            ));
2940        }
2941        let end = self.pos();
2942        let hex = scratch.as_str();
2943        assert_eq!(self.char(), '}');
2944        self.bump_and_bump_space();
2945
2946        if hex.is_empty() {
2947            return Err(self.error(
2948                Span::new(brace_pos, self.pos()),
2949                ast::ErrorKind::EscapeHexEmpty,
2950            ));
2951        }
2952        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2953            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2954            Some(c) => Ok(Literal {
2955                span: Span::new(start, self.pos()),
2956                kind: LiteralKind::HexBrace(kind),
2957                c,
2958            }),
2959        }
2960    }
2961
2962    fn parse_decimal(&self) -> Result<u32> {
2963        let mut scratch = self.parser().scratch.borrow_mut();
2964        scratch.clear();
2965
2966        while !self.is_eof() && self.char().is_whitespace() {
2967            self.bump();
2968        }
2969        let start = self.pos();
2970        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2971            scratch.push(self.char());
2972            self.bump_and_bump_space();
2973        }
2974        let span = Span::new(start, self.pos());
2975        while !self.is_eof() && self.char().is_whitespace() {
2976            self.bump_and_bump_space();
2977        }
2978        let digits = scratch.as_str();
2979        if digits.is_empty() {
2980            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2981        }
2982        match digits.parse::<u32>().ok() {
2983            Some(n) => Ok(n),
2984            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2985        }
2986    }
2987
2988    #[inline(never)]
2989    fn parse_set_class(&self) -> Result<ClassBracketed> {
2990        assert_eq!(self.char(), '[');
2991
2992        let mut union = ClassSetUnion {
2993            span: self.span(),
2994            items: vec![],
2995        };
2996        loop {
2997            self.bump_space();
2998            if self.is_eof() {
2999                return Err(self.unclosed_class_error());
3000            }
3001            match self.char() {
3002                '[' => {
3003                    if !self.parser().stack_class.borrow().is_empty() {
3004                        if let Some(cls) = self.maybe_parse_ascii_class() {
3005                            union.push(ClassSetItem::Ascii(cls));
3006                            continue;
3007                        }
3008                    }
3009                    union = self.push_class_open(union)?;
3010                }
3011                ']' => match self.pop_class(union)? {
3012                    Either::Left(nested_union) => {
3013                        union = nested_union;
3014                    }
3015                    Either::Right(class) => return Ok(class),
3016                },
3017                '&' if self.peek() == Some('&') => {
3018                    assert!(self.bump_if("&&"));
3019                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3020                }
3021                '-' if self.peek() == Some('-') => {
3022                    assert!(self.bump_if("--"));
3023                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3024                }
3025                '~' if self.peek() == Some('~') => {
3026                    assert!(self.bump_if("~~"));
3027                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3028                }
3029                _ => {
3030                    union.push(self.parse_set_class_range()?);
3031                }
3032            }
3033        }
3034    }
3035
3036    #[inline(never)]
3037    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3038        let prim1 = self.parse_set_class_item()?;
3039        self.bump_space();
3040        if self.is_eof() {
3041            return Err(self.unclosed_class_error());
3042        }
3043        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3044            return prim1.into_class_set_item(self);
3045        }
3046        if !self.bump_and_bump_space() {
3047            return Err(self.unclosed_class_error());
3048        }
3049        let prim2 = self.parse_set_class_item()?;
3050        let range = ClassSetRange {
3051            span: Span::new(prim1.span().start, prim2.span().end),
3052            start: prim1.into_class_literal(self)?,
3053            end: prim2.into_class_literal(self)?,
3054        };
3055        if !range.is_valid() {
3056            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3057        }
3058        Ok(ClassSetItem::Range(range))
3059    }
3060
3061    #[inline(never)]
3062    fn parse_set_class_item(&self) -> Result<Primitive> {
3063        if self.char() == '\\' {
3064            self.parse_escape()
3065        } else {
3066            let x = Primitive::Literal(Literal {
3067                span: self.span_char(),
3068                kind: LiteralKind::Verbatim,
3069                c: self.char(),
3070            });
3071            self.bump();
3072            Ok(x)
3073        }
3074    }
3075
3076    #[inline(never)]
3077    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3078        assert_eq!(self.char(), '[');
3079        let start = self.pos();
3080        if !self.bump_and_bump_space() {
3081            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3082        }
3083
3084        let negated = if self.char() != '^' {
3085            false
3086        } else {
3087            if !self.bump_and_bump_space() {
3088                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3089            }
3090            true
3091        };
3092        // Accept any number of `-` as literal `-`.
3093        let mut union = ClassSetUnion {
3094            span: self.span(),
3095            items: vec![],
3096        };
3097        while self.char() == '-' {
3098            union.push(ClassSetItem::Literal(Literal {
3099                span: self.span_char(),
3100                kind: LiteralKind::Verbatim,
3101                c: '-',
3102            }));
3103            if !self.bump_and_bump_space() {
3104                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3105            }
3106        }
3107        // If `]` is the *first* char in a set, then interpret it as a literal
3108        // `]`. That is, an empty class is impossible to write.
3109        if union.items.is_empty() && self.char() == ']' {
3110            union.push(ClassSetItem::Literal(Literal {
3111                span: self.span_char(),
3112                kind: LiteralKind::Verbatim,
3113                c: ']',
3114            }));
3115            if !self.bump_and_bump_space() {
3116                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3117            }
3118        }
3119        let set = ClassBracketed {
3120            span: Span::new(start, self.pos()),
3121            negated,
3122            kind: ClassSet::union(ClassSetUnion {
3123                span: Span::new(union.span.start, union.span.start),
3124                items: vec![],
3125            }),
3126        };
3127        Ok((set, union))
3128    }
3129
3130    #[inline(never)]
3131    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3132        assert_eq!(self.char(), '[');
3133        // If parsing fails, then we back up the parser to this starting point.
3134        let start = self.pos();
3135        let mut negated = false;
3136        if !self.bump() || self.char() != ':' {
3137            self.parser().pos.set(start);
3138            return None;
3139        }
3140        if !self.bump() {
3141            self.parser().pos.set(start);
3142            return None;
3143        }
3144        if self.char() == '^' {
3145            negated = true;
3146            if !self.bump() {
3147                self.parser().pos.set(start);
3148                return None;
3149            }
3150        }
3151        let name_start = self.offset();
3152        while self.char() != ':' && self.bump() {}
3153        if self.is_eof() {
3154            self.parser().pos.set(start);
3155            return None;
3156        }
3157        let name = &self.pattern()[name_start..self.offset()];
3158        if !self.bump_if(":]") {
3159            self.parser().pos.set(start);
3160            return None;
3161        }
3162        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3163            Some(kind) => kind,
3164            None => {
3165                self.parser().pos.set(start);
3166                return None;
3167            }
3168        };
3169        Some(ClassAscii {
3170            span: Span::new(start, self.pos()),
3171            kind,
3172            negated,
3173        })
3174    }
3175
3176    #[inline(never)]
3177    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3178        assert!(self.char() == 'p' || self.char() == 'P');
3179
3180        let mut scratch = self.parser().scratch.borrow_mut();
3181        scratch.clear();
3182
3183        let negated = self.char() == 'P';
3184        if !self.bump_and_bump_space() {
3185            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3186        }
3187        let (start, kind) = if self.char() == '{' {
3188            let start = self.span_char().end;
3189            while self.bump_and_bump_space() && self.char() != '}' {
3190                scratch.push(self.char());
3191            }
3192            if self.is_eof() {
3193                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3194            }
3195            assert_eq!(self.char(), '}');
3196            self.bump();
3197
3198            let name = scratch.as_str();
3199            if let Some(i) = name.find("!=") {
3200                (
3201                    start,
3202                    ClassUnicodeKind::NamedValue {
3203                        op: ClassUnicodeOpKind::NotEqual,
3204                        name: name[..i].to_string(),
3205                        value: name[i + 2..].to_string(),
3206                    },
3207                )
3208            } else if let Some(i) = name.find(':') {
3209                (
3210                    start,
3211                    ClassUnicodeKind::NamedValue {
3212                        op: ClassUnicodeOpKind::Colon,
3213                        name: name[..i].to_string(),
3214                        value: name[i + 1..].to_string(),
3215                    },
3216                )
3217            } else if let Some(i) = name.find('=') {
3218                (
3219                    start,
3220                    ClassUnicodeKind::NamedValue {
3221                        op: ClassUnicodeOpKind::Equal,
3222                        name: name[..i].to_string(),
3223                        value: name[i + 1..].to_string(),
3224                    },
3225                )
3226            } else {
3227                (start, ClassUnicodeKind::Named(name.to_string()))
3228            }
3229        } else {
3230            let start = self.pos();
3231            let c = self.char();
3232            if c == '\\' {
3233                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3234            }
3235            self.bump_and_bump_space();
3236            let kind = ClassUnicodeKind::OneLetter(c);
3237            (start, kind)
3238        };
3239        Ok(ClassUnicode {
3240            span: Span::new(start, self.pos()),
3241            negated,
3242            kind,
3243        })
3244    }
3245
3246    #[inline(never)]
3247    fn parse_perl_class(&self) -> ClassPerl {
3248        let c = self.char();
3249        let span = self.span_char();
3250        self.bump();
3251        let (negated, kind) = match c {
3252            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3253            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3254            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3255            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3256            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3257            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3258            c => panic!("expected valid Perl class but got '{}'", c),
3259        };
3260        ClassPerl {
3261            span,
3262            kind,
3263            negated,
3264        }
3265    }
3266}
3267
3268/// `[\s\S]`, `[\w\W]`, `[\d\D]` etc into `_` canonicalization.
3269/// should really be done as a u32 BDD but good enough for now
3270fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
3271    use regex_syntax::ast::ClassSetItem;
3272    let items = match item {
3273        ClassSetItem::Union(u) => &u.items,
3274        _ => return false,
3275    };
3276    if items.len() != 2 {
3277        return false;
3278    }
3279    match (&items[0], &items[1]) {
3280        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3281            let is_all = a.kind == b.kind && a.negated != b.negated;
3282            is_all
3283        }
3284        _ => false,
3285    }
3286}
3287
3288pub fn max_concat_length(ast: &ast::Ast) -> usize {
3289    match ast {
3290        ast::Ast::Empty(_)
3291        | ast::Ast::Flags(_)
3292        | ast::Ast::Literal(_)
3293        | ast::Ast::Dot(_)
3294        | ast::Ast::Top(_)
3295        | ast::Ast::Assertion(_)
3296        | ast::Ast::ClassUnicode(_)
3297        | ast::Ast::ClassPerl(_)
3298        | ast::Ast::ClassBracketed(_) => 0,
3299        ast::Ast::Group(g) => max_concat_length(&g.ast),
3300        ast::Ast::Complement(c) => max_concat_length(&c.ast),
3301        ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3302        ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3303        ast::Ast::Concat(c) => c
3304            .asts
3305            .len()
3306            .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3307        ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3308        ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3309    }
3310}
3311
3312pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3313    fn go(ast: &ast::Ast, limit: u64) -> u64 {
3314        match ast {
3315            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3316            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3317            ast::Ast::Assertion(_) => 1,
3318            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3319            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3320            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3321            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3322            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3323            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3324            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3325            ast::Ast::Repetition(r) => {
3326                let body = go(&r.ast, limit);
3327                let factor: u64 = match &r.op.kind {
3328                    ast::RepetitionKind::ZeroOrOne => 2,
3329                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3330                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3331                        (*n as u64).max(1)
3332                    }
3333                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3334                        (*n as u64).max(1).saturating_add(1)
3335                    }
3336                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3337                        (*m as u64).max(1)
3338                    }
3339                };
3340                body.saturating_mul(factor).min(limit)
3341            }
3342        }
3343    }
3344    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3345        let mut total: u64 = 0;
3346        for c in children {
3347            total = total.saturating_add(go(c, limit));
3348            if total >= limit {
3349                return limit;
3350            }
3351        }
3352        total
3353    }
3354    go(ast, limit)
3355}
3356
3357pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3358    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3359    p.parse(tb)
3360}
3361
3362pub fn parse_ast_with<'s>(
3363    tb: &mut TB<'s>,
3364    pattern: &'s str,
3365    flags: &PatternFlags,
3366) -> std::result::Result<NodeId, ParseError> {
3367    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3368    p.parse(tb)
3369}
3370
3371/// Parse a pattern into the raw AST without converting to algebra nodes.
3372pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3373    let mut p: ResharpParser = ResharpParser::new(pattern);
3374    p.parse_inner()
3375}