resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `RegexOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// `^` and `$` match at line boundaries (`\n`) in addition to text boundaries.
37    pub multiline: bool,
38    /// allow whitespace and `#` comments in the pattern.
39    pub ignore_whitespace: bool,
40    /// ASCII `\w`/`\d`/`\s` tables, but
41    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
42    pub ascii_perl_classes: bool,
43    /// max upper bound on `expanded_ast_size` before parser rejects the
44    /// pattern as too complex. default 50_000.
45    pub expanded_ast_limit: u64,
46    /// max children allowed in any single `Concat`/`Alternation`/
47    /// `Intersection` node. default 4_000.
48    pub max_list_len: usize,
49    /// max upper bound on bounded repetition `{n,m}`. default 500.
50    pub max_repeat: u32,
51    pub max_depth: usize,
52}
53
54// arbitrary safeguards, these will not prevent intentional DoS patterns
55// more to protect you from shooting yourself in the foot
56pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62    fn default() -> Self {
63        Self {
64            unicode: true,
65            full_unicode: false,
66            case_insensitive: false,
67            dot_matches_new_line: false,
68            multiline: true,
69            ignore_whitespace: false,
70            ascii_perl_classes: false,
71            expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72            max_list_len: DEFAULT_MAX_LIST_LEN,
73            max_repeat: DEFAULT_MAX_REPEAT,
74            max_depth: DEFAULT_MAX_DEPTH,
75        }
76    }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81    Word,
82    NonWord,
83    MaybeWord,
84    MaybeNonWord,
85    Unknown,
86    Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90    b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: &regex_syntax::ast::ClassSetItem) -> WordCharKind {
94    use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95    use WordCharKind::*;
96    match item {
97        ClassSetItem::Empty(_) => Unknown,
98        ClassSetItem::Literal(l) => {
99            if is_word_byte(l.c as u8) {
100                Word
101            } else {
102                NonWord
103            }
104        }
105        ClassSetItem::Range(r) => {
106            let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107            let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108            if all_word {
109                Word
110            } else if all_non {
111                NonWord
112            } else {
113                Unknown
114            }
115        }
116        ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117            (ClassPerlKind::Word, false) => Word,
118            (ClassPerlKind::Word, true) => NonWord,
119            (ClassPerlKind::Space, false) => NonWord,
120            (ClassPerlKind::Digit, false) => Word,
121            _ => Unknown,
122        },
123        ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124        ClassSetItem::Union(u) => {
125            let mut kind = Unknown;
126            for item in &u.items {
127                let k = class_set_item_word_kind(item);
128                kind = match (kind, k) {
129                    (_, Unknown) => return Unknown,
130                    (Unknown, _) => k,
131                    (Word, Word) => Word,
132                    (NonWord, NonWord) => NonWord,
133                    _ => return Unknown,
134                };
135            }
136            kind
137        }
138        _ => Unknown,
139    }
140}
141
142fn class_bracketed_word_kind(c: &regex_syntax::ast::ClassBracketed) -> WordCharKind {
143    use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144    use WordCharKind::*;
145    if c.negated {
146        return match &c.kind {
147            ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148                if p.negated {
149                    Word
150                } else {
151                    NonWord
152                }
153            }
154            _ => Unknown,
155        };
156    }
157    match &c.kind {
158        ClassSet::Item(item) => class_set_item_word_kind(item),
159        ClassSet::BinaryOp(_) => Unknown,
160    }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164    regex_syntax::ast::Literal {
165        span,
166        kind: regex_syntax::ast::LiteralKind::Verbatim,
167        c,
168    }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172    regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173        span,
174        start: ascii_class_lit(span, a),
175        end: ascii_class_lit(span, b),
176    })
177}
178
179fn ascii_perl_positive(
180    span: Span,
181    kind: &regex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183    use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184    match kind {
185        ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186        ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187            span,
188            items: vec![
189                ascii_class_range(span, 'a', 'z'),
190                ascii_class_range(span, 'A', 'Z'),
191                ascii_class_range(span, '0', '9'),
192                ClassSetItem::Literal(ascii_class_lit(span, '_')),
193            ],
194        }),
195        ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196            span,
197            items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198                .into_iter()
199                .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200                .collect(),
201        }),
202    }
203}
204
205fn ascii_perl_set_item(
206    span: Span,
207    kind: &regex_syntax::ast::ClassPerlKind,
208    negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210    use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211    let positive = ascii_perl_positive(span, kind);
212    if negated {
213        ClassSetItem::Bracketed(Box::new(ClassBracketed {
214            span,
215            negated: true,
216            kind: ClassSet::Item(positive),
217        }))
218    } else {
219        positive
220    }
221}
222
223fn rewrite_ascii_perl_set(set: &regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224    use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225    match set {
226        ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227        ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228            span: op.span,
229            kind: op.kind.clone(),
230            lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231            rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232        }),
233    }
234}
235
236fn rewrite_ascii_perl_item(
237    item: &regex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239    use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240    match item {
241        ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242        ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243            span: u.span,
244            items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245        }),
246        ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247            span: b.span,
248            negated: b.negated,
249            kind: rewrite_ascii_perl_set(&b.kind),
250        })),
251        other => other.clone(),
252    }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257    Literal(Literal),
258    Assertion(ast::Assertion),
259    Dot(Span),
260    Top(Span),
261    Perl(ClassPerl),
262    Unicode(ClassUnicode),
263}
264
265impl Primitive {
266    fn span(&self) -> &Span {
267        match *self {
268            Primitive::Literal(ref x) => &x.span,
269            Primitive::Assertion(ref x) => &x.span,
270            Primitive::Dot(ref span) => span,
271            Primitive::Top(ref span) => span,
272            Primitive::Perl(ref x) => &x.span,
273            Primitive::Unicode(ref x) => &x.span,
274        }
275    }
276
277    fn into_ast(self) -> Ast {
278        match self {
279            Primitive::Literal(lit) => Ast::literal(lit),
280            Primitive::Assertion(assert) => Ast::assertion(assert),
281            Primitive::Dot(span) => Ast::dot(span),
282            Primitive::Top(span) => Ast::top(span),
283            Primitive::Perl(cls) => Ast::class_perl(cls),
284            Primitive::Unicode(cls) => Ast::class_unicode(cls),
285        }
286    }
287
288    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289        use self::Primitive::*;
290        use regex_syntax::ast::ClassSetItem;
291
292        match self {
293            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297        }
298    }
299
300    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301        use self::Primitive::*;
302
303        match self {
304            Literal(lit) => Ok(lit),
305            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306        }
307    }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312    Left(Left),
313    Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318    /// The kind of error.
319    pub kind: ErrorKind,
320    /// The original pattern that the parser generated the error from. Every
321    /// span in an error is a valid range into this string.
322    pattern: String,
323    /// The span of this error.
324    pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329        write!(f, "{:?}: {:?}", self.kind, self.span)
330    }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338    /// This state is pushed whenever an opening group is found.
339    Group {
340        /// The concatenation immediately preceding the opening group.
341        concat: Concat,
342        /// The group that has been opened. Its sub-AST is always empty.
343        group: ast::Group,
344        /// Whether this group has the `x` flag enabled or not.
345        ignore_whitespace: bool,
346    },
347    Alternation(ast::Alternation),
348    Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353    /// This state is pushed whenever an opening bracket is found.
354    Open {
355        /// The union of class items immediately preceding this class.
356        union: regex_syntax::ast::ClassSetUnion,
357        set: regex_syntax::ast::ClassBracketed,
358    },
359    /// This state is pushed when a operator is seen. When popped, the stored
360    /// set becomes the left hand side of the operator.
361    Op {
362        /// The type of the operation, i.e., &&, -- or ~~.
363        kind: regex_syntax::ast::ClassSetBinaryOpKind,
364        /// The left-hand side of the operator.
365        lhs: regex_syntax::ast::ClassSet,
366    },
367}
368
369/// RE# syntax parser based on the regex-syntax crate.
370pub struct ResharpParser<'s> {
371    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372    unicode_classes: resharp_algebra::UnicodeClassCache,
373    pub translator: regex_syntax::hir::translate::Translator,
374    pub pattern: &'s str,
375    pos: Cell<Position>,
376    capture_index: Cell<u32>,
377    octal: bool,
378    empty_min_range: bool,
379    ignore_whitespace: Cell<bool>,
380    dot_all: Cell<bool>,
381    multiline: Cell<bool>,
382    global_unicode: bool,
383    global_full_unicode: bool,
384    global_ascii_perl: bool,
385    global_case_insensitive: bool,
386    expanded_ast_limit: u64,
387    max_list_len: usize,
388    max_repeat: u32,
389    max_depth: usize,
390    comments: RefCell<Vec<ast::Comment>>,
391    stack_group: RefCell<Vec<GroupState>>,
392    stack_class: RefCell<Vec<ClassState>>,
393    capture_names: RefCell<Vec<ast::CaptureName>>,
394    scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398    result.map_err(|e| {
399        if e.kind == from {
400            ParseError {
401                kind: to,
402                pattern: e.pattern,
403                span: e.span,
404            }
405        } else {
406            e
407        }
408    })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412    if first {
413        c == '_' || c.is_alphabetic()
414    } else {
415        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416    }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420    matches!(
421        c,
422        '\\' | '.'
423            | '+'
424            | '*'
425            | '?'
426            | '('
427            | ')'
428            | '|'
429            | '['
430            | ']'
431            | '{'
432            | '}'
433            | '^'
434            | '$'
435            | '#'
436            | '&'
437            | '-'
438            | '~'
439            | '_'
440    )
441}
442
443/// escapes all resharp meta characters in `text`.
444pub fn escape(text: &str) -> String {
445    let mut buf = String::new();
446    escape_into(text, &mut buf);
447    buf
448}
449
450/// escapes all resharp meta characters in `text` and appends to `buf`.
451pub fn escape_into(text: &str, buf: &mut String) {
452    buf.reserve(text.len());
453    for c in text.chars() {
454        if is_meta_character(c) {
455            buf.push('\\');
456        }
457        buf.push(c);
458    }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462    if is_meta_character(c) {
463        return true;
464    }
465    if !c.is_ascii() {
466        return false;
467    }
468    match c {
469        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470        '<' | '>' => false,
471        _ => true,
472    }
473}
474
475fn is_hex(c: char) -> bool {
476    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480    match ast {
481        Ast::Concat(c) => {
482            let mut child_at_start = at_start;
483            for child in &c.asts {
484                ensure_lookbehind_at_start(child, child_at_start)?;
485                if ast_may_consume(child) {
486                    child_at_start = false;
487                }
488            }
489            Ok(())
490        }
491        Ast::Alternation(a) => {
492            for child in &a.asts {
493                ensure_lookbehind_at_start(child, at_start)?;
494            }
495            Ok(())
496        }
497        Ast::Intersection(i) => {
498            for child in &i.asts {
499                ensure_lookbehind_at_start(child, at_start)?;
500            }
501            Ok(())
502        }
503        Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504        Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505        Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506        Ast::Lookaround(g) => {
507            match g.kind {
508                LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509                    if !at_start {
510                        return Err(g.span);
511                    }
512                }
513                LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514            }
515            ensure_lookbehind_at_start(&g.ast, true)
516        }
517        Ast::Empty(_)
518        | Ast::Flags(_)
519        | Ast::Literal(_)
520        | Ast::Dot(_)
521        | Ast::Top(_)
522        | Ast::Assertion(_)
523        | Ast::ClassUnicode(_)
524        | Ast::ClassPerl(_)
525        | Ast::ClassBracketed(_) => Ok(()),
526    }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530    match ast {
531        Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532        Ast::Literal(_)
533        | Ast::Dot(_)
534        | Ast::Top(_)
535        | Ast::ClassUnicode(_)
536        | Ast::ClassPerl(_)
537        | Ast::ClassBracketed(_) => true,
538        Ast::Group(g) => ast_may_consume(&g.ast),
539        Ast::Repetition(r) => {
540            if !ast_may_consume(&r.ast) {
541                return false;
542            }
543            match r.op.kind {
544                RepetitionKind::ZeroOrOne
545                | RepetitionKind::ZeroOrMore
546                | RepetitionKind::OneOrMore => true,
547                RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548                RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549                RepetitionKind::Range(_) => true,
550            }
551        }
552        Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553        Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554        Ast::Complement(_) => true,
555        Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556    }
557}
558
559impl<'s> ResharpParser<'s> {
560    fn default_translator_builder(&self) -> TranslatorBuilder {
561        let mut trb = TranslatorBuilder::new();
562        trb.unicode(self.global_unicode);
563        trb.utf8(false);
564        trb.case_insensitive(self.global_case_insensitive);
565        trb
566    }
567
568    pub fn new(pattern: &'s str) -> Self {
569        Self::with_flags(pattern, &PatternFlags::default())
570    }
571
572    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573        let mut trb = TranslatorBuilder::new();
574        trb.unicode(flags.unicode);
575        trb.utf8(false);
576        trb.case_insensitive(flags.case_insensitive);
577        Self {
578            translator: trb.build(),
579            pattern,
580            perl_classes: vec![],
581            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582            pos: Cell::new(Position::new(0, 0, 0)),
583            capture_index: Cell::new(0),
584            octal: false,
585            empty_min_range: false,
586            ignore_whitespace: Cell::new(flags.ignore_whitespace),
587            dot_all: Cell::new(flags.dot_matches_new_line),
588            multiline: Cell::new(flags.multiline),
589            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590            global_full_unicode: flags.full_unicode,
591            global_ascii_perl: flags.ascii_perl_classes,
592            global_case_insensitive: flags.case_insensitive,
593            expanded_ast_limit: flags.expanded_ast_limit,
594            max_list_len: flags.max_list_len,
595            max_repeat: flags.max_repeat,
596            max_depth: flags.max_depth,
597            comments: RefCell::new(vec![]),
598            stack_group: RefCell::new(vec![]),
599            stack_class: RefCell::new(vec![]),
600            capture_names: RefCell::new(vec![]),
601            scratch: RefCell::new(String::new()),
602        }
603    }
604
605    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606        self
607    }
608
609    fn pattern(&self) -> &str {
610        self.pattern
611    }
612
613    fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614        ParseError {
615            kind,
616            pattern: self.pattern().to_string(),
617            span,
618        }
619    }
620
621    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622        self.error(
623            Span::splat(self.pos()),
624            ast::ErrorKind::UnsupportedResharpRegex,
625        )
626    }
627
628    fn offset(&self) -> usize {
629        self.parser().pos.get().offset
630    }
631
632    fn line(&self) -> usize {
633        self.parser().pos.get().line
634    }
635
636    fn column(&self) -> usize {
637        self.parser().pos.get().column
638    }
639
640    fn next_capture_index(&self, span: Span) -> Result<u32> {
641        let current = self.parser().capture_index.get();
642        let i = current
643            .checked_add(1)
644            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645        self.parser().capture_index.set(i);
646        Ok(i)
647    }
648
649    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650        let mut names = self.parser().capture_names.borrow_mut();
651        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652            Err(i) => {
653                names.insert(i, cap.clone());
654                Ok(())
655            }
656            Ok(i) => Err(self.error(
657                cap.span,
658                ast::ErrorKind::GroupNameDuplicate {
659                    original: names[i].span,
660                },
661            )),
662        }
663    }
664
665    fn ignore_whitespace(&self) -> bool {
666        self.parser().ignore_whitespace.get()
667    }
668
669    fn char(&self) -> char {
670        self.char_at(self.offset())
671    }
672
673    fn char_at(&self, i: usize) -> char {
674        self.pattern()[i..]
675            .chars()
676            .next()
677            .unwrap_or_else(|| panic!("expected char at offset {}", i))
678    }
679
680    fn bump(&self) -> bool {
681        if self.is_eof() {
682            return false;
683        }
684        let Position {
685            mut offset,
686            mut line,
687            mut column,
688        } = self.pos();
689        if self.char() == '\n' {
690            line = line.checked_add(1).unwrap();
691            column = 1;
692        } else {
693            column = column.checked_add(1).unwrap();
694        }
695        offset += self.char().len_utf8();
696        self.parser().pos.set(Position {
697            offset,
698            line,
699            column,
700        });
701        self.pattern()[self.offset()..].chars().next().is_some()
702    }
703
704    fn bump_if(&self, prefix: &str) -> bool {
705        if self.pattern()[self.offset()..].starts_with(prefix) {
706            for _ in 0..prefix.chars().count() {
707                self.bump();
708            }
709            true
710        } else {
711            false
712        }
713    }
714
715    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716        if self.bump_if("?=") {
717            return Some((true, true));
718        }
719        if self.bump_if("?!") {
720            return Some((true, false));
721        }
722        if self.bump_if("?<=") {
723            return Some((false, true));
724        }
725        if self.bump_if("?<!") {
726            return Some((false, false));
727        }
728        None
729    }
730
731    fn bump_and_bump_space(&self) -> bool {
732        if !self.bump() {
733            return false;
734        }
735        self.bump_space();
736        !self.is_eof()
737    }
738
739    fn bump_space(&self) {
740        if !self.ignore_whitespace() {
741            return;
742        }
743        while !self.is_eof() {
744            if self.char().is_whitespace() {
745                self.bump();
746            } else if self.char() == '#' {
747                let start = self.pos();
748                let mut comment_text = String::new();
749                self.bump();
750                while !self.is_eof() {
751                    let c = self.char();
752                    self.bump();
753                    if c == '\n' {
754                        break;
755                    }
756                    comment_text.push(c);
757                }
758                let comment = ast::Comment {
759                    span: Span::new(start, self.pos()),
760                    comment: comment_text,
761                };
762                self.parser().comments.borrow_mut().push(comment);
763            } else {
764                break;
765            }
766        }
767    }
768
769    fn peek(&self) -> Option<char> {
770        if self.is_eof() {
771            return None;
772        }
773        self.pattern()[self.offset() + self.char().len_utf8()..]
774            .chars()
775            .next()
776    }
777
778    /// Like peek, but will ignore spaces when the parser is in whitespace
779    /// insensitive mode.
780    fn peek_space(&self) -> Option<char> {
781        if !self.ignore_whitespace() {
782            return self.peek();
783        }
784        if self.is_eof() {
785            return None;
786        }
787        let mut start = self.offset() + self.char().len_utf8();
788        let mut in_comment = false;
789        for (i, c) in self.pattern()[start..].char_indices() {
790            if c.is_whitespace() {
791                continue;
792            } else if !in_comment && c == '#' {
793                in_comment = true;
794            } else if in_comment && c == '\n' {
795                in_comment = false;
796            } else {
797                start += i;
798                break;
799            }
800        }
801        self.pattern()[start..].chars().next()
802    }
803
804    fn is_eof(&self) -> bool {
805        self.offset() == self.pattern().len()
806    }
807
808    fn pos(&self) -> Position {
809        self.parser().pos.get()
810    }
811
812    fn span(&self) -> Span {
813        Span::splat(self.pos())
814    }
815
816    fn span_char(&self) -> Span {
817        let mut next = Position {
818            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819            line: self.line(),
820            column: self.column().checked_add(1).unwrap(),
821        };
822        if self.char() == '\n' {
823            next.line += 1;
824            next.column = 1;
825        }
826        Span::new(self.pos(), next)
827    }
828
829    #[inline(never)]
830    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831        assert_eq!(self.char(), '|');
832        concat.span.end = self.pos();
833        self.push_or_add_alternation(concat);
834        self.bump();
835        Ok(ast::Concat {
836            span: self.span(),
837            asts: vec![],
838        })
839    }
840
841    fn push_or_add_alternation(&self, concat: Concat) {
842        use self::GroupState::*;
843
844        let mut stack = self.parser().stack_group.borrow_mut();
845        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846            alts.asts.push(concat.into_ast());
847            return;
848        }
849        stack.push(Alternation(ast::Alternation {
850            span: Span::new(concat.span.start, self.pos()),
851            asts: vec![concat.into_ast()],
852        }));
853    }
854
855    #[inline(never)]
856    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857        assert_eq!(self.char(), '&');
858        concat.span.end = self.pos();
859        self.push_or_add_intersect(concat);
860        self.bump();
861        Ok(Concat {
862            span: self.span(),
863            asts: vec![],
864        })
865    }
866
867    fn push_or_add_intersect(&self, concat: Concat) {
868        use self::GroupState::*;
869
870        let mut stack = self.parser().stack_group.borrow_mut();
871        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872            alts.asts.push(concat.into_ast());
873            return;
874        }
875        stack.push(Intersection(ast::Intersection {
876            span: Span::new(concat.span.start, self.pos()),
877            asts: vec![concat.into_ast()],
878        }));
879    }
880
881    #[inline(never)]
882    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883        assert_eq!(self.char(), '(');
884        match self.parse_group()? {
885            Either::Left(set) => {
886                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887                if let Some(v) = ignore {
888                    self.parser().ignore_whitespace.set(v);
889                }
890
891                concat.asts.push(Ast::flags(set));
892                Ok(concat)
893            }
894            Either::Right(group) => {
895                let old_ignore_whitespace = self.ignore_whitespace();
896                let new_ignore_whitespace = group
897                    .flags()
898                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899                    .unwrap_or(old_ignore_whitespace);
900                self.parser()
901                    .stack_group
902                    .borrow_mut()
903                    .push(GroupState::Group {
904                        concat,
905                        group,
906                        ignore_whitespace: old_ignore_whitespace,
907                    });
908                self.parser().ignore_whitespace.set(new_ignore_whitespace);
909                Ok(Concat {
910                    span: self.span(),
911                    asts: vec![],
912                })
913            }
914        }
915    }
916
917    #[inline(never)]
918    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919        assert_eq!(self.char(), '~');
920        self.bump();
921        if self.is_eof() || self.char() != '(' {
922            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923        }
924        let open_span = self.span_char();
925        self.bump();
926        let group = ast::Group {
927            span: open_span,
928            kind: ast::GroupKind::Complement,
929            ast: Box::new(Ast::empty(self.span())),
930        };
931
932        let old_ignore_whitespace = self.ignore_whitespace();
933        let new_ignore_whitespace = group
934            .flags()
935            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936            .unwrap_or(old_ignore_whitespace);
937        self.parser()
938            .stack_group
939            .borrow_mut()
940            .push(GroupState::Group {
941                concat,
942                group,
943                ignore_whitespace: old_ignore_whitespace,
944            });
945        self.parser().ignore_whitespace.set(new_ignore_whitespace);
946        Ok(Concat {
947            span: self.span(),
948            asts: vec![],
949        })
950    }
951
952    #[inline(never)]
953    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954        use self::GroupState::*;
955        assert_eq!(self.char(), ')');
956        let mut stack = self.parser().stack_group.borrow_mut();
957        let topstack = stack.pop();
958
959        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960            Some(Group {
961                concat,
962                group,
963                ignore_whitespace,
964            }) => (concat, group, ignore_whitespace, None),
965            Some(Alternation(alt)) => match stack.pop() {
966                Some(Group {
967                    concat,
968                    group,
969                    ignore_whitespace,
970                }) => (
971                    concat,
972                    group,
973                    ignore_whitespace,
974                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975                ),
976                None | Some(Alternation(_)) | Some(Intersection(_)) => {
977                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978                }
979            },
980            Some(Intersection(int)) => match stack.pop() {
981                Some(Group {
982                    concat,
983                    group,
984                    ignore_whitespace,
985                }) => (
986                    concat,
987                    group,
988                    ignore_whitespace,
989                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990                ),
991                None | Some(Alternation(_)) | Some(Intersection(_)) => {
992                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993                }
994            },
995
996            None => {
997                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998            }
999        };
1000        self.parser().ignore_whitespace.set(ignore_whitespace);
1001        group_concat.span.end = self.pos();
1002        self.bump();
1003        group.span.end = self.pos();
1004        match alt {
1005            Some(Either::Left(mut alt)) => {
1006                alt.span.end = group_concat.span.end;
1007                alt.asts.push(group_concat.into_ast());
1008                group.ast = Box::new(alt.into_ast());
1009            }
1010            Some(Either::Right(mut int)) => {
1011                int.span.end = group_concat.span.end;
1012                int.asts.push(group_concat.into_ast());
1013                group.ast = Box::new(int.into_ast());
1014            }
1015            None => {
1016                group.ast = Box::new(group_concat.into_ast());
1017            }
1018        }
1019
1020        if group.kind == GroupKind::Complement {
1021            let complement = ast::Complement {
1022                span: self.span(),
1023                ast: group.ast,
1024            };
1025            prior_concat.asts.push(Ast::complement(complement));
1026        }
1027        // ignore groups for now
1028        else {
1029            prior_concat.asts.push(Ast::group(group));
1030        }
1031        Ok(prior_concat)
1032    }
1033
1034    #[inline(never)]
1035    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036        concat.span.end = self.pos();
1037        let mut stack = self.parser().stack_group.borrow_mut();
1038        let ast = match stack.pop() {
1039            None => Ok(concat.into_ast()),
1040            Some(GroupState::Alternation(mut alt)) => {
1041                alt.span.end = self.pos();
1042                alt.asts.push(concat.into_ast());
1043                Ok(Ast::alternation(alt))
1044            }
1045            Some(GroupState::Intersection(mut int)) => {
1046                int.span.end = self.pos();
1047                int.asts.push(concat.into_ast());
1048
1049                Ok(Ast::intersection(int))
1050            }
1051            Some(GroupState::Group { group, .. }) => {
1052                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053            }
1054        };
1055        // If we try to pop again, there should be nothing.
1056        match stack.pop() {
1057            None => ast,
1058            Some(GroupState::Alternation(alt)) => {
1059                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060            }
1061            Some(GroupState::Intersection(int)) => {
1062                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063            }
1064            Some(GroupState::Group { group, .. }) => {
1065                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066            }
1067        }
1068    }
1069
1070    #[inline(never)]
1071    fn push_class_open(
1072        &self,
1073        parent_union: regex_syntax::ast::ClassSetUnion,
1074    ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075        assert_eq!(self.char(), '[');
1076
1077        let (nested_set, nested_union) = self.parse_set_class_open()?;
1078        self.parser()
1079            .stack_class
1080            .borrow_mut()
1081            .push(ClassState::Open {
1082                union: parent_union,
1083                set: nested_set,
1084            });
1085        Ok(nested_union)
1086    }
1087
1088    #[inline(never)]
1089    fn pop_class(
1090        &self,
1091        nested_union: regex_syntax::ast::ClassSetUnion,
1092    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093        assert_eq!(self.char(), ']');
1094
1095        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096        let prevset = self.pop_class_op(item);
1097        let mut stack = self.parser().stack_class.borrow_mut();
1098        match stack.pop() {
1099            None => panic!("unexpected empty character class stack"),
1100            Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101            Some(ClassState::Open { mut union, mut set }) => {
1102                self.bump();
1103                set.span.end = self.pos();
1104                set.kind = prevset;
1105                if stack.is_empty() {
1106                    Ok(Either::Right(set))
1107                } else {
1108                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109                    Ok(Either::Left(union))
1110                }
1111            }
1112        }
1113    }
1114
1115    #[inline(never)]
1116    fn unclosed_class_error(&self) -> ParseError {
1117        for state in self.parser().stack_class.borrow().iter().rev() {
1118            if let ClassState::Open { ref set, .. } = *state {
1119                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120            }
1121        }
1122        panic!("no open character class found")
1123    }
1124
1125    #[inline(never)]
1126    fn push_class_op(
1127        &self,
1128        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129        next_union: regex_syntax::ast::ClassSetUnion,
1130    ) -> regex_syntax::ast::ClassSetUnion {
1131        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132        let new_lhs = self.pop_class_op(item);
1133        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134            kind: next_kind,
1135            lhs: new_lhs,
1136        });
1137        regex_syntax::ast::ClassSetUnion {
1138            span: self.span(),
1139            items: vec![],
1140        }
1141    }
1142
1143    #[inline(never)]
1144    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145        let mut stack = self.parser().stack_class.borrow_mut();
1146        let (kind, lhs) = match stack.pop() {
1147            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148            Some(state @ ClassState::Open { .. }) => {
1149                stack.push(state);
1150                return rhs;
1151            }
1152            None => unreachable!(),
1153        };
1154        let span = Span::new(lhs.span().start, rhs.span().end);
1155        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156            span,
1157            kind,
1158            lhs: Box::new(lhs),
1159            rhs: Box::new(rhs),
1160        })
1161    }
1162
1163    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164        match hir.kind() {
1165            hir::HirKind::Empty => Ok(NodeId::EPS),
1166            hir::HirKind::Literal(l) => {
1167                if l.0.len() == 1 {
1168                    let node = tb.mk_u8(l.0[0]);
1169                    Ok(node)
1170                } else {
1171                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172                    let conc = tb.mk_concats(ws.iter().copied());
1173                    Ok(conc)
1174                }
1175            }
1176            hir::HirKind::Class(class) => match class {
1177                hir::Class::Unicode(class_unicode) => {
1178                    let ranges = class_unicode.ranges();
1179                    if ranges.len() == 1
1180                        && ranges[0].start() == '\u{0}'
1181                        && ranges[0].end() == '\u{10FFFF}'
1182                    {
1183                        return Ok(tb.mk_range_u8(0, 255));
1184                    }
1185                    let mut nodes = Vec::new();
1186                    for range in ranges {
1187                        for seq in Utf8Sequences::new(range.start(), range.end()) {
1188                            let sl = seq.as_slice();
1189                            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190                            let node = match bytes.len() {
1191                                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192                                n => {
1193                                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194                                    let mut conc = last;
1195                                    for i in (0..n - 1).rev() {
1196                                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197                                        conc = tb.mk_concat(b, conc);
1198                                    }
1199                                    conc
1200                                }
1201                            };
1202                            nodes.push(node);
1203                        }
1204                    }
1205                    let merged = tb.mk_unions(nodes.into_iter());
1206                    Ok(merged)
1207                }
1208                hir::Class::Bytes(class_bytes) => {
1209                    let ranges = class_bytes.ranges();
1210                    let mut result = NodeId::BOT;
1211                    for range in ranges {
1212                        let start = range.start();
1213                        let end = range.end();
1214                        let node = tb.mk_range_u8(start, end);
1215                        result = tb.mk_union(result, node);
1216                    }
1217                    Ok(result)
1218                }
1219            },
1220            hir::HirKind::Look(_) => Err(self.error(
1221                Span::splat(self.pos()),
1222                ast::ErrorKind::UnsupportedResharpRegex,
1223            )),
1224            hir::HirKind::Repetition(_) => Err(self.error(
1225                Span::splat(self.pos()),
1226                ast::ErrorKind::UnsupportedResharpRegex,
1227            )),
1228            hir::HirKind::Capture(_) => Err(self.error(
1229                Span::splat(self.pos()),
1230                ast::ErrorKind::UnsupportedResharpRegex,
1231            )),
1232            hir::HirKind::Concat(body) => {
1233                let mut result = NodeId::EPS;
1234                for child in body {
1235                    let node = self.hir_to_node_id(child, tb)?;
1236                    result = tb.mk_concat(result, node);
1237                }
1238                Ok(result)
1239            }
1240            hir::HirKind::Alternation(_) => Err(self.error(
1241                Span::splat(self.pos()),
1242                ast::ErrorKind::UnsupportedResharpRegex,
1243            )),
1244        }
1245    }
1246
1247    fn translate_ast_to_hir(
1248        &mut self,
1249        orig_ast: &regex_syntax::ast::Ast,
1250        tb: &mut TB<'s>,
1251    ) -> Result<NodeId> {
1252        match self.translator.translate("", orig_ast) {
1253            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254            Ok(hir) => self.hir_to_node_id(&hir, tb),
1255        }
1256    }
1257
1258    fn translator_to_node_id(
1259        &mut self,
1260        orig_ast: &regex_syntax::ast::Ast,
1261        translator: &mut Option<Translator>,
1262        tb: &mut TB<'s>,
1263    ) -> Result<NodeId> {
1264        match translator {
1265            Some(tr) => {
1266                let hir = tr
1267                    .translate("", orig_ast)
1268                    .map_err(|e| self.unsupported_error(e))?;
1269                self.hir_to_node_id(&hir, tb)
1270            }
1271            None => self.translate_ast_to_hir(orig_ast, tb),
1272        }
1273    }
1274
1275    fn get_class(
1276        &mut self,
1277        negated: bool,
1278        kind: regex_syntax::ast::ClassPerlKind,
1279        tb: &mut TB<'s>,
1280    ) -> Result<NodeId> {
1281        let w = self
1282            .perl_classes
1283            .iter()
1284            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285        match w {
1286            Some((_, _, value)) => Ok(*value),
1287            None => {
1288                let translated = if self.global_ascii_perl {
1289                    let pos = match kind {
1290                        regex_syntax::ast::ClassPerlKind::Word => {
1291                            let az = tb.mk_range_u8(b'a', b'z');
1292                            let big = tb.mk_range_u8(b'A', b'Z');
1293                            let dig = tb.mk_range_u8(b'0', b'9');
1294                            let us = tb.mk_u8(b'_');
1295                            tb.mk_unions([az, big, dig, us].into_iter())
1296                        }
1297                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298                        regex_syntax::ast::ClassPerlKind::Space => {
1299                            let sp = tb.mk_u8(b' ');
1300                            let tab = tb.mk_u8(b'\t');
1301                            let nl = tb.mk_u8(b'\n');
1302                            let cr = tb.mk_u8(b'\r');
1303                            let ff = tb.mk_u8(0x0C);
1304                            let vt = tb.mk_u8(0x0B);
1305                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306                        }
1307                    };
1308                    if negated {
1309                        resharp_algebra::neg_class(tb, pos)
1310                    } else {
1311                        pos
1312                    }
1313                } else if self.global_unicode {
1314                    match kind {
1315                        regex_syntax::ast::ClassPerlKind::Word => {
1316                            if self.global_full_unicode {
1317                                self.unicode_classes.ensure_word_full(tb);
1318                            } else {
1319                                self.unicode_classes.ensure_word(tb);
1320                            }
1321                            if negated {
1322                                self.unicode_classes.non_word
1323                            } else {
1324                                self.unicode_classes.word
1325                            }
1326                        }
1327                        regex_syntax::ast::ClassPerlKind::Digit => {
1328                            if self.global_full_unicode {
1329                                self.unicode_classes.ensure_digit_full(tb);
1330                            } else {
1331                                self.unicode_classes.ensure_digit(tb);
1332                            }
1333                            if negated {
1334                                self.unicode_classes.non_digit
1335                            } else {
1336                                self.unicode_classes.digit
1337                            }
1338                        }
1339                        regex_syntax::ast::ClassPerlKind::Space => {
1340                            if self.global_full_unicode {
1341                                self.unicode_classes.ensure_space_full(tb);
1342                            } else {
1343                                self.unicode_classes.ensure_space(tb);
1344                            }
1345                            if negated {
1346                                self.unicode_classes.non_space
1347                            } else {
1348                                self.unicode_classes.space
1349                            }
1350                        }
1351                    }
1352                } else {
1353                    let pos = match kind {
1354                        regex_syntax::ast::ClassPerlKind::Word => {
1355                            let az = tb.mk_range_u8(b'a', b'z');
1356                            let big = tb.mk_range_u8(b'A', b'Z');
1357                            let dig = tb.mk_range_u8(b'0', b'9');
1358                            let us = tb.mk_u8(b'_');
1359                            tb.mk_unions([az, big, dig, us].into_iter())
1360                        }
1361                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362                        regex_syntax::ast::ClassPerlKind::Space => {
1363                            let sp = tb.mk_u8(b' ');
1364                            let tab = tb.mk_u8(b'\t');
1365                            let nl = tb.mk_u8(b'\n');
1366                            let cr = tb.mk_u8(b'\r');
1367                            let ff = tb.mk_u8(0x0C);
1368                            let vt = tb.mk_u8(0x0B);
1369                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370                        }
1371                    };
1372                    if negated {
1373                        // Negate the byte class, not the whole language: `~(pos)`
1374                        // (mk_compl) includes the empty string and every non-class
1375                        // substring, so `\D`/`\S`/`\W` wrongly become nullable and
1376                        // match "". neg_class negates within the single byte, as
1377                        // the js branch above does.
1378                        resharp_algebra::neg_class(tb, pos)
1379                    } else {
1380                        pos
1381                    }
1382                };
1383                self.perl_classes.push((negated, kind, translated));
1384                Ok(translated)
1385            }
1386        }
1387    }
1388
1389    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1390        use WordCharKind::*;
1391        match ast {
1392            Ast::Literal(lit) => {
1393                if is_word_byte(lit.c as u8) {
1394                    Word
1395                } else {
1396                    NonWord
1397                }
1398            }
1399            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1400                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1401                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1402                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1403                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1404                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1405                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1406            },
1407            Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1408            Ast::Dot(_) | Ast::Top(_) => Unknown,
1409            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1410            Ast::Concat(c) if !c.asts.is_empty() => {
1411                let edge = if left { c.asts.len() - 1 } else { 0 };
1412                let kind = Self::word_char_kind(&c.asts[edge], left);
1413                match kind {
1414                    MaybeWord => {
1415                        let dir: isize = if left { -1 } else { 1 };
1416                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1417                            Word => Word,
1418                            _ => MaybeWord,
1419                        }
1420                    }
1421                    MaybeNonWord => {
1422                        let dir: isize = if left { -1 } else { 1 };
1423                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1424                            NonWord => NonWord,
1425                            _ => MaybeNonWord,
1426                        }
1427                    }
1428                    other => other,
1429                }
1430            }
1431            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1432                let first = Self::word_char_kind(&alt.asts[0], left);
1433                if alt.asts[1..]
1434                    .iter()
1435                    .all(|a| Self::word_char_kind(a, left) == first)
1436                {
1437                    first
1438                } else {
1439                    Unknown
1440                }
1441            }
1442            Ast::Repetition(r) => {
1443                let inner = Self::word_char_kind(&r.ast, left);
1444                let nullable = matches!(
1445                    &r.op.kind,
1446                    ast::RepetitionKind::ZeroOrMore
1447                        | ast::RepetitionKind::ZeroOrOne
1448                        | ast::RepetitionKind::Range(
1449                            ast::RepetitionRange::Bounded(0, _)
1450                                | ast::RepetitionRange::Exactly(0)
1451                        )
1452                );
1453                if nullable {
1454                    match inner {
1455                        Word => MaybeWord,
1456                        NonWord => MaybeNonWord,
1457                        _ => Unknown,
1458                    }
1459                } else {
1460                    inner
1461                }
1462            }
1463            Ast::Lookaround(la) => match la.kind {
1464                ast::LookaroundKind::PositiveLookahead
1465                | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1466                _ => Unknown,
1467            },
1468            Ast::Assertion(a) => match (&a.kind, left) {
1469                (ast::AssertionKind::EndText, false) => NonWord,
1470                (ast::AssertionKind::StartText, true) => NonWord,
1471                _ => Unknown,
1472            },
1473            _ => Unknown,
1474        }
1475    }
1476
1477    // ok to return None here, it's only an optimization
1478    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1479        match ast {
1480            Ast::Literal(_)
1481            | Ast::ClassPerl(_)
1482            | Ast::ClassBracketed(_)
1483            | Ast::ClassUnicode(_)
1484            | Ast::Dot(_)
1485            | Ast::Top(_) => Some(ast),
1486            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1487            Ast::Concat(c) if !c.asts.is_empty() => {
1488                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1489            }
1490            Ast::Repetition(r) => {
1491                let nullable = matches!(
1492                    &r.op.kind,
1493                    ast::RepetitionKind::ZeroOrMore
1494                        | ast::RepetitionKind::ZeroOrOne
1495                        | ast::RepetitionKind::Range(
1496                            ast::RepetitionRange::Bounded(0, _)
1497                                | ast::RepetitionRange::Exactly(0)
1498                        )
1499                );
1500                if nullable {
1501                    None
1502                } else {
1503                    Self::edge_class_ast(&r.ast, left)
1504                }
1505            }
1506            _ => None,
1507        }
1508    }
1509
1510    fn resolve_word_kind(
1511        &mut self,
1512        asts: &[Ast],
1513        idx: usize,
1514        dir: isize,
1515        translator: &mut Option<Translator>,
1516        tb: &mut TB<'s>,
1517        word_id: NodeId,
1518        not_word_id: NodeId,
1519    ) -> Result<WordCharKind> {
1520        use WordCharKind::*;
1521        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1522        if fast != Unknown {
1523            return Ok(fast);
1524        }
1525        let neighbor_idx = (idx as isize + dir) as usize;
1526        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1527            self.ast_to_node_id(edge, translator, tb)?
1528        } else if dir > 0 {
1529            let mut bodies: Vec<NodeId> = vec![];
1530            let mut j = neighbor_idx;
1531            while j < asts.len() {
1532                match &asts[j] {
1533                    Ast::Lookaround(la) => {
1534                        let kind = la.kind.clone();
1535                        let lookbehind = matches!(
1536                            kind,
1537                            ast::LookaroundKind::PositiveLookbehind
1538                                | ast::LookaroundKind::NegativeLookbehind
1539                        );
1540                        if lookbehind {
1541                            j += 1;
1542                            continue;
1543                        }
1544                        let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1545                        let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1546                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1547                        })?;
1548                        let body_ts = tb.mk_concat(body, NodeId::TS);
1549                        let constraint = match kind {
1550                            ast::LookaroundKind::PositiveLookahead => body_ts,
1551                            ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1552                            _ => unreachable!(),
1553                        };
1554                        bodies.push(constraint);
1555                        j += 1;
1556                    }
1557                    other => {
1558                        let n = self.ast_to_node_id(other, translator, tb)?;
1559                        let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1560                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1561                        })?;
1562                        bodies.push(tb.mk_concat(n, NodeId::TS));
1563                        break;
1564                    }
1565                }
1566            }
1567            if bodies.is_empty() {
1568                return Ok(Unknown);
1569            }
1570            let combined = tb.mk_inters(bodies.into_iter());
1571            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1572            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1573            return if tb.subsumes(word_prefix, combined) == Some(true) {
1574                Ok(Word)
1575            } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1576                Ok(NonWord)
1577            } else {
1578                Ok(Unknown)
1579            };
1580        } else {
1581            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1582            let mut neighbor_node = tb
1583                .try_elim_lookarounds(neighbor_node)
1584                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1585            neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1586                Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1587            })?;
1588            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1589            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1590            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1591                Ok(Word)
1592            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1593                Ok(NonWord)
1594            } else {
1595                Ok(Unknown)
1596            };
1597        };
1598        if tb.subsumes(word_id, node) == Some(true) {
1599            Ok(Word)
1600        } else if tb.subsumes(not_word_id, node) == Some(true) {
1601            Ok(NonWord)
1602        } else {
1603            Ok(Unknown)
1604        }
1605    }
1606
1607    fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1608        match ast {
1609            Ast::Lookaround(la) => match la.kind {
1610                ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1611                    dir < 0
1612                }
1613                ast::LookaroundKind::PositiveLookbehind
1614                | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1615            },
1616            Ast::Repetition(r) => matches!(
1617                &r.op.kind,
1618                ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1619            ),
1620            _ => false,
1621        }
1622    }
1623
1624    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1625        use WordCharKind::*;
1626        let next = idx as isize + dir;
1627        if next < 0 || next >= asts.len() as isize {
1628            return Edge;
1629        }
1630        if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1631            return Self::concat_neighbor_kind(asts, next as usize, dir);
1632        }
1633        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1634        match kind {
1635            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1636                Word => Word,
1637                _ => Unknown,
1638            },
1639            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1640                NonWord => NonWord,
1641                _ => Unknown,
1642            },
1643            other => other,
1644        }
1645    }
1646
1647    fn specialize_word_boundaries(
1648        &mut self,
1649        children: &mut [NodeId],
1650        tb: &mut TB<'s>,
1651    ) -> Result<()> {
1652        let wb = self.unicode_classes.wb;
1653        let non_wb = self.unicode_classes.non_wb;
1654        if wb == NodeId::MISSING {
1655            return Ok(());
1656        }
1657        let word = self.unicode_classes.word;
1658        let non_word = self.unicode_classes.non_word;
1659        if word == NodeId::MISSING {
1660            return Ok(());
1661        }
1662        // narrow check, could be done over the whole regex instead of one node
1663        // but i need to make sure the cost isnt too large
1664        let word_pref = tb.mk_concat(word, NodeId::TS);
1665        let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1666        let word_suf = tb.mk_concat(NodeId::TS, word);
1667        let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1668        let len = children.len();
1669        for k in 0..len {
1670            let l = if k == 0 {
1671                WordCharKind::Edge
1672            } else {
1673                use resharp_algebra::Kind;
1674                if tb.get_kind(children[k - 1]) == Kind::End
1675                    && (children[k] == wb || children[k] == non_wb)
1676                {
1677                    return Err(
1678                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1679                    );
1680                }
1681                Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1682            };
1683            let r = if k + 1 >= len {
1684                WordCharKind::Edge
1685            } else {
1686                Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1687            };
1688            children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1689                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1690        }
1691        Ok(())
1692    }
1693
1694    fn rewrite_wb_in_node(
1695        b: &mut TB<'s>,
1696        node: NodeId,
1697        wb: NodeId,
1698        non_wb: NodeId,
1699        word: NodeId,
1700        left: WordCharKind,
1701        right: WordCharKind,
1702    ) -> Option<NodeId> {
1703        let boundary_match = if node == wb {
1704            true
1705        } else if node == non_wb {
1706            false
1707        } else if b.get_kind(node) == Kind::Union {
1708            let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1709            let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1710            return Some(b.mk_union(l, r));
1711        } else {
1712            return Some(node);
1713        };
1714        use WordCharKind::*;
1715        let result = match (left, right) {
1716            (NonWord, Word) | (Word, NonWord) => {
1717                if boundary_match {
1718                    NodeId::EPS
1719                } else {
1720                    NodeId::BOT
1721                }
1722            }
1723            (Word, Word) | (NonWord, NonWord) => {
1724                if boundary_match {
1725                    NodeId::BOT
1726                } else {
1727                    NodeId::EPS
1728                }
1729            }
1730            (Word, _) => {
1731                if boundary_match {
1732                    b.mk_neg_lookahead(word, 0)
1733                } else {
1734                    let tail = b.mk_concat(word, NodeId::TS);
1735                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1736                }
1737            }
1738            (NonWord, _) => {
1739                if boundary_match {
1740                    let tail = b.mk_concat(word, NodeId::TS);
1741                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1742                } else {
1743                    b.mk_neg_lookahead(word, 0)
1744                }
1745            }
1746            (_, Word) => {
1747                if boundary_match {
1748                    b.mk_neg_lookbehind(word)
1749                } else {
1750                    b.mk_lookbehind(word, NodeId::MISSING)
1751                }
1752            }
1753            (_, NonWord) => {
1754                if boundary_match {
1755                    b.mk_lookbehind(word, NodeId::MISSING)
1756                } else {
1757                    b.mk_neg_lookbehind(word)
1758                }
1759            }
1760            _ => return Some(node),
1761        };
1762        Some(result)
1763    }
1764
1765    fn classify(
1766        b: &mut TB<'s>,
1767        node: NodeId,
1768        word_dir: NodeId,
1769        non_word_dir: NodeId,
1770    ) -> WordCharKind {
1771        if b.contains_look(node) || b.contains_anchors(node) {
1772            return WordCharKind::Unknown;
1773        }
1774        if b.subsumes(word_dir, node) == Some(true) {
1775            WordCharKind::Word
1776        } else if b.subsumes(non_word_dir, node) == Some(true) {
1777            WordCharKind::NonWord
1778        } else {
1779            WordCharKind::Unknown
1780        }
1781    }
1782
1783    fn rewrite_word_boundary_in_concat(
1784        &mut self,
1785        asts: &[Ast],
1786        idx: usize,
1787        translator: &mut Option<Translator>,
1788        tb: &mut TB<'s>,
1789        negated: bool,
1790    ) -> Result<(NodeId, usize)> {
1791        use WordCharKind::*;
1792        if self.global_full_unicode {
1793            self.unicode_classes.ensure_word_full(tb);
1794        } else if self.global_unicode && !self.global_ascii_perl {
1795            self.unicode_classes.ensure_word(tb);
1796        } else {
1797            self.unicode_classes.ensure_word_ascii(tb);
1798        }
1799        let word_id = self.unicode_classes.word;
1800        let not_word_id = self.unicode_classes.non_word;
1801        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1802        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1803        let boundary_match = !negated;
1804        match (left, right) {
1805            (NonWord, Word) | (Word, NonWord) => Ok((
1806                if boundary_match {
1807                    NodeId::EPS
1808                } else {
1809                    NodeId::BOT
1810                },
1811                idx + 1,
1812            )),
1813            (Word, Word) | (NonWord, NonWord) => Ok((
1814                if boundary_match {
1815                    NodeId::BOT
1816                } else {
1817                    NodeId::EPS
1818                },
1819                idx + 1,
1820            )),
1821            (Word, _) => {
1822                if boundary_match {
1823                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1824                } else {
1825                    let tail = tb.mk_concat(word_id, NodeId::TS);
1826                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1827                }
1828            }
1829            (NonWord, _) => {
1830                if boundary_match {
1831                    let tail = tb.mk_concat(word_id, NodeId::TS);
1832                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1833                } else {
1834                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1835                }
1836            }
1837            (_, Word) => {
1838                if boundary_match {
1839                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1840                } else {
1841                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1842                }
1843            }
1844            (_, NonWord) => {
1845                if boundary_match {
1846                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1847                } else {
1848                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1849                }
1850            }
1851            //   \b = (?<=\w)(?!\w) | (?<!\w)(?=\w)
1852            //   \B = (?<=\w)(?=\w)  | (?<!\w)(?!\w)
1853            //   TODO: expensive and the cost could be reduced further
1854            _ => {
1855                self.unicode_classes.ensure_wb(tb);
1856                let node = if boundary_match {
1857                    self.unicode_classes.wb
1858                } else {
1859                    self.unicode_classes.non_wb
1860                };
1861                Ok((node, idx + 1))
1862            }
1863        }
1864    }
1865
1866    fn merge_boundary_with_following_lookaheads(
1867        &mut self,
1868        asts: &[Ast],
1869        wb_idx: usize,
1870        boundary_tail: NodeId,
1871        translator: &mut Option<Translator>,
1872        tb: &mut TB<'s>,
1873    ) -> Result<(NodeId, usize)> {
1874        let mut next = wb_idx + 1;
1875        let mut la_bodies = vec![boundary_tail];
1876        while next < asts.len() {
1877            match &asts[next] {
1878                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1879                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1880                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1881                    next += 1;
1882                }
1883                _ => break,
1884            }
1885        }
1886        let merged = tb.mk_inters(la_bodies.into_iter());
1887        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1888    }
1889
1890    fn ast_to_node_id(
1891        &mut self,
1892        ast: &Ast,
1893        translator: &mut Option<Translator>,
1894        tb: &mut TB<'s>,
1895    ) -> Result<NodeId> {
1896        match ast {
1897            Ast::Empty(_) => Ok(NodeId::EPS),
1898            Ast::Flags(f) => {
1899                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1900                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1901                }
1902                let mut translator_builder = self.default_translator_builder();
1903                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1904                    translator_builder.case_insensitive(state);
1905                }
1906                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1907                    translator_builder.unicode(state);
1908                }
1909                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1910                    self.dot_all.set(state);
1911                }
1912                if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1913                    self.multiline.set(state);
1914                }
1915                let concat_translator = Some(translator_builder.build());
1916                *translator = concat_translator;
1917                Ok(NodeId::EPS)
1918            }
1919            Ast::Literal(l) => {
1920                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1921                self.translator_to_node_id(&ast_lit, translator, tb)
1922            }
1923            Ast::Top(_) => Ok(NodeId::TOP),
1924            Ast::Dot(_) => {
1925                let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1926                let hirv = match (codepoint_dot, self.dot_all.get()) {
1927                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1928                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1929                    (false, true) => return Ok(NodeId::TOP),
1930                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1931                };
1932                self.hir_to_node_id(&hirv, tb)
1933            }
1934            Ast::Assertion(a) => match &a.kind {
1935                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1936                ast::AssertionKind::EndText => Ok(NodeId::END),
1937                ast::AssertionKind::WordBoundary => {
1938                    let only = Ast::Assertion(a.clone());
1939                    let asts = std::slice::from_ref(&only);
1940                    let (node, _) =
1941                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1942                    Ok(node)
1943                }
1944                ast::AssertionKind::NotWordBoundary => {
1945                    // bare \B with no surrounding concat: treat as singleton.
1946                    let only = Ast::Assertion(a.clone());
1947                    let asts = std::slice::from_ref(&only);
1948                    let (node, _) =
1949                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1950                    Ok(node)
1951                }
1952                ast::AssertionKind::StartLine => {
1953                    if !self.multiline.get() {
1954                        return Ok(NodeId::BEGIN);
1955                    }
1956                    let left = NodeId::BEGIN;
1957                    let right = tb.mk_u8(b'\n');
1958                    let union = tb.mk_union(left, right);
1959                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1960                }
1961                ast::AssertionKind::EndLine => {
1962                    if !self.multiline.get() {
1963                        return Ok(NodeId::END);
1964                    }
1965                    let left = NodeId::END;
1966                    let right = tb.mk_u8(b'\n');
1967                    let union = tb.mk_union(left, right);
1968                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1969                }
1970                ast::AssertionKind::WordBoundaryStart => {
1971                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1972                }
1973                ast::AssertionKind::WordBoundaryEnd => {
1974                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1975                }
1976                ast::AssertionKind::WordBoundaryStartAngle => {
1977                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1978                }
1979                ast::AssertionKind::WordBoundaryEndAngle => {
1980                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1981                }
1982                ast::AssertionKind::WordBoundaryStartHalf => {
1983                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1984                }
1985                ast::AssertionKind::WordBoundaryEndHalf => {
1986                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1987                }
1988            },
1989            Ast::ClassUnicode(c) => {
1990                let tmp = regex_syntax::ast::ClassUnicode {
1991                    span: c.span,
1992                    negated: c.negated,
1993                    kind: c.kind.clone(),
1994                };
1995                if !c.negated {
1996                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1997                        match s.as_str() {
1998                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1999                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2000                            // a single valid utf8 codepoint; \p{utf8}*&~(a) => non a, but valid utf8
2001                            "utf8" => {
2002                                let ascii = tb.mk_range_u8(0, 127);
2003                                let beta = tb.mk_range_u8(128, 0xBF);
2004                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
2005                                let c0s = tb.mk_concats([c0, beta].into_iter());
2006                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
2007                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2008                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
2009                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2010                                return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2011                            }
2012                            "hex" => {
2013                                let nums = tb.mk_range_u8(b'0', b'9');
2014                                let lets = tb.mk_range_u8(b'a', b'f');
2015                                let lets2 = tb.mk_range_u8(b'A', b'F');
2016                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2017                                return Ok(merged);
2018                            }
2019                            _ => {}
2020                        }
2021                    };
2022                }
2023
2024                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2025                self.translator_to_node_id(&orig_ast, translator, tb)
2026            }
2027            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2028            Ast::ClassBracketed(c) => match &c.kind {
2029                regex_syntax::ast::ClassSet::Item(item) => {
2030                    if !c.negated && is_universal_perl_pair(item) {
2031                        return Ok(NodeId::TOP);
2032                    }
2033                    if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2034                        return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2035                    }
2036                    let kind = if self.global_ascii_perl {
2037                        rewrite_ascii_perl_set(&c.kind)
2038                    } else {
2039                        c.kind.clone()
2040                    };
2041                    let tmp = regex_syntax::ast::ClassBracketed {
2042                        span: c.span,
2043                        negated: c.negated,
2044                        kind,
2045                    };
2046                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2047                    self.translator_to_node_id(&orig_ast, translator, tb)
2048                }
2049                regex_syntax::ast::ClassSet::BinaryOp(_) => {
2050                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2051                }
2052            },
2053            Ast::Repetition(r) => {
2054                let body = self.ast_to_node_id(&r.ast, translator, tb);
2055                match body {
2056                    Ok(body) => match &r.op.kind {
2057                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2058                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2059                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2060                        ast::RepetitionKind::Range(r) => match r {
2061                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2062                            ast::RepetitionRange::AtLeast(n) => {
2063                                let rep = tb.mk_repeat(body, *n, *n);
2064                                let st = tb.mk_star(body);
2065                                Ok(tb.mk_concat(rep, st))
2066                            }
2067
2068                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2069                        },
2070                    },
2071                    Err(_) => body,
2072                }
2073            }
2074            Ast::Lookaround(g) => {
2075                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2076                match g.kind {
2077                    ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2078                        let mut prefix = NodeId::EPS;
2079                        let mut rest = body;
2080                        while tb.get_kind(rest) == Kind::Concat
2081                            && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2082                        {
2083                            prefix = tb.mk_concat(prefix, rest.left(tb));
2084                            rest = rest.right(tb);
2085                        }
2086                        if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2087                            return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2088                        }
2089                        let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2090                        Ok(tb.mk_concat(prefix, la))
2091                    }
2092                    ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2093                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2094                    }
2095                    ast::LookaroundKind::PositiveLookahead => {
2096                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2097                    }
2098                    ast::LookaroundKind::PositiveLookbehind
2099                    | ast::LookaroundKind::NegativeLookbehind
2100                        if body.contains_lookahead(tb) =>
2101                    {
2102                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2103                    }
2104                    ast::LookaroundKind::PositiveLookbehind => {
2105                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2106                    }
2107                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2108                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2109                }
2110            }
2111            Ast::Group(g) => {
2112                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2113                    if !flags.items.is_empty() {
2114                        let mut translator_builder = self.default_translator_builder();
2115                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2116                            translator_builder.case_insensitive(state);
2117                        }
2118                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2119                            translator_builder.unicode(state);
2120                        }
2121                        let saved_dot_all = self.dot_all.get();
2122                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2123                            self.dot_all.set(state);
2124                        }
2125                        let saved_multiline = self.multiline.get();
2126                        if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2127                            self.multiline.set(state);
2128                        }
2129                        let mut scoped = Some(translator_builder.build());
2130                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2131                        self.dot_all.set(saved_dot_all);
2132                        self.multiline.set(saved_multiline);
2133                        return result;
2134                    }
2135                }
2136                self.ast_to_node_id(&g.ast, translator, tb)
2137            }
2138            Ast::Alternation(a) => {
2139                let mut children = vec![];
2140                for ast in &a.asts {
2141                    match self.ast_to_node_id(ast, translator, tb) {
2142                        Ok(node_id) => children.push(node_id),
2143                        Err(err) => return Err(err),
2144                    }
2145                }
2146                Ok(tb.mk_unions(children.iter().copied()))
2147            }
2148            Ast::Concat(c) => {
2149                let mut concat_translator: Option<Translator> = None;
2150                let mut children = vec![];
2151                let mut prev_boundary_child: Option<usize> = None;
2152                let mut i = 0;
2153                while i < c.asts.len() {
2154                    let ast = &c.asts[i];
2155                    match ast {
2156                        Ast::Flags(f) => {
2157                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2158                                return Err(
2159                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2160                                );
2161                            }
2162                            let mut translator_builder = self.default_translator_builder();
2163                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2164                                translator_builder.case_insensitive(state);
2165                            }
2166                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2167                                translator_builder.unicode(state);
2168                            }
2169                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2170                                self.dot_all.set(state);
2171                            }
2172                            if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2173                                self.multiline.set(state);
2174                            }
2175                            concat_translator = Some(translator_builder.build());
2176                            *translator = concat_translator.clone();
2177                            i += 1;
2178                            continue;
2179                        }
2180                        Ast::Assertion(a)
2181                            if a.kind == ast::AssertionKind::WordBoundary
2182                                || a.kind == ast::AssertionKind::NotWordBoundary =>
2183                        {
2184                            let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2185                            let node = self.rewrite_word_boundary_in_concat(
2186                                &c.asts, i, translator, tb, negated,
2187                            )?;
2188                            match prev_boundary_child {
2189                                Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2190                                None => {
2191                                    children.push(node.0);
2192                                    prev_boundary_child = Some(children.len() - 1);
2193                                }
2194                            }
2195                            i = node.1; // skip consumed lookaheads
2196                            continue;
2197                        }
2198                        _ => {}
2199                    }
2200                    match concat_translator {
2201                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2202                            Ok(node_id) => {
2203                                if node_id != resharp_algebra::NodeId::EPS {
2204                                    prev_boundary_child = None;
2205                                    children.push(node_id);
2206                                }
2207                            }
2208                            Err(err) => return Err(err),
2209                        },
2210                        None => match self.ast_to_node_id(ast, translator, tb) {
2211                            Ok(node_id) => {
2212                                if node_id != resharp_algebra::NodeId::EPS {
2213                                    prev_boundary_child = None;
2214                                    children.push(node_id);
2215                                }
2216                            }
2217                            Err(err) => return Err(err),
2218                        },
2219                    }
2220                    i += 1;
2221                }
2222                self.specialize_word_boundaries(&mut children, tb)?;
2223                Ok(tb.mk_concats(children.iter().cloned()))
2224            }
2225            Ast::Intersection(intersection) => {
2226                let mut children = vec![];
2227                for ast in &intersection.asts {
2228                    match self.ast_to_node_id(ast, translator, tb) {
2229                        Ok(node_id) => children.push(node_id),
2230                        Err(err) => return Err(err),
2231                    }
2232                }
2233                Ok(tb.mk_inters(children.into_iter()))
2234            }
2235            Ast::Complement(complement) => {
2236                let body = self.ast_to_node_id(&complement.ast, translator, tb);
2237                body.map(|x| tb.mk_compl(x))
2238            }
2239        }
2240    }
2241
2242    fn parse_inner(&mut self) -> Result<Ast> {
2243        let mut concat = Concat {
2244            span: self.span(),
2245            asts: vec![],
2246        };
2247        loop {
2248            self.bump_space();
2249            if self.is_eof() {
2250                break;
2251            }
2252            match self.char() {
2253                '(' => concat = self.push_group(concat)?,
2254                ')' => concat = self.pop_group(concat)?,
2255                '|' => concat = self.push_alternate(concat)?,
2256                '&' => concat = self.push_intersect(concat)?,
2257                '~' => concat = self.push_compl_group(concat)?,
2258                '[' => {
2259                    let class = self.parse_set_class()?;
2260                    concat.asts.push(Ast::class_bracketed(class));
2261                }
2262                '?' => {
2263                    concat =
2264                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2265                }
2266                '*' => {
2267                    concat =
2268                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2269                }
2270                '+' => {
2271                    concat =
2272                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2273                }
2274                '{' => {
2275                    concat = self.parse_counted_repetition(concat)?;
2276                }
2277                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2278            }
2279            if self.stack_group.borrow().len() > self.max_depth {
2280                return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2281            }
2282        }
2283        let ast = self.pop_group_end(concat)?;
2284        if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2285            || max_concat_length(&ast) >= self.max_list_len
2286        {
2287            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2288        }
2289        Ok(ast)
2290    }
2291
2292    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2293        let ast = self.parse_inner()?;
2294        if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2295            return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2296        }
2297        self.ast_to_node_id(&ast, &mut None, tb)
2298    }
2299
2300    #[inline(never)]
2301    fn parse_uncounted_repetition(
2302        &self,
2303        mut concat: ast::Concat,
2304        kind: ast::RepetitionKind,
2305    ) -> Result<ast::Concat> {
2306        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
2307        let op_start = self.pos();
2308        let ast = match concat.asts.pop() {
2309            Some(ast) => ast,
2310            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2311        };
2312        match ast {
2313            Ast::Empty(_) | Ast::Flags(_) => {
2314                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2315            }
2316            _ => {}
2317        }
2318        if self.bump() && self.char() == '?' {
2319            return Err(self.error(
2320                Span::new(op_start, self.pos()),
2321                ast::ErrorKind::UnsupportedLazyQuantifier,
2322            ));
2323        }
2324        concat.asts.push(Ast::repetition(ast::Repetition {
2325            span: ast.span().with_end(self.pos()),
2326            op: ast::RepetitionOp {
2327                span: Span::new(op_start, self.pos()),
2328                kind,
2329            },
2330            greedy: true,
2331            ast: Box::new(ast),
2332        }));
2333        Ok(concat)
2334    }
2335
2336    #[inline(never)]
2337    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2338        assert!(self.char() == '{');
2339        let start = self.pos();
2340        let ast = match concat.asts.pop() {
2341            Some(ast) => ast,
2342            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2343        };
2344        match ast {
2345            Ast::Empty(_) | Ast::Flags(_) => {
2346                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2347            }
2348            _ => {}
2349        }
2350        if !self.bump_and_bump_space() {
2351            return Err(self.error(
2352                Span::new(start, self.pos()),
2353                ast::ErrorKind::RepetitionCountUnclosed,
2354            ));
2355        }
2356        let count_start = specialize_err(
2357            self.parse_decimal(),
2358            ast::ErrorKind::DecimalEmpty,
2359            ast::ErrorKind::RepetitionCountDecimalEmpty,
2360        );
2361        if self.is_eof() {
2362            return Err(self.error(
2363                Span::new(start, self.pos()),
2364                ast::ErrorKind::RepetitionCountUnclosed,
2365            ));
2366        }
2367        let range = if self.char() == ',' {
2368            if !self.bump_and_bump_space() {
2369                return Err(self.error(
2370                    Span::new(start, self.pos()),
2371                    ast::ErrorKind::RepetitionCountUnclosed,
2372                ));
2373            }
2374            if self.char() != '}' {
2375                let count_start = match count_start {
2376                    Ok(c) => c,
2377                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2378                        if self.parser().empty_min_range {
2379                            0
2380                        } else {
2381                            return Err(err);
2382                        }
2383                    }
2384                    err => err?,
2385                };
2386                let count_end = specialize_err(
2387                    self.parse_decimal(),
2388                    ast::ErrorKind::DecimalEmpty,
2389                    ast::ErrorKind::RepetitionCountDecimalEmpty,
2390                )?;
2391                ast::RepetitionRange::Bounded(count_start, count_end)
2392            } else {
2393                ast::RepetitionRange::AtLeast(count_start?)
2394            }
2395        } else {
2396            ast::RepetitionRange::Exactly(count_start?)
2397        };
2398
2399        if self.is_eof() || self.char() != '}' {
2400            return Err(self.error(
2401                Span::new(start, self.pos()),
2402                ast::ErrorKind::RepetitionCountUnclosed,
2403            ));
2404        }
2405
2406        if self.bump_and_bump_space() && self.char() == '?' {
2407            return Err(self.error(
2408                Span::new(start, self.pos()),
2409                ast::ErrorKind::UnsupportedLazyQuantifier,
2410            ));
2411        }
2412
2413        let op_span = Span::new(start, self.pos());
2414        if !range.is_valid() {
2415            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2416        }
2417
2418        let over_limit = match &range {
2419            ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2420            ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2421            ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2422        };
2423        if over_limit {
2424            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2425        }
2426        concat.asts.push(Ast::repetition(ast::Repetition {
2427            span: ast.span().with_end(self.pos()),
2428            op: ast::RepetitionOp {
2429                span: op_span,
2430                kind: ast::RepetitionKind::Range(range),
2431            },
2432            greedy: true,
2433            ast: Box::new(ast),
2434        }));
2435        Ok(concat)
2436    }
2437
2438    #[inline(never)]
2439    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2440        assert_eq!(self.char(), '(');
2441        let open_span = self.span_char();
2442        self.bump();
2443        self.bump_space();
2444        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2445            let kind = match (pos, ahead) {
2446                (true, true) => LookaroundKind::PositiveLookahead,
2447                (true, false) => LookaroundKind::PositiveLookbehind,
2448                (false, true) => LookaroundKind::NegativeLookahead,
2449                (false, false) => LookaroundKind::NegativeLookbehind,
2450            };
2451            return Ok(Either::Right(ast::Group {
2452                span: open_span,
2453                kind: ast::GroupKind::Lookaround(kind),
2454                ast: Box::new(Ast::empty(self.span())),
2455            }));
2456        }
2457        let inner_span = self.span();
2458        let mut starts_with_p = true;
2459        if self.bump_if("?P<") || {
2460            starts_with_p = false;
2461            self.bump_if("?<")
2462        } {
2463            let capture_index = self.next_capture_index(open_span)?;
2464            let name = self.parse_capture_name(capture_index)?;
2465            Ok(Either::Right(ast::Group {
2466                span: open_span,
2467                kind: ast::GroupKind::CaptureName {
2468                    starts_with_p,
2469                    name,
2470                },
2471                ast: Box::new(Ast::empty(self.span())),
2472            }))
2473        } else if self.bump_if("?") {
2474            if self.is_eof() {
2475                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2476            }
2477            let flags = self.parse_flags()?;
2478            let char_end = self.char();
2479            self.bump();
2480            if char_end == ')' {
2481                // We don't allow empty flags, e.g., `(?)`. We instead
2482                // interpret it as a repetition operator missing its argument.
2483                if flags.items.is_empty() {
2484                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2485                }
2486                Ok(Either::Left(ast::SetFlags {
2487                    span: Span {
2488                        end: self.pos(),
2489                        ..open_span
2490                    },
2491                    flags,
2492                }))
2493            } else {
2494                assert_eq!(char_end, ':');
2495                Ok(Either::Right(ast::Group {
2496                    span: open_span,
2497                    kind: ast::GroupKind::NonCapturing(flags),
2498                    ast: Box::new(Ast::empty(self.span())),
2499                }))
2500            }
2501        } else {
2502            let capture_index = self.next_capture_index(open_span)?;
2503            Ok(Either::Right(ast::Group {
2504                span: open_span,
2505                kind: ast::GroupKind::CaptureIndex(capture_index),
2506                ast: Box::new(Ast::empty(self.span())),
2507            }))
2508        }
2509    }
2510
2511    #[inline(never)]
2512    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2513        if self.is_eof() {
2514            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2515        }
2516        let start = self.pos();
2517        loop {
2518            if self.char() == '>' {
2519                break;
2520            }
2521            if !is_capture_char(self.char(), self.pos() == start) {
2522                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2523            }
2524            if !self.bump() {
2525                break;
2526            }
2527        }
2528        let end = self.pos();
2529        if self.is_eof() {
2530            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2531        }
2532        assert_eq!(self.char(), '>');
2533        self.bump();
2534        let name = &self.pattern()[start.offset..end.offset];
2535        if name.is_empty() {
2536            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2537        }
2538        let capname = ast::CaptureName {
2539            span: Span::new(start, end),
2540            name: name.to_string(),
2541            index: capture_index,
2542        };
2543        self.add_capture_name(&capname)?;
2544        Ok(capname)
2545    }
2546
2547    #[inline(never)]
2548    fn parse_flags(&self) -> Result<ast::Flags> {
2549        let mut flags = ast::Flags {
2550            span: self.span(),
2551            items: vec![],
2552        };
2553        let mut last_was_negation = None;
2554        while self.char() != ':' && self.char() != ')' {
2555            if self.char() == '-' {
2556                last_was_negation = Some(self.span_char());
2557                let item = ast::FlagsItem {
2558                    span: self.span_char(),
2559                    kind: ast::FlagsItemKind::Negation,
2560                };
2561                if let Some(i) = flags.add_item(item) {
2562                    return Err(self.error(
2563                        self.span_char(),
2564                        ast::ErrorKind::FlagRepeatedNegation {
2565                            original: flags.items[i].span,
2566                        },
2567                    ));
2568                }
2569            } else {
2570                last_was_negation = None;
2571                let item = ast::FlagsItem {
2572                    span: self.span_char(),
2573                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2574                };
2575                if let Some(i) = flags.add_item(item) {
2576                    return Err(self.error(
2577                        self.span_char(),
2578                        ast::ErrorKind::FlagDuplicate {
2579                            original: flags.items[i].span,
2580                        },
2581                    ));
2582                }
2583            }
2584            if !self.bump() {
2585                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2586            }
2587        }
2588        if let Some(span) = last_was_negation {
2589            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2590        }
2591        flags.span.end = self.pos();
2592        Ok(flags)
2593    }
2594
2595    #[inline(never)]
2596    fn parse_flag(&self) -> Result<ast::Flag> {
2597        match self.char() {
2598            'i' => Ok(ast::Flag::CaseInsensitive),
2599            'm' => Ok(ast::Flag::MultiLine),
2600            's' => Ok(ast::Flag::DotMatchesNewLine),
2601            'U' => Ok(ast::Flag::SwapGreed),
2602            'u' => Ok(ast::Flag::Unicode),
2603            'R' => Ok(ast::Flag::CRLF),
2604            'x' => Ok(ast::Flag::IgnoreWhitespace),
2605            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2606        }
2607    }
2608
2609    fn parse_primitive(&self) -> Result<Primitive> {
2610        match self.char() {
2611            '\\' => self.parse_escape(),
2612            '_' => {
2613                let ast = Primitive::Top(self.span_char());
2614                self.bump();
2615                Ok(ast)
2616            }
2617            '.' => {
2618                let ast = Primitive::Dot(self.span_char());
2619                self.bump();
2620                Ok(ast)
2621            }
2622            '^' => {
2623                let ast = Primitive::Assertion(ast::Assertion {
2624                    span: self.span_char(),
2625                    kind: ast::AssertionKind::StartLine,
2626                });
2627                self.bump();
2628                Ok(ast)
2629            }
2630            '$' => {
2631                let ast = Primitive::Assertion(ast::Assertion {
2632                    span: self.span_char(),
2633                    kind: ast::AssertionKind::EndLine,
2634                });
2635                self.bump();
2636                Ok(ast)
2637            }
2638            c => {
2639                let ast = Primitive::Literal(Literal {
2640                    span: self.span_char(),
2641                    kind: LiteralKind::Verbatim,
2642                    c,
2643                });
2644                self.bump();
2645                Ok(ast)
2646            }
2647        }
2648    }
2649
2650    #[inline(never)]
2651    fn parse_escape(&self) -> Result<Primitive> {
2652        assert_eq!(self.char(), '\\');
2653        let start = self.pos();
2654        if !self.bump() {
2655            return Err(self.error(
2656                Span::new(start, self.pos()),
2657                ast::ErrorKind::EscapeUnexpectedEof,
2658            ));
2659        }
2660        let c = self.char();
2661        // Put some of the more complicated routines into helpers.
2662        match c {
2663            '0'..='9' => {
2664                if !self.parser().octal {
2665                    return Err(self.error(
2666                        Span::new(start, self.span_char().end),
2667                        ast::ErrorKind::UnsupportedBackreference,
2668                    ));
2669                }
2670                let mut lit = self.parse_octal();
2671                lit.span.start = start;
2672                return Ok(Primitive::Literal(lit));
2673            }
2674            'x' | 'u' | 'U' => {
2675                let mut lit = self.parse_hex()?;
2676                lit.span.start = start;
2677                return Ok(Primitive::Literal(lit));
2678            }
2679            'p' | 'P' => {
2680                let mut cls = self.parse_unicode_class()?;
2681                cls.span.start = start;
2682                return Ok(Primitive::Unicode(cls));
2683            }
2684            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2685                let mut cls = self.parse_perl_class();
2686                cls.span.start = start;
2687                return Ok(Primitive::Perl(cls));
2688            }
2689            _ => {}
2690        }
2691
2692        // Handle all of the one letter sequences inline.
2693        self.bump();
2694        let span = Span::new(start, self.pos());
2695        if is_meta_character(c) {
2696            return Ok(Primitive::Literal(Literal {
2697                span,
2698                kind: LiteralKind::Meta,
2699                c,
2700            }));
2701        }
2702        if is_escapeable_character(c) {
2703            return Ok(Primitive::Literal(Literal {
2704                span,
2705                kind: LiteralKind::Superfluous,
2706                c,
2707            }));
2708        }
2709        let special = |kind, c| {
2710            Ok(Primitive::Literal(Literal {
2711                span,
2712                kind: LiteralKind::Special(kind),
2713                c,
2714            }))
2715        };
2716        match c {
2717            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2718            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2719            't' => special(SpecialLiteralKind::Tab, '\t'),
2720            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2721            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2722            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2723            'A' => Ok(Primitive::Assertion(ast::Assertion {
2724                span,
2725                kind: ast::AssertionKind::StartText,
2726            })),
2727            'z' => Ok(Primitive::Assertion(ast::Assertion {
2728                span,
2729                kind: ast::AssertionKind::EndText,
2730            })),
2731            'b' => {
2732                let mut wb = ast::Assertion {
2733                    span,
2734                    kind: ast::AssertionKind::WordBoundary,
2735                };
2736                // After a \b, we "try" to parse things like \b{start} for
2737                // special word boundary assertions.
2738                if !self.is_eof() && self.char() == '{' {
2739                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2740                        wb.kind = kind;
2741                        wb.span.end = self.pos();
2742                    }
2743                }
2744                Ok(Primitive::Assertion(wb))
2745            }
2746            'B' => Ok(Primitive::Assertion(ast::Assertion {
2747                span,
2748                kind: ast::AssertionKind::NotWordBoundary,
2749            })),
2750            '<' => Ok(Primitive::Assertion(ast::Assertion {
2751                span,
2752                kind: ast::AssertionKind::WordBoundaryStartAngle,
2753            })),
2754            '>' => Ok(Primitive::Assertion(ast::Assertion {
2755                span,
2756                kind: ast::AssertionKind::WordBoundaryEndAngle,
2757            })),
2758            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2759        }
2760    }
2761
2762    fn maybe_parse_special_word_boundary(
2763        &self,
2764        wb_start: Position,
2765    ) -> Result<Option<ast::AssertionKind>> {
2766        assert_eq!(self.char(), '{');
2767
2768        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2769        let start = self.pos();
2770        if !self.bump_and_bump_space() {
2771            return Err(self.error(
2772                Span::new(wb_start, self.pos()),
2773                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2774            ));
2775        }
2776        let start_contents = self.pos();
2777        if !is_valid_char(self.char()) {
2778            self.parser().pos.set(start);
2779            return Ok(None);
2780        }
2781
2782        // Now collect up our chars until we see a '}'.
2783        let mut scratch = self.parser().scratch.borrow_mut();
2784        scratch.clear();
2785        while !self.is_eof() && is_valid_char(self.char()) {
2786            scratch.push(self.char());
2787            self.bump_and_bump_space();
2788        }
2789        if self.is_eof() || self.char() != '}' {
2790            return Err(self.error(
2791                Span::new(start, self.pos()),
2792                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2793            ));
2794        }
2795        let end = self.pos();
2796        self.bump();
2797        let kind = match scratch.as_str() {
2798            "start" => ast::AssertionKind::WordBoundaryStart,
2799            "end" => ast::AssertionKind::WordBoundaryEnd,
2800            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2801            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2802            _ => {
2803                return Err(self.error(
2804                    Span::new(start_contents, end),
2805                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2806                ))
2807            }
2808        };
2809        Ok(Some(kind))
2810    }
2811
2812    #[inline(never)]
2813    fn parse_octal(&self) -> Literal {
2814        assert!(self.parser().octal);
2815        assert!('0' <= self.char() && self.char() <= '7');
2816        let start = self.pos();
2817        // Parse up to two more digits.
2818        while self.bump()
2819            && '0' <= self.char()
2820            && self.char() <= '7'
2821            && self.pos().offset - start.offset <= 2
2822        {}
2823        let end = self.pos();
2824        let octal = &self.pattern()[start.offset..end.offset];
2825        // Parsing the octal should never fail since the above guarantees a
2826        // valid number.
2827        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2828        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2829        // invalid Unicode scalar values.
2830        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2831        Literal {
2832            span: Span::new(start, end),
2833            kind: LiteralKind::Octal,
2834            c,
2835        }
2836    }
2837
2838    #[inline(never)]
2839    fn parse_hex(&self) -> Result<Literal> {
2840        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2841
2842        let hex_kind = match self.char() {
2843            'x' => HexLiteralKind::X,
2844            'u' => HexLiteralKind::UnicodeShort,
2845            _ => HexLiteralKind::UnicodeLong,
2846        };
2847        if !self.bump_and_bump_space() {
2848            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2849        }
2850        if self.char() == '{' {
2851            self.parse_hex_brace(hex_kind)
2852        } else {
2853            self.parse_hex_digits(hex_kind)
2854        }
2855    }
2856
2857    #[inline(never)]
2858    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2859        let mut scratch = self.parser().scratch.borrow_mut();
2860        scratch.clear();
2861
2862        let start = self.pos();
2863        for i in 0..kind.digits() {
2864            if i > 0 && !self.bump_and_bump_space() {
2865                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2866            }
2867            if !is_hex(self.char()) {
2868                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2869            }
2870            scratch.push(self.char());
2871        }
2872        self.bump_and_bump_space();
2873        let end = self.pos();
2874        let hex = scratch.as_str();
2875        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2876            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2877            Some(c) => Ok(Literal {
2878                span: Span::new(start, end),
2879                kind: LiteralKind::HexFixed(kind),
2880                c,
2881            }),
2882        }
2883    }
2884
2885    #[inline(never)]
2886    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2887        let mut scratch = self.parser().scratch.borrow_mut();
2888        scratch.clear();
2889
2890        let brace_pos = self.pos();
2891        let start = self.span_char().end;
2892        while self.bump_and_bump_space() && self.char() != '}' {
2893            if !is_hex(self.char()) {
2894                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2895            }
2896            scratch.push(self.char());
2897        }
2898        if self.is_eof() {
2899            return Err(self.error(
2900                Span::new(brace_pos, self.pos()),
2901                ast::ErrorKind::EscapeUnexpectedEof,
2902            ));
2903        }
2904        let end = self.pos();
2905        let hex = scratch.as_str();
2906        assert_eq!(self.char(), '}');
2907        self.bump_and_bump_space();
2908
2909        if hex.is_empty() {
2910            return Err(self.error(
2911                Span::new(brace_pos, self.pos()),
2912                ast::ErrorKind::EscapeHexEmpty,
2913            ));
2914        }
2915        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2916            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2917            Some(c) => Ok(Literal {
2918                span: Span::new(start, self.pos()),
2919                kind: LiteralKind::HexBrace(kind),
2920                c,
2921            }),
2922        }
2923    }
2924
2925    fn parse_decimal(&self) -> Result<u32> {
2926        let mut scratch = self.parser().scratch.borrow_mut();
2927        scratch.clear();
2928
2929        while !self.is_eof() && self.char().is_whitespace() {
2930            self.bump();
2931        }
2932        let start = self.pos();
2933        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2934            scratch.push(self.char());
2935            self.bump_and_bump_space();
2936        }
2937        let span = Span::new(start, self.pos());
2938        while !self.is_eof() && self.char().is_whitespace() {
2939            self.bump_and_bump_space();
2940        }
2941        let digits = scratch.as_str();
2942        if digits.is_empty() {
2943            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2944        }
2945        match digits.parse::<u32>().ok() {
2946            Some(n) => Ok(n),
2947            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2948        }
2949    }
2950
2951    #[inline(never)]
2952    fn parse_set_class(&self) -> Result<ClassBracketed> {
2953        assert_eq!(self.char(), '[');
2954
2955        let mut union = ClassSetUnion {
2956            span: self.span(),
2957            items: vec![],
2958        };
2959        loop {
2960            self.bump_space();
2961            if self.is_eof() {
2962                return Err(self.unclosed_class_error());
2963            }
2964            match self.char() {
2965                '[' => {
2966                    if !self.parser().stack_class.borrow().is_empty() {
2967                        if let Some(cls) = self.maybe_parse_ascii_class() {
2968                            union.push(ClassSetItem::Ascii(cls));
2969                            continue;
2970                        }
2971                    }
2972                    union = self.push_class_open(union)?;
2973                }
2974                ']' => match self.pop_class(union)? {
2975                    Either::Left(nested_union) => {
2976                        union = nested_union;
2977                    }
2978                    Either::Right(class) => return Ok(class),
2979                },
2980                '&' if self.peek() == Some('&') => {
2981                    assert!(self.bump_if("&&"));
2982                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2983                }
2984                '-' if self.peek() == Some('-') => {
2985                    assert!(self.bump_if("--"));
2986                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2987                }
2988                '~' if self.peek() == Some('~') => {
2989                    assert!(self.bump_if("~~"));
2990                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2991                }
2992                _ => {
2993                    union.push(self.parse_set_class_range()?);
2994                }
2995            }
2996        }
2997    }
2998
2999    #[inline(never)]
3000    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3001        let prim1 = self.parse_set_class_item()?;
3002        self.bump_space();
3003        if self.is_eof() {
3004            return Err(self.unclosed_class_error());
3005        }
3006        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3007            return prim1.into_class_set_item(self);
3008        }
3009        if !self.bump_and_bump_space() {
3010            return Err(self.unclosed_class_error());
3011        }
3012        let prim2 = self.parse_set_class_item()?;
3013        let range = ClassSetRange {
3014            span: Span::new(prim1.span().start, prim2.span().end),
3015            start: prim1.into_class_literal(self)?,
3016            end: prim2.into_class_literal(self)?,
3017        };
3018        if !range.is_valid() {
3019            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3020        }
3021        Ok(ClassSetItem::Range(range))
3022    }
3023
3024    #[inline(never)]
3025    fn parse_set_class_item(&self) -> Result<Primitive> {
3026        if self.char() == '\\' {
3027            self.parse_escape()
3028        } else {
3029            let x = Primitive::Literal(Literal {
3030                span: self.span_char(),
3031                kind: LiteralKind::Verbatim,
3032                c: self.char(),
3033            });
3034            self.bump();
3035            Ok(x)
3036        }
3037    }
3038
3039    #[inline(never)]
3040    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3041        assert_eq!(self.char(), '[');
3042        let start = self.pos();
3043        if !self.bump_and_bump_space() {
3044            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3045        }
3046
3047        let negated = if self.char() != '^' {
3048            false
3049        } else {
3050            if !self.bump_and_bump_space() {
3051                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3052            }
3053            true
3054        };
3055        // Accept any number of `-` as literal `-`.
3056        let mut union = ClassSetUnion {
3057            span: self.span(),
3058            items: vec![],
3059        };
3060        while self.char() == '-' {
3061            union.push(ClassSetItem::Literal(Literal {
3062                span: self.span_char(),
3063                kind: LiteralKind::Verbatim,
3064                c: '-',
3065            }));
3066            if !self.bump_and_bump_space() {
3067                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3068            }
3069        }
3070        // If `]` is the *first* char in a set, then interpret it as a literal
3071        // `]`. That is, an empty class is impossible to write.
3072        if union.items.is_empty() && self.char() == ']' {
3073            union.push(ClassSetItem::Literal(Literal {
3074                span: self.span_char(),
3075                kind: LiteralKind::Verbatim,
3076                c: ']',
3077            }));
3078            if !self.bump_and_bump_space() {
3079                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3080            }
3081        }
3082        let set = ClassBracketed {
3083            span: Span::new(start, self.pos()),
3084            negated,
3085            kind: ClassSet::union(ClassSetUnion {
3086                span: Span::new(union.span.start, union.span.start),
3087                items: vec![],
3088            }),
3089        };
3090        Ok((set, union))
3091    }
3092
3093    #[inline(never)]
3094    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3095        assert_eq!(self.char(), '[');
3096        // If parsing fails, then we back up the parser to this starting point.
3097        let start = self.pos();
3098        let mut negated = false;
3099        if !self.bump() || self.char() != ':' {
3100            self.parser().pos.set(start);
3101            return None;
3102        }
3103        if !self.bump() {
3104            self.parser().pos.set(start);
3105            return None;
3106        }
3107        if self.char() == '^' {
3108            negated = true;
3109            if !self.bump() {
3110                self.parser().pos.set(start);
3111                return None;
3112            }
3113        }
3114        let name_start = self.offset();
3115        while self.char() != ':' && self.bump() {}
3116        if self.is_eof() {
3117            self.parser().pos.set(start);
3118            return None;
3119        }
3120        let name = &self.pattern()[name_start..self.offset()];
3121        if !self.bump_if(":]") {
3122            self.parser().pos.set(start);
3123            return None;
3124        }
3125        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3126            Some(kind) => kind,
3127            None => {
3128                self.parser().pos.set(start);
3129                return None;
3130            }
3131        };
3132        Some(ClassAscii {
3133            span: Span::new(start, self.pos()),
3134            kind,
3135            negated,
3136        })
3137    }
3138
3139    #[inline(never)]
3140    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3141        assert!(self.char() == 'p' || self.char() == 'P');
3142
3143        let mut scratch = self.parser().scratch.borrow_mut();
3144        scratch.clear();
3145
3146        let negated = self.char() == 'P';
3147        if !self.bump_and_bump_space() {
3148            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3149        }
3150        let (start, kind) = if self.char() == '{' {
3151            let start = self.span_char().end;
3152            while self.bump_and_bump_space() && self.char() != '}' {
3153                scratch.push(self.char());
3154            }
3155            if self.is_eof() {
3156                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3157            }
3158            assert_eq!(self.char(), '}');
3159            self.bump();
3160
3161            let name = scratch.as_str();
3162            if let Some(i) = name.find("!=") {
3163                (
3164                    start,
3165                    ClassUnicodeKind::NamedValue {
3166                        op: ClassUnicodeOpKind::NotEqual,
3167                        name: name[..i].to_string(),
3168                        value: name[i + 2..].to_string(),
3169                    },
3170                )
3171            } else if let Some(i) = name.find(':') {
3172                (
3173                    start,
3174                    ClassUnicodeKind::NamedValue {
3175                        op: ClassUnicodeOpKind::Colon,
3176                        name: name[..i].to_string(),
3177                        value: name[i + 1..].to_string(),
3178                    },
3179                )
3180            } else if let Some(i) = name.find('=') {
3181                (
3182                    start,
3183                    ClassUnicodeKind::NamedValue {
3184                        op: ClassUnicodeOpKind::Equal,
3185                        name: name[..i].to_string(),
3186                        value: name[i + 1..].to_string(),
3187                    },
3188                )
3189            } else {
3190                (start, ClassUnicodeKind::Named(name.to_string()))
3191            }
3192        } else {
3193            let start = self.pos();
3194            let c = self.char();
3195            if c == '\\' {
3196                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3197            }
3198            self.bump_and_bump_space();
3199            let kind = ClassUnicodeKind::OneLetter(c);
3200            (start, kind)
3201        };
3202        Ok(ClassUnicode {
3203            span: Span::new(start, self.pos()),
3204            negated,
3205            kind,
3206        })
3207    }
3208
3209    #[inline(never)]
3210    fn parse_perl_class(&self) -> ClassPerl {
3211        let c = self.char();
3212        let span = self.span_char();
3213        self.bump();
3214        let (negated, kind) = match c {
3215            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3216            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3217            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3218            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3219            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3220            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3221            c => panic!("expected valid Perl class but got '{}'", c),
3222        };
3223        ClassPerl {
3224            span,
3225            kind,
3226            negated,
3227        }
3228    }
3229}
3230
3231/// `[\s\S]`, `[\w\W]`, `[\d\D]` etc into `_` canonicalization.
3232/// should really be done as a u32 BDD but good enough for now
3233fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
3234    use regex_syntax::ast::ClassSetItem;
3235    let items = match item {
3236        ClassSetItem::Union(u) => &u.items,
3237        _ => return false,
3238    };
3239    if items.len() != 2 {
3240        return false;
3241    }
3242    match (&items[0], &items[1]) {
3243        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3244            let is_all = a.kind == b.kind && a.negated != b.negated;
3245            is_all
3246        }
3247        _ => false,
3248    }
3249}
3250
3251pub fn max_concat_length(ast: &ast::Ast) -> usize {
3252    match ast {
3253        ast::Ast::Empty(_)
3254        | ast::Ast::Flags(_)
3255        | ast::Ast::Literal(_)
3256        | ast::Ast::Dot(_)
3257        | ast::Ast::Top(_)
3258        | ast::Ast::Assertion(_)
3259        | ast::Ast::ClassUnicode(_)
3260        | ast::Ast::ClassPerl(_)
3261        | ast::Ast::ClassBracketed(_) => 0,
3262        ast::Ast::Group(g) => max_concat_length(&g.ast),
3263        ast::Ast::Complement(c) => max_concat_length(&c.ast),
3264        ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3265        ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3266        ast::Ast::Concat(c) => c
3267            .asts
3268            .len()
3269            .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3270        ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3271        ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3272    }
3273}
3274
3275pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3276    fn go(ast: &ast::Ast, limit: u64) -> u64 {
3277        match ast {
3278            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3279            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3280            ast::Ast::Assertion(_) => 1,
3281            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3282            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3283            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3284            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3285            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3286            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3287            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3288            ast::Ast::Repetition(r) => {
3289                let body = go(&r.ast, limit);
3290                let factor: u64 = match &r.op.kind {
3291                    ast::RepetitionKind::ZeroOrOne => 2,
3292                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3293                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3294                        (*n as u64).max(1)
3295                    }
3296                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3297                        (*n as u64).max(1).saturating_add(1)
3298                    }
3299                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3300                        (*m as u64).max(1)
3301                    }
3302                };
3303                body.saturating_mul(factor).min(limit)
3304            }
3305        }
3306    }
3307    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3308        let mut total: u64 = 0;
3309        for c in children {
3310            total = total.saturating_add(go(c, limit));
3311            if total >= limit {
3312                return limit;
3313            }
3314        }
3315        total
3316    }
3317    go(ast, limit)
3318}
3319
3320pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3321    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3322    p.parse(tb)
3323}
3324
3325pub fn parse_ast_with<'s>(
3326    tb: &mut TB<'s>,
3327    pattern: &'s str,
3328    flags: &PatternFlags,
3329) -> std::result::Result<NodeId, ParseError> {
3330    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3331    p.parse(tb)
3332}
3333
3334/// Parse a pattern into the raw AST without converting to algebra nodes.
3335pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3336    let mut p: ResharpParser = ResharpParser::new(pattern);
3337    p.parse_inner()
3338}
resharp_parser/lib.rs

resharp_parser/
lib.rs