resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `RegexOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// `^` and `$` match at line boundaries (`\n`) in addition to text boundaries.
37    pub multiline: bool,
38    /// allow whitespace and `#` comments in the pattern.
39    pub ignore_whitespace: bool,
40    /// ASCII `\w`/`\d`/`\s` tables, but
41    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
42    pub ascii_perl_classes: bool,
43    /// max upper bound on `expanded_ast_size` before parser rejects the
44    /// pattern as too complex. default 50_000.
45    pub expanded_ast_limit: u64,
46    /// max children allowed in any single `Concat`/`Alternation`/
47    /// `Intersection` node. default 4_000.
48    pub max_list_len: usize,
49    /// max upper bound on bounded repetition `{n,m}`. default 500.
50    pub max_repeat: u32,
51    pub max_depth: usize,
52}
53
54// arbitrary safeguards, these will not prevent intentional DoS patterns
55// more to protect you from shooting yourself in the foot
56pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62    fn default() -> Self {
63        Self {
64            unicode: true,
65            full_unicode: false,
66            case_insensitive: false,
67            dot_matches_new_line: false,
68            multiline: true,
69            ignore_whitespace: false,
70            ascii_perl_classes: false,
71            expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72            max_list_len: DEFAULT_MAX_LIST_LEN,
73            max_repeat: DEFAULT_MAX_REPEAT,
74            max_depth: DEFAULT_MAX_DEPTH,
75        }
76    }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81    Word,
82    NonWord,
83    MaybeWord,
84    MaybeNonWord,
85    Unknown,
86    Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90    b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: &regex_syntax::ast::ClassSetItem) -> WordCharKind {
94    use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95    use WordCharKind::*;
96    match item {
97        ClassSetItem::Empty(_) => Unknown,
98        ClassSetItem::Literal(l) => {
99            if is_word_byte(l.c as u8) {
100                Word
101            } else {
102                NonWord
103            }
104        }
105        ClassSetItem::Range(r) => {
106            let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107            let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108            if all_word {
109                Word
110            } else if all_non {
111                NonWord
112            } else {
113                Unknown
114            }
115        }
116        ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117            (ClassPerlKind::Word, false) => Word,
118            (ClassPerlKind::Word, true) => NonWord,
119            (ClassPerlKind::Space, false) => NonWord,
120            (ClassPerlKind::Digit, false) => Word,
121            _ => Unknown,
122        },
123        ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124        ClassSetItem::Union(u) => {
125            let mut kind = Unknown;
126            for item in &u.items {
127                let k = class_set_item_word_kind(item);
128                kind = match (kind, k) {
129                    (_, Unknown) => return Unknown,
130                    (Unknown, _) => k,
131                    (Word, Word) => Word,
132                    (NonWord, NonWord) => NonWord,
133                    _ => return Unknown,
134                };
135            }
136            kind
137        }
138        _ => Unknown,
139    }
140}
141
142fn class_bracketed_word_kind(c: &regex_syntax::ast::ClassBracketed) -> WordCharKind {
143    use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144    use WordCharKind::*;
145    if c.negated {
146        return match &c.kind {
147            ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148                if p.negated {
149                    Word
150                } else {
151                    NonWord
152                }
153            }
154            _ => Unknown,
155        };
156    }
157    match &c.kind {
158        ClassSet::Item(item) => class_set_item_word_kind(item),
159        ClassSet::BinaryOp(_) => Unknown,
160    }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164    regex_syntax::ast::Literal {
165        span,
166        kind: regex_syntax::ast::LiteralKind::Verbatim,
167        c,
168    }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172    regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173        span,
174        start: ascii_class_lit(span, a),
175        end: ascii_class_lit(span, b),
176    })
177}
178
179fn ascii_perl_positive(
180    span: Span,
181    kind: &regex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183    use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184    match kind {
185        ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186        ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187            span,
188            items: vec![
189                ascii_class_range(span, 'a', 'z'),
190                ascii_class_range(span, 'A', 'Z'),
191                ascii_class_range(span, '0', '9'),
192                ClassSetItem::Literal(ascii_class_lit(span, '_')),
193            ],
194        }),
195        ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196            span,
197            items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198                .into_iter()
199                .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200                .collect(),
201        }),
202    }
203}
204
205fn ascii_perl_set_item(
206    span: Span,
207    kind: &regex_syntax::ast::ClassPerlKind,
208    negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210    use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211    let positive = ascii_perl_positive(span, kind);
212    if negated {
213        ClassSetItem::Bracketed(Box::new(ClassBracketed {
214            span,
215            negated: true,
216            kind: ClassSet::Item(positive),
217        }))
218    } else {
219        positive
220    }
221}
222
223fn rewrite_ascii_perl_set(set: &regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224    use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225    match set {
226        ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227        ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228            span: op.span,
229            kind: op.kind.clone(),
230            lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231            rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232        }),
233    }
234}
235
236fn rewrite_ascii_perl_item(
237    item: &regex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239    use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240    match item {
241        ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242        ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243            span: u.span,
244            items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245        }),
246        ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247            span: b.span,
248            negated: b.negated,
249            kind: rewrite_ascii_perl_set(&b.kind),
250        })),
251        other => other.clone(),
252    }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257    Literal(Literal),
258    Assertion(ast::Assertion),
259    Dot(Span),
260    Top(Span),
261    Perl(ClassPerl),
262    Unicode(ClassUnicode),
263}
264
265impl Primitive {
266    fn span(&self) -> &Span {
267        match *self {
268            Primitive::Literal(ref x) => &x.span,
269            Primitive::Assertion(ref x) => &x.span,
270            Primitive::Dot(ref span) => span,
271            Primitive::Top(ref span) => span,
272            Primitive::Perl(ref x) => &x.span,
273            Primitive::Unicode(ref x) => &x.span,
274        }
275    }
276
277    fn into_ast(self) -> Ast {
278        match self {
279            Primitive::Literal(lit) => Ast::literal(lit),
280            Primitive::Assertion(assert) => Ast::assertion(assert),
281            Primitive::Dot(span) => Ast::dot(span),
282            Primitive::Top(span) => Ast::top(span),
283            Primitive::Perl(cls) => Ast::class_perl(cls),
284            Primitive::Unicode(cls) => Ast::class_unicode(cls),
285        }
286    }
287
288    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289        use self::Primitive::*;
290        use regex_syntax::ast::ClassSetItem;
291
292        match self {
293            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297        }
298    }
299
300    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301        use self::Primitive::*;
302
303        match self {
304            Literal(lit) => Ok(lit),
305            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306        }
307    }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312    Left(Left),
313    Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318    /// The kind of error.
319    pub kind: ErrorKind,
320    /// The original pattern that the parser generated the error from. Every
321    /// span in an error is a valid range into this string.
322    pattern: String,
323    /// The span of this error.
324    pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329        write!(f, "{:?}: {:?}", self.kind, self.span)
330    }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338    /// This state is pushed whenever an opening group is found.
339    Group {
340        /// The concatenation immediately preceding the opening group.
341        concat: Concat,
342        /// The group that has been opened. Its sub-AST is always empty.
343        group: ast::Group,
344        /// Whether this group has the `x` flag enabled or not.
345        ignore_whitespace: bool,
346    },
347    Alternation(ast::Alternation),
348    Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353    /// This state is pushed whenever an opening bracket is found.
354    Open {
355        /// The union of class items immediately preceding this class.
356        union: regex_syntax::ast::ClassSetUnion,
357        set: regex_syntax::ast::ClassBracketed,
358    },
359    /// This state is pushed when a operator is seen. When popped, the stored
360    /// set becomes the left hand side of the operator.
361    Op {
362        /// The type of the operation, i.e., &&, -- or ~~.
363        kind: regex_syntax::ast::ClassSetBinaryOpKind,
364        /// The left-hand side of the operator.
365        lhs: regex_syntax::ast::ClassSet,
366    },
367}
368
369/// RE# syntax parser based on the regex-syntax crate.
370pub struct ResharpParser<'s> {
371    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372    unicode_classes: resharp_algebra::UnicodeClassCache,
373    pub translator: regex_syntax::hir::translate::Translator,
374    pub pattern: &'s str,
375    pos: Cell<Position>,
376    capture_index: Cell<u32>,
377    octal: bool,
378    empty_min_range: bool,
379    ignore_whitespace: Cell<bool>,
380    dot_all: Cell<bool>,
381    multiline: Cell<bool>,
382    global_unicode: bool,
383    global_full_unicode: bool,
384    global_ascii_perl: bool,
385    global_case_insensitive: bool,
386    expanded_ast_limit: u64,
387    max_list_len: usize,
388    max_repeat: u32,
389    max_depth: usize,
390    comments: RefCell<Vec<ast::Comment>>,
391    stack_group: RefCell<Vec<GroupState>>,
392    stack_class: RefCell<Vec<ClassState>>,
393    capture_names: RefCell<Vec<ast::CaptureName>>,
394    scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398    result.map_err(|e| {
399        if e.kind == from {
400            ParseError {
401                kind: to,
402                pattern: e.pattern,
403                span: e.span,
404            }
405        } else {
406            e
407        }
408    })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412    if first {
413        c == '_' || c.is_alphabetic()
414    } else {
415        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416    }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420    matches!(
421        c,
422        '\\' | '.'
423            | '+'
424            | '*'
425            | '?'
426            | '('
427            | ')'
428            | '|'
429            | '['
430            | ']'
431            | '{'
432            | '}'
433            | '^'
434            | '$'
435            | '#'
436            | '&'
437            | '-'
438            | '~'
439            | '_'
440    )
441}
442
443/// escapes all resharp meta characters in `text`.
444pub fn escape(text: &str) -> String {
445    let mut buf = String::new();
446    escape_into(text, &mut buf);
447    buf
448}
449
450/// escapes all resharp meta characters in `text` and appends to `buf`.
451pub fn escape_into(text: &str, buf: &mut String) {
452    buf.reserve(text.len());
453    for c in text.chars() {
454        if is_meta_character(c) {
455            buf.push('\\');
456        }
457        buf.push(c);
458    }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462    if is_meta_character(c) {
463        return true;
464    }
465    if !c.is_ascii() {
466        return false;
467    }
468    match c {
469        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470        '<' | '>' => false,
471        _ => true,
472    }
473}
474
475fn is_hex(c: char) -> bool {
476    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480    match ast {
481        Ast::Concat(c) => {
482            let mut child_at_start = at_start;
483            for child in &c.asts {
484                ensure_lookbehind_at_start(child, child_at_start)?;
485                if ast_may_consume(child) {
486                    child_at_start = false;
487                }
488            }
489            Ok(())
490        }
491        Ast::Alternation(a) => {
492            for child in &a.asts {
493                ensure_lookbehind_at_start(child, at_start)?;
494            }
495            Ok(())
496        }
497        Ast::Intersection(i) => {
498            for child in &i.asts {
499                ensure_lookbehind_at_start(child, at_start)?;
500            }
501            Ok(())
502        }
503        Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504        Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505        Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506        Ast::Lookaround(g) => {
507            match g.kind {
508                LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509                    if !at_start {
510                        return Err(g.span);
511                    }
512                }
513                LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514            }
515            ensure_lookbehind_at_start(&g.ast, true)
516        }
517        Ast::Empty(_)
518        | Ast::Flags(_)
519        | Ast::Literal(_)
520        | Ast::Dot(_)
521        | Ast::Top(_)
522        | Ast::Assertion(_)
523        | Ast::ClassUnicode(_)
524        | Ast::ClassPerl(_)
525        | Ast::ClassBracketed(_) => Ok(()),
526    }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530    match ast {
531        Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532        Ast::Literal(_)
533        | Ast::Dot(_)
534        | Ast::Top(_)
535        | Ast::ClassUnicode(_)
536        | Ast::ClassPerl(_)
537        | Ast::ClassBracketed(_) => true,
538        Ast::Group(g) => ast_may_consume(&g.ast),
539        Ast::Repetition(r) => {
540            if !ast_may_consume(&r.ast) {
541                return false;
542            }
543            match r.op.kind {
544                RepetitionKind::ZeroOrOne
545                | RepetitionKind::ZeroOrMore
546                | RepetitionKind::OneOrMore => true,
547                RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548                RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549                RepetitionKind::Range(_) => true,
550            }
551        }
552        Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553        Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554        Ast::Complement(_) => true,
555        Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556    }
557}
558
559impl<'s> ResharpParser<'s> {
560    fn default_translator_builder(&self) -> TranslatorBuilder {
561        let mut trb = TranslatorBuilder::new();
562        trb.unicode(self.global_unicode);
563        trb.utf8(false);
564        trb.case_insensitive(self.global_case_insensitive);
565        trb
566    }
567
568    pub fn new(pattern: &'s str) -> Self {
569        Self::with_flags(pattern, &PatternFlags::default())
570    }
571
572    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573        let mut trb = TranslatorBuilder::new();
574        trb.unicode(flags.unicode);
575        trb.utf8(false);
576        trb.case_insensitive(flags.case_insensitive);
577        Self {
578            translator: trb.build(),
579            pattern,
580            perl_classes: vec![],
581            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582            pos: Cell::new(Position::new(0, 0, 0)),
583            capture_index: Cell::new(0),
584            octal: false,
585            empty_min_range: false,
586            ignore_whitespace: Cell::new(flags.ignore_whitespace),
587            dot_all: Cell::new(flags.dot_matches_new_line),
588            multiline: Cell::new(flags.multiline),
589            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590            global_full_unicode: flags.full_unicode,
591            global_ascii_perl: flags.ascii_perl_classes,
592            global_case_insensitive: flags.case_insensitive,
593            expanded_ast_limit: flags.expanded_ast_limit,
594            max_list_len: flags.max_list_len,
595            max_repeat: flags.max_repeat,
596            max_depth: flags.max_depth,
597            comments: RefCell::new(vec![]),
598            stack_group: RefCell::new(vec![]),
599            stack_class: RefCell::new(vec![]),
600            capture_names: RefCell::new(vec![]),
601            scratch: RefCell::new(String::new()),
602        }
603    }
604
605    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606        self
607    }
608
609    fn pattern(&self) -> &str {
610        self.pattern
611    }
612
613    fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614        ParseError {
615            kind,
616            pattern: self.pattern().to_string(),
617            span,
618        }
619    }
620
621    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622        self.error(
623            Span::splat(self.pos()),
624            ast::ErrorKind::UnsupportedResharpRegex,
625        )
626    }
627
628    fn offset(&self) -> usize {
629        self.parser().pos.get().offset
630    }
631
632    fn line(&self) -> usize {
633        self.parser().pos.get().line
634    }
635
636    fn column(&self) -> usize {
637        self.parser().pos.get().column
638    }
639
640    fn next_capture_index(&self, span: Span) -> Result<u32> {
641        let current = self.parser().capture_index.get();
642        let i = current
643            .checked_add(1)
644            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645        self.parser().capture_index.set(i);
646        Ok(i)
647    }
648
649    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650        let mut names = self.parser().capture_names.borrow_mut();
651        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652            Err(i) => {
653                names.insert(i, cap.clone());
654                Ok(())
655            }
656            Ok(i) => Err(self.error(
657                cap.span,
658                ast::ErrorKind::GroupNameDuplicate {
659                    original: names[i].span,
660                },
661            )),
662        }
663    }
664
665    fn ignore_whitespace(&self) -> bool {
666        self.parser().ignore_whitespace.get()
667    }
668
669    fn char(&self) -> char {
670        self.char_at(self.offset())
671    }
672
673    fn char_at(&self, i: usize) -> char {
674        self.pattern()[i..]
675            .chars()
676            .next()
677            .unwrap_or_else(|| panic!("expected char at offset {}", i))
678    }
679
680    fn bump(&self) -> bool {
681        if self.is_eof() {
682            return false;
683        }
684        let Position {
685            mut offset,
686            mut line,
687            mut column,
688        } = self.pos();
689        if self.char() == '\n' {
690            line = line.checked_add(1).unwrap();
691            column = 1;
692        } else {
693            column = column.checked_add(1).unwrap();
694        }
695        offset += self.char().len_utf8();
696        self.parser().pos.set(Position {
697            offset,
698            line,
699            column,
700        });
701        self.pattern()[self.offset()..].chars().next().is_some()
702    }
703
704    fn bump_if(&self, prefix: &str) -> bool {
705        if self.pattern()[self.offset()..].starts_with(prefix) {
706            for _ in 0..prefix.chars().count() {
707                self.bump();
708            }
709            true
710        } else {
711            false
712        }
713    }
714
715    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716        if self.bump_if("?=") {
717            return Some((true, true));
718        }
719        if self.bump_if("?!") {
720            return Some((true, false));
721        }
722        if self.bump_if("?<=") {
723            return Some((false, true));
724        }
725        if self.bump_if("?<!") {
726            return Some((false, false));
727        }
728        None
729    }
730
731    fn bump_and_bump_space(&self) -> bool {
732        if !self.bump() {
733            return false;
734        }
735        self.bump_space();
736        !self.is_eof()
737    }
738
739    fn bump_space(&self) {
740        if !self.ignore_whitespace() {
741            return;
742        }
743        while !self.is_eof() {
744            if self.char().is_whitespace() {
745                self.bump();
746            } else if self.char() == '#' {
747                let start = self.pos();
748                let mut comment_text = String::new();
749                self.bump();
750                while !self.is_eof() {
751                    let c = self.char();
752                    self.bump();
753                    if c == '\n' {
754                        break;
755                    }
756                    comment_text.push(c);
757                }
758                let comment = ast::Comment {
759                    span: Span::new(start, self.pos()),
760                    comment: comment_text,
761                };
762                self.parser().comments.borrow_mut().push(comment);
763            } else {
764                break;
765            }
766        }
767    }
768
769    fn peek(&self) -> Option<char> {
770        if self.is_eof() {
771            return None;
772        }
773        self.pattern()[self.offset() + self.char().len_utf8()..]
774            .chars()
775            .next()
776    }
777
778    /// Like peek, but will ignore spaces when the parser is in whitespace
779    /// insensitive mode.
780    fn peek_space(&self) -> Option<char> {
781        if !self.ignore_whitespace() {
782            return self.peek();
783        }
784        if self.is_eof() {
785            return None;
786        }
787        let mut start = self.offset() + self.char().len_utf8();
788        let mut in_comment = false;
789        for (i, c) in self.pattern()[start..].char_indices() {
790            if c.is_whitespace() {
791                continue;
792            } else if !in_comment && c == '#' {
793                in_comment = true;
794            } else if in_comment && c == '\n' {
795                in_comment = false;
796            } else {
797                start += i;
798                break;
799            }
800        }
801        self.pattern()[start..].chars().next()
802    }
803
804    fn is_eof(&self) -> bool {
805        self.offset() == self.pattern().len()
806    }
807
808    fn pos(&self) -> Position {
809        self.parser().pos.get()
810    }
811
812    fn span(&self) -> Span {
813        Span::splat(self.pos())
814    }
815
816    fn span_char(&self) -> Span {
817        let mut next = Position {
818            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819            line: self.line(),
820            column: self.column().checked_add(1).unwrap(),
821        };
822        if self.char() == '\n' {
823            next.line += 1;
824            next.column = 1;
825        }
826        Span::new(self.pos(), next)
827    }
828
829    #[inline(never)]
830    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831        assert_eq!(self.char(), '|');
832        concat.span.end = self.pos();
833        self.push_or_add_alternation(concat);
834        self.bump();
835        Ok(ast::Concat {
836            span: self.span(),
837            asts: vec![],
838        })
839    }
840
841    fn push_or_add_alternation(&self, concat: Concat) {
842        use self::GroupState::*;
843
844        let mut stack = self.parser().stack_group.borrow_mut();
845        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846            alts.asts.push(concat.into_ast());
847            return;
848        }
849        stack.push(Alternation(ast::Alternation {
850            span: Span::new(concat.span.start, self.pos()),
851            asts: vec![concat.into_ast()],
852        }));
853    }
854
855    #[inline(never)]
856    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857        assert_eq!(self.char(), '&');
858        concat.span.end = self.pos();
859        self.push_or_add_intersect(concat);
860        self.bump();
861        Ok(Concat {
862            span: self.span(),
863            asts: vec![],
864        })
865    }
866
867    fn push_or_add_intersect(&self, concat: Concat) {
868        use self::GroupState::*;
869
870        let mut stack = self.parser().stack_group.borrow_mut();
871        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872            alts.asts.push(concat.into_ast());
873            return;
874        }
875        stack.push(Intersection(ast::Intersection {
876            span: Span::new(concat.span.start, self.pos()),
877            asts: vec![concat.into_ast()],
878        }));
879    }
880
881    #[inline(never)]
882    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883        assert_eq!(self.char(), '(');
884        match self.parse_group()? {
885            Either::Left(set) => {
886                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887                if let Some(v) = ignore {
888                    self.parser().ignore_whitespace.set(v);
889                }
890
891                concat.asts.push(Ast::flags(set));
892                Ok(concat)
893            }
894            Either::Right(group) => {
895                let old_ignore_whitespace = self.ignore_whitespace();
896                let new_ignore_whitespace = group
897                    .flags()
898                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899                    .unwrap_or(old_ignore_whitespace);
900                self.parser()
901                    .stack_group
902                    .borrow_mut()
903                    .push(GroupState::Group {
904                        concat,
905                        group,
906                        ignore_whitespace: old_ignore_whitespace,
907                    });
908                self.parser().ignore_whitespace.set(new_ignore_whitespace);
909                Ok(Concat {
910                    span: self.span(),
911                    asts: vec![],
912                })
913            }
914        }
915    }
916
917    #[inline(never)]
918    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919        assert_eq!(self.char(), '~');
920        self.bump();
921        if self.is_eof() || self.char() != '(' {
922            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923        }
924        let open_span = self.span_char();
925        self.bump();
926        let group = ast::Group {
927            span: open_span,
928            kind: ast::GroupKind::Complement,
929            ast: Box::new(Ast::empty(self.span())),
930        };
931
932        let old_ignore_whitespace = self.ignore_whitespace();
933        let new_ignore_whitespace = group
934            .flags()
935            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936            .unwrap_or(old_ignore_whitespace);
937        self.parser()
938            .stack_group
939            .borrow_mut()
940            .push(GroupState::Group {
941                concat,
942                group,
943                ignore_whitespace: old_ignore_whitespace,
944            });
945        self.parser().ignore_whitespace.set(new_ignore_whitespace);
946        Ok(Concat {
947            span: self.span(),
948            asts: vec![],
949        })
950    }
951
952    #[inline(never)]
953    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954        use self::GroupState::*;
955        assert_eq!(self.char(), ')');
956        let mut stack = self.parser().stack_group.borrow_mut();
957        let topstack = stack.pop();
958
959        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960            Some(Group {
961                concat,
962                group,
963                ignore_whitespace,
964            }) => (concat, group, ignore_whitespace, None),
965            Some(Alternation(alt)) => match stack.pop() {
966                Some(Group {
967                    concat,
968                    group,
969                    ignore_whitespace,
970                }) => (
971                    concat,
972                    group,
973                    ignore_whitespace,
974                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975                ),
976                None | Some(Alternation(_)) | Some(Intersection(_)) => {
977                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978                }
979            },
980            Some(Intersection(int)) => match stack.pop() {
981                Some(Group {
982                    concat,
983                    group,
984                    ignore_whitespace,
985                }) => (
986                    concat,
987                    group,
988                    ignore_whitespace,
989                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990                ),
991                None | Some(Alternation(_)) | Some(Intersection(_)) => {
992                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993                }
994            },
995
996            None => {
997                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998            }
999        };
1000        self.parser().ignore_whitespace.set(ignore_whitespace);
1001        group_concat.span.end = self.pos();
1002        self.bump();
1003        group.span.end = self.pos();
1004        match alt {
1005            Some(Either::Left(mut alt)) => {
1006                alt.span.end = group_concat.span.end;
1007                alt.asts.push(group_concat.into_ast());
1008                group.ast = Box::new(alt.into_ast());
1009            }
1010            Some(Either::Right(mut int)) => {
1011                int.span.end = group_concat.span.end;
1012                int.asts.push(group_concat.into_ast());
1013                group.ast = Box::new(int.into_ast());
1014            }
1015            None => {
1016                group.ast = Box::new(group_concat.into_ast());
1017            }
1018        }
1019
1020        if group.kind == GroupKind::Complement {
1021            let complement = ast::Complement {
1022                span: self.span(),
1023                ast: group.ast,
1024            };
1025            prior_concat.asts.push(Ast::complement(complement));
1026        }
1027        // ignore groups for now
1028        else {
1029            prior_concat.asts.push(Ast::group(group));
1030        }
1031        Ok(prior_concat)
1032    }
1033
1034    #[inline(never)]
1035    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036        concat.span.end = self.pos();
1037        let mut stack = self.parser().stack_group.borrow_mut();
1038        let ast = match stack.pop() {
1039            None => Ok(concat.into_ast()),
1040            Some(GroupState::Alternation(mut alt)) => {
1041                alt.span.end = self.pos();
1042                alt.asts.push(concat.into_ast());
1043                Ok(Ast::alternation(alt))
1044            }
1045            Some(GroupState::Intersection(mut int)) => {
1046                int.span.end = self.pos();
1047                int.asts.push(concat.into_ast());
1048
1049                Ok(Ast::intersection(int))
1050            }
1051            Some(GroupState::Group { group, .. }) => {
1052                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053            }
1054        };
1055        // If we try to pop again, there should be nothing.
1056        match stack.pop() {
1057            None => ast,
1058            Some(GroupState::Alternation(alt)) => {
1059                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060            }
1061            Some(GroupState::Intersection(int)) => {
1062                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063            }
1064            Some(GroupState::Group { group, .. }) => {
1065                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066            }
1067        }
1068    }
1069
1070    #[inline(never)]
1071    fn push_class_open(
1072        &self,
1073        parent_union: regex_syntax::ast::ClassSetUnion,
1074    ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075        assert_eq!(self.char(), '[');
1076
1077        let (nested_set, nested_union) = self.parse_set_class_open()?;
1078        self.parser()
1079            .stack_class
1080            .borrow_mut()
1081            .push(ClassState::Open {
1082                union: parent_union,
1083                set: nested_set,
1084            });
1085        Ok(nested_union)
1086    }
1087
1088    #[inline(never)]
1089    fn pop_class(
1090        &self,
1091        nested_union: regex_syntax::ast::ClassSetUnion,
1092    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093        assert_eq!(self.char(), ']');
1094
1095        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096        let prevset = self.pop_class_op(item);
1097        let mut stack = self.parser().stack_class.borrow_mut();
1098        match stack.pop() {
1099            None => panic!("unexpected empty character class stack"),
1100            Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101            Some(ClassState::Open { mut union, mut set }) => {
1102                self.bump();
1103                set.span.end = self.pos();
1104                set.kind = prevset;
1105                if stack.is_empty() {
1106                    Ok(Either::Right(set))
1107                } else {
1108                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109                    Ok(Either::Left(union))
1110                }
1111            }
1112        }
1113    }
1114
1115    #[inline(never)]
1116    fn unclosed_class_error(&self) -> ParseError {
1117        for state in self.parser().stack_class.borrow().iter().rev() {
1118            if let ClassState::Open { ref set, .. } = *state {
1119                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120            }
1121        }
1122        panic!("no open character class found")
1123    }
1124
1125    #[inline(never)]
1126    fn push_class_op(
1127        &self,
1128        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129        next_union: regex_syntax::ast::ClassSetUnion,
1130    ) -> regex_syntax::ast::ClassSetUnion {
1131        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132        let new_lhs = self.pop_class_op(item);
1133        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134            kind: next_kind,
1135            lhs: new_lhs,
1136        });
1137        regex_syntax::ast::ClassSetUnion {
1138            span: self.span(),
1139            items: vec![],
1140        }
1141    }
1142
1143    #[inline(never)]
1144    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145        let mut stack = self.parser().stack_class.borrow_mut();
1146        let (kind, lhs) = match stack.pop() {
1147            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148            Some(state @ ClassState::Open { .. }) => {
1149                stack.push(state);
1150                return rhs;
1151            }
1152            None => unreachable!(),
1153        };
1154        let span = Span::new(lhs.span().start, rhs.span().end);
1155        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156            span,
1157            kind,
1158            lhs: Box::new(lhs),
1159            rhs: Box::new(rhs),
1160        })
1161    }
1162
1163    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164        match hir.kind() {
1165            hir::HirKind::Empty => Ok(NodeId::EPS),
1166            hir::HirKind::Literal(l) => {
1167                if l.0.len() == 1 {
1168                    let node = tb.mk_u8(l.0[0]);
1169                    Ok(node)
1170                } else {
1171                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172                    let conc = tb.mk_concats(ws.iter().copied());
1173                    Ok(conc)
1174                }
1175            }
1176            hir::HirKind::Class(class) => match class {
1177                hir::Class::Unicode(class_unicode) => {
1178                    let ranges = class_unicode.ranges();
1179                    if ranges.len() == 1
1180                        && ranges[0].start() == '\u{0}'
1181                        && ranges[0].end() == '\u{10FFFF}'
1182                    {
1183                        return Ok(tb.mk_range_u8(0, 255));
1184                    }
1185                    let mut nodes = Vec::new();
1186                    for range in ranges {
1187                        for seq in Utf8Sequences::new(range.start(), range.end()) {
1188                            let sl = seq.as_slice();
1189                            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190                            let node = match bytes.len() {
1191                                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192                                n => {
1193                                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194                                    let mut conc = last;
1195                                    for i in (0..n - 1).rev() {
1196                                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197                                        conc = tb.mk_concat(b, conc);
1198                                    }
1199                                    conc
1200                                }
1201                            };
1202                            nodes.push(node);
1203                        }
1204                    }
1205                    let merged = tb.mk_unions(nodes.into_iter());
1206                    Ok(merged)
1207                }
1208                hir::Class::Bytes(class_bytes) => {
1209                    let ranges = class_bytes.ranges();
1210                    let mut result = NodeId::BOT;
1211                    for range in ranges {
1212                        let start = range.start();
1213                        let end = range.end();
1214                        let node = tb.mk_range_u8(start, end);
1215                        result = tb.mk_union(result, node);
1216                    }
1217                    Ok(result)
1218                }
1219            },
1220            hir::HirKind::Look(_) => Err(self.error(
1221                Span::splat(self.pos()),
1222                ast::ErrorKind::UnsupportedResharpRegex,
1223            )),
1224            hir::HirKind::Repetition(_) => Err(self.error(
1225                Span::splat(self.pos()),
1226                ast::ErrorKind::UnsupportedResharpRegex,
1227            )),
1228            hir::HirKind::Capture(_) => Err(self.error(
1229                Span::splat(self.pos()),
1230                ast::ErrorKind::UnsupportedResharpRegex,
1231            )),
1232            hir::HirKind::Concat(body) => {
1233                let mut result = NodeId::EPS;
1234                for child in body {
1235                    let node = self.hir_to_node_id(child, tb)?;
1236                    result = tb.mk_concat(result, node);
1237                }
1238                Ok(result)
1239            }
1240            hir::HirKind::Alternation(_) => Err(self.error(
1241                Span::splat(self.pos()),
1242                ast::ErrorKind::UnsupportedResharpRegex,
1243            )),
1244        }
1245    }
1246
1247    fn translate_ast_to_hir(
1248        &mut self,
1249        orig_ast: &regex_syntax::ast::Ast,
1250        tb: &mut TB<'s>,
1251    ) -> Result<NodeId> {
1252        match self.translator.translate("", orig_ast) {
1253            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254            Ok(hir) => self.hir_to_node_id(&hir, tb),
1255        }
1256    }
1257
1258    fn translator_to_node_id(
1259        &mut self,
1260        orig_ast: &regex_syntax::ast::Ast,
1261        translator: &mut Option<Translator>,
1262        tb: &mut TB<'s>,
1263    ) -> Result<NodeId> {
1264        match translator {
1265            Some(tr) => {
1266                let hir = tr
1267                    .translate("", orig_ast)
1268                    .map_err(|e| self.unsupported_error(e))?;
1269                self.hir_to_node_id(&hir, tb)
1270            }
1271            None => self.translate_ast_to_hir(orig_ast, tb),
1272        }
1273    }
1274
1275    fn get_class(
1276        &mut self,
1277        negated: bool,
1278        kind: regex_syntax::ast::ClassPerlKind,
1279        tb: &mut TB<'s>,
1280    ) -> Result<NodeId> {
1281        let w = self
1282            .perl_classes
1283            .iter()
1284            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285        match w {
1286            Some((_, _, value)) => Ok(*value),
1287            None => {
1288                let translated = if self.global_ascii_perl {
1289                    let pos = match kind {
1290                        regex_syntax::ast::ClassPerlKind::Word => {
1291                            let az = tb.mk_range_u8(b'a', b'z');
1292                            let big = tb.mk_range_u8(b'A', b'Z');
1293                            let dig = tb.mk_range_u8(b'0', b'9');
1294                            let us = tb.mk_u8(b'_');
1295                            tb.mk_unions([az, big, dig, us].into_iter())
1296                        }
1297                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298                        regex_syntax::ast::ClassPerlKind::Space => {
1299                            let sp = tb.mk_u8(b' ');
1300                            let tab = tb.mk_u8(b'\t');
1301                            let nl = tb.mk_u8(b'\n');
1302                            let cr = tb.mk_u8(b'\r');
1303                            let ff = tb.mk_u8(0x0C);
1304                            let vt = tb.mk_u8(0x0B);
1305                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306                        }
1307                    };
1308                    if negated {
1309                        resharp_algebra::neg_class(tb, pos)
1310                    } else {
1311                        pos
1312                    }
1313                } else if self.global_unicode {
1314                    match kind {
1315                        regex_syntax::ast::ClassPerlKind::Word => {
1316                            if self.global_full_unicode {
1317                                self.unicode_classes.ensure_word_full(tb);
1318                            } else {
1319                                self.unicode_classes.ensure_word(tb);
1320                            }
1321                            if negated {
1322                                self.unicode_classes.non_word
1323                            } else {
1324                                self.unicode_classes.word
1325                            }
1326                        }
1327                        regex_syntax::ast::ClassPerlKind::Digit => {
1328                            if self.global_full_unicode {
1329                                self.unicode_classes.ensure_digit_full(tb);
1330                            } else {
1331                                self.unicode_classes.ensure_digit(tb);
1332                            }
1333                            if negated {
1334                                self.unicode_classes.non_digit
1335                            } else {
1336                                self.unicode_classes.digit
1337                            }
1338                        }
1339                        regex_syntax::ast::ClassPerlKind::Space => {
1340                            if self.global_full_unicode {
1341                                self.unicode_classes.ensure_space_full(tb);
1342                            } else {
1343                                self.unicode_classes.ensure_space(tb);
1344                            }
1345                            if negated {
1346                                self.unicode_classes.non_space
1347                            } else {
1348                                self.unicode_classes.space
1349                            }
1350                        }
1351                    }
1352                } else {
1353                    let pos = match kind {
1354                        regex_syntax::ast::ClassPerlKind::Word => {
1355                            let az = tb.mk_range_u8(b'a', b'z');
1356                            let big = tb.mk_range_u8(b'A', b'Z');
1357                            let dig = tb.mk_range_u8(b'0', b'9');
1358                            let us = tb.mk_u8(b'_');
1359                            tb.mk_unions([az, big, dig, us].into_iter())
1360                        }
1361                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362                        regex_syntax::ast::ClassPerlKind::Space => {
1363                            let sp = tb.mk_u8(b' ');
1364                            let tab = tb.mk_u8(b'\t');
1365                            let nl = tb.mk_u8(b'\n');
1366                            let cr = tb.mk_u8(b'\r');
1367                            let ff = tb.mk_u8(0x0C);
1368                            let vt = tb.mk_u8(0x0B);
1369                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370                        }
1371                    };
1372                    if negated {
1373                        // Negate the byte class, not the whole language: `~(pos)`
1374                        // (mk_compl) includes the empty string and every non-class
1375                        // substring, so `\D`/`\S`/`\W` wrongly become nullable and
1376                        // match "". neg_class negates within the single byte, as
1377                        // the js branch above does.
1378                        resharp_algebra::neg_class(tb, pos)
1379                    } else {
1380                        pos
1381                    }
1382                };
1383                self.perl_classes.push((negated, kind, translated));
1384                Ok(translated)
1385            }
1386        }
1387    }
1388
1389    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1390        use WordCharKind::*;
1391        match ast {
1392            Ast::Literal(lit) => {
1393                if is_word_byte(lit.c as u8) {
1394                    Word
1395                } else {
1396                    NonWord
1397                }
1398            }
1399            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1400                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1401                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1402                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1403                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1404                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1405                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1406            },
1407            Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1408            Ast::Dot(_) | Ast::Top(_) => Unknown,
1409            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1410            Ast::Concat(c) if !c.asts.is_empty() => {
1411                let dir: isize = if left { -1 } else { 1 };
1412                let edge = match Self::concat_edge_index(&c.asts, left) {
1413                    Some(e) => e,
1414                    None => return Unknown,
1415                };
1416                let kind = Self::word_char_kind(&c.asts[edge], left);
1417                match kind {
1418                    MaybeWord => {
1419                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1420                            Word => Word,
1421                            _ => MaybeWord,
1422                        }
1423                    }
1424                    MaybeNonWord => {
1425                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1426                            NonWord => NonWord,
1427                            _ => MaybeNonWord,
1428                        }
1429                    }
1430                    other => other,
1431                }
1432            }
1433            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1434                let first = Self::word_char_kind(&alt.asts[0], left);
1435                if alt.asts[1..]
1436                    .iter()
1437                    .all(|a| Self::word_char_kind(a, left) == first)
1438                {
1439                    first
1440                } else {
1441                    Unknown
1442                }
1443            }
1444            Ast::Repetition(r) => {
1445                let inner = Self::word_char_kind(&r.ast, left);
1446                let nullable = matches!(
1447                    &r.op.kind,
1448                    ast::RepetitionKind::ZeroOrMore
1449                        | ast::RepetitionKind::ZeroOrOne
1450                        | ast::RepetitionKind::Range(
1451                            ast::RepetitionRange::Bounded(0, _)
1452                                | ast::RepetitionRange::Exactly(0)
1453                        )
1454                );
1455                if nullable {
1456                    match inner {
1457                        Word => MaybeWord,
1458                        NonWord => MaybeNonWord,
1459                        _ => Unknown,
1460                    }
1461                } else {
1462                    inner
1463                }
1464            }
1465            Ast::Lookaround(la) => match la.kind {
1466                ast::LookaroundKind::PositiveLookahead
1467                | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1468                _ => Unknown,
1469            },
1470            Ast::Assertion(a) => match (&a.kind, left) {
1471                (ast::AssertionKind::EndText, false) => NonWord,
1472                (ast::AssertionKind::StartText, true) => NonWord,
1473                _ => Unknown,
1474            },
1475            _ => Unknown,
1476        }
1477    }
1478
1479    // ok to return None here, it's only an optimization
1480    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1481        match ast {
1482            Ast::Literal(_)
1483            | Ast::ClassPerl(_)
1484            | Ast::ClassBracketed(_)
1485            | Ast::ClassUnicode(_)
1486            | Ast::Dot(_)
1487            | Ast::Top(_) => Some(ast),
1488            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1489            Ast::Concat(c) if !c.asts.is_empty() => {
1490                Self::concat_edge_index(&c.asts, left)
1491                    .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1492            }
1493            Ast::Repetition(r) => {
1494                let nullable = matches!(
1495                    &r.op.kind,
1496                    ast::RepetitionKind::ZeroOrMore
1497                        | ast::RepetitionKind::ZeroOrOne
1498                        | ast::RepetitionKind::Range(
1499                            ast::RepetitionRange::Bounded(0, _)
1500                                | ast::RepetitionRange::Exactly(0)
1501                        )
1502                );
1503                if nullable {
1504                    None
1505                } else {
1506                    Self::edge_class_ast(&r.ast, left)
1507                }
1508            }
1509            _ => None,
1510        }
1511    }
1512
1513    fn resolve_word_kind(
1514        &mut self,
1515        asts: &[Ast],
1516        idx: usize,
1517        dir: isize,
1518        translator: &mut Option<Translator>,
1519        tb: &mut TB<'s>,
1520        word_id: NodeId,
1521        not_word_id: NodeId,
1522    ) -> Result<WordCharKind> {
1523        use WordCharKind::*;
1524        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1525        if fast != Unknown {
1526            return Ok(fast);
1527        }
1528        let neighbor_idx = (idx as isize + dir) as usize;
1529        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1530            self.ast_to_node_id(edge, translator, tb)?
1531        } else if dir > 0 {
1532            let mut bodies: Vec<NodeId> = vec![];
1533            let mut j = neighbor_idx;
1534            while j < asts.len() {
1535                match &asts[j] {
1536                    Ast::Lookaround(la) => {
1537                        let kind = la.kind.clone();
1538                        let lookbehind = matches!(
1539                            kind,
1540                            ast::LookaroundKind::PositiveLookbehind
1541                                | ast::LookaroundKind::NegativeLookbehind
1542                        );
1543                        if lookbehind {
1544                            j += 1;
1545                            continue;
1546                        }
1547                        let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1548                        let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1549                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1550                        })?;
1551                        let body_ts = tb.mk_concat(body, NodeId::TS);
1552                        let constraint = match kind {
1553                            ast::LookaroundKind::PositiveLookahead => body_ts,
1554                            ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1555                            _ => unreachable!(),
1556                        };
1557                        bodies.push(constraint);
1558                        j += 1;
1559                    }
1560                    other => {
1561                        let n = self.ast_to_node_id(other, translator, tb)?;
1562                        let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1563                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1564                        })?;
1565                        bodies.push(tb.mk_concat(n, NodeId::TS));
1566                        break;
1567                    }
1568                }
1569            }
1570            if bodies.is_empty() {
1571                return Ok(Unknown);
1572            }
1573            let combined = tb.mk_inters(bodies.into_iter());
1574            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1575            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1576            return if tb.subsumes(word_prefix, combined) == Some(true) {
1577                Ok(Word)
1578            } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1579                Ok(NonWord)
1580            } else {
1581                Ok(Unknown)
1582            };
1583        } else {
1584            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1585            let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1586            let mut neighbor_node = tb
1587                .try_elim_lookarounds(neighbor_node)
1588                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1589            neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1590                Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1591            })?;
1592            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1593            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1594            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1595                Ok(Word)
1596            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1597                Ok(NonWord)
1598            } else {
1599                Ok(Unknown)
1600            };
1601        };
1602        if tb.subsumes(word_id, node) == Some(true) {
1603            Ok(Word)
1604        } else if tb.subsumes(not_word_id, node) == Some(true) {
1605            Ok(NonWord)
1606        } else {
1607            Ok(Unknown)
1608        }
1609    }
1610
1611    fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1612        match tb.get_kind(node) {
1613            Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1614            Kind::Concat => {
1615                let l = node.left(tb);
1616                let r = node.right(tb);
1617                let stripped_r = Self::strip_trailing_lookahead(tb, r);
1618                if stripped_r == NodeId::EPS {
1619                    Self::strip_trailing_lookahead(tb, l)
1620                } else if stripped_r == r {
1621                    node
1622                } else {
1623                    tb.mk_concat(l, stripped_r)
1624                }
1625            }
1626            _ => node,
1627        }
1628    }
1629
1630    fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1631        let dir: isize = if left { -1 } else { 1 };
1632        let mut e = if left { asts.len() as isize - 1 } else { 0 };
1633        while e >= 0
1634            && (e as usize) < asts.len()
1635            && Self::is_transparent_for_dir(&asts[e as usize], dir)
1636        {
1637            e += dir;
1638        }
1639        if e < 0 || e as usize >= asts.len() {
1640            None
1641        } else {
1642            Some(e as usize)
1643        }
1644    }
1645
1646    fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1647        match ast {
1648            Ast::Lookaround(la) => match la.kind {
1649                ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1650                    dir < 0
1651                }
1652                ast::LookaroundKind::PositiveLookbehind
1653                | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1654            },
1655            Ast::Repetition(r) => matches!(
1656                &r.op.kind,
1657                ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1658            ),
1659            _ => false,
1660        }
1661    }
1662
1663    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1664        use WordCharKind::*;
1665        let next = idx as isize + dir;
1666        if next < 0 || next >= asts.len() as isize {
1667            return Edge;
1668        }
1669        if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1670            return Self::concat_neighbor_kind(asts, next as usize, dir);
1671        }
1672        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1673        match kind {
1674            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1675                Word => Word,
1676                _ => Unknown,
1677            },
1678            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1679                NonWord => NonWord,
1680                _ => Unknown,
1681            },
1682            other => other,
1683        }
1684    }
1685
1686    fn specialize_word_boundaries(
1687        &mut self,
1688        children: &mut [NodeId],
1689        tb: &mut TB<'s>,
1690    ) -> Result<()> {
1691        let wb = self.unicode_classes.wb;
1692        let non_wb = self.unicode_classes.non_wb;
1693        if wb == NodeId::MISSING {
1694            return Ok(());
1695        }
1696        let word = self.unicode_classes.word;
1697        let non_word = self.unicode_classes.non_word;
1698        if word == NodeId::MISSING {
1699            return Ok(());
1700        }
1701        // narrow check, could be done over the whole regex instead of one node
1702        // but i need to make sure the cost isnt too large
1703        let word_pref = tb.mk_concat(word, NodeId::TS);
1704        let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1705        let word_suf = tb.mk_concat(NodeId::TS, word);
1706        let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1707        let len = children.len();
1708        for k in 0..len {
1709            let l = if k == 0 {
1710                WordCharKind::Edge
1711            } else {
1712                use resharp_algebra::Kind;
1713                if tb.get_kind(children[k - 1]) == Kind::End
1714                    && (children[k] == wb || children[k] == non_wb)
1715                {
1716                    return Err(
1717                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1718                    );
1719                }
1720                Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1721            };
1722            let r = if k + 1 >= len {
1723                WordCharKind::Edge
1724            } else {
1725                Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1726            };
1727            children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1728                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1729        }
1730        Ok(())
1731    }
1732
1733    fn rewrite_wb_in_node(
1734        b: &mut TB<'s>,
1735        node: NodeId,
1736        wb: NodeId,
1737        non_wb: NodeId,
1738        word: NodeId,
1739        left: WordCharKind,
1740        right: WordCharKind,
1741    ) -> Option<NodeId> {
1742        let boundary_match = if node == wb {
1743            true
1744        } else if node == non_wb {
1745            false
1746        } else if b.get_kind(node) == Kind::Union {
1747            let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1748            let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1749            return Some(b.mk_union(l, r));
1750        } else {
1751            return Some(node);
1752        };
1753        use WordCharKind::*;
1754        let result = match (left, right) {
1755            (NonWord, Word) | (Word, NonWord) => {
1756                if boundary_match {
1757                    NodeId::EPS
1758                } else {
1759                    NodeId::BOT
1760                }
1761            }
1762            (Word, Word) | (NonWord, NonWord) => {
1763                if boundary_match {
1764                    NodeId::BOT
1765                } else {
1766                    NodeId::EPS
1767                }
1768            }
1769            (Word, _) => {
1770                if boundary_match {
1771                    b.mk_neg_lookahead(word, 0)
1772                } else {
1773                    let tail = b.mk_concat(word, NodeId::TS);
1774                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1775                }
1776            }
1777            (NonWord, _) => {
1778                if boundary_match {
1779                    let tail = b.mk_concat(word, NodeId::TS);
1780                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1781                } else {
1782                    b.mk_neg_lookahead(word, 0)
1783                }
1784            }
1785            (_, Word) => {
1786                if boundary_match {
1787                    b.mk_neg_lookbehind(word)
1788                } else {
1789                    b.mk_lookbehind(word, NodeId::MISSING)
1790                }
1791            }
1792            (_, NonWord) => {
1793                if boundary_match {
1794                    b.mk_lookbehind(word, NodeId::MISSING)
1795                } else {
1796                    b.mk_neg_lookbehind(word)
1797                }
1798            }
1799            _ => return Some(node),
1800        };
1801        Some(result)
1802    }
1803
1804    fn classify(
1805        b: &mut TB<'s>,
1806        node: NodeId,
1807        word_dir: NodeId,
1808        non_word_dir: NodeId,
1809    ) -> WordCharKind {
1810        if b.contains_look(node) || b.contains_anchors(node) {
1811            return WordCharKind::Unknown;
1812        }
1813        if b.subsumes(word_dir, node) == Some(true) {
1814            WordCharKind::Word
1815        } else if b.subsumes(non_word_dir, node) == Some(true) {
1816            WordCharKind::NonWord
1817        } else {
1818            WordCharKind::Unknown
1819        }
1820    }
1821
1822    fn rewrite_word_boundary_in_concat(
1823        &mut self,
1824        asts: &[Ast],
1825        idx: usize,
1826        translator: &mut Option<Translator>,
1827        tb: &mut TB<'s>,
1828        negated: bool,
1829    ) -> Result<(NodeId, usize)> {
1830        use WordCharKind::*;
1831        if self.global_full_unicode {
1832            self.unicode_classes.ensure_word_full(tb);
1833        } else if self.global_unicode && !self.global_ascii_perl {
1834            self.unicode_classes.ensure_word(tb);
1835        } else {
1836            self.unicode_classes.ensure_word_ascii(tb);
1837        }
1838        let word_id = self.unicode_classes.word;
1839        let not_word_id = self.unicode_classes.non_word;
1840        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1841        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1842        let boundary_match = !negated;
1843        match (left, right) {
1844            (NonWord, Word) | (Word, NonWord) => Ok((
1845                if boundary_match {
1846                    NodeId::EPS
1847                } else {
1848                    NodeId::BOT
1849                },
1850                idx + 1,
1851            )),
1852            (Word, Word) | (NonWord, NonWord) => Ok((
1853                if boundary_match {
1854                    NodeId::BOT
1855                } else {
1856                    NodeId::EPS
1857                },
1858                idx + 1,
1859            )),
1860            (Word, _) => {
1861                if boundary_match {
1862                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1863                } else {
1864                    let tail = tb.mk_concat(word_id, NodeId::TS);
1865                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1866                }
1867            }
1868            (NonWord, _) => {
1869                if boundary_match {
1870                    let tail = tb.mk_concat(word_id, NodeId::TS);
1871                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1872                } else {
1873                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1874                }
1875            }
1876            (_, Word) => {
1877                if boundary_match {
1878                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1879                } else {
1880                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1881                }
1882            }
1883            (_, NonWord) => {
1884                if boundary_match {
1885                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1886                } else {
1887                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1888                }
1889            }
1890            //   \b = (?<=\w)(?!\w) | (?<!\w)(?=\w)
1891            //   \B = (?<=\w)(?=\w)  | (?<!\w)(?!\w)
1892            //   TODO: expensive and the cost could be reduced further
1893            _ => {
1894                self.unicode_classes.ensure_wb(tb);
1895                let node = if boundary_match {
1896                    self.unicode_classes.wb
1897                } else {
1898                    self.unicode_classes.non_wb
1899                };
1900                Ok((node, idx + 1))
1901            }
1902        }
1903    }
1904
1905    fn merge_boundary_with_following_lookaheads(
1906        &mut self,
1907        asts: &[Ast],
1908        wb_idx: usize,
1909        boundary_tail: NodeId,
1910        translator: &mut Option<Translator>,
1911        tb: &mut TB<'s>,
1912    ) -> Result<(NodeId, usize)> {
1913        let mut next = wb_idx + 1;
1914        let mut la_bodies = vec![boundary_tail];
1915        while next < asts.len() {
1916            match &asts[next] {
1917                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1918                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1919                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1920                    next += 1;
1921                }
1922                _ => break,
1923            }
1924        }
1925        let merged = tb.mk_inters(la_bodies.into_iter());
1926        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1927    }
1928
1929    fn ast_to_node_id(
1930        &mut self,
1931        ast: &Ast,
1932        translator: &mut Option<Translator>,
1933        tb: &mut TB<'s>,
1934    ) -> Result<NodeId> {
1935        match ast {
1936            Ast::Empty(_) => Ok(NodeId::EPS),
1937            Ast::Flags(f) => {
1938                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1939                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1940                }
1941                let mut translator_builder = self.default_translator_builder();
1942                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1943                    translator_builder.case_insensitive(state);
1944                }
1945                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1946                    translator_builder.unicode(state);
1947                }
1948                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1949                    self.dot_all.set(state);
1950                }
1951                if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1952                    self.multiline.set(state);
1953                }
1954                let concat_translator = Some(translator_builder.build());
1955                *translator = concat_translator;
1956                Ok(NodeId::EPS)
1957            }
1958            Ast::Literal(l) => {
1959                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1960                self.translator_to_node_id(&ast_lit, translator, tb)
1961            }
1962            Ast::Top(_) => Ok(NodeId::TOP),
1963            Ast::Dot(_) => {
1964                let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1965                let hirv = match (codepoint_dot, self.dot_all.get()) {
1966                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1967                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1968                    (false, true) => return Ok(NodeId::TOP),
1969                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1970                };
1971                self.hir_to_node_id(&hirv, tb)
1972            }
1973            Ast::Assertion(a) => match &a.kind {
1974                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1975                ast::AssertionKind::EndText => Ok(NodeId::END),
1976                ast::AssertionKind::WordBoundary => {
1977                    let only = Ast::Assertion(a.clone());
1978                    let asts = std::slice::from_ref(&only);
1979                    let (node, _) =
1980                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1981                    Ok(node)
1982                }
1983                ast::AssertionKind::NotWordBoundary => {
1984                    // bare \B with no surrounding concat: treat as singleton.
1985                    let only = Ast::Assertion(a.clone());
1986                    let asts = std::slice::from_ref(&only);
1987                    let (node, _) =
1988                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1989                    Ok(node)
1990                }
1991                ast::AssertionKind::StartLine => {
1992                    if !self.multiline.get() {
1993                        return Ok(NodeId::BEGIN);
1994                    }
1995                    let left = NodeId::BEGIN;
1996                    let right = tb.mk_u8(b'\n');
1997                    let union = tb.mk_union(left, right);
1998                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1999                }
2000                ast::AssertionKind::EndLine => {
2001                    if !self.multiline.get() {
2002                        return Ok(NodeId::END);
2003                    }
2004                    let left = NodeId::END;
2005                    let right = tb.mk_u8(b'\n');
2006                    let union = tb.mk_union(left, right);
2007                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2008                }
2009                ast::AssertionKind::WordBoundaryStart => {
2010                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2011                }
2012                ast::AssertionKind::WordBoundaryEnd => {
2013                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2014                }
2015                ast::AssertionKind::WordBoundaryStartAngle => {
2016                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2017                }
2018                ast::AssertionKind::WordBoundaryEndAngle => {
2019                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2020                }
2021                ast::AssertionKind::WordBoundaryStartHalf => {
2022                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2023                }
2024                ast::AssertionKind::WordBoundaryEndHalf => {
2025                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2026                }
2027            },
2028            Ast::ClassUnicode(c) => {
2029                let tmp = regex_syntax::ast::ClassUnicode {
2030                    span: c.span,
2031                    negated: c.negated,
2032                    kind: c.kind.clone(),
2033                };
2034                if !c.negated {
2035                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2036                        match s.as_str() {
2037                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
2038                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2039                            // a single valid utf8 codepoint; \p{utf8}*&~(a) => non a, but valid utf8
2040                            "utf8" => {
2041                                let ascii = tb.mk_range_u8(0, 127);
2042                                let beta = tb.mk_range_u8(128, 0xBF);
2043                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
2044                                let c0s = tb.mk_concats([c0, beta].into_iter());
2045                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
2046                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2047                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
2048                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2049                                return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2050                            }
2051                            "hex" => {
2052                                let nums = tb.mk_range_u8(b'0', b'9');
2053                                let lets = tb.mk_range_u8(b'a', b'f');
2054                                let lets2 = tb.mk_range_u8(b'A', b'F');
2055                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2056                                return Ok(merged);
2057                            }
2058                            _ => {}
2059                        }
2060                    };
2061                }
2062
2063                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2064                self.translator_to_node_id(&orig_ast, translator, tb)
2065            }
2066            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2067            Ast::ClassBracketed(c) => match &c.kind {
2068                regex_syntax::ast::ClassSet::Item(item) => {
2069                    if !c.negated && is_universal_perl_pair(item) {
2070                        return Ok(NodeId::TOP);
2071                    }
2072                    if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2073                        return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2074                    }
2075                    let kind = if self.global_ascii_perl {
2076                        rewrite_ascii_perl_set(&c.kind)
2077                    } else {
2078                        c.kind.clone()
2079                    };
2080                    let tmp = regex_syntax::ast::ClassBracketed {
2081                        span: c.span,
2082                        negated: c.negated,
2083                        kind,
2084                    };
2085                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2086                    self.translator_to_node_id(&orig_ast, translator, tb)
2087                }
2088                regex_syntax::ast::ClassSet::BinaryOp(_) => {
2089                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2090                }
2091            },
2092            Ast::Repetition(r) => {
2093                let body = self.ast_to_node_id(&r.ast, translator, tb);
2094                match body {
2095                    Ok(body) => match &r.op.kind {
2096                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2097                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2098                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2099                        ast::RepetitionKind::Range(r) => match r {
2100                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2101                            ast::RepetitionRange::AtLeast(n) => {
2102                                let rep = tb.mk_repeat(body, *n, *n);
2103                                let st = tb.mk_star(body);
2104                                Ok(tb.mk_concat(rep, st))
2105                            }
2106
2107                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2108                        },
2109                    },
2110                    Err(_) => body,
2111                }
2112            }
2113            Ast::Lookaround(g) => {
2114                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2115                match g.kind {
2116                    ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2117                        let mut prefix = NodeId::EPS;
2118                        let mut rest = body;
2119                        while tb.get_kind(rest) == Kind::Concat
2120                            && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2121                        {
2122                            prefix = tb.mk_concat(prefix, rest.left(tb));
2123                            rest = rest.right(tb);
2124                        }
2125                        if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2126                            return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2127                        }
2128                        let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2129                        Ok(tb.mk_concat(prefix, la))
2130                    }
2131                    ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2132                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2133                    }
2134                    ast::LookaroundKind::PositiveLookahead => {
2135                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2136                    }
2137                    ast::LookaroundKind::PositiveLookbehind
2138                    | ast::LookaroundKind::NegativeLookbehind
2139                        if body.contains_lookahead(tb) =>
2140                    {
2141                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2142                    }
2143                    ast::LookaroundKind::PositiveLookbehind => {
2144                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2145                    }
2146                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2147                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2148                }
2149            }
2150            Ast::Group(g) => {
2151                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2152                    if !flags.items.is_empty() {
2153                        let mut translator_builder = self.default_translator_builder();
2154                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2155                            translator_builder.case_insensitive(state);
2156                        }
2157                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2158                            translator_builder.unicode(state);
2159                        }
2160                        let saved_dot_all = self.dot_all.get();
2161                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2162                            self.dot_all.set(state);
2163                        }
2164                        let saved_multiline = self.multiline.get();
2165                        if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2166                            self.multiline.set(state);
2167                        }
2168                        let mut scoped = Some(translator_builder.build());
2169                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2170                        self.dot_all.set(saved_dot_all);
2171                        self.multiline.set(saved_multiline);
2172                        return result;
2173                    }
2174                }
2175                self.ast_to_node_id(&g.ast, translator, tb)
2176            }
2177            Ast::Alternation(a) => {
2178                let mut children = vec![];
2179                for ast in &a.asts {
2180                    match self.ast_to_node_id(ast, translator, tb) {
2181                        Ok(node_id) => children.push(node_id),
2182                        Err(err) => return Err(err),
2183                    }
2184                }
2185                Ok(tb.mk_unions(children.iter().copied()))
2186            }
2187            Ast::Concat(c) => {
2188                let mut concat_translator: Option<Translator> = None;
2189                let mut children = vec![];
2190                let mut prev_boundary_child: Option<usize> = None;
2191                let mut i = 0;
2192                while i < c.asts.len() {
2193                    let ast = &c.asts[i];
2194                    match ast {
2195                        Ast::Flags(f) => {
2196                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2197                                return Err(
2198                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2199                                );
2200                            }
2201                            let mut translator_builder = self.default_translator_builder();
2202                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2203                                translator_builder.case_insensitive(state);
2204                            }
2205                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2206                                translator_builder.unicode(state);
2207                            }
2208                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2209                                self.dot_all.set(state);
2210                            }
2211                            if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2212                                self.multiline.set(state);
2213                            }
2214                            concat_translator = Some(translator_builder.build());
2215                            *translator = concat_translator.clone();
2216                            i += 1;
2217                            continue;
2218                        }
2219                        Ast::Assertion(a)
2220                            if a.kind == ast::AssertionKind::WordBoundary
2221                                || a.kind == ast::AssertionKind::NotWordBoundary =>
2222                        {
2223                            let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2224                            let node = self.rewrite_word_boundary_in_concat(
2225                                &c.asts, i, translator, tb, negated,
2226                            )?;
2227                            match prev_boundary_child {
2228                                Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2229                                None => {
2230                                    children.push(node.0);
2231                                    prev_boundary_child = Some(children.len() - 1);
2232                                }
2233                            }
2234                            i = node.1; // skip consumed lookaheads
2235                            continue;
2236                        }
2237                        _ => {}
2238                    }
2239                    match concat_translator {
2240                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2241                            Ok(node_id) => {
2242                                if node_id != resharp_algebra::NodeId::EPS {
2243                                    prev_boundary_child = None;
2244                                    children.push(node_id);
2245                                }
2246                            }
2247                            Err(err) => return Err(err),
2248                        },
2249                        None => match self.ast_to_node_id(ast, translator, tb) {
2250                            Ok(node_id) => {
2251                                if node_id != resharp_algebra::NodeId::EPS {
2252                                    prev_boundary_child = None;
2253                                    children.push(node_id);
2254                                }
2255                            }
2256                            Err(err) => return Err(err),
2257                        },
2258                    }
2259                    i += 1;
2260                }
2261                self.specialize_word_boundaries(&mut children, tb)?;
2262                Ok(tb.mk_concats(children.iter().cloned()))
2263            }
2264            Ast::Intersection(intersection) => {
2265                let mut children = vec![];
2266                for ast in &intersection.asts {
2267                    match self.ast_to_node_id(ast, translator, tb) {
2268                        Ok(node_id) => children.push(node_id),
2269                        Err(err) => return Err(err),
2270                    }
2271                }
2272                Ok(tb.mk_inters(children.into_iter()))
2273            }
2274            Ast::Complement(complement) => {
2275                let body = self.ast_to_node_id(&complement.ast, translator, tb);
2276                body.map(|x| tb.mk_compl(x))
2277            }
2278        }
2279    }
2280
2281    fn parse_inner(&mut self) -> Result<Ast> {
2282        let mut concat = Concat {
2283            span: self.span(),
2284            asts: vec![],
2285        };
2286        loop {
2287            self.bump_space();
2288            if self.is_eof() {
2289                break;
2290            }
2291            match self.char() {
2292                '(' => concat = self.push_group(concat)?,
2293                ')' => concat = self.pop_group(concat)?,
2294                '|' => concat = self.push_alternate(concat)?,
2295                '&' => concat = self.push_intersect(concat)?,
2296                '~' => concat = self.push_compl_group(concat)?,
2297                '[' => {
2298                    let class = self.parse_set_class()?;
2299                    concat.asts.push(Ast::class_bracketed(class));
2300                }
2301                '?' => {
2302                    concat =
2303                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2304                }
2305                '*' => {
2306                    concat =
2307                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2308                }
2309                '+' => {
2310                    concat =
2311                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2312                }
2313                '{' => {
2314                    concat = self.parse_counted_repetition(concat)?;
2315                }
2316                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2317            }
2318            if self.stack_group.borrow().len() > self.max_depth {
2319                return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2320            }
2321        }
2322        let ast = self.pop_group_end(concat)?;
2323        if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2324            || max_concat_length(&ast) >= self.max_list_len
2325        {
2326            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2327        }
2328        Ok(ast)
2329    }
2330
2331    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2332        let ast = self.parse_inner()?;
2333        if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2334            return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2335        }
2336        self.ast_to_node_id(&ast, &mut None, tb)
2337    }
2338
2339    #[inline(never)]
2340    fn parse_uncounted_repetition(
2341        &self,
2342        mut concat: ast::Concat,
2343        kind: ast::RepetitionKind,
2344    ) -> Result<ast::Concat> {
2345        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
2346        let op_start = self.pos();
2347        let ast = match concat.asts.pop() {
2348            Some(ast) => ast,
2349            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2350        };
2351        match ast {
2352            Ast::Empty(_) | Ast::Flags(_) => {
2353                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2354            }
2355            _ => {}
2356        }
2357        if self.bump() && self.char() == '?' {
2358            return Err(self.error(
2359                Span::new(op_start, self.pos()),
2360                ast::ErrorKind::UnsupportedLazyQuantifier,
2361            ));
2362        }
2363        concat.asts.push(Ast::repetition(ast::Repetition {
2364            span: ast.span().with_end(self.pos()),
2365            op: ast::RepetitionOp {
2366                span: Span::new(op_start, self.pos()),
2367                kind,
2368            },
2369            greedy: true,
2370            ast: Box::new(ast),
2371        }));
2372        Ok(concat)
2373    }
2374
2375    #[inline(never)]
2376    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2377        assert!(self.char() == '{');
2378        let start = self.pos();
2379        let ast = match concat.asts.pop() {
2380            Some(ast) => ast,
2381            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2382        };
2383        match ast {
2384            Ast::Empty(_) | Ast::Flags(_) => {
2385                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2386            }
2387            _ => {}
2388        }
2389        if !self.bump_and_bump_space() {
2390            return Err(self.error(
2391                Span::new(start, self.pos()),
2392                ast::ErrorKind::RepetitionCountUnclosed,
2393            ));
2394        }
2395        let count_start = specialize_err(
2396            self.parse_decimal(),
2397            ast::ErrorKind::DecimalEmpty,
2398            ast::ErrorKind::RepetitionCountDecimalEmpty,
2399        );
2400        if self.is_eof() {
2401            return Err(self.error(
2402                Span::new(start, self.pos()),
2403                ast::ErrorKind::RepetitionCountUnclosed,
2404            ));
2405        }
2406        let range = if self.char() == ',' {
2407            if !self.bump_and_bump_space() {
2408                return Err(self.error(
2409                    Span::new(start, self.pos()),
2410                    ast::ErrorKind::RepetitionCountUnclosed,
2411                ));
2412            }
2413            if self.char() != '}' {
2414                let count_start = match count_start {
2415                    Ok(c) => c,
2416                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2417                        if self.parser().empty_min_range {
2418                            0
2419                        } else {
2420                            return Err(err);
2421                        }
2422                    }
2423                    err => err?,
2424                };
2425                let count_end = specialize_err(
2426                    self.parse_decimal(),
2427                    ast::ErrorKind::DecimalEmpty,
2428                    ast::ErrorKind::RepetitionCountDecimalEmpty,
2429                )?;
2430                ast::RepetitionRange::Bounded(count_start, count_end)
2431            } else {
2432                ast::RepetitionRange::AtLeast(count_start?)
2433            }
2434        } else {
2435            ast::RepetitionRange::Exactly(count_start?)
2436        };
2437
2438        if self.is_eof() || self.char() != '}' {
2439            return Err(self.error(
2440                Span::new(start, self.pos()),
2441                ast::ErrorKind::RepetitionCountUnclosed,
2442            ));
2443        }
2444
2445        if self.bump_and_bump_space() && self.char() == '?' {
2446            return Err(self.error(
2447                Span::new(start, self.pos()),
2448                ast::ErrorKind::UnsupportedLazyQuantifier,
2449            ));
2450        }
2451
2452        let op_span = Span::new(start, self.pos());
2453        if !range.is_valid() {
2454            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2455        }
2456
2457        let over_limit = match &range {
2458            ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2459            ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2460            ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2461        };
2462        if over_limit {
2463            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2464        }
2465        concat.asts.push(Ast::repetition(ast::Repetition {
2466            span: ast.span().with_end(self.pos()),
2467            op: ast::RepetitionOp {
2468                span: op_span,
2469                kind: ast::RepetitionKind::Range(range),
2470            },
2471            greedy: true,
2472            ast: Box::new(ast),
2473        }));
2474        Ok(concat)
2475    }
2476
2477    #[inline(never)]
2478    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2479        assert_eq!(self.char(), '(');
2480        let open_span = self.span_char();
2481        self.bump();
2482        self.bump_space();
2483        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2484            let kind = match (pos, ahead) {
2485                (true, true) => LookaroundKind::PositiveLookahead,
2486                (true, false) => LookaroundKind::PositiveLookbehind,
2487                (false, true) => LookaroundKind::NegativeLookahead,
2488                (false, false) => LookaroundKind::NegativeLookbehind,
2489            };
2490            return Ok(Either::Right(ast::Group {
2491                span: open_span,
2492                kind: ast::GroupKind::Lookaround(kind),
2493                ast: Box::new(Ast::empty(self.span())),
2494            }));
2495        }
2496        let inner_span = self.span();
2497        let mut starts_with_p = true;
2498        if self.bump_if("?P<") || {
2499            starts_with_p = false;
2500            self.bump_if("?<")
2501        } {
2502            let capture_index = self.next_capture_index(open_span)?;
2503            let name = self.parse_capture_name(capture_index)?;
2504            Ok(Either::Right(ast::Group {
2505                span: open_span,
2506                kind: ast::GroupKind::CaptureName {
2507                    starts_with_p,
2508                    name,
2509                },
2510                ast: Box::new(Ast::empty(self.span())),
2511            }))
2512        } else if self.bump_if("?") {
2513            if self.is_eof() {
2514                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2515            }
2516            let flags = self.parse_flags()?;
2517            let char_end = self.char();
2518            self.bump();
2519            if char_end == ')' {
2520                // We don't allow empty flags, e.g., `(?)`. We instead
2521                // interpret it as a repetition operator missing its argument.
2522                if flags.items.is_empty() {
2523                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2524                }
2525                Ok(Either::Left(ast::SetFlags {
2526                    span: Span {
2527                        end: self.pos(),
2528                        ..open_span
2529                    },
2530                    flags,
2531                }))
2532            } else {
2533                assert_eq!(char_end, ':');
2534                Ok(Either::Right(ast::Group {
2535                    span: open_span,
2536                    kind: ast::GroupKind::NonCapturing(flags),
2537                    ast: Box::new(Ast::empty(self.span())),
2538                }))
2539            }
2540        } else {
2541            let capture_index = self.next_capture_index(open_span)?;
2542            Ok(Either::Right(ast::Group {
2543                span: open_span,
2544                kind: ast::GroupKind::CaptureIndex(capture_index),
2545                ast: Box::new(Ast::empty(self.span())),
2546            }))
2547        }
2548    }
2549
2550    #[inline(never)]
2551    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2552        if self.is_eof() {
2553            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2554        }
2555        let start = self.pos();
2556        loop {
2557            if self.char() == '>' {
2558                break;
2559            }
2560            if !is_capture_char(self.char(), self.pos() == start) {
2561                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2562            }
2563            if !self.bump() {
2564                break;
2565            }
2566        }
2567        let end = self.pos();
2568        if self.is_eof() {
2569            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2570        }
2571        assert_eq!(self.char(), '>');
2572        self.bump();
2573        let name = &self.pattern()[start.offset..end.offset];
2574        if name.is_empty() {
2575            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2576        }
2577        let capname = ast::CaptureName {
2578            span: Span::new(start, end),
2579            name: name.to_string(),
2580            index: capture_index,
2581        };
2582        self.add_capture_name(&capname)?;
2583        Ok(capname)
2584    }
2585
2586    #[inline(never)]
2587    fn parse_flags(&self) -> Result<ast::Flags> {
2588        let mut flags = ast::Flags {
2589            span: self.span(),
2590            items: vec![],
2591        };
2592        let mut last_was_negation = None;
2593        while self.char() != ':' && self.char() != ')' {
2594            if self.char() == '-' {
2595                last_was_negation = Some(self.span_char());
2596                let item = ast::FlagsItem {
2597                    span: self.span_char(),
2598                    kind: ast::FlagsItemKind::Negation,
2599                };
2600                if let Some(i) = flags.add_item(item) {
2601                    return Err(self.error(
2602                        self.span_char(),
2603                        ast::ErrorKind::FlagRepeatedNegation {
2604                            original: flags.items[i].span,
2605                        },
2606                    ));
2607                }
2608            } else {
2609                last_was_negation = None;
2610                let item = ast::FlagsItem {
2611                    span: self.span_char(),
2612                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2613                };
2614                if let Some(i) = flags.add_item(item) {
2615                    return Err(self.error(
2616                        self.span_char(),
2617                        ast::ErrorKind::FlagDuplicate {
2618                            original: flags.items[i].span,
2619                        },
2620                    ));
2621                }
2622            }
2623            if !self.bump() {
2624                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2625            }
2626        }
2627        if let Some(span) = last_was_negation {
2628            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2629        }
2630        flags.span.end = self.pos();
2631        Ok(flags)
2632    }
2633
2634    #[inline(never)]
2635    fn parse_flag(&self) -> Result<ast::Flag> {
2636        match self.char() {
2637            'i' => Ok(ast::Flag::CaseInsensitive),
2638            'm' => Ok(ast::Flag::MultiLine),
2639            's' => Ok(ast::Flag::DotMatchesNewLine),
2640            'U' => Ok(ast::Flag::SwapGreed),
2641            'u' => Ok(ast::Flag::Unicode),
2642            'R' => Ok(ast::Flag::CRLF),
2643            'x' => Ok(ast::Flag::IgnoreWhitespace),
2644            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2645        }
2646    }
2647
2648    fn parse_primitive(&self) -> Result<Primitive> {
2649        match self.char() {
2650            '\\' => self.parse_escape(),
2651            '_' => {
2652                let ast = Primitive::Top(self.span_char());
2653                self.bump();
2654                Ok(ast)
2655            }
2656            '.' => {
2657                let ast = Primitive::Dot(self.span_char());
2658                self.bump();
2659                Ok(ast)
2660            }
2661            '^' => {
2662                let ast = Primitive::Assertion(ast::Assertion {
2663                    span: self.span_char(),
2664                    kind: ast::AssertionKind::StartLine,
2665                });
2666                self.bump();
2667                Ok(ast)
2668            }
2669            '$' => {
2670                let ast = Primitive::Assertion(ast::Assertion {
2671                    span: self.span_char(),
2672                    kind: ast::AssertionKind::EndLine,
2673                });
2674                self.bump();
2675                Ok(ast)
2676            }
2677            c => {
2678                let ast = Primitive::Literal(Literal {
2679                    span: self.span_char(),
2680                    kind: LiteralKind::Verbatim,
2681                    c,
2682                });
2683                self.bump();
2684                Ok(ast)
2685            }
2686        }
2687    }
2688
2689    #[inline(never)]
2690    fn parse_escape(&self) -> Result<Primitive> {
2691        assert_eq!(self.char(), '\\');
2692        let start = self.pos();
2693        if !self.bump() {
2694            return Err(self.error(
2695                Span::new(start, self.pos()),
2696                ast::ErrorKind::EscapeUnexpectedEof,
2697            ));
2698        }
2699        let c = self.char();
2700        // Put some of the more complicated routines into helpers.
2701        match c {
2702            '0'..='9' => {
2703                if !self.parser().octal {
2704                    return Err(self.error(
2705                        Span::new(start, self.span_char().end),
2706                        ast::ErrorKind::UnsupportedBackreference,
2707                    ));
2708                }
2709                let mut lit = self.parse_octal();
2710                lit.span.start = start;
2711                return Ok(Primitive::Literal(lit));
2712            }
2713            'x' | 'u' | 'U' => {
2714                let mut lit = self.parse_hex()?;
2715                lit.span.start = start;
2716                return Ok(Primitive::Literal(lit));
2717            }
2718            'p' | 'P' => {
2719                let mut cls = self.parse_unicode_class()?;
2720                cls.span.start = start;
2721                return Ok(Primitive::Unicode(cls));
2722            }
2723            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2724                let mut cls = self.parse_perl_class();
2725                cls.span.start = start;
2726                return Ok(Primitive::Perl(cls));
2727            }
2728            _ => {}
2729        }
2730
2731        // Handle all of the one letter sequences inline.
2732        self.bump();
2733        let span = Span::new(start, self.pos());
2734        if is_meta_character(c) {
2735            return Ok(Primitive::Literal(Literal {
2736                span,
2737                kind: LiteralKind::Meta,
2738                c,
2739            }));
2740        }
2741        if is_escapeable_character(c) {
2742            return Ok(Primitive::Literal(Literal {
2743                span,
2744                kind: LiteralKind::Superfluous,
2745                c,
2746            }));
2747        }
2748        let special = |kind, c| {
2749            Ok(Primitive::Literal(Literal {
2750                span,
2751                kind: LiteralKind::Special(kind),
2752                c,
2753            }))
2754        };
2755        match c {
2756            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2757            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2758            't' => special(SpecialLiteralKind::Tab, '\t'),
2759            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2760            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2761            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2762            'A' => Ok(Primitive::Assertion(ast::Assertion {
2763                span,
2764                kind: ast::AssertionKind::StartText,
2765            })),
2766            'z' => Ok(Primitive::Assertion(ast::Assertion {
2767                span,
2768                kind: ast::AssertionKind::EndText,
2769            })),
2770            'b' => {
2771                let mut wb = ast::Assertion {
2772                    span,
2773                    kind: ast::AssertionKind::WordBoundary,
2774                };
2775                // After a \b, we "try" to parse things like \b{start} for
2776                // special word boundary assertions.
2777                if !self.is_eof() && self.char() == '{' {
2778                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2779                        wb.kind = kind;
2780                        wb.span.end = self.pos();
2781                    }
2782                }
2783                Ok(Primitive::Assertion(wb))
2784            }
2785            'B' => Ok(Primitive::Assertion(ast::Assertion {
2786                span,
2787                kind: ast::AssertionKind::NotWordBoundary,
2788            })),
2789            '<' => Ok(Primitive::Assertion(ast::Assertion {
2790                span,
2791                kind: ast::AssertionKind::WordBoundaryStartAngle,
2792            })),
2793            '>' => Ok(Primitive::Assertion(ast::Assertion {
2794                span,
2795                kind: ast::AssertionKind::WordBoundaryEndAngle,
2796            })),
2797            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2798        }
2799    }
2800
2801    fn maybe_parse_special_word_boundary(
2802        &self,
2803        wb_start: Position,
2804    ) -> Result<Option<ast::AssertionKind>> {
2805        assert_eq!(self.char(), '{');
2806
2807        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2808        let start = self.pos();
2809        if !self.bump_and_bump_space() {
2810            return Err(self.error(
2811                Span::new(wb_start, self.pos()),
2812                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2813            ));
2814        }
2815        let start_contents = self.pos();
2816        if !is_valid_char(self.char()) {
2817            self.parser().pos.set(start);
2818            return Ok(None);
2819        }
2820
2821        // Now collect up our chars until we see a '}'.
2822        let mut scratch = self.parser().scratch.borrow_mut();
2823        scratch.clear();
2824        while !self.is_eof() && is_valid_char(self.char()) {
2825            scratch.push(self.char());
2826            self.bump_and_bump_space();
2827        }
2828        if self.is_eof() || self.char() != '}' {
2829            return Err(self.error(
2830                Span::new(start, self.pos()),
2831                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2832            ));
2833        }
2834        let end = self.pos();
2835        self.bump();
2836        let kind = match scratch.as_str() {
2837            "start" => ast::AssertionKind::WordBoundaryStart,
2838            "end" => ast::AssertionKind::WordBoundaryEnd,
2839            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2840            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2841            _ => {
2842                return Err(self.error(
2843                    Span::new(start_contents, end),
2844                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2845                ))
2846            }
2847        };
2848        Ok(Some(kind))
2849    }
2850
2851    #[inline(never)]
2852    fn parse_octal(&self) -> Literal {
2853        assert!(self.parser().octal);
2854        assert!('0' <= self.char() && self.char() <= '7');
2855        let start = self.pos();
2856        // Parse up to two more digits.
2857        while self.bump()
2858            && '0' <= self.char()
2859            && self.char() <= '7'
2860            && self.pos().offset - start.offset <= 2
2861        {}
2862        let end = self.pos();
2863        let octal = &self.pattern()[start.offset..end.offset];
2864        // Parsing the octal should never fail since the above guarantees a
2865        // valid number.
2866        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2867        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2868        // invalid Unicode scalar values.
2869        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2870        Literal {
2871            span: Span::new(start, end),
2872            kind: LiteralKind::Octal,
2873            c,
2874        }
2875    }
2876
2877    #[inline(never)]
2878    fn parse_hex(&self) -> Result<Literal> {
2879        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2880
2881        let hex_kind = match self.char() {
2882            'x' => HexLiteralKind::X,
2883            'u' => HexLiteralKind::UnicodeShort,
2884            _ => HexLiteralKind::UnicodeLong,
2885        };
2886        if !self.bump_and_bump_space() {
2887            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2888        }
2889        if self.char() == '{' {
2890            self.parse_hex_brace(hex_kind)
2891        } else {
2892            self.parse_hex_digits(hex_kind)
2893        }
2894    }
2895
2896    #[inline(never)]
2897    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2898        let mut scratch = self.parser().scratch.borrow_mut();
2899        scratch.clear();
2900
2901        let start = self.pos();
2902        for i in 0..kind.digits() {
2903            if i > 0 && !self.bump_and_bump_space() {
2904                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2905            }
2906            if !is_hex(self.char()) {
2907                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2908            }
2909            scratch.push(self.char());
2910        }
2911        self.bump_and_bump_space();
2912        let end = self.pos();
2913        let hex = scratch.as_str();
2914        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2915            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2916            Some(c) => Ok(Literal {
2917                span: Span::new(start, end),
2918                kind: LiteralKind::HexFixed(kind),
2919                c,
2920            }),
2921        }
2922    }
2923
2924    #[inline(never)]
2925    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2926        let mut scratch = self.parser().scratch.borrow_mut();
2927        scratch.clear();
2928
2929        let brace_pos = self.pos();
2930        let start = self.span_char().end;
2931        while self.bump_and_bump_space() && self.char() != '}' {
2932            if !is_hex(self.char()) {
2933                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2934            }
2935            scratch.push(self.char());
2936        }
2937        if self.is_eof() {
2938            return Err(self.error(
2939                Span::new(brace_pos, self.pos()),
2940                ast::ErrorKind::EscapeUnexpectedEof,
2941            ));
2942        }
2943        let end = self.pos();
2944        let hex = scratch.as_str();
2945        assert_eq!(self.char(), '}');
2946        self.bump_and_bump_space();
2947
2948        if hex.is_empty() {
2949            return Err(self.error(
2950                Span::new(brace_pos, self.pos()),
2951                ast::ErrorKind::EscapeHexEmpty,
2952            ));
2953        }
2954        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2955            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2956            Some(c) => Ok(Literal {
2957                span: Span::new(start, self.pos()),
2958                kind: LiteralKind::HexBrace(kind),
2959                c,
2960            }),
2961        }
2962    }
2963
2964    fn parse_decimal(&self) -> Result<u32> {
2965        let mut scratch = self.parser().scratch.borrow_mut();
2966        scratch.clear();
2967
2968        while !self.is_eof() && self.char().is_whitespace() {
2969            self.bump();
2970        }
2971        let start = self.pos();
2972        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2973            scratch.push(self.char());
2974            self.bump_and_bump_space();
2975        }
2976        let span = Span::new(start, self.pos());
2977        while !self.is_eof() && self.char().is_whitespace() {
2978            self.bump_and_bump_space();
2979        }
2980        let digits = scratch.as_str();
2981        if digits.is_empty() {
2982            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2983        }
2984        match digits.parse::<u32>().ok() {
2985            Some(n) => Ok(n),
2986            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2987        }
2988    }
2989
2990    #[inline(never)]
2991    fn parse_set_class(&self) -> Result<ClassBracketed> {
2992        assert_eq!(self.char(), '[');
2993
2994        let mut union = ClassSetUnion {
2995            span: self.span(),
2996            items: vec![],
2997        };
2998        loop {
2999            self.bump_space();
3000            if self.is_eof() {
3001                return Err(self.unclosed_class_error());
3002            }
3003            match self.char() {
3004                '[' => {
3005                    if !self.parser().stack_class.borrow().is_empty() {
3006                        if let Some(cls) = self.maybe_parse_ascii_class() {
3007                            union.push(ClassSetItem::Ascii(cls));
3008                            continue;
3009                        }
3010                    }
3011                    union = self.push_class_open(union)?;
3012                }
3013                ']' => match self.pop_class(union)? {
3014                    Either::Left(nested_union) => {
3015                        union = nested_union;
3016                    }
3017                    Either::Right(class) => return Ok(class),
3018                },
3019                '&' if self.peek() == Some('&') => {
3020                    assert!(self.bump_if("&&"));
3021                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3022                }
3023                '-' if self.peek() == Some('-') => {
3024                    assert!(self.bump_if("--"));
3025                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3026                }
3027                '~' if self.peek() == Some('~') => {
3028                    assert!(self.bump_if("~~"));
3029                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3030                }
3031                _ => {
3032                    union.push(self.parse_set_class_range()?);
3033                }
3034            }
3035        }
3036    }
3037
3038    #[inline(never)]
3039    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3040        let prim1 = self.parse_set_class_item()?;
3041        self.bump_space();
3042        if self.is_eof() {
3043            return Err(self.unclosed_class_error());
3044        }
3045        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3046            return prim1.into_class_set_item(self);
3047        }
3048        if !self.bump_and_bump_space() {
3049            return Err(self.unclosed_class_error());
3050        }
3051        let prim2 = self.parse_set_class_item()?;
3052        let range = ClassSetRange {
3053            span: Span::new(prim1.span().start, prim2.span().end),
3054            start: prim1.into_class_literal(self)?,
3055            end: prim2.into_class_literal(self)?,
3056        };
3057        if !range.is_valid() {
3058            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3059        }
3060        Ok(ClassSetItem::Range(range))
3061    }
3062
3063    #[inline(never)]
3064    fn parse_set_class_item(&self) -> Result<Primitive> {
3065        if self.char() == '\\' {
3066            self.parse_escape()
3067        } else {
3068            let x = Primitive::Literal(Literal {
3069                span: self.span_char(),
3070                kind: LiteralKind::Verbatim,
3071                c: self.char(),
3072            });
3073            self.bump();
3074            Ok(x)
3075        }
3076    }
3077
3078    #[inline(never)]
3079    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3080        assert_eq!(self.char(), '[');
3081        let start = self.pos();
3082        if !self.bump_and_bump_space() {
3083            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3084        }
3085
3086        let negated = if self.char() != '^' {
3087            false
3088        } else {
3089            if !self.bump_and_bump_space() {
3090                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3091            }
3092            true
3093        };
3094        // Accept any number of `-` as literal `-`.
3095        let mut union = ClassSetUnion {
3096            span: self.span(),
3097            items: vec![],
3098        };
3099        while self.char() == '-' {
3100            union.push(ClassSetItem::Literal(Literal {
3101                span: self.span_char(),
3102                kind: LiteralKind::Verbatim,
3103                c: '-',
3104            }));
3105            if !self.bump_and_bump_space() {
3106                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3107            }
3108        }
3109        // If `]` is the *first* char in a set, then interpret it as a literal
3110        // `]`. That is, an empty class is impossible to write.
3111        if union.items.is_empty() && self.char() == ']' {
3112            union.push(ClassSetItem::Literal(Literal {
3113                span: self.span_char(),
3114                kind: LiteralKind::Verbatim,
3115                c: ']',
3116            }));
3117            if !self.bump_and_bump_space() {
3118                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3119            }
3120        }
3121        let set = ClassBracketed {
3122            span: Span::new(start, self.pos()),
3123            negated,
3124            kind: ClassSet::union(ClassSetUnion {
3125                span: Span::new(union.span.start, union.span.start),
3126                items: vec![],
3127            }),
3128        };
3129        Ok((set, union))
3130    }
3131
3132    #[inline(never)]
3133    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3134        assert_eq!(self.char(), '[');
3135        // If parsing fails, then we back up the parser to this starting point.
3136        let start = self.pos();
3137        let mut negated = false;
3138        if !self.bump() || self.char() != ':' {
3139            self.parser().pos.set(start);
3140            return None;
3141        }
3142        if !self.bump() {
3143            self.parser().pos.set(start);
3144            return None;
3145        }
3146        if self.char() == '^' {
3147            negated = true;
3148            if !self.bump() {
3149                self.parser().pos.set(start);
3150                return None;
3151            }
3152        }
3153        let name_start = self.offset();
3154        while self.char() != ':' && self.bump() {}
3155        if self.is_eof() {
3156            self.parser().pos.set(start);
3157            return None;
3158        }
3159        let name = &self.pattern()[name_start..self.offset()];
3160        if !self.bump_if(":]") {
3161            self.parser().pos.set(start);
3162            return None;
3163        }
3164        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3165            Some(kind) => kind,
3166            None => {
3167                self.parser().pos.set(start);
3168                return None;
3169            }
3170        };
3171        Some(ClassAscii {
3172            span: Span::new(start, self.pos()),
3173            kind,
3174            negated,
3175        })
3176    }
3177
3178    #[inline(never)]
3179    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3180        assert!(self.char() == 'p' || self.char() == 'P');
3181
3182        let mut scratch = self.parser().scratch.borrow_mut();
3183        scratch.clear();
3184
3185        let negated = self.char() == 'P';
3186        if !self.bump_and_bump_space() {
3187            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3188        }
3189        let (start, kind) = if self.char() == '{' {
3190            let start = self.span_char().end;
3191            while self.bump_and_bump_space() && self.char() != '}' {
3192                scratch.push(self.char());
3193            }
3194            if self.is_eof() {
3195                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3196            }
3197            assert_eq!(self.char(), '}');
3198            self.bump();
3199
3200            let name = scratch.as_str();
3201            if let Some(i) = name.find("!=") {
3202                (
3203                    start,
3204                    ClassUnicodeKind::NamedValue {
3205                        op: ClassUnicodeOpKind::NotEqual,
3206                        name: name[..i].to_string(),
3207                        value: name[i + 2..].to_string(),
3208                    },
3209                )
3210            } else if let Some(i) = name.find(':') {
3211                (
3212                    start,
3213                    ClassUnicodeKind::NamedValue {
3214                        op: ClassUnicodeOpKind::Colon,
3215                        name: name[..i].to_string(),
3216                        value: name[i + 1..].to_string(),
3217                    },
3218                )
3219            } else if let Some(i) = name.find('=') {
3220                (
3221                    start,
3222                    ClassUnicodeKind::NamedValue {
3223                        op: ClassUnicodeOpKind::Equal,
3224                        name: name[..i].to_string(),
3225                        value: name[i + 1..].to_string(),
3226                    },
3227                )
3228            } else {
3229                (start, ClassUnicodeKind::Named(name.to_string()))
3230            }
3231        } else {
3232            let start = self.pos();
3233            let c = self.char();
3234            if c == '\\' {
3235                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3236            }
3237            self.bump_and_bump_space();
3238            let kind = ClassUnicodeKind::OneLetter(c);
3239            (start, kind)
3240        };
3241        Ok(ClassUnicode {
3242            span: Span::new(start, self.pos()),
3243            negated,
3244            kind,
3245        })
3246    }
3247
3248    #[inline(never)]
3249    fn parse_perl_class(&self) -> ClassPerl {
3250        let c = self.char();
3251        let span = self.span_char();
3252        self.bump();
3253        let (negated, kind) = match c {
3254            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3255            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3256            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3257            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3258            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3259            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3260            c => panic!("expected valid Perl class but got '{}'", c),
3261        };
3262        ClassPerl {
3263            span,
3264            kind,
3265            negated,
3266        }
3267    }
3268}
3269
3270/// `[\s\S]`, `[\w\W]`, `[\d\D]` etc into `_` canonicalization.
3271/// should really be done as a u32 BDD but good enough for now
3272fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
3273    use regex_syntax::ast::ClassSetItem;
3274    let items = match item {
3275        ClassSetItem::Union(u) => &u.items,
3276        _ => return false,
3277    };
3278    if items.len() != 2 {
3279        return false;
3280    }
3281    match (&items[0], &items[1]) {
3282        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3283            let is_all = a.kind == b.kind && a.negated != b.negated;
3284            is_all
3285        }
3286        _ => false,
3287    }
3288}
3289
3290pub fn max_concat_length(ast: &ast::Ast) -> usize {
3291    match ast {
3292        ast::Ast::Empty(_)
3293        | ast::Ast::Flags(_)
3294        | ast::Ast::Literal(_)
3295        | ast::Ast::Dot(_)
3296        | ast::Ast::Top(_)
3297        | ast::Ast::Assertion(_)
3298        | ast::Ast::ClassUnicode(_)
3299        | ast::Ast::ClassPerl(_)
3300        | ast::Ast::ClassBracketed(_) => 0,
3301        ast::Ast::Group(g) => max_concat_length(&g.ast),
3302        ast::Ast::Complement(c) => max_concat_length(&c.ast),
3303        ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3304        ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3305        ast::Ast::Concat(c) => c
3306            .asts
3307            .len()
3308            .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3309        ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3310        ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3311    }
3312}
3313
3314pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3315    fn go(ast: &ast::Ast, limit: u64) -> u64 {
3316        match ast {
3317            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3318            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3319            ast::Ast::Assertion(_) => 1,
3320            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3321            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3322            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3323            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3324            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3325            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3326            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3327            ast::Ast::Repetition(r) => {
3328                let body = go(&r.ast, limit);
3329                let factor: u64 = match &r.op.kind {
3330                    ast::RepetitionKind::ZeroOrOne => 2,
3331                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3332                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3333                        (*n as u64).max(1)
3334                    }
3335                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3336                        (*n as u64).max(1).saturating_add(1)
3337                    }
3338                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3339                        (*m as u64).max(1)
3340                    }
3341                };
3342                body.saturating_mul(factor).min(limit)
3343            }
3344        }
3345    }
3346    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3347        let mut total: u64 = 0;
3348        for c in children {
3349            total = total.saturating_add(go(c, limit));
3350            if total >= limit {
3351                return limit;
3352            }
3353        }
3354        total
3355    }
3356    go(ast, limit)
3357}
3358
3359pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3360    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3361    p.parse(tb)
3362}
3363
3364pub fn parse_ast_with<'s>(
3365    tb: &mut TB<'s>,
3366    pattern: &'s str,
3367    flags: &PatternFlags,
3368) -> std::result::Result<NodeId, ParseError> {
3369    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3370    p.parse(tb)
3371}
3372
3373/// Parse a pattern into the raw AST without converting to algebra nodes.
3374pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3375    let mut p: ResharpParser = ResharpParser::new(pattern);
3376    p.parse_inner()
3377}
resharp_parser/lib.rs

resharp_parser/
lib.rs