resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `RegexOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// `\w` covers all Unicode word chars including 3- and 4-byte sequences.
31    pub full_unicode: bool,
32    /// global case-insensitive matching.
33    pub case_insensitive: bool,
34    /// `.` matches `\n` (behaves like `_`).
35    pub dot_matches_new_line: bool,
36    /// `^` and `$` match at line boundaries (`\n`) in addition to text boundaries.
37    pub multiline: bool,
38    /// allow whitespace and `#` comments in the pattern.
39    pub ignore_whitespace: bool,
40    /// ASCII `\w`/`\d`/`\s` tables, but
41    /// negated perl classes (`\W`/`\D`/`\S`) and `.` match a full codepoint
42    pub ascii_perl_classes: bool,
43    /// max upper bound on `expanded_ast_size` before parser rejects the
44    /// pattern as too complex. default 50_000.
45    pub expanded_ast_limit: u64,
46    /// max children allowed in any single `Concat`/`Alternation`/
47    /// `Intersection` node. default 4_000.
48    pub max_list_len: usize,
49    /// max upper bound on bounded repetition `{n,m}`. default 500.
50    pub max_repeat: u32,
51    pub max_depth: usize,
52}
53
54// arbitrary safeguards, these will not prevent intentional DoS patterns
55// more to protect you from shooting yourself in the foot
56pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62    fn default() -> Self {
63        Self {
64            unicode: true,
65            full_unicode: false,
66            case_insensitive: false,
67            dot_matches_new_line: false,
68            multiline: true,
69            ignore_whitespace: false,
70            ascii_perl_classes: false,
71            expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72            max_list_len: DEFAULT_MAX_LIST_LEN,
73            max_repeat: DEFAULT_MAX_REPEAT,
74            max_depth: DEFAULT_MAX_DEPTH,
75        }
76    }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81    Word,
82    NonWord,
83    MaybeWord,
84    MaybeNonWord,
85    Unknown,
86    Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90    b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: &regex_syntax::ast::ClassSetItem) -> WordCharKind {
94    use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95    use WordCharKind::*;
96    match item {
97        ClassSetItem::Empty(_) => Unknown,
98        ClassSetItem::Literal(l) => {
99            if is_word_byte(l.c as u8) {
100                Word
101            } else {
102                NonWord
103            }
104        }
105        ClassSetItem::Range(r) => {
106            let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107            let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108            if all_word {
109                Word
110            } else if all_non {
111                NonWord
112            } else {
113                Unknown
114            }
115        }
116        ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117            (ClassPerlKind::Word, false) => Word,
118            (ClassPerlKind::Word, true) => NonWord,
119            (ClassPerlKind::Space, false) => NonWord,
120            (ClassPerlKind::Digit, false) => Word,
121            _ => Unknown,
122        },
123        ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124        ClassSetItem::Union(u) => {
125            let mut kind = Unknown;
126            for item in &u.items {
127                let k = class_set_item_word_kind(item);
128                kind = match (kind, k) {
129                    (_, Unknown) => return Unknown,
130                    (Unknown, _) => k,
131                    (Word, Word) => Word,
132                    (NonWord, NonWord) => NonWord,
133                    _ => return Unknown,
134                };
135            }
136            kind
137        }
138        _ => Unknown,
139    }
140}
141
142fn utf8_codepoint_node(tb: &mut TB<'_>) -> NodeId {
143    let ascii = tb.mk_range_u8(0, 127);
144    let beta = tb.mk_range_u8(0x80, 0xBF);
145    let c0 = tb.mk_range_u8(0xC0, 0xDF);
146    let c0s = tb.mk_concats([c0, beta].into_iter());
147    let e0 = tb.mk_range_u8(0xE0, 0xEF);
148    let e0s = tb.mk_concats([e0, beta, beta].into_iter());
149    let f0 = tb.mk_range_u8(0xF0, 0xF7);
150    let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
151    tb.mk_unions([ascii, c0s, e0s, f0s].into_iter())
152}
153
154fn unicode_ranges_to_node(ranges: &[hir::ClassUnicodeRange], tb: &mut TB<'_>) -> NodeId {
155    let mut nodes = Vec::new();
156    for range in ranges {
157        for seq in Utf8Sequences::new(range.start(), range.end()) {
158            let sl = seq.as_slice();
159            let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
160            let node = match bytes.len() {
161                1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
162                n => {
163                    let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
164                    let mut conc = last;
165                    for i in (0..n - 1).rev() {
166                        let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
167                        conc = tb.mk_concat(b, conc);
168                    }
169                    conc
170                }
171            };
172            nodes.push(node);
173        }
174    }
175    tb.mk_unions(nodes.into_iter())
176}
177
178fn class_bracketed_word_kind(c: &regex_syntax::ast::ClassBracketed) -> WordCharKind {
179    use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
180    use WordCharKind::*;
181    if c.negated {
182        return match &c.kind {
183            ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
184                if p.negated {
185                    Word
186                } else {
187                    NonWord
188                }
189            }
190            _ => Unknown,
191        };
192    }
193    match &c.kind {
194        ClassSet::Item(item) => class_set_item_word_kind(item),
195        ClassSet::BinaryOp(_) => Unknown,
196    }
197}
198
199fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
200    regex_syntax::ast::Literal {
201        span,
202        kind: regex_syntax::ast::LiteralKind::Verbatim,
203        c,
204    }
205}
206
207fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
208    regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
209        span,
210        start: ascii_class_lit(span, a),
211        end: ascii_class_lit(span, b),
212    })
213}
214
215fn ascii_perl_positive(
216    span: Span,
217    kind: &regex_syntax::ast::ClassPerlKind,
218) -> regex_syntax::ast::ClassSetItem {
219    use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
220    match kind {
221        ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
222        ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
223            span,
224            items: vec![
225                ascii_class_range(span, 'a', 'z'),
226                ascii_class_range(span, 'A', 'Z'),
227                ascii_class_range(span, '0', '9'),
228                ClassSetItem::Literal(ascii_class_lit(span, '_')),
229            ],
230        }),
231        ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
232            span,
233            items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
234                .into_iter()
235                .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
236                .collect(),
237        }),
238    }
239}
240
241fn ascii_perl_set_item(
242    span: Span,
243    kind: &regex_syntax::ast::ClassPerlKind,
244    negated: bool,
245) -> regex_syntax::ast::ClassSetItem {
246    use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
247    let positive = ascii_perl_positive(span, kind);
248    if negated {
249        ClassSetItem::Bracketed(Box::new(ClassBracketed {
250            span,
251            negated: true,
252            kind: ClassSet::Item(positive),
253        }))
254    } else {
255        positive
256    }
257}
258
259fn rewrite_ascii_perl_set(set: &regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
260    use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
261    match set {
262        ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
263        ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
264            span: op.span,
265            kind: op.kind.clone(),
266            lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
267            rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
268        }),
269    }
270}
271
272fn rewrite_ascii_perl_item(
273    item: &regex_syntax::ast::ClassSetItem,
274) -> regex_syntax::ast::ClassSetItem {
275    use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
276    match item {
277        ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
278        ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
279            span: u.span,
280            items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
281        }),
282        ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
283            span: b.span,
284            negated: b.negated,
285            kind: rewrite_ascii_perl_set(&b.kind),
286        })),
287        other => other.clone(),
288    }
289}
290
291#[derive(Clone, Debug, Eq, PartialEq)]
292enum Primitive {
293    Literal(Literal),
294    Assertion(ast::Assertion),
295    Dot(Span),
296    Top(Span),
297    Perl(ClassPerl),
298    Unicode(ClassUnicode),
299}
300
301impl Primitive {
302    fn span(&self) -> &Span {
303        match *self {
304            Primitive::Literal(ref x) => &x.span,
305            Primitive::Assertion(ref x) => &x.span,
306            Primitive::Dot(ref span) => span,
307            Primitive::Top(ref span) => span,
308            Primitive::Perl(ref x) => &x.span,
309            Primitive::Unicode(ref x) => &x.span,
310        }
311    }
312
313    fn into_ast(self) -> Ast {
314        match self {
315            Primitive::Literal(lit) => Ast::literal(lit),
316            Primitive::Assertion(assert) => Ast::assertion(assert),
317            Primitive::Dot(span) => Ast::dot(span),
318            Primitive::Top(span) => Ast::top(span),
319            Primitive::Perl(cls) => Ast::class_perl(cls),
320            Primitive::Unicode(cls) => Ast::class_unicode(cls),
321        }
322    }
323
324    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
325        use self::Primitive::*;
326        use regex_syntax::ast::ClassSetItem;
327
328        match self {
329            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
330            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
331            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
332            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
333        }
334    }
335
336    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
337        use self::Primitive::*;
338
339        match self {
340            Literal(lit) => Ok(lit),
341            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
342        }
343    }
344}
345
346#[derive(Clone, Debug, Eq, PartialEq)]
347pub enum Either<Left, Right> {
348    Left(Left),
349    Right(Right),
350}
351
352#[derive(Clone, Debug, Eq, PartialEq)]
353pub struct ParseError {
354    /// The kind of error.
355    pub kind: ErrorKind,
356    /// The original pattern that the parser generated the error from. Every
357    /// span in an error is a valid range into this string.
358    pattern: String,
359    /// The span of this error.
360    pub span: Span,
361}
362
363impl std::fmt::Display for ParseError {
364    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
365        write!(f, "{:?}: {:?}", self.kind, self.span)
366    }
367}
368impl std::error::Error for ParseError {}
369
370type Result<T> = core::result::Result<T, ParseError>;
371
372#[derive(Clone, Debug)]
373enum GroupState {
374    /// This state is pushed whenever an opening group is found.
375    Group {
376        /// The concatenation immediately preceding the opening group.
377        concat: Concat,
378        /// The group that has been opened. Its sub-AST is always empty.
379        group: ast::Group,
380        /// Whether this group has the `x` flag enabled or not.
381        ignore_whitespace: bool,
382    },
383    Alternation(ast::Alternation),
384    Intersection(ast::Intersection),
385}
386
387#[derive(Clone, Debug)]
388enum ClassState {
389    /// This state is pushed whenever an opening bracket is found.
390    Open {
391        /// The union of class items immediately preceding this class.
392        union: regex_syntax::ast::ClassSetUnion,
393        set: regex_syntax::ast::ClassBracketed,
394    },
395    /// This state is pushed when a operator is seen. When popped, the stored
396    /// set becomes the left hand side of the operator.
397    Op {
398        /// The type of the operation, i.e., &&, -- or ~~.
399        kind: regex_syntax::ast::ClassSetBinaryOpKind,
400        /// The left-hand side of the operator.
401        lhs: regex_syntax::ast::ClassSet,
402    },
403}
404
405/// RE# syntax parser based on the regex-syntax crate.
406pub struct ResharpParser<'s> {
407    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
408    unicode_classes: resharp_algebra::UnicodeClassCache,
409    pub translator: regex_syntax::hir::translate::Translator,
410    pub pattern: &'s str,
411    pos: Cell<Position>,
412    capture_index: Cell<u32>,
413    octal: bool,
414    empty_min_range: bool,
415    ignore_whitespace: Cell<bool>,
416    dot_all: Cell<bool>,
417    multiline: Cell<bool>,
418    global_unicode: bool,
419    global_full_unicode: bool,
420    global_ascii_perl: bool,
421    global_case_insensitive: bool,
422    expanded_ast_limit: u64,
423    max_list_len: usize,
424    max_repeat: u32,
425    max_depth: usize,
426    comments: RefCell<Vec<ast::Comment>>,
427    stack_group: RefCell<Vec<GroupState>>,
428    stack_class: RefCell<Vec<ClassState>>,
429    capture_names: RefCell<Vec<ast::CaptureName>>,
430    scratch: RefCell<String>,
431}
432
433fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
434    result.map_err(|e| {
435        if e.kind == from {
436            ParseError {
437                kind: to,
438                pattern: e.pattern,
439                span: e.span,
440            }
441        } else {
442            e
443        }
444    })
445}
446
447fn is_capture_char(c: char, first: bool) -> bool {
448    if first {
449        c == '_' || c.is_alphabetic()
450    } else {
451        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
452    }
453}
454
455pub fn is_meta_character(c: char) -> bool {
456    matches!(
457        c,
458        '\\' | '.'
459            | '+'
460            | '*'
461            | '?'
462            | '('
463            | ')'
464            | '|'
465            | '['
466            | ']'
467            | '{'
468            | '}'
469            | '^'
470            | '$'
471            | '#'
472            | '&'
473            | '-'
474            | '~'
475            | '_'
476    )
477}
478
479/// escapes all resharp meta characters in `text`.
480pub fn escape(text: &str) -> String {
481    let mut buf = String::new();
482    escape_into(text, &mut buf);
483    buf
484}
485
486/// escapes all resharp meta characters in `text` and appends to `buf`.
487pub fn escape_into(text: &str, buf: &mut String) {
488    buf.reserve(text.len());
489    for c in text.chars() {
490        if is_meta_character(c) {
491            buf.push('\\');
492        }
493        buf.push(c);
494    }
495}
496
497pub fn is_escapeable_character(c: char) -> bool {
498    if is_meta_character(c) {
499        return true;
500    }
501    if !c.is_ascii() {
502        return false;
503    }
504    match c {
505        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
506        '<' | '>' => false,
507        _ => true,
508    }
509}
510
511fn is_hex(c: char) -> bool {
512    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
513}
514
515fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
516    match ast {
517        Ast::Concat(c) => {
518            let mut child_at_start = at_start;
519            for child in &c.asts {
520                ensure_lookbehind_at_start(child, child_at_start)?;
521                if ast_may_consume(child) {
522                    child_at_start = false;
523                }
524            }
525            Ok(())
526        }
527        Ast::Alternation(a) => {
528            for child in &a.asts {
529                ensure_lookbehind_at_start(child, at_start)?;
530            }
531            Ok(())
532        }
533        Ast::Intersection(i) => {
534            for child in &i.asts {
535                ensure_lookbehind_at_start(child, at_start)?;
536            }
537            Ok(())
538        }
539        Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
540        Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
541        Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
542        Ast::Lookaround(g) => {
543            match g.kind {
544                LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
545                    if !at_start {
546                        return Err(g.span);
547                    }
548                }
549                LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
550            }
551            ensure_lookbehind_at_start(&g.ast, true)
552        }
553        Ast::Empty(_)
554        | Ast::Flags(_)
555        | Ast::Literal(_)
556        | Ast::Dot(_)
557        | Ast::Top(_)
558        | Ast::Assertion(_)
559        | Ast::ClassUnicode(_)
560        | Ast::ClassPerl(_)
561        | Ast::ClassBracketed(_) => Ok(()),
562    }
563}
564
565fn ast_may_consume(ast: &Ast) -> bool {
566    match ast {
567        Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
568        Ast::Literal(_)
569        | Ast::Dot(_)
570        | Ast::Top(_)
571        | Ast::ClassUnicode(_)
572        | Ast::ClassPerl(_)
573        | Ast::ClassBracketed(_) => true,
574        Ast::Group(g) => ast_may_consume(&g.ast),
575        Ast::Repetition(r) => {
576            if !ast_may_consume(&r.ast) {
577                return false;
578            }
579            match r.op.kind {
580                RepetitionKind::ZeroOrOne
581                | RepetitionKind::ZeroOrMore
582                | RepetitionKind::OneOrMore => true,
583                RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
584                RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
585                RepetitionKind::Range(_) => true,
586            }
587        }
588        Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
589        Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
590        Ast::Complement(_) => true,
591        Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
592    }
593}
594
595impl<'s> ResharpParser<'s> {
596    fn default_translator_builder(&self) -> TranslatorBuilder {
597        let mut trb = TranslatorBuilder::new();
598        trb.unicode(self.global_unicode);
599        trb.utf8(false);
600        trb.case_insensitive(self.global_case_insensitive);
601        trb
602    }
603
604    pub fn new(pattern: &'s str) -> Self {
605        Self::with_flags(pattern, &PatternFlags::default())
606    }
607
608    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
609        let mut trb = TranslatorBuilder::new();
610        trb.unicode(flags.unicode);
611        trb.utf8(false);
612        trb.case_insensitive(flags.case_insensitive);
613        Self {
614            translator: trb.build(),
615            pattern,
616            perl_classes: vec![],
617            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
618            pos: Cell::new(Position::new(0, 0, 0)),
619            capture_index: Cell::new(0),
620            octal: false,
621            empty_min_range: false,
622            ignore_whitespace: Cell::new(flags.ignore_whitespace),
623            dot_all: Cell::new(flags.dot_matches_new_line),
624            multiline: Cell::new(flags.multiline),
625            global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
626            global_full_unicode: flags.full_unicode,
627            global_ascii_perl: flags.ascii_perl_classes,
628            global_case_insensitive: flags.case_insensitive,
629            expanded_ast_limit: flags.expanded_ast_limit,
630            max_list_len: flags.max_list_len,
631            max_repeat: flags.max_repeat,
632            max_depth: flags.max_depth,
633            comments: RefCell::new(vec![]),
634            stack_group: RefCell::new(vec![]),
635            stack_class: RefCell::new(vec![]),
636            capture_names: RefCell::new(vec![]),
637            scratch: RefCell::new(String::new()),
638        }
639    }
640
641    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
642        self
643    }
644
645    fn pattern(&self) -> &str {
646        self.pattern
647    }
648
649    fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
650        ParseError {
651            kind,
652            pattern: self.pattern().to_string(),
653            span,
654        }
655    }
656
657    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
658        self.error(
659            Span::splat(self.pos()),
660            ast::ErrorKind::UnsupportedResharpRegex,
661        )
662    }
663
664    fn offset(&self) -> usize {
665        self.parser().pos.get().offset
666    }
667
668    fn line(&self) -> usize {
669        self.parser().pos.get().line
670    }
671
672    fn column(&self) -> usize {
673        self.parser().pos.get().column
674    }
675
676    fn next_capture_index(&self, span: Span) -> Result<u32> {
677        let current = self.parser().capture_index.get();
678        let i = current
679            .checked_add(1)
680            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
681        self.parser().capture_index.set(i);
682        Ok(i)
683    }
684
685    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
686        let mut names = self.parser().capture_names.borrow_mut();
687        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
688            Err(i) => {
689                names.insert(i, cap.clone());
690                Ok(())
691            }
692            Ok(i) => Err(self.error(
693                cap.span,
694                ast::ErrorKind::GroupNameDuplicate {
695                    original: names[i].span,
696                },
697            )),
698        }
699    }
700
701    fn ignore_whitespace(&self) -> bool {
702        self.parser().ignore_whitespace.get()
703    }
704
705    fn char(&self) -> char {
706        self.char_at(self.offset())
707    }
708
709    fn char_at(&self, i: usize) -> char {
710        self.pattern()[i..]
711            .chars()
712            .next()
713            .unwrap_or_else(|| panic!("expected char at offset {}", i))
714    }
715
716    fn bump(&self) -> bool {
717        if self.is_eof() {
718            return false;
719        }
720        let Position {
721            mut offset,
722            mut line,
723            mut column,
724        } = self.pos();
725        if self.char() == '\n' {
726            line = line.checked_add(1).unwrap();
727            column = 1;
728        } else {
729            column = column.checked_add(1).unwrap();
730        }
731        offset += self.char().len_utf8();
732        self.parser().pos.set(Position {
733            offset,
734            line,
735            column,
736        });
737        self.pattern()[self.offset()..].chars().next().is_some()
738    }
739
740    fn bump_if(&self, prefix: &str) -> bool {
741        if self.pattern()[self.offset()..].starts_with(prefix) {
742            for _ in 0..prefix.chars().count() {
743                self.bump();
744            }
745            true
746        } else {
747            false
748        }
749    }
750
751    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
752        if self.bump_if("?=") {
753            return Some((true, true));
754        }
755        if self.bump_if("?!") {
756            return Some((true, false));
757        }
758        if self.bump_if("?<=") {
759            return Some((false, true));
760        }
761        if self.bump_if("?<!") {
762            return Some((false, false));
763        }
764        None
765    }
766
767    fn bump_and_bump_space(&self) -> bool {
768        if !self.bump() {
769            return false;
770        }
771        self.bump_space();
772        !self.is_eof()
773    }
774
775    fn bump_space(&self) {
776        if !self.ignore_whitespace() {
777            return;
778        }
779        while !self.is_eof() {
780            if self.char().is_whitespace() {
781                self.bump();
782            } else if self.char() == '#' {
783                let start = self.pos();
784                let mut comment_text = String::new();
785                self.bump();
786                while !self.is_eof() {
787                    let c = self.char();
788                    self.bump();
789                    if c == '\n' {
790                        break;
791                    }
792                    comment_text.push(c);
793                }
794                let comment = ast::Comment {
795                    span: Span::new(start, self.pos()),
796                    comment: comment_text,
797                };
798                self.parser().comments.borrow_mut().push(comment);
799            } else {
800                break;
801            }
802        }
803    }
804
805    fn peek(&self) -> Option<char> {
806        if self.is_eof() {
807            return None;
808        }
809        self.pattern()[self.offset() + self.char().len_utf8()..]
810            .chars()
811            .next()
812    }
813
814    /// Like peek, but will ignore spaces when the parser is in whitespace
815    /// insensitive mode.
816    fn peek_space(&self) -> Option<char> {
817        if !self.ignore_whitespace() {
818            return self.peek();
819        }
820        if self.is_eof() {
821            return None;
822        }
823        let mut start = self.offset() + self.char().len_utf8();
824        let mut in_comment = false;
825        for (i, c) in self.pattern()[start..].char_indices() {
826            if c.is_whitespace() {
827                continue;
828            } else if !in_comment && c == '#' {
829                in_comment = true;
830            } else if in_comment && c == '\n' {
831                in_comment = false;
832            } else {
833                start += i;
834                break;
835            }
836        }
837        self.pattern()[start..].chars().next()
838    }
839
840    fn is_eof(&self) -> bool {
841        self.offset() == self.pattern().len()
842    }
843
844    fn pos(&self) -> Position {
845        self.parser().pos.get()
846    }
847
848    fn span(&self) -> Span {
849        Span::splat(self.pos())
850    }
851
852    fn span_char(&self) -> Span {
853        let mut next = Position {
854            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
855            line: self.line(),
856            column: self.column().checked_add(1).unwrap(),
857        };
858        if self.char() == '\n' {
859            next.line += 1;
860            next.column = 1;
861        }
862        Span::new(self.pos(), next)
863    }
864
865    #[inline(never)]
866    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
867        assert_eq!(self.char(), '|');
868        concat.span.end = self.pos();
869        self.push_or_add_alternation(concat);
870        self.bump();
871        Ok(ast::Concat {
872            span: self.span(),
873            asts: vec![],
874        })
875    }
876
877    fn push_or_add_alternation(&self, concat: Concat) {
878        use self::GroupState::*;
879
880        let mut stack = self.parser().stack_group.borrow_mut();
881        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
882            alts.asts.push(concat.into_ast());
883            return;
884        }
885        stack.push(Alternation(ast::Alternation {
886            span: Span::new(concat.span.start, self.pos()),
887            asts: vec![concat.into_ast()],
888        }));
889    }
890
891    #[inline(never)]
892    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
893        assert_eq!(self.char(), '&');
894        concat.span.end = self.pos();
895        self.push_or_add_intersect(concat);
896        self.bump();
897        Ok(Concat {
898            span: self.span(),
899            asts: vec![],
900        })
901    }
902
903    fn push_or_add_intersect(&self, concat: Concat) {
904        use self::GroupState::*;
905
906        let mut stack = self.parser().stack_group.borrow_mut();
907        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
908            alts.asts.push(concat.into_ast());
909            return;
910        }
911        stack.push(Intersection(ast::Intersection {
912            span: Span::new(concat.span.start, self.pos()),
913            asts: vec![concat.into_ast()],
914        }));
915    }
916
917    #[inline(never)]
918    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
919        assert_eq!(self.char(), '(');
920        match self.parse_group()? {
921            Either::Left(set) => {
922                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
923                if let Some(v) = ignore {
924                    self.parser().ignore_whitespace.set(v);
925                }
926
927                concat.asts.push(Ast::flags(set));
928                Ok(concat)
929            }
930            Either::Right(group) => {
931                let old_ignore_whitespace = self.ignore_whitespace();
932                let new_ignore_whitespace = group
933                    .flags()
934                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
935                    .unwrap_or(old_ignore_whitespace);
936                self.parser()
937                    .stack_group
938                    .borrow_mut()
939                    .push(GroupState::Group {
940                        concat,
941                        group,
942                        ignore_whitespace: old_ignore_whitespace,
943                    });
944                self.parser().ignore_whitespace.set(new_ignore_whitespace);
945                Ok(Concat {
946                    span: self.span(),
947                    asts: vec![],
948                })
949            }
950        }
951    }
952
953    #[inline(never)]
954    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
955        assert_eq!(self.char(), '~');
956        self.bump();
957        if self.is_eof() || self.char() != '(' {
958            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
959        }
960        let open_span = self.span_char();
961        self.bump();
962        let group = ast::Group {
963            span: open_span,
964            kind: ast::GroupKind::Complement,
965            ast: Box::new(Ast::empty(self.span())),
966        };
967
968        let old_ignore_whitespace = self.ignore_whitespace();
969        let new_ignore_whitespace = group
970            .flags()
971            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
972            .unwrap_or(old_ignore_whitespace);
973        self.parser()
974            .stack_group
975            .borrow_mut()
976            .push(GroupState::Group {
977                concat,
978                group,
979                ignore_whitespace: old_ignore_whitespace,
980            });
981        self.parser().ignore_whitespace.set(new_ignore_whitespace);
982        Ok(Concat {
983            span: self.span(),
984            asts: vec![],
985        })
986    }
987
988    #[inline(never)]
989    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
990        use self::GroupState::*;
991        assert_eq!(self.char(), ')');
992        let mut stack = self.parser().stack_group.borrow_mut();
993        let topstack = stack.pop();
994
995        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
996            Some(Group {
997                concat,
998                group,
999                ignore_whitespace,
1000            }) => (concat, group, ignore_whitespace, None),
1001            Some(Alternation(alt)) => match stack.pop() {
1002                Some(Group {
1003                    concat,
1004                    group,
1005                    ignore_whitespace,
1006                }) => (
1007                    concat,
1008                    group,
1009                    ignore_whitespace,
1010                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
1011                ),
1012                None | Some(Alternation(_)) | Some(Intersection(_)) => {
1013                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1014                }
1015            },
1016            Some(Intersection(int)) => match stack.pop() {
1017                Some(Group {
1018                    concat,
1019                    group,
1020                    ignore_whitespace,
1021                }) => (
1022                    concat,
1023                    group,
1024                    ignore_whitespace,
1025                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
1026                ),
1027                None | Some(Alternation(_)) | Some(Intersection(_)) => {
1028                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1029                }
1030            },
1031
1032            None => {
1033                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1034            }
1035        };
1036        self.parser().ignore_whitespace.set(ignore_whitespace);
1037        group_concat.span.end = self.pos();
1038        self.bump();
1039        group.span.end = self.pos();
1040        match alt {
1041            Some(Either::Left(mut alt)) => {
1042                alt.span.end = group_concat.span.end;
1043                alt.asts.push(group_concat.into_ast());
1044                group.ast = Box::new(alt.into_ast());
1045            }
1046            Some(Either::Right(mut int)) => {
1047                int.span.end = group_concat.span.end;
1048                int.asts.push(group_concat.into_ast());
1049                group.ast = Box::new(int.into_ast());
1050            }
1051            None => {
1052                group.ast = Box::new(group_concat.into_ast());
1053            }
1054        }
1055
1056        if group.kind == GroupKind::Complement {
1057            let complement = ast::Complement {
1058                span: self.span(),
1059                ast: group.ast,
1060            };
1061            prior_concat.asts.push(Ast::complement(complement));
1062        }
1063        // ignore groups for now
1064        else {
1065            prior_concat.asts.push(Ast::group(group));
1066        }
1067        Ok(prior_concat)
1068    }
1069
1070    #[inline(never)]
1071    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1072        concat.span.end = self.pos();
1073        let mut stack = self.parser().stack_group.borrow_mut();
1074        let ast = match stack.pop() {
1075            None => Ok(concat.into_ast()),
1076            Some(GroupState::Alternation(mut alt)) => {
1077                alt.span.end = self.pos();
1078                alt.asts.push(concat.into_ast());
1079                Ok(Ast::alternation(alt))
1080            }
1081            Some(GroupState::Intersection(mut int)) => {
1082                int.span.end = self.pos();
1083                int.asts.push(concat.into_ast());
1084
1085                Ok(Ast::intersection(int))
1086            }
1087            Some(GroupState::Group { group, .. }) => {
1088                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1089            }
1090        };
1091        // If we try to pop again, there should be nothing.
1092        match stack.pop() {
1093            None => ast,
1094            Some(GroupState::Alternation(alt)) => {
1095                Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1096            }
1097            Some(GroupState::Intersection(int)) => {
1098                Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1099            }
1100            Some(GroupState::Group { group, .. }) => {
1101                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1102            }
1103        }
1104    }
1105
1106    #[inline(never)]
1107    fn push_class_open(
1108        &self,
1109        parent_union: regex_syntax::ast::ClassSetUnion,
1110    ) -> Result<regex_syntax::ast::ClassSetUnion> {
1111        assert_eq!(self.char(), '[');
1112
1113        let (nested_set, nested_union) = self.parse_set_class_open()?;
1114        self.parser()
1115            .stack_class
1116            .borrow_mut()
1117            .push(ClassState::Open {
1118                union: parent_union,
1119                set: nested_set,
1120            });
1121        Ok(nested_union)
1122    }
1123
1124    #[inline(never)]
1125    fn pop_class(
1126        &self,
1127        nested_union: regex_syntax::ast::ClassSetUnion,
1128    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1129        assert_eq!(self.char(), ']');
1130
1131        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1132        let prevset = self.pop_class_op(item);
1133        let mut stack = self.parser().stack_class.borrow_mut();
1134        match stack.pop() {
1135            None => panic!("unexpected empty character class stack"),
1136            Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1137            Some(ClassState::Open { mut union, mut set }) => {
1138                self.bump();
1139                set.span.end = self.pos();
1140                set.kind = prevset;
1141                if stack.is_empty() {
1142                    Ok(Either::Right(set))
1143                } else {
1144                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1145                    Ok(Either::Left(union))
1146                }
1147            }
1148        }
1149    }
1150
1151    #[inline(never)]
1152    fn unclosed_class_error(&self) -> ParseError {
1153        for state in self.parser().stack_class.borrow().iter().rev() {
1154            if let ClassState::Open { ref set, .. } = *state {
1155                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1156            }
1157        }
1158        panic!("no open character class found")
1159    }
1160
1161    #[inline(never)]
1162    fn push_class_op(
1163        &self,
1164        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1165        next_union: regex_syntax::ast::ClassSetUnion,
1166    ) -> regex_syntax::ast::ClassSetUnion {
1167        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1168        let new_lhs = self.pop_class_op(item);
1169        self.parser().stack_class.borrow_mut().push(ClassState::Op {
1170            kind: next_kind,
1171            lhs: new_lhs,
1172        });
1173        regex_syntax::ast::ClassSetUnion {
1174            span: self.span(),
1175            items: vec![],
1176        }
1177    }
1178
1179    #[inline(never)]
1180    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1181        let mut stack = self.parser().stack_class.borrow_mut();
1182        let (kind, lhs) = match stack.pop() {
1183            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1184            Some(state @ ClassState::Open { .. }) => {
1185                stack.push(state);
1186                return rhs;
1187            }
1188            None => unreachable!(),
1189        };
1190        let span = Span::new(lhs.span().start, rhs.span().end);
1191        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1192            span,
1193            kind,
1194            lhs: Box::new(lhs),
1195            rhs: Box::new(rhs),
1196        })
1197    }
1198
1199    fn any_codepoint_node(&self, tb: &mut TB<'_>) -> NodeId {
1200        utf8_codepoint_node(tb)
1201    }
1202
1203    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1204        match hir.kind() {
1205            hir::HirKind::Empty => Ok(NodeId::EPS),
1206            hir::HirKind::Literal(l) => {
1207                if l.0.len() == 1 {
1208                    let node = tb.mk_u8(l.0[0]);
1209                    Ok(node)
1210                } else {
1211                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1212                    let conc = tb.mk_concats(ws.iter().copied());
1213                    Ok(conc)
1214                }
1215            }
1216            hir::HirKind::Class(class) => match class {
1217                hir::Class::Unicode(class_unicode) => {
1218                    Ok(unicode_ranges_to_node(class_unicode.ranges(), tb))
1219                }
1220                hir::Class::Bytes(class_bytes) => {
1221                    let ranges = class_bytes.ranges();
1222                    let mut result = NodeId::BOT;
1223                    for range in ranges {
1224                        let start = range.start();
1225                        let end = range.end();
1226                        let node = tb.mk_range_u8(start, end);
1227                        result = tb.mk_union(result, node);
1228                    }
1229                    Ok(result)
1230                }
1231            },
1232            hir::HirKind::Look(_) => Err(self.error(
1233                Span::splat(self.pos()),
1234                ast::ErrorKind::UnsupportedResharpRegex,
1235            )),
1236            hir::HirKind::Repetition(_) => Err(self.error(
1237                Span::splat(self.pos()),
1238                ast::ErrorKind::UnsupportedResharpRegex,
1239            )),
1240            hir::HirKind::Capture(_) => Err(self.error(
1241                Span::splat(self.pos()),
1242                ast::ErrorKind::UnsupportedResharpRegex,
1243            )),
1244            hir::HirKind::Concat(body) => {
1245                let mut result = NodeId::EPS;
1246                for child in body {
1247                    let node = self.hir_to_node_id(child, tb)?;
1248                    result = tb.mk_concat(result, node);
1249                }
1250                Ok(result)
1251            }
1252            hir::HirKind::Alternation(_) => Err(self.error(
1253                Span::splat(self.pos()),
1254                ast::ErrorKind::UnsupportedResharpRegex,
1255            )),
1256        }
1257    }
1258
1259    fn translate_ast_to_hir(
1260        &mut self,
1261        orig_ast: &regex_syntax::ast::Ast,
1262        tb: &mut TB<'s>,
1263    ) -> Result<NodeId> {
1264        match self.translator.translate("", orig_ast) {
1265            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1266            Ok(hir) => self.hir_to_node_id(&hir, tb),
1267        }
1268    }
1269
1270    fn translator_to_node_id(
1271        &mut self,
1272        orig_ast: &regex_syntax::ast::Ast,
1273        translator: &mut Option<Translator>,
1274        tb: &mut TB<'s>,
1275    ) -> Result<NodeId> {
1276        match translator {
1277            Some(tr) => {
1278                let hir = tr
1279                    .translate("", orig_ast)
1280                    .map_err(|e| self.unsupported_error(e))?;
1281                self.hir_to_node_id(&hir, tb)
1282            }
1283            None => self.translate_ast_to_hir(orig_ast, tb),
1284        }
1285    }
1286
1287    fn get_class(
1288        &mut self,
1289        negated: bool,
1290        kind: regex_syntax::ast::ClassPerlKind,
1291        tb: &mut TB<'s>,
1292    ) -> Result<NodeId> {
1293        let w = self
1294            .perl_classes
1295            .iter()
1296            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1297        match w {
1298            Some((_, _, value)) => Ok(*value),
1299            None => {
1300                let translated = if self.global_ascii_perl {
1301                    let pos = match kind {
1302                        regex_syntax::ast::ClassPerlKind::Word => {
1303                            let az = tb.mk_range_u8(b'a', b'z');
1304                            let big = tb.mk_range_u8(b'A', b'Z');
1305                            let dig = tb.mk_range_u8(b'0', b'9');
1306                            let us = tb.mk_u8(b'_');
1307                            tb.mk_unions([az, big, dig, us].into_iter())
1308                        }
1309                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1310                        regex_syntax::ast::ClassPerlKind::Space => {
1311                            let sp = tb.mk_u8(b' ');
1312                            let tab = tb.mk_u8(b'\t');
1313                            let nl = tb.mk_u8(b'\n');
1314                            let cr = tb.mk_u8(b'\r');
1315                            let ff = tb.mk_u8(0x0C);
1316                            let vt = tb.mk_u8(0x0B);
1317                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1318                        }
1319                    };
1320                    if negated {
1321                        resharp_algebra::neg_class(tb, pos)
1322                    } else {
1323                        pos
1324                    }
1325                } else if self.global_unicode {
1326                    match kind {
1327                        regex_syntax::ast::ClassPerlKind::Word => {
1328                            if self.global_full_unicode {
1329                                self.unicode_classes.ensure_word_full(tb);
1330                            } else {
1331                                self.unicode_classes.ensure_word(tb);
1332                            }
1333                            if negated {
1334                                self.unicode_classes.non_word
1335                            } else {
1336                                self.unicode_classes.word
1337                            }
1338                        }
1339                        regex_syntax::ast::ClassPerlKind::Digit => {
1340                            if self.global_full_unicode {
1341                                self.unicode_classes.ensure_digit_full(tb);
1342                            } else {
1343                                self.unicode_classes.ensure_digit(tb);
1344                            }
1345                            if negated {
1346                                self.unicode_classes.non_digit
1347                            } else {
1348                                self.unicode_classes.digit
1349                            }
1350                        }
1351                        regex_syntax::ast::ClassPerlKind::Space => {
1352                            if self.global_full_unicode {
1353                                self.unicode_classes.ensure_space_full(tb);
1354                            } else {
1355                                self.unicode_classes.ensure_space(tb);
1356                            }
1357                            if negated {
1358                                self.unicode_classes.non_space
1359                            } else {
1360                                self.unicode_classes.space
1361                            }
1362                        }
1363                    }
1364                } else {
1365                    let pos = match kind {
1366                        regex_syntax::ast::ClassPerlKind::Word => {
1367                            let az = tb.mk_range_u8(b'a', b'z');
1368                            let big = tb.mk_range_u8(b'A', b'Z');
1369                            let dig = tb.mk_range_u8(b'0', b'9');
1370                            let us = tb.mk_u8(b'_');
1371                            tb.mk_unions([az, big, dig, us].into_iter())
1372                        }
1373                        regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1374                        regex_syntax::ast::ClassPerlKind::Space => {
1375                            let sp = tb.mk_u8(b' ');
1376                            let tab = tb.mk_u8(b'\t');
1377                            let nl = tb.mk_u8(b'\n');
1378                            let cr = tb.mk_u8(b'\r');
1379                            let ff = tb.mk_u8(0x0C);
1380                            let vt = tb.mk_u8(0x0B);
1381                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1382                        }
1383                    };
1384                    if negated {
1385                        // negate within the byte, not the language, or `\D`/`\S`/`\W` wrongly match "".
1386                        resharp_algebra::neg_class(tb, pos)
1387                    } else {
1388                        pos
1389                    }
1390                };
1391                self.perl_classes.push((negated, kind, translated));
1392                Ok(translated)
1393            }
1394        }
1395    }
1396
1397    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1398        use WordCharKind::*;
1399        match ast {
1400            Ast::Literal(lit) => {
1401                if is_word_byte(lit.c as u8) {
1402                    Word
1403                } else {
1404                    NonWord
1405                }
1406            }
1407            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1408                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1409                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1410                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1411                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1412                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1413                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1414            },
1415            Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1416            Ast::Dot(_) | Ast::Top(_) => Unknown,
1417            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1418            Ast::Concat(c) if !c.asts.is_empty() => {
1419                let dir: isize = if left { -1 } else { 1 };
1420                let edge = match Self::concat_edge_index(&c.asts, left) {
1421                    Some(e) => e,
1422                    None => return Unknown,
1423                };
1424                let kind = Self::word_char_kind(&c.asts[edge], left);
1425                match kind {
1426                    MaybeWord => {
1427                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1428                            Word => Word,
1429                            Edge => MaybeWord,
1430                            _ => Unknown,
1431                        }
1432                    }
1433                    MaybeNonWord => {
1434                        match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1435                            NonWord => NonWord,
1436                            Edge => MaybeNonWord,
1437                            _ => Unknown,
1438                        }
1439                    }
1440                    other => other,
1441                }
1442            }
1443            Ast::Alternation(alt) if !alt.asts.is_empty() => {
1444                let first = Self::word_char_kind(&alt.asts[0], left);
1445                if alt.asts[1..]
1446                    .iter()
1447                    .all(|a| Self::word_char_kind(a, left) == first)
1448                {
1449                    first
1450                } else {
1451                    Unknown
1452                }
1453            }
1454            Ast::Repetition(r) => {
1455                let inner = Self::word_char_kind(&r.ast, left);
1456                let nullable = matches!(
1457                    &r.op.kind,
1458                    ast::RepetitionKind::ZeroOrMore
1459                        | ast::RepetitionKind::ZeroOrOne
1460                        | ast::RepetitionKind::Range(
1461                            ast::RepetitionRange::Bounded(0, _)
1462                                | ast::RepetitionRange::Exactly(0)
1463                        )
1464                );
1465                if nullable {
1466                    match inner {
1467                        Word => MaybeWord,
1468                        NonWord => MaybeNonWord,
1469                        _ => Unknown,
1470                    }
1471                } else {
1472                    inner
1473                }
1474            }
1475            Ast::Lookaround(la) => match la.kind {
1476                ast::LookaroundKind::PositiveLookahead
1477                | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1478                _ => Unknown,
1479            },
1480            Ast::Assertion(a) => match (&a.kind, left) {
1481                (ast::AssertionKind::EndText, false) => NonWord,
1482                (ast::AssertionKind::StartText, true) => NonWord,
1483                _ => Unknown,
1484            },
1485            _ => Unknown,
1486        }
1487    }
1488
1489    // ok to return None here, it's only an optimization
1490    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1491        match ast {
1492            Ast::Literal(_)
1493            | Ast::ClassPerl(_)
1494            | Ast::ClassBracketed(_)
1495            | Ast::ClassUnicode(_)
1496            | Ast::Dot(_)
1497            | Ast::Top(_) => Some(ast),
1498            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1499            Ast::Concat(c) if !c.asts.is_empty() => {
1500                Self::concat_edge_index(&c.asts, left)
1501                    .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1502            }
1503            Ast::Repetition(r) => {
1504                let nullable = matches!(
1505                    &r.op.kind,
1506                    ast::RepetitionKind::ZeroOrMore
1507                        | ast::RepetitionKind::ZeroOrOne
1508                        | ast::RepetitionKind::Range(
1509                            ast::RepetitionRange::Bounded(0, _)
1510                                | ast::RepetitionRange::Exactly(0)
1511                        )
1512                );
1513                if nullable {
1514                    None
1515                } else {
1516                    Self::edge_class_ast(&r.ast, left)
1517                }
1518            }
1519            _ => None,
1520        }
1521    }
1522
1523    fn resolve_word_kind(
1524        &mut self,
1525        asts: &[Ast],
1526        idx: usize,
1527        dir: isize,
1528        translator: &mut Option<Translator>,
1529        tb: &mut TB<'s>,
1530        word_id: NodeId,
1531        not_word_id: NodeId,
1532    ) -> Result<WordCharKind> {
1533        use WordCharKind::*;
1534        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1535        if fast != Unknown {
1536            return Ok(fast);
1537        }
1538        let neighbor_idx = (idx as isize + dir) as usize;
1539        let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1540            self.ast_to_node_id(edge, translator, tb)?
1541        } else if dir > 0 {
1542            let mut bodies: Vec<NodeId> = vec![];
1543            let mut j = neighbor_idx;
1544            while j < asts.len() {
1545                match &asts[j] {
1546                    Ast::Lookaround(la) => {
1547                        let kind = la.kind.clone();
1548                        let lookbehind = matches!(
1549                            kind,
1550                            ast::LookaroundKind::PositiveLookbehind
1551                                | ast::LookaroundKind::NegativeLookbehind
1552                        );
1553                        if lookbehind {
1554                            j += 1;
1555                            continue;
1556                        }
1557                        let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1558                        let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1559                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1560                        })?;
1561                        let body_ts = tb.mk_concat(body, NodeId::TS);
1562                        let constraint = match kind {
1563                            ast::LookaroundKind::PositiveLookahead => body_ts,
1564                            ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1565                            _ => unreachable!(),
1566                        };
1567                        bodies.push(constraint);
1568                        j += 1;
1569                    }
1570                    other => {
1571                        let n = self.ast_to_node_id(other, translator, tb)?;
1572                        let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1573                            self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1574                        })?;
1575                        bodies.push(tb.mk_concat(n, NodeId::TS));
1576                        break;
1577                    }
1578                }
1579            }
1580            if bodies.is_empty() {
1581                return Ok(Unknown);
1582            }
1583            let combined = tb.mk_inters(bodies.into_iter());
1584            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1585            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1586            return if tb.subsumes(word_prefix, combined) == Some(true) {
1587                Ok(Word)
1588            } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1589                Ok(NonWord)
1590            } else {
1591                Ok(Unknown)
1592            };
1593        } else {
1594            let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1595            let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1596            let mut neighbor_node = tb
1597                .try_elim_lookarounds(neighbor_node)
1598                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1599            neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1600                Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1601            })?;
1602            let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1603            let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1604            return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1605                Ok(Word)
1606            } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1607                Ok(NonWord)
1608            } else {
1609                Ok(Unknown)
1610            };
1611        };
1612        if tb.subsumes(word_id, node) == Some(true) {
1613            Ok(Word)
1614        } else if tb.subsumes(not_word_id, node) == Some(true) {
1615            Ok(NonWord)
1616        } else {
1617            Ok(Unknown)
1618        }
1619    }
1620
1621    fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1622        match tb.get_kind(node) {
1623            Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1624            Kind::Concat => {
1625                let l = node.left(tb);
1626                let r = node.right(tb);
1627                let stripped_r = Self::strip_trailing_lookahead(tb, r);
1628                if stripped_r == NodeId::EPS {
1629                    Self::strip_trailing_lookahead(tb, l)
1630                } else if stripped_r == r {
1631                    node
1632                } else {
1633                    tb.mk_concat(l, stripped_r)
1634                }
1635            }
1636            _ => node,
1637        }
1638    }
1639
1640    fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1641        let dir: isize = if left { -1 } else { 1 };
1642        let mut e = if left { asts.len() as isize - 1 } else { 0 };
1643        while e >= 0
1644            && (e as usize) < asts.len()
1645            && Self::is_transparent_for_dir(&asts[e as usize], dir)
1646        {
1647            e += dir;
1648        }
1649        if e < 0 || e as usize >= asts.len() {
1650            None
1651        } else {
1652            Some(e as usize)
1653        }
1654    }
1655
1656    fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1657        match ast {
1658            Ast::Lookaround(la) => match la.kind {
1659                ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1660                    dir < 0
1661                }
1662                ast::LookaroundKind::PositiveLookbehind
1663                | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1664            },
1665            Ast::Repetition(r) => matches!(
1666                &r.op.kind,
1667                ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1668            ),
1669            _ => false,
1670        }
1671    }
1672
1673    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1674        use WordCharKind::*;
1675        let next = idx as isize + dir;
1676        if next < 0 || next >= asts.len() as isize {
1677            return Edge;
1678        }
1679        if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1680            return Self::concat_neighbor_kind(asts, next as usize, dir);
1681        }
1682        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1683        match kind {
1684            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1685                Word => Word,
1686                _ => Unknown,
1687            },
1688            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1689                NonWord => NonWord,
1690                _ => Unknown,
1691            },
1692            other => other,
1693        }
1694    }
1695
1696    fn specialize_word_boundaries(
1697        &mut self,
1698        children: &mut [NodeId],
1699        tb: &mut TB<'s>,
1700    ) -> Result<()> {
1701        let wb = self.unicode_classes.wb;
1702        let non_wb = self.unicode_classes.non_wb;
1703        if wb == NodeId::MISSING {
1704            return Ok(());
1705        }
1706        let word = self.unicode_classes.word;
1707        let non_word = self.unicode_classes.non_word;
1708        if word == NodeId::MISSING {
1709            return Ok(());
1710        }
1711        // narrow check, could be done over the whole regex instead of one node
1712        // but i need to make sure the cost isnt too large
1713        let word_pref = tb.mk_concat(word, NodeId::TS);
1714        let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1715        let word_suf = tb.mk_concat(NodeId::TS, word);
1716        let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1717        let len = children.len();
1718        for k in 0..len {
1719            let l = if k == 0 {
1720                WordCharKind::Edge
1721            } else {
1722                use resharp_algebra::Kind;
1723                if tb.get_kind(children[k - 1]) == Kind::End
1724                    && (children[k] == wb || children[k] == non_wb)
1725                {
1726                    return Err(
1727                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1728                    );
1729                }
1730                Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1731            };
1732            let r = if k + 1 >= len {
1733                WordCharKind::Edge
1734            } else {
1735                Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1736            };
1737            children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1738                .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1739        }
1740        Ok(())
1741    }
1742
1743    fn rewrite_wb_in_node(
1744        b: &mut TB<'s>,
1745        node: NodeId,
1746        wb: NodeId,
1747        non_wb: NodeId,
1748        word: NodeId,
1749        left: WordCharKind,
1750        right: WordCharKind,
1751    ) -> Option<NodeId> {
1752        let boundary_match = if node == wb {
1753            true
1754        } else if node == non_wb {
1755            false
1756        } else if b.get_kind(node) == Kind::Union {
1757            let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1758            let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1759            return Some(b.mk_union(l, r));
1760        } else {
1761            return Some(node);
1762        };
1763        use WordCharKind::*;
1764        let result = match (left, right) {
1765            (NonWord, Word) | (Word, NonWord) => {
1766                if boundary_match {
1767                    NodeId::EPS
1768                } else {
1769                    NodeId::BOT
1770                }
1771            }
1772            (Word, Word) | (NonWord, NonWord) => {
1773                if boundary_match {
1774                    NodeId::BOT
1775                } else {
1776                    NodeId::EPS
1777                }
1778            }
1779            (Word, _) => {
1780                if boundary_match {
1781                    b.mk_neg_lookahead(word, 0)
1782                } else {
1783                    let tail = b.mk_concat(word, NodeId::TS);
1784                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1785                }
1786            }
1787            (NonWord, _) => {
1788                if boundary_match {
1789                    let tail = b.mk_concat(word, NodeId::TS);
1790                    b.mk_lookahead(tail, NodeId::MISSING, 0)
1791                } else {
1792                    b.mk_neg_lookahead(word, 0)
1793                }
1794            }
1795            (_, Word) => {
1796                if boundary_match {
1797                    b.mk_neg_lookbehind(word)
1798                } else {
1799                    b.mk_lookbehind(word, NodeId::MISSING)
1800                }
1801            }
1802            (_, NonWord) => {
1803                if boundary_match {
1804                    b.mk_lookbehind(word, NodeId::MISSING)
1805                } else {
1806                    b.mk_neg_lookbehind(word)
1807                }
1808            }
1809            _ => return Some(node),
1810        };
1811        Some(result)
1812    }
1813
1814    fn classify(
1815        b: &mut TB<'s>,
1816        node: NodeId,
1817        word_dir: NodeId,
1818        non_word_dir: NodeId,
1819    ) -> WordCharKind {
1820        if b.contains_look(node) || b.contains_anchors(node) {
1821            return WordCharKind::Unknown;
1822        }
1823        if b.subsumes(word_dir, node) == Some(true) {
1824            WordCharKind::Word
1825        } else if b.subsumes(non_word_dir, node) == Some(true) {
1826            WordCharKind::NonWord
1827        } else {
1828            WordCharKind::Unknown
1829        }
1830    }
1831
1832    fn rewrite_word_boundary_in_concat(
1833        &mut self,
1834        asts: &[Ast],
1835        idx: usize,
1836        translator: &mut Option<Translator>,
1837        tb: &mut TB<'s>,
1838        negated: bool,
1839    ) -> Result<(NodeId, usize)> {
1840        use WordCharKind::*;
1841        if self.global_full_unicode {
1842            self.unicode_classes.ensure_word_full(tb);
1843        } else if self.global_unicode && !self.global_ascii_perl {
1844            self.unicode_classes.ensure_word(tb);
1845        } else {
1846            self.unicode_classes.ensure_word_ascii(tb);
1847        }
1848        let word_id = self.unicode_classes.word;
1849        let not_word_id = self.unicode_classes.non_word;
1850        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1851        let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1852        let boundary_match = !negated;
1853        match (left, right) {
1854            (NonWord, Word) | (Word, NonWord) => Ok((
1855                if boundary_match {
1856                    NodeId::EPS
1857                } else {
1858                    NodeId::BOT
1859                },
1860                idx + 1,
1861            )),
1862            (Word, Word) | (NonWord, NonWord) => Ok((
1863                if boundary_match {
1864                    NodeId::BOT
1865                } else {
1866                    NodeId::EPS
1867                },
1868                idx + 1,
1869            )),
1870            (Word, _) => {
1871                if boundary_match {
1872                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1873                } else {
1874                    let tail = tb.mk_concat(word_id, NodeId::TS);
1875                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1876                }
1877            }
1878            (NonWord, _) => {
1879                if boundary_match {
1880                    let tail = tb.mk_concat(word_id, NodeId::TS);
1881                    self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1882                } else {
1883                    Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1884                }
1885            }
1886            (_, Word) => {
1887                if boundary_match {
1888                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1889                } else {
1890                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1891                }
1892            }
1893            (_, NonWord) => {
1894                if boundary_match {
1895                    Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1896                } else {
1897                    Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1898                }
1899            }
1900            //   \b = (?<=\w)(?!\w) | (?<!\w)(?=\w)
1901            //   \B = (?<=\w)(?=\w)  | (?<!\w)(?!\w)
1902            //   TODO: expensive and the cost could be reduced further
1903            _ => {
1904                self.unicode_classes.ensure_wb(tb);
1905                let node = if boundary_match {
1906                    self.unicode_classes.wb
1907                } else {
1908                    self.unicode_classes.non_wb
1909                };
1910                Ok((node, idx + 1))
1911            }
1912        }
1913    }
1914
1915    fn merge_boundary_with_following_lookaheads(
1916        &mut self,
1917        asts: &[Ast],
1918        wb_idx: usize,
1919        boundary_tail: NodeId,
1920        translator: &mut Option<Translator>,
1921        tb: &mut TB<'s>,
1922    ) -> Result<(NodeId, usize)> {
1923        let mut next = wb_idx + 1;
1924        let mut la_bodies = vec![boundary_tail];
1925        while next < asts.len() {
1926            match &asts[next] {
1927                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1928                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1929                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1930                    next += 1;
1931                }
1932                _ => break,
1933            }
1934        }
1935        let merged = tb.mk_inters(la_bodies.into_iter());
1936        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1937    }
1938
1939    fn ast_to_node_id(
1940        &mut self,
1941        ast: &Ast,
1942        translator: &mut Option<Translator>,
1943        tb: &mut TB<'s>,
1944    ) -> Result<NodeId> {
1945        match ast {
1946            Ast::Empty(_) => Ok(NodeId::EPS),
1947            Ast::Flags(f) => {
1948                if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1949                    return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1950                }
1951                let mut translator_builder = self.default_translator_builder();
1952                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1953                    translator_builder.case_insensitive(state);
1954                }
1955                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1956                    translator_builder.unicode(state);
1957                }
1958                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1959                    self.dot_all.set(state);
1960                }
1961                if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1962                    self.multiline.set(state);
1963                }
1964                let concat_translator = Some(translator_builder.build());
1965                *translator = concat_translator;
1966                Ok(NodeId::EPS)
1967            }
1968            Ast::Literal(l) => {
1969                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1970                self.translator_to_node_id(&ast_lit, translator, tb)
1971            }
1972            Ast::Top(_) => Ok(NodeId::TOP),
1973            Ast::Dot(_) => {
1974                let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1975                let hirv = match (codepoint_dot, self.dot_all.get()) {
1976                    (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1977                    (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1978                    (false, true) => return Ok(NodeId::TOP),
1979                    (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1980                };
1981                self.hir_to_node_id(&hirv, tb)
1982            }
1983            Ast::Assertion(a) => match &a.kind {
1984                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1985                ast::AssertionKind::EndText => Ok(NodeId::END),
1986                ast::AssertionKind::WordBoundary => {
1987                    let only = Ast::Assertion(a.clone());
1988                    let asts = std::slice::from_ref(&only);
1989                    let (node, _) =
1990                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1991                    Ok(node)
1992                }
1993                ast::AssertionKind::NotWordBoundary => {
1994                    // bare \B with no surrounding concat: treat as singleton.
1995                    let only = Ast::Assertion(a.clone());
1996                    let asts = std::slice::from_ref(&only);
1997                    let (node, _) =
1998                        self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1999                    Ok(node)
2000                }
2001                ast::AssertionKind::StartLine => {
2002                    if !self.multiline.get() {
2003                        return Ok(NodeId::BEGIN);
2004                    }
2005                    let left = NodeId::BEGIN;
2006                    let right = tb.mk_u8(b'\n');
2007                    let union = tb.mk_union(left, right);
2008                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
2009                }
2010                ast::AssertionKind::EndLine => {
2011                    if !self.multiline.get() {
2012                        return Ok(NodeId::END);
2013                    }
2014                    let left = NodeId::END;
2015                    let right = tb.mk_u8(b'\n');
2016                    let union = tb.mk_union(left, right);
2017                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2018                }
2019                ast::AssertionKind::WordBoundaryStart => {
2020                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2021                }
2022                ast::AssertionKind::WordBoundaryEnd => {
2023                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2024                }
2025                ast::AssertionKind::WordBoundaryStartAngle => {
2026                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2027                }
2028                ast::AssertionKind::WordBoundaryEndAngle => {
2029                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2030                }
2031                ast::AssertionKind::WordBoundaryStartHalf => {
2032                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2033                }
2034                ast::AssertionKind::WordBoundaryEndHalf => {
2035                    Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2036                }
2037            },
2038            Ast::ClassUnicode(c) => {
2039                let tmp = regex_syntax::ast::ClassUnicode {
2040                    span: c.span,
2041                    negated: c.negated,
2042                    kind: c.kind.clone(),
2043                };
2044                if !c.negated {
2045                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2046                        match s.as_str() {
2047                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
2048                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2049                            // a single valid utf8 codepoint; \p{utf8}*&~(a) => non a, but valid utf8
2050                            "utf8" => return Ok(utf8_codepoint_node(tb)),
2051                            "hex" => {
2052                                let nums = tb.mk_range_u8(b'0', b'9');
2053                                let lets = tb.mk_range_u8(b'a', b'f');
2054                                let lets2 = tb.mk_range_u8(b'A', b'F');
2055                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2056                                return Ok(merged);
2057                            }
2058                            _ => {}
2059                        }
2060                    };
2061                }
2062
2063                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2064                self.translator_to_node_id(&orig_ast, translator, tb)
2065            }
2066            Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2067            Ast::ClassBracketed(c) => match &c.kind {
2068                regex_syntax::ast::ClassSet::Item(item) => {
2069                    if !c.negated && is_universal_perl_pair(item) {
2070                        if self.global_ascii_perl || self.global_full_unicode {
2071                            return Ok(self.any_codepoint_node(tb));
2072                        }
2073                        return Ok(NodeId::TOP);
2074                    }
2075                    if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2076                        return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2077                    }
2078                    let kind = if self.global_ascii_perl {
2079                        rewrite_ascii_perl_set(&c.kind)
2080                    } else {
2081                        c.kind.clone()
2082                    };
2083                    let tmp = regex_syntax::ast::ClassBracketed {
2084                        span: c.span,
2085                        negated: c.negated,
2086                        kind,
2087                    };
2088                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2089                    self.translator_to_node_id(&orig_ast, translator, tb)
2090                }
2091                regex_syntax::ast::ClassSet::BinaryOp(_) => {
2092                    Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2093                }
2094            },
2095            Ast::Repetition(r) => {
2096                let body = self.ast_to_node_id(&r.ast, translator, tb);
2097                match body {
2098                    Ok(body) => match &r.op.kind {
2099                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2100                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2101                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2102                        ast::RepetitionKind::Range(r) => match r {
2103                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2104                            ast::RepetitionRange::AtLeast(n) => {
2105                                let rep = tb.mk_repeat(body, *n, *n);
2106                                let st = tb.mk_star(body);
2107                                Ok(tb.mk_concat(rep, st))
2108                            }
2109
2110                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2111                        },
2112                    },
2113                    Err(_) => body,
2114                }
2115            }
2116            Ast::Lookaround(g) => {
2117                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2118                match g.kind {
2119                    ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2120                        let mut prefix = NodeId::EPS;
2121                        let mut rest = body;
2122                        while tb.get_kind(rest) == Kind::Concat
2123                            && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2124                        {
2125                            prefix = tb.mk_concat(prefix, rest.left(tb));
2126                            rest = rest.right(tb);
2127                        }
2128                        if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2129                            return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2130                        }
2131                        let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2132                        Ok(tb.mk_concat(prefix, la))
2133                    }
2134                    ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2135                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2136                    }
2137                    ast::LookaroundKind::PositiveLookahead => {
2138                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2139                    }
2140                    ast::LookaroundKind::PositiveLookbehind
2141                    | ast::LookaroundKind::NegativeLookbehind
2142                        if body.contains_lookahead(tb) =>
2143                    {
2144                        Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2145                    }
2146                    ast::LookaroundKind::PositiveLookbehind => {
2147                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2148                    }
2149                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2150                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2151                }
2152            }
2153            Ast::Group(g) => {
2154                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2155                    if !flags.items.is_empty() {
2156                        let mut translator_builder = self.default_translator_builder();
2157                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2158                            translator_builder.case_insensitive(state);
2159                        }
2160                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2161                            translator_builder.unicode(state);
2162                        }
2163                        let saved_dot_all = self.dot_all.get();
2164                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2165                            self.dot_all.set(state);
2166                        }
2167                        let saved_multiline = self.multiline.get();
2168                        if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2169                            self.multiline.set(state);
2170                        }
2171                        let mut scoped = Some(translator_builder.build());
2172                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2173                        self.dot_all.set(saved_dot_all);
2174                        self.multiline.set(saved_multiline);
2175                        return result;
2176                    }
2177                }
2178                self.ast_to_node_id(&g.ast, translator, tb)
2179            }
2180            Ast::Alternation(a) => {
2181                let mut children = vec![];
2182                for ast in &a.asts {
2183                    match self.ast_to_node_id(ast, translator, tb) {
2184                        Ok(node_id) => children.push(node_id),
2185                        Err(err) => return Err(err),
2186                    }
2187                }
2188                Ok(tb.mk_unions(children.iter().copied()))
2189            }
2190            Ast::Concat(c) => {
2191                let mut concat_translator: Option<Translator> = None;
2192                let mut children = vec![];
2193                let mut prev_boundary_child: Option<usize> = None;
2194                let mut i = 0;
2195                while i < c.asts.len() {
2196                    let ast = &c.asts[i];
2197                    match ast {
2198                        Ast::Flags(f) => {
2199                            if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2200                                return Err(
2201                                    self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2202                                );
2203                            }
2204                            let mut translator_builder = self.default_translator_builder();
2205                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2206                                translator_builder.case_insensitive(state);
2207                            }
2208                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2209                                translator_builder.unicode(state);
2210                            }
2211                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2212                                self.dot_all.set(state);
2213                            }
2214                            if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2215                                self.multiline.set(state);
2216                            }
2217                            concat_translator = Some(translator_builder.build());
2218                            *translator = concat_translator.clone();
2219                            i += 1;
2220                            continue;
2221                        }
2222                        Ast::Assertion(a)
2223                            if a.kind == ast::AssertionKind::WordBoundary
2224                                || a.kind == ast::AssertionKind::NotWordBoundary =>
2225                        {
2226                            let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2227                            let node = self.rewrite_word_boundary_in_concat(
2228                                &c.asts, i, translator, tb, negated,
2229                            )?;
2230                            match prev_boundary_child {
2231                                Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2232                                None => {
2233                                    children.push(node.0);
2234                                    prev_boundary_child = Some(children.len() - 1);
2235                                }
2236                            }
2237                            i = node.1; // skip consumed lookaheads
2238                            continue;
2239                        }
2240                        _ => {}
2241                    }
2242                    match concat_translator {
2243                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2244                            Ok(node_id) => {
2245                                if node_id != resharp_algebra::NodeId::EPS {
2246                                    prev_boundary_child = None;
2247                                    children.push(node_id);
2248                                }
2249                            }
2250                            Err(err) => return Err(err),
2251                        },
2252                        None => match self.ast_to_node_id(ast, translator, tb) {
2253                            Ok(node_id) => {
2254                                if node_id != resharp_algebra::NodeId::EPS {
2255                                    prev_boundary_child = None;
2256                                    children.push(node_id);
2257                                }
2258                            }
2259                            Err(err) => return Err(err),
2260                        },
2261                    }
2262                    i += 1;
2263                }
2264                self.specialize_word_boundaries(&mut children, tb)?;
2265                Ok(tb.mk_concats(children.iter().cloned()))
2266            }
2267            Ast::Intersection(intersection) => {
2268                let mut children = vec![];
2269                for ast in &intersection.asts {
2270                    match self.ast_to_node_id(ast, translator, tb) {
2271                        Ok(node_id) => children.push(node_id),
2272                        Err(err) => return Err(err),
2273                    }
2274                }
2275                Ok(tb.mk_inters(children.into_iter()))
2276            }
2277            Ast::Complement(complement) => {
2278                let body = self.ast_to_node_id(&complement.ast, translator, tb);
2279                body.map(|x| tb.mk_compl(x))
2280            }
2281        }
2282    }
2283
2284    fn parse_inner(&mut self) -> Result<Ast> {
2285        let mut concat = Concat {
2286            span: self.span(),
2287            asts: vec![],
2288        };
2289        loop {
2290            self.bump_space();
2291            if self.is_eof() {
2292                break;
2293            }
2294            match self.char() {
2295                '(' => concat = self.push_group(concat)?,
2296                ')' => concat = self.pop_group(concat)?,
2297                '|' => concat = self.push_alternate(concat)?,
2298                '&' => concat = self.push_intersect(concat)?,
2299                '~' => concat = self.push_compl_group(concat)?,
2300                '[' => {
2301                    let class = self.parse_set_class()?;
2302                    concat.asts.push(Ast::class_bracketed(class));
2303                }
2304                '?' => {
2305                    concat =
2306                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2307                }
2308                '*' => {
2309                    concat =
2310                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2311                }
2312                '+' => {
2313                    concat =
2314                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2315                }
2316                '{' => {
2317                    concat = self.parse_counted_repetition(concat)?;
2318                }
2319                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2320            }
2321            if self.stack_group.borrow().len() > self.max_depth {
2322                return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2323            }
2324        }
2325        let ast = self.pop_group_end(concat)?;
2326        if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2327            || max_concat_length(&ast) >= self.max_list_len
2328        {
2329            return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2330        }
2331        Ok(ast)
2332    }
2333
2334    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2335        let ast = self.parse_inner()?;
2336        if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2337            return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2338        }
2339        self.ast_to_node_id(&ast, &mut None, tb)
2340    }
2341
2342    #[inline(never)]
2343    fn parse_uncounted_repetition(
2344        &self,
2345        mut concat: ast::Concat,
2346        kind: ast::RepetitionKind,
2347    ) -> Result<ast::Concat> {
2348        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
2349        let op_start = self.pos();
2350        let ast = match concat.asts.pop() {
2351            Some(ast) => ast,
2352            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2353        };
2354        match ast {
2355            Ast::Empty(_) | Ast::Flags(_) => {
2356                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2357            }
2358            _ => {}
2359        }
2360        if self.bump() && self.char() == '?' {
2361            return Err(self.error(
2362                Span::new(op_start, self.pos()),
2363                ast::ErrorKind::UnsupportedLazyQuantifier,
2364            ));
2365        }
2366        concat.asts.push(Ast::repetition(ast::Repetition {
2367            span: ast.span().with_end(self.pos()),
2368            op: ast::RepetitionOp {
2369                span: Span::new(op_start, self.pos()),
2370                kind,
2371            },
2372            greedy: true,
2373            ast: Box::new(ast),
2374        }));
2375        Ok(concat)
2376    }
2377
2378    #[inline(never)]
2379    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2380        assert!(self.char() == '{');
2381        let start = self.pos();
2382        let ast = match concat.asts.pop() {
2383            Some(ast) => ast,
2384            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2385        };
2386        match ast {
2387            Ast::Empty(_) | Ast::Flags(_) => {
2388                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2389            }
2390            _ => {}
2391        }
2392        if !self.bump_and_bump_space() {
2393            return Err(self.error(
2394                Span::new(start, self.pos()),
2395                ast::ErrorKind::RepetitionCountUnclosed,
2396            ));
2397        }
2398        let count_start = specialize_err(
2399            self.parse_decimal(),
2400            ast::ErrorKind::DecimalEmpty,
2401            ast::ErrorKind::RepetitionCountDecimalEmpty,
2402        );
2403        if self.is_eof() {
2404            return Err(self.error(
2405                Span::new(start, self.pos()),
2406                ast::ErrorKind::RepetitionCountUnclosed,
2407            ));
2408        }
2409        let range = if self.char() == ',' {
2410            if !self.bump_and_bump_space() {
2411                return Err(self.error(
2412                    Span::new(start, self.pos()),
2413                    ast::ErrorKind::RepetitionCountUnclosed,
2414                ));
2415            }
2416            if self.char() != '}' {
2417                let count_start = match count_start {
2418                    Ok(c) => c,
2419                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2420                        if self.parser().empty_min_range {
2421                            0
2422                        } else {
2423                            return Err(err);
2424                        }
2425                    }
2426                    err => err?,
2427                };
2428                let count_end = specialize_err(
2429                    self.parse_decimal(),
2430                    ast::ErrorKind::DecimalEmpty,
2431                    ast::ErrorKind::RepetitionCountDecimalEmpty,
2432                )?;
2433                ast::RepetitionRange::Bounded(count_start, count_end)
2434            } else {
2435                ast::RepetitionRange::AtLeast(count_start?)
2436            }
2437        } else {
2438            ast::RepetitionRange::Exactly(count_start?)
2439        };
2440
2441        if self.is_eof() || self.char() != '}' {
2442            return Err(self.error(
2443                Span::new(start, self.pos()),
2444                ast::ErrorKind::RepetitionCountUnclosed,
2445            ));
2446        }
2447
2448        if self.bump_and_bump_space() && self.char() == '?' {
2449            return Err(self.error(
2450                Span::new(start, self.pos()),
2451                ast::ErrorKind::UnsupportedLazyQuantifier,
2452            ));
2453        }
2454
2455        let op_span = Span::new(start, self.pos());
2456        if !range.is_valid() {
2457            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2458        }
2459
2460        let over_limit = match &range {
2461            ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2462            ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2463            ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2464        };
2465        if over_limit {
2466            return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2467        }
2468        concat.asts.push(Ast::repetition(ast::Repetition {
2469            span: ast.span().with_end(self.pos()),
2470            op: ast::RepetitionOp {
2471                span: op_span,
2472                kind: ast::RepetitionKind::Range(range),
2473            },
2474            greedy: true,
2475            ast: Box::new(ast),
2476        }));
2477        Ok(concat)
2478    }
2479
2480    #[inline(never)]
2481    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2482        assert_eq!(self.char(), '(');
2483        let open_span = self.span_char();
2484        self.bump();
2485        self.bump_space();
2486        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2487            let kind = match (pos, ahead) {
2488                (true, true) => LookaroundKind::PositiveLookahead,
2489                (true, false) => LookaroundKind::PositiveLookbehind,
2490                (false, true) => LookaroundKind::NegativeLookahead,
2491                (false, false) => LookaroundKind::NegativeLookbehind,
2492            };
2493            return Ok(Either::Right(ast::Group {
2494                span: open_span,
2495                kind: ast::GroupKind::Lookaround(kind),
2496                ast: Box::new(Ast::empty(self.span())),
2497            }));
2498        }
2499        let inner_span = self.span();
2500        let mut starts_with_p = true;
2501        if self.bump_if("?P<") || {
2502            starts_with_p = false;
2503            self.bump_if("?<")
2504        } {
2505            let capture_index = self.next_capture_index(open_span)?;
2506            let name = self.parse_capture_name(capture_index)?;
2507            Ok(Either::Right(ast::Group {
2508                span: open_span,
2509                kind: ast::GroupKind::CaptureName {
2510                    starts_with_p,
2511                    name,
2512                },
2513                ast: Box::new(Ast::empty(self.span())),
2514            }))
2515        } else if self.bump_if("?") {
2516            if self.is_eof() {
2517                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2518            }
2519            let flags = self.parse_flags()?;
2520            let char_end = self.char();
2521            self.bump();
2522            if char_end == ')' {
2523                // We don't allow empty flags, e.g., `(?)`. We instead
2524                // interpret it as a repetition operator missing its argument.
2525                if flags.items.is_empty() {
2526                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2527                }
2528                Ok(Either::Left(ast::SetFlags {
2529                    span: Span {
2530                        end: self.pos(),
2531                        ..open_span
2532                    },
2533                    flags,
2534                }))
2535            } else {
2536                assert_eq!(char_end, ':');
2537                Ok(Either::Right(ast::Group {
2538                    span: open_span,
2539                    kind: ast::GroupKind::NonCapturing(flags),
2540                    ast: Box::new(Ast::empty(self.span())),
2541                }))
2542            }
2543        } else {
2544            let capture_index = self.next_capture_index(open_span)?;
2545            Ok(Either::Right(ast::Group {
2546                span: open_span,
2547                kind: ast::GroupKind::CaptureIndex(capture_index),
2548                ast: Box::new(Ast::empty(self.span())),
2549            }))
2550        }
2551    }
2552
2553    #[inline(never)]
2554    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2555        if self.is_eof() {
2556            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2557        }
2558        let start = self.pos();
2559        loop {
2560            if self.char() == '>' {
2561                break;
2562            }
2563            if !is_capture_char(self.char(), self.pos() == start) {
2564                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2565            }
2566            if !self.bump() {
2567                break;
2568            }
2569        }
2570        let end = self.pos();
2571        if self.is_eof() {
2572            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2573        }
2574        assert_eq!(self.char(), '>');
2575        self.bump();
2576        let name = &self.pattern()[start.offset..end.offset];
2577        if name.is_empty() {
2578            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2579        }
2580        let capname = ast::CaptureName {
2581            span: Span::new(start, end),
2582            name: name.to_string(),
2583            index: capture_index,
2584        };
2585        self.add_capture_name(&capname)?;
2586        Ok(capname)
2587    }
2588
2589    #[inline(never)]
2590    fn parse_flags(&self) -> Result<ast::Flags> {
2591        let mut flags = ast::Flags {
2592            span: self.span(),
2593            items: vec![],
2594        };
2595        let mut last_was_negation = None;
2596        while self.char() != ':' && self.char() != ')' {
2597            if self.char() == '-' {
2598                last_was_negation = Some(self.span_char());
2599                let item = ast::FlagsItem {
2600                    span: self.span_char(),
2601                    kind: ast::FlagsItemKind::Negation,
2602                };
2603                if let Some(i) = flags.add_item(item) {
2604                    return Err(self.error(
2605                        self.span_char(),
2606                        ast::ErrorKind::FlagRepeatedNegation {
2607                            original: flags.items[i].span,
2608                        },
2609                    ));
2610                }
2611            } else {
2612                last_was_negation = None;
2613                let item = ast::FlagsItem {
2614                    span: self.span_char(),
2615                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2616                };
2617                if let Some(i) = flags.add_item(item) {
2618                    return Err(self.error(
2619                        self.span_char(),
2620                        ast::ErrorKind::FlagDuplicate {
2621                            original: flags.items[i].span,
2622                        },
2623                    ));
2624                }
2625            }
2626            if !self.bump() {
2627                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2628            }
2629        }
2630        if let Some(span) = last_was_negation {
2631            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2632        }
2633        flags.span.end = self.pos();
2634        Ok(flags)
2635    }
2636
2637    #[inline(never)]
2638    fn parse_flag(&self) -> Result<ast::Flag> {
2639        match self.char() {
2640            'i' => Ok(ast::Flag::CaseInsensitive),
2641            'm' => Ok(ast::Flag::MultiLine),
2642            's' => Ok(ast::Flag::DotMatchesNewLine),
2643            'U' => Ok(ast::Flag::SwapGreed),
2644            'u' => Ok(ast::Flag::Unicode),
2645            'R' => Ok(ast::Flag::CRLF),
2646            'x' => Ok(ast::Flag::IgnoreWhitespace),
2647            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2648        }
2649    }
2650
2651    fn parse_primitive(&self) -> Result<Primitive> {
2652        match self.char() {
2653            '\\' => self.parse_escape(),
2654            '_' => {
2655                let ast = Primitive::Top(self.span_char());
2656                self.bump();
2657                Ok(ast)
2658            }
2659            '.' => {
2660                let ast = Primitive::Dot(self.span_char());
2661                self.bump();
2662                Ok(ast)
2663            }
2664            '^' => {
2665                let ast = Primitive::Assertion(ast::Assertion {
2666                    span: self.span_char(),
2667                    kind: ast::AssertionKind::StartLine,
2668                });
2669                self.bump();
2670                Ok(ast)
2671            }
2672            '$' => {
2673                let ast = Primitive::Assertion(ast::Assertion {
2674                    span: self.span_char(),
2675                    kind: ast::AssertionKind::EndLine,
2676                });
2677                self.bump();
2678                Ok(ast)
2679            }
2680            c => {
2681                let ast = Primitive::Literal(Literal {
2682                    span: self.span_char(),
2683                    kind: LiteralKind::Verbatim,
2684                    c,
2685                });
2686                self.bump();
2687                Ok(ast)
2688            }
2689        }
2690    }
2691
2692    #[inline(never)]
2693    fn parse_escape(&self) -> Result<Primitive> {
2694        assert_eq!(self.char(), '\\');
2695        let start = self.pos();
2696        if !self.bump() {
2697            return Err(self.error(
2698                Span::new(start, self.pos()),
2699                ast::ErrorKind::EscapeUnexpectedEof,
2700            ));
2701        }
2702        let c = self.char();
2703        // Put some of the more complicated routines into helpers.
2704        match c {
2705            '0'..='9' => {
2706                if !self.parser().octal {
2707                    return Err(self.error(
2708                        Span::new(start, self.span_char().end),
2709                        ast::ErrorKind::UnsupportedBackreference,
2710                    ));
2711                }
2712                let mut lit = self.parse_octal();
2713                lit.span.start = start;
2714                return Ok(Primitive::Literal(lit));
2715            }
2716            'x' | 'u' | 'U' => {
2717                let mut lit = self.parse_hex()?;
2718                lit.span.start = start;
2719                return Ok(Primitive::Literal(lit));
2720            }
2721            'p' | 'P' => {
2722                let mut cls = self.parse_unicode_class()?;
2723                cls.span.start = start;
2724                return Ok(Primitive::Unicode(cls));
2725            }
2726            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2727                let mut cls = self.parse_perl_class();
2728                cls.span.start = start;
2729                return Ok(Primitive::Perl(cls));
2730            }
2731            _ => {}
2732        }
2733
2734        // Handle all of the one letter sequences inline.
2735        self.bump();
2736        let span = Span::new(start, self.pos());
2737        if is_meta_character(c) {
2738            return Ok(Primitive::Literal(Literal {
2739                span,
2740                kind: LiteralKind::Meta,
2741                c,
2742            }));
2743        }
2744        if is_escapeable_character(c) {
2745            return Ok(Primitive::Literal(Literal {
2746                span,
2747                kind: LiteralKind::Superfluous,
2748                c,
2749            }));
2750        }
2751        let special = |kind, c| {
2752            Ok(Primitive::Literal(Literal {
2753                span,
2754                kind: LiteralKind::Special(kind),
2755                c,
2756            }))
2757        };
2758        match c {
2759            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2760            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2761            't' => special(SpecialLiteralKind::Tab, '\t'),
2762            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2763            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2764            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2765            'A' => Ok(Primitive::Assertion(ast::Assertion {
2766                span,
2767                kind: ast::AssertionKind::StartText,
2768            })),
2769            'z' => Ok(Primitive::Assertion(ast::Assertion {
2770                span,
2771                kind: ast::AssertionKind::EndText,
2772            })),
2773            'b' => {
2774                let mut wb = ast::Assertion {
2775                    span,
2776                    kind: ast::AssertionKind::WordBoundary,
2777                };
2778                // After a \b, we "try" to parse things like \b{start} for
2779                // special word boundary assertions.
2780                if !self.is_eof() && self.char() == '{' {
2781                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2782                        wb.kind = kind;
2783                        wb.span.end = self.pos();
2784                    }
2785                }
2786                Ok(Primitive::Assertion(wb))
2787            }
2788            'B' => Ok(Primitive::Assertion(ast::Assertion {
2789                span,
2790                kind: ast::AssertionKind::NotWordBoundary,
2791            })),
2792            '<' => Ok(Primitive::Assertion(ast::Assertion {
2793                span,
2794                kind: ast::AssertionKind::WordBoundaryStartAngle,
2795            })),
2796            '>' => Ok(Primitive::Assertion(ast::Assertion {
2797                span,
2798                kind: ast::AssertionKind::WordBoundaryEndAngle,
2799            })),
2800            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2801        }
2802    }
2803
2804    fn maybe_parse_special_word_boundary(
2805        &self,
2806        wb_start: Position,
2807    ) -> Result<Option<ast::AssertionKind>> {
2808        assert_eq!(self.char(), '{');
2809
2810        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2811        let start = self.pos();
2812        if !self.bump_and_bump_space() {
2813            return Err(self.error(
2814                Span::new(wb_start, self.pos()),
2815                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2816            ));
2817        }
2818        let start_contents = self.pos();
2819        if !is_valid_char(self.char()) {
2820            self.parser().pos.set(start);
2821            return Ok(None);
2822        }
2823
2824        // Now collect up our chars until we see a '}'.
2825        let mut scratch = self.parser().scratch.borrow_mut();
2826        scratch.clear();
2827        while !self.is_eof() && is_valid_char(self.char()) {
2828            scratch.push(self.char());
2829            self.bump_and_bump_space();
2830        }
2831        if self.is_eof() || self.char() != '}' {
2832            return Err(self.error(
2833                Span::new(start, self.pos()),
2834                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2835            ));
2836        }
2837        let end = self.pos();
2838        self.bump();
2839        let kind = match scratch.as_str() {
2840            "start" => ast::AssertionKind::WordBoundaryStart,
2841            "end" => ast::AssertionKind::WordBoundaryEnd,
2842            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2843            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2844            _ => {
2845                return Err(self.error(
2846                    Span::new(start_contents, end),
2847                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2848                ))
2849            }
2850        };
2851        Ok(Some(kind))
2852    }
2853
2854    #[inline(never)]
2855    fn parse_octal(&self) -> Literal {
2856        assert!(self.parser().octal);
2857        assert!('0' <= self.char() && self.char() <= '7');
2858        let start = self.pos();
2859        // Parse up to two more digits.
2860        while self.bump()
2861            && '0' <= self.char()
2862            && self.char() <= '7'
2863            && self.pos().offset - start.offset <= 2
2864        {}
2865        let end = self.pos();
2866        let octal = &self.pattern()[start.offset..end.offset];
2867        // Parsing the octal should never fail since the above guarantees a
2868        // valid number.
2869        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2870        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2871        // invalid Unicode scalar values.
2872        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2873        Literal {
2874            span: Span::new(start, end),
2875            kind: LiteralKind::Octal,
2876            c,
2877        }
2878    }
2879
2880    #[inline(never)]
2881    fn parse_hex(&self) -> Result<Literal> {
2882        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2883
2884        let hex_kind = match self.char() {
2885            'x' => HexLiteralKind::X,
2886            'u' => HexLiteralKind::UnicodeShort,
2887            _ => HexLiteralKind::UnicodeLong,
2888        };
2889        if !self.bump_and_bump_space() {
2890            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2891        }
2892        if self.char() == '{' {
2893            self.parse_hex_brace(hex_kind)
2894        } else {
2895            self.parse_hex_digits(hex_kind)
2896        }
2897    }
2898
2899    #[inline(never)]
2900    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2901        let mut scratch = self.parser().scratch.borrow_mut();
2902        scratch.clear();
2903
2904        let start = self.pos();
2905        for i in 0..kind.digits() {
2906            if i > 0 && !self.bump_and_bump_space() {
2907                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2908            }
2909            if !is_hex(self.char()) {
2910                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2911            }
2912            scratch.push(self.char());
2913        }
2914        self.bump_and_bump_space();
2915        let end = self.pos();
2916        let hex = scratch.as_str();
2917        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2918            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2919            Some(c) => Ok(Literal {
2920                span: Span::new(start, end),
2921                kind: LiteralKind::HexFixed(kind),
2922                c,
2923            }),
2924        }
2925    }
2926
2927    #[inline(never)]
2928    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2929        let mut scratch = self.parser().scratch.borrow_mut();
2930        scratch.clear();
2931
2932        let brace_pos = self.pos();
2933        let start = self.span_char().end;
2934        while self.bump_and_bump_space() && self.char() != '}' {
2935            if !is_hex(self.char()) {
2936                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2937            }
2938            scratch.push(self.char());
2939        }
2940        if self.is_eof() {
2941            return Err(self.error(
2942                Span::new(brace_pos, self.pos()),
2943                ast::ErrorKind::EscapeUnexpectedEof,
2944            ));
2945        }
2946        let end = self.pos();
2947        let hex = scratch.as_str();
2948        assert_eq!(self.char(), '}');
2949        self.bump_and_bump_space();
2950
2951        if hex.is_empty() {
2952            return Err(self.error(
2953                Span::new(brace_pos, self.pos()),
2954                ast::ErrorKind::EscapeHexEmpty,
2955            ));
2956        }
2957        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2958            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2959            Some(c) => Ok(Literal {
2960                span: Span::new(start, self.pos()),
2961                kind: LiteralKind::HexBrace(kind),
2962                c,
2963            }),
2964        }
2965    }
2966
2967    fn parse_decimal(&self) -> Result<u32> {
2968        let mut scratch = self.parser().scratch.borrow_mut();
2969        scratch.clear();
2970
2971        while !self.is_eof() && self.char().is_whitespace() {
2972            self.bump();
2973        }
2974        let start = self.pos();
2975        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2976            scratch.push(self.char());
2977            self.bump_and_bump_space();
2978        }
2979        let span = Span::new(start, self.pos());
2980        while !self.is_eof() && self.char().is_whitespace() {
2981            self.bump_and_bump_space();
2982        }
2983        let digits = scratch.as_str();
2984        if digits.is_empty() {
2985            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2986        }
2987        match digits.parse::<u32>().ok() {
2988            Some(n) => Ok(n),
2989            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2990        }
2991    }
2992
2993    #[inline(never)]
2994    fn parse_set_class(&self) -> Result<ClassBracketed> {
2995        assert_eq!(self.char(), '[');
2996
2997        let mut union = ClassSetUnion {
2998            span: self.span(),
2999            items: vec![],
3000        };
3001        loop {
3002            self.bump_space();
3003            if self.is_eof() {
3004                return Err(self.unclosed_class_error());
3005            }
3006            match self.char() {
3007                '[' => {
3008                    if !self.parser().stack_class.borrow().is_empty() {
3009                        if let Some(cls) = self.maybe_parse_ascii_class() {
3010                            union.push(ClassSetItem::Ascii(cls));
3011                            continue;
3012                        }
3013                    }
3014                    union = self.push_class_open(union)?;
3015                }
3016                ']' => match self.pop_class(union)? {
3017                    Either::Left(nested_union) => {
3018                        union = nested_union;
3019                    }
3020                    Either::Right(class) => return Ok(class),
3021                },
3022                '&' if self.peek() == Some('&') => {
3023                    assert!(self.bump_if("&&"));
3024                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3025                }
3026                '-' if self.peek() == Some('-') => {
3027                    assert!(self.bump_if("--"));
3028                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3029                }
3030                '~' if self.peek() == Some('~') => {
3031                    assert!(self.bump_if("~~"));
3032                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3033                }
3034                _ => {
3035                    union.push(self.parse_set_class_range()?);
3036                }
3037            }
3038        }
3039    }
3040
3041    #[inline(never)]
3042    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3043        let prim1 = self.parse_set_class_item()?;
3044        self.bump_space();
3045        if self.is_eof() {
3046            return Err(self.unclosed_class_error());
3047        }
3048        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3049            return prim1.into_class_set_item(self);
3050        }
3051        if !self.bump_and_bump_space() {
3052            return Err(self.unclosed_class_error());
3053        }
3054        let prim2 = self.parse_set_class_item()?;
3055        let range = ClassSetRange {
3056            span: Span::new(prim1.span().start, prim2.span().end),
3057            start: prim1.into_class_literal(self)?,
3058            end: prim2.into_class_literal(self)?,
3059        };
3060        if !range.is_valid() {
3061            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3062        }
3063        Ok(ClassSetItem::Range(range))
3064    }
3065
3066    #[inline(never)]
3067    fn parse_set_class_item(&self) -> Result<Primitive> {
3068        if self.char() == '\\' {
3069            self.parse_escape()
3070        } else {
3071            let x = Primitive::Literal(Literal {
3072                span: self.span_char(),
3073                kind: LiteralKind::Verbatim,
3074                c: self.char(),
3075            });
3076            self.bump();
3077            Ok(x)
3078        }
3079    }
3080
3081    #[inline(never)]
3082    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3083        assert_eq!(self.char(), '[');
3084        let start = self.pos();
3085        if !self.bump_and_bump_space() {
3086            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3087        }
3088
3089        let negated = if self.char() != '^' {
3090            false
3091        } else {
3092            if !self.bump_and_bump_space() {
3093                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3094            }
3095            true
3096        };
3097        // Accept any number of `-` as literal `-`.
3098        let mut union = ClassSetUnion {
3099            span: self.span(),
3100            items: vec![],
3101        };
3102        while self.char() == '-' {
3103            union.push(ClassSetItem::Literal(Literal {
3104                span: self.span_char(),
3105                kind: LiteralKind::Verbatim,
3106                c: '-',
3107            }));
3108            if !self.bump_and_bump_space() {
3109                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3110            }
3111        }
3112        // If `]` is the *first* char in a set, then interpret it as a literal
3113        // `]`. That is, an empty class is impossible to write.
3114        if union.items.is_empty() && self.char() == ']' {
3115            union.push(ClassSetItem::Literal(Literal {
3116                span: self.span_char(),
3117                kind: LiteralKind::Verbatim,
3118                c: ']',
3119            }));
3120            if !self.bump_and_bump_space() {
3121                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3122            }
3123        }
3124        let set = ClassBracketed {
3125            span: Span::new(start, self.pos()),
3126            negated,
3127            kind: ClassSet::union(ClassSetUnion {
3128                span: Span::new(union.span.start, union.span.start),
3129                items: vec![],
3130            }),
3131        };
3132        Ok((set, union))
3133    }
3134
3135    #[inline(never)]
3136    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3137        assert_eq!(self.char(), '[');
3138        // If parsing fails, then we back up the parser to this starting point.
3139        let start = self.pos();
3140        let mut negated = false;
3141        if !self.bump() || self.char() != ':' {
3142            self.parser().pos.set(start);
3143            return None;
3144        }
3145        if !self.bump() {
3146            self.parser().pos.set(start);
3147            return None;
3148        }
3149        if self.char() == '^' {
3150            negated = true;
3151            if !self.bump() {
3152                self.parser().pos.set(start);
3153                return None;
3154            }
3155        }
3156        let name_start = self.offset();
3157        while self.char() != ':' && self.bump() {}
3158        if self.is_eof() {
3159            self.parser().pos.set(start);
3160            return None;
3161        }
3162        let name = &self.pattern()[name_start..self.offset()];
3163        if !self.bump_if(":]") {
3164            self.parser().pos.set(start);
3165            return None;
3166        }
3167        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3168            Some(kind) => kind,
3169            None => {
3170                self.parser().pos.set(start);
3171                return None;
3172            }
3173        };
3174        Some(ClassAscii {
3175            span: Span::new(start, self.pos()),
3176            kind,
3177            negated,
3178        })
3179    }
3180
3181    #[inline(never)]
3182    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3183        assert!(self.char() == 'p' || self.char() == 'P');
3184
3185        let mut scratch = self.parser().scratch.borrow_mut();
3186        scratch.clear();
3187
3188        let negated = self.char() == 'P';
3189        if !self.bump_and_bump_space() {
3190            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3191        }
3192        let (start, kind) = if self.char() == '{' {
3193            let start = self.span_char().end;
3194            while self.bump_and_bump_space() && self.char() != '}' {
3195                scratch.push(self.char());
3196            }
3197            if self.is_eof() {
3198                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3199            }
3200            assert_eq!(self.char(), '}');
3201            self.bump();
3202
3203            let name = scratch.as_str();
3204            if let Some(i) = name.find("!=") {
3205                (
3206                    start,
3207                    ClassUnicodeKind::NamedValue {
3208                        op: ClassUnicodeOpKind::NotEqual,
3209                        name: name[..i].to_string(),
3210                        value: name[i + 2..].to_string(),
3211                    },
3212                )
3213            } else if let Some(i) = name.find(':') {
3214                (
3215                    start,
3216                    ClassUnicodeKind::NamedValue {
3217                        op: ClassUnicodeOpKind::Colon,
3218                        name: name[..i].to_string(),
3219                        value: name[i + 1..].to_string(),
3220                    },
3221                )
3222            } else if let Some(i) = name.find('=') {
3223                (
3224                    start,
3225                    ClassUnicodeKind::NamedValue {
3226                        op: ClassUnicodeOpKind::Equal,
3227                        name: name[..i].to_string(),
3228                        value: name[i + 1..].to_string(),
3229                    },
3230                )
3231            } else {
3232                (start, ClassUnicodeKind::Named(name.to_string()))
3233            }
3234        } else {
3235            let start = self.pos();
3236            let c = self.char();
3237            if c == '\\' {
3238                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3239            }
3240            self.bump_and_bump_space();
3241            let kind = ClassUnicodeKind::OneLetter(c);
3242            (start, kind)
3243        };
3244        Ok(ClassUnicode {
3245            span: Span::new(start, self.pos()),
3246            negated,
3247            kind,
3248        })
3249    }
3250
3251    #[inline(never)]
3252    fn parse_perl_class(&self) -> ClassPerl {
3253        let c = self.char();
3254        let span = self.span_char();
3255        self.bump();
3256        let (negated, kind) = match c {
3257            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3258            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3259            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3260            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3261            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3262            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3263            c => panic!("expected valid Perl class but got '{}'", c),
3264        };
3265        ClassPerl {
3266            span,
3267            kind,
3268            negated,
3269        }
3270    }
3271}
3272
3273fn is_universal_perl_pair(item: &regex_syntax::ast::ClassSetItem) -> bool {
3274    use regex_syntax::ast::ClassSetItem;
3275    let items = match item {
3276        ClassSetItem::Union(u) => &u.items,
3277        _ => return false,
3278    };
3279    if items.len() != 2 {
3280        return false;
3281    }
3282    match (&items[0], &items[1]) {
3283        (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3284            let is_all = a.kind == b.kind && a.negated != b.negated;
3285            is_all
3286        }
3287        _ => false,
3288    }
3289}
3290
3291pub fn max_concat_length(ast: &ast::Ast) -> usize {
3292    match ast {
3293        ast::Ast::Empty(_)
3294        | ast::Ast::Flags(_)
3295        | ast::Ast::Literal(_)
3296        | ast::Ast::Dot(_)
3297        | ast::Ast::Top(_)
3298        | ast::Ast::Assertion(_)
3299        | ast::Ast::ClassUnicode(_)
3300        | ast::Ast::ClassPerl(_)
3301        | ast::Ast::ClassBracketed(_) => 0,
3302        ast::Ast::Group(g) => max_concat_length(&g.ast),
3303        ast::Ast::Complement(c) => max_concat_length(&c.ast),
3304        ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3305        ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3306        ast::Ast::Concat(c) => c
3307            .asts
3308            .len()
3309            .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3310        ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3311        ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3312    }
3313}
3314
3315pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3316    fn go(ast: &ast::Ast, limit: u64) -> u64 {
3317        match ast {
3318            ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3319            ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3320            ast::Ast::Assertion(_) => 1,
3321            ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3322            ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3323            ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3324            ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3325            ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3326            ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3327            ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3328            ast::Ast::Repetition(r) => {
3329                let body = go(&r.ast, limit);
3330                let factor: u64 = match &r.op.kind {
3331                    ast::RepetitionKind::ZeroOrOne => 2,
3332                    ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3333                    ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3334                        (*n as u64).max(1)
3335                    }
3336                    ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3337                        (*n as u64).max(1).saturating_add(1)
3338                    }
3339                    ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3340                        (*m as u64).max(1)
3341                    }
3342                };
3343                body.saturating_mul(factor).min(limit)
3344            }
3345        }
3346    }
3347    fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3348        let mut total: u64 = 0;
3349        for c in children {
3350            total = total.saturating_add(go(c, limit));
3351            if total >= limit {
3352                return limit;
3353            }
3354        }
3355        total
3356    }
3357    go(ast, limit)
3358}
3359
3360pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3361    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3362    p.parse(tb)
3363}
3364
3365pub fn parse_ast_with<'s>(
3366    tb: &mut TB<'s>,
3367    pattern: &'s str,
3368    flags: &PatternFlags,
3369) -> std::result::Result<NodeId, ParseError> {
3370    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3371    p.parse(tb)
3372}
3373
3374/// Parse a pattern into the raw AST without converting to algebra nodes.
3375pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3376    let mut p: ResharpParser = ResharpParser::new(pattern);
3377    p.parse_inner()
3378}
resharp_parser/lib.rs

resharp_parser/
lib.rs