resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `EngineOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// global case-insensitive matching.
31    pub case_insensitive: bool,
32    /// `.` matches `\n` (behaves like `_`).
33    pub dot_matches_new_line: bool,
34    /// allow whitespace and `#` comments in the pattern.
35    pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39    fn default() -> Self {
40        Self {
41            unicode: true,
42            case_insensitive: false,
43            dot_matches_new_line: false,
44            ignore_whitespace: false,
45        }
46    }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51    Word,
52    NonWord,
53    MaybeWord,
54    MaybeNonWord,
55    Unknown,
56    Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60    b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66    Literal(Literal),
67    Assertion(ast::Assertion),
68    Dot(Span),
69    Top(Span),
70    Perl(ClassPerl),
71    Unicode(ClassUnicode),
72}
73
74impl Primitive {
75    fn span(&self) -> &Span {
76        match *self {
77            Primitive::Literal(ref x) => &x.span,
78            Primitive::Assertion(ref x) => &x.span,
79            Primitive::Dot(ref span) => span,
80            Primitive::Top(ref span) => span,
81            Primitive::Perl(ref x) => &x.span,
82            Primitive::Unicode(ref x) => &x.span,
83        }
84    }
85
86    fn into_ast(self) -> Ast {
87        match self {
88            Primitive::Literal(lit) => Ast::literal(lit),
89            Primitive::Assertion(assert) => Ast::assertion(assert),
90            Primitive::Dot(span) => Ast::dot(span),
91            Primitive::Top(span) => Ast::top(span),
92            Primitive::Perl(cls) => Ast::class_perl(cls),
93            Primitive::Unicode(cls) => Ast::class_unicode(cls),
94        }
95    }
96
97    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98        use self::Primitive::*;
99        use regex_syntax::ast::ClassSetItem;
100
101        match self {
102            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106        }
107    }
108
109    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110        use self::Primitive::*;
111
112        match self {
113            Literal(lit) => Ok(lit),
114            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115        }
116    }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121    Left(Left),
122    Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127    /// The kind of error.
128    pub kind: ErrorKind,
129    /// The original pattern that the parser generated the error from. Every
130    /// span in an error is a valid range into this string.
131    pattern: String,
132    /// The span of this error.
133    pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138        write!(f, "{:?}: {:?}", self.kind, self.span)
139    }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147    /// This state is pushed whenever an opening group is found.
148    Group {
149        /// The concatenation immediately preceding the opening group.
150        concat: Concat,
151        /// The group that has been opened. Its sub-AST is always empty.
152        group: ast::Group,
153        /// Whether this group has the `x` flag enabled or not.
154        ignore_whitespace: bool,
155    },
156    /// This state is pushed whenever a new alternation branch is found. If
157    /// an alternation branch is found and this state is at the top of the
158    /// stack, then this state should be modified to include the new
159    /// alternation.
160    Alternation(ast::Alternation),
161    Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166    /// This state is pushed whenever an opening bracket is found.
167    Open {
168        /// The union of class items immediately preceding this class.
169        union: regex_syntax::ast::ClassSetUnion,
170        /// The class that has been opened. Typically this just corresponds
171        /// to the `[`, but it can also include `[^` since `^` indicates
172        /// negation of the class.
173        set: regex_syntax::ast::ClassBracketed,
174    },
175    /// This state is pushed when a operator is seen. When popped, the stored
176    /// set becomes the left hand side of the operator.
177    Op {
178        /// The type of the operation, i.e., &&, -- or ~~.
179        kind: regex_syntax::ast::ClassSetBinaryOpKind,
180        /// The left-hand side of the operator.
181        lhs: regex_syntax::ast::ClassSet,
182    },
183}
184
185/// RE# syntax parser based on the regex-syntax crate.
186pub struct ResharpParser<'s> {
187    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188    unicode_classes: resharp_algebra::UnicodeClassCache,
189    pub translator: regex_syntax::hir::translate::Translator,
190    pub pattern: &'s str,
191    pos: Cell<Position>,
192    capture_index: Cell<u32>,
193    octal: bool,
194    empty_min_range: bool,
195    ignore_whitespace: Cell<bool>,
196    dot_all: Cell<bool>,
197    global_unicode: bool,
198    global_case_insensitive: bool,
199    comments: RefCell<Vec<ast::Comment>>,
200    stack_group: RefCell<Vec<GroupState>>,
201    stack_class: RefCell<Vec<ClassState>>,
202    capture_names: RefCell<Vec<ast::CaptureName>>,
203    scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207    result.map_err(|e| {
208        if e.kind == from {
209            ResharpError { kind: to, pattern: e.pattern, span: e.span }
210        } else {
211            e
212        }
213    })
214}
215
216fn is_capture_char(c: char, first: bool) -> bool {
217    if first {
218        c == '_' || c.is_alphabetic()
219    } else {
220        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
221    }
222}
223
224pub fn is_meta_character(c: char) -> bool {
225    matches!(c, '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
226        | '#' | '&' | '-' | '~' | '_')
227}
228
229/// escapes all resharp meta characters in `text`.
230pub fn escape(text: &str) -> String {
231    let mut buf = String::new();
232    escape_into(text, &mut buf);
233    buf
234}
235
236/// escapes all resharp meta characters in `text` and appends to `buf`.
237pub fn escape_into(text: &str, buf: &mut String) {
238    buf.reserve(text.len());
239    for c in text.chars() {
240        if is_meta_character(c) {
241            buf.push('\\');
242        }
243        buf.push(c);
244    }
245}
246
247pub fn is_escapeable_character(c: char) -> bool {
248    // Certainly escapeable if it's a meta character.
249    if is_meta_character(c) {
250        return true;
251    }
252    // Any character that isn't ASCII is definitely not escapeable. There's
253    // no real need to allow things like \☃ right?
254    if !c.is_ascii() {
255        return false;
256    }
257    // Otherwise, we basically say that everything is escapeable unless it's a
258    // letter or digit. Things like \3 are either octal (when enabled) or an
259    // error, and we should keep it that way. Otherwise, letters are reserved
260    // for adding new syntax in a backwards compatible way.
261    match c {
262        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
263        // While not currently supported, we keep these as not escapeable to
264        // give us some flexibility with respect to supporting the \< and
265        // \> word boundary assertions in the future. By rejecting them as
266        // escapeable, \< and \> will result in a parse error. Thus, we can
267        // turn them into something else in the future without it being a
268        // backwards incompatible change.
269        //
270        // OK, now we support \< and \>, and we need to retain them as *not*
271        // escapeable here since the escape sequence is significant.
272        '<' | '>' => false,
273        _ => true,
274    }
275}
276
277fn is_hex(c: char) -> bool {
278    c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
279}
280
281impl<'s> ResharpParser<'s> {
282    fn default_translator_builder(&self) -> TranslatorBuilder {
283        let mut trb = TranslatorBuilder::new();
284        trb.unicode(self.global_unicode);
285        trb.utf8(false);
286        trb.case_insensitive(self.global_case_insensitive);
287        trb
288    }
289
290    pub fn new(pattern: &'s str) -> Self {
291        Self::with_flags(pattern, &PatternFlags::default())
292    }
293
294    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
295        let mut trb = TranslatorBuilder::new();
296        trb.unicode(flags.unicode);
297        trb.utf8(false);
298        trb.case_insensitive(flags.case_insensitive);
299        Self {
300            translator: trb.build(),
301            pattern,
302            perl_classes: vec![],
303            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
304            pos: Cell::new(Position::new(0, 0, 0)),
305            capture_index: Cell::new(0),
306            octal: false,
307            empty_min_range: false,
308            ignore_whitespace: Cell::new(flags.ignore_whitespace),
309            dot_all: Cell::new(flags.dot_matches_new_line),
310            global_unicode: flags.unicode,
311            global_case_insensitive: flags.case_insensitive,
312            comments: RefCell::new(vec![]),
313            stack_group: RefCell::new(vec![]),
314            stack_class: RefCell::new(vec![]),
315            capture_names: RefCell::new(vec![]),
316            scratch: RefCell::new(String::new()),
317        }
318    }
319
320    /// Return a reference to the parser state.
321    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
322        self
323    }
324
325    /// Return a reference to the pattern being parsed.
326    fn pattern(&self) -> &str {
327        self.pattern
328    }
329
330    /// Create a new error with the given span and error type.
331    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
332        ResharpError {
333            kind,
334            pattern: self.pattern().to_string(),
335            span,
336        }
337    }
338
339    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
340        self.error(Span::splat(self.pos()), ast::ErrorKind::UnsupportedResharpRegex)
341    }
342
343    /// Return the current offset of the parser.
344    ///
345    /// The offset starts at `0` from the beginning of the regular expression
346    /// pattern string.
347    fn offset(&self) -> usize {
348        self.parser().pos.get().offset
349    }
350
351    /// Return the current line number of the parser.
352    ///
353    /// The line number starts at `1`.
354    fn line(&self) -> usize {
355        self.parser().pos.get().line
356    }
357
358    /// Return the current column of the parser.
359    ///
360    /// The column number starts at `1` and is reset whenever a `\n` is seen.
361    fn column(&self) -> usize {
362        self.parser().pos.get().column
363    }
364
365    /// Return the next capturing index. Each subsequent call increments the
366    /// internal index.
367    ///
368    /// The span given should correspond to the location of the opening
369    /// parenthesis.
370    ///
371    /// If the capture limit is exceeded, then an error is returned.
372    fn next_capture_index(&self, span: Span) -> Result<u32> {
373        let current = self.parser().capture_index.get();
374        let i = current
375            .checked_add(1)
376            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
377        self.parser().capture_index.set(i);
378        Ok(i)
379    }
380
381    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
382        let mut names = self.parser().capture_names.borrow_mut();
383        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
384            Err(i) => {
385                names.insert(i, cap.clone());
386                Ok(())
387            }
388            Ok(i) => Err(self.error(
389                cap.span,
390                ast::ErrorKind::GroupNameDuplicate {
391                    original: names[i].span,
392                },
393            )),
394        }
395    }
396
397    fn ignore_whitespace(&self) -> bool {
398        self.parser().ignore_whitespace.get()
399    }
400
401    fn char(&self) -> char {
402        self.char_at(self.offset())
403    }
404
405    fn char_at(&self, i: usize) -> char {
406        self.pattern()[i..]
407            .chars()
408            .next()
409            .unwrap_or_else(|| panic!("expected char at offset {}", i))
410    }
411
412    fn bump(&self) -> bool {
413        if self.is_eof() {
414            return false;
415        }
416        let Position {
417            mut offset,
418            mut line,
419            mut column,
420        } = self.pos();
421        if self.char() == '\n' {
422            line = line.checked_add(1).unwrap();
423            column = 1;
424        } else {
425            column = column.checked_add(1).unwrap();
426        }
427        offset += self.char().len_utf8();
428        self.parser().pos.set(Position {
429            offset,
430            line,
431            column,
432        });
433        self.pattern()[self.offset()..].chars().next().is_some()
434    }
435
436    fn bump_if(&self, prefix: &str) -> bool {
437        if self.pattern()[self.offset()..].starts_with(prefix) {
438            for _ in 0..prefix.chars().count() {
439                self.bump();
440            }
441            true
442        } else {
443            false
444        }
445    }
446
447    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
448        if self.bump_if("?=") {
449            return Some((true, true));
450        }
451        if self.bump_if("?!") {
452            return Some((true, false));
453        }
454        if self.bump_if("?<=") {
455            return Some((false, true));
456        }
457        if self.bump_if("?<!") {
458            return Some((false, false));
459        }
460        None
461    }
462
463    fn bump_and_bump_space(&self) -> bool {
464        if !self.bump() {
465            return false;
466        }
467        self.bump_space();
468        !self.is_eof()
469    }
470
471    fn bump_space(&self) {
472        if !self.ignore_whitespace() {
473            return;
474        }
475        while !self.is_eof() {
476            if self.char().is_whitespace() {
477                self.bump();
478            } else if self.char() == '#' {
479                let start = self.pos();
480                let mut comment_text = String::new();
481                self.bump();
482                while !self.is_eof() {
483                    let c = self.char();
484                    self.bump();
485                    if c == '\n' {
486                        break;
487                    }
488                    comment_text.push(c);
489                }
490                let comment = ast::Comment {
491                    span: Span::new(start, self.pos()),
492                    comment: comment_text,
493                };
494                self.parser().comments.borrow_mut().push(comment);
495            } else {
496                break;
497            }
498        }
499    }
500
501    /// Peek at the next character in the input without advancing the parser.
502    ///
503    /// If the input has been exhausted, then this returns `None`.
504    fn peek(&self) -> Option<char> {
505        if self.is_eof() {
506            return None;
507        }
508        self.pattern()[self.offset() + self.char().len_utf8()..]
509            .chars()
510            .next()
511    }
512
513    /// Like peek, but will ignore spaces when the parser is in whitespace
514    /// insensitive mode.
515    fn peek_space(&self) -> Option<char> {
516        if !self.ignore_whitespace() {
517            return self.peek();
518        }
519        if self.is_eof() {
520            return None;
521        }
522        let mut start = self.offset() + self.char().len_utf8();
523        let mut in_comment = false;
524        for (i, c) in self.pattern()[start..].char_indices() {
525            if c.is_whitespace() {
526                continue;
527            } else if !in_comment && c == '#' {
528                in_comment = true;
529            } else if in_comment && c == '\n' {
530                in_comment = false;
531            } else {
532                start += i;
533                break;
534            }
535        }
536        self.pattern()[start..].chars().next()
537    }
538
539    /// Returns true if the next call to `bump` would return false.
540    fn is_eof(&self) -> bool {
541        self.offset() == self.pattern().len()
542    }
543
544    /// Return the current position of the parser, which includes the offset,
545    /// line and column.
546    fn pos(&self) -> Position {
547        self.parser().pos.get()
548    }
549
550    /// Create a span at the current position of the parser. Both the start
551    /// and end of the span are set.
552    fn span(&self) -> Span {
553        Span::splat(self.pos())
554    }
555
556    /// Create a span that covers the current character.
557    fn span_char(&self) -> Span {
558        let mut next = Position {
559            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
560            line: self.line(),
561            column: self.column().checked_add(1).unwrap(),
562        };
563        if self.char() == '\n' {
564            next.line += 1;
565            next.column = 1;
566        }
567        Span::new(self.pos(), next)
568    }
569
570    /// Parse and push a single alternation on to the parser's internal stack.
571    /// If the top of the stack already has an alternation, then add to that
572    /// instead of pushing a new one.
573    ///
574    /// The concatenation given corresponds to a single alternation branch.
575    /// The concatenation returned starts the next branch and is empty.
576    ///
577    /// This assumes the parser is currently positioned at `|` and will advance
578    /// the parser to the character following `|`.
579    #[inline(never)]
580    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
581        assert_eq!(self.char(), '|');
582        concat.span.end = self.pos();
583        self.push_or_add_alternation(concat);
584        self.bump();
585        Ok(ast::Concat {
586            span: self.span(),
587            asts: vec![],
588        })
589    }
590
591    /// Pushes or adds the given branch of an alternation to the parser's
592    /// internal stack of state.
593    fn push_or_add_alternation(&self, concat: Concat) {
594        use self::GroupState::*;
595
596        let mut stack = self.parser().stack_group.borrow_mut();
597        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
598            alts.asts.push(concat.into_ast());
599            return;
600        }
601        stack.push(Alternation(ast::Alternation {
602            span: Span::new(concat.span.start, self.pos()),
603            asts: vec![concat.into_ast()],
604        }));
605    }
606
607    #[inline(never)]
608    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
609        assert_eq!(self.char(), '&');
610        concat.span.end = self.pos();
611        self.push_or_add_intersect(concat);
612        self.bump();
613        Ok(Concat {
614            span: self.span(),
615            asts: vec![],
616        })
617    }
618
619    /// Pushes or adds the given branch of an alternation to the parser's
620    /// internal stack of state.
621    fn push_or_add_intersect(&self, concat: Concat) {
622        use self::GroupState::*;
623
624        let mut stack = self.parser().stack_group.borrow_mut();
625        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
626            alts.asts.push(concat.into_ast());
627            return;
628        }
629        stack.push(Intersection(ast::Intersection {
630            span: Span::new(concat.span.start, self.pos()),
631            asts: vec![concat.into_ast()],
632        }));
633    }
634
635    /// Parse and push a group AST (and its parent concatenation) on to the
636    /// parser's internal stack. Return a fresh concatenation corresponding
637    /// to the group's sub-AST.
638    ///
639    /// If a set of flags was found (with no group), then the concatenation
640    /// is returned with that set of flags added.
641    ///
642    /// This assumes that the parser is currently positioned on the opening
643    /// parenthesis. It advances the parser to the character at the start
644    /// of the sub-expression (or adjoining expression).
645    ///
646    /// If there was a problem parsing the start of the group, then an error
647    /// is returned.
648    #[inline(never)]
649    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
650        assert_eq!(self.char(), '(');
651        match self.parse_group()? {
652            Either::Left(set) => {
653                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
654                if let Some(v) = ignore {
655                    self.parser().ignore_whitespace.set(v);
656                }
657
658                concat.asts.push(Ast::flags(set));
659                Ok(concat)
660            }
661            Either::Right(group) => {
662                let old_ignore_whitespace = self.ignore_whitespace();
663                let new_ignore_whitespace = group
664                    .flags()
665                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
666                    .unwrap_or(old_ignore_whitespace);
667                self.parser()
668                    .stack_group
669                    .borrow_mut()
670                    .push(GroupState::Group {
671                        concat,
672                        group,
673                        ignore_whitespace: old_ignore_whitespace,
674                    });
675                self.parser().ignore_whitespace.set(new_ignore_whitespace);
676                Ok(Concat {
677                    span: self.span(),
678                    asts: vec![],
679                })
680            }
681        }
682    }
683
684    #[inline(never)]
685    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
686        assert_eq!(self.char(), '~');
687        self.bump();
688        if self.is_eof() || self.char() != '(' {
689            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
690        }
691        let open_span = self.span_char();
692        self.bump();
693        let group = ast::Group {
694            span: open_span,
695            kind: ast::GroupKind::Complement,
696            ast: Box::new(Ast::empty(self.span())),
697        };
698
699        let old_ignore_whitespace = self.ignore_whitespace();
700        let new_ignore_whitespace = group
701            .flags()
702            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
703            .unwrap_or(old_ignore_whitespace);
704        self.parser()
705            .stack_group
706            .borrow_mut()
707            .push(GroupState::Group {
708                concat,
709                group,
710                ignore_whitespace: old_ignore_whitespace,
711            });
712        self.parser().ignore_whitespace.set(new_ignore_whitespace);
713        Ok(Concat {
714            span: self.span(),
715            asts: vec![],
716        })
717    }
718
719    /// Pop a group AST from the parser's internal stack and set the group's
720    /// AST to the given concatenation. Return the concatenation containing
721    /// the group.
722    ///
723    /// This assumes that the parser is currently positioned on the closing
724    /// parenthesis and advances the parser to the character following the `)`.
725    ///
726    /// If no such group could be popped, then an unopened group error is
727    /// returned.
728    #[inline(never)]
729    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
730        use self::GroupState::*;
731        assert_eq!(self.char(), ')');
732        let mut stack = self.parser().stack_group.borrow_mut();
733        let topstack = stack.pop();
734
735        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
736            Some(Group {
737                concat,
738                group,
739                ignore_whitespace,
740            }) => (concat, group, ignore_whitespace, None),
741            Some(Alternation(alt)) => match stack.pop() {
742                Some(Group {
743                    concat,
744                    group,
745                    ignore_whitespace,
746                }) => (
747                    concat,
748                    group,
749                    ignore_whitespace,
750                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
751                ),
752                None | Some(Alternation(_)) | Some(Intersection(_)) => {
753                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
754                }
755            },
756            Some(Intersection(int)) => match stack.pop() {
757                Some(Group {
758                    concat,
759                    group,
760                    ignore_whitespace,
761                }) => (
762                    concat,
763                    group,
764                    ignore_whitespace,
765                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
766                ),
767                None | Some(Alternation(_)) | Some(Intersection(_)) => {
768                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
769                }
770            },
771
772            None => {
773                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
774            }
775        };
776        self.parser().ignore_whitespace.set(ignore_whitespace);
777        group_concat.span.end = self.pos();
778        self.bump();
779        group.span.end = self.pos();
780        match alt {
781            Some(Either::Left(mut alt)) => {
782                alt.span.end = group_concat.span.end;
783                alt.asts.push(group_concat.into_ast());
784                group.ast = Box::new(alt.into_ast());
785            }
786            Some(Either::Right(mut int)) => {
787                int.span.end = group_concat.span.end;
788                int.asts.push(group_concat.into_ast());
789                group.ast = Box::new(int.into_ast());
790            }
791            None => {
792                group.ast = Box::new(group_concat.into_ast());
793            }
794        }
795
796        if group.kind == GroupKind::Complement {
797            let complement = ast::Complement {
798                span: self.span(),
799                ast: group.ast,
800            };
801            prior_concat.asts.push(Ast::complement(complement));
802        }
803        // ignore groups for now
804        else {
805            prior_concat.asts.push(Ast::group(group));
806        }
807        Ok(prior_concat)
808    }
809
810    /// Pop the last state from the parser's internal stack, if it exists, and
811    /// add the given concatenation to it. There either must be no state or a
812    /// single alternation item on the stack. Any other scenario produces an
813    /// error.
814    ///
815    /// This assumes that the parser has advanced to the end.
816    #[inline(never)]
817    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
818        concat.span.end = self.pos();
819        let mut stack = self.parser().stack_group.borrow_mut();
820        let ast = match stack.pop() {
821            None => Ok(concat.into_ast()),
822            Some(GroupState::Alternation(mut alt)) => {
823                alt.span.end = self.pos();
824                alt.asts.push(concat.into_ast());
825                Ok(Ast::alternation(alt))
826            }
827            Some(GroupState::Intersection(mut int)) => {
828                int.span.end = self.pos();
829                int.asts.push(concat.into_ast());
830
831                Ok(Ast::intersection(int))
832            }
833            Some(GroupState::Group { group, .. }) => {
834                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
835            }
836        };
837        // If we try to pop again, there should be nothing.
838        match stack.pop() {
839            None => ast,
840            Some(GroupState::Alternation(_)) => {
841                // This unreachable is unfortunate. This case can't happen
842                // because the only way we can be here is if there were two
843                // `GroupState::Alternation`s adjacent in the parser's stack,
844                // which we guarantee to never happen because we never push a
845                // `GroupState::Alternation` if one is already at the top of
846                // the stack.
847                unreachable!()
848            }
849            Some(GroupState::Intersection(_)) => {
850                unreachable!()
851            }
852            Some(GroupState::Group { group, .. }) => {
853                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
854            }
855        }
856    }
857
858    /// Parse the opening of a character class and push the current class
859    /// parsing context onto the parser's stack. This assumes that the parser
860    /// is positioned at an opening `[`. The given union should correspond to
861    /// the union of set items built up before seeing the `[`.
862    ///
863    /// If there was a problem parsing the opening of the class, then an error
864    /// is returned. Otherwise, a new union of set items for the class is
865    /// returned (which may be populated with either a `]` or a `-`).
866    #[inline(never)]
867    fn push_class_open(
868        &self,
869        parent_union: regex_syntax::ast::ClassSetUnion,
870    ) -> Result<regex_syntax::ast::ClassSetUnion> {
871        assert_eq!(self.char(), '[');
872
873        let (nested_set, nested_union) = self.parse_set_class_open()?;
874        self.parser()
875            .stack_class
876            .borrow_mut()
877            .push(ClassState::Open {
878                union: parent_union,
879                set: nested_set,
880            });
881        Ok(nested_union)
882    }
883
884    /// Parse the end of a character class set and pop the character class
885    /// parser stack. The union given corresponds to the last union built
886    /// before seeing the closing `]`. The union returned corresponds to the
887    /// parent character class set with the nested class added to it.
888    ///
889    /// This assumes that the parser is positioned at a `]` and will advance
890    /// the parser to the byte immediately following the `]`.
891    ///
892    /// If the stack is empty after popping, then this returns the final
893    /// "top-level" character class AST (where a "top-level" character class
894    /// is one that is not nested inside any other character class).
895    ///
896    /// If there is no corresponding opening bracket on the parser's stack,
897    /// then an error is returned.
898    #[inline(never)]
899    fn pop_class(
900        &self,
901        nested_union: regex_syntax::ast::ClassSetUnion,
902    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
903        assert_eq!(self.char(), ']');
904
905        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
906        let prevset = self.pop_class_op(item);
907        let mut stack = self.parser().stack_class.borrow_mut();
908        match stack.pop() {
909            None => {
910                // We can never observe an empty stack:
911                //
912                // 1) We are guaranteed to start with a non-empty stack since
913                //    the character class parser is only initiated when it sees
914                //    a `[`.
915                // 2) If we ever observe an empty stack while popping after
916                //    seeing a `]`, then we signal the character class parser
917                //    to terminate.
918                panic!("unexpected empty character class stack")
919            }
920            Some(ClassState::Op { .. }) => {
921                // This panic is unfortunate, but this case is impossible
922                // since we already popped the Op state if one exists above.
923                // Namely, every push to the class parser stack is guarded by
924                // whether an existing Op is already on the top of the stack.
925                // If it is, the existing Op is modified. That is, the stack
926                // can never have consecutive Op states.
927                panic!("unexpected ClassState::Op")
928            }
929            Some(ClassState::Open { mut union, mut set }) => {
930                self.bump();
931                set.span.end = self.pos();
932                set.kind = prevset;
933                if stack.is_empty() {
934                    Ok(Either::Right(set))
935                } else {
936                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
937                    Ok(Either::Left(union))
938                }
939            }
940        }
941    }
942
943    /// Return an "unclosed class" error whose span points to the most
944    /// recently opened class.
945    ///
946    /// This should only be called while parsing a character class.
947    #[inline(never)]
948    fn unclosed_class_error(&self) -> ResharpError {
949        for state in self.parser().stack_class.borrow().iter().rev() {
950            if let ClassState::Open { ref set, .. } = *state {
951                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
952            }
953        }
954        // We are guaranteed to have a non-empty stack with at least
955        // one open bracket, so we should never get here.
956        panic!("no open character class found")
957    }
958
959    /// Push the current set of class items on to the class parser's stack as
960    /// the left hand side of the given operator.
961    ///
962    /// A fresh set union is returned, which should be used to build the right
963    /// hand side of this operator.
964    #[inline(never)]
965    fn push_class_op(
966        &self,
967        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
968        next_union: regex_syntax::ast::ClassSetUnion,
969    ) -> regex_syntax::ast::ClassSetUnion {
970        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
971        let new_lhs = self.pop_class_op(item);
972        self.parser().stack_class.borrow_mut().push(ClassState::Op {
973            kind: next_kind,
974            lhs: new_lhs,
975        });
976        regex_syntax::ast::ClassSetUnion {
977            span: self.span(),
978            items: vec![],
979        }
980    }
981
982    /// Pop a character class set from the character class parser stack. If the
983    /// top of the stack is just an item (not an operation), then return the
984    /// given set unchanged. If the top of the stack is an operation, then the
985    /// given set will be used as the rhs of the operation on the top of the
986    /// stack. In that case, the binary operation is returned as a set.
987    #[inline(never)]
988    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
989        let mut stack = self.parser().stack_class.borrow_mut();
990        let (kind, lhs) = match stack.pop() {
991            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
992            Some(state @ ClassState::Open { .. }) => {
993                stack.push(state);
994                return rhs;
995            }
996            None => unreachable!(),
997        };
998        let span = Span::new(lhs.span().start, rhs.span().end);
999        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1000            span,
1001            kind,
1002            lhs: Box::new(lhs),
1003            rhs: Box::new(rhs),
1004        })
1005    }
1006
1007    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1008        match hir.kind() {
1009            hir::HirKind::Empty => Ok(NodeId::EPS),
1010            hir::HirKind::Literal(l) => {
1011                if l.0.len() == 1 {
1012                    let node = tb.mk_u8(l.0[0]);
1013                    Ok(node)
1014                } else {
1015                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1016                    let conc = tb.mk_concats(ws.iter().copied());
1017                    Ok(conc)
1018                }
1019            }
1020            hir::HirKind::Class(class) => {
1021                match class {
1022                    hir::Class::Unicode(class_unicode) => {
1023                        let ranges = class_unicode.ranges();
1024                        let mut nodes = Vec::new();
1025                        for range in ranges {
1026                            for seq in Utf8Sequences::new(range.start(), range.end()) {
1027                                let sl = seq.as_slice();
1028                                let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1029                                let node = match bytes.len() {
1030                                    1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1031                                    n => {
1032                                        let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1033                                        let mut conc = last;
1034                                        for i in (0..n - 1).rev() {
1035                                            let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1036                                            conc = tb.mk_concat(b, conc);
1037                                        }
1038                                        conc
1039                                    }
1040                                };
1041                                nodes.push(node);
1042                            }
1043                        }
1044                        let merged = tb.mk_unions(nodes.into_iter());
1045                        Ok(merged)
1046                    }
1047                    hir::Class::Bytes(class_bytes) => {
1048                        let ranges = class_bytes.ranges();
1049                        let mut result = NodeId::BOT;
1050                        for range in ranges {
1051                            let start = range.start();
1052                            let end = range.end();
1053                            let node = tb.mk_range_u8(start, end);
1054                            result = tb.mk_union(result, node);
1055                        }
1056                        Ok(result)
1057                    }
1058                }
1059            }
1060            hir::HirKind::Look(_) => todo!(),
1061            hir::HirKind::Repetition(_) => todo!(),
1062            hir::HirKind::Capture(_) => todo!(),
1063            hir::HirKind::Concat(body) => {
1064                let mut result = NodeId::EPS;
1065                for child in body {
1066                    let node = self.hir_to_node_id(child, tb)?;
1067                    result = tb.mk_concat(result, node);
1068                }
1069                Ok(result)
1070            }
1071            hir::HirKind::Alternation(_) => todo!(),
1072        }
1073    }
1074
1075    fn translate_ast_to_hir(
1076        &mut self,
1077        orig_ast: &regex_syntax::ast::Ast,
1078        tb: &mut TB<'s>,
1079    ) -> Result<NodeId> {
1080        match self.translator.translate("", orig_ast) {
1081            Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1082            Ok(hir) => self.hir_to_node_id(&hir, tb),
1083        }
1084    }
1085
1086    fn translator_to_node_id(
1087        &mut self,
1088        orig_ast: &regex_syntax::ast::Ast,
1089        translator: &mut Option<Translator>,
1090        tb: &mut TB<'s>,
1091    ) -> Result<NodeId> {
1092        match translator {
1093            Some(tr) => {
1094                let hir = tr
1095                    .translate("", orig_ast)
1096                    .map_err(|e| self.unsupported_error(e))?;
1097                self.hir_to_node_id(&hir, tb)
1098            }
1099            None => self.translate_ast_to_hir(orig_ast, tb),
1100        }
1101    }
1102
1103    fn get_class(
1104        &mut self,
1105        negated: bool,
1106        kind: regex_syntax::ast::ClassPerlKind,
1107        tb: &mut TB<'s>,
1108    ) -> Result<NodeId> {
1109        let w = self
1110            .perl_classes
1111            .iter()
1112            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1113        match w {
1114            Some((_, _, value)) => Ok(*value),
1115            None => {
1116                let translated = if self.global_unicode {
1117                    match kind {
1118                        regex_syntax::ast::ClassPerlKind::Word => {
1119                            self.unicode_classes.ensure_word(tb);
1120                            if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1121                        }
1122                        regex_syntax::ast::ClassPerlKind::Digit => {
1123                            self.unicode_classes.ensure_digit(tb);
1124                            if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1125                        }
1126                        regex_syntax::ast::ClassPerlKind::Space => {
1127                            self.unicode_classes.ensure_space(tb);
1128                            if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1129                        }
1130                    }
1131                } else {
1132                    let pos = match kind {
1133                        regex_syntax::ast::ClassPerlKind::Word => {
1134                            let az = tb.mk_range_u8(b'a', b'z');
1135                            let big = tb.mk_range_u8(b'A', b'Z');
1136                            let dig = tb.mk_range_u8(b'0', b'9');
1137                            let us = tb.mk_u8(b'_');
1138                            tb.mk_unions([az, big, dig, us].into_iter())
1139                        }
1140                        regex_syntax::ast::ClassPerlKind::Digit => {
1141                            tb.mk_range_u8(b'0', b'9')
1142                        }
1143                        regex_syntax::ast::ClassPerlKind::Space => {
1144                            let sp = tb.mk_u8(b' ');
1145                            let tab = tb.mk_u8(b'\t');
1146                            let nl = tb.mk_u8(b'\n');
1147                            let cr = tb.mk_u8(b'\r');
1148                            let ff = tb.mk_u8(0x0C);
1149                            let vt = tb.mk_u8(0x0B);
1150                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1151                        }
1152                    };
1153                    if negated { tb.mk_compl(pos) } else { pos }
1154                };
1155                self.perl_classes.push((negated, kind, translated));
1156                Ok(translated)
1157            }
1158        }
1159    }
1160
1161    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1162        use WordCharKind::*;
1163        match ast {
1164            Ast::Literal(lit) => {
1165                if is_word_byte(lit.c as u8) {
1166                    Word
1167                } else {
1168                    NonWord
1169                }
1170            }
1171            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1172                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1173                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1174                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1175                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1176                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1177                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1178            },
1179            Ast::Dot(_) | Ast::Top(_) => Unknown,
1180            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1181            Ast::Concat(c) if !c.asts.is_empty() => {
1182                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1183            }
1184            Ast::Repetition(r) => {
1185                let inner = Self::word_char_kind(&r.ast, left);
1186                let nullable = matches!(
1187                    &r.op.kind,
1188                    ast::RepetitionKind::ZeroOrMore
1189                        | ast::RepetitionKind::ZeroOrOne
1190                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1191                );
1192                if nullable {
1193                    match inner {
1194                        Word => MaybeWord,
1195                        NonWord => MaybeNonWord,
1196                        _ => Unknown,
1197                    }
1198                } else {
1199                    inner
1200                }
1201            }
1202            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1203            _ => Unknown,
1204        }
1205    }
1206
1207    fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1208        match ast {
1209            Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1210            | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1211            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1212            Ast::Concat(c) if !c.asts.is_empty() => {
1213                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1214            }
1215            Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1216            Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1217            _ => None,
1218        }
1219    }
1220
1221    fn resolve_word_kind(
1222        &mut self,
1223        asts: &[Ast],
1224        idx: usize,
1225        dir: isize,
1226        translator: &mut Option<Translator>,
1227        tb: &mut TB<'s>,
1228        word_id: NodeId,
1229        not_word_id: NodeId,
1230    ) -> Result<WordCharKind> {
1231        use WordCharKind::*;
1232        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1233        if fast != Unknown {
1234            return Ok(fast);
1235        }
1236        let neighbor_idx = (idx as isize + dir) as usize;
1237        let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1238            Some(e) => e,
1239            None => return Ok(Unknown),
1240        };
1241        let node = self.ast_to_node_id(edge, translator, tb)?;
1242        if tb.subsumes(word_id, node) == Some(true) {
1243            Ok(Word)
1244        } else if tb.subsumes(not_word_id, node) == Some(true) {
1245            Ok(NonWord)
1246        } else {
1247            Ok(Unknown)
1248        }
1249    }
1250
1251    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1252        use WordCharKind::*;
1253        let next = idx as isize + dir;
1254        if next < 0 || next >= asts.len() as isize {
1255            return Edge;
1256        }
1257        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1258        match kind {
1259            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1260                Word => Word,
1261                _ => Unknown,
1262            },
1263            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1264                NonWord => NonWord,
1265                _ => Unknown,
1266            },
1267            other => other,
1268        }
1269    }
1270
1271    fn rewrite_word_boundary_in_concat(
1272        &mut self,
1273        asts: &[Ast],
1274        idx: usize,
1275        translator: &mut Option<Translator>,
1276        tb: &mut TB<'s>,
1277    ) -> Result<(NodeId, usize)> {
1278        use WordCharKind::*;
1279        let (word_id, not_word_id) = if self.global_unicode {
1280            self.unicode_classes.ensure_word(tb);
1281            (self.unicode_classes.word, self.unicode_classes.non_word)
1282        } else {
1283            let az = tb.mk_range_u8(b'a', b'z');
1284            let big = tb.mk_range_u8(b'A', b'Z');
1285            let dig = tb.mk_range_u8(b'0', b'9');
1286            let us = tb.mk_u8(b'_');
1287            let w = tb.mk_unions([az, big, dig, us].into_iter());
1288            (w, tb.mk_compl(w))
1289        };
1290        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1291        let right =
1292            self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1293
1294        match (left, right) {
1295            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1296            (Word, _) => {
1297                let neg = tb.mk_neg_lookahead(word_id, 0);
1298                Ok((neg, idx + 1))
1299            }
1300            (NonWord, _) => {
1301                let set = tb.mk_union(NodeId::END, word_id);
1302                let tail = tb.mk_concat(set, NodeId::TS);
1303                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1304            }
1305            (_, Word) => {
1306                Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1307            }
1308            (_, NonWord) => {
1309                let body = tb.mk_union(NodeId::BEGIN, word_id);
1310                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1311            }
1312            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1313            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1314            _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1315        }
1316    }
1317
1318    fn merge_boundary_with_following_lookaheads(
1319        &mut self,
1320        asts: &[Ast],
1321        wb_idx: usize,
1322        boundary_tail: NodeId,
1323        translator: &mut Option<Translator>,
1324        tb: &mut TB<'s>,
1325    ) -> Result<(NodeId, usize)> {
1326        let mut next = wb_idx + 1;
1327        let mut la_bodies = vec![boundary_tail];
1328        while next < asts.len() {
1329            match &asts[next] {
1330                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1331                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1332                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1333                    next += 1;
1334                }
1335                _ => break,
1336            }
1337        }
1338        let merged = tb.mk_inters(la_bodies.into_iter());
1339        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1340    }
1341
1342    fn ast_to_node_id(
1343        &mut self,
1344        ast: &Ast,
1345        translator: &mut Option<Translator>,
1346        tb: &mut TB<'s>,
1347    ) -> Result<NodeId> {
1348        match ast {
1349            Ast::Empty(_) => Ok(NodeId::EPS),
1350            Ast::Flags(f) => {
1351                let mut translator_builder = self.default_translator_builder();
1352                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1353                    translator_builder.case_insensitive(state);
1354                }
1355                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1356                    translator_builder.unicode(state);
1357                }
1358                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1359                    self.dot_all.set(state);
1360                }
1361                let concat_translator = Some(translator_builder.build());
1362                *translator = concat_translator;
1363                Ok(NodeId::EPS)
1364            }
1365            Ast::Literal(l) => {
1366                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1367                self.translator_to_node_id(&ast_lit, translator, tb)
1368            }
1369            Ast::Top(_) => Ok(NodeId::TOP),
1370            Ast::Dot(_) => {
1371                if self.dot_all.get() {
1372                    Ok(NodeId::TOP)
1373                } else {
1374                    let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1375                    self.hir_to_node_id(&hirv, tb)
1376                }
1377            }
1378            Ast::Assertion(a) => match &a.kind {
1379                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1380                ast::AssertionKind::EndText => Ok(NodeId::END),
1381                ast::AssertionKind::WordBoundary => {
1382                    Err(
1383                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1384                    )
1385                }
1386                ast::AssertionKind::NotWordBoundary => {
1387                    Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1388                }
1389                ast::AssertionKind::StartLine => {
1390                    let left = NodeId::BEGIN;
1391                    let right = tb.mk_u8(b'\n');
1392                    let union = tb.mk_union(left, right);
1393                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1394                }
1395                ast::AssertionKind::EndLine => {
1396                    let left = NodeId::END;
1397                    let right = tb.mk_u8(b'\n');
1398                    let union = tb.mk_union(left, right);
1399                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1400                }
1401                ast::AssertionKind::WordBoundaryStart => todo!(),
1402                ast::AssertionKind::WordBoundaryEnd => todo!(),
1403                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1404                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1405                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1406                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1407            },
1408            Ast::ClassUnicode(c) => {
1409                let tmp = regex_syntax::ast::ClassUnicode {
1410                    span: c.span,
1411                    negated: c.negated,
1412                    kind: c.kind.clone(),
1413                };
1414                if !c.negated {
1415                    if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind { match s.as_str() {
1416                        // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1417                        "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1418                        // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1419                        "utf8" => {
1420                            let ascii = tb.mk_range_u8(0, 127);
1421                            let beta = tb.mk_range_u8(128, 0xBF);
1422                            let c0 = tb.mk_range_u8(0xC0, 0xDF);
1423                            let c0s = tb.mk_concats([c0, beta].into_iter());
1424                            let e0 = tb.mk_range_u8(0xE0, 0xEF);
1425                            let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1426                            let f0 = tb.mk_range_u8(0xF0, 0xF7);
1427                            let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1428                            let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1429                            return Ok(tb.mk_star(merged));
1430                        }
1431                        "hex" => {
1432                            let nums = tb.mk_range_u8(b'0', b'9');
1433                            let lets = tb.mk_range_u8(b'a', b'f');
1434                            let lets2 = tb.mk_range_u8(b'A', b'F');
1435                            let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1436                            return Ok(merged);
1437                        }
1438                        _ => {}
1439                    } };
1440                }
1441
1442                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1443                self.translator_to_node_id(&orig_ast, translator, tb)
1444            }
1445            Ast::ClassPerl(c) => {
1446                self.get_class(c.negated, c.kind.clone(), tb)
1447            }
1448            Ast::ClassBracketed(c) => match &c.kind {
1449                regex_syntax::ast::ClassSet::Item(_) => {
1450                    let tmp = regex_syntax::ast::ClassBracketed {
1451                        span: c.span,
1452                        negated: c.negated,
1453                        kind: c.kind.clone(),
1454                    };
1455                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1456                    self.translator_to_node_id(&orig_ast, translator, tb)
1457                }
1458                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1459            },
1460            Ast::Repetition(r) => {
1461                let body = self.ast_to_node_id(&r.ast, translator, tb);
1462                match body {
1463                    Ok(body) => match &r.op.kind {
1464                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1465                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1466                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1467                        ast::RepetitionKind::Range(r) => match r {
1468                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1469                            ast::RepetitionRange::AtLeast(n) => {
1470                                let rep = tb.mk_repeat(body, *n, *n);
1471                                let st = tb.mk_star(body);
1472                                Ok(tb.mk_concat(rep, st))
1473                            }
1474
1475                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1476                        },
1477                    },
1478                    Err(_) => body,
1479                }
1480            }
1481            Ast::Lookaround(g) => {
1482                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1483                match g.kind {
1484                    ast::LookaroundKind::PositiveLookahead => {
1485                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1486                    }
1487                    ast::LookaroundKind::PositiveLookbehind => {
1488                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1489                    }
1490                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1491                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1492                }
1493            }
1494            Ast::Group(g) => {
1495                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1496                    if !flags.items.is_empty() {
1497                        let mut translator_builder = self.default_translator_builder();
1498                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1499                            translator_builder.case_insensitive(state);
1500                        }
1501                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1502                            translator_builder.unicode(state);
1503                        }
1504                        let saved_dot_all = self.dot_all.get();
1505                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1506                            self.dot_all.set(state);
1507                        }
1508                        let mut scoped = Some(translator_builder.build());
1509                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1510                        self.dot_all.set(saved_dot_all);
1511                        return result;
1512                    }
1513                }
1514                self.ast_to_node_id(&g.ast, translator, tb)
1515            }
1516            Ast::Alternation(a) => {
1517                let mut children = vec![];
1518                for ast in &a.asts {
1519                    match self.ast_to_node_id(ast, translator, tb) {
1520                        Ok(node_id) => children.push(node_id),
1521                        Err(err) => return Err(err),
1522                    }
1523                }
1524                Ok(tb.mk_unions(children.iter().copied()))
1525            }
1526            Ast::Concat(c) => {
1527                let mut concat_translator: Option<Translator> = None;
1528                let mut children = vec![];
1529                let mut i = 0;
1530                while i < c.asts.len() {
1531                    let ast = &c.asts[i];
1532                    match ast {
1533                        Ast::Flags(f) => {
1534                            let mut translator_builder = self.default_translator_builder();
1535                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1536                                translator_builder.case_insensitive(state);
1537                            }
1538                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1539                                translator_builder.unicode(state);
1540                            }
1541                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1542                                self.dot_all.set(state);
1543                            }
1544                            concat_translator = Some(translator_builder.build());
1545                            i += 1;
1546                            continue;
1547                        }
1548                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1549                            let node =
1550                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1551                            children.push(node.0);
1552                            i = node.1; // skip consumed lookaheads
1553                            continue;
1554                        }
1555                        _ => {}
1556                    }
1557                    match concat_translator {
1558                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1559                            Ok(node_id) => children.push(node_id),
1560                            Err(err) => return Err(err),
1561                        },
1562                        None => match self.ast_to_node_id(ast, translator, tb) {
1563                            Ok(node_id) => children.push(node_id),
1564                            Err(err) => return Err(err),
1565                        },
1566                    }
1567                    i += 1;
1568                }
1569                Ok(tb.mk_concats(children.iter().cloned()))
1570            }
1571            Ast::Intersection(intersection) => {
1572                let mut children = vec![];
1573                for ast in &intersection.asts {
1574                    match self.ast_to_node_id(ast, translator, tb) {
1575                        Ok(node_id) => children.push(node_id),
1576                        Err(err) => return Err(err),
1577                    }
1578                }
1579                Ok(tb.mk_inters(children.into_iter()))
1580            }
1581            Ast::Complement(complement) => {
1582                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1583                body.map(|x| tb.mk_compl(x))
1584            }
1585        }
1586    }
1587
1588    /// Parse the regular expression and return an abstract syntax tree with
1589    /// all of the comments found in the pattern.
1590    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1591        let mut concat = Concat {
1592            span: self.span(),
1593            asts: vec![],
1594        };
1595        loop {
1596            self.bump_space();
1597            if self.is_eof() {
1598                break;
1599            }
1600            match self.char() {
1601                '(' => concat = self.push_group(concat)?,
1602                ')' => concat = self.pop_group(concat)?,
1603                '|' => concat = self.push_alternate(concat)?,
1604                '&' => concat = self.push_intersect(concat)?,
1605                '~' => concat = self.push_compl_group(concat)?,
1606                '[' => {
1607                    let class = self.parse_set_class()?;
1608                    concat.asts.push(Ast::class_bracketed(class));
1609                }
1610                '?' => {
1611                    concat =
1612                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1613                }
1614                '*' => {
1615                    concat =
1616                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1617                }
1618                '+' => {
1619                    concat =
1620                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1621                }
1622                '{' => {
1623                    concat = self.parse_counted_repetition(concat)?;
1624                }
1625                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1626            }
1627        }
1628        let ast = self.pop_group_end(concat)?;
1629        self.ast_to_node_id(&ast, &mut None, tb)
1630    }
1631
1632    #[inline(never)]
1633    fn parse_uncounted_repetition(
1634        &self,
1635        mut concat: ast::Concat,
1636        kind: ast::RepetitionKind,
1637    ) -> Result<ast::Concat> {
1638        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1639        let op_start = self.pos();
1640        let ast = match concat.asts.pop() {
1641            Some(ast) => ast,
1642            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1643        };
1644        match ast {
1645            Ast::Empty(_) | Ast::Flags(_) => {
1646                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1647            }
1648            _ => {}
1649        }
1650        if self.bump() && self.char() == '?' {
1651            return Err(self.error(
1652                Span::new(op_start, self.pos()),
1653                ast::ErrorKind::UnsupportedLazyQuantifier,
1654            ));
1655        }
1656        concat.asts.push(Ast::repetition(ast::Repetition {
1657            span: ast.span().with_end(self.pos()),
1658            op: ast::RepetitionOp {
1659                span: Span::new(op_start, self.pos()),
1660                kind,
1661            },
1662            greedy: true,
1663            ast: Box::new(ast),
1664        }));
1665        Ok(concat)
1666    }
1667
1668    #[inline(never)]
1669    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1670        assert!(self.char() == '{');
1671        let start = self.pos();
1672        let ast = match concat.asts.pop() {
1673            Some(ast) => ast,
1674            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1675        };
1676        match ast {
1677            Ast::Empty(_) | Ast::Flags(_) => {
1678                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1679            }
1680            _ => {}
1681        }
1682        if !self.bump_and_bump_space() {
1683            return Err(self.error(
1684                Span::new(start, self.pos()),
1685                ast::ErrorKind::RepetitionCountUnclosed,
1686            ));
1687        }
1688        let count_start = specialize_err(
1689            self.parse_decimal(),
1690            ast::ErrorKind::DecimalEmpty,
1691            ast::ErrorKind::RepetitionCountDecimalEmpty,
1692        );
1693        if self.is_eof() {
1694            return Err(self.error(
1695                Span::new(start, self.pos()),
1696                ast::ErrorKind::RepetitionCountUnclosed,
1697            ));
1698        }
1699        let range = if self.char() == ',' {
1700            if !self.bump_and_bump_space() {
1701                return Err(self.error(
1702                    Span::new(start, self.pos()),
1703                    ast::ErrorKind::RepetitionCountUnclosed,
1704                ));
1705            }
1706            if self.char() != '}' {
1707                let count_start = match count_start {
1708                    Ok(c) => c,
1709                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1710                        if self.parser().empty_min_range {
1711                            0
1712                        } else {
1713                            return Err(err);
1714                        }
1715                    }
1716                    err => err?,
1717                };
1718                let count_end = specialize_err(
1719                    self.parse_decimal(),
1720                    ast::ErrorKind::DecimalEmpty,
1721                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1722                )?;
1723                ast::RepetitionRange::Bounded(count_start, count_end)
1724            } else {
1725                ast::RepetitionRange::AtLeast(count_start?)
1726            }
1727        } else {
1728            ast::RepetitionRange::Exactly(count_start?)
1729        };
1730
1731        if self.is_eof() || self.char() != '}' {
1732            return Err(self.error(
1733                Span::new(start, self.pos()),
1734                ast::ErrorKind::RepetitionCountUnclosed,
1735            ));
1736        }
1737
1738        if self.bump_and_bump_space() && self.char() == '?' {
1739            return Err(self.error(
1740                Span::new(start, self.pos()),
1741                ast::ErrorKind::UnsupportedLazyQuantifier,
1742            ));
1743        }
1744
1745        let op_span = Span::new(start, self.pos());
1746        if !range.is_valid() {
1747            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1748        }
1749        concat.asts.push(Ast::repetition(ast::Repetition {
1750            span: ast.span().with_end(self.pos()),
1751            op: ast::RepetitionOp {
1752                span: op_span,
1753                kind: ast::RepetitionKind::Range(range),
1754            },
1755            greedy: true,
1756            ast: Box::new(ast),
1757        }));
1758        Ok(concat)
1759    }
1760
1761    #[inline(never)]
1762    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1763        assert_eq!(self.char(), '(');
1764        let open_span = self.span_char();
1765        self.bump();
1766        self.bump_space();
1767        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1768            let kind = match (pos, ahead) {
1769                (true, true) => LookaroundKind::PositiveLookahead,
1770                (true, false) => LookaroundKind::PositiveLookbehind,
1771                (false, true) => LookaroundKind::NegativeLookahead,
1772                (false, false) => LookaroundKind::NegativeLookbehind,
1773            };
1774            return Ok(Either::Right(ast::Group {
1775                span: open_span,
1776                kind: ast::GroupKind::Lookaround(kind),
1777                ast: Box::new(Ast::empty(self.span())),
1778            }));
1779        }
1780        let inner_span = self.span();
1781        let mut starts_with_p = true;
1782        if self.bump_if("?P<") || {
1783            starts_with_p = false;
1784            self.bump_if("?<")
1785        } {
1786            let capture_index = self.next_capture_index(open_span)?;
1787            let name = self.parse_capture_name(capture_index)?;
1788            Ok(Either::Right(ast::Group {
1789                span: open_span,
1790                kind: ast::GroupKind::CaptureName {
1791                    starts_with_p,
1792                    name,
1793                },
1794                ast: Box::new(Ast::empty(self.span())),
1795            }))
1796        } else if self.bump_if("?") {
1797            if self.is_eof() {
1798                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1799            }
1800            let flags = self.parse_flags()?;
1801            let char_end = self.char();
1802            self.bump();
1803            if char_end == ')' {
1804                // We don't allow empty flags, e.g., `(?)`. We instead
1805                // interpret it as a repetition operator missing its argument.
1806                if flags.items.is_empty() {
1807                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1808                }
1809                Ok(Either::Left(ast::SetFlags {
1810                    span: Span {
1811                        end: self.pos(),
1812                        ..open_span
1813                    },
1814                    flags,
1815                }))
1816            } else {
1817                assert_eq!(char_end, ':');
1818                Ok(Either::Right(ast::Group {
1819                    span: open_span,
1820                    kind: ast::GroupKind::NonCapturing(flags),
1821                    ast: Box::new(Ast::empty(self.span())),
1822                }))
1823            }
1824        } else {
1825            let capture_index = self.next_capture_index(open_span)?;
1826            Ok(Either::Right(ast::Group {
1827                span: open_span,
1828                kind: ast::GroupKind::CaptureIndex(capture_index),
1829                ast: Box::new(Ast::empty(self.span())),
1830            }))
1831        }
1832    }
1833
1834    #[inline(never)]
1835    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1836        if self.is_eof() {
1837            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1838        }
1839        let start = self.pos();
1840        loop {
1841            if self.char() == '>' {
1842                break;
1843            }
1844            if !is_capture_char(self.char(), self.pos() == start) {
1845                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1846            }
1847            if !self.bump() {
1848                break;
1849            }
1850        }
1851        let end = self.pos();
1852        if self.is_eof() {
1853            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1854        }
1855        assert_eq!(self.char(), '>');
1856        self.bump();
1857        let name = &self.pattern()[start.offset..end.offset];
1858        if name.is_empty() {
1859            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1860        }
1861        let capname = ast::CaptureName {
1862            span: Span::new(start, end),
1863            name: name.to_string(),
1864            index: capture_index,
1865        };
1866        self.add_capture_name(&capname)?;
1867        Ok(capname)
1868    }
1869
1870    #[inline(never)]
1871    fn parse_flags(&self) -> Result<ast::Flags> {
1872        let mut flags = ast::Flags {
1873            span: self.span(),
1874            items: vec![],
1875        };
1876        let mut last_was_negation = None;
1877        while self.char() != ':' && self.char() != ')' {
1878            if self.char() == '-' {
1879                last_was_negation = Some(self.span_char());
1880                let item = ast::FlagsItem {
1881                    span: self.span_char(),
1882                    kind: ast::FlagsItemKind::Negation,
1883                };
1884                if let Some(i) = flags.add_item(item) {
1885                    return Err(self.error(
1886                        self.span_char(),
1887                        ast::ErrorKind::FlagRepeatedNegation {
1888                            original: flags.items[i].span,
1889                        },
1890                    ));
1891                }
1892            } else {
1893                last_was_negation = None;
1894                let item = ast::FlagsItem {
1895                    span: self.span_char(),
1896                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1897                };
1898                if let Some(i) = flags.add_item(item) {
1899                    return Err(self.error(
1900                        self.span_char(),
1901                        ast::ErrorKind::FlagDuplicate {
1902                            original: flags.items[i].span,
1903                        },
1904                    ));
1905                }
1906            }
1907            if !self.bump() {
1908                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1909            }
1910        }
1911        if let Some(span) = last_was_negation {
1912            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1913        }
1914        flags.span.end = self.pos();
1915        Ok(flags)
1916    }
1917
1918    #[inline(never)]
1919    fn parse_flag(&self) -> Result<ast::Flag> {
1920        match self.char() {
1921            'i' => Ok(ast::Flag::CaseInsensitive),
1922            'm' => Ok(ast::Flag::MultiLine),
1923            's' => Ok(ast::Flag::DotMatchesNewLine),
1924            'U' => Ok(ast::Flag::SwapGreed),
1925            'u' => Ok(ast::Flag::Unicode),
1926            'R' => Ok(ast::Flag::CRLF),
1927            'x' => Ok(ast::Flag::IgnoreWhitespace),
1928            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1929        }
1930    }
1931
1932    fn parse_primitive(&self) -> Result<Primitive> {
1933        match self.char() {
1934            '\\' => self.parse_escape(),
1935            '_' => {
1936                let ast = Primitive::Top(self.span_char());
1937                self.bump();
1938                Ok(ast)
1939            }
1940            '.' => {
1941                let ast = Primitive::Dot(self.span_char());
1942                self.bump();
1943                Ok(ast)
1944            }
1945            '^' => {
1946                let ast = Primitive::Assertion(ast::Assertion {
1947                    span: self.span_char(),
1948                    kind: ast::AssertionKind::StartLine,
1949                });
1950                self.bump();
1951                Ok(ast)
1952            }
1953            '$' => {
1954                let ast = Primitive::Assertion(ast::Assertion {
1955                    span: self.span_char(),
1956                    kind: ast::AssertionKind::EndLine,
1957                });
1958                self.bump();
1959                Ok(ast)
1960            }
1961            c => {
1962                let ast = Primitive::Literal(Literal {
1963                    span: self.span_char(),
1964                    kind: LiteralKind::Verbatim,
1965                    c,
1966                });
1967                self.bump();
1968                Ok(ast)
1969            }
1970        }
1971    }
1972
1973    #[inline(never)]
1974    fn parse_escape(&self) -> Result<Primitive> {
1975        assert_eq!(self.char(), '\\');
1976        let start = self.pos();
1977        if !self.bump() {
1978            return Err(self.error(
1979                Span::new(start, self.pos()),
1980                ast::ErrorKind::EscapeUnexpectedEof,
1981            ));
1982        }
1983        let c = self.char();
1984        // Put some of the more complicated routines into helpers.
1985        match c {
1986            '0'..='9' => {
1987                if !self.parser().octal {
1988                    return Err(self.error(
1989                        Span::new(start, self.span_char().end),
1990                        ast::ErrorKind::UnsupportedBackreference,
1991                    ));
1992                }
1993                let mut lit = self.parse_octal();
1994                lit.span.start = start;
1995                return Ok(Primitive::Literal(lit));
1996            }
1997            // '8'..='9' if !self.parser().octal => {
1998            //     return Err(self.error(
1999            //         Span::new(start, self.span_char().end),
2000            //         ast::ErrorKind::UnsupportedBackreference,
2001            //     ));
2002            // }
2003            'x' | 'u' | 'U' => {
2004                let mut lit = self.parse_hex()?;
2005                lit.span.start = start;
2006                return Ok(Primitive::Literal(lit));
2007            }
2008            'p' | 'P' => {
2009                let mut cls = self.parse_unicode_class()?;
2010                cls.span.start = start;
2011                return Ok(Primitive::Unicode(cls));
2012            }
2013            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2014                let mut cls = self.parse_perl_class();
2015                cls.span.start = start;
2016                return Ok(Primitive::Perl(cls));
2017            }
2018            _ => {}
2019        }
2020
2021        // Handle all of the one letter sequences inline.
2022        self.bump();
2023        let span = Span::new(start, self.pos());
2024        if is_meta_character(c) {
2025            return Ok(Primitive::Literal(Literal {
2026                span,
2027                kind: LiteralKind::Meta,
2028                c,
2029            }));
2030        }
2031        if is_escapeable_character(c) {
2032            return Ok(Primitive::Literal(Literal {
2033                span,
2034                kind: LiteralKind::Superfluous,
2035                c,
2036            }));
2037        }
2038        let special = |kind, c| {
2039            Ok(Primitive::Literal(Literal {
2040                span,
2041                kind: LiteralKind::Special(kind),
2042                c,
2043            }))
2044        };
2045        match c {
2046            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2047            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2048            't' => special(SpecialLiteralKind::Tab, '\t'),
2049            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2050            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2051            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2052            'A' => Ok(Primitive::Assertion(ast::Assertion {
2053                span,
2054                kind: ast::AssertionKind::StartText,
2055            })),
2056            'z' => Ok(Primitive::Assertion(ast::Assertion {
2057                span,
2058                kind: ast::AssertionKind::EndText,
2059            })),
2060            'b' => {
2061                let mut wb = ast::Assertion {
2062                    span,
2063                    kind: ast::AssertionKind::WordBoundary,
2064                };
2065                // After a \b, we "try" to parse things like \b{start} for
2066                // special word boundary assertions.
2067                if !self.is_eof() && self.char() == '{' {
2068                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2069                        wb.kind = kind;
2070                        wb.span.end = self.pos();
2071                    }
2072                }
2073                Ok(Primitive::Assertion(wb))
2074            }
2075            'B' => Ok(Primitive::Assertion(ast::Assertion {
2076                span,
2077                kind: ast::AssertionKind::NotWordBoundary,
2078            })),
2079            '<' => Ok(Primitive::Assertion(ast::Assertion {
2080                span,
2081                kind: ast::AssertionKind::WordBoundaryStartAngle,
2082            })),
2083            '>' => Ok(Primitive::Assertion(ast::Assertion {
2084                span,
2085                kind: ast::AssertionKind::WordBoundaryEndAngle,
2086            })),
2087            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2088        }
2089    }
2090
2091    fn maybe_parse_special_word_boundary(
2092        &self,
2093        wb_start: Position,
2094    ) -> Result<Option<ast::AssertionKind>> {
2095        assert_eq!(self.char(), '{');
2096
2097        let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2098        let start = self.pos();
2099        if !self.bump_and_bump_space() {
2100            return Err(self.error(
2101                Span::new(wb_start, self.pos()),
2102                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2103            ));
2104        }
2105        let start_contents = self.pos();
2106        // This is one of the critical bits: if the first non-whitespace
2107        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2108        // boundary), then we bail and let the counted repetition parser deal
2109        // with this.
2110        if !is_valid_char(self.char()) {
2111            self.parser().pos.set(start);
2112            return Ok(None);
2113        }
2114
2115        // Now collect up our chars until we see a '}'.
2116        let mut scratch = self.parser().scratch.borrow_mut();
2117        scratch.clear();
2118        while !self.is_eof() && is_valid_char(self.char()) {
2119            scratch.push(self.char());
2120            self.bump_and_bump_space();
2121        }
2122        if self.is_eof() || self.char() != '}' {
2123            return Err(self.error(
2124                Span::new(start, self.pos()),
2125                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2126            ));
2127        }
2128        let end = self.pos();
2129        self.bump();
2130        let kind = match scratch.as_str() {
2131            "start" => ast::AssertionKind::WordBoundaryStart,
2132            "end" => ast::AssertionKind::WordBoundaryEnd,
2133            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2134            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2135            _ => {
2136                return Err(self.error(
2137                    Span::new(start_contents, end),
2138                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2139                ))
2140            }
2141        };
2142        Ok(Some(kind))
2143    }
2144
2145    #[inline(never)]
2146    fn parse_octal(&self) -> Literal {
2147        assert!(self.parser().octal);
2148        assert!('0' <= self.char() && self.char() <= '7');
2149        let start = self.pos();
2150        // Parse up to two more digits.
2151        while self.bump()
2152            && '0' <= self.char()
2153            && self.char() <= '7'
2154            && self.pos().offset - start.offset <= 2
2155        {}
2156        let end = self.pos();
2157        let octal = &self.pattern()[start.offset..end.offset];
2158        // Parsing the octal should never fail since the above guarantees a
2159        // valid number.
2160        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2161        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2162        // invalid Unicode scalar values.
2163        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2164        Literal {
2165            span: Span::new(start, end),
2166            kind: LiteralKind::Octal,
2167            c,
2168        }
2169    }
2170
2171    #[inline(never)]
2172    fn parse_hex(&self) -> Result<Literal> {
2173        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2174
2175        let hex_kind = match self.char() {
2176            'x' => HexLiteralKind::X,
2177            'u' => HexLiteralKind::UnicodeShort,
2178            _ => HexLiteralKind::UnicodeLong,
2179        };
2180        if !self.bump_and_bump_space() {
2181            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2182        }
2183        if self.char() == '{' {
2184            self.parse_hex_brace(hex_kind)
2185        } else {
2186            self.parse_hex_digits(hex_kind)
2187        }
2188    }
2189
2190    #[inline(never)]
2191    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2192        let mut scratch = self.parser().scratch.borrow_mut();
2193        scratch.clear();
2194
2195        let start = self.pos();
2196        for i in 0..kind.digits() {
2197            if i > 0 && !self.bump_and_bump_space() {
2198                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2199            }
2200            if !is_hex(self.char()) {
2201                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2202            }
2203            scratch.push(self.char());
2204        }
2205        // The final bump just moves the parser past the literal, which may
2206        // be EOF.
2207        self.bump_and_bump_space();
2208        let end = self.pos();
2209        let hex = scratch.as_str();
2210        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2211            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2212            Some(c) => Ok(Literal {
2213                span: Span::new(start, end),
2214                kind: LiteralKind::HexFixed(kind),
2215                c,
2216            }),
2217        }
2218    }
2219
2220    #[inline(never)]
2221    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2222        let mut scratch = self.parser().scratch.borrow_mut();
2223        scratch.clear();
2224
2225        let brace_pos = self.pos();
2226        let start = self.span_char().end;
2227        while self.bump_and_bump_space() && self.char() != '}' {
2228            if !is_hex(self.char()) {
2229                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2230            }
2231            scratch.push(self.char());
2232        }
2233        if self.is_eof() {
2234            return Err(self.error(
2235                Span::new(brace_pos, self.pos()),
2236                ast::ErrorKind::EscapeUnexpectedEof,
2237            ));
2238        }
2239        let end = self.pos();
2240        let hex = scratch.as_str();
2241        assert_eq!(self.char(), '}');
2242        self.bump_and_bump_space();
2243
2244        if hex.is_empty() {
2245            return Err(self.error(
2246                Span::new(brace_pos, self.pos()),
2247                ast::ErrorKind::EscapeHexEmpty,
2248            ));
2249        }
2250        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2251            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2252            Some(c) => Ok(Literal {
2253                span: Span::new(start, self.pos()),
2254                kind: LiteralKind::HexBrace(kind),
2255                c,
2256            }),
2257        }
2258    }
2259
2260    fn parse_decimal(&self) -> Result<u32> {
2261        let mut scratch = self.parser().scratch.borrow_mut();
2262        scratch.clear();
2263
2264        while !self.is_eof() && self.char().is_whitespace() {
2265            self.bump();
2266        }
2267        let start = self.pos();
2268        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2269            scratch.push(self.char());
2270            self.bump_and_bump_space();
2271        }
2272        let span = Span::new(start, self.pos());
2273        while !self.is_eof() && self.char().is_whitespace() {
2274            self.bump_and_bump_space();
2275        }
2276        let digits = scratch.as_str();
2277        if digits.is_empty() {
2278            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2279        }
2280        match digits.parse::<u32>().ok() {
2281            Some(n) => Ok(n),
2282            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2283        }
2284    }
2285
2286    #[inline(never)]
2287    fn parse_set_class(&self) -> Result<ClassBracketed> {
2288        assert_eq!(self.char(), '[');
2289
2290        let mut union = ClassSetUnion {
2291            span: self.span(),
2292            items: vec![],
2293        };
2294        loop {
2295            self.bump_space();
2296            if self.is_eof() {
2297                return Err(self.unclosed_class_error());
2298            }
2299            match self.char() {
2300                '[' => {
2301                    // If we've already parsed the opening bracket, then
2302                    // attempt to treat this as the beginning of an ASCII
2303                    // class. If ASCII class parsing fails, then the parser
2304                    // backs up to `[`.
2305                    if !self.parser().stack_class.borrow().is_empty() {
2306                        if let Some(cls) = self.maybe_parse_ascii_class() {
2307                            union.push(ClassSetItem::Ascii(cls));
2308                            continue;
2309                        }
2310                    }
2311                    union = self.push_class_open(union)?;
2312                }
2313                ']' => match self.pop_class(union)? {
2314                    Either::Left(nested_union) => {
2315                        union = nested_union;
2316                    }
2317                    Either::Right(class) => return Ok(class),
2318                },
2319                '&' if self.peek() == Some('&') => {
2320                    assert!(self.bump_if("&&"));
2321                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2322                }
2323                '-' if self.peek() == Some('-') => {
2324                    assert!(self.bump_if("--"));
2325                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2326                }
2327                '~' if self.peek() == Some('~') => {
2328                    assert!(self.bump_if("~~"));
2329                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2330                }
2331                _ => {
2332                    union.push(self.parse_set_class_range()?);
2333                }
2334            }
2335        }
2336    }
2337
2338    #[inline(never)]
2339    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2340        let prim1 = self.parse_set_class_item()?;
2341        self.bump_space();
2342        if self.is_eof() {
2343            return Err(self.unclosed_class_error());
2344        }
2345        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2346            return prim1.into_class_set_item(self);
2347        }
2348        if !self.bump_and_bump_space() {
2349            return Err(self.unclosed_class_error());
2350        }
2351        let prim2 = self.parse_set_class_item()?;
2352        let range = ClassSetRange {
2353            span: Span::new(prim1.span().start, prim2.span().end),
2354            start: prim1.into_class_literal(self)?,
2355            end: prim2.into_class_literal(self)?,
2356        };
2357        if !range.is_valid() {
2358            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2359        }
2360        Ok(ClassSetItem::Range(range))
2361    }
2362
2363    #[inline(never)]
2364    fn parse_set_class_item(&self) -> Result<Primitive> {
2365        if self.char() == '\\' {
2366            self.parse_escape()
2367        } else {
2368            let x = Primitive::Literal(Literal {
2369                span: self.span_char(),
2370                kind: LiteralKind::Verbatim,
2371                c: self.char(),
2372            });
2373            self.bump();
2374            Ok(x)
2375        }
2376    }
2377
2378    #[inline(never)]
2379    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2380        assert_eq!(self.char(), '[');
2381        let start = self.pos();
2382        if !self.bump_and_bump_space() {
2383            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2384        }
2385
2386        let negated = if self.char() != '^' {
2387            false
2388        } else {
2389            if !self.bump_and_bump_space() {
2390                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2391            }
2392            true
2393        };
2394        // Accept any number of `-` as literal `-`.
2395        let mut union = ClassSetUnion {
2396            span: self.span(),
2397            items: vec![],
2398        };
2399        while self.char() == '-' {
2400            union.push(ClassSetItem::Literal(Literal {
2401                span: self.span_char(),
2402                kind: LiteralKind::Verbatim,
2403                c: '-',
2404            }));
2405            if !self.bump_and_bump_space() {
2406                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2407            }
2408        }
2409        // If `]` is the *first* char in a set, then interpret it as a literal
2410        // `]`. That is, an empty class is impossible to write.
2411        if union.items.is_empty() && self.char() == ']' {
2412            union.push(ClassSetItem::Literal(Literal {
2413                span: self.span_char(),
2414                kind: LiteralKind::Verbatim,
2415                c: ']',
2416            }));
2417            if !self.bump_and_bump_space() {
2418                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2419            }
2420        }
2421        let set = ClassBracketed {
2422            span: Span::new(start, self.pos()),
2423            negated,
2424            kind: ClassSet::union(ClassSetUnion {
2425                span: Span::new(union.span.start, union.span.start),
2426                items: vec![],
2427            }),
2428        };
2429        Ok((set, union))
2430    }
2431
2432    #[inline(never)]
2433    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2434        assert_eq!(self.char(), '[');
2435        // If parsing fails, then we back up the parser to this starting point.
2436        let start = self.pos();
2437        let mut negated = false;
2438        if !self.bump() || self.char() != ':' {
2439            self.parser().pos.set(start);
2440            return None;
2441        }
2442        if !self.bump() {
2443            self.parser().pos.set(start);
2444            return None;
2445        }
2446        if self.char() == '^' {
2447            negated = true;
2448            if !self.bump() {
2449                self.parser().pos.set(start);
2450                return None;
2451            }
2452        }
2453        let name_start = self.offset();
2454        while self.char() != ':' && self.bump() {}
2455        if self.is_eof() {
2456            self.parser().pos.set(start);
2457            return None;
2458        }
2459        let name = &self.pattern()[name_start..self.offset()];
2460        if !self.bump_if(":]") {
2461            self.parser().pos.set(start);
2462            return None;
2463        }
2464        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2465            Some(kind) => kind,
2466            None => {
2467                self.parser().pos.set(start);
2468                return None;
2469            }
2470        };
2471        Some(ClassAscii {
2472            span: Span::new(start, self.pos()),
2473            kind,
2474            negated,
2475        })
2476    }
2477
2478    #[inline(never)]
2479    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2480        assert!(self.char() == 'p' || self.char() == 'P');
2481
2482        let mut scratch = self.parser().scratch.borrow_mut();
2483        scratch.clear();
2484
2485        let negated = self.char() == 'P';
2486        if !self.bump_and_bump_space() {
2487            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2488        }
2489        let (start, kind) = if self.char() == '{' {
2490            let start = self.span_char().end;
2491            while self.bump_and_bump_space() && self.char() != '}' {
2492                scratch.push(self.char());
2493            }
2494            if self.is_eof() {
2495                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2496            }
2497            assert_eq!(self.char(), '}');
2498            self.bump();
2499
2500            let name = scratch.as_str();
2501            if let Some(i) = name.find("!=") {
2502                (
2503                    start,
2504                    ClassUnicodeKind::NamedValue {
2505                        op: ClassUnicodeOpKind::NotEqual,
2506                        name: name[..i].to_string(),
2507                        value: name[i + 2..].to_string(),
2508                    },
2509                )
2510            } else if let Some(i) = name.find(':') {
2511                (
2512                    start,
2513                    ClassUnicodeKind::NamedValue {
2514                        op: ClassUnicodeOpKind::Colon,
2515                        name: name[..i].to_string(),
2516                        value: name[i + 1..].to_string(),
2517                    },
2518                )
2519            } else if let Some(i) = name.find('=') {
2520                (
2521                    start,
2522                    ClassUnicodeKind::NamedValue {
2523                        op: ClassUnicodeOpKind::Equal,
2524                        name: name[..i].to_string(),
2525                        value: name[i + 1..].to_string(),
2526                    },
2527                )
2528            } else {
2529                (start, ClassUnicodeKind::Named(name.to_string()))
2530            }
2531        } else {
2532            let start = self.pos();
2533            let c = self.char();
2534            if c == '\\' {
2535                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2536            }
2537            self.bump_and_bump_space();
2538            let kind = ClassUnicodeKind::OneLetter(c);
2539            (start, kind)
2540        };
2541        Ok(ClassUnicode {
2542            span: Span::new(start, self.pos()),
2543            negated,
2544            kind,
2545        })
2546    }
2547
2548    #[inline(never)]
2549    fn parse_perl_class(&self) -> ClassPerl {
2550        let c = self.char();
2551        let span = self.span_char();
2552        self.bump();
2553        let (negated, kind) = match c {
2554            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2555            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2556            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2557            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2558            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2559            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2560            c => panic!("expected valid Perl class but got '{}'", c),
2561        };
2562        ClassPerl {
2563            span,
2564            kind,
2565            negated,
2566        }
2567    }
2568}
2569
2570pub fn parse_ast<'s>(
2571    tb: &mut TB<'s>,
2572    pattern: &'s str,
2573) -> std::result::Result<NodeId, ResharpError> {
2574    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2575    p.parse(tb)
2576}
2577
2578pub fn parse_ast_with<'s>(
2579    tb: &mut TB<'s>,
2580    pattern: &'s str,
2581    flags: &PatternFlags,
2582) -> std::result::Result<NodeId, ResharpError> {
2583    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2584    p.parse(tb)
2585}
resharp_parser/lib.rs

resharp_parser/
lib.rs