resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `EngineOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// global case-insensitive matching.
31    pub case_insensitive: bool,
32    /// `.` matches `\n` (behaves like `_`).
33    pub dot_matches_new_line: bool,
34    /// allow whitespace and `#` comments in the pattern.
35    pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39    fn default() -> Self {
40        Self {
41            unicode: true,
42            case_insensitive: false,
43            dot_matches_new_line: false,
44            ignore_whitespace: false,
45        }
46    }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51    Word,
52    NonWord,
53    MaybeWord,
54    MaybeNonWord,
55    Unknown,
56    Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60    b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66    Literal(Literal),
67    Assertion(ast::Assertion),
68    Dot(Span),
69    Top(Span),
70    Perl(ClassPerl),
71    Unicode(ClassUnicode),
72}
73
74impl Primitive {
75    fn span(&self) -> &Span {
76        match *self {
77            Primitive::Literal(ref x) => &x.span,
78            Primitive::Assertion(ref x) => &x.span,
79            Primitive::Dot(ref span) => span,
80            Primitive::Top(ref span) => span,
81            Primitive::Perl(ref x) => &x.span,
82            Primitive::Unicode(ref x) => &x.span,
83        }
84    }
85
86    fn into_ast(self) -> Ast {
87        match self {
88            Primitive::Literal(lit) => Ast::literal(lit),
89            Primitive::Assertion(assert) => Ast::assertion(assert),
90            Primitive::Dot(span) => Ast::dot(span),
91            Primitive::Top(span) => Ast::top(span),
92            Primitive::Perl(cls) => Ast::class_perl(cls),
93            Primitive::Unicode(cls) => Ast::class_unicode(cls),
94        }
95    }
96
97    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98        use self::Primitive::*;
99        use regex_syntax::ast::ClassSetItem;
100
101        match self {
102            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106        }
107    }
108
109    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110        use self::Primitive::*;
111
112        match self {
113            Literal(lit) => Ok(lit),
114            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115        }
116    }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121    Left(Left),
122    Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127    /// The kind of error.
128    pub kind: ErrorKind,
129    /// The original pattern that the parser generated the error from. Every
130    /// span in an error is a valid range into this string.
131    pattern: String,
132    /// The span of this error.
133    pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138        write!(f, "{:?}: {:?}", self.kind, self.span)
139    }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147    /// This state is pushed whenever an opening group is found.
148    Group {
149        /// The concatenation immediately preceding the opening group.
150        concat: Concat,
151        /// The group that has been opened. Its sub-AST is always empty.
152        group: ast::Group,
153        /// Whether this group has the `x` flag enabled or not.
154        ignore_whitespace: bool,
155    },
156    /// This state is pushed whenever a new alternation branch is found. If
157    /// an alternation branch is found and this state is at the top of the
158    /// stack, then this state should be modified to include the new
159    /// alternation.
160    Alternation(ast::Alternation),
161    Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166    /// This state is pushed whenever an opening bracket is found.
167    Open {
168        /// The union of class items immediately preceding this class.
169        union: regex_syntax::ast::ClassSetUnion,
170        /// The class that has been opened. Typically this just corresponds
171        /// to the `[`, but it can also include `[^` since `^` indicates
172        /// negation of the class.
173        set: regex_syntax::ast::ClassBracketed,
174    },
175    /// This state is pushed when a operator is seen. When popped, the stored
176    /// set becomes the left hand side of the operator.
177    Op {
178        /// The type of the operation, i.e., &&, -- or ~~.
179        kind: regex_syntax::ast::ClassSetBinaryOpKind,
180        /// The left-hand side of the operator.
181        lhs: regex_syntax::ast::ClassSet,
182    },
183}
184
185/// RE# syntax parser based on the regex-syntax crate.
186pub struct ResharpParser<'s> {
187    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188    unicode_classes: resharp_algebra::UnicodeClassCache,
189    pub translator: regex_syntax::hir::translate::Translator,
190    pub pattern: &'s str,
191    pos: Cell<Position>,
192    capture_index: Cell<u32>,
193    octal: bool,
194    empty_min_range: bool,
195    ignore_whitespace: Cell<bool>,
196    dot_all: Cell<bool>,
197    global_unicode: bool,
198    global_case_insensitive: bool,
199    comments: RefCell<Vec<ast::Comment>>,
200    stack_group: RefCell<Vec<GroupState>>,
201    stack_class: RefCell<Vec<ClassState>>,
202    capture_names: RefCell<Vec<ast::CaptureName>>,
203    scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207    if let Err(e) = result {
208        if e.kind == from {
209            Err(ResharpError {
210                kind: to,
211                pattern: e.pattern,
212                span: e.span,
213            })
214        } else {
215            Err(e)
216        }
217    } else {
218        result
219    }
220}
221
222fn is_capture_char(c: char, first: bool) -> bool {
223    if first {
224        c == '_' || c.is_alphabetic()
225    } else {
226        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
227    }
228}
229
230pub fn is_meta_character(c: char) -> bool {
231    match c {
232        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
233        | '#' | '&' | '-' | '~' | '_' => true,
234        _ => false,
235    }
236}
237
238/// escapes all resharp meta characters in `text`.
239pub fn escape(text: &str) -> String {
240    let mut buf = String::new();
241    escape_into(text, &mut buf);
242    buf
243}
244
245/// escapes all resharp meta characters in `text` and appends to `buf`.
246pub fn escape_into(text: &str, buf: &mut String) {
247    buf.reserve(text.len());
248    for c in text.chars() {
249        if is_meta_character(c) {
250            buf.push('\\');
251        }
252        buf.push(c);
253    }
254}
255
256pub fn is_escapeable_character(c: char) -> bool {
257    // Certainly escapeable if it's a meta character.
258    if is_meta_character(c) {
259        return true;
260    }
261    // Any character that isn't ASCII is definitely not escapeable. There's
262    // no real need to allow things like \☃ right?
263    if !c.is_ascii() {
264        return false;
265    }
266    // Otherwise, we basically say that everything is escapeable unless it's a
267    // letter or digit. Things like \3 are either octal (when enabled) or an
268    // error, and we should keep it that way. Otherwise, letters are reserved
269    // for adding new syntax in a backwards compatible way.
270    match c {
271        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
272        // While not currently supported, we keep these as not escapeable to
273        // give us some flexibility with respect to supporting the \< and
274        // \> word boundary assertions in the future. By rejecting them as
275        // escapeable, \< and \> will result in a parse error. Thus, we can
276        // turn them into something else in the future without it being a
277        // backwards incompatible change.
278        //
279        // OK, now we support \< and \>, and we need to retain them as *not*
280        // escapeable here since the escape sequence is significant.
281        '<' | '>' => false,
282        _ => true,
283    }
284}
285
286fn is_hex(c: char) -> bool {
287    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
288}
289
290impl<'s> ResharpParser<'s> {
291    fn default_translator_builder(&self) -> TranslatorBuilder {
292        let mut trb = TranslatorBuilder::new();
293        trb.unicode(self.global_unicode);
294        trb.utf8(false);
295        trb.case_insensitive(self.global_case_insensitive);
296        trb
297    }
298
299    pub fn new(pattern: &'s str) -> Self {
300        Self::with_flags(pattern, &PatternFlags::default())
301    }
302
303    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
304        let mut trb = TranslatorBuilder::new();
305        trb.unicode(flags.unicode);
306        trb.utf8(false);
307        trb.case_insensitive(flags.case_insensitive);
308        Self {
309            translator: trb.build(),
310            pattern,
311            perl_classes: vec![],
312            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
313            pos: Cell::new(Position::new(0, 0, 0)),
314            capture_index: Cell::new(0),
315            octal: false,
316            empty_min_range: false,
317            ignore_whitespace: Cell::new(flags.ignore_whitespace),
318            dot_all: Cell::new(flags.dot_matches_new_line),
319            global_unicode: flags.unicode,
320            global_case_insensitive: flags.case_insensitive,
321            comments: RefCell::new(vec![]),
322            stack_group: RefCell::new(vec![]),
323            stack_class: RefCell::new(vec![]),
324            capture_names: RefCell::new(vec![]),
325            scratch: RefCell::new(String::new()),
326        }
327    }
328
329    /// Return a reference to the parser state.
330    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
331        self
332    }
333
334    /// Return a reference to the pattern being parsed.
335    fn pattern(&self) -> &str {
336        self.pattern
337    }
338
339    /// Create a new error with the given span and error type.
340    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
341        ResharpError {
342            kind,
343            pattern: self.pattern().to_string(),
344            span,
345        }
346    }
347
348    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
349        let emptyspan = Span::splat(self.pos());
350        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
351        inner
352    }
353
354    /// Return the current offset of the parser.
355    ///
356    /// The offset starts at `0` from the beginning of the regular expression
357    /// pattern string.
358    fn offset(&self) -> usize {
359        self.parser().pos.get().offset
360    }
361
362    /// Return the current line number of the parser.
363    ///
364    /// The line number starts at `1`.
365    fn line(&self) -> usize {
366        self.parser().pos.get().line
367    }
368
369    /// Return the current column of the parser.
370    ///
371    /// The column number starts at `1` and is reset whenever a `\n` is seen.
372    fn column(&self) -> usize {
373        self.parser().pos.get().column
374    }
375
376    /// Return the next capturing index. Each subsequent call increments the
377    /// internal index.
378    ///
379    /// The span given should correspond to the location of the opening
380    /// parenthesis.
381    ///
382    /// If the capture limit is exceeded, then an error is returned.
383    fn next_capture_index(&self, span: Span) -> Result<u32> {
384        let current = self.parser().capture_index.get();
385        let i = current
386            .checked_add(1)
387            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
388        self.parser().capture_index.set(i);
389        Ok(i)
390    }
391
392    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
393        let mut names = self.parser().capture_names.borrow_mut();
394        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
395            Err(i) => {
396                names.insert(i, cap.clone());
397                Ok(())
398            }
399            Ok(i) => Err(self.error(
400                cap.span,
401                ast::ErrorKind::GroupNameDuplicate {
402                    original: names[i].span,
403                },
404            )),
405        }
406    }
407
408    fn ignore_whitespace(&self) -> bool {
409        self.parser().ignore_whitespace.get()
410    }
411
412    fn char(&self) -> char {
413        self.char_at(self.offset())
414    }
415
416    fn char_at(&self, i: usize) -> char {
417        self.pattern()[i..]
418            .chars()
419            .next()
420            .unwrap_or_else(|| panic!("expected char at offset {}", i))
421    }
422
423    fn bump(&self) -> bool {
424        if self.is_eof() {
425            return false;
426        }
427        let Position {
428            mut offset,
429            mut line,
430            mut column,
431        } = self.pos();
432        if self.char() == '\n' {
433            line = line.checked_add(1).unwrap();
434            column = 1;
435        } else {
436            column = column.checked_add(1).unwrap();
437        }
438        offset += self.char().len_utf8();
439        self.parser().pos.set(Position {
440            offset,
441            line,
442            column,
443        });
444        self.pattern()[self.offset()..].chars().next().is_some()
445    }
446
447    fn bump_if(&self, prefix: &str) -> bool {
448        if self.pattern()[self.offset()..].starts_with(prefix) {
449            for _ in 0..prefix.chars().count() {
450                self.bump();
451            }
452            true
453        } else {
454            false
455        }
456    }
457
458    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
459        if self.bump_if("?=") {
460            return Some((true, true));
461        }
462        if self.bump_if("?!") {
463            return Some((true, false));
464        }
465        if self.bump_if("?<=") {
466            return Some((false, true));
467        }
468        if self.bump_if("?<!") {
469            return Some((false, false));
470        }
471        return None;
472        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
473    }
474
475    fn bump_and_bump_space(&self) -> bool {
476        if !self.bump() {
477            return false;
478        }
479        self.bump_space();
480        !self.is_eof()
481    }
482
483    fn bump_space(&self) {
484        if !self.ignore_whitespace() {
485            return;
486        }
487        while !self.is_eof() {
488            if self.char().is_whitespace() {
489                self.bump();
490            } else if self.char() == '#' {
491                let start = self.pos();
492                let mut comment_text = String::new();
493                self.bump();
494                while !self.is_eof() {
495                    let c = self.char();
496                    self.bump();
497                    if c == '\n' {
498                        break;
499                    }
500                    comment_text.push(c);
501                }
502                let comment = ast::Comment {
503                    span: Span::new(start, self.pos()),
504                    comment: comment_text,
505                };
506                self.parser().comments.borrow_mut().push(comment);
507            } else {
508                break;
509            }
510        }
511    }
512
513    /// Peek at the next character in the input without advancing the parser.
514    ///
515    /// If the input has been exhausted, then this returns `None`.
516    fn peek(&self) -> Option<char> {
517        if self.is_eof() {
518            return None;
519        }
520        self.pattern()[self.offset() + self.char().len_utf8()..]
521            .chars()
522            .next()
523    }
524
525    /// Like peek, but will ignore spaces when the parser is in whitespace
526    /// insensitive mode.
527    fn peek_space(&self) -> Option<char> {
528        if !self.ignore_whitespace() {
529            return self.peek();
530        }
531        if self.is_eof() {
532            return None;
533        }
534        let mut start = self.offset() + self.char().len_utf8();
535        let mut in_comment = false;
536        for (i, c) in self.pattern()[start..].char_indices() {
537            if c.is_whitespace() {
538                continue;
539            } else if !in_comment && c == '#' {
540                in_comment = true;
541            } else if in_comment && c == '\n' {
542                in_comment = false;
543            } else {
544                start += i;
545                break;
546            }
547        }
548        self.pattern()[start..].chars().next()
549    }
550
551    /// Returns true if the next call to `bump` would return false.
552    fn is_eof(&self) -> bool {
553        self.offset() == self.pattern().len()
554    }
555
556    /// Return the current position of the parser, which includes the offset,
557    /// line and column.
558    fn pos(&self) -> Position {
559        self.parser().pos.get()
560    }
561
562    /// Create a span at the current position of the parser. Both the start
563    /// and end of the span are set.
564    fn span(&self) -> Span {
565        Span::splat(self.pos())
566    }
567
568    /// Create a span that covers the current character.
569    fn span_char(&self) -> Span {
570        let mut next = Position {
571            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
572            line: self.line(),
573            column: self.column().checked_add(1).unwrap(),
574        };
575        if self.char() == '\n' {
576            next.line += 1;
577            next.column = 1;
578        }
579        Span::new(self.pos(), next)
580    }
581
582    /// Parse and push a single alternation on to the parser's internal stack.
583    /// If the top of the stack already has an alternation, then add to that
584    /// instead of pushing a new one.
585    ///
586    /// The concatenation given corresponds to a single alternation branch.
587    /// The concatenation returned starts the next branch and is empty.
588    ///
589    /// This assumes the parser is currently positioned at `|` and will advance
590    /// the parser to the character following `|`.
591    #[inline(never)]
592    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
593        assert_eq!(self.char(), '|');
594        concat.span.end = self.pos();
595        self.push_or_add_alternation(concat);
596        self.bump();
597        Ok(ast::Concat {
598            span: self.span(),
599            asts: vec![],
600        })
601    }
602
603    /// Pushes or adds the given branch of an alternation to the parser's
604    /// internal stack of state.
605    fn push_or_add_alternation(&self, concat: Concat) {
606        use self::GroupState::*;
607
608        let mut stack = self.parser().stack_group.borrow_mut();
609        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
610            alts.asts.push(concat.into_ast());
611            return;
612        }
613        stack.push(Alternation(ast::Alternation {
614            span: Span::new(concat.span.start, self.pos()),
615            asts: vec![concat.into_ast()],
616        }));
617    }
618
619    #[inline(never)]
620    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
621        assert_eq!(self.char(), '&');
622        concat.span.end = self.pos();
623        self.push_or_add_intersect(concat);
624        self.bump();
625        Ok(Concat {
626            span: self.span(),
627            asts: vec![],
628        })
629    }
630
631    /// Pushes or adds the given branch of an alternation to the parser's
632    /// internal stack of state.
633    fn push_or_add_intersect(&self, concat: Concat) {
634        use self::GroupState::*;
635
636        let mut stack = self.parser().stack_group.borrow_mut();
637        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
638            alts.asts.push(concat.into_ast());
639            return;
640        }
641        stack.push(Intersection(ast::Intersection {
642            span: Span::new(concat.span.start, self.pos()),
643            asts: vec![concat.into_ast()],
644        }));
645    }
646
647    /// Parse and push a group AST (and its parent concatenation) on to the
648    /// parser's internal stack. Return a fresh concatenation corresponding
649    /// to the group's sub-AST.
650    ///
651    /// If a set of flags was found (with no group), then the concatenation
652    /// is returned with that set of flags added.
653    ///
654    /// This assumes that the parser is currently positioned on the opening
655    /// parenthesis. It advances the parser to the character at the start
656    /// of the sub-expression (or adjoining expression).
657    ///
658    /// If there was a problem parsing the start of the group, then an error
659    /// is returned.
660    #[inline(never)]
661    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
662        assert_eq!(self.char(), '(');
663        match self.parse_group()? {
664            Either::Left(set) => {
665                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
666                if let Some(v) = ignore {
667                    self.parser().ignore_whitespace.set(v);
668                }
669
670                concat.asts.push(Ast::flags(set));
671                Ok(concat)
672            }
673            Either::Right(group) => {
674                let old_ignore_whitespace = self.ignore_whitespace();
675                let new_ignore_whitespace = group
676                    .flags()
677                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678                    .unwrap_or(old_ignore_whitespace);
679                self.parser()
680                    .stack_group
681                    .borrow_mut()
682                    .push(GroupState::Group {
683                        concat,
684                        group,
685                        ignore_whitespace: old_ignore_whitespace,
686                    });
687                self.parser().ignore_whitespace.set(new_ignore_whitespace);
688                Ok(Concat {
689                    span: self.span(),
690                    asts: vec![],
691                })
692            }
693        }
694    }
695
696    #[inline(never)]
697    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
698        assert_eq!(self.char(), '~');
699        self.bump();
700        if self.is_eof() || self.char() != '(' {
701            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
702        }
703        let open_span = self.span_char();
704        self.bump();
705        let group = ast::Group {
706            span: open_span,
707            kind: ast::GroupKind::Complement,
708            ast: Box::new(Ast::empty(self.span())),
709        };
710
711        let old_ignore_whitespace = self.ignore_whitespace();
712        let new_ignore_whitespace = group
713            .flags()
714            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
715            .unwrap_or(old_ignore_whitespace);
716        self.parser()
717            .stack_group
718            .borrow_mut()
719            .push(GroupState::Group {
720                concat,
721                group,
722                ignore_whitespace: old_ignore_whitespace,
723            });
724        self.parser().ignore_whitespace.set(new_ignore_whitespace);
725        Ok(Concat {
726            span: self.span(),
727            asts: vec![],
728        })
729    }
730
731    /// Pop a group AST from the parser's internal stack and set the group's
732    /// AST to the given concatenation. Return the concatenation containing
733    /// the group.
734    ///
735    /// This assumes that the parser is currently positioned on the closing
736    /// parenthesis and advances the parser to the character following the `)`.
737    ///
738    /// If no such group could be popped, then an unopened group error is
739    /// returned.
740    #[inline(never)]
741    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
742        use self::GroupState::*;
743        assert_eq!(self.char(), ')');
744        let mut stack = self.parser().stack_group.borrow_mut();
745        let topstack = stack.pop();
746
747        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
748            Some(Group {
749                concat,
750                group,
751                ignore_whitespace,
752            }) => (concat, group, ignore_whitespace, None),
753            Some(Alternation(alt)) => match stack.pop() {
754                Some(Group {
755                    concat,
756                    group,
757                    ignore_whitespace,
758                }) => (
759                    concat,
760                    group,
761                    ignore_whitespace,
762                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
763                ),
764                None | Some(Alternation(_)) | Some(Intersection(_)) => {
765                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
766                }
767            },
768            Some(Intersection(int)) => match stack.pop() {
769                Some(Group {
770                    concat,
771                    group,
772                    ignore_whitespace,
773                }) => (
774                    concat,
775                    group,
776                    ignore_whitespace,
777                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
778                ),
779                None | Some(Alternation(_)) | Some(Intersection(_)) => {
780                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
781                }
782            },
783
784            None => {
785                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
786            }
787        };
788        self.parser().ignore_whitespace.set(ignore_whitespace);
789        group_concat.span.end = self.pos();
790        self.bump();
791        group.span.end = self.pos();
792        match alt {
793            Some(Either::Left(mut alt)) => {
794                alt.span.end = group_concat.span.end;
795                alt.asts.push(group_concat.into_ast());
796                group.ast = Box::new(alt.into_ast());
797            }
798            Some(Either::Right(mut int)) => {
799                int.span.end = group_concat.span.end;
800                int.asts.push(group_concat.into_ast());
801                group.ast = Box::new(int.into_ast());
802            }
803            None => {
804                group.ast = Box::new(group_concat.into_ast());
805            }
806        }
807
808        if group.kind == GroupKind::Complement {
809            let complement = ast::Complement {
810                span: self.span(),
811                ast: group.ast,
812            };
813            prior_concat.asts.push(Ast::complement(complement));
814        }
815        // ignore groups for now
816        else {
817            // dbg!(&group);
818            prior_concat.asts.push(Ast::group(group));
819        }
820        Ok(prior_concat)
821    }
822
823    /// Pop the last state from the parser's internal stack, if it exists, and
824    /// add the given concatenation to it. There either must be no state or a
825    /// single alternation item on the stack. Any other scenario produces an
826    /// error.
827    ///
828    /// This assumes that the parser has advanced to the end.
829    #[inline(never)]
830    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
831        concat.span.end = self.pos();
832        // dbg!(&concat);
833        let mut stack = self.parser().stack_group.borrow_mut();
834        let ast = match stack.pop() {
835            None => Ok(concat.into_ast()),
836            Some(GroupState::Alternation(mut alt)) => {
837                alt.span.end = self.pos();
838                alt.asts.push(concat.into_ast());
839                Ok(Ast::alternation(alt))
840            }
841            Some(GroupState::Intersection(mut int)) => {
842                int.span.end = self.pos();
843                int.asts.push(concat.into_ast());
844
845                // dbg!("end intersection2");
846                Ok(Ast::intersection(int))
847            }
848            Some(GroupState::Group { group, .. }) => {
849                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
850            }
851        };
852        // If we try to pop again, there should be nothing.
853        match stack.pop() {
854            None => ast,
855            Some(GroupState::Alternation(_)) => {
856                // This unreachable is unfortunate. This case can't happen
857                // because the only way we can be here is if there were two
858                // `GroupState::Alternation`s adjacent in the parser's stack,
859                // which we guarantee to never happen because we never push a
860                // `GroupState::Alternation` if one is already at the top of
861                // the stack.
862                unreachable!()
863            }
864            Some(GroupState::Intersection(_)) => {
865                unreachable!()
866            }
867            Some(GroupState::Group { group, .. }) => {
868                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
869            }
870        }
871    }
872
873    /// Parse the opening of a character class and push the current class
874    /// parsing context onto the parser's stack. This assumes that the parser
875    /// is positioned at an opening `[`. The given union should correspond to
876    /// the union of set items built up before seeing the `[`.
877    ///
878    /// If there was a problem parsing the opening of the class, then an error
879    /// is returned. Otherwise, a new union of set items for the class is
880    /// returned (which may be populated with either a `]` or a `-`).
881    #[inline(never)]
882    fn push_class_open(
883        &self,
884        parent_union: regex_syntax::ast::ClassSetUnion,
885    ) -> Result<regex_syntax::ast::ClassSetUnion> {
886        assert_eq!(self.char(), '[');
887
888        let (nested_set, nested_union) = self.parse_set_class_open()?;
889        self.parser()
890            .stack_class
891            .borrow_mut()
892            .push(ClassState::Open {
893                union: parent_union,
894                set: nested_set,
895            });
896        Ok(nested_union)
897    }
898
899    /// Parse the end of a character class set and pop the character class
900    /// parser stack. The union given corresponds to the last union built
901    /// before seeing the closing `]`. The union returned corresponds to the
902    /// parent character class set with the nested class added to it.
903    ///
904    /// This assumes that the parser is positioned at a `]` and will advance
905    /// the parser to the byte immediately following the `]`.
906    ///
907    /// If the stack is empty after popping, then this returns the final
908    /// "top-level" character class AST (where a "top-level" character class
909    /// is one that is not nested inside any other character class).
910    ///
911    /// If there is no corresponding opening bracket on the parser's stack,
912    /// then an error is returned.
913    #[inline(never)]
914    fn pop_class(
915        &self,
916        nested_union: regex_syntax::ast::ClassSetUnion,
917    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
918        assert_eq!(self.char(), ']');
919
920        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
921        let prevset = self.pop_class_op(item);
922        let mut stack = self.parser().stack_class.borrow_mut();
923        match stack.pop() {
924            None => {
925                // We can never observe an empty stack:
926                //
927                // 1) We are guaranteed to start with a non-empty stack since
928                //    the character class parser is only initiated when it sees
929                //    a `[`.
930                // 2) If we ever observe an empty stack while popping after
931                //    seeing a `]`, then we signal the character class parser
932                //    to terminate.
933                panic!("unexpected empty character class stack")
934            }
935            Some(ClassState::Op { .. }) => {
936                // This panic is unfortunate, but this case is impossible
937                // since we already popped the Op state if one exists above.
938                // Namely, every push to the class parser stack is guarded by
939                // whether an existing Op is already on the top of the stack.
940                // If it is, the existing Op is modified. That is, the stack
941                // can never have consecutive Op states.
942                panic!("unexpected ClassState::Op")
943            }
944            Some(ClassState::Open { mut union, mut set }) => {
945                self.bump();
946                set.span.end = self.pos();
947                set.kind = prevset;
948                if stack.is_empty() {
949                    Ok(Either::Right(set))
950                } else {
951                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
952                    Ok(Either::Left(union))
953                }
954            }
955        }
956    }
957
958    /// Return an "unclosed class" error whose span points to the most
959    /// recently opened class.
960    ///
961    /// This should only be called while parsing a character class.
962    #[inline(never)]
963    fn unclosed_class_error(&self) -> ResharpError {
964        for state in self.parser().stack_class.borrow().iter().rev() {
965            if let ClassState::Open { ref set, .. } = *state {
966                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
967            }
968        }
969        // We are guaranteed to have a non-empty stack with at least
970        // one open bracket, so we should never get here.
971        panic!("no open character class found")
972    }
973
974    /// Push the current set of class items on to the class parser's stack as
975    /// the left hand side of the given operator.
976    ///
977    /// A fresh set union is returned, which should be used to build the right
978    /// hand side of this operator.
979    #[inline(never)]
980    fn push_class_op(
981        &self,
982        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
983        next_union: regex_syntax::ast::ClassSetUnion,
984    ) -> regex_syntax::ast::ClassSetUnion {
985        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
986        let new_lhs = self.pop_class_op(item);
987        self.parser().stack_class.borrow_mut().push(ClassState::Op {
988            kind: next_kind,
989            lhs: new_lhs,
990        });
991        regex_syntax::ast::ClassSetUnion {
992            span: self.span(),
993            items: vec![],
994        }
995    }
996
997    /// Pop a character class set from the character class parser stack. If the
998    /// top of the stack is just an item (not an operation), then return the
999    /// given set unchanged. If the top of the stack is an operation, then the
1000    /// given set will be used as the rhs of the operation on the top of the
1001    /// stack. In that case, the binary operation is returned as a set.
1002    #[inline(never)]
1003    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1004        let mut stack = self.parser().stack_class.borrow_mut();
1005        let (kind, lhs) = match stack.pop() {
1006            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1007            Some(state @ ClassState::Open { .. }) => {
1008                stack.push(state);
1009                return rhs;
1010            }
1011            None => unreachable!(),
1012        };
1013        let span = Span::new(lhs.span().start, rhs.span().end);
1014        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1015            span,
1016            kind,
1017            lhs: Box::new(lhs),
1018            rhs: Box::new(rhs),
1019        })
1020    }
1021
1022    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1023        match hir.kind() {
1024            hir::HirKind::Empty => Ok(NodeId::EPS),
1025            hir::HirKind::Literal(l) => {
1026                if l.0.len() == 1 {
1027                    let node = tb.mk_u8(l.0[0]);
1028                    Ok(node)
1029                } else {
1030                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1031                    let conc = tb.mk_concats(ws.iter().copied());
1032                    Ok(conc)
1033                }
1034            }
1035            hir::HirKind::Class(class) => {
1036                // dbg!(class);
1037                match class {
1038                    hir::Class::Unicode(class_unicode) => {
1039                        let ranges = class_unicode.ranges();
1040                        let mut nodes = Vec::new();
1041                        for range in ranges {
1042                            for seq in Utf8Sequences::new(range.start(), range.end()) {
1043                                let sl = seq.as_slice();
1044                                let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1045                                let node = match bytes.len() {
1046                                    1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1047                                    n => {
1048                                        let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1049                                        let mut conc = last;
1050                                        for i in (0..n - 1).rev() {
1051                                            let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1052                                            conc = tb.mk_concat(b, conc);
1053                                        }
1054                                        conc
1055                                    }
1056                                };
1057                                nodes.push(node);
1058                            }
1059                        }
1060                        let merged = tb.mk_unions(nodes.into_iter());
1061                        Ok(merged)
1062                    }
1063                    hir::Class::Bytes(class_bytes) => {
1064                        let ranges = class_bytes.ranges();
1065                        let mut result = NodeId::BOT;
1066                        for range in ranges {
1067                            let start = range.start();
1068                            let end = range.end();
1069                            let node = tb.mk_range_u8(start, end);
1070                            result = tb.mk_union(result, node);
1071                        }
1072                        Ok(result)
1073                    }
1074                }
1075            }
1076            hir::HirKind::Look(_) => todo!(),
1077            hir::HirKind::Repetition(_) => todo!(),
1078            hir::HirKind::Capture(_) => todo!(),
1079            hir::HirKind::Concat(body) => {
1080                let mut result = NodeId::EPS;
1081                for child in body {
1082                    let node = self.hir_to_node_id(child, tb)?;
1083                    result = tb.mk_concat(result, node);
1084                }
1085                Ok(result)
1086            }
1087            hir::HirKind::Alternation(_) => todo!(),
1088        }
1089    }
1090
1091    fn translate_ast_to_hir(
1092        &mut self,
1093        orig_ast: &regex_syntax::ast::Ast,
1094        tb: &mut TB<'s>,
1095    ) -> Result<NodeId> {
1096        // let unboxed = class_perl.deref();
1097        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1098        match self.translator.translate("", orig_ast) {
1099            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1100            Ok(hir) => {
1101                let mapped = self.hir_to_node_id(&hir, tb);
1102                mapped
1103            }
1104        }
1105    }
1106
1107    fn translator_to_node_id(
1108        &mut self,
1109        orig_ast: &regex_syntax::ast::Ast,
1110        translator: &mut Option<Translator>,
1111        tb: &mut TB<'s>,
1112    ) -> Result<NodeId> {
1113        match translator {
1114            Some(tr) => {
1115                let hir = tr
1116                    .translate("", &orig_ast)
1117                    .map_err(|e| self.unsupported_error(e))?;
1118                self.hir_to_node_id(&hir, tb)
1119            }
1120            None => self.translate_ast_to_hir(&orig_ast, tb),
1121        }
1122    }
1123
1124    fn get_class(
1125        &mut self,
1126        negated: bool,
1127        kind: regex_syntax::ast::ClassPerlKind,
1128        tb: &mut TB<'s>,
1129    ) -> Result<NodeId> {
1130        let w = self
1131            .perl_classes
1132            .iter()
1133            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1134        match w {
1135            Some((_, _, value)) => return Ok(*value),
1136            None => {
1137                let translated = if self.global_unicode {
1138                    match kind {
1139                        regex_syntax::ast::ClassPerlKind::Word => {
1140                            self.unicode_classes.ensure_word(tb);
1141                            if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1142                        }
1143                        regex_syntax::ast::ClassPerlKind::Digit => {
1144                            self.unicode_classes.ensure_digit(tb);
1145                            if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1146                        }
1147                        regex_syntax::ast::ClassPerlKind::Space => {
1148                            self.unicode_classes.ensure_space(tb);
1149                            if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1150                        }
1151                    }
1152                } else {
1153                    let pos = match kind {
1154                        regex_syntax::ast::ClassPerlKind::Word => {
1155                            let az = tb.mk_range_u8(b'a', b'z');
1156                            let big = tb.mk_range_u8(b'A', b'Z');
1157                            let dig = tb.mk_range_u8(b'0', b'9');
1158                            let us = tb.mk_u8(b'_');
1159                            tb.mk_unions([az, big, dig, us].into_iter())
1160                        }
1161                        regex_syntax::ast::ClassPerlKind::Digit => {
1162                            tb.mk_range_u8(b'0', b'9')
1163                        }
1164                        regex_syntax::ast::ClassPerlKind::Space => {
1165                            let sp = tb.mk_u8(b' ');
1166                            let tab = tb.mk_u8(b'\t');
1167                            let nl = tb.mk_u8(b'\n');
1168                            let cr = tb.mk_u8(b'\r');
1169                            let ff = tb.mk_u8(0x0C);
1170                            let vt = tb.mk_u8(0x0B);
1171                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1172                        }
1173                    };
1174                    if negated { tb.mk_compl(pos) } else { pos }
1175                };
1176                self.perl_classes.push((negated, kind, translated));
1177                Ok(translated)
1178            }
1179        }
1180    }
1181
1182    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1183        use WordCharKind::*;
1184        match ast {
1185            Ast::Literal(lit) => {
1186                if is_word_byte(lit.c as u8) {
1187                    Word
1188                } else {
1189                    NonWord
1190                }
1191            }
1192            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1193                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1194                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1195                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1196                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1197                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1198                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1199            },
1200            Ast::Dot(_) | Ast::Top(_) => Unknown,
1201            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1202            Ast::Concat(c) if !c.asts.is_empty() => {
1203                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1204            }
1205            Ast::Repetition(r) => {
1206                let inner = Self::word_char_kind(&r.ast, left);
1207                let nullable = matches!(
1208                    &r.op.kind,
1209                    ast::RepetitionKind::ZeroOrMore
1210                        | ast::RepetitionKind::ZeroOrOne
1211                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1212                );
1213                if nullable {
1214                    match inner {
1215                        Word => MaybeWord,
1216                        NonWord => MaybeNonWord,
1217                        _ => Unknown,
1218                    }
1219                } else {
1220                    inner
1221                }
1222            }
1223            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1224            _ => Unknown,
1225        }
1226    }
1227
1228    fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1229        match ast {
1230            Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1231            | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1232            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1233            Ast::Concat(c) if !c.asts.is_empty() => {
1234                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1235            }
1236            Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1237            Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1238            _ => None,
1239        }
1240    }
1241
1242    fn resolve_word_kind(
1243        &mut self,
1244        asts: &[Ast],
1245        idx: usize,
1246        dir: isize,
1247        translator: &mut Option<Translator>,
1248        tb: &mut TB<'s>,
1249        word_id: NodeId,
1250        not_word_id: NodeId,
1251    ) -> Result<WordCharKind> {
1252        use WordCharKind::*;
1253        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1254        if fast != Unknown {
1255            return Ok(fast);
1256        }
1257        let neighbor_idx = (idx as isize + dir) as usize;
1258        let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1259            Some(e) => e,
1260            None => return Ok(Unknown),
1261        };
1262        let node = self.ast_to_node_id(edge, translator, tb)?;
1263        if tb.subsumes(word_id, node) == Some(true) {
1264            Ok(Word)
1265        } else if tb.subsumes(not_word_id, node) == Some(true) {
1266            Ok(NonWord)
1267        } else {
1268            Ok(Unknown)
1269        }
1270    }
1271
1272    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1273        use WordCharKind::*;
1274        let next = idx as isize + dir;
1275        if next < 0 || next >= asts.len() as isize {
1276            return Edge;
1277        }
1278        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1279        match kind {
1280            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1281                Word => Word,
1282                _ => Unknown,
1283            },
1284            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1285                NonWord => NonWord,
1286                _ => Unknown,
1287            },
1288            other => other,
1289        }
1290    }
1291
1292    fn rewrite_word_boundary_in_concat(
1293        &mut self,
1294        asts: &[Ast],
1295        idx: usize,
1296        translator: &mut Option<Translator>,
1297        tb: &mut TB<'s>,
1298    ) -> Result<(NodeId, usize)> {
1299        use WordCharKind::*;
1300        self.unicode_classes.ensure_word(tb);
1301        let word_id = self.unicode_classes.word;
1302        let not_word_id = self.unicode_classes.non_word;
1303        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1304        let right =
1305            self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1306
1307        match (left, right) {
1308            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1309            (Word, _) => {
1310                let neg = tb.mk_neg_lookahead(word_id, 0);
1311                Ok((neg, idx + 1))
1312            }
1313            (NonWord, _) => {
1314                let set = tb.mk_union(NodeId::END, word_id);
1315                let tail = tb.mk_concat(set, NodeId::TS);
1316                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1317            }
1318            (_, Word) => {
1319                Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1320            }
1321            (_, NonWord) => {
1322                let body = tb.mk_union(NodeId::BEGIN, word_id);
1323                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1324            }
1325            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1326            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1327            _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1328        }
1329    }
1330
1331    fn merge_boundary_with_following_lookaheads(
1332        &mut self,
1333        asts: &[Ast],
1334        wb_idx: usize,
1335        boundary_tail: NodeId,
1336        translator: &mut Option<Translator>,
1337        tb: &mut TB<'s>,
1338    ) -> Result<(NodeId, usize)> {
1339        let mut next = wb_idx + 1;
1340        let mut la_bodies = vec![boundary_tail];
1341        while next < asts.len() {
1342            match &asts[next] {
1343                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1344                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1345                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1346                    next += 1;
1347                }
1348                _ => break,
1349            }
1350        }
1351        let merged = tb.mk_inters(la_bodies.into_iter());
1352        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1353    }
1354
1355    fn ast_to_node_id(
1356        &mut self,
1357        ast: &Ast,
1358        translator: &mut Option<Translator>,
1359        tb: &mut TB<'s>,
1360    ) -> Result<NodeId> {
1361        match ast {
1362            Ast::Empty(_) => Ok(NodeId::EPS),
1363            Ast::Flags(f) => {
1364                let mut translator_builder = self.default_translator_builder();
1365                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1366                    translator_builder.case_insensitive(state);
1367                }
1368                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1369                    translator_builder.unicode(state);
1370                }
1371                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1372                    self.dot_all.set(state);
1373                }
1374                let concat_translator = Some(translator_builder.build());
1375                *translator = concat_translator;
1376                Ok(NodeId::EPS)
1377            }
1378            Ast::Literal(l) => {
1379                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1380                self.translator_to_node_id(&ast_lit, translator, tb)
1381            }
1382            Ast::Top(_) => Ok(NodeId::TOP),
1383            Ast::Dot(_) => {
1384                if self.dot_all.get() {
1385                    Ok(NodeId::TOP)
1386                } else {
1387                    let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1388                    self.hir_to_node_id(&hirv, tb)
1389                }
1390            }
1391            Ast::Assertion(a) => match &a.kind {
1392                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1393                ast::AssertionKind::EndText => Ok(NodeId::END),
1394                ast::AssertionKind::WordBoundary => {
1395                    return Err(
1396                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1397                    )
1398                }
1399                ast::AssertionKind::NotWordBoundary => {
1400                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1401                }
1402                ast::AssertionKind::StartLine => {
1403                    let left = NodeId::BEGIN;
1404                    let right = tb.mk_u8('\n' as u8);
1405                    let union = tb.mk_union(left, right);
1406                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1407                }
1408                ast::AssertionKind::EndLine => {
1409                    let left = NodeId::END;
1410                    let right = tb.mk_u8('\n' as u8);
1411                    let union = tb.mk_union(left, right);
1412                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1413                }
1414                ast::AssertionKind::WordBoundaryStart => todo!(),
1415                ast::AssertionKind::WordBoundaryEnd => todo!(),
1416                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1417                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1418                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1419                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1420            },
1421            Ast::ClassUnicode(c) => {
1422                let tmp = regex_syntax::ast::ClassUnicode {
1423                    span: c.span,
1424                    negated: c.negated,
1425                    kind: c.kind.clone(),
1426                };
1427                if !c.negated {
1428                    match &c.kind {
1429                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1430                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1431                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1432                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1433                            "utf8" => {
1434                                let ascii = tb.mk_range_u8(0, 127);
1435                                let beta = tb.mk_range_u8(128, 0xBF);
1436                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1437                                let c0s = tb.mk_concats([c0, beta].into_iter());
1438                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1439                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1440                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1441                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1442                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1443                                return Ok(tb.mk_star(merged));
1444                            }
1445                            "hex" => {
1446                                let nums = tb.mk_range_u8(b'0', b'9');
1447                                let lets = tb.mk_range_u8(b'a', b'f');
1448                                let lets2 = tb.mk_range_u8(b'A', b'F');
1449                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1450                                return Ok(merged);
1451                            }
1452                            _ => {}
1453                        },
1454                        _ => {}
1455                    };
1456                }
1457
1458                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1459                self.translator_to_node_id(&orig_ast, translator, tb)
1460            }
1461            Ast::ClassPerl(c) => {
1462                self.get_class(c.negated, c.kind.clone(), tb)
1463            }
1464            Ast::ClassBracketed(c) => match &c.kind {
1465                regex_syntax::ast::ClassSet::Item(_) => {
1466                    let tmp = regex_syntax::ast::ClassBracketed {
1467                        span: c.span,
1468                        negated: c.negated,
1469                        kind: c.kind.clone(),
1470                    };
1471                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1472                    self.translator_to_node_id(&orig_ast, translator, tb)
1473                }
1474                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1475            },
1476            Ast::Repetition(r) => {
1477                let body = self.ast_to_node_id(&r.ast, translator, tb);
1478                match body {
1479                    Ok(body) => match &r.op.kind {
1480                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1481                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1482                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1483                        ast::RepetitionKind::Range(r) => match r {
1484                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1485                            ast::RepetitionRange::AtLeast(n) => {
1486                                let rep = tb.mk_repeat(body, *n, *n);
1487                                let st = tb.mk_star(body);
1488                                Ok(tb.mk_concat(rep, st))
1489                            }
1490
1491                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1492                        },
1493                    },
1494                    Err(_) => body,
1495                }
1496            }
1497            Ast::Lookaround(g) => {
1498                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1499                match g.kind {
1500                    ast::LookaroundKind::PositiveLookahead => {
1501                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1502                    }
1503                    ast::LookaroundKind::PositiveLookbehind => {
1504                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1505                    }
1506                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1507                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1508                }
1509            }
1510            Ast::Group(g) => {
1511                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1512                    if !flags.items.is_empty() {
1513                        let mut translator_builder = self.default_translator_builder();
1514                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1515                            translator_builder.case_insensitive(state);
1516                        }
1517                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1518                            translator_builder.unicode(state);
1519                        }
1520                        let saved_dot_all = self.dot_all.get();
1521                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1522                            self.dot_all.set(state);
1523                        }
1524                        let mut scoped = Some(translator_builder.build());
1525                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1526                        self.dot_all.set(saved_dot_all);
1527                        return result;
1528                    }
1529                }
1530                self.ast_to_node_id(&g.ast, translator, tb)
1531            }
1532            Ast::Alternation(a) => {
1533                let mut children = vec![];
1534                for ast in &a.asts {
1535                    match self.ast_to_node_id(ast, translator, tb) {
1536                        Ok(node_id) => children.push(node_id),
1537                        Err(err) => return Err(err),
1538                    }
1539                }
1540                Ok(tb.mk_unions(children.iter().copied()))
1541            }
1542            Ast::Concat(c) => {
1543                let mut concat_translator: Option<Translator> = None;
1544                let mut children = vec![];
1545                let mut i = 0;
1546                while i < c.asts.len() {
1547                    let ast = &c.asts[i];
1548                    match ast {
1549                        Ast::Flags(f) => {
1550                            let mut translator_builder = self.default_translator_builder();
1551                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1552                                translator_builder.case_insensitive(state);
1553                            }
1554                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1555                                translator_builder.unicode(state);
1556                            }
1557                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1558                                self.dot_all.set(state);
1559                            }
1560                            concat_translator = Some(translator_builder.build());
1561                            i += 1;
1562                            continue;
1563                        }
1564                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1565                            let node =
1566                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1567                            children.push(node.0);
1568                            i = node.1; // skip consumed lookaheads
1569                            continue;
1570                        }
1571                        _ => {}
1572                    }
1573                    match concat_translator {
1574                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1575                            Ok(node_id) => children.push(node_id),
1576                            Err(err) => return Err(err),
1577                        },
1578                        None => match self.ast_to_node_id(ast, translator, tb) {
1579                            Ok(node_id) => children.push(node_id),
1580                            Err(err) => return Err(err),
1581                        },
1582                    }
1583                    i += 1;
1584                }
1585                Ok(tb.mk_concats(children.iter().cloned()))
1586            }
1587            Ast::Intersection(intersection) => {
1588                let mut children = vec![];
1589                for ast in &intersection.asts {
1590                    match self.ast_to_node_id(ast, translator, tb) {
1591                        Ok(node_id) => children.push(node_id),
1592                        Err(err) => return Err(err),
1593                    }
1594                }
1595                Ok(tb.mk_inters(children.into_iter()))
1596            }
1597            Ast::Complement(complement) => {
1598                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1599                body.map(|x| tb.mk_compl(x))
1600            }
1601        }
1602    }
1603
1604    /// Parse the regular expression and return an abstract syntax tree with
1605    /// all of the comments found in the pattern.
1606    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1607        let mut concat = Concat {
1608            span: self.span(),
1609            asts: vec![],
1610        };
1611        loop {
1612            self.bump_space();
1613            if self.is_eof() {
1614                break;
1615            }
1616            match self.char() {
1617                '(' => concat = self.push_group(concat)?,
1618                ')' => concat = self.pop_group(concat)?,
1619                '|' => concat = self.push_alternate(concat)?,
1620                '&' => concat = self.push_intersect(concat)?,
1621                '~' => concat = self.push_compl_group(concat)?,
1622                '[' => {
1623                    let class = self.parse_set_class()?;
1624                    concat.asts.push(Ast::class_bracketed(class));
1625                }
1626                '?' => {
1627                    concat =
1628                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1629                }
1630                '*' => {
1631                    concat =
1632                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1633                }
1634                '+' => {
1635                    concat =
1636                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1637                }
1638                '{' => {
1639                    concat = self.parse_counted_repetition(concat)?;
1640                }
1641                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1642            }
1643        }
1644        let ast = self.pop_group_end(concat)?;
1645        // dbg!(&ast);
1646        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1647        node_id
1648    }
1649
1650    #[inline(never)]
1651    fn parse_uncounted_repetition(
1652        &self,
1653        mut concat: ast::Concat,
1654        kind: ast::RepetitionKind,
1655    ) -> Result<ast::Concat> {
1656        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1657        let op_start = self.pos();
1658        let ast = match concat.asts.pop() {
1659            Some(ast) => ast,
1660            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1661        };
1662        match ast {
1663            Ast::Empty(_) | Ast::Flags(_) => {
1664                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1665            }
1666            _ => {}
1667        }
1668        if self.bump() && self.char() == '?' {
1669            return Err(self.error(
1670                Span::new(op_start, self.pos()),
1671                ast::ErrorKind::UnsupportedLazyQuantifier,
1672            ));
1673        }
1674        concat.asts.push(Ast::repetition(ast::Repetition {
1675            span: ast.span().with_end(self.pos()),
1676            op: ast::RepetitionOp {
1677                span: Span::new(op_start, self.pos()),
1678                kind,
1679            },
1680            greedy: true,
1681            ast: Box::new(ast),
1682        }));
1683        Ok(concat)
1684    }
1685
1686    #[inline(never)]
1687    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1688        assert!(self.char() == '{');
1689        let start = self.pos();
1690        let ast = match concat.asts.pop() {
1691            Some(ast) => ast,
1692            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1693        };
1694        match ast {
1695            Ast::Empty(_) | Ast::Flags(_) => {
1696                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1697            }
1698            _ => {}
1699        }
1700        if !self.bump_and_bump_space() {
1701            return Err(self.error(
1702                Span::new(start, self.pos()),
1703                ast::ErrorKind::RepetitionCountUnclosed,
1704            ));
1705        }
1706        let count_start = specialize_err(
1707            self.parse_decimal(),
1708            ast::ErrorKind::DecimalEmpty,
1709            ast::ErrorKind::RepetitionCountDecimalEmpty,
1710        );
1711        if self.is_eof() {
1712            return Err(self.error(
1713                Span::new(start, self.pos()),
1714                ast::ErrorKind::RepetitionCountUnclosed,
1715            ));
1716        }
1717        let range = if self.char() == ',' {
1718            if !self.bump_and_bump_space() {
1719                return Err(self.error(
1720                    Span::new(start, self.pos()),
1721                    ast::ErrorKind::RepetitionCountUnclosed,
1722                ));
1723            }
1724            if self.char() != '}' {
1725                let count_start = match count_start {
1726                    Ok(c) => c,
1727                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1728                        if self.parser().empty_min_range {
1729                            0
1730                        } else {
1731                            return Err(err);
1732                        }
1733                    }
1734                    err => err?,
1735                };
1736                let count_end = specialize_err(
1737                    self.parse_decimal(),
1738                    ast::ErrorKind::DecimalEmpty,
1739                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1740                )?;
1741                ast::RepetitionRange::Bounded(count_start, count_end)
1742            } else {
1743                ast::RepetitionRange::AtLeast(count_start?)
1744            }
1745        } else {
1746            ast::RepetitionRange::Exactly(count_start?)
1747        };
1748
1749        if self.is_eof() || self.char() != '}' {
1750            return Err(self.error(
1751                Span::new(start, self.pos()),
1752                ast::ErrorKind::RepetitionCountUnclosed,
1753            ));
1754        }
1755
1756        if self.bump_and_bump_space() && self.char() == '?' {
1757            return Err(self.error(
1758                Span::new(start, self.pos()),
1759                ast::ErrorKind::UnsupportedLazyQuantifier,
1760            ));
1761        }
1762
1763        let op_span = Span::new(start, self.pos());
1764        if !range.is_valid() {
1765            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1766        }
1767        concat.asts.push(Ast::repetition(ast::Repetition {
1768            span: ast.span().with_end(self.pos()),
1769            op: ast::RepetitionOp {
1770                span: op_span,
1771                kind: ast::RepetitionKind::Range(range),
1772            },
1773            greedy: true,
1774            ast: Box::new(ast),
1775        }));
1776        Ok(concat)
1777    }
1778
1779    #[inline(never)]
1780    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1781        assert_eq!(self.char(), '(');
1782        let open_span = self.span_char();
1783        self.bump();
1784        self.bump_space();
1785        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1786            let kind = match (pos, ahead) {
1787                (true, true) => LookaroundKind::PositiveLookahead,
1788                (true, false) => LookaroundKind::PositiveLookbehind,
1789                (false, true) => LookaroundKind::NegativeLookahead,
1790                (false, false) => LookaroundKind::NegativeLookbehind,
1791            };
1792            return Ok(Either::Right(ast::Group {
1793                span: open_span,
1794                kind: ast::GroupKind::Lookaround(kind),
1795                ast: Box::new(Ast::empty(self.span())),
1796            }));
1797        }
1798        let inner_span = self.span();
1799        let mut starts_with_p = true;
1800        if self.bump_if("?P<") || {
1801            starts_with_p = false;
1802            self.bump_if("?<")
1803        } {
1804            let capture_index = self.next_capture_index(open_span)?;
1805            let name = self.parse_capture_name(capture_index)?;
1806            Ok(Either::Right(ast::Group {
1807                span: open_span,
1808                kind: ast::GroupKind::CaptureName {
1809                    starts_with_p,
1810                    name,
1811                },
1812                ast: Box::new(Ast::empty(self.span())),
1813            }))
1814        } else if self.bump_if("?") {
1815            if self.is_eof() {
1816                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1817            }
1818            let flags = self.parse_flags()?;
1819            let char_end = self.char();
1820            self.bump();
1821            if char_end == ')' {
1822                // We don't allow empty flags, e.g., `(?)`. We instead
1823                // interpret it as a repetition operator missing its argument.
1824                if flags.items.is_empty() {
1825                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1826                }
1827                Ok(Either::Left(ast::SetFlags {
1828                    span: Span {
1829                        end: self.pos(),
1830                        ..open_span
1831                    },
1832                    flags,
1833                }))
1834            } else {
1835                assert_eq!(char_end, ':');
1836                Ok(Either::Right(ast::Group {
1837                    span: open_span,
1838                    kind: ast::GroupKind::NonCapturing(flags),
1839                    ast: Box::new(Ast::empty(self.span())),
1840                }))
1841            }
1842        } else {
1843            let capture_index = self.next_capture_index(open_span)?;
1844            Ok(Either::Right(ast::Group {
1845                span: open_span,
1846                kind: ast::GroupKind::CaptureIndex(capture_index),
1847                ast: Box::new(Ast::empty(self.span())),
1848            }))
1849        }
1850    }
1851
1852    #[inline(never)]
1853    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1854        if self.is_eof() {
1855            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1856        }
1857        let start = self.pos();
1858        loop {
1859            if self.char() == '>' {
1860                break;
1861            }
1862            if !is_capture_char(self.char(), self.pos() == start) {
1863                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1864            }
1865            if !self.bump() {
1866                break;
1867            }
1868        }
1869        let end = self.pos();
1870        if self.is_eof() {
1871            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1872        }
1873        assert_eq!(self.char(), '>');
1874        self.bump();
1875        let name = &self.pattern()[start.offset..end.offset];
1876        if name.is_empty() {
1877            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1878        }
1879        let capname = ast::CaptureName {
1880            span: Span::new(start, end),
1881            name: name.to_string(),
1882            index: capture_index,
1883        };
1884        self.add_capture_name(&capname)?;
1885        Ok(capname)
1886    }
1887
1888    #[inline(never)]
1889    fn parse_flags(&self) -> Result<ast::Flags> {
1890        let mut flags = ast::Flags {
1891            span: self.span(),
1892            items: vec![],
1893        };
1894        let mut last_was_negation = None;
1895        while self.char() != ':' && self.char() != ')' {
1896            if self.char() == '-' {
1897                last_was_negation = Some(self.span_char());
1898                let item = ast::FlagsItem {
1899                    span: self.span_char(),
1900                    kind: ast::FlagsItemKind::Negation,
1901                };
1902                if let Some(i) = flags.add_item(item) {
1903                    return Err(self.error(
1904                        self.span_char(),
1905                        ast::ErrorKind::FlagRepeatedNegation {
1906                            original: flags.items[i].span,
1907                        },
1908                    ));
1909                }
1910            } else {
1911                last_was_negation = None;
1912                let item = ast::FlagsItem {
1913                    span: self.span_char(),
1914                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1915                };
1916                if let Some(i) = flags.add_item(item) {
1917                    return Err(self.error(
1918                        self.span_char(),
1919                        ast::ErrorKind::FlagDuplicate {
1920                            original: flags.items[i].span,
1921                        },
1922                    ));
1923                }
1924            }
1925            if !self.bump() {
1926                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1927            }
1928        }
1929        if let Some(span) = last_was_negation {
1930            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1931        }
1932        flags.span.end = self.pos();
1933        Ok(flags)
1934    }
1935
1936    #[inline(never)]
1937    fn parse_flag(&self) -> Result<ast::Flag> {
1938        match self.char() {
1939            'i' => Ok(ast::Flag::CaseInsensitive),
1940            'm' => Ok(ast::Flag::MultiLine),
1941            's' => Ok(ast::Flag::DotMatchesNewLine),
1942            'U' => Ok(ast::Flag::SwapGreed),
1943            'u' => Ok(ast::Flag::Unicode),
1944            'R' => Ok(ast::Flag::CRLF),
1945            'x' => Ok(ast::Flag::IgnoreWhitespace),
1946            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1947        }
1948    }
1949
1950    fn parse_primitive(&self) -> Result<Primitive> {
1951        match self.char() {
1952            '\\' => self.parse_escape(),
1953            '_' => {
1954                let ast = Primitive::Top(self.span_char());
1955                self.bump();
1956                Ok(ast)
1957            }
1958            '.' => {
1959                let ast = Primitive::Dot(self.span_char());
1960                self.bump();
1961                Ok(ast)
1962            }
1963            '^' => {
1964                let ast = Primitive::Assertion(ast::Assertion {
1965                    span: self.span_char(),
1966                    kind: ast::AssertionKind::StartLine,
1967                });
1968                self.bump();
1969                Ok(ast)
1970            }
1971            '$' => {
1972                let ast = Primitive::Assertion(ast::Assertion {
1973                    span: self.span_char(),
1974                    kind: ast::AssertionKind::EndLine,
1975                });
1976                self.bump();
1977                Ok(ast)
1978            }
1979            c => {
1980                let ast = Primitive::Literal(Literal {
1981                    span: self.span_char(),
1982                    kind: LiteralKind::Verbatim,
1983                    c,
1984                });
1985                self.bump();
1986                Ok(ast)
1987            }
1988        }
1989    }
1990
1991    #[inline(never)]
1992    fn parse_escape(&self) -> Result<Primitive> {
1993        assert_eq!(self.char(), '\\');
1994        let start = self.pos();
1995        if !self.bump() {
1996            return Err(self.error(
1997                Span::new(start, self.pos()),
1998                ast::ErrorKind::EscapeUnexpectedEof,
1999            ));
2000        }
2001        let c = self.char();
2002        // Put some of the more complicated routines into helpers.
2003        match c {
2004            '0'..='9' => {
2005                if !self.parser().octal {
2006                    return Err(self.error(
2007                        Span::new(start, self.span_char().end),
2008                        ast::ErrorKind::UnsupportedBackreference,
2009                    ));
2010                }
2011                let mut lit = self.parse_octal();
2012                lit.span.start = start;
2013                return Ok(Primitive::Literal(lit));
2014            }
2015            // '8'..='9' if !self.parser().octal => {
2016            //     return Err(self.error(
2017            //         Span::new(start, self.span_char().end),
2018            //         ast::ErrorKind::UnsupportedBackreference,
2019            //     ));
2020            // }
2021            'x' | 'u' | 'U' => {
2022                let mut lit = self.parse_hex()?;
2023                lit.span.start = start;
2024                return Ok(Primitive::Literal(lit));
2025            }
2026            'p' | 'P' => {
2027                let mut cls = self.parse_unicode_class()?;
2028                cls.span.start = start;
2029                return Ok(Primitive::Unicode(cls));
2030            }
2031            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2032                let mut cls = self.parse_perl_class();
2033                cls.span.start = start;
2034                return Ok(Primitive::Perl(cls));
2035            }
2036            _ => {}
2037        }
2038
2039        // Handle all of the one letter sequences inline.
2040        self.bump();
2041        let span = Span::new(start, self.pos());
2042        if is_meta_character(c) {
2043            return Ok(Primitive::Literal(Literal {
2044                span,
2045                kind: LiteralKind::Meta,
2046                c,
2047            }));
2048        }
2049        if is_escapeable_character(c) {
2050            return Ok(Primitive::Literal(Literal {
2051                span,
2052                kind: LiteralKind::Superfluous,
2053                c,
2054            }));
2055        }
2056        let special = |kind, c| {
2057            Ok(Primitive::Literal(Literal {
2058                span,
2059                kind: LiteralKind::Special(kind),
2060                c,
2061            }))
2062        };
2063        match c {
2064            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2065            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2066            't' => special(SpecialLiteralKind::Tab, '\t'),
2067            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2068            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2069            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2070            'A' => Ok(Primitive::Assertion(ast::Assertion {
2071                span,
2072                kind: ast::AssertionKind::StartText,
2073            })),
2074            'z' => Ok(Primitive::Assertion(ast::Assertion {
2075                span,
2076                kind: ast::AssertionKind::EndText,
2077            })),
2078            'b' => {
2079                let mut wb = ast::Assertion {
2080                    span,
2081                    kind: ast::AssertionKind::WordBoundary,
2082                };
2083                // After a \b, we "try" to parse things like \b{start} for
2084                // special word boundary assertions.
2085                if !self.is_eof() && self.char() == '{' {
2086                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2087                        wb.kind = kind;
2088                        wb.span.end = self.pos();
2089                    }
2090                }
2091                Ok(Primitive::Assertion(wb))
2092            }
2093            'B' => Ok(Primitive::Assertion(ast::Assertion {
2094                span,
2095                kind: ast::AssertionKind::NotWordBoundary,
2096            })),
2097            '<' => Ok(Primitive::Assertion(ast::Assertion {
2098                span,
2099                kind: ast::AssertionKind::WordBoundaryStartAngle,
2100            })),
2101            '>' => Ok(Primitive::Assertion(ast::Assertion {
2102                span,
2103                kind: ast::AssertionKind::WordBoundaryEndAngle,
2104            })),
2105            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2106        }
2107    }
2108
2109    fn maybe_parse_special_word_boundary(
2110        &self,
2111        wb_start: Position,
2112    ) -> Result<Option<ast::AssertionKind>> {
2113        assert_eq!(self.char(), '{');
2114
2115        let is_valid_char = |c| match c {
2116            'A'..='Z' | 'a'..='z' | '-' => true,
2117            _ => false,
2118        };
2119        let start = self.pos();
2120        if !self.bump_and_bump_space() {
2121            return Err(self.error(
2122                Span::new(wb_start, self.pos()),
2123                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2124            ));
2125        }
2126        let start_contents = self.pos();
2127        // This is one of the critical bits: if the first non-whitespace
2128        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2129        // boundary), then we bail and let the counted repetition parser deal
2130        // with this.
2131        if !is_valid_char(self.char()) {
2132            self.parser().pos.set(start);
2133            return Ok(None);
2134        }
2135
2136        // Now collect up our chars until we see a '}'.
2137        let mut scratch = self.parser().scratch.borrow_mut();
2138        scratch.clear();
2139        while !self.is_eof() && is_valid_char(self.char()) {
2140            scratch.push(self.char());
2141            self.bump_and_bump_space();
2142        }
2143        if self.is_eof() || self.char() != '}' {
2144            return Err(self.error(
2145                Span::new(start, self.pos()),
2146                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2147            ));
2148        }
2149        let end = self.pos();
2150        self.bump();
2151        let kind = match scratch.as_str() {
2152            "start" => ast::AssertionKind::WordBoundaryStart,
2153            "end" => ast::AssertionKind::WordBoundaryEnd,
2154            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2155            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2156            _ => {
2157                return Err(self.error(
2158                    Span::new(start_contents, end),
2159                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2160                ))
2161            }
2162        };
2163        Ok(Some(kind))
2164    }
2165
2166    #[inline(never)]
2167    fn parse_octal(&self) -> Literal {
2168        assert!(self.parser().octal);
2169        assert!('0' <= self.char() && self.char() <= '7');
2170        let start = self.pos();
2171        // Parse up to two more digits.
2172        while self.bump()
2173            && '0' <= self.char()
2174            && self.char() <= '7'
2175            && self.pos().offset - start.offset <= 2
2176        {}
2177        let end = self.pos();
2178        let octal = &self.pattern()[start.offset..end.offset];
2179        // Parsing the octal should never fail since the above guarantees a
2180        // valid number.
2181        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2182        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2183        // invalid Unicode scalar values.
2184        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2185        Literal {
2186            span: Span::new(start, end),
2187            kind: LiteralKind::Octal,
2188            c,
2189        }
2190    }
2191
2192    #[inline(never)]
2193    fn parse_hex(&self) -> Result<Literal> {
2194        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2195
2196        let hex_kind = match self.char() {
2197            'x' => HexLiteralKind::X,
2198            'u' => HexLiteralKind::UnicodeShort,
2199            _ => HexLiteralKind::UnicodeLong,
2200        };
2201        if !self.bump_and_bump_space() {
2202            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2203        }
2204        if self.char() == '{' {
2205            self.parse_hex_brace(hex_kind)
2206        } else {
2207            self.parse_hex_digits(hex_kind)
2208        }
2209    }
2210
2211    #[inline(never)]
2212    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2213        let mut scratch = self.parser().scratch.borrow_mut();
2214        scratch.clear();
2215
2216        let start = self.pos();
2217        for i in 0..kind.digits() {
2218            if i > 0 && !self.bump_and_bump_space() {
2219                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2220            }
2221            if !is_hex(self.char()) {
2222                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2223            }
2224            scratch.push(self.char());
2225        }
2226        // The final bump just moves the parser past the literal, which may
2227        // be EOF.
2228        self.bump_and_bump_space();
2229        let end = self.pos();
2230        let hex = scratch.as_str();
2231        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2232            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2233            Some(c) => Ok(Literal {
2234                span: Span::new(start, end),
2235                kind: LiteralKind::HexFixed(kind),
2236                c,
2237            }),
2238        }
2239    }
2240
2241    #[inline(never)]
2242    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2243        let mut scratch = self.parser().scratch.borrow_mut();
2244        scratch.clear();
2245
2246        let brace_pos = self.pos();
2247        let start = self.span_char().end;
2248        while self.bump_and_bump_space() && self.char() != '}' {
2249            if !is_hex(self.char()) {
2250                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2251            }
2252            scratch.push(self.char());
2253        }
2254        if self.is_eof() {
2255            return Err(self.error(
2256                Span::new(brace_pos, self.pos()),
2257                ast::ErrorKind::EscapeUnexpectedEof,
2258            ));
2259        }
2260        let end = self.pos();
2261        let hex = scratch.as_str();
2262        assert_eq!(self.char(), '}');
2263        self.bump_and_bump_space();
2264
2265        if hex.is_empty() {
2266            return Err(self.error(
2267                Span::new(brace_pos, self.pos()),
2268                ast::ErrorKind::EscapeHexEmpty,
2269            ));
2270        }
2271        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2272            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2273            Some(c) => Ok(Literal {
2274                span: Span::new(start, self.pos()),
2275                kind: LiteralKind::HexBrace(kind),
2276                c,
2277            }),
2278        }
2279    }
2280
2281    fn parse_decimal(&self) -> Result<u32> {
2282        let mut scratch = self.parser().scratch.borrow_mut();
2283        scratch.clear();
2284
2285        while !self.is_eof() && self.char().is_whitespace() {
2286            self.bump();
2287        }
2288        let start = self.pos();
2289        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2290            scratch.push(self.char());
2291            self.bump_and_bump_space();
2292        }
2293        let span = Span::new(start, self.pos());
2294        while !self.is_eof() && self.char().is_whitespace() {
2295            self.bump_and_bump_space();
2296        }
2297        let digits = scratch.as_str();
2298        if digits.is_empty() {
2299            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2300        }
2301        match u32::from_str_radix(digits, 10).ok() {
2302            Some(n) => Ok(n),
2303            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2304        }
2305    }
2306
2307    #[inline(never)]
2308    fn parse_set_class(&self) -> Result<ClassBracketed> {
2309        assert_eq!(self.char(), '[');
2310
2311        let mut union = ClassSetUnion {
2312            span: self.span(),
2313            items: vec![],
2314        };
2315        loop {
2316            self.bump_space();
2317            if self.is_eof() {
2318                return Err(self.unclosed_class_error());
2319            }
2320            match self.char() {
2321                '[' => {
2322                    // If we've already parsed the opening bracket, then
2323                    // attempt to treat this as the beginning of an ASCII
2324                    // class. If ASCII class parsing fails, then the parser
2325                    // backs up to `[`.
2326                    if !self.parser().stack_class.borrow().is_empty() {
2327                        if let Some(cls) = self.maybe_parse_ascii_class() {
2328                            union.push(ClassSetItem::Ascii(cls));
2329                            continue;
2330                        }
2331                    }
2332                    union = self.push_class_open(union)?;
2333                }
2334                ']' => match self.pop_class(union)? {
2335                    Either::Left(nested_union) => {
2336                        union = nested_union;
2337                    }
2338                    Either::Right(class) => return Ok(class),
2339                },
2340                '&' if self.peek() == Some('&') => {
2341                    assert!(self.bump_if("&&"));
2342                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2343                }
2344                '-' if self.peek() == Some('-') => {
2345                    assert!(self.bump_if("--"));
2346                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2347                }
2348                '~' if self.peek() == Some('~') => {
2349                    assert!(self.bump_if("~~"));
2350                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2351                }
2352                _ => {
2353                    union.push(self.parse_set_class_range()?);
2354                }
2355            }
2356        }
2357    }
2358
2359    #[inline(never)]
2360    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2361        let prim1 = self.parse_set_class_item()?;
2362        self.bump_space();
2363        if self.is_eof() {
2364            return Err(self.unclosed_class_error());
2365        }
2366        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2367            return prim1.into_class_set_item(self);
2368        }
2369        if !self.bump_and_bump_space() {
2370            return Err(self.unclosed_class_error());
2371        }
2372        let prim2 = self.parse_set_class_item()?;
2373        let range = ClassSetRange {
2374            span: Span::new(prim1.span().start, prim2.span().end),
2375            start: prim1.into_class_literal(self)?,
2376            end: prim2.into_class_literal(self)?,
2377        };
2378        if !range.is_valid() {
2379            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2380        }
2381        Ok(ClassSetItem::Range(range))
2382    }
2383
2384    #[inline(never)]
2385    fn parse_set_class_item(&self) -> Result<Primitive> {
2386        if self.char() == '\\' {
2387            self.parse_escape()
2388        } else {
2389            let x = Primitive::Literal(Literal {
2390                span: self.span_char(),
2391                kind: LiteralKind::Verbatim,
2392                c: self.char(),
2393            });
2394            self.bump();
2395            Ok(x)
2396        }
2397    }
2398
2399    #[inline(never)]
2400    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2401        assert_eq!(self.char(), '[');
2402        let start = self.pos();
2403        if !self.bump_and_bump_space() {
2404            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2405        }
2406
2407        let negated = if self.char() != '^' {
2408            false
2409        } else {
2410            if !self.bump_and_bump_space() {
2411                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2412            }
2413            true
2414        };
2415        // Accept any number of `-` as literal `-`.
2416        let mut union = ClassSetUnion {
2417            span: self.span(),
2418            items: vec![],
2419        };
2420        while self.char() == '-' {
2421            union.push(ClassSetItem::Literal(Literal {
2422                span: self.span_char(),
2423                kind: LiteralKind::Verbatim,
2424                c: '-',
2425            }));
2426            if !self.bump_and_bump_space() {
2427                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2428            }
2429        }
2430        // If `]` is the *first* char in a set, then interpret it as a literal
2431        // `]`. That is, an empty class is impossible to write.
2432        if union.items.is_empty() && self.char() == ']' {
2433            union.push(ClassSetItem::Literal(Literal {
2434                span: self.span_char(),
2435                kind: LiteralKind::Verbatim,
2436                c: ']',
2437            }));
2438            if !self.bump_and_bump_space() {
2439                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2440            }
2441        }
2442        let set = ClassBracketed {
2443            span: Span::new(start, self.pos()),
2444            negated,
2445            kind: ClassSet::union(ClassSetUnion {
2446                span: Span::new(union.span.start, union.span.start),
2447                items: vec![],
2448            }),
2449        };
2450        Ok((set, union))
2451    }
2452
2453    #[inline(never)]
2454    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2455        assert_eq!(self.char(), '[');
2456        // If parsing fails, then we back up the parser to this starting point.
2457        let start = self.pos();
2458        let mut negated = false;
2459        if !self.bump() || self.char() != ':' {
2460            self.parser().pos.set(start);
2461            return None;
2462        }
2463        if !self.bump() {
2464            self.parser().pos.set(start);
2465            return None;
2466        }
2467        if self.char() == '^' {
2468            negated = true;
2469            if !self.bump() {
2470                self.parser().pos.set(start);
2471                return None;
2472            }
2473        }
2474        let name_start = self.offset();
2475        while self.char() != ':' && self.bump() {}
2476        if self.is_eof() {
2477            self.parser().pos.set(start);
2478            return None;
2479        }
2480        let name = &self.pattern()[name_start..self.offset()];
2481        if !self.bump_if(":]") {
2482            self.parser().pos.set(start);
2483            return None;
2484        }
2485        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2486            Some(kind) => kind,
2487            None => {
2488                self.parser().pos.set(start);
2489                return None;
2490            }
2491        };
2492        Some(ClassAscii {
2493            span: Span::new(start, self.pos()),
2494            kind,
2495            negated,
2496        })
2497    }
2498
2499    #[inline(never)]
2500    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2501        assert!(self.char() == 'p' || self.char() == 'P');
2502
2503        let mut scratch = self.parser().scratch.borrow_mut();
2504        scratch.clear();
2505
2506        let negated = self.char() == 'P';
2507        if !self.bump_and_bump_space() {
2508            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2509        }
2510        let (start, kind) = if self.char() == '{' {
2511            let start = self.span_char().end;
2512            while self.bump_and_bump_space() && self.char() != '}' {
2513                scratch.push(self.char());
2514            }
2515            if self.is_eof() {
2516                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2517            }
2518            assert_eq!(self.char(), '}');
2519            self.bump();
2520
2521            let name = scratch.as_str();
2522            if let Some(i) = name.find("!=") {
2523                (
2524                    start,
2525                    ClassUnicodeKind::NamedValue {
2526                        op: ClassUnicodeOpKind::NotEqual,
2527                        name: name[..i].to_string(),
2528                        value: name[i + 2..].to_string(),
2529                    },
2530                )
2531            } else if let Some(i) = name.find(':') {
2532                (
2533                    start,
2534                    ClassUnicodeKind::NamedValue {
2535                        op: ClassUnicodeOpKind::Colon,
2536                        name: name[..i].to_string(),
2537                        value: name[i + 1..].to_string(),
2538                    },
2539                )
2540            } else if let Some(i) = name.find('=') {
2541                (
2542                    start,
2543                    ClassUnicodeKind::NamedValue {
2544                        op: ClassUnicodeOpKind::Equal,
2545                        name: name[..i].to_string(),
2546                        value: name[i + 1..].to_string(),
2547                    },
2548                )
2549            } else {
2550                (start, ClassUnicodeKind::Named(name.to_string()))
2551            }
2552        } else {
2553            let start = self.pos();
2554            let c = self.char();
2555            if c == '\\' {
2556                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2557            }
2558            self.bump_and_bump_space();
2559            let kind = ClassUnicodeKind::OneLetter(c);
2560            (start, kind)
2561        };
2562        Ok(ClassUnicode {
2563            span: Span::new(start, self.pos()),
2564            negated,
2565            kind,
2566        })
2567    }
2568
2569    #[inline(never)]
2570    fn parse_perl_class(&self) -> ClassPerl {
2571        let c = self.char();
2572        let span = self.span_char();
2573        self.bump();
2574        let (negated, kind) = match c {
2575            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2576            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2577            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2578            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2579            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2580            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2581            c => panic!("expected valid Perl class but got '{}'", c),
2582        };
2583        ClassPerl {
2584            span,
2585            kind,
2586            negated,
2587        }
2588    }
2589}
2590
2591pub fn parse_ast<'s>(
2592    tb: &mut TB<'s>,
2593    pattern: &'s str,
2594) -> std::result::Result<NodeId, ResharpError> {
2595    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2596    p.parse(tb)
2597}
2598
2599pub fn parse_ast_with<'s>(
2600    tb: &mut TB<'s>,
2601    pattern: &'s str,
2602    flags: &PatternFlags,
2603) -> std::result::Result<NodeId, ResharpError> {
2604    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2605    p.parse(tb)
2606}
resharp_parser/lib.rs

resharp_parser/
lib.rs