resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26/// global pattern-level flags, set from `EngineOptions`.
27pub struct PatternFlags {
28    /// `\w`/`\d`/`\s` match full Unicode (true) or ASCII only (false).
29    pub unicode: bool,
30    /// global case-insensitive matching.
31    pub case_insensitive: bool,
32    /// `.` matches `\n` (behaves like `_`).
33    pub dot_matches_new_line: bool,
34    /// allow whitespace and `#` comments in the pattern.
35    pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39    fn default() -> Self {
40        Self {
41            unicode: true,
42            case_insensitive: false,
43            dot_matches_new_line: false,
44            ignore_whitespace: false,
45        }
46    }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51    Word,
52    NonWord,
53    MaybeWord,
54    MaybeNonWord,
55    Unknown,
56    Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60    b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66    Literal(Literal),
67    Assertion(ast::Assertion),
68    Dot(Span),
69    Top(Span),
70    Perl(ClassPerl),
71    Unicode(ClassUnicode),
72}
73
74impl Primitive {
75    fn span(&self) -> &Span {
76        match *self {
77            Primitive::Literal(ref x) => &x.span,
78            Primitive::Assertion(ref x) => &x.span,
79            Primitive::Dot(ref span) => span,
80            Primitive::Top(ref span) => span,
81            Primitive::Perl(ref x) => &x.span,
82            Primitive::Unicode(ref x) => &x.span,
83        }
84    }
85
86    fn into_ast(self) -> Ast {
87        match self {
88            Primitive::Literal(lit) => Ast::literal(lit),
89            Primitive::Assertion(assert) => Ast::assertion(assert),
90            Primitive::Dot(span) => Ast::dot(span),
91            Primitive::Top(span) => Ast::top(span),
92            Primitive::Perl(cls) => Ast::class_perl(cls),
93            Primitive::Unicode(cls) => Ast::class_unicode(cls),
94        }
95    }
96
97    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98        use self::Primitive::*;
99        use regex_syntax::ast::ClassSetItem;
100
101        match self {
102            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106        }
107    }
108
109    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110        use self::Primitive::*;
111
112        match self {
113            Literal(lit) => Ok(lit),
114            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115        }
116    }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121    Left(Left),
122    Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127    /// The kind of error.
128    pub kind: ErrorKind,
129    /// The original pattern that the parser generated the error from. Every
130    /// span in an error is a valid range into this string.
131    pattern: String,
132    /// The span of this error.
133    pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138        write!(f, "{:?}: {:?}", self.kind, self.span)
139    }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147    /// This state is pushed whenever an opening group is found.
148    Group {
149        /// The concatenation immediately preceding the opening group.
150        concat: Concat,
151        /// The group that has been opened. Its sub-AST is always empty.
152        group: ast::Group,
153        /// Whether this group has the `x` flag enabled or not.
154        ignore_whitespace: bool,
155    },
156    /// This state is pushed whenever a new alternation branch is found. If
157    /// an alternation branch is found and this state is at the top of the
158    /// stack, then this state should be modified to include the new
159    /// alternation.
160    Alternation(ast::Alternation),
161    Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166    /// This state is pushed whenever an opening bracket is found.
167    Open {
168        /// The union of class items immediately preceding this class.
169        union: regex_syntax::ast::ClassSetUnion,
170        /// The class that has been opened. Typically this just corresponds
171        /// to the `[`, but it can also include `[^` since `^` indicates
172        /// negation of the class.
173        set: regex_syntax::ast::ClassBracketed,
174    },
175    /// This state is pushed when a operator is seen. When popped, the stored
176    /// set becomes the left hand side of the operator.
177    Op {
178        /// The type of the operation, i.e., &&, -- or ~~.
179        kind: regex_syntax::ast::ClassSetBinaryOpKind,
180        /// The left-hand side of the operator.
181        lhs: regex_syntax::ast::ClassSet,
182    },
183}
184
185/// RE# syntax parser based on the regex-syntax crate.
186pub struct ResharpParser<'s> {
187    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188    unicode_classes: resharp_algebra::UnicodeClassCache,
189    pub translator: regex_syntax::hir::translate::Translator,
190    pub pattern: &'s str,
191    pos: Cell<Position>,
192    capture_index: Cell<u32>,
193    octal: bool,
194    empty_min_range: bool,
195    ignore_whitespace: Cell<bool>,
196    dot_all: Cell<bool>,
197    global_unicode: bool,
198    global_case_insensitive: bool,
199    comments: RefCell<Vec<ast::Comment>>,
200    stack_group: RefCell<Vec<GroupState>>,
201    stack_class: RefCell<Vec<ClassState>>,
202    capture_names: RefCell<Vec<ast::CaptureName>>,
203    scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207    if let Err(e) = result {
208        if e.kind == from {
209            Err(ResharpError {
210                kind: to,
211                pattern: e.pattern,
212                span: e.span,
213            })
214        } else {
215            Err(e)
216        }
217    } else {
218        result
219    }
220}
221
222fn is_capture_char(c: char, first: bool) -> bool {
223    if first {
224        c == '_' || c.is_alphabetic()
225    } else {
226        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
227    }
228}
229
230pub fn is_meta_character(c: char) -> bool {
231    match c {
232        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
233        | '#' | '&' | '-' | '~' | '_' => true,
234        _ => false,
235    }
236}
237
238/// escapes all resharp meta characters in `text`.
239pub fn escape(text: &str) -> String {
240    let mut buf = String::new();
241    escape_into(text, &mut buf);
242    buf
243}
244
245/// escapes all resharp meta characters in `text` and appends to `buf`.
246pub fn escape_into(text: &str, buf: &mut String) {
247    buf.reserve(text.len());
248    for c in text.chars() {
249        if is_meta_character(c) {
250            buf.push('\\');
251        }
252        buf.push(c);
253    }
254}
255
256pub fn is_escapeable_character(c: char) -> bool {
257    // Certainly escapeable if it's a meta character.
258    if is_meta_character(c) {
259        return true;
260    }
261    // Any character that isn't ASCII is definitely not escapeable. There's
262    // no real need to allow things like \☃ right?
263    if !c.is_ascii() {
264        return false;
265    }
266    // Otherwise, we basically say that everything is escapeable unless it's a
267    // letter or digit. Things like \3 are either octal (when enabled) or an
268    // error, and we should keep it that way. Otherwise, letters are reserved
269    // for adding new syntax in a backwards compatible way.
270    match c {
271        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
272        // While not currently supported, we keep these as not escapeable to
273        // give us some flexibility with respect to supporting the \< and
274        // \> word boundary assertions in the future. By rejecting them as
275        // escapeable, \< and \> will result in a parse error. Thus, we can
276        // turn them into something else in the future without it being a
277        // backwards incompatible change.
278        //
279        // OK, now we support \< and \>, and we need to retain them as *not*
280        // escapeable here since the escape sequence is significant.
281        '<' | '>' => false,
282        _ => true,
283    }
284}
285
286fn is_hex(c: char) -> bool {
287    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
288}
289
290impl<'s> ResharpParser<'s> {
291    fn default_translator_builder(&self) -> TranslatorBuilder {
292        let mut trb = TranslatorBuilder::new();
293        trb.unicode(self.global_unicode);
294        trb.utf8(false);
295        trb.case_insensitive(self.global_case_insensitive);
296        trb
297    }
298
299    pub fn new(pattern: &'s str) -> Self {
300        Self::with_flags(pattern, &PatternFlags::default())
301    }
302
303    pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
304        let mut trb = TranslatorBuilder::new();
305        trb.unicode(flags.unicode);
306        trb.utf8(false);
307        trb.case_insensitive(flags.case_insensitive);
308        Self {
309            translator: trb.build(),
310            pattern,
311            perl_classes: vec![],
312            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
313            pos: Cell::new(Position::new(0, 0, 0)),
314            capture_index: Cell::new(0),
315            octal: false,
316            empty_min_range: false,
317            ignore_whitespace: Cell::new(flags.ignore_whitespace),
318            dot_all: Cell::new(flags.dot_matches_new_line),
319            global_unicode: flags.unicode,
320            global_case_insensitive: flags.case_insensitive,
321            comments: RefCell::new(vec![]),
322            stack_group: RefCell::new(vec![]),
323            stack_class: RefCell::new(vec![]),
324            capture_names: RefCell::new(vec![]),
325            scratch: RefCell::new(String::new()),
326        }
327    }
328
329    /// Return a reference to the parser state.
330    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
331        self
332    }
333
334    /// Return a reference to the pattern being parsed.
335    fn pattern(&self) -> &str {
336        self.pattern
337    }
338
339    /// Create a new error with the given span and error type.
340    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
341        ResharpError {
342            kind,
343            pattern: self.pattern().to_string(),
344            span,
345        }
346    }
347
348    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
349        let emptyspan = Span::splat(self.pos());
350        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
351        inner
352    }
353
354    /// Return the current offset of the parser.
355    ///
356    /// The offset starts at `0` from the beginning of the regular expression
357    /// pattern string.
358    fn offset(&self) -> usize {
359        self.parser().pos.get().offset
360    }
361
362    /// Return the current line number of the parser.
363    ///
364    /// The line number starts at `1`.
365    fn line(&self) -> usize {
366        self.parser().pos.get().line
367    }
368
369    /// Return the current column of the parser.
370    ///
371    /// The column number starts at `1` and is reset whenever a `\n` is seen.
372    fn column(&self) -> usize {
373        self.parser().pos.get().column
374    }
375
376    /// Return the next capturing index. Each subsequent call increments the
377    /// internal index.
378    ///
379    /// The span given should correspond to the location of the opening
380    /// parenthesis.
381    ///
382    /// If the capture limit is exceeded, then an error is returned.
383    fn next_capture_index(&self, span: Span) -> Result<u32> {
384        let current = self.parser().capture_index.get();
385        let i = current
386            .checked_add(1)
387            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
388        self.parser().capture_index.set(i);
389        Ok(i)
390    }
391
392    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
393        let mut names = self.parser().capture_names.borrow_mut();
394        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
395            Err(i) => {
396                names.insert(i, cap.clone());
397                Ok(())
398            }
399            Ok(i) => Err(self.error(
400                cap.span,
401                ast::ErrorKind::GroupNameDuplicate {
402                    original: names[i].span,
403                },
404            )),
405        }
406    }
407
408    fn ignore_whitespace(&self) -> bool {
409        self.parser().ignore_whitespace.get()
410    }
411
412    fn char(&self) -> char {
413        self.char_at(self.offset())
414    }
415
416    fn char_at(&self, i: usize) -> char {
417        self.pattern()[i..]
418            .chars()
419            .next()
420            .unwrap_or_else(|| panic!("expected char at offset {}", i))
421    }
422
423    fn bump(&self) -> bool {
424        if self.is_eof() {
425            return false;
426        }
427        let Position {
428            mut offset,
429            mut line,
430            mut column,
431        } = self.pos();
432        if self.char() == '\n' {
433            line = line.checked_add(1).unwrap();
434            column = 1;
435        } else {
436            column = column.checked_add(1).unwrap();
437        }
438        offset += self.char().len_utf8();
439        self.parser().pos.set(Position {
440            offset,
441            line,
442            column,
443        });
444        self.pattern()[self.offset()..].chars().next().is_some()
445    }
446
447    fn bump_if(&self, prefix: &str) -> bool {
448        if self.pattern()[self.offset()..].starts_with(prefix) {
449            for _ in 0..prefix.chars().count() {
450                self.bump();
451            }
452            true
453        } else {
454            false
455        }
456    }
457
458    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
459        if self.bump_if("?=") {
460            return Some((true, true));
461        }
462        if self.bump_if("?!") {
463            return Some((true, false));
464        }
465        if self.bump_if("?<=") {
466            return Some((false, true));
467        }
468        if self.bump_if("?<!") {
469            return Some((false, false));
470        }
471        return None;
472        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
473    }
474
475    fn bump_and_bump_space(&self) -> bool {
476        if !self.bump() {
477            return false;
478        }
479        self.bump_space();
480        !self.is_eof()
481    }
482
483    fn bump_space(&self) {
484        if !self.ignore_whitespace() {
485            return;
486        }
487        while !self.is_eof() {
488            if self.char().is_whitespace() {
489                self.bump();
490            } else if self.char() == '#' {
491                let start = self.pos();
492                let mut comment_text = String::new();
493                self.bump();
494                while !self.is_eof() {
495                    let c = self.char();
496                    self.bump();
497                    if c == '\n' {
498                        break;
499                    }
500                    comment_text.push(c);
501                }
502                let comment = ast::Comment {
503                    span: Span::new(start, self.pos()),
504                    comment: comment_text,
505                };
506                self.parser().comments.borrow_mut().push(comment);
507            } else {
508                break;
509            }
510        }
511    }
512
513    /// Peek at the next character in the input without advancing the parser.
514    ///
515    /// If the input has been exhausted, then this returns `None`.
516    fn peek(&self) -> Option<char> {
517        if self.is_eof() {
518            return None;
519        }
520        self.pattern()[self.offset() + self.char().len_utf8()..]
521            .chars()
522            .next()
523    }
524
525    /// Like peek, but will ignore spaces when the parser is in whitespace
526    /// insensitive mode.
527    fn peek_space(&self) -> Option<char> {
528        if !self.ignore_whitespace() {
529            return self.peek();
530        }
531        if self.is_eof() {
532            return None;
533        }
534        let mut start = self.offset() + self.char().len_utf8();
535        let mut in_comment = false;
536        for (i, c) in self.pattern()[start..].char_indices() {
537            if c.is_whitespace() {
538                continue;
539            } else if !in_comment && c == '#' {
540                in_comment = true;
541            } else if in_comment && c == '\n' {
542                in_comment = false;
543            } else {
544                start += i;
545                break;
546            }
547        }
548        self.pattern()[start..].chars().next()
549    }
550
551    /// Returns true if the next call to `bump` would return false.
552    fn is_eof(&self) -> bool {
553        self.offset() == self.pattern().len()
554    }
555
556    /// Return the current position of the parser, which includes the offset,
557    /// line and column.
558    fn pos(&self) -> Position {
559        self.parser().pos.get()
560    }
561
562    /// Create a span at the current position of the parser. Both the start
563    /// and end of the span are set.
564    fn span(&self) -> Span {
565        Span::splat(self.pos())
566    }
567
568    /// Create a span that covers the current character.
569    fn span_char(&self) -> Span {
570        let mut next = Position {
571            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
572            line: self.line(),
573            column: self.column().checked_add(1).unwrap(),
574        };
575        if self.char() == '\n' {
576            next.line += 1;
577            next.column = 1;
578        }
579        Span::new(self.pos(), next)
580    }
581
582    /// Parse and push a single alternation on to the parser's internal stack.
583    /// If the top of the stack already has an alternation, then add to that
584    /// instead of pushing a new one.
585    ///
586    /// The concatenation given corresponds to a single alternation branch.
587    /// The concatenation returned starts the next branch and is empty.
588    ///
589    /// This assumes the parser is currently positioned at `|` and will advance
590    /// the parser to the character following `|`.
591    #[inline(never)]
592    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
593        assert_eq!(self.char(), '|');
594        concat.span.end = self.pos();
595        self.push_or_add_alternation(concat);
596        self.bump();
597        Ok(ast::Concat {
598            span: self.span(),
599            asts: vec![],
600        })
601    }
602
603    /// Pushes or adds the given branch of an alternation to the parser's
604    /// internal stack of state.
605    fn push_or_add_alternation(&self, concat: Concat) {
606        use self::GroupState::*;
607
608        let mut stack = self.parser().stack_group.borrow_mut();
609        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
610            alts.asts.push(concat.into_ast());
611            return;
612        }
613        stack.push(Alternation(ast::Alternation {
614            span: Span::new(concat.span.start, self.pos()),
615            asts: vec![concat.into_ast()],
616        }));
617    }
618
619    #[inline(never)]
620    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
621        assert_eq!(self.char(), '&');
622        concat.span.end = self.pos();
623        self.push_or_add_intersect(concat);
624        self.bump();
625        Ok(Concat {
626            span: self.span(),
627            asts: vec![],
628        })
629    }
630
631    /// Pushes or adds the given branch of an alternation to the parser's
632    /// internal stack of state.
633    fn push_or_add_intersect(&self, concat: Concat) {
634        use self::GroupState::*;
635
636        let mut stack = self.parser().stack_group.borrow_mut();
637        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
638            alts.asts.push(concat.into_ast());
639            return;
640        }
641        stack.push(Intersection(ast::Intersection {
642            span: Span::new(concat.span.start, self.pos()),
643            asts: vec![concat.into_ast()],
644        }));
645    }
646
647    /// Parse and push a group AST (and its parent concatenation) on to the
648    /// parser's internal stack. Return a fresh concatenation corresponding
649    /// to the group's sub-AST.
650    ///
651    /// If a set of flags was found (with no group), then the concatenation
652    /// is returned with that set of flags added.
653    ///
654    /// This assumes that the parser is currently positioned on the opening
655    /// parenthesis. It advances the parser to the character at the start
656    /// of the sub-expression (or adjoining expression).
657    ///
658    /// If there was a problem parsing the start of the group, then an error
659    /// is returned.
660    #[inline(never)]
661    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
662        assert_eq!(self.char(), '(');
663        match self.parse_group()? {
664            Either::Left(set) => {
665                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
666                if let Some(v) = ignore {
667                    self.parser().ignore_whitespace.set(v);
668                }
669
670                concat.asts.push(Ast::flags(set));
671                Ok(concat)
672            }
673            Either::Right(group) => {
674                let old_ignore_whitespace = self.ignore_whitespace();
675                let new_ignore_whitespace = group
676                    .flags()
677                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678                    .unwrap_or(old_ignore_whitespace);
679                self.parser()
680                    .stack_group
681                    .borrow_mut()
682                    .push(GroupState::Group {
683                        concat,
684                        group,
685                        ignore_whitespace: old_ignore_whitespace,
686                    });
687                self.parser().ignore_whitespace.set(new_ignore_whitespace);
688                Ok(Concat {
689                    span: self.span(),
690                    asts: vec![],
691                })
692            }
693        }
694    }
695
696    #[inline(never)]
697    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
698        assert_eq!(self.char(), '~');
699        self.bump();
700        if self.is_eof() || self.char() != '(' {
701            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
702        }
703        let open_span = self.span_char();
704        self.bump();
705        let group = ast::Group {
706            span: open_span,
707            kind: ast::GroupKind::Complement,
708            ast: Box::new(Ast::empty(self.span())),
709        };
710
711        let old_ignore_whitespace = self.ignore_whitespace();
712        let new_ignore_whitespace = group
713            .flags()
714            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
715            .unwrap_or(old_ignore_whitespace);
716        self.parser()
717            .stack_group
718            .borrow_mut()
719            .push(GroupState::Group {
720                concat,
721                group,
722                ignore_whitespace: old_ignore_whitespace,
723            });
724        self.parser().ignore_whitespace.set(new_ignore_whitespace);
725        Ok(Concat {
726            span: self.span(),
727            asts: vec![],
728        })
729    }
730
731    /// Pop a group AST from the parser's internal stack and set the group's
732    /// AST to the given concatenation. Return the concatenation containing
733    /// the group.
734    ///
735    /// This assumes that the parser is currently positioned on the closing
736    /// parenthesis and advances the parser to the character following the `)`.
737    ///
738    /// If no such group could be popped, then an unopened group error is
739    /// returned.
740    #[inline(never)]
741    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
742        use self::GroupState::*;
743        assert_eq!(self.char(), ')');
744        let mut stack = self.parser().stack_group.borrow_mut();
745        let topstack = stack.pop();
746
747        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
748            Some(Group {
749                concat,
750                group,
751                ignore_whitespace,
752            }) => (concat, group, ignore_whitespace, None),
753            Some(Alternation(alt)) => match stack.pop() {
754                Some(Group {
755                    concat,
756                    group,
757                    ignore_whitespace,
758                }) => (
759                    concat,
760                    group,
761                    ignore_whitespace,
762                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
763                ),
764                None | Some(Alternation(_)) | Some(Intersection(_)) => {
765                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
766                }
767            },
768            Some(Intersection(int)) => match stack.pop() {
769                Some(Group {
770                    concat,
771                    group,
772                    ignore_whitespace,
773                }) => (
774                    concat,
775                    group,
776                    ignore_whitespace,
777                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
778                ),
779                None | Some(Alternation(_)) | Some(Intersection(_)) => {
780                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
781                }
782            },
783
784            None => {
785                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
786            }
787        };
788        self.parser().ignore_whitespace.set(ignore_whitespace);
789        group_concat.span.end = self.pos();
790        self.bump();
791        group.span.end = self.pos();
792        match alt {
793            Some(Either::Left(mut alt)) => {
794                alt.span.end = group_concat.span.end;
795                alt.asts.push(group_concat.into_ast());
796                group.ast = Box::new(alt.into_ast());
797            }
798            Some(Either::Right(mut int)) => {
799                int.span.end = group_concat.span.end;
800                int.asts.push(group_concat.into_ast());
801                group.ast = Box::new(int.into_ast());
802            }
803            None => {
804                group.ast = Box::new(group_concat.into_ast());
805            }
806        }
807
808        if group.kind == GroupKind::Complement {
809            let complement = ast::Complement {
810                span: self.span(),
811                ast: group.ast,
812            };
813            prior_concat.asts.push(Ast::complement(complement));
814        }
815        // ignore groups for now
816        else {
817            // dbg!(&group);
818            prior_concat.asts.push(Ast::group(group));
819        }
820        Ok(prior_concat)
821    }
822
823    /// Pop the last state from the parser's internal stack, if it exists, and
824    /// add the given concatenation to it. There either must be no state or a
825    /// single alternation item on the stack. Any other scenario produces an
826    /// error.
827    ///
828    /// This assumes that the parser has advanced to the end.
829    #[inline(never)]
830    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
831        concat.span.end = self.pos();
832        // dbg!(&concat);
833        let mut stack = self.parser().stack_group.borrow_mut();
834        let ast = match stack.pop() {
835            None => Ok(concat.into_ast()),
836            Some(GroupState::Alternation(mut alt)) => {
837                alt.span.end = self.pos();
838                alt.asts.push(concat.into_ast());
839                Ok(Ast::alternation(alt))
840            }
841            Some(GroupState::Intersection(mut int)) => {
842                int.span.end = self.pos();
843                int.asts.push(concat.into_ast());
844
845                // dbg!("end intersection2");
846                Ok(Ast::intersection(int))
847            }
848            Some(GroupState::Group { group, .. }) => {
849                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
850            }
851        };
852        // If we try to pop again, there should be nothing.
853        match stack.pop() {
854            None => ast,
855            Some(GroupState::Alternation(_)) => {
856                // This unreachable is unfortunate. This case can't happen
857                // because the only way we can be here is if there were two
858                // `GroupState::Alternation`s adjacent in the parser's stack,
859                // which we guarantee to never happen because we never push a
860                // `GroupState::Alternation` if one is already at the top of
861                // the stack.
862                unreachable!()
863            }
864            Some(GroupState::Intersection(_)) => {
865                unreachable!()
866            }
867            Some(GroupState::Group { group, .. }) => {
868                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
869            }
870        }
871    }
872
873    /// Parse the opening of a character class and push the current class
874    /// parsing context onto the parser's stack. This assumes that the parser
875    /// is positioned at an opening `[`. The given union should correspond to
876    /// the union of set items built up before seeing the `[`.
877    ///
878    /// If there was a problem parsing the opening of the class, then an error
879    /// is returned. Otherwise, a new union of set items for the class is
880    /// returned (which may be populated with either a `]` or a `-`).
881    #[inline(never)]
882    fn push_class_open(
883        &self,
884        parent_union: regex_syntax::ast::ClassSetUnion,
885    ) -> Result<regex_syntax::ast::ClassSetUnion> {
886        assert_eq!(self.char(), '[');
887
888        let (nested_set, nested_union) = self.parse_set_class_open()?;
889        self.parser()
890            .stack_class
891            .borrow_mut()
892            .push(ClassState::Open {
893                union: parent_union,
894                set: nested_set,
895            });
896        Ok(nested_union)
897    }
898
899    /// Parse the end of a character class set and pop the character class
900    /// parser stack. The union given corresponds to the last union built
901    /// before seeing the closing `]`. The union returned corresponds to the
902    /// parent character class set with the nested class added to it.
903    ///
904    /// This assumes that the parser is positioned at a `]` and will advance
905    /// the parser to the byte immediately following the `]`.
906    ///
907    /// If the stack is empty after popping, then this returns the final
908    /// "top-level" character class AST (where a "top-level" character class
909    /// is one that is not nested inside any other character class).
910    ///
911    /// If there is no corresponding opening bracket on the parser's stack,
912    /// then an error is returned.
913    #[inline(never)]
914    fn pop_class(
915        &self,
916        nested_union: regex_syntax::ast::ClassSetUnion,
917    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
918        assert_eq!(self.char(), ']');
919
920        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
921        let prevset = self.pop_class_op(item);
922        let mut stack = self.parser().stack_class.borrow_mut();
923        match stack.pop() {
924            None => {
925                // We can never observe an empty stack:
926                //
927                // 1) We are guaranteed to start with a non-empty stack since
928                //    the character class parser is only initiated when it sees
929                //    a `[`.
930                // 2) If we ever observe an empty stack while popping after
931                //    seeing a `]`, then we signal the character class parser
932                //    to terminate.
933                panic!("unexpected empty character class stack")
934            }
935            Some(ClassState::Op { .. }) => {
936                // This panic is unfortunate, but this case is impossible
937                // since we already popped the Op state if one exists above.
938                // Namely, every push to the class parser stack is guarded by
939                // whether an existing Op is already on the top of the stack.
940                // If it is, the existing Op is modified. That is, the stack
941                // can never have consecutive Op states.
942                panic!("unexpected ClassState::Op")
943            }
944            Some(ClassState::Open { mut union, mut set }) => {
945                self.bump();
946                set.span.end = self.pos();
947                set.kind = prevset;
948                if stack.is_empty() {
949                    Ok(Either::Right(set))
950                } else {
951                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
952                    Ok(Either::Left(union))
953                }
954            }
955        }
956    }
957
958    /// Return an "unclosed class" error whose span points to the most
959    /// recently opened class.
960    ///
961    /// This should only be called while parsing a character class.
962    #[inline(never)]
963    fn unclosed_class_error(&self) -> ResharpError {
964        for state in self.parser().stack_class.borrow().iter().rev() {
965            if let ClassState::Open { ref set, .. } = *state {
966                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
967            }
968        }
969        // We are guaranteed to have a non-empty stack with at least
970        // one open bracket, so we should never get here.
971        panic!("no open character class found")
972    }
973
974    /// Push the current set of class items on to the class parser's stack as
975    /// the left hand side of the given operator.
976    ///
977    /// A fresh set union is returned, which should be used to build the right
978    /// hand side of this operator.
979    #[inline(never)]
980    fn push_class_op(
981        &self,
982        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
983        next_union: regex_syntax::ast::ClassSetUnion,
984    ) -> regex_syntax::ast::ClassSetUnion {
985        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
986        let new_lhs = self.pop_class_op(item);
987        self.parser().stack_class.borrow_mut().push(ClassState::Op {
988            kind: next_kind,
989            lhs: new_lhs,
990        });
991        regex_syntax::ast::ClassSetUnion {
992            span: self.span(),
993            items: vec![],
994        }
995    }
996
997    /// Pop a character class set from the character class parser stack. If the
998    /// top of the stack is just an item (not an operation), then return the
999    /// given set unchanged. If the top of the stack is an operation, then the
1000    /// given set will be used as the rhs of the operation on the top of the
1001    /// stack. In that case, the binary operation is returned as a set.
1002    #[inline(never)]
1003    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1004        let mut stack = self.parser().stack_class.borrow_mut();
1005        let (kind, lhs) = match stack.pop() {
1006            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1007            Some(state @ ClassState::Open { .. }) => {
1008                stack.push(state);
1009                return rhs;
1010            }
1011            None => unreachable!(),
1012        };
1013        let span = Span::new(lhs.span().start, rhs.span().end);
1014        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1015            span,
1016            kind,
1017            lhs: Box::new(lhs),
1018            rhs: Box::new(rhs),
1019        })
1020    }
1021
1022    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1023        match hir.kind() {
1024            hir::HirKind::Empty => Ok(NodeId::EPS),
1025            hir::HirKind::Literal(l) => {
1026                if l.0.len() == 1 {
1027                    let node = tb.mk_u8(l.0[0]);
1028                    Ok(node)
1029                } else {
1030                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1031                    let conc = tb.mk_concats(ws.iter().copied());
1032                    Ok(conc)
1033                }
1034            }
1035            hir::HirKind::Class(class) => {
1036                // dbg!(class);
1037                match class {
1038                    hir::Class::Unicode(class_unicode) => {
1039                        let ranges = class_unicode.ranges();
1040                        let mut nodes = Vec::new();
1041                        for range in ranges {
1042                            for seq in Utf8Sequences::new(range.start(), range.end()) {
1043                                let sl = seq.as_slice();
1044                                let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1045                                let node = match bytes.len() {
1046                                    1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1047                                    n => {
1048                                        let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1049                                        let mut conc = last;
1050                                        for i in (0..n - 1).rev() {
1051                                            let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1052                                            conc = tb.mk_concat(b, conc);
1053                                        }
1054                                        conc
1055                                    }
1056                                };
1057                                nodes.push(node);
1058                            }
1059                        }
1060                        let merged = tb.mk_unions(nodes.into_iter());
1061                        Ok(merged)
1062                    }
1063                    hir::Class::Bytes(class_bytes) => {
1064                        let ranges = class_bytes.ranges();
1065                        let mut result = NodeId::BOT;
1066                        for range in ranges {
1067                            let start = range.start();
1068                            let end = range.end();
1069                            let node = tb.mk_range_u8(start, end);
1070                            result = tb.mk_union(result, node);
1071                        }
1072                        Ok(result)
1073                    }
1074                }
1075            }
1076            hir::HirKind::Look(_) => todo!(),
1077            hir::HirKind::Repetition(_) => todo!(),
1078            hir::HirKind::Capture(_) => todo!(),
1079            hir::HirKind::Concat(body) => {
1080                let mut result = NodeId::EPS;
1081                for child in body {
1082                    let node = self.hir_to_node_id(child, tb)?;
1083                    result = tb.mk_concat(result, node);
1084                }
1085                Ok(result)
1086            }
1087            hir::HirKind::Alternation(_) => todo!(),
1088        }
1089    }
1090
1091    fn translate_ast_to_hir(
1092        &mut self,
1093        orig_ast: &regex_syntax::ast::Ast,
1094        tb: &mut TB<'s>,
1095    ) -> Result<NodeId> {
1096        // let unboxed = class_perl.deref();
1097        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1098        match self.translator.translate("", orig_ast) {
1099            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1100            Ok(hir) => {
1101                let mapped = self.hir_to_node_id(&hir, tb);
1102                mapped
1103            }
1104        }
1105    }
1106
1107    fn translator_to_node_id(
1108        &mut self,
1109        orig_ast: &regex_syntax::ast::Ast,
1110        translator: &mut Option<Translator>,
1111        tb: &mut TB<'s>,
1112    ) -> Result<NodeId> {
1113        match translator {
1114            Some(tr) => {
1115                let hir = tr
1116                    .translate("", &orig_ast)
1117                    .map_err(|e| self.unsupported_error(e))?;
1118                self.hir_to_node_id(&hir, tb)
1119            }
1120            None => self.translate_ast_to_hir(&orig_ast, tb),
1121        }
1122    }
1123
1124    fn get_class(
1125        &mut self,
1126        negated: bool,
1127        kind: regex_syntax::ast::ClassPerlKind,
1128        tb: &mut TB<'s>,
1129    ) -> Result<NodeId> {
1130        let w = self
1131            .perl_classes
1132            .iter()
1133            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1134        match w {
1135            Some((_, _, value)) => return Ok(*value),
1136            None => {
1137                let translated = if self.global_unicode {
1138                    match kind {
1139                        regex_syntax::ast::ClassPerlKind::Word => {
1140                            self.unicode_classes.ensure_word(tb);
1141                            if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1142                        }
1143                        regex_syntax::ast::ClassPerlKind::Digit => {
1144                            self.unicode_classes.ensure_digit(tb);
1145                            if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1146                        }
1147                        regex_syntax::ast::ClassPerlKind::Space => {
1148                            self.unicode_classes.ensure_space(tb);
1149                            if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1150                        }
1151                    }
1152                } else {
1153                    let pos = match kind {
1154                        regex_syntax::ast::ClassPerlKind::Word => {
1155                            let az = tb.mk_range_u8(b'a', b'z');
1156                            let big = tb.mk_range_u8(b'A', b'Z');
1157                            let dig = tb.mk_range_u8(b'0', b'9');
1158                            let us = tb.mk_u8(b'_');
1159                            tb.mk_unions([az, big, dig, us].into_iter())
1160                        }
1161                        regex_syntax::ast::ClassPerlKind::Digit => {
1162                            tb.mk_range_u8(b'0', b'9')
1163                        }
1164                        regex_syntax::ast::ClassPerlKind::Space => {
1165                            let sp = tb.mk_u8(b' ');
1166                            let tab = tb.mk_u8(b'\t');
1167                            let nl = tb.mk_u8(b'\n');
1168                            let cr = tb.mk_u8(b'\r');
1169                            let ff = tb.mk_u8(0x0C);
1170                            let vt = tb.mk_u8(0x0B);
1171                            tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1172                        }
1173                    };
1174                    if negated { tb.mk_compl(pos) } else { pos }
1175                };
1176                self.perl_classes.push((negated, kind, translated));
1177                Ok(translated)
1178            }
1179        }
1180    }
1181
1182    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1183        use WordCharKind::*;
1184        match ast {
1185            Ast::Literal(lit) => {
1186                if is_word_byte(lit.c as u8) {
1187                    Word
1188                } else {
1189                    NonWord
1190                }
1191            }
1192            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1193                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1194                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1195                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1196                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1197                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1198                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1199            },
1200            Ast::Dot(_) | Ast::Top(_) => Unknown,
1201            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1202            Ast::Concat(c) if !c.asts.is_empty() => {
1203                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1204            }
1205            Ast::Repetition(r) => {
1206                let inner = Self::word_char_kind(&r.ast, left);
1207                let nullable = matches!(
1208                    &r.op.kind,
1209                    ast::RepetitionKind::ZeroOrMore
1210                        | ast::RepetitionKind::ZeroOrOne
1211                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1212                );
1213                if nullable {
1214                    match inner {
1215                        Word => MaybeWord,
1216                        NonWord => MaybeNonWord,
1217                        _ => Unknown,
1218                    }
1219                } else {
1220                    inner
1221                }
1222            }
1223            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1224            _ => Unknown,
1225        }
1226    }
1227
1228    fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1229        match ast {
1230            Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1231            | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1232            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1233            Ast::Concat(c) if !c.asts.is_empty() => {
1234                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1235            }
1236            Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1237            Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1238            _ => None,
1239        }
1240    }
1241
1242    fn resolve_word_kind(
1243        &mut self,
1244        asts: &[Ast],
1245        idx: usize,
1246        dir: isize,
1247        translator: &mut Option<Translator>,
1248        tb: &mut TB<'s>,
1249        word_id: NodeId,
1250        not_word_id: NodeId,
1251    ) -> Result<WordCharKind> {
1252        use WordCharKind::*;
1253        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1254        if fast != Unknown {
1255            return Ok(fast);
1256        }
1257        let neighbor_idx = (idx as isize + dir) as usize;
1258        let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1259            Some(e) => e,
1260            None => return Ok(Unknown),
1261        };
1262        let node = self.ast_to_node_id(edge, translator, tb)?;
1263        if tb.subsumes(word_id, node) == Some(true) {
1264            Ok(Word)
1265        } else if tb.subsumes(not_word_id, node) == Some(true) {
1266            Ok(NonWord)
1267        } else {
1268            Ok(Unknown)
1269        }
1270    }
1271
1272    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1273        use WordCharKind::*;
1274        let next = idx as isize + dir;
1275        if next < 0 || next >= asts.len() as isize {
1276            return Edge;
1277        }
1278        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1279        match kind {
1280            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1281                Word => Word,
1282                _ => Unknown,
1283            },
1284            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1285                NonWord => NonWord,
1286                _ => Unknown,
1287            },
1288            other => other,
1289        }
1290    }
1291
1292    fn rewrite_word_boundary_in_concat(
1293        &mut self,
1294        asts: &[Ast],
1295        idx: usize,
1296        translator: &mut Option<Translator>,
1297        tb: &mut TB<'s>,
1298    ) -> Result<(NodeId, usize)> {
1299        use WordCharKind::*;
1300        let (word_id, not_word_id) = if self.global_unicode {
1301            self.unicode_classes.ensure_word(tb);
1302            (self.unicode_classes.word, self.unicode_classes.non_word)
1303        } else {
1304            let az = tb.mk_range_u8(b'a', b'z');
1305            let big = tb.mk_range_u8(b'A', b'Z');
1306            let dig = tb.mk_range_u8(b'0', b'9');
1307            let us = tb.mk_u8(b'_');
1308            let w = tb.mk_unions([az, big, dig, us].into_iter());
1309            (w, tb.mk_compl(w))
1310        };
1311        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1312        let right =
1313            self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1314
1315        match (left, right) {
1316            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1317            (Word, _) => {
1318                let neg = tb.mk_neg_lookahead(word_id, 0);
1319                Ok((neg, idx + 1))
1320            }
1321            (NonWord, _) => {
1322                let set = tb.mk_union(NodeId::END, word_id);
1323                let tail = tb.mk_concat(set, NodeId::TS);
1324                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1325            }
1326            (_, Word) => {
1327                Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1328            }
1329            (_, NonWord) => {
1330                let body = tb.mk_union(NodeId::BEGIN, word_id);
1331                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1332            }
1333            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1334            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1335            _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1336        }
1337    }
1338
1339    fn merge_boundary_with_following_lookaheads(
1340        &mut self,
1341        asts: &[Ast],
1342        wb_idx: usize,
1343        boundary_tail: NodeId,
1344        translator: &mut Option<Translator>,
1345        tb: &mut TB<'s>,
1346    ) -> Result<(NodeId, usize)> {
1347        let mut next = wb_idx + 1;
1348        let mut la_bodies = vec![boundary_tail];
1349        while next < asts.len() {
1350            match &asts[next] {
1351                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1352                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1353                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1354                    next += 1;
1355                }
1356                _ => break,
1357            }
1358        }
1359        let merged = tb.mk_inters(la_bodies.into_iter());
1360        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1361    }
1362
1363    fn ast_to_node_id(
1364        &mut self,
1365        ast: &Ast,
1366        translator: &mut Option<Translator>,
1367        tb: &mut TB<'s>,
1368    ) -> Result<NodeId> {
1369        match ast {
1370            Ast::Empty(_) => Ok(NodeId::EPS),
1371            Ast::Flags(f) => {
1372                let mut translator_builder = self.default_translator_builder();
1373                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1374                    translator_builder.case_insensitive(state);
1375                }
1376                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1377                    translator_builder.unicode(state);
1378                }
1379                if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1380                    self.dot_all.set(state);
1381                }
1382                let concat_translator = Some(translator_builder.build());
1383                *translator = concat_translator;
1384                Ok(NodeId::EPS)
1385            }
1386            Ast::Literal(l) => {
1387                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1388                self.translator_to_node_id(&ast_lit, translator, tb)
1389            }
1390            Ast::Top(_) => Ok(NodeId::TOP),
1391            Ast::Dot(_) => {
1392                if self.dot_all.get() {
1393                    Ok(NodeId::TOP)
1394                } else {
1395                    let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1396                    self.hir_to_node_id(&hirv, tb)
1397                }
1398            }
1399            Ast::Assertion(a) => match &a.kind {
1400                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1401                ast::AssertionKind::EndText => Ok(NodeId::END),
1402                ast::AssertionKind::WordBoundary => {
1403                    return Err(
1404                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1405                    )
1406                }
1407                ast::AssertionKind::NotWordBoundary => {
1408                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1409                }
1410                ast::AssertionKind::StartLine => {
1411                    let left = NodeId::BEGIN;
1412                    let right = tb.mk_u8('\n' as u8);
1413                    let union = tb.mk_union(left, right);
1414                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1415                }
1416                ast::AssertionKind::EndLine => {
1417                    let left = NodeId::END;
1418                    let right = tb.mk_u8('\n' as u8);
1419                    let union = tb.mk_union(left, right);
1420                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1421                }
1422                ast::AssertionKind::WordBoundaryStart => todo!(),
1423                ast::AssertionKind::WordBoundaryEnd => todo!(),
1424                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1425                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1426                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1427                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1428            },
1429            Ast::ClassUnicode(c) => {
1430                let tmp = regex_syntax::ast::ClassUnicode {
1431                    span: c.span,
1432                    negated: c.negated,
1433                    kind: c.kind.clone(),
1434                };
1435                if !c.negated {
1436                    match &c.kind {
1437                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1438                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1439                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1440                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1441                            "utf8" => {
1442                                let ascii = tb.mk_range_u8(0, 127);
1443                                let beta = tb.mk_range_u8(128, 0xBF);
1444                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1445                                let c0s = tb.mk_concats([c0, beta].into_iter());
1446                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1447                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1448                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1449                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1450                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1451                                return Ok(tb.mk_star(merged));
1452                            }
1453                            "hex" => {
1454                                let nums = tb.mk_range_u8(b'0', b'9');
1455                                let lets = tb.mk_range_u8(b'a', b'f');
1456                                let lets2 = tb.mk_range_u8(b'A', b'F');
1457                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1458                                return Ok(merged);
1459                            }
1460                            _ => {}
1461                        },
1462                        _ => {}
1463                    };
1464                }
1465
1466                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1467                self.translator_to_node_id(&orig_ast, translator, tb)
1468            }
1469            Ast::ClassPerl(c) => {
1470                self.get_class(c.negated, c.kind.clone(), tb)
1471            }
1472            Ast::ClassBracketed(c) => match &c.kind {
1473                regex_syntax::ast::ClassSet::Item(_) => {
1474                    let tmp = regex_syntax::ast::ClassBracketed {
1475                        span: c.span,
1476                        negated: c.negated,
1477                        kind: c.kind.clone(),
1478                    };
1479                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1480                    self.translator_to_node_id(&orig_ast, translator, tb)
1481                }
1482                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1483            },
1484            Ast::Repetition(r) => {
1485                let body = self.ast_to_node_id(&r.ast, translator, tb);
1486                match body {
1487                    Ok(body) => match &r.op.kind {
1488                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1489                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1490                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1491                        ast::RepetitionKind::Range(r) => match r {
1492                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1493                            ast::RepetitionRange::AtLeast(n) => {
1494                                let rep = tb.mk_repeat(body, *n, *n);
1495                                let st = tb.mk_star(body);
1496                                Ok(tb.mk_concat(rep, st))
1497                            }
1498
1499                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1500                        },
1501                    },
1502                    Err(_) => body,
1503                }
1504            }
1505            Ast::Lookaround(g) => {
1506                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1507                match g.kind {
1508                    ast::LookaroundKind::PositiveLookahead => {
1509                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1510                    }
1511                    ast::LookaroundKind::PositiveLookbehind => {
1512                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1513                    }
1514                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1515                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1516                }
1517            }
1518            Ast::Group(g) => {
1519                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1520                    if !flags.items.is_empty() {
1521                        let mut translator_builder = self.default_translator_builder();
1522                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1523                            translator_builder.case_insensitive(state);
1524                        }
1525                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1526                            translator_builder.unicode(state);
1527                        }
1528                        let saved_dot_all = self.dot_all.get();
1529                        if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1530                            self.dot_all.set(state);
1531                        }
1532                        let mut scoped = Some(translator_builder.build());
1533                        let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1534                        self.dot_all.set(saved_dot_all);
1535                        return result;
1536                    }
1537                }
1538                self.ast_to_node_id(&g.ast, translator, tb)
1539            }
1540            Ast::Alternation(a) => {
1541                let mut children = vec![];
1542                for ast in &a.asts {
1543                    match self.ast_to_node_id(ast, translator, tb) {
1544                        Ok(node_id) => children.push(node_id),
1545                        Err(err) => return Err(err),
1546                    }
1547                }
1548                Ok(tb.mk_unions(children.iter().copied()))
1549            }
1550            Ast::Concat(c) => {
1551                let mut concat_translator: Option<Translator> = None;
1552                let mut children = vec![];
1553                let mut i = 0;
1554                while i < c.asts.len() {
1555                    let ast = &c.asts[i];
1556                    match ast {
1557                        Ast::Flags(f) => {
1558                            let mut translator_builder = self.default_translator_builder();
1559                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1560                                translator_builder.case_insensitive(state);
1561                            }
1562                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1563                                translator_builder.unicode(state);
1564                            }
1565                            if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1566                                self.dot_all.set(state);
1567                            }
1568                            concat_translator = Some(translator_builder.build());
1569                            i += 1;
1570                            continue;
1571                        }
1572                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1573                            let node =
1574                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1575                            children.push(node.0);
1576                            i = node.1; // skip consumed lookaheads
1577                            continue;
1578                        }
1579                        _ => {}
1580                    }
1581                    match concat_translator {
1582                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1583                            Ok(node_id) => children.push(node_id),
1584                            Err(err) => return Err(err),
1585                        },
1586                        None => match self.ast_to_node_id(ast, translator, tb) {
1587                            Ok(node_id) => children.push(node_id),
1588                            Err(err) => return Err(err),
1589                        },
1590                    }
1591                    i += 1;
1592                }
1593                Ok(tb.mk_concats(children.iter().cloned()))
1594            }
1595            Ast::Intersection(intersection) => {
1596                let mut children = vec![];
1597                for ast in &intersection.asts {
1598                    match self.ast_to_node_id(ast, translator, tb) {
1599                        Ok(node_id) => children.push(node_id),
1600                        Err(err) => return Err(err),
1601                    }
1602                }
1603                Ok(tb.mk_inters(children.into_iter()))
1604            }
1605            Ast::Complement(complement) => {
1606                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1607                body.map(|x| tb.mk_compl(x))
1608            }
1609        }
1610    }
1611
1612    /// Parse the regular expression and return an abstract syntax tree with
1613    /// all of the comments found in the pattern.
1614    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1615        let mut concat = Concat {
1616            span: self.span(),
1617            asts: vec![],
1618        };
1619        loop {
1620            self.bump_space();
1621            if self.is_eof() {
1622                break;
1623            }
1624            match self.char() {
1625                '(' => concat = self.push_group(concat)?,
1626                ')' => concat = self.pop_group(concat)?,
1627                '|' => concat = self.push_alternate(concat)?,
1628                '&' => concat = self.push_intersect(concat)?,
1629                '~' => concat = self.push_compl_group(concat)?,
1630                '[' => {
1631                    let class = self.parse_set_class()?;
1632                    concat.asts.push(Ast::class_bracketed(class));
1633                }
1634                '?' => {
1635                    concat =
1636                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1637                }
1638                '*' => {
1639                    concat =
1640                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1641                }
1642                '+' => {
1643                    concat =
1644                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1645                }
1646                '{' => {
1647                    concat = self.parse_counted_repetition(concat)?;
1648                }
1649                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1650            }
1651        }
1652        let ast = self.pop_group_end(concat)?;
1653        // dbg!(&ast);
1654        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1655        node_id
1656    }
1657
1658    #[inline(never)]
1659    fn parse_uncounted_repetition(
1660        &self,
1661        mut concat: ast::Concat,
1662        kind: ast::RepetitionKind,
1663    ) -> Result<ast::Concat> {
1664        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1665        let op_start = self.pos();
1666        let ast = match concat.asts.pop() {
1667            Some(ast) => ast,
1668            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1669        };
1670        match ast {
1671            Ast::Empty(_) | Ast::Flags(_) => {
1672                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1673            }
1674            _ => {}
1675        }
1676        if self.bump() && self.char() == '?' {
1677            return Err(self.error(
1678                Span::new(op_start, self.pos()),
1679                ast::ErrorKind::UnsupportedLazyQuantifier,
1680            ));
1681        }
1682        concat.asts.push(Ast::repetition(ast::Repetition {
1683            span: ast.span().with_end(self.pos()),
1684            op: ast::RepetitionOp {
1685                span: Span::new(op_start, self.pos()),
1686                kind,
1687            },
1688            greedy: true,
1689            ast: Box::new(ast),
1690        }));
1691        Ok(concat)
1692    }
1693
1694    #[inline(never)]
1695    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1696        assert!(self.char() == '{');
1697        let start = self.pos();
1698        let ast = match concat.asts.pop() {
1699            Some(ast) => ast,
1700            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1701        };
1702        match ast {
1703            Ast::Empty(_) | Ast::Flags(_) => {
1704                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1705            }
1706            _ => {}
1707        }
1708        if !self.bump_and_bump_space() {
1709            return Err(self.error(
1710                Span::new(start, self.pos()),
1711                ast::ErrorKind::RepetitionCountUnclosed,
1712            ));
1713        }
1714        let count_start = specialize_err(
1715            self.parse_decimal(),
1716            ast::ErrorKind::DecimalEmpty,
1717            ast::ErrorKind::RepetitionCountDecimalEmpty,
1718        );
1719        if self.is_eof() {
1720            return Err(self.error(
1721                Span::new(start, self.pos()),
1722                ast::ErrorKind::RepetitionCountUnclosed,
1723            ));
1724        }
1725        let range = if self.char() == ',' {
1726            if !self.bump_and_bump_space() {
1727                return Err(self.error(
1728                    Span::new(start, self.pos()),
1729                    ast::ErrorKind::RepetitionCountUnclosed,
1730                ));
1731            }
1732            if self.char() != '}' {
1733                let count_start = match count_start {
1734                    Ok(c) => c,
1735                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1736                        if self.parser().empty_min_range {
1737                            0
1738                        } else {
1739                            return Err(err);
1740                        }
1741                    }
1742                    err => err?,
1743                };
1744                let count_end = specialize_err(
1745                    self.parse_decimal(),
1746                    ast::ErrorKind::DecimalEmpty,
1747                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1748                )?;
1749                ast::RepetitionRange::Bounded(count_start, count_end)
1750            } else {
1751                ast::RepetitionRange::AtLeast(count_start?)
1752            }
1753        } else {
1754            ast::RepetitionRange::Exactly(count_start?)
1755        };
1756
1757        if self.is_eof() || self.char() != '}' {
1758            return Err(self.error(
1759                Span::new(start, self.pos()),
1760                ast::ErrorKind::RepetitionCountUnclosed,
1761            ));
1762        }
1763
1764        if self.bump_and_bump_space() && self.char() == '?' {
1765            return Err(self.error(
1766                Span::new(start, self.pos()),
1767                ast::ErrorKind::UnsupportedLazyQuantifier,
1768            ));
1769        }
1770
1771        let op_span = Span::new(start, self.pos());
1772        if !range.is_valid() {
1773            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1774        }
1775        concat.asts.push(Ast::repetition(ast::Repetition {
1776            span: ast.span().with_end(self.pos()),
1777            op: ast::RepetitionOp {
1778                span: op_span,
1779                kind: ast::RepetitionKind::Range(range),
1780            },
1781            greedy: true,
1782            ast: Box::new(ast),
1783        }));
1784        Ok(concat)
1785    }
1786
1787    #[inline(never)]
1788    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1789        assert_eq!(self.char(), '(');
1790        let open_span = self.span_char();
1791        self.bump();
1792        self.bump_space();
1793        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1794            let kind = match (pos, ahead) {
1795                (true, true) => LookaroundKind::PositiveLookahead,
1796                (true, false) => LookaroundKind::PositiveLookbehind,
1797                (false, true) => LookaroundKind::NegativeLookahead,
1798                (false, false) => LookaroundKind::NegativeLookbehind,
1799            };
1800            return Ok(Either::Right(ast::Group {
1801                span: open_span,
1802                kind: ast::GroupKind::Lookaround(kind),
1803                ast: Box::new(Ast::empty(self.span())),
1804            }));
1805        }
1806        let inner_span = self.span();
1807        let mut starts_with_p = true;
1808        if self.bump_if("?P<") || {
1809            starts_with_p = false;
1810            self.bump_if("?<")
1811        } {
1812            let capture_index = self.next_capture_index(open_span)?;
1813            let name = self.parse_capture_name(capture_index)?;
1814            Ok(Either::Right(ast::Group {
1815                span: open_span,
1816                kind: ast::GroupKind::CaptureName {
1817                    starts_with_p,
1818                    name,
1819                },
1820                ast: Box::new(Ast::empty(self.span())),
1821            }))
1822        } else if self.bump_if("?") {
1823            if self.is_eof() {
1824                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1825            }
1826            let flags = self.parse_flags()?;
1827            let char_end = self.char();
1828            self.bump();
1829            if char_end == ')' {
1830                // We don't allow empty flags, e.g., `(?)`. We instead
1831                // interpret it as a repetition operator missing its argument.
1832                if flags.items.is_empty() {
1833                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1834                }
1835                Ok(Either::Left(ast::SetFlags {
1836                    span: Span {
1837                        end: self.pos(),
1838                        ..open_span
1839                    },
1840                    flags,
1841                }))
1842            } else {
1843                assert_eq!(char_end, ':');
1844                Ok(Either::Right(ast::Group {
1845                    span: open_span,
1846                    kind: ast::GroupKind::NonCapturing(flags),
1847                    ast: Box::new(Ast::empty(self.span())),
1848                }))
1849            }
1850        } else {
1851            let capture_index = self.next_capture_index(open_span)?;
1852            Ok(Either::Right(ast::Group {
1853                span: open_span,
1854                kind: ast::GroupKind::CaptureIndex(capture_index),
1855                ast: Box::new(Ast::empty(self.span())),
1856            }))
1857        }
1858    }
1859
1860    #[inline(never)]
1861    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1862        if self.is_eof() {
1863            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1864        }
1865        let start = self.pos();
1866        loop {
1867            if self.char() == '>' {
1868                break;
1869            }
1870            if !is_capture_char(self.char(), self.pos() == start) {
1871                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1872            }
1873            if !self.bump() {
1874                break;
1875            }
1876        }
1877        let end = self.pos();
1878        if self.is_eof() {
1879            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1880        }
1881        assert_eq!(self.char(), '>');
1882        self.bump();
1883        let name = &self.pattern()[start.offset..end.offset];
1884        if name.is_empty() {
1885            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1886        }
1887        let capname = ast::CaptureName {
1888            span: Span::new(start, end),
1889            name: name.to_string(),
1890            index: capture_index,
1891        };
1892        self.add_capture_name(&capname)?;
1893        Ok(capname)
1894    }
1895
1896    #[inline(never)]
1897    fn parse_flags(&self) -> Result<ast::Flags> {
1898        let mut flags = ast::Flags {
1899            span: self.span(),
1900            items: vec![],
1901        };
1902        let mut last_was_negation = None;
1903        while self.char() != ':' && self.char() != ')' {
1904            if self.char() == '-' {
1905                last_was_negation = Some(self.span_char());
1906                let item = ast::FlagsItem {
1907                    span: self.span_char(),
1908                    kind: ast::FlagsItemKind::Negation,
1909                };
1910                if let Some(i) = flags.add_item(item) {
1911                    return Err(self.error(
1912                        self.span_char(),
1913                        ast::ErrorKind::FlagRepeatedNegation {
1914                            original: flags.items[i].span,
1915                        },
1916                    ));
1917                }
1918            } else {
1919                last_was_negation = None;
1920                let item = ast::FlagsItem {
1921                    span: self.span_char(),
1922                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1923                };
1924                if let Some(i) = flags.add_item(item) {
1925                    return Err(self.error(
1926                        self.span_char(),
1927                        ast::ErrorKind::FlagDuplicate {
1928                            original: flags.items[i].span,
1929                        },
1930                    ));
1931                }
1932            }
1933            if !self.bump() {
1934                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1935            }
1936        }
1937        if let Some(span) = last_was_negation {
1938            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1939        }
1940        flags.span.end = self.pos();
1941        Ok(flags)
1942    }
1943
1944    #[inline(never)]
1945    fn parse_flag(&self) -> Result<ast::Flag> {
1946        match self.char() {
1947            'i' => Ok(ast::Flag::CaseInsensitive),
1948            'm' => Ok(ast::Flag::MultiLine),
1949            's' => Ok(ast::Flag::DotMatchesNewLine),
1950            'U' => Ok(ast::Flag::SwapGreed),
1951            'u' => Ok(ast::Flag::Unicode),
1952            'R' => Ok(ast::Flag::CRLF),
1953            'x' => Ok(ast::Flag::IgnoreWhitespace),
1954            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1955        }
1956    }
1957
1958    fn parse_primitive(&self) -> Result<Primitive> {
1959        match self.char() {
1960            '\\' => self.parse_escape(),
1961            '_' => {
1962                let ast = Primitive::Top(self.span_char());
1963                self.bump();
1964                Ok(ast)
1965            }
1966            '.' => {
1967                let ast = Primitive::Dot(self.span_char());
1968                self.bump();
1969                Ok(ast)
1970            }
1971            '^' => {
1972                let ast = Primitive::Assertion(ast::Assertion {
1973                    span: self.span_char(),
1974                    kind: ast::AssertionKind::StartLine,
1975                });
1976                self.bump();
1977                Ok(ast)
1978            }
1979            '$' => {
1980                let ast = Primitive::Assertion(ast::Assertion {
1981                    span: self.span_char(),
1982                    kind: ast::AssertionKind::EndLine,
1983                });
1984                self.bump();
1985                Ok(ast)
1986            }
1987            c => {
1988                let ast = Primitive::Literal(Literal {
1989                    span: self.span_char(),
1990                    kind: LiteralKind::Verbatim,
1991                    c,
1992                });
1993                self.bump();
1994                Ok(ast)
1995            }
1996        }
1997    }
1998
1999    #[inline(never)]
2000    fn parse_escape(&self) -> Result<Primitive> {
2001        assert_eq!(self.char(), '\\');
2002        let start = self.pos();
2003        if !self.bump() {
2004            return Err(self.error(
2005                Span::new(start, self.pos()),
2006                ast::ErrorKind::EscapeUnexpectedEof,
2007            ));
2008        }
2009        let c = self.char();
2010        // Put some of the more complicated routines into helpers.
2011        match c {
2012            '0'..='9' => {
2013                if !self.parser().octal {
2014                    return Err(self.error(
2015                        Span::new(start, self.span_char().end),
2016                        ast::ErrorKind::UnsupportedBackreference,
2017                    ));
2018                }
2019                let mut lit = self.parse_octal();
2020                lit.span.start = start;
2021                return Ok(Primitive::Literal(lit));
2022            }
2023            // '8'..='9' if !self.parser().octal => {
2024            //     return Err(self.error(
2025            //         Span::new(start, self.span_char().end),
2026            //         ast::ErrorKind::UnsupportedBackreference,
2027            //     ));
2028            // }
2029            'x' | 'u' | 'U' => {
2030                let mut lit = self.parse_hex()?;
2031                lit.span.start = start;
2032                return Ok(Primitive::Literal(lit));
2033            }
2034            'p' | 'P' => {
2035                let mut cls = self.parse_unicode_class()?;
2036                cls.span.start = start;
2037                return Ok(Primitive::Unicode(cls));
2038            }
2039            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2040                let mut cls = self.parse_perl_class();
2041                cls.span.start = start;
2042                return Ok(Primitive::Perl(cls));
2043            }
2044            _ => {}
2045        }
2046
2047        // Handle all of the one letter sequences inline.
2048        self.bump();
2049        let span = Span::new(start, self.pos());
2050        if is_meta_character(c) {
2051            return Ok(Primitive::Literal(Literal {
2052                span,
2053                kind: LiteralKind::Meta,
2054                c,
2055            }));
2056        }
2057        if is_escapeable_character(c) {
2058            return Ok(Primitive::Literal(Literal {
2059                span,
2060                kind: LiteralKind::Superfluous,
2061                c,
2062            }));
2063        }
2064        let special = |kind, c| {
2065            Ok(Primitive::Literal(Literal {
2066                span,
2067                kind: LiteralKind::Special(kind),
2068                c,
2069            }))
2070        };
2071        match c {
2072            'a' => special(SpecialLiteralKind::Bell, '\x07'),
2073            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2074            't' => special(SpecialLiteralKind::Tab, '\t'),
2075            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2076            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2077            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2078            'A' => Ok(Primitive::Assertion(ast::Assertion {
2079                span,
2080                kind: ast::AssertionKind::StartText,
2081            })),
2082            'z' => Ok(Primitive::Assertion(ast::Assertion {
2083                span,
2084                kind: ast::AssertionKind::EndText,
2085            })),
2086            'b' => {
2087                let mut wb = ast::Assertion {
2088                    span,
2089                    kind: ast::AssertionKind::WordBoundary,
2090                };
2091                // After a \b, we "try" to parse things like \b{start} for
2092                // special word boundary assertions.
2093                if !self.is_eof() && self.char() == '{' {
2094                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2095                        wb.kind = kind;
2096                        wb.span.end = self.pos();
2097                    }
2098                }
2099                Ok(Primitive::Assertion(wb))
2100            }
2101            'B' => Ok(Primitive::Assertion(ast::Assertion {
2102                span,
2103                kind: ast::AssertionKind::NotWordBoundary,
2104            })),
2105            '<' => Ok(Primitive::Assertion(ast::Assertion {
2106                span,
2107                kind: ast::AssertionKind::WordBoundaryStartAngle,
2108            })),
2109            '>' => Ok(Primitive::Assertion(ast::Assertion {
2110                span,
2111                kind: ast::AssertionKind::WordBoundaryEndAngle,
2112            })),
2113            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2114        }
2115    }
2116
2117    fn maybe_parse_special_word_boundary(
2118        &self,
2119        wb_start: Position,
2120    ) -> Result<Option<ast::AssertionKind>> {
2121        assert_eq!(self.char(), '{');
2122
2123        let is_valid_char = |c| match c {
2124            'A'..='Z' | 'a'..='z' | '-' => true,
2125            _ => false,
2126        };
2127        let start = self.pos();
2128        if !self.bump_and_bump_space() {
2129            return Err(self.error(
2130                Span::new(wb_start, self.pos()),
2131                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2132            ));
2133        }
2134        let start_contents = self.pos();
2135        // This is one of the critical bits: if the first non-whitespace
2136        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2137        // boundary), then we bail and let the counted repetition parser deal
2138        // with this.
2139        if !is_valid_char(self.char()) {
2140            self.parser().pos.set(start);
2141            return Ok(None);
2142        }
2143
2144        // Now collect up our chars until we see a '}'.
2145        let mut scratch = self.parser().scratch.borrow_mut();
2146        scratch.clear();
2147        while !self.is_eof() && is_valid_char(self.char()) {
2148            scratch.push(self.char());
2149            self.bump_and_bump_space();
2150        }
2151        if self.is_eof() || self.char() != '}' {
2152            return Err(self.error(
2153                Span::new(start, self.pos()),
2154                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2155            ));
2156        }
2157        let end = self.pos();
2158        self.bump();
2159        let kind = match scratch.as_str() {
2160            "start" => ast::AssertionKind::WordBoundaryStart,
2161            "end" => ast::AssertionKind::WordBoundaryEnd,
2162            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2163            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2164            _ => {
2165                return Err(self.error(
2166                    Span::new(start_contents, end),
2167                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2168                ))
2169            }
2170        };
2171        Ok(Some(kind))
2172    }
2173
2174    #[inline(never)]
2175    fn parse_octal(&self) -> Literal {
2176        assert!(self.parser().octal);
2177        assert!('0' <= self.char() && self.char() <= '7');
2178        let start = self.pos();
2179        // Parse up to two more digits.
2180        while self.bump()
2181            && '0' <= self.char()
2182            && self.char() <= '7'
2183            && self.pos().offset - start.offset <= 2
2184        {}
2185        let end = self.pos();
2186        let octal = &self.pattern()[start.offset..end.offset];
2187        // Parsing the octal should never fail since the above guarantees a
2188        // valid number.
2189        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2190        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2191        // invalid Unicode scalar values.
2192        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2193        Literal {
2194            span: Span::new(start, end),
2195            kind: LiteralKind::Octal,
2196            c,
2197        }
2198    }
2199
2200    #[inline(never)]
2201    fn parse_hex(&self) -> Result<Literal> {
2202        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2203
2204        let hex_kind = match self.char() {
2205            'x' => HexLiteralKind::X,
2206            'u' => HexLiteralKind::UnicodeShort,
2207            _ => HexLiteralKind::UnicodeLong,
2208        };
2209        if !self.bump_and_bump_space() {
2210            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2211        }
2212        if self.char() == '{' {
2213            self.parse_hex_brace(hex_kind)
2214        } else {
2215            self.parse_hex_digits(hex_kind)
2216        }
2217    }
2218
2219    #[inline(never)]
2220    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2221        let mut scratch = self.parser().scratch.borrow_mut();
2222        scratch.clear();
2223
2224        let start = self.pos();
2225        for i in 0..kind.digits() {
2226            if i > 0 && !self.bump_and_bump_space() {
2227                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2228            }
2229            if !is_hex(self.char()) {
2230                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2231            }
2232            scratch.push(self.char());
2233        }
2234        // The final bump just moves the parser past the literal, which may
2235        // be EOF.
2236        self.bump_and_bump_space();
2237        let end = self.pos();
2238        let hex = scratch.as_str();
2239        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2240            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2241            Some(c) => Ok(Literal {
2242                span: Span::new(start, end),
2243                kind: LiteralKind::HexFixed(kind),
2244                c,
2245            }),
2246        }
2247    }
2248
2249    #[inline(never)]
2250    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2251        let mut scratch = self.parser().scratch.borrow_mut();
2252        scratch.clear();
2253
2254        let brace_pos = self.pos();
2255        let start = self.span_char().end;
2256        while self.bump_and_bump_space() && self.char() != '}' {
2257            if !is_hex(self.char()) {
2258                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2259            }
2260            scratch.push(self.char());
2261        }
2262        if self.is_eof() {
2263            return Err(self.error(
2264                Span::new(brace_pos, self.pos()),
2265                ast::ErrorKind::EscapeUnexpectedEof,
2266            ));
2267        }
2268        let end = self.pos();
2269        let hex = scratch.as_str();
2270        assert_eq!(self.char(), '}');
2271        self.bump_and_bump_space();
2272
2273        if hex.is_empty() {
2274            return Err(self.error(
2275                Span::new(brace_pos, self.pos()),
2276                ast::ErrorKind::EscapeHexEmpty,
2277            ));
2278        }
2279        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2280            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2281            Some(c) => Ok(Literal {
2282                span: Span::new(start, self.pos()),
2283                kind: LiteralKind::HexBrace(kind),
2284                c,
2285            }),
2286        }
2287    }
2288
2289    fn parse_decimal(&self) -> Result<u32> {
2290        let mut scratch = self.parser().scratch.borrow_mut();
2291        scratch.clear();
2292
2293        while !self.is_eof() && self.char().is_whitespace() {
2294            self.bump();
2295        }
2296        let start = self.pos();
2297        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2298            scratch.push(self.char());
2299            self.bump_and_bump_space();
2300        }
2301        let span = Span::new(start, self.pos());
2302        while !self.is_eof() && self.char().is_whitespace() {
2303            self.bump_and_bump_space();
2304        }
2305        let digits = scratch.as_str();
2306        if digits.is_empty() {
2307            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2308        }
2309        match u32::from_str_radix(digits, 10).ok() {
2310            Some(n) => Ok(n),
2311            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2312        }
2313    }
2314
2315    #[inline(never)]
2316    fn parse_set_class(&self) -> Result<ClassBracketed> {
2317        assert_eq!(self.char(), '[');
2318
2319        let mut union = ClassSetUnion {
2320            span: self.span(),
2321            items: vec![],
2322        };
2323        loop {
2324            self.bump_space();
2325            if self.is_eof() {
2326                return Err(self.unclosed_class_error());
2327            }
2328            match self.char() {
2329                '[' => {
2330                    // If we've already parsed the opening bracket, then
2331                    // attempt to treat this as the beginning of an ASCII
2332                    // class. If ASCII class parsing fails, then the parser
2333                    // backs up to `[`.
2334                    if !self.parser().stack_class.borrow().is_empty() {
2335                        if let Some(cls) = self.maybe_parse_ascii_class() {
2336                            union.push(ClassSetItem::Ascii(cls));
2337                            continue;
2338                        }
2339                    }
2340                    union = self.push_class_open(union)?;
2341                }
2342                ']' => match self.pop_class(union)? {
2343                    Either::Left(nested_union) => {
2344                        union = nested_union;
2345                    }
2346                    Either::Right(class) => return Ok(class),
2347                },
2348                '&' if self.peek() == Some('&') => {
2349                    assert!(self.bump_if("&&"));
2350                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2351                }
2352                '-' if self.peek() == Some('-') => {
2353                    assert!(self.bump_if("--"));
2354                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2355                }
2356                '~' if self.peek() == Some('~') => {
2357                    assert!(self.bump_if("~~"));
2358                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2359                }
2360                _ => {
2361                    union.push(self.parse_set_class_range()?);
2362                }
2363            }
2364        }
2365    }
2366
2367    #[inline(never)]
2368    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2369        let prim1 = self.parse_set_class_item()?;
2370        self.bump_space();
2371        if self.is_eof() {
2372            return Err(self.unclosed_class_error());
2373        }
2374        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2375            return prim1.into_class_set_item(self);
2376        }
2377        if !self.bump_and_bump_space() {
2378            return Err(self.unclosed_class_error());
2379        }
2380        let prim2 = self.parse_set_class_item()?;
2381        let range = ClassSetRange {
2382            span: Span::new(prim1.span().start, prim2.span().end),
2383            start: prim1.into_class_literal(self)?,
2384            end: prim2.into_class_literal(self)?,
2385        };
2386        if !range.is_valid() {
2387            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2388        }
2389        Ok(ClassSetItem::Range(range))
2390    }
2391
2392    #[inline(never)]
2393    fn parse_set_class_item(&self) -> Result<Primitive> {
2394        if self.char() == '\\' {
2395            self.parse_escape()
2396        } else {
2397            let x = Primitive::Literal(Literal {
2398                span: self.span_char(),
2399                kind: LiteralKind::Verbatim,
2400                c: self.char(),
2401            });
2402            self.bump();
2403            Ok(x)
2404        }
2405    }
2406
2407    #[inline(never)]
2408    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2409        assert_eq!(self.char(), '[');
2410        let start = self.pos();
2411        if !self.bump_and_bump_space() {
2412            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2413        }
2414
2415        let negated = if self.char() != '^' {
2416            false
2417        } else {
2418            if !self.bump_and_bump_space() {
2419                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2420            }
2421            true
2422        };
2423        // Accept any number of `-` as literal `-`.
2424        let mut union = ClassSetUnion {
2425            span: self.span(),
2426            items: vec![],
2427        };
2428        while self.char() == '-' {
2429            union.push(ClassSetItem::Literal(Literal {
2430                span: self.span_char(),
2431                kind: LiteralKind::Verbatim,
2432                c: '-',
2433            }));
2434            if !self.bump_and_bump_space() {
2435                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2436            }
2437        }
2438        // If `]` is the *first* char in a set, then interpret it as a literal
2439        // `]`. That is, an empty class is impossible to write.
2440        if union.items.is_empty() && self.char() == ']' {
2441            union.push(ClassSetItem::Literal(Literal {
2442                span: self.span_char(),
2443                kind: LiteralKind::Verbatim,
2444                c: ']',
2445            }));
2446            if !self.bump_and_bump_space() {
2447                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2448            }
2449        }
2450        let set = ClassBracketed {
2451            span: Span::new(start, self.pos()),
2452            negated,
2453            kind: ClassSet::union(ClassSetUnion {
2454                span: Span::new(union.span.start, union.span.start),
2455                items: vec![],
2456            }),
2457        };
2458        Ok((set, union))
2459    }
2460
2461    #[inline(never)]
2462    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2463        assert_eq!(self.char(), '[');
2464        // If parsing fails, then we back up the parser to this starting point.
2465        let start = self.pos();
2466        let mut negated = false;
2467        if !self.bump() || self.char() != ':' {
2468            self.parser().pos.set(start);
2469            return None;
2470        }
2471        if !self.bump() {
2472            self.parser().pos.set(start);
2473            return None;
2474        }
2475        if self.char() == '^' {
2476            negated = true;
2477            if !self.bump() {
2478                self.parser().pos.set(start);
2479                return None;
2480            }
2481        }
2482        let name_start = self.offset();
2483        while self.char() != ':' && self.bump() {}
2484        if self.is_eof() {
2485            self.parser().pos.set(start);
2486            return None;
2487        }
2488        let name = &self.pattern()[name_start..self.offset()];
2489        if !self.bump_if(":]") {
2490            self.parser().pos.set(start);
2491            return None;
2492        }
2493        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2494            Some(kind) => kind,
2495            None => {
2496                self.parser().pos.set(start);
2497                return None;
2498            }
2499        };
2500        Some(ClassAscii {
2501            span: Span::new(start, self.pos()),
2502            kind,
2503            negated,
2504        })
2505    }
2506
2507    #[inline(never)]
2508    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2509        assert!(self.char() == 'p' || self.char() == 'P');
2510
2511        let mut scratch = self.parser().scratch.borrow_mut();
2512        scratch.clear();
2513
2514        let negated = self.char() == 'P';
2515        if !self.bump_and_bump_space() {
2516            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2517        }
2518        let (start, kind) = if self.char() == '{' {
2519            let start = self.span_char().end;
2520            while self.bump_and_bump_space() && self.char() != '}' {
2521                scratch.push(self.char());
2522            }
2523            if self.is_eof() {
2524                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2525            }
2526            assert_eq!(self.char(), '}');
2527            self.bump();
2528
2529            let name = scratch.as_str();
2530            if let Some(i) = name.find("!=") {
2531                (
2532                    start,
2533                    ClassUnicodeKind::NamedValue {
2534                        op: ClassUnicodeOpKind::NotEqual,
2535                        name: name[..i].to_string(),
2536                        value: name[i + 2..].to_string(),
2537                    },
2538                )
2539            } else if let Some(i) = name.find(':') {
2540                (
2541                    start,
2542                    ClassUnicodeKind::NamedValue {
2543                        op: ClassUnicodeOpKind::Colon,
2544                        name: name[..i].to_string(),
2545                        value: name[i + 1..].to_string(),
2546                    },
2547                )
2548            } else if let Some(i) = name.find('=') {
2549                (
2550                    start,
2551                    ClassUnicodeKind::NamedValue {
2552                        op: ClassUnicodeOpKind::Equal,
2553                        name: name[..i].to_string(),
2554                        value: name[i + 1..].to_string(),
2555                    },
2556                )
2557            } else {
2558                (start, ClassUnicodeKind::Named(name.to_string()))
2559            }
2560        } else {
2561            let start = self.pos();
2562            let c = self.char();
2563            if c == '\\' {
2564                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2565            }
2566            self.bump_and_bump_space();
2567            let kind = ClassUnicodeKind::OneLetter(c);
2568            (start, kind)
2569        };
2570        Ok(ClassUnicode {
2571            span: Span::new(start, self.pos()),
2572            negated,
2573            kind,
2574        })
2575    }
2576
2577    #[inline(never)]
2578    fn parse_perl_class(&self) -> ClassPerl {
2579        let c = self.char();
2580        let span = self.span_char();
2581        self.bump();
2582        let (negated, kind) = match c {
2583            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2584            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2585            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2586            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2587            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2588            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2589            c => panic!("expected valid Perl class but got '{}'", c),
2590        };
2591        ClassPerl {
2592            span,
2593            kind,
2594            negated,
2595        }
2596    }
2597}
2598
2599pub fn parse_ast<'s>(
2600    tb: &mut TB<'s>,
2601    pattern: &'s str,
2602) -> std::result::Result<NodeId, ResharpError> {
2603    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2604    p.parse(tb)
2605}
2606
2607pub fn parse_ast_with<'s>(
2608    tb: &mut TB<'s>,
2609    pattern: &'s str,
2610    flags: &PatternFlags,
2611) -> std::result::Result<NodeId, ResharpError> {
2612    let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2613    p.parse(tb)
2614}
resharp_parser/lib.rs

resharp_parser/
lib.rs