resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28    Word,
29    NonWord,
30    MaybeWord,
31    MaybeNonWord,
32    Unknown,
33    Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37    b.is_ascii_alphanumeric() || b == b'_'
38}
39
40
41#[derive(Clone, Debug, Eq, PartialEq)]
42enum Primitive {
43    Literal(Literal),
44    Assertion(ast::Assertion),
45    Dot(Span),
46    Top(Span),
47    Perl(ClassPerl),
48    Unicode(ClassUnicode),
49}
50
51impl Primitive {
52    fn span(&self) -> &Span {
53        match *self {
54            Primitive::Literal(ref x) => &x.span,
55            Primitive::Assertion(ref x) => &x.span,
56            Primitive::Dot(ref span) => span,
57            Primitive::Top(ref span) => span,
58            Primitive::Perl(ref x) => &x.span,
59            Primitive::Unicode(ref x) => &x.span,
60        }
61    }
62
63    fn into_ast(self) -> Ast {
64        match self {
65            Primitive::Literal(lit) => Ast::literal(lit),
66            Primitive::Assertion(assert) => Ast::assertion(assert),
67            Primitive::Dot(span) => Ast::dot(span),
68            Primitive::Top(span) => Ast::top(span),
69            Primitive::Perl(cls) => Ast::class_perl(cls),
70            Primitive::Unicode(cls) => Ast::class_unicode(cls),
71        }
72    }
73
74    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
75        use self::Primitive::*;
76        use regex_syntax::ast::ClassSetItem;
77
78        match self {
79            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
80            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
81            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
82            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
83        }
84    }
85
86    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
87        use self::Primitive::*;
88
89        match self {
90            Literal(lit) => Ok(lit),
91            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
92        }
93    }
94}
95
96#[derive(Clone, Debug, Eq, PartialEq)]
97pub enum Either<Left, Right> {
98    Left(Left),
99    Right(Right),
100}
101
102#[derive(Clone, Debug, Eq, PartialEq)]
103pub struct ResharpError {
104    /// The kind of error.
105    pub kind: ErrorKind,
106    /// The original pattern that the parser generated the error from. Every
107    /// span in an error is a valid range into this string.
108    pattern: String,
109    /// The span of this error.
110    pub span: Span,
111}
112
113impl std::fmt::Display for ResharpError {
114    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
115        write!(f, "{:?}: {:?}", self.kind, self.span)
116    }
117}
118impl std::error::Error for ResharpError {}
119
120type Result<T> = core::result::Result<T, ResharpError>;
121
122#[derive(Clone, Debug)]
123enum GroupState {
124    /// This state is pushed whenever an opening group is found.
125    Group {
126        /// The concatenation immediately preceding the opening group.
127        concat: Concat,
128        /// The group that has been opened. Its sub-AST is always empty.
129        group: ast::Group,
130        /// Whether this group has the `x` flag enabled or not.
131        ignore_whitespace: bool,
132    },
133    /// This state is pushed whenever a new alternation branch is found. If
134    /// an alternation branch is found and this state is at the top of the
135    /// stack, then this state should be modified to include the new
136    /// alternation.
137    Alternation(ast::Alternation),
138    Intersection(ast::Intersection),
139}
140
141#[derive(Clone, Debug)]
142enum ClassState {
143    /// This state is pushed whenever an opening bracket is found.
144    Open {
145        /// The union of class items immediately preceding this class.
146        union: regex_syntax::ast::ClassSetUnion,
147        /// The class that has been opened. Typically this just corresponds
148        /// to the `[`, but it can also include `[^` since `^` indicates
149        /// negation of the class.
150        set: regex_syntax::ast::ClassBracketed,
151    },
152    /// This state is pushed when a operator is seen. When popped, the stored
153    /// set becomes the left hand side of the operator.
154    Op {
155        /// The type of the operation, i.e., &&, -- or ~~.
156        kind: regex_syntax::ast::ClassSetBinaryOpKind,
157        /// The left-hand side of the operator.
158        lhs: regex_syntax::ast::ClassSet,
159    },
160}
161
162/// RE# syntax parser based on the regex-syntax crate.
163pub struct ResharpParser<'s> {
164    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
165    unicode_classes: resharp_algebra::UnicodeClassCache,
166    pub translator: regex_syntax::hir::translate::Translator,
167    pub pattern: &'s str,
168    pos: Cell<Position>,
169    capture_index: Cell<u32>,
170    octal: bool,
171    empty_min_range: bool,
172    ignore_whitespace: Cell<bool>,
173    comments: RefCell<Vec<ast::Comment>>,
174    stack_group: RefCell<Vec<GroupState>>,
175    stack_class: RefCell<Vec<ClassState>>,
176    capture_names: RefCell<Vec<ast::CaptureName>>,
177    scratch: RefCell<String>,
178}
179
180fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
181    if let Err(e) = result {
182        if e.kind == from {
183            Err(ResharpError {
184                kind: to,
185                pattern: e.pattern,
186                span: e.span,
187            })
188        } else {
189            Err(e)
190        }
191    } else {
192        result
193    }
194}
195
196fn is_capture_char(c: char, first: bool) -> bool {
197    if first {
198        c == '_' || c.is_alphabetic()
199    } else {
200        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
201    }
202}
203
204pub fn is_meta_character(c: char) -> bool {
205    match c {
206        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
207        | '#' | '&' | '-' | '~' | '_' => true,
208        _ => false,
209    }
210}
211
212/// escapes all resharp meta characters in `text`.
213pub fn escape(text: &str) -> String {
214    let mut buf = String::new();
215    escape_into(text, &mut buf);
216    buf
217}
218
219/// escapes all resharp meta characters in `text` and appends to `buf`.
220pub fn escape_into(text: &str, buf: &mut String) {
221    buf.reserve(text.len());
222    for c in text.chars() {
223        if is_meta_character(c) {
224            buf.push('\\');
225        }
226        buf.push(c);
227    }
228}
229
230pub fn is_escapeable_character(c: char) -> bool {
231    // Certainly escapeable if it's a meta character.
232    if is_meta_character(c) {
233        return true;
234    }
235    // Any character that isn't ASCII is definitely not escapeable. There's
236    // no real need to allow things like \☃ right?
237    if !c.is_ascii() {
238        return false;
239    }
240    // Otherwise, we basically say that everything is escapeable unless it's a
241    // letter or digit. Things like \3 are either octal (when enabled) or an
242    // error, and we should keep it that way. Otherwise, letters are reserved
243    // for adding new syntax in a backwards compatible way.
244    match c {
245        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
246        // While not currently supported, we keep these as not escapeable to
247        // give us some flexibility with respect to supporting the \< and
248        // \> word boundary assertions in the future. By rejecting them as
249        // escapeable, \< and \> will result in a parse error. Thus, we can
250        // turn them into something else in the future without it being a
251        // backwards incompatible change.
252        //
253        // OK, now we support \< and \>, and we need to retain them as *not*
254        // escapeable here since the escape sequence is significant.
255        '<' | '>' => false,
256        _ => true,
257    }
258}
259
260fn is_hex(c: char) -> bool {
261    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
262}
263
264impl<'s> ResharpParser<'s> {
265    fn default_translator_builder() -> TranslatorBuilder {
266        let mut trb = TranslatorBuilder::new();
267        trb.unicode(true);
268        trb.utf8(false);
269        trb
270    }
271
272    pub fn new(pattern: &'s str) -> Self {
273        let trb = Self::default_translator_builder();
274        Self {
275            translator: trb.build(),
276            pattern,
277            perl_classes: vec![],
278            unicode_classes: resharp_algebra::UnicodeClassCache::default(),
279            pos: Cell::new(Position::new(0, 0, 0)),
280            capture_index: Cell::new(0),
281            octal: false,
282            empty_min_range: false,
283            ignore_whitespace: Cell::new(false),
284            comments: RefCell::new(vec![]),
285            stack_group: RefCell::new(vec![]),
286            stack_class: RefCell::new(vec![]),
287            capture_names: RefCell::new(vec![]),
288            scratch: RefCell::new(String::new()),
289        }
290    }
291
292    /// Return a reference to the parser state.
293    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
294        self
295    }
296
297    /// Return a reference to the pattern being parsed.
298    fn pattern(&self) -> &str {
299        self.pattern
300    }
301
302    /// Create a new error with the given span and error type.
303    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
304        ResharpError {
305            kind,
306            pattern: self.pattern().to_string(),
307            span,
308        }
309    }
310
311    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
312        let emptyspan = Span::splat(self.pos());
313        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
314        inner
315    }
316
317    /// Return the current offset of the parser.
318    ///
319    /// The offset starts at `0` from the beginning of the regular expression
320    /// pattern string.
321    fn offset(&self) -> usize {
322        self.parser().pos.get().offset
323    }
324
325    /// Return the current line number of the parser.
326    ///
327    /// The line number starts at `1`.
328    fn line(&self) -> usize {
329        self.parser().pos.get().line
330    }
331
332    /// Return the current column of the parser.
333    ///
334    /// The column number starts at `1` and is reset whenever a `\n` is seen.
335    fn column(&self) -> usize {
336        self.parser().pos.get().column
337    }
338
339    /// Return the next capturing index. Each subsequent call increments the
340    /// internal index.
341    ///
342    /// The span given should correspond to the location of the opening
343    /// parenthesis.
344    ///
345    /// If the capture limit is exceeded, then an error is returned.
346    fn next_capture_index(&self, span: Span) -> Result<u32> {
347        let current = self.parser().capture_index.get();
348        let i = current
349            .checked_add(1)
350            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
351        self.parser().capture_index.set(i);
352        Ok(i)
353    }
354
355    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
356        let mut names = self.parser().capture_names.borrow_mut();
357        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
358            Err(i) => {
359                names.insert(i, cap.clone());
360                Ok(())
361            }
362            Ok(i) => Err(self.error(
363                cap.span,
364                ast::ErrorKind::GroupNameDuplicate {
365                    original: names[i].span,
366                },
367            )),
368        }
369    }
370
371    fn ignore_whitespace(&self) -> bool {
372        self.parser().ignore_whitespace.get()
373    }
374
375    fn char(&self) -> char {
376        self.char_at(self.offset())
377    }
378
379    fn char_at(&self, i: usize) -> char {
380        self.pattern()[i..]
381            .chars()
382            .next()
383            .unwrap_or_else(|| panic!("expected char at offset {}", i))
384    }
385
386    fn bump(&self) -> bool {
387        if self.is_eof() {
388            return false;
389        }
390        let Position {
391            mut offset,
392            mut line,
393            mut column,
394        } = self.pos();
395        if self.char() == '\n' {
396            line = line.checked_add(1).unwrap();
397            column = 1;
398        } else {
399            column = column.checked_add(1).unwrap();
400        }
401        offset += self.char().len_utf8();
402        self.parser().pos.set(Position {
403            offset,
404            line,
405            column,
406        });
407        self.pattern()[self.offset()..].chars().next().is_some()
408    }
409
410    fn bump_if(&self, prefix: &str) -> bool {
411        if self.pattern()[self.offset()..].starts_with(prefix) {
412            for _ in 0..prefix.chars().count() {
413                self.bump();
414            }
415            true
416        } else {
417            false
418        }
419    }
420
421    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
422        if self.bump_if("?=") {
423            return Some((true, true));
424        }
425        if self.bump_if("?!") {
426            return Some((true, false));
427        }
428        if self.bump_if("?<=") {
429            return Some((false, true));
430        }
431        if self.bump_if("?<!") {
432            return Some((false, false));
433        }
434        return None;
435        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
436    }
437
438    fn bump_and_bump_space(&self) -> bool {
439        if !self.bump() {
440            return false;
441        }
442        self.bump_space();
443        !self.is_eof()
444    }
445
446    fn bump_space(&self) {
447        if !self.ignore_whitespace() {
448            return;
449        }
450        while !self.is_eof() {
451            if self.char().is_whitespace() {
452                self.bump();
453            } else if self.char() == '#' {
454                let start = self.pos();
455                let mut comment_text = String::new();
456                self.bump();
457                while !self.is_eof() {
458                    let c = self.char();
459                    self.bump();
460                    if c == '\n' {
461                        break;
462                    }
463                    comment_text.push(c);
464                }
465                let comment = ast::Comment {
466                    span: Span::new(start, self.pos()),
467                    comment: comment_text,
468                };
469                self.parser().comments.borrow_mut().push(comment);
470            } else {
471                break;
472            }
473        }
474    }
475
476    /// Peek at the next character in the input without advancing the parser.
477    ///
478    /// If the input has been exhausted, then this returns `None`.
479    fn peek(&self) -> Option<char> {
480        if self.is_eof() {
481            return None;
482        }
483        self.pattern()[self.offset() + self.char().len_utf8()..]
484            .chars()
485            .next()
486    }
487
488    /// Like peek, but will ignore spaces when the parser is in whitespace
489    /// insensitive mode.
490    fn peek_space(&self) -> Option<char> {
491        if !self.ignore_whitespace() {
492            return self.peek();
493        }
494        if self.is_eof() {
495            return None;
496        }
497        let mut start = self.offset() + self.char().len_utf8();
498        let mut in_comment = false;
499        for (i, c) in self.pattern()[start..].char_indices() {
500            if c.is_whitespace() {
501                continue;
502            } else if !in_comment && c == '#' {
503                in_comment = true;
504            } else if in_comment && c == '\n' {
505                in_comment = false;
506            } else {
507                start += i;
508                break;
509            }
510        }
511        self.pattern()[start..].chars().next()
512    }
513
514    /// Returns true if the next call to `bump` would return false.
515    fn is_eof(&self) -> bool {
516        self.offset() == self.pattern().len()
517    }
518
519    /// Return the current position of the parser, which includes the offset,
520    /// line and column.
521    fn pos(&self) -> Position {
522        self.parser().pos.get()
523    }
524
525    /// Create a span at the current position of the parser. Both the start
526    /// and end of the span are set.
527    fn span(&self) -> Span {
528        Span::splat(self.pos())
529    }
530
531    /// Create a span that covers the current character.
532    fn span_char(&self) -> Span {
533        let mut next = Position {
534            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
535            line: self.line(),
536            column: self.column().checked_add(1).unwrap(),
537        };
538        if self.char() == '\n' {
539            next.line += 1;
540            next.column = 1;
541        }
542        Span::new(self.pos(), next)
543    }
544
545    /// Parse and push a single alternation on to the parser's internal stack.
546    /// If the top of the stack already has an alternation, then add to that
547    /// instead of pushing a new one.
548    ///
549    /// The concatenation given corresponds to a single alternation branch.
550    /// The concatenation returned starts the next branch and is empty.
551    ///
552    /// This assumes the parser is currently positioned at `|` and will advance
553    /// the parser to the character following `|`.
554    #[inline(never)]
555    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
556        assert_eq!(self.char(), '|');
557        concat.span.end = self.pos();
558        self.push_or_add_alternation(concat);
559        self.bump();
560        Ok(ast::Concat {
561            span: self.span(),
562            asts: vec![],
563        })
564    }
565
566    /// Pushes or adds the given branch of an alternation to the parser's
567    /// internal stack of state.
568    fn push_or_add_alternation(&self, concat: Concat) {
569        use self::GroupState::*;
570
571        let mut stack = self.parser().stack_group.borrow_mut();
572        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
573            alts.asts.push(concat.into_ast());
574            return;
575        }
576        stack.push(Alternation(ast::Alternation {
577            span: Span::new(concat.span.start, self.pos()),
578            asts: vec![concat.into_ast()],
579        }));
580    }
581
582    #[inline(never)]
583    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
584        assert_eq!(self.char(), '&');
585        concat.span.end = self.pos();
586        self.push_or_add_intersect(concat);
587        self.bump();
588        Ok(Concat {
589            span: self.span(),
590            asts: vec![],
591        })
592    }
593
594    /// Pushes or adds the given branch of an alternation to the parser's
595    /// internal stack of state.
596    fn push_or_add_intersect(&self, concat: Concat) {
597        use self::GroupState::*;
598
599        let mut stack = self.parser().stack_group.borrow_mut();
600        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
601            alts.asts.push(concat.into_ast());
602            return;
603        }
604        stack.push(Intersection(ast::Intersection {
605            span: Span::new(concat.span.start, self.pos()),
606            asts: vec![concat.into_ast()],
607        }));
608    }
609
610    /// Parse and push a group AST (and its parent concatenation) on to the
611    /// parser's internal stack. Return a fresh concatenation corresponding
612    /// to the group's sub-AST.
613    ///
614    /// If a set of flags was found (with no group), then the concatenation
615    /// is returned with that set of flags added.
616    ///
617    /// This assumes that the parser is currently positioned on the opening
618    /// parenthesis. It advances the parser to the character at the start
619    /// of the sub-expression (or adjoining expression).
620    ///
621    /// If there was a problem parsing the start of the group, then an error
622    /// is returned.
623    #[inline(never)]
624    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
625        assert_eq!(self.char(), '(');
626        match self.parse_group()? {
627            Either::Left(set) => {
628                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
629                if let Some(v) = ignore {
630                    self.parser().ignore_whitespace.set(v);
631                }
632
633                concat.asts.push(Ast::flags(set));
634                Ok(concat)
635            }
636            Either::Right(group) => {
637                let old_ignore_whitespace = self.ignore_whitespace();
638                let new_ignore_whitespace = group
639                    .flags()
640                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
641                    .unwrap_or(old_ignore_whitespace);
642                self.parser()
643                    .stack_group
644                    .borrow_mut()
645                    .push(GroupState::Group {
646                        concat,
647                        group,
648                        ignore_whitespace: old_ignore_whitespace,
649                    });
650                self.parser().ignore_whitespace.set(new_ignore_whitespace);
651                Ok(Concat {
652                    span: self.span(),
653                    asts: vec![],
654                })
655            }
656        }
657    }
658
659    #[inline(never)]
660    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
661        assert_eq!(self.char(), '~');
662        self.bump();
663        if self.is_eof() || self.char() != '(' {
664            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
665        }
666        let open_span = self.span_char();
667        self.bump();
668        let group = ast::Group {
669            span: open_span,
670            kind: ast::GroupKind::Complement,
671            ast: Box::new(Ast::empty(self.span())),
672        };
673
674        let old_ignore_whitespace = self.ignore_whitespace();
675        let new_ignore_whitespace = group
676            .flags()
677            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678            .unwrap_or(old_ignore_whitespace);
679        self.parser()
680            .stack_group
681            .borrow_mut()
682            .push(GroupState::Group {
683                concat,
684                group,
685                ignore_whitespace: old_ignore_whitespace,
686            });
687        self.parser().ignore_whitespace.set(new_ignore_whitespace);
688        Ok(Concat {
689            span: self.span(),
690            asts: vec![],
691        })
692    }
693
694    /// Pop a group AST from the parser's internal stack and set the group's
695    /// AST to the given concatenation. Return the concatenation containing
696    /// the group.
697    ///
698    /// This assumes that the parser is currently positioned on the closing
699    /// parenthesis and advances the parser to the character following the `)`.
700    ///
701    /// If no such group could be popped, then an unopened group error is
702    /// returned.
703    #[inline(never)]
704    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
705        use self::GroupState::*;
706        assert_eq!(self.char(), ')');
707        let mut stack = self.parser().stack_group.borrow_mut();
708        let topstack = stack.pop();
709
710        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
711            Some(Group {
712                concat,
713                group,
714                ignore_whitespace,
715            }) => (concat, group, ignore_whitespace, None),
716            Some(Alternation(alt)) => match stack.pop() {
717                Some(Group {
718                    concat,
719                    group,
720                    ignore_whitespace,
721                }) => (
722                    concat,
723                    group,
724                    ignore_whitespace,
725                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
726                ),
727                None | Some(Alternation(_)) | Some(Intersection(_)) => {
728                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
729                }
730            },
731            Some(Intersection(int)) => match stack.pop() {
732                Some(Group {
733                    concat,
734                    group,
735                    ignore_whitespace,
736                }) => (
737                    concat,
738                    group,
739                    ignore_whitespace,
740                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
741                ),
742                None | Some(Alternation(_)) | Some(Intersection(_)) => {
743                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
744                }
745            },
746
747            None => {
748                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
749            }
750        };
751        self.parser().ignore_whitespace.set(ignore_whitespace);
752        group_concat.span.end = self.pos();
753        self.bump();
754        group.span.end = self.pos();
755        match alt {
756            Some(Either::Left(mut alt)) => {
757                alt.span.end = group_concat.span.end;
758                alt.asts.push(group_concat.into_ast());
759                group.ast = Box::new(alt.into_ast());
760            }
761            Some(Either::Right(mut int)) => {
762                int.span.end = group_concat.span.end;
763                int.asts.push(group_concat.into_ast());
764                group.ast = Box::new(int.into_ast());
765            }
766            None => {
767                group.ast = Box::new(group_concat.into_ast());
768            }
769        }
770
771        if group.kind == GroupKind::Complement {
772            let complement = ast::Complement {
773                span: self.span(),
774                ast: group.ast,
775            };
776            prior_concat.asts.push(Ast::complement(complement));
777        }
778        // ignore groups for now
779        else {
780            // dbg!(&group);
781            prior_concat.asts.push(Ast::group(group));
782        }
783        Ok(prior_concat)
784    }
785
786    /// Pop the last state from the parser's internal stack, if it exists, and
787    /// add the given concatenation to it. There either must be no state or a
788    /// single alternation item on the stack. Any other scenario produces an
789    /// error.
790    ///
791    /// This assumes that the parser has advanced to the end.
792    #[inline(never)]
793    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
794        concat.span.end = self.pos();
795        // dbg!(&concat);
796        let mut stack = self.parser().stack_group.borrow_mut();
797        let ast = match stack.pop() {
798            None => Ok(concat.into_ast()),
799            Some(GroupState::Alternation(mut alt)) => {
800                alt.span.end = self.pos();
801                alt.asts.push(concat.into_ast());
802                Ok(Ast::alternation(alt))
803            }
804            Some(GroupState::Intersection(mut int)) => {
805                int.span.end = self.pos();
806                int.asts.push(concat.into_ast());
807
808                // dbg!("end intersection2");
809                Ok(Ast::intersection(int))
810            }
811            Some(GroupState::Group { group, .. }) => {
812                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
813            }
814        };
815        // If we try to pop again, there should be nothing.
816        match stack.pop() {
817            None => ast,
818            Some(GroupState::Alternation(_)) => {
819                // This unreachable is unfortunate. This case can't happen
820                // because the only way we can be here is if there were two
821                // `GroupState::Alternation`s adjacent in the parser's stack,
822                // which we guarantee to never happen because we never push a
823                // `GroupState::Alternation` if one is already at the top of
824                // the stack.
825                unreachable!()
826            }
827            Some(GroupState::Intersection(_)) => {
828                unreachable!()
829            }
830            Some(GroupState::Group { group, .. }) => {
831                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
832            }
833        }
834    }
835
836    /// Parse the opening of a character class and push the current class
837    /// parsing context onto the parser's stack. This assumes that the parser
838    /// is positioned at an opening `[`. The given union should correspond to
839    /// the union of set items built up before seeing the `[`.
840    ///
841    /// If there was a problem parsing the opening of the class, then an error
842    /// is returned. Otherwise, a new union of set items for the class is
843    /// returned (which may be populated with either a `]` or a `-`).
844    #[inline(never)]
845    fn push_class_open(
846        &self,
847        parent_union: regex_syntax::ast::ClassSetUnion,
848    ) -> Result<regex_syntax::ast::ClassSetUnion> {
849        assert_eq!(self.char(), '[');
850
851        let (nested_set, nested_union) = self.parse_set_class_open()?;
852        self.parser()
853            .stack_class
854            .borrow_mut()
855            .push(ClassState::Open {
856                union: parent_union,
857                set: nested_set,
858            });
859        Ok(nested_union)
860    }
861
862    /// Parse the end of a character class set and pop the character class
863    /// parser stack. The union given corresponds to the last union built
864    /// before seeing the closing `]`. The union returned corresponds to the
865    /// parent character class set with the nested class added to it.
866    ///
867    /// This assumes that the parser is positioned at a `]` and will advance
868    /// the parser to the byte immediately following the `]`.
869    ///
870    /// If the stack is empty after popping, then this returns the final
871    /// "top-level" character class AST (where a "top-level" character class
872    /// is one that is not nested inside any other character class).
873    ///
874    /// If there is no corresponding opening bracket on the parser's stack,
875    /// then an error is returned.
876    #[inline(never)]
877    fn pop_class(
878        &self,
879        nested_union: regex_syntax::ast::ClassSetUnion,
880    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
881        assert_eq!(self.char(), ']');
882
883        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
884        let prevset = self.pop_class_op(item);
885        let mut stack = self.parser().stack_class.borrow_mut();
886        match stack.pop() {
887            None => {
888                // We can never observe an empty stack:
889                //
890                // 1) We are guaranteed to start with a non-empty stack since
891                //    the character class parser is only initiated when it sees
892                //    a `[`.
893                // 2) If we ever observe an empty stack while popping after
894                //    seeing a `]`, then we signal the character class parser
895                //    to terminate.
896                panic!("unexpected empty character class stack")
897            }
898            Some(ClassState::Op { .. }) => {
899                // This panic is unfortunate, but this case is impossible
900                // since we already popped the Op state if one exists above.
901                // Namely, every push to the class parser stack is guarded by
902                // whether an existing Op is already on the top of the stack.
903                // If it is, the existing Op is modified. That is, the stack
904                // can never have consecutive Op states.
905                panic!("unexpected ClassState::Op")
906            }
907            Some(ClassState::Open { mut union, mut set }) => {
908                self.bump();
909                set.span.end = self.pos();
910                set.kind = prevset;
911                if stack.is_empty() {
912                    Ok(Either::Right(set))
913                } else {
914                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
915                    Ok(Either::Left(union))
916                }
917            }
918        }
919    }
920
921    /// Return an "unclosed class" error whose span points to the most
922    /// recently opened class.
923    ///
924    /// This should only be called while parsing a character class.
925    #[inline(never)]
926    fn unclosed_class_error(&self) -> ResharpError {
927        for state in self.parser().stack_class.borrow().iter().rev() {
928            if let ClassState::Open { ref set, .. } = *state {
929                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
930            }
931        }
932        // We are guaranteed to have a non-empty stack with at least
933        // one open bracket, so we should never get here.
934        panic!("no open character class found")
935    }
936
937    /// Push the current set of class items on to the class parser's stack as
938    /// the left hand side of the given operator.
939    ///
940    /// A fresh set union is returned, which should be used to build the right
941    /// hand side of this operator.
942    #[inline(never)]
943    fn push_class_op(
944        &self,
945        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
946        next_union: regex_syntax::ast::ClassSetUnion,
947    ) -> regex_syntax::ast::ClassSetUnion {
948        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
949        let new_lhs = self.pop_class_op(item);
950        self.parser().stack_class.borrow_mut().push(ClassState::Op {
951            kind: next_kind,
952            lhs: new_lhs,
953        });
954        regex_syntax::ast::ClassSetUnion {
955            span: self.span(),
956            items: vec![],
957        }
958    }
959
960    /// Pop a character class set from the character class parser stack. If the
961    /// top of the stack is just an item (not an operation), then return the
962    /// given set unchanged. If the top of the stack is an operation, then the
963    /// given set will be used as the rhs of the operation on the top of the
964    /// stack. In that case, the binary operation is returned as a set.
965    #[inline(never)]
966    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
967        let mut stack = self.parser().stack_class.borrow_mut();
968        let (kind, lhs) = match stack.pop() {
969            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
970            Some(state @ ClassState::Open { .. }) => {
971                stack.push(state);
972                return rhs;
973            }
974            None => unreachable!(),
975        };
976        let span = Span::new(lhs.span().start, rhs.span().end);
977        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
978            span,
979            kind,
980            lhs: Box::new(lhs),
981            rhs: Box::new(rhs),
982        })
983    }
984
985    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
986        match hir.kind() {
987            hir::HirKind::Empty => Ok(NodeId::EPS),
988            hir::HirKind::Literal(l) => {
989                if l.0.len() == 1 {
990                    let node = tb.mk_u8(l.0[0]);
991                    Ok(node)
992                } else {
993                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
994                    let conc = tb.mk_concats(ws.iter().copied());
995                    Ok(conc)
996                }
997            }
998            hir::HirKind::Class(class) => {
999                // dbg!(class);
1000                match class {
1001                    hir::Class::Unicode(class_unicode) => {
1002                        let ranges = class_unicode.ranges();
1003                        let mut nodes = Vec::new();
1004                        for range in ranges {
1005                            for seq in Utf8Sequences::new(range.start(), range.end()) {
1006                                let sl = seq.as_slice();
1007                                let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1008                                let node = match bytes.len() {
1009                                    1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1010                                    n => {
1011                                        let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1012                                        let mut conc = last;
1013                                        for i in (0..n - 1).rev() {
1014                                            let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1015                                            conc = tb.mk_concat(b, conc);
1016                                        }
1017                                        conc
1018                                    }
1019                                };
1020                                nodes.push(node);
1021                            }
1022                        }
1023                        let merged = tb.mk_unions(nodes.into_iter());
1024                        Ok(merged)
1025                    }
1026                    hir::Class::Bytes(class_bytes) => {
1027                        let ranges = class_bytes.ranges();
1028                        let mut result = NodeId::BOT;
1029                        for range in ranges {
1030                            let start = range.start();
1031                            let end = range.end();
1032                            let node = tb.mk_range_u8(start, end);
1033                            result = tb.mk_union(result, node);
1034                        }
1035                        Ok(result)
1036                    }
1037                }
1038            }
1039            hir::HirKind::Look(_) => todo!(),
1040            hir::HirKind::Repetition(_) => todo!(),
1041            hir::HirKind::Capture(_) => todo!(),
1042            hir::HirKind::Concat(body) => {
1043                let mut result = NodeId::EPS;
1044                for child in body {
1045                    let node = self.hir_to_node_id(child, tb)?;
1046                    result = tb.mk_concat(result, node);
1047                }
1048                Ok(result)
1049            }
1050            hir::HirKind::Alternation(_) => todo!(),
1051        }
1052    }
1053
1054    fn translate_ast_to_hir(
1055        &mut self,
1056        orig_ast: &regex_syntax::ast::Ast,
1057        tb: &mut TB<'s>,
1058    ) -> Result<NodeId> {
1059        // let unboxed = class_perl.deref();
1060        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1061        match self.translator.translate("", orig_ast) {
1062            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1063            Ok(hir) => {
1064                let mapped = self.hir_to_node_id(&hir, tb);
1065                mapped
1066            }
1067        }
1068    }
1069
1070    fn translator_to_node_id(
1071        &mut self,
1072        orig_ast: &regex_syntax::ast::Ast,
1073        translator: &mut Option<Translator>,
1074        tb: &mut TB<'s>,
1075    ) -> Result<NodeId> {
1076        match translator {
1077            Some(tr) => {
1078                let hir = tr
1079                    .translate("", &orig_ast)
1080                    .map_err(|e| self.unsupported_error(e))?;
1081                self.hir_to_node_id(&hir, tb)
1082            }
1083            None => self.translate_ast_to_hir(&orig_ast, tb),
1084        }
1085    }
1086
1087    fn get_class(
1088        &mut self,
1089        negated: bool,
1090        kind: regex_syntax::ast::ClassPerlKind,
1091        tb: &mut TB<'s>,
1092    ) -> Result<NodeId> {
1093        // find in array
1094        let w = self
1095            .perl_classes
1096            .iter()
1097            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1098        match w {
1099            Some((_, _, value)) => return Ok(*value),
1100            None => {
1101                let translated = match kind {
1102                    regex_syntax::ast::ClassPerlKind::Word => {
1103                        self.unicode_classes.ensure_word(tb);
1104                        if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1105                    }
1106                    regex_syntax::ast::ClassPerlKind::Digit => {
1107                        self.unicode_classes.ensure_digit(tb);
1108                        if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1109                    }
1110                    regex_syntax::ast::ClassPerlKind::Space => {
1111                        self.unicode_classes.ensure_space(tb);
1112                        if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1113                    }
1114                };
1115                self.perl_classes.push((negated, kind, translated));
1116                Ok(translated)
1117            }
1118        }
1119    }
1120
1121    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1122        use WordCharKind::*;
1123        match ast {
1124            Ast::Literal(lit) => {
1125                if is_word_byte(lit.c as u8) {
1126                    Word
1127                } else {
1128                    NonWord
1129                }
1130            }
1131            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1132                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1133                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1134                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1135                (&regex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1136                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1137                (&regex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1138            },
1139            Ast::Dot(_) | Ast::Top(_) => Unknown,
1140            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1141            Ast::Concat(c) if !c.asts.is_empty() => {
1142                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1143            }
1144            Ast::Repetition(r) => {
1145                let inner = Self::word_char_kind(&r.ast, left);
1146                let nullable = matches!(
1147                    &r.op.kind,
1148                    ast::RepetitionKind::ZeroOrMore
1149                        | ast::RepetitionKind::ZeroOrOne
1150                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1151                );
1152                if nullable {
1153                    match inner {
1154                        Word => MaybeWord,
1155                        NonWord => MaybeNonWord,
1156                        _ => Unknown,
1157                    }
1158                } else {
1159                    inner
1160                }
1161            }
1162            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1163            _ => Unknown,
1164        }
1165    }
1166
1167    fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1168        match ast {
1169            Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1170            | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1171            Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1172            Ast::Concat(c) if !c.asts.is_empty() => {
1173                Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1174            }
1175            Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1176            Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1177            _ => None,
1178        }
1179    }
1180
1181    fn resolve_word_kind(
1182        &mut self,
1183        asts: &[Ast],
1184        idx: usize,
1185        dir: isize,
1186        translator: &mut Option<Translator>,
1187        tb: &mut TB<'s>,
1188        word_id: NodeId,
1189        not_word_id: NodeId,
1190    ) -> Result<WordCharKind> {
1191        use WordCharKind::*;
1192        let fast = Self::concat_neighbor_kind(asts, idx, dir);
1193        if fast != Unknown {
1194            return Ok(fast);
1195        }
1196        let neighbor_idx = (idx as isize + dir) as usize;
1197        let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1198            Some(e) => e,
1199            None => return Ok(Unknown),
1200        };
1201        let node = self.ast_to_node_id(edge, translator, tb)?;
1202        if tb.subsumes(word_id, node) == Some(true) {
1203            Ok(Word)
1204        } else if tb.subsumes(not_word_id, node) == Some(true) {
1205            Ok(NonWord)
1206        } else {
1207            Ok(Unknown)
1208        }
1209    }
1210
1211    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1212        use WordCharKind::*;
1213        let next = idx as isize + dir;
1214        if next < 0 || next >= asts.len() as isize {
1215            return Edge;
1216        }
1217        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1218        match kind {
1219            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1220                Word => Word,
1221                _ => Unknown,
1222            },
1223            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1224                NonWord => NonWord,
1225                _ => Unknown,
1226            },
1227            other => other,
1228        }
1229    }
1230
1231    fn rewrite_word_boundary_in_concat(
1232        &mut self,
1233        asts: &[Ast],
1234        idx: usize,
1235        translator: &mut Option<Translator>,
1236        tb: &mut TB<'s>,
1237    ) -> Result<(NodeId, usize)> {
1238        use WordCharKind::*;
1239        self.unicode_classes.ensure_word(tb);
1240        let word_id = self.unicode_classes.word;
1241        let not_word_id = self.unicode_classes.non_word;
1242        let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1243        let right =
1244            self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1245
1246        match (left, right) {
1247            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1248            (Word, _) => {
1249                let neg = tb.mk_neg_lookahead(word_id, 0);
1250                Ok((neg, idx + 1))
1251            }
1252            (NonWord, _) => {
1253                let set = tb.mk_union(NodeId::END, word_id);
1254                let tail = tb.mk_concat(set, NodeId::TS);
1255                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1256            }
1257            (_, Word) => {
1258                Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1259            }
1260            (_, NonWord) => {
1261                let body = tb.mk_union(NodeId::BEGIN, word_id);
1262                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1263            }
1264            // TODO: (Unknown, Unknown) is possible via make_full_word_boundary but
1265            // the full expansion (lb(\w)·la(\W) | lb(\W)·la(\w)) is too expensive
1266            _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1267        }
1268    }
1269
1270    fn merge_boundary_with_following_lookaheads(
1271        &mut self,
1272        asts: &[Ast],
1273        wb_idx: usize,
1274        boundary_tail: NodeId,
1275        translator: &mut Option<Translator>,
1276        tb: &mut TB<'s>,
1277    ) -> Result<(NodeId, usize)> {
1278        let mut next = wb_idx + 1;
1279        let mut la_bodies = vec![boundary_tail];
1280        while next < asts.len() {
1281            match &asts[next] {
1282                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1283                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1284                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1285                    next += 1;
1286                }
1287                _ => break,
1288            }
1289        }
1290        let merged = tb.mk_inters(la_bodies.into_iter());
1291        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1292    }
1293
1294    fn ast_to_node_id(
1295        &mut self,
1296        ast: &Ast,
1297        translator: &mut Option<Translator>,
1298        tb: &mut TB<'s>,
1299    ) -> Result<NodeId> {
1300        match ast {
1301            Ast::Empty(_) => Ok(NodeId::EPS),
1302            Ast::Flags(f) => {
1303                let mut translator_builder = Self::default_translator_builder();
1304                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1305                    translator_builder.case_insensitive(state);
1306                }
1307                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1308                    translator_builder.unicode(state);
1309                }
1310                let concat_translator = Some(translator_builder.build());
1311                *translator = concat_translator;
1312                Ok(NodeId::EPS)
1313            }
1314            Ast::Literal(l) => {
1315                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1316                self.translator_to_node_id(&ast_lit, translator, tb)
1317            }
1318            Ast::Top(_) => Ok(NodeId::TOP),
1319            Ast::Dot(_) => {
1320                let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1321                self.hir_to_node_id(&hirv, tb)
1322            }
1323            Ast::Assertion(a) => match &a.kind {
1324                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1325                ast::AssertionKind::EndText => Ok(NodeId::END),
1326                ast::AssertionKind::WordBoundary => {
1327                    return Err(
1328                        self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1329                    )
1330                }
1331                ast::AssertionKind::NotWordBoundary => {
1332                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1333                }
1334                ast::AssertionKind::StartLine => {
1335                    let left = NodeId::BEGIN;
1336                    let right = tb.mk_u8('\n' as u8);
1337                    let union = tb.mk_union(left, right);
1338                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1339                }
1340                ast::AssertionKind::EndLine => {
1341                    let left = NodeId::END;
1342                    let right = tb.mk_u8('\n' as u8);
1343                    let union = tb.mk_union(left, right);
1344                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1345                }
1346                ast::AssertionKind::WordBoundaryStart => todo!(),
1347                ast::AssertionKind::WordBoundaryEnd => todo!(),
1348                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1349                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1350                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1351                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1352            },
1353            Ast::ClassUnicode(c) => {
1354                let tmp = regex_syntax::ast::ClassUnicode {
1355                    span: c.span,
1356                    negated: c.negated,
1357                    kind: c.kind.clone(),
1358                };
1359                if !c.negated {
1360                    match &c.kind {
1361                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1362                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1363                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1364                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1365                            "utf8" => {
1366                                let ascii = tb.mk_range_u8(0, 127);
1367                                let beta = tb.mk_range_u8(128, 0xBF);
1368                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1369                                let c0s = tb.mk_concats([c0, beta].into_iter());
1370                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1371                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1372                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1373                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1374                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1375                                return Ok(tb.mk_star(merged));
1376                            }
1377                            "hex" => {
1378                                let nums = tb.mk_range_u8(b'0', b'9');
1379                                let lets = tb.mk_range_u8(b'a', b'f');
1380                                let lets2 = tb.mk_range_u8(b'A', b'F');
1381                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1382                                return Ok(merged);
1383                            }
1384                            _ => {}
1385                        },
1386                        _ => {}
1387                    };
1388                }
1389
1390                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1391                self.translator_to_node_id(&orig_ast, translator, tb)
1392            }
1393            Ast::ClassPerl(c) => {
1394                self.get_class(c.negated, c.kind.clone(), tb)
1395            }
1396            Ast::ClassBracketed(c) => match &c.kind {
1397                regex_syntax::ast::ClassSet::Item(_) => {
1398                    let tmp = regex_syntax::ast::ClassBracketed {
1399                        span: c.span,
1400                        negated: c.negated,
1401                        kind: c.kind.clone(),
1402                    };
1403                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1404                    self.translator_to_node_id(&orig_ast, translator, tb)
1405                }
1406                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1407            },
1408            Ast::Repetition(r) => {
1409                let body = self.ast_to_node_id(&r.ast, translator, tb);
1410                match body {
1411                    Ok(body) => match &r.op.kind {
1412                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1413                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1414                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1415                        ast::RepetitionKind::Range(r) => match r {
1416                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1417                            ast::RepetitionRange::AtLeast(n) => {
1418                                let rep = tb.mk_repeat(body, *n, *n);
1419                                let st = tb.mk_star(body);
1420                                Ok(tb.mk_concat(rep, st))
1421                            }
1422
1423                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1424                        },
1425                    },
1426                    Err(_) => body,
1427                }
1428            }
1429            Ast::Lookaround(g) => {
1430                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1431                match g.kind {
1432                    ast::LookaroundKind::PositiveLookahead => {
1433                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1434                    }
1435                    ast::LookaroundKind::PositiveLookbehind => {
1436                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1437                    }
1438                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1439                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1440                }
1441            }
1442            Ast::Group(g) => {
1443                if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1444                    if !flags.items.is_empty() {
1445                        let mut translator_builder = Self::default_translator_builder();
1446                        translator_builder.utf8(false);
1447                        if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1448                            translator_builder.case_insensitive(state);
1449                        }
1450                        if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1451                            translator_builder.unicode(state);
1452                        }
1453                        let mut scoped = Some(translator_builder.build());
1454                        return self.ast_to_node_id(&g.ast, &mut scoped, tb);
1455                    }
1456                }
1457                self.ast_to_node_id(&g.ast, translator, tb)
1458            }
1459            Ast::Alternation(a) => {
1460                let mut children = vec![];
1461                for ast in &a.asts {
1462                    match self.ast_to_node_id(ast, translator, tb) {
1463                        Ok(node_id) => children.push(node_id),
1464                        Err(err) => return Err(err),
1465                    }
1466                }
1467                Ok(tb.mk_unions(children.iter().copied()))
1468            }
1469            Ast::Concat(c) => {
1470                let mut concat_translator: Option<Translator> = None;
1471                let mut children = vec![];
1472                let mut i = 0;
1473                while i < c.asts.len() {
1474                    let ast = &c.asts[i];
1475                    match ast {
1476                        Ast::Flags(f) => {
1477                            let mut translator_builder = Self::default_translator_builder();
1478                            translator_builder.utf8(false);
1479                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1480                                translator_builder.case_insensitive(state);
1481                            }
1482                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1483                                translator_builder.unicode(state);
1484                            }
1485                            concat_translator = Some(translator_builder.build());
1486                            i += 1;
1487                            continue;
1488                        }
1489                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1490                            let node =
1491                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1492                            children.push(node.0);
1493                            i = node.1; // skip consumed lookaheads
1494                            continue;
1495                        }
1496                        _ => {}
1497                    }
1498                    match concat_translator {
1499                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1500                            Ok(node_id) => children.push(node_id),
1501                            Err(err) => return Err(err),
1502                        },
1503                        None => match self.ast_to_node_id(ast, translator, tb) {
1504                            Ok(node_id) => children.push(node_id),
1505                            Err(err) => return Err(err),
1506                        },
1507                    }
1508                    i += 1;
1509                }
1510                Ok(tb.mk_concats(children.iter().cloned()))
1511            }
1512            Ast::Intersection(intersection) => {
1513                let mut children = vec![];
1514                for ast in &intersection.asts {
1515                    match self.ast_to_node_id(ast, translator, tb) {
1516                        Ok(node_id) => children.push(node_id),
1517                        Err(err) => return Err(err),
1518                    }
1519                }
1520                Ok(tb.mk_inters(children.into_iter()))
1521            }
1522            Ast::Complement(complement) => {
1523                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1524                body.map(|x| tb.mk_compl(x))
1525            }
1526        }
1527    }
1528
1529    /// Parse the regular expression and return an abstract syntax tree with
1530    /// all of the comments found in the pattern.
1531    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1532        let mut concat = Concat {
1533            span: self.span(),
1534            asts: vec![],
1535        };
1536        loop {
1537            self.bump_space();
1538            if self.is_eof() {
1539                break;
1540            }
1541            match self.char() {
1542                '(' => concat = self.push_group(concat)?,
1543                ')' => concat = self.pop_group(concat)?,
1544                '|' => concat = self.push_alternate(concat)?,
1545                '&' => concat = self.push_intersect(concat)?,
1546                '~' => concat = self.push_compl_group(concat)?,
1547                '[' => {
1548                    let class = self.parse_set_class()?;
1549                    concat.asts.push(Ast::class_bracketed(class));
1550                }
1551                '?' => {
1552                    concat =
1553                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1554                }
1555                '*' => {
1556                    concat =
1557                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1558                }
1559                '+' => {
1560                    concat =
1561                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1562                }
1563                '{' => {
1564                    concat = self.parse_counted_repetition(concat)?;
1565                }
1566                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1567            }
1568        }
1569        let ast = self.pop_group_end(concat)?;
1570        // dbg!(&ast);
1571        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1572        node_id
1573    }
1574
1575    #[inline(never)]
1576    fn parse_uncounted_repetition(
1577        &self,
1578        mut concat: ast::Concat,
1579        kind: ast::RepetitionKind,
1580    ) -> Result<ast::Concat> {
1581        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1582        let op_start = self.pos();
1583        let ast = match concat.asts.pop() {
1584            Some(ast) => ast,
1585            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1586        };
1587        match ast {
1588            Ast::Empty(_) | Ast::Flags(_) => {
1589                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1590            }
1591            _ => {}
1592        }
1593        if self.bump() && self.char() == '?' {
1594            return Err(self.error(
1595                Span::new(op_start, self.pos()),
1596                ast::ErrorKind::UnsupportedLazyQuantifier,
1597            ));
1598        }
1599        concat.asts.push(Ast::repetition(ast::Repetition {
1600            span: ast.span().with_end(self.pos()),
1601            op: ast::RepetitionOp {
1602                span: Span::new(op_start, self.pos()),
1603                kind,
1604            },
1605            greedy: true,
1606            ast: Box::new(ast),
1607        }));
1608        Ok(concat)
1609    }
1610
1611    #[inline(never)]
1612    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1613        assert!(self.char() == '{');
1614        let start = self.pos();
1615        let ast = match concat.asts.pop() {
1616            Some(ast) => ast,
1617            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1618        };
1619        match ast {
1620            Ast::Empty(_) | Ast::Flags(_) => {
1621                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1622            }
1623            _ => {}
1624        }
1625        if !self.bump_and_bump_space() {
1626            return Err(self.error(
1627                Span::new(start, self.pos()),
1628                ast::ErrorKind::RepetitionCountUnclosed,
1629            ));
1630        }
1631        let count_start = specialize_err(
1632            self.parse_decimal(),
1633            ast::ErrorKind::DecimalEmpty,
1634            ast::ErrorKind::RepetitionCountDecimalEmpty,
1635        );
1636        if self.is_eof() {
1637            return Err(self.error(
1638                Span::new(start, self.pos()),
1639                ast::ErrorKind::RepetitionCountUnclosed,
1640            ));
1641        }
1642        let range = if self.char() == ',' {
1643            if !self.bump_and_bump_space() {
1644                return Err(self.error(
1645                    Span::new(start, self.pos()),
1646                    ast::ErrorKind::RepetitionCountUnclosed,
1647                ));
1648            }
1649            if self.char() != '}' {
1650                let count_start = match count_start {
1651                    Ok(c) => c,
1652                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1653                        if self.parser().empty_min_range {
1654                            0
1655                        } else {
1656                            return Err(err);
1657                        }
1658                    }
1659                    err => err?,
1660                };
1661                let count_end = specialize_err(
1662                    self.parse_decimal(),
1663                    ast::ErrorKind::DecimalEmpty,
1664                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1665                )?;
1666                ast::RepetitionRange::Bounded(count_start, count_end)
1667            } else {
1668                ast::RepetitionRange::AtLeast(count_start?)
1669            }
1670        } else {
1671            ast::RepetitionRange::Exactly(count_start?)
1672        };
1673
1674        if self.is_eof() || self.char() != '}' {
1675            return Err(self.error(
1676                Span::new(start, self.pos()),
1677                ast::ErrorKind::RepetitionCountUnclosed,
1678            ));
1679        }
1680
1681        if self.bump_and_bump_space() && self.char() == '?' {
1682            return Err(self.error(
1683                Span::new(start, self.pos()),
1684                ast::ErrorKind::UnsupportedLazyQuantifier,
1685            ));
1686        }
1687
1688        let op_span = Span::new(start, self.pos());
1689        if !range.is_valid() {
1690            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1691        }
1692        concat.asts.push(Ast::repetition(ast::Repetition {
1693            span: ast.span().with_end(self.pos()),
1694            op: ast::RepetitionOp {
1695                span: op_span,
1696                kind: ast::RepetitionKind::Range(range),
1697            },
1698            greedy: true,
1699            ast: Box::new(ast),
1700        }));
1701        Ok(concat)
1702    }
1703
1704    #[inline(never)]
1705    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1706        assert_eq!(self.char(), '(');
1707        let open_span = self.span_char();
1708        self.bump();
1709        self.bump_space();
1710        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1711            let kind = match (pos, ahead) {
1712                (true, true) => LookaroundKind::PositiveLookahead,
1713                (true, false) => LookaroundKind::PositiveLookbehind,
1714                (false, true) => LookaroundKind::NegativeLookahead,
1715                (false, false) => LookaroundKind::NegativeLookbehind,
1716            };
1717            return Ok(Either::Right(ast::Group {
1718                span: open_span,
1719                kind: ast::GroupKind::Lookaround(kind),
1720                ast: Box::new(Ast::empty(self.span())),
1721            }));
1722        }
1723        let inner_span = self.span();
1724        let mut starts_with_p = true;
1725        if self.bump_if("?P<") || {
1726            starts_with_p = false;
1727            self.bump_if("?<")
1728        } {
1729            let capture_index = self.next_capture_index(open_span)?;
1730            let name = self.parse_capture_name(capture_index)?;
1731            Ok(Either::Right(ast::Group {
1732                span: open_span,
1733                kind: ast::GroupKind::CaptureName {
1734                    starts_with_p,
1735                    name,
1736                },
1737                ast: Box::new(Ast::empty(self.span())),
1738            }))
1739        } else if self.bump_if("?") {
1740            if self.is_eof() {
1741                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1742            }
1743            let flags = self.parse_flags()?;
1744            let char_end = self.char();
1745            self.bump();
1746            if char_end == ')' {
1747                // We don't allow empty flags, e.g., `(?)`. We instead
1748                // interpret it as a repetition operator missing its argument.
1749                if flags.items.is_empty() {
1750                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1751                }
1752                Ok(Either::Left(ast::SetFlags {
1753                    span: Span {
1754                        end: self.pos(),
1755                        ..open_span
1756                    },
1757                    flags,
1758                }))
1759            } else {
1760                assert_eq!(char_end, ':');
1761                Ok(Either::Right(ast::Group {
1762                    span: open_span,
1763                    kind: ast::GroupKind::NonCapturing(flags),
1764                    ast: Box::new(Ast::empty(self.span())),
1765                }))
1766            }
1767        } else {
1768            let capture_index = self.next_capture_index(open_span)?;
1769            Ok(Either::Right(ast::Group {
1770                span: open_span,
1771                kind: ast::GroupKind::CaptureIndex(capture_index),
1772                ast: Box::new(Ast::empty(self.span())),
1773            }))
1774        }
1775    }
1776
1777    #[inline(never)]
1778    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1779        if self.is_eof() {
1780            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1781        }
1782        let start = self.pos();
1783        loop {
1784            if self.char() == '>' {
1785                break;
1786            }
1787            if !is_capture_char(self.char(), self.pos() == start) {
1788                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1789            }
1790            if !self.bump() {
1791                break;
1792            }
1793        }
1794        let end = self.pos();
1795        if self.is_eof() {
1796            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1797        }
1798        assert_eq!(self.char(), '>');
1799        self.bump();
1800        let name = &self.pattern()[start.offset..end.offset];
1801        if name.is_empty() {
1802            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1803        }
1804        let capname = ast::CaptureName {
1805            span: Span::new(start, end),
1806            name: name.to_string(),
1807            index: capture_index,
1808        };
1809        self.add_capture_name(&capname)?;
1810        Ok(capname)
1811    }
1812
1813    #[inline(never)]
1814    fn parse_flags(&self) -> Result<ast::Flags> {
1815        let mut flags = ast::Flags {
1816            span: self.span(),
1817            items: vec![],
1818        };
1819        let mut last_was_negation = None;
1820        while self.char() != ':' && self.char() != ')' {
1821            if self.char() == '-' {
1822                last_was_negation = Some(self.span_char());
1823                let item = ast::FlagsItem {
1824                    span: self.span_char(),
1825                    kind: ast::FlagsItemKind::Negation,
1826                };
1827                if let Some(i) = flags.add_item(item) {
1828                    return Err(self.error(
1829                        self.span_char(),
1830                        ast::ErrorKind::FlagRepeatedNegation {
1831                            original: flags.items[i].span,
1832                        },
1833                    ));
1834                }
1835            } else {
1836                last_was_negation = None;
1837                let item = ast::FlagsItem {
1838                    span: self.span_char(),
1839                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1840                };
1841                if let Some(i) = flags.add_item(item) {
1842                    return Err(self.error(
1843                        self.span_char(),
1844                        ast::ErrorKind::FlagDuplicate {
1845                            original: flags.items[i].span,
1846                        },
1847                    ));
1848                }
1849            }
1850            if !self.bump() {
1851                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1852            }
1853        }
1854        if let Some(span) = last_was_negation {
1855            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1856        }
1857        flags.span.end = self.pos();
1858        Ok(flags)
1859    }
1860
1861    #[inline(never)]
1862    fn parse_flag(&self) -> Result<ast::Flag> {
1863        match self.char() {
1864            'i' => Ok(ast::Flag::CaseInsensitive),
1865            'm' => Ok(ast::Flag::MultiLine),
1866            's' => Ok(ast::Flag::DotMatchesNewLine),
1867            'U' => Ok(ast::Flag::SwapGreed),
1868            'u' => Ok(ast::Flag::Unicode),
1869            'R' => Ok(ast::Flag::CRLF),
1870            'x' => Ok(ast::Flag::IgnoreWhitespace),
1871            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1872        }
1873    }
1874
1875    fn parse_primitive(&self) -> Result<Primitive> {
1876        match self.char() {
1877            '\\' => self.parse_escape(),
1878            '_' => {
1879                let ast = Primitive::Top(self.span_char());
1880                self.bump();
1881                Ok(ast)
1882            }
1883            '.' => {
1884                let ast = Primitive::Dot(self.span_char());
1885                self.bump();
1886                Ok(ast)
1887            }
1888            '^' => {
1889                let ast = Primitive::Assertion(ast::Assertion {
1890                    span: self.span_char(),
1891                    kind: ast::AssertionKind::StartLine,
1892                });
1893                self.bump();
1894                Ok(ast)
1895            }
1896            '$' => {
1897                let ast = Primitive::Assertion(ast::Assertion {
1898                    span: self.span_char(),
1899                    kind: ast::AssertionKind::EndLine,
1900                });
1901                self.bump();
1902                Ok(ast)
1903            }
1904            c => {
1905                let ast = Primitive::Literal(Literal {
1906                    span: self.span_char(),
1907                    kind: LiteralKind::Verbatim,
1908                    c,
1909                });
1910                self.bump();
1911                Ok(ast)
1912            }
1913        }
1914    }
1915
1916    #[inline(never)]
1917    fn parse_escape(&self) -> Result<Primitive> {
1918        assert_eq!(self.char(), '\\');
1919        let start = self.pos();
1920        if !self.bump() {
1921            return Err(self.error(
1922                Span::new(start, self.pos()),
1923                ast::ErrorKind::EscapeUnexpectedEof,
1924            ));
1925        }
1926        let c = self.char();
1927        // Put some of the more complicated routines into helpers.
1928        match c {
1929            '0'..='9' => {
1930                if !self.parser().octal {
1931                    return Err(self.error(
1932                        Span::new(start, self.span_char().end),
1933                        ast::ErrorKind::UnsupportedBackreference,
1934                    ));
1935                }
1936                let mut lit = self.parse_octal();
1937                lit.span.start = start;
1938                return Ok(Primitive::Literal(lit));
1939            }
1940            // '8'..='9' if !self.parser().octal => {
1941            //     return Err(self.error(
1942            //         Span::new(start, self.span_char().end),
1943            //         ast::ErrorKind::UnsupportedBackreference,
1944            //     ));
1945            // }
1946            'x' | 'u' | 'U' => {
1947                let mut lit = self.parse_hex()?;
1948                lit.span.start = start;
1949                return Ok(Primitive::Literal(lit));
1950            }
1951            'p' | 'P' => {
1952                let mut cls = self.parse_unicode_class()?;
1953                cls.span.start = start;
1954                return Ok(Primitive::Unicode(cls));
1955            }
1956            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1957                let mut cls = self.parse_perl_class();
1958                cls.span.start = start;
1959                return Ok(Primitive::Perl(cls));
1960            }
1961            _ => {}
1962        }
1963
1964        // Handle all of the one letter sequences inline.
1965        self.bump();
1966        let span = Span::new(start, self.pos());
1967        if is_meta_character(c) {
1968            return Ok(Primitive::Literal(Literal {
1969                span,
1970                kind: LiteralKind::Meta,
1971                c,
1972            }));
1973        }
1974        if is_escapeable_character(c) {
1975            return Ok(Primitive::Literal(Literal {
1976                span,
1977                kind: LiteralKind::Superfluous,
1978                c,
1979            }));
1980        }
1981        let special = |kind, c| {
1982            Ok(Primitive::Literal(Literal {
1983                span,
1984                kind: LiteralKind::Special(kind),
1985                c,
1986            }))
1987        };
1988        match c {
1989            'a' => special(SpecialLiteralKind::Bell, '\x07'),
1990            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1991            't' => special(SpecialLiteralKind::Tab, '\t'),
1992            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1993            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1994            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1995            'A' => Ok(Primitive::Assertion(ast::Assertion {
1996                span,
1997                kind: ast::AssertionKind::StartText,
1998            })),
1999            'z' => Ok(Primitive::Assertion(ast::Assertion {
2000                span,
2001                kind: ast::AssertionKind::EndText,
2002            })),
2003            'b' => {
2004                let mut wb = ast::Assertion {
2005                    span,
2006                    kind: ast::AssertionKind::WordBoundary,
2007                };
2008                // After a \b, we "try" to parse things like \b{start} for
2009                // special word boundary assertions.
2010                if !self.is_eof() && self.char() == '{' {
2011                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2012                        wb.kind = kind;
2013                        wb.span.end = self.pos();
2014                    }
2015                }
2016                Ok(Primitive::Assertion(wb))
2017            }
2018            'B' => Ok(Primitive::Assertion(ast::Assertion {
2019                span,
2020                kind: ast::AssertionKind::NotWordBoundary,
2021            })),
2022            '<' => Ok(Primitive::Assertion(ast::Assertion {
2023                span,
2024                kind: ast::AssertionKind::WordBoundaryStartAngle,
2025            })),
2026            '>' => Ok(Primitive::Assertion(ast::Assertion {
2027                span,
2028                kind: ast::AssertionKind::WordBoundaryEndAngle,
2029            })),
2030            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2031        }
2032    }
2033
2034    fn maybe_parse_special_word_boundary(
2035        &self,
2036        wb_start: Position,
2037    ) -> Result<Option<ast::AssertionKind>> {
2038        assert_eq!(self.char(), '{');
2039
2040        let is_valid_char = |c| match c {
2041            'A'..='Z' | 'a'..='z' | '-' => true,
2042            _ => false,
2043        };
2044        let start = self.pos();
2045        if !self.bump_and_bump_space() {
2046            return Err(self.error(
2047                Span::new(wb_start, self.pos()),
2048                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2049            ));
2050        }
2051        let start_contents = self.pos();
2052        // This is one of the critical bits: if the first non-whitespace
2053        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2054        // boundary), then we bail and let the counted repetition parser deal
2055        // with this.
2056        if !is_valid_char(self.char()) {
2057            self.parser().pos.set(start);
2058            return Ok(None);
2059        }
2060
2061        // Now collect up our chars until we see a '}'.
2062        let mut scratch = self.parser().scratch.borrow_mut();
2063        scratch.clear();
2064        while !self.is_eof() && is_valid_char(self.char()) {
2065            scratch.push(self.char());
2066            self.bump_and_bump_space();
2067        }
2068        if self.is_eof() || self.char() != '}' {
2069            return Err(self.error(
2070                Span::new(start, self.pos()),
2071                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2072            ));
2073        }
2074        let end = self.pos();
2075        self.bump();
2076        let kind = match scratch.as_str() {
2077            "start" => ast::AssertionKind::WordBoundaryStart,
2078            "end" => ast::AssertionKind::WordBoundaryEnd,
2079            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2080            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2081            _ => {
2082                return Err(self.error(
2083                    Span::new(start_contents, end),
2084                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2085                ))
2086            }
2087        };
2088        Ok(Some(kind))
2089    }
2090
2091    #[inline(never)]
2092    fn parse_octal(&self) -> Literal {
2093        assert!(self.parser().octal);
2094        assert!('0' <= self.char() && self.char() <= '7');
2095        let start = self.pos();
2096        // Parse up to two more digits.
2097        while self.bump()
2098            && '0' <= self.char()
2099            && self.char() <= '7'
2100            && self.pos().offset - start.offset <= 2
2101        {}
2102        let end = self.pos();
2103        let octal = &self.pattern()[start.offset..end.offset];
2104        // Parsing the octal should never fail since the above guarantees a
2105        // valid number.
2106        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2107        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2108        // invalid Unicode scalar values.
2109        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2110        Literal {
2111            span: Span::new(start, end),
2112            kind: LiteralKind::Octal,
2113            c,
2114        }
2115    }
2116
2117    #[inline(never)]
2118    fn parse_hex(&self) -> Result<Literal> {
2119        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2120
2121        let hex_kind = match self.char() {
2122            'x' => HexLiteralKind::X,
2123            'u' => HexLiteralKind::UnicodeShort,
2124            _ => HexLiteralKind::UnicodeLong,
2125        };
2126        if !self.bump_and_bump_space() {
2127            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2128        }
2129        if self.char() == '{' {
2130            self.parse_hex_brace(hex_kind)
2131        } else {
2132            self.parse_hex_digits(hex_kind)
2133        }
2134    }
2135
2136    #[inline(never)]
2137    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2138        let mut scratch = self.parser().scratch.borrow_mut();
2139        scratch.clear();
2140
2141        let start = self.pos();
2142        for i in 0..kind.digits() {
2143            if i > 0 && !self.bump_and_bump_space() {
2144                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2145            }
2146            if !is_hex(self.char()) {
2147                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2148            }
2149            scratch.push(self.char());
2150        }
2151        // The final bump just moves the parser past the literal, which may
2152        // be EOF.
2153        self.bump_and_bump_space();
2154        let end = self.pos();
2155        let hex = scratch.as_str();
2156        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2157            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2158            Some(c) => Ok(Literal {
2159                span: Span::new(start, end),
2160                kind: LiteralKind::HexFixed(kind),
2161                c,
2162            }),
2163        }
2164    }
2165
2166    #[inline(never)]
2167    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2168        let mut scratch = self.parser().scratch.borrow_mut();
2169        scratch.clear();
2170
2171        let brace_pos = self.pos();
2172        let start = self.span_char().end;
2173        while self.bump_and_bump_space() && self.char() != '}' {
2174            if !is_hex(self.char()) {
2175                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2176            }
2177            scratch.push(self.char());
2178        }
2179        if self.is_eof() {
2180            return Err(self.error(
2181                Span::new(brace_pos, self.pos()),
2182                ast::ErrorKind::EscapeUnexpectedEof,
2183            ));
2184        }
2185        let end = self.pos();
2186        let hex = scratch.as_str();
2187        assert_eq!(self.char(), '}');
2188        self.bump_and_bump_space();
2189
2190        if hex.is_empty() {
2191            return Err(self.error(
2192                Span::new(brace_pos, self.pos()),
2193                ast::ErrorKind::EscapeHexEmpty,
2194            ));
2195        }
2196        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2197            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2198            Some(c) => Ok(Literal {
2199                span: Span::new(start, self.pos()),
2200                kind: LiteralKind::HexBrace(kind),
2201                c,
2202            }),
2203        }
2204    }
2205
2206    fn parse_decimal(&self) -> Result<u32> {
2207        let mut scratch = self.parser().scratch.borrow_mut();
2208        scratch.clear();
2209
2210        while !self.is_eof() && self.char().is_whitespace() {
2211            self.bump();
2212        }
2213        let start = self.pos();
2214        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2215            scratch.push(self.char());
2216            self.bump_and_bump_space();
2217        }
2218        let span = Span::new(start, self.pos());
2219        while !self.is_eof() && self.char().is_whitespace() {
2220            self.bump_and_bump_space();
2221        }
2222        let digits = scratch.as_str();
2223        if digits.is_empty() {
2224            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2225        }
2226        match u32::from_str_radix(digits, 10).ok() {
2227            Some(n) => Ok(n),
2228            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2229        }
2230    }
2231
2232    #[inline(never)]
2233    fn parse_set_class(&self) -> Result<ClassBracketed> {
2234        assert_eq!(self.char(), '[');
2235
2236        let mut union = ClassSetUnion {
2237            span: self.span(),
2238            items: vec![],
2239        };
2240        loop {
2241            self.bump_space();
2242            if self.is_eof() {
2243                return Err(self.unclosed_class_error());
2244            }
2245            match self.char() {
2246                '[' => {
2247                    // If we've already parsed the opening bracket, then
2248                    // attempt to treat this as the beginning of an ASCII
2249                    // class. If ASCII class parsing fails, then the parser
2250                    // backs up to `[`.
2251                    if !self.parser().stack_class.borrow().is_empty() {
2252                        if let Some(cls) = self.maybe_parse_ascii_class() {
2253                            union.push(ClassSetItem::Ascii(cls));
2254                            continue;
2255                        }
2256                    }
2257                    union = self.push_class_open(union)?;
2258                }
2259                ']' => match self.pop_class(union)? {
2260                    Either::Left(nested_union) => {
2261                        union = nested_union;
2262                    }
2263                    Either::Right(class) => return Ok(class),
2264                },
2265                '&' if self.peek() == Some('&') => {
2266                    assert!(self.bump_if("&&"));
2267                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2268                }
2269                '-' if self.peek() == Some('-') => {
2270                    assert!(self.bump_if("--"));
2271                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2272                }
2273                '~' if self.peek() == Some('~') => {
2274                    assert!(self.bump_if("~~"));
2275                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2276                }
2277                _ => {
2278                    union.push(self.parse_set_class_range()?);
2279                }
2280            }
2281        }
2282    }
2283
2284    #[inline(never)]
2285    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2286        let prim1 = self.parse_set_class_item()?;
2287        self.bump_space();
2288        if self.is_eof() {
2289            return Err(self.unclosed_class_error());
2290        }
2291        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2292            return prim1.into_class_set_item(self);
2293        }
2294        if !self.bump_and_bump_space() {
2295            return Err(self.unclosed_class_error());
2296        }
2297        let prim2 = self.parse_set_class_item()?;
2298        let range = ClassSetRange {
2299            span: Span::new(prim1.span().start, prim2.span().end),
2300            start: prim1.into_class_literal(self)?,
2301            end: prim2.into_class_literal(self)?,
2302        };
2303        if !range.is_valid() {
2304            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2305        }
2306        Ok(ClassSetItem::Range(range))
2307    }
2308
2309    #[inline(never)]
2310    fn parse_set_class_item(&self) -> Result<Primitive> {
2311        if self.char() == '\\' {
2312            self.parse_escape()
2313        } else {
2314            let x = Primitive::Literal(Literal {
2315                span: self.span_char(),
2316                kind: LiteralKind::Verbatim,
2317                c: self.char(),
2318            });
2319            self.bump();
2320            Ok(x)
2321        }
2322    }
2323
2324    #[inline(never)]
2325    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2326        assert_eq!(self.char(), '[');
2327        let start = self.pos();
2328        if !self.bump_and_bump_space() {
2329            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2330        }
2331
2332        let negated = if self.char() != '^' {
2333            false
2334        } else {
2335            if !self.bump_and_bump_space() {
2336                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2337            }
2338            true
2339        };
2340        // Accept any number of `-` as literal `-`.
2341        let mut union = ClassSetUnion {
2342            span: self.span(),
2343            items: vec![],
2344        };
2345        while self.char() == '-' {
2346            union.push(ClassSetItem::Literal(Literal {
2347                span: self.span_char(),
2348                kind: LiteralKind::Verbatim,
2349                c: '-',
2350            }));
2351            if !self.bump_and_bump_space() {
2352                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2353            }
2354        }
2355        // If `]` is the *first* char in a set, then interpret it as a literal
2356        // `]`. That is, an empty class is impossible to write.
2357        if union.items.is_empty() && self.char() == ']' {
2358            union.push(ClassSetItem::Literal(Literal {
2359                span: self.span_char(),
2360                kind: LiteralKind::Verbatim,
2361                c: ']',
2362            }));
2363            if !self.bump_and_bump_space() {
2364                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2365            }
2366        }
2367        let set = ClassBracketed {
2368            span: Span::new(start, self.pos()),
2369            negated,
2370            kind: ClassSet::union(ClassSetUnion {
2371                span: Span::new(union.span.start, union.span.start),
2372                items: vec![],
2373            }),
2374        };
2375        Ok((set, union))
2376    }
2377
2378    #[inline(never)]
2379    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2380        assert_eq!(self.char(), '[');
2381        // If parsing fails, then we back up the parser to this starting point.
2382        let start = self.pos();
2383        let mut negated = false;
2384        if !self.bump() || self.char() != ':' {
2385            self.parser().pos.set(start);
2386            return None;
2387        }
2388        if !self.bump() {
2389            self.parser().pos.set(start);
2390            return None;
2391        }
2392        if self.char() == '^' {
2393            negated = true;
2394            if !self.bump() {
2395                self.parser().pos.set(start);
2396                return None;
2397            }
2398        }
2399        let name_start = self.offset();
2400        while self.char() != ':' && self.bump() {}
2401        if self.is_eof() {
2402            self.parser().pos.set(start);
2403            return None;
2404        }
2405        let name = &self.pattern()[name_start..self.offset()];
2406        if !self.bump_if(":]") {
2407            self.parser().pos.set(start);
2408            return None;
2409        }
2410        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2411            Some(kind) => kind,
2412            None => {
2413                self.parser().pos.set(start);
2414                return None;
2415            }
2416        };
2417        Some(ClassAscii {
2418            span: Span::new(start, self.pos()),
2419            kind,
2420            negated,
2421        })
2422    }
2423
2424    #[inline(never)]
2425    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2426        assert!(self.char() == 'p' || self.char() == 'P');
2427
2428        let mut scratch = self.parser().scratch.borrow_mut();
2429        scratch.clear();
2430
2431        let negated = self.char() == 'P';
2432        if !self.bump_and_bump_space() {
2433            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2434        }
2435        let (start, kind) = if self.char() == '{' {
2436            let start = self.span_char().end;
2437            while self.bump_and_bump_space() && self.char() != '}' {
2438                scratch.push(self.char());
2439            }
2440            if self.is_eof() {
2441                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2442            }
2443            assert_eq!(self.char(), '}');
2444            self.bump();
2445
2446            let name = scratch.as_str();
2447            if let Some(i) = name.find("!=") {
2448                (
2449                    start,
2450                    ClassUnicodeKind::NamedValue {
2451                        op: ClassUnicodeOpKind::NotEqual,
2452                        name: name[..i].to_string(),
2453                        value: name[i + 2..].to_string(),
2454                    },
2455                )
2456            } else if let Some(i) = name.find(':') {
2457                (
2458                    start,
2459                    ClassUnicodeKind::NamedValue {
2460                        op: ClassUnicodeOpKind::Colon,
2461                        name: name[..i].to_string(),
2462                        value: name[i + 1..].to_string(),
2463                    },
2464                )
2465            } else if let Some(i) = name.find('=') {
2466                (
2467                    start,
2468                    ClassUnicodeKind::NamedValue {
2469                        op: ClassUnicodeOpKind::Equal,
2470                        name: name[..i].to_string(),
2471                        value: name[i + 1..].to_string(),
2472                    },
2473                )
2474            } else {
2475                (start, ClassUnicodeKind::Named(name.to_string()))
2476            }
2477        } else {
2478            let start = self.pos();
2479            let c = self.char();
2480            if c == '\\' {
2481                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2482            }
2483            self.bump_and_bump_space();
2484            let kind = ClassUnicodeKind::OneLetter(c);
2485            (start, kind)
2486        };
2487        Ok(ClassUnicode {
2488            span: Span::new(start, self.pos()),
2489            negated,
2490            kind,
2491        })
2492    }
2493
2494    #[inline(never)]
2495    fn parse_perl_class(&self) -> ClassPerl {
2496        let c = self.char();
2497        let span = self.span_char();
2498        self.bump();
2499        let (negated, kind) = match c {
2500            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2501            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2502            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2503            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2504            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2505            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2506            c => panic!("expected valid Perl class but got '{}'", c),
2507        };
2508        ClassPerl {
2509            span,
2510            kind,
2511            negated,
2512        }
2513    }
2514}
2515
2516pub fn parse_ast<'s>(
2517    tb: &mut TB<'s>,
2518    pattern: &'s str,
2519) -> std::result::Result<NodeId, ResharpError> {
2520    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2521    let result = p.parse(tb);
2522    result
2523}
resharp_parser/lib.rs

resharp_parser/
lib.rs