resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, TaggedEps};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28    Word,
29    NonWord,
30    MaybeWord,
31    MaybeNonWord,
32    Unknown,
33    Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37    b.is_ascii_alphanumeric() || b == b'_'
38}
39
40#[derive(Clone, Debug, Eq, PartialEq)]
41enum Primitive {
42    Literal(Literal),
43    Assertion(ast::Assertion),
44    Dot(Span),
45    Top(Span),
46    Perl(ClassPerl),
47    Unicode(ClassUnicode),
48    TaggedEps(ast::TaggedEps),
49}
50
51impl Primitive {
52    fn span(&self) -> &Span {
53        match *self {
54            Primitive::Literal(ref x) => &x.span,
55            Primitive::Assertion(ref x) => &x.span,
56            Primitive::Dot(ref span) => span,
57            Primitive::Top(ref span) => span,
58            Primitive::Perl(ref x) => &x.span,
59            Primitive::Unicode(ref x) => &x.span,
60            Primitive::TaggedEps(ref x) => &x.span,
61        }
62    }
63
64    fn into_ast(self) -> Ast {
65        match self {
66            Primitive::Literal(lit) => Ast::literal(lit),
67            Primitive::Assertion(assert) => Ast::assertion(assert),
68            Primitive::Dot(span) => Ast::dot(span),
69            Primitive::Top(span) => Ast::top(span),
70            Primitive::Perl(cls) => Ast::class_perl(cls),
71            Primitive::Unicode(cls) => Ast::class_unicode(cls),
72            Primitive::TaggedEps(te) => Ast::tagged_eps(te),
73        }
74    }
75
76    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
77        use self::Primitive::*;
78        use regex_syntax::ast::ClassSetItem;
79
80        match self {
81            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
82            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
83            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
84            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
85        }
86    }
87
88    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
89        use self::Primitive::*;
90
91        match self {
92            Literal(lit) => Ok(lit),
93            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
94        }
95    }
96}
97
98#[derive(Clone, Debug, Eq, PartialEq)]
99pub enum Either<Left, Right> {
100    Left(Left),
101    Right(Right),
102}
103
104#[derive(Clone, Debug, Eq, PartialEq)]
105pub struct ResharpError {
106    /// The kind of error.
107    pub kind: ErrorKind,
108    /// The original pattern that the parser generated the error from. Every
109    /// span in an error is a valid range into this string.
110    pattern: String,
111    /// The span of this error.
112    pub span: Span,
113}
114
115impl std::fmt::Display for ResharpError {
116    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
117        write!(f, "{:?}: {:?}", self.kind, self.span)
118    }
119}
120impl std::error::Error for ResharpError {}
121
122type Result<T> = core::result::Result<T, ResharpError>;
123
124
125#[derive(Clone, Debug)]
126enum GroupState {
127    /// This state is pushed whenever an opening group is found.
128    Group {
129        /// The concatenation immediately preceding the opening group.
130        concat: Concat,
131        /// The group that has been opened. Its sub-AST is always empty.
132        group: ast::Group,
133        /// Whether this group has the `x` flag enabled or not.
134        ignore_whitespace: bool,
135    },
136    /// This state is pushed whenever a new alternation branch is found. If
137    /// an alternation branch is found and this state is at the top of the
138    /// stack, then this state should be modified to include the new
139    /// alternation.
140    Alternation(ast::Alternation),
141    Intersection(ast::Intersection),
142}
143
144#[derive(Clone, Debug)]
145enum ClassState {
146    /// This state is pushed whenever an opening bracket is found.
147    Open {
148        /// The union of class items immediately preceding this class.
149        union: regex_syntax::ast::ClassSetUnion,
150        /// The class that has been opened. Typically this just corresponds
151        /// to the `[`, but it can also include `[^` since `^` indicates
152        /// negation of the class.
153        set: regex_syntax::ast::ClassBracketed,
154    },
155    /// This state is pushed when a operator is seen. When popped, the stored
156    /// set becomes the left hand side of the operator.
157    Op {
158        /// The type of the operation, i.e., &&, -- or ~~.
159        kind: regex_syntax::ast::ClassSetBinaryOpKind,
160        /// The left-hand side of the operator.
161        lhs: regex_syntax::ast::ClassSet,
162    },
163}
164
165/// RE# syntax parser based on the regex-syntax crate.
166pub struct ResharpParser<'s> {
167    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
168    pub translator: regex_syntax::hir::translate::Translator,
169    pub pattern: &'s str,
170    pos: Cell<Position>,
171    capture_index: Cell<u32>,
172    octal: bool,
173    empty_min_range: bool,
174    ignore_whitespace: Cell<bool>,
175    comments: RefCell<Vec<ast::Comment>>,
176    stack_group: RefCell<Vec<GroupState>>,
177    stack_class: RefCell<Vec<ClassState>>,
178    capture_names: RefCell<Vec<ast::CaptureName>>,
179    scratch: RefCell<String>,
180}
181
182fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
183    if let Err(e) = result {
184        if e.kind == from {
185            Err(ResharpError {
186                kind: to,
187                pattern: e.pattern,
188                span: e.span,
189            })
190        } else {
191            Err(e)
192        }
193    } else {
194        result
195    }
196}
197
198fn is_capture_char(c: char, first: bool) -> bool {
199    if first {
200        c == '_' || c.is_alphabetic()
201    } else {
202        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
203    }
204}
205
206pub fn is_meta_character(c: char) -> bool {
207    match c {
208        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
209        | '#' | '&' | '-' | '~' => true,
210        _ => false,
211    }
212}
213
214pub fn is_escapeable_character(c: char) -> bool {
215    // Certainly escapeable if it's a meta character.
216    if is_meta_character(c) {
217        return true;
218    }
219    // Any character that isn't ASCII is definitely not escapeable. There's
220    // no real need to allow things like \☃ right?
221    if !c.is_ascii() {
222        return false;
223    }
224    // Otherwise, we basically say that everything is escapeable unless it's a
225    // letter or digit. Things like \3 are either octal (when enabled) or an
226    // error, and we should keep it that way. Otherwise, letters are reserved
227    // for adding new syntax in a backwards compatible way.
228    match c {
229        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
230        // While not currently supported, we keep these as not escapeable to
231        // give us some flexibility with respect to supporting the \< and
232        // \> word boundary assertions in the future. By rejecting them as
233        // escapeable, \< and \> will result in a parse error. Thus, we can
234        // turn them into something else in the future without it being a
235        // backwards incompatible change.
236        //
237        // OK, now we support \< and \>, and we need to retain them as *not*
238        // escapeable here since the escape sequence is significant.
239        '<' | '>' => false,
240        _ => true,
241    }
242}
243
244fn is_hex(c: char) -> bool {
245    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
246}
247
248impl<'s> ResharpParser<'s> {
249    fn default_translator_builder() -> TranslatorBuilder {
250        let mut trb = TranslatorBuilder::new();
251        trb.unicode(true);
252        trb.utf8(false);
253        trb
254    }
255
256    pub fn new(pattern: &'s str) -> Self {
257        let trb = Self::default_translator_builder();
258        Self {
259            translator: trb.build(),
260            pattern,
261            perl_classes: vec![],
262            pos: Cell::new(Position::new(0, 0, 0)),
263            capture_index: Cell::new(0),
264            octal: false,
265            empty_min_range: false,
266            ignore_whitespace: Cell::new(false),
267            comments: RefCell::new(vec![]),
268            stack_group: RefCell::new(vec![]),
269            stack_class: RefCell::new(vec![]),
270            capture_names: RefCell::new(vec![]),
271            scratch: RefCell::new(String::new()),
272        }
273    }
274
275    /// Return a reference to the parser state.
276    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
277        self
278    }
279
280    /// Return a reference to the pattern being parsed.
281    fn pattern(&self) -> &str {
282        self.pattern
283    }
284
285    /// Create a new error with the given span and error type.
286    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
287        ResharpError {
288            kind,
289            pattern: self.pattern().to_string(),
290            span,
291        }
292    }
293
294    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
295        let emptyspan = Span::splat(self.pos());
296        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
297        inner
298    }
299
300    /// Return the current offset of the parser.
301    ///
302    /// The offset starts at `0` from the beginning of the regular expression
303    /// pattern string.
304    fn offset(&self) -> usize {
305        self.parser().pos.get().offset
306    }
307
308    /// Return the current line number of the parser.
309    ///
310    /// The line number starts at `1`.
311    fn line(&self) -> usize {
312        self.parser().pos.get().line
313    }
314
315    /// Return the current column of the parser.
316    ///
317    /// The column number starts at `1` and is reset whenever a `\n` is seen.
318    fn column(&self) -> usize {
319        self.parser().pos.get().column
320    }
321
322    /// Return the next capturing index. Each subsequent call increments the
323    /// internal index.
324    ///
325    /// The span given should correspond to the location of the opening
326    /// parenthesis.
327    ///
328    /// If the capture limit is exceeded, then an error is returned.
329    fn next_capture_index(&self, span: Span) -> Result<u32> {
330        let current = self.parser().capture_index.get();
331        let i = current
332            .checked_add(1)
333            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
334        self.parser().capture_index.set(i);
335        Ok(i)
336    }
337
338    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
339        let mut names = self.parser().capture_names.borrow_mut();
340        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
341            Err(i) => {
342                names.insert(i, cap.clone());
343                Ok(())
344            }
345            Ok(i) => Err(self.error(
346                cap.span,
347                ast::ErrorKind::GroupNameDuplicate {
348                    original: names[i].span,
349                },
350            )),
351        }
352    }
353
354    fn ignore_whitespace(&self) -> bool {
355        self.parser().ignore_whitespace.get()
356    }
357
358    fn char(&self) -> char {
359        self.char_at(self.offset())
360    }
361
362    fn char_at(&self, i: usize) -> char {
363        self.pattern()[i..]
364            .chars()
365            .next()
366            .unwrap_or_else(|| panic!("expected char at offset {}", i))
367    }
368
369    fn bump(&self) -> bool {
370        if self.is_eof() {
371            return false;
372        }
373        let Position {
374            mut offset,
375            mut line,
376            mut column,
377        } = self.pos();
378        if self.char() == '\n' {
379            line = line.checked_add(1).unwrap();
380            column = 1;
381        } else {
382            column = column.checked_add(1).unwrap();
383        }
384        offset += self.char().len_utf8();
385        self.parser().pos.set(Position {
386            offset,
387            line,
388            column,
389        });
390        self.pattern()[self.offset()..].chars().next().is_some()
391    }
392
393    fn bump_if(&self, prefix: &str) -> bool {
394        if self.pattern()[self.offset()..].starts_with(prefix) {
395            for _ in 0..prefix.chars().count() {
396                self.bump();
397            }
398            true
399        } else {
400            false
401        }
402    }
403
404    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
405        if self.bump_if("?=") {
406            return Some((true, true));
407        }
408        if self.bump_if("?!") {
409            return Some((true, false));
410        }
411        if self.bump_if("?<=") {
412            return Some((false, true));
413        }
414        if self.bump_if("?<!") {
415            return Some((false, false));
416        }
417        return None;
418        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
419    }
420
421    fn bump_and_bump_space(&self) -> bool {
422        if !self.bump() {
423            return false;
424        }
425        self.bump_space();
426        !self.is_eof()
427    }
428
429    fn bump_space(&self) {
430        if !self.ignore_whitespace() {
431            return;
432        }
433        while !self.is_eof() {
434            if self.char().is_whitespace() {
435                self.bump();
436            } else if self.char() == '#' {
437                let start = self.pos();
438                let mut comment_text = String::new();
439                self.bump();
440                while !self.is_eof() {
441                    let c = self.char();
442                    self.bump();
443                    if c == '\n' {
444                        break;
445                    }
446                    comment_text.push(c);
447                }
448                let comment = ast::Comment {
449                    span: Span::new(start, self.pos()),
450                    comment: comment_text,
451                };
452                self.parser().comments.borrow_mut().push(comment);
453            } else {
454                break;
455            }
456        }
457    }
458
459    /// Peek at the next character in the input without advancing the parser.
460    ///
461    /// If the input has been exhausted, then this returns `None`.
462    fn peek(&self) -> Option<char> {
463        if self.is_eof() {
464            return None;
465        }
466        self.pattern()[self.offset() + self.char().len_utf8()..]
467            .chars()
468            .next()
469    }
470
471    /// Like peek, but will ignore spaces when the parser is in whitespace
472    /// insensitive mode.
473    fn peek_space(&self) -> Option<char> {
474        if !self.ignore_whitespace() {
475            return self.peek();
476        }
477        if self.is_eof() {
478            return None;
479        }
480        let mut start = self.offset() + self.char().len_utf8();
481        let mut in_comment = false;
482        for (i, c) in self.pattern()[start..].char_indices() {
483            if c.is_whitespace() {
484                continue;
485            } else if !in_comment && c == '#' {
486                in_comment = true;
487            } else if in_comment && c == '\n' {
488                in_comment = false;
489            } else {
490                start += i;
491                break;
492            }
493        }
494        self.pattern()[start..].chars().next()
495    }
496
497    /// Returns true if the next call to `bump` would return false.
498    fn is_eof(&self) -> bool {
499        self.offset() == self.pattern().len()
500    }
501
502    /// Return the current position of the parser, which includes the offset,
503    /// line and column.
504    fn pos(&self) -> Position {
505        self.parser().pos.get()
506    }
507
508    /// Create a span at the current position of the parser. Both the start
509    /// and end of the span are set.
510    fn span(&self) -> Span {
511        Span::splat(self.pos())
512    }
513
514    /// Create a span that covers the current character.
515    fn span_char(&self) -> Span {
516        let mut next = Position {
517            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
518            line: self.line(),
519            column: self.column().checked_add(1).unwrap(),
520        };
521        if self.char() == '\n' {
522            next.line += 1;
523            next.column = 1;
524        }
525        Span::new(self.pos(), next)
526    }
527
528    /// Parse and push a single alternation on to the parser's internal stack.
529    /// If the top of the stack already has an alternation, then add to that
530    /// instead of pushing a new one.
531    ///
532    /// The concatenation given corresponds to a single alternation branch.
533    /// The concatenation returned starts the next branch and is empty.
534    ///
535    /// This assumes the parser is currently positioned at `|` and will advance
536    /// the parser to the character following `|`.
537    #[inline(never)]
538    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
539        assert_eq!(self.char(), '|');
540        concat.span.end = self.pos();
541        self.push_or_add_alternation(concat);
542        self.bump();
543        Ok(ast::Concat {
544            span: self.span(),
545            asts: vec![],
546        })
547    }
548
549    /// Pushes or adds the given branch of an alternation to the parser's
550    /// internal stack of state.
551    fn push_or_add_alternation(&self, concat: Concat) {
552        use self::GroupState::*;
553
554        let mut stack = self.parser().stack_group.borrow_mut();
555        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
556            alts.asts.push(concat.into_ast());
557            return;
558        }
559        stack.push(Alternation(ast::Alternation {
560            span: Span::new(concat.span.start, self.pos()),
561            asts: vec![concat.into_ast()],
562        }));
563    }
564
565    #[inline(never)]
566    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
567        assert_eq!(self.char(), '&');
568        concat.span.end = self.pos();
569        self.push_or_add_intersect(concat);
570        self.bump();
571        Ok(Concat {
572            span: self.span(),
573            asts: vec![],
574        })
575    }
576
577    /// Pushes or adds the given branch of an alternation to the parser's
578    /// internal stack of state.
579    fn push_or_add_intersect(&self, concat: Concat) {
580        use self::GroupState::*;
581
582        let mut stack = self.parser().stack_group.borrow_mut();
583        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
584            alts.asts.push(concat.into_ast());
585            return;
586        }
587        stack.push(Intersection(ast::Intersection {
588            span: Span::new(concat.span.start, self.pos()),
589            asts: vec![concat.into_ast()],
590        }));
591    }
592
593    /// Parse and push a group AST (and its parent concatenation) on to the
594    /// parser's internal stack. Return a fresh concatenation corresponding
595    /// to the group's sub-AST.
596    ///
597    /// If a set of flags was found (with no group), then the concatenation
598    /// is returned with that set of flags added.
599    ///
600    /// This assumes that the parser is currently positioned on the opening
601    /// parenthesis. It advances the parser to the character at the start
602    /// of the sub-expression (or adjoining expression).
603    ///
604    /// If there was a problem parsing the start of the group, then an error
605    /// is returned.
606    #[inline(never)]
607    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
608        assert_eq!(self.char(), '(');
609        match self.parse_group()? {
610            Either::Left(set) => {
611                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
612                if let Some(v) = ignore {
613                    self.parser().ignore_whitespace.set(v);
614                }
615
616                concat.asts.push(Ast::flags(set));
617                Ok(concat)
618            }
619            Either::Right(group) => {
620                let old_ignore_whitespace = self.ignore_whitespace();
621                let new_ignore_whitespace = group
622                    .flags()
623                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
624                    .unwrap_or(old_ignore_whitespace);
625                self.parser()
626                    .stack_group
627                    .borrow_mut()
628                    .push(GroupState::Group {
629                        concat,
630                        group,
631                        ignore_whitespace: old_ignore_whitespace,
632                    });
633                self.parser().ignore_whitespace.set(new_ignore_whitespace);
634                Ok(Concat {
635                    span: self.span(),
636                    asts: vec![],
637                })
638            }
639        }
640    }
641
642    #[inline(never)]
643    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
644        assert_eq!(self.char(), '~');
645        self.bump();
646        if self.is_eof() || self.char() != '(' {
647            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
648        }
649        let open_span = self.span_char();
650        self.bump();
651        let group = ast::Group {
652            span: open_span,
653            kind: ast::GroupKind::Complement,
654            ast: Box::new(Ast::empty(self.span())),
655        };
656
657        let old_ignore_whitespace = self.ignore_whitespace();
658        let new_ignore_whitespace = group
659            .flags()
660            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
661            .unwrap_or(old_ignore_whitespace);
662        self.parser()
663            .stack_group
664            .borrow_mut()
665            .push(GroupState::Group {
666                concat,
667                group,
668                ignore_whitespace: old_ignore_whitespace,
669            });
670        self.parser().ignore_whitespace.set(new_ignore_whitespace);
671        Ok(Concat {
672            span: self.span(),
673            asts: vec![],
674        })
675    }
676
677    /// Pop a group AST from the parser's internal stack and set the group's
678    /// AST to the given concatenation. Return the concatenation containing
679    /// the group.
680    ///
681    /// This assumes that the parser is currently positioned on the closing
682    /// parenthesis and advances the parser to the character following the `)`.
683    ///
684    /// If no such group could be popped, then an unopened group error is
685    /// returned.
686    #[inline(never)]
687    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
688        use self::GroupState::*;
689        assert_eq!(self.char(), ')');
690        let mut stack = self.parser().stack_group.borrow_mut();
691        let topstack = stack.pop();
692
693        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
694            Some(Group {
695                concat,
696                group,
697                ignore_whitespace,
698            }) => (concat, group, ignore_whitespace, None),
699            Some(Alternation(alt)) => match stack.pop() {
700                Some(Group {
701                    concat,
702                    group,
703                    ignore_whitespace,
704                }) => (
705                    concat,
706                    group,
707                    ignore_whitespace,
708                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
709                ),
710                None | Some(Alternation(_)) | Some(Intersection(_)) => {
711                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
712                }
713            },
714            Some(Intersection(int)) => match stack.pop() {
715                Some(Group {
716                    concat,
717                    group,
718                    ignore_whitespace,
719                }) => (
720                    concat,
721                    group,
722                    ignore_whitespace,
723                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
724                ),
725                None | Some(Alternation(_)) | Some(Intersection(_)) => {
726                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
727                }
728            },
729
730            None => {
731                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
732            }
733        };
734        self.parser().ignore_whitespace.set(ignore_whitespace);
735        group_concat.span.end = self.pos();
736        self.bump();
737        group.span.end = self.pos();
738        match alt {
739            Some(Either::Left(mut alt)) => {
740                alt.span.end = group_concat.span.end;
741                alt.asts.push(group_concat.into_ast());
742                group.ast = Box::new(alt.into_ast());
743            }
744            Some(Either::Right(mut int)) => {
745                int.span.end = group_concat.span.end;
746                int.asts.push(group_concat.into_ast());
747                group.ast = Box::new(int.into_ast());
748            }
749            None => {
750                group.ast = Box::new(group_concat.into_ast());
751            }
752        }
753
754        if group.kind == GroupKind::Complement {
755            let complement = ast::Complement {
756                span: self.span(),
757                ast: group.ast,
758            };
759            prior_concat.asts.push(Ast::complement(complement));
760        }
761        // ignore groups for now
762        else {
763            // dbg!(&group);
764            prior_concat.asts.push(Ast::group(group));
765        }
766        Ok(prior_concat)
767    }
768
769    /// Pop the last state from the parser's internal stack, if it exists, and
770    /// add the given concatenation to it. There either must be no state or a
771    /// single alternation item on the stack. Any other scenario produces an
772    /// error.
773    ///
774    /// This assumes that the parser has advanced to the end.
775    #[inline(never)]
776    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
777        concat.span.end = self.pos();
778        // dbg!(&concat);
779        let mut stack = self.parser().stack_group.borrow_mut();
780        let ast = match stack.pop() {
781            None => Ok(concat.into_ast()),
782            Some(GroupState::Alternation(mut alt)) => {
783                alt.span.end = self.pos();
784                alt.asts.push(concat.into_ast());
785                Ok(Ast::alternation(alt))
786            }
787            Some(GroupState::Intersection(mut int)) => {
788                int.span.end = self.pos();
789                int.asts.push(concat.into_ast());
790
791                // dbg!("end intersection2");
792                Ok(Ast::intersection(int))
793            }
794            Some(GroupState::Group { group, .. }) => {
795                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
796            }
797        };
798        // If we try to pop again, there should be nothing.
799        match stack.pop() {
800            None => ast,
801            Some(GroupState::Alternation(_)) => {
802                // This unreachable is unfortunate. This case can't happen
803                // because the only way we can be here is if there were two
804                // `GroupState::Alternation`s adjacent in the parser's stack,
805                // which we guarantee to never happen because we never push a
806                // `GroupState::Alternation` if one is already at the top of
807                // the stack.
808                unreachable!()
809            }
810            Some(GroupState::Intersection(_)) => {
811                unreachable!()
812            }
813            Some(GroupState::Group { group, .. }) => {
814                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
815            }
816        }
817    }
818
819    /// Parse the opening of a character class and push the current class
820    /// parsing context onto the parser's stack. This assumes that the parser
821    /// is positioned at an opening `[`. The given union should correspond to
822    /// the union of set items built up before seeing the `[`.
823    ///
824    /// If there was a problem parsing the opening of the class, then an error
825    /// is returned. Otherwise, a new union of set items for the class is
826    /// returned (which may be populated with either a `]` or a `-`).
827    #[inline(never)]
828    fn push_class_open(
829        &self,
830        parent_union: regex_syntax::ast::ClassSetUnion,
831    ) -> Result<regex_syntax::ast::ClassSetUnion> {
832        assert_eq!(self.char(), '[');
833
834        let (nested_set, nested_union) = self.parse_set_class_open()?;
835        self.parser()
836            .stack_class
837            .borrow_mut()
838            .push(ClassState::Open {
839                union: parent_union,
840                set: nested_set,
841            });
842        Ok(nested_union)
843    }
844
845    /// Parse the end of a character class set and pop the character class
846    /// parser stack. The union given corresponds to the last union built
847    /// before seeing the closing `]`. The union returned corresponds to the
848    /// parent character class set with the nested class added to it.
849    ///
850    /// This assumes that the parser is positioned at a `]` and will advance
851    /// the parser to the byte immediately following the `]`.
852    ///
853    /// If the stack is empty after popping, then this returns the final
854    /// "top-level" character class AST (where a "top-level" character class
855    /// is one that is not nested inside any other character class).
856    ///
857    /// If there is no corresponding opening bracket on the parser's stack,
858    /// then an error is returned.
859    #[inline(never)]
860    fn pop_class(
861        &self,
862        nested_union: regex_syntax::ast::ClassSetUnion,
863    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
864        assert_eq!(self.char(), ']');
865
866        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
867        let prevset = self.pop_class_op(item);
868        let mut stack = self.parser().stack_class.borrow_mut();
869        match stack.pop() {
870            None => {
871                // We can never observe an empty stack:
872                //
873                // 1) We are guaranteed to start with a non-empty stack since
874                //    the character class parser is only initiated when it sees
875                //    a `[`.
876                // 2) If we ever observe an empty stack while popping after
877                //    seeing a `]`, then we signal the character class parser
878                //    to terminate.
879                panic!("unexpected empty character class stack")
880            }
881            Some(ClassState::Op { .. }) => {
882                // This panic is unfortunate, but this case is impossible
883                // since we already popped the Op state if one exists above.
884                // Namely, every push to the class parser stack is guarded by
885                // whether an existing Op is already on the top of the stack.
886                // If it is, the existing Op is modified. That is, the stack
887                // can never have consecutive Op states.
888                panic!("unexpected ClassState::Op")
889            }
890            Some(ClassState::Open { mut union, mut set }) => {
891                self.bump();
892                set.span.end = self.pos();
893                set.kind = prevset;
894                if stack.is_empty() {
895                    Ok(Either::Right(set))
896                } else {
897                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
898                    Ok(Either::Left(union))
899                }
900            }
901        }
902    }
903
904    /// Return an "unclosed class" error whose span points to the most
905    /// recently opened class.
906    ///
907    /// This should only be called while parsing a character class.
908    #[inline(never)]
909    fn unclosed_class_error(&self) -> ResharpError {
910        for state in self.parser().stack_class.borrow().iter().rev() {
911            if let ClassState::Open { ref set, .. } = *state {
912                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
913            }
914        }
915        // We are guaranteed to have a non-empty stack with at least
916        // one open bracket, so we should never get here.
917        panic!("no open character class found")
918    }
919
920    /// Push the current set of class items on to the class parser's stack as
921    /// the left hand side of the given operator.
922    ///
923    /// A fresh set union is returned, which should be used to build the right
924    /// hand side of this operator.
925    #[inline(never)]
926    fn push_class_op(
927        &self,
928        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
929        next_union: regex_syntax::ast::ClassSetUnion,
930    ) -> regex_syntax::ast::ClassSetUnion {
931        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
932        let new_lhs = self.pop_class_op(item);
933        self.parser().stack_class.borrow_mut().push(ClassState::Op {
934            kind: next_kind,
935            lhs: new_lhs,
936        });
937        regex_syntax::ast::ClassSetUnion {
938            span: self.span(),
939            items: vec![],
940        }
941    }
942
943    /// Pop a character class set from the character class parser stack. If the
944    /// top of the stack is just an item (not an operation), then return the
945    /// given set unchanged. If the top of the stack is an operation, then the
946    /// given set will be used as the rhs of the operation on the top of the
947    /// stack. In that case, the binary operation is returned as a set.
948    #[inline(never)]
949    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
950        let mut stack = self.parser().stack_class.borrow_mut();
951        let (kind, lhs) = match stack.pop() {
952            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
953            Some(state @ ClassState::Open { .. }) => {
954                stack.push(state);
955                return rhs;
956            }
957            None => unreachable!(),
958        };
959        let span = Span::new(lhs.span().start, rhs.span().end);
960        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
961            span,
962            kind,
963            lhs: Box::new(lhs),
964            rhs: Box::new(rhs),
965        })
966    }
967
968    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
969        match hir.kind() {
970            hir::HirKind::Empty => Ok(NodeId::EPS),
971            hir::HirKind::Literal(l) => {
972                if l.0.len() == 1 {
973                    let node = tb.mk_u8(l.0[0]);
974                    Ok(node)
975                } else {
976                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
977                    let conc = tb.mk_concats(ws.iter().copied());
978                    Ok(conc)
979                }
980            }
981            hir::HirKind::Class(class) => {
982                // dbg!(class);
983                match class {
984                    hir::Class::Unicode(class_unicode) => {
985                        let ranges = class_unicode.ranges();
986                        let mut s1 = NodeId::BOT;
987                        let mut s2 = NodeId::BOT;
988                        let mut s3 = NodeId::BOT;
989                        let mut s4 = NodeId::BOT;
990                        for range in ranges {
991                            for seq in Utf8Sequences::new(range.start(), range.end()) {
992                                // cut len > 2 out for now
993                                if seq.len() > 2 {
994                                    continue;
995                                }
996
997                                let v: Vec<_> = seq
998                                    .as_slice()
999                                    .iter()
1000                                    .map(|s|
1001                                                            // self.mk_byte_set(&byteset_from_range(s.start, s.end))
1002                                                            (s.start, s.end))
1003                                    .collect();
1004                                if v.len() == 1 {
1005                                    let (start, end) = v[0];
1006                                    let node = tb.mk_range_u8(start, end);
1007                                    s1 = tb.mk_union(s1, node);
1008                                } else if v.len() == 2 {
1009                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1010                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1011                                    let conc = tb.mk_concat(node1, node2);
1012                                    s2 = tb.mk_union(s2, conc);
1013                                } else if v.len() == 3 {
1014                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1015                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1016                                    let node3 = tb.mk_range_u8(v[2].0, v[2].1);
1017                                    let conc2 = tb.mk_concat(node2, node3);
1018                                    let conc1 = tb.mk_concat(node1, conc2);
1019                                    s3 = tb.mk_union(s3, conc1);
1020                                } else {
1021                                    let mut conc = NodeId::EPS;
1022                                    for i in (0..4).rev() {
1023                                        let node = tb.mk_range_u8(v[i].0, v[i].1);
1024                                        conc = tb.mk_concat(conc, node);
1025                                    }
1026                                    s4 = tb.mk_union(s4, conc);
1027                                }
1028                            }
1029                        }
1030                        let merged = tb.mk_union(s2, s1);
1031                        let merged = tb.mk_union(s3, merged);
1032                        let merged = tb.mk_union(s4, merged);
1033                        Ok(merged)
1034                    }
1035                    hir::Class::Bytes(class_bytes) => {
1036                        let ranges = class_bytes.ranges();
1037                        let mut result = NodeId::BOT;
1038                        for range in ranges {
1039                            let start = range.start();
1040                            let end = range.end();
1041                            let node = tb.mk_range_u8(start, end);
1042                            result = tb.mk_union(result, node);
1043                        }
1044                        Ok(result)
1045                    }
1046                }
1047            }
1048            hir::HirKind::Look(_) => todo!(),
1049            hir::HirKind::Repetition(_) => todo!(),
1050            hir::HirKind::Capture(_) => todo!(),
1051            hir::HirKind::Concat(body) => {
1052                let mut result = NodeId::EPS;
1053                for child in body {
1054                    let node = self.hir_to_node_id(child, tb)?;
1055                    result = tb.mk_concat(result, node);
1056                }
1057                Ok(result)
1058            }
1059            hir::HirKind::Alternation(_) => todo!(),
1060        }
1061    }
1062
1063    fn translate_ast_to_hir(
1064        &mut self,
1065        orig_ast: &regex_syntax::ast::Ast,
1066        tb: &mut TB<'s>,
1067    ) -> Result<NodeId> {
1068        // let unboxed = class_perl.deref();
1069        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1070        match self.translator.translate("", orig_ast) {
1071            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1072            Ok(hir) => {
1073                let mapped = self.hir_to_node_id(&hir, tb);
1074                mapped
1075            }
1076        }
1077    }
1078
1079    fn translator_to_node_id(
1080        &mut self,
1081        orig_ast: &regex_syntax::ast::Ast,
1082        translator: &mut Option<Translator>,
1083        tb: &mut TB<'s>,
1084    ) -> Result<NodeId> {
1085        match translator {
1086            Some(tr) => {
1087                let hir = tr
1088                    .translate("", &orig_ast)
1089                    .map_err(|e| self.unsupported_error(e))?;
1090                self.hir_to_node_id(&hir, tb)
1091            }
1092            None => self.translate_ast_to_hir(&orig_ast, tb),
1093        }
1094    }
1095
1096    fn get_class(
1097        &mut self,
1098        negated: bool,
1099        kind: regex_syntax::ast::ClassPerlKind,
1100        tb: &mut TB<'s>,
1101    ) -> Result<NodeId> {
1102        // find in array
1103        let w = self
1104            .perl_classes
1105            .iter()
1106            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1107        match w {
1108            Some((_, _, value)) => return Ok(*value),
1109            None => {
1110                let tmp = regex_syntax::ast::ClassPerl {
1111                    span: Span::splat(Position::new(0, 0, 0)),
1112                    negated,
1113                    kind: kind.clone(),
1114                };
1115                let word_ast = regex_syntax::ast::Ast::class_perl(tmp);
1116                let translated = self.translate_ast_to_hir(&word_ast, tb)?;
1117                self.perl_classes.push((negated, kind, translated));
1118                Ok(translated)
1119            }
1120        }
1121    }
1122
1123    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1124        use WordCharKind::*;
1125        match ast {
1126            Ast::Literal(lit) => {
1127                if is_word_byte(lit.c as u8) {
1128                    Word
1129                } else {
1130                    NonWord
1131                }
1132            }
1133            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1134                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1135                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1136                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1137                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1138                _ => Unknown,
1139            },
1140            Ast::Dot(_) | Ast::Top(_) => Unknown,
1141            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1142            Ast::Concat(c) if !c.asts.is_empty() => {
1143                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1144            }
1145            Ast::Repetition(r) => {
1146                let inner = Self::word_char_kind(&r.ast, left);
1147                let nullable = matches!(
1148                    &r.op.kind,
1149                    ast::RepetitionKind::ZeroOrMore
1150                        | ast::RepetitionKind::ZeroOrOne
1151                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1152                );
1153                if nullable {
1154                    match inner {
1155                        Word => MaybeWord,
1156                        NonWord => MaybeNonWord,
1157                        _ => Unknown,
1158                    }
1159                } else {
1160                    inner
1161                }
1162            }
1163            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1164            _ => Unknown,
1165        }
1166    }
1167
1168    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1169        use WordCharKind::*;
1170        let next = idx as isize + dir;
1171        if next < 0 || next >= asts.len() as isize {
1172            return Edge;
1173        }
1174        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1175        match kind {
1176            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1177                Word => Word,
1178                _ => Unknown,
1179            },
1180            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1181                NonWord => NonWord,
1182                _ => Unknown,
1183            },
1184            other => other,
1185        }
1186    }
1187
1188    fn rewrite_word_boundary_in_concat(
1189        &mut self,
1190        asts: &[Ast],
1191        idx: usize,
1192        translator: &mut Option<Translator>,
1193        tb: &mut TB<'s>,
1194    ) -> Result<(NodeId, usize)> {
1195        use WordCharKind::*;
1196        let left = Self::concat_neighbor_kind(asts, idx, -1);
1197        let right = Self::concat_neighbor_kind(asts, idx, 1);
1198        let word_id = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1199        let not_word_id = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1200
1201        match (left, right) {
1202            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1203            (Word, _) => {
1204                let set = tb.mk_union(NodeId::END, not_word_id);
1205                let tail = tb.mk_concat(set, NodeId::TS);
1206                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1207            }
1208            (NonWord, _) => {
1209                let set = tb.mk_union(NodeId::END, word_id);
1210                let tail = tb.mk_concat(set, NodeId::TS);
1211                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1212            }
1213            (_, Word) => {
1214                let body = tb.mk_union(NodeId::BEGIN, not_word_id);
1215                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1216            }
1217            (_, NonWord) => {
1218                let body = tb.mk_union(NodeId::BEGIN, word_id);
1219                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1220            }
1221            _ => Ok((self.make_full_word_boundary(tb)?, idx + 1)),
1222        }
1223    }
1224
1225    fn merge_boundary_with_following_lookaheads(
1226        &mut self,
1227        asts: &[Ast],
1228        wb_idx: usize,
1229        boundary_tail: NodeId,
1230        translator: &mut Option<Translator>,
1231        tb: &mut TB<'s>,
1232    ) -> Result<(NodeId, usize)> {
1233        let mut next = wb_idx + 1;
1234        let mut la_bodies = vec![boundary_tail];
1235        while next < asts.len() {
1236            match &asts[next] {
1237                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1238                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1239                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1240                    next += 1;
1241                }
1242                _ => break,
1243            }
1244        }
1245        let merged = tb.mk_inters(la_bodies.into_iter());
1246        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1247    }
1248
1249    fn make_full_word_boundary(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1250        let w = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1251        let nw = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1252        let lb_w = tb.mk_lookbehind(w, NodeId::MISSING);
1253        let la_nw = tb.mk_lookahead(nw, NodeId::MISSING, 0);
1254        let c1 = tb.mk_concat(lb_w, la_nw);
1255        let lb_nw = tb.mk_lookbehind(nw, NodeId::MISSING);
1256        let la_w = tb.mk_lookahead(w, NodeId::MISSING, 0);
1257        let c2 = tb.mk_concat(lb_nw, la_w);
1258        Ok(tb.mk_union(c1, c2))
1259    }
1260
1261    fn ast_to_node_id(
1262        &mut self,
1263        ast: &Ast,
1264        translator: &mut Option<Translator>,
1265        tb: &mut TB<'s>,
1266    ) -> Result<NodeId> {
1267        match ast {
1268            Ast::Empty(_) => Ok(NodeId::EPS),
1269            Ast::Flags(f) => {
1270                let mut translator_builder = Self::default_translator_builder();
1271                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1272                    translator_builder.case_insensitive(state);
1273                }
1274                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1275                    translator_builder.unicode(state);
1276                }
1277                let concat_translator = Some(translator_builder.build());
1278                *translator = concat_translator;
1279                Ok(NodeId::EPS)
1280            }
1281            Ast::Literal(l) => {
1282                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1283                self.translator_to_node_id(&ast_lit, translator, tb)
1284            }
1285            Ast::Top(_) => Ok(NodeId::TOP),
1286            Ast::Dot(_) => {
1287                let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1288                self.hir_to_node_id(&hirv, tb)
1289            }
1290            Ast::Assertion(a) => match &a.kind {
1291                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1292                ast::AssertionKind::EndText => Ok(NodeId::END),
1293                ast::AssertionKind::WordBoundary => {
1294                    let word_id =
1295                        self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1296                    let not_word_id =
1297                        self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1298                    // case1 : (?<=word)(?=not_word)
1299                    let case1_1 = tb.mk_lookbehind(word_id, NodeId::MISSING);
1300                    let case1_2 = tb.mk_lookahead(not_word_id, NodeId::MISSING, 0);
1301                    let case1 = tb.mk_concat(case1_1, case1_2);
1302                    // case2 : (?<=not_word)(?=word)
1303                    let case2_1 = tb.mk_lookbehind(not_word_id, NodeId::MISSING);
1304                    let case2_2 = tb.mk_lookahead(word_id, NodeId::MISSING, 0);
1305                    let case2 = tb.mk_concat(case2_1, case2_2);
1306                    Ok(tb.mk_union(case1, case2))
1307                }
1308                ast::AssertionKind::NotWordBoundary => {
1309                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1310                }
1311                ast::AssertionKind::StartLine => {
1312                    let left = NodeId::BEGIN;
1313                    let right = tb.mk_u8('\n' as u8);
1314                    let union = tb.mk_union(left, right);
1315                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1316                }
1317                ast::AssertionKind::EndLine => {
1318                    let left = NodeId::END;
1319                    let right = tb.mk_u8('\n' as u8);
1320                    let union = tb.mk_union(left, right);
1321                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1322                }
1323                ast::AssertionKind::WordBoundaryStart => todo!(),
1324                ast::AssertionKind::WordBoundaryEnd => todo!(),
1325                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1326                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1327                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1328                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1329            },
1330            Ast::ClassUnicode(c) => {
1331                let tmp = regex_syntax::ast::ClassUnicode {
1332                    span: c.span,
1333                    negated: c.negated,
1334                    kind: c.kind.clone(),
1335                };
1336                if !c.negated {
1337                    match &c.kind {
1338                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1339                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1340                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1341                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1342                            "utf8" => {
1343                                let ascii = tb.mk_range_u8(0, 127);
1344                                let beta = tb.mk_range_u8(128, 0xBF);
1345                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1346                                let c0s = tb.mk_concats([c0, beta].into_iter());
1347                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1348                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1349                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1350                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1351                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1352                                return Ok(tb.mk_star(merged));
1353                            }
1354                            "hex" => {
1355                                let nums = tb.mk_range_u8(b'0', b'9');
1356                                let lets = tb.mk_range_u8(b'a', b'f');
1357                                let lets2 = tb.mk_range_u8(b'A', b'F');
1358                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1359                                return Ok(merged);
1360                            }
1361                            _ => {}
1362                        },
1363                        _ => {}
1364                    };
1365                }
1366
1367                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1368                self.translator_to_node_id(&orig_ast, translator, tb)
1369            }
1370            Ast::ClassPerl(c) => {
1371                let tmp = regex_syntax::ast::ClassPerl {
1372                    span: c.span,
1373                    negated: c.negated,
1374                    kind: c.kind.clone(),
1375                };
1376                let orig_ast = regex_syntax::ast::Ast::class_perl(tmp);
1377                self.translator_to_node_id(&orig_ast, translator, tb)
1378            }
1379            Ast::ClassBracketed(c) => match &c.kind {
1380                regex_syntax::ast::ClassSet::Item(_) => {
1381                    let tmp = regex_syntax::ast::ClassBracketed {
1382                        span: c.span,
1383                        negated: c.negated,
1384                        kind: c.kind.clone(),
1385                    };
1386                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1387                    self.translator_to_node_id(&orig_ast, translator, tb)
1388                }
1389                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1390            },
1391            Ast::Repetition(r) => {
1392                let body = self.ast_to_node_id(&r.ast, translator, tb);
1393                match body {
1394                    Ok(body) => match &r.op.kind {
1395                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1396                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1397                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1398                        ast::RepetitionKind::Range(r) => match r {
1399                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1400                            ast::RepetitionRange::AtLeast(n) => {
1401                                let rep = tb.mk_repeat(body, *n, *n);
1402                                let st = tb.mk_star(body);
1403                                Ok(tb.mk_concat(rep, st))
1404                            }
1405
1406                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1407                        },
1408                    },
1409                    Err(_) => body,
1410                }
1411            }
1412            Ast::Lookaround(g) => {
1413                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1414                match g.kind {
1415                    ast::LookaroundKind::PositiveLookahead => {
1416                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1417                    }
1418                    ast::LookaroundKind::PositiveLookbehind => {
1419                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1420                    }
1421                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1422                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1423                }
1424            }
1425            Ast::Group(g) => {
1426                let child = self.ast_to_node_id(&g.ast, translator, tb);
1427                child
1428            }
1429            Ast::Alternation(a) => {
1430                let mut children = vec![];
1431                for ast in &a.asts {
1432                    match self.ast_to_node_id(ast, translator, tb) {
1433                        Ok(node_id) => children.push(node_id),
1434                        Err(err) => return Err(err),
1435                    }
1436                }
1437                Ok(tb.mk_unions(children.iter().copied()))
1438            }
1439            Ast::Concat(c) => {
1440                let mut concat_translator: Option<Translator> = None;
1441                let mut children = vec![];
1442                let mut i = 0;
1443                while i < c.asts.len() {
1444                    let ast = &c.asts[i];
1445                    match ast {
1446                        Ast::Flags(f) => {
1447                            let mut translator_builder = Self::default_translator_builder();
1448                            translator_builder.utf8(false);
1449                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1450                                translator_builder.case_insensitive(state);
1451                            }
1452                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1453                                translator_builder.unicode(state);
1454                            }
1455                            concat_translator = Some(translator_builder.build());
1456                            i += 1;
1457                            continue;
1458                        }
1459                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1460                            let node =
1461                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1462                            children.push(node.0);
1463                            i = node.1; // skip consumed lookaheads
1464                            continue;
1465                        }
1466                        _ => {}
1467                    }
1468                    match concat_translator {
1469                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1470                            Ok(node_id) => children.push(node_id),
1471                            Err(err) => return Err(err),
1472                        },
1473                        None => match self.ast_to_node_id(ast, translator, tb) {
1474                            Ok(node_id) => children.push(node_id),
1475                            Err(err) => return Err(err),
1476                        },
1477                    }
1478                    i += 1;
1479                }
1480                Ok(tb.mk_concats(children.iter().cloned()))
1481            }
1482            Ast::Intersection(intersection) => {
1483                let mut children = vec![];
1484                for ast in &intersection.asts {
1485                    match self.ast_to_node_id(ast, translator, tb) {
1486                        Ok(node_id) => children.push(node_id),
1487                        Err(err) => return Err(err),
1488                    }
1489                }
1490                Ok(tb.mk_inters(children.into_iter()))
1491            }
1492            Ast::Complement(complement) => {
1493                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1494                body.map(|x| tb.mk_compl(x))
1495            }
1496            Ast::TaggedEps(_) => Err(self.error(self.span(), ErrorKind::UnsupportedBackreference)),
1497        }
1498    }
1499
1500    /// Parse the regular expression and return an abstract syntax tree with
1501    /// all of the comments found in the pattern.
1502    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1503        let mut concat = Concat {
1504            span: self.span(),
1505            asts: vec![],
1506        };
1507        loop {
1508            self.bump_space();
1509            if self.is_eof() {
1510                break;
1511            }
1512            match self.char() {
1513                '(' => concat = self.push_group(concat)?,
1514                ')' => concat = self.pop_group(concat)?,
1515                '|' => concat = self.push_alternate(concat)?,
1516                '&' => concat = self.push_intersect(concat)?,
1517                '~' => concat = self.push_compl_group(concat)?,
1518                '[' => {
1519                    let class = self.parse_set_class()?;
1520                    concat.asts.push(Ast::class_bracketed(class));
1521                }
1522                '?' => {
1523                    concat =
1524                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1525                }
1526                '*' => {
1527                    concat =
1528                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1529                }
1530                '+' => {
1531                    concat =
1532                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1533                }
1534                '{' => {
1535                    concat = self.parse_counted_repetition(concat)?;
1536                }
1537                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1538            }
1539        }
1540        let ast = self.pop_group_end(concat)?;
1541        // dbg!(&ast);
1542        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1543        node_id
1544    }
1545
1546    #[inline(never)]
1547    fn parse_uncounted_repetition(
1548        &self,
1549        mut concat: ast::Concat,
1550        kind: ast::RepetitionKind,
1551    ) -> Result<ast::Concat> {
1552        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1553        let op_start = self.pos();
1554        let ast = match concat.asts.pop() {
1555            Some(ast) => ast,
1556            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1557        };
1558        match ast {
1559            Ast::Empty(_) | Ast::Flags(_) => {
1560                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1561            }
1562            _ => {}
1563        }
1564        let mut greedy = true;
1565        if self.bump() && self.char() == '?' {
1566            greedy = false;
1567            self.bump();
1568        }
1569        concat.asts.push(Ast::repetition(ast::Repetition {
1570            span: ast.span().with_end(self.pos()),
1571            op: ast::RepetitionOp {
1572                span: Span::new(op_start, self.pos()),
1573                kind,
1574            },
1575            greedy,
1576            ast: Box::new(ast),
1577        }));
1578        Ok(concat)
1579    }
1580
1581    #[inline(never)]
1582    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1583        assert!(self.char() == '{');
1584        let start = self.pos();
1585        let ast = match concat.asts.pop() {
1586            Some(ast) => ast,
1587            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1588        };
1589        match ast {
1590            Ast::Empty(_) | Ast::Flags(_) => {
1591                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1592            }
1593            _ => {}
1594        }
1595        if !self.bump_and_bump_space() {
1596            return Err(self.error(
1597                Span::new(start, self.pos()),
1598                ast::ErrorKind::RepetitionCountUnclosed,
1599            ));
1600        }
1601        let count_start = specialize_err(
1602            self.parse_decimal(),
1603            ast::ErrorKind::DecimalEmpty,
1604            ast::ErrorKind::RepetitionCountDecimalEmpty,
1605        );
1606        if self.is_eof() {
1607            return Err(self.error(
1608                Span::new(start, self.pos()),
1609                ast::ErrorKind::RepetitionCountUnclosed,
1610            ));
1611        }
1612        let range = if self.char() == ',' {
1613            if !self.bump_and_bump_space() {
1614                return Err(self.error(
1615                    Span::new(start, self.pos()),
1616                    ast::ErrorKind::RepetitionCountUnclosed,
1617                ));
1618            }
1619            if self.char() != '}' {
1620                let count_start = match count_start {
1621                    Ok(c) => c,
1622                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1623                        if self.parser().empty_min_range {
1624                            0
1625                        } else {
1626                            return Err(err);
1627                        }
1628                    }
1629                    err => err?,
1630                };
1631                let count_end = specialize_err(
1632                    self.parse_decimal(),
1633                    ast::ErrorKind::DecimalEmpty,
1634                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1635                )?;
1636                ast::RepetitionRange::Bounded(count_start, count_end)
1637            } else {
1638                ast::RepetitionRange::AtLeast(count_start?)
1639            }
1640        } else {
1641            ast::RepetitionRange::Exactly(count_start?)
1642        };
1643
1644        if self.is_eof() || self.char() != '}' {
1645            return Err(self.error(
1646                Span::new(start, self.pos()),
1647                ast::ErrorKind::RepetitionCountUnclosed,
1648            ));
1649        }
1650
1651        let mut greedy = true;
1652        if self.bump_and_bump_space() && self.char() == '?' {
1653            greedy = false;
1654            self.bump();
1655        }
1656
1657        let op_span = Span::new(start, self.pos());
1658        if !range.is_valid() {
1659            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1660        }
1661        concat.asts.push(Ast::repetition(ast::Repetition {
1662            span: ast.span().with_end(self.pos()),
1663            op: ast::RepetitionOp {
1664                span: op_span,
1665                kind: ast::RepetitionKind::Range(range),
1666            },
1667            greedy,
1668            ast: Box::new(ast),
1669        }));
1670        Ok(concat)
1671    }
1672
1673    #[inline(never)]
1674    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1675        assert_eq!(self.char(), '(');
1676        let open_span = self.span_char();
1677        self.bump();
1678        self.bump_space();
1679        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1680            let kind = match (pos, ahead) {
1681                (true, true) => LookaroundKind::PositiveLookahead,
1682                (true, false) => LookaroundKind::PositiveLookbehind,
1683                (false, true) => LookaroundKind::NegativeLookahead,
1684                (false, false) => LookaroundKind::NegativeLookbehind,
1685            };
1686            return Ok(Either::Right(ast::Group {
1687                span: open_span,
1688                kind: ast::GroupKind::Lookaround(kind),
1689                ast: Box::new(Ast::empty(self.span())),
1690            }));
1691        }
1692        let inner_span = self.span();
1693        let mut starts_with_p = true;
1694        if self.bump_if("?P<") || {
1695            starts_with_p = false;
1696            self.bump_if("?<")
1697        } {
1698            let capture_index = self.next_capture_index(open_span)?;
1699            let name = self.parse_capture_name(capture_index)?;
1700            Ok(Either::Right(ast::Group {
1701                span: open_span,
1702                kind: ast::GroupKind::CaptureName {
1703                    starts_with_p,
1704                    name,
1705                },
1706                ast: Box::new(Ast::empty(self.span())),
1707            }))
1708        } else if self.bump_if("?") {
1709            if self.is_eof() {
1710                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1711            }
1712            let flags = self.parse_flags()?;
1713            let char_end = self.char();
1714            self.bump();
1715            if char_end == ')' {
1716                // We don't allow empty flags, e.g., `(?)`. We instead
1717                // interpret it as a repetition operator missing its argument.
1718                if flags.items.is_empty() {
1719                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1720                }
1721                Ok(Either::Left(ast::SetFlags {
1722                    span: Span {
1723                        end: self.pos(),
1724                        ..open_span
1725                    },
1726                    flags,
1727                }))
1728            } else {
1729                assert_eq!(char_end, ':');
1730                Ok(Either::Right(ast::Group {
1731                    span: open_span,
1732                    kind: ast::GroupKind::NonCapturing(flags),
1733                    ast: Box::new(Ast::empty(self.span())),
1734                }))
1735            }
1736        } else {
1737            let capture_index = self.next_capture_index(open_span)?;
1738            Ok(Either::Right(ast::Group {
1739                span: open_span,
1740                kind: ast::GroupKind::CaptureIndex(capture_index),
1741                ast: Box::new(Ast::empty(self.span())),
1742            }))
1743        }
1744    }
1745
1746    #[inline(never)]
1747    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1748        if self.is_eof() {
1749            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1750        }
1751        let start = self.pos();
1752        loop {
1753            if self.char() == '>' {
1754                break;
1755            }
1756            if !is_capture_char(self.char(), self.pos() == start) {
1757                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1758            }
1759            if !self.bump() {
1760                break;
1761            }
1762        }
1763        let end = self.pos();
1764        if self.is_eof() {
1765            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1766        }
1767        assert_eq!(self.char(), '>');
1768        self.bump();
1769        let name = &self.pattern()[start.offset..end.offset];
1770        if name.is_empty() {
1771            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1772        }
1773        let capname = ast::CaptureName {
1774            span: Span::new(start, end),
1775            name: name.to_string(),
1776            index: capture_index,
1777        };
1778        self.add_capture_name(&capname)?;
1779        Ok(capname)
1780    }
1781
1782    #[inline(never)]
1783    fn parse_flags(&self) -> Result<ast::Flags> {
1784        let mut flags = ast::Flags {
1785            span: self.span(),
1786            items: vec![],
1787        };
1788        let mut last_was_negation = None;
1789        while self.char() != ':' && self.char() != ')' {
1790            if self.char() == '-' {
1791                last_was_negation = Some(self.span_char());
1792                let item = ast::FlagsItem {
1793                    span: self.span_char(),
1794                    kind: ast::FlagsItemKind::Negation,
1795                };
1796                if let Some(i) = flags.add_item(item) {
1797                    return Err(self.error(
1798                        self.span_char(),
1799                        ast::ErrorKind::FlagRepeatedNegation {
1800                            original: flags.items[i].span,
1801                        },
1802                    ));
1803                }
1804            } else {
1805                last_was_negation = None;
1806                let item = ast::FlagsItem {
1807                    span: self.span_char(),
1808                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1809                };
1810                if let Some(i) = flags.add_item(item) {
1811                    return Err(self.error(
1812                        self.span_char(),
1813                        ast::ErrorKind::FlagDuplicate {
1814                            original: flags.items[i].span,
1815                        },
1816                    ));
1817                }
1818            }
1819            if !self.bump() {
1820                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1821            }
1822        }
1823        if let Some(span) = last_was_negation {
1824            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1825        }
1826        flags.span.end = self.pos();
1827        Ok(flags)
1828    }
1829
1830    #[inline(never)]
1831    fn parse_flag(&self) -> Result<ast::Flag> {
1832        match self.char() {
1833            'i' => Ok(ast::Flag::CaseInsensitive),
1834            'm' => Ok(ast::Flag::MultiLine),
1835            's' => Ok(ast::Flag::DotMatchesNewLine),
1836            'U' => Ok(ast::Flag::SwapGreed),
1837            'u' => Ok(ast::Flag::Unicode),
1838            'R' => Ok(ast::Flag::CRLF),
1839            'x' => Ok(ast::Flag::IgnoreWhitespace),
1840            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1841        }
1842    }
1843
1844    fn parse_primitive(&self) -> Result<Primitive> {
1845        match self.char() {
1846            '\\' => self.parse_escape(),
1847            '_' => {
1848                let ast = Primitive::Top(self.span_char());
1849                self.bump();
1850                Ok(ast)
1851            }
1852            '.' => {
1853                let ast = Primitive::Dot(self.span_char());
1854                self.bump();
1855                Ok(ast)
1856            }
1857            '^' => {
1858                let ast = Primitive::Assertion(ast::Assertion {
1859                    span: self.span_char(),
1860                    kind: ast::AssertionKind::StartLine,
1861                });
1862                self.bump();
1863                Ok(ast)
1864            }
1865            '$' => {
1866                let ast = Primitive::Assertion(ast::Assertion {
1867                    span: self.span_char(),
1868                    kind: ast::AssertionKind::EndLine,
1869                });
1870                self.bump();
1871                Ok(ast)
1872            }
1873            c => {
1874                let ast = Primitive::Literal(Literal {
1875                    span: self.span_char(),
1876                    kind: LiteralKind::Verbatim,
1877                    c,
1878                });
1879                self.bump();
1880                Ok(ast)
1881            }
1882        }
1883    }
1884
1885    #[inline(never)]
1886    fn parse_escape(&self) -> Result<Primitive> {
1887        assert_eq!(self.char(), '\\');
1888        let start = self.pos();
1889        if !self.bump() {
1890            return Err(self.error(
1891                Span::new(start, self.pos()),
1892                ast::ErrorKind::EscapeUnexpectedEof,
1893            ));
1894        }
1895        let c = self.char();
1896        // Put some of the more complicated routines into helpers.
1897        match c {
1898            '0'..='9' => {
1899                if !self.parser().octal {
1900                    let contents = self.parse_decimal()?;
1901                    let ast = TaggedEps {
1902                        span: self.span(),
1903                        num: contents,
1904                    };
1905                    return Ok(Primitive::TaggedEps(ast));
1906                }
1907                let mut lit = self.parse_octal();
1908                lit.span.start = start;
1909                return Ok(Primitive::Literal(lit));
1910            }
1911            // '8'..='9' if !self.parser().octal => {
1912            //     return Err(self.error(
1913            //         Span::new(start, self.span_char().end),
1914            //         ast::ErrorKind::UnsupportedBackreference,
1915            //     ));
1916            // }
1917            'x' | 'u' | 'U' => {
1918                let mut lit = self.parse_hex()?;
1919                lit.span.start = start;
1920                return Ok(Primitive::Literal(lit));
1921            }
1922            'p' | 'P' => {
1923                let mut cls = self.parse_unicode_class()?;
1924                cls.span.start = start;
1925                return Ok(Primitive::Unicode(cls));
1926            }
1927            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1928                let mut cls = self.parse_perl_class();
1929                cls.span.start = start;
1930                return Ok(Primitive::Perl(cls));
1931            }
1932            _ => {}
1933        }
1934
1935        // Handle all of the one letter sequences inline.
1936        self.bump();
1937        let span = Span::new(start, self.pos());
1938        if is_meta_character(c) {
1939            return Ok(Primitive::Literal(Literal {
1940                span,
1941                kind: LiteralKind::Meta,
1942                c,
1943            }));
1944        }
1945        if is_escapeable_character(c) {
1946            return Ok(Primitive::Literal(Literal {
1947                span,
1948                kind: LiteralKind::Superfluous,
1949                c,
1950            }));
1951        }
1952        let special = |kind, c| {
1953            Ok(Primitive::Literal(Literal {
1954                span,
1955                kind: LiteralKind::Special(kind),
1956                c,
1957            }))
1958        };
1959        match c {
1960            'a' => special(SpecialLiteralKind::Bell, '\x07'),
1961            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1962            't' => special(SpecialLiteralKind::Tab, '\t'),
1963            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1964            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1965            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1966            'A' => Ok(Primitive::Assertion(ast::Assertion {
1967                span,
1968                kind: ast::AssertionKind::StartText,
1969            })),
1970            'z' => Ok(Primitive::Assertion(ast::Assertion {
1971                span,
1972                kind: ast::AssertionKind::EndText,
1973            })),
1974            'b' => {
1975                let mut wb = ast::Assertion {
1976                    span,
1977                    kind: ast::AssertionKind::WordBoundary,
1978                };
1979                // After a \b, we "try" to parse things like \b{start} for
1980                // special word boundary assertions.
1981                if !self.is_eof() && self.char() == '{' {
1982                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
1983                        wb.kind = kind;
1984                        wb.span.end = self.pos();
1985                    }
1986                }
1987                Ok(Primitive::Assertion(wb))
1988            }
1989            'B' => Ok(Primitive::Assertion(ast::Assertion {
1990                span,
1991                kind: ast::AssertionKind::NotWordBoundary,
1992            })),
1993            '<' => Ok(Primitive::Assertion(ast::Assertion {
1994                span,
1995                kind: ast::AssertionKind::WordBoundaryStartAngle,
1996            })),
1997            '>' => Ok(Primitive::Assertion(ast::Assertion {
1998                span,
1999                kind: ast::AssertionKind::WordBoundaryEndAngle,
2000            })),
2001            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2002        }
2003    }
2004
2005    fn maybe_parse_special_word_boundary(
2006        &self,
2007        wb_start: Position,
2008    ) -> Result<Option<ast::AssertionKind>> {
2009        assert_eq!(self.char(), '{');
2010
2011        let is_valid_char = |c| match c {
2012            'A'..='Z' | 'a'..='z' | '-' => true,
2013            _ => false,
2014        };
2015        let start = self.pos();
2016        if !self.bump_and_bump_space() {
2017            return Err(self.error(
2018                Span::new(wb_start, self.pos()),
2019                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2020            ));
2021        }
2022        let start_contents = self.pos();
2023        // This is one of the critical bits: if the first non-whitespace
2024        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2025        // boundary), then we bail and let the counted repetition parser deal
2026        // with this.
2027        if !is_valid_char(self.char()) {
2028            self.parser().pos.set(start);
2029            return Ok(None);
2030        }
2031
2032        // Now collect up our chars until we see a '}'.
2033        let mut scratch = self.parser().scratch.borrow_mut();
2034        scratch.clear();
2035        while !self.is_eof() && is_valid_char(self.char()) {
2036            scratch.push(self.char());
2037            self.bump_and_bump_space();
2038        }
2039        if self.is_eof() || self.char() != '}' {
2040            return Err(self.error(
2041                Span::new(start, self.pos()),
2042                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2043            ));
2044        }
2045        let end = self.pos();
2046        self.bump();
2047        let kind = match scratch.as_str() {
2048            "start" => ast::AssertionKind::WordBoundaryStart,
2049            "end" => ast::AssertionKind::WordBoundaryEnd,
2050            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2051            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2052            _ => {
2053                return Err(self.error(
2054                    Span::new(start_contents, end),
2055                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2056                ))
2057            }
2058        };
2059        Ok(Some(kind))
2060    }
2061
2062    #[inline(never)]
2063    fn parse_octal(&self) -> Literal {
2064        assert!(self.parser().octal);
2065        assert!('0' <= self.char() && self.char() <= '7');
2066        let start = self.pos();
2067        // Parse up to two more digits.
2068        while self.bump()
2069            && '0' <= self.char()
2070            && self.char() <= '7'
2071            && self.pos().offset - start.offset <= 2
2072        {}
2073        let end = self.pos();
2074        let octal = &self.pattern()[start.offset..end.offset];
2075        // Parsing the octal should never fail since the above guarantees a
2076        // valid number.
2077        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2078        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2079        // invalid Unicode scalar values.
2080        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2081        Literal {
2082            span: Span::new(start, end),
2083            kind: LiteralKind::Octal,
2084            c,
2085        }
2086    }
2087
2088    #[inline(never)]
2089    fn parse_hex(&self) -> Result<Literal> {
2090        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2091
2092        let hex_kind = match self.char() {
2093            'x' => HexLiteralKind::X,
2094            'u' => HexLiteralKind::UnicodeShort,
2095            _ => HexLiteralKind::UnicodeLong,
2096        };
2097        if !self.bump_and_bump_space() {
2098            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2099        }
2100        if self.char() == '{' {
2101            self.parse_hex_brace(hex_kind)
2102        } else {
2103            self.parse_hex_digits(hex_kind)
2104        }
2105    }
2106
2107    #[inline(never)]
2108    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2109        let mut scratch = self.parser().scratch.borrow_mut();
2110        scratch.clear();
2111
2112        let start = self.pos();
2113        for i in 0..kind.digits() {
2114            if i > 0 && !self.bump_and_bump_space() {
2115                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2116            }
2117            if !is_hex(self.char()) {
2118                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2119            }
2120            scratch.push(self.char());
2121        }
2122        // The final bump just moves the parser past the literal, which may
2123        // be EOF.
2124        self.bump_and_bump_space();
2125        let end = self.pos();
2126        let hex = scratch.as_str();
2127        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2128            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2129            Some(c) => Ok(Literal {
2130                span: Span::new(start, end),
2131                kind: LiteralKind::HexFixed(kind),
2132                c,
2133            }),
2134        }
2135    }
2136
2137    #[inline(never)]
2138    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2139        let mut scratch = self.parser().scratch.borrow_mut();
2140        scratch.clear();
2141
2142        let brace_pos = self.pos();
2143        let start = self.span_char().end;
2144        while self.bump_and_bump_space() && self.char() != '}' {
2145            if !is_hex(self.char()) {
2146                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2147            }
2148            scratch.push(self.char());
2149        }
2150        if self.is_eof() {
2151            return Err(self.error(
2152                Span::new(brace_pos, self.pos()),
2153                ast::ErrorKind::EscapeUnexpectedEof,
2154            ));
2155        }
2156        let end = self.pos();
2157        let hex = scratch.as_str();
2158        assert_eq!(self.char(), '}');
2159        self.bump_and_bump_space();
2160
2161        if hex.is_empty() {
2162            return Err(self.error(
2163                Span::new(brace_pos, self.pos()),
2164                ast::ErrorKind::EscapeHexEmpty,
2165            ));
2166        }
2167        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2168            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2169            Some(c) => Ok(Literal {
2170                span: Span::new(start, self.pos()),
2171                kind: LiteralKind::HexBrace(kind),
2172                c,
2173            }),
2174        }
2175    }
2176
2177    fn parse_decimal(&self) -> Result<u32> {
2178        let mut scratch = self.parser().scratch.borrow_mut();
2179        scratch.clear();
2180
2181        while !self.is_eof() && self.char().is_whitespace() {
2182            self.bump();
2183        }
2184        let start = self.pos();
2185        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2186            scratch.push(self.char());
2187            self.bump_and_bump_space();
2188        }
2189        let span = Span::new(start, self.pos());
2190        while !self.is_eof() && self.char().is_whitespace() {
2191            self.bump_and_bump_space();
2192        }
2193        let digits = scratch.as_str();
2194        if digits.is_empty() {
2195            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2196        }
2197        match u32::from_str_radix(digits, 10).ok() {
2198            Some(n) => Ok(n),
2199            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2200        }
2201    }
2202
2203    #[inline(never)]
2204    fn parse_set_class(&self) -> Result<ClassBracketed> {
2205        assert_eq!(self.char(), '[');
2206
2207        let mut union = ClassSetUnion {
2208            span: self.span(),
2209            items: vec![],
2210        };
2211        loop {
2212            self.bump_space();
2213            if self.is_eof() {
2214                return Err(self.unclosed_class_error());
2215            }
2216            match self.char() {
2217                '[' => {
2218                    // If we've already parsed the opening bracket, then
2219                    // attempt to treat this as the beginning of an ASCII
2220                    // class. If ASCII class parsing fails, then the parser
2221                    // backs up to `[`.
2222                    if !self.parser().stack_class.borrow().is_empty() {
2223                        if let Some(cls) = self.maybe_parse_ascii_class() {
2224                            union.push(ClassSetItem::Ascii(cls));
2225                            continue;
2226                        }
2227                    }
2228                    union = self.push_class_open(union)?;
2229                }
2230                ']' => match self.pop_class(union)? {
2231                    Either::Left(nested_union) => {
2232                        union = nested_union;
2233                    }
2234                    Either::Right(class) => return Ok(class),
2235                },
2236                '&' if self.peek() == Some('&') => {
2237                    assert!(self.bump_if("&&"));
2238                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2239                }
2240                '-' if self.peek() == Some('-') => {
2241                    assert!(self.bump_if("--"));
2242                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2243                }
2244                '~' if self.peek() == Some('~') => {
2245                    assert!(self.bump_if("~~"));
2246                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2247                }
2248                _ => {
2249                    union.push(self.parse_set_class_range()?);
2250                }
2251            }
2252        }
2253    }
2254
2255    #[inline(never)]
2256    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2257        let prim1 = self.parse_set_class_item()?;
2258        self.bump_space();
2259        if self.is_eof() {
2260            return Err(self.unclosed_class_error());
2261        }
2262        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2263            return prim1.into_class_set_item(self);
2264        }
2265        if !self.bump_and_bump_space() {
2266            return Err(self.unclosed_class_error());
2267        }
2268        let prim2 = self.parse_set_class_item()?;
2269        let range = ClassSetRange {
2270            span: Span::new(prim1.span().start, prim2.span().end),
2271            start: prim1.into_class_literal(self)?,
2272            end: prim2.into_class_literal(self)?,
2273        };
2274        if !range.is_valid() {
2275            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2276        }
2277        Ok(ClassSetItem::Range(range))
2278    }
2279
2280    #[inline(never)]
2281    fn parse_set_class_item(&self) -> Result<Primitive> {
2282        if self.char() == '\\' {
2283            self.parse_escape()
2284        } else {
2285            let x = Primitive::Literal(Literal {
2286                span: self.span_char(),
2287                kind: LiteralKind::Verbatim,
2288                c: self.char(),
2289            });
2290            self.bump();
2291            Ok(x)
2292        }
2293    }
2294
2295    #[inline(never)]
2296    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2297        assert_eq!(self.char(), '[');
2298        let start = self.pos();
2299        if !self.bump_and_bump_space() {
2300            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2301        }
2302
2303        let negated = if self.char() != '^' {
2304            false
2305        } else {
2306            if !self.bump_and_bump_space() {
2307                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2308            }
2309            true
2310        };
2311        // Accept any number of `-` as literal `-`.
2312        let mut union = ClassSetUnion {
2313            span: self.span(),
2314            items: vec![],
2315        };
2316        while self.char() == '-' {
2317            union.push(ClassSetItem::Literal(Literal {
2318                span: self.span_char(),
2319                kind: LiteralKind::Verbatim,
2320                c: '-',
2321            }));
2322            if !self.bump_and_bump_space() {
2323                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2324            }
2325        }
2326        // If `]` is the *first* char in a set, then interpret it as a literal
2327        // `]`. That is, an empty class is impossible to write.
2328        if union.items.is_empty() && self.char() == ']' {
2329            union.push(ClassSetItem::Literal(Literal {
2330                span: self.span_char(),
2331                kind: LiteralKind::Verbatim,
2332                c: ']',
2333            }));
2334            if !self.bump_and_bump_space() {
2335                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2336            }
2337        }
2338        let set = ClassBracketed {
2339            span: Span::new(start, self.pos()),
2340            negated,
2341            kind: ClassSet::union(ClassSetUnion {
2342                span: Span::new(union.span.start, union.span.start),
2343                items: vec![],
2344            }),
2345        };
2346        Ok((set, union))
2347    }
2348
2349    #[inline(never)]
2350    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2351        assert_eq!(self.char(), '[');
2352        // If parsing fails, then we back up the parser to this starting point.
2353        let start = self.pos();
2354        let mut negated = false;
2355        if !self.bump() || self.char() != ':' {
2356            self.parser().pos.set(start);
2357            return None;
2358        }
2359        if !self.bump() {
2360            self.parser().pos.set(start);
2361            return None;
2362        }
2363        if self.char() == '^' {
2364            negated = true;
2365            if !self.bump() {
2366                self.parser().pos.set(start);
2367                return None;
2368            }
2369        }
2370        let name_start = self.offset();
2371        while self.char() != ':' && self.bump() {}
2372        if self.is_eof() {
2373            self.parser().pos.set(start);
2374            return None;
2375        }
2376        let name = &self.pattern()[name_start..self.offset()];
2377        if !self.bump_if(":]") {
2378            self.parser().pos.set(start);
2379            return None;
2380        }
2381        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2382            Some(kind) => kind,
2383            None => {
2384                self.parser().pos.set(start);
2385                return None;
2386            }
2387        };
2388        Some(ClassAscii {
2389            span: Span::new(start, self.pos()),
2390            kind,
2391            negated,
2392        })
2393    }
2394
2395    #[inline(never)]
2396    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2397        assert!(self.char() == 'p' || self.char() == 'P');
2398
2399        let mut scratch = self.parser().scratch.borrow_mut();
2400        scratch.clear();
2401
2402        let negated = self.char() == 'P';
2403        if !self.bump_and_bump_space() {
2404            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2405        }
2406        let (start, kind) = if self.char() == '{' {
2407            let start = self.span_char().end;
2408            while self.bump_and_bump_space() && self.char() != '}' {
2409                scratch.push(self.char());
2410            }
2411            if self.is_eof() {
2412                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2413            }
2414            assert_eq!(self.char(), '}');
2415            self.bump();
2416
2417            let name = scratch.as_str();
2418            if let Some(i) = name.find("!=") {
2419                (
2420                    start,
2421                    ClassUnicodeKind::NamedValue {
2422                        op: ClassUnicodeOpKind::NotEqual,
2423                        name: name[..i].to_string(),
2424                        value: name[i + 2..].to_string(),
2425                    },
2426                )
2427            } else if let Some(i) = name.find(':') {
2428                (
2429                    start,
2430                    ClassUnicodeKind::NamedValue {
2431                        op: ClassUnicodeOpKind::Colon,
2432                        name: name[..i].to_string(),
2433                        value: name[i + 1..].to_string(),
2434                    },
2435                )
2436            } else if let Some(i) = name.find('=') {
2437                (
2438                    start,
2439                    ClassUnicodeKind::NamedValue {
2440                        op: ClassUnicodeOpKind::Equal,
2441                        name: name[..i].to_string(),
2442                        value: name[i + 1..].to_string(),
2443                    },
2444                )
2445            } else {
2446                (start, ClassUnicodeKind::Named(name.to_string()))
2447            }
2448        } else {
2449            let start = self.pos();
2450            let c = self.char();
2451            if c == '\\' {
2452                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2453            }
2454            self.bump_and_bump_space();
2455            let kind = ClassUnicodeKind::OneLetter(c);
2456            (start, kind)
2457        };
2458        Ok(ClassUnicode {
2459            span: Span::new(start, self.pos()),
2460            negated,
2461            kind,
2462        })
2463    }
2464
2465    #[inline(never)]
2466    fn parse_perl_class(&self) -> ClassPerl {
2467        let c = self.char();
2468        let span = self.span_char();
2469        self.bump();
2470        let (negated, kind) = match c {
2471            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2472            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2473            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2474            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2475            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2476            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2477            c => panic!("expected valid Perl class but got '{}'", c),
2478        };
2479        ClassPerl {
2480            span,
2481            kind,
2482            negated,
2483        }
2484    }
2485}
2486
2487pub fn parse_ast<'s>(
2488    tb: &mut TB<'s>,
2489    pattern: &'s str,
2490) -> std::result::Result<NodeId, ResharpError> {
2491    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2492    let result = p.parse(tb);
2493    result
2494}
resharp_parser/lib.rs

resharp_parser/
lib.rs