resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28    Word,
29    NonWord,
30    MaybeWord,
31    MaybeNonWord,
32    Unknown,
33    Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37    b.is_ascii_alphanumeric() || b == b'_'
38}
39
40#[derive(Clone, Debug, Eq, PartialEq)]
41enum Primitive {
42    Literal(Literal),
43    Assertion(ast::Assertion),
44    Dot(Span),
45    Top(Span),
46    Perl(ClassPerl),
47    Unicode(ClassUnicode),
48}
49
50impl Primitive {
51    fn span(&self) -> &Span {
52        match *self {
53            Primitive::Literal(ref x) => &x.span,
54            Primitive::Assertion(ref x) => &x.span,
55            Primitive::Dot(ref span) => span,
56            Primitive::Top(ref span) => span,
57            Primitive::Perl(ref x) => &x.span,
58            Primitive::Unicode(ref x) => &x.span,
59        }
60    }
61
62    fn into_ast(self) -> Ast {
63        match self {
64            Primitive::Literal(lit) => Ast::literal(lit),
65            Primitive::Assertion(assert) => Ast::assertion(assert),
66            Primitive::Dot(span) => Ast::dot(span),
67            Primitive::Top(span) => Ast::top(span),
68            Primitive::Perl(cls) => Ast::class_perl(cls),
69            Primitive::Unicode(cls) => Ast::class_unicode(cls),
70        }
71    }
72
73    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
74        use self::Primitive::*;
75        use regex_syntax::ast::ClassSetItem;
76
77        match self {
78            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
79            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
80            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
81            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
82        }
83    }
84
85    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
86        use self::Primitive::*;
87
88        match self {
89            Literal(lit) => Ok(lit),
90            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
91        }
92    }
93}
94
95#[derive(Clone, Debug, Eq, PartialEq)]
96pub enum Either<Left, Right> {
97    Left(Left),
98    Right(Right),
99}
100
101#[derive(Clone, Debug, Eq, PartialEq)]
102pub struct ResharpError {
103    /// The kind of error.
104    pub kind: ErrorKind,
105    /// The original pattern that the parser generated the error from. Every
106    /// span in an error is a valid range into this string.
107    pattern: String,
108    /// The span of this error.
109    pub span: Span,
110}
111
112impl std::fmt::Display for ResharpError {
113    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
114        write!(f, "{:?}: {:?}", self.kind, self.span)
115    }
116}
117impl std::error::Error for ResharpError {}
118
119type Result<T> = core::result::Result<T, ResharpError>;
120
121#[derive(Clone, Debug)]
122enum GroupState {
123    /// This state is pushed whenever an opening group is found.
124    Group {
125        /// The concatenation immediately preceding the opening group.
126        concat: Concat,
127        /// The group that has been opened. Its sub-AST is always empty.
128        group: ast::Group,
129        /// Whether this group has the `x` flag enabled or not.
130        ignore_whitespace: bool,
131    },
132    /// This state is pushed whenever a new alternation branch is found. If
133    /// an alternation branch is found and this state is at the top of the
134    /// stack, then this state should be modified to include the new
135    /// alternation.
136    Alternation(ast::Alternation),
137    Intersection(ast::Intersection),
138}
139
140#[derive(Clone, Debug)]
141enum ClassState {
142    /// This state is pushed whenever an opening bracket is found.
143    Open {
144        /// The union of class items immediately preceding this class.
145        union: regex_syntax::ast::ClassSetUnion,
146        /// The class that has been opened. Typically this just corresponds
147        /// to the `[`, but it can also include `[^` since `^` indicates
148        /// negation of the class.
149        set: regex_syntax::ast::ClassBracketed,
150    },
151    /// This state is pushed when a operator is seen. When popped, the stored
152    /// set becomes the left hand side of the operator.
153    Op {
154        /// The type of the operation, i.e., &&, -- or ~~.
155        kind: regex_syntax::ast::ClassSetBinaryOpKind,
156        /// The left-hand side of the operator.
157        lhs: regex_syntax::ast::ClassSet,
158    },
159}
160
161/// RE# syntax parser based on the regex-syntax crate.
162pub struct ResharpParser<'s> {
163    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
164    pub translator: regex_syntax::hir::translate::Translator,
165    pub pattern: &'s str,
166    pos: Cell<Position>,
167    capture_index: Cell<u32>,
168    octal: bool,
169    empty_min_range: bool,
170    ignore_whitespace: Cell<bool>,
171    comments: RefCell<Vec<ast::Comment>>,
172    stack_group: RefCell<Vec<GroupState>>,
173    stack_class: RefCell<Vec<ClassState>>,
174    capture_names: RefCell<Vec<ast::CaptureName>>,
175    scratch: RefCell<String>,
176}
177
178fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
179    if let Err(e) = result {
180        if e.kind == from {
181            Err(ResharpError {
182                kind: to,
183                pattern: e.pattern,
184                span: e.span,
185            })
186        } else {
187            Err(e)
188        }
189    } else {
190        result
191    }
192}
193
194fn is_capture_char(c: char, first: bool) -> bool {
195    if first {
196        c == '_' || c.is_alphabetic()
197    } else {
198        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
199    }
200}
201
202pub fn is_meta_character(c: char) -> bool {
203    match c {
204        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
205        | '#' | '&' | '-' | '~' => true,
206        _ => false,
207    }
208}
209
210pub fn is_escapeable_character(c: char) -> bool {
211    // Certainly escapeable if it's a meta character.
212    if is_meta_character(c) {
213        return true;
214    }
215    // Any character that isn't ASCII is definitely not escapeable. There's
216    // no real need to allow things like \☃ right?
217    if !c.is_ascii() {
218        return false;
219    }
220    // Otherwise, we basically say that everything is escapeable unless it's a
221    // letter or digit. Things like \3 are either octal (when enabled) or an
222    // error, and we should keep it that way. Otherwise, letters are reserved
223    // for adding new syntax in a backwards compatible way.
224    match c {
225        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
226        // While not currently supported, we keep these as not escapeable to
227        // give us some flexibility with respect to supporting the \< and
228        // \> word boundary assertions in the future. By rejecting them as
229        // escapeable, \< and \> will result in a parse error. Thus, we can
230        // turn them into something else in the future without it being a
231        // backwards incompatible change.
232        //
233        // OK, now we support \< and \>, and we need to retain them as *not*
234        // escapeable here since the escape sequence is significant.
235        '<' | '>' => false,
236        _ => true,
237    }
238}
239
240fn is_hex(c: char) -> bool {
241    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
242}
243
244impl<'s> ResharpParser<'s> {
245    fn default_translator_builder() -> TranslatorBuilder {
246        let mut trb = TranslatorBuilder::new();
247        trb.unicode(true);
248        trb.utf8(false);
249        trb
250    }
251
252    pub fn new(pattern: &'s str) -> Self {
253        let trb = Self::default_translator_builder();
254        Self {
255            translator: trb.build(),
256            pattern,
257            perl_classes: vec![],
258            pos: Cell::new(Position::new(0, 0, 0)),
259            capture_index: Cell::new(0),
260            octal: false,
261            empty_min_range: false,
262            ignore_whitespace: Cell::new(false),
263            comments: RefCell::new(vec![]),
264            stack_group: RefCell::new(vec![]),
265            stack_class: RefCell::new(vec![]),
266            capture_names: RefCell::new(vec![]),
267            scratch: RefCell::new(String::new()),
268        }
269    }
270
271    /// Return a reference to the parser state.
272    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
273        self
274    }
275
276    /// Return a reference to the pattern being parsed.
277    fn pattern(&self) -> &str {
278        self.pattern
279    }
280
281    /// Create a new error with the given span and error type.
282    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
283        ResharpError {
284            kind,
285            pattern: self.pattern().to_string(),
286            span,
287        }
288    }
289
290    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
291        let emptyspan = Span::splat(self.pos());
292        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
293        inner
294    }
295
296    /// Return the current offset of the parser.
297    ///
298    /// The offset starts at `0` from the beginning of the regular expression
299    /// pattern string.
300    fn offset(&self) -> usize {
301        self.parser().pos.get().offset
302    }
303
304    /// Return the current line number of the parser.
305    ///
306    /// The line number starts at `1`.
307    fn line(&self) -> usize {
308        self.parser().pos.get().line
309    }
310
311    /// Return the current column of the parser.
312    ///
313    /// The column number starts at `1` and is reset whenever a `\n` is seen.
314    fn column(&self) -> usize {
315        self.parser().pos.get().column
316    }
317
318    /// Return the next capturing index. Each subsequent call increments the
319    /// internal index.
320    ///
321    /// The span given should correspond to the location of the opening
322    /// parenthesis.
323    ///
324    /// If the capture limit is exceeded, then an error is returned.
325    fn next_capture_index(&self, span: Span) -> Result<u32> {
326        let current = self.parser().capture_index.get();
327        let i = current
328            .checked_add(1)
329            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
330        self.parser().capture_index.set(i);
331        Ok(i)
332    }
333
334    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
335        let mut names = self.parser().capture_names.borrow_mut();
336        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
337            Err(i) => {
338                names.insert(i, cap.clone());
339                Ok(())
340            }
341            Ok(i) => Err(self.error(
342                cap.span,
343                ast::ErrorKind::GroupNameDuplicate {
344                    original: names[i].span,
345                },
346            )),
347        }
348    }
349
350    fn ignore_whitespace(&self) -> bool {
351        self.parser().ignore_whitespace.get()
352    }
353
354    fn char(&self) -> char {
355        self.char_at(self.offset())
356    }
357
358    fn char_at(&self, i: usize) -> char {
359        self.pattern()[i..]
360            .chars()
361            .next()
362            .unwrap_or_else(|| panic!("expected char at offset {}", i))
363    }
364
365    fn bump(&self) -> bool {
366        if self.is_eof() {
367            return false;
368        }
369        let Position {
370            mut offset,
371            mut line,
372            mut column,
373        } = self.pos();
374        if self.char() == '\n' {
375            line = line.checked_add(1).unwrap();
376            column = 1;
377        } else {
378            column = column.checked_add(1).unwrap();
379        }
380        offset += self.char().len_utf8();
381        self.parser().pos.set(Position {
382            offset,
383            line,
384            column,
385        });
386        self.pattern()[self.offset()..].chars().next().is_some()
387    }
388
389    fn bump_if(&self, prefix: &str) -> bool {
390        if self.pattern()[self.offset()..].starts_with(prefix) {
391            for _ in 0..prefix.chars().count() {
392                self.bump();
393            }
394            true
395        } else {
396            false
397        }
398    }
399
400    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
401        if self.bump_if("?=") {
402            return Some((true, true));
403        }
404        if self.bump_if("?!") {
405            return Some((true, false));
406        }
407        if self.bump_if("?<=") {
408            return Some((false, true));
409        }
410        if self.bump_if("?<!") {
411            return Some((false, false));
412        }
413        return None;
414        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
415    }
416
417    fn bump_and_bump_space(&self) -> bool {
418        if !self.bump() {
419            return false;
420        }
421        self.bump_space();
422        !self.is_eof()
423    }
424
425    fn bump_space(&self) {
426        if !self.ignore_whitespace() {
427            return;
428        }
429        while !self.is_eof() {
430            if self.char().is_whitespace() {
431                self.bump();
432            } else if self.char() == '#' {
433                let start = self.pos();
434                let mut comment_text = String::new();
435                self.bump();
436                while !self.is_eof() {
437                    let c = self.char();
438                    self.bump();
439                    if c == '\n' {
440                        break;
441                    }
442                    comment_text.push(c);
443                }
444                let comment = ast::Comment {
445                    span: Span::new(start, self.pos()),
446                    comment: comment_text,
447                };
448                self.parser().comments.borrow_mut().push(comment);
449            } else {
450                break;
451            }
452        }
453    }
454
455    /// Peek at the next character in the input without advancing the parser.
456    ///
457    /// If the input has been exhausted, then this returns `None`.
458    fn peek(&self) -> Option<char> {
459        if self.is_eof() {
460            return None;
461        }
462        self.pattern()[self.offset() + self.char().len_utf8()..]
463            .chars()
464            .next()
465    }
466
467    /// Like peek, but will ignore spaces when the parser is in whitespace
468    /// insensitive mode.
469    fn peek_space(&self) -> Option<char> {
470        if !self.ignore_whitespace() {
471            return self.peek();
472        }
473        if self.is_eof() {
474            return None;
475        }
476        let mut start = self.offset() + self.char().len_utf8();
477        let mut in_comment = false;
478        for (i, c) in self.pattern()[start..].char_indices() {
479            if c.is_whitespace() {
480                continue;
481            } else if !in_comment && c == '#' {
482                in_comment = true;
483            } else if in_comment && c == '\n' {
484                in_comment = false;
485            } else {
486                start += i;
487                break;
488            }
489        }
490        self.pattern()[start..].chars().next()
491    }
492
493    /// Returns true if the next call to `bump` would return false.
494    fn is_eof(&self) -> bool {
495        self.offset() == self.pattern().len()
496    }
497
498    /// Return the current position of the parser, which includes the offset,
499    /// line and column.
500    fn pos(&self) -> Position {
501        self.parser().pos.get()
502    }
503
504    /// Create a span at the current position of the parser. Both the start
505    /// and end of the span are set.
506    fn span(&self) -> Span {
507        Span::splat(self.pos())
508    }
509
510    /// Create a span that covers the current character.
511    fn span_char(&self) -> Span {
512        let mut next = Position {
513            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
514            line: self.line(),
515            column: self.column().checked_add(1).unwrap(),
516        };
517        if self.char() == '\n' {
518            next.line += 1;
519            next.column = 1;
520        }
521        Span::new(self.pos(), next)
522    }
523
524    /// Parse and push a single alternation on to the parser's internal stack.
525    /// If the top of the stack already has an alternation, then add to that
526    /// instead of pushing a new one.
527    ///
528    /// The concatenation given corresponds to a single alternation branch.
529    /// The concatenation returned starts the next branch and is empty.
530    ///
531    /// This assumes the parser is currently positioned at `|` and will advance
532    /// the parser to the character following `|`.
533    #[inline(never)]
534    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
535        assert_eq!(self.char(), '|');
536        concat.span.end = self.pos();
537        self.push_or_add_alternation(concat);
538        self.bump();
539        Ok(ast::Concat {
540            span: self.span(),
541            asts: vec![],
542        })
543    }
544
545    /// Pushes or adds the given branch of an alternation to the parser's
546    /// internal stack of state.
547    fn push_or_add_alternation(&self, concat: Concat) {
548        use self::GroupState::*;
549
550        let mut stack = self.parser().stack_group.borrow_mut();
551        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
552            alts.asts.push(concat.into_ast());
553            return;
554        }
555        stack.push(Alternation(ast::Alternation {
556            span: Span::new(concat.span.start, self.pos()),
557            asts: vec![concat.into_ast()],
558        }));
559    }
560
561    #[inline(never)]
562    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
563        assert_eq!(self.char(), '&');
564        concat.span.end = self.pos();
565        self.push_or_add_intersect(concat);
566        self.bump();
567        Ok(Concat {
568            span: self.span(),
569            asts: vec![],
570        })
571    }
572
573    /// Pushes or adds the given branch of an alternation to the parser's
574    /// internal stack of state.
575    fn push_or_add_intersect(&self, concat: Concat) {
576        use self::GroupState::*;
577
578        let mut stack = self.parser().stack_group.borrow_mut();
579        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
580            alts.asts.push(concat.into_ast());
581            return;
582        }
583        stack.push(Intersection(ast::Intersection {
584            span: Span::new(concat.span.start, self.pos()),
585            asts: vec![concat.into_ast()],
586        }));
587    }
588
589    /// Parse and push a group AST (and its parent concatenation) on to the
590    /// parser's internal stack. Return a fresh concatenation corresponding
591    /// to the group's sub-AST.
592    ///
593    /// If a set of flags was found (with no group), then the concatenation
594    /// is returned with that set of flags added.
595    ///
596    /// This assumes that the parser is currently positioned on the opening
597    /// parenthesis. It advances the parser to the character at the start
598    /// of the sub-expression (or adjoining expression).
599    ///
600    /// If there was a problem parsing the start of the group, then an error
601    /// is returned.
602    #[inline(never)]
603    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
604        assert_eq!(self.char(), '(');
605        match self.parse_group()? {
606            Either::Left(set) => {
607                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
608                if let Some(v) = ignore {
609                    self.parser().ignore_whitespace.set(v);
610                }
611
612                concat.asts.push(Ast::flags(set));
613                Ok(concat)
614            }
615            Either::Right(group) => {
616                let old_ignore_whitespace = self.ignore_whitespace();
617                let new_ignore_whitespace = group
618                    .flags()
619                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
620                    .unwrap_or(old_ignore_whitespace);
621                self.parser()
622                    .stack_group
623                    .borrow_mut()
624                    .push(GroupState::Group {
625                        concat,
626                        group,
627                        ignore_whitespace: old_ignore_whitespace,
628                    });
629                self.parser().ignore_whitespace.set(new_ignore_whitespace);
630                Ok(Concat {
631                    span: self.span(),
632                    asts: vec![],
633                })
634            }
635        }
636    }
637
638    #[inline(never)]
639    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
640        assert_eq!(self.char(), '~');
641        self.bump();
642        if self.is_eof() || self.char() != '(' {
643            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
644        }
645        let open_span = self.span_char();
646        self.bump();
647        let group = ast::Group {
648            span: open_span,
649            kind: ast::GroupKind::Complement,
650            ast: Box::new(Ast::empty(self.span())),
651        };
652
653        let old_ignore_whitespace = self.ignore_whitespace();
654        let new_ignore_whitespace = group
655            .flags()
656            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
657            .unwrap_or(old_ignore_whitespace);
658        self.parser()
659            .stack_group
660            .borrow_mut()
661            .push(GroupState::Group {
662                concat,
663                group,
664                ignore_whitespace: old_ignore_whitespace,
665            });
666        self.parser().ignore_whitespace.set(new_ignore_whitespace);
667        Ok(Concat {
668            span: self.span(),
669            asts: vec![],
670        })
671    }
672
673    /// Pop a group AST from the parser's internal stack and set the group's
674    /// AST to the given concatenation. Return the concatenation containing
675    /// the group.
676    ///
677    /// This assumes that the parser is currently positioned on the closing
678    /// parenthesis and advances the parser to the character following the `)`.
679    ///
680    /// If no such group could be popped, then an unopened group error is
681    /// returned.
682    #[inline(never)]
683    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
684        use self::GroupState::*;
685        assert_eq!(self.char(), ')');
686        let mut stack = self.parser().stack_group.borrow_mut();
687        let topstack = stack.pop();
688
689        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
690            Some(Group {
691                concat,
692                group,
693                ignore_whitespace,
694            }) => (concat, group, ignore_whitespace, None),
695            Some(Alternation(alt)) => match stack.pop() {
696                Some(Group {
697                    concat,
698                    group,
699                    ignore_whitespace,
700                }) => (
701                    concat,
702                    group,
703                    ignore_whitespace,
704                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
705                ),
706                None | Some(Alternation(_)) | Some(Intersection(_)) => {
707                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
708                }
709            },
710            Some(Intersection(int)) => match stack.pop() {
711                Some(Group {
712                    concat,
713                    group,
714                    ignore_whitespace,
715                }) => (
716                    concat,
717                    group,
718                    ignore_whitespace,
719                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
720                ),
721                None | Some(Alternation(_)) | Some(Intersection(_)) => {
722                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
723                }
724            },
725
726            None => {
727                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
728            }
729        };
730        self.parser().ignore_whitespace.set(ignore_whitespace);
731        group_concat.span.end = self.pos();
732        self.bump();
733        group.span.end = self.pos();
734        match alt {
735            Some(Either::Left(mut alt)) => {
736                alt.span.end = group_concat.span.end;
737                alt.asts.push(group_concat.into_ast());
738                group.ast = Box::new(alt.into_ast());
739            }
740            Some(Either::Right(mut int)) => {
741                int.span.end = group_concat.span.end;
742                int.asts.push(group_concat.into_ast());
743                group.ast = Box::new(int.into_ast());
744            }
745            None => {
746                group.ast = Box::new(group_concat.into_ast());
747            }
748        }
749
750        if group.kind == GroupKind::Complement {
751            let complement = ast::Complement {
752                span: self.span(),
753                ast: group.ast,
754            };
755            prior_concat.asts.push(Ast::complement(complement));
756        }
757        // ignore groups for now
758        else {
759            // dbg!(&group);
760            prior_concat.asts.push(Ast::group(group));
761        }
762        Ok(prior_concat)
763    }
764
765    /// Pop the last state from the parser's internal stack, if it exists, and
766    /// add the given concatenation to it. There either must be no state or a
767    /// single alternation item on the stack. Any other scenario produces an
768    /// error.
769    ///
770    /// This assumes that the parser has advanced to the end.
771    #[inline(never)]
772    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
773        concat.span.end = self.pos();
774        // dbg!(&concat);
775        let mut stack = self.parser().stack_group.borrow_mut();
776        let ast = match stack.pop() {
777            None => Ok(concat.into_ast()),
778            Some(GroupState::Alternation(mut alt)) => {
779                alt.span.end = self.pos();
780                alt.asts.push(concat.into_ast());
781                Ok(Ast::alternation(alt))
782            }
783            Some(GroupState::Intersection(mut int)) => {
784                int.span.end = self.pos();
785                int.asts.push(concat.into_ast());
786
787                // dbg!("end intersection2");
788                Ok(Ast::intersection(int))
789            }
790            Some(GroupState::Group { group, .. }) => {
791                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
792            }
793        };
794        // If we try to pop again, there should be nothing.
795        match stack.pop() {
796            None => ast,
797            Some(GroupState::Alternation(_)) => {
798                // This unreachable is unfortunate. This case can't happen
799                // because the only way we can be here is if there were two
800                // `GroupState::Alternation`s adjacent in the parser's stack,
801                // which we guarantee to never happen because we never push a
802                // `GroupState::Alternation` if one is already at the top of
803                // the stack.
804                unreachable!()
805            }
806            Some(GroupState::Intersection(_)) => {
807                unreachable!()
808            }
809            Some(GroupState::Group { group, .. }) => {
810                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
811            }
812        }
813    }
814
815    /// Parse the opening of a character class and push the current class
816    /// parsing context onto the parser's stack. This assumes that the parser
817    /// is positioned at an opening `[`. The given union should correspond to
818    /// the union of set items built up before seeing the `[`.
819    ///
820    /// If there was a problem parsing the opening of the class, then an error
821    /// is returned. Otherwise, a new union of set items for the class is
822    /// returned (which may be populated with either a `]` or a `-`).
823    #[inline(never)]
824    fn push_class_open(
825        &self,
826        parent_union: regex_syntax::ast::ClassSetUnion,
827    ) -> Result<regex_syntax::ast::ClassSetUnion> {
828        assert_eq!(self.char(), '[');
829
830        let (nested_set, nested_union) = self.parse_set_class_open()?;
831        self.parser()
832            .stack_class
833            .borrow_mut()
834            .push(ClassState::Open {
835                union: parent_union,
836                set: nested_set,
837            });
838        Ok(nested_union)
839    }
840
841    /// Parse the end of a character class set and pop the character class
842    /// parser stack. The union given corresponds to the last union built
843    /// before seeing the closing `]`. The union returned corresponds to the
844    /// parent character class set with the nested class added to it.
845    ///
846    /// This assumes that the parser is positioned at a `]` and will advance
847    /// the parser to the byte immediately following the `]`.
848    ///
849    /// If the stack is empty after popping, then this returns the final
850    /// "top-level" character class AST (where a "top-level" character class
851    /// is one that is not nested inside any other character class).
852    ///
853    /// If there is no corresponding opening bracket on the parser's stack,
854    /// then an error is returned.
855    #[inline(never)]
856    fn pop_class(
857        &self,
858        nested_union: regex_syntax::ast::ClassSetUnion,
859    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
860        assert_eq!(self.char(), ']');
861
862        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
863        let prevset = self.pop_class_op(item);
864        let mut stack = self.parser().stack_class.borrow_mut();
865        match stack.pop() {
866            None => {
867                // We can never observe an empty stack:
868                //
869                // 1) We are guaranteed to start with a non-empty stack since
870                //    the character class parser is only initiated when it sees
871                //    a `[`.
872                // 2) If we ever observe an empty stack while popping after
873                //    seeing a `]`, then we signal the character class parser
874                //    to terminate.
875                panic!("unexpected empty character class stack")
876            }
877            Some(ClassState::Op { .. }) => {
878                // This panic is unfortunate, but this case is impossible
879                // since we already popped the Op state if one exists above.
880                // Namely, every push to the class parser stack is guarded by
881                // whether an existing Op is already on the top of the stack.
882                // If it is, the existing Op is modified. That is, the stack
883                // can never have consecutive Op states.
884                panic!("unexpected ClassState::Op")
885            }
886            Some(ClassState::Open { mut union, mut set }) => {
887                self.bump();
888                set.span.end = self.pos();
889                set.kind = prevset;
890                if stack.is_empty() {
891                    Ok(Either::Right(set))
892                } else {
893                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
894                    Ok(Either::Left(union))
895                }
896            }
897        }
898    }
899
900    /// Return an "unclosed class" error whose span points to the most
901    /// recently opened class.
902    ///
903    /// This should only be called while parsing a character class.
904    #[inline(never)]
905    fn unclosed_class_error(&self) -> ResharpError {
906        for state in self.parser().stack_class.borrow().iter().rev() {
907            if let ClassState::Open { ref set, .. } = *state {
908                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
909            }
910        }
911        // We are guaranteed to have a non-empty stack with at least
912        // one open bracket, so we should never get here.
913        panic!("no open character class found")
914    }
915
916    /// Push the current set of class items on to the class parser's stack as
917    /// the left hand side of the given operator.
918    ///
919    /// A fresh set union is returned, which should be used to build the right
920    /// hand side of this operator.
921    #[inline(never)]
922    fn push_class_op(
923        &self,
924        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
925        next_union: regex_syntax::ast::ClassSetUnion,
926    ) -> regex_syntax::ast::ClassSetUnion {
927        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
928        let new_lhs = self.pop_class_op(item);
929        self.parser().stack_class.borrow_mut().push(ClassState::Op {
930            kind: next_kind,
931            lhs: new_lhs,
932        });
933        regex_syntax::ast::ClassSetUnion {
934            span: self.span(),
935            items: vec![],
936        }
937    }
938
939    /// Pop a character class set from the character class parser stack. If the
940    /// top of the stack is just an item (not an operation), then return the
941    /// given set unchanged. If the top of the stack is an operation, then the
942    /// given set will be used as the rhs of the operation on the top of the
943    /// stack. In that case, the binary operation is returned as a set.
944    #[inline(never)]
945    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
946        let mut stack = self.parser().stack_class.borrow_mut();
947        let (kind, lhs) = match stack.pop() {
948            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
949            Some(state @ ClassState::Open { .. }) => {
950                stack.push(state);
951                return rhs;
952            }
953            None => unreachable!(),
954        };
955        let span = Span::new(lhs.span().start, rhs.span().end);
956        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
957            span,
958            kind,
959            lhs: Box::new(lhs),
960            rhs: Box::new(rhs),
961        })
962    }
963
964    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
965        match hir.kind() {
966            hir::HirKind::Empty => Ok(NodeId::EPS),
967            hir::HirKind::Literal(l) => {
968                if l.0.len() == 1 {
969                    let node = tb.mk_u8(l.0[0]);
970                    Ok(node)
971                } else {
972                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
973                    let conc = tb.mk_concats(ws.iter().copied());
974                    Ok(conc)
975                }
976            }
977            hir::HirKind::Class(class) => {
978                // dbg!(class);
979                match class {
980                    hir::Class::Unicode(class_unicode) => {
981                        let ranges = class_unicode.ranges();
982                        let mut s1 = NodeId::BOT;
983                        let mut s2 = NodeId::BOT;
984                        let mut s3 = NodeId::BOT;
985                        let mut s4 = NodeId::BOT;
986                        for range in ranges {
987                            for seq in Utf8Sequences::new(range.start(), range.end()) {
988                                // cut len > 2 out for now
989                                if seq.len() > 2 {
990                                    continue;
991                                }
992
993                                let v: Vec<_> = seq
994                                    .as_slice()
995                                    .iter()
996                                    .map(|s|
997                                                            // self.mk_byte_set(&byteset_from_range(s.start, s.end))
998                                                            (s.start, s.end))
999                                    .collect();
1000                                if v.len() == 1 {
1001                                    let (start, end) = v[0];
1002                                    let node = tb.mk_range_u8(start, end);
1003                                    s1 = tb.mk_union(s1, node);
1004                                } else if v.len() == 2 {
1005                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1006                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1007                                    let conc = tb.mk_concat(node1, node2);
1008                                    s2 = tb.mk_union(s2, conc);
1009                                } else if v.len() == 3 {
1010                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1011                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1012                                    let node3 = tb.mk_range_u8(v[2].0, v[2].1);
1013                                    let conc2 = tb.mk_concat(node2, node3);
1014                                    let conc1 = tb.mk_concat(node1, conc2);
1015                                    s3 = tb.mk_union(s3, conc1);
1016                                } else {
1017                                    let mut conc = NodeId::EPS;
1018                                    for i in (0..4).rev() {
1019                                        let node = tb.mk_range_u8(v[i].0, v[i].1);
1020                                        conc = tb.mk_concat(conc, node);
1021                                    }
1022                                    s4 = tb.mk_union(s4, conc);
1023                                }
1024                            }
1025                        }
1026                        let merged = tb.mk_union(s2, s1);
1027                        let merged = tb.mk_union(s3, merged);
1028                        let merged = tb.mk_union(s4, merged);
1029                        Ok(merged)
1030                    }
1031                    hir::Class::Bytes(class_bytes) => {
1032                        let ranges = class_bytes.ranges();
1033                        let mut result = NodeId::BOT;
1034                        for range in ranges {
1035                            let start = range.start();
1036                            let end = range.end();
1037                            let node = tb.mk_range_u8(start, end);
1038                            result = tb.mk_union(result, node);
1039                        }
1040                        Ok(result)
1041                    }
1042                }
1043            }
1044            hir::HirKind::Look(_) => todo!(),
1045            hir::HirKind::Repetition(_) => todo!(),
1046            hir::HirKind::Capture(_) => todo!(),
1047            hir::HirKind::Concat(body) => {
1048                let mut result = NodeId::EPS;
1049                for child in body {
1050                    let node = self.hir_to_node_id(child, tb)?;
1051                    result = tb.mk_concat(result, node);
1052                }
1053                Ok(result)
1054            }
1055            hir::HirKind::Alternation(_) => todo!(),
1056        }
1057    }
1058
1059    fn translate_ast_to_hir(
1060        &mut self,
1061        orig_ast: &regex_syntax::ast::Ast,
1062        tb: &mut TB<'s>,
1063    ) -> Result<NodeId> {
1064        // let unboxed = class_perl.deref();
1065        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1066        match self.translator.translate("", orig_ast) {
1067            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1068            Ok(hir) => {
1069                let mapped = self.hir_to_node_id(&hir, tb);
1070                mapped
1071            }
1072        }
1073    }
1074
1075    fn translator_to_node_id(
1076        &mut self,
1077        orig_ast: &regex_syntax::ast::Ast,
1078        translator: &mut Option<Translator>,
1079        tb: &mut TB<'s>,
1080    ) -> Result<NodeId> {
1081        match translator {
1082            Some(tr) => {
1083                let hir = tr
1084                    .translate("", &orig_ast)
1085                    .map_err(|e| self.unsupported_error(e))?;
1086                self.hir_to_node_id(&hir, tb)
1087            }
1088            None => self.translate_ast_to_hir(&orig_ast, tb),
1089        }
1090    }
1091
1092    fn get_class(
1093        &mut self,
1094        negated: bool,
1095        kind: regex_syntax::ast::ClassPerlKind,
1096        tb: &mut TB<'s>,
1097    ) -> Result<NodeId> {
1098        // find in array
1099        let w = self
1100            .perl_classes
1101            .iter()
1102            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1103        match w {
1104            Some((_, _, value)) => return Ok(*value),
1105            None => {
1106                let tmp = regex_syntax::ast::ClassPerl {
1107                    span: Span::splat(Position::new(0, 0, 0)),
1108                    negated,
1109                    kind: kind.clone(),
1110                };
1111                let word_ast = regex_syntax::ast::Ast::class_perl(tmp);
1112                let translated = self.translate_ast_to_hir(&word_ast, tb)?;
1113                self.perl_classes.push((negated, kind, translated));
1114                Ok(translated)
1115            }
1116        }
1117    }
1118
1119    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1120        use WordCharKind::*;
1121        match ast {
1122            Ast::Literal(lit) => {
1123                if is_word_byte(lit.c as u8) {
1124                    Word
1125                } else {
1126                    NonWord
1127                }
1128            }
1129            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1130                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1131                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1132                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1133                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1134                _ => Unknown,
1135            },
1136            Ast::Dot(_) | Ast::Top(_) => Unknown,
1137            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1138            Ast::Concat(c) if !c.asts.is_empty() => {
1139                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1140            }
1141            Ast::Repetition(r) => {
1142                let inner = Self::word_char_kind(&r.ast, left);
1143                let nullable = matches!(
1144                    &r.op.kind,
1145                    ast::RepetitionKind::ZeroOrMore
1146                        | ast::RepetitionKind::ZeroOrOne
1147                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1148                );
1149                if nullable {
1150                    match inner {
1151                        Word => MaybeWord,
1152                        NonWord => MaybeNonWord,
1153                        _ => Unknown,
1154                    }
1155                } else {
1156                    inner
1157                }
1158            }
1159            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1160            _ => Unknown,
1161        }
1162    }
1163
1164    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1165        use WordCharKind::*;
1166        let next = idx as isize + dir;
1167        if next < 0 || next >= asts.len() as isize {
1168            return Edge;
1169        }
1170        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1171        match kind {
1172            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1173                Word => Word,
1174                _ => Unknown,
1175            },
1176            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1177                NonWord => NonWord,
1178                _ => Unknown,
1179            },
1180            other => other,
1181        }
1182    }
1183
1184    fn rewrite_word_boundary_in_concat(
1185        &mut self,
1186        asts: &[Ast],
1187        idx: usize,
1188        translator: &mut Option<Translator>,
1189        tb: &mut TB<'s>,
1190    ) -> Result<(NodeId, usize)> {
1191        use WordCharKind::*;
1192        let left = Self::concat_neighbor_kind(asts, idx, -1);
1193        let right = Self::concat_neighbor_kind(asts, idx, 1);
1194        let word_id = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1195        let not_word_id = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1196
1197        match (left, right) {
1198            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1199            (Word, _) => {
1200                let set = tb.mk_union(NodeId::END, not_word_id);
1201                let tail = tb.mk_concat(set, NodeId::TS);
1202                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1203            }
1204            (NonWord, _) => {
1205                let set = tb.mk_union(NodeId::END, word_id);
1206                let tail = tb.mk_concat(set, NodeId::TS);
1207                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1208            }
1209            (_, Word) => {
1210                let body = tb.mk_union(NodeId::BEGIN, not_word_id);
1211                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1212            }
1213            (_, NonWord) => {
1214                let body = tb.mk_union(NodeId::BEGIN, word_id);
1215                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1216            }
1217            _ => Ok((self.make_full_word_boundary(tb)?, idx + 1)),
1218        }
1219    }
1220
1221    fn merge_boundary_with_following_lookaheads(
1222        &mut self,
1223        asts: &[Ast],
1224        wb_idx: usize,
1225        boundary_tail: NodeId,
1226        translator: &mut Option<Translator>,
1227        tb: &mut TB<'s>,
1228    ) -> Result<(NodeId, usize)> {
1229        let mut next = wb_idx + 1;
1230        let mut la_bodies = vec![boundary_tail];
1231        while next < asts.len() {
1232            match &asts[next] {
1233                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1234                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1235                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1236                    next += 1;
1237                }
1238                _ => break,
1239            }
1240        }
1241        let merged = tb.mk_inters(la_bodies.into_iter());
1242        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1243    }
1244
1245    fn make_full_word_boundary(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1246        let w = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1247        let nw = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1248        let lb_w = tb.mk_lookbehind(w, NodeId::MISSING);
1249        let la_nw = tb.mk_lookahead(nw, NodeId::MISSING, 0);
1250        let c1 = tb.mk_concat(lb_w, la_nw);
1251        let lb_nw = tb.mk_lookbehind(nw, NodeId::MISSING);
1252        let la_w = tb.mk_lookahead(w, NodeId::MISSING, 0);
1253        let c2 = tb.mk_concat(lb_nw, la_w);
1254        Ok(tb.mk_union(c1, c2))
1255    }
1256
1257    fn ast_to_node_id(
1258        &mut self,
1259        ast: &Ast,
1260        translator: &mut Option<Translator>,
1261        tb: &mut TB<'s>,
1262    ) -> Result<NodeId> {
1263        match ast {
1264            Ast::Empty(_) => Ok(NodeId::EPS),
1265            Ast::Flags(f) => {
1266                let mut translator_builder = Self::default_translator_builder();
1267                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1268                    translator_builder.case_insensitive(state);
1269                }
1270                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1271                    translator_builder.unicode(state);
1272                }
1273                let concat_translator = Some(translator_builder.build());
1274                *translator = concat_translator;
1275                Ok(NodeId::EPS)
1276            }
1277            Ast::Literal(l) => {
1278                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1279                self.translator_to_node_id(&ast_lit, translator, tb)
1280            }
1281            Ast::Top(_) => Ok(NodeId::TOP),
1282            Ast::Dot(_) => {
1283                let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1284                self.hir_to_node_id(&hirv, tb)
1285            }
1286            Ast::Assertion(a) => match &a.kind {
1287                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1288                ast::AssertionKind::EndText => Ok(NodeId::END),
1289                ast::AssertionKind::WordBoundary => {
1290                    let word_id =
1291                        self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1292                    let not_word_id =
1293                        self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1294                    // case1 : (?<=word)(?=not_word)
1295                    let case1_1 = tb.mk_lookbehind(word_id, NodeId::MISSING);
1296                    let case1_2 = tb.mk_lookahead(not_word_id, NodeId::MISSING, 0);
1297                    let case1 = tb.mk_concat(case1_1, case1_2);
1298                    // case2 : (?<=not_word)(?=word)
1299                    let case2_1 = tb.mk_lookbehind(not_word_id, NodeId::MISSING);
1300                    let case2_2 = tb.mk_lookahead(word_id, NodeId::MISSING, 0);
1301                    let case2 = tb.mk_concat(case2_1, case2_2);
1302                    Ok(tb.mk_union(case1, case2))
1303                }
1304                ast::AssertionKind::NotWordBoundary => {
1305                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1306                }
1307                ast::AssertionKind::StartLine => {
1308                    let left = NodeId::BEGIN;
1309                    let right = tb.mk_u8('\n' as u8);
1310                    let union = tb.mk_union(left, right);
1311                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1312                }
1313                ast::AssertionKind::EndLine => {
1314                    let left = NodeId::END;
1315                    let right = tb.mk_u8('\n' as u8);
1316                    let union = tb.mk_union(left, right);
1317                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1318                }
1319                ast::AssertionKind::WordBoundaryStart => todo!(),
1320                ast::AssertionKind::WordBoundaryEnd => todo!(),
1321                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1322                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1323                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1324                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1325            },
1326            Ast::ClassUnicode(c) => {
1327                let tmp = regex_syntax::ast::ClassUnicode {
1328                    span: c.span,
1329                    negated: c.negated,
1330                    kind: c.kind.clone(),
1331                };
1332                if !c.negated {
1333                    match &c.kind {
1334                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1335                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1336                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1337                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1338                            "utf8" => {
1339                                let ascii = tb.mk_range_u8(0, 127);
1340                                let beta = tb.mk_range_u8(128, 0xBF);
1341                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1342                                let c0s = tb.mk_concats([c0, beta].into_iter());
1343                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1344                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1345                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1346                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1347                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1348                                return Ok(tb.mk_star(merged));
1349                            }
1350                            "hex" => {
1351                                let nums = tb.mk_range_u8(b'0', b'9');
1352                                let lets = tb.mk_range_u8(b'a', b'f');
1353                                let lets2 = tb.mk_range_u8(b'A', b'F');
1354                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1355                                return Ok(merged);
1356                            }
1357                            _ => {}
1358                        },
1359                        _ => {}
1360                    };
1361                }
1362
1363                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1364                self.translator_to_node_id(&orig_ast, translator, tb)
1365            }
1366            Ast::ClassPerl(c) => {
1367                let tmp = regex_syntax::ast::ClassPerl {
1368                    span: c.span,
1369                    negated: c.negated,
1370                    kind: c.kind.clone(),
1371                };
1372                let orig_ast = regex_syntax::ast::Ast::class_perl(tmp);
1373                self.translator_to_node_id(&orig_ast, translator, tb)
1374            }
1375            Ast::ClassBracketed(c) => match &c.kind {
1376                regex_syntax::ast::ClassSet::Item(_) => {
1377                    let tmp = regex_syntax::ast::ClassBracketed {
1378                        span: c.span,
1379                        negated: c.negated,
1380                        kind: c.kind.clone(),
1381                    };
1382                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1383                    self.translator_to_node_id(&orig_ast, translator, tb)
1384                }
1385                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1386            },
1387            Ast::Repetition(r) => {
1388                let body = self.ast_to_node_id(&r.ast, translator, tb);
1389                match body {
1390                    Ok(body) => match &r.op.kind {
1391                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1392                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1393                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1394                        ast::RepetitionKind::Range(r) => match r {
1395                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1396                            ast::RepetitionRange::AtLeast(n) => {
1397                                let rep = tb.mk_repeat(body, *n, *n);
1398                                let st = tb.mk_star(body);
1399                                Ok(tb.mk_concat(rep, st))
1400                            }
1401
1402                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1403                        },
1404                    },
1405                    Err(_) => body,
1406                }
1407            }
1408            Ast::Lookaround(g) => {
1409                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1410                match g.kind {
1411                    ast::LookaroundKind::PositiveLookahead => {
1412                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1413                    }
1414                    ast::LookaroundKind::PositiveLookbehind => {
1415                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1416                    }
1417                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1418                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1419                }
1420            }
1421            Ast::Group(g) => {
1422                let child = self.ast_to_node_id(&g.ast, translator, tb);
1423                child
1424            }
1425            Ast::Alternation(a) => {
1426                let mut children = vec![];
1427                for ast in &a.asts {
1428                    match self.ast_to_node_id(ast, translator, tb) {
1429                        Ok(node_id) => children.push(node_id),
1430                        Err(err) => return Err(err),
1431                    }
1432                }
1433                Ok(tb.mk_unions(children.iter().copied()))
1434            }
1435            Ast::Concat(c) => {
1436                let mut concat_translator: Option<Translator> = None;
1437                let mut children = vec![];
1438                let mut i = 0;
1439                while i < c.asts.len() {
1440                    let ast = &c.asts[i];
1441                    match ast {
1442                        Ast::Flags(f) => {
1443                            let mut translator_builder = Self::default_translator_builder();
1444                            translator_builder.utf8(false);
1445                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1446                                translator_builder.case_insensitive(state);
1447                            }
1448                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1449                                translator_builder.unicode(state);
1450                            }
1451                            concat_translator = Some(translator_builder.build());
1452                            i += 1;
1453                            continue;
1454                        }
1455                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1456                            let node =
1457                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1458                            children.push(node.0);
1459                            i = node.1; // skip consumed lookaheads
1460                            continue;
1461                        }
1462                        _ => {}
1463                    }
1464                    match concat_translator {
1465                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1466                            Ok(node_id) => children.push(node_id),
1467                            Err(err) => return Err(err),
1468                        },
1469                        None => match self.ast_to_node_id(ast, translator, tb) {
1470                            Ok(node_id) => children.push(node_id),
1471                            Err(err) => return Err(err),
1472                        },
1473                    }
1474                    i += 1;
1475                }
1476                Ok(tb.mk_concats(children.iter().cloned()))
1477            }
1478            Ast::Intersection(intersection) => {
1479                let mut children = vec![];
1480                for ast in &intersection.asts {
1481                    match self.ast_to_node_id(ast, translator, tb) {
1482                        Ok(node_id) => children.push(node_id),
1483                        Err(err) => return Err(err),
1484                    }
1485                }
1486                Ok(tb.mk_inters(children.into_iter()))
1487            }
1488            Ast::Complement(complement) => {
1489                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1490                body.map(|x| tb.mk_compl(x))
1491            }
1492        }
1493    }
1494
1495    /// Parse the regular expression and return an abstract syntax tree with
1496    /// all of the comments found in the pattern.
1497    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1498        let mut concat = Concat {
1499            span: self.span(),
1500            asts: vec![],
1501        };
1502        loop {
1503            self.bump_space();
1504            if self.is_eof() {
1505                break;
1506            }
1507            match self.char() {
1508                '(' => concat = self.push_group(concat)?,
1509                ')' => concat = self.pop_group(concat)?,
1510                '|' => concat = self.push_alternate(concat)?,
1511                '&' => concat = self.push_intersect(concat)?,
1512                '~' => concat = self.push_compl_group(concat)?,
1513                '[' => {
1514                    let class = self.parse_set_class()?;
1515                    concat.asts.push(Ast::class_bracketed(class));
1516                }
1517                '?' => {
1518                    concat =
1519                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1520                }
1521                '*' => {
1522                    concat =
1523                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1524                }
1525                '+' => {
1526                    concat =
1527                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1528                }
1529                '{' => {
1530                    concat = self.parse_counted_repetition(concat)?;
1531                }
1532                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1533            }
1534        }
1535        let ast = self.pop_group_end(concat)?;
1536        // dbg!(&ast);
1537        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1538        node_id
1539    }
1540
1541    #[inline(never)]
1542    fn parse_uncounted_repetition(
1543        &self,
1544        mut concat: ast::Concat,
1545        kind: ast::RepetitionKind,
1546    ) -> Result<ast::Concat> {
1547        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1548        let op_start = self.pos();
1549        let ast = match concat.asts.pop() {
1550            Some(ast) => ast,
1551            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1552        };
1553        match ast {
1554            Ast::Empty(_) | Ast::Flags(_) => {
1555                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1556            }
1557            _ => {}
1558        }
1559        if self.bump() && self.char() == '?' {
1560            return Err(self.error(
1561                Span::new(op_start, self.pos()),
1562                ast::ErrorKind::UnsupportedLazyQuantifier,
1563            ));
1564        }
1565        concat.asts.push(Ast::repetition(ast::Repetition {
1566            span: ast.span().with_end(self.pos()),
1567            op: ast::RepetitionOp {
1568                span: Span::new(op_start, self.pos()),
1569                kind,
1570            },
1571            greedy: true,
1572            ast: Box::new(ast),
1573        }));
1574        Ok(concat)
1575    }
1576
1577    #[inline(never)]
1578    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1579        assert!(self.char() == '{');
1580        let start = self.pos();
1581        let ast = match concat.asts.pop() {
1582            Some(ast) => ast,
1583            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1584        };
1585        match ast {
1586            Ast::Empty(_) | Ast::Flags(_) => {
1587                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1588            }
1589            _ => {}
1590        }
1591        if !self.bump_and_bump_space() {
1592            return Err(self.error(
1593                Span::new(start, self.pos()),
1594                ast::ErrorKind::RepetitionCountUnclosed,
1595            ));
1596        }
1597        let count_start = specialize_err(
1598            self.parse_decimal(),
1599            ast::ErrorKind::DecimalEmpty,
1600            ast::ErrorKind::RepetitionCountDecimalEmpty,
1601        );
1602        if self.is_eof() {
1603            return Err(self.error(
1604                Span::new(start, self.pos()),
1605                ast::ErrorKind::RepetitionCountUnclosed,
1606            ));
1607        }
1608        let range = if self.char() == ',' {
1609            if !self.bump_and_bump_space() {
1610                return Err(self.error(
1611                    Span::new(start, self.pos()),
1612                    ast::ErrorKind::RepetitionCountUnclosed,
1613                ));
1614            }
1615            if self.char() != '}' {
1616                let count_start = match count_start {
1617                    Ok(c) => c,
1618                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1619                        if self.parser().empty_min_range {
1620                            0
1621                        } else {
1622                            return Err(err);
1623                        }
1624                    }
1625                    err => err?,
1626                };
1627                let count_end = specialize_err(
1628                    self.parse_decimal(),
1629                    ast::ErrorKind::DecimalEmpty,
1630                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1631                )?;
1632                ast::RepetitionRange::Bounded(count_start, count_end)
1633            } else {
1634                ast::RepetitionRange::AtLeast(count_start?)
1635            }
1636        } else {
1637            ast::RepetitionRange::Exactly(count_start?)
1638        };
1639
1640        if self.is_eof() || self.char() != '}' {
1641            return Err(self.error(
1642                Span::new(start, self.pos()),
1643                ast::ErrorKind::RepetitionCountUnclosed,
1644            ));
1645        }
1646
1647        if self.bump_and_bump_space() && self.char() == '?' {
1648            return Err(self.error(
1649                Span::new(start, self.pos()),
1650                ast::ErrorKind::UnsupportedLazyQuantifier,
1651            ));
1652        }
1653
1654        let op_span = Span::new(start, self.pos());
1655        if !range.is_valid() {
1656            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1657        }
1658        concat.asts.push(Ast::repetition(ast::Repetition {
1659            span: ast.span().with_end(self.pos()),
1660            op: ast::RepetitionOp {
1661                span: op_span,
1662                kind: ast::RepetitionKind::Range(range),
1663            },
1664            greedy: true,
1665            ast: Box::new(ast),
1666        }));
1667        Ok(concat)
1668    }
1669
1670    #[inline(never)]
1671    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1672        assert_eq!(self.char(), '(');
1673        let open_span = self.span_char();
1674        self.bump();
1675        self.bump_space();
1676        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1677            let kind = match (pos, ahead) {
1678                (true, true) => LookaroundKind::PositiveLookahead,
1679                (true, false) => LookaroundKind::PositiveLookbehind,
1680                (false, true) => LookaroundKind::NegativeLookahead,
1681                (false, false) => LookaroundKind::NegativeLookbehind,
1682            };
1683            return Ok(Either::Right(ast::Group {
1684                span: open_span,
1685                kind: ast::GroupKind::Lookaround(kind),
1686                ast: Box::new(Ast::empty(self.span())),
1687            }));
1688        }
1689        let inner_span = self.span();
1690        let mut starts_with_p = true;
1691        if self.bump_if("?P<") || {
1692            starts_with_p = false;
1693            self.bump_if("?<")
1694        } {
1695            let capture_index = self.next_capture_index(open_span)?;
1696            let name = self.parse_capture_name(capture_index)?;
1697            Ok(Either::Right(ast::Group {
1698                span: open_span,
1699                kind: ast::GroupKind::CaptureName {
1700                    starts_with_p,
1701                    name,
1702                },
1703                ast: Box::new(Ast::empty(self.span())),
1704            }))
1705        } else if self.bump_if("?") {
1706            if self.is_eof() {
1707                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1708            }
1709            let flags = self.parse_flags()?;
1710            let char_end = self.char();
1711            self.bump();
1712            if char_end == ')' {
1713                // We don't allow empty flags, e.g., `(?)`. We instead
1714                // interpret it as a repetition operator missing its argument.
1715                if flags.items.is_empty() {
1716                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1717                }
1718                Ok(Either::Left(ast::SetFlags {
1719                    span: Span {
1720                        end: self.pos(),
1721                        ..open_span
1722                    },
1723                    flags,
1724                }))
1725            } else {
1726                assert_eq!(char_end, ':');
1727                Ok(Either::Right(ast::Group {
1728                    span: open_span,
1729                    kind: ast::GroupKind::NonCapturing(flags),
1730                    ast: Box::new(Ast::empty(self.span())),
1731                }))
1732            }
1733        } else {
1734            let capture_index = self.next_capture_index(open_span)?;
1735            Ok(Either::Right(ast::Group {
1736                span: open_span,
1737                kind: ast::GroupKind::CaptureIndex(capture_index),
1738                ast: Box::new(Ast::empty(self.span())),
1739            }))
1740        }
1741    }
1742
1743    #[inline(never)]
1744    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1745        if self.is_eof() {
1746            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1747        }
1748        let start = self.pos();
1749        loop {
1750            if self.char() == '>' {
1751                break;
1752            }
1753            if !is_capture_char(self.char(), self.pos() == start) {
1754                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1755            }
1756            if !self.bump() {
1757                break;
1758            }
1759        }
1760        let end = self.pos();
1761        if self.is_eof() {
1762            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1763        }
1764        assert_eq!(self.char(), '>');
1765        self.bump();
1766        let name = &self.pattern()[start.offset..end.offset];
1767        if name.is_empty() {
1768            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1769        }
1770        let capname = ast::CaptureName {
1771            span: Span::new(start, end),
1772            name: name.to_string(),
1773            index: capture_index,
1774        };
1775        self.add_capture_name(&capname)?;
1776        Ok(capname)
1777    }
1778
1779    #[inline(never)]
1780    fn parse_flags(&self) -> Result<ast::Flags> {
1781        let mut flags = ast::Flags {
1782            span: self.span(),
1783            items: vec![],
1784        };
1785        let mut last_was_negation = None;
1786        while self.char() != ':' && self.char() != ')' {
1787            if self.char() == '-' {
1788                last_was_negation = Some(self.span_char());
1789                let item = ast::FlagsItem {
1790                    span: self.span_char(),
1791                    kind: ast::FlagsItemKind::Negation,
1792                };
1793                if let Some(i) = flags.add_item(item) {
1794                    return Err(self.error(
1795                        self.span_char(),
1796                        ast::ErrorKind::FlagRepeatedNegation {
1797                            original: flags.items[i].span,
1798                        },
1799                    ));
1800                }
1801            } else {
1802                last_was_negation = None;
1803                let item = ast::FlagsItem {
1804                    span: self.span_char(),
1805                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1806                };
1807                if let Some(i) = flags.add_item(item) {
1808                    return Err(self.error(
1809                        self.span_char(),
1810                        ast::ErrorKind::FlagDuplicate {
1811                            original: flags.items[i].span,
1812                        },
1813                    ));
1814                }
1815            }
1816            if !self.bump() {
1817                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1818            }
1819        }
1820        if let Some(span) = last_was_negation {
1821            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1822        }
1823        flags.span.end = self.pos();
1824        Ok(flags)
1825    }
1826
1827    #[inline(never)]
1828    fn parse_flag(&self) -> Result<ast::Flag> {
1829        match self.char() {
1830            'i' => Ok(ast::Flag::CaseInsensitive),
1831            'm' => Ok(ast::Flag::MultiLine),
1832            's' => Ok(ast::Flag::DotMatchesNewLine),
1833            'U' => Ok(ast::Flag::SwapGreed),
1834            'u' => Ok(ast::Flag::Unicode),
1835            'R' => Ok(ast::Flag::CRLF),
1836            'x' => Ok(ast::Flag::IgnoreWhitespace),
1837            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1838        }
1839    }
1840
1841    fn parse_primitive(&self) -> Result<Primitive> {
1842        match self.char() {
1843            '\\' => self.parse_escape(),
1844            '_' => {
1845                let ast = Primitive::Top(self.span_char());
1846                self.bump();
1847                Ok(ast)
1848            }
1849            '.' => {
1850                let ast = Primitive::Dot(self.span_char());
1851                self.bump();
1852                Ok(ast)
1853            }
1854            '^' => {
1855                let ast = Primitive::Assertion(ast::Assertion {
1856                    span: self.span_char(),
1857                    kind: ast::AssertionKind::StartLine,
1858                });
1859                self.bump();
1860                Ok(ast)
1861            }
1862            '$' => {
1863                let ast = Primitive::Assertion(ast::Assertion {
1864                    span: self.span_char(),
1865                    kind: ast::AssertionKind::EndLine,
1866                });
1867                self.bump();
1868                Ok(ast)
1869            }
1870            c => {
1871                let ast = Primitive::Literal(Literal {
1872                    span: self.span_char(),
1873                    kind: LiteralKind::Verbatim,
1874                    c,
1875                });
1876                self.bump();
1877                Ok(ast)
1878            }
1879        }
1880    }
1881
1882    #[inline(never)]
1883    fn parse_escape(&self) -> Result<Primitive> {
1884        assert_eq!(self.char(), '\\');
1885        let start = self.pos();
1886        if !self.bump() {
1887            return Err(self.error(
1888                Span::new(start, self.pos()),
1889                ast::ErrorKind::EscapeUnexpectedEof,
1890            ));
1891        }
1892        let c = self.char();
1893        // Put some of the more complicated routines into helpers.
1894        match c {
1895            '0'..='9' => {
1896                if !self.parser().octal {
1897                    return Err(self.error(
1898                        Span::new(start, self.span_char().end),
1899                        ast::ErrorKind::UnsupportedBackreference,
1900                    ));
1901                }
1902                let mut lit = self.parse_octal();
1903                lit.span.start = start;
1904                return Ok(Primitive::Literal(lit));
1905            }
1906            // '8'..='9' if !self.parser().octal => {
1907            //     return Err(self.error(
1908            //         Span::new(start, self.span_char().end),
1909            //         ast::ErrorKind::UnsupportedBackreference,
1910            //     ));
1911            // }
1912            'x' | 'u' | 'U' => {
1913                let mut lit = self.parse_hex()?;
1914                lit.span.start = start;
1915                return Ok(Primitive::Literal(lit));
1916            }
1917            'p' | 'P' => {
1918                let mut cls = self.parse_unicode_class()?;
1919                cls.span.start = start;
1920                return Ok(Primitive::Unicode(cls));
1921            }
1922            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1923                let mut cls = self.parse_perl_class();
1924                cls.span.start = start;
1925                return Ok(Primitive::Perl(cls));
1926            }
1927            _ => {}
1928        }
1929
1930        // Handle all of the one letter sequences inline.
1931        self.bump();
1932        let span = Span::new(start, self.pos());
1933        if is_meta_character(c) {
1934            return Ok(Primitive::Literal(Literal {
1935                span,
1936                kind: LiteralKind::Meta,
1937                c,
1938            }));
1939        }
1940        if is_escapeable_character(c) {
1941            return Ok(Primitive::Literal(Literal {
1942                span,
1943                kind: LiteralKind::Superfluous,
1944                c,
1945            }));
1946        }
1947        let special = |kind, c| {
1948            Ok(Primitive::Literal(Literal {
1949                span,
1950                kind: LiteralKind::Special(kind),
1951                c,
1952            }))
1953        };
1954        match c {
1955            'a' => special(SpecialLiteralKind::Bell, '\x07'),
1956            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1957            't' => special(SpecialLiteralKind::Tab, '\t'),
1958            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1959            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1960            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1961            'A' => Ok(Primitive::Assertion(ast::Assertion {
1962                span,
1963                kind: ast::AssertionKind::StartText,
1964            })),
1965            'z' => Ok(Primitive::Assertion(ast::Assertion {
1966                span,
1967                kind: ast::AssertionKind::EndText,
1968            })),
1969            'b' => {
1970                let mut wb = ast::Assertion {
1971                    span,
1972                    kind: ast::AssertionKind::WordBoundary,
1973                };
1974                // After a \b, we "try" to parse things like \b{start} for
1975                // special word boundary assertions.
1976                if !self.is_eof() && self.char() == '{' {
1977                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
1978                        wb.kind = kind;
1979                        wb.span.end = self.pos();
1980                    }
1981                }
1982                Ok(Primitive::Assertion(wb))
1983            }
1984            'B' => Ok(Primitive::Assertion(ast::Assertion {
1985                span,
1986                kind: ast::AssertionKind::NotWordBoundary,
1987            })),
1988            '<' => Ok(Primitive::Assertion(ast::Assertion {
1989                span,
1990                kind: ast::AssertionKind::WordBoundaryStartAngle,
1991            })),
1992            '>' => Ok(Primitive::Assertion(ast::Assertion {
1993                span,
1994                kind: ast::AssertionKind::WordBoundaryEndAngle,
1995            })),
1996            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1997        }
1998    }
1999
2000    fn maybe_parse_special_word_boundary(
2001        &self,
2002        wb_start: Position,
2003    ) -> Result<Option<ast::AssertionKind>> {
2004        assert_eq!(self.char(), '{');
2005
2006        let is_valid_char = |c| match c {
2007            'A'..='Z' | 'a'..='z' | '-' => true,
2008            _ => false,
2009        };
2010        let start = self.pos();
2011        if !self.bump_and_bump_space() {
2012            return Err(self.error(
2013                Span::new(wb_start, self.pos()),
2014                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2015            ));
2016        }
2017        let start_contents = self.pos();
2018        // This is one of the critical bits: if the first non-whitespace
2019        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2020        // boundary), then we bail and let the counted repetition parser deal
2021        // with this.
2022        if !is_valid_char(self.char()) {
2023            self.parser().pos.set(start);
2024            return Ok(None);
2025        }
2026
2027        // Now collect up our chars until we see a '}'.
2028        let mut scratch = self.parser().scratch.borrow_mut();
2029        scratch.clear();
2030        while !self.is_eof() && is_valid_char(self.char()) {
2031            scratch.push(self.char());
2032            self.bump_and_bump_space();
2033        }
2034        if self.is_eof() || self.char() != '}' {
2035            return Err(self.error(
2036                Span::new(start, self.pos()),
2037                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2038            ));
2039        }
2040        let end = self.pos();
2041        self.bump();
2042        let kind = match scratch.as_str() {
2043            "start" => ast::AssertionKind::WordBoundaryStart,
2044            "end" => ast::AssertionKind::WordBoundaryEnd,
2045            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2046            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2047            _ => {
2048                return Err(self.error(
2049                    Span::new(start_contents, end),
2050                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2051                ))
2052            }
2053        };
2054        Ok(Some(kind))
2055    }
2056
2057    #[inline(never)]
2058    fn parse_octal(&self) -> Literal {
2059        assert!(self.parser().octal);
2060        assert!('0' <= self.char() && self.char() <= '7');
2061        let start = self.pos();
2062        // Parse up to two more digits.
2063        while self.bump()
2064            && '0' <= self.char()
2065            && self.char() <= '7'
2066            && self.pos().offset - start.offset <= 2
2067        {}
2068        let end = self.pos();
2069        let octal = &self.pattern()[start.offset..end.offset];
2070        // Parsing the octal should never fail since the above guarantees a
2071        // valid number.
2072        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2073        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2074        // invalid Unicode scalar values.
2075        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2076        Literal {
2077            span: Span::new(start, end),
2078            kind: LiteralKind::Octal,
2079            c,
2080        }
2081    }
2082
2083    #[inline(never)]
2084    fn parse_hex(&self) -> Result<Literal> {
2085        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2086
2087        let hex_kind = match self.char() {
2088            'x' => HexLiteralKind::X,
2089            'u' => HexLiteralKind::UnicodeShort,
2090            _ => HexLiteralKind::UnicodeLong,
2091        };
2092        if !self.bump_and_bump_space() {
2093            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2094        }
2095        if self.char() == '{' {
2096            self.parse_hex_brace(hex_kind)
2097        } else {
2098            self.parse_hex_digits(hex_kind)
2099        }
2100    }
2101
2102    #[inline(never)]
2103    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2104        let mut scratch = self.parser().scratch.borrow_mut();
2105        scratch.clear();
2106
2107        let start = self.pos();
2108        for i in 0..kind.digits() {
2109            if i > 0 && !self.bump_and_bump_space() {
2110                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2111            }
2112            if !is_hex(self.char()) {
2113                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2114            }
2115            scratch.push(self.char());
2116        }
2117        // The final bump just moves the parser past the literal, which may
2118        // be EOF.
2119        self.bump_and_bump_space();
2120        let end = self.pos();
2121        let hex = scratch.as_str();
2122        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2123            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2124            Some(c) => Ok(Literal {
2125                span: Span::new(start, end),
2126                kind: LiteralKind::HexFixed(kind),
2127                c,
2128            }),
2129        }
2130    }
2131
2132    #[inline(never)]
2133    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2134        let mut scratch = self.parser().scratch.borrow_mut();
2135        scratch.clear();
2136
2137        let brace_pos = self.pos();
2138        let start = self.span_char().end;
2139        while self.bump_and_bump_space() && self.char() != '}' {
2140            if !is_hex(self.char()) {
2141                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2142            }
2143            scratch.push(self.char());
2144        }
2145        if self.is_eof() {
2146            return Err(self.error(
2147                Span::new(brace_pos, self.pos()),
2148                ast::ErrorKind::EscapeUnexpectedEof,
2149            ));
2150        }
2151        let end = self.pos();
2152        let hex = scratch.as_str();
2153        assert_eq!(self.char(), '}');
2154        self.bump_and_bump_space();
2155
2156        if hex.is_empty() {
2157            return Err(self.error(
2158                Span::new(brace_pos, self.pos()),
2159                ast::ErrorKind::EscapeHexEmpty,
2160            ));
2161        }
2162        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2163            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2164            Some(c) => Ok(Literal {
2165                span: Span::new(start, self.pos()),
2166                kind: LiteralKind::HexBrace(kind),
2167                c,
2168            }),
2169        }
2170    }
2171
2172    fn parse_decimal(&self) -> Result<u32> {
2173        let mut scratch = self.parser().scratch.borrow_mut();
2174        scratch.clear();
2175
2176        while !self.is_eof() && self.char().is_whitespace() {
2177            self.bump();
2178        }
2179        let start = self.pos();
2180        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2181            scratch.push(self.char());
2182            self.bump_and_bump_space();
2183        }
2184        let span = Span::new(start, self.pos());
2185        while !self.is_eof() && self.char().is_whitespace() {
2186            self.bump_and_bump_space();
2187        }
2188        let digits = scratch.as_str();
2189        if digits.is_empty() {
2190            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2191        }
2192        match u32::from_str_radix(digits, 10).ok() {
2193            Some(n) => Ok(n),
2194            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2195        }
2196    }
2197
2198    #[inline(never)]
2199    fn parse_set_class(&self) -> Result<ClassBracketed> {
2200        assert_eq!(self.char(), '[');
2201
2202        let mut union = ClassSetUnion {
2203            span: self.span(),
2204            items: vec![],
2205        };
2206        loop {
2207            self.bump_space();
2208            if self.is_eof() {
2209                return Err(self.unclosed_class_error());
2210            }
2211            match self.char() {
2212                '[' => {
2213                    // If we've already parsed the opening bracket, then
2214                    // attempt to treat this as the beginning of an ASCII
2215                    // class. If ASCII class parsing fails, then the parser
2216                    // backs up to `[`.
2217                    if !self.parser().stack_class.borrow().is_empty() {
2218                        if let Some(cls) = self.maybe_parse_ascii_class() {
2219                            union.push(ClassSetItem::Ascii(cls));
2220                            continue;
2221                        }
2222                    }
2223                    union = self.push_class_open(union)?;
2224                }
2225                ']' => match self.pop_class(union)? {
2226                    Either::Left(nested_union) => {
2227                        union = nested_union;
2228                    }
2229                    Either::Right(class) => return Ok(class),
2230                },
2231                '&' if self.peek() == Some('&') => {
2232                    assert!(self.bump_if("&&"));
2233                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2234                }
2235                '-' if self.peek() == Some('-') => {
2236                    assert!(self.bump_if("--"));
2237                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2238                }
2239                '~' if self.peek() == Some('~') => {
2240                    assert!(self.bump_if("~~"));
2241                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2242                }
2243                _ => {
2244                    union.push(self.parse_set_class_range()?);
2245                }
2246            }
2247        }
2248    }
2249
2250    #[inline(never)]
2251    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2252        let prim1 = self.parse_set_class_item()?;
2253        self.bump_space();
2254        if self.is_eof() {
2255            return Err(self.unclosed_class_error());
2256        }
2257        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2258            return prim1.into_class_set_item(self);
2259        }
2260        if !self.bump_and_bump_space() {
2261            return Err(self.unclosed_class_error());
2262        }
2263        let prim2 = self.parse_set_class_item()?;
2264        let range = ClassSetRange {
2265            span: Span::new(prim1.span().start, prim2.span().end),
2266            start: prim1.into_class_literal(self)?,
2267            end: prim2.into_class_literal(self)?,
2268        };
2269        if !range.is_valid() {
2270            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2271        }
2272        Ok(ClassSetItem::Range(range))
2273    }
2274
2275    #[inline(never)]
2276    fn parse_set_class_item(&self) -> Result<Primitive> {
2277        if self.char() == '\\' {
2278            self.parse_escape()
2279        } else {
2280            let x = Primitive::Literal(Literal {
2281                span: self.span_char(),
2282                kind: LiteralKind::Verbatim,
2283                c: self.char(),
2284            });
2285            self.bump();
2286            Ok(x)
2287        }
2288    }
2289
2290    #[inline(never)]
2291    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2292        assert_eq!(self.char(), '[');
2293        let start = self.pos();
2294        if !self.bump_and_bump_space() {
2295            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2296        }
2297
2298        let negated = if self.char() != '^' {
2299            false
2300        } else {
2301            if !self.bump_and_bump_space() {
2302                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2303            }
2304            true
2305        };
2306        // Accept any number of `-` as literal `-`.
2307        let mut union = ClassSetUnion {
2308            span: self.span(),
2309            items: vec![],
2310        };
2311        while self.char() == '-' {
2312            union.push(ClassSetItem::Literal(Literal {
2313                span: self.span_char(),
2314                kind: LiteralKind::Verbatim,
2315                c: '-',
2316            }));
2317            if !self.bump_and_bump_space() {
2318                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2319            }
2320        }
2321        // If `]` is the *first* char in a set, then interpret it as a literal
2322        // `]`. That is, an empty class is impossible to write.
2323        if union.items.is_empty() && self.char() == ']' {
2324            union.push(ClassSetItem::Literal(Literal {
2325                span: self.span_char(),
2326                kind: LiteralKind::Verbatim,
2327                c: ']',
2328            }));
2329            if !self.bump_and_bump_space() {
2330                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2331            }
2332        }
2333        let set = ClassBracketed {
2334            span: Span::new(start, self.pos()),
2335            negated,
2336            kind: ClassSet::union(ClassSetUnion {
2337                span: Span::new(union.span.start, union.span.start),
2338                items: vec![],
2339            }),
2340        };
2341        Ok((set, union))
2342    }
2343
2344    #[inline(never)]
2345    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2346        assert_eq!(self.char(), '[');
2347        // If parsing fails, then we back up the parser to this starting point.
2348        let start = self.pos();
2349        let mut negated = false;
2350        if !self.bump() || self.char() != ':' {
2351            self.parser().pos.set(start);
2352            return None;
2353        }
2354        if !self.bump() {
2355            self.parser().pos.set(start);
2356            return None;
2357        }
2358        if self.char() == '^' {
2359            negated = true;
2360            if !self.bump() {
2361                self.parser().pos.set(start);
2362                return None;
2363            }
2364        }
2365        let name_start = self.offset();
2366        while self.char() != ':' && self.bump() {}
2367        if self.is_eof() {
2368            self.parser().pos.set(start);
2369            return None;
2370        }
2371        let name = &self.pattern()[name_start..self.offset()];
2372        if !self.bump_if(":]") {
2373            self.parser().pos.set(start);
2374            return None;
2375        }
2376        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2377            Some(kind) => kind,
2378            None => {
2379                self.parser().pos.set(start);
2380                return None;
2381            }
2382        };
2383        Some(ClassAscii {
2384            span: Span::new(start, self.pos()),
2385            kind,
2386            negated,
2387        })
2388    }
2389
2390    #[inline(never)]
2391    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2392        assert!(self.char() == 'p' || self.char() == 'P');
2393
2394        let mut scratch = self.parser().scratch.borrow_mut();
2395        scratch.clear();
2396
2397        let negated = self.char() == 'P';
2398        if !self.bump_and_bump_space() {
2399            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2400        }
2401        let (start, kind) = if self.char() == '{' {
2402            let start = self.span_char().end;
2403            while self.bump_and_bump_space() && self.char() != '}' {
2404                scratch.push(self.char());
2405            }
2406            if self.is_eof() {
2407                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2408            }
2409            assert_eq!(self.char(), '}');
2410            self.bump();
2411
2412            let name = scratch.as_str();
2413            if let Some(i) = name.find("!=") {
2414                (
2415                    start,
2416                    ClassUnicodeKind::NamedValue {
2417                        op: ClassUnicodeOpKind::NotEqual,
2418                        name: name[..i].to_string(),
2419                        value: name[i + 2..].to_string(),
2420                    },
2421                )
2422            } else if let Some(i) = name.find(':') {
2423                (
2424                    start,
2425                    ClassUnicodeKind::NamedValue {
2426                        op: ClassUnicodeOpKind::Colon,
2427                        name: name[..i].to_string(),
2428                        value: name[i + 1..].to_string(),
2429                    },
2430                )
2431            } else if let Some(i) = name.find('=') {
2432                (
2433                    start,
2434                    ClassUnicodeKind::NamedValue {
2435                        op: ClassUnicodeOpKind::Equal,
2436                        name: name[..i].to_string(),
2437                        value: name[i + 1..].to_string(),
2438                    },
2439                )
2440            } else {
2441                (start, ClassUnicodeKind::Named(name.to_string()))
2442            }
2443        } else {
2444            let start = self.pos();
2445            let c = self.char();
2446            if c == '\\' {
2447                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2448            }
2449            self.bump_and_bump_space();
2450            let kind = ClassUnicodeKind::OneLetter(c);
2451            (start, kind)
2452        };
2453        Ok(ClassUnicode {
2454            span: Span::new(start, self.pos()),
2455            negated,
2456            kind,
2457        })
2458    }
2459
2460    #[inline(never)]
2461    fn parse_perl_class(&self) -> ClassPerl {
2462        let c = self.char();
2463        let span = self.span_char();
2464        self.bump();
2465        let (negated, kind) = match c {
2466            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2467            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2468            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2469            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2470            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2471            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2472            c => panic!("expected valid Perl class but got '{}'", c),
2473        };
2474        ClassPerl {
2475            span,
2476            kind,
2477            negated,
2478        }
2479    }
2480}
2481
2482pub fn parse_ast<'s>(
2483    tb: &mut TB<'s>,
2484    pattern: &'s str,
2485) -> std::result::Result<NodeId, ResharpError> {
2486    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2487    let result = p.parse(tb);
2488    result
2489}
resharp_parser/lib.rs

resharp_parser/
lib.rs