resharp_parser/
lib.rs

1//! Parser for resharp regex patterns.
2//!
3//! Converts regex pattern strings into the node representation used by resharp-algebra.
4
5#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11    ast::{
12        ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13        ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14        HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15    },
16    hir::{
17        self,
18        translate::{Translator, TranslatorBuilder},
19    },
20    utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28    Word,
29    NonWord,
30    MaybeWord,
31    MaybeNonWord,
32    Unknown,
33    Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37    b.is_ascii_alphanumeric() || b == b'_'
38}
39
40#[derive(Clone, Debug, Eq, PartialEq)]
41enum Primitive {
42    Literal(Literal),
43    Assertion(ast::Assertion),
44    Dot(Span),
45    Top(Span),
46    Perl(ClassPerl),
47    Unicode(ClassUnicode),
48}
49
50impl Primitive {
51    fn span(&self) -> &Span {
52        match *self {
53            Primitive::Literal(ref x) => &x.span,
54            Primitive::Assertion(ref x) => &x.span,
55            Primitive::Dot(ref span) => span,
56            Primitive::Top(ref span) => span,
57            Primitive::Perl(ref x) => &x.span,
58            Primitive::Unicode(ref x) => &x.span,
59        }
60    }
61
62    fn into_ast(self) -> Ast {
63        match self {
64            Primitive::Literal(lit) => Ast::literal(lit),
65            Primitive::Assertion(assert) => Ast::assertion(assert),
66            Primitive::Dot(span) => Ast::dot(span),
67            Primitive::Top(span) => Ast::top(span),
68            Primitive::Perl(cls) => Ast::class_perl(cls),
69            Primitive::Unicode(cls) => Ast::class_unicode(cls),
70        }
71    }
72
73    fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
74        use self::Primitive::*;
75        use regex_syntax::ast::ClassSetItem;
76
77        match self {
78            Literal(lit) => Ok(ClassSetItem::Literal(lit)),
79            Perl(cls) => Ok(ClassSetItem::Perl(cls)),
80            Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
81            x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
82        }
83    }
84
85    fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
86        use self::Primitive::*;
87
88        match self {
89            Literal(lit) => Ok(lit),
90            x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
91        }
92    }
93}
94
95#[derive(Clone, Debug, Eq, PartialEq)]
96pub enum Either<Left, Right> {
97    Left(Left),
98    Right(Right),
99}
100
101#[derive(Clone, Debug, Eq, PartialEq)]
102pub struct ResharpError {
103    /// The kind of error.
104    pub kind: ErrorKind,
105    /// The original pattern that the parser generated the error from. Every
106    /// span in an error is a valid range into this string.
107    pattern: String,
108    /// The span of this error.
109    pub span: Span,
110}
111
112impl std::fmt::Display for ResharpError {
113    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
114        write!(f, "{:?}: {:?}", self.kind, self.span)
115    }
116}
117impl std::error::Error for ResharpError {}
118
119type Result<T> = core::result::Result<T, ResharpError>;
120
121#[derive(Clone, Debug)]
122enum GroupState {
123    /// This state is pushed whenever an opening group is found.
124    Group {
125        /// The concatenation immediately preceding the opening group.
126        concat: Concat,
127        /// The group that has been opened. Its sub-AST is always empty.
128        group: ast::Group,
129        /// Whether this group has the `x` flag enabled or not.
130        ignore_whitespace: bool,
131    },
132    /// This state is pushed whenever a new alternation branch is found. If
133    /// an alternation branch is found and this state is at the top of the
134    /// stack, then this state should be modified to include the new
135    /// alternation.
136    Alternation(ast::Alternation),
137    Intersection(ast::Intersection),
138}
139
140#[derive(Clone, Debug)]
141enum ClassState {
142    /// This state is pushed whenever an opening bracket is found.
143    Open {
144        /// The union of class items immediately preceding this class.
145        union: regex_syntax::ast::ClassSetUnion,
146        /// The class that has been opened. Typically this just corresponds
147        /// to the `[`, but it can also include `[^` since `^` indicates
148        /// negation of the class.
149        set: regex_syntax::ast::ClassBracketed,
150    },
151    /// This state is pushed when a operator is seen. When popped, the stored
152    /// set becomes the left hand side of the operator.
153    Op {
154        /// The type of the operation, i.e., &&, -- or ~~.
155        kind: regex_syntax::ast::ClassSetBinaryOpKind,
156        /// The left-hand side of the operator.
157        lhs: regex_syntax::ast::ClassSet,
158    },
159}
160
161/// RE# syntax parser based on the regex-syntax crate.
162pub struct ResharpParser<'s> {
163    perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
164    pub translator: regex_syntax::hir::translate::Translator,
165    pub pattern: &'s str,
166    pos: Cell<Position>,
167    capture_index: Cell<u32>,
168    octal: bool,
169    empty_min_range: bool,
170    ignore_whitespace: Cell<bool>,
171    comments: RefCell<Vec<ast::Comment>>,
172    stack_group: RefCell<Vec<GroupState>>,
173    stack_class: RefCell<Vec<ClassState>>,
174    capture_names: RefCell<Vec<ast::CaptureName>>,
175    scratch: RefCell<String>,
176}
177
178fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
179    if let Err(e) = result {
180        if e.kind == from {
181            Err(ResharpError {
182                kind: to,
183                pattern: e.pattern,
184                span: e.span,
185            })
186        } else {
187            Err(e)
188        }
189    } else {
190        result
191    }
192}
193
194fn is_capture_char(c: char, first: bool) -> bool {
195    if first {
196        c == '_' || c.is_alphabetic()
197    } else {
198        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
199    }
200}
201
202pub fn is_meta_character(c: char) -> bool {
203    match c {
204        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
205        | '#' | '&' | '-' | '~' | '_' => true,
206        _ => false,
207    }
208}
209
210/// escapes all resharp meta characters in `text`.
211pub fn escape(text: &str) -> String {
212    let mut buf = String::new();
213    escape_into(text, &mut buf);
214    buf
215}
216
217/// escapes all resharp meta characters in `text` and appends to `buf`.
218pub fn escape_into(text: &str, buf: &mut String) {
219    buf.reserve(text.len());
220    for c in text.chars() {
221        if is_meta_character(c) {
222            buf.push('\\');
223        }
224        buf.push(c);
225    }
226}
227
228pub fn is_escapeable_character(c: char) -> bool {
229    // Certainly escapeable if it's a meta character.
230    if is_meta_character(c) {
231        return true;
232    }
233    // Any character that isn't ASCII is definitely not escapeable. There's
234    // no real need to allow things like \☃ right?
235    if !c.is_ascii() {
236        return false;
237    }
238    // Otherwise, we basically say that everything is escapeable unless it's a
239    // letter or digit. Things like \3 are either octal (when enabled) or an
240    // error, and we should keep it that way. Otherwise, letters are reserved
241    // for adding new syntax in a backwards compatible way.
242    match c {
243        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
244        // While not currently supported, we keep these as not escapeable to
245        // give us some flexibility with respect to supporting the \< and
246        // \> word boundary assertions in the future. By rejecting them as
247        // escapeable, \< and \> will result in a parse error. Thus, we can
248        // turn them into something else in the future without it being a
249        // backwards incompatible change.
250        //
251        // OK, now we support \< and \>, and we need to retain them as *not*
252        // escapeable here since the escape sequence is significant.
253        '<' | '>' => false,
254        _ => true,
255    }
256}
257
258fn is_hex(c: char) -> bool {
259    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
260}
261
262impl<'s> ResharpParser<'s> {
263    fn default_translator_builder() -> TranslatorBuilder {
264        let mut trb = TranslatorBuilder::new();
265        trb.unicode(true);
266        trb.utf8(false);
267        trb
268    }
269
270    pub fn new(pattern: &'s str) -> Self {
271        let trb = Self::default_translator_builder();
272        Self {
273            translator: trb.build(),
274            pattern,
275            perl_classes: vec![],
276            pos: Cell::new(Position::new(0, 0, 0)),
277            capture_index: Cell::new(0),
278            octal: false,
279            empty_min_range: false,
280            ignore_whitespace: Cell::new(false),
281            comments: RefCell::new(vec![]),
282            stack_group: RefCell::new(vec![]),
283            stack_class: RefCell::new(vec![]),
284            capture_names: RefCell::new(vec![]),
285            scratch: RefCell::new(String::new()),
286        }
287    }
288
289    /// Return a reference to the parser state.
290    fn parser(&'_ self) -> &'_ ResharpParser<'_> {
291        self
292    }
293
294    /// Return a reference to the pattern being parsed.
295    fn pattern(&self) -> &str {
296        self.pattern
297    }
298
299    /// Create a new error with the given span and error type.
300    fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
301        ResharpError {
302            kind,
303            pattern: self.pattern().to_string(),
304            span,
305        }
306    }
307
308    fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
309        let emptyspan = Span::splat(self.pos());
310        let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
311        inner
312    }
313
314    /// Return the current offset of the parser.
315    ///
316    /// The offset starts at `0` from the beginning of the regular expression
317    /// pattern string.
318    fn offset(&self) -> usize {
319        self.parser().pos.get().offset
320    }
321
322    /// Return the current line number of the parser.
323    ///
324    /// The line number starts at `1`.
325    fn line(&self) -> usize {
326        self.parser().pos.get().line
327    }
328
329    /// Return the current column of the parser.
330    ///
331    /// The column number starts at `1` and is reset whenever a `\n` is seen.
332    fn column(&self) -> usize {
333        self.parser().pos.get().column
334    }
335
336    /// Return the next capturing index. Each subsequent call increments the
337    /// internal index.
338    ///
339    /// The span given should correspond to the location of the opening
340    /// parenthesis.
341    ///
342    /// If the capture limit is exceeded, then an error is returned.
343    fn next_capture_index(&self, span: Span) -> Result<u32> {
344        let current = self.parser().capture_index.get();
345        let i = current
346            .checked_add(1)
347            .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
348        self.parser().capture_index.set(i);
349        Ok(i)
350    }
351
352    fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
353        let mut names = self.parser().capture_names.borrow_mut();
354        match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
355            Err(i) => {
356                names.insert(i, cap.clone());
357                Ok(())
358            }
359            Ok(i) => Err(self.error(
360                cap.span,
361                ast::ErrorKind::GroupNameDuplicate {
362                    original: names[i].span,
363                },
364            )),
365        }
366    }
367
368    fn ignore_whitespace(&self) -> bool {
369        self.parser().ignore_whitespace.get()
370    }
371
372    fn char(&self) -> char {
373        self.char_at(self.offset())
374    }
375
376    fn char_at(&self, i: usize) -> char {
377        self.pattern()[i..]
378            .chars()
379            .next()
380            .unwrap_or_else(|| panic!("expected char at offset {}", i))
381    }
382
383    fn bump(&self) -> bool {
384        if self.is_eof() {
385            return false;
386        }
387        let Position {
388            mut offset,
389            mut line,
390            mut column,
391        } = self.pos();
392        if self.char() == '\n' {
393            line = line.checked_add(1).unwrap();
394            column = 1;
395        } else {
396            column = column.checked_add(1).unwrap();
397        }
398        offset += self.char().len_utf8();
399        self.parser().pos.set(Position {
400            offset,
401            line,
402            column,
403        });
404        self.pattern()[self.offset()..].chars().next().is_some()
405    }
406
407    fn bump_if(&self, prefix: &str) -> bool {
408        if self.pattern()[self.offset()..].starts_with(prefix) {
409            for _ in 0..prefix.chars().count() {
410                self.bump();
411            }
412            true
413        } else {
414            false
415        }
416    }
417
418    fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
419        if self.bump_if("?=") {
420            return Some((true, true));
421        }
422        if self.bump_if("?!") {
423            return Some((true, false));
424        }
425        if self.bump_if("?<=") {
426            return Some((false, true));
427        }
428        if self.bump_if("?<!") {
429            return Some((false, false));
430        }
431        return None;
432        // self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("?<!")
433    }
434
435    fn bump_and_bump_space(&self) -> bool {
436        if !self.bump() {
437            return false;
438        }
439        self.bump_space();
440        !self.is_eof()
441    }
442
443    fn bump_space(&self) {
444        if !self.ignore_whitespace() {
445            return;
446        }
447        while !self.is_eof() {
448            if self.char().is_whitespace() {
449                self.bump();
450            } else if self.char() == '#' {
451                let start = self.pos();
452                let mut comment_text = String::new();
453                self.bump();
454                while !self.is_eof() {
455                    let c = self.char();
456                    self.bump();
457                    if c == '\n' {
458                        break;
459                    }
460                    comment_text.push(c);
461                }
462                let comment = ast::Comment {
463                    span: Span::new(start, self.pos()),
464                    comment: comment_text,
465                };
466                self.parser().comments.borrow_mut().push(comment);
467            } else {
468                break;
469            }
470        }
471    }
472
473    /// Peek at the next character in the input without advancing the parser.
474    ///
475    /// If the input has been exhausted, then this returns `None`.
476    fn peek(&self) -> Option<char> {
477        if self.is_eof() {
478            return None;
479        }
480        self.pattern()[self.offset() + self.char().len_utf8()..]
481            .chars()
482            .next()
483    }
484
485    /// Like peek, but will ignore spaces when the parser is in whitespace
486    /// insensitive mode.
487    fn peek_space(&self) -> Option<char> {
488        if !self.ignore_whitespace() {
489            return self.peek();
490        }
491        if self.is_eof() {
492            return None;
493        }
494        let mut start = self.offset() + self.char().len_utf8();
495        let mut in_comment = false;
496        for (i, c) in self.pattern()[start..].char_indices() {
497            if c.is_whitespace() {
498                continue;
499            } else if !in_comment && c == '#' {
500                in_comment = true;
501            } else if in_comment && c == '\n' {
502                in_comment = false;
503            } else {
504                start += i;
505                break;
506            }
507        }
508        self.pattern()[start..].chars().next()
509    }
510
511    /// Returns true if the next call to `bump` would return false.
512    fn is_eof(&self) -> bool {
513        self.offset() == self.pattern().len()
514    }
515
516    /// Return the current position of the parser, which includes the offset,
517    /// line and column.
518    fn pos(&self) -> Position {
519        self.parser().pos.get()
520    }
521
522    /// Create a span at the current position of the parser. Both the start
523    /// and end of the span are set.
524    fn span(&self) -> Span {
525        Span::splat(self.pos())
526    }
527
528    /// Create a span that covers the current character.
529    fn span_char(&self) -> Span {
530        let mut next = Position {
531            offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
532            line: self.line(),
533            column: self.column().checked_add(1).unwrap(),
534        };
535        if self.char() == '\n' {
536            next.line += 1;
537            next.column = 1;
538        }
539        Span::new(self.pos(), next)
540    }
541
542    /// Parse and push a single alternation on to the parser's internal stack.
543    /// If the top of the stack already has an alternation, then add to that
544    /// instead of pushing a new one.
545    ///
546    /// The concatenation given corresponds to a single alternation branch.
547    /// The concatenation returned starts the next branch and is empty.
548    ///
549    /// This assumes the parser is currently positioned at `|` and will advance
550    /// the parser to the character following `|`.
551    #[inline(never)]
552    fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
553        assert_eq!(self.char(), '|');
554        concat.span.end = self.pos();
555        self.push_or_add_alternation(concat);
556        self.bump();
557        Ok(ast::Concat {
558            span: self.span(),
559            asts: vec![],
560        })
561    }
562
563    /// Pushes or adds the given branch of an alternation to the parser's
564    /// internal stack of state.
565    fn push_or_add_alternation(&self, concat: Concat) {
566        use self::GroupState::*;
567
568        let mut stack = self.parser().stack_group.borrow_mut();
569        if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
570            alts.asts.push(concat.into_ast());
571            return;
572        }
573        stack.push(Alternation(ast::Alternation {
574            span: Span::new(concat.span.start, self.pos()),
575            asts: vec![concat.into_ast()],
576        }));
577    }
578
579    #[inline(never)]
580    fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
581        assert_eq!(self.char(), '&');
582        concat.span.end = self.pos();
583        self.push_or_add_intersect(concat);
584        self.bump();
585        Ok(Concat {
586            span: self.span(),
587            asts: vec![],
588        })
589    }
590
591    /// Pushes or adds the given branch of an alternation to the parser's
592    /// internal stack of state.
593    fn push_or_add_intersect(&self, concat: Concat) {
594        use self::GroupState::*;
595
596        let mut stack = self.parser().stack_group.borrow_mut();
597        if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
598            alts.asts.push(concat.into_ast());
599            return;
600        }
601        stack.push(Intersection(ast::Intersection {
602            span: Span::new(concat.span.start, self.pos()),
603            asts: vec![concat.into_ast()],
604        }));
605    }
606
607    /// Parse and push a group AST (and its parent concatenation) on to the
608    /// parser's internal stack. Return a fresh concatenation corresponding
609    /// to the group's sub-AST.
610    ///
611    /// If a set of flags was found (with no group), then the concatenation
612    /// is returned with that set of flags added.
613    ///
614    /// This assumes that the parser is currently positioned on the opening
615    /// parenthesis. It advances the parser to the character at the start
616    /// of the sub-expression (or adjoining expression).
617    ///
618    /// If there was a problem parsing the start of the group, then an error
619    /// is returned.
620    #[inline(never)]
621    fn push_group(&self, mut concat: Concat) -> Result<Concat> {
622        assert_eq!(self.char(), '(');
623        match self.parse_group()? {
624            Either::Left(set) => {
625                let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
626                if let Some(v) = ignore {
627                    self.parser().ignore_whitespace.set(v);
628                }
629
630                concat.asts.push(Ast::flags(set));
631                Ok(concat)
632            }
633            Either::Right(group) => {
634                let old_ignore_whitespace = self.ignore_whitespace();
635                let new_ignore_whitespace = group
636                    .flags()
637                    .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
638                    .unwrap_or(old_ignore_whitespace);
639                self.parser()
640                    .stack_group
641                    .borrow_mut()
642                    .push(GroupState::Group {
643                        concat,
644                        group,
645                        ignore_whitespace: old_ignore_whitespace,
646                    });
647                self.parser().ignore_whitespace.set(new_ignore_whitespace);
648                Ok(Concat {
649                    span: self.span(),
650                    asts: vec![],
651                })
652            }
653        }
654    }
655
656    #[inline(never)]
657    fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
658        assert_eq!(self.char(), '~');
659        self.bump();
660        if self.is_eof() || self.char() != '(' {
661            return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
662        }
663        let open_span = self.span_char();
664        self.bump();
665        let group = ast::Group {
666            span: open_span,
667            kind: ast::GroupKind::Complement,
668            ast: Box::new(Ast::empty(self.span())),
669        };
670
671        let old_ignore_whitespace = self.ignore_whitespace();
672        let new_ignore_whitespace = group
673            .flags()
674            .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
675            .unwrap_or(old_ignore_whitespace);
676        self.parser()
677            .stack_group
678            .borrow_mut()
679            .push(GroupState::Group {
680                concat,
681                group,
682                ignore_whitespace: old_ignore_whitespace,
683            });
684        self.parser().ignore_whitespace.set(new_ignore_whitespace);
685        Ok(Concat {
686            span: self.span(),
687            asts: vec![],
688        })
689    }
690
691    /// Pop a group AST from the parser's internal stack and set the group's
692    /// AST to the given concatenation. Return the concatenation containing
693    /// the group.
694    ///
695    /// This assumes that the parser is currently positioned on the closing
696    /// parenthesis and advances the parser to the character following the `)`.
697    ///
698    /// If no such group could be popped, then an unopened group error is
699    /// returned.
700    #[inline(never)]
701    fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
702        use self::GroupState::*;
703        assert_eq!(self.char(), ')');
704        let mut stack = self.parser().stack_group.borrow_mut();
705        let topstack = stack.pop();
706
707        let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
708            Some(Group {
709                concat,
710                group,
711                ignore_whitespace,
712            }) => (concat, group, ignore_whitespace, None),
713            Some(Alternation(alt)) => match stack.pop() {
714                Some(Group {
715                    concat,
716                    group,
717                    ignore_whitespace,
718                }) => (
719                    concat,
720                    group,
721                    ignore_whitespace,
722                    Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
723                ),
724                None | Some(Alternation(_)) | Some(Intersection(_)) => {
725                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
726                }
727            },
728            Some(Intersection(int)) => match stack.pop() {
729                Some(Group {
730                    concat,
731                    group,
732                    ignore_whitespace,
733                }) => (
734                    concat,
735                    group,
736                    ignore_whitespace,
737                    Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
738                ),
739                None | Some(Alternation(_)) | Some(Intersection(_)) => {
740                    return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
741                }
742            },
743
744            None => {
745                return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
746            }
747        };
748        self.parser().ignore_whitespace.set(ignore_whitespace);
749        group_concat.span.end = self.pos();
750        self.bump();
751        group.span.end = self.pos();
752        match alt {
753            Some(Either::Left(mut alt)) => {
754                alt.span.end = group_concat.span.end;
755                alt.asts.push(group_concat.into_ast());
756                group.ast = Box::new(alt.into_ast());
757            }
758            Some(Either::Right(mut int)) => {
759                int.span.end = group_concat.span.end;
760                int.asts.push(group_concat.into_ast());
761                group.ast = Box::new(int.into_ast());
762            }
763            None => {
764                group.ast = Box::new(group_concat.into_ast());
765            }
766        }
767
768        if group.kind == GroupKind::Complement {
769            let complement = ast::Complement {
770                span: self.span(),
771                ast: group.ast,
772            };
773            prior_concat.asts.push(Ast::complement(complement));
774        }
775        // ignore groups for now
776        else {
777            // dbg!(&group);
778            prior_concat.asts.push(Ast::group(group));
779        }
780        Ok(prior_concat)
781    }
782
783    /// Pop the last state from the parser's internal stack, if it exists, and
784    /// add the given concatenation to it. There either must be no state or a
785    /// single alternation item on the stack. Any other scenario produces an
786    /// error.
787    ///
788    /// This assumes that the parser has advanced to the end.
789    #[inline(never)]
790    fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
791        concat.span.end = self.pos();
792        // dbg!(&concat);
793        let mut stack = self.parser().stack_group.borrow_mut();
794        let ast = match stack.pop() {
795            None => Ok(concat.into_ast()),
796            Some(GroupState::Alternation(mut alt)) => {
797                alt.span.end = self.pos();
798                alt.asts.push(concat.into_ast());
799                Ok(Ast::alternation(alt))
800            }
801            Some(GroupState::Intersection(mut int)) => {
802                int.span.end = self.pos();
803                int.asts.push(concat.into_ast());
804
805                // dbg!("end intersection2");
806                Ok(Ast::intersection(int))
807            }
808            Some(GroupState::Group { group, .. }) => {
809                return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
810            }
811        };
812        // If we try to pop again, there should be nothing.
813        match stack.pop() {
814            None => ast,
815            Some(GroupState::Alternation(_)) => {
816                // This unreachable is unfortunate. This case can't happen
817                // because the only way we can be here is if there were two
818                // `GroupState::Alternation`s adjacent in the parser's stack,
819                // which we guarantee to never happen because we never push a
820                // `GroupState::Alternation` if one is already at the top of
821                // the stack.
822                unreachable!()
823            }
824            Some(GroupState::Intersection(_)) => {
825                unreachable!()
826            }
827            Some(GroupState::Group { group, .. }) => {
828                Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
829            }
830        }
831    }
832
833    /// Parse the opening of a character class and push the current class
834    /// parsing context onto the parser's stack. This assumes that the parser
835    /// is positioned at an opening `[`. The given union should correspond to
836    /// the union of set items built up before seeing the `[`.
837    ///
838    /// If there was a problem parsing the opening of the class, then an error
839    /// is returned. Otherwise, a new union of set items for the class is
840    /// returned (which may be populated with either a `]` or a `-`).
841    #[inline(never)]
842    fn push_class_open(
843        &self,
844        parent_union: regex_syntax::ast::ClassSetUnion,
845    ) -> Result<regex_syntax::ast::ClassSetUnion> {
846        assert_eq!(self.char(), '[');
847
848        let (nested_set, nested_union) = self.parse_set_class_open()?;
849        self.parser()
850            .stack_class
851            .borrow_mut()
852            .push(ClassState::Open {
853                union: parent_union,
854                set: nested_set,
855            });
856        Ok(nested_union)
857    }
858
859    /// Parse the end of a character class set and pop the character class
860    /// parser stack. The union given corresponds to the last union built
861    /// before seeing the closing `]`. The union returned corresponds to the
862    /// parent character class set with the nested class added to it.
863    ///
864    /// This assumes that the parser is positioned at a `]` and will advance
865    /// the parser to the byte immediately following the `]`.
866    ///
867    /// If the stack is empty after popping, then this returns the final
868    /// "top-level" character class AST (where a "top-level" character class
869    /// is one that is not nested inside any other character class).
870    ///
871    /// If there is no corresponding opening bracket on the parser's stack,
872    /// then an error is returned.
873    #[inline(never)]
874    fn pop_class(
875        &self,
876        nested_union: regex_syntax::ast::ClassSetUnion,
877    ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
878        assert_eq!(self.char(), ']');
879
880        let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
881        let prevset = self.pop_class_op(item);
882        let mut stack = self.parser().stack_class.borrow_mut();
883        match stack.pop() {
884            None => {
885                // We can never observe an empty stack:
886                //
887                // 1) We are guaranteed to start with a non-empty stack since
888                //    the character class parser is only initiated when it sees
889                //    a `[`.
890                // 2) If we ever observe an empty stack while popping after
891                //    seeing a `]`, then we signal the character class parser
892                //    to terminate.
893                panic!("unexpected empty character class stack")
894            }
895            Some(ClassState::Op { .. }) => {
896                // This panic is unfortunate, but this case is impossible
897                // since we already popped the Op state if one exists above.
898                // Namely, every push to the class parser stack is guarded by
899                // whether an existing Op is already on the top of the stack.
900                // If it is, the existing Op is modified. That is, the stack
901                // can never have consecutive Op states.
902                panic!("unexpected ClassState::Op")
903            }
904            Some(ClassState::Open { mut union, mut set }) => {
905                self.bump();
906                set.span.end = self.pos();
907                set.kind = prevset;
908                if stack.is_empty() {
909                    Ok(Either::Right(set))
910                } else {
911                    union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
912                    Ok(Either::Left(union))
913                }
914            }
915        }
916    }
917
918    /// Return an "unclosed class" error whose span points to the most
919    /// recently opened class.
920    ///
921    /// This should only be called while parsing a character class.
922    #[inline(never)]
923    fn unclosed_class_error(&self) -> ResharpError {
924        for state in self.parser().stack_class.borrow().iter().rev() {
925            if let ClassState::Open { ref set, .. } = *state {
926                return self.error(set.span, ast::ErrorKind::ClassUnclosed);
927            }
928        }
929        // We are guaranteed to have a non-empty stack with at least
930        // one open bracket, so we should never get here.
931        panic!("no open character class found")
932    }
933
934    /// Push the current set of class items on to the class parser's stack as
935    /// the left hand side of the given operator.
936    ///
937    /// A fresh set union is returned, which should be used to build the right
938    /// hand side of this operator.
939    #[inline(never)]
940    fn push_class_op(
941        &self,
942        next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
943        next_union: regex_syntax::ast::ClassSetUnion,
944    ) -> regex_syntax::ast::ClassSetUnion {
945        let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
946        let new_lhs = self.pop_class_op(item);
947        self.parser().stack_class.borrow_mut().push(ClassState::Op {
948            kind: next_kind,
949            lhs: new_lhs,
950        });
951        regex_syntax::ast::ClassSetUnion {
952            span: self.span(),
953            items: vec![],
954        }
955    }
956
957    /// Pop a character class set from the character class parser stack. If the
958    /// top of the stack is just an item (not an operation), then return the
959    /// given set unchanged. If the top of the stack is an operation, then the
960    /// given set will be used as the rhs of the operation on the top of the
961    /// stack. In that case, the binary operation is returned as a set.
962    #[inline(never)]
963    fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
964        let mut stack = self.parser().stack_class.borrow_mut();
965        let (kind, lhs) = match stack.pop() {
966            Some(ClassState::Op { kind, lhs }) => (kind, lhs),
967            Some(state @ ClassState::Open { .. }) => {
968                stack.push(state);
969                return rhs;
970            }
971            None => unreachable!(),
972        };
973        let span = Span::new(lhs.span().start, rhs.span().end);
974        regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
975            span,
976            kind,
977            lhs: Box::new(lhs),
978            rhs: Box::new(rhs),
979        })
980    }
981
982    fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
983        match hir.kind() {
984            hir::HirKind::Empty => Ok(NodeId::EPS),
985            hir::HirKind::Literal(l) => {
986                if l.0.len() == 1 {
987                    let node = tb.mk_u8(l.0[0]);
988                    Ok(node)
989                } else {
990                    let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
991                    let conc = tb.mk_concats(ws.iter().copied());
992                    Ok(conc)
993                }
994            }
995            hir::HirKind::Class(class) => {
996                // dbg!(class);
997                match class {
998                    hir::Class::Unicode(class_unicode) => {
999                        let ranges = class_unicode.ranges();
1000                        let mut s1 = NodeId::BOT;
1001                        let mut s2 = NodeId::BOT;
1002                        let mut s3 = NodeId::BOT;
1003                        let mut s4 = NodeId::BOT;
1004                        for range in ranges {
1005                            for seq in Utf8Sequences::new(range.start(), range.end()) {
1006                                // cut len > 2 out for now
1007                                if seq.len() > 2 {
1008                                    continue;
1009                                }
1010
1011                                let v: Vec<_> = seq
1012                                    .as_slice()
1013                                    .iter()
1014                                    .map(|s|
1015                                                            // self.mk_byte_set(&byteset_from_range(s.start, s.end))
1016                                                            (s.start, s.end))
1017                                    .collect();
1018                                if v.len() == 1 {
1019                                    let (start, end) = v[0];
1020                                    let node = tb.mk_range_u8(start, end);
1021                                    s1 = tb.mk_union(s1, node);
1022                                } else if v.len() == 2 {
1023                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1024                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1025                                    let conc = tb.mk_concat(node1, node2);
1026                                    s2 = tb.mk_union(s2, conc);
1027                                } else if v.len() == 3 {
1028                                    let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1029                                    let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1030                                    let node3 = tb.mk_range_u8(v[2].0, v[2].1);
1031                                    let conc2 = tb.mk_concat(node2, node3);
1032                                    let conc1 = tb.mk_concat(node1, conc2);
1033                                    s3 = tb.mk_union(s3, conc1);
1034                                } else {
1035                                    let mut conc = NodeId::EPS;
1036                                    for i in (0..4).rev() {
1037                                        let node = tb.mk_range_u8(v[i].0, v[i].1);
1038                                        conc = tb.mk_concat(conc, node);
1039                                    }
1040                                    s4 = tb.mk_union(s4, conc);
1041                                }
1042                            }
1043                        }
1044                        let merged = tb.mk_union(s2, s1);
1045                        let merged = tb.mk_union(s3, merged);
1046                        let merged = tb.mk_union(s4, merged);
1047                        Ok(merged)
1048                    }
1049                    hir::Class::Bytes(class_bytes) => {
1050                        let ranges = class_bytes.ranges();
1051                        let mut result = NodeId::BOT;
1052                        for range in ranges {
1053                            let start = range.start();
1054                            let end = range.end();
1055                            let node = tb.mk_range_u8(start, end);
1056                            result = tb.mk_union(result, node);
1057                        }
1058                        Ok(result)
1059                    }
1060                }
1061            }
1062            hir::HirKind::Look(_) => todo!(),
1063            hir::HirKind::Repetition(_) => todo!(),
1064            hir::HirKind::Capture(_) => todo!(),
1065            hir::HirKind::Concat(body) => {
1066                let mut result = NodeId::EPS;
1067                for child in body {
1068                    let node = self.hir_to_node_id(child, tb)?;
1069                    result = tb.mk_concat(result, node);
1070                }
1071                Ok(result)
1072            }
1073            hir::HirKind::Alternation(_) => todo!(),
1074        }
1075    }
1076
1077    fn translate_ast_to_hir(
1078        &mut self,
1079        orig_ast: &regex_syntax::ast::Ast,
1080        tb: &mut TB<'s>,
1081    ) -> Result<NodeId> {
1082        // let unboxed = class_perl.deref();
1083        // let orig_ast = regex_syntax::ast::Ast::class_perl(unboxed.clone());
1084        match self.translator.translate("", orig_ast) {
1085            Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1086            Ok(hir) => {
1087                let mapped = self.hir_to_node_id(&hir, tb);
1088                mapped
1089            }
1090        }
1091    }
1092
1093    fn translator_to_node_id(
1094        &mut self,
1095        orig_ast: &regex_syntax::ast::Ast,
1096        translator: &mut Option<Translator>,
1097        tb: &mut TB<'s>,
1098    ) -> Result<NodeId> {
1099        match translator {
1100            Some(tr) => {
1101                let hir = tr
1102                    .translate("", &orig_ast)
1103                    .map_err(|e| self.unsupported_error(e))?;
1104                self.hir_to_node_id(&hir, tb)
1105            }
1106            None => self.translate_ast_to_hir(&orig_ast, tb),
1107        }
1108    }
1109
1110    fn get_class(
1111        &mut self,
1112        negated: bool,
1113        kind: regex_syntax::ast::ClassPerlKind,
1114        tb: &mut TB<'s>,
1115    ) -> Result<NodeId> {
1116        // find in array
1117        let w = self
1118            .perl_classes
1119            .iter()
1120            .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1121        match w {
1122            Some((_, _, value)) => return Ok(*value),
1123            None => {
1124                let tmp = regex_syntax::ast::ClassPerl {
1125                    span: Span::splat(Position::new(0, 0, 0)),
1126                    negated,
1127                    kind: kind.clone(),
1128                };
1129                let word_ast = regex_syntax::ast::Ast::class_perl(tmp);
1130                let translated = self.translate_ast_to_hir(&word_ast, tb)?;
1131                self.perl_classes.push((negated, kind, translated));
1132                Ok(translated)
1133            }
1134        }
1135    }
1136
1137    fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1138        use WordCharKind::*;
1139        match ast {
1140            Ast::Literal(lit) => {
1141                if is_word_byte(lit.c as u8) {
1142                    Word
1143                } else {
1144                    NonWord
1145                }
1146            }
1147            Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1148                (&regex_syntax::ast::ClassPerlKind::Word, false) => Word,
1149                (&regex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1150                (&regex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1151                (&regex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1152                _ => Unknown,
1153            },
1154            Ast::Dot(_) | Ast::Top(_) => Unknown,
1155            Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1156            Ast::Concat(c) if !c.asts.is_empty() => {
1157                Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1158            }
1159            Ast::Repetition(r) => {
1160                let inner = Self::word_char_kind(&r.ast, left);
1161                let nullable = matches!(
1162                    &r.op.kind,
1163                    ast::RepetitionKind::ZeroOrMore
1164                        | ast::RepetitionKind::ZeroOrOne
1165                        | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1166                );
1167                if nullable {
1168                    match inner {
1169                        Word => MaybeWord,
1170                        NonWord => MaybeNonWord,
1171                        _ => Unknown,
1172                    }
1173                } else {
1174                    inner
1175                }
1176            }
1177            Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1178            _ => Unknown,
1179        }
1180    }
1181
1182    fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1183        use WordCharKind::*;
1184        let next = idx as isize + dir;
1185        if next < 0 || next >= asts.len() as isize {
1186            return Edge;
1187        }
1188        let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1189        match kind {
1190            MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1191                Word => Word,
1192                _ => Unknown,
1193            },
1194            MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1195                NonWord => NonWord,
1196                _ => Unknown,
1197            },
1198            other => other,
1199        }
1200    }
1201
1202    fn rewrite_word_boundary_in_concat(
1203        &mut self,
1204        asts: &[Ast],
1205        idx: usize,
1206        translator: &mut Option<Translator>,
1207        tb: &mut TB<'s>,
1208    ) -> Result<(NodeId, usize)> {
1209        use WordCharKind::*;
1210        let left = Self::concat_neighbor_kind(asts, idx, -1);
1211        let right = Self::concat_neighbor_kind(asts, idx, 1);
1212        let word_id = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1213        let not_word_id = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1214
1215        match (left, right) {
1216            (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1217            (Word, _) => {
1218                let set = tb.mk_union(NodeId::END, not_word_id);
1219                let tail = tb.mk_concat(set, NodeId::TS);
1220                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1221            }
1222            (NonWord, _) => {
1223                let set = tb.mk_union(NodeId::END, word_id);
1224                let tail = tb.mk_concat(set, NodeId::TS);
1225                self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1226            }
1227            (_, Word) => {
1228                let body = tb.mk_union(NodeId::BEGIN, not_word_id);
1229                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1230            }
1231            (_, NonWord) => {
1232                let body = tb.mk_union(NodeId::BEGIN, word_id);
1233                Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1234            }
1235            _ => Ok((self.make_full_word_boundary(tb)?, idx + 1)),
1236        }
1237    }
1238
1239    fn merge_boundary_with_following_lookaheads(
1240        &mut self,
1241        asts: &[Ast],
1242        wb_idx: usize,
1243        boundary_tail: NodeId,
1244        translator: &mut Option<Translator>,
1245        tb: &mut TB<'s>,
1246    ) -> Result<(NodeId, usize)> {
1247        let mut next = wb_idx + 1;
1248        let mut la_bodies = vec![boundary_tail];
1249        while next < asts.len() {
1250            match &asts[next] {
1251                Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1252                    let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1253                    la_bodies.push(tb.mk_concat(body, NodeId::TS));
1254                    next += 1;
1255                }
1256                _ => break,
1257            }
1258        }
1259        let merged = tb.mk_inters(la_bodies.into_iter());
1260        Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1261    }
1262
1263    fn make_full_word_boundary(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1264        let w = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1265        let nw = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1266        let lb_w = tb.mk_lookbehind(w, NodeId::MISSING);
1267        let la_nw = tb.mk_lookahead(nw, NodeId::MISSING, 0);
1268        let c1 = tb.mk_concat(lb_w, la_nw);
1269        let lb_nw = tb.mk_lookbehind(nw, NodeId::MISSING);
1270        let la_w = tb.mk_lookahead(w, NodeId::MISSING, 0);
1271        let c2 = tb.mk_concat(lb_nw, la_w);
1272        Ok(tb.mk_union(c1, c2))
1273    }
1274
1275    fn ast_to_node_id(
1276        &mut self,
1277        ast: &Ast,
1278        translator: &mut Option<Translator>,
1279        tb: &mut TB<'s>,
1280    ) -> Result<NodeId> {
1281        match ast {
1282            Ast::Empty(_) => Ok(NodeId::EPS),
1283            Ast::Flags(f) => {
1284                let mut translator_builder = Self::default_translator_builder();
1285                if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1286                    translator_builder.case_insensitive(state);
1287                }
1288                if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1289                    translator_builder.unicode(state);
1290                }
1291                let concat_translator = Some(translator_builder.build());
1292                *translator = concat_translator;
1293                Ok(NodeId::EPS)
1294            }
1295            Ast::Literal(l) => {
1296                let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1297                self.translator_to_node_id(&ast_lit, translator, tb)
1298            }
1299            Ast::Top(_) => Ok(NodeId::TOP),
1300            Ast::Dot(_) => {
1301                let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1302                self.hir_to_node_id(&hirv, tb)
1303            }
1304            Ast::Assertion(a) => match &a.kind {
1305                ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1306                ast::AssertionKind::EndText => Ok(NodeId::END),
1307                ast::AssertionKind::WordBoundary => {
1308                    let word_id =
1309                        self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1310                    let not_word_id =
1311                        self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1312                    // case1 : (?<=word)(?=not_word)
1313                    let case1_1 = tb.mk_lookbehind(word_id, NodeId::MISSING);
1314                    let case1_2 = tb.mk_lookahead(not_word_id, NodeId::MISSING, 0);
1315                    let case1 = tb.mk_concat(case1_1, case1_2);
1316                    // case2 : (?<=not_word)(?=word)
1317                    let case2_1 = tb.mk_lookbehind(not_word_id, NodeId::MISSING);
1318                    let case2_2 = tb.mk_lookahead(word_id, NodeId::MISSING, 0);
1319                    let case2 = tb.mk_concat(case2_1, case2_2);
1320                    Ok(tb.mk_union(case1, case2))
1321                }
1322                ast::AssertionKind::NotWordBoundary => {
1323                    return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1324                }
1325                ast::AssertionKind::StartLine => {
1326                    let left = NodeId::BEGIN;
1327                    let right = tb.mk_u8('\n' as u8);
1328                    let union = tb.mk_union(left, right);
1329                    Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1330                }
1331                ast::AssertionKind::EndLine => {
1332                    let left = NodeId::END;
1333                    let right = tb.mk_u8('\n' as u8);
1334                    let union = tb.mk_union(left, right);
1335                    Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1336                }
1337                ast::AssertionKind::WordBoundaryStart => todo!(),
1338                ast::AssertionKind::WordBoundaryEnd => todo!(),
1339                ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1340                ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1341                ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1342                ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1343            },
1344            Ast::ClassUnicode(c) => {
1345                let tmp = regex_syntax::ast::ClassUnicode {
1346                    span: c.span,
1347                    negated: c.negated,
1348                    kind: c.kind.clone(),
1349                };
1350                if !c.negated {
1351                    match &c.kind {
1352                        regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1353                            // \p{ascii} for ascii, \p{ascii}&\p{Letter} => [A-Za-z]
1354                            "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1355                            // restricts matches to valid utf8, \p{utf8}*&~(a) => non a, but valid utf8
1356                            "utf8" => {
1357                                let ascii = tb.mk_range_u8(0, 127);
1358                                let beta = tb.mk_range_u8(128, 0xBF);
1359                                let c0 = tb.mk_range_u8(0xC0, 0xDF);
1360                                let c0s = tb.mk_concats([c0, beta].into_iter());
1361                                let e0 = tb.mk_range_u8(0xE0, 0xEF);
1362                                let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1363                                let f0 = tb.mk_range_u8(0xF0, 0xF7);
1364                                let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1365                                let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1366                                return Ok(tb.mk_star(merged));
1367                            }
1368                            "hex" => {
1369                                let nums = tb.mk_range_u8(b'0', b'9');
1370                                let lets = tb.mk_range_u8(b'a', b'f');
1371                                let lets2 = tb.mk_range_u8(b'A', b'F');
1372                                let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1373                                return Ok(merged);
1374                            }
1375                            _ => {}
1376                        },
1377                        _ => {}
1378                    };
1379                }
1380
1381                let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1382                self.translator_to_node_id(&orig_ast, translator, tb)
1383            }
1384            Ast::ClassPerl(c) => {
1385                let tmp = regex_syntax::ast::ClassPerl {
1386                    span: c.span,
1387                    negated: c.negated,
1388                    kind: c.kind.clone(),
1389                };
1390                let orig_ast = regex_syntax::ast::Ast::class_perl(tmp);
1391                self.translator_to_node_id(&orig_ast, translator, tb)
1392            }
1393            Ast::ClassBracketed(c) => match &c.kind {
1394                regex_syntax::ast::ClassSet::Item(_) => {
1395                    let tmp = regex_syntax::ast::ClassBracketed {
1396                        span: c.span,
1397                        negated: c.negated,
1398                        kind: c.kind.clone(),
1399                    };
1400                    let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1401                    self.translator_to_node_id(&orig_ast, translator, tb)
1402                }
1403                regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1404            },
1405            Ast::Repetition(r) => {
1406                let body = self.ast_to_node_id(&r.ast, translator, tb);
1407                match body {
1408                    Ok(body) => match &r.op.kind {
1409                        ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1410                        ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1411                        ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1412                        ast::RepetitionKind::Range(r) => match r {
1413                            ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1414                            ast::RepetitionRange::AtLeast(n) => {
1415                                let rep = tb.mk_repeat(body, *n, *n);
1416                                let st = tb.mk_star(body);
1417                                Ok(tb.mk_concat(rep, st))
1418                            }
1419
1420                            ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1421                        },
1422                    },
1423                    Err(_) => body,
1424                }
1425            }
1426            Ast::Lookaround(g) => {
1427                let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1428                match g.kind {
1429                    ast::LookaroundKind::PositiveLookahead => {
1430                        Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1431                    }
1432                    ast::LookaroundKind::PositiveLookbehind => {
1433                        Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1434                    }
1435                    ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1436                    ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1437                }
1438            }
1439            Ast::Group(g) => {
1440                let child = self.ast_to_node_id(&g.ast, translator, tb);
1441                child
1442            }
1443            Ast::Alternation(a) => {
1444                let mut children = vec![];
1445                for ast in &a.asts {
1446                    match self.ast_to_node_id(ast, translator, tb) {
1447                        Ok(node_id) => children.push(node_id),
1448                        Err(err) => return Err(err),
1449                    }
1450                }
1451                Ok(tb.mk_unions(children.iter().copied()))
1452            }
1453            Ast::Concat(c) => {
1454                let mut concat_translator: Option<Translator> = None;
1455                let mut children = vec![];
1456                let mut i = 0;
1457                while i < c.asts.len() {
1458                    let ast = &c.asts[i];
1459                    match ast {
1460                        Ast::Flags(f) => {
1461                            let mut translator_builder = Self::default_translator_builder();
1462                            translator_builder.utf8(false);
1463                            if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1464                                translator_builder.case_insensitive(state);
1465                            }
1466                            if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1467                                translator_builder.unicode(state);
1468                            }
1469                            concat_translator = Some(translator_builder.build());
1470                            i += 1;
1471                            continue;
1472                        }
1473                        Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1474                            let node =
1475                                self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1476                            children.push(node.0);
1477                            i = node.1; // skip consumed lookaheads
1478                            continue;
1479                        }
1480                        _ => {}
1481                    }
1482                    match concat_translator {
1483                        Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1484                            Ok(node_id) => children.push(node_id),
1485                            Err(err) => return Err(err),
1486                        },
1487                        None => match self.ast_to_node_id(ast, translator, tb) {
1488                            Ok(node_id) => children.push(node_id),
1489                            Err(err) => return Err(err),
1490                        },
1491                    }
1492                    i += 1;
1493                }
1494                Ok(tb.mk_concats(children.iter().cloned()))
1495            }
1496            Ast::Intersection(intersection) => {
1497                let mut children = vec![];
1498                for ast in &intersection.asts {
1499                    match self.ast_to_node_id(ast, translator, tb) {
1500                        Ok(node_id) => children.push(node_id),
1501                        Err(err) => return Err(err),
1502                    }
1503                }
1504                Ok(tb.mk_inters(children.into_iter()))
1505            }
1506            Ast::Complement(complement) => {
1507                let body = self.ast_to_node_id(&complement.ast, translator, tb);
1508                body.map(|x| tb.mk_compl(x))
1509            }
1510        }
1511    }
1512
1513    /// Parse the regular expression and return an abstract syntax tree with
1514    /// all of the comments found in the pattern.
1515    fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1516        let mut concat = Concat {
1517            span: self.span(),
1518            asts: vec![],
1519        };
1520        loop {
1521            self.bump_space();
1522            if self.is_eof() {
1523                break;
1524            }
1525            match self.char() {
1526                '(' => concat = self.push_group(concat)?,
1527                ')' => concat = self.pop_group(concat)?,
1528                '|' => concat = self.push_alternate(concat)?,
1529                '&' => concat = self.push_intersect(concat)?,
1530                '~' => concat = self.push_compl_group(concat)?,
1531                '[' => {
1532                    let class = self.parse_set_class()?;
1533                    concat.asts.push(Ast::class_bracketed(class));
1534                }
1535                '?' => {
1536                    concat =
1537                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1538                }
1539                '*' => {
1540                    concat =
1541                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1542                }
1543                '+' => {
1544                    concat =
1545                        self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1546                }
1547                '{' => {
1548                    concat = self.parse_counted_repetition(concat)?;
1549                }
1550                _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1551            }
1552        }
1553        let ast = self.pop_group_end(concat)?;
1554        // dbg!(&ast);
1555        let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1556        node_id
1557    }
1558
1559    #[inline(never)]
1560    fn parse_uncounted_repetition(
1561        &self,
1562        mut concat: ast::Concat,
1563        kind: ast::RepetitionKind,
1564    ) -> Result<ast::Concat> {
1565        // assert!(self.char() == '?' || self.char() == '*' || self.char() == '+');
1566        let op_start = self.pos();
1567        let ast = match concat.asts.pop() {
1568            Some(ast) => ast,
1569            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1570        };
1571        match ast {
1572            Ast::Empty(_) | Ast::Flags(_) => {
1573                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1574            }
1575            _ => {}
1576        }
1577        if self.bump() && self.char() == '?' {
1578            return Err(self.error(
1579                Span::new(op_start, self.pos()),
1580                ast::ErrorKind::UnsupportedLazyQuantifier,
1581            ));
1582        }
1583        concat.asts.push(Ast::repetition(ast::Repetition {
1584            span: ast.span().with_end(self.pos()),
1585            op: ast::RepetitionOp {
1586                span: Span::new(op_start, self.pos()),
1587                kind,
1588            },
1589            greedy: true,
1590            ast: Box::new(ast),
1591        }));
1592        Ok(concat)
1593    }
1594
1595    #[inline(never)]
1596    fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1597        assert!(self.char() == '{');
1598        let start = self.pos();
1599        let ast = match concat.asts.pop() {
1600            Some(ast) => ast,
1601            None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1602        };
1603        match ast {
1604            Ast::Empty(_) | Ast::Flags(_) => {
1605                return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1606            }
1607            _ => {}
1608        }
1609        if !self.bump_and_bump_space() {
1610            return Err(self.error(
1611                Span::new(start, self.pos()),
1612                ast::ErrorKind::RepetitionCountUnclosed,
1613            ));
1614        }
1615        let count_start = specialize_err(
1616            self.parse_decimal(),
1617            ast::ErrorKind::DecimalEmpty,
1618            ast::ErrorKind::RepetitionCountDecimalEmpty,
1619        );
1620        if self.is_eof() {
1621            return Err(self.error(
1622                Span::new(start, self.pos()),
1623                ast::ErrorKind::RepetitionCountUnclosed,
1624            ));
1625        }
1626        let range = if self.char() == ',' {
1627            if !self.bump_and_bump_space() {
1628                return Err(self.error(
1629                    Span::new(start, self.pos()),
1630                    ast::ErrorKind::RepetitionCountUnclosed,
1631                ));
1632            }
1633            if self.char() != '}' {
1634                let count_start = match count_start {
1635                    Ok(c) => c,
1636                    Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1637                        if self.parser().empty_min_range {
1638                            0
1639                        } else {
1640                            return Err(err);
1641                        }
1642                    }
1643                    err => err?,
1644                };
1645                let count_end = specialize_err(
1646                    self.parse_decimal(),
1647                    ast::ErrorKind::DecimalEmpty,
1648                    ast::ErrorKind::RepetitionCountDecimalEmpty,
1649                )?;
1650                ast::RepetitionRange::Bounded(count_start, count_end)
1651            } else {
1652                ast::RepetitionRange::AtLeast(count_start?)
1653            }
1654        } else {
1655            ast::RepetitionRange::Exactly(count_start?)
1656        };
1657
1658        if self.is_eof() || self.char() != '}' {
1659            return Err(self.error(
1660                Span::new(start, self.pos()),
1661                ast::ErrorKind::RepetitionCountUnclosed,
1662            ));
1663        }
1664
1665        if self.bump_and_bump_space() && self.char() == '?' {
1666            return Err(self.error(
1667                Span::new(start, self.pos()),
1668                ast::ErrorKind::UnsupportedLazyQuantifier,
1669            ));
1670        }
1671
1672        let op_span = Span::new(start, self.pos());
1673        if !range.is_valid() {
1674            return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1675        }
1676        concat.asts.push(Ast::repetition(ast::Repetition {
1677            span: ast.span().with_end(self.pos()),
1678            op: ast::RepetitionOp {
1679                span: op_span,
1680                kind: ast::RepetitionKind::Range(range),
1681            },
1682            greedy: true,
1683            ast: Box::new(ast),
1684        }));
1685        Ok(concat)
1686    }
1687
1688    #[inline(never)]
1689    fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1690        assert_eq!(self.char(), '(');
1691        let open_span = self.span_char();
1692        self.bump();
1693        self.bump_space();
1694        if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1695            let kind = match (pos, ahead) {
1696                (true, true) => LookaroundKind::PositiveLookahead,
1697                (true, false) => LookaroundKind::PositiveLookbehind,
1698                (false, true) => LookaroundKind::NegativeLookahead,
1699                (false, false) => LookaroundKind::NegativeLookbehind,
1700            };
1701            return Ok(Either::Right(ast::Group {
1702                span: open_span,
1703                kind: ast::GroupKind::Lookaround(kind),
1704                ast: Box::new(Ast::empty(self.span())),
1705            }));
1706        }
1707        let inner_span = self.span();
1708        let mut starts_with_p = true;
1709        if self.bump_if("?P<") || {
1710            starts_with_p = false;
1711            self.bump_if("?<")
1712        } {
1713            let capture_index = self.next_capture_index(open_span)?;
1714            let name = self.parse_capture_name(capture_index)?;
1715            Ok(Either::Right(ast::Group {
1716                span: open_span,
1717                kind: ast::GroupKind::CaptureName {
1718                    starts_with_p,
1719                    name,
1720                },
1721                ast: Box::new(Ast::empty(self.span())),
1722            }))
1723        } else if self.bump_if("?") {
1724            if self.is_eof() {
1725                return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1726            }
1727            let flags = self.parse_flags()?;
1728            let char_end = self.char();
1729            self.bump();
1730            if char_end == ')' {
1731                // We don't allow empty flags, e.g., `(?)`. We instead
1732                // interpret it as a repetition operator missing its argument.
1733                if flags.items.is_empty() {
1734                    return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1735                }
1736                Ok(Either::Left(ast::SetFlags {
1737                    span: Span {
1738                        end: self.pos(),
1739                        ..open_span
1740                    },
1741                    flags,
1742                }))
1743            } else {
1744                assert_eq!(char_end, ':');
1745                Ok(Either::Right(ast::Group {
1746                    span: open_span,
1747                    kind: ast::GroupKind::NonCapturing(flags),
1748                    ast: Box::new(Ast::empty(self.span())),
1749                }))
1750            }
1751        } else {
1752            let capture_index = self.next_capture_index(open_span)?;
1753            Ok(Either::Right(ast::Group {
1754                span: open_span,
1755                kind: ast::GroupKind::CaptureIndex(capture_index),
1756                ast: Box::new(Ast::empty(self.span())),
1757            }))
1758        }
1759    }
1760
1761    #[inline(never)]
1762    fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1763        if self.is_eof() {
1764            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1765        }
1766        let start = self.pos();
1767        loop {
1768            if self.char() == '>' {
1769                break;
1770            }
1771            if !is_capture_char(self.char(), self.pos() == start) {
1772                return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1773            }
1774            if !self.bump() {
1775                break;
1776            }
1777        }
1778        let end = self.pos();
1779        if self.is_eof() {
1780            return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1781        }
1782        assert_eq!(self.char(), '>');
1783        self.bump();
1784        let name = &self.pattern()[start.offset..end.offset];
1785        if name.is_empty() {
1786            return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1787        }
1788        let capname = ast::CaptureName {
1789            span: Span::new(start, end),
1790            name: name.to_string(),
1791            index: capture_index,
1792        };
1793        self.add_capture_name(&capname)?;
1794        Ok(capname)
1795    }
1796
1797    #[inline(never)]
1798    fn parse_flags(&self) -> Result<ast::Flags> {
1799        let mut flags = ast::Flags {
1800            span: self.span(),
1801            items: vec![],
1802        };
1803        let mut last_was_negation = None;
1804        while self.char() != ':' && self.char() != ')' {
1805            if self.char() == '-' {
1806                last_was_negation = Some(self.span_char());
1807                let item = ast::FlagsItem {
1808                    span: self.span_char(),
1809                    kind: ast::FlagsItemKind::Negation,
1810                };
1811                if let Some(i) = flags.add_item(item) {
1812                    return Err(self.error(
1813                        self.span_char(),
1814                        ast::ErrorKind::FlagRepeatedNegation {
1815                            original: flags.items[i].span,
1816                        },
1817                    ));
1818                }
1819            } else {
1820                last_was_negation = None;
1821                let item = ast::FlagsItem {
1822                    span: self.span_char(),
1823                    kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1824                };
1825                if let Some(i) = flags.add_item(item) {
1826                    return Err(self.error(
1827                        self.span_char(),
1828                        ast::ErrorKind::FlagDuplicate {
1829                            original: flags.items[i].span,
1830                        },
1831                    ));
1832                }
1833            }
1834            if !self.bump() {
1835                return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1836            }
1837        }
1838        if let Some(span) = last_was_negation {
1839            return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1840        }
1841        flags.span.end = self.pos();
1842        Ok(flags)
1843    }
1844
1845    #[inline(never)]
1846    fn parse_flag(&self) -> Result<ast::Flag> {
1847        match self.char() {
1848            'i' => Ok(ast::Flag::CaseInsensitive),
1849            'm' => Ok(ast::Flag::MultiLine),
1850            's' => Ok(ast::Flag::DotMatchesNewLine),
1851            'U' => Ok(ast::Flag::SwapGreed),
1852            'u' => Ok(ast::Flag::Unicode),
1853            'R' => Ok(ast::Flag::CRLF),
1854            'x' => Ok(ast::Flag::IgnoreWhitespace),
1855            _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1856        }
1857    }
1858
1859    fn parse_primitive(&self) -> Result<Primitive> {
1860        match self.char() {
1861            '\\' => self.parse_escape(),
1862            '_' => {
1863                let ast = Primitive::Top(self.span_char());
1864                self.bump();
1865                Ok(ast)
1866            }
1867            '.' => {
1868                let ast = Primitive::Dot(self.span_char());
1869                self.bump();
1870                Ok(ast)
1871            }
1872            '^' => {
1873                let ast = Primitive::Assertion(ast::Assertion {
1874                    span: self.span_char(),
1875                    kind: ast::AssertionKind::StartLine,
1876                });
1877                self.bump();
1878                Ok(ast)
1879            }
1880            '$' => {
1881                let ast = Primitive::Assertion(ast::Assertion {
1882                    span: self.span_char(),
1883                    kind: ast::AssertionKind::EndLine,
1884                });
1885                self.bump();
1886                Ok(ast)
1887            }
1888            c => {
1889                let ast = Primitive::Literal(Literal {
1890                    span: self.span_char(),
1891                    kind: LiteralKind::Verbatim,
1892                    c,
1893                });
1894                self.bump();
1895                Ok(ast)
1896            }
1897        }
1898    }
1899
1900    #[inline(never)]
1901    fn parse_escape(&self) -> Result<Primitive> {
1902        assert_eq!(self.char(), '\\');
1903        let start = self.pos();
1904        if !self.bump() {
1905            return Err(self.error(
1906                Span::new(start, self.pos()),
1907                ast::ErrorKind::EscapeUnexpectedEof,
1908            ));
1909        }
1910        let c = self.char();
1911        // Put some of the more complicated routines into helpers.
1912        match c {
1913            '0'..='9' => {
1914                if !self.parser().octal {
1915                    return Err(self.error(
1916                        Span::new(start, self.span_char().end),
1917                        ast::ErrorKind::UnsupportedBackreference,
1918                    ));
1919                }
1920                let mut lit = self.parse_octal();
1921                lit.span.start = start;
1922                return Ok(Primitive::Literal(lit));
1923            }
1924            // '8'..='9' if !self.parser().octal => {
1925            //     return Err(self.error(
1926            //         Span::new(start, self.span_char().end),
1927            //         ast::ErrorKind::UnsupportedBackreference,
1928            //     ));
1929            // }
1930            'x' | 'u' | 'U' => {
1931                let mut lit = self.parse_hex()?;
1932                lit.span.start = start;
1933                return Ok(Primitive::Literal(lit));
1934            }
1935            'p' | 'P' => {
1936                let mut cls = self.parse_unicode_class()?;
1937                cls.span.start = start;
1938                return Ok(Primitive::Unicode(cls));
1939            }
1940            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1941                let mut cls = self.parse_perl_class();
1942                cls.span.start = start;
1943                return Ok(Primitive::Perl(cls));
1944            }
1945            _ => {}
1946        }
1947
1948        // Handle all of the one letter sequences inline.
1949        self.bump();
1950        let span = Span::new(start, self.pos());
1951        if is_meta_character(c) {
1952            return Ok(Primitive::Literal(Literal {
1953                span,
1954                kind: LiteralKind::Meta,
1955                c,
1956            }));
1957        }
1958        if is_escapeable_character(c) {
1959            return Ok(Primitive::Literal(Literal {
1960                span,
1961                kind: LiteralKind::Superfluous,
1962                c,
1963            }));
1964        }
1965        let special = |kind, c| {
1966            Ok(Primitive::Literal(Literal {
1967                span,
1968                kind: LiteralKind::Special(kind),
1969                c,
1970            }))
1971        };
1972        match c {
1973            'a' => special(SpecialLiteralKind::Bell, '\x07'),
1974            'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1975            't' => special(SpecialLiteralKind::Tab, '\t'),
1976            'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1977            'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1978            'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1979            'A' => Ok(Primitive::Assertion(ast::Assertion {
1980                span,
1981                kind: ast::AssertionKind::StartText,
1982            })),
1983            'z' => Ok(Primitive::Assertion(ast::Assertion {
1984                span,
1985                kind: ast::AssertionKind::EndText,
1986            })),
1987            'b' => {
1988                let mut wb = ast::Assertion {
1989                    span,
1990                    kind: ast::AssertionKind::WordBoundary,
1991                };
1992                // After a \b, we "try" to parse things like \b{start} for
1993                // special word boundary assertions.
1994                if !self.is_eof() && self.char() == '{' {
1995                    if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
1996                        wb.kind = kind;
1997                        wb.span.end = self.pos();
1998                    }
1999                }
2000                Ok(Primitive::Assertion(wb))
2001            }
2002            'B' => Ok(Primitive::Assertion(ast::Assertion {
2003                span,
2004                kind: ast::AssertionKind::NotWordBoundary,
2005            })),
2006            '<' => Ok(Primitive::Assertion(ast::Assertion {
2007                span,
2008                kind: ast::AssertionKind::WordBoundaryStartAngle,
2009            })),
2010            '>' => Ok(Primitive::Assertion(ast::Assertion {
2011                span,
2012                kind: ast::AssertionKind::WordBoundaryEndAngle,
2013            })),
2014            _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2015        }
2016    }
2017
2018    fn maybe_parse_special_word_boundary(
2019        &self,
2020        wb_start: Position,
2021    ) -> Result<Option<ast::AssertionKind>> {
2022        assert_eq!(self.char(), '{');
2023
2024        let is_valid_char = |c| match c {
2025            'A'..='Z' | 'a'..='z' | '-' => true,
2026            _ => false,
2027        };
2028        let start = self.pos();
2029        if !self.bump_and_bump_space() {
2030            return Err(self.error(
2031                Span::new(wb_start, self.pos()),
2032                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2033            ));
2034        }
2035        let start_contents = self.pos();
2036        // This is one of the critical bits: if the first non-whitespace
2037        // character isn't in [-A-Za-z] (i.e., this can't be a special word
2038        // boundary), then we bail and let the counted repetition parser deal
2039        // with this.
2040        if !is_valid_char(self.char()) {
2041            self.parser().pos.set(start);
2042            return Ok(None);
2043        }
2044
2045        // Now collect up our chars until we see a '}'.
2046        let mut scratch = self.parser().scratch.borrow_mut();
2047        scratch.clear();
2048        while !self.is_eof() && is_valid_char(self.char()) {
2049            scratch.push(self.char());
2050            self.bump_and_bump_space();
2051        }
2052        if self.is_eof() || self.char() != '}' {
2053            return Err(self.error(
2054                Span::new(start, self.pos()),
2055                ast::ErrorKind::SpecialWordBoundaryUnclosed,
2056            ));
2057        }
2058        let end = self.pos();
2059        self.bump();
2060        let kind = match scratch.as_str() {
2061            "start" => ast::AssertionKind::WordBoundaryStart,
2062            "end" => ast::AssertionKind::WordBoundaryEnd,
2063            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2064            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2065            _ => {
2066                return Err(self.error(
2067                    Span::new(start_contents, end),
2068                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2069                ))
2070            }
2071        };
2072        Ok(Some(kind))
2073    }
2074
2075    #[inline(never)]
2076    fn parse_octal(&self) -> Literal {
2077        assert!(self.parser().octal);
2078        assert!('0' <= self.char() && self.char() <= '7');
2079        let start = self.pos();
2080        // Parse up to two more digits.
2081        while self.bump()
2082            && '0' <= self.char()
2083            && self.char() <= '7'
2084            && self.pos().offset - start.offset <= 2
2085        {}
2086        let end = self.pos();
2087        let octal = &self.pattern()[start.offset..end.offset];
2088        // Parsing the octal should never fail since the above guarantees a
2089        // valid number.
2090        let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2091        // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
2092        // invalid Unicode scalar values.
2093        let c = char::from_u32(codepoint).expect("Unicode scalar value");
2094        Literal {
2095            span: Span::new(start, end),
2096            kind: LiteralKind::Octal,
2097            c,
2098        }
2099    }
2100
2101    #[inline(never)]
2102    fn parse_hex(&self) -> Result<Literal> {
2103        assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2104
2105        let hex_kind = match self.char() {
2106            'x' => HexLiteralKind::X,
2107            'u' => HexLiteralKind::UnicodeShort,
2108            _ => HexLiteralKind::UnicodeLong,
2109        };
2110        if !self.bump_and_bump_space() {
2111            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2112        }
2113        if self.char() == '{' {
2114            self.parse_hex_brace(hex_kind)
2115        } else {
2116            self.parse_hex_digits(hex_kind)
2117        }
2118    }
2119
2120    #[inline(never)]
2121    fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2122        let mut scratch = self.parser().scratch.borrow_mut();
2123        scratch.clear();
2124
2125        let start = self.pos();
2126        for i in 0..kind.digits() {
2127            if i > 0 && !self.bump_and_bump_space() {
2128                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2129            }
2130            if !is_hex(self.char()) {
2131                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2132            }
2133            scratch.push(self.char());
2134        }
2135        // The final bump just moves the parser past the literal, which may
2136        // be EOF.
2137        self.bump_and_bump_space();
2138        let end = self.pos();
2139        let hex = scratch.as_str();
2140        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2141            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2142            Some(c) => Ok(Literal {
2143                span: Span::new(start, end),
2144                kind: LiteralKind::HexFixed(kind),
2145                c,
2146            }),
2147        }
2148    }
2149
2150    #[inline(never)]
2151    fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2152        let mut scratch = self.parser().scratch.borrow_mut();
2153        scratch.clear();
2154
2155        let brace_pos = self.pos();
2156        let start = self.span_char().end;
2157        while self.bump_and_bump_space() && self.char() != '}' {
2158            if !is_hex(self.char()) {
2159                return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2160            }
2161            scratch.push(self.char());
2162        }
2163        if self.is_eof() {
2164            return Err(self.error(
2165                Span::new(brace_pos, self.pos()),
2166                ast::ErrorKind::EscapeUnexpectedEof,
2167            ));
2168        }
2169        let end = self.pos();
2170        let hex = scratch.as_str();
2171        assert_eq!(self.char(), '}');
2172        self.bump_and_bump_space();
2173
2174        if hex.is_empty() {
2175            return Err(self.error(
2176                Span::new(brace_pos, self.pos()),
2177                ast::ErrorKind::EscapeHexEmpty,
2178            ));
2179        }
2180        match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2181            None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2182            Some(c) => Ok(Literal {
2183                span: Span::new(start, self.pos()),
2184                kind: LiteralKind::HexBrace(kind),
2185                c,
2186            }),
2187        }
2188    }
2189
2190    fn parse_decimal(&self) -> Result<u32> {
2191        let mut scratch = self.parser().scratch.borrow_mut();
2192        scratch.clear();
2193
2194        while !self.is_eof() && self.char().is_whitespace() {
2195            self.bump();
2196        }
2197        let start = self.pos();
2198        while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2199            scratch.push(self.char());
2200            self.bump_and_bump_space();
2201        }
2202        let span = Span::new(start, self.pos());
2203        while !self.is_eof() && self.char().is_whitespace() {
2204            self.bump_and_bump_space();
2205        }
2206        let digits = scratch.as_str();
2207        if digits.is_empty() {
2208            return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2209        }
2210        match u32::from_str_radix(digits, 10).ok() {
2211            Some(n) => Ok(n),
2212            None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2213        }
2214    }
2215
2216    #[inline(never)]
2217    fn parse_set_class(&self) -> Result<ClassBracketed> {
2218        assert_eq!(self.char(), '[');
2219
2220        let mut union = ClassSetUnion {
2221            span: self.span(),
2222            items: vec![],
2223        };
2224        loop {
2225            self.bump_space();
2226            if self.is_eof() {
2227                return Err(self.unclosed_class_error());
2228            }
2229            match self.char() {
2230                '[' => {
2231                    // If we've already parsed the opening bracket, then
2232                    // attempt to treat this as the beginning of an ASCII
2233                    // class. If ASCII class parsing fails, then the parser
2234                    // backs up to `[`.
2235                    if !self.parser().stack_class.borrow().is_empty() {
2236                        if let Some(cls) = self.maybe_parse_ascii_class() {
2237                            union.push(ClassSetItem::Ascii(cls));
2238                            continue;
2239                        }
2240                    }
2241                    union = self.push_class_open(union)?;
2242                }
2243                ']' => match self.pop_class(union)? {
2244                    Either::Left(nested_union) => {
2245                        union = nested_union;
2246                    }
2247                    Either::Right(class) => return Ok(class),
2248                },
2249                '&' if self.peek() == Some('&') => {
2250                    assert!(self.bump_if("&&"));
2251                    union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2252                }
2253                '-' if self.peek() == Some('-') => {
2254                    assert!(self.bump_if("--"));
2255                    union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2256                }
2257                '~' if self.peek() == Some('~') => {
2258                    assert!(self.bump_if("~~"));
2259                    union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2260                }
2261                _ => {
2262                    union.push(self.parse_set_class_range()?);
2263                }
2264            }
2265        }
2266    }
2267
2268    #[inline(never)]
2269    fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2270        let prim1 = self.parse_set_class_item()?;
2271        self.bump_space();
2272        if self.is_eof() {
2273            return Err(self.unclosed_class_error());
2274        }
2275        if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2276            return prim1.into_class_set_item(self);
2277        }
2278        if !self.bump_and_bump_space() {
2279            return Err(self.unclosed_class_error());
2280        }
2281        let prim2 = self.parse_set_class_item()?;
2282        let range = ClassSetRange {
2283            span: Span::new(prim1.span().start, prim2.span().end),
2284            start: prim1.into_class_literal(self)?,
2285            end: prim2.into_class_literal(self)?,
2286        };
2287        if !range.is_valid() {
2288            return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2289        }
2290        Ok(ClassSetItem::Range(range))
2291    }
2292
2293    #[inline(never)]
2294    fn parse_set_class_item(&self) -> Result<Primitive> {
2295        if self.char() == '\\' {
2296            self.parse_escape()
2297        } else {
2298            let x = Primitive::Literal(Literal {
2299                span: self.span_char(),
2300                kind: LiteralKind::Verbatim,
2301                c: self.char(),
2302            });
2303            self.bump();
2304            Ok(x)
2305        }
2306    }
2307
2308    #[inline(never)]
2309    fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2310        assert_eq!(self.char(), '[');
2311        let start = self.pos();
2312        if !self.bump_and_bump_space() {
2313            return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2314        }
2315
2316        let negated = if self.char() != '^' {
2317            false
2318        } else {
2319            if !self.bump_and_bump_space() {
2320                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2321            }
2322            true
2323        };
2324        // Accept any number of `-` as literal `-`.
2325        let mut union = ClassSetUnion {
2326            span: self.span(),
2327            items: vec![],
2328        };
2329        while self.char() == '-' {
2330            union.push(ClassSetItem::Literal(Literal {
2331                span: self.span_char(),
2332                kind: LiteralKind::Verbatim,
2333                c: '-',
2334            }));
2335            if !self.bump_and_bump_space() {
2336                return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2337            }
2338        }
2339        // If `]` is the *first* char in a set, then interpret it as a literal
2340        // `]`. That is, an empty class is impossible to write.
2341        if union.items.is_empty() && self.char() == ']' {
2342            union.push(ClassSetItem::Literal(Literal {
2343                span: self.span_char(),
2344                kind: LiteralKind::Verbatim,
2345                c: ']',
2346            }));
2347            if !self.bump_and_bump_space() {
2348                return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2349            }
2350        }
2351        let set = ClassBracketed {
2352            span: Span::new(start, self.pos()),
2353            negated,
2354            kind: ClassSet::union(ClassSetUnion {
2355                span: Span::new(union.span.start, union.span.start),
2356                items: vec![],
2357            }),
2358        };
2359        Ok((set, union))
2360    }
2361
2362    #[inline(never)]
2363    fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2364        assert_eq!(self.char(), '[');
2365        // If parsing fails, then we back up the parser to this starting point.
2366        let start = self.pos();
2367        let mut negated = false;
2368        if !self.bump() || self.char() != ':' {
2369            self.parser().pos.set(start);
2370            return None;
2371        }
2372        if !self.bump() {
2373            self.parser().pos.set(start);
2374            return None;
2375        }
2376        if self.char() == '^' {
2377            negated = true;
2378            if !self.bump() {
2379                self.parser().pos.set(start);
2380                return None;
2381            }
2382        }
2383        let name_start = self.offset();
2384        while self.char() != ':' && self.bump() {}
2385        if self.is_eof() {
2386            self.parser().pos.set(start);
2387            return None;
2388        }
2389        let name = &self.pattern()[name_start..self.offset()];
2390        if !self.bump_if(":]") {
2391            self.parser().pos.set(start);
2392            return None;
2393        }
2394        let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2395            Some(kind) => kind,
2396            None => {
2397                self.parser().pos.set(start);
2398                return None;
2399            }
2400        };
2401        Some(ClassAscii {
2402            span: Span::new(start, self.pos()),
2403            kind,
2404            negated,
2405        })
2406    }
2407
2408    #[inline(never)]
2409    fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2410        assert!(self.char() == 'p' || self.char() == 'P');
2411
2412        let mut scratch = self.parser().scratch.borrow_mut();
2413        scratch.clear();
2414
2415        let negated = self.char() == 'P';
2416        if !self.bump_and_bump_space() {
2417            return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2418        }
2419        let (start, kind) = if self.char() == '{' {
2420            let start = self.span_char().end;
2421            while self.bump_and_bump_space() && self.char() != '}' {
2422                scratch.push(self.char());
2423            }
2424            if self.is_eof() {
2425                return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2426            }
2427            assert_eq!(self.char(), '}');
2428            self.bump();
2429
2430            let name = scratch.as_str();
2431            if let Some(i) = name.find("!=") {
2432                (
2433                    start,
2434                    ClassUnicodeKind::NamedValue {
2435                        op: ClassUnicodeOpKind::NotEqual,
2436                        name: name[..i].to_string(),
2437                        value: name[i + 2..].to_string(),
2438                    },
2439                )
2440            } else if let Some(i) = name.find(':') {
2441                (
2442                    start,
2443                    ClassUnicodeKind::NamedValue {
2444                        op: ClassUnicodeOpKind::Colon,
2445                        name: name[..i].to_string(),
2446                        value: name[i + 1..].to_string(),
2447                    },
2448                )
2449            } else if let Some(i) = name.find('=') {
2450                (
2451                    start,
2452                    ClassUnicodeKind::NamedValue {
2453                        op: ClassUnicodeOpKind::Equal,
2454                        name: name[..i].to_string(),
2455                        value: name[i + 1..].to_string(),
2456                    },
2457                )
2458            } else {
2459                (start, ClassUnicodeKind::Named(name.to_string()))
2460            }
2461        } else {
2462            let start = self.pos();
2463            let c = self.char();
2464            if c == '\\' {
2465                return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2466            }
2467            self.bump_and_bump_space();
2468            let kind = ClassUnicodeKind::OneLetter(c);
2469            (start, kind)
2470        };
2471        Ok(ClassUnicode {
2472            span: Span::new(start, self.pos()),
2473            negated,
2474            kind,
2475        })
2476    }
2477
2478    #[inline(never)]
2479    fn parse_perl_class(&self) -> ClassPerl {
2480        let c = self.char();
2481        let span = self.span_char();
2482        self.bump();
2483        let (negated, kind) = match c {
2484            'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2485            'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2486            's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2487            'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2488            'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2489            'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2490            c => panic!("expected valid Perl class but got '{}'", c),
2491        };
2492        ClassPerl {
2493            span,
2494            kind,
2495            negated,
2496        }
2497    }
2498}
2499
2500pub fn parse_ast<'s>(
2501    tb: &mut TB<'s>,
2502    pattern: &'s str,
2503) -> std::result::Result<NodeId, ResharpError> {
2504    let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2505    let result = p.parse(tb);
2506    result
2507}
resharp_parser/lib.rs

resharp_parser/
lib.rs