1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub case_insensitive: bool,
32 pub dot_matches_new_line: bool,
34 pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39 fn default() -> Self {
40 Self {
41 unicode: true,
42 case_insensitive: false,
43 dot_matches_new_line: false,
44 ignore_whitespace: false,
45 }
46 }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51 Word,
52 NonWord,
53 MaybeWord,
54 MaybeNonWord,
55 Unknown,
56 Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60 b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66 Literal(Literal),
67 Assertion(ast::Assertion),
68 Dot(Span),
69 Top(Span),
70 Perl(ClassPerl),
71 Unicode(ClassUnicode),
72}
73
74impl Primitive {
75 fn span(&self) -> &Span {
76 match *self {
77 Primitive::Literal(ref x) => &x.span,
78 Primitive::Assertion(ref x) => &x.span,
79 Primitive::Dot(ref span) => span,
80 Primitive::Top(ref span) => span,
81 Primitive::Perl(ref x) => &x.span,
82 Primitive::Unicode(ref x) => &x.span,
83 }
84 }
85
86 fn into_ast(self) -> Ast {
87 match self {
88 Primitive::Literal(lit) => Ast::literal(lit),
89 Primitive::Assertion(assert) => Ast::assertion(assert),
90 Primitive::Dot(span) => Ast::dot(span),
91 Primitive::Top(span) => Ast::top(span),
92 Primitive::Perl(cls) => Ast::class_perl(cls),
93 Primitive::Unicode(cls) => Ast::class_unicode(cls),
94 }
95 }
96
97 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98 use self::Primitive::*;
99 use regex_syntax::ast::ClassSetItem;
100
101 match self {
102 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106 }
107 }
108
109 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110 use self::Primitive::*;
111
112 match self {
113 Literal(lit) => Ok(lit),
114 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115 }
116 }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121 Left(Left),
122 Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127 pub kind: ErrorKind,
129 pattern: String,
132 pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138 write!(f, "{:?}: {:?}", self.kind, self.span)
139 }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147 Group {
149 concat: Concat,
151 group: ast::Group,
153 ignore_whitespace: bool,
155 },
156 Alternation(ast::Alternation),
161 Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166 Open {
168 union: regex_syntax::ast::ClassSetUnion,
170 set: regex_syntax::ast::ClassBracketed,
174 },
175 Op {
178 kind: regex_syntax::ast::ClassSetBinaryOpKind,
180 lhs: regex_syntax::ast::ClassSet,
182 },
183}
184
185pub struct ResharpParser<'s> {
187 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188 unicode_classes: resharp_algebra::UnicodeClassCache,
189 pub translator: regex_syntax::hir::translate::Translator,
190 pub pattern: &'s str,
191 pos: Cell<Position>,
192 capture_index: Cell<u32>,
193 octal: bool,
194 empty_min_range: bool,
195 ignore_whitespace: Cell<bool>,
196 dot_all: Cell<bool>,
197 global_unicode: bool,
198 global_case_insensitive: bool,
199 comments: RefCell<Vec<ast::Comment>>,
200 stack_group: RefCell<Vec<GroupState>>,
201 stack_class: RefCell<Vec<ClassState>>,
202 capture_names: RefCell<Vec<ast::CaptureName>>,
203 scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207 result.map_err(|e| {
208 if e.kind == from {
209 ResharpError { kind: to, pattern: e.pattern, span: e.span }
210 } else {
211 e
212 }
213 })
214}
215
216fn is_capture_char(c: char, first: bool) -> bool {
217 if first {
218 c == '_' || c.is_alphabetic()
219 } else {
220 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
221 }
222}
223
224pub fn is_meta_character(c: char) -> bool {
225 matches!(c, '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
226 | '#' | '&' | '-' | '~' | '_')
227}
228
229pub fn escape(text: &str) -> String {
231 let mut buf = String::new();
232 escape_into(text, &mut buf);
233 buf
234}
235
236pub fn escape_into(text: &str, buf: &mut String) {
238 buf.reserve(text.len());
239 for c in text.chars() {
240 if is_meta_character(c) {
241 buf.push('\\');
242 }
243 buf.push(c);
244 }
245}
246
247pub fn is_escapeable_character(c: char) -> bool {
248 if is_meta_character(c) {
250 return true;
251 }
252 if !c.is_ascii() {
255 return false;
256 }
257 match c {
262 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
263 '<' | '>' => false,
273 _ => true,
274 }
275}
276
277fn is_hex(c: char) -> bool {
278 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
279}
280
281impl<'s> ResharpParser<'s> {
282 fn default_translator_builder(&self) -> TranslatorBuilder {
283 let mut trb = TranslatorBuilder::new();
284 trb.unicode(self.global_unicode);
285 trb.utf8(false);
286 trb.case_insensitive(self.global_case_insensitive);
287 trb
288 }
289
290 pub fn new(pattern: &'s str) -> Self {
291 Self::with_flags(pattern, &PatternFlags::default())
292 }
293
294 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
295 let mut trb = TranslatorBuilder::new();
296 trb.unicode(flags.unicode);
297 trb.utf8(false);
298 trb.case_insensitive(flags.case_insensitive);
299 Self {
300 translator: trb.build(),
301 pattern,
302 perl_classes: vec![],
303 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
304 pos: Cell::new(Position::new(0, 0, 0)),
305 capture_index: Cell::new(0),
306 octal: false,
307 empty_min_range: false,
308 ignore_whitespace: Cell::new(flags.ignore_whitespace),
309 dot_all: Cell::new(flags.dot_matches_new_line),
310 global_unicode: flags.unicode,
311 global_case_insensitive: flags.case_insensitive,
312 comments: RefCell::new(vec![]),
313 stack_group: RefCell::new(vec![]),
314 stack_class: RefCell::new(vec![]),
315 capture_names: RefCell::new(vec![]),
316 scratch: RefCell::new(String::new()),
317 }
318 }
319
320 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
322 self
323 }
324
325 fn pattern(&self) -> &str {
327 self.pattern
328 }
329
330 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
332 ResharpError {
333 kind,
334 pattern: self.pattern().to_string(),
335 span,
336 }
337 }
338
339 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
340 self.error(Span::splat(self.pos()), ast::ErrorKind::UnsupportedResharpRegex)
341 }
342
343 fn offset(&self) -> usize {
348 self.parser().pos.get().offset
349 }
350
351 fn line(&self) -> usize {
355 self.parser().pos.get().line
356 }
357
358 fn column(&self) -> usize {
362 self.parser().pos.get().column
363 }
364
365 fn next_capture_index(&self, span: Span) -> Result<u32> {
373 let current = self.parser().capture_index.get();
374 let i = current
375 .checked_add(1)
376 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
377 self.parser().capture_index.set(i);
378 Ok(i)
379 }
380
381 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
382 let mut names = self.parser().capture_names.borrow_mut();
383 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
384 Err(i) => {
385 names.insert(i, cap.clone());
386 Ok(())
387 }
388 Ok(i) => Err(self.error(
389 cap.span,
390 ast::ErrorKind::GroupNameDuplicate {
391 original: names[i].span,
392 },
393 )),
394 }
395 }
396
397 fn ignore_whitespace(&self) -> bool {
398 self.parser().ignore_whitespace.get()
399 }
400
401 fn char(&self) -> char {
402 self.char_at(self.offset())
403 }
404
405 fn char_at(&self, i: usize) -> char {
406 self.pattern()[i..]
407 .chars()
408 .next()
409 .unwrap_or_else(|| panic!("expected char at offset {}", i))
410 }
411
412 fn bump(&self) -> bool {
413 if self.is_eof() {
414 return false;
415 }
416 let Position {
417 mut offset,
418 mut line,
419 mut column,
420 } = self.pos();
421 if self.char() == '\n' {
422 line = line.checked_add(1).unwrap();
423 column = 1;
424 } else {
425 column = column.checked_add(1).unwrap();
426 }
427 offset += self.char().len_utf8();
428 self.parser().pos.set(Position {
429 offset,
430 line,
431 column,
432 });
433 self.pattern()[self.offset()..].chars().next().is_some()
434 }
435
436 fn bump_if(&self, prefix: &str) -> bool {
437 if self.pattern()[self.offset()..].starts_with(prefix) {
438 for _ in 0..prefix.chars().count() {
439 self.bump();
440 }
441 true
442 } else {
443 false
444 }
445 }
446
447 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
448 if self.bump_if("?=") {
449 return Some((true, true));
450 }
451 if self.bump_if("?!") {
452 return Some((true, false));
453 }
454 if self.bump_if("?<=") {
455 return Some((false, true));
456 }
457 if self.bump_if("?<!") {
458 return Some((false, false));
459 }
460 None
461 }
462
463 fn bump_and_bump_space(&self) -> bool {
464 if !self.bump() {
465 return false;
466 }
467 self.bump_space();
468 !self.is_eof()
469 }
470
471 fn bump_space(&self) {
472 if !self.ignore_whitespace() {
473 return;
474 }
475 while !self.is_eof() {
476 if self.char().is_whitespace() {
477 self.bump();
478 } else if self.char() == '#' {
479 let start = self.pos();
480 let mut comment_text = String::new();
481 self.bump();
482 while !self.is_eof() {
483 let c = self.char();
484 self.bump();
485 if c == '\n' {
486 break;
487 }
488 comment_text.push(c);
489 }
490 let comment = ast::Comment {
491 span: Span::new(start, self.pos()),
492 comment: comment_text,
493 };
494 self.parser().comments.borrow_mut().push(comment);
495 } else {
496 break;
497 }
498 }
499 }
500
501 fn peek(&self) -> Option<char> {
505 if self.is_eof() {
506 return None;
507 }
508 self.pattern()[self.offset() + self.char().len_utf8()..]
509 .chars()
510 .next()
511 }
512
513 fn peek_space(&self) -> Option<char> {
516 if !self.ignore_whitespace() {
517 return self.peek();
518 }
519 if self.is_eof() {
520 return None;
521 }
522 let mut start = self.offset() + self.char().len_utf8();
523 let mut in_comment = false;
524 for (i, c) in self.pattern()[start..].char_indices() {
525 if c.is_whitespace() {
526 continue;
527 } else if !in_comment && c == '#' {
528 in_comment = true;
529 } else if in_comment && c == '\n' {
530 in_comment = false;
531 } else {
532 start += i;
533 break;
534 }
535 }
536 self.pattern()[start..].chars().next()
537 }
538
539 fn is_eof(&self) -> bool {
541 self.offset() == self.pattern().len()
542 }
543
544 fn pos(&self) -> Position {
547 self.parser().pos.get()
548 }
549
550 fn span(&self) -> Span {
553 Span::splat(self.pos())
554 }
555
556 fn span_char(&self) -> Span {
558 let mut next = Position {
559 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
560 line: self.line(),
561 column: self.column().checked_add(1).unwrap(),
562 };
563 if self.char() == '\n' {
564 next.line += 1;
565 next.column = 1;
566 }
567 Span::new(self.pos(), next)
568 }
569
570 #[inline(never)]
580 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
581 assert_eq!(self.char(), '|');
582 concat.span.end = self.pos();
583 self.push_or_add_alternation(concat);
584 self.bump();
585 Ok(ast::Concat {
586 span: self.span(),
587 asts: vec![],
588 })
589 }
590
591 fn push_or_add_alternation(&self, concat: Concat) {
594 use self::GroupState::*;
595
596 let mut stack = self.parser().stack_group.borrow_mut();
597 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
598 alts.asts.push(concat.into_ast());
599 return;
600 }
601 stack.push(Alternation(ast::Alternation {
602 span: Span::new(concat.span.start, self.pos()),
603 asts: vec![concat.into_ast()],
604 }));
605 }
606
607 #[inline(never)]
608 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
609 assert_eq!(self.char(), '&');
610 concat.span.end = self.pos();
611 self.push_or_add_intersect(concat);
612 self.bump();
613 Ok(Concat {
614 span: self.span(),
615 asts: vec![],
616 })
617 }
618
619 fn push_or_add_intersect(&self, concat: Concat) {
622 use self::GroupState::*;
623
624 let mut stack = self.parser().stack_group.borrow_mut();
625 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
626 alts.asts.push(concat.into_ast());
627 return;
628 }
629 stack.push(Intersection(ast::Intersection {
630 span: Span::new(concat.span.start, self.pos()),
631 asts: vec![concat.into_ast()],
632 }));
633 }
634
635 #[inline(never)]
649 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
650 assert_eq!(self.char(), '(');
651 match self.parse_group()? {
652 Either::Left(set) => {
653 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
654 if let Some(v) = ignore {
655 self.parser().ignore_whitespace.set(v);
656 }
657
658 concat.asts.push(Ast::flags(set));
659 Ok(concat)
660 }
661 Either::Right(group) => {
662 let old_ignore_whitespace = self.ignore_whitespace();
663 let new_ignore_whitespace = group
664 .flags()
665 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
666 .unwrap_or(old_ignore_whitespace);
667 self.parser()
668 .stack_group
669 .borrow_mut()
670 .push(GroupState::Group {
671 concat,
672 group,
673 ignore_whitespace: old_ignore_whitespace,
674 });
675 self.parser().ignore_whitespace.set(new_ignore_whitespace);
676 Ok(Concat {
677 span: self.span(),
678 asts: vec![],
679 })
680 }
681 }
682 }
683
684 #[inline(never)]
685 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
686 assert_eq!(self.char(), '~');
687 self.bump();
688 if self.is_eof() || self.char() != '(' {
689 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
690 }
691 let open_span = self.span_char();
692 self.bump();
693 let group = ast::Group {
694 span: open_span,
695 kind: ast::GroupKind::Complement,
696 ast: Box::new(Ast::empty(self.span())),
697 };
698
699 let old_ignore_whitespace = self.ignore_whitespace();
700 let new_ignore_whitespace = group
701 .flags()
702 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
703 .unwrap_or(old_ignore_whitespace);
704 self.parser()
705 .stack_group
706 .borrow_mut()
707 .push(GroupState::Group {
708 concat,
709 group,
710 ignore_whitespace: old_ignore_whitespace,
711 });
712 self.parser().ignore_whitespace.set(new_ignore_whitespace);
713 Ok(Concat {
714 span: self.span(),
715 asts: vec![],
716 })
717 }
718
719 #[inline(never)]
729 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
730 use self::GroupState::*;
731 assert_eq!(self.char(), ')');
732 let mut stack = self.parser().stack_group.borrow_mut();
733 let topstack = stack.pop();
734
735 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
736 Some(Group {
737 concat,
738 group,
739 ignore_whitespace,
740 }) => (concat, group, ignore_whitespace, None),
741 Some(Alternation(alt)) => match stack.pop() {
742 Some(Group {
743 concat,
744 group,
745 ignore_whitespace,
746 }) => (
747 concat,
748 group,
749 ignore_whitespace,
750 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
751 ),
752 None | Some(Alternation(_)) | Some(Intersection(_)) => {
753 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
754 }
755 },
756 Some(Intersection(int)) => match stack.pop() {
757 Some(Group {
758 concat,
759 group,
760 ignore_whitespace,
761 }) => (
762 concat,
763 group,
764 ignore_whitespace,
765 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
766 ),
767 None | Some(Alternation(_)) | Some(Intersection(_)) => {
768 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
769 }
770 },
771
772 None => {
773 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
774 }
775 };
776 self.parser().ignore_whitespace.set(ignore_whitespace);
777 group_concat.span.end = self.pos();
778 self.bump();
779 group.span.end = self.pos();
780 match alt {
781 Some(Either::Left(mut alt)) => {
782 alt.span.end = group_concat.span.end;
783 alt.asts.push(group_concat.into_ast());
784 group.ast = Box::new(alt.into_ast());
785 }
786 Some(Either::Right(mut int)) => {
787 int.span.end = group_concat.span.end;
788 int.asts.push(group_concat.into_ast());
789 group.ast = Box::new(int.into_ast());
790 }
791 None => {
792 group.ast = Box::new(group_concat.into_ast());
793 }
794 }
795
796 if group.kind == GroupKind::Complement {
797 let complement = ast::Complement {
798 span: self.span(),
799 ast: group.ast,
800 };
801 prior_concat.asts.push(Ast::complement(complement));
802 }
803 else {
805 prior_concat.asts.push(Ast::group(group));
806 }
807 Ok(prior_concat)
808 }
809
810 #[inline(never)]
817 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
818 concat.span.end = self.pos();
819 let mut stack = self.parser().stack_group.borrow_mut();
820 let ast = match stack.pop() {
821 None => Ok(concat.into_ast()),
822 Some(GroupState::Alternation(mut alt)) => {
823 alt.span.end = self.pos();
824 alt.asts.push(concat.into_ast());
825 Ok(Ast::alternation(alt))
826 }
827 Some(GroupState::Intersection(mut int)) => {
828 int.span.end = self.pos();
829 int.asts.push(concat.into_ast());
830
831 Ok(Ast::intersection(int))
832 }
833 Some(GroupState::Group { group, .. }) => {
834 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
835 }
836 };
837 match stack.pop() {
839 None => ast,
840 Some(GroupState::Alternation(_)) => {
841 unreachable!()
848 }
849 Some(GroupState::Intersection(_)) => {
850 unreachable!()
851 }
852 Some(GroupState::Group { group, .. }) => {
853 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
854 }
855 }
856 }
857
858 #[inline(never)]
867 fn push_class_open(
868 &self,
869 parent_union: regex_syntax::ast::ClassSetUnion,
870 ) -> Result<regex_syntax::ast::ClassSetUnion> {
871 assert_eq!(self.char(), '[');
872
873 let (nested_set, nested_union) = self.parse_set_class_open()?;
874 self.parser()
875 .stack_class
876 .borrow_mut()
877 .push(ClassState::Open {
878 union: parent_union,
879 set: nested_set,
880 });
881 Ok(nested_union)
882 }
883
884 #[inline(never)]
899 fn pop_class(
900 &self,
901 nested_union: regex_syntax::ast::ClassSetUnion,
902 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
903 assert_eq!(self.char(), ']');
904
905 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
906 let prevset = self.pop_class_op(item);
907 let mut stack = self.parser().stack_class.borrow_mut();
908 match stack.pop() {
909 None => {
910 panic!("unexpected empty character class stack")
919 }
920 Some(ClassState::Op { .. }) => {
921 panic!("unexpected ClassState::Op")
928 }
929 Some(ClassState::Open { mut union, mut set }) => {
930 self.bump();
931 set.span.end = self.pos();
932 set.kind = prevset;
933 if stack.is_empty() {
934 Ok(Either::Right(set))
935 } else {
936 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
937 Ok(Either::Left(union))
938 }
939 }
940 }
941 }
942
943 #[inline(never)]
948 fn unclosed_class_error(&self) -> ResharpError {
949 for state in self.parser().stack_class.borrow().iter().rev() {
950 if let ClassState::Open { ref set, .. } = *state {
951 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
952 }
953 }
954 panic!("no open character class found")
957 }
958
959 #[inline(never)]
965 fn push_class_op(
966 &self,
967 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
968 next_union: regex_syntax::ast::ClassSetUnion,
969 ) -> regex_syntax::ast::ClassSetUnion {
970 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
971 let new_lhs = self.pop_class_op(item);
972 self.parser().stack_class.borrow_mut().push(ClassState::Op {
973 kind: next_kind,
974 lhs: new_lhs,
975 });
976 regex_syntax::ast::ClassSetUnion {
977 span: self.span(),
978 items: vec![],
979 }
980 }
981
982 #[inline(never)]
988 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
989 let mut stack = self.parser().stack_class.borrow_mut();
990 let (kind, lhs) = match stack.pop() {
991 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
992 Some(state @ ClassState::Open { .. }) => {
993 stack.push(state);
994 return rhs;
995 }
996 None => unreachable!(),
997 };
998 let span = Span::new(lhs.span().start, rhs.span().end);
999 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1000 span,
1001 kind,
1002 lhs: Box::new(lhs),
1003 rhs: Box::new(rhs),
1004 })
1005 }
1006
1007 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1008 match hir.kind() {
1009 hir::HirKind::Empty => Ok(NodeId::EPS),
1010 hir::HirKind::Literal(l) => {
1011 if l.0.len() == 1 {
1012 let node = tb.mk_u8(l.0[0]);
1013 Ok(node)
1014 } else {
1015 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1016 let conc = tb.mk_concats(ws.iter().copied());
1017 Ok(conc)
1018 }
1019 }
1020 hir::HirKind::Class(class) => {
1021 match class {
1022 hir::Class::Unicode(class_unicode) => {
1023 let ranges = class_unicode.ranges();
1024 let mut nodes = Vec::new();
1025 for range in ranges {
1026 for seq in Utf8Sequences::new(range.start(), range.end()) {
1027 let sl = seq.as_slice();
1028 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1029 let node = match bytes.len() {
1030 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1031 n => {
1032 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1033 let mut conc = last;
1034 for i in (0..n - 1).rev() {
1035 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1036 conc = tb.mk_concat(b, conc);
1037 }
1038 conc
1039 }
1040 };
1041 nodes.push(node);
1042 }
1043 }
1044 let merged = tb.mk_unions(nodes.into_iter());
1045 Ok(merged)
1046 }
1047 hir::Class::Bytes(class_bytes) => {
1048 let ranges = class_bytes.ranges();
1049 let mut result = NodeId::BOT;
1050 for range in ranges {
1051 let start = range.start();
1052 let end = range.end();
1053 let node = tb.mk_range_u8(start, end);
1054 result = tb.mk_union(result, node);
1055 }
1056 Ok(result)
1057 }
1058 }
1059 }
1060 hir::HirKind::Look(_) => todo!(),
1061 hir::HirKind::Repetition(_) => todo!(),
1062 hir::HirKind::Capture(_) => todo!(),
1063 hir::HirKind::Concat(body) => {
1064 let mut result = NodeId::EPS;
1065 for child in body {
1066 let node = self.hir_to_node_id(child, tb)?;
1067 result = tb.mk_concat(result, node);
1068 }
1069 Ok(result)
1070 }
1071 hir::HirKind::Alternation(_) => todo!(),
1072 }
1073 }
1074
1075 fn translate_ast_to_hir(
1076 &mut self,
1077 orig_ast: ®ex_syntax::ast::Ast,
1078 tb: &mut TB<'s>,
1079 ) -> Result<NodeId> {
1080 match self.translator.translate("", orig_ast) {
1081 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1082 Ok(hir) => self.hir_to_node_id(&hir, tb),
1083 }
1084 }
1085
1086 fn translator_to_node_id(
1087 &mut self,
1088 orig_ast: ®ex_syntax::ast::Ast,
1089 translator: &mut Option<Translator>,
1090 tb: &mut TB<'s>,
1091 ) -> Result<NodeId> {
1092 match translator {
1093 Some(tr) => {
1094 let hir = tr
1095 .translate("", orig_ast)
1096 .map_err(|e| self.unsupported_error(e))?;
1097 self.hir_to_node_id(&hir, tb)
1098 }
1099 None => self.translate_ast_to_hir(orig_ast, tb),
1100 }
1101 }
1102
1103 fn get_class(
1104 &mut self,
1105 negated: bool,
1106 kind: regex_syntax::ast::ClassPerlKind,
1107 tb: &mut TB<'s>,
1108 ) -> Result<NodeId> {
1109 let w = self
1110 .perl_classes
1111 .iter()
1112 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1113 match w {
1114 Some((_, _, value)) => Ok(*value),
1115 None => {
1116 let translated = if self.global_unicode {
1117 match kind {
1118 regex_syntax::ast::ClassPerlKind::Word => {
1119 self.unicode_classes.ensure_word(tb);
1120 if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1121 }
1122 regex_syntax::ast::ClassPerlKind::Digit => {
1123 self.unicode_classes.ensure_digit(tb);
1124 if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1125 }
1126 regex_syntax::ast::ClassPerlKind::Space => {
1127 self.unicode_classes.ensure_space(tb);
1128 if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1129 }
1130 }
1131 } else {
1132 let pos = match kind {
1133 regex_syntax::ast::ClassPerlKind::Word => {
1134 let az = tb.mk_range_u8(b'a', b'z');
1135 let big = tb.mk_range_u8(b'A', b'Z');
1136 let dig = tb.mk_range_u8(b'0', b'9');
1137 let us = tb.mk_u8(b'_');
1138 tb.mk_unions([az, big, dig, us].into_iter())
1139 }
1140 regex_syntax::ast::ClassPerlKind::Digit => {
1141 tb.mk_range_u8(b'0', b'9')
1142 }
1143 regex_syntax::ast::ClassPerlKind::Space => {
1144 let sp = tb.mk_u8(b' ');
1145 let tab = tb.mk_u8(b'\t');
1146 let nl = tb.mk_u8(b'\n');
1147 let cr = tb.mk_u8(b'\r');
1148 let ff = tb.mk_u8(0x0C);
1149 let vt = tb.mk_u8(0x0B);
1150 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1151 }
1152 };
1153 if negated { tb.mk_compl(pos) } else { pos }
1154 };
1155 self.perl_classes.push((negated, kind, translated));
1156 Ok(translated)
1157 }
1158 }
1159 }
1160
1161 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1162 use WordCharKind::*;
1163 match ast {
1164 Ast::Literal(lit) => {
1165 if is_word_byte(lit.c as u8) {
1166 Word
1167 } else {
1168 NonWord
1169 }
1170 }
1171 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1172 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1173 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1174 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1175 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1176 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1177 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1178 },
1179 Ast::Dot(_) | Ast::Top(_) => Unknown,
1180 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1181 Ast::Concat(c) if !c.asts.is_empty() => {
1182 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1183 }
1184 Ast::Repetition(r) => {
1185 let inner = Self::word_char_kind(&r.ast, left);
1186 let nullable = matches!(
1187 &r.op.kind,
1188 ast::RepetitionKind::ZeroOrMore
1189 | ast::RepetitionKind::ZeroOrOne
1190 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1191 );
1192 if nullable {
1193 match inner {
1194 Word => MaybeWord,
1195 NonWord => MaybeNonWord,
1196 _ => Unknown,
1197 }
1198 } else {
1199 inner
1200 }
1201 }
1202 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1203 _ => Unknown,
1204 }
1205 }
1206
1207 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1208 match ast {
1209 Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1210 | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1211 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1212 Ast::Concat(c) if !c.asts.is_empty() => {
1213 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1214 }
1215 Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1216 Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1217 _ => None,
1218 }
1219 }
1220
1221 fn resolve_word_kind(
1222 &mut self,
1223 asts: &[Ast],
1224 idx: usize,
1225 dir: isize,
1226 translator: &mut Option<Translator>,
1227 tb: &mut TB<'s>,
1228 word_id: NodeId,
1229 not_word_id: NodeId,
1230 ) -> Result<WordCharKind> {
1231 use WordCharKind::*;
1232 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1233 if fast != Unknown {
1234 return Ok(fast);
1235 }
1236 let neighbor_idx = (idx as isize + dir) as usize;
1237 let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1238 Some(e) => e,
1239 None => return Ok(Unknown),
1240 };
1241 let node = self.ast_to_node_id(edge, translator, tb)?;
1242 if tb.subsumes(word_id, node) == Some(true) {
1243 Ok(Word)
1244 } else if tb.subsumes(not_word_id, node) == Some(true) {
1245 Ok(NonWord)
1246 } else {
1247 Ok(Unknown)
1248 }
1249 }
1250
1251 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1252 use WordCharKind::*;
1253 let next = idx as isize + dir;
1254 if next < 0 || next >= asts.len() as isize {
1255 return Edge;
1256 }
1257 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1258 match kind {
1259 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1260 Word => Word,
1261 _ => Unknown,
1262 },
1263 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1264 NonWord => NonWord,
1265 _ => Unknown,
1266 },
1267 other => other,
1268 }
1269 }
1270
1271 fn rewrite_word_boundary_in_concat(
1272 &mut self,
1273 asts: &[Ast],
1274 idx: usize,
1275 translator: &mut Option<Translator>,
1276 tb: &mut TB<'s>,
1277 ) -> Result<(NodeId, usize)> {
1278 use WordCharKind::*;
1279 let (word_id, not_word_id) = if self.global_unicode {
1280 self.unicode_classes.ensure_word(tb);
1281 (self.unicode_classes.word, self.unicode_classes.non_word)
1282 } else {
1283 let az = tb.mk_range_u8(b'a', b'z');
1284 let big = tb.mk_range_u8(b'A', b'Z');
1285 let dig = tb.mk_range_u8(b'0', b'9');
1286 let us = tb.mk_u8(b'_');
1287 let w = tb.mk_unions([az, big, dig, us].into_iter());
1288 (w, tb.mk_compl(w))
1289 };
1290 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1291 let right =
1292 self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1293
1294 match (left, right) {
1295 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1296 (Word, _) => {
1297 let neg = tb.mk_neg_lookahead(word_id, 0);
1298 Ok((neg, idx + 1))
1299 }
1300 (NonWord, _) => {
1301 let set = tb.mk_union(NodeId::END, word_id);
1302 let tail = tb.mk_concat(set, NodeId::TS);
1303 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1304 }
1305 (_, Word) => {
1306 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1307 }
1308 (_, NonWord) => {
1309 let body = tb.mk_union(NodeId::BEGIN, word_id);
1310 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1311 }
1312 _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1315 }
1316 }
1317
1318 fn merge_boundary_with_following_lookaheads(
1319 &mut self,
1320 asts: &[Ast],
1321 wb_idx: usize,
1322 boundary_tail: NodeId,
1323 translator: &mut Option<Translator>,
1324 tb: &mut TB<'s>,
1325 ) -> Result<(NodeId, usize)> {
1326 let mut next = wb_idx + 1;
1327 let mut la_bodies = vec![boundary_tail];
1328 while next < asts.len() {
1329 match &asts[next] {
1330 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1331 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1332 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1333 next += 1;
1334 }
1335 _ => break,
1336 }
1337 }
1338 let merged = tb.mk_inters(la_bodies.into_iter());
1339 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1340 }
1341
1342 fn ast_to_node_id(
1343 &mut self,
1344 ast: &Ast,
1345 translator: &mut Option<Translator>,
1346 tb: &mut TB<'s>,
1347 ) -> Result<NodeId> {
1348 match ast {
1349 Ast::Empty(_) => Ok(NodeId::EPS),
1350 Ast::Flags(f) => {
1351 let mut translator_builder = self.default_translator_builder();
1352 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1353 translator_builder.case_insensitive(state);
1354 }
1355 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1356 translator_builder.unicode(state);
1357 }
1358 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1359 self.dot_all.set(state);
1360 }
1361 let concat_translator = Some(translator_builder.build());
1362 *translator = concat_translator;
1363 Ok(NodeId::EPS)
1364 }
1365 Ast::Literal(l) => {
1366 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1367 self.translator_to_node_id(&ast_lit, translator, tb)
1368 }
1369 Ast::Top(_) => Ok(NodeId::TOP),
1370 Ast::Dot(_) => {
1371 if self.dot_all.get() {
1372 Ok(NodeId::TOP)
1373 } else {
1374 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1375 self.hir_to_node_id(&hirv, tb)
1376 }
1377 }
1378 Ast::Assertion(a) => match &a.kind {
1379 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1380 ast::AssertionKind::EndText => Ok(NodeId::END),
1381 ast::AssertionKind::WordBoundary => {
1382 Err(
1383 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1384 )
1385 }
1386 ast::AssertionKind::NotWordBoundary => {
1387 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1388 }
1389 ast::AssertionKind::StartLine => {
1390 let left = NodeId::BEGIN;
1391 let right = tb.mk_u8(b'\n');
1392 let union = tb.mk_union(left, right);
1393 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1394 }
1395 ast::AssertionKind::EndLine => {
1396 let left = NodeId::END;
1397 let right = tb.mk_u8(b'\n');
1398 let union = tb.mk_union(left, right);
1399 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1400 }
1401 ast::AssertionKind::WordBoundaryStart => todo!(),
1402 ast::AssertionKind::WordBoundaryEnd => todo!(),
1403 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1404 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1405 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1406 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1407 },
1408 Ast::ClassUnicode(c) => {
1409 let tmp = regex_syntax::ast::ClassUnicode {
1410 span: c.span,
1411 negated: c.negated,
1412 kind: c.kind.clone(),
1413 };
1414 if !c.negated {
1415 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind { match s.as_str() {
1416 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1418 "utf8" => {
1420 let ascii = tb.mk_range_u8(0, 127);
1421 let beta = tb.mk_range_u8(128, 0xBF);
1422 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1423 let c0s = tb.mk_concats([c0, beta].into_iter());
1424 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1425 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1426 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1427 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1428 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1429 return Ok(tb.mk_star(merged));
1430 }
1431 "hex" => {
1432 let nums = tb.mk_range_u8(b'0', b'9');
1433 let lets = tb.mk_range_u8(b'a', b'f');
1434 let lets2 = tb.mk_range_u8(b'A', b'F');
1435 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1436 return Ok(merged);
1437 }
1438 _ => {}
1439 } };
1440 }
1441
1442 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1443 self.translator_to_node_id(&orig_ast, translator, tb)
1444 }
1445 Ast::ClassPerl(c) => {
1446 self.get_class(c.negated, c.kind.clone(), tb)
1447 }
1448 Ast::ClassBracketed(c) => match &c.kind {
1449 regex_syntax::ast::ClassSet::Item(_) => {
1450 let tmp = regex_syntax::ast::ClassBracketed {
1451 span: c.span,
1452 negated: c.negated,
1453 kind: c.kind.clone(),
1454 };
1455 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1456 self.translator_to_node_id(&orig_ast, translator, tb)
1457 }
1458 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1459 },
1460 Ast::Repetition(r) => {
1461 let body = self.ast_to_node_id(&r.ast, translator, tb);
1462 match body {
1463 Ok(body) => match &r.op.kind {
1464 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1465 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1466 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1467 ast::RepetitionKind::Range(r) => match r {
1468 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1469 ast::RepetitionRange::AtLeast(n) => {
1470 let rep = tb.mk_repeat(body, *n, *n);
1471 let st = tb.mk_star(body);
1472 Ok(tb.mk_concat(rep, st))
1473 }
1474
1475 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1476 },
1477 },
1478 Err(_) => body,
1479 }
1480 }
1481 Ast::Lookaround(g) => {
1482 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1483 match g.kind {
1484 ast::LookaroundKind::PositiveLookahead => {
1485 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1486 }
1487 ast::LookaroundKind::PositiveLookbehind => {
1488 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1489 }
1490 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1491 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1492 }
1493 }
1494 Ast::Group(g) => {
1495 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1496 if !flags.items.is_empty() {
1497 let mut translator_builder = self.default_translator_builder();
1498 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1499 translator_builder.case_insensitive(state);
1500 }
1501 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1502 translator_builder.unicode(state);
1503 }
1504 let saved_dot_all = self.dot_all.get();
1505 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1506 self.dot_all.set(state);
1507 }
1508 let mut scoped = Some(translator_builder.build());
1509 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1510 self.dot_all.set(saved_dot_all);
1511 return result;
1512 }
1513 }
1514 self.ast_to_node_id(&g.ast, translator, tb)
1515 }
1516 Ast::Alternation(a) => {
1517 let mut children = vec![];
1518 for ast in &a.asts {
1519 match self.ast_to_node_id(ast, translator, tb) {
1520 Ok(node_id) => children.push(node_id),
1521 Err(err) => return Err(err),
1522 }
1523 }
1524 Ok(tb.mk_unions(children.iter().copied()))
1525 }
1526 Ast::Concat(c) => {
1527 let mut concat_translator: Option<Translator> = None;
1528 let mut children = vec![];
1529 let mut i = 0;
1530 while i < c.asts.len() {
1531 let ast = &c.asts[i];
1532 match ast {
1533 Ast::Flags(f) => {
1534 let mut translator_builder = self.default_translator_builder();
1535 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1536 translator_builder.case_insensitive(state);
1537 }
1538 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1539 translator_builder.unicode(state);
1540 }
1541 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1542 self.dot_all.set(state);
1543 }
1544 concat_translator = Some(translator_builder.build());
1545 i += 1;
1546 continue;
1547 }
1548 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1549 let node =
1550 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1551 children.push(node.0);
1552 i = node.1; continue;
1554 }
1555 _ => {}
1556 }
1557 match concat_translator {
1558 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1559 Ok(node_id) => children.push(node_id),
1560 Err(err) => return Err(err),
1561 },
1562 None => match self.ast_to_node_id(ast, translator, tb) {
1563 Ok(node_id) => children.push(node_id),
1564 Err(err) => return Err(err),
1565 },
1566 }
1567 i += 1;
1568 }
1569 Ok(tb.mk_concats(children.iter().cloned()))
1570 }
1571 Ast::Intersection(intersection) => {
1572 let mut children = vec![];
1573 for ast in &intersection.asts {
1574 match self.ast_to_node_id(ast, translator, tb) {
1575 Ok(node_id) => children.push(node_id),
1576 Err(err) => return Err(err),
1577 }
1578 }
1579 Ok(tb.mk_inters(children.into_iter()))
1580 }
1581 Ast::Complement(complement) => {
1582 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1583 body.map(|x| tb.mk_compl(x))
1584 }
1585 }
1586 }
1587
1588 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1591 let mut concat = Concat {
1592 span: self.span(),
1593 asts: vec![],
1594 };
1595 loop {
1596 self.bump_space();
1597 if self.is_eof() {
1598 break;
1599 }
1600 match self.char() {
1601 '(' => concat = self.push_group(concat)?,
1602 ')' => concat = self.pop_group(concat)?,
1603 '|' => concat = self.push_alternate(concat)?,
1604 '&' => concat = self.push_intersect(concat)?,
1605 '~' => concat = self.push_compl_group(concat)?,
1606 '[' => {
1607 let class = self.parse_set_class()?;
1608 concat.asts.push(Ast::class_bracketed(class));
1609 }
1610 '?' => {
1611 concat =
1612 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1613 }
1614 '*' => {
1615 concat =
1616 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1617 }
1618 '+' => {
1619 concat =
1620 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1621 }
1622 '{' => {
1623 concat = self.parse_counted_repetition(concat)?;
1624 }
1625 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1626 }
1627 }
1628 let ast = self.pop_group_end(concat)?;
1629 self.ast_to_node_id(&ast, &mut None, tb)
1630 }
1631
1632 #[inline(never)]
1633 fn parse_uncounted_repetition(
1634 &self,
1635 mut concat: ast::Concat,
1636 kind: ast::RepetitionKind,
1637 ) -> Result<ast::Concat> {
1638 let op_start = self.pos();
1640 let ast = match concat.asts.pop() {
1641 Some(ast) => ast,
1642 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1643 };
1644 match ast {
1645 Ast::Empty(_) | Ast::Flags(_) => {
1646 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1647 }
1648 _ => {}
1649 }
1650 if self.bump() && self.char() == '?' {
1651 return Err(self.error(
1652 Span::new(op_start, self.pos()),
1653 ast::ErrorKind::UnsupportedLazyQuantifier,
1654 ));
1655 }
1656 concat.asts.push(Ast::repetition(ast::Repetition {
1657 span: ast.span().with_end(self.pos()),
1658 op: ast::RepetitionOp {
1659 span: Span::new(op_start, self.pos()),
1660 kind,
1661 },
1662 greedy: true,
1663 ast: Box::new(ast),
1664 }));
1665 Ok(concat)
1666 }
1667
1668 #[inline(never)]
1669 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1670 assert!(self.char() == '{');
1671 let start = self.pos();
1672 let ast = match concat.asts.pop() {
1673 Some(ast) => ast,
1674 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1675 };
1676 match ast {
1677 Ast::Empty(_) | Ast::Flags(_) => {
1678 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1679 }
1680 _ => {}
1681 }
1682 if !self.bump_and_bump_space() {
1683 return Err(self.error(
1684 Span::new(start, self.pos()),
1685 ast::ErrorKind::RepetitionCountUnclosed,
1686 ));
1687 }
1688 let count_start = specialize_err(
1689 self.parse_decimal(),
1690 ast::ErrorKind::DecimalEmpty,
1691 ast::ErrorKind::RepetitionCountDecimalEmpty,
1692 );
1693 if self.is_eof() {
1694 return Err(self.error(
1695 Span::new(start, self.pos()),
1696 ast::ErrorKind::RepetitionCountUnclosed,
1697 ));
1698 }
1699 let range = if self.char() == ',' {
1700 if !self.bump_and_bump_space() {
1701 return Err(self.error(
1702 Span::new(start, self.pos()),
1703 ast::ErrorKind::RepetitionCountUnclosed,
1704 ));
1705 }
1706 if self.char() != '}' {
1707 let count_start = match count_start {
1708 Ok(c) => c,
1709 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1710 if self.parser().empty_min_range {
1711 0
1712 } else {
1713 return Err(err);
1714 }
1715 }
1716 err => err?,
1717 };
1718 let count_end = specialize_err(
1719 self.parse_decimal(),
1720 ast::ErrorKind::DecimalEmpty,
1721 ast::ErrorKind::RepetitionCountDecimalEmpty,
1722 )?;
1723 ast::RepetitionRange::Bounded(count_start, count_end)
1724 } else {
1725 ast::RepetitionRange::AtLeast(count_start?)
1726 }
1727 } else {
1728 ast::RepetitionRange::Exactly(count_start?)
1729 };
1730
1731 if self.is_eof() || self.char() != '}' {
1732 return Err(self.error(
1733 Span::new(start, self.pos()),
1734 ast::ErrorKind::RepetitionCountUnclosed,
1735 ));
1736 }
1737
1738 if self.bump_and_bump_space() && self.char() == '?' {
1739 return Err(self.error(
1740 Span::new(start, self.pos()),
1741 ast::ErrorKind::UnsupportedLazyQuantifier,
1742 ));
1743 }
1744
1745 let op_span = Span::new(start, self.pos());
1746 if !range.is_valid() {
1747 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1748 }
1749 concat.asts.push(Ast::repetition(ast::Repetition {
1750 span: ast.span().with_end(self.pos()),
1751 op: ast::RepetitionOp {
1752 span: op_span,
1753 kind: ast::RepetitionKind::Range(range),
1754 },
1755 greedy: true,
1756 ast: Box::new(ast),
1757 }));
1758 Ok(concat)
1759 }
1760
1761 #[inline(never)]
1762 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1763 assert_eq!(self.char(), '(');
1764 let open_span = self.span_char();
1765 self.bump();
1766 self.bump_space();
1767 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1768 let kind = match (pos, ahead) {
1769 (true, true) => LookaroundKind::PositiveLookahead,
1770 (true, false) => LookaroundKind::PositiveLookbehind,
1771 (false, true) => LookaroundKind::NegativeLookahead,
1772 (false, false) => LookaroundKind::NegativeLookbehind,
1773 };
1774 return Ok(Either::Right(ast::Group {
1775 span: open_span,
1776 kind: ast::GroupKind::Lookaround(kind),
1777 ast: Box::new(Ast::empty(self.span())),
1778 }));
1779 }
1780 let inner_span = self.span();
1781 let mut starts_with_p = true;
1782 if self.bump_if("?P<") || {
1783 starts_with_p = false;
1784 self.bump_if("?<")
1785 } {
1786 let capture_index = self.next_capture_index(open_span)?;
1787 let name = self.parse_capture_name(capture_index)?;
1788 Ok(Either::Right(ast::Group {
1789 span: open_span,
1790 kind: ast::GroupKind::CaptureName {
1791 starts_with_p,
1792 name,
1793 },
1794 ast: Box::new(Ast::empty(self.span())),
1795 }))
1796 } else if self.bump_if("?") {
1797 if self.is_eof() {
1798 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1799 }
1800 let flags = self.parse_flags()?;
1801 let char_end = self.char();
1802 self.bump();
1803 if char_end == ')' {
1804 if flags.items.is_empty() {
1807 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1808 }
1809 Ok(Either::Left(ast::SetFlags {
1810 span: Span {
1811 end: self.pos(),
1812 ..open_span
1813 },
1814 flags,
1815 }))
1816 } else {
1817 assert_eq!(char_end, ':');
1818 Ok(Either::Right(ast::Group {
1819 span: open_span,
1820 kind: ast::GroupKind::NonCapturing(flags),
1821 ast: Box::new(Ast::empty(self.span())),
1822 }))
1823 }
1824 } else {
1825 let capture_index = self.next_capture_index(open_span)?;
1826 Ok(Either::Right(ast::Group {
1827 span: open_span,
1828 kind: ast::GroupKind::CaptureIndex(capture_index),
1829 ast: Box::new(Ast::empty(self.span())),
1830 }))
1831 }
1832 }
1833
1834 #[inline(never)]
1835 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1836 if self.is_eof() {
1837 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1838 }
1839 let start = self.pos();
1840 loop {
1841 if self.char() == '>' {
1842 break;
1843 }
1844 if !is_capture_char(self.char(), self.pos() == start) {
1845 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1846 }
1847 if !self.bump() {
1848 break;
1849 }
1850 }
1851 let end = self.pos();
1852 if self.is_eof() {
1853 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1854 }
1855 assert_eq!(self.char(), '>');
1856 self.bump();
1857 let name = &self.pattern()[start.offset..end.offset];
1858 if name.is_empty() {
1859 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1860 }
1861 let capname = ast::CaptureName {
1862 span: Span::new(start, end),
1863 name: name.to_string(),
1864 index: capture_index,
1865 };
1866 self.add_capture_name(&capname)?;
1867 Ok(capname)
1868 }
1869
1870 #[inline(never)]
1871 fn parse_flags(&self) -> Result<ast::Flags> {
1872 let mut flags = ast::Flags {
1873 span: self.span(),
1874 items: vec![],
1875 };
1876 let mut last_was_negation = None;
1877 while self.char() != ':' && self.char() != ')' {
1878 if self.char() == '-' {
1879 last_was_negation = Some(self.span_char());
1880 let item = ast::FlagsItem {
1881 span: self.span_char(),
1882 kind: ast::FlagsItemKind::Negation,
1883 };
1884 if let Some(i) = flags.add_item(item) {
1885 return Err(self.error(
1886 self.span_char(),
1887 ast::ErrorKind::FlagRepeatedNegation {
1888 original: flags.items[i].span,
1889 },
1890 ));
1891 }
1892 } else {
1893 last_was_negation = None;
1894 let item = ast::FlagsItem {
1895 span: self.span_char(),
1896 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1897 };
1898 if let Some(i) = flags.add_item(item) {
1899 return Err(self.error(
1900 self.span_char(),
1901 ast::ErrorKind::FlagDuplicate {
1902 original: flags.items[i].span,
1903 },
1904 ));
1905 }
1906 }
1907 if !self.bump() {
1908 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1909 }
1910 }
1911 if let Some(span) = last_was_negation {
1912 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1913 }
1914 flags.span.end = self.pos();
1915 Ok(flags)
1916 }
1917
1918 #[inline(never)]
1919 fn parse_flag(&self) -> Result<ast::Flag> {
1920 match self.char() {
1921 'i' => Ok(ast::Flag::CaseInsensitive),
1922 'm' => Ok(ast::Flag::MultiLine),
1923 's' => Ok(ast::Flag::DotMatchesNewLine),
1924 'U' => Ok(ast::Flag::SwapGreed),
1925 'u' => Ok(ast::Flag::Unicode),
1926 'R' => Ok(ast::Flag::CRLF),
1927 'x' => Ok(ast::Flag::IgnoreWhitespace),
1928 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1929 }
1930 }
1931
1932 fn parse_primitive(&self) -> Result<Primitive> {
1933 match self.char() {
1934 '\\' => self.parse_escape(),
1935 '_' => {
1936 let ast = Primitive::Top(self.span_char());
1937 self.bump();
1938 Ok(ast)
1939 }
1940 '.' => {
1941 let ast = Primitive::Dot(self.span_char());
1942 self.bump();
1943 Ok(ast)
1944 }
1945 '^' => {
1946 let ast = Primitive::Assertion(ast::Assertion {
1947 span: self.span_char(),
1948 kind: ast::AssertionKind::StartLine,
1949 });
1950 self.bump();
1951 Ok(ast)
1952 }
1953 '$' => {
1954 let ast = Primitive::Assertion(ast::Assertion {
1955 span: self.span_char(),
1956 kind: ast::AssertionKind::EndLine,
1957 });
1958 self.bump();
1959 Ok(ast)
1960 }
1961 c => {
1962 let ast = Primitive::Literal(Literal {
1963 span: self.span_char(),
1964 kind: LiteralKind::Verbatim,
1965 c,
1966 });
1967 self.bump();
1968 Ok(ast)
1969 }
1970 }
1971 }
1972
1973 #[inline(never)]
1974 fn parse_escape(&self) -> Result<Primitive> {
1975 assert_eq!(self.char(), '\\');
1976 let start = self.pos();
1977 if !self.bump() {
1978 return Err(self.error(
1979 Span::new(start, self.pos()),
1980 ast::ErrorKind::EscapeUnexpectedEof,
1981 ));
1982 }
1983 let c = self.char();
1984 match c {
1986 '0'..='9' => {
1987 if !self.parser().octal {
1988 return Err(self.error(
1989 Span::new(start, self.span_char().end),
1990 ast::ErrorKind::UnsupportedBackreference,
1991 ));
1992 }
1993 let mut lit = self.parse_octal();
1994 lit.span.start = start;
1995 return Ok(Primitive::Literal(lit));
1996 }
1997 'x' | 'u' | 'U' => {
2004 let mut lit = self.parse_hex()?;
2005 lit.span.start = start;
2006 return Ok(Primitive::Literal(lit));
2007 }
2008 'p' | 'P' => {
2009 let mut cls = self.parse_unicode_class()?;
2010 cls.span.start = start;
2011 return Ok(Primitive::Unicode(cls));
2012 }
2013 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2014 let mut cls = self.parse_perl_class();
2015 cls.span.start = start;
2016 return Ok(Primitive::Perl(cls));
2017 }
2018 _ => {}
2019 }
2020
2021 self.bump();
2023 let span = Span::new(start, self.pos());
2024 if is_meta_character(c) {
2025 return Ok(Primitive::Literal(Literal {
2026 span,
2027 kind: LiteralKind::Meta,
2028 c,
2029 }));
2030 }
2031 if is_escapeable_character(c) {
2032 return Ok(Primitive::Literal(Literal {
2033 span,
2034 kind: LiteralKind::Superfluous,
2035 c,
2036 }));
2037 }
2038 let special = |kind, c| {
2039 Ok(Primitive::Literal(Literal {
2040 span,
2041 kind: LiteralKind::Special(kind),
2042 c,
2043 }))
2044 };
2045 match c {
2046 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2047 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2048 't' => special(SpecialLiteralKind::Tab, '\t'),
2049 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2050 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2051 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2052 'A' => Ok(Primitive::Assertion(ast::Assertion {
2053 span,
2054 kind: ast::AssertionKind::StartText,
2055 })),
2056 'z' => Ok(Primitive::Assertion(ast::Assertion {
2057 span,
2058 kind: ast::AssertionKind::EndText,
2059 })),
2060 'b' => {
2061 let mut wb = ast::Assertion {
2062 span,
2063 kind: ast::AssertionKind::WordBoundary,
2064 };
2065 if !self.is_eof() && self.char() == '{' {
2068 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2069 wb.kind = kind;
2070 wb.span.end = self.pos();
2071 }
2072 }
2073 Ok(Primitive::Assertion(wb))
2074 }
2075 'B' => Ok(Primitive::Assertion(ast::Assertion {
2076 span,
2077 kind: ast::AssertionKind::NotWordBoundary,
2078 })),
2079 '<' => Ok(Primitive::Assertion(ast::Assertion {
2080 span,
2081 kind: ast::AssertionKind::WordBoundaryStartAngle,
2082 })),
2083 '>' => Ok(Primitive::Assertion(ast::Assertion {
2084 span,
2085 kind: ast::AssertionKind::WordBoundaryEndAngle,
2086 })),
2087 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2088 }
2089 }
2090
2091 fn maybe_parse_special_word_boundary(
2092 &self,
2093 wb_start: Position,
2094 ) -> Result<Option<ast::AssertionKind>> {
2095 assert_eq!(self.char(), '{');
2096
2097 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2098 let start = self.pos();
2099 if !self.bump_and_bump_space() {
2100 return Err(self.error(
2101 Span::new(wb_start, self.pos()),
2102 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2103 ));
2104 }
2105 let start_contents = self.pos();
2106 if !is_valid_char(self.char()) {
2111 self.parser().pos.set(start);
2112 return Ok(None);
2113 }
2114
2115 let mut scratch = self.parser().scratch.borrow_mut();
2117 scratch.clear();
2118 while !self.is_eof() && is_valid_char(self.char()) {
2119 scratch.push(self.char());
2120 self.bump_and_bump_space();
2121 }
2122 if self.is_eof() || self.char() != '}' {
2123 return Err(self.error(
2124 Span::new(start, self.pos()),
2125 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2126 ));
2127 }
2128 let end = self.pos();
2129 self.bump();
2130 let kind = match scratch.as_str() {
2131 "start" => ast::AssertionKind::WordBoundaryStart,
2132 "end" => ast::AssertionKind::WordBoundaryEnd,
2133 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2134 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2135 _ => {
2136 return Err(self.error(
2137 Span::new(start_contents, end),
2138 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2139 ))
2140 }
2141 };
2142 Ok(Some(kind))
2143 }
2144
2145 #[inline(never)]
2146 fn parse_octal(&self) -> Literal {
2147 assert!(self.parser().octal);
2148 assert!('0' <= self.char() && self.char() <= '7');
2149 let start = self.pos();
2150 while self.bump()
2152 && '0' <= self.char()
2153 && self.char() <= '7'
2154 && self.pos().offset - start.offset <= 2
2155 {}
2156 let end = self.pos();
2157 let octal = &self.pattern()[start.offset..end.offset];
2158 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2161 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2164 Literal {
2165 span: Span::new(start, end),
2166 kind: LiteralKind::Octal,
2167 c,
2168 }
2169 }
2170
2171 #[inline(never)]
2172 fn parse_hex(&self) -> Result<Literal> {
2173 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2174
2175 let hex_kind = match self.char() {
2176 'x' => HexLiteralKind::X,
2177 'u' => HexLiteralKind::UnicodeShort,
2178 _ => HexLiteralKind::UnicodeLong,
2179 };
2180 if !self.bump_and_bump_space() {
2181 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2182 }
2183 if self.char() == '{' {
2184 self.parse_hex_brace(hex_kind)
2185 } else {
2186 self.parse_hex_digits(hex_kind)
2187 }
2188 }
2189
2190 #[inline(never)]
2191 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2192 let mut scratch = self.parser().scratch.borrow_mut();
2193 scratch.clear();
2194
2195 let start = self.pos();
2196 for i in 0..kind.digits() {
2197 if i > 0 && !self.bump_and_bump_space() {
2198 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2199 }
2200 if !is_hex(self.char()) {
2201 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2202 }
2203 scratch.push(self.char());
2204 }
2205 self.bump_and_bump_space();
2208 let end = self.pos();
2209 let hex = scratch.as_str();
2210 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2211 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2212 Some(c) => Ok(Literal {
2213 span: Span::new(start, end),
2214 kind: LiteralKind::HexFixed(kind),
2215 c,
2216 }),
2217 }
2218 }
2219
2220 #[inline(never)]
2221 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2222 let mut scratch = self.parser().scratch.borrow_mut();
2223 scratch.clear();
2224
2225 let brace_pos = self.pos();
2226 let start = self.span_char().end;
2227 while self.bump_and_bump_space() && self.char() != '}' {
2228 if !is_hex(self.char()) {
2229 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2230 }
2231 scratch.push(self.char());
2232 }
2233 if self.is_eof() {
2234 return Err(self.error(
2235 Span::new(brace_pos, self.pos()),
2236 ast::ErrorKind::EscapeUnexpectedEof,
2237 ));
2238 }
2239 let end = self.pos();
2240 let hex = scratch.as_str();
2241 assert_eq!(self.char(), '}');
2242 self.bump_and_bump_space();
2243
2244 if hex.is_empty() {
2245 return Err(self.error(
2246 Span::new(brace_pos, self.pos()),
2247 ast::ErrorKind::EscapeHexEmpty,
2248 ));
2249 }
2250 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2251 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2252 Some(c) => Ok(Literal {
2253 span: Span::new(start, self.pos()),
2254 kind: LiteralKind::HexBrace(kind),
2255 c,
2256 }),
2257 }
2258 }
2259
2260 fn parse_decimal(&self) -> Result<u32> {
2261 let mut scratch = self.parser().scratch.borrow_mut();
2262 scratch.clear();
2263
2264 while !self.is_eof() && self.char().is_whitespace() {
2265 self.bump();
2266 }
2267 let start = self.pos();
2268 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2269 scratch.push(self.char());
2270 self.bump_and_bump_space();
2271 }
2272 let span = Span::new(start, self.pos());
2273 while !self.is_eof() && self.char().is_whitespace() {
2274 self.bump_and_bump_space();
2275 }
2276 let digits = scratch.as_str();
2277 if digits.is_empty() {
2278 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2279 }
2280 match digits.parse::<u32>().ok() {
2281 Some(n) => Ok(n),
2282 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2283 }
2284 }
2285
2286 #[inline(never)]
2287 fn parse_set_class(&self) -> Result<ClassBracketed> {
2288 assert_eq!(self.char(), '[');
2289
2290 let mut union = ClassSetUnion {
2291 span: self.span(),
2292 items: vec![],
2293 };
2294 loop {
2295 self.bump_space();
2296 if self.is_eof() {
2297 return Err(self.unclosed_class_error());
2298 }
2299 match self.char() {
2300 '[' => {
2301 if !self.parser().stack_class.borrow().is_empty() {
2306 if let Some(cls) = self.maybe_parse_ascii_class() {
2307 union.push(ClassSetItem::Ascii(cls));
2308 continue;
2309 }
2310 }
2311 union = self.push_class_open(union)?;
2312 }
2313 ']' => match self.pop_class(union)? {
2314 Either::Left(nested_union) => {
2315 union = nested_union;
2316 }
2317 Either::Right(class) => return Ok(class),
2318 },
2319 '&' if self.peek() == Some('&') => {
2320 assert!(self.bump_if("&&"));
2321 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2322 }
2323 '-' if self.peek() == Some('-') => {
2324 assert!(self.bump_if("--"));
2325 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2326 }
2327 '~' if self.peek() == Some('~') => {
2328 assert!(self.bump_if("~~"));
2329 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2330 }
2331 _ => {
2332 union.push(self.parse_set_class_range()?);
2333 }
2334 }
2335 }
2336 }
2337
2338 #[inline(never)]
2339 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2340 let prim1 = self.parse_set_class_item()?;
2341 self.bump_space();
2342 if self.is_eof() {
2343 return Err(self.unclosed_class_error());
2344 }
2345 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2346 return prim1.into_class_set_item(self);
2347 }
2348 if !self.bump_and_bump_space() {
2349 return Err(self.unclosed_class_error());
2350 }
2351 let prim2 = self.parse_set_class_item()?;
2352 let range = ClassSetRange {
2353 span: Span::new(prim1.span().start, prim2.span().end),
2354 start: prim1.into_class_literal(self)?,
2355 end: prim2.into_class_literal(self)?,
2356 };
2357 if !range.is_valid() {
2358 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2359 }
2360 Ok(ClassSetItem::Range(range))
2361 }
2362
2363 #[inline(never)]
2364 fn parse_set_class_item(&self) -> Result<Primitive> {
2365 if self.char() == '\\' {
2366 self.parse_escape()
2367 } else {
2368 let x = Primitive::Literal(Literal {
2369 span: self.span_char(),
2370 kind: LiteralKind::Verbatim,
2371 c: self.char(),
2372 });
2373 self.bump();
2374 Ok(x)
2375 }
2376 }
2377
2378 #[inline(never)]
2379 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2380 assert_eq!(self.char(), '[');
2381 let start = self.pos();
2382 if !self.bump_and_bump_space() {
2383 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2384 }
2385
2386 let negated = if self.char() != '^' {
2387 false
2388 } else {
2389 if !self.bump_and_bump_space() {
2390 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2391 }
2392 true
2393 };
2394 let mut union = ClassSetUnion {
2396 span: self.span(),
2397 items: vec![],
2398 };
2399 while self.char() == '-' {
2400 union.push(ClassSetItem::Literal(Literal {
2401 span: self.span_char(),
2402 kind: LiteralKind::Verbatim,
2403 c: '-',
2404 }));
2405 if !self.bump_and_bump_space() {
2406 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2407 }
2408 }
2409 if union.items.is_empty() && self.char() == ']' {
2412 union.push(ClassSetItem::Literal(Literal {
2413 span: self.span_char(),
2414 kind: LiteralKind::Verbatim,
2415 c: ']',
2416 }));
2417 if !self.bump_and_bump_space() {
2418 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2419 }
2420 }
2421 let set = ClassBracketed {
2422 span: Span::new(start, self.pos()),
2423 negated,
2424 kind: ClassSet::union(ClassSetUnion {
2425 span: Span::new(union.span.start, union.span.start),
2426 items: vec![],
2427 }),
2428 };
2429 Ok((set, union))
2430 }
2431
2432 #[inline(never)]
2433 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2434 assert_eq!(self.char(), '[');
2435 let start = self.pos();
2437 let mut negated = false;
2438 if !self.bump() || self.char() != ':' {
2439 self.parser().pos.set(start);
2440 return None;
2441 }
2442 if !self.bump() {
2443 self.parser().pos.set(start);
2444 return None;
2445 }
2446 if self.char() == '^' {
2447 negated = true;
2448 if !self.bump() {
2449 self.parser().pos.set(start);
2450 return None;
2451 }
2452 }
2453 let name_start = self.offset();
2454 while self.char() != ':' && self.bump() {}
2455 if self.is_eof() {
2456 self.parser().pos.set(start);
2457 return None;
2458 }
2459 let name = &self.pattern()[name_start..self.offset()];
2460 if !self.bump_if(":]") {
2461 self.parser().pos.set(start);
2462 return None;
2463 }
2464 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2465 Some(kind) => kind,
2466 None => {
2467 self.parser().pos.set(start);
2468 return None;
2469 }
2470 };
2471 Some(ClassAscii {
2472 span: Span::new(start, self.pos()),
2473 kind,
2474 negated,
2475 })
2476 }
2477
2478 #[inline(never)]
2479 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2480 assert!(self.char() == 'p' || self.char() == 'P');
2481
2482 let mut scratch = self.parser().scratch.borrow_mut();
2483 scratch.clear();
2484
2485 let negated = self.char() == 'P';
2486 if !self.bump_and_bump_space() {
2487 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2488 }
2489 let (start, kind) = if self.char() == '{' {
2490 let start = self.span_char().end;
2491 while self.bump_and_bump_space() && self.char() != '}' {
2492 scratch.push(self.char());
2493 }
2494 if self.is_eof() {
2495 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2496 }
2497 assert_eq!(self.char(), '}');
2498 self.bump();
2499
2500 let name = scratch.as_str();
2501 if let Some(i) = name.find("!=") {
2502 (
2503 start,
2504 ClassUnicodeKind::NamedValue {
2505 op: ClassUnicodeOpKind::NotEqual,
2506 name: name[..i].to_string(),
2507 value: name[i + 2..].to_string(),
2508 },
2509 )
2510 } else if let Some(i) = name.find(':') {
2511 (
2512 start,
2513 ClassUnicodeKind::NamedValue {
2514 op: ClassUnicodeOpKind::Colon,
2515 name: name[..i].to_string(),
2516 value: name[i + 1..].to_string(),
2517 },
2518 )
2519 } else if let Some(i) = name.find('=') {
2520 (
2521 start,
2522 ClassUnicodeKind::NamedValue {
2523 op: ClassUnicodeOpKind::Equal,
2524 name: name[..i].to_string(),
2525 value: name[i + 1..].to_string(),
2526 },
2527 )
2528 } else {
2529 (start, ClassUnicodeKind::Named(name.to_string()))
2530 }
2531 } else {
2532 let start = self.pos();
2533 let c = self.char();
2534 if c == '\\' {
2535 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2536 }
2537 self.bump_and_bump_space();
2538 let kind = ClassUnicodeKind::OneLetter(c);
2539 (start, kind)
2540 };
2541 Ok(ClassUnicode {
2542 span: Span::new(start, self.pos()),
2543 negated,
2544 kind,
2545 })
2546 }
2547
2548 #[inline(never)]
2549 fn parse_perl_class(&self) -> ClassPerl {
2550 let c = self.char();
2551 let span = self.span_char();
2552 self.bump();
2553 let (negated, kind) = match c {
2554 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2555 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2556 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2557 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2558 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2559 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2560 c => panic!("expected valid Perl class but got '{}'", c),
2561 };
2562 ClassPerl {
2563 span,
2564 kind,
2565 negated,
2566 }
2567 }
2568}
2569
2570pub fn parse_ast<'s>(
2571 tb: &mut TB<'s>,
2572 pattern: &'s str,
2573) -> std::result::Result<NodeId, ResharpError> {
2574 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2575 p.parse(tb)
2576}
2577
2578pub fn parse_ast_with<'s>(
2579 tb: &mut TB<'s>,
2580 pattern: &'s str,
2581 flags: &PatternFlags,
2582) -> std::result::Result<NodeId, ResharpError> {
2583 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2584 p.parse(tb)
2585}