1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub case_insensitive: bool,
32 pub dot_matches_new_line: bool,
34 pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39 fn default() -> Self {
40 Self {
41 unicode: true,
42 case_insensitive: false,
43 dot_matches_new_line: false,
44 ignore_whitespace: false,
45 }
46 }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51 Word,
52 NonWord,
53 MaybeWord,
54 MaybeNonWord,
55 Unknown,
56 Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60 b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66 Literal(Literal),
67 Assertion(ast::Assertion),
68 Dot(Span),
69 Top(Span),
70 Perl(ClassPerl),
71 Unicode(ClassUnicode),
72}
73
74impl Primitive {
75 fn span(&self) -> &Span {
76 match *self {
77 Primitive::Literal(ref x) => &x.span,
78 Primitive::Assertion(ref x) => &x.span,
79 Primitive::Dot(ref span) => span,
80 Primitive::Top(ref span) => span,
81 Primitive::Perl(ref x) => &x.span,
82 Primitive::Unicode(ref x) => &x.span,
83 }
84 }
85
86 fn into_ast(self) -> Ast {
87 match self {
88 Primitive::Literal(lit) => Ast::literal(lit),
89 Primitive::Assertion(assert) => Ast::assertion(assert),
90 Primitive::Dot(span) => Ast::dot(span),
91 Primitive::Top(span) => Ast::top(span),
92 Primitive::Perl(cls) => Ast::class_perl(cls),
93 Primitive::Unicode(cls) => Ast::class_unicode(cls),
94 }
95 }
96
97 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98 use self::Primitive::*;
99 use regex_syntax::ast::ClassSetItem;
100
101 match self {
102 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106 }
107 }
108
109 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110 use self::Primitive::*;
111
112 match self {
113 Literal(lit) => Ok(lit),
114 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115 }
116 }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121 Left(Left),
122 Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127 pub kind: ErrorKind,
129 pattern: String,
132 pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138 write!(f, "{:?}: {:?}", self.kind, self.span)
139 }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147 Group {
149 concat: Concat,
151 group: ast::Group,
153 ignore_whitespace: bool,
155 },
156 Alternation(ast::Alternation),
161 Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166 Open {
168 union: regex_syntax::ast::ClassSetUnion,
170 set: regex_syntax::ast::ClassBracketed,
174 },
175 Op {
178 kind: regex_syntax::ast::ClassSetBinaryOpKind,
180 lhs: regex_syntax::ast::ClassSet,
182 },
183}
184
185pub struct ResharpParser<'s> {
187 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188 unicode_classes: resharp_algebra::UnicodeClassCache,
189 pub translator: regex_syntax::hir::translate::Translator,
190 pub pattern: &'s str,
191 pos: Cell<Position>,
192 capture_index: Cell<u32>,
193 octal: bool,
194 empty_min_range: bool,
195 ignore_whitespace: Cell<bool>,
196 dot_all: Cell<bool>,
197 global_unicode: bool,
198 global_case_insensitive: bool,
199 comments: RefCell<Vec<ast::Comment>>,
200 stack_group: RefCell<Vec<GroupState>>,
201 stack_class: RefCell<Vec<ClassState>>,
202 capture_names: RefCell<Vec<ast::CaptureName>>,
203 scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207 if let Err(e) = result {
208 if e.kind == from {
209 Err(ResharpError {
210 kind: to,
211 pattern: e.pattern,
212 span: e.span,
213 })
214 } else {
215 Err(e)
216 }
217 } else {
218 result
219 }
220}
221
222fn is_capture_char(c: char, first: bool) -> bool {
223 if first {
224 c == '_' || c.is_alphabetic()
225 } else {
226 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
227 }
228}
229
230pub fn is_meta_character(c: char) -> bool {
231 match c {
232 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
233 | '#' | '&' | '-' | '~' | '_' => true,
234 _ => false,
235 }
236}
237
238pub fn escape(text: &str) -> String {
240 let mut buf = String::new();
241 escape_into(text, &mut buf);
242 buf
243}
244
245pub fn escape_into(text: &str, buf: &mut String) {
247 buf.reserve(text.len());
248 for c in text.chars() {
249 if is_meta_character(c) {
250 buf.push('\\');
251 }
252 buf.push(c);
253 }
254}
255
256pub fn is_escapeable_character(c: char) -> bool {
257 if is_meta_character(c) {
259 return true;
260 }
261 if !c.is_ascii() {
264 return false;
265 }
266 match c {
271 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
272 '<' | '>' => false,
282 _ => true,
283 }
284}
285
286fn is_hex(c: char) -> bool {
287 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
288}
289
290impl<'s> ResharpParser<'s> {
291 fn default_translator_builder(&self) -> TranslatorBuilder {
292 let mut trb = TranslatorBuilder::new();
293 trb.unicode(self.global_unicode);
294 trb.utf8(false);
295 trb.case_insensitive(self.global_case_insensitive);
296 trb
297 }
298
299 pub fn new(pattern: &'s str) -> Self {
300 Self::with_flags(pattern, &PatternFlags::default())
301 }
302
303 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
304 let mut trb = TranslatorBuilder::new();
305 trb.unicode(flags.unicode);
306 trb.utf8(false);
307 trb.case_insensitive(flags.case_insensitive);
308 Self {
309 translator: trb.build(),
310 pattern,
311 perl_classes: vec![],
312 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
313 pos: Cell::new(Position::new(0, 0, 0)),
314 capture_index: Cell::new(0),
315 octal: false,
316 empty_min_range: false,
317 ignore_whitespace: Cell::new(flags.ignore_whitespace),
318 dot_all: Cell::new(flags.dot_matches_new_line),
319 global_unicode: flags.unicode,
320 global_case_insensitive: flags.case_insensitive,
321 comments: RefCell::new(vec![]),
322 stack_group: RefCell::new(vec![]),
323 stack_class: RefCell::new(vec![]),
324 capture_names: RefCell::new(vec![]),
325 scratch: RefCell::new(String::new()),
326 }
327 }
328
329 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
331 self
332 }
333
334 fn pattern(&self) -> &str {
336 self.pattern
337 }
338
339 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
341 ResharpError {
342 kind,
343 pattern: self.pattern().to_string(),
344 span,
345 }
346 }
347
348 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
349 let emptyspan = Span::splat(self.pos());
350 let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
351 inner
352 }
353
354 fn offset(&self) -> usize {
359 self.parser().pos.get().offset
360 }
361
362 fn line(&self) -> usize {
366 self.parser().pos.get().line
367 }
368
369 fn column(&self) -> usize {
373 self.parser().pos.get().column
374 }
375
376 fn next_capture_index(&self, span: Span) -> Result<u32> {
384 let current = self.parser().capture_index.get();
385 let i = current
386 .checked_add(1)
387 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
388 self.parser().capture_index.set(i);
389 Ok(i)
390 }
391
392 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
393 let mut names = self.parser().capture_names.borrow_mut();
394 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
395 Err(i) => {
396 names.insert(i, cap.clone());
397 Ok(())
398 }
399 Ok(i) => Err(self.error(
400 cap.span,
401 ast::ErrorKind::GroupNameDuplicate {
402 original: names[i].span,
403 },
404 )),
405 }
406 }
407
408 fn ignore_whitespace(&self) -> bool {
409 self.parser().ignore_whitespace.get()
410 }
411
412 fn char(&self) -> char {
413 self.char_at(self.offset())
414 }
415
416 fn char_at(&self, i: usize) -> char {
417 self.pattern()[i..]
418 .chars()
419 .next()
420 .unwrap_or_else(|| panic!("expected char at offset {}", i))
421 }
422
423 fn bump(&self) -> bool {
424 if self.is_eof() {
425 return false;
426 }
427 let Position {
428 mut offset,
429 mut line,
430 mut column,
431 } = self.pos();
432 if self.char() == '\n' {
433 line = line.checked_add(1).unwrap();
434 column = 1;
435 } else {
436 column = column.checked_add(1).unwrap();
437 }
438 offset += self.char().len_utf8();
439 self.parser().pos.set(Position {
440 offset,
441 line,
442 column,
443 });
444 self.pattern()[self.offset()..].chars().next().is_some()
445 }
446
447 fn bump_if(&self, prefix: &str) -> bool {
448 if self.pattern()[self.offset()..].starts_with(prefix) {
449 for _ in 0..prefix.chars().count() {
450 self.bump();
451 }
452 true
453 } else {
454 false
455 }
456 }
457
458 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
459 if self.bump_if("?=") {
460 return Some((true, true));
461 }
462 if self.bump_if("?!") {
463 return Some((true, false));
464 }
465 if self.bump_if("?<=") {
466 return Some((false, true));
467 }
468 if self.bump_if("?<!") {
469 return Some((false, false));
470 }
471 return None;
472 }
474
475 fn bump_and_bump_space(&self) -> bool {
476 if !self.bump() {
477 return false;
478 }
479 self.bump_space();
480 !self.is_eof()
481 }
482
483 fn bump_space(&self) {
484 if !self.ignore_whitespace() {
485 return;
486 }
487 while !self.is_eof() {
488 if self.char().is_whitespace() {
489 self.bump();
490 } else if self.char() == '#' {
491 let start = self.pos();
492 let mut comment_text = String::new();
493 self.bump();
494 while !self.is_eof() {
495 let c = self.char();
496 self.bump();
497 if c == '\n' {
498 break;
499 }
500 comment_text.push(c);
501 }
502 let comment = ast::Comment {
503 span: Span::new(start, self.pos()),
504 comment: comment_text,
505 };
506 self.parser().comments.borrow_mut().push(comment);
507 } else {
508 break;
509 }
510 }
511 }
512
513 fn peek(&self) -> Option<char> {
517 if self.is_eof() {
518 return None;
519 }
520 self.pattern()[self.offset() + self.char().len_utf8()..]
521 .chars()
522 .next()
523 }
524
525 fn peek_space(&self) -> Option<char> {
528 if !self.ignore_whitespace() {
529 return self.peek();
530 }
531 if self.is_eof() {
532 return None;
533 }
534 let mut start = self.offset() + self.char().len_utf8();
535 let mut in_comment = false;
536 for (i, c) in self.pattern()[start..].char_indices() {
537 if c.is_whitespace() {
538 continue;
539 } else if !in_comment && c == '#' {
540 in_comment = true;
541 } else if in_comment && c == '\n' {
542 in_comment = false;
543 } else {
544 start += i;
545 break;
546 }
547 }
548 self.pattern()[start..].chars().next()
549 }
550
551 fn is_eof(&self) -> bool {
553 self.offset() == self.pattern().len()
554 }
555
556 fn pos(&self) -> Position {
559 self.parser().pos.get()
560 }
561
562 fn span(&self) -> Span {
565 Span::splat(self.pos())
566 }
567
568 fn span_char(&self) -> Span {
570 let mut next = Position {
571 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
572 line: self.line(),
573 column: self.column().checked_add(1).unwrap(),
574 };
575 if self.char() == '\n' {
576 next.line += 1;
577 next.column = 1;
578 }
579 Span::new(self.pos(), next)
580 }
581
582 #[inline(never)]
592 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
593 assert_eq!(self.char(), '|');
594 concat.span.end = self.pos();
595 self.push_or_add_alternation(concat);
596 self.bump();
597 Ok(ast::Concat {
598 span: self.span(),
599 asts: vec![],
600 })
601 }
602
603 fn push_or_add_alternation(&self, concat: Concat) {
606 use self::GroupState::*;
607
608 let mut stack = self.parser().stack_group.borrow_mut();
609 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
610 alts.asts.push(concat.into_ast());
611 return;
612 }
613 stack.push(Alternation(ast::Alternation {
614 span: Span::new(concat.span.start, self.pos()),
615 asts: vec![concat.into_ast()],
616 }));
617 }
618
619 #[inline(never)]
620 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
621 assert_eq!(self.char(), '&');
622 concat.span.end = self.pos();
623 self.push_or_add_intersect(concat);
624 self.bump();
625 Ok(Concat {
626 span: self.span(),
627 asts: vec![],
628 })
629 }
630
631 fn push_or_add_intersect(&self, concat: Concat) {
634 use self::GroupState::*;
635
636 let mut stack = self.parser().stack_group.borrow_mut();
637 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
638 alts.asts.push(concat.into_ast());
639 return;
640 }
641 stack.push(Intersection(ast::Intersection {
642 span: Span::new(concat.span.start, self.pos()),
643 asts: vec![concat.into_ast()],
644 }));
645 }
646
647 #[inline(never)]
661 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
662 assert_eq!(self.char(), '(');
663 match self.parse_group()? {
664 Either::Left(set) => {
665 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
666 if let Some(v) = ignore {
667 self.parser().ignore_whitespace.set(v);
668 }
669
670 concat.asts.push(Ast::flags(set));
671 Ok(concat)
672 }
673 Either::Right(group) => {
674 let old_ignore_whitespace = self.ignore_whitespace();
675 let new_ignore_whitespace = group
676 .flags()
677 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678 .unwrap_or(old_ignore_whitespace);
679 self.parser()
680 .stack_group
681 .borrow_mut()
682 .push(GroupState::Group {
683 concat,
684 group,
685 ignore_whitespace: old_ignore_whitespace,
686 });
687 self.parser().ignore_whitespace.set(new_ignore_whitespace);
688 Ok(Concat {
689 span: self.span(),
690 asts: vec![],
691 })
692 }
693 }
694 }
695
696 #[inline(never)]
697 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
698 assert_eq!(self.char(), '~');
699 self.bump();
700 if self.is_eof() || self.char() != '(' {
701 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
702 }
703 let open_span = self.span_char();
704 self.bump();
705 let group = ast::Group {
706 span: open_span,
707 kind: ast::GroupKind::Complement,
708 ast: Box::new(Ast::empty(self.span())),
709 };
710
711 let old_ignore_whitespace = self.ignore_whitespace();
712 let new_ignore_whitespace = group
713 .flags()
714 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
715 .unwrap_or(old_ignore_whitespace);
716 self.parser()
717 .stack_group
718 .borrow_mut()
719 .push(GroupState::Group {
720 concat,
721 group,
722 ignore_whitespace: old_ignore_whitespace,
723 });
724 self.parser().ignore_whitespace.set(new_ignore_whitespace);
725 Ok(Concat {
726 span: self.span(),
727 asts: vec![],
728 })
729 }
730
731 #[inline(never)]
741 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
742 use self::GroupState::*;
743 assert_eq!(self.char(), ')');
744 let mut stack = self.parser().stack_group.borrow_mut();
745 let topstack = stack.pop();
746
747 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
748 Some(Group {
749 concat,
750 group,
751 ignore_whitespace,
752 }) => (concat, group, ignore_whitespace, None),
753 Some(Alternation(alt)) => match stack.pop() {
754 Some(Group {
755 concat,
756 group,
757 ignore_whitespace,
758 }) => (
759 concat,
760 group,
761 ignore_whitespace,
762 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
763 ),
764 None | Some(Alternation(_)) | Some(Intersection(_)) => {
765 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
766 }
767 },
768 Some(Intersection(int)) => match stack.pop() {
769 Some(Group {
770 concat,
771 group,
772 ignore_whitespace,
773 }) => (
774 concat,
775 group,
776 ignore_whitespace,
777 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
778 ),
779 None | Some(Alternation(_)) | Some(Intersection(_)) => {
780 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
781 }
782 },
783
784 None => {
785 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
786 }
787 };
788 self.parser().ignore_whitespace.set(ignore_whitespace);
789 group_concat.span.end = self.pos();
790 self.bump();
791 group.span.end = self.pos();
792 match alt {
793 Some(Either::Left(mut alt)) => {
794 alt.span.end = group_concat.span.end;
795 alt.asts.push(group_concat.into_ast());
796 group.ast = Box::new(alt.into_ast());
797 }
798 Some(Either::Right(mut int)) => {
799 int.span.end = group_concat.span.end;
800 int.asts.push(group_concat.into_ast());
801 group.ast = Box::new(int.into_ast());
802 }
803 None => {
804 group.ast = Box::new(group_concat.into_ast());
805 }
806 }
807
808 if group.kind == GroupKind::Complement {
809 let complement = ast::Complement {
810 span: self.span(),
811 ast: group.ast,
812 };
813 prior_concat.asts.push(Ast::complement(complement));
814 }
815 else {
817 prior_concat.asts.push(Ast::group(group));
819 }
820 Ok(prior_concat)
821 }
822
823 #[inline(never)]
830 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
831 concat.span.end = self.pos();
832 let mut stack = self.parser().stack_group.borrow_mut();
834 let ast = match stack.pop() {
835 None => Ok(concat.into_ast()),
836 Some(GroupState::Alternation(mut alt)) => {
837 alt.span.end = self.pos();
838 alt.asts.push(concat.into_ast());
839 Ok(Ast::alternation(alt))
840 }
841 Some(GroupState::Intersection(mut int)) => {
842 int.span.end = self.pos();
843 int.asts.push(concat.into_ast());
844
845 Ok(Ast::intersection(int))
847 }
848 Some(GroupState::Group { group, .. }) => {
849 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
850 }
851 };
852 match stack.pop() {
854 None => ast,
855 Some(GroupState::Alternation(_)) => {
856 unreachable!()
863 }
864 Some(GroupState::Intersection(_)) => {
865 unreachable!()
866 }
867 Some(GroupState::Group { group, .. }) => {
868 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
869 }
870 }
871 }
872
873 #[inline(never)]
882 fn push_class_open(
883 &self,
884 parent_union: regex_syntax::ast::ClassSetUnion,
885 ) -> Result<regex_syntax::ast::ClassSetUnion> {
886 assert_eq!(self.char(), '[');
887
888 let (nested_set, nested_union) = self.parse_set_class_open()?;
889 self.parser()
890 .stack_class
891 .borrow_mut()
892 .push(ClassState::Open {
893 union: parent_union,
894 set: nested_set,
895 });
896 Ok(nested_union)
897 }
898
899 #[inline(never)]
914 fn pop_class(
915 &self,
916 nested_union: regex_syntax::ast::ClassSetUnion,
917 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
918 assert_eq!(self.char(), ']');
919
920 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
921 let prevset = self.pop_class_op(item);
922 let mut stack = self.parser().stack_class.borrow_mut();
923 match stack.pop() {
924 None => {
925 panic!("unexpected empty character class stack")
934 }
935 Some(ClassState::Op { .. }) => {
936 panic!("unexpected ClassState::Op")
943 }
944 Some(ClassState::Open { mut union, mut set }) => {
945 self.bump();
946 set.span.end = self.pos();
947 set.kind = prevset;
948 if stack.is_empty() {
949 Ok(Either::Right(set))
950 } else {
951 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
952 Ok(Either::Left(union))
953 }
954 }
955 }
956 }
957
958 #[inline(never)]
963 fn unclosed_class_error(&self) -> ResharpError {
964 for state in self.parser().stack_class.borrow().iter().rev() {
965 if let ClassState::Open { ref set, .. } = *state {
966 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
967 }
968 }
969 panic!("no open character class found")
972 }
973
974 #[inline(never)]
980 fn push_class_op(
981 &self,
982 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
983 next_union: regex_syntax::ast::ClassSetUnion,
984 ) -> regex_syntax::ast::ClassSetUnion {
985 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
986 let new_lhs = self.pop_class_op(item);
987 self.parser().stack_class.borrow_mut().push(ClassState::Op {
988 kind: next_kind,
989 lhs: new_lhs,
990 });
991 regex_syntax::ast::ClassSetUnion {
992 span: self.span(),
993 items: vec![],
994 }
995 }
996
997 #[inline(never)]
1003 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1004 let mut stack = self.parser().stack_class.borrow_mut();
1005 let (kind, lhs) = match stack.pop() {
1006 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1007 Some(state @ ClassState::Open { .. }) => {
1008 stack.push(state);
1009 return rhs;
1010 }
1011 None => unreachable!(),
1012 };
1013 let span = Span::new(lhs.span().start, rhs.span().end);
1014 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1015 span,
1016 kind,
1017 lhs: Box::new(lhs),
1018 rhs: Box::new(rhs),
1019 })
1020 }
1021
1022 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1023 match hir.kind() {
1024 hir::HirKind::Empty => Ok(NodeId::EPS),
1025 hir::HirKind::Literal(l) => {
1026 if l.0.len() == 1 {
1027 let node = tb.mk_u8(l.0[0]);
1028 Ok(node)
1029 } else {
1030 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1031 let conc = tb.mk_concats(ws.iter().copied());
1032 Ok(conc)
1033 }
1034 }
1035 hir::HirKind::Class(class) => {
1036 match class {
1038 hir::Class::Unicode(class_unicode) => {
1039 let ranges = class_unicode.ranges();
1040 let mut nodes = Vec::new();
1041 for range in ranges {
1042 for seq in Utf8Sequences::new(range.start(), range.end()) {
1043 let sl = seq.as_slice();
1044 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1045 let node = match bytes.len() {
1046 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1047 n => {
1048 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1049 let mut conc = last;
1050 for i in (0..n - 1).rev() {
1051 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1052 conc = tb.mk_concat(b, conc);
1053 }
1054 conc
1055 }
1056 };
1057 nodes.push(node);
1058 }
1059 }
1060 let merged = tb.mk_unions(nodes.into_iter());
1061 Ok(merged)
1062 }
1063 hir::Class::Bytes(class_bytes) => {
1064 let ranges = class_bytes.ranges();
1065 let mut result = NodeId::BOT;
1066 for range in ranges {
1067 let start = range.start();
1068 let end = range.end();
1069 let node = tb.mk_range_u8(start, end);
1070 result = tb.mk_union(result, node);
1071 }
1072 Ok(result)
1073 }
1074 }
1075 }
1076 hir::HirKind::Look(_) => todo!(),
1077 hir::HirKind::Repetition(_) => todo!(),
1078 hir::HirKind::Capture(_) => todo!(),
1079 hir::HirKind::Concat(body) => {
1080 let mut result = NodeId::EPS;
1081 for child in body {
1082 let node = self.hir_to_node_id(child, tb)?;
1083 result = tb.mk_concat(result, node);
1084 }
1085 Ok(result)
1086 }
1087 hir::HirKind::Alternation(_) => todo!(),
1088 }
1089 }
1090
1091 fn translate_ast_to_hir(
1092 &mut self,
1093 orig_ast: ®ex_syntax::ast::Ast,
1094 tb: &mut TB<'s>,
1095 ) -> Result<NodeId> {
1096 match self.translator.translate("", orig_ast) {
1099 Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1100 Ok(hir) => {
1101 let mapped = self.hir_to_node_id(&hir, tb);
1102 mapped
1103 }
1104 }
1105 }
1106
1107 fn translator_to_node_id(
1108 &mut self,
1109 orig_ast: ®ex_syntax::ast::Ast,
1110 translator: &mut Option<Translator>,
1111 tb: &mut TB<'s>,
1112 ) -> Result<NodeId> {
1113 match translator {
1114 Some(tr) => {
1115 let hir = tr
1116 .translate("", &orig_ast)
1117 .map_err(|e| self.unsupported_error(e))?;
1118 self.hir_to_node_id(&hir, tb)
1119 }
1120 None => self.translate_ast_to_hir(&orig_ast, tb),
1121 }
1122 }
1123
1124 fn get_class(
1125 &mut self,
1126 negated: bool,
1127 kind: regex_syntax::ast::ClassPerlKind,
1128 tb: &mut TB<'s>,
1129 ) -> Result<NodeId> {
1130 let w = self
1131 .perl_classes
1132 .iter()
1133 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1134 match w {
1135 Some((_, _, value)) => return Ok(*value),
1136 None => {
1137 let translated = if self.global_unicode {
1138 match kind {
1139 regex_syntax::ast::ClassPerlKind::Word => {
1140 self.unicode_classes.ensure_word(tb);
1141 if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1142 }
1143 regex_syntax::ast::ClassPerlKind::Digit => {
1144 self.unicode_classes.ensure_digit(tb);
1145 if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1146 }
1147 regex_syntax::ast::ClassPerlKind::Space => {
1148 self.unicode_classes.ensure_space(tb);
1149 if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1150 }
1151 }
1152 } else {
1153 let pos = match kind {
1154 regex_syntax::ast::ClassPerlKind::Word => {
1155 let az = tb.mk_range_u8(b'a', b'z');
1156 let big = tb.mk_range_u8(b'A', b'Z');
1157 let dig = tb.mk_range_u8(b'0', b'9');
1158 let us = tb.mk_u8(b'_');
1159 tb.mk_unions([az, big, dig, us].into_iter())
1160 }
1161 regex_syntax::ast::ClassPerlKind::Digit => {
1162 tb.mk_range_u8(b'0', b'9')
1163 }
1164 regex_syntax::ast::ClassPerlKind::Space => {
1165 let sp = tb.mk_u8(b' ');
1166 let tab = tb.mk_u8(b'\t');
1167 let nl = tb.mk_u8(b'\n');
1168 let cr = tb.mk_u8(b'\r');
1169 let ff = tb.mk_u8(0x0C);
1170 let vt = tb.mk_u8(0x0B);
1171 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1172 }
1173 };
1174 if negated { tb.mk_compl(pos) } else { pos }
1175 };
1176 self.perl_classes.push((negated, kind, translated));
1177 Ok(translated)
1178 }
1179 }
1180 }
1181
1182 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1183 use WordCharKind::*;
1184 match ast {
1185 Ast::Literal(lit) => {
1186 if is_word_byte(lit.c as u8) {
1187 Word
1188 } else {
1189 NonWord
1190 }
1191 }
1192 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1193 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1194 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1195 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1196 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1197 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1198 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1199 },
1200 Ast::Dot(_) | Ast::Top(_) => Unknown,
1201 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1202 Ast::Concat(c) if !c.asts.is_empty() => {
1203 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1204 }
1205 Ast::Repetition(r) => {
1206 let inner = Self::word_char_kind(&r.ast, left);
1207 let nullable = matches!(
1208 &r.op.kind,
1209 ast::RepetitionKind::ZeroOrMore
1210 | ast::RepetitionKind::ZeroOrOne
1211 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1212 );
1213 if nullable {
1214 match inner {
1215 Word => MaybeWord,
1216 NonWord => MaybeNonWord,
1217 _ => Unknown,
1218 }
1219 } else {
1220 inner
1221 }
1222 }
1223 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1224 _ => Unknown,
1225 }
1226 }
1227
1228 fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1229 match ast {
1230 Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1231 | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1232 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1233 Ast::Concat(c) if !c.asts.is_empty() => {
1234 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1235 }
1236 Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1237 Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1238 _ => None,
1239 }
1240 }
1241
1242 fn resolve_word_kind(
1243 &mut self,
1244 asts: &[Ast],
1245 idx: usize,
1246 dir: isize,
1247 translator: &mut Option<Translator>,
1248 tb: &mut TB<'s>,
1249 word_id: NodeId,
1250 not_word_id: NodeId,
1251 ) -> Result<WordCharKind> {
1252 use WordCharKind::*;
1253 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1254 if fast != Unknown {
1255 return Ok(fast);
1256 }
1257 let neighbor_idx = (idx as isize + dir) as usize;
1258 let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1259 Some(e) => e,
1260 None => return Ok(Unknown),
1261 };
1262 let node = self.ast_to_node_id(edge, translator, tb)?;
1263 if tb.subsumes(word_id, node) == Some(true) {
1264 Ok(Word)
1265 } else if tb.subsumes(not_word_id, node) == Some(true) {
1266 Ok(NonWord)
1267 } else {
1268 Ok(Unknown)
1269 }
1270 }
1271
1272 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1273 use WordCharKind::*;
1274 let next = idx as isize + dir;
1275 if next < 0 || next >= asts.len() as isize {
1276 return Edge;
1277 }
1278 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1279 match kind {
1280 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1281 Word => Word,
1282 _ => Unknown,
1283 },
1284 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1285 NonWord => NonWord,
1286 _ => Unknown,
1287 },
1288 other => other,
1289 }
1290 }
1291
1292 fn rewrite_word_boundary_in_concat(
1293 &mut self,
1294 asts: &[Ast],
1295 idx: usize,
1296 translator: &mut Option<Translator>,
1297 tb: &mut TB<'s>,
1298 ) -> Result<(NodeId, usize)> {
1299 use WordCharKind::*;
1300 self.unicode_classes.ensure_word(tb);
1301 let word_id = self.unicode_classes.word;
1302 let not_word_id = self.unicode_classes.non_word;
1303 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1304 let right =
1305 self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1306
1307 match (left, right) {
1308 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1309 (Word, _) => {
1310 let neg = tb.mk_neg_lookahead(word_id, 0);
1311 Ok((neg, idx + 1))
1312 }
1313 (NonWord, _) => {
1314 let set = tb.mk_union(NodeId::END, word_id);
1315 let tail = tb.mk_concat(set, NodeId::TS);
1316 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1317 }
1318 (_, Word) => {
1319 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1320 }
1321 (_, NonWord) => {
1322 let body = tb.mk_union(NodeId::BEGIN, word_id);
1323 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1324 }
1325 _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1328 }
1329 }
1330
1331 fn merge_boundary_with_following_lookaheads(
1332 &mut self,
1333 asts: &[Ast],
1334 wb_idx: usize,
1335 boundary_tail: NodeId,
1336 translator: &mut Option<Translator>,
1337 tb: &mut TB<'s>,
1338 ) -> Result<(NodeId, usize)> {
1339 let mut next = wb_idx + 1;
1340 let mut la_bodies = vec![boundary_tail];
1341 while next < asts.len() {
1342 match &asts[next] {
1343 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1344 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1345 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1346 next += 1;
1347 }
1348 _ => break,
1349 }
1350 }
1351 let merged = tb.mk_inters(la_bodies.into_iter());
1352 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1353 }
1354
1355 fn ast_to_node_id(
1356 &mut self,
1357 ast: &Ast,
1358 translator: &mut Option<Translator>,
1359 tb: &mut TB<'s>,
1360 ) -> Result<NodeId> {
1361 match ast {
1362 Ast::Empty(_) => Ok(NodeId::EPS),
1363 Ast::Flags(f) => {
1364 let mut translator_builder = self.default_translator_builder();
1365 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1366 translator_builder.case_insensitive(state);
1367 }
1368 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1369 translator_builder.unicode(state);
1370 }
1371 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1372 self.dot_all.set(state);
1373 }
1374 let concat_translator = Some(translator_builder.build());
1375 *translator = concat_translator;
1376 Ok(NodeId::EPS)
1377 }
1378 Ast::Literal(l) => {
1379 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1380 self.translator_to_node_id(&ast_lit, translator, tb)
1381 }
1382 Ast::Top(_) => Ok(NodeId::TOP),
1383 Ast::Dot(_) => {
1384 if self.dot_all.get() {
1385 Ok(NodeId::TOP)
1386 } else {
1387 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1388 self.hir_to_node_id(&hirv, tb)
1389 }
1390 }
1391 Ast::Assertion(a) => match &a.kind {
1392 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1393 ast::AssertionKind::EndText => Ok(NodeId::END),
1394 ast::AssertionKind::WordBoundary => {
1395 return Err(
1396 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1397 )
1398 }
1399 ast::AssertionKind::NotWordBoundary => {
1400 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1401 }
1402 ast::AssertionKind::StartLine => {
1403 let left = NodeId::BEGIN;
1404 let right = tb.mk_u8('\n' as u8);
1405 let union = tb.mk_union(left, right);
1406 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1407 }
1408 ast::AssertionKind::EndLine => {
1409 let left = NodeId::END;
1410 let right = tb.mk_u8('\n' as u8);
1411 let union = tb.mk_union(left, right);
1412 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1413 }
1414 ast::AssertionKind::WordBoundaryStart => todo!(),
1415 ast::AssertionKind::WordBoundaryEnd => todo!(),
1416 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1417 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1418 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1419 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1420 },
1421 Ast::ClassUnicode(c) => {
1422 let tmp = regex_syntax::ast::ClassUnicode {
1423 span: c.span,
1424 negated: c.negated,
1425 kind: c.kind.clone(),
1426 };
1427 if !c.negated {
1428 match &c.kind {
1429 regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1430 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1432 "utf8" => {
1434 let ascii = tb.mk_range_u8(0, 127);
1435 let beta = tb.mk_range_u8(128, 0xBF);
1436 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1437 let c0s = tb.mk_concats([c0, beta].into_iter());
1438 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1439 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1440 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1441 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1442 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1443 return Ok(tb.mk_star(merged));
1444 }
1445 "hex" => {
1446 let nums = tb.mk_range_u8(b'0', b'9');
1447 let lets = tb.mk_range_u8(b'a', b'f');
1448 let lets2 = tb.mk_range_u8(b'A', b'F');
1449 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1450 return Ok(merged);
1451 }
1452 _ => {}
1453 },
1454 _ => {}
1455 };
1456 }
1457
1458 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1459 self.translator_to_node_id(&orig_ast, translator, tb)
1460 }
1461 Ast::ClassPerl(c) => {
1462 self.get_class(c.negated, c.kind.clone(), tb)
1463 }
1464 Ast::ClassBracketed(c) => match &c.kind {
1465 regex_syntax::ast::ClassSet::Item(_) => {
1466 let tmp = regex_syntax::ast::ClassBracketed {
1467 span: c.span,
1468 negated: c.negated,
1469 kind: c.kind.clone(),
1470 };
1471 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1472 self.translator_to_node_id(&orig_ast, translator, tb)
1473 }
1474 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1475 },
1476 Ast::Repetition(r) => {
1477 let body = self.ast_to_node_id(&r.ast, translator, tb);
1478 match body {
1479 Ok(body) => match &r.op.kind {
1480 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1481 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1482 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1483 ast::RepetitionKind::Range(r) => match r {
1484 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1485 ast::RepetitionRange::AtLeast(n) => {
1486 let rep = tb.mk_repeat(body, *n, *n);
1487 let st = tb.mk_star(body);
1488 Ok(tb.mk_concat(rep, st))
1489 }
1490
1491 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1492 },
1493 },
1494 Err(_) => body,
1495 }
1496 }
1497 Ast::Lookaround(g) => {
1498 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1499 match g.kind {
1500 ast::LookaroundKind::PositiveLookahead => {
1501 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1502 }
1503 ast::LookaroundKind::PositiveLookbehind => {
1504 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1505 }
1506 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1507 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1508 }
1509 }
1510 Ast::Group(g) => {
1511 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1512 if !flags.items.is_empty() {
1513 let mut translator_builder = self.default_translator_builder();
1514 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1515 translator_builder.case_insensitive(state);
1516 }
1517 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1518 translator_builder.unicode(state);
1519 }
1520 let saved_dot_all = self.dot_all.get();
1521 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1522 self.dot_all.set(state);
1523 }
1524 let mut scoped = Some(translator_builder.build());
1525 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1526 self.dot_all.set(saved_dot_all);
1527 return result;
1528 }
1529 }
1530 self.ast_to_node_id(&g.ast, translator, tb)
1531 }
1532 Ast::Alternation(a) => {
1533 let mut children = vec![];
1534 for ast in &a.asts {
1535 match self.ast_to_node_id(ast, translator, tb) {
1536 Ok(node_id) => children.push(node_id),
1537 Err(err) => return Err(err),
1538 }
1539 }
1540 Ok(tb.mk_unions(children.iter().copied()))
1541 }
1542 Ast::Concat(c) => {
1543 let mut concat_translator: Option<Translator> = None;
1544 let mut children = vec![];
1545 let mut i = 0;
1546 while i < c.asts.len() {
1547 let ast = &c.asts[i];
1548 match ast {
1549 Ast::Flags(f) => {
1550 let mut translator_builder = self.default_translator_builder();
1551 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1552 translator_builder.case_insensitive(state);
1553 }
1554 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1555 translator_builder.unicode(state);
1556 }
1557 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1558 self.dot_all.set(state);
1559 }
1560 concat_translator = Some(translator_builder.build());
1561 i += 1;
1562 continue;
1563 }
1564 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1565 let node =
1566 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1567 children.push(node.0);
1568 i = node.1; continue;
1570 }
1571 _ => {}
1572 }
1573 match concat_translator {
1574 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1575 Ok(node_id) => children.push(node_id),
1576 Err(err) => return Err(err),
1577 },
1578 None => match self.ast_to_node_id(ast, translator, tb) {
1579 Ok(node_id) => children.push(node_id),
1580 Err(err) => return Err(err),
1581 },
1582 }
1583 i += 1;
1584 }
1585 Ok(tb.mk_concats(children.iter().cloned()))
1586 }
1587 Ast::Intersection(intersection) => {
1588 let mut children = vec![];
1589 for ast in &intersection.asts {
1590 match self.ast_to_node_id(ast, translator, tb) {
1591 Ok(node_id) => children.push(node_id),
1592 Err(err) => return Err(err),
1593 }
1594 }
1595 Ok(tb.mk_inters(children.into_iter()))
1596 }
1597 Ast::Complement(complement) => {
1598 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1599 body.map(|x| tb.mk_compl(x))
1600 }
1601 }
1602 }
1603
1604 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1607 let mut concat = Concat {
1608 span: self.span(),
1609 asts: vec![],
1610 };
1611 loop {
1612 self.bump_space();
1613 if self.is_eof() {
1614 break;
1615 }
1616 match self.char() {
1617 '(' => concat = self.push_group(concat)?,
1618 ')' => concat = self.pop_group(concat)?,
1619 '|' => concat = self.push_alternate(concat)?,
1620 '&' => concat = self.push_intersect(concat)?,
1621 '~' => concat = self.push_compl_group(concat)?,
1622 '[' => {
1623 let class = self.parse_set_class()?;
1624 concat.asts.push(Ast::class_bracketed(class));
1625 }
1626 '?' => {
1627 concat =
1628 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1629 }
1630 '*' => {
1631 concat =
1632 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1633 }
1634 '+' => {
1635 concat =
1636 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1637 }
1638 '{' => {
1639 concat = self.parse_counted_repetition(concat)?;
1640 }
1641 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1642 }
1643 }
1644 let ast = self.pop_group_end(concat)?;
1645 let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1647 node_id
1648 }
1649
1650 #[inline(never)]
1651 fn parse_uncounted_repetition(
1652 &self,
1653 mut concat: ast::Concat,
1654 kind: ast::RepetitionKind,
1655 ) -> Result<ast::Concat> {
1656 let op_start = self.pos();
1658 let ast = match concat.asts.pop() {
1659 Some(ast) => ast,
1660 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1661 };
1662 match ast {
1663 Ast::Empty(_) | Ast::Flags(_) => {
1664 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1665 }
1666 _ => {}
1667 }
1668 if self.bump() && self.char() == '?' {
1669 return Err(self.error(
1670 Span::new(op_start, self.pos()),
1671 ast::ErrorKind::UnsupportedLazyQuantifier,
1672 ));
1673 }
1674 concat.asts.push(Ast::repetition(ast::Repetition {
1675 span: ast.span().with_end(self.pos()),
1676 op: ast::RepetitionOp {
1677 span: Span::new(op_start, self.pos()),
1678 kind,
1679 },
1680 greedy: true,
1681 ast: Box::new(ast),
1682 }));
1683 Ok(concat)
1684 }
1685
1686 #[inline(never)]
1687 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1688 assert!(self.char() == '{');
1689 let start = self.pos();
1690 let ast = match concat.asts.pop() {
1691 Some(ast) => ast,
1692 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1693 };
1694 match ast {
1695 Ast::Empty(_) | Ast::Flags(_) => {
1696 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1697 }
1698 _ => {}
1699 }
1700 if !self.bump_and_bump_space() {
1701 return Err(self.error(
1702 Span::new(start, self.pos()),
1703 ast::ErrorKind::RepetitionCountUnclosed,
1704 ));
1705 }
1706 let count_start = specialize_err(
1707 self.parse_decimal(),
1708 ast::ErrorKind::DecimalEmpty,
1709 ast::ErrorKind::RepetitionCountDecimalEmpty,
1710 );
1711 if self.is_eof() {
1712 return Err(self.error(
1713 Span::new(start, self.pos()),
1714 ast::ErrorKind::RepetitionCountUnclosed,
1715 ));
1716 }
1717 let range = if self.char() == ',' {
1718 if !self.bump_and_bump_space() {
1719 return Err(self.error(
1720 Span::new(start, self.pos()),
1721 ast::ErrorKind::RepetitionCountUnclosed,
1722 ));
1723 }
1724 if self.char() != '}' {
1725 let count_start = match count_start {
1726 Ok(c) => c,
1727 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1728 if self.parser().empty_min_range {
1729 0
1730 } else {
1731 return Err(err);
1732 }
1733 }
1734 err => err?,
1735 };
1736 let count_end = specialize_err(
1737 self.parse_decimal(),
1738 ast::ErrorKind::DecimalEmpty,
1739 ast::ErrorKind::RepetitionCountDecimalEmpty,
1740 )?;
1741 ast::RepetitionRange::Bounded(count_start, count_end)
1742 } else {
1743 ast::RepetitionRange::AtLeast(count_start?)
1744 }
1745 } else {
1746 ast::RepetitionRange::Exactly(count_start?)
1747 };
1748
1749 if self.is_eof() || self.char() != '}' {
1750 return Err(self.error(
1751 Span::new(start, self.pos()),
1752 ast::ErrorKind::RepetitionCountUnclosed,
1753 ));
1754 }
1755
1756 if self.bump_and_bump_space() && self.char() == '?' {
1757 return Err(self.error(
1758 Span::new(start, self.pos()),
1759 ast::ErrorKind::UnsupportedLazyQuantifier,
1760 ));
1761 }
1762
1763 let op_span = Span::new(start, self.pos());
1764 if !range.is_valid() {
1765 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1766 }
1767 concat.asts.push(Ast::repetition(ast::Repetition {
1768 span: ast.span().with_end(self.pos()),
1769 op: ast::RepetitionOp {
1770 span: op_span,
1771 kind: ast::RepetitionKind::Range(range),
1772 },
1773 greedy: true,
1774 ast: Box::new(ast),
1775 }));
1776 Ok(concat)
1777 }
1778
1779 #[inline(never)]
1780 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1781 assert_eq!(self.char(), '(');
1782 let open_span = self.span_char();
1783 self.bump();
1784 self.bump_space();
1785 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1786 let kind = match (pos, ahead) {
1787 (true, true) => LookaroundKind::PositiveLookahead,
1788 (true, false) => LookaroundKind::PositiveLookbehind,
1789 (false, true) => LookaroundKind::NegativeLookahead,
1790 (false, false) => LookaroundKind::NegativeLookbehind,
1791 };
1792 return Ok(Either::Right(ast::Group {
1793 span: open_span,
1794 kind: ast::GroupKind::Lookaround(kind),
1795 ast: Box::new(Ast::empty(self.span())),
1796 }));
1797 }
1798 let inner_span = self.span();
1799 let mut starts_with_p = true;
1800 if self.bump_if("?P<") || {
1801 starts_with_p = false;
1802 self.bump_if("?<")
1803 } {
1804 let capture_index = self.next_capture_index(open_span)?;
1805 let name = self.parse_capture_name(capture_index)?;
1806 Ok(Either::Right(ast::Group {
1807 span: open_span,
1808 kind: ast::GroupKind::CaptureName {
1809 starts_with_p,
1810 name,
1811 },
1812 ast: Box::new(Ast::empty(self.span())),
1813 }))
1814 } else if self.bump_if("?") {
1815 if self.is_eof() {
1816 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1817 }
1818 let flags = self.parse_flags()?;
1819 let char_end = self.char();
1820 self.bump();
1821 if char_end == ')' {
1822 if flags.items.is_empty() {
1825 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1826 }
1827 Ok(Either::Left(ast::SetFlags {
1828 span: Span {
1829 end: self.pos(),
1830 ..open_span
1831 },
1832 flags,
1833 }))
1834 } else {
1835 assert_eq!(char_end, ':');
1836 Ok(Either::Right(ast::Group {
1837 span: open_span,
1838 kind: ast::GroupKind::NonCapturing(flags),
1839 ast: Box::new(Ast::empty(self.span())),
1840 }))
1841 }
1842 } else {
1843 let capture_index = self.next_capture_index(open_span)?;
1844 Ok(Either::Right(ast::Group {
1845 span: open_span,
1846 kind: ast::GroupKind::CaptureIndex(capture_index),
1847 ast: Box::new(Ast::empty(self.span())),
1848 }))
1849 }
1850 }
1851
1852 #[inline(never)]
1853 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1854 if self.is_eof() {
1855 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1856 }
1857 let start = self.pos();
1858 loop {
1859 if self.char() == '>' {
1860 break;
1861 }
1862 if !is_capture_char(self.char(), self.pos() == start) {
1863 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1864 }
1865 if !self.bump() {
1866 break;
1867 }
1868 }
1869 let end = self.pos();
1870 if self.is_eof() {
1871 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1872 }
1873 assert_eq!(self.char(), '>');
1874 self.bump();
1875 let name = &self.pattern()[start.offset..end.offset];
1876 if name.is_empty() {
1877 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1878 }
1879 let capname = ast::CaptureName {
1880 span: Span::new(start, end),
1881 name: name.to_string(),
1882 index: capture_index,
1883 };
1884 self.add_capture_name(&capname)?;
1885 Ok(capname)
1886 }
1887
1888 #[inline(never)]
1889 fn parse_flags(&self) -> Result<ast::Flags> {
1890 let mut flags = ast::Flags {
1891 span: self.span(),
1892 items: vec![],
1893 };
1894 let mut last_was_negation = None;
1895 while self.char() != ':' && self.char() != ')' {
1896 if self.char() == '-' {
1897 last_was_negation = Some(self.span_char());
1898 let item = ast::FlagsItem {
1899 span: self.span_char(),
1900 kind: ast::FlagsItemKind::Negation,
1901 };
1902 if let Some(i) = flags.add_item(item) {
1903 return Err(self.error(
1904 self.span_char(),
1905 ast::ErrorKind::FlagRepeatedNegation {
1906 original: flags.items[i].span,
1907 },
1908 ));
1909 }
1910 } else {
1911 last_was_negation = None;
1912 let item = ast::FlagsItem {
1913 span: self.span_char(),
1914 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1915 };
1916 if let Some(i) = flags.add_item(item) {
1917 return Err(self.error(
1918 self.span_char(),
1919 ast::ErrorKind::FlagDuplicate {
1920 original: flags.items[i].span,
1921 },
1922 ));
1923 }
1924 }
1925 if !self.bump() {
1926 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1927 }
1928 }
1929 if let Some(span) = last_was_negation {
1930 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1931 }
1932 flags.span.end = self.pos();
1933 Ok(flags)
1934 }
1935
1936 #[inline(never)]
1937 fn parse_flag(&self) -> Result<ast::Flag> {
1938 match self.char() {
1939 'i' => Ok(ast::Flag::CaseInsensitive),
1940 'm' => Ok(ast::Flag::MultiLine),
1941 's' => Ok(ast::Flag::DotMatchesNewLine),
1942 'U' => Ok(ast::Flag::SwapGreed),
1943 'u' => Ok(ast::Flag::Unicode),
1944 'R' => Ok(ast::Flag::CRLF),
1945 'x' => Ok(ast::Flag::IgnoreWhitespace),
1946 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1947 }
1948 }
1949
1950 fn parse_primitive(&self) -> Result<Primitive> {
1951 match self.char() {
1952 '\\' => self.parse_escape(),
1953 '_' => {
1954 let ast = Primitive::Top(self.span_char());
1955 self.bump();
1956 Ok(ast)
1957 }
1958 '.' => {
1959 let ast = Primitive::Dot(self.span_char());
1960 self.bump();
1961 Ok(ast)
1962 }
1963 '^' => {
1964 let ast = Primitive::Assertion(ast::Assertion {
1965 span: self.span_char(),
1966 kind: ast::AssertionKind::StartLine,
1967 });
1968 self.bump();
1969 Ok(ast)
1970 }
1971 '$' => {
1972 let ast = Primitive::Assertion(ast::Assertion {
1973 span: self.span_char(),
1974 kind: ast::AssertionKind::EndLine,
1975 });
1976 self.bump();
1977 Ok(ast)
1978 }
1979 c => {
1980 let ast = Primitive::Literal(Literal {
1981 span: self.span_char(),
1982 kind: LiteralKind::Verbatim,
1983 c,
1984 });
1985 self.bump();
1986 Ok(ast)
1987 }
1988 }
1989 }
1990
1991 #[inline(never)]
1992 fn parse_escape(&self) -> Result<Primitive> {
1993 assert_eq!(self.char(), '\\');
1994 let start = self.pos();
1995 if !self.bump() {
1996 return Err(self.error(
1997 Span::new(start, self.pos()),
1998 ast::ErrorKind::EscapeUnexpectedEof,
1999 ));
2000 }
2001 let c = self.char();
2002 match c {
2004 '0'..='9' => {
2005 if !self.parser().octal {
2006 return Err(self.error(
2007 Span::new(start, self.span_char().end),
2008 ast::ErrorKind::UnsupportedBackreference,
2009 ));
2010 }
2011 let mut lit = self.parse_octal();
2012 lit.span.start = start;
2013 return Ok(Primitive::Literal(lit));
2014 }
2015 'x' | 'u' | 'U' => {
2022 let mut lit = self.parse_hex()?;
2023 lit.span.start = start;
2024 return Ok(Primitive::Literal(lit));
2025 }
2026 'p' | 'P' => {
2027 let mut cls = self.parse_unicode_class()?;
2028 cls.span.start = start;
2029 return Ok(Primitive::Unicode(cls));
2030 }
2031 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2032 let mut cls = self.parse_perl_class();
2033 cls.span.start = start;
2034 return Ok(Primitive::Perl(cls));
2035 }
2036 _ => {}
2037 }
2038
2039 self.bump();
2041 let span = Span::new(start, self.pos());
2042 if is_meta_character(c) {
2043 return Ok(Primitive::Literal(Literal {
2044 span,
2045 kind: LiteralKind::Meta,
2046 c,
2047 }));
2048 }
2049 if is_escapeable_character(c) {
2050 return Ok(Primitive::Literal(Literal {
2051 span,
2052 kind: LiteralKind::Superfluous,
2053 c,
2054 }));
2055 }
2056 let special = |kind, c| {
2057 Ok(Primitive::Literal(Literal {
2058 span,
2059 kind: LiteralKind::Special(kind),
2060 c,
2061 }))
2062 };
2063 match c {
2064 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2065 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2066 't' => special(SpecialLiteralKind::Tab, '\t'),
2067 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2068 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2069 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2070 'A' => Ok(Primitive::Assertion(ast::Assertion {
2071 span,
2072 kind: ast::AssertionKind::StartText,
2073 })),
2074 'z' => Ok(Primitive::Assertion(ast::Assertion {
2075 span,
2076 kind: ast::AssertionKind::EndText,
2077 })),
2078 'b' => {
2079 let mut wb = ast::Assertion {
2080 span,
2081 kind: ast::AssertionKind::WordBoundary,
2082 };
2083 if !self.is_eof() && self.char() == '{' {
2086 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2087 wb.kind = kind;
2088 wb.span.end = self.pos();
2089 }
2090 }
2091 Ok(Primitive::Assertion(wb))
2092 }
2093 'B' => Ok(Primitive::Assertion(ast::Assertion {
2094 span,
2095 kind: ast::AssertionKind::NotWordBoundary,
2096 })),
2097 '<' => Ok(Primitive::Assertion(ast::Assertion {
2098 span,
2099 kind: ast::AssertionKind::WordBoundaryStartAngle,
2100 })),
2101 '>' => Ok(Primitive::Assertion(ast::Assertion {
2102 span,
2103 kind: ast::AssertionKind::WordBoundaryEndAngle,
2104 })),
2105 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2106 }
2107 }
2108
2109 fn maybe_parse_special_word_boundary(
2110 &self,
2111 wb_start: Position,
2112 ) -> Result<Option<ast::AssertionKind>> {
2113 assert_eq!(self.char(), '{');
2114
2115 let is_valid_char = |c| match c {
2116 'A'..='Z' | 'a'..='z' | '-' => true,
2117 _ => false,
2118 };
2119 let start = self.pos();
2120 if !self.bump_and_bump_space() {
2121 return Err(self.error(
2122 Span::new(wb_start, self.pos()),
2123 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2124 ));
2125 }
2126 let start_contents = self.pos();
2127 if !is_valid_char(self.char()) {
2132 self.parser().pos.set(start);
2133 return Ok(None);
2134 }
2135
2136 let mut scratch = self.parser().scratch.borrow_mut();
2138 scratch.clear();
2139 while !self.is_eof() && is_valid_char(self.char()) {
2140 scratch.push(self.char());
2141 self.bump_and_bump_space();
2142 }
2143 if self.is_eof() || self.char() != '}' {
2144 return Err(self.error(
2145 Span::new(start, self.pos()),
2146 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2147 ));
2148 }
2149 let end = self.pos();
2150 self.bump();
2151 let kind = match scratch.as_str() {
2152 "start" => ast::AssertionKind::WordBoundaryStart,
2153 "end" => ast::AssertionKind::WordBoundaryEnd,
2154 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2155 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2156 _ => {
2157 return Err(self.error(
2158 Span::new(start_contents, end),
2159 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2160 ))
2161 }
2162 };
2163 Ok(Some(kind))
2164 }
2165
2166 #[inline(never)]
2167 fn parse_octal(&self) -> Literal {
2168 assert!(self.parser().octal);
2169 assert!('0' <= self.char() && self.char() <= '7');
2170 let start = self.pos();
2171 while self.bump()
2173 && '0' <= self.char()
2174 && self.char() <= '7'
2175 && self.pos().offset - start.offset <= 2
2176 {}
2177 let end = self.pos();
2178 let octal = &self.pattern()[start.offset..end.offset];
2179 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2182 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2185 Literal {
2186 span: Span::new(start, end),
2187 kind: LiteralKind::Octal,
2188 c,
2189 }
2190 }
2191
2192 #[inline(never)]
2193 fn parse_hex(&self) -> Result<Literal> {
2194 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2195
2196 let hex_kind = match self.char() {
2197 'x' => HexLiteralKind::X,
2198 'u' => HexLiteralKind::UnicodeShort,
2199 _ => HexLiteralKind::UnicodeLong,
2200 };
2201 if !self.bump_and_bump_space() {
2202 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2203 }
2204 if self.char() == '{' {
2205 self.parse_hex_brace(hex_kind)
2206 } else {
2207 self.parse_hex_digits(hex_kind)
2208 }
2209 }
2210
2211 #[inline(never)]
2212 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2213 let mut scratch = self.parser().scratch.borrow_mut();
2214 scratch.clear();
2215
2216 let start = self.pos();
2217 for i in 0..kind.digits() {
2218 if i > 0 && !self.bump_and_bump_space() {
2219 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2220 }
2221 if !is_hex(self.char()) {
2222 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2223 }
2224 scratch.push(self.char());
2225 }
2226 self.bump_and_bump_space();
2229 let end = self.pos();
2230 let hex = scratch.as_str();
2231 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2232 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2233 Some(c) => Ok(Literal {
2234 span: Span::new(start, end),
2235 kind: LiteralKind::HexFixed(kind),
2236 c,
2237 }),
2238 }
2239 }
2240
2241 #[inline(never)]
2242 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2243 let mut scratch = self.parser().scratch.borrow_mut();
2244 scratch.clear();
2245
2246 let brace_pos = self.pos();
2247 let start = self.span_char().end;
2248 while self.bump_and_bump_space() && self.char() != '}' {
2249 if !is_hex(self.char()) {
2250 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2251 }
2252 scratch.push(self.char());
2253 }
2254 if self.is_eof() {
2255 return Err(self.error(
2256 Span::new(brace_pos, self.pos()),
2257 ast::ErrorKind::EscapeUnexpectedEof,
2258 ));
2259 }
2260 let end = self.pos();
2261 let hex = scratch.as_str();
2262 assert_eq!(self.char(), '}');
2263 self.bump_and_bump_space();
2264
2265 if hex.is_empty() {
2266 return Err(self.error(
2267 Span::new(brace_pos, self.pos()),
2268 ast::ErrorKind::EscapeHexEmpty,
2269 ));
2270 }
2271 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2272 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2273 Some(c) => Ok(Literal {
2274 span: Span::new(start, self.pos()),
2275 kind: LiteralKind::HexBrace(kind),
2276 c,
2277 }),
2278 }
2279 }
2280
2281 fn parse_decimal(&self) -> Result<u32> {
2282 let mut scratch = self.parser().scratch.borrow_mut();
2283 scratch.clear();
2284
2285 while !self.is_eof() && self.char().is_whitespace() {
2286 self.bump();
2287 }
2288 let start = self.pos();
2289 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2290 scratch.push(self.char());
2291 self.bump_and_bump_space();
2292 }
2293 let span = Span::new(start, self.pos());
2294 while !self.is_eof() && self.char().is_whitespace() {
2295 self.bump_and_bump_space();
2296 }
2297 let digits = scratch.as_str();
2298 if digits.is_empty() {
2299 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2300 }
2301 match u32::from_str_radix(digits, 10).ok() {
2302 Some(n) => Ok(n),
2303 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2304 }
2305 }
2306
2307 #[inline(never)]
2308 fn parse_set_class(&self) -> Result<ClassBracketed> {
2309 assert_eq!(self.char(), '[');
2310
2311 let mut union = ClassSetUnion {
2312 span: self.span(),
2313 items: vec![],
2314 };
2315 loop {
2316 self.bump_space();
2317 if self.is_eof() {
2318 return Err(self.unclosed_class_error());
2319 }
2320 match self.char() {
2321 '[' => {
2322 if !self.parser().stack_class.borrow().is_empty() {
2327 if let Some(cls) = self.maybe_parse_ascii_class() {
2328 union.push(ClassSetItem::Ascii(cls));
2329 continue;
2330 }
2331 }
2332 union = self.push_class_open(union)?;
2333 }
2334 ']' => match self.pop_class(union)? {
2335 Either::Left(nested_union) => {
2336 union = nested_union;
2337 }
2338 Either::Right(class) => return Ok(class),
2339 },
2340 '&' if self.peek() == Some('&') => {
2341 assert!(self.bump_if("&&"));
2342 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2343 }
2344 '-' if self.peek() == Some('-') => {
2345 assert!(self.bump_if("--"));
2346 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2347 }
2348 '~' if self.peek() == Some('~') => {
2349 assert!(self.bump_if("~~"));
2350 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2351 }
2352 _ => {
2353 union.push(self.parse_set_class_range()?);
2354 }
2355 }
2356 }
2357 }
2358
2359 #[inline(never)]
2360 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2361 let prim1 = self.parse_set_class_item()?;
2362 self.bump_space();
2363 if self.is_eof() {
2364 return Err(self.unclosed_class_error());
2365 }
2366 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2367 return prim1.into_class_set_item(self);
2368 }
2369 if !self.bump_and_bump_space() {
2370 return Err(self.unclosed_class_error());
2371 }
2372 let prim2 = self.parse_set_class_item()?;
2373 let range = ClassSetRange {
2374 span: Span::new(prim1.span().start, prim2.span().end),
2375 start: prim1.into_class_literal(self)?,
2376 end: prim2.into_class_literal(self)?,
2377 };
2378 if !range.is_valid() {
2379 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2380 }
2381 Ok(ClassSetItem::Range(range))
2382 }
2383
2384 #[inline(never)]
2385 fn parse_set_class_item(&self) -> Result<Primitive> {
2386 if self.char() == '\\' {
2387 self.parse_escape()
2388 } else {
2389 let x = Primitive::Literal(Literal {
2390 span: self.span_char(),
2391 kind: LiteralKind::Verbatim,
2392 c: self.char(),
2393 });
2394 self.bump();
2395 Ok(x)
2396 }
2397 }
2398
2399 #[inline(never)]
2400 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2401 assert_eq!(self.char(), '[');
2402 let start = self.pos();
2403 if !self.bump_and_bump_space() {
2404 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2405 }
2406
2407 let negated = if self.char() != '^' {
2408 false
2409 } else {
2410 if !self.bump_and_bump_space() {
2411 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2412 }
2413 true
2414 };
2415 let mut union = ClassSetUnion {
2417 span: self.span(),
2418 items: vec![],
2419 };
2420 while self.char() == '-' {
2421 union.push(ClassSetItem::Literal(Literal {
2422 span: self.span_char(),
2423 kind: LiteralKind::Verbatim,
2424 c: '-',
2425 }));
2426 if !self.bump_and_bump_space() {
2427 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2428 }
2429 }
2430 if union.items.is_empty() && self.char() == ']' {
2433 union.push(ClassSetItem::Literal(Literal {
2434 span: self.span_char(),
2435 kind: LiteralKind::Verbatim,
2436 c: ']',
2437 }));
2438 if !self.bump_and_bump_space() {
2439 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2440 }
2441 }
2442 let set = ClassBracketed {
2443 span: Span::new(start, self.pos()),
2444 negated,
2445 kind: ClassSet::union(ClassSetUnion {
2446 span: Span::new(union.span.start, union.span.start),
2447 items: vec![],
2448 }),
2449 };
2450 Ok((set, union))
2451 }
2452
2453 #[inline(never)]
2454 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2455 assert_eq!(self.char(), '[');
2456 let start = self.pos();
2458 let mut negated = false;
2459 if !self.bump() || self.char() != ':' {
2460 self.parser().pos.set(start);
2461 return None;
2462 }
2463 if !self.bump() {
2464 self.parser().pos.set(start);
2465 return None;
2466 }
2467 if self.char() == '^' {
2468 negated = true;
2469 if !self.bump() {
2470 self.parser().pos.set(start);
2471 return None;
2472 }
2473 }
2474 let name_start = self.offset();
2475 while self.char() != ':' && self.bump() {}
2476 if self.is_eof() {
2477 self.parser().pos.set(start);
2478 return None;
2479 }
2480 let name = &self.pattern()[name_start..self.offset()];
2481 if !self.bump_if(":]") {
2482 self.parser().pos.set(start);
2483 return None;
2484 }
2485 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2486 Some(kind) => kind,
2487 None => {
2488 self.parser().pos.set(start);
2489 return None;
2490 }
2491 };
2492 Some(ClassAscii {
2493 span: Span::new(start, self.pos()),
2494 kind,
2495 negated,
2496 })
2497 }
2498
2499 #[inline(never)]
2500 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2501 assert!(self.char() == 'p' || self.char() == 'P');
2502
2503 let mut scratch = self.parser().scratch.borrow_mut();
2504 scratch.clear();
2505
2506 let negated = self.char() == 'P';
2507 if !self.bump_and_bump_space() {
2508 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2509 }
2510 let (start, kind) = if self.char() == '{' {
2511 let start = self.span_char().end;
2512 while self.bump_and_bump_space() && self.char() != '}' {
2513 scratch.push(self.char());
2514 }
2515 if self.is_eof() {
2516 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2517 }
2518 assert_eq!(self.char(), '}');
2519 self.bump();
2520
2521 let name = scratch.as_str();
2522 if let Some(i) = name.find("!=") {
2523 (
2524 start,
2525 ClassUnicodeKind::NamedValue {
2526 op: ClassUnicodeOpKind::NotEqual,
2527 name: name[..i].to_string(),
2528 value: name[i + 2..].to_string(),
2529 },
2530 )
2531 } else if let Some(i) = name.find(':') {
2532 (
2533 start,
2534 ClassUnicodeKind::NamedValue {
2535 op: ClassUnicodeOpKind::Colon,
2536 name: name[..i].to_string(),
2537 value: name[i + 1..].to_string(),
2538 },
2539 )
2540 } else if let Some(i) = name.find('=') {
2541 (
2542 start,
2543 ClassUnicodeKind::NamedValue {
2544 op: ClassUnicodeOpKind::Equal,
2545 name: name[..i].to_string(),
2546 value: name[i + 1..].to_string(),
2547 },
2548 )
2549 } else {
2550 (start, ClassUnicodeKind::Named(name.to_string()))
2551 }
2552 } else {
2553 let start = self.pos();
2554 let c = self.char();
2555 if c == '\\' {
2556 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2557 }
2558 self.bump_and_bump_space();
2559 let kind = ClassUnicodeKind::OneLetter(c);
2560 (start, kind)
2561 };
2562 Ok(ClassUnicode {
2563 span: Span::new(start, self.pos()),
2564 negated,
2565 kind,
2566 })
2567 }
2568
2569 #[inline(never)]
2570 fn parse_perl_class(&self) -> ClassPerl {
2571 let c = self.char();
2572 let span = self.span_char();
2573 self.bump();
2574 let (negated, kind) = match c {
2575 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2576 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2577 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2578 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2579 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2580 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2581 c => panic!("expected valid Perl class but got '{}'", c),
2582 };
2583 ClassPerl {
2584 span,
2585 kind,
2586 negated,
2587 }
2588 }
2589}
2590
2591pub fn parse_ast<'s>(
2592 tb: &mut TB<'s>,
2593 pattern: &'s str,
2594) -> std::result::Result<NodeId, ResharpError> {
2595 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2596 p.parse(tb)
2597}
2598
2599pub fn parse_ast_with<'s>(
2600 tb: &mut TB<'s>,
2601 pattern: &'s str,
2602 flags: &PatternFlags,
2603) -> std::result::Result<NodeId, ResharpError> {
2604 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2605 p.parse(tb)
2606}