1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub case_insensitive: bool,
32 pub dot_matches_new_line: bool,
34 pub ignore_whitespace: bool,
36}
37
38impl Default for PatternFlags {
39 fn default() -> Self {
40 Self {
41 unicode: true,
42 case_insensitive: false,
43 dot_matches_new_line: false,
44 ignore_whitespace: false,
45 }
46 }
47}
48
49#[derive(Clone, Copy, PartialEq)]
50enum WordCharKind {
51 Word,
52 NonWord,
53 MaybeWord,
54 MaybeNonWord,
55 Unknown,
56 Edge,
57}
58
59fn is_word_byte(b: u8) -> bool {
60 b.is_ascii_alphanumeric() || b == b'_'
61}
62
63
64#[derive(Clone, Debug, Eq, PartialEq)]
65enum Primitive {
66 Literal(Literal),
67 Assertion(ast::Assertion),
68 Dot(Span),
69 Top(Span),
70 Perl(ClassPerl),
71 Unicode(ClassUnicode),
72}
73
74impl Primitive {
75 fn span(&self) -> &Span {
76 match *self {
77 Primitive::Literal(ref x) => &x.span,
78 Primitive::Assertion(ref x) => &x.span,
79 Primitive::Dot(ref span) => span,
80 Primitive::Top(ref span) => span,
81 Primitive::Perl(ref x) => &x.span,
82 Primitive::Unicode(ref x) => &x.span,
83 }
84 }
85
86 fn into_ast(self) -> Ast {
87 match self {
88 Primitive::Literal(lit) => Ast::literal(lit),
89 Primitive::Assertion(assert) => Ast::assertion(assert),
90 Primitive::Dot(span) => Ast::dot(span),
91 Primitive::Top(span) => Ast::top(span),
92 Primitive::Perl(cls) => Ast::class_perl(cls),
93 Primitive::Unicode(cls) => Ast::class_unicode(cls),
94 }
95 }
96
97 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
98 use self::Primitive::*;
99 use regex_syntax::ast::ClassSetItem;
100
101 match self {
102 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
103 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
104 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
105 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
106 }
107 }
108
109 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
110 use self::Primitive::*;
111
112 match self {
113 Literal(lit) => Ok(lit),
114 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
115 }
116 }
117}
118
119#[derive(Clone, Debug, Eq, PartialEq)]
120pub enum Either<Left, Right> {
121 Left(Left),
122 Right(Right),
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
126pub struct ResharpError {
127 pub kind: ErrorKind,
129 pattern: String,
132 pub span: Span,
134}
135
136impl std::fmt::Display for ResharpError {
137 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
138 write!(f, "{:?}: {:?}", self.kind, self.span)
139 }
140}
141impl std::error::Error for ResharpError {}
142
143type Result<T> = core::result::Result<T, ResharpError>;
144
145#[derive(Clone, Debug)]
146enum GroupState {
147 Group {
149 concat: Concat,
151 group: ast::Group,
153 ignore_whitespace: bool,
155 },
156 Alternation(ast::Alternation),
161 Intersection(ast::Intersection),
162}
163
164#[derive(Clone, Debug)]
165enum ClassState {
166 Open {
168 union: regex_syntax::ast::ClassSetUnion,
170 set: regex_syntax::ast::ClassBracketed,
174 },
175 Op {
178 kind: regex_syntax::ast::ClassSetBinaryOpKind,
180 lhs: regex_syntax::ast::ClassSet,
182 },
183}
184
185pub struct ResharpParser<'s> {
187 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
188 unicode_classes: resharp_algebra::UnicodeClassCache,
189 pub translator: regex_syntax::hir::translate::Translator,
190 pub pattern: &'s str,
191 pos: Cell<Position>,
192 capture_index: Cell<u32>,
193 octal: bool,
194 empty_min_range: bool,
195 ignore_whitespace: Cell<bool>,
196 dot_all: Cell<bool>,
197 global_unicode: bool,
198 global_case_insensitive: bool,
199 comments: RefCell<Vec<ast::Comment>>,
200 stack_group: RefCell<Vec<GroupState>>,
201 stack_class: RefCell<Vec<ClassState>>,
202 capture_names: RefCell<Vec<ast::CaptureName>>,
203 scratch: RefCell<String>,
204}
205
206fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
207 if let Err(e) = result {
208 if e.kind == from {
209 Err(ResharpError {
210 kind: to,
211 pattern: e.pattern,
212 span: e.span,
213 })
214 } else {
215 Err(e)
216 }
217 } else {
218 result
219 }
220}
221
222fn is_capture_char(c: char, first: bool) -> bool {
223 if first {
224 c == '_' || c.is_alphabetic()
225 } else {
226 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
227 }
228}
229
230pub fn is_meta_character(c: char) -> bool {
231 match c {
232 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
233 | '#' | '&' | '-' | '~' | '_' => true,
234 _ => false,
235 }
236}
237
238pub fn escape(text: &str) -> String {
240 let mut buf = String::new();
241 escape_into(text, &mut buf);
242 buf
243}
244
245pub fn escape_into(text: &str, buf: &mut String) {
247 buf.reserve(text.len());
248 for c in text.chars() {
249 if is_meta_character(c) {
250 buf.push('\\');
251 }
252 buf.push(c);
253 }
254}
255
256pub fn is_escapeable_character(c: char) -> bool {
257 if is_meta_character(c) {
259 return true;
260 }
261 if !c.is_ascii() {
264 return false;
265 }
266 match c {
271 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
272 '<' | '>' => false,
282 _ => true,
283 }
284}
285
286fn is_hex(c: char) -> bool {
287 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
288}
289
290impl<'s> ResharpParser<'s> {
291 fn default_translator_builder(&self) -> TranslatorBuilder {
292 let mut trb = TranslatorBuilder::new();
293 trb.unicode(self.global_unicode);
294 trb.utf8(false);
295 trb.case_insensitive(self.global_case_insensitive);
296 trb
297 }
298
299 pub fn new(pattern: &'s str) -> Self {
300 Self::with_flags(pattern, &PatternFlags::default())
301 }
302
303 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
304 let mut trb = TranslatorBuilder::new();
305 trb.unicode(flags.unicode);
306 trb.utf8(false);
307 trb.case_insensitive(flags.case_insensitive);
308 Self {
309 translator: trb.build(),
310 pattern,
311 perl_classes: vec![],
312 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
313 pos: Cell::new(Position::new(0, 0, 0)),
314 capture_index: Cell::new(0),
315 octal: false,
316 empty_min_range: false,
317 ignore_whitespace: Cell::new(flags.ignore_whitespace),
318 dot_all: Cell::new(flags.dot_matches_new_line),
319 global_unicode: flags.unicode,
320 global_case_insensitive: flags.case_insensitive,
321 comments: RefCell::new(vec![]),
322 stack_group: RefCell::new(vec![]),
323 stack_class: RefCell::new(vec![]),
324 capture_names: RefCell::new(vec![]),
325 scratch: RefCell::new(String::new()),
326 }
327 }
328
329 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
331 self
332 }
333
334 fn pattern(&self) -> &str {
336 self.pattern
337 }
338
339 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
341 ResharpError {
342 kind,
343 pattern: self.pattern().to_string(),
344 span,
345 }
346 }
347
348 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
349 let emptyspan = Span::splat(self.pos());
350 let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
351 inner
352 }
353
354 fn offset(&self) -> usize {
359 self.parser().pos.get().offset
360 }
361
362 fn line(&self) -> usize {
366 self.parser().pos.get().line
367 }
368
369 fn column(&self) -> usize {
373 self.parser().pos.get().column
374 }
375
376 fn next_capture_index(&self, span: Span) -> Result<u32> {
384 let current = self.parser().capture_index.get();
385 let i = current
386 .checked_add(1)
387 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
388 self.parser().capture_index.set(i);
389 Ok(i)
390 }
391
392 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
393 let mut names = self.parser().capture_names.borrow_mut();
394 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
395 Err(i) => {
396 names.insert(i, cap.clone());
397 Ok(())
398 }
399 Ok(i) => Err(self.error(
400 cap.span,
401 ast::ErrorKind::GroupNameDuplicate {
402 original: names[i].span,
403 },
404 )),
405 }
406 }
407
408 fn ignore_whitespace(&self) -> bool {
409 self.parser().ignore_whitespace.get()
410 }
411
412 fn char(&self) -> char {
413 self.char_at(self.offset())
414 }
415
416 fn char_at(&self, i: usize) -> char {
417 self.pattern()[i..]
418 .chars()
419 .next()
420 .unwrap_or_else(|| panic!("expected char at offset {}", i))
421 }
422
423 fn bump(&self) -> bool {
424 if self.is_eof() {
425 return false;
426 }
427 let Position {
428 mut offset,
429 mut line,
430 mut column,
431 } = self.pos();
432 if self.char() == '\n' {
433 line = line.checked_add(1).unwrap();
434 column = 1;
435 } else {
436 column = column.checked_add(1).unwrap();
437 }
438 offset += self.char().len_utf8();
439 self.parser().pos.set(Position {
440 offset,
441 line,
442 column,
443 });
444 self.pattern()[self.offset()..].chars().next().is_some()
445 }
446
447 fn bump_if(&self, prefix: &str) -> bool {
448 if self.pattern()[self.offset()..].starts_with(prefix) {
449 for _ in 0..prefix.chars().count() {
450 self.bump();
451 }
452 true
453 } else {
454 false
455 }
456 }
457
458 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
459 if self.bump_if("?=") {
460 return Some((true, true));
461 }
462 if self.bump_if("?!") {
463 return Some((true, false));
464 }
465 if self.bump_if("?<=") {
466 return Some((false, true));
467 }
468 if self.bump_if("?<!") {
469 return Some((false, false));
470 }
471 return None;
472 }
474
475 fn bump_and_bump_space(&self) -> bool {
476 if !self.bump() {
477 return false;
478 }
479 self.bump_space();
480 !self.is_eof()
481 }
482
483 fn bump_space(&self) {
484 if !self.ignore_whitespace() {
485 return;
486 }
487 while !self.is_eof() {
488 if self.char().is_whitespace() {
489 self.bump();
490 } else if self.char() == '#' {
491 let start = self.pos();
492 let mut comment_text = String::new();
493 self.bump();
494 while !self.is_eof() {
495 let c = self.char();
496 self.bump();
497 if c == '\n' {
498 break;
499 }
500 comment_text.push(c);
501 }
502 let comment = ast::Comment {
503 span: Span::new(start, self.pos()),
504 comment: comment_text,
505 };
506 self.parser().comments.borrow_mut().push(comment);
507 } else {
508 break;
509 }
510 }
511 }
512
513 fn peek(&self) -> Option<char> {
517 if self.is_eof() {
518 return None;
519 }
520 self.pattern()[self.offset() + self.char().len_utf8()..]
521 .chars()
522 .next()
523 }
524
525 fn peek_space(&self) -> Option<char> {
528 if !self.ignore_whitespace() {
529 return self.peek();
530 }
531 if self.is_eof() {
532 return None;
533 }
534 let mut start = self.offset() + self.char().len_utf8();
535 let mut in_comment = false;
536 for (i, c) in self.pattern()[start..].char_indices() {
537 if c.is_whitespace() {
538 continue;
539 } else if !in_comment && c == '#' {
540 in_comment = true;
541 } else if in_comment && c == '\n' {
542 in_comment = false;
543 } else {
544 start += i;
545 break;
546 }
547 }
548 self.pattern()[start..].chars().next()
549 }
550
551 fn is_eof(&self) -> bool {
553 self.offset() == self.pattern().len()
554 }
555
556 fn pos(&self) -> Position {
559 self.parser().pos.get()
560 }
561
562 fn span(&self) -> Span {
565 Span::splat(self.pos())
566 }
567
568 fn span_char(&self) -> Span {
570 let mut next = Position {
571 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
572 line: self.line(),
573 column: self.column().checked_add(1).unwrap(),
574 };
575 if self.char() == '\n' {
576 next.line += 1;
577 next.column = 1;
578 }
579 Span::new(self.pos(), next)
580 }
581
582 #[inline(never)]
592 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
593 assert_eq!(self.char(), '|');
594 concat.span.end = self.pos();
595 self.push_or_add_alternation(concat);
596 self.bump();
597 Ok(ast::Concat {
598 span: self.span(),
599 asts: vec![],
600 })
601 }
602
603 fn push_or_add_alternation(&self, concat: Concat) {
606 use self::GroupState::*;
607
608 let mut stack = self.parser().stack_group.borrow_mut();
609 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
610 alts.asts.push(concat.into_ast());
611 return;
612 }
613 stack.push(Alternation(ast::Alternation {
614 span: Span::new(concat.span.start, self.pos()),
615 asts: vec![concat.into_ast()],
616 }));
617 }
618
619 #[inline(never)]
620 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
621 assert_eq!(self.char(), '&');
622 concat.span.end = self.pos();
623 self.push_or_add_intersect(concat);
624 self.bump();
625 Ok(Concat {
626 span: self.span(),
627 asts: vec![],
628 })
629 }
630
631 fn push_or_add_intersect(&self, concat: Concat) {
634 use self::GroupState::*;
635
636 let mut stack = self.parser().stack_group.borrow_mut();
637 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
638 alts.asts.push(concat.into_ast());
639 return;
640 }
641 stack.push(Intersection(ast::Intersection {
642 span: Span::new(concat.span.start, self.pos()),
643 asts: vec![concat.into_ast()],
644 }));
645 }
646
647 #[inline(never)]
661 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
662 assert_eq!(self.char(), '(');
663 match self.parse_group()? {
664 Either::Left(set) => {
665 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
666 if let Some(v) = ignore {
667 self.parser().ignore_whitespace.set(v);
668 }
669
670 concat.asts.push(Ast::flags(set));
671 Ok(concat)
672 }
673 Either::Right(group) => {
674 let old_ignore_whitespace = self.ignore_whitespace();
675 let new_ignore_whitespace = group
676 .flags()
677 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678 .unwrap_or(old_ignore_whitespace);
679 self.parser()
680 .stack_group
681 .borrow_mut()
682 .push(GroupState::Group {
683 concat,
684 group,
685 ignore_whitespace: old_ignore_whitespace,
686 });
687 self.parser().ignore_whitespace.set(new_ignore_whitespace);
688 Ok(Concat {
689 span: self.span(),
690 asts: vec![],
691 })
692 }
693 }
694 }
695
696 #[inline(never)]
697 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
698 assert_eq!(self.char(), '~');
699 self.bump();
700 if self.is_eof() || self.char() != '(' {
701 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
702 }
703 let open_span = self.span_char();
704 self.bump();
705 let group = ast::Group {
706 span: open_span,
707 kind: ast::GroupKind::Complement,
708 ast: Box::new(Ast::empty(self.span())),
709 };
710
711 let old_ignore_whitespace = self.ignore_whitespace();
712 let new_ignore_whitespace = group
713 .flags()
714 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
715 .unwrap_or(old_ignore_whitespace);
716 self.parser()
717 .stack_group
718 .borrow_mut()
719 .push(GroupState::Group {
720 concat,
721 group,
722 ignore_whitespace: old_ignore_whitespace,
723 });
724 self.parser().ignore_whitespace.set(new_ignore_whitespace);
725 Ok(Concat {
726 span: self.span(),
727 asts: vec![],
728 })
729 }
730
731 #[inline(never)]
741 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
742 use self::GroupState::*;
743 assert_eq!(self.char(), ')');
744 let mut stack = self.parser().stack_group.borrow_mut();
745 let topstack = stack.pop();
746
747 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
748 Some(Group {
749 concat,
750 group,
751 ignore_whitespace,
752 }) => (concat, group, ignore_whitespace, None),
753 Some(Alternation(alt)) => match stack.pop() {
754 Some(Group {
755 concat,
756 group,
757 ignore_whitespace,
758 }) => (
759 concat,
760 group,
761 ignore_whitespace,
762 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
763 ),
764 None | Some(Alternation(_)) | Some(Intersection(_)) => {
765 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
766 }
767 },
768 Some(Intersection(int)) => match stack.pop() {
769 Some(Group {
770 concat,
771 group,
772 ignore_whitespace,
773 }) => (
774 concat,
775 group,
776 ignore_whitespace,
777 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
778 ),
779 None | Some(Alternation(_)) | Some(Intersection(_)) => {
780 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
781 }
782 },
783
784 None => {
785 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
786 }
787 };
788 self.parser().ignore_whitespace.set(ignore_whitespace);
789 group_concat.span.end = self.pos();
790 self.bump();
791 group.span.end = self.pos();
792 match alt {
793 Some(Either::Left(mut alt)) => {
794 alt.span.end = group_concat.span.end;
795 alt.asts.push(group_concat.into_ast());
796 group.ast = Box::new(alt.into_ast());
797 }
798 Some(Either::Right(mut int)) => {
799 int.span.end = group_concat.span.end;
800 int.asts.push(group_concat.into_ast());
801 group.ast = Box::new(int.into_ast());
802 }
803 None => {
804 group.ast = Box::new(group_concat.into_ast());
805 }
806 }
807
808 if group.kind == GroupKind::Complement {
809 let complement = ast::Complement {
810 span: self.span(),
811 ast: group.ast,
812 };
813 prior_concat.asts.push(Ast::complement(complement));
814 }
815 else {
817 prior_concat.asts.push(Ast::group(group));
819 }
820 Ok(prior_concat)
821 }
822
823 #[inline(never)]
830 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
831 concat.span.end = self.pos();
832 let mut stack = self.parser().stack_group.borrow_mut();
834 let ast = match stack.pop() {
835 None => Ok(concat.into_ast()),
836 Some(GroupState::Alternation(mut alt)) => {
837 alt.span.end = self.pos();
838 alt.asts.push(concat.into_ast());
839 Ok(Ast::alternation(alt))
840 }
841 Some(GroupState::Intersection(mut int)) => {
842 int.span.end = self.pos();
843 int.asts.push(concat.into_ast());
844
845 Ok(Ast::intersection(int))
847 }
848 Some(GroupState::Group { group, .. }) => {
849 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
850 }
851 };
852 match stack.pop() {
854 None => ast,
855 Some(GroupState::Alternation(_)) => {
856 unreachable!()
863 }
864 Some(GroupState::Intersection(_)) => {
865 unreachable!()
866 }
867 Some(GroupState::Group { group, .. }) => {
868 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
869 }
870 }
871 }
872
873 #[inline(never)]
882 fn push_class_open(
883 &self,
884 parent_union: regex_syntax::ast::ClassSetUnion,
885 ) -> Result<regex_syntax::ast::ClassSetUnion> {
886 assert_eq!(self.char(), '[');
887
888 let (nested_set, nested_union) = self.parse_set_class_open()?;
889 self.parser()
890 .stack_class
891 .borrow_mut()
892 .push(ClassState::Open {
893 union: parent_union,
894 set: nested_set,
895 });
896 Ok(nested_union)
897 }
898
899 #[inline(never)]
914 fn pop_class(
915 &self,
916 nested_union: regex_syntax::ast::ClassSetUnion,
917 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
918 assert_eq!(self.char(), ']');
919
920 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
921 let prevset = self.pop_class_op(item);
922 let mut stack = self.parser().stack_class.borrow_mut();
923 match stack.pop() {
924 None => {
925 panic!("unexpected empty character class stack")
934 }
935 Some(ClassState::Op { .. }) => {
936 panic!("unexpected ClassState::Op")
943 }
944 Some(ClassState::Open { mut union, mut set }) => {
945 self.bump();
946 set.span.end = self.pos();
947 set.kind = prevset;
948 if stack.is_empty() {
949 Ok(Either::Right(set))
950 } else {
951 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
952 Ok(Either::Left(union))
953 }
954 }
955 }
956 }
957
958 #[inline(never)]
963 fn unclosed_class_error(&self) -> ResharpError {
964 for state in self.parser().stack_class.borrow().iter().rev() {
965 if let ClassState::Open { ref set, .. } = *state {
966 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
967 }
968 }
969 panic!("no open character class found")
972 }
973
974 #[inline(never)]
980 fn push_class_op(
981 &self,
982 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
983 next_union: regex_syntax::ast::ClassSetUnion,
984 ) -> regex_syntax::ast::ClassSetUnion {
985 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
986 let new_lhs = self.pop_class_op(item);
987 self.parser().stack_class.borrow_mut().push(ClassState::Op {
988 kind: next_kind,
989 lhs: new_lhs,
990 });
991 regex_syntax::ast::ClassSetUnion {
992 span: self.span(),
993 items: vec![],
994 }
995 }
996
997 #[inline(never)]
1003 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1004 let mut stack = self.parser().stack_class.borrow_mut();
1005 let (kind, lhs) = match stack.pop() {
1006 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1007 Some(state @ ClassState::Open { .. }) => {
1008 stack.push(state);
1009 return rhs;
1010 }
1011 None => unreachable!(),
1012 };
1013 let span = Span::new(lhs.span().start, rhs.span().end);
1014 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1015 span,
1016 kind,
1017 lhs: Box::new(lhs),
1018 rhs: Box::new(rhs),
1019 })
1020 }
1021
1022 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1023 match hir.kind() {
1024 hir::HirKind::Empty => Ok(NodeId::EPS),
1025 hir::HirKind::Literal(l) => {
1026 if l.0.len() == 1 {
1027 let node = tb.mk_u8(l.0[0]);
1028 Ok(node)
1029 } else {
1030 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1031 let conc = tb.mk_concats(ws.iter().copied());
1032 Ok(conc)
1033 }
1034 }
1035 hir::HirKind::Class(class) => {
1036 match class {
1038 hir::Class::Unicode(class_unicode) => {
1039 let ranges = class_unicode.ranges();
1040 let mut nodes = Vec::new();
1041 for range in ranges {
1042 for seq in Utf8Sequences::new(range.start(), range.end()) {
1043 let sl = seq.as_slice();
1044 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1045 let node = match bytes.len() {
1046 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1047 n => {
1048 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1049 let mut conc = last;
1050 for i in (0..n - 1).rev() {
1051 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1052 conc = tb.mk_concat(b, conc);
1053 }
1054 conc
1055 }
1056 };
1057 nodes.push(node);
1058 }
1059 }
1060 let merged = tb.mk_unions(nodes.into_iter());
1061 Ok(merged)
1062 }
1063 hir::Class::Bytes(class_bytes) => {
1064 let ranges = class_bytes.ranges();
1065 let mut result = NodeId::BOT;
1066 for range in ranges {
1067 let start = range.start();
1068 let end = range.end();
1069 let node = tb.mk_range_u8(start, end);
1070 result = tb.mk_union(result, node);
1071 }
1072 Ok(result)
1073 }
1074 }
1075 }
1076 hir::HirKind::Look(_) => todo!(),
1077 hir::HirKind::Repetition(_) => todo!(),
1078 hir::HirKind::Capture(_) => todo!(),
1079 hir::HirKind::Concat(body) => {
1080 let mut result = NodeId::EPS;
1081 for child in body {
1082 let node = self.hir_to_node_id(child, tb)?;
1083 result = tb.mk_concat(result, node);
1084 }
1085 Ok(result)
1086 }
1087 hir::HirKind::Alternation(_) => todo!(),
1088 }
1089 }
1090
1091 fn translate_ast_to_hir(
1092 &mut self,
1093 orig_ast: ®ex_syntax::ast::Ast,
1094 tb: &mut TB<'s>,
1095 ) -> Result<NodeId> {
1096 match self.translator.translate("", orig_ast) {
1099 Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1100 Ok(hir) => {
1101 let mapped = self.hir_to_node_id(&hir, tb);
1102 mapped
1103 }
1104 }
1105 }
1106
1107 fn translator_to_node_id(
1108 &mut self,
1109 orig_ast: ®ex_syntax::ast::Ast,
1110 translator: &mut Option<Translator>,
1111 tb: &mut TB<'s>,
1112 ) -> Result<NodeId> {
1113 match translator {
1114 Some(tr) => {
1115 let hir = tr
1116 .translate("", &orig_ast)
1117 .map_err(|e| self.unsupported_error(e))?;
1118 self.hir_to_node_id(&hir, tb)
1119 }
1120 None => self.translate_ast_to_hir(&orig_ast, tb),
1121 }
1122 }
1123
1124 fn get_class(
1125 &mut self,
1126 negated: bool,
1127 kind: regex_syntax::ast::ClassPerlKind,
1128 tb: &mut TB<'s>,
1129 ) -> Result<NodeId> {
1130 let w = self
1131 .perl_classes
1132 .iter()
1133 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1134 match w {
1135 Some((_, _, value)) => return Ok(*value),
1136 None => {
1137 let translated = if self.global_unicode {
1138 match kind {
1139 regex_syntax::ast::ClassPerlKind::Word => {
1140 self.unicode_classes.ensure_word(tb);
1141 if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1142 }
1143 regex_syntax::ast::ClassPerlKind::Digit => {
1144 self.unicode_classes.ensure_digit(tb);
1145 if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1146 }
1147 regex_syntax::ast::ClassPerlKind::Space => {
1148 self.unicode_classes.ensure_space(tb);
1149 if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1150 }
1151 }
1152 } else {
1153 let pos = match kind {
1154 regex_syntax::ast::ClassPerlKind::Word => {
1155 let az = tb.mk_range_u8(b'a', b'z');
1156 let big = tb.mk_range_u8(b'A', b'Z');
1157 let dig = tb.mk_range_u8(b'0', b'9');
1158 let us = tb.mk_u8(b'_');
1159 tb.mk_unions([az, big, dig, us].into_iter())
1160 }
1161 regex_syntax::ast::ClassPerlKind::Digit => {
1162 tb.mk_range_u8(b'0', b'9')
1163 }
1164 regex_syntax::ast::ClassPerlKind::Space => {
1165 let sp = tb.mk_u8(b' ');
1166 let tab = tb.mk_u8(b'\t');
1167 let nl = tb.mk_u8(b'\n');
1168 let cr = tb.mk_u8(b'\r');
1169 let ff = tb.mk_u8(0x0C);
1170 let vt = tb.mk_u8(0x0B);
1171 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1172 }
1173 };
1174 if negated { tb.mk_compl(pos) } else { pos }
1175 };
1176 self.perl_classes.push((negated, kind, translated));
1177 Ok(translated)
1178 }
1179 }
1180 }
1181
1182 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1183 use WordCharKind::*;
1184 match ast {
1185 Ast::Literal(lit) => {
1186 if is_word_byte(lit.c as u8) {
1187 Word
1188 } else {
1189 NonWord
1190 }
1191 }
1192 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1193 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1194 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1195 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1196 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1197 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1198 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1199 },
1200 Ast::Dot(_) | Ast::Top(_) => Unknown,
1201 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1202 Ast::Concat(c) if !c.asts.is_empty() => {
1203 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1204 }
1205 Ast::Repetition(r) => {
1206 let inner = Self::word_char_kind(&r.ast, left);
1207 let nullable = matches!(
1208 &r.op.kind,
1209 ast::RepetitionKind::ZeroOrMore
1210 | ast::RepetitionKind::ZeroOrOne
1211 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1212 );
1213 if nullable {
1214 match inner {
1215 Word => MaybeWord,
1216 NonWord => MaybeNonWord,
1217 _ => Unknown,
1218 }
1219 } else {
1220 inner
1221 }
1222 }
1223 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1224 _ => Unknown,
1225 }
1226 }
1227
1228 fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1229 match ast {
1230 Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1231 | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1232 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1233 Ast::Concat(c) if !c.asts.is_empty() => {
1234 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1235 }
1236 Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1237 Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1238 _ => None,
1239 }
1240 }
1241
1242 fn resolve_word_kind(
1243 &mut self,
1244 asts: &[Ast],
1245 idx: usize,
1246 dir: isize,
1247 translator: &mut Option<Translator>,
1248 tb: &mut TB<'s>,
1249 word_id: NodeId,
1250 not_word_id: NodeId,
1251 ) -> Result<WordCharKind> {
1252 use WordCharKind::*;
1253 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1254 if fast != Unknown {
1255 return Ok(fast);
1256 }
1257 let neighbor_idx = (idx as isize + dir) as usize;
1258 let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1259 Some(e) => e,
1260 None => return Ok(Unknown),
1261 };
1262 let node = self.ast_to_node_id(edge, translator, tb)?;
1263 if tb.subsumes(word_id, node) == Some(true) {
1264 Ok(Word)
1265 } else if tb.subsumes(not_word_id, node) == Some(true) {
1266 Ok(NonWord)
1267 } else {
1268 Ok(Unknown)
1269 }
1270 }
1271
1272 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1273 use WordCharKind::*;
1274 let next = idx as isize + dir;
1275 if next < 0 || next >= asts.len() as isize {
1276 return Edge;
1277 }
1278 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1279 match kind {
1280 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1281 Word => Word,
1282 _ => Unknown,
1283 },
1284 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1285 NonWord => NonWord,
1286 _ => Unknown,
1287 },
1288 other => other,
1289 }
1290 }
1291
1292 fn rewrite_word_boundary_in_concat(
1293 &mut self,
1294 asts: &[Ast],
1295 idx: usize,
1296 translator: &mut Option<Translator>,
1297 tb: &mut TB<'s>,
1298 ) -> Result<(NodeId, usize)> {
1299 use WordCharKind::*;
1300 let (word_id, not_word_id) = if self.global_unicode {
1301 self.unicode_classes.ensure_word(tb);
1302 (self.unicode_classes.word, self.unicode_classes.non_word)
1303 } else {
1304 let az = tb.mk_range_u8(b'a', b'z');
1305 let big = tb.mk_range_u8(b'A', b'Z');
1306 let dig = tb.mk_range_u8(b'0', b'9');
1307 let us = tb.mk_u8(b'_');
1308 let w = tb.mk_unions([az, big, dig, us].into_iter());
1309 (w, tb.mk_compl(w))
1310 };
1311 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1312 let right =
1313 self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1314
1315 match (left, right) {
1316 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1317 (Word, _) => {
1318 let neg = tb.mk_neg_lookahead(word_id, 0);
1319 Ok((neg, idx + 1))
1320 }
1321 (NonWord, _) => {
1322 let set = tb.mk_union(NodeId::END, word_id);
1323 let tail = tb.mk_concat(set, NodeId::TS);
1324 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1325 }
1326 (_, Word) => {
1327 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1328 }
1329 (_, NonWord) => {
1330 let body = tb.mk_union(NodeId::BEGIN, word_id);
1331 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1332 }
1333 _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1336 }
1337 }
1338
1339 fn merge_boundary_with_following_lookaheads(
1340 &mut self,
1341 asts: &[Ast],
1342 wb_idx: usize,
1343 boundary_tail: NodeId,
1344 translator: &mut Option<Translator>,
1345 tb: &mut TB<'s>,
1346 ) -> Result<(NodeId, usize)> {
1347 let mut next = wb_idx + 1;
1348 let mut la_bodies = vec![boundary_tail];
1349 while next < asts.len() {
1350 match &asts[next] {
1351 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1352 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1353 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1354 next += 1;
1355 }
1356 _ => break,
1357 }
1358 }
1359 let merged = tb.mk_inters(la_bodies.into_iter());
1360 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1361 }
1362
1363 fn ast_to_node_id(
1364 &mut self,
1365 ast: &Ast,
1366 translator: &mut Option<Translator>,
1367 tb: &mut TB<'s>,
1368 ) -> Result<NodeId> {
1369 match ast {
1370 Ast::Empty(_) => Ok(NodeId::EPS),
1371 Ast::Flags(f) => {
1372 let mut translator_builder = self.default_translator_builder();
1373 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1374 translator_builder.case_insensitive(state);
1375 }
1376 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1377 translator_builder.unicode(state);
1378 }
1379 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1380 self.dot_all.set(state);
1381 }
1382 let concat_translator = Some(translator_builder.build());
1383 *translator = concat_translator;
1384 Ok(NodeId::EPS)
1385 }
1386 Ast::Literal(l) => {
1387 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1388 self.translator_to_node_id(&ast_lit, translator, tb)
1389 }
1390 Ast::Top(_) => Ok(NodeId::TOP),
1391 Ast::Dot(_) => {
1392 if self.dot_all.get() {
1393 Ok(NodeId::TOP)
1394 } else {
1395 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1396 self.hir_to_node_id(&hirv, tb)
1397 }
1398 }
1399 Ast::Assertion(a) => match &a.kind {
1400 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1401 ast::AssertionKind::EndText => Ok(NodeId::END),
1402 ast::AssertionKind::WordBoundary => {
1403 return Err(
1404 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1405 )
1406 }
1407 ast::AssertionKind::NotWordBoundary => {
1408 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1409 }
1410 ast::AssertionKind::StartLine => {
1411 let left = NodeId::BEGIN;
1412 let right = tb.mk_u8('\n' as u8);
1413 let union = tb.mk_union(left, right);
1414 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1415 }
1416 ast::AssertionKind::EndLine => {
1417 let left = NodeId::END;
1418 let right = tb.mk_u8('\n' as u8);
1419 let union = tb.mk_union(left, right);
1420 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1421 }
1422 ast::AssertionKind::WordBoundaryStart => todo!(),
1423 ast::AssertionKind::WordBoundaryEnd => todo!(),
1424 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1425 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1426 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1427 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1428 },
1429 Ast::ClassUnicode(c) => {
1430 let tmp = regex_syntax::ast::ClassUnicode {
1431 span: c.span,
1432 negated: c.negated,
1433 kind: c.kind.clone(),
1434 };
1435 if !c.negated {
1436 match &c.kind {
1437 regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1438 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1440 "utf8" => {
1442 let ascii = tb.mk_range_u8(0, 127);
1443 let beta = tb.mk_range_u8(128, 0xBF);
1444 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1445 let c0s = tb.mk_concats([c0, beta].into_iter());
1446 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1447 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1448 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1449 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1450 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1451 return Ok(tb.mk_star(merged));
1452 }
1453 "hex" => {
1454 let nums = tb.mk_range_u8(b'0', b'9');
1455 let lets = tb.mk_range_u8(b'a', b'f');
1456 let lets2 = tb.mk_range_u8(b'A', b'F');
1457 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1458 return Ok(merged);
1459 }
1460 _ => {}
1461 },
1462 _ => {}
1463 };
1464 }
1465
1466 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1467 self.translator_to_node_id(&orig_ast, translator, tb)
1468 }
1469 Ast::ClassPerl(c) => {
1470 self.get_class(c.negated, c.kind.clone(), tb)
1471 }
1472 Ast::ClassBracketed(c) => match &c.kind {
1473 regex_syntax::ast::ClassSet::Item(_) => {
1474 let tmp = regex_syntax::ast::ClassBracketed {
1475 span: c.span,
1476 negated: c.negated,
1477 kind: c.kind.clone(),
1478 };
1479 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1480 self.translator_to_node_id(&orig_ast, translator, tb)
1481 }
1482 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1483 },
1484 Ast::Repetition(r) => {
1485 let body = self.ast_to_node_id(&r.ast, translator, tb);
1486 match body {
1487 Ok(body) => match &r.op.kind {
1488 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1489 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1490 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1491 ast::RepetitionKind::Range(r) => match r {
1492 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1493 ast::RepetitionRange::AtLeast(n) => {
1494 let rep = tb.mk_repeat(body, *n, *n);
1495 let st = tb.mk_star(body);
1496 Ok(tb.mk_concat(rep, st))
1497 }
1498
1499 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1500 },
1501 },
1502 Err(_) => body,
1503 }
1504 }
1505 Ast::Lookaround(g) => {
1506 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1507 match g.kind {
1508 ast::LookaroundKind::PositiveLookahead => {
1509 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1510 }
1511 ast::LookaroundKind::PositiveLookbehind => {
1512 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1513 }
1514 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1515 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1516 }
1517 }
1518 Ast::Group(g) => {
1519 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1520 if !flags.items.is_empty() {
1521 let mut translator_builder = self.default_translator_builder();
1522 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1523 translator_builder.case_insensitive(state);
1524 }
1525 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1526 translator_builder.unicode(state);
1527 }
1528 let saved_dot_all = self.dot_all.get();
1529 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1530 self.dot_all.set(state);
1531 }
1532 let mut scoped = Some(translator_builder.build());
1533 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1534 self.dot_all.set(saved_dot_all);
1535 return result;
1536 }
1537 }
1538 self.ast_to_node_id(&g.ast, translator, tb)
1539 }
1540 Ast::Alternation(a) => {
1541 let mut children = vec![];
1542 for ast in &a.asts {
1543 match self.ast_to_node_id(ast, translator, tb) {
1544 Ok(node_id) => children.push(node_id),
1545 Err(err) => return Err(err),
1546 }
1547 }
1548 Ok(tb.mk_unions(children.iter().copied()))
1549 }
1550 Ast::Concat(c) => {
1551 let mut concat_translator: Option<Translator> = None;
1552 let mut children = vec![];
1553 let mut i = 0;
1554 while i < c.asts.len() {
1555 let ast = &c.asts[i];
1556 match ast {
1557 Ast::Flags(f) => {
1558 let mut translator_builder = self.default_translator_builder();
1559 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1560 translator_builder.case_insensitive(state);
1561 }
1562 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1563 translator_builder.unicode(state);
1564 }
1565 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1566 self.dot_all.set(state);
1567 }
1568 concat_translator = Some(translator_builder.build());
1569 i += 1;
1570 continue;
1571 }
1572 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1573 let node =
1574 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1575 children.push(node.0);
1576 i = node.1; continue;
1578 }
1579 _ => {}
1580 }
1581 match concat_translator {
1582 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1583 Ok(node_id) => children.push(node_id),
1584 Err(err) => return Err(err),
1585 },
1586 None => match self.ast_to_node_id(ast, translator, tb) {
1587 Ok(node_id) => children.push(node_id),
1588 Err(err) => return Err(err),
1589 },
1590 }
1591 i += 1;
1592 }
1593 Ok(tb.mk_concats(children.iter().cloned()))
1594 }
1595 Ast::Intersection(intersection) => {
1596 let mut children = vec![];
1597 for ast in &intersection.asts {
1598 match self.ast_to_node_id(ast, translator, tb) {
1599 Ok(node_id) => children.push(node_id),
1600 Err(err) => return Err(err),
1601 }
1602 }
1603 Ok(tb.mk_inters(children.into_iter()))
1604 }
1605 Ast::Complement(complement) => {
1606 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1607 body.map(|x| tb.mk_compl(x))
1608 }
1609 }
1610 }
1611
1612 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1615 let mut concat = Concat {
1616 span: self.span(),
1617 asts: vec![],
1618 };
1619 loop {
1620 self.bump_space();
1621 if self.is_eof() {
1622 break;
1623 }
1624 match self.char() {
1625 '(' => concat = self.push_group(concat)?,
1626 ')' => concat = self.pop_group(concat)?,
1627 '|' => concat = self.push_alternate(concat)?,
1628 '&' => concat = self.push_intersect(concat)?,
1629 '~' => concat = self.push_compl_group(concat)?,
1630 '[' => {
1631 let class = self.parse_set_class()?;
1632 concat.asts.push(Ast::class_bracketed(class));
1633 }
1634 '?' => {
1635 concat =
1636 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1637 }
1638 '*' => {
1639 concat =
1640 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1641 }
1642 '+' => {
1643 concat =
1644 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1645 }
1646 '{' => {
1647 concat = self.parse_counted_repetition(concat)?;
1648 }
1649 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1650 }
1651 }
1652 let ast = self.pop_group_end(concat)?;
1653 let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1655 node_id
1656 }
1657
1658 #[inline(never)]
1659 fn parse_uncounted_repetition(
1660 &self,
1661 mut concat: ast::Concat,
1662 kind: ast::RepetitionKind,
1663 ) -> Result<ast::Concat> {
1664 let op_start = self.pos();
1666 let ast = match concat.asts.pop() {
1667 Some(ast) => ast,
1668 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1669 };
1670 match ast {
1671 Ast::Empty(_) | Ast::Flags(_) => {
1672 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1673 }
1674 _ => {}
1675 }
1676 if self.bump() && self.char() == '?' {
1677 return Err(self.error(
1678 Span::new(op_start, self.pos()),
1679 ast::ErrorKind::UnsupportedLazyQuantifier,
1680 ));
1681 }
1682 concat.asts.push(Ast::repetition(ast::Repetition {
1683 span: ast.span().with_end(self.pos()),
1684 op: ast::RepetitionOp {
1685 span: Span::new(op_start, self.pos()),
1686 kind,
1687 },
1688 greedy: true,
1689 ast: Box::new(ast),
1690 }));
1691 Ok(concat)
1692 }
1693
1694 #[inline(never)]
1695 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1696 assert!(self.char() == '{');
1697 let start = self.pos();
1698 let ast = match concat.asts.pop() {
1699 Some(ast) => ast,
1700 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1701 };
1702 match ast {
1703 Ast::Empty(_) | Ast::Flags(_) => {
1704 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1705 }
1706 _ => {}
1707 }
1708 if !self.bump_and_bump_space() {
1709 return Err(self.error(
1710 Span::new(start, self.pos()),
1711 ast::ErrorKind::RepetitionCountUnclosed,
1712 ));
1713 }
1714 let count_start = specialize_err(
1715 self.parse_decimal(),
1716 ast::ErrorKind::DecimalEmpty,
1717 ast::ErrorKind::RepetitionCountDecimalEmpty,
1718 );
1719 if self.is_eof() {
1720 return Err(self.error(
1721 Span::new(start, self.pos()),
1722 ast::ErrorKind::RepetitionCountUnclosed,
1723 ));
1724 }
1725 let range = if self.char() == ',' {
1726 if !self.bump_and_bump_space() {
1727 return Err(self.error(
1728 Span::new(start, self.pos()),
1729 ast::ErrorKind::RepetitionCountUnclosed,
1730 ));
1731 }
1732 if self.char() != '}' {
1733 let count_start = match count_start {
1734 Ok(c) => c,
1735 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1736 if self.parser().empty_min_range {
1737 0
1738 } else {
1739 return Err(err);
1740 }
1741 }
1742 err => err?,
1743 };
1744 let count_end = specialize_err(
1745 self.parse_decimal(),
1746 ast::ErrorKind::DecimalEmpty,
1747 ast::ErrorKind::RepetitionCountDecimalEmpty,
1748 )?;
1749 ast::RepetitionRange::Bounded(count_start, count_end)
1750 } else {
1751 ast::RepetitionRange::AtLeast(count_start?)
1752 }
1753 } else {
1754 ast::RepetitionRange::Exactly(count_start?)
1755 };
1756
1757 if self.is_eof() || self.char() != '}' {
1758 return Err(self.error(
1759 Span::new(start, self.pos()),
1760 ast::ErrorKind::RepetitionCountUnclosed,
1761 ));
1762 }
1763
1764 if self.bump_and_bump_space() && self.char() == '?' {
1765 return Err(self.error(
1766 Span::new(start, self.pos()),
1767 ast::ErrorKind::UnsupportedLazyQuantifier,
1768 ));
1769 }
1770
1771 let op_span = Span::new(start, self.pos());
1772 if !range.is_valid() {
1773 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1774 }
1775 concat.asts.push(Ast::repetition(ast::Repetition {
1776 span: ast.span().with_end(self.pos()),
1777 op: ast::RepetitionOp {
1778 span: op_span,
1779 kind: ast::RepetitionKind::Range(range),
1780 },
1781 greedy: true,
1782 ast: Box::new(ast),
1783 }));
1784 Ok(concat)
1785 }
1786
1787 #[inline(never)]
1788 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1789 assert_eq!(self.char(), '(');
1790 let open_span = self.span_char();
1791 self.bump();
1792 self.bump_space();
1793 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1794 let kind = match (pos, ahead) {
1795 (true, true) => LookaroundKind::PositiveLookahead,
1796 (true, false) => LookaroundKind::PositiveLookbehind,
1797 (false, true) => LookaroundKind::NegativeLookahead,
1798 (false, false) => LookaroundKind::NegativeLookbehind,
1799 };
1800 return Ok(Either::Right(ast::Group {
1801 span: open_span,
1802 kind: ast::GroupKind::Lookaround(kind),
1803 ast: Box::new(Ast::empty(self.span())),
1804 }));
1805 }
1806 let inner_span = self.span();
1807 let mut starts_with_p = true;
1808 if self.bump_if("?P<") || {
1809 starts_with_p = false;
1810 self.bump_if("?<")
1811 } {
1812 let capture_index = self.next_capture_index(open_span)?;
1813 let name = self.parse_capture_name(capture_index)?;
1814 Ok(Either::Right(ast::Group {
1815 span: open_span,
1816 kind: ast::GroupKind::CaptureName {
1817 starts_with_p,
1818 name,
1819 },
1820 ast: Box::new(Ast::empty(self.span())),
1821 }))
1822 } else if self.bump_if("?") {
1823 if self.is_eof() {
1824 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1825 }
1826 let flags = self.parse_flags()?;
1827 let char_end = self.char();
1828 self.bump();
1829 if char_end == ')' {
1830 if flags.items.is_empty() {
1833 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1834 }
1835 Ok(Either::Left(ast::SetFlags {
1836 span: Span {
1837 end: self.pos(),
1838 ..open_span
1839 },
1840 flags,
1841 }))
1842 } else {
1843 assert_eq!(char_end, ':');
1844 Ok(Either::Right(ast::Group {
1845 span: open_span,
1846 kind: ast::GroupKind::NonCapturing(flags),
1847 ast: Box::new(Ast::empty(self.span())),
1848 }))
1849 }
1850 } else {
1851 let capture_index = self.next_capture_index(open_span)?;
1852 Ok(Either::Right(ast::Group {
1853 span: open_span,
1854 kind: ast::GroupKind::CaptureIndex(capture_index),
1855 ast: Box::new(Ast::empty(self.span())),
1856 }))
1857 }
1858 }
1859
1860 #[inline(never)]
1861 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1862 if self.is_eof() {
1863 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1864 }
1865 let start = self.pos();
1866 loop {
1867 if self.char() == '>' {
1868 break;
1869 }
1870 if !is_capture_char(self.char(), self.pos() == start) {
1871 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1872 }
1873 if !self.bump() {
1874 break;
1875 }
1876 }
1877 let end = self.pos();
1878 if self.is_eof() {
1879 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1880 }
1881 assert_eq!(self.char(), '>');
1882 self.bump();
1883 let name = &self.pattern()[start.offset..end.offset];
1884 if name.is_empty() {
1885 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1886 }
1887 let capname = ast::CaptureName {
1888 span: Span::new(start, end),
1889 name: name.to_string(),
1890 index: capture_index,
1891 };
1892 self.add_capture_name(&capname)?;
1893 Ok(capname)
1894 }
1895
1896 #[inline(never)]
1897 fn parse_flags(&self) -> Result<ast::Flags> {
1898 let mut flags = ast::Flags {
1899 span: self.span(),
1900 items: vec![],
1901 };
1902 let mut last_was_negation = None;
1903 while self.char() != ':' && self.char() != ')' {
1904 if self.char() == '-' {
1905 last_was_negation = Some(self.span_char());
1906 let item = ast::FlagsItem {
1907 span: self.span_char(),
1908 kind: ast::FlagsItemKind::Negation,
1909 };
1910 if let Some(i) = flags.add_item(item) {
1911 return Err(self.error(
1912 self.span_char(),
1913 ast::ErrorKind::FlagRepeatedNegation {
1914 original: flags.items[i].span,
1915 },
1916 ));
1917 }
1918 } else {
1919 last_was_negation = None;
1920 let item = ast::FlagsItem {
1921 span: self.span_char(),
1922 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1923 };
1924 if let Some(i) = flags.add_item(item) {
1925 return Err(self.error(
1926 self.span_char(),
1927 ast::ErrorKind::FlagDuplicate {
1928 original: flags.items[i].span,
1929 },
1930 ));
1931 }
1932 }
1933 if !self.bump() {
1934 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1935 }
1936 }
1937 if let Some(span) = last_was_negation {
1938 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1939 }
1940 flags.span.end = self.pos();
1941 Ok(flags)
1942 }
1943
1944 #[inline(never)]
1945 fn parse_flag(&self) -> Result<ast::Flag> {
1946 match self.char() {
1947 'i' => Ok(ast::Flag::CaseInsensitive),
1948 'm' => Ok(ast::Flag::MultiLine),
1949 's' => Ok(ast::Flag::DotMatchesNewLine),
1950 'U' => Ok(ast::Flag::SwapGreed),
1951 'u' => Ok(ast::Flag::Unicode),
1952 'R' => Ok(ast::Flag::CRLF),
1953 'x' => Ok(ast::Flag::IgnoreWhitespace),
1954 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1955 }
1956 }
1957
1958 fn parse_primitive(&self) -> Result<Primitive> {
1959 match self.char() {
1960 '\\' => self.parse_escape(),
1961 '_' => {
1962 let ast = Primitive::Top(self.span_char());
1963 self.bump();
1964 Ok(ast)
1965 }
1966 '.' => {
1967 let ast = Primitive::Dot(self.span_char());
1968 self.bump();
1969 Ok(ast)
1970 }
1971 '^' => {
1972 let ast = Primitive::Assertion(ast::Assertion {
1973 span: self.span_char(),
1974 kind: ast::AssertionKind::StartLine,
1975 });
1976 self.bump();
1977 Ok(ast)
1978 }
1979 '$' => {
1980 let ast = Primitive::Assertion(ast::Assertion {
1981 span: self.span_char(),
1982 kind: ast::AssertionKind::EndLine,
1983 });
1984 self.bump();
1985 Ok(ast)
1986 }
1987 c => {
1988 let ast = Primitive::Literal(Literal {
1989 span: self.span_char(),
1990 kind: LiteralKind::Verbatim,
1991 c,
1992 });
1993 self.bump();
1994 Ok(ast)
1995 }
1996 }
1997 }
1998
1999 #[inline(never)]
2000 fn parse_escape(&self) -> Result<Primitive> {
2001 assert_eq!(self.char(), '\\');
2002 let start = self.pos();
2003 if !self.bump() {
2004 return Err(self.error(
2005 Span::new(start, self.pos()),
2006 ast::ErrorKind::EscapeUnexpectedEof,
2007 ));
2008 }
2009 let c = self.char();
2010 match c {
2012 '0'..='9' => {
2013 if !self.parser().octal {
2014 return Err(self.error(
2015 Span::new(start, self.span_char().end),
2016 ast::ErrorKind::UnsupportedBackreference,
2017 ));
2018 }
2019 let mut lit = self.parse_octal();
2020 lit.span.start = start;
2021 return Ok(Primitive::Literal(lit));
2022 }
2023 'x' | 'u' | 'U' => {
2030 let mut lit = self.parse_hex()?;
2031 lit.span.start = start;
2032 return Ok(Primitive::Literal(lit));
2033 }
2034 'p' | 'P' => {
2035 let mut cls = self.parse_unicode_class()?;
2036 cls.span.start = start;
2037 return Ok(Primitive::Unicode(cls));
2038 }
2039 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2040 let mut cls = self.parse_perl_class();
2041 cls.span.start = start;
2042 return Ok(Primitive::Perl(cls));
2043 }
2044 _ => {}
2045 }
2046
2047 self.bump();
2049 let span = Span::new(start, self.pos());
2050 if is_meta_character(c) {
2051 return Ok(Primitive::Literal(Literal {
2052 span,
2053 kind: LiteralKind::Meta,
2054 c,
2055 }));
2056 }
2057 if is_escapeable_character(c) {
2058 return Ok(Primitive::Literal(Literal {
2059 span,
2060 kind: LiteralKind::Superfluous,
2061 c,
2062 }));
2063 }
2064 let special = |kind, c| {
2065 Ok(Primitive::Literal(Literal {
2066 span,
2067 kind: LiteralKind::Special(kind),
2068 c,
2069 }))
2070 };
2071 match c {
2072 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2073 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2074 't' => special(SpecialLiteralKind::Tab, '\t'),
2075 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2076 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2077 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2078 'A' => Ok(Primitive::Assertion(ast::Assertion {
2079 span,
2080 kind: ast::AssertionKind::StartText,
2081 })),
2082 'z' => Ok(Primitive::Assertion(ast::Assertion {
2083 span,
2084 kind: ast::AssertionKind::EndText,
2085 })),
2086 'b' => {
2087 let mut wb = ast::Assertion {
2088 span,
2089 kind: ast::AssertionKind::WordBoundary,
2090 };
2091 if !self.is_eof() && self.char() == '{' {
2094 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2095 wb.kind = kind;
2096 wb.span.end = self.pos();
2097 }
2098 }
2099 Ok(Primitive::Assertion(wb))
2100 }
2101 'B' => Ok(Primitive::Assertion(ast::Assertion {
2102 span,
2103 kind: ast::AssertionKind::NotWordBoundary,
2104 })),
2105 '<' => Ok(Primitive::Assertion(ast::Assertion {
2106 span,
2107 kind: ast::AssertionKind::WordBoundaryStartAngle,
2108 })),
2109 '>' => Ok(Primitive::Assertion(ast::Assertion {
2110 span,
2111 kind: ast::AssertionKind::WordBoundaryEndAngle,
2112 })),
2113 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2114 }
2115 }
2116
2117 fn maybe_parse_special_word_boundary(
2118 &self,
2119 wb_start: Position,
2120 ) -> Result<Option<ast::AssertionKind>> {
2121 assert_eq!(self.char(), '{');
2122
2123 let is_valid_char = |c| match c {
2124 'A'..='Z' | 'a'..='z' | '-' => true,
2125 _ => false,
2126 };
2127 let start = self.pos();
2128 if !self.bump_and_bump_space() {
2129 return Err(self.error(
2130 Span::new(wb_start, self.pos()),
2131 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2132 ));
2133 }
2134 let start_contents = self.pos();
2135 if !is_valid_char(self.char()) {
2140 self.parser().pos.set(start);
2141 return Ok(None);
2142 }
2143
2144 let mut scratch = self.parser().scratch.borrow_mut();
2146 scratch.clear();
2147 while !self.is_eof() && is_valid_char(self.char()) {
2148 scratch.push(self.char());
2149 self.bump_and_bump_space();
2150 }
2151 if self.is_eof() || self.char() != '}' {
2152 return Err(self.error(
2153 Span::new(start, self.pos()),
2154 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2155 ));
2156 }
2157 let end = self.pos();
2158 self.bump();
2159 let kind = match scratch.as_str() {
2160 "start" => ast::AssertionKind::WordBoundaryStart,
2161 "end" => ast::AssertionKind::WordBoundaryEnd,
2162 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2163 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2164 _ => {
2165 return Err(self.error(
2166 Span::new(start_contents, end),
2167 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2168 ))
2169 }
2170 };
2171 Ok(Some(kind))
2172 }
2173
2174 #[inline(never)]
2175 fn parse_octal(&self) -> Literal {
2176 assert!(self.parser().octal);
2177 assert!('0' <= self.char() && self.char() <= '7');
2178 let start = self.pos();
2179 while self.bump()
2181 && '0' <= self.char()
2182 && self.char() <= '7'
2183 && self.pos().offset - start.offset <= 2
2184 {}
2185 let end = self.pos();
2186 let octal = &self.pattern()[start.offset..end.offset];
2187 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2190 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2193 Literal {
2194 span: Span::new(start, end),
2195 kind: LiteralKind::Octal,
2196 c,
2197 }
2198 }
2199
2200 #[inline(never)]
2201 fn parse_hex(&self) -> Result<Literal> {
2202 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2203
2204 let hex_kind = match self.char() {
2205 'x' => HexLiteralKind::X,
2206 'u' => HexLiteralKind::UnicodeShort,
2207 _ => HexLiteralKind::UnicodeLong,
2208 };
2209 if !self.bump_and_bump_space() {
2210 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2211 }
2212 if self.char() == '{' {
2213 self.parse_hex_brace(hex_kind)
2214 } else {
2215 self.parse_hex_digits(hex_kind)
2216 }
2217 }
2218
2219 #[inline(never)]
2220 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2221 let mut scratch = self.parser().scratch.borrow_mut();
2222 scratch.clear();
2223
2224 let start = self.pos();
2225 for i in 0..kind.digits() {
2226 if i > 0 && !self.bump_and_bump_space() {
2227 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2228 }
2229 if !is_hex(self.char()) {
2230 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2231 }
2232 scratch.push(self.char());
2233 }
2234 self.bump_and_bump_space();
2237 let end = self.pos();
2238 let hex = scratch.as_str();
2239 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2240 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2241 Some(c) => Ok(Literal {
2242 span: Span::new(start, end),
2243 kind: LiteralKind::HexFixed(kind),
2244 c,
2245 }),
2246 }
2247 }
2248
2249 #[inline(never)]
2250 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2251 let mut scratch = self.parser().scratch.borrow_mut();
2252 scratch.clear();
2253
2254 let brace_pos = self.pos();
2255 let start = self.span_char().end;
2256 while self.bump_and_bump_space() && self.char() != '}' {
2257 if !is_hex(self.char()) {
2258 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2259 }
2260 scratch.push(self.char());
2261 }
2262 if self.is_eof() {
2263 return Err(self.error(
2264 Span::new(brace_pos, self.pos()),
2265 ast::ErrorKind::EscapeUnexpectedEof,
2266 ));
2267 }
2268 let end = self.pos();
2269 let hex = scratch.as_str();
2270 assert_eq!(self.char(), '}');
2271 self.bump_and_bump_space();
2272
2273 if hex.is_empty() {
2274 return Err(self.error(
2275 Span::new(brace_pos, self.pos()),
2276 ast::ErrorKind::EscapeHexEmpty,
2277 ));
2278 }
2279 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2280 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2281 Some(c) => Ok(Literal {
2282 span: Span::new(start, self.pos()),
2283 kind: LiteralKind::HexBrace(kind),
2284 c,
2285 }),
2286 }
2287 }
2288
2289 fn parse_decimal(&self) -> Result<u32> {
2290 let mut scratch = self.parser().scratch.borrow_mut();
2291 scratch.clear();
2292
2293 while !self.is_eof() && self.char().is_whitespace() {
2294 self.bump();
2295 }
2296 let start = self.pos();
2297 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2298 scratch.push(self.char());
2299 self.bump_and_bump_space();
2300 }
2301 let span = Span::new(start, self.pos());
2302 while !self.is_eof() && self.char().is_whitespace() {
2303 self.bump_and_bump_space();
2304 }
2305 let digits = scratch.as_str();
2306 if digits.is_empty() {
2307 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2308 }
2309 match u32::from_str_radix(digits, 10).ok() {
2310 Some(n) => Ok(n),
2311 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2312 }
2313 }
2314
2315 #[inline(never)]
2316 fn parse_set_class(&self) -> Result<ClassBracketed> {
2317 assert_eq!(self.char(), '[');
2318
2319 let mut union = ClassSetUnion {
2320 span: self.span(),
2321 items: vec![],
2322 };
2323 loop {
2324 self.bump_space();
2325 if self.is_eof() {
2326 return Err(self.unclosed_class_error());
2327 }
2328 match self.char() {
2329 '[' => {
2330 if !self.parser().stack_class.borrow().is_empty() {
2335 if let Some(cls) = self.maybe_parse_ascii_class() {
2336 union.push(ClassSetItem::Ascii(cls));
2337 continue;
2338 }
2339 }
2340 union = self.push_class_open(union)?;
2341 }
2342 ']' => match self.pop_class(union)? {
2343 Either::Left(nested_union) => {
2344 union = nested_union;
2345 }
2346 Either::Right(class) => return Ok(class),
2347 },
2348 '&' if self.peek() == Some('&') => {
2349 assert!(self.bump_if("&&"));
2350 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2351 }
2352 '-' if self.peek() == Some('-') => {
2353 assert!(self.bump_if("--"));
2354 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2355 }
2356 '~' if self.peek() == Some('~') => {
2357 assert!(self.bump_if("~~"));
2358 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2359 }
2360 _ => {
2361 union.push(self.parse_set_class_range()?);
2362 }
2363 }
2364 }
2365 }
2366
2367 #[inline(never)]
2368 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2369 let prim1 = self.parse_set_class_item()?;
2370 self.bump_space();
2371 if self.is_eof() {
2372 return Err(self.unclosed_class_error());
2373 }
2374 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2375 return prim1.into_class_set_item(self);
2376 }
2377 if !self.bump_and_bump_space() {
2378 return Err(self.unclosed_class_error());
2379 }
2380 let prim2 = self.parse_set_class_item()?;
2381 let range = ClassSetRange {
2382 span: Span::new(prim1.span().start, prim2.span().end),
2383 start: prim1.into_class_literal(self)?,
2384 end: prim2.into_class_literal(self)?,
2385 };
2386 if !range.is_valid() {
2387 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2388 }
2389 Ok(ClassSetItem::Range(range))
2390 }
2391
2392 #[inline(never)]
2393 fn parse_set_class_item(&self) -> Result<Primitive> {
2394 if self.char() == '\\' {
2395 self.parse_escape()
2396 } else {
2397 let x = Primitive::Literal(Literal {
2398 span: self.span_char(),
2399 kind: LiteralKind::Verbatim,
2400 c: self.char(),
2401 });
2402 self.bump();
2403 Ok(x)
2404 }
2405 }
2406
2407 #[inline(never)]
2408 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2409 assert_eq!(self.char(), '[');
2410 let start = self.pos();
2411 if !self.bump_and_bump_space() {
2412 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2413 }
2414
2415 let negated = if self.char() != '^' {
2416 false
2417 } else {
2418 if !self.bump_and_bump_space() {
2419 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2420 }
2421 true
2422 };
2423 let mut union = ClassSetUnion {
2425 span: self.span(),
2426 items: vec![],
2427 };
2428 while self.char() == '-' {
2429 union.push(ClassSetItem::Literal(Literal {
2430 span: self.span_char(),
2431 kind: LiteralKind::Verbatim,
2432 c: '-',
2433 }));
2434 if !self.bump_and_bump_space() {
2435 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2436 }
2437 }
2438 if union.items.is_empty() && self.char() == ']' {
2441 union.push(ClassSetItem::Literal(Literal {
2442 span: self.span_char(),
2443 kind: LiteralKind::Verbatim,
2444 c: ']',
2445 }));
2446 if !self.bump_and_bump_space() {
2447 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2448 }
2449 }
2450 let set = ClassBracketed {
2451 span: Span::new(start, self.pos()),
2452 negated,
2453 kind: ClassSet::union(ClassSetUnion {
2454 span: Span::new(union.span.start, union.span.start),
2455 items: vec![],
2456 }),
2457 };
2458 Ok((set, union))
2459 }
2460
2461 #[inline(never)]
2462 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2463 assert_eq!(self.char(), '[');
2464 let start = self.pos();
2466 let mut negated = false;
2467 if !self.bump() || self.char() != ':' {
2468 self.parser().pos.set(start);
2469 return None;
2470 }
2471 if !self.bump() {
2472 self.parser().pos.set(start);
2473 return None;
2474 }
2475 if self.char() == '^' {
2476 negated = true;
2477 if !self.bump() {
2478 self.parser().pos.set(start);
2479 return None;
2480 }
2481 }
2482 let name_start = self.offset();
2483 while self.char() != ':' && self.bump() {}
2484 if self.is_eof() {
2485 self.parser().pos.set(start);
2486 return None;
2487 }
2488 let name = &self.pattern()[name_start..self.offset()];
2489 if !self.bump_if(":]") {
2490 self.parser().pos.set(start);
2491 return None;
2492 }
2493 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2494 Some(kind) => kind,
2495 None => {
2496 self.parser().pos.set(start);
2497 return None;
2498 }
2499 };
2500 Some(ClassAscii {
2501 span: Span::new(start, self.pos()),
2502 kind,
2503 negated,
2504 })
2505 }
2506
2507 #[inline(never)]
2508 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2509 assert!(self.char() == 'p' || self.char() == 'P');
2510
2511 let mut scratch = self.parser().scratch.borrow_mut();
2512 scratch.clear();
2513
2514 let negated = self.char() == 'P';
2515 if !self.bump_and_bump_space() {
2516 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2517 }
2518 let (start, kind) = if self.char() == '{' {
2519 let start = self.span_char().end;
2520 while self.bump_and_bump_space() && self.char() != '}' {
2521 scratch.push(self.char());
2522 }
2523 if self.is_eof() {
2524 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2525 }
2526 assert_eq!(self.char(), '}');
2527 self.bump();
2528
2529 let name = scratch.as_str();
2530 if let Some(i) = name.find("!=") {
2531 (
2532 start,
2533 ClassUnicodeKind::NamedValue {
2534 op: ClassUnicodeOpKind::NotEqual,
2535 name: name[..i].to_string(),
2536 value: name[i + 2..].to_string(),
2537 },
2538 )
2539 } else if let Some(i) = name.find(':') {
2540 (
2541 start,
2542 ClassUnicodeKind::NamedValue {
2543 op: ClassUnicodeOpKind::Colon,
2544 name: name[..i].to_string(),
2545 value: name[i + 1..].to_string(),
2546 },
2547 )
2548 } else if let Some(i) = name.find('=') {
2549 (
2550 start,
2551 ClassUnicodeKind::NamedValue {
2552 op: ClassUnicodeOpKind::Equal,
2553 name: name[..i].to_string(),
2554 value: name[i + 1..].to_string(),
2555 },
2556 )
2557 } else {
2558 (start, ClassUnicodeKind::Named(name.to_string()))
2559 }
2560 } else {
2561 let start = self.pos();
2562 let c = self.char();
2563 if c == '\\' {
2564 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2565 }
2566 self.bump_and_bump_space();
2567 let kind = ClassUnicodeKind::OneLetter(c);
2568 (start, kind)
2569 };
2570 Ok(ClassUnicode {
2571 span: Span::new(start, self.pos()),
2572 negated,
2573 kind,
2574 })
2575 }
2576
2577 #[inline(never)]
2578 fn parse_perl_class(&self) -> ClassPerl {
2579 let c = self.char();
2580 let span = self.span_char();
2581 self.bump();
2582 let (negated, kind) = match c {
2583 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2584 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2585 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2586 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2587 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2588 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2589 c => panic!("expected valid Perl class but got '{}'", c),
2590 };
2591 ClassPerl {
2592 span,
2593 kind,
2594 negated,
2595 }
2596 }
2597}
2598
2599pub fn parse_ast<'s>(
2600 tb: &mut TB<'s>,
2601 pattern: &'s str,
2602) -> std::result::Result<NodeId, ResharpError> {
2603 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2604 p.parse(tb)
2605}
2606
2607pub fn parse_ast_with<'s>(
2608 tb: &mut TB<'s>,
2609 pattern: &'s str,
2610 flags: &PatternFlags,
2611) -> std::result::Result<NodeId, ResharpError> {
2612 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2613 p.parse(tb)
2614}