1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub ignore_whitespace: bool,
38 pub ascii_perl_classes: bool,
41}
42
43const REPETITION_COUNT_LIMIT: u32 = 2_000;
46const EXPANDED_AST_LIMIT: u64 = 50_000;
47const MAX_LIST_LEN: usize = 4_000;
48
49impl Default for PatternFlags {
50 fn default() -> Self {
51 Self {
52 unicode: true,
53 full_unicode: false,
54 case_insensitive: false,
55 dot_matches_new_line: false,
56 ignore_whitespace: false,
57 ascii_perl_classes: false,
58 }
59 }
60}
61
62#[derive(Clone, Copy, PartialEq, Debug)]
63enum WordCharKind {
64 Word,
65 NonWord,
66 MaybeWord,
67 MaybeNonWord,
68 Unknown,
69 Edge,
70}
71
72fn is_word_byte(b: u8) -> bool {
73 b.is_ascii_alphanumeric() || b == b'_'
74}
75
76#[derive(Clone, Debug, Eq, PartialEq)]
77enum Primitive {
78 Literal(Literal),
79 Assertion(ast::Assertion),
80 Dot(Span),
81 Top(Span),
82 Perl(ClassPerl),
83 Unicode(ClassUnicode),
84}
85
86impl Primitive {
87 fn span(&self) -> &Span {
88 match *self {
89 Primitive::Literal(ref x) => &x.span,
90 Primitive::Assertion(ref x) => &x.span,
91 Primitive::Dot(ref span) => span,
92 Primitive::Top(ref span) => span,
93 Primitive::Perl(ref x) => &x.span,
94 Primitive::Unicode(ref x) => &x.span,
95 }
96 }
97
98 fn into_ast(self) -> Ast {
99 match self {
100 Primitive::Literal(lit) => Ast::literal(lit),
101 Primitive::Assertion(assert) => Ast::assertion(assert),
102 Primitive::Dot(span) => Ast::dot(span),
103 Primitive::Top(span) => Ast::top(span),
104 Primitive::Perl(cls) => Ast::class_perl(cls),
105 Primitive::Unicode(cls) => Ast::class_unicode(cls),
106 }
107 }
108
109 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
110 use self::Primitive::*;
111 use regex_syntax::ast::ClassSetItem;
112
113 match self {
114 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
115 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
116 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
117 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
118 }
119 }
120
121 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
122 use self::Primitive::*;
123
124 match self {
125 Literal(lit) => Ok(lit),
126 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
127 }
128 }
129}
130
131#[derive(Clone, Debug, Eq, PartialEq)]
132pub enum Either<Left, Right> {
133 Left(Left),
134 Right(Right),
135}
136
137#[derive(Clone, Debug, Eq, PartialEq)]
138pub struct ResharpError {
139 pub kind: ErrorKind,
141 pattern: String,
144 pub span: Span,
146}
147
148impl std::fmt::Display for ResharpError {
149 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
150 write!(f, "{:?}: {:?}", self.kind, self.span)
151 }
152}
153impl std::error::Error for ResharpError {}
154
155type Result<T> = core::result::Result<T, ResharpError>;
156
157#[derive(Clone, Debug)]
158enum GroupState {
159 Group {
161 concat: Concat,
163 group: ast::Group,
165 ignore_whitespace: bool,
167 },
168 Alternation(ast::Alternation),
173 Intersection(ast::Intersection),
174}
175
176#[derive(Clone, Debug)]
177enum ClassState {
178 Open {
180 union: regex_syntax::ast::ClassSetUnion,
182 set: regex_syntax::ast::ClassBracketed,
186 },
187 Op {
190 kind: regex_syntax::ast::ClassSetBinaryOpKind,
192 lhs: regex_syntax::ast::ClassSet,
194 },
195}
196
197pub struct ResharpParser<'s> {
199 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
200 unicode_classes: resharp_algebra::UnicodeClassCache,
201 pub translator: regex_syntax::hir::translate::Translator,
202 pub pattern: &'s str,
203 pos: Cell<Position>,
204 capture_index: Cell<u32>,
205 octal: bool,
206 empty_min_range: bool,
207 ignore_whitespace: Cell<bool>,
208 dot_all: Cell<bool>,
209 global_unicode: bool,
210 global_full_unicode: bool,
211 global_ascii_perl: bool,
212 global_case_insensitive: bool,
213 comments: RefCell<Vec<ast::Comment>>,
214 stack_group: RefCell<Vec<GroupState>>,
215 stack_class: RefCell<Vec<ClassState>>,
216 capture_names: RefCell<Vec<ast::CaptureName>>,
217 scratch: RefCell<String>,
218}
219
220fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
221 result.map_err(|e| {
222 if e.kind == from {
223 ResharpError {
224 kind: to,
225 pattern: e.pattern,
226 span: e.span,
227 }
228 } else {
229 e
230 }
231 })
232}
233
234fn is_capture_char(c: char, first: bool) -> bool {
235 if first {
236 c == '_' || c.is_alphabetic()
237 } else {
238 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
239 }
240}
241
242pub fn is_meta_character(c: char) -> bool {
243 matches!(
244 c,
245 '\\' | '.'
246 | '+'
247 | '*'
248 | '?'
249 | '('
250 | ')'
251 | '|'
252 | '['
253 | ']'
254 | '{'
255 | '}'
256 | '^'
257 | '$'
258 | '#'
259 | '&'
260 | '-'
261 | '~'
262 | '_'
263 )
264}
265
266pub fn escape(text: &str) -> String {
268 let mut buf = String::new();
269 escape_into(text, &mut buf);
270 buf
271}
272
273pub fn escape_into(text: &str, buf: &mut String) {
275 buf.reserve(text.len());
276 for c in text.chars() {
277 if is_meta_character(c) {
278 buf.push('\\');
279 }
280 buf.push(c);
281 }
282}
283
284pub fn is_escapeable_character(c: char) -> bool {
285 if is_meta_character(c) {
287 return true;
288 }
289 if !c.is_ascii() {
292 return false;
293 }
294 match c {
299 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
300 '<' | '>' => false,
310 _ => true,
311 }
312}
313
314fn is_hex(c: char) -> bool {
315 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
316}
317
318impl<'s> ResharpParser<'s> {
319 fn default_translator_builder(&self) -> TranslatorBuilder {
320 let mut trb = TranslatorBuilder::new();
321 trb.unicode(self.global_unicode);
322 trb.utf8(false);
323 trb.case_insensitive(self.global_case_insensitive);
324 trb
325 }
326
327 pub fn new(pattern: &'s str) -> Self {
328 Self::with_flags(pattern, &PatternFlags::default())
329 }
330
331 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
332 let mut trb = TranslatorBuilder::new();
333 trb.unicode(flags.unicode);
334 trb.utf8(false);
335 trb.case_insensitive(flags.case_insensitive);
336 Self {
337 translator: trb.build(),
338 pattern,
339 perl_classes: vec![],
340 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
341 pos: Cell::new(Position::new(0, 0, 0)),
342 capture_index: Cell::new(0),
343 octal: false,
344 empty_min_range: false,
345 ignore_whitespace: Cell::new(flags.ignore_whitespace),
346 dot_all: Cell::new(flags.dot_matches_new_line),
347 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
348 global_full_unicode: flags.full_unicode,
349 global_ascii_perl: flags.ascii_perl_classes,
350 global_case_insensitive: flags.case_insensitive,
351 comments: RefCell::new(vec![]),
352 stack_group: RefCell::new(vec![]),
353 stack_class: RefCell::new(vec![]),
354 capture_names: RefCell::new(vec![]),
355 scratch: RefCell::new(String::new()),
356 }
357 }
358
359 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
361 self
362 }
363
364 fn pattern(&self) -> &str {
366 self.pattern
367 }
368
369 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
371 ResharpError {
372 kind,
373 pattern: self.pattern().to_string(),
374 span,
375 }
376 }
377
378 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
379 self.error(
380 Span::splat(self.pos()),
381 ast::ErrorKind::UnsupportedResharpRegex,
382 )
383 }
384
385 fn offset(&self) -> usize {
390 self.parser().pos.get().offset
391 }
392
393 fn line(&self) -> usize {
397 self.parser().pos.get().line
398 }
399
400 fn column(&self) -> usize {
404 self.parser().pos.get().column
405 }
406
407 fn next_capture_index(&self, span: Span) -> Result<u32> {
415 let current = self.parser().capture_index.get();
416 let i = current
417 .checked_add(1)
418 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
419 self.parser().capture_index.set(i);
420 Ok(i)
421 }
422
423 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
424 let mut names = self.parser().capture_names.borrow_mut();
425 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
426 Err(i) => {
427 names.insert(i, cap.clone());
428 Ok(())
429 }
430 Ok(i) => Err(self.error(
431 cap.span,
432 ast::ErrorKind::GroupNameDuplicate {
433 original: names[i].span,
434 },
435 )),
436 }
437 }
438
439 fn ignore_whitespace(&self) -> bool {
440 self.parser().ignore_whitespace.get()
441 }
442
443 fn char(&self) -> char {
444 self.char_at(self.offset())
445 }
446
447 fn char_at(&self, i: usize) -> char {
448 self.pattern()[i..]
449 .chars()
450 .next()
451 .unwrap_or_else(|| panic!("expected char at offset {}", i))
452 }
453
454 fn bump(&self) -> bool {
455 if self.is_eof() {
456 return false;
457 }
458 let Position {
459 mut offset,
460 mut line,
461 mut column,
462 } = self.pos();
463 if self.char() == '\n' {
464 line = line.checked_add(1).unwrap();
465 column = 1;
466 } else {
467 column = column.checked_add(1).unwrap();
468 }
469 offset += self.char().len_utf8();
470 self.parser().pos.set(Position {
471 offset,
472 line,
473 column,
474 });
475 self.pattern()[self.offset()..].chars().next().is_some()
476 }
477
478 fn bump_if(&self, prefix: &str) -> bool {
479 if self.pattern()[self.offset()..].starts_with(prefix) {
480 for _ in 0..prefix.chars().count() {
481 self.bump();
482 }
483 true
484 } else {
485 false
486 }
487 }
488
489 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
490 if self.bump_if("?=") {
491 return Some((true, true));
492 }
493 if self.bump_if("?!") {
494 return Some((true, false));
495 }
496 if self.bump_if("?<=") {
497 return Some((false, true));
498 }
499 if self.bump_if("?<!") {
500 return Some((false, false));
501 }
502 None
503 }
504
505 fn bump_and_bump_space(&self) -> bool {
506 if !self.bump() {
507 return false;
508 }
509 self.bump_space();
510 !self.is_eof()
511 }
512
513 fn bump_space(&self) {
514 if !self.ignore_whitespace() {
515 return;
516 }
517 while !self.is_eof() {
518 if self.char().is_whitespace() {
519 self.bump();
520 } else if self.char() == '#' {
521 let start = self.pos();
522 let mut comment_text = String::new();
523 self.bump();
524 while !self.is_eof() {
525 let c = self.char();
526 self.bump();
527 if c == '\n' {
528 break;
529 }
530 comment_text.push(c);
531 }
532 let comment = ast::Comment {
533 span: Span::new(start, self.pos()),
534 comment: comment_text,
535 };
536 self.parser().comments.borrow_mut().push(comment);
537 } else {
538 break;
539 }
540 }
541 }
542
543 fn peek(&self) -> Option<char> {
547 if self.is_eof() {
548 return None;
549 }
550 self.pattern()[self.offset() + self.char().len_utf8()..]
551 .chars()
552 .next()
553 }
554
555 fn peek_space(&self) -> Option<char> {
558 if !self.ignore_whitespace() {
559 return self.peek();
560 }
561 if self.is_eof() {
562 return None;
563 }
564 let mut start = self.offset() + self.char().len_utf8();
565 let mut in_comment = false;
566 for (i, c) in self.pattern()[start..].char_indices() {
567 if c.is_whitespace() {
568 continue;
569 } else if !in_comment && c == '#' {
570 in_comment = true;
571 } else if in_comment && c == '\n' {
572 in_comment = false;
573 } else {
574 start += i;
575 break;
576 }
577 }
578 self.pattern()[start..].chars().next()
579 }
580
581 fn is_eof(&self) -> bool {
583 self.offset() == self.pattern().len()
584 }
585
586 fn pos(&self) -> Position {
589 self.parser().pos.get()
590 }
591
592 fn span(&self) -> Span {
595 Span::splat(self.pos())
596 }
597
598 fn span_char(&self) -> Span {
600 let mut next = Position {
601 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
602 line: self.line(),
603 column: self.column().checked_add(1).unwrap(),
604 };
605 if self.char() == '\n' {
606 next.line += 1;
607 next.column = 1;
608 }
609 Span::new(self.pos(), next)
610 }
611
612 #[inline(never)]
622 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
623 assert_eq!(self.char(), '|');
624 concat.span.end = self.pos();
625 self.push_or_add_alternation(concat);
626 self.bump();
627 Ok(ast::Concat {
628 span: self.span(),
629 asts: vec![],
630 })
631 }
632
633 fn push_or_add_alternation(&self, concat: Concat) {
636 use self::GroupState::*;
637
638 let mut stack = self.parser().stack_group.borrow_mut();
639 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
640 alts.asts.push(concat.into_ast());
641 return;
642 }
643 stack.push(Alternation(ast::Alternation {
644 span: Span::new(concat.span.start, self.pos()),
645 asts: vec![concat.into_ast()],
646 }));
647 }
648
649 #[inline(never)]
650 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
651 assert_eq!(self.char(), '&');
652 concat.span.end = self.pos();
653 self.push_or_add_intersect(concat);
654 self.bump();
655 Ok(Concat {
656 span: self.span(),
657 asts: vec![],
658 })
659 }
660
661 fn push_or_add_intersect(&self, concat: Concat) {
664 use self::GroupState::*;
665
666 let mut stack = self.parser().stack_group.borrow_mut();
667 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
668 alts.asts.push(concat.into_ast());
669 return;
670 }
671 stack.push(Intersection(ast::Intersection {
672 span: Span::new(concat.span.start, self.pos()),
673 asts: vec![concat.into_ast()],
674 }));
675 }
676
677 #[inline(never)]
691 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
692 assert_eq!(self.char(), '(');
693 match self.parse_group()? {
694 Either::Left(set) => {
695 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
696 if let Some(v) = ignore {
697 self.parser().ignore_whitespace.set(v);
698 }
699
700 concat.asts.push(Ast::flags(set));
701 Ok(concat)
702 }
703 Either::Right(group) => {
704 let old_ignore_whitespace = self.ignore_whitespace();
705 let new_ignore_whitespace = group
706 .flags()
707 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
708 .unwrap_or(old_ignore_whitespace);
709 self.parser()
710 .stack_group
711 .borrow_mut()
712 .push(GroupState::Group {
713 concat,
714 group,
715 ignore_whitespace: old_ignore_whitespace,
716 });
717 self.parser().ignore_whitespace.set(new_ignore_whitespace);
718 Ok(Concat {
719 span: self.span(),
720 asts: vec![],
721 })
722 }
723 }
724 }
725
726 #[inline(never)]
727 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
728 assert_eq!(self.char(), '~');
729 self.bump();
730 if self.is_eof() || self.char() != '(' {
731 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
732 }
733 let open_span = self.span_char();
734 self.bump();
735 let group = ast::Group {
736 span: open_span,
737 kind: ast::GroupKind::Complement,
738 ast: Box::new(Ast::empty(self.span())),
739 };
740
741 let old_ignore_whitespace = self.ignore_whitespace();
742 let new_ignore_whitespace = group
743 .flags()
744 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
745 .unwrap_or(old_ignore_whitespace);
746 self.parser()
747 .stack_group
748 .borrow_mut()
749 .push(GroupState::Group {
750 concat,
751 group,
752 ignore_whitespace: old_ignore_whitespace,
753 });
754 self.parser().ignore_whitespace.set(new_ignore_whitespace);
755 Ok(Concat {
756 span: self.span(),
757 asts: vec![],
758 })
759 }
760
761 #[inline(never)]
771 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
772 use self::GroupState::*;
773 assert_eq!(self.char(), ')');
774 let mut stack = self.parser().stack_group.borrow_mut();
775 let topstack = stack.pop();
776
777 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
778 Some(Group {
779 concat,
780 group,
781 ignore_whitespace,
782 }) => (concat, group, ignore_whitespace, None),
783 Some(Alternation(alt)) => match stack.pop() {
784 Some(Group {
785 concat,
786 group,
787 ignore_whitespace,
788 }) => (
789 concat,
790 group,
791 ignore_whitespace,
792 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
793 ),
794 None | Some(Alternation(_)) | Some(Intersection(_)) => {
795 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
796 }
797 },
798 Some(Intersection(int)) => match stack.pop() {
799 Some(Group {
800 concat,
801 group,
802 ignore_whitespace,
803 }) => (
804 concat,
805 group,
806 ignore_whitespace,
807 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
808 ),
809 None | Some(Alternation(_)) | Some(Intersection(_)) => {
810 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
811 }
812 },
813
814 None => {
815 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
816 }
817 };
818 self.parser().ignore_whitespace.set(ignore_whitespace);
819 group_concat.span.end = self.pos();
820 self.bump();
821 group.span.end = self.pos();
822 match alt {
823 Some(Either::Left(mut alt)) => {
824 alt.span.end = group_concat.span.end;
825 alt.asts.push(group_concat.into_ast());
826 group.ast = Box::new(alt.into_ast());
827 }
828 Some(Either::Right(mut int)) => {
829 int.span.end = group_concat.span.end;
830 int.asts.push(group_concat.into_ast());
831 group.ast = Box::new(int.into_ast());
832 }
833 None => {
834 group.ast = Box::new(group_concat.into_ast());
835 }
836 }
837
838 if group.kind == GroupKind::Complement {
839 let complement = ast::Complement {
840 span: self.span(),
841 ast: group.ast,
842 };
843 prior_concat.asts.push(Ast::complement(complement));
844 }
845 else {
847 prior_concat.asts.push(Ast::group(group));
848 }
849 Ok(prior_concat)
850 }
851
852 #[inline(never)]
859 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
860 concat.span.end = self.pos();
861 let mut stack = self.parser().stack_group.borrow_mut();
862 let ast = match stack.pop() {
863 None => Ok(concat.into_ast()),
864 Some(GroupState::Alternation(mut alt)) => {
865 alt.span.end = self.pos();
866 alt.asts.push(concat.into_ast());
867 Ok(Ast::alternation(alt))
868 }
869 Some(GroupState::Intersection(mut int)) => {
870 int.span.end = self.pos();
871 int.asts.push(concat.into_ast());
872
873 Ok(Ast::intersection(int))
874 }
875 Some(GroupState::Group { group, .. }) => {
876 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
877 }
878 };
879 match stack.pop() {
881 None => ast,
882 Some(GroupState::Alternation(alt)) => {
883 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
884 }
885 Some(GroupState::Intersection(int)) => {
886 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
887 }
888 Some(GroupState::Group { group, .. }) => {
889 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
890 }
891 }
892 }
893
894 #[inline(never)]
903 fn push_class_open(
904 &self,
905 parent_union: regex_syntax::ast::ClassSetUnion,
906 ) -> Result<regex_syntax::ast::ClassSetUnion> {
907 assert_eq!(self.char(), '[');
908
909 let (nested_set, nested_union) = self.parse_set_class_open()?;
910 self.parser()
911 .stack_class
912 .borrow_mut()
913 .push(ClassState::Open {
914 union: parent_union,
915 set: nested_set,
916 });
917 Ok(nested_union)
918 }
919
920 #[inline(never)]
935 fn pop_class(
936 &self,
937 nested_union: regex_syntax::ast::ClassSetUnion,
938 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
939 assert_eq!(self.char(), ']');
940
941 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
942 let prevset = self.pop_class_op(item);
943 let mut stack = self.parser().stack_class.borrow_mut();
944 match stack.pop() {
945 None => {
946 panic!("unexpected empty character class stack")
955 }
956 Some(ClassState::Op { .. }) => {
957 panic!("unexpected ClassState::Op")
964 }
965 Some(ClassState::Open { mut union, mut set }) => {
966 self.bump();
967 set.span.end = self.pos();
968 set.kind = prevset;
969 if stack.is_empty() {
970 Ok(Either::Right(set))
971 } else {
972 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
973 Ok(Either::Left(union))
974 }
975 }
976 }
977 }
978
979 #[inline(never)]
984 fn unclosed_class_error(&self) -> ResharpError {
985 for state in self.parser().stack_class.borrow().iter().rev() {
986 if let ClassState::Open { ref set, .. } = *state {
987 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
988 }
989 }
990 panic!("no open character class found")
993 }
994
995 #[inline(never)]
1001 fn push_class_op(
1002 &self,
1003 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1004 next_union: regex_syntax::ast::ClassSetUnion,
1005 ) -> regex_syntax::ast::ClassSetUnion {
1006 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1007 let new_lhs = self.pop_class_op(item);
1008 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1009 kind: next_kind,
1010 lhs: new_lhs,
1011 });
1012 regex_syntax::ast::ClassSetUnion {
1013 span: self.span(),
1014 items: vec![],
1015 }
1016 }
1017
1018 #[inline(never)]
1024 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1025 let mut stack = self.parser().stack_class.borrow_mut();
1026 let (kind, lhs) = match stack.pop() {
1027 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1028 Some(state @ ClassState::Open { .. }) => {
1029 stack.push(state);
1030 return rhs;
1031 }
1032 None => unreachable!(),
1033 };
1034 let span = Span::new(lhs.span().start, rhs.span().end);
1035 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1036 span,
1037 kind,
1038 lhs: Box::new(lhs),
1039 rhs: Box::new(rhs),
1040 })
1041 }
1042
1043 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1044 match hir.kind() {
1045 hir::HirKind::Empty => Ok(NodeId::EPS),
1046 hir::HirKind::Literal(l) => {
1047 if l.0.len() == 1 {
1048 let node = tb.mk_u8(l.0[0]);
1049 Ok(node)
1050 } else {
1051 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1052 let conc = tb.mk_concats(ws.iter().copied());
1053 Ok(conc)
1054 }
1055 }
1056 hir::HirKind::Class(class) => match class {
1057 hir::Class::Unicode(class_unicode) => {
1058 let ranges = class_unicode.ranges();
1059 if ranges.len() == 1
1060 && ranges[0].start() == '\u{0}'
1061 && ranges[0].end() == '\u{10FFFF}'
1062 {
1063 return Ok(tb.mk_range_u8(0, 255));
1064 }
1065 let mut nodes = Vec::new();
1066 for range in ranges {
1067 for seq in Utf8Sequences::new(range.start(), range.end()) {
1068 let sl = seq.as_slice();
1069 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1070 let node = match bytes.len() {
1071 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1072 n => {
1073 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1074 let mut conc = last;
1075 for i in (0..n - 1).rev() {
1076 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1077 conc = tb.mk_concat(b, conc);
1078 }
1079 conc
1080 }
1081 };
1082 nodes.push(node);
1083 }
1084 }
1085 let merged = tb.mk_unions(nodes.into_iter());
1086 Ok(merged)
1087 }
1088 hir::Class::Bytes(class_bytes) => {
1089 let ranges = class_bytes.ranges();
1090 let mut result = NodeId::BOT;
1091 for range in ranges {
1092 let start = range.start();
1093 let end = range.end();
1094 let node = tb.mk_range_u8(start, end);
1095 result = tb.mk_union(result, node);
1096 }
1097 Ok(result)
1098 }
1099 },
1100 hir::HirKind::Look(_) => Err(self.error(
1101 Span::splat(self.pos()),
1102 ast::ErrorKind::UnsupportedResharpRegex,
1103 )),
1104 hir::HirKind::Repetition(_) => Err(self.error(
1105 Span::splat(self.pos()),
1106 ast::ErrorKind::UnsupportedResharpRegex,
1107 )),
1108 hir::HirKind::Capture(_) => Err(self.error(
1109 Span::splat(self.pos()),
1110 ast::ErrorKind::UnsupportedResharpRegex,
1111 )),
1112 hir::HirKind::Concat(body) => {
1113 let mut result = NodeId::EPS;
1114 for child in body {
1115 let node = self.hir_to_node_id(child, tb)?;
1116 result = tb.mk_concat(result, node);
1117 }
1118 Ok(result)
1119 }
1120 hir::HirKind::Alternation(_) => Err(self.error(
1121 Span::splat(self.pos()),
1122 ast::ErrorKind::UnsupportedResharpRegex,
1123 )),
1124 }
1125 }
1126
1127 fn translate_ast_to_hir(
1128 &mut self,
1129 orig_ast: ®ex_syntax::ast::Ast,
1130 tb: &mut TB<'s>,
1131 ) -> Result<NodeId> {
1132 match self.translator.translate("", orig_ast) {
1133 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1134 Ok(hir) => self.hir_to_node_id(&hir, tb),
1135 }
1136 }
1137
1138 fn translator_to_node_id(
1139 &mut self,
1140 orig_ast: ®ex_syntax::ast::Ast,
1141 translator: &mut Option<Translator>,
1142 tb: &mut TB<'s>,
1143 ) -> Result<NodeId> {
1144 match translator {
1145 Some(tr) => {
1146 let hir = tr
1147 .translate("", orig_ast)
1148 .map_err(|e| self.unsupported_error(e))?;
1149 self.hir_to_node_id(&hir, tb)
1150 }
1151 None => self.translate_ast_to_hir(orig_ast, tb),
1152 }
1153 }
1154
1155 fn get_class(
1156 &mut self,
1157 negated: bool,
1158 kind: regex_syntax::ast::ClassPerlKind,
1159 tb: &mut TB<'s>,
1160 ) -> Result<NodeId> {
1161 let w = self
1162 .perl_classes
1163 .iter()
1164 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1165 match w {
1166 Some((_, _, value)) => Ok(*value),
1167 None => {
1168 let translated = if self.global_ascii_perl {
1169 let pos = match kind {
1170 regex_syntax::ast::ClassPerlKind::Word => {
1171 let az = tb.mk_range_u8(b'a', b'z');
1172 let big = tb.mk_range_u8(b'A', b'Z');
1173 let dig = tb.mk_range_u8(b'0', b'9');
1174 let us = tb.mk_u8(b'_');
1175 tb.mk_unions([az, big, dig, us].into_iter())
1176 }
1177 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1178 regex_syntax::ast::ClassPerlKind::Space => {
1179 let sp = tb.mk_u8(b' ');
1180 let tab = tb.mk_u8(b'\t');
1181 let nl = tb.mk_u8(b'\n');
1182 let cr = tb.mk_u8(b'\r');
1183 let ff = tb.mk_u8(0x0C);
1184 let vt = tb.mk_u8(0x0B);
1185 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1186 }
1187 };
1188 if negated {
1189 resharp_algebra::neg_class(tb, pos)
1190 } else {
1191 pos
1192 }
1193 } else if self.global_unicode {
1194 match kind {
1195 regex_syntax::ast::ClassPerlKind::Word => {
1196 if self.global_full_unicode {
1197 self.unicode_classes.ensure_word_full(tb);
1198 } else {
1199 self.unicode_classes.ensure_word(tb);
1200 }
1201 if negated {
1202 self.unicode_classes.non_word
1203 } else {
1204 self.unicode_classes.word
1205 }
1206 }
1207 regex_syntax::ast::ClassPerlKind::Digit => {
1208 if self.global_full_unicode {
1209 self.unicode_classes.ensure_digit_full(tb);
1210 } else {
1211 self.unicode_classes.ensure_digit(tb);
1212 }
1213 if negated {
1214 self.unicode_classes.non_digit
1215 } else {
1216 self.unicode_classes.digit
1217 }
1218 }
1219 regex_syntax::ast::ClassPerlKind::Space => {
1220 if self.global_full_unicode {
1221 self.unicode_classes.ensure_space_full(tb);
1222 } else {
1223 self.unicode_classes.ensure_space(tb);
1224 }
1225 if negated {
1226 self.unicode_classes.non_space
1227 } else {
1228 self.unicode_classes.space
1229 }
1230 }
1231 }
1232 } else {
1233 let pos = match kind {
1234 regex_syntax::ast::ClassPerlKind::Word => {
1235 let az = tb.mk_range_u8(b'a', b'z');
1236 let big = tb.mk_range_u8(b'A', b'Z');
1237 let dig = tb.mk_range_u8(b'0', b'9');
1238 let us = tb.mk_u8(b'_');
1239 tb.mk_unions([az, big, dig, us].into_iter())
1240 }
1241 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1242 regex_syntax::ast::ClassPerlKind::Space => {
1243 let sp = tb.mk_u8(b' ');
1244 let tab = tb.mk_u8(b'\t');
1245 let nl = tb.mk_u8(b'\n');
1246 let cr = tb.mk_u8(b'\r');
1247 let ff = tb.mk_u8(0x0C);
1248 let vt = tb.mk_u8(0x0B);
1249 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1250 }
1251 };
1252 if negated {
1253 tb.mk_compl(pos)
1254 } else {
1255 pos
1256 }
1257 };
1258 self.perl_classes.push((negated, kind, translated));
1259 Ok(translated)
1260 }
1261 }
1262 }
1263
1264 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1265 use WordCharKind::*;
1266 match ast {
1267 Ast::Literal(lit) => {
1268 if is_word_byte(lit.c as u8) {
1269 Word
1270 } else {
1271 NonWord
1272 }
1273 }
1274 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1275 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1276 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1277 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1278 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1279 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1280 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1281 },
1282 Ast::Dot(_) | Ast::Top(_) => Unknown,
1283 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1284 Ast::Concat(c) if !c.asts.is_empty() => {
1285 let edge = if left { c.asts.len() - 1 } else { 0 };
1286 let kind = Self::word_char_kind(&c.asts[edge], left);
1287 match kind {
1288 MaybeWord => {
1289 let dir: isize = if left { -1 } else { 1 };
1290 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1291 Word => Word,
1292 _ => MaybeWord,
1293 }
1294 }
1295 MaybeNonWord => {
1296 let dir: isize = if left { -1 } else { 1 };
1297 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1298 NonWord => NonWord,
1299 _ => MaybeNonWord,
1300 }
1301 }
1302 other => other,
1303 }
1304 }
1305 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1306 let first = Self::word_char_kind(&alt.asts[0], left);
1307 if alt.asts[1..]
1308 .iter()
1309 .all(|a| Self::word_char_kind(a, left) == first)
1310 {
1311 first
1312 } else {
1313 Unknown
1314 }
1315 }
1316 Ast::Repetition(r) => {
1317 let inner = Self::word_char_kind(&r.ast, left);
1318 let nullable = matches!(
1319 &r.op.kind,
1320 ast::RepetitionKind::ZeroOrMore
1321 | ast::RepetitionKind::ZeroOrOne
1322 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1323 );
1324 if nullable {
1325 match inner {
1326 Word => MaybeWord,
1327 NonWord => MaybeNonWord,
1328 _ => Unknown,
1329 }
1330 } else {
1331 inner
1332 }
1333 }
1334 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1335 _ => Unknown,
1336 }
1337 }
1338
1339 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1341 match ast {
1342 Ast::Literal(_)
1343 | Ast::ClassPerl(_)
1344 | Ast::ClassBracketed(_)
1345 | Ast::ClassUnicode(_)
1346 | Ast::Dot(_)
1347 | Ast::Top(_) => Some(ast),
1348 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1349 Ast::Concat(c) if !c.asts.is_empty() => {
1350 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1351 }
1352 Ast::Repetition(r) => {
1353 let nullable = matches!(
1354 &r.op.kind,
1355 ast::RepetitionKind::ZeroOrMore
1356 | ast::RepetitionKind::ZeroOrOne
1357 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1358 );
1359 if nullable {
1360 None
1361 } else {
1362 Self::edge_class_ast(&r.ast, left)
1363 }
1364 }
1365 _ => None,
1366 }
1367 }
1368
1369 fn resolve_word_kind(
1370 &mut self,
1371 asts: &[Ast],
1372 idx: usize,
1373 dir: isize,
1374 translator: &mut Option<Translator>,
1375 tb: &mut TB<'s>,
1376 word_id: NodeId,
1377 not_word_id: NodeId,
1378 ) -> Result<WordCharKind> {
1379 use WordCharKind::*;
1380 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1381 if fast != Unknown {
1382 return Ok(fast);
1383 }
1384 let neighbor_idx = (idx as isize + dir) as usize;
1385 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1386 self.ast_to_node_id(edge, translator, tb)?
1387 } else {
1388 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1390 let mut neighbor_node = tb
1391 .try_elim_lookarounds(neighbor_node)
1392 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1393 if dir < 0 {
1394 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1395 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1396 })?;
1397 }
1398 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1399 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1400 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1401 Ok(Word)
1402 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1403 Ok(NonWord)
1404 } else {
1405 Ok(Unknown)
1406 };
1407 };
1408 if tb.subsumes(word_id, node) == Some(true) {
1409 Ok(Word)
1410 } else if tb.subsumes(not_word_id, node) == Some(true) {
1411 Ok(NonWord)
1412 } else {
1413 Ok(Unknown)
1414 }
1415 }
1416
1417 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1418 use WordCharKind::*;
1419 let next = idx as isize + dir;
1420 if next < 0 || next >= asts.len() as isize {
1421 return Edge;
1422 }
1423 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1424 match kind {
1425 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1426 Word => Word,
1427 _ => Unknown,
1428 },
1429 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1430 NonWord => NonWord,
1431 _ => Unknown,
1432 },
1433 other => other,
1434 }
1435 }
1436
1437 fn rewrite_word_boundary_in_concat(
1438 &mut self,
1439 asts: &[Ast],
1440 idx: usize,
1441 translator: &mut Option<Translator>,
1442 tb: &mut TB<'s>,
1443 ) -> Result<(NodeId, usize)> {
1444 use WordCharKind::*;
1445 let (word_id, not_word_id) = if self.global_full_unicode {
1446 self.unicode_classes.ensure_word_full(tb);
1447 (self.unicode_classes.word, self.unicode_classes.non_word)
1448 } else if self.global_unicode && !self.global_ascii_perl {
1449 self.unicode_classes.ensure_word(tb);
1450 (self.unicode_classes.word, self.unicode_classes.non_word)
1451 } else {
1452 let az = tb.mk_range_u8(b'a', b'z');
1453 let big = tb.mk_range_u8(b'A', b'Z');
1454 let dig = tb.mk_range_u8(b'0', b'9');
1455 let us = tb.mk_u8(b'_');
1456 let w = tb.mk_unions([az, big, dig, us].into_iter());
1457 (w, tb.mk_compl(w))
1458 };
1459 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1460 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1461 match (left, right) {
1462 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1463 (Word, _) => {
1464 let neg = tb.mk_neg_lookahead(word_id, 0);
1465 Ok((neg, idx + 1))
1466 }
1467 (NonWord, _) => {
1468 let tail = tb.mk_concat(word_id, NodeId::TS);
1469 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1470 }
1471 (_, Word) => Ok((tb.mk_neg_lookbehind(word_id), idx + 1)),
1472 (_, NonWord) => Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1)),
1473 _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1477 }
1478 }
1479
1480 fn merge_boundary_with_following_lookaheads(
1481 &mut self,
1482 asts: &[Ast],
1483 wb_idx: usize,
1484 boundary_tail: NodeId,
1485 translator: &mut Option<Translator>,
1486 tb: &mut TB<'s>,
1487 ) -> Result<(NodeId, usize)> {
1488 let mut next = wb_idx + 1;
1489 let mut la_bodies = vec![boundary_tail];
1490 while next < asts.len() {
1491 match &asts[next] {
1492 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1493 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1494 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1495 next += 1;
1496 }
1497 _ => break,
1498 }
1499 }
1500 let merged = tb.mk_inters(la_bodies.into_iter());
1501 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1502 }
1503
1504 fn ast_to_node_id(
1505 &mut self,
1506 ast: &Ast,
1507 translator: &mut Option<Translator>,
1508 tb: &mut TB<'s>,
1509 ) -> Result<NodeId> {
1510 match ast {
1511 Ast::Empty(_) => Ok(NodeId::EPS),
1512 Ast::Flags(f) => {
1513 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1514 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1515 }
1516 let mut translator_builder = self.default_translator_builder();
1517 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1518 translator_builder.case_insensitive(state);
1519 }
1520 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1521 translator_builder.unicode(state);
1522 }
1523 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1524 self.dot_all.set(state);
1525 }
1526 let concat_translator = Some(translator_builder.build());
1527 *translator = concat_translator;
1528 Ok(NodeId::EPS)
1529 }
1530 Ast::Literal(l) => {
1531 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1532 self.translator_to_node_id(&ast_lit, translator, tb)
1533 }
1534 Ast::Top(_) => Ok(NodeId::TOP),
1535 Ast::Dot(_) => {
1536 let hirv = match (self.global_ascii_perl, self.dot_all.get()) {
1537 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1538 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1539 (false, true) => return Ok(NodeId::TOP),
1540 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1541 };
1542 self.hir_to_node_id(&hirv, tb)
1543 }
1544 Ast::Assertion(a) => match &a.kind {
1545 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1546 ast::AssertionKind::EndText => Ok(NodeId::END),
1547 ast::AssertionKind::WordBoundary => {
1548 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1549 }
1550 ast::AssertionKind::NotWordBoundary => {
1551 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1552 }
1553 ast::AssertionKind::StartLine => {
1554 let left = NodeId::BEGIN;
1555 let right = tb.mk_u8(b'\n');
1556 let union = tb.mk_union(left, right);
1557 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1558 }
1559 ast::AssertionKind::EndLine => {
1560 let left = NodeId::END;
1561 let right = tb.mk_u8(b'\n');
1562 let union = tb.mk_union(left, right);
1563 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1564 }
1565 ast::AssertionKind::WordBoundaryStart => {
1566 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1567 }
1568 ast::AssertionKind::WordBoundaryEnd => {
1569 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1570 }
1571 ast::AssertionKind::WordBoundaryStartAngle => {
1572 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1573 }
1574 ast::AssertionKind::WordBoundaryEndAngle => {
1575 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1576 }
1577 ast::AssertionKind::WordBoundaryStartHalf => {
1578 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1579 }
1580 ast::AssertionKind::WordBoundaryEndHalf => {
1581 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1582 }
1583 },
1584 Ast::ClassUnicode(c) => {
1585 let tmp = regex_syntax::ast::ClassUnicode {
1586 span: c.span,
1587 negated: c.negated,
1588 kind: c.kind.clone(),
1589 };
1590 if !c.negated {
1591 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1592 match s.as_str() {
1593 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1595 "utf8" => {
1597 let ascii = tb.mk_range_u8(0, 127);
1598 let beta = tb.mk_range_u8(128, 0xBF);
1599 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1600 let c0s = tb.mk_concats([c0, beta].into_iter());
1601 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1602 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1603 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1604 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1605 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1606 return Ok(tb.mk_star(merged));
1607 }
1608 "hex" => {
1609 let nums = tb.mk_range_u8(b'0', b'9');
1610 let lets = tb.mk_range_u8(b'a', b'f');
1611 let lets2 = tb.mk_range_u8(b'A', b'F');
1612 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1613 return Ok(merged);
1614 }
1615 _ => {}
1616 }
1617 };
1618 }
1619
1620 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1621 self.translator_to_node_id(&orig_ast, translator, tb)
1622 }
1623 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1624 Ast::ClassBracketed(c) => match &c.kind {
1625 regex_syntax::ast::ClassSet::Item(item) => {
1626 if !c.negated && is_universal_perl_pair(item) {
1627 return Ok(NodeId::TOP);
1628 }
1629 let tmp = regex_syntax::ast::ClassBracketed {
1630 span: c.span,
1631 negated: c.negated,
1632 kind: c.kind.clone(),
1633 };
1634 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1635 self.translator_to_node_id(&orig_ast, translator, tb)
1636 }
1637 regex_syntax::ast::ClassSet::BinaryOp(_) => {
1638 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1639 }
1640 },
1641 Ast::Repetition(r) => {
1642 let body = self.ast_to_node_id(&r.ast, translator, tb);
1643 match body {
1644 Ok(body) => match &r.op.kind {
1645 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1646 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1647 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1648 ast::RepetitionKind::Range(r) => match r {
1649 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1650 ast::RepetitionRange::AtLeast(n) => {
1651 let rep = tb.mk_repeat(body, *n, *n);
1652 let st = tb.mk_star(body);
1653 Ok(tb.mk_concat(rep, st))
1654 }
1655
1656 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1657 },
1658 },
1659 Err(_) => body,
1660 }
1661 }
1662 Ast::Lookaround(g) => {
1663 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1664 match g.kind {
1665 ast::LookaroundKind::PositiveLookahead => {
1666 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1667 }
1668 ast::LookaroundKind::PositiveLookbehind => {
1669 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1670 }
1671 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1672 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1673 }
1674 }
1675 Ast::Group(g) => {
1676 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1677 if !flags.items.is_empty() {
1678 let mut translator_builder = self.default_translator_builder();
1679 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1680 translator_builder.case_insensitive(state);
1681 }
1682 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1683 translator_builder.unicode(state);
1684 }
1685 let saved_dot_all = self.dot_all.get();
1686 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1687 self.dot_all.set(state);
1688 }
1689 let mut scoped = Some(translator_builder.build());
1690 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1691 self.dot_all.set(saved_dot_all);
1692 return result;
1693 }
1694 }
1695 self.ast_to_node_id(&g.ast, translator, tb)
1696 }
1697 Ast::Alternation(a) => {
1698 let mut children = vec![];
1699 for ast in &a.asts {
1700 match self.ast_to_node_id(ast, translator, tb) {
1701 Ok(node_id) => children.push(node_id),
1702 Err(err) => return Err(err),
1703 }
1704 }
1705 Ok(tb.mk_unions(children.iter().copied()))
1706 }
1707 Ast::Concat(c) => {
1708 let mut concat_translator: Option<Translator> = None;
1709 let mut children = vec![];
1710 let mut i = 0;
1711 while i < c.asts.len() {
1712 let ast = &c.asts[i];
1713 match ast {
1714 Ast::Flags(f) => {
1715 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1716 return Err(
1717 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1718 );
1719 }
1720 let mut translator_builder = self.default_translator_builder();
1721 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1722 translator_builder.case_insensitive(state);
1723 }
1724 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1725 translator_builder.unicode(state);
1726 }
1727 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1728 self.dot_all.set(state);
1729 }
1730 concat_translator = Some(translator_builder.build());
1731 *translator = concat_translator.clone();
1732 i += 1;
1733 continue;
1734 }
1735 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1736 let node =
1737 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1738 children.push(node.0);
1739 i = node.1; continue;
1741 }
1742 _ => {}
1743 }
1744 match concat_translator {
1745 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1746 Ok(node_id) => children.push(node_id),
1747 Err(err) => return Err(err),
1748 },
1749 None => match self.ast_to_node_id(ast, translator, tb) {
1750 Ok(node_id) => children.push(node_id),
1751 Err(err) => return Err(err),
1752 },
1753 }
1754 i += 1;
1755 }
1756 Ok(tb.mk_concats(children.iter().cloned()))
1757 }
1758 Ast::Intersection(intersection) => {
1759 let mut children = vec![];
1760 for ast in &intersection.asts {
1761 match self.ast_to_node_id(ast, translator, tb) {
1762 Ok(node_id) => children.push(node_id),
1763 Err(err) => return Err(err),
1764 }
1765 }
1766 Ok(tb.mk_inters(children.into_iter()))
1767 }
1768 Ast::Complement(complement) => {
1769 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1770 body.map(|x| tb.mk_compl(x))
1771 }
1772 }
1773 }
1774
1775 fn parse_inner(&mut self) -> Result<Ast> {
1776 let mut concat = Concat {
1777 span: self.span(),
1778 asts: vec![],
1779 };
1780 loop {
1781 self.bump_space();
1782 if self.is_eof() {
1783 break;
1784 }
1785 match self.char() {
1786 '(' => concat = self.push_group(concat)?,
1787 ')' => concat = self.pop_group(concat)?,
1788 '|' => concat = self.push_alternate(concat)?,
1789 '&' => concat = self.push_intersect(concat)?,
1790 '~' => concat = self.push_compl_group(concat)?,
1791 '[' => {
1792 let class = self.parse_set_class()?;
1793 concat.asts.push(Ast::class_bracketed(class));
1794 }
1795 '?' => {
1796 concat =
1797 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1798 }
1799 '*' => {
1800 concat =
1801 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1802 }
1803 '+' => {
1804 concat =
1805 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1806 }
1807 '{' => {
1808 concat = self.parse_counted_repetition(concat)?;
1809 }
1810 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1811 }
1812 }
1813 let ast = self.pop_group_end(concat)?;
1814 if expanded_ast_size(&ast, EXPANDED_AST_LIMIT) >= EXPANDED_AST_LIMIT
1815 || max_list_length(&ast) >= MAX_LIST_LEN
1816 {
1817 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1818 }
1819 Ok(ast)
1820 }
1821
1822 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1825 let ast = self.parse_inner()?;
1826 self.ast_to_node_id(&ast, &mut None, tb)
1827 }
1828
1829 #[inline(never)]
1830 fn parse_uncounted_repetition(
1831 &self,
1832 mut concat: ast::Concat,
1833 kind: ast::RepetitionKind,
1834 ) -> Result<ast::Concat> {
1835 let op_start = self.pos();
1837 let ast = match concat.asts.pop() {
1838 Some(ast) => ast,
1839 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1840 };
1841 match ast {
1842 Ast::Empty(_) | Ast::Flags(_) => {
1843 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1844 }
1845 _ => {}
1846 }
1847 if self.bump() && self.char() == '?' {
1848 return Err(self.error(
1849 Span::new(op_start, self.pos()),
1850 ast::ErrorKind::UnsupportedLazyQuantifier,
1851 ));
1852 }
1853 concat.asts.push(Ast::repetition(ast::Repetition {
1854 span: ast.span().with_end(self.pos()),
1855 op: ast::RepetitionOp {
1856 span: Span::new(op_start, self.pos()),
1857 kind,
1858 },
1859 greedy: true,
1860 ast: Box::new(ast),
1861 }));
1862 Ok(concat)
1863 }
1864
1865 #[inline(never)]
1866 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1867 assert!(self.char() == '{');
1868 let start = self.pos();
1869 let ast = match concat.asts.pop() {
1870 Some(ast) => ast,
1871 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1872 };
1873 match ast {
1874 Ast::Empty(_) | Ast::Flags(_) => {
1875 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1876 }
1877 _ => {}
1878 }
1879 if !self.bump_and_bump_space() {
1880 return Err(self.error(
1881 Span::new(start, self.pos()),
1882 ast::ErrorKind::RepetitionCountUnclosed,
1883 ));
1884 }
1885 let count_start = specialize_err(
1886 self.parse_decimal(),
1887 ast::ErrorKind::DecimalEmpty,
1888 ast::ErrorKind::RepetitionCountDecimalEmpty,
1889 );
1890 if self.is_eof() {
1891 return Err(self.error(
1892 Span::new(start, self.pos()),
1893 ast::ErrorKind::RepetitionCountUnclosed,
1894 ));
1895 }
1896 let range = if self.char() == ',' {
1897 if !self.bump_and_bump_space() {
1898 return Err(self.error(
1899 Span::new(start, self.pos()),
1900 ast::ErrorKind::RepetitionCountUnclosed,
1901 ));
1902 }
1903 if self.char() != '}' {
1904 let count_start = match count_start {
1905 Ok(c) => c,
1906 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1907 if self.parser().empty_min_range {
1908 0
1909 } else {
1910 return Err(err);
1911 }
1912 }
1913 err => err?,
1914 };
1915 let count_end = specialize_err(
1916 self.parse_decimal(),
1917 ast::ErrorKind::DecimalEmpty,
1918 ast::ErrorKind::RepetitionCountDecimalEmpty,
1919 )?;
1920 ast::RepetitionRange::Bounded(count_start, count_end)
1921 } else {
1922 ast::RepetitionRange::AtLeast(count_start?)
1923 }
1924 } else {
1925 ast::RepetitionRange::Exactly(count_start?)
1926 };
1927
1928 if self.is_eof() || self.char() != '}' {
1929 return Err(self.error(
1930 Span::new(start, self.pos()),
1931 ast::ErrorKind::RepetitionCountUnclosed,
1932 ));
1933 }
1934
1935 if self.bump_and_bump_space() && self.char() == '?' {
1936 return Err(self.error(
1937 Span::new(start, self.pos()),
1938 ast::ErrorKind::UnsupportedLazyQuantifier,
1939 ));
1940 }
1941
1942 let op_span = Span::new(start, self.pos());
1943 if !range.is_valid() {
1944 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1945 }
1946
1947 let over_limit = match &range {
1948 ast::RepetitionRange::Exactly(n) => *n > REPETITION_COUNT_LIMIT,
1949 ast::RepetitionRange::AtLeast(n) => *n > REPETITION_COUNT_LIMIT,
1950 ast::RepetitionRange::Bounded(n, m) => {
1951 *n > REPETITION_COUNT_LIMIT || *m > REPETITION_COUNT_LIMIT
1952 }
1953 };
1954 if over_limit {
1955 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
1956 }
1957 concat.asts.push(Ast::repetition(ast::Repetition {
1958 span: ast.span().with_end(self.pos()),
1959 op: ast::RepetitionOp {
1960 span: op_span,
1961 kind: ast::RepetitionKind::Range(range),
1962 },
1963 greedy: true,
1964 ast: Box::new(ast),
1965 }));
1966 Ok(concat)
1967 }
1968
1969 #[inline(never)]
1970 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1971 assert_eq!(self.char(), '(');
1972 let open_span = self.span_char();
1973 self.bump();
1974 self.bump_space();
1975 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1976 let kind = match (pos, ahead) {
1977 (true, true) => LookaroundKind::PositiveLookahead,
1978 (true, false) => LookaroundKind::PositiveLookbehind,
1979 (false, true) => LookaroundKind::NegativeLookahead,
1980 (false, false) => LookaroundKind::NegativeLookbehind,
1981 };
1982 return Ok(Either::Right(ast::Group {
1983 span: open_span,
1984 kind: ast::GroupKind::Lookaround(kind),
1985 ast: Box::new(Ast::empty(self.span())),
1986 }));
1987 }
1988 let inner_span = self.span();
1989 let mut starts_with_p = true;
1990 if self.bump_if("?P<") || {
1991 starts_with_p = false;
1992 self.bump_if("?<")
1993 } {
1994 let capture_index = self.next_capture_index(open_span)?;
1995 let name = self.parse_capture_name(capture_index)?;
1996 Ok(Either::Right(ast::Group {
1997 span: open_span,
1998 kind: ast::GroupKind::CaptureName {
1999 starts_with_p,
2000 name,
2001 },
2002 ast: Box::new(Ast::empty(self.span())),
2003 }))
2004 } else if self.bump_if("?") {
2005 if self.is_eof() {
2006 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2007 }
2008 let flags = self.parse_flags()?;
2009 let char_end = self.char();
2010 self.bump();
2011 if char_end == ')' {
2012 if flags.items.is_empty() {
2015 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2016 }
2017 Ok(Either::Left(ast::SetFlags {
2018 span: Span {
2019 end: self.pos(),
2020 ..open_span
2021 },
2022 flags,
2023 }))
2024 } else {
2025 assert_eq!(char_end, ':');
2026 Ok(Either::Right(ast::Group {
2027 span: open_span,
2028 kind: ast::GroupKind::NonCapturing(flags),
2029 ast: Box::new(Ast::empty(self.span())),
2030 }))
2031 }
2032 } else {
2033 let capture_index = self.next_capture_index(open_span)?;
2034 Ok(Either::Right(ast::Group {
2035 span: open_span,
2036 kind: ast::GroupKind::CaptureIndex(capture_index),
2037 ast: Box::new(Ast::empty(self.span())),
2038 }))
2039 }
2040 }
2041
2042 #[inline(never)]
2043 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2044 if self.is_eof() {
2045 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2046 }
2047 let start = self.pos();
2048 loop {
2049 if self.char() == '>' {
2050 break;
2051 }
2052 if !is_capture_char(self.char(), self.pos() == start) {
2053 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2054 }
2055 if !self.bump() {
2056 break;
2057 }
2058 }
2059 let end = self.pos();
2060 if self.is_eof() {
2061 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2062 }
2063 assert_eq!(self.char(), '>');
2064 self.bump();
2065 let name = &self.pattern()[start.offset..end.offset];
2066 if name.is_empty() {
2067 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2068 }
2069 let capname = ast::CaptureName {
2070 span: Span::new(start, end),
2071 name: name.to_string(),
2072 index: capture_index,
2073 };
2074 self.add_capture_name(&capname)?;
2075 Ok(capname)
2076 }
2077
2078 #[inline(never)]
2079 fn parse_flags(&self) -> Result<ast::Flags> {
2080 let mut flags = ast::Flags {
2081 span: self.span(),
2082 items: vec![],
2083 };
2084 let mut last_was_negation = None;
2085 while self.char() != ':' && self.char() != ')' {
2086 if self.char() == '-' {
2087 last_was_negation = Some(self.span_char());
2088 let item = ast::FlagsItem {
2089 span: self.span_char(),
2090 kind: ast::FlagsItemKind::Negation,
2091 };
2092 if let Some(i) = flags.add_item(item) {
2093 return Err(self.error(
2094 self.span_char(),
2095 ast::ErrorKind::FlagRepeatedNegation {
2096 original: flags.items[i].span,
2097 },
2098 ));
2099 }
2100 } else {
2101 last_was_negation = None;
2102 let item = ast::FlagsItem {
2103 span: self.span_char(),
2104 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2105 };
2106 if let Some(i) = flags.add_item(item) {
2107 return Err(self.error(
2108 self.span_char(),
2109 ast::ErrorKind::FlagDuplicate {
2110 original: flags.items[i].span,
2111 },
2112 ));
2113 }
2114 }
2115 if !self.bump() {
2116 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2117 }
2118 }
2119 if let Some(span) = last_was_negation {
2120 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2121 }
2122 flags.span.end = self.pos();
2123 Ok(flags)
2124 }
2125
2126 #[inline(never)]
2127 fn parse_flag(&self) -> Result<ast::Flag> {
2128 match self.char() {
2129 'i' => Ok(ast::Flag::CaseInsensitive),
2130 'm' => Ok(ast::Flag::MultiLine),
2131 's' => Ok(ast::Flag::DotMatchesNewLine),
2132 'U' => Ok(ast::Flag::SwapGreed),
2133 'u' => Ok(ast::Flag::Unicode),
2134 'R' => Ok(ast::Flag::CRLF),
2135 'x' => Ok(ast::Flag::IgnoreWhitespace),
2136 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2137 }
2138 }
2139
2140 fn parse_primitive(&self) -> Result<Primitive> {
2141 match self.char() {
2142 '\\' => self.parse_escape(),
2143 '_' => {
2144 let ast = Primitive::Top(self.span_char());
2145 self.bump();
2146 Ok(ast)
2147 }
2148 '.' => {
2149 let ast = Primitive::Dot(self.span_char());
2150 self.bump();
2151 Ok(ast)
2152 }
2153 '^' => {
2154 let ast = Primitive::Assertion(ast::Assertion {
2155 span: self.span_char(),
2156 kind: ast::AssertionKind::StartLine,
2157 });
2158 self.bump();
2159 Ok(ast)
2160 }
2161 '$' => {
2162 let ast = Primitive::Assertion(ast::Assertion {
2163 span: self.span_char(),
2164 kind: ast::AssertionKind::EndLine,
2165 });
2166 self.bump();
2167 Ok(ast)
2168 }
2169 c => {
2170 let ast = Primitive::Literal(Literal {
2171 span: self.span_char(),
2172 kind: LiteralKind::Verbatim,
2173 c,
2174 });
2175 self.bump();
2176 Ok(ast)
2177 }
2178 }
2179 }
2180
2181 #[inline(never)]
2182 fn parse_escape(&self) -> Result<Primitive> {
2183 assert_eq!(self.char(), '\\');
2184 let start = self.pos();
2185 if !self.bump() {
2186 return Err(self.error(
2187 Span::new(start, self.pos()),
2188 ast::ErrorKind::EscapeUnexpectedEof,
2189 ));
2190 }
2191 let c = self.char();
2192 match c {
2194 '0'..='9' => {
2195 if !self.parser().octal {
2196 return Err(self.error(
2197 Span::new(start, self.span_char().end),
2198 ast::ErrorKind::UnsupportedBackreference,
2199 ));
2200 }
2201 let mut lit = self.parse_octal();
2202 lit.span.start = start;
2203 return Ok(Primitive::Literal(lit));
2204 }
2205 'x' | 'u' | 'U' => {
2212 let mut lit = self.parse_hex()?;
2213 lit.span.start = start;
2214 return Ok(Primitive::Literal(lit));
2215 }
2216 'p' | 'P' => {
2217 let mut cls = self.parse_unicode_class()?;
2218 cls.span.start = start;
2219 return Ok(Primitive::Unicode(cls));
2220 }
2221 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2222 let mut cls = self.parse_perl_class();
2223 cls.span.start = start;
2224 return Ok(Primitive::Perl(cls));
2225 }
2226 _ => {}
2227 }
2228
2229 self.bump();
2231 let span = Span::new(start, self.pos());
2232 if is_meta_character(c) {
2233 return Ok(Primitive::Literal(Literal {
2234 span,
2235 kind: LiteralKind::Meta,
2236 c,
2237 }));
2238 }
2239 if is_escapeable_character(c) {
2240 return Ok(Primitive::Literal(Literal {
2241 span,
2242 kind: LiteralKind::Superfluous,
2243 c,
2244 }));
2245 }
2246 let special = |kind, c| {
2247 Ok(Primitive::Literal(Literal {
2248 span,
2249 kind: LiteralKind::Special(kind),
2250 c,
2251 }))
2252 };
2253 match c {
2254 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2255 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2256 't' => special(SpecialLiteralKind::Tab, '\t'),
2257 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2258 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2259 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2260 'A' => Ok(Primitive::Assertion(ast::Assertion {
2261 span,
2262 kind: ast::AssertionKind::StartText,
2263 })),
2264 'z' => Ok(Primitive::Assertion(ast::Assertion {
2265 span,
2266 kind: ast::AssertionKind::EndText,
2267 })),
2268 'b' => {
2269 let mut wb = ast::Assertion {
2270 span,
2271 kind: ast::AssertionKind::WordBoundary,
2272 };
2273 if !self.is_eof() && self.char() == '{' {
2276 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2277 wb.kind = kind;
2278 wb.span.end = self.pos();
2279 }
2280 }
2281 Ok(Primitive::Assertion(wb))
2282 }
2283 'B' => Ok(Primitive::Assertion(ast::Assertion {
2284 span,
2285 kind: ast::AssertionKind::NotWordBoundary,
2286 })),
2287 '<' => Ok(Primitive::Assertion(ast::Assertion {
2288 span,
2289 kind: ast::AssertionKind::WordBoundaryStartAngle,
2290 })),
2291 '>' => Ok(Primitive::Assertion(ast::Assertion {
2292 span,
2293 kind: ast::AssertionKind::WordBoundaryEndAngle,
2294 })),
2295 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2296 }
2297 }
2298
2299 fn maybe_parse_special_word_boundary(
2300 &self,
2301 wb_start: Position,
2302 ) -> Result<Option<ast::AssertionKind>> {
2303 assert_eq!(self.char(), '{');
2304
2305 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2306 let start = self.pos();
2307 if !self.bump_and_bump_space() {
2308 return Err(self.error(
2309 Span::new(wb_start, self.pos()),
2310 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2311 ));
2312 }
2313 let start_contents = self.pos();
2314 if !is_valid_char(self.char()) {
2319 self.parser().pos.set(start);
2320 return Ok(None);
2321 }
2322
2323 let mut scratch = self.parser().scratch.borrow_mut();
2325 scratch.clear();
2326 while !self.is_eof() && is_valid_char(self.char()) {
2327 scratch.push(self.char());
2328 self.bump_and_bump_space();
2329 }
2330 if self.is_eof() || self.char() != '}' {
2331 return Err(self.error(
2332 Span::new(start, self.pos()),
2333 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2334 ));
2335 }
2336 let end = self.pos();
2337 self.bump();
2338 let kind = match scratch.as_str() {
2339 "start" => ast::AssertionKind::WordBoundaryStart,
2340 "end" => ast::AssertionKind::WordBoundaryEnd,
2341 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2342 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2343 _ => {
2344 return Err(self.error(
2345 Span::new(start_contents, end),
2346 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2347 ))
2348 }
2349 };
2350 Ok(Some(kind))
2351 }
2352
2353 #[inline(never)]
2354 fn parse_octal(&self) -> Literal {
2355 assert!(self.parser().octal);
2356 assert!('0' <= self.char() && self.char() <= '7');
2357 let start = self.pos();
2358 while self.bump()
2360 && '0' <= self.char()
2361 && self.char() <= '7'
2362 && self.pos().offset - start.offset <= 2
2363 {}
2364 let end = self.pos();
2365 let octal = &self.pattern()[start.offset..end.offset];
2366 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2369 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2372 Literal {
2373 span: Span::new(start, end),
2374 kind: LiteralKind::Octal,
2375 c,
2376 }
2377 }
2378
2379 #[inline(never)]
2380 fn parse_hex(&self) -> Result<Literal> {
2381 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2382
2383 let hex_kind = match self.char() {
2384 'x' => HexLiteralKind::X,
2385 'u' => HexLiteralKind::UnicodeShort,
2386 _ => HexLiteralKind::UnicodeLong,
2387 };
2388 if !self.bump_and_bump_space() {
2389 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2390 }
2391 if self.char() == '{' {
2392 self.parse_hex_brace(hex_kind)
2393 } else {
2394 self.parse_hex_digits(hex_kind)
2395 }
2396 }
2397
2398 #[inline(never)]
2399 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2400 let mut scratch = self.parser().scratch.borrow_mut();
2401 scratch.clear();
2402
2403 let start = self.pos();
2404 for i in 0..kind.digits() {
2405 if i > 0 && !self.bump_and_bump_space() {
2406 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2407 }
2408 if !is_hex(self.char()) {
2409 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2410 }
2411 scratch.push(self.char());
2412 }
2413 self.bump_and_bump_space();
2416 let end = self.pos();
2417 let hex = scratch.as_str();
2418 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2419 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2420 Some(c) => Ok(Literal {
2421 span: Span::new(start, end),
2422 kind: LiteralKind::HexFixed(kind),
2423 c,
2424 }),
2425 }
2426 }
2427
2428 #[inline(never)]
2429 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2430 let mut scratch = self.parser().scratch.borrow_mut();
2431 scratch.clear();
2432
2433 let brace_pos = self.pos();
2434 let start = self.span_char().end;
2435 while self.bump_and_bump_space() && self.char() != '}' {
2436 if !is_hex(self.char()) {
2437 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2438 }
2439 scratch.push(self.char());
2440 }
2441 if self.is_eof() {
2442 return Err(self.error(
2443 Span::new(brace_pos, self.pos()),
2444 ast::ErrorKind::EscapeUnexpectedEof,
2445 ));
2446 }
2447 let end = self.pos();
2448 let hex = scratch.as_str();
2449 assert_eq!(self.char(), '}');
2450 self.bump_and_bump_space();
2451
2452 if hex.is_empty() {
2453 return Err(self.error(
2454 Span::new(brace_pos, self.pos()),
2455 ast::ErrorKind::EscapeHexEmpty,
2456 ));
2457 }
2458 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2459 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2460 Some(c) => Ok(Literal {
2461 span: Span::new(start, self.pos()),
2462 kind: LiteralKind::HexBrace(kind),
2463 c,
2464 }),
2465 }
2466 }
2467
2468 fn parse_decimal(&self) -> Result<u32> {
2469 let mut scratch = self.parser().scratch.borrow_mut();
2470 scratch.clear();
2471
2472 while !self.is_eof() && self.char().is_whitespace() {
2473 self.bump();
2474 }
2475 let start = self.pos();
2476 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2477 scratch.push(self.char());
2478 self.bump_and_bump_space();
2479 }
2480 let span = Span::new(start, self.pos());
2481 while !self.is_eof() && self.char().is_whitespace() {
2482 self.bump_and_bump_space();
2483 }
2484 let digits = scratch.as_str();
2485 if digits.is_empty() {
2486 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2487 }
2488 match digits.parse::<u32>().ok() {
2489 Some(n) => Ok(n),
2490 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2491 }
2492 }
2493
2494 #[inline(never)]
2495 fn parse_set_class(&self) -> Result<ClassBracketed> {
2496 assert_eq!(self.char(), '[');
2497
2498 let mut union = ClassSetUnion {
2499 span: self.span(),
2500 items: vec![],
2501 };
2502 loop {
2503 self.bump_space();
2504 if self.is_eof() {
2505 return Err(self.unclosed_class_error());
2506 }
2507 match self.char() {
2508 '[' => {
2509 if !self.parser().stack_class.borrow().is_empty() {
2514 if let Some(cls) = self.maybe_parse_ascii_class() {
2515 union.push(ClassSetItem::Ascii(cls));
2516 continue;
2517 }
2518 }
2519 union = self.push_class_open(union)?;
2520 }
2521 ']' => match self.pop_class(union)? {
2522 Either::Left(nested_union) => {
2523 union = nested_union;
2524 }
2525 Either::Right(class) => return Ok(class),
2526 },
2527 '&' if self.peek() == Some('&') => {
2528 assert!(self.bump_if("&&"));
2529 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2530 }
2531 '-' if self.peek() == Some('-') => {
2532 assert!(self.bump_if("--"));
2533 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2534 }
2535 '~' if self.peek() == Some('~') => {
2536 assert!(self.bump_if("~~"));
2537 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2538 }
2539 _ => {
2540 union.push(self.parse_set_class_range()?);
2541 }
2542 }
2543 }
2544 }
2545
2546 #[inline(never)]
2547 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2548 let prim1 = self.parse_set_class_item()?;
2549 self.bump_space();
2550 if self.is_eof() {
2551 return Err(self.unclosed_class_error());
2552 }
2553 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2554 return prim1.into_class_set_item(self);
2555 }
2556 if !self.bump_and_bump_space() {
2557 return Err(self.unclosed_class_error());
2558 }
2559 let prim2 = self.parse_set_class_item()?;
2560 let range = ClassSetRange {
2561 span: Span::new(prim1.span().start, prim2.span().end),
2562 start: prim1.into_class_literal(self)?,
2563 end: prim2.into_class_literal(self)?,
2564 };
2565 if !range.is_valid() {
2566 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2567 }
2568 Ok(ClassSetItem::Range(range))
2569 }
2570
2571 #[inline(never)]
2572 fn parse_set_class_item(&self) -> Result<Primitive> {
2573 if self.char() == '\\' {
2574 self.parse_escape()
2575 } else {
2576 let x = Primitive::Literal(Literal {
2577 span: self.span_char(),
2578 kind: LiteralKind::Verbatim,
2579 c: self.char(),
2580 });
2581 self.bump();
2582 Ok(x)
2583 }
2584 }
2585
2586 #[inline(never)]
2587 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2588 assert_eq!(self.char(), '[');
2589 let start = self.pos();
2590 if !self.bump_and_bump_space() {
2591 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2592 }
2593
2594 let negated = if self.char() != '^' {
2595 false
2596 } else {
2597 if !self.bump_and_bump_space() {
2598 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2599 }
2600 true
2601 };
2602 let mut union = ClassSetUnion {
2604 span: self.span(),
2605 items: vec![],
2606 };
2607 while self.char() == '-' {
2608 union.push(ClassSetItem::Literal(Literal {
2609 span: self.span_char(),
2610 kind: LiteralKind::Verbatim,
2611 c: '-',
2612 }));
2613 if !self.bump_and_bump_space() {
2614 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2615 }
2616 }
2617 if union.items.is_empty() && self.char() == ']' {
2620 union.push(ClassSetItem::Literal(Literal {
2621 span: self.span_char(),
2622 kind: LiteralKind::Verbatim,
2623 c: ']',
2624 }));
2625 if !self.bump_and_bump_space() {
2626 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2627 }
2628 }
2629 let set = ClassBracketed {
2630 span: Span::new(start, self.pos()),
2631 negated,
2632 kind: ClassSet::union(ClassSetUnion {
2633 span: Span::new(union.span.start, union.span.start),
2634 items: vec![],
2635 }),
2636 };
2637 Ok((set, union))
2638 }
2639
2640 #[inline(never)]
2641 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2642 assert_eq!(self.char(), '[');
2643 let start = self.pos();
2645 let mut negated = false;
2646 if !self.bump() || self.char() != ':' {
2647 self.parser().pos.set(start);
2648 return None;
2649 }
2650 if !self.bump() {
2651 self.parser().pos.set(start);
2652 return None;
2653 }
2654 if self.char() == '^' {
2655 negated = true;
2656 if !self.bump() {
2657 self.parser().pos.set(start);
2658 return None;
2659 }
2660 }
2661 let name_start = self.offset();
2662 while self.char() != ':' && self.bump() {}
2663 if self.is_eof() {
2664 self.parser().pos.set(start);
2665 return None;
2666 }
2667 let name = &self.pattern()[name_start..self.offset()];
2668 if !self.bump_if(":]") {
2669 self.parser().pos.set(start);
2670 return None;
2671 }
2672 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2673 Some(kind) => kind,
2674 None => {
2675 self.parser().pos.set(start);
2676 return None;
2677 }
2678 };
2679 Some(ClassAscii {
2680 span: Span::new(start, self.pos()),
2681 kind,
2682 negated,
2683 })
2684 }
2685
2686 #[inline(never)]
2687 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2688 assert!(self.char() == 'p' || self.char() == 'P');
2689
2690 let mut scratch = self.parser().scratch.borrow_mut();
2691 scratch.clear();
2692
2693 let negated = self.char() == 'P';
2694 if !self.bump_and_bump_space() {
2695 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2696 }
2697 let (start, kind) = if self.char() == '{' {
2698 let start = self.span_char().end;
2699 while self.bump_and_bump_space() && self.char() != '}' {
2700 scratch.push(self.char());
2701 }
2702 if self.is_eof() {
2703 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2704 }
2705 assert_eq!(self.char(), '}');
2706 self.bump();
2707
2708 let name = scratch.as_str();
2709 if let Some(i) = name.find("!=") {
2710 (
2711 start,
2712 ClassUnicodeKind::NamedValue {
2713 op: ClassUnicodeOpKind::NotEqual,
2714 name: name[..i].to_string(),
2715 value: name[i + 2..].to_string(),
2716 },
2717 )
2718 } else if let Some(i) = name.find(':') {
2719 (
2720 start,
2721 ClassUnicodeKind::NamedValue {
2722 op: ClassUnicodeOpKind::Colon,
2723 name: name[..i].to_string(),
2724 value: name[i + 1..].to_string(),
2725 },
2726 )
2727 } else if let Some(i) = name.find('=') {
2728 (
2729 start,
2730 ClassUnicodeKind::NamedValue {
2731 op: ClassUnicodeOpKind::Equal,
2732 name: name[..i].to_string(),
2733 value: name[i + 1..].to_string(),
2734 },
2735 )
2736 } else {
2737 (start, ClassUnicodeKind::Named(name.to_string()))
2738 }
2739 } else {
2740 let start = self.pos();
2741 let c = self.char();
2742 if c == '\\' {
2743 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2744 }
2745 self.bump_and_bump_space();
2746 let kind = ClassUnicodeKind::OneLetter(c);
2747 (start, kind)
2748 };
2749 Ok(ClassUnicode {
2750 span: Span::new(start, self.pos()),
2751 negated,
2752 kind,
2753 })
2754 }
2755
2756 #[inline(never)]
2757 fn parse_perl_class(&self) -> ClassPerl {
2758 let c = self.char();
2759 let span = self.span_char();
2760 self.bump();
2761 let (negated, kind) = match c {
2762 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2763 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2764 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2765 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2766 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2767 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2768 c => panic!("expected valid Perl class but got '{}'", c),
2769 };
2770 ClassPerl {
2771 span,
2772 kind,
2773 negated,
2774 }
2775 }
2776}
2777
2778fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
2781 use regex_syntax::ast::ClassSetItem;
2782 let items = match item {
2783 ClassSetItem::Union(u) => &u.items,
2784 _ => return false,
2785 };
2786 if items.len() != 2 {
2787 return false;
2788 }
2789 match (&items[0], &items[1]) {
2790 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2791 let is_all = a.kind == b.kind && a.negated != b.negated;
2792 is_all
2793 }
2794 _ => false,
2795 }
2796}
2797
2798pub fn max_list_length(ast: &ast::Ast) -> usize {
2799 match ast {
2800 ast::Ast::Empty(_)
2801 | ast::Ast::Flags(_)
2802 | ast::Ast::Literal(_)
2803 | ast::Ast::Dot(_)
2804 | ast::Ast::Top(_)
2805 | ast::Ast::Assertion(_)
2806 | ast::Ast::ClassUnicode(_)
2807 | ast::Ast::ClassPerl(_)
2808 | ast::Ast::ClassBracketed(_) => 0,
2809 ast::Ast::Group(g) => max_list_length(&g.ast),
2810 ast::Ast::Complement(c) => max_list_length(&c.ast),
2811 ast::Ast::Lookaround(l) => max_list_length(&l.ast),
2812 ast::Ast::Repetition(r) => max_list_length(&r.ast),
2813 ast::Ast::Concat(c) => c
2814 .asts
2815 .len()
2816 .max(c.asts.iter().map(max_list_length).max().unwrap_or(0)),
2817 ast::Ast::Alternation(a) => a
2818 .asts
2819 .len()
2820 .max(a.asts.iter().map(max_list_length).max().unwrap_or(0)),
2821 ast::Ast::Intersection(i) => i
2822 .asts
2823 .len()
2824 .max(i.asts.iter().map(max_list_length).max().unwrap_or(0)),
2825 }
2826}
2827
2828pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2829 fn go(ast: &ast::Ast, limit: u64) -> u64 {
2830 match ast {
2831 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2832 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2833 ast::Ast::Assertion(_) => 1,
2834 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2835 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2836 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2837 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2838 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2839 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2840 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2841 ast::Ast::Repetition(r) => {
2842 let body = go(&r.ast, limit);
2843 let factor: u64 = match &r.op.kind {
2844 ast::RepetitionKind::ZeroOrOne => 2,
2845 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2846 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2847 (*n as u64).max(1)
2848 }
2849 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2850 (*n as u64).max(1).saturating_add(1)
2851 }
2852 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2853 (*m as u64).max(1)
2854 }
2855 };
2856 body.saturating_mul(factor).min(limit)
2857 }
2858 }
2859 }
2860 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
2861 let mut total: u64 = 0;
2862 for c in children {
2863 total = total.saturating_add(go(c, limit));
2864 if total >= limit {
2865 return limit;
2866 }
2867 }
2868 total
2869 }
2870 go(ast, limit)
2871}
2872
2873pub fn parse_ast<'s>(
2874 tb: &mut TB<'s>,
2875 pattern: &'s str,
2876) -> std::result::Result<NodeId, ResharpError> {
2877 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2878 p.parse(tb)
2879}
2880
2881pub fn parse_ast_with<'s>(
2882 tb: &mut TB<'s>,
2883 pattern: &'s str,
2884 flags: &PatternFlags,
2885) -> std::result::Result<NodeId, ResharpError> {
2886 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2887 p.parse(tb)
2888}
2889
2890pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ResharpError> {
2892 let mut p: ResharpParser = ResharpParser::new(pattern);
2893 p.parse_inner()
2894}