1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub ignore_whitespace: bool,
38 pub ascii_perl_classes: bool,
41}
42
43const REPETITION_COUNT_LIMIT: u32 = 2_000;
46const EXPANDED_AST_LIMIT: u64 = 50_000;
47const MAX_LIST_LEN: usize = 4_000;
48
49impl Default for PatternFlags {
50 fn default() -> Self {
51 Self {
52 unicode: true,
53 full_unicode: false,
54 case_insensitive: false,
55 dot_matches_new_line: false,
56 ignore_whitespace: false,
57 ascii_perl_classes: false,
58 }
59 }
60}
61
62#[derive(Clone, Copy, PartialEq, Debug)]
63enum WordCharKind {
64 Word,
65 NonWord,
66 MaybeWord,
67 MaybeNonWord,
68 Unknown,
69 Edge,
70}
71
72fn is_word_byte(b: u8) -> bool {
73 b.is_ascii_alphanumeric() || b == b'_'
74}
75
76#[derive(Clone, Debug, Eq, PartialEq)]
77enum Primitive {
78 Literal(Literal),
79 Assertion(ast::Assertion),
80 Dot(Span),
81 Top(Span),
82 Perl(ClassPerl),
83 Unicode(ClassUnicode),
84}
85
86impl Primitive {
87 fn span(&self) -> &Span {
88 match *self {
89 Primitive::Literal(ref x) => &x.span,
90 Primitive::Assertion(ref x) => &x.span,
91 Primitive::Dot(ref span) => span,
92 Primitive::Top(ref span) => span,
93 Primitive::Perl(ref x) => &x.span,
94 Primitive::Unicode(ref x) => &x.span,
95 }
96 }
97
98 fn into_ast(self) -> Ast {
99 match self {
100 Primitive::Literal(lit) => Ast::literal(lit),
101 Primitive::Assertion(assert) => Ast::assertion(assert),
102 Primitive::Dot(span) => Ast::dot(span),
103 Primitive::Top(span) => Ast::top(span),
104 Primitive::Perl(cls) => Ast::class_perl(cls),
105 Primitive::Unicode(cls) => Ast::class_unicode(cls),
106 }
107 }
108
109 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
110 use self::Primitive::*;
111 use regex_syntax::ast::ClassSetItem;
112
113 match self {
114 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
115 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
116 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
117 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
118 }
119 }
120
121 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
122 use self::Primitive::*;
123
124 match self {
125 Literal(lit) => Ok(lit),
126 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
127 }
128 }
129}
130
131#[derive(Clone, Debug, Eq, PartialEq)]
132pub enum Either<Left, Right> {
133 Left(Left),
134 Right(Right),
135}
136
137#[derive(Clone, Debug, Eq, PartialEq)]
138pub struct ParseError {
139 pub kind: ErrorKind,
141 pattern: String,
144 pub span: Span,
146}
147
148impl std::fmt::Display for ParseError {
149 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
150 write!(f, "{:?}: {:?}", self.kind, self.span)
151 }
152}
153impl std::error::Error for ParseError {}
154
155type Result<T> = core::result::Result<T, ParseError>;
156
157#[derive(Clone, Debug)]
158enum GroupState {
159 Group {
161 concat: Concat,
163 group: ast::Group,
165 ignore_whitespace: bool,
167 },
168 Alternation(ast::Alternation),
169 Intersection(ast::Intersection),
170}
171
172#[derive(Clone, Debug)]
173enum ClassState {
174 Open {
176 union: regex_syntax::ast::ClassSetUnion,
178 set: regex_syntax::ast::ClassBracketed,
179 },
180 Op {
183 kind: regex_syntax::ast::ClassSetBinaryOpKind,
185 lhs: regex_syntax::ast::ClassSet,
187 },
188}
189
190pub struct ResharpParser<'s> {
192 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
193 unicode_classes: resharp_algebra::UnicodeClassCache,
194 pub translator: regex_syntax::hir::translate::Translator,
195 pub pattern: &'s str,
196 pos: Cell<Position>,
197 capture_index: Cell<u32>,
198 octal: bool,
199 empty_min_range: bool,
200 ignore_whitespace: Cell<bool>,
201 dot_all: Cell<bool>,
202 global_unicode: bool,
203 global_full_unicode: bool,
204 global_ascii_perl: bool,
205 global_case_insensitive: bool,
206 comments: RefCell<Vec<ast::Comment>>,
207 stack_group: RefCell<Vec<GroupState>>,
208 stack_class: RefCell<Vec<ClassState>>,
209 capture_names: RefCell<Vec<ast::CaptureName>>,
210 scratch: RefCell<String>,
211}
212
213fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
214 result.map_err(|e| {
215 if e.kind == from {
216 ParseError {
217 kind: to,
218 pattern: e.pattern,
219 span: e.span,
220 }
221 } else {
222 e
223 }
224 })
225}
226
227fn is_capture_char(c: char, first: bool) -> bool {
228 if first {
229 c == '_' || c.is_alphabetic()
230 } else {
231 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
232 }
233}
234
235pub fn is_meta_character(c: char) -> bool {
236 matches!(
237 c,
238 '\\' | '.'
239 | '+'
240 | '*'
241 | '?'
242 | '('
243 | ')'
244 | '|'
245 | '['
246 | ']'
247 | '{'
248 | '}'
249 | '^'
250 | '$'
251 | '#'
252 | '&'
253 | '-'
254 | '~'
255 | '_'
256 )
257}
258
259pub fn escape(text: &str) -> String {
261 let mut buf = String::new();
262 escape_into(text, &mut buf);
263 buf
264}
265
266pub fn escape_into(text: &str, buf: &mut String) {
268 buf.reserve(text.len());
269 for c in text.chars() {
270 if is_meta_character(c) {
271 buf.push('\\');
272 }
273 buf.push(c);
274 }
275}
276
277pub fn is_escapeable_character(c: char) -> bool {
278 if is_meta_character(c) {
279 return true;
280 }
281 if !c.is_ascii() {
282 return false;
283 }
284 match c {
285 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
286 '<' | '>' => false,
287 _ => true,
288 }
289}
290
291fn is_hex(c: char) -> bool {
292 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
293}
294
295impl<'s> ResharpParser<'s> {
296 fn default_translator_builder(&self) -> TranslatorBuilder {
297 let mut trb = TranslatorBuilder::new();
298 trb.unicode(self.global_unicode);
299 trb.utf8(false);
300 trb.case_insensitive(self.global_case_insensitive);
301 trb
302 }
303
304 pub fn new(pattern: &'s str) -> Self {
305 Self::with_flags(pattern, &PatternFlags::default())
306 }
307
308 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
309 let mut trb = TranslatorBuilder::new();
310 trb.unicode(flags.unicode);
311 trb.utf8(false);
312 trb.case_insensitive(flags.case_insensitive);
313 Self {
314 translator: trb.build(),
315 pattern,
316 perl_classes: vec![],
317 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
318 pos: Cell::new(Position::new(0, 0, 0)),
319 capture_index: Cell::new(0),
320 octal: false,
321 empty_min_range: false,
322 ignore_whitespace: Cell::new(flags.ignore_whitespace),
323 dot_all: Cell::new(flags.dot_matches_new_line),
324 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
325 global_full_unicode: flags.full_unicode,
326 global_ascii_perl: flags.ascii_perl_classes,
327 global_case_insensitive: flags.case_insensitive,
328 comments: RefCell::new(vec![]),
329 stack_group: RefCell::new(vec![]),
330 stack_class: RefCell::new(vec![]),
331 capture_names: RefCell::new(vec![]),
332 scratch: RefCell::new(String::new()),
333 }
334 }
335
336 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
337 self
338 }
339
340 fn pattern(&self) -> &str {
341 self.pattern
342 }
343
344 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
345 ParseError {
346 kind,
347 pattern: self.pattern().to_string(),
348 span,
349 }
350 }
351
352 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
353 self.error(
354 Span::splat(self.pos()),
355 ast::ErrorKind::UnsupportedResharpRegex,
356 )
357 }
358
359 fn offset(&self) -> usize {
360 self.parser().pos.get().offset
361 }
362
363 fn line(&self) -> usize {
364 self.parser().pos.get().line
365 }
366
367 fn column(&self) -> usize {
368 self.parser().pos.get().column
369 }
370
371 fn next_capture_index(&self, span: Span) -> Result<u32> {
372 let current = self.parser().capture_index.get();
373 let i = current
374 .checked_add(1)
375 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
376 self.parser().capture_index.set(i);
377 Ok(i)
378 }
379
380 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
381 let mut names = self.parser().capture_names.borrow_mut();
382 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
383 Err(i) => {
384 names.insert(i, cap.clone());
385 Ok(())
386 }
387 Ok(i) => Err(self.error(
388 cap.span,
389 ast::ErrorKind::GroupNameDuplicate {
390 original: names[i].span,
391 },
392 )),
393 }
394 }
395
396 fn ignore_whitespace(&self) -> bool {
397 self.parser().ignore_whitespace.get()
398 }
399
400 fn char(&self) -> char {
401 self.char_at(self.offset())
402 }
403
404 fn char_at(&self, i: usize) -> char {
405 self.pattern()[i..]
406 .chars()
407 .next()
408 .unwrap_or_else(|| panic!("expected char at offset {}", i))
409 }
410
411 fn bump(&self) -> bool {
412 if self.is_eof() {
413 return false;
414 }
415 let Position {
416 mut offset,
417 mut line,
418 mut column,
419 } = self.pos();
420 if self.char() == '\n' {
421 line = line.checked_add(1).unwrap();
422 column = 1;
423 } else {
424 column = column.checked_add(1).unwrap();
425 }
426 offset += self.char().len_utf8();
427 self.parser().pos.set(Position {
428 offset,
429 line,
430 column,
431 });
432 self.pattern()[self.offset()..].chars().next().is_some()
433 }
434
435 fn bump_if(&self, prefix: &str) -> bool {
436 if self.pattern()[self.offset()..].starts_with(prefix) {
437 for _ in 0..prefix.chars().count() {
438 self.bump();
439 }
440 true
441 } else {
442 false
443 }
444 }
445
446 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
447 if self.bump_if("?=") {
448 return Some((true, true));
449 }
450 if self.bump_if("?!") {
451 return Some((true, false));
452 }
453 if self.bump_if("?<=") {
454 return Some((false, true));
455 }
456 if self.bump_if("?<!") {
457 return Some((false, false));
458 }
459 None
460 }
461
462 fn bump_and_bump_space(&self) -> bool {
463 if !self.bump() {
464 return false;
465 }
466 self.bump_space();
467 !self.is_eof()
468 }
469
470 fn bump_space(&self) {
471 if !self.ignore_whitespace() {
472 return;
473 }
474 while !self.is_eof() {
475 if self.char().is_whitespace() {
476 self.bump();
477 } else if self.char() == '#' {
478 let start = self.pos();
479 let mut comment_text = String::new();
480 self.bump();
481 while !self.is_eof() {
482 let c = self.char();
483 self.bump();
484 if c == '\n' {
485 break;
486 }
487 comment_text.push(c);
488 }
489 let comment = ast::Comment {
490 span: Span::new(start, self.pos()),
491 comment: comment_text,
492 };
493 self.parser().comments.borrow_mut().push(comment);
494 } else {
495 break;
496 }
497 }
498 }
499
500 fn peek(&self) -> Option<char> {
501 if self.is_eof() {
502 return None;
503 }
504 self.pattern()[self.offset() + self.char().len_utf8()..]
505 .chars()
506 .next()
507 }
508
509 fn peek_space(&self) -> Option<char> {
512 if !self.ignore_whitespace() {
513 return self.peek();
514 }
515 if self.is_eof() {
516 return None;
517 }
518 let mut start = self.offset() + self.char().len_utf8();
519 let mut in_comment = false;
520 for (i, c) in self.pattern()[start..].char_indices() {
521 if c.is_whitespace() {
522 continue;
523 } else if !in_comment && c == '#' {
524 in_comment = true;
525 } else if in_comment && c == '\n' {
526 in_comment = false;
527 } else {
528 start += i;
529 break;
530 }
531 }
532 self.pattern()[start..].chars().next()
533 }
534
535 fn is_eof(&self) -> bool {
536 self.offset() == self.pattern().len()
537 }
538
539 fn pos(&self) -> Position {
540 self.parser().pos.get()
541 }
542
543 fn span(&self) -> Span {
544 Span::splat(self.pos())
545 }
546
547 fn span_char(&self) -> Span {
548 let mut next = Position {
549 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
550 line: self.line(),
551 column: self.column().checked_add(1).unwrap(),
552 };
553 if self.char() == '\n' {
554 next.line += 1;
555 next.column = 1;
556 }
557 Span::new(self.pos(), next)
558 }
559
560 #[inline(never)]
561 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
562 assert_eq!(self.char(), '|');
563 concat.span.end = self.pos();
564 self.push_or_add_alternation(concat);
565 self.bump();
566 Ok(ast::Concat {
567 span: self.span(),
568 asts: vec![],
569 })
570 }
571
572 fn push_or_add_alternation(&self, concat: Concat) {
573 use self::GroupState::*;
574
575 let mut stack = self.parser().stack_group.borrow_mut();
576 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
577 alts.asts.push(concat.into_ast());
578 return;
579 }
580 stack.push(Alternation(ast::Alternation {
581 span: Span::new(concat.span.start, self.pos()),
582 asts: vec![concat.into_ast()],
583 }));
584 }
585
586 #[inline(never)]
587 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
588 assert_eq!(self.char(), '&');
589 concat.span.end = self.pos();
590 self.push_or_add_intersect(concat);
591 self.bump();
592 Ok(Concat {
593 span: self.span(),
594 asts: vec![],
595 })
596 }
597
598 fn push_or_add_intersect(&self, concat: Concat) {
599 use self::GroupState::*;
600
601 let mut stack = self.parser().stack_group.borrow_mut();
602 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
603 alts.asts.push(concat.into_ast());
604 return;
605 }
606 stack.push(Intersection(ast::Intersection {
607 span: Span::new(concat.span.start, self.pos()),
608 asts: vec![concat.into_ast()],
609 }));
610 }
611
612 #[inline(never)]
613 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
614 assert_eq!(self.char(), '(');
615 match self.parse_group()? {
616 Either::Left(set) => {
617 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
618 if let Some(v) = ignore {
619 self.parser().ignore_whitespace.set(v);
620 }
621
622 concat.asts.push(Ast::flags(set));
623 Ok(concat)
624 }
625 Either::Right(group) => {
626 let old_ignore_whitespace = self.ignore_whitespace();
627 let new_ignore_whitespace = group
628 .flags()
629 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
630 .unwrap_or(old_ignore_whitespace);
631 self.parser()
632 .stack_group
633 .borrow_mut()
634 .push(GroupState::Group {
635 concat,
636 group,
637 ignore_whitespace: old_ignore_whitespace,
638 });
639 self.parser().ignore_whitespace.set(new_ignore_whitespace);
640 Ok(Concat {
641 span: self.span(),
642 asts: vec![],
643 })
644 }
645 }
646 }
647
648 #[inline(never)]
649 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
650 assert_eq!(self.char(), '~');
651 self.bump();
652 if self.is_eof() || self.char() != '(' {
653 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
654 }
655 let open_span = self.span_char();
656 self.bump();
657 let group = ast::Group {
658 span: open_span,
659 kind: ast::GroupKind::Complement,
660 ast: Box::new(Ast::empty(self.span())),
661 };
662
663 let old_ignore_whitespace = self.ignore_whitespace();
664 let new_ignore_whitespace = group
665 .flags()
666 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
667 .unwrap_or(old_ignore_whitespace);
668 self.parser()
669 .stack_group
670 .borrow_mut()
671 .push(GroupState::Group {
672 concat,
673 group,
674 ignore_whitespace: old_ignore_whitespace,
675 });
676 self.parser().ignore_whitespace.set(new_ignore_whitespace);
677 Ok(Concat {
678 span: self.span(),
679 asts: vec![],
680 })
681 }
682
683 #[inline(never)]
684 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
685 use self::GroupState::*;
686 assert_eq!(self.char(), ')');
687 let mut stack = self.parser().stack_group.borrow_mut();
688 let topstack = stack.pop();
689
690 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
691 Some(Group {
692 concat,
693 group,
694 ignore_whitespace,
695 }) => (concat, group, ignore_whitespace, None),
696 Some(Alternation(alt)) => match stack.pop() {
697 Some(Group {
698 concat,
699 group,
700 ignore_whitespace,
701 }) => (
702 concat,
703 group,
704 ignore_whitespace,
705 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
706 ),
707 None | Some(Alternation(_)) | Some(Intersection(_)) => {
708 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
709 }
710 },
711 Some(Intersection(int)) => match stack.pop() {
712 Some(Group {
713 concat,
714 group,
715 ignore_whitespace,
716 }) => (
717 concat,
718 group,
719 ignore_whitespace,
720 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
721 ),
722 None | Some(Alternation(_)) | Some(Intersection(_)) => {
723 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
724 }
725 },
726
727 None => {
728 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
729 }
730 };
731 self.parser().ignore_whitespace.set(ignore_whitespace);
732 group_concat.span.end = self.pos();
733 self.bump();
734 group.span.end = self.pos();
735 match alt {
736 Some(Either::Left(mut alt)) => {
737 alt.span.end = group_concat.span.end;
738 alt.asts.push(group_concat.into_ast());
739 group.ast = Box::new(alt.into_ast());
740 }
741 Some(Either::Right(mut int)) => {
742 int.span.end = group_concat.span.end;
743 int.asts.push(group_concat.into_ast());
744 group.ast = Box::new(int.into_ast());
745 }
746 None => {
747 group.ast = Box::new(group_concat.into_ast());
748 }
749 }
750
751 if group.kind == GroupKind::Complement {
752 let complement = ast::Complement {
753 span: self.span(),
754 ast: group.ast,
755 };
756 prior_concat.asts.push(Ast::complement(complement));
757 }
758 else {
760 prior_concat.asts.push(Ast::group(group));
761 }
762 Ok(prior_concat)
763 }
764
765 #[inline(never)]
766 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
767 concat.span.end = self.pos();
768 let mut stack = self.parser().stack_group.borrow_mut();
769 let ast = match stack.pop() {
770 None => Ok(concat.into_ast()),
771 Some(GroupState::Alternation(mut alt)) => {
772 alt.span.end = self.pos();
773 alt.asts.push(concat.into_ast());
774 Ok(Ast::alternation(alt))
775 }
776 Some(GroupState::Intersection(mut int)) => {
777 int.span.end = self.pos();
778 int.asts.push(concat.into_ast());
779
780 Ok(Ast::intersection(int))
781 }
782 Some(GroupState::Group { group, .. }) => {
783 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
784 }
785 };
786 match stack.pop() {
788 None => ast,
789 Some(GroupState::Alternation(alt)) => {
790 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
791 }
792 Some(GroupState::Intersection(int)) => {
793 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
794 }
795 Some(GroupState::Group { group, .. }) => {
796 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
797 }
798 }
799 }
800
801 #[inline(never)]
802 fn push_class_open(
803 &self,
804 parent_union: regex_syntax::ast::ClassSetUnion,
805 ) -> Result<regex_syntax::ast::ClassSetUnion> {
806 assert_eq!(self.char(), '[');
807
808 let (nested_set, nested_union) = self.parse_set_class_open()?;
809 self.parser()
810 .stack_class
811 .borrow_mut()
812 .push(ClassState::Open {
813 union: parent_union,
814 set: nested_set,
815 });
816 Ok(nested_union)
817 }
818
819 #[inline(never)]
820 fn pop_class(
821 &self,
822 nested_union: regex_syntax::ast::ClassSetUnion,
823 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
824 assert_eq!(self.char(), ']');
825
826 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
827 let prevset = self.pop_class_op(item);
828 let mut stack = self.parser().stack_class.borrow_mut();
829 match stack.pop() {
830 None => panic!("unexpected empty character class stack"),
831 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
832 Some(ClassState::Open { mut union, mut set }) => {
833 self.bump();
834 set.span.end = self.pos();
835 set.kind = prevset;
836 if stack.is_empty() {
837 Ok(Either::Right(set))
838 } else {
839 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
840 Ok(Either::Left(union))
841 }
842 }
843 }
844 }
845
846 #[inline(never)]
847 fn unclosed_class_error(&self) -> ParseError {
848 for state in self.parser().stack_class.borrow().iter().rev() {
849 if let ClassState::Open { ref set, .. } = *state {
850 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
851 }
852 }
853 panic!("no open character class found")
854 }
855
856 #[inline(never)]
857 fn push_class_op(
858 &self,
859 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
860 next_union: regex_syntax::ast::ClassSetUnion,
861 ) -> regex_syntax::ast::ClassSetUnion {
862 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
863 let new_lhs = self.pop_class_op(item);
864 self.parser().stack_class.borrow_mut().push(ClassState::Op {
865 kind: next_kind,
866 lhs: new_lhs,
867 });
868 regex_syntax::ast::ClassSetUnion {
869 span: self.span(),
870 items: vec![],
871 }
872 }
873
874 #[inline(never)]
875 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
876 let mut stack = self.parser().stack_class.borrow_mut();
877 let (kind, lhs) = match stack.pop() {
878 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
879 Some(state @ ClassState::Open { .. }) => {
880 stack.push(state);
881 return rhs;
882 }
883 None => unreachable!(),
884 };
885 let span = Span::new(lhs.span().start, rhs.span().end);
886 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
887 span,
888 kind,
889 lhs: Box::new(lhs),
890 rhs: Box::new(rhs),
891 })
892 }
893
894 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
895 match hir.kind() {
896 hir::HirKind::Empty => Ok(NodeId::EPS),
897 hir::HirKind::Literal(l) => {
898 if l.0.len() == 1 {
899 let node = tb.mk_u8(l.0[0]);
900 Ok(node)
901 } else {
902 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
903 let conc = tb.mk_concats(ws.iter().copied());
904 Ok(conc)
905 }
906 }
907 hir::HirKind::Class(class) => match class {
908 hir::Class::Unicode(class_unicode) => {
909 let ranges = class_unicode.ranges();
910 if ranges.len() == 1
911 && ranges[0].start() == '\u{0}'
912 && ranges[0].end() == '\u{10FFFF}'
913 {
914 return Ok(tb.mk_range_u8(0, 255));
915 }
916 let mut nodes = Vec::new();
917 for range in ranges {
918 for seq in Utf8Sequences::new(range.start(), range.end()) {
919 let sl = seq.as_slice();
920 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
921 let node = match bytes.len() {
922 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
923 n => {
924 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
925 let mut conc = last;
926 for i in (0..n - 1).rev() {
927 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
928 conc = tb.mk_concat(b, conc);
929 }
930 conc
931 }
932 };
933 nodes.push(node);
934 }
935 }
936 let merged = tb.mk_unions(nodes.into_iter());
937 Ok(merged)
938 }
939 hir::Class::Bytes(class_bytes) => {
940 let ranges = class_bytes.ranges();
941 let mut result = NodeId::BOT;
942 for range in ranges {
943 let start = range.start();
944 let end = range.end();
945 let node = tb.mk_range_u8(start, end);
946 result = tb.mk_union(result, node);
947 }
948 Ok(result)
949 }
950 },
951 hir::HirKind::Look(_) => Err(self.error(
952 Span::splat(self.pos()),
953 ast::ErrorKind::UnsupportedResharpRegex,
954 )),
955 hir::HirKind::Repetition(_) => Err(self.error(
956 Span::splat(self.pos()),
957 ast::ErrorKind::UnsupportedResharpRegex,
958 )),
959 hir::HirKind::Capture(_) => Err(self.error(
960 Span::splat(self.pos()),
961 ast::ErrorKind::UnsupportedResharpRegex,
962 )),
963 hir::HirKind::Concat(body) => {
964 let mut result = NodeId::EPS;
965 for child in body {
966 let node = self.hir_to_node_id(child, tb)?;
967 result = tb.mk_concat(result, node);
968 }
969 Ok(result)
970 }
971 hir::HirKind::Alternation(_) => Err(self.error(
972 Span::splat(self.pos()),
973 ast::ErrorKind::UnsupportedResharpRegex,
974 )),
975 }
976 }
977
978 fn translate_ast_to_hir(
979 &mut self,
980 orig_ast: ®ex_syntax::ast::Ast,
981 tb: &mut TB<'s>,
982 ) -> Result<NodeId> {
983 match self.translator.translate("", orig_ast) {
984 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
985 Ok(hir) => self.hir_to_node_id(&hir, tb),
986 }
987 }
988
989 fn translator_to_node_id(
990 &mut self,
991 orig_ast: ®ex_syntax::ast::Ast,
992 translator: &mut Option<Translator>,
993 tb: &mut TB<'s>,
994 ) -> Result<NodeId> {
995 match translator {
996 Some(tr) => {
997 let hir = tr
998 .translate("", orig_ast)
999 .map_err(|e| self.unsupported_error(e))?;
1000 self.hir_to_node_id(&hir, tb)
1001 }
1002 None => self.translate_ast_to_hir(orig_ast, tb),
1003 }
1004 }
1005
1006 fn get_class(
1007 &mut self,
1008 negated: bool,
1009 kind: regex_syntax::ast::ClassPerlKind,
1010 tb: &mut TB<'s>,
1011 ) -> Result<NodeId> {
1012 let w = self
1013 .perl_classes
1014 .iter()
1015 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1016 match w {
1017 Some((_, _, value)) => Ok(*value),
1018 None => {
1019 let translated = if self.global_ascii_perl {
1020 let pos = match kind {
1021 regex_syntax::ast::ClassPerlKind::Word => {
1022 let az = tb.mk_range_u8(b'a', b'z');
1023 let big = tb.mk_range_u8(b'A', b'Z');
1024 let dig = tb.mk_range_u8(b'0', b'9');
1025 let us = tb.mk_u8(b'_');
1026 tb.mk_unions([az, big, dig, us].into_iter())
1027 }
1028 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1029 regex_syntax::ast::ClassPerlKind::Space => {
1030 let sp = tb.mk_u8(b' ');
1031 let tab = tb.mk_u8(b'\t');
1032 let nl = tb.mk_u8(b'\n');
1033 let cr = tb.mk_u8(b'\r');
1034 let ff = tb.mk_u8(0x0C);
1035 let vt = tb.mk_u8(0x0B);
1036 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1037 }
1038 };
1039 if negated {
1040 resharp_algebra::neg_class(tb, pos)
1041 } else {
1042 pos
1043 }
1044 } else if self.global_unicode {
1045 match kind {
1046 regex_syntax::ast::ClassPerlKind::Word => {
1047 if self.global_full_unicode {
1048 self.unicode_classes.ensure_word_full(tb);
1049 } else {
1050 self.unicode_classes.ensure_word(tb);
1051 }
1052 if negated {
1053 self.unicode_classes.non_word
1054 } else {
1055 self.unicode_classes.word
1056 }
1057 }
1058 regex_syntax::ast::ClassPerlKind::Digit => {
1059 if self.global_full_unicode {
1060 self.unicode_classes.ensure_digit_full(tb);
1061 } else {
1062 self.unicode_classes.ensure_digit(tb);
1063 }
1064 if negated {
1065 self.unicode_classes.non_digit
1066 } else {
1067 self.unicode_classes.digit
1068 }
1069 }
1070 regex_syntax::ast::ClassPerlKind::Space => {
1071 if self.global_full_unicode {
1072 self.unicode_classes.ensure_space_full(tb);
1073 } else {
1074 self.unicode_classes.ensure_space(tb);
1075 }
1076 if negated {
1077 self.unicode_classes.non_space
1078 } else {
1079 self.unicode_classes.space
1080 }
1081 }
1082 }
1083 } else {
1084 let pos = match kind {
1085 regex_syntax::ast::ClassPerlKind::Word => {
1086 let az = tb.mk_range_u8(b'a', b'z');
1087 let big = tb.mk_range_u8(b'A', b'Z');
1088 let dig = tb.mk_range_u8(b'0', b'9');
1089 let us = tb.mk_u8(b'_');
1090 tb.mk_unions([az, big, dig, us].into_iter())
1091 }
1092 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1093 regex_syntax::ast::ClassPerlKind::Space => {
1094 let sp = tb.mk_u8(b' ');
1095 let tab = tb.mk_u8(b'\t');
1096 let nl = tb.mk_u8(b'\n');
1097 let cr = tb.mk_u8(b'\r');
1098 let ff = tb.mk_u8(0x0C);
1099 let vt = tb.mk_u8(0x0B);
1100 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1101 }
1102 };
1103 if negated {
1104 tb.mk_compl(pos)
1105 } else {
1106 pos
1107 }
1108 };
1109 self.perl_classes.push((negated, kind, translated));
1110 Ok(translated)
1111 }
1112 }
1113 }
1114
1115 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1116 use WordCharKind::*;
1117 match ast {
1118 Ast::Literal(lit) => {
1119 if is_word_byte(lit.c as u8) {
1120 Word
1121 } else {
1122 NonWord
1123 }
1124 }
1125 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1126 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1127 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1128 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1129 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1130 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1131 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1132 },
1133 Ast::Dot(_) | Ast::Top(_) => Unknown,
1134 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1135 Ast::Concat(c) if !c.asts.is_empty() => {
1136 let edge = if left { c.asts.len() - 1 } else { 0 };
1137 let kind = Self::word_char_kind(&c.asts[edge], left);
1138 match kind {
1139 MaybeWord => {
1140 let dir: isize = if left { -1 } else { 1 };
1141 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1142 Word => Word,
1143 _ => MaybeWord,
1144 }
1145 }
1146 MaybeNonWord => {
1147 let dir: isize = if left { -1 } else { 1 };
1148 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1149 NonWord => NonWord,
1150 _ => MaybeNonWord,
1151 }
1152 }
1153 other => other,
1154 }
1155 }
1156 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1157 let first = Self::word_char_kind(&alt.asts[0], left);
1158 if alt.asts[1..]
1159 .iter()
1160 .all(|a| Self::word_char_kind(a, left) == first)
1161 {
1162 first
1163 } else {
1164 Unknown
1165 }
1166 }
1167 Ast::Repetition(r) => {
1168 let inner = Self::word_char_kind(&r.ast, left);
1169 let nullable = matches!(
1170 &r.op.kind,
1171 ast::RepetitionKind::ZeroOrMore
1172 | ast::RepetitionKind::ZeroOrOne
1173 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1174 );
1175 if nullable {
1176 match inner {
1177 Word => MaybeWord,
1178 NonWord => MaybeNonWord,
1179 _ => Unknown,
1180 }
1181 } else {
1182 inner
1183 }
1184 }
1185 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1186 _ => Unknown,
1187 }
1188 }
1189
1190 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1192 match ast {
1193 Ast::Literal(_)
1194 | Ast::ClassPerl(_)
1195 | Ast::ClassBracketed(_)
1196 | Ast::ClassUnicode(_)
1197 | Ast::Dot(_)
1198 | Ast::Top(_) => Some(ast),
1199 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1200 Ast::Concat(c) if !c.asts.is_empty() => {
1201 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1202 }
1203 Ast::Repetition(r) => {
1204 let nullable = matches!(
1205 &r.op.kind,
1206 ast::RepetitionKind::ZeroOrMore
1207 | ast::RepetitionKind::ZeroOrOne
1208 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1209 );
1210 if nullable {
1211 None
1212 } else {
1213 Self::edge_class_ast(&r.ast, left)
1214 }
1215 }
1216 _ => None,
1217 }
1218 }
1219
1220 fn resolve_word_kind(
1221 &mut self,
1222 asts: &[Ast],
1223 idx: usize,
1224 dir: isize,
1225 translator: &mut Option<Translator>,
1226 tb: &mut TB<'s>,
1227 word_id: NodeId,
1228 not_word_id: NodeId,
1229 ) -> Result<WordCharKind> {
1230 use WordCharKind::*;
1231 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1232 if fast != Unknown {
1233 return Ok(fast);
1234 }
1235 let neighbor_idx = (idx as isize + dir) as usize;
1236 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1237 self.ast_to_node_id(edge, translator, tb)?
1238 } else {
1239 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1241 let mut neighbor_node = tb
1242 .try_elim_lookarounds(neighbor_node)
1243 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1244 if dir < 0 {
1245 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1246 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1247 })?;
1248 }
1249 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1250 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1251 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1252 Ok(Word)
1253 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1254 Ok(NonWord)
1255 } else {
1256 Ok(Unknown)
1257 };
1258 };
1259 if tb.subsumes(word_id, node) == Some(true) {
1260 Ok(Word)
1261 } else if tb.subsumes(not_word_id, node) == Some(true) {
1262 Ok(NonWord)
1263 } else {
1264 Ok(Unknown)
1265 }
1266 }
1267
1268 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1269 use WordCharKind::*;
1270 let next = idx as isize + dir;
1271 if next < 0 || next >= asts.len() as isize {
1272 return Edge;
1273 }
1274 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1275 match kind {
1276 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1277 Word => Word,
1278 _ => Unknown,
1279 },
1280 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1281 NonWord => NonWord,
1282 _ => Unknown,
1283 },
1284 other => other,
1285 }
1286 }
1287
1288 fn rewrite_word_boundary_in_concat(
1289 &mut self,
1290 asts: &[Ast],
1291 idx: usize,
1292 translator: &mut Option<Translator>,
1293 tb: &mut TB<'s>,
1294 ) -> Result<(NodeId, usize)> {
1295 use WordCharKind::*;
1296 let (word_id, not_word_id) = if self.global_full_unicode {
1297 self.unicode_classes.ensure_word_full(tb);
1298 (self.unicode_classes.word, self.unicode_classes.non_word)
1299 } else if self.global_unicode && !self.global_ascii_perl {
1300 self.unicode_classes.ensure_word(tb);
1301 (self.unicode_classes.word, self.unicode_classes.non_word)
1302 } else {
1303 let az = tb.mk_range_u8(b'a', b'z');
1304 let big = tb.mk_range_u8(b'A', b'Z');
1305 let dig = tb.mk_range_u8(b'0', b'9');
1306 let us = tb.mk_u8(b'_');
1307 let w = tb.mk_unions([az, big, dig, us].into_iter());
1308 (w, tb.mk_compl(w))
1309 };
1310 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1311 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1312 match (left, right) {
1313 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1314 (Word, _) => {
1315 let neg = tb.mk_neg_lookahead(word_id, 0);
1316 Ok((neg, idx + 1))
1317 }
1318 (NonWord, _) => {
1319 let tail = tb.mk_concat(word_id, NodeId::TS);
1320 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1321 }
1322 (_, Word) => Ok((tb.mk_neg_lookbehind(word_id), idx + 1)),
1323 (_, NonWord) => Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1)),
1324 _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1328 }
1329 }
1330
1331 fn merge_boundary_with_following_lookaheads(
1332 &mut self,
1333 asts: &[Ast],
1334 wb_idx: usize,
1335 boundary_tail: NodeId,
1336 translator: &mut Option<Translator>,
1337 tb: &mut TB<'s>,
1338 ) -> Result<(NodeId, usize)> {
1339 let mut next = wb_idx + 1;
1340 let mut la_bodies = vec![boundary_tail];
1341 while next < asts.len() {
1342 match &asts[next] {
1343 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1344 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1345 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1346 next += 1;
1347 }
1348 _ => break,
1349 }
1350 }
1351 let merged = tb.mk_inters(la_bodies.into_iter());
1352 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1353 }
1354
1355 fn ast_to_node_id(
1356 &mut self,
1357 ast: &Ast,
1358 translator: &mut Option<Translator>,
1359 tb: &mut TB<'s>,
1360 ) -> Result<NodeId> {
1361 match ast {
1362 Ast::Empty(_) => Ok(NodeId::EPS),
1363 Ast::Flags(f) => {
1364 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1365 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1366 }
1367 let mut translator_builder = self.default_translator_builder();
1368 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1369 translator_builder.case_insensitive(state);
1370 }
1371 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1372 translator_builder.unicode(state);
1373 }
1374 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1375 self.dot_all.set(state);
1376 }
1377 let concat_translator = Some(translator_builder.build());
1378 *translator = concat_translator;
1379 Ok(NodeId::EPS)
1380 }
1381 Ast::Literal(l) => {
1382 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1383 self.translator_to_node_id(&ast_lit, translator, tb)
1384 }
1385 Ast::Top(_) => Ok(NodeId::TOP),
1386 Ast::Dot(_) => {
1387 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1388 let hirv = match (codepoint_dot, self.dot_all.get()) {
1389 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1390 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1391 (false, true) => return Ok(NodeId::TOP),
1392 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1393 };
1394 self.hir_to_node_id(&hirv, tb)
1395 }
1396 Ast::Assertion(a) => match &a.kind {
1397 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1398 ast::AssertionKind::EndText => Ok(NodeId::END),
1399 ast::AssertionKind::WordBoundary => {
1400 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1401 }
1402 ast::AssertionKind::NotWordBoundary => {
1403 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1404 }
1405 ast::AssertionKind::StartLine => {
1406 let left = NodeId::BEGIN;
1407 let right = tb.mk_u8(b'\n');
1408 let union = tb.mk_union(left, right);
1409 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1410 }
1411 ast::AssertionKind::EndLine => {
1412 let left = NodeId::END;
1413 let right = tb.mk_u8(b'\n');
1414 let union = tb.mk_union(left, right);
1415 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1416 }
1417 ast::AssertionKind::WordBoundaryStart => {
1418 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1419 }
1420 ast::AssertionKind::WordBoundaryEnd => {
1421 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1422 }
1423 ast::AssertionKind::WordBoundaryStartAngle => {
1424 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1425 }
1426 ast::AssertionKind::WordBoundaryEndAngle => {
1427 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1428 }
1429 ast::AssertionKind::WordBoundaryStartHalf => {
1430 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1431 }
1432 ast::AssertionKind::WordBoundaryEndHalf => {
1433 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1434 }
1435 },
1436 Ast::ClassUnicode(c) => {
1437 let tmp = regex_syntax::ast::ClassUnicode {
1438 span: c.span,
1439 negated: c.negated,
1440 kind: c.kind.clone(),
1441 };
1442 if !c.negated {
1443 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1444 match s.as_str() {
1445 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1447 "utf8" => {
1449 let ascii = tb.mk_range_u8(0, 127);
1450 let beta = tb.mk_range_u8(128, 0xBF);
1451 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1452 let c0s = tb.mk_concats([c0, beta].into_iter());
1453 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1454 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1455 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1456 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1457 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1458 return Ok(tb.mk_star(merged));
1459 }
1460 "hex" => {
1461 let nums = tb.mk_range_u8(b'0', b'9');
1462 let lets = tb.mk_range_u8(b'a', b'f');
1463 let lets2 = tb.mk_range_u8(b'A', b'F');
1464 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1465 return Ok(merged);
1466 }
1467 _ => {}
1468 }
1469 };
1470 }
1471
1472 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1473 self.translator_to_node_id(&orig_ast, translator, tb)
1474 }
1475 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1476 Ast::ClassBracketed(c) => match &c.kind {
1477 regex_syntax::ast::ClassSet::Item(item) => {
1478 if !c.negated && is_universal_perl_pair(item) {
1479 return Ok(NodeId::TOP);
1480 }
1481 let tmp = regex_syntax::ast::ClassBracketed {
1482 span: c.span,
1483 negated: c.negated,
1484 kind: c.kind.clone(),
1485 };
1486 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1487 self.translator_to_node_id(&orig_ast, translator, tb)
1488 }
1489 regex_syntax::ast::ClassSet::BinaryOp(_) => {
1490 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1491 }
1492 },
1493 Ast::Repetition(r) => {
1494 let body = self.ast_to_node_id(&r.ast, translator, tb);
1495 match body {
1496 Ok(body) => match &r.op.kind {
1497 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1498 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1499 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1500 ast::RepetitionKind::Range(r) => match r {
1501 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1502 ast::RepetitionRange::AtLeast(n) => {
1503 let rep = tb.mk_repeat(body, *n, *n);
1504 let st = tb.mk_star(body);
1505 Ok(tb.mk_concat(rep, st))
1506 }
1507
1508 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1509 },
1510 },
1511 Err(_) => body,
1512 }
1513 }
1514 Ast::Lookaround(g) => {
1515 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1516 match g.kind {
1517 ast::LookaroundKind::PositiveLookahead
1518 | ast::LookaroundKind::NegativeLookahead
1519 if body.contains_lookbehind(tb) =>
1520 {
1521 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
1522 }
1523 ast::LookaroundKind::PositiveLookahead => {
1524 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1525 }
1526 ast::LookaroundKind::PositiveLookbehind => {
1527 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1528 }
1529 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1530 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1531 }
1532 }
1533 Ast::Group(g) => {
1534 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1535 if !flags.items.is_empty() {
1536 let mut translator_builder = self.default_translator_builder();
1537 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1538 translator_builder.case_insensitive(state);
1539 }
1540 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1541 translator_builder.unicode(state);
1542 }
1543 let saved_dot_all = self.dot_all.get();
1544 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1545 self.dot_all.set(state);
1546 }
1547 let mut scoped = Some(translator_builder.build());
1548 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1549 self.dot_all.set(saved_dot_all);
1550 return result;
1551 }
1552 }
1553 self.ast_to_node_id(&g.ast, translator, tb)
1554 }
1555 Ast::Alternation(a) => {
1556 let mut children = vec![];
1557 for ast in &a.asts {
1558 match self.ast_to_node_id(ast, translator, tb) {
1559 Ok(node_id) => children.push(node_id),
1560 Err(err) => return Err(err),
1561 }
1562 }
1563 Ok(tb.mk_unions(children.iter().copied()))
1564 }
1565 Ast::Concat(c) => {
1566 let mut concat_translator: Option<Translator> = None;
1567 let mut children = vec![];
1568 let mut i = 0;
1569 while i < c.asts.len() {
1570 let ast = &c.asts[i];
1571 match ast {
1572 Ast::Flags(f) => {
1573 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1574 return Err(
1575 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1576 );
1577 }
1578 let mut translator_builder = self.default_translator_builder();
1579 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1580 translator_builder.case_insensitive(state);
1581 }
1582 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1583 translator_builder.unicode(state);
1584 }
1585 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1586 self.dot_all.set(state);
1587 }
1588 concat_translator = Some(translator_builder.build());
1589 *translator = concat_translator.clone();
1590 i += 1;
1591 continue;
1592 }
1593 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1594 let node =
1595 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1596 children.push(node.0);
1597 i = node.1; continue;
1599 }
1600 _ => {}
1601 }
1602 match concat_translator {
1603 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1604 Ok(node_id) => children.push(node_id),
1605 Err(err) => return Err(err),
1606 },
1607 None => match self.ast_to_node_id(ast, translator, tb) {
1608 Ok(node_id) => children.push(node_id),
1609 Err(err) => return Err(err),
1610 },
1611 }
1612 i += 1;
1613 }
1614 Ok(tb.mk_concats(children.iter().cloned()))
1615 }
1616 Ast::Intersection(intersection) => {
1617 let mut children = vec![];
1618 for ast in &intersection.asts {
1619 match self.ast_to_node_id(ast, translator, tb) {
1620 Ok(node_id) => children.push(node_id),
1621 Err(err) => return Err(err),
1622 }
1623 }
1624 Ok(tb.mk_inters(children.into_iter()))
1625 }
1626 Ast::Complement(complement) => {
1627 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1628 body.map(|x| tb.mk_compl(x))
1629 }
1630 }
1631 }
1632
1633 fn parse_inner(&mut self) -> Result<Ast> {
1634 let mut concat = Concat {
1635 span: self.span(),
1636 asts: vec![],
1637 };
1638 loop {
1639 self.bump_space();
1640 if self.is_eof() {
1641 break;
1642 }
1643 match self.char() {
1644 '(' => concat = self.push_group(concat)?,
1645 ')' => concat = self.pop_group(concat)?,
1646 '|' => concat = self.push_alternate(concat)?,
1647 '&' => concat = self.push_intersect(concat)?,
1648 '~' => concat = self.push_compl_group(concat)?,
1649 '[' => {
1650 let class = self.parse_set_class()?;
1651 concat.asts.push(Ast::class_bracketed(class));
1652 }
1653 '?' => {
1654 concat =
1655 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1656 }
1657 '*' => {
1658 concat =
1659 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1660 }
1661 '+' => {
1662 concat =
1663 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1664 }
1665 '{' => {
1666 concat = self.parse_counted_repetition(concat)?;
1667 }
1668 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1669 }
1670 }
1671 let ast = self.pop_group_end(concat)?;
1672 if expanded_ast_size(&ast, EXPANDED_AST_LIMIT) >= EXPANDED_AST_LIMIT
1673 || max_list_length(&ast) >= MAX_LIST_LEN
1674 {
1675 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1676 }
1677 Ok(ast)
1678 }
1679
1680 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1681 let ast = self.parse_inner()?;
1682 self.ast_to_node_id(&ast, &mut None, tb)
1683 }
1684
1685 #[inline(never)]
1686 fn parse_uncounted_repetition(
1687 &self,
1688 mut concat: ast::Concat,
1689 kind: ast::RepetitionKind,
1690 ) -> Result<ast::Concat> {
1691 let op_start = self.pos();
1693 let ast = match concat.asts.pop() {
1694 Some(ast) => ast,
1695 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1696 };
1697 match ast {
1698 Ast::Empty(_) | Ast::Flags(_) => {
1699 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1700 }
1701 _ => {}
1702 }
1703 if self.bump() && self.char() == '?' {
1704 return Err(self.error(
1705 Span::new(op_start, self.pos()),
1706 ast::ErrorKind::UnsupportedLazyQuantifier,
1707 ));
1708 }
1709 concat.asts.push(Ast::repetition(ast::Repetition {
1710 span: ast.span().with_end(self.pos()),
1711 op: ast::RepetitionOp {
1712 span: Span::new(op_start, self.pos()),
1713 kind,
1714 },
1715 greedy: true,
1716 ast: Box::new(ast),
1717 }));
1718 Ok(concat)
1719 }
1720
1721 #[inline(never)]
1722 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1723 assert!(self.char() == '{');
1724 let start = self.pos();
1725 let ast = match concat.asts.pop() {
1726 Some(ast) => ast,
1727 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1728 };
1729 match ast {
1730 Ast::Empty(_) | Ast::Flags(_) => {
1731 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1732 }
1733 _ => {}
1734 }
1735 if !self.bump_and_bump_space() {
1736 return Err(self.error(
1737 Span::new(start, self.pos()),
1738 ast::ErrorKind::RepetitionCountUnclosed,
1739 ));
1740 }
1741 let count_start = specialize_err(
1742 self.parse_decimal(),
1743 ast::ErrorKind::DecimalEmpty,
1744 ast::ErrorKind::RepetitionCountDecimalEmpty,
1745 );
1746 if self.is_eof() {
1747 return Err(self.error(
1748 Span::new(start, self.pos()),
1749 ast::ErrorKind::RepetitionCountUnclosed,
1750 ));
1751 }
1752 let range = if self.char() == ',' {
1753 if !self.bump_and_bump_space() {
1754 return Err(self.error(
1755 Span::new(start, self.pos()),
1756 ast::ErrorKind::RepetitionCountUnclosed,
1757 ));
1758 }
1759 if self.char() != '}' {
1760 let count_start = match count_start {
1761 Ok(c) => c,
1762 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1763 if self.parser().empty_min_range {
1764 0
1765 } else {
1766 return Err(err);
1767 }
1768 }
1769 err => err?,
1770 };
1771 let count_end = specialize_err(
1772 self.parse_decimal(),
1773 ast::ErrorKind::DecimalEmpty,
1774 ast::ErrorKind::RepetitionCountDecimalEmpty,
1775 )?;
1776 ast::RepetitionRange::Bounded(count_start, count_end)
1777 } else {
1778 ast::RepetitionRange::AtLeast(count_start?)
1779 }
1780 } else {
1781 ast::RepetitionRange::Exactly(count_start?)
1782 };
1783
1784 if self.is_eof() || self.char() != '}' {
1785 return Err(self.error(
1786 Span::new(start, self.pos()),
1787 ast::ErrorKind::RepetitionCountUnclosed,
1788 ));
1789 }
1790
1791 if self.bump_and_bump_space() && self.char() == '?' {
1792 return Err(self.error(
1793 Span::new(start, self.pos()),
1794 ast::ErrorKind::UnsupportedLazyQuantifier,
1795 ));
1796 }
1797
1798 let op_span = Span::new(start, self.pos());
1799 if !range.is_valid() {
1800 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1801 }
1802
1803 let over_limit = match &range {
1804 ast::RepetitionRange::Exactly(n) => *n > REPETITION_COUNT_LIMIT,
1805 ast::RepetitionRange::AtLeast(n) => *n > REPETITION_COUNT_LIMIT,
1806 ast::RepetitionRange::Bounded(n, m) => {
1807 *n > REPETITION_COUNT_LIMIT || *m > REPETITION_COUNT_LIMIT
1808 }
1809 };
1810 if over_limit {
1811 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
1812 }
1813 concat.asts.push(Ast::repetition(ast::Repetition {
1814 span: ast.span().with_end(self.pos()),
1815 op: ast::RepetitionOp {
1816 span: op_span,
1817 kind: ast::RepetitionKind::Range(range),
1818 },
1819 greedy: true,
1820 ast: Box::new(ast),
1821 }));
1822 Ok(concat)
1823 }
1824
1825 #[inline(never)]
1826 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1827 assert_eq!(self.char(), '(');
1828 let open_span = self.span_char();
1829 self.bump();
1830 self.bump_space();
1831 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1832 let kind = match (pos, ahead) {
1833 (true, true) => LookaroundKind::PositiveLookahead,
1834 (true, false) => LookaroundKind::PositiveLookbehind,
1835 (false, true) => LookaroundKind::NegativeLookahead,
1836 (false, false) => LookaroundKind::NegativeLookbehind,
1837 };
1838 return Ok(Either::Right(ast::Group {
1839 span: open_span,
1840 kind: ast::GroupKind::Lookaround(kind),
1841 ast: Box::new(Ast::empty(self.span())),
1842 }));
1843 }
1844 let inner_span = self.span();
1845 let mut starts_with_p = true;
1846 if self.bump_if("?P<") || {
1847 starts_with_p = false;
1848 self.bump_if("?<")
1849 } {
1850 let capture_index = self.next_capture_index(open_span)?;
1851 let name = self.parse_capture_name(capture_index)?;
1852 Ok(Either::Right(ast::Group {
1853 span: open_span,
1854 kind: ast::GroupKind::CaptureName {
1855 starts_with_p,
1856 name,
1857 },
1858 ast: Box::new(Ast::empty(self.span())),
1859 }))
1860 } else if self.bump_if("?") {
1861 if self.is_eof() {
1862 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1863 }
1864 let flags = self.parse_flags()?;
1865 let char_end = self.char();
1866 self.bump();
1867 if char_end == ')' {
1868 if flags.items.is_empty() {
1871 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1872 }
1873 Ok(Either::Left(ast::SetFlags {
1874 span: Span {
1875 end: self.pos(),
1876 ..open_span
1877 },
1878 flags,
1879 }))
1880 } else {
1881 assert_eq!(char_end, ':');
1882 Ok(Either::Right(ast::Group {
1883 span: open_span,
1884 kind: ast::GroupKind::NonCapturing(flags),
1885 ast: Box::new(Ast::empty(self.span())),
1886 }))
1887 }
1888 } else {
1889 let capture_index = self.next_capture_index(open_span)?;
1890 Ok(Either::Right(ast::Group {
1891 span: open_span,
1892 kind: ast::GroupKind::CaptureIndex(capture_index),
1893 ast: Box::new(Ast::empty(self.span())),
1894 }))
1895 }
1896 }
1897
1898 #[inline(never)]
1899 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1900 if self.is_eof() {
1901 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1902 }
1903 let start = self.pos();
1904 loop {
1905 if self.char() == '>' {
1906 break;
1907 }
1908 if !is_capture_char(self.char(), self.pos() == start) {
1909 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1910 }
1911 if !self.bump() {
1912 break;
1913 }
1914 }
1915 let end = self.pos();
1916 if self.is_eof() {
1917 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1918 }
1919 assert_eq!(self.char(), '>');
1920 self.bump();
1921 let name = &self.pattern()[start.offset..end.offset];
1922 if name.is_empty() {
1923 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1924 }
1925 let capname = ast::CaptureName {
1926 span: Span::new(start, end),
1927 name: name.to_string(),
1928 index: capture_index,
1929 };
1930 self.add_capture_name(&capname)?;
1931 Ok(capname)
1932 }
1933
1934 #[inline(never)]
1935 fn parse_flags(&self) -> Result<ast::Flags> {
1936 let mut flags = ast::Flags {
1937 span: self.span(),
1938 items: vec![],
1939 };
1940 let mut last_was_negation = None;
1941 while self.char() != ':' && self.char() != ')' {
1942 if self.char() == '-' {
1943 last_was_negation = Some(self.span_char());
1944 let item = ast::FlagsItem {
1945 span: self.span_char(),
1946 kind: ast::FlagsItemKind::Negation,
1947 };
1948 if let Some(i) = flags.add_item(item) {
1949 return Err(self.error(
1950 self.span_char(),
1951 ast::ErrorKind::FlagRepeatedNegation {
1952 original: flags.items[i].span,
1953 },
1954 ));
1955 }
1956 } else {
1957 last_was_negation = None;
1958 let item = ast::FlagsItem {
1959 span: self.span_char(),
1960 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1961 };
1962 if let Some(i) = flags.add_item(item) {
1963 return Err(self.error(
1964 self.span_char(),
1965 ast::ErrorKind::FlagDuplicate {
1966 original: flags.items[i].span,
1967 },
1968 ));
1969 }
1970 }
1971 if !self.bump() {
1972 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1973 }
1974 }
1975 if let Some(span) = last_was_negation {
1976 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1977 }
1978 flags.span.end = self.pos();
1979 Ok(flags)
1980 }
1981
1982 #[inline(never)]
1983 fn parse_flag(&self) -> Result<ast::Flag> {
1984 match self.char() {
1985 'i' => Ok(ast::Flag::CaseInsensitive),
1986 'm' => Ok(ast::Flag::MultiLine),
1987 's' => Ok(ast::Flag::DotMatchesNewLine),
1988 'U' => Ok(ast::Flag::SwapGreed),
1989 'u' => Ok(ast::Flag::Unicode),
1990 'R' => Ok(ast::Flag::CRLF),
1991 'x' => Ok(ast::Flag::IgnoreWhitespace),
1992 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1993 }
1994 }
1995
1996 fn parse_primitive(&self) -> Result<Primitive> {
1997 match self.char() {
1998 '\\' => self.parse_escape(),
1999 '_' => {
2000 let ast = Primitive::Top(self.span_char());
2001 self.bump();
2002 Ok(ast)
2003 }
2004 '.' => {
2005 let ast = Primitive::Dot(self.span_char());
2006 self.bump();
2007 Ok(ast)
2008 }
2009 '^' => {
2010 let ast = Primitive::Assertion(ast::Assertion {
2011 span: self.span_char(),
2012 kind: ast::AssertionKind::StartLine,
2013 });
2014 self.bump();
2015 Ok(ast)
2016 }
2017 '$' => {
2018 let ast = Primitive::Assertion(ast::Assertion {
2019 span: self.span_char(),
2020 kind: ast::AssertionKind::EndLine,
2021 });
2022 self.bump();
2023 Ok(ast)
2024 }
2025 c => {
2026 let ast = Primitive::Literal(Literal {
2027 span: self.span_char(),
2028 kind: LiteralKind::Verbatim,
2029 c,
2030 });
2031 self.bump();
2032 Ok(ast)
2033 }
2034 }
2035 }
2036
2037 #[inline(never)]
2038 fn parse_escape(&self) -> Result<Primitive> {
2039 assert_eq!(self.char(), '\\');
2040 let start = self.pos();
2041 if !self.bump() {
2042 return Err(self.error(
2043 Span::new(start, self.pos()),
2044 ast::ErrorKind::EscapeUnexpectedEof,
2045 ));
2046 }
2047 let c = self.char();
2048 match c {
2050 '0'..='9' => {
2051 if !self.parser().octal {
2052 return Err(self.error(
2053 Span::new(start, self.span_char().end),
2054 ast::ErrorKind::UnsupportedBackreference,
2055 ));
2056 }
2057 let mut lit = self.parse_octal();
2058 lit.span.start = start;
2059 return Ok(Primitive::Literal(lit));
2060 }
2061 'x' | 'u' | 'U' => {
2068 let mut lit = self.parse_hex()?;
2069 lit.span.start = start;
2070 return Ok(Primitive::Literal(lit));
2071 }
2072 'p' | 'P' => {
2073 let mut cls = self.parse_unicode_class()?;
2074 cls.span.start = start;
2075 return Ok(Primitive::Unicode(cls));
2076 }
2077 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2078 let mut cls = self.parse_perl_class();
2079 cls.span.start = start;
2080 return Ok(Primitive::Perl(cls));
2081 }
2082 _ => {}
2083 }
2084
2085 self.bump();
2087 let span = Span::new(start, self.pos());
2088 if is_meta_character(c) {
2089 return Ok(Primitive::Literal(Literal {
2090 span,
2091 kind: LiteralKind::Meta,
2092 c,
2093 }));
2094 }
2095 if is_escapeable_character(c) {
2096 return Ok(Primitive::Literal(Literal {
2097 span,
2098 kind: LiteralKind::Superfluous,
2099 c,
2100 }));
2101 }
2102 let special = |kind, c| {
2103 Ok(Primitive::Literal(Literal {
2104 span,
2105 kind: LiteralKind::Special(kind),
2106 c,
2107 }))
2108 };
2109 match c {
2110 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2111 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2112 't' => special(SpecialLiteralKind::Tab, '\t'),
2113 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2114 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2115 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2116 'A' => Ok(Primitive::Assertion(ast::Assertion {
2117 span,
2118 kind: ast::AssertionKind::StartText,
2119 })),
2120 'z' => Ok(Primitive::Assertion(ast::Assertion {
2121 span,
2122 kind: ast::AssertionKind::EndText,
2123 })),
2124 'b' => {
2125 let mut wb = ast::Assertion {
2126 span,
2127 kind: ast::AssertionKind::WordBoundary,
2128 };
2129 if !self.is_eof() && self.char() == '{' {
2132 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2133 wb.kind = kind;
2134 wb.span.end = self.pos();
2135 }
2136 }
2137 Ok(Primitive::Assertion(wb))
2138 }
2139 'B' => Ok(Primitive::Assertion(ast::Assertion {
2140 span,
2141 kind: ast::AssertionKind::NotWordBoundary,
2142 })),
2143 '<' => Ok(Primitive::Assertion(ast::Assertion {
2144 span,
2145 kind: ast::AssertionKind::WordBoundaryStartAngle,
2146 })),
2147 '>' => Ok(Primitive::Assertion(ast::Assertion {
2148 span,
2149 kind: ast::AssertionKind::WordBoundaryEndAngle,
2150 })),
2151 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2152 }
2153 }
2154
2155 fn maybe_parse_special_word_boundary(
2156 &self,
2157 wb_start: Position,
2158 ) -> Result<Option<ast::AssertionKind>> {
2159 assert_eq!(self.char(), '{');
2160
2161 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2162 let start = self.pos();
2163 if !self.bump_and_bump_space() {
2164 return Err(self.error(
2165 Span::new(wb_start, self.pos()),
2166 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2167 ));
2168 }
2169 let start_contents = self.pos();
2170 if !is_valid_char(self.char()) {
2171 self.parser().pos.set(start);
2172 return Ok(None);
2173 }
2174
2175 let mut scratch = self.parser().scratch.borrow_mut();
2177 scratch.clear();
2178 while !self.is_eof() && is_valid_char(self.char()) {
2179 scratch.push(self.char());
2180 self.bump_and_bump_space();
2181 }
2182 if self.is_eof() || self.char() != '}' {
2183 return Err(self.error(
2184 Span::new(start, self.pos()),
2185 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2186 ));
2187 }
2188 let end = self.pos();
2189 self.bump();
2190 let kind = match scratch.as_str() {
2191 "start" => ast::AssertionKind::WordBoundaryStart,
2192 "end" => ast::AssertionKind::WordBoundaryEnd,
2193 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2194 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2195 _ => {
2196 return Err(self.error(
2197 Span::new(start_contents, end),
2198 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2199 ))
2200 }
2201 };
2202 Ok(Some(kind))
2203 }
2204
2205 #[inline(never)]
2206 fn parse_octal(&self) -> Literal {
2207 assert!(self.parser().octal);
2208 assert!('0' <= self.char() && self.char() <= '7');
2209 let start = self.pos();
2210 while self.bump()
2212 && '0' <= self.char()
2213 && self.char() <= '7'
2214 && self.pos().offset - start.offset <= 2
2215 {}
2216 let end = self.pos();
2217 let octal = &self.pattern()[start.offset..end.offset];
2218 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2221 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2224 Literal {
2225 span: Span::new(start, end),
2226 kind: LiteralKind::Octal,
2227 c,
2228 }
2229 }
2230
2231 #[inline(never)]
2232 fn parse_hex(&self) -> Result<Literal> {
2233 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2234
2235 let hex_kind = match self.char() {
2236 'x' => HexLiteralKind::X,
2237 'u' => HexLiteralKind::UnicodeShort,
2238 _ => HexLiteralKind::UnicodeLong,
2239 };
2240 if !self.bump_and_bump_space() {
2241 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2242 }
2243 if self.char() == '{' {
2244 self.parse_hex_brace(hex_kind)
2245 } else {
2246 self.parse_hex_digits(hex_kind)
2247 }
2248 }
2249
2250 #[inline(never)]
2251 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2252 let mut scratch = self.parser().scratch.borrow_mut();
2253 scratch.clear();
2254
2255 let start = self.pos();
2256 for i in 0..kind.digits() {
2257 if i > 0 && !self.bump_and_bump_space() {
2258 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2259 }
2260 if !is_hex(self.char()) {
2261 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2262 }
2263 scratch.push(self.char());
2264 }
2265 self.bump_and_bump_space();
2266 let end = self.pos();
2267 let hex = scratch.as_str();
2268 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2269 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2270 Some(c) => Ok(Literal {
2271 span: Span::new(start, end),
2272 kind: LiteralKind::HexFixed(kind),
2273 c,
2274 }),
2275 }
2276 }
2277
2278 #[inline(never)]
2279 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2280 let mut scratch = self.parser().scratch.borrow_mut();
2281 scratch.clear();
2282
2283 let brace_pos = self.pos();
2284 let start = self.span_char().end;
2285 while self.bump_and_bump_space() && self.char() != '}' {
2286 if !is_hex(self.char()) {
2287 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2288 }
2289 scratch.push(self.char());
2290 }
2291 if self.is_eof() {
2292 return Err(self.error(
2293 Span::new(brace_pos, self.pos()),
2294 ast::ErrorKind::EscapeUnexpectedEof,
2295 ));
2296 }
2297 let end = self.pos();
2298 let hex = scratch.as_str();
2299 assert_eq!(self.char(), '}');
2300 self.bump_and_bump_space();
2301
2302 if hex.is_empty() {
2303 return Err(self.error(
2304 Span::new(brace_pos, self.pos()),
2305 ast::ErrorKind::EscapeHexEmpty,
2306 ));
2307 }
2308 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2309 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2310 Some(c) => Ok(Literal {
2311 span: Span::new(start, self.pos()),
2312 kind: LiteralKind::HexBrace(kind),
2313 c,
2314 }),
2315 }
2316 }
2317
2318 fn parse_decimal(&self) -> Result<u32> {
2319 let mut scratch = self.parser().scratch.borrow_mut();
2320 scratch.clear();
2321
2322 while !self.is_eof() && self.char().is_whitespace() {
2323 self.bump();
2324 }
2325 let start = self.pos();
2326 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2327 scratch.push(self.char());
2328 self.bump_and_bump_space();
2329 }
2330 let span = Span::new(start, self.pos());
2331 while !self.is_eof() && self.char().is_whitespace() {
2332 self.bump_and_bump_space();
2333 }
2334 let digits = scratch.as_str();
2335 if digits.is_empty() {
2336 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2337 }
2338 match digits.parse::<u32>().ok() {
2339 Some(n) => Ok(n),
2340 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2341 }
2342 }
2343
2344 #[inline(never)]
2345 fn parse_set_class(&self) -> Result<ClassBracketed> {
2346 assert_eq!(self.char(), '[');
2347
2348 let mut union = ClassSetUnion {
2349 span: self.span(),
2350 items: vec![],
2351 };
2352 loop {
2353 self.bump_space();
2354 if self.is_eof() {
2355 return Err(self.unclosed_class_error());
2356 }
2357 match self.char() {
2358 '[' => {
2359 if !self.parser().stack_class.borrow().is_empty() {
2360 if let Some(cls) = self.maybe_parse_ascii_class() {
2361 union.push(ClassSetItem::Ascii(cls));
2362 continue;
2363 }
2364 }
2365 union = self.push_class_open(union)?;
2366 }
2367 ']' => match self.pop_class(union)? {
2368 Either::Left(nested_union) => {
2369 union = nested_union;
2370 }
2371 Either::Right(class) => return Ok(class),
2372 },
2373 '&' if self.peek() == Some('&') => {
2374 assert!(self.bump_if("&&"));
2375 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2376 }
2377 '-' if self.peek() == Some('-') => {
2378 assert!(self.bump_if("--"));
2379 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2380 }
2381 '~' if self.peek() == Some('~') => {
2382 assert!(self.bump_if("~~"));
2383 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2384 }
2385 _ => {
2386 union.push(self.parse_set_class_range()?);
2387 }
2388 }
2389 }
2390 }
2391
2392 #[inline(never)]
2393 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2394 let prim1 = self.parse_set_class_item()?;
2395 self.bump_space();
2396 if self.is_eof() {
2397 return Err(self.unclosed_class_error());
2398 }
2399 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2400 return prim1.into_class_set_item(self);
2401 }
2402 if !self.bump_and_bump_space() {
2403 return Err(self.unclosed_class_error());
2404 }
2405 let prim2 = self.parse_set_class_item()?;
2406 let range = ClassSetRange {
2407 span: Span::new(prim1.span().start, prim2.span().end),
2408 start: prim1.into_class_literal(self)?,
2409 end: prim2.into_class_literal(self)?,
2410 };
2411 if !range.is_valid() {
2412 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2413 }
2414 Ok(ClassSetItem::Range(range))
2415 }
2416
2417 #[inline(never)]
2418 fn parse_set_class_item(&self) -> Result<Primitive> {
2419 if self.char() == '\\' {
2420 self.parse_escape()
2421 } else {
2422 let x = Primitive::Literal(Literal {
2423 span: self.span_char(),
2424 kind: LiteralKind::Verbatim,
2425 c: self.char(),
2426 });
2427 self.bump();
2428 Ok(x)
2429 }
2430 }
2431
2432 #[inline(never)]
2433 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2434 assert_eq!(self.char(), '[');
2435 let start = self.pos();
2436 if !self.bump_and_bump_space() {
2437 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2438 }
2439
2440 let negated = if self.char() != '^' {
2441 false
2442 } else {
2443 if !self.bump_and_bump_space() {
2444 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2445 }
2446 true
2447 };
2448 let mut union = ClassSetUnion {
2450 span: self.span(),
2451 items: vec![],
2452 };
2453 while self.char() == '-' {
2454 union.push(ClassSetItem::Literal(Literal {
2455 span: self.span_char(),
2456 kind: LiteralKind::Verbatim,
2457 c: '-',
2458 }));
2459 if !self.bump_and_bump_space() {
2460 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2461 }
2462 }
2463 if union.items.is_empty() && self.char() == ']' {
2466 union.push(ClassSetItem::Literal(Literal {
2467 span: self.span_char(),
2468 kind: LiteralKind::Verbatim,
2469 c: ']',
2470 }));
2471 if !self.bump_and_bump_space() {
2472 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2473 }
2474 }
2475 let set = ClassBracketed {
2476 span: Span::new(start, self.pos()),
2477 negated,
2478 kind: ClassSet::union(ClassSetUnion {
2479 span: Span::new(union.span.start, union.span.start),
2480 items: vec![],
2481 }),
2482 };
2483 Ok((set, union))
2484 }
2485
2486 #[inline(never)]
2487 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2488 assert_eq!(self.char(), '[');
2489 let start = self.pos();
2491 let mut negated = false;
2492 if !self.bump() || self.char() != ':' {
2493 self.parser().pos.set(start);
2494 return None;
2495 }
2496 if !self.bump() {
2497 self.parser().pos.set(start);
2498 return None;
2499 }
2500 if self.char() == '^' {
2501 negated = true;
2502 if !self.bump() {
2503 self.parser().pos.set(start);
2504 return None;
2505 }
2506 }
2507 let name_start = self.offset();
2508 while self.char() != ':' && self.bump() {}
2509 if self.is_eof() {
2510 self.parser().pos.set(start);
2511 return None;
2512 }
2513 let name = &self.pattern()[name_start..self.offset()];
2514 if !self.bump_if(":]") {
2515 self.parser().pos.set(start);
2516 return None;
2517 }
2518 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2519 Some(kind) => kind,
2520 None => {
2521 self.parser().pos.set(start);
2522 return None;
2523 }
2524 };
2525 Some(ClassAscii {
2526 span: Span::new(start, self.pos()),
2527 kind,
2528 negated,
2529 })
2530 }
2531
2532 #[inline(never)]
2533 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2534 assert!(self.char() == 'p' || self.char() == 'P');
2535
2536 let mut scratch = self.parser().scratch.borrow_mut();
2537 scratch.clear();
2538
2539 let negated = self.char() == 'P';
2540 if !self.bump_and_bump_space() {
2541 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2542 }
2543 let (start, kind) = if self.char() == '{' {
2544 let start = self.span_char().end;
2545 while self.bump_and_bump_space() && self.char() != '}' {
2546 scratch.push(self.char());
2547 }
2548 if self.is_eof() {
2549 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2550 }
2551 assert_eq!(self.char(), '}');
2552 self.bump();
2553
2554 let name = scratch.as_str();
2555 if let Some(i) = name.find("!=") {
2556 (
2557 start,
2558 ClassUnicodeKind::NamedValue {
2559 op: ClassUnicodeOpKind::NotEqual,
2560 name: name[..i].to_string(),
2561 value: name[i + 2..].to_string(),
2562 },
2563 )
2564 } else if let Some(i) = name.find(':') {
2565 (
2566 start,
2567 ClassUnicodeKind::NamedValue {
2568 op: ClassUnicodeOpKind::Colon,
2569 name: name[..i].to_string(),
2570 value: name[i + 1..].to_string(),
2571 },
2572 )
2573 } else if let Some(i) = name.find('=') {
2574 (
2575 start,
2576 ClassUnicodeKind::NamedValue {
2577 op: ClassUnicodeOpKind::Equal,
2578 name: name[..i].to_string(),
2579 value: name[i + 1..].to_string(),
2580 },
2581 )
2582 } else {
2583 (start, ClassUnicodeKind::Named(name.to_string()))
2584 }
2585 } else {
2586 let start = self.pos();
2587 let c = self.char();
2588 if c == '\\' {
2589 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2590 }
2591 self.bump_and_bump_space();
2592 let kind = ClassUnicodeKind::OneLetter(c);
2593 (start, kind)
2594 };
2595 Ok(ClassUnicode {
2596 span: Span::new(start, self.pos()),
2597 negated,
2598 kind,
2599 })
2600 }
2601
2602 #[inline(never)]
2603 fn parse_perl_class(&self) -> ClassPerl {
2604 let c = self.char();
2605 let span = self.span_char();
2606 self.bump();
2607 let (negated, kind) = match c {
2608 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2609 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2610 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2611 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2612 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2613 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2614 c => panic!("expected valid Perl class but got '{}'", c),
2615 };
2616 ClassPerl {
2617 span,
2618 kind,
2619 negated,
2620 }
2621 }
2622}
2623
2624fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
2627 use regex_syntax::ast::ClassSetItem;
2628 let items = match item {
2629 ClassSetItem::Union(u) => &u.items,
2630 _ => return false,
2631 };
2632 if items.len() != 2 {
2633 return false;
2634 }
2635 match (&items[0], &items[1]) {
2636 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2637 let is_all = a.kind == b.kind && a.negated != b.negated;
2638 is_all
2639 }
2640 _ => false,
2641 }
2642}
2643
2644pub fn max_list_length(ast: &ast::Ast) -> usize {
2645 match ast {
2646 ast::Ast::Empty(_)
2647 | ast::Ast::Flags(_)
2648 | ast::Ast::Literal(_)
2649 | ast::Ast::Dot(_)
2650 | ast::Ast::Top(_)
2651 | ast::Ast::Assertion(_)
2652 | ast::Ast::ClassUnicode(_)
2653 | ast::Ast::ClassPerl(_)
2654 | ast::Ast::ClassBracketed(_) => 0,
2655 ast::Ast::Group(g) => max_list_length(&g.ast),
2656 ast::Ast::Complement(c) => max_list_length(&c.ast),
2657 ast::Ast::Lookaround(l) => max_list_length(&l.ast),
2658 ast::Ast::Repetition(r) => max_list_length(&r.ast),
2659 ast::Ast::Concat(c) => c
2660 .asts
2661 .len()
2662 .max(c.asts.iter().map(max_list_length).max().unwrap_or(0)),
2663 ast::Ast::Alternation(a) => a
2664 .asts
2665 .len()
2666 .max(a.asts.iter().map(max_list_length).max().unwrap_or(0)),
2667 ast::Ast::Intersection(i) => i
2668 .asts
2669 .len()
2670 .max(i.asts.iter().map(max_list_length).max().unwrap_or(0)),
2671 }
2672}
2673
2674pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2675 fn go(ast: &ast::Ast, limit: u64) -> u64 {
2676 match ast {
2677 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2678 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2679 ast::Ast::Assertion(_) => 1,
2680 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2681 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2682 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2683 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2684 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2685 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2686 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2687 ast::Ast::Repetition(r) => {
2688 let body = go(&r.ast, limit);
2689 let factor: u64 = match &r.op.kind {
2690 ast::RepetitionKind::ZeroOrOne => 2,
2691 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2692 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2693 (*n as u64).max(1)
2694 }
2695 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2696 (*n as u64).max(1).saturating_add(1)
2697 }
2698 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2699 (*m as u64).max(1)
2700 }
2701 };
2702 body.saturating_mul(factor).min(limit)
2703 }
2704 }
2705 }
2706 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
2707 let mut total: u64 = 0;
2708 for c in children {
2709 total = total.saturating_add(go(c, limit));
2710 if total >= limit {
2711 return limit;
2712 }
2713 }
2714 total
2715 }
2716 go(ast, limit)
2717}
2718
2719pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
2720 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2721 p.parse(tb)
2722}
2723
2724pub fn parse_ast_with<'s>(
2725 tb: &mut TB<'s>,
2726 pattern: &'s str,
2727 flags: &PatternFlags,
2728) -> std::result::Result<NodeId, ParseError> {
2729 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2730 p.parse(tb)
2731}
2732
2733pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
2735 let mut p: ResharpParser = ResharpParser::new(pattern);
2736 p.parse_inner()
2737}