1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub ignore_whitespace: bool,
38}
39
40impl Default for PatternFlags {
41 fn default() -> Self {
42 Self {
43 unicode: true,
44 full_unicode: false,
45 case_insensitive: false,
46 dot_matches_new_line: false,
47 ignore_whitespace: false,
48 }
49 }
50}
51
52#[derive(Clone, Copy, PartialEq, Debug)]
53enum WordCharKind {
54 Word,
55 NonWord,
56 MaybeWord,
57 MaybeNonWord,
58 Unknown,
59 Edge,
60}
61
62fn is_word_byte(b: u8) -> bool {
63 b.is_ascii_alphanumeric() || b == b'_'
64}
65
66#[derive(Clone, Debug, Eq, PartialEq)]
67enum Primitive {
68 Literal(Literal),
69 Assertion(ast::Assertion),
70 Dot(Span),
71 Top(Span),
72 Perl(ClassPerl),
73 Unicode(ClassUnicode),
74}
75
76impl Primitive {
77 fn span(&self) -> &Span {
78 match *self {
79 Primitive::Literal(ref x) => &x.span,
80 Primitive::Assertion(ref x) => &x.span,
81 Primitive::Dot(ref span) => span,
82 Primitive::Top(ref span) => span,
83 Primitive::Perl(ref x) => &x.span,
84 Primitive::Unicode(ref x) => &x.span,
85 }
86 }
87
88 fn into_ast(self) -> Ast {
89 match self {
90 Primitive::Literal(lit) => Ast::literal(lit),
91 Primitive::Assertion(assert) => Ast::assertion(assert),
92 Primitive::Dot(span) => Ast::dot(span),
93 Primitive::Top(span) => Ast::top(span),
94 Primitive::Perl(cls) => Ast::class_perl(cls),
95 Primitive::Unicode(cls) => Ast::class_unicode(cls),
96 }
97 }
98
99 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
100 use self::Primitive::*;
101 use regex_syntax::ast::ClassSetItem;
102
103 match self {
104 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
105 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
106 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
107 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
108 }
109 }
110
111 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
112 use self::Primitive::*;
113
114 match self {
115 Literal(lit) => Ok(lit),
116 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
117 }
118 }
119}
120
121#[derive(Clone, Debug, Eq, PartialEq)]
122pub enum Either<Left, Right> {
123 Left(Left),
124 Right(Right),
125}
126
127#[derive(Clone, Debug, Eq, PartialEq)]
128pub struct ResharpError {
129 pub kind: ErrorKind,
131 pattern: String,
134 pub span: Span,
136}
137
138impl std::fmt::Display for ResharpError {
139 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
140 write!(f, "{:?}: {:?}", self.kind, self.span)
141 }
142}
143impl std::error::Error for ResharpError {}
144
145type Result<T> = core::result::Result<T, ResharpError>;
146
147#[derive(Clone, Debug)]
148enum GroupState {
149 Group {
151 concat: Concat,
153 group: ast::Group,
155 ignore_whitespace: bool,
157 },
158 Alternation(ast::Alternation),
163 Intersection(ast::Intersection),
164}
165
166#[derive(Clone, Debug)]
167enum ClassState {
168 Open {
170 union: regex_syntax::ast::ClassSetUnion,
172 set: regex_syntax::ast::ClassBracketed,
176 },
177 Op {
180 kind: regex_syntax::ast::ClassSetBinaryOpKind,
182 lhs: regex_syntax::ast::ClassSet,
184 },
185}
186
187pub struct ResharpParser<'s> {
189 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
190 unicode_classes: resharp_algebra::UnicodeClassCache,
191 pub translator: regex_syntax::hir::translate::Translator,
192 pub pattern: &'s str,
193 pos: Cell<Position>,
194 capture_index: Cell<u32>,
195 octal: bool,
196 empty_min_range: bool,
197 ignore_whitespace: Cell<bool>,
198 dot_all: Cell<bool>,
199 global_unicode: bool,
200 global_full_unicode: bool,
201 global_case_insensitive: bool,
202 comments: RefCell<Vec<ast::Comment>>,
203 stack_group: RefCell<Vec<GroupState>>,
204 stack_class: RefCell<Vec<ClassState>>,
205 capture_names: RefCell<Vec<ast::CaptureName>>,
206 scratch: RefCell<String>,
207}
208
209fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
210 result.map_err(|e| {
211 if e.kind == from {
212 ResharpError {
213 kind: to,
214 pattern: e.pattern,
215 span: e.span,
216 }
217 } else {
218 e
219 }
220 })
221}
222
223fn is_capture_char(c: char, first: bool) -> bool {
224 if first {
225 c == '_' || c.is_alphabetic()
226 } else {
227 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
228 }
229}
230
231pub fn is_meta_character(c: char) -> bool {
232 matches!(
233 c,
234 '\\' | '.'
235 | '+'
236 | '*'
237 | '?'
238 | '('
239 | ')'
240 | '|'
241 | '['
242 | ']'
243 | '{'
244 | '}'
245 | '^'
246 | '$'
247 | '#'
248 | '&'
249 | '-'
250 | '~'
251 | '_'
252 )
253}
254
255pub fn escape(text: &str) -> String {
257 let mut buf = String::new();
258 escape_into(text, &mut buf);
259 buf
260}
261
262pub fn escape_into(text: &str, buf: &mut String) {
264 buf.reserve(text.len());
265 for c in text.chars() {
266 if is_meta_character(c) {
267 buf.push('\\');
268 }
269 buf.push(c);
270 }
271}
272
273pub fn is_escapeable_character(c: char) -> bool {
274 if is_meta_character(c) {
276 return true;
277 }
278 if !c.is_ascii() {
281 return false;
282 }
283 match c {
288 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
289 '<' | '>' => false,
299 _ => true,
300 }
301}
302
303fn is_hex(c: char) -> bool {
304 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
305}
306
307impl<'s> ResharpParser<'s> {
308 fn default_translator_builder(&self) -> TranslatorBuilder {
309 let mut trb = TranslatorBuilder::new();
310 trb.unicode(self.global_unicode);
311 trb.utf8(false);
312 trb.case_insensitive(self.global_case_insensitive);
313 trb
314 }
315
316 pub fn new(pattern: &'s str) -> Self {
317 Self::with_flags(pattern, &PatternFlags::default())
318 }
319
320 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
321 let mut trb = TranslatorBuilder::new();
322 trb.unicode(flags.unicode);
323 trb.utf8(false);
324 trb.case_insensitive(flags.case_insensitive);
325 Self {
326 translator: trb.build(),
327 pattern,
328 perl_classes: vec![],
329 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
330 pos: Cell::new(Position::new(0, 0, 0)),
331 capture_index: Cell::new(0),
332 octal: false,
333 empty_min_range: false,
334 ignore_whitespace: Cell::new(flags.ignore_whitespace),
335 dot_all: Cell::new(flags.dot_matches_new_line),
336 global_unicode: flags.unicode || flags.full_unicode,
337 global_full_unicode: flags.full_unicode,
338 global_case_insensitive: flags.case_insensitive,
339 comments: RefCell::new(vec![]),
340 stack_group: RefCell::new(vec![]),
341 stack_class: RefCell::new(vec![]),
342 capture_names: RefCell::new(vec![]),
343 scratch: RefCell::new(String::new()),
344 }
345 }
346
347 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
349 self
350 }
351
352 fn pattern(&self) -> &str {
354 self.pattern
355 }
356
357 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
359 ResharpError {
360 kind,
361 pattern: self.pattern().to_string(),
362 span,
363 }
364 }
365
366 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
367 self.error(
368 Span::splat(self.pos()),
369 ast::ErrorKind::UnsupportedResharpRegex,
370 )
371 }
372
373 fn offset(&self) -> usize {
378 self.parser().pos.get().offset
379 }
380
381 fn line(&self) -> usize {
385 self.parser().pos.get().line
386 }
387
388 fn column(&self) -> usize {
392 self.parser().pos.get().column
393 }
394
395 fn next_capture_index(&self, span: Span) -> Result<u32> {
403 let current = self.parser().capture_index.get();
404 let i = current
405 .checked_add(1)
406 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
407 self.parser().capture_index.set(i);
408 Ok(i)
409 }
410
411 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
412 let mut names = self.parser().capture_names.borrow_mut();
413 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
414 Err(i) => {
415 names.insert(i, cap.clone());
416 Ok(())
417 }
418 Ok(i) => Err(self.error(
419 cap.span,
420 ast::ErrorKind::GroupNameDuplicate {
421 original: names[i].span,
422 },
423 )),
424 }
425 }
426
427 fn ignore_whitespace(&self) -> bool {
428 self.parser().ignore_whitespace.get()
429 }
430
431 fn char(&self) -> char {
432 self.char_at(self.offset())
433 }
434
435 fn char_at(&self, i: usize) -> char {
436 self.pattern()[i..]
437 .chars()
438 .next()
439 .unwrap_or_else(|| panic!("expected char at offset {}", i))
440 }
441
442 fn bump(&self) -> bool {
443 if self.is_eof() {
444 return false;
445 }
446 let Position {
447 mut offset,
448 mut line,
449 mut column,
450 } = self.pos();
451 if self.char() == '\n' {
452 line = line.checked_add(1).unwrap();
453 column = 1;
454 } else {
455 column = column.checked_add(1).unwrap();
456 }
457 offset += self.char().len_utf8();
458 self.parser().pos.set(Position {
459 offset,
460 line,
461 column,
462 });
463 self.pattern()[self.offset()..].chars().next().is_some()
464 }
465
466 fn bump_if(&self, prefix: &str) -> bool {
467 if self.pattern()[self.offset()..].starts_with(prefix) {
468 for _ in 0..prefix.chars().count() {
469 self.bump();
470 }
471 true
472 } else {
473 false
474 }
475 }
476
477 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
478 if self.bump_if("?=") {
479 return Some((true, true));
480 }
481 if self.bump_if("?!") {
482 return Some((true, false));
483 }
484 if self.bump_if("?<=") {
485 return Some((false, true));
486 }
487 if self.bump_if("?<!") {
488 return Some((false, false));
489 }
490 None
491 }
492
493 fn bump_and_bump_space(&self) -> bool {
494 if !self.bump() {
495 return false;
496 }
497 self.bump_space();
498 !self.is_eof()
499 }
500
501 fn bump_space(&self) {
502 if !self.ignore_whitespace() {
503 return;
504 }
505 while !self.is_eof() {
506 if self.char().is_whitespace() {
507 self.bump();
508 } else if self.char() == '#' {
509 let start = self.pos();
510 let mut comment_text = String::new();
511 self.bump();
512 while !self.is_eof() {
513 let c = self.char();
514 self.bump();
515 if c == '\n' {
516 break;
517 }
518 comment_text.push(c);
519 }
520 let comment = ast::Comment {
521 span: Span::new(start, self.pos()),
522 comment: comment_text,
523 };
524 self.parser().comments.borrow_mut().push(comment);
525 } else {
526 break;
527 }
528 }
529 }
530
531 fn peek(&self) -> Option<char> {
535 if self.is_eof() {
536 return None;
537 }
538 self.pattern()[self.offset() + self.char().len_utf8()..]
539 .chars()
540 .next()
541 }
542
543 fn peek_space(&self) -> Option<char> {
546 if !self.ignore_whitespace() {
547 return self.peek();
548 }
549 if self.is_eof() {
550 return None;
551 }
552 let mut start = self.offset() + self.char().len_utf8();
553 let mut in_comment = false;
554 for (i, c) in self.pattern()[start..].char_indices() {
555 if c.is_whitespace() {
556 continue;
557 } else if !in_comment && c == '#' {
558 in_comment = true;
559 } else if in_comment && c == '\n' {
560 in_comment = false;
561 } else {
562 start += i;
563 break;
564 }
565 }
566 self.pattern()[start..].chars().next()
567 }
568
569 fn is_eof(&self) -> bool {
571 self.offset() == self.pattern().len()
572 }
573
574 fn pos(&self) -> Position {
577 self.parser().pos.get()
578 }
579
580 fn span(&self) -> Span {
583 Span::splat(self.pos())
584 }
585
586 fn span_char(&self) -> Span {
588 let mut next = Position {
589 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
590 line: self.line(),
591 column: self.column().checked_add(1).unwrap(),
592 };
593 if self.char() == '\n' {
594 next.line += 1;
595 next.column = 1;
596 }
597 Span::new(self.pos(), next)
598 }
599
600 #[inline(never)]
610 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
611 assert_eq!(self.char(), '|');
612 concat.span.end = self.pos();
613 self.push_or_add_alternation(concat);
614 self.bump();
615 Ok(ast::Concat {
616 span: self.span(),
617 asts: vec![],
618 })
619 }
620
621 fn push_or_add_alternation(&self, concat: Concat) {
624 use self::GroupState::*;
625
626 let mut stack = self.parser().stack_group.borrow_mut();
627 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
628 alts.asts.push(concat.into_ast());
629 return;
630 }
631 stack.push(Alternation(ast::Alternation {
632 span: Span::new(concat.span.start, self.pos()),
633 asts: vec![concat.into_ast()],
634 }));
635 }
636
637 #[inline(never)]
638 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
639 assert_eq!(self.char(), '&');
640 concat.span.end = self.pos();
641 self.push_or_add_intersect(concat);
642 self.bump();
643 Ok(Concat {
644 span: self.span(),
645 asts: vec![],
646 })
647 }
648
649 fn push_or_add_intersect(&self, concat: Concat) {
652 use self::GroupState::*;
653
654 let mut stack = self.parser().stack_group.borrow_mut();
655 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
656 alts.asts.push(concat.into_ast());
657 return;
658 }
659 stack.push(Intersection(ast::Intersection {
660 span: Span::new(concat.span.start, self.pos()),
661 asts: vec![concat.into_ast()],
662 }));
663 }
664
665 #[inline(never)]
679 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
680 assert_eq!(self.char(), '(');
681 match self.parse_group()? {
682 Either::Left(set) => {
683 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
684 if let Some(v) = ignore {
685 self.parser().ignore_whitespace.set(v);
686 }
687
688 concat.asts.push(Ast::flags(set));
689 Ok(concat)
690 }
691 Either::Right(group) => {
692 let old_ignore_whitespace = self.ignore_whitespace();
693 let new_ignore_whitespace = group
694 .flags()
695 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
696 .unwrap_or(old_ignore_whitespace);
697 self.parser()
698 .stack_group
699 .borrow_mut()
700 .push(GroupState::Group {
701 concat,
702 group,
703 ignore_whitespace: old_ignore_whitespace,
704 });
705 self.parser().ignore_whitespace.set(new_ignore_whitespace);
706 Ok(Concat {
707 span: self.span(),
708 asts: vec![],
709 })
710 }
711 }
712 }
713
714 #[inline(never)]
715 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
716 assert_eq!(self.char(), '~');
717 self.bump();
718 if self.is_eof() || self.char() != '(' {
719 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
720 }
721 let open_span = self.span_char();
722 self.bump();
723 let group = ast::Group {
724 span: open_span,
725 kind: ast::GroupKind::Complement,
726 ast: Box::new(Ast::empty(self.span())),
727 };
728
729 let old_ignore_whitespace = self.ignore_whitespace();
730 let new_ignore_whitespace = group
731 .flags()
732 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
733 .unwrap_or(old_ignore_whitespace);
734 self.parser()
735 .stack_group
736 .borrow_mut()
737 .push(GroupState::Group {
738 concat,
739 group,
740 ignore_whitespace: old_ignore_whitespace,
741 });
742 self.parser().ignore_whitespace.set(new_ignore_whitespace);
743 Ok(Concat {
744 span: self.span(),
745 asts: vec![],
746 })
747 }
748
749 #[inline(never)]
759 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
760 use self::GroupState::*;
761 assert_eq!(self.char(), ')');
762 let mut stack = self.parser().stack_group.borrow_mut();
763 let topstack = stack.pop();
764
765 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
766 Some(Group {
767 concat,
768 group,
769 ignore_whitespace,
770 }) => (concat, group, ignore_whitespace, None),
771 Some(Alternation(alt)) => match stack.pop() {
772 Some(Group {
773 concat,
774 group,
775 ignore_whitespace,
776 }) => (
777 concat,
778 group,
779 ignore_whitespace,
780 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
781 ),
782 None | Some(Alternation(_)) | Some(Intersection(_)) => {
783 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
784 }
785 },
786 Some(Intersection(int)) => match stack.pop() {
787 Some(Group {
788 concat,
789 group,
790 ignore_whitespace,
791 }) => (
792 concat,
793 group,
794 ignore_whitespace,
795 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
796 ),
797 None | Some(Alternation(_)) | Some(Intersection(_)) => {
798 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
799 }
800 },
801
802 None => {
803 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
804 }
805 };
806 self.parser().ignore_whitespace.set(ignore_whitespace);
807 group_concat.span.end = self.pos();
808 self.bump();
809 group.span.end = self.pos();
810 match alt {
811 Some(Either::Left(mut alt)) => {
812 alt.span.end = group_concat.span.end;
813 alt.asts.push(group_concat.into_ast());
814 group.ast = Box::new(alt.into_ast());
815 }
816 Some(Either::Right(mut int)) => {
817 int.span.end = group_concat.span.end;
818 int.asts.push(group_concat.into_ast());
819 group.ast = Box::new(int.into_ast());
820 }
821 None => {
822 group.ast = Box::new(group_concat.into_ast());
823 }
824 }
825
826 if group.kind == GroupKind::Complement {
827 let complement = ast::Complement {
828 span: self.span(),
829 ast: group.ast,
830 };
831 prior_concat.asts.push(Ast::complement(complement));
832 }
833 else {
835 prior_concat.asts.push(Ast::group(group));
836 }
837 Ok(prior_concat)
838 }
839
840 #[inline(never)]
847 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
848 concat.span.end = self.pos();
849 let mut stack = self.parser().stack_group.borrow_mut();
850 let ast = match stack.pop() {
851 None => Ok(concat.into_ast()),
852 Some(GroupState::Alternation(mut alt)) => {
853 alt.span.end = self.pos();
854 alt.asts.push(concat.into_ast());
855 Ok(Ast::alternation(alt))
856 }
857 Some(GroupState::Intersection(mut int)) => {
858 int.span.end = self.pos();
859 int.asts.push(concat.into_ast());
860
861 Ok(Ast::intersection(int))
862 }
863 Some(GroupState::Group { group, .. }) => {
864 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
865 }
866 };
867 match stack.pop() {
869 None => ast,
870 Some(GroupState::Alternation(_)) => {
871 unreachable!()
878 }
879 Some(GroupState::Intersection(_)) => {
880 unreachable!()
881 }
882 Some(GroupState::Group { group, .. }) => {
883 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
884 }
885 }
886 }
887
888 #[inline(never)]
897 fn push_class_open(
898 &self,
899 parent_union: regex_syntax::ast::ClassSetUnion,
900 ) -> Result<regex_syntax::ast::ClassSetUnion> {
901 assert_eq!(self.char(), '[');
902
903 let (nested_set, nested_union) = self.parse_set_class_open()?;
904 self.parser()
905 .stack_class
906 .borrow_mut()
907 .push(ClassState::Open {
908 union: parent_union,
909 set: nested_set,
910 });
911 Ok(nested_union)
912 }
913
914 #[inline(never)]
929 fn pop_class(
930 &self,
931 nested_union: regex_syntax::ast::ClassSetUnion,
932 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
933 assert_eq!(self.char(), ']');
934
935 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
936 let prevset = self.pop_class_op(item);
937 let mut stack = self.parser().stack_class.borrow_mut();
938 match stack.pop() {
939 None => {
940 panic!("unexpected empty character class stack")
949 }
950 Some(ClassState::Op { .. }) => {
951 panic!("unexpected ClassState::Op")
958 }
959 Some(ClassState::Open { mut union, mut set }) => {
960 self.bump();
961 set.span.end = self.pos();
962 set.kind = prevset;
963 if stack.is_empty() {
964 Ok(Either::Right(set))
965 } else {
966 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
967 Ok(Either::Left(union))
968 }
969 }
970 }
971 }
972
973 #[inline(never)]
978 fn unclosed_class_error(&self) -> ResharpError {
979 for state in self.parser().stack_class.borrow().iter().rev() {
980 if let ClassState::Open { ref set, .. } = *state {
981 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
982 }
983 }
984 panic!("no open character class found")
987 }
988
989 #[inline(never)]
995 fn push_class_op(
996 &self,
997 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
998 next_union: regex_syntax::ast::ClassSetUnion,
999 ) -> regex_syntax::ast::ClassSetUnion {
1000 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1001 let new_lhs = self.pop_class_op(item);
1002 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1003 kind: next_kind,
1004 lhs: new_lhs,
1005 });
1006 regex_syntax::ast::ClassSetUnion {
1007 span: self.span(),
1008 items: vec![],
1009 }
1010 }
1011
1012 #[inline(never)]
1018 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1019 let mut stack = self.parser().stack_class.borrow_mut();
1020 let (kind, lhs) = match stack.pop() {
1021 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1022 Some(state @ ClassState::Open { .. }) => {
1023 stack.push(state);
1024 return rhs;
1025 }
1026 None => unreachable!(),
1027 };
1028 let span = Span::new(lhs.span().start, rhs.span().end);
1029 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1030 span,
1031 kind,
1032 lhs: Box::new(lhs),
1033 rhs: Box::new(rhs),
1034 })
1035 }
1036
1037 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1038 match hir.kind() {
1039 hir::HirKind::Empty => Ok(NodeId::EPS),
1040 hir::HirKind::Literal(l) => {
1041 if l.0.len() == 1 {
1042 let node = tb.mk_u8(l.0[0]);
1043 Ok(node)
1044 } else {
1045 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1046 let conc = tb.mk_concats(ws.iter().copied());
1047 Ok(conc)
1048 }
1049 }
1050 hir::HirKind::Class(class) => match class {
1051 hir::Class::Unicode(class_unicode) => {
1052 let ranges = class_unicode.ranges();
1053 let mut nodes = Vec::new();
1054 for range in ranges {
1055 for seq in Utf8Sequences::new(range.start(), range.end()) {
1056 let sl = seq.as_slice();
1057 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1058 let node = match bytes.len() {
1059 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1060 n => {
1061 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1062 let mut conc = last;
1063 for i in (0..n - 1).rev() {
1064 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1065 conc = tb.mk_concat(b, conc);
1066 }
1067 conc
1068 }
1069 };
1070 nodes.push(node);
1071 }
1072 }
1073 let merged = tb.mk_unions(nodes.into_iter());
1074 Ok(merged)
1075 }
1076 hir::Class::Bytes(class_bytes) => {
1077 let ranges = class_bytes.ranges();
1078 let mut result = NodeId::BOT;
1079 for range in ranges {
1080 let start = range.start();
1081 let end = range.end();
1082 let node = tb.mk_range_u8(start, end);
1083 result = tb.mk_union(result, node);
1084 }
1085 Ok(result)
1086 }
1087 },
1088 hir::HirKind::Look(_) => Err(self.error(
1089 Span::splat(self.pos()),
1090 ast::ErrorKind::UnsupportedResharpRegex,
1091 )),
1092 hir::HirKind::Repetition(_) => Err(self.error(
1093 Span::splat(self.pos()),
1094 ast::ErrorKind::UnsupportedResharpRegex,
1095 )),
1096 hir::HirKind::Capture(_) => Err(self.error(
1097 Span::splat(self.pos()),
1098 ast::ErrorKind::UnsupportedResharpRegex,
1099 )),
1100 hir::HirKind::Concat(body) => {
1101 let mut result = NodeId::EPS;
1102 for child in body {
1103 let node = self.hir_to_node_id(child, tb)?;
1104 result = tb.mk_concat(result, node);
1105 }
1106 Ok(result)
1107 }
1108 hir::HirKind::Alternation(_) => Err(self.error(
1109 Span::splat(self.pos()),
1110 ast::ErrorKind::UnsupportedResharpRegex,
1111 )),
1112 }
1113 }
1114
1115 fn translate_ast_to_hir(
1116 &mut self,
1117 orig_ast: ®ex_syntax::ast::Ast,
1118 tb: &mut TB<'s>,
1119 ) -> Result<NodeId> {
1120 match self.translator.translate("", orig_ast) {
1121 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1122 Ok(hir) => self.hir_to_node_id(&hir, tb),
1123 }
1124 }
1125
1126 fn translator_to_node_id(
1127 &mut self,
1128 orig_ast: ®ex_syntax::ast::Ast,
1129 translator: &mut Option<Translator>,
1130 tb: &mut TB<'s>,
1131 ) -> Result<NodeId> {
1132 match translator {
1133 Some(tr) => {
1134 let hir = tr
1135 .translate("", orig_ast)
1136 .map_err(|e| self.unsupported_error(e))?;
1137 self.hir_to_node_id(&hir, tb)
1138 }
1139 None => self.translate_ast_to_hir(orig_ast, tb),
1140 }
1141 }
1142
1143 fn get_class(
1144 &mut self,
1145 negated: bool,
1146 kind: regex_syntax::ast::ClassPerlKind,
1147 tb: &mut TB<'s>,
1148 ) -> Result<NodeId> {
1149 let w = self
1150 .perl_classes
1151 .iter()
1152 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1153 match w {
1154 Some((_, _, value)) => Ok(*value),
1155 None => {
1156 let translated = if self.global_unicode {
1157 match kind {
1158 regex_syntax::ast::ClassPerlKind::Word => {
1159 if self.global_full_unicode {
1160 self.unicode_classes.ensure_word_full(tb);
1161 } else {
1162 self.unicode_classes.ensure_word(tb);
1163 }
1164 if negated {
1165 self.unicode_classes.non_word
1166 } else {
1167 self.unicode_classes.word
1168 }
1169 }
1170 regex_syntax::ast::ClassPerlKind::Digit => {
1171 if self.global_full_unicode {
1172 self.unicode_classes.ensure_digit_full(tb);
1173 } else {
1174 self.unicode_classes.ensure_digit(tb);
1175 }
1176 if negated {
1177 self.unicode_classes.non_digit
1178 } else {
1179 self.unicode_classes.digit
1180 }
1181 }
1182 regex_syntax::ast::ClassPerlKind::Space => {
1183 self.unicode_classes.ensure_space(tb);
1184 if negated {
1185 self.unicode_classes.non_space
1186 } else {
1187 self.unicode_classes.space
1188 }
1189 }
1190 }
1191 } else {
1192 let pos = match kind {
1193 regex_syntax::ast::ClassPerlKind::Word => {
1194 let az = tb.mk_range_u8(b'a', b'z');
1195 let big = tb.mk_range_u8(b'A', b'Z');
1196 let dig = tb.mk_range_u8(b'0', b'9');
1197 let us = tb.mk_u8(b'_');
1198 tb.mk_unions([az, big, dig, us].into_iter())
1199 }
1200 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1201 regex_syntax::ast::ClassPerlKind::Space => {
1202 let sp = tb.mk_u8(b' ');
1203 let tab = tb.mk_u8(b'\t');
1204 let nl = tb.mk_u8(b'\n');
1205 let cr = tb.mk_u8(b'\r');
1206 let ff = tb.mk_u8(0x0C);
1207 let vt = tb.mk_u8(0x0B);
1208 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1209 }
1210 };
1211 if negated {
1212 tb.mk_compl(pos)
1213 } else {
1214 pos
1215 }
1216 };
1217 self.perl_classes.push((negated, kind, translated));
1218 Ok(translated)
1219 }
1220 }
1221 }
1222
1223 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1224 use WordCharKind::*;
1225 match ast {
1226 Ast::Literal(lit) => {
1227 if is_word_byte(lit.c as u8) {
1228 Word
1229 } else {
1230 NonWord
1231 }
1232 }
1233 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1234 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1235 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1236 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1237 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1238 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1239 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1240 },
1241 Ast::Dot(_) | Ast::Top(_) => Unknown,
1242 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1243 Ast::Concat(c) if !c.asts.is_empty() => {
1244 let edge = if left { c.asts.len() - 1 } else { 0 };
1245 let kind = Self::word_char_kind(&c.asts[edge], left);
1246 match kind {
1247 MaybeWord => {
1248 let dir: isize = if left { -1 } else { 1 };
1249 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1250 Word => Word,
1251 _ => MaybeWord,
1252 }
1253 }
1254 MaybeNonWord => {
1255 let dir: isize = if left { -1 } else { 1 };
1256 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1257 NonWord => NonWord,
1258 _ => MaybeNonWord,
1259 }
1260 }
1261 other => other,
1262 }
1263 }
1264 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1265 let first = Self::word_char_kind(&alt.asts[0], left);
1266 if alt.asts[1..]
1267 .iter()
1268 .all(|a| Self::word_char_kind(a, left) == first)
1269 {
1270 first
1271 } else {
1272 Unknown
1273 }
1274 }
1275 Ast::Repetition(r) => {
1276 let inner = Self::word_char_kind(&r.ast, left);
1277 let nullable = matches!(
1278 &r.op.kind,
1279 ast::RepetitionKind::ZeroOrMore
1280 | ast::RepetitionKind::ZeroOrOne
1281 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1282 );
1283 if nullable {
1284 match inner {
1285 Word => MaybeWord,
1286 NonWord => MaybeNonWord,
1287 _ => Unknown,
1288 }
1289 } else {
1290 inner
1291 }
1292 }
1293 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1294 _ => Unknown,
1295 }
1296 }
1297
1298 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1300 match ast {
1301 Ast::Literal(_)
1302 | Ast::ClassPerl(_)
1303 | Ast::ClassBracketed(_)
1304 | Ast::ClassUnicode(_)
1305 | Ast::Dot(_)
1306 | Ast::Top(_) => Some(ast),
1307 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1308 Ast::Concat(c) if !c.asts.is_empty() => {
1309 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1310 }
1311 Ast::Repetition(r) => {
1312 let nullable = matches!(
1313 &r.op.kind,
1314 ast::RepetitionKind::ZeroOrMore
1315 | ast::RepetitionKind::ZeroOrOne
1316 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1317 );
1318 if nullable {
1319 None
1320 } else {
1321 Self::edge_class_ast(&r.ast, left)
1322 }
1323 }
1324 _ => None,
1325 }
1326 }
1327
1328 fn resolve_word_kind(
1329 &mut self,
1330 asts: &[Ast],
1331 idx: usize,
1332 dir: isize,
1333 translator: &mut Option<Translator>,
1334 tb: &mut TB<'s>,
1335 word_id: NodeId,
1336 not_word_id: NodeId,
1337 ) -> Result<WordCharKind> {
1338 use WordCharKind::*;
1339 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1340 if fast != Unknown {
1341 return Ok(fast);
1342 }
1343 let neighbor_idx = (idx as isize + dir) as usize;
1344 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1345 self.ast_to_node_id(edge, translator, tb)?
1346 } else {
1347 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1349 let mut neighbor_node = tb
1350 .try_elim_lookarounds(neighbor_node)
1351 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1352 if dir < 0 {
1353 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1354 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1355 })?;
1356 }
1357 let word_prefix = if dir > 0 {
1358 tb.mk_concat(word_id, NodeId::TS)
1359 } else {
1360 tb.mk_concat(NodeId::TS, word_id)
1361 };
1362 let non_word_prefix = if dir > 0 {
1363 tb.mk_concat(not_word_id, NodeId::TS)
1364 } else {
1365 tb.mk_concat(NodeId::TS, not_word_id)
1366 };
1367 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1368 Ok(Word)
1369 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1370 Ok(NonWord)
1371 } else {
1372 Ok(Unknown)
1373 };
1374 };
1375 if tb.subsumes(word_id, node) == Some(true) {
1376 Ok(Word)
1377 } else if tb.subsumes(not_word_id, node) == Some(true) {
1378 Ok(NonWord)
1379 } else {
1380 Ok(Unknown)
1381 }
1382 }
1383
1384 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1385 use WordCharKind::*;
1386 let next = idx as isize + dir;
1387 if next < 0 || next >= asts.len() as isize {
1388 return Edge;
1389 }
1390 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1391 match kind {
1392 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1393 Word => Word,
1394 _ => Unknown,
1395 },
1396 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1397 NonWord => NonWord,
1398 _ => Unknown,
1399 },
1400 other => other,
1401 }
1402 }
1403
1404 fn rewrite_word_boundary_in_concat(
1405 &mut self,
1406 asts: &[Ast],
1407 idx: usize,
1408 translator: &mut Option<Translator>,
1409 tb: &mut TB<'s>,
1410 ) -> Result<(NodeId, usize)> {
1411 use WordCharKind::*;
1412 let (word_id, not_word_id) = if self.global_full_unicode {
1413 self.unicode_classes.ensure_word_full(tb);
1414 (self.unicode_classes.word, self.unicode_classes.non_word)
1415 } else if self.global_unicode {
1416 self.unicode_classes.ensure_word(tb);
1417 (self.unicode_classes.word, self.unicode_classes.non_word)
1418 } else {
1419 let az = tb.mk_range_u8(b'a', b'z');
1420 let big = tb.mk_range_u8(b'A', b'Z');
1421 let dig = tb.mk_range_u8(b'0', b'9');
1422 let us = tb.mk_u8(b'_');
1423 let w = tb.mk_unions([az, big, dig, us].into_iter());
1424 (w, tb.mk_compl(w))
1425 };
1426 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1427 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1428 match (left, right) {
1429 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1430 (Word, _) => {
1431 let neg = tb.mk_neg_lookahead(word_id, 0);
1432 Ok((neg, idx + 1))
1433 }
1434 (NonWord, _) => {
1435 let set = tb.mk_union(NodeId::END, word_id);
1436 let tail = tb.mk_concat(set, NodeId::TS);
1437 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1438 }
1439 (_, Word) => Ok((tb.mk_neg_lookbehind(word_id), idx + 1)),
1440 (_, NonWord) => {
1441 let body = tb.mk_union(NodeId::BEGIN, word_id);
1442 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1443 }
1444 _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1448 }
1449 }
1450
1451 fn merge_boundary_with_following_lookaheads(
1452 &mut self,
1453 asts: &[Ast],
1454 wb_idx: usize,
1455 boundary_tail: NodeId,
1456 translator: &mut Option<Translator>,
1457 tb: &mut TB<'s>,
1458 ) -> Result<(NodeId, usize)> {
1459 let mut next = wb_idx + 1;
1460 let mut la_bodies = vec![boundary_tail];
1461 while next < asts.len() {
1462 match &asts[next] {
1463 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1464 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1465 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1466 next += 1;
1467 }
1468 _ => break,
1469 }
1470 }
1471 let merged = tb.mk_inters(la_bodies.into_iter());
1472 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1473 }
1474
1475 fn ast_to_node_id(
1476 &mut self,
1477 ast: &Ast,
1478 translator: &mut Option<Translator>,
1479 tb: &mut TB<'s>,
1480 ) -> Result<NodeId> {
1481 match ast {
1482 Ast::Empty(_) => Ok(NodeId::EPS),
1483 Ast::Flags(f) => {
1484 let mut translator_builder = self.default_translator_builder();
1485 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1486 translator_builder.case_insensitive(state);
1487 }
1488 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1489 translator_builder.unicode(state);
1490 }
1491 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1492 self.dot_all.set(state);
1493 }
1494 let concat_translator = Some(translator_builder.build());
1495 *translator = concat_translator;
1496 Ok(NodeId::EPS)
1497 }
1498 Ast::Literal(l) => {
1499 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1500 self.translator_to_node_id(&ast_lit, translator, tb)
1501 }
1502 Ast::Top(_) => Ok(NodeId::TOP),
1503 Ast::Dot(_) => {
1504 if self.dot_all.get() {
1505 Ok(NodeId::TOP)
1506 } else {
1507 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1508 self.hir_to_node_id(&hirv, tb)
1509 }
1510 }
1511 Ast::Assertion(a) => match &a.kind {
1512 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1513 ast::AssertionKind::EndText => Ok(NodeId::END),
1514 ast::AssertionKind::WordBoundary => {
1515 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1516 }
1517 ast::AssertionKind::NotWordBoundary => {
1518 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1519 }
1520 ast::AssertionKind::StartLine => {
1521 let left = NodeId::BEGIN;
1522 let right = tb.mk_u8(b'\n');
1523 let union = tb.mk_union(left, right);
1524 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1525 }
1526 ast::AssertionKind::EndLine => {
1527 let left = NodeId::END;
1528 let right = tb.mk_u8(b'\n');
1529 let union = tb.mk_union(left, right);
1530 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1531 }
1532 ast::AssertionKind::WordBoundaryStart => {
1533 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1534 }
1535 ast::AssertionKind::WordBoundaryEnd => {
1536 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1537 }
1538 ast::AssertionKind::WordBoundaryStartAngle => {
1539 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1540 }
1541 ast::AssertionKind::WordBoundaryEndAngle => {
1542 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1543 }
1544 ast::AssertionKind::WordBoundaryStartHalf => {
1545 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1546 }
1547 ast::AssertionKind::WordBoundaryEndHalf => {
1548 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1549 }
1550 },
1551 Ast::ClassUnicode(c) => {
1552 let tmp = regex_syntax::ast::ClassUnicode {
1553 span: c.span,
1554 negated: c.negated,
1555 kind: c.kind.clone(),
1556 };
1557 if !c.negated {
1558 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1559 match s.as_str() {
1560 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1562 "utf8" => {
1564 let ascii = tb.mk_range_u8(0, 127);
1565 let beta = tb.mk_range_u8(128, 0xBF);
1566 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1567 let c0s = tb.mk_concats([c0, beta].into_iter());
1568 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1569 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1570 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1571 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1572 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1573 return Ok(tb.mk_star(merged));
1574 }
1575 "hex" => {
1576 let nums = tb.mk_range_u8(b'0', b'9');
1577 let lets = tb.mk_range_u8(b'a', b'f');
1578 let lets2 = tb.mk_range_u8(b'A', b'F');
1579 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1580 return Ok(merged);
1581 }
1582 _ => {}
1583 }
1584 };
1585 }
1586
1587 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1588 self.translator_to_node_id(&orig_ast, translator, tb)
1589 }
1590 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1591 Ast::ClassBracketed(c) => match &c.kind {
1592 regex_syntax::ast::ClassSet::Item(_) => {
1593 let tmp = regex_syntax::ast::ClassBracketed {
1594 span: c.span,
1595 negated: c.negated,
1596 kind: c.kind.clone(),
1597 };
1598 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1599 self.translator_to_node_id(&orig_ast, translator, tb)
1600 }
1601 regex_syntax::ast::ClassSet::BinaryOp(_) => {
1602 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1603 }
1604 },
1605 Ast::Repetition(r) => {
1606 let body = self.ast_to_node_id(&r.ast, translator, tb);
1607 match body {
1608 Ok(body) => match &r.op.kind {
1609 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1610 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1611 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1612 ast::RepetitionKind::Range(r) => match r {
1613 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1614 ast::RepetitionRange::AtLeast(n) => {
1615 let rep = tb.mk_repeat(body, *n, *n);
1616 let st = tb.mk_star(body);
1617 Ok(tb.mk_concat(rep, st))
1618 }
1619
1620 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1621 },
1622 },
1623 Err(_) => body,
1624 }
1625 }
1626 Ast::Lookaround(g) => {
1627 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1628 match g.kind {
1629 ast::LookaroundKind::PositiveLookahead => {
1630 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1631 }
1632 ast::LookaroundKind::PositiveLookbehind => {
1633 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1634 }
1635 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1636 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1637 }
1638 }
1639 Ast::Group(g) => {
1640 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1641 if !flags.items.is_empty() {
1642 let mut translator_builder = self.default_translator_builder();
1643 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1644 translator_builder.case_insensitive(state);
1645 }
1646 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1647 translator_builder.unicode(state);
1648 }
1649 let saved_dot_all = self.dot_all.get();
1650 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1651 self.dot_all.set(state);
1652 }
1653 let mut scoped = Some(translator_builder.build());
1654 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1655 self.dot_all.set(saved_dot_all);
1656 return result;
1657 }
1658 }
1659 self.ast_to_node_id(&g.ast, translator, tb)
1660 }
1661 Ast::Alternation(a) => {
1662 let mut children = vec![];
1663 for ast in &a.asts {
1664 match self.ast_to_node_id(ast, translator, tb) {
1665 Ok(node_id) => children.push(node_id),
1666 Err(err) => return Err(err),
1667 }
1668 }
1669 Ok(tb.mk_unions(children.iter().copied()))
1670 }
1671 Ast::Concat(c) => {
1672 let mut concat_translator: Option<Translator> = None;
1673 let mut children = vec![];
1674 let mut i = 0;
1675 while i < c.asts.len() {
1676 let ast = &c.asts[i];
1677 match ast {
1678 Ast::Flags(f) => {
1679 let mut translator_builder = self.default_translator_builder();
1680 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1681 translator_builder.case_insensitive(state);
1682 }
1683 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1684 translator_builder.unicode(state);
1685 }
1686 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1687 self.dot_all.set(state);
1688 }
1689 concat_translator = Some(translator_builder.build());
1690 *translator = concat_translator.clone();
1691 i += 1;
1692 continue;
1693 }
1694 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1695 let node =
1696 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1697 children.push(node.0);
1698 i = node.1; continue;
1700 }
1701 _ => {}
1702 }
1703 match concat_translator {
1704 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1705 Ok(node_id) => children.push(node_id),
1706 Err(err) => return Err(err),
1707 },
1708 None => match self.ast_to_node_id(ast, translator, tb) {
1709 Ok(node_id) => children.push(node_id),
1710 Err(err) => return Err(err),
1711 },
1712 }
1713 i += 1;
1714 }
1715 Ok(tb.mk_concats(children.iter().cloned()))
1716 }
1717 Ast::Intersection(intersection) => {
1718 let mut children = vec![];
1719 for ast in &intersection.asts {
1720 match self.ast_to_node_id(ast, translator, tb) {
1721 Ok(node_id) => children.push(node_id),
1722 Err(err) => return Err(err),
1723 }
1724 }
1725 Ok(tb.mk_inters(children.into_iter()))
1726 }
1727 Ast::Complement(complement) => {
1728 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1729 body.map(|x| tb.mk_compl(x))
1730 }
1731 }
1732 }
1733
1734 fn parse_inner(&mut self) -> Result<Ast> {
1735 let mut concat = Concat {
1736 span: self.span(),
1737 asts: vec![],
1738 };
1739 loop {
1740 self.bump_space();
1741 if self.is_eof() {
1742 break;
1743 }
1744 match self.char() {
1745 '(' => concat = self.push_group(concat)?,
1746 ')' => concat = self.pop_group(concat)?,
1747 '|' => concat = self.push_alternate(concat)?,
1748 '&' => concat = self.push_intersect(concat)?,
1749 '~' => concat = self.push_compl_group(concat)?,
1750 '[' => {
1751 let class = self.parse_set_class()?;
1752 concat.asts.push(Ast::class_bracketed(class));
1753 }
1754 '?' => {
1755 concat =
1756 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1757 }
1758 '*' => {
1759 concat =
1760 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1761 }
1762 '+' => {
1763 concat =
1764 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1765 }
1766 '{' => {
1767 concat = self.parse_counted_repetition(concat)?;
1768 }
1769 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1770 }
1771 }
1772 self.pop_group_end(concat)
1773 }
1774
1775 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1778 let ast = self.parse_inner()?;
1779 self.ast_to_node_id(&ast, &mut None, tb)
1780 }
1781
1782 #[inline(never)]
1783 fn parse_uncounted_repetition(
1784 &self,
1785 mut concat: ast::Concat,
1786 kind: ast::RepetitionKind,
1787 ) -> Result<ast::Concat> {
1788 let op_start = self.pos();
1790 let ast = match concat.asts.pop() {
1791 Some(ast) => ast,
1792 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1793 };
1794 match ast {
1795 Ast::Empty(_) | Ast::Flags(_) => {
1796 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1797 }
1798 _ => {}
1799 }
1800 if self.bump() && self.char() == '?' {
1801 return Err(self.error(
1802 Span::new(op_start, self.pos()),
1803 ast::ErrorKind::UnsupportedLazyQuantifier,
1804 ));
1805 }
1806 concat.asts.push(Ast::repetition(ast::Repetition {
1807 span: ast.span().with_end(self.pos()),
1808 op: ast::RepetitionOp {
1809 span: Span::new(op_start, self.pos()),
1810 kind,
1811 },
1812 greedy: true,
1813 ast: Box::new(ast),
1814 }));
1815 Ok(concat)
1816 }
1817
1818 #[inline(never)]
1819 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1820 assert!(self.char() == '{');
1821 let start = self.pos();
1822 let ast = match concat.asts.pop() {
1823 Some(ast) => ast,
1824 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1825 };
1826 match ast {
1827 Ast::Empty(_) | Ast::Flags(_) => {
1828 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1829 }
1830 _ => {}
1831 }
1832 if !self.bump_and_bump_space() {
1833 return Err(self.error(
1834 Span::new(start, self.pos()),
1835 ast::ErrorKind::RepetitionCountUnclosed,
1836 ));
1837 }
1838 let count_start = specialize_err(
1839 self.parse_decimal(),
1840 ast::ErrorKind::DecimalEmpty,
1841 ast::ErrorKind::RepetitionCountDecimalEmpty,
1842 );
1843 if self.is_eof() {
1844 return Err(self.error(
1845 Span::new(start, self.pos()),
1846 ast::ErrorKind::RepetitionCountUnclosed,
1847 ));
1848 }
1849 let range = if self.char() == ',' {
1850 if !self.bump_and_bump_space() {
1851 return Err(self.error(
1852 Span::new(start, self.pos()),
1853 ast::ErrorKind::RepetitionCountUnclosed,
1854 ));
1855 }
1856 if self.char() != '}' {
1857 let count_start = match count_start {
1858 Ok(c) => c,
1859 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1860 if self.parser().empty_min_range {
1861 0
1862 } else {
1863 return Err(err);
1864 }
1865 }
1866 err => err?,
1867 };
1868 let count_end = specialize_err(
1869 self.parse_decimal(),
1870 ast::ErrorKind::DecimalEmpty,
1871 ast::ErrorKind::RepetitionCountDecimalEmpty,
1872 )?;
1873 ast::RepetitionRange::Bounded(count_start, count_end)
1874 } else {
1875 ast::RepetitionRange::AtLeast(count_start?)
1876 }
1877 } else {
1878 ast::RepetitionRange::Exactly(count_start?)
1879 };
1880
1881 if self.is_eof() || self.char() != '}' {
1882 return Err(self.error(
1883 Span::new(start, self.pos()),
1884 ast::ErrorKind::RepetitionCountUnclosed,
1885 ));
1886 }
1887
1888 if self.bump_and_bump_space() && self.char() == '?' {
1889 return Err(self.error(
1890 Span::new(start, self.pos()),
1891 ast::ErrorKind::UnsupportedLazyQuantifier,
1892 ));
1893 }
1894
1895 let op_span = Span::new(start, self.pos());
1896 if !range.is_valid() {
1897 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1898 }
1899 concat.asts.push(Ast::repetition(ast::Repetition {
1900 span: ast.span().with_end(self.pos()),
1901 op: ast::RepetitionOp {
1902 span: op_span,
1903 kind: ast::RepetitionKind::Range(range),
1904 },
1905 greedy: true,
1906 ast: Box::new(ast),
1907 }));
1908 Ok(concat)
1909 }
1910
1911 #[inline(never)]
1912 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1913 assert_eq!(self.char(), '(');
1914 let open_span = self.span_char();
1915 self.bump();
1916 self.bump_space();
1917 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1918 let kind = match (pos, ahead) {
1919 (true, true) => LookaroundKind::PositiveLookahead,
1920 (true, false) => LookaroundKind::PositiveLookbehind,
1921 (false, true) => LookaroundKind::NegativeLookahead,
1922 (false, false) => LookaroundKind::NegativeLookbehind,
1923 };
1924 return Ok(Either::Right(ast::Group {
1925 span: open_span,
1926 kind: ast::GroupKind::Lookaround(kind),
1927 ast: Box::new(Ast::empty(self.span())),
1928 }));
1929 }
1930 let inner_span = self.span();
1931 let mut starts_with_p = true;
1932 if self.bump_if("?P<") || {
1933 starts_with_p = false;
1934 self.bump_if("?<")
1935 } {
1936 let capture_index = self.next_capture_index(open_span)?;
1937 let name = self.parse_capture_name(capture_index)?;
1938 Ok(Either::Right(ast::Group {
1939 span: open_span,
1940 kind: ast::GroupKind::CaptureName {
1941 starts_with_p,
1942 name,
1943 },
1944 ast: Box::new(Ast::empty(self.span())),
1945 }))
1946 } else if self.bump_if("?") {
1947 if self.is_eof() {
1948 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1949 }
1950 let flags = self.parse_flags()?;
1951 let char_end = self.char();
1952 self.bump();
1953 if char_end == ')' {
1954 if flags.items.is_empty() {
1957 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1958 }
1959 Ok(Either::Left(ast::SetFlags {
1960 span: Span {
1961 end: self.pos(),
1962 ..open_span
1963 },
1964 flags,
1965 }))
1966 } else {
1967 assert_eq!(char_end, ':');
1968 Ok(Either::Right(ast::Group {
1969 span: open_span,
1970 kind: ast::GroupKind::NonCapturing(flags),
1971 ast: Box::new(Ast::empty(self.span())),
1972 }))
1973 }
1974 } else {
1975 let capture_index = self.next_capture_index(open_span)?;
1976 Ok(Either::Right(ast::Group {
1977 span: open_span,
1978 kind: ast::GroupKind::CaptureIndex(capture_index),
1979 ast: Box::new(Ast::empty(self.span())),
1980 }))
1981 }
1982 }
1983
1984 #[inline(never)]
1985 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1986 if self.is_eof() {
1987 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1988 }
1989 let start = self.pos();
1990 loop {
1991 if self.char() == '>' {
1992 break;
1993 }
1994 if !is_capture_char(self.char(), self.pos() == start) {
1995 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1996 }
1997 if !self.bump() {
1998 break;
1999 }
2000 }
2001 let end = self.pos();
2002 if self.is_eof() {
2003 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2004 }
2005 assert_eq!(self.char(), '>');
2006 self.bump();
2007 let name = &self.pattern()[start.offset..end.offset];
2008 if name.is_empty() {
2009 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2010 }
2011 let capname = ast::CaptureName {
2012 span: Span::new(start, end),
2013 name: name.to_string(),
2014 index: capture_index,
2015 };
2016 self.add_capture_name(&capname)?;
2017 Ok(capname)
2018 }
2019
2020 #[inline(never)]
2021 fn parse_flags(&self) -> Result<ast::Flags> {
2022 let mut flags = ast::Flags {
2023 span: self.span(),
2024 items: vec![],
2025 };
2026 let mut last_was_negation = None;
2027 while self.char() != ':' && self.char() != ')' {
2028 if self.char() == '-' {
2029 last_was_negation = Some(self.span_char());
2030 let item = ast::FlagsItem {
2031 span: self.span_char(),
2032 kind: ast::FlagsItemKind::Negation,
2033 };
2034 if let Some(i) = flags.add_item(item) {
2035 return Err(self.error(
2036 self.span_char(),
2037 ast::ErrorKind::FlagRepeatedNegation {
2038 original: flags.items[i].span,
2039 },
2040 ));
2041 }
2042 } else {
2043 last_was_negation = None;
2044 let item = ast::FlagsItem {
2045 span: self.span_char(),
2046 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2047 };
2048 if let Some(i) = flags.add_item(item) {
2049 return Err(self.error(
2050 self.span_char(),
2051 ast::ErrorKind::FlagDuplicate {
2052 original: flags.items[i].span,
2053 },
2054 ));
2055 }
2056 }
2057 if !self.bump() {
2058 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2059 }
2060 }
2061 if let Some(span) = last_was_negation {
2062 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2063 }
2064 flags.span.end = self.pos();
2065 Ok(flags)
2066 }
2067
2068 #[inline(never)]
2069 fn parse_flag(&self) -> Result<ast::Flag> {
2070 match self.char() {
2071 'i' => Ok(ast::Flag::CaseInsensitive),
2072 'm' => Ok(ast::Flag::MultiLine),
2073 's' => Ok(ast::Flag::DotMatchesNewLine),
2074 'U' => Ok(ast::Flag::SwapGreed),
2075 'u' => Ok(ast::Flag::Unicode),
2076 'R' => Ok(ast::Flag::CRLF),
2077 'x' => Ok(ast::Flag::IgnoreWhitespace),
2078 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2079 }
2080 }
2081
2082 fn parse_primitive(&self) -> Result<Primitive> {
2083 match self.char() {
2084 '\\' => self.parse_escape(),
2085 '_' => {
2086 let ast = Primitive::Top(self.span_char());
2087 self.bump();
2088 Ok(ast)
2089 }
2090 '.' => {
2091 let ast = Primitive::Dot(self.span_char());
2092 self.bump();
2093 Ok(ast)
2094 }
2095 '^' => {
2096 let ast = Primitive::Assertion(ast::Assertion {
2097 span: self.span_char(),
2098 kind: ast::AssertionKind::StartLine,
2099 });
2100 self.bump();
2101 Ok(ast)
2102 }
2103 '$' => {
2104 let ast = Primitive::Assertion(ast::Assertion {
2105 span: self.span_char(),
2106 kind: ast::AssertionKind::EndLine,
2107 });
2108 self.bump();
2109 Ok(ast)
2110 }
2111 c => {
2112 let ast = Primitive::Literal(Literal {
2113 span: self.span_char(),
2114 kind: LiteralKind::Verbatim,
2115 c,
2116 });
2117 self.bump();
2118 Ok(ast)
2119 }
2120 }
2121 }
2122
2123 #[inline(never)]
2124 fn parse_escape(&self) -> Result<Primitive> {
2125 assert_eq!(self.char(), '\\');
2126 let start = self.pos();
2127 if !self.bump() {
2128 return Err(self.error(
2129 Span::new(start, self.pos()),
2130 ast::ErrorKind::EscapeUnexpectedEof,
2131 ));
2132 }
2133 let c = self.char();
2134 match c {
2136 '0'..='9' => {
2137 if !self.parser().octal {
2138 return Err(self.error(
2139 Span::new(start, self.span_char().end),
2140 ast::ErrorKind::UnsupportedBackreference,
2141 ));
2142 }
2143 let mut lit = self.parse_octal();
2144 lit.span.start = start;
2145 return Ok(Primitive::Literal(lit));
2146 }
2147 'x' | 'u' | 'U' => {
2154 let mut lit = self.parse_hex()?;
2155 lit.span.start = start;
2156 return Ok(Primitive::Literal(lit));
2157 }
2158 'p' | 'P' => {
2159 let mut cls = self.parse_unicode_class()?;
2160 cls.span.start = start;
2161 return Ok(Primitive::Unicode(cls));
2162 }
2163 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2164 let mut cls = self.parse_perl_class();
2165 cls.span.start = start;
2166 return Ok(Primitive::Perl(cls));
2167 }
2168 _ => {}
2169 }
2170
2171 self.bump();
2173 let span = Span::new(start, self.pos());
2174 if is_meta_character(c) {
2175 return Ok(Primitive::Literal(Literal {
2176 span,
2177 kind: LiteralKind::Meta,
2178 c,
2179 }));
2180 }
2181 if is_escapeable_character(c) {
2182 return Ok(Primitive::Literal(Literal {
2183 span,
2184 kind: LiteralKind::Superfluous,
2185 c,
2186 }));
2187 }
2188 let special = |kind, c| {
2189 Ok(Primitive::Literal(Literal {
2190 span,
2191 kind: LiteralKind::Special(kind),
2192 c,
2193 }))
2194 };
2195 match c {
2196 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2197 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2198 't' => special(SpecialLiteralKind::Tab, '\t'),
2199 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2200 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2201 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2202 'A' => Ok(Primitive::Assertion(ast::Assertion {
2203 span,
2204 kind: ast::AssertionKind::StartText,
2205 })),
2206 'z' => Ok(Primitive::Assertion(ast::Assertion {
2207 span,
2208 kind: ast::AssertionKind::EndText,
2209 })),
2210 'b' => {
2211 let mut wb = ast::Assertion {
2212 span,
2213 kind: ast::AssertionKind::WordBoundary,
2214 };
2215 if !self.is_eof() && self.char() == '{' {
2218 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2219 wb.kind = kind;
2220 wb.span.end = self.pos();
2221 }
2222 }
2223 Ok(Primitive::Assertion(wb))
2224 }
2225 'B' => Ok(Primitive::Assertion(ast::Assertion {
2226 span,
2227 kind: ast::AssertionKind::NotWordBoundary,
2228 })),
2229 '<' => Ok(Primitive::Assertion(ast::Assertion {
2230 span,
2231 kind: ast::AssertionKind::WordBoundaryStartAngle,
2232 })),
2233 '>' => Ok(Primitive::Assertion(ast::Assertion {
2234 span,
2235 kind: ast::AssertionKind::WordBoundaryEndAngle,
2236 })),
2237 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2238 }
2239 }
2240
2241 fn maybe_parse_special_word_boundary(
2242 &self,
2243 wb_start: Position,
2244 ) -> Result<Option<ast::AssertionKind>> {
2245 assert_eq!(self.char(), '{');
2246
2247 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2248 let start = self.pos();
2249 if !self.bump_and_bump_space() {
2250 return Err(self.error(
2251 Span::new(wb_start, self.pos()),
2252 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2253 ));
2254 }
2255 let start_contents = self.pos();
2256 if !is_valid_char(self.char()) {
2261 self.parser().pos.set(start);
2262 return Ok(None);
2263 }
2264
2265 let mut scratch = self.parser().scratch.borrow_mut();
2267 scratch.clear();
2268 while !self.is_eof() && is_valid_char(self.char()) {
2269 scratch.push(self.char());
2270 self.bump_and_bump_space();
2271 }
2272 if self.is_eof() || self.char() != '}' {
2273 return Err(self.error(
2274 Span::new(start, self.pos()),
2275 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2276 ));
2277 }
2278 let end = self.pos();
2279 self.bump();
2280 let kind = match scratch.as_str() {
2281 "start" => ast::AssertionKind::WordBoundaryStart,
2282 "end" => ast::AssertionKind::WordBoundaryEnd,
2283 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2284 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2285 _ => {
2286 return Err(self.error(
2287 Span::new(start_contents, end),
2288 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2289 ))
2290 }
2291 };
2292 Ok(Some(kind))
2293 }
2294
2295 #[inline(never)]
2296 fn parse_octal(&self) -> Literal {
2297 assert!(self.parser().octal);
2298 assert!('0' <= self.char() && self.char() <= '7');
2299 let start = self.pos();
2300 while self.bump()
2302 && '0' <= self.char()
2303 && self.char() <= '7'
2304 && self.pos().offset - start.offset <= 2
2305 {}
2306 let end = self.pos();
2307 let octal = &self.pattern()[start.offset..end.offset];
2308 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2311 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2314 Literal {
2315 span: Span::new(start, end),
2316 kind: LiteralKind::Octal,
2317 c,
2318 }
2319 }
2320
2321 #[inline(never)]
2322 fn parse_hex(&self) -> Result<Literal> {
2323 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2324
2325 let hex_kind = match self.char() {
2326 'x' => HexLiteralKind::X,
2327 'u' => HexLiteralKind::UnicodeShort,
2328 _ => HexLiteralKind::UnicodeLong,
2329 };
2330 if !self.bump_and_bump_space() {
2331 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2332 }
2333 if self.char() == '{' {
2334 self.parse_hex_brace(hex_kind)
2335 } else {
2336 self.parse_hex_digits(hex_kind)
2337 }
2338 }
2339
2340 #[inline(never)]
2341 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2342 let mut scratch = self.parser().scratch.borrow_mut();
2343 scratch.clear();
2344
2345 let start = self.pos();
2346 for i in 0..kind.digits() {
2347 if i > 0 && !self.bump_and_bump_space() {
2348 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2349 }
2350 if !is_hex(self.char()) {
2351 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2352 }
2353 scratch.push(self.char());
2354 }
2355 self.bump_and_bump_space();
2358 let end = self.pos();
2359 let hex = scratch.as_str();
2360 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2361 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2362 Some(c) => Ok(Literal {
2363 span: Span::new(start, end),
2364 kind: LiteralKind::HexFixed(kind),
2365 c,
2366 }),
2367 }
2368 }
2369
2370 #[inline(never)]
2371 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2372 let mut scratch = self.parser().scratch.borrow_mut();
2373 scratch.clear();
2374
2375 let brace_pos = self.pos();
2376 let start = self.span_char().end;
2377 while self.bump_and_bump_space() && self.char() != '}' {
2378 if !is_hex(self.char()) {
2379 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2380 }
2381 scratch.push(self.char());
2382 }
2383 if self.is_eof() {
2384 return Err(self.error(
2385 Span::new(brace_pos, self.pos()),
2386 ast::ErrorKind::EscapeUnexpectedEof,
2387 ));
2388 }
2389 let end = self.pos();
2390 let hex = scratch.as_str();
2391 assert_eq!(self.char(), '}');
2392 self.bump_and_bump_space();
2393
2394 if hex.is_empty() {
2395 return Err(self.error(
2396 Span::new(brace_pos, self.pos()),
2397 ast::ErrorKind::EscapeHexEmpty,
2398 ));
2399 }
2400 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2401 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2402 Some(c) => Ok(Literal {
2403 span: Span::new(start, self.pos()),
2404 kind: LiteralKind::HexBrace(kind),
2405 c,
2406 }),
2407 }
2408 }
2409
2410 fn parse_decimal(&self) -> Result<u32> {
2411 let mut scratch = self.parser().scratch.borrow_mut();
2412 scratch.clear();
2413
2414 while !self.is_eof() && self.char().is_whitespace() {
2415 self.bump();
2416 }
2417 let start = self.pos();
2418 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2419 scratch.push(self.char());
2420 self.bump_and_bump_space();
2421 }
2422 let span = Span::new(start, self.pos());
2423 while !self.is_eof() && self.char().is_whitespace() {
2424 self.bump_and_bump_space();
2425 }
2426 let digits = scratch.as_str();
2427 if digits.is_empty() {
2428 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2429 }
2430 match digits.parse::<u32>().ok() {
2431 Some(n) => Ok(n),
2432 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2433 }
2434 }
2435
2436 #[inline(never)]
2437 fn parse_set_class(&self) -> Result<ClassBracketed> {
2438 assert_eq!(self.char(), '[');
2439
2440 let mut union = ClassSetUnion {
2441 span: self.span(),
2442 items: vec![],
2443 };
2444 loop {
2445 self.bump_space();
2446 if self.is_eof() {
2447 return Err(self.unclosed_class_error());
2448 }
2449 match self.char() {
2450 '[' => {
2451 if !self.parser().stack_class.borrow().is_empty() {
2456 if let Some(cls) = self.maybe_parse_ascii_class() {
2457 union.push(ClassSetItem::Ascii(cls));
2458 continue;
2459 }
2460 }
2461 union = self.push_class_open(union)?;
2462 }
2463 ']' => match self.pop_class(union)? {
2464 Either::Left(nested_union) => {
2465 union = nested_union;
2466 }
2467 Either::Right(class) => return Ok(class),
2468 },
2469 '&' if self.peek() == Some('&') => {
2470 assert!(self.bump_if("&&"));
2471 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2472 }
2473 '-' if self.peek() == Some('-') => {
2474 assert!(self.bump_if("--"));
2475 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2476 }
2477 '~' if self.peek() == Some('~') => {
2478 assert!(self.bump_if("~~"));
2479 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2480 }
2481 _ => {
2482 union.push(self.parse_set_class_range()?);
2483 }
2484 }
2485 }
2486 }
2487
2488 #[inline(never)]
2489 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2490 let prim1 = self.parse_set_class_item()?;
2491 self.bump_space();
2492 if self.is_eof() {
2493 return Err(self.unclosed_class_error());
2494 }
2495 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2496 return prim1.into_class_set_item(self);
2497 }
2498 if !self.bump_and_bump_space() {
2499 return Err(self.unclosed_class_error());
2500 }
2501 let prim2 = self.parse_set_class_item()?;
2502 let range = ClassSetRange {
2503 span: Span::new(prim1.span().start, prim2.span().end),
2504 start: prim1.into_class_literal(self)?,
2505 end: prim2.into_class_literal(self)?,
2506 };
2507 if !range.is_valid() {
2508 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2509 }
2510 Ok(ClassSetItem::Range(range))
2511 }
2512
2513 #[inline(never)]
2514 fn parse_set_class_item(&self) -> Result<Primitive> {
2515 if self.char() == '\\' {
2516 self.parse_escape()
2517 } else {
2518 let x = Primitive::Literal(Literal {
2519 span: self.span_char(),
2520 kind: LiteralKind::Verbatim,
2521 c: self.char(),
2522 });
2523 self.bump();
2524 Ok(x)
2525 }
2526 }
2527
2528 #[inline(never)]
2529 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2530 assert_eq!(self.char(), '[');
2531 let start = self.pos();
2532 if !self.bump_and_bump_space() {
2533 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2534 }
2535
2536 let negated = if self.char() != '^' {
2537 false
2538 } else {
2539 if !self.bump_and_bump_space() {
2540 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2541 }
2542 true
2543 };
2544 let mut union = ClassSetUnion {
2546 span: self.span(),
2547 items: vec![],
2548 };
2549 while self.char() == '-' {
2550 union.push(ClassSetItem::Literal(Literal {
2551 span: self.span_char(),
2552 kind: LiteralKind::Verbatim,
2553 c: '-',
2554 }));
2555 if !self.bump_and_bump_space() {
2556 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2557 }
2558 }
2559 if union.items.is_empty() && self.char() == ']' {
2562 union.push(ClassSetItem::Literal(Literal {
2563 span: self.span_char(),
2564 kind: LiteralKind::Verbatim,
2565 c: ']',
2566 }));
2567 if !self.bump_and_bump_space() {
2568 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2569 }
2570 }
2571 let set = ClassBracketed {
2572 span: Span::new(start, self.pos()),
2573 negated,
2574 kind: ClassSet::union(ClassSetUnion {
2575 span: Span::new(union.span.start, union.span.start),
2576 items: vec![],
2577 }),
2578 };
2579 Ok((set, union))
2580 }
2581
2582 #[inline(never)]
2583 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2584 assert_eq!(self.char(), '[');
2585 let start = self.pos();
2587 let mut negated = false;
2588 if !self.bump() || self.char() != ':' {
2589 self.parser().pos.set(start);
2590 return None;
2591 }
2592 if !self.bump() {
2593 self.parser().pos.set(start);
2594 return None;
2595 }
2596 if self.char() == '^' {
2597 negated = true;
2598 if !self.bump() {
2599 self.parser().pos.set(start);
2600 return None;
2601 }
2602 }
2603 let name_start = self.offset();
2604 while self.char() != ':' && self.bump() {}
2605 if self.is_eof() {
2606 self.parser().pos.set(start);
2607 return None;
2608 }
2609 let name = &self.pattern()[name_start..self.offset()];
2610 if !self.bump_if(":]") {
2611 self.parser().pos.set(start);
2612 return None;
2613 }
2614 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2615 Some(kind) => kind,
2616 None => {
2617 self.parser().pos.set(start);
2618 return None;
2619 }
2620 };
2621 Some(ClassAscii {
2622 span: Span::new(start, self.pos()),
2623 kind,
2624 negated,
2625 })
2626 }
2627
2628 #[inline(never)]
2629 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2630 assert!(self.char() == 'p' || self.char() == 'P');
2631
2632 let mut scratch = self.parser().scratch.borrow_mut();
2633 scratch.clear();
2634
2635 let negated = self.char() == 'P';
2636 if !self.bump_and_bump_space() {
2637 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2638 }
2639 let (start, kind) = if self.char() == '{' {
2640 let start = self.span_char().end;
2641 while self.bump_and_bump_space() && self.char() != '}' {
2642 scratch.push(self.char());
2643 }
2644 if self.is_eof() {
2645 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2646 }
2647 assert_eq!(self.char(), '}');
2648 self.bump();
2649
2650 let name = scratch.as_str();
2651 if let Some(i) = name.find("!=") {
2652 (
2653 start,
2654 ClassUnicodeKind::NamedValue {
2655 op: ClassUnicodeOpKind::NotEqual,
2656 name: name[..i].to_string(),
2657 value: name[i + 2..].to_string(),
2658 },
2659 )
2660 } else if let Some(i) = name.find(':') {
2661 (
2662 start,
2663 ClassUnicodeKind::NamedValue {
2664 op: ClassUnicodeOpKind::Colon,
2665 name: name[..i].to_string(),
2666 value: name[i + 1..].to_string(),
2667 },
2668 )
2669 } else if let Some(i) = name.find('=') {
2670 (
2671 start,
2672 ClassUnicodeKind::NamedValue {
2673 op: ClassUnicodeOpKind::Equal,
2674 name: name[..i].to_string(),
2675 value: name[i + 1..].to_string(),
2676 },
2677 )
2678 } else {
2679 (start, ClassUnicodeKind::Named(name.to_string()))
2680 }
2681 } else {
2682 let start = self.pos();
2683 let c = self.char();
2684 if c == '\\' {
2685 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2686 }
2687 self.bump_and_bump_space();
2688 let kind = ClassUnicodeKind::OneLetter(c);
2689 (start, kind)
2690 };
2691 Ok(ClassUnicode {
2692 span: Span::new(start, self.pos()),
2693 negated,
2694 kind,
2695 })
2696 }
2697
2698 #[inline(never)]
2699 fn parse_perl_class(&self) -> ClassPerl {
2700 let c = self.char();
2701 let span = self.span_char();
2702 self.bump();
2703 let (negated, kind) = match c {
2704 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2705 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2706 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2707 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2708 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2709 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2710 c => panic!("expected valid Perl class but got '{}'", c),
2711 };
2712 ClassPerl {
2713 span,
2714 kind,
2715 negated,
2716 }
2717 }
2718}
2719
2720pub fn parse_ast<'s>(
2721 tb: &mut TB<'s>,
2722 pattern: &'s str,
2723) -> std::result::Result<NodeId, ResharpError> {
2724 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2725 p.parse(tb)
2726}
2727
2728pub fn parse_ast_with<'s>(
2729 tb: &mut TB<'s>,
2730 pattern: &'s str,
2731 flags: &PatternFlags,
2732) -> std::result::Result<NodeId, ResharpError> {
2733 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2734 p.parse(tb)
2735}
2736
2737pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ResharpError> {
2739 let mut p: ResharpParser = ResharpParser::new(pattern);
2740 p.parse_inner()
2741}