1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28 Word,
29 NonWord,
30 MaybeWord,
31 MaybeNonWord,
32 Unknown,
33 Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37 b.is_ascii_alphanumeric() || b == b'_'
38}
39
40#[derive(Clone, Debug, Eq, PartialEq)]
41enum Primitive {
42 Literal(Literal),
43 Assertion(ast::Assertion),
44 Dot(Span),
45 Top(Span),
46 Perl(ClassPerl),
47 Unicode(ClassUnicode),
48}
49
50impl Primitive {
51 fn span(&self) -> &Span {
52 match *self {
53 Primitive::Literal(ref x) => &x.span,
54 Primitive::Assertion(ref x) => &x.span,
55 Primitive::Dot(ref span) => span,
56 Primitive::Top(ref span) => span,
57 Primitive::Perl(ref x) => &x.span,
58 Primitive::Unicode(ref x) => &x.span,
59 }
60 }
61
62 fn into_ast(self) -> Ast {
63 match self {
64 Primitive::Literal(lit) => Ast::literal(lit),
65 Primitive::Assertion(assert) => Ast::assertion(assert),
66 Primitive::Dot(span) => Ast::dot(span),
67 Primitive::Top(span) => Ast::top(span),
68 Primitive::Perl(cls) => Ast::class_perl(cls),
69 Primitive::Unicode(cls) => Ast::class_unicode(cls),
70 }
71 }
72
73 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
74 use self::Primitive::*;
75 use regex_syntax::ast::ClassSetItem;
76
77 match self {
78 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
79 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
80 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
81 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
82 }
83 }
84
85 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
86 use self::Primitive::*;
87
88 match self {
89 Literal(lit) => Ok(lit),
90 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
91 }
92 }
93}
94
95#[derive(Clone, Debug, Eq, PartialEq)]
96pub enum Either<Left, Right> {
97 Left(Left),
98 Right(Right),
99}
100
101#[derive(Clone, Debug, Eq, PartialEq)]
102pub struct ResharpError {
103 pub kind: ErrorKind,
105 pattern: String,
108 pub span: Span,
110}
111
112impl std::fmt::Display for ResharpError {
113 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
114 write!(f, "{:?}: {:?}", self.kind, self.span)
115 }
116}
117impl std::error::Error for ResharpError {}
118
119type Result<T> = core::result::Result<T, ResharpError>;
120
121#[derive(Clone, Debug)]
122enum GroupState {
123 Group {
125 concat: Concat,
127 group: ast::Group,
129 ignore_whitespace: bool,
131 },
132 Alternation(ast::Alternation),
137 Intersection(ast::Intersection),
138}
139
140#[derive(Clone, Debug)]
141enum ClassState {
142 Open {
144 union: regex_syntax::ast::ClassSetUnion,
146 set: regex_syntax::ast::ClassBracketed,
150 },
151 Op {
154 kind: regex_syntax::ast::ClassSetBinaryOpKind,
156 lhs: regex_syntax::ast::ClassSet,
158 },
159}
160
161pub struct ResharpParser<'s> {
163 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
164 pub translator: regex_syntax::hir::translate::Translator,
165 pub pattern: &'s str,
166 pos: Cell<Position>,
167 capture_index: Cell<u32>,
168 octal: bool,
169 empty_min_range: bool,
170 ignore_whitespace: Cell<bool>,
171 comments: RefCell<Vec<ast::Comment>>,
172 stack_group: RefCell<Vec<GroupState>>,
173 stack_class: RefCell<Vec<ClassState>>,
174 capture_names: RefCell<Vec<ast::CaptureName>>,
175 scratch: RefCell<String>,
176}
177
178fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
179 if let Err(e) = result {
180 if e.kind == from {
181 Err(ResharpError {
182 kind: to,
183 pattern: e.pattern,
184 span: e.span,
185 })
186 } else {
187 Err(e)
188 }
189 } else {
190 result
191 }
192}
193
194fn is_capture_char(c: char, first: bool) -> bool {
195 if first {
196 c == '_' || c.is_alphabetic()
197 } else {
198 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
199 }
200}
201
202pub fn is_meta_character(c: char) -> bool {
203 match c {
204 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
205 | '#' | '&' | '-' | '~' | '_' => true,
206 _ => false,
207 }
208}
209
210pub fn escape(text: &str) -> String {
212 let mut buf = String::new();
213 escape_into(text, &mut buf);
214 buf
215}
216
217pub fn escape_into(text: &str, buf: &mut String) {
219 buf.reserve(text.len());
220 for c in text.chars() {
221 if is_meta_character(c) {
222 buf.push('\\');
223 }
224 buf.push(c);
225 }
226}
227
228pub fn is_escapeable_character(c: char) -> bool {
229 if is_meta_character(c) {
231 return true;
232 }
233 if !c.is_ascii() {
236 return false;
237 }
238 match c {
243 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
244 '<' | '>' => false,
254 _ => true,
255 }
256}
257
258fn is_hex(c: char) -> bool {
259 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
260}
261
262impl<'s> ResharpParser<'s> {
263 fn default_translator_builder() -> TranslatorBuilder {
264 let mut trb = TranslatorBuilder::new();
265 trb.unicode(true);
266 trb.utf8(false);
267 trb
268 }
269
270 pub fn new(pattern: &'s str) -> Self {
271 let trb = Self::default_translator_builder();
272 Self {
273 translator: trb.build(),
274 pattern,
275 perl_classes: vec![],
276 pos: Cell::new(Position::new(0, 0, 0)),
277 capture_index: Cell::new(0),
278 octal: false,
279 empty_min_range: false,
280 ignore_whitespace: Cell::new(false),
281 comments: RefCell::new(vec![]),
282 stack_group: RefCell::new(vec![]),
283 stack_class: RefCell::new(vec![]),
284 capture_names: RefCell::new(vec![]),
285 scratch: RefCell::new(String::new()),
286 }
287 }
288
289 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
291 self
292 }
293
294 fn pattern(&self) -> &str {
296 self.pattern
297 }
298
299 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
301 ResharpError {
302 kind,
303 pattern: self.pattern().to_string(),
304 span,
305 }
306 }
307
308 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
309 let emptyspan = Span::splat(self.pos());
310 let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
311 inner
312 }
313
314 fn offset(&self) -> usize {
319 self.parser().pos.get().offset
320 }
321
322 fn line(&self) -> usize {
326 self.parser().pos.get().line
327 }
328
329 fn column(&self) -> usize {
333 self.parser().pos.get().column
334 }
335
336 fn next_capture_index(&self, span: Span) -> Result<u32> {
344 let current = self.parser().capture_index.get();
345 let i = current
346 .checked_add(1)
347 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
348 self.parser().capture_index.set(i);
349 Ok(i)
350 }
351
352 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
353 let mut names = self.parser().capture_names.borrow_mut();
354 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
355 Err(i) => {
356 names.insert(i, cap.clone());
357 Ok(())
358 }
359 Ok(i) => Err(self.error(
360 cap.span,
361 ast::ErrorKind::GroupNameDuplicate {
362 original: names[i].span,
363 },
364 )),
365 }
366 }
367
368 fn ignore_whitespace(&self) -> bool {
369 self.parser().ignore_whitespace.get()
370 }
371
372 fn char(&self) -> char {
373 self.char_at(self.offset())
374 }
375
376 fn char_at(&self, i: usize) -> char {
377 self.pattern()[i..]
378 .chars()
379 .next()
380 .unwrap_or_else(|| panic!("expected char at offset {}", i))
381 }
382
383 fn bump(&self) -> bool {
384 if self.is_eof() {
385 return false;
386 }
387 let Position {
388 mut offset,
389 mut line,
390 mut column,
391 } = self.pos();
392 if self.char() == '\n' {
393 line = line.checked_add(1).unwrap();
394 column = 1;
395 } else {
396 column = column.checked_add(1).unwrap();
397 }
398 offset += self.char().len_utf8();
399 self.parser().pos.set(Position {
400 offset,
401 line,
402 column,
403 });
404 self.pattern()[self.offset()..].chars().next().is_some()
405 }
406
407 fn bump_if(&self, prefix: &str) -> bool {
408 if self.pattern()[self.offset()..].starts_with(prefix) {
409 for _ in 0..prefix.chars().count() {
410 self.bump();
411 }
412 true
413 } else {
414 false
415 }
416 }
417
418 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
419 if self.bump_if("?=") {
420 return Some((true, true));
421 }
422 if self.bump_if("?!") {
423 return Some((true, false));
424 }
425 if self.bump_if("?<=") {
426 return Some((false, true));
427 }
428 if self.bump_if("?<!") {
429 return Some((false, false));
430 }
431 return None;
432 }
434
435 fn bump_and_bump_space(&self) -> bool {
436 if !self.bump() {
437 return false;
438 }
439 self.bump_space();
440 !self.is_eof()
441 }
442
443 fn bump_space(&self) {
444 if !self.ignore_whitespace() {
445 return;
446 }
447 while !self.is_eof() {
448 if self.char().is_whitespace() {
449 self.bump();
450 } else if self.char() == '#' {
451 let start = self.pos();
452 let mut comment_text = String::new();
453 self.bump();
454 while !self.is_eof() {
455 let c = self.char();
456 self.bump();
457 if c == '\n' {
458 break;
459 }
460 comment_text.push(c);
461 }
462 let comment = ast::Comment {
463 span: Span::new(start, self.pos()),
464 comment: comment_text,
465 };
466 self.parser().comments.borrow_mut().push(comment);
467 } else {
468 break;
469 }
470 }
471 }
472
473 fn peek(&self) -> Option<char> {
477 if self.is_eof() {
478 return None;
479 }
480 self.pattern()[self.offset() + self.char().len_utf8()..]
481 .chars()
482 .next()
483 }
484
485 fn peek_space(&self) -> Option<char> {
488 if !self.ignore_whitespace() {
489 return self.peek();
490 }
491 if self.is_eof() {
492 return None;
493 }
494 let mut start = self.offset() + self.char().len_utf8();
495 let mut in_comment = false;
496 for (i, c) in self.pattern()[start..].char_indices() {
497 if c.is_whitespace() {
498 continue;
499 } else if !in_comment && c == '#' {
500 in_comment = true;
501 } else if in_comment && c == '\n' {
502 in_comment = false;
503 } else {
504 start += i;
505 break;
506 }
507 }
508 self.pattern()[start..].chars().next()
509 }
510
511 fn is_eof(&self) -> bool {
513 self.offset() == self.pattern().len()
514 }
515
516 fn pos(&self) -> Position {
519 self.parser().pos.get()
520 }
521
522 fn span(&self) -> Span {
525 Span::splat(self.pos())
526 }
527
528 fn span_char(&self) -> Span {
530 let mut next = Position {
531 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
532 line: self.line(),
533 column: self.column().checked_add(1).unwrap(),
534 };
535 if self.char() == '\n' {
536 next.line += 1;
537 next.column = 1;
538 }
539 Span::new(self.pos(), next)
540 }
541
542 #[inline(never)]
552 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
553 assert_eq!(self.char(), '|');
554 concat.span.end = self.pos();
555 self.push_or_add_alternation(concat);
556 self.bump();
557 Ok(ast::Concat {
558 span: self.span(),
559 asts: vec![],
560 })
561 }
562
563 fn push_or_add_alternation(&self, concat: Concat) {
566 use self::GroupState::*;
567
568 let mut stack = self.parser().stack_group.borrow_mut();
569 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
570 alts.asts.push(concat.into_ast());
571 return;
572 }
573 stack.push(Alternation(ast::Alternation {
574 span: Span::new(concat.span.start, self.pos()),
575 asts: vec![concat.into_ast()],
576 }));
577 }
578
579 #[inline(never)]
580 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
581 assert_eq!(self.char(), '&');
582 concat.span.end = self.pos();
583 self.push_or_add_intersect(concat);
584 self.bump();
585 Ok(Concat {
586 span: self.span(),
587 asts: vec![],
588 })
589 }
590
591 fn push_or_add_intersect(&self, concat: Concat) {
594 use self::GroupState::*;
595
596 let mut stack = self.parser().stack_group.borrow_mut();
597 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
598 alts.asts.push(concat.into_ast());
599 return;
600 }
601 stack.push(Intersection(ast::Intersection {
602 span: Span::new(concat.span.start, self.pos()),
603 asts: vec![concat.into_ast()],
604 }));
605 }
606
607 #[inline(never)]
621 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
622 assert_eq!(self.char(), '(');
623 match self.parse_group()? {
624 Either::Left(set) => {
625 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
626 if let Some(v) = ignore {
627 self.parser().ignore_whitespace.set(v);
628 }
629
630 concat.asts.push(Ast::flags(set));
631 Ok(concat)
632 }
633 Either::Right(group) => {
634 let old_ignore_whitespace = self.ignore_whitespace();
635 let new_ignore_whitespace = group
636 .flags()
637 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
638 .unwrap_or(old_ignore_whitespace);
639 self.parser()
640 .stack_group
641 .borrow_mut()
642 .push(GroupState::Group {
643 concat,
644 group,
645 ignore_whitespace: old_ignore_whitespace,
646 });
647 self.parser().ignore_whitespace.set(new_ignore_whitespace);
648 Ok(Concat {
649 span: self.span(),
650 asts: vec![],
651 })
652 }
653 }
654 }
655
656 #[inline(never)]
657 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
658 assert_eq!(self.char(), '~');
659 self.bump();
660 if self.is_eof() || self.char() != '(' {
661 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
662 }
663 let open_span = self.span_char();
664 self.bump();
665 let group = ast::Group {
666 span: open_span,
667 kind: ast::GroupKind::Complement,
668 ast: Box::new(Ast::empty(self.span())),
669 };
670
671 let old_ignore_whitespace = self.ignore_whitespace();
672 let new_ignore_whitespace = group
673 .flags()
674 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
675 .unwrap_or(old_ignore_whitespace);
676 self.parser()
677 .stack_group
678 .borrow_mut()
679 .push(GroupState::Group {
680 concat,
681 group,
682 ignore_whitespace: old_ignore_whitespace,
683 });
684 self.parser().ignore_whitespace.set(new_ignore_whitespace);
685 Ok(Concat {
686 span: self.span(),
687 asts: vec![],
688 })
689 }
690
691 #[inline(never)]
701 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
702 use self::GroupState::*;
703 assert_eq!(self.char(), ')');
704 let mut stack = self.parser().stack_group.borrow_mut();
705 let topstack = stack.pop();
706
707 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
708 Some(Group {
709 concat,
710 group,
711 ignore_whitespace,
712 }) => (concat, group, ignore_whitespace, None),
713 Some(Alternation(alt)) => match stack.pop() {
714 Some(Group {
715 concat,
716 group,
717 ignore_whitespace,
718 }) => (
719 concat,
720 group,
721 ignore_whitespace,
722 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
723 ),
724 None | Some(Alternation(_)) | Some(Intersection(_)) => {
725 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
726 }
727 },
728 Some(Intersection(int)) => match stack.pop() {
729 Some(Group {
730 concat,
731 group,
732 ignore_whitespace,
733 }) => (
734 concat,
735 group,
736 ignore_whitespace,
737 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
738 ),
739 None | Some(Alternation(_)) | Some(Intersection(_)) => {
740 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
741 }
742 },
743
744 None => {
745 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
746 }
747 };
748 self.parser().ignore_whitespace.set(ignore_whitespace);
749 group_concat.span.end = self.pos();
750 self.bump();
751 group.span.end = self.pos();
752 match alt {
753 Some(Either::Left(mut alt)) => {
754 alt.span.end = group_concat.span.end;
755 alt.asts.push(group_concat.into_ast());
756 group.ast = Box::new(alt.into_ast());
757 }
758 Some(Either::Right(mut int)) => {
759 int.span.end = group_concat.span.end;
760 int.asts.push(group_concat.into_ast());
761 group.ast = Box::new(int.into_ast());
762 }
763 None => {
764 group.ast = Box::new(group_concat.into_ast());
765 }
766 }
767
768 if group.kind == GroupKind::Complement {
769 let complement = ast::Complement {
770 span: self.span(),
771 ast: group.ast,
772 };
773 prior_concat.asts.push(Ast::complement(complement));
774 }
775 else {
777 prior_concat.asts.push(Ast::group(group));
779 }
780 Ok(prior_concat)
781 }
782
783 #[inline(never)]
790 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
791 concat.span.end = self.pos();
792 let mut stack = self.parser().stack_group.borrow_mut();
794 let ast = match stack.pop() {
795 None => Ok(concat.into_ast()),
796 Some(GroupState::Alternation(mut alt)) => {
797 alt.span.end = self.pos();
798 alt.asts.push(concat.into_ast());
799 Ok(Ast::alternation(alt))
800 }
801 Some(GroupState::Intersection(mut int)) => {
802 int.span.end = self.pos();
803 int.asts.push(concat.into_ast());
804
805 Ok(Ast::intersection(int))
807 }
808 Some(GroupState::Group { group, .. }) => {
809 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
810 }
811 };
812 match stack.pop() {
814 None => ast,
815 Some(GroupState::Alternation(_)) => {
816 unreachable!()
823 }
824 Some(GroupState::Intersection(_)) => {
825 unreachable!()
826 }
827 Some(GroupState::Group { group, .. }) => {
828 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
829 }
830 }
831 }
832
833 #[inline(never)]
842 fn push_class_open(
843 &self,
844 parent_union: regex_syntax::ast::ClassSetUnion,
845 ) -> Result<regex_syntax::ast::ClassSetUnion> {
846 assert_eq!(self.char(), '[');
847
848 let (nested_set, nested_union) = self.parse_set_class_open()?;
849 self.parser()
850 .stack_class
851 .borrow_mut()
852 .push(ClassState::Open {
853 union: parent_union,
854 set: nested_set,
855 });
856 Ok(nested_union)
857 }
858
859 #[inline(never)]
874 fn pop_class(
875 &self,
876 nested_union: regex_syntax::ast::ClassSetUnion,
877 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
878 assert_eq!(self.char(), ']');
879
880 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
881 let prevset = self.pop_class_op(item);
882 let mut stack = self.parser().stack_class.borrow_mut();
883 match stack.pop() {
884 None => {
885 panic!("unexpected empty character class stack")
894 }
895 Some(ClassState::Op { .. }) => {
896 panic!("unexpected ClassState::Op")
903 }
904 Some(ClassState::Open { mut union, mut set }) => {
905 self.bump();
906 set.span.end = self.pos();
907 set.kind = prevset;
908 if stack.is_empty() {
909 Ok(Either::Right(set))
910 } else {
911 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
912 Ok(Either::Left(union))
913 }
914 }
915 }
916 }
917
918 #[inline(never)]
923 fn unclosed_class_error(&self) -> ResharpError {
924 for state in self.parser().stack_class.borrow().iter().rev() {
925 if let ClassState::Open { ref set, .. } = *state {
926 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
927 }
928 }
929 panic!("no open character class found")
932 }
933
934 #[inline(never)]
940 fn push_class_op(
941 &self,
942 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
943 next_union: regex_syntax::ast::ClassSetUnion,
944 ) -> regex_syntax::ast::ClassSetUnion {
945 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
946 let new_lhs = self.pop_class_op(item);
947 self.parser().stack_class.borrow_mut().push(ClassState::Op {
948 kind: next_kind,
949 lhs: new_lhs,
950 });
951 regex_syntax::ast::ClassSetUnion {
952 span: self.span(),
953 items: vec![],
954 }
955 }
956
957 #[inline(never)]
963 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
964 let mut stack = self.parser().stack_class.borrow_mut();
965 let (kind, lhs) = match stack.pop() {
966 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
967 Some(state @ ClassState::Open { .. }) => {
968 stack.push(state);
969 return rhs;
970 }
971 None => unreachable!(),
972 };
973 let span = Span::new(lhs.span().start, rhs.span().end);
974 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
975 span,
976 kind,
977 lhs: Box::new(lhs),
978 rhs: Box::new(rhs),
979 })
980 }
981
982 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
983 match hir.kind() {
984 hir::HirKind::Empty => Ok(NodeId::EPS),
985 hir::HirKind::Literal(l) => {
986 if l.0.len() == 1 {
987 let node = tb.mk_u8(l.0[0]);
988 Ok(node)
989 } else {
990 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
991 let conc = tb.mk_concats(ws.iter().copied());
992 Ok(conc)
993 }
994 }
995 hir::HirKind::Class(class) => {
996 match class {
998 hir::Class::Unicode(class_unicode) => {
999 let ranges = class_unicode.ranges();
1000 let mut s1 = NodeId::BOT;
1001 let mut s2 = NodeId::BOT;
1002 let mut s3 = NodeId::BOT;
1003 let mut s4 = NodeId::BOT;
1004 for range in ranges {
1005 for seq in Utf8Sequences::new(range.start(), range.end()) {
1006 if seq.len() > 2 {
1008 continue;
1009 }
1010
1011 let v: Vec<_> = seq
1012 .as_slice()
1013 .iter()
1014 .map(|s|
1015 (s.start, s.end))
1017 .collect();
1018 if v.len() == 1 {
1019 let (start, end) = v[0];
1020 let node = tb.mk_range_u8(start, end);
1021 s1 = tb.mk_union(s1, node);
1022 } else if v.len() == 2 {
1023 let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1024 let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1025 let conc = tb.mk_concat(node1, node2);
1026 s2 = tb.mk_union(s2, conc);
1027 } else if v.len() == 3 {
1028 let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1029 let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1030 let node3 = tb.mk_range_u8(v[2].0, v[2].1);
1031 let conc2 = tb.mk_concat(node2, node3);
1032 let conc1 = tb.mk_concat(node1, conc2);
1033 s3 = tb.mk_union(s3, conc1);
1034 } else {
1035 let mut conc = NodeId::EPS;
1036 for i in (0..4).rev() {
1037 let node = tb.mk_range_u8(v[i].0, v[i].1);
1038 conc = tb.mk_concat(conc, node);
1039 }
1040 s4 = tb.mk_union(s4, conc);
1041 }
1042 }
1043 }
1044 let merged = tb.mk_union(s2, s1);
1045 let merged = tb.mk_union(s3, merged);
1046 let merged = tb.mk_union(s4, merged);
1047 Ok(merged)
1048 }
1049 hir::Class::Bytes(class_bytes) => {
1050 let ranges = class_bytes.ranges();
1051 let mut result = NodeId::BOT;
1052 for range in ranges {
1053 let start = range.start();
1054 let end = range.end();
1055 let node = tb.mk_range_u8(start, end);
1056 result = tb.mk_union(result, node);
1057 }
1058 Ok(result)
1059 }
1060 }
1061 }
1062 hir::HirKind::Look(_) => todo!(),
1063 hir::HirKind::Repetition(_) => todo!(),
1064 hir::HirKind::Capture(_) => todo!(),
1065 hir::HirKind::Concat(body) => {
1066 let mut result = NodeId::EPS;
1067 for child in body {
1068 let node = self.hir_to_node_id(child, tb)?;
1069 result = tb.mk_concat(result, node);
1070 }
1071 Ok(result)
1072 }
1073 hir::HirKind::Alternation(_) => todo!(),
1074 }
1075 }
1076
1077 fn translate_ast_to_hir(
1078 &mut self,
1079 orig_ast: ®ex_syntax::ast::Ast,
1080 tb: &mut TB<'s>,
1081 ) -> Result<NodeId> {
1082 match self.translator.translate("", orig_ast) {
1085 Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1086 Ok(hir) => {
1087 let mapped = self.hir_to_node_id(&hir, tb);
1088 mapped
1089 }
1090 }
1091 }
1092
1093 fn translator_to_node_id(
1094 &mut self,
1095 orig_ast: ®ex_syntax::ast::Ast,
1096 translator: &mut Option<Translator>,
1097 tb: &mut TB<'s>,
1098 ) -> Result<NodeId> {
1099 match translator {
1100 Some(tr) => {
1101 let hir = tr
1102 .translate("", &orig_ast)
1103 .map_err(|e| self.unsupported_error(e))?;
1104 self.hir_to_node_id(&hir, tb)
1105 }
1106 None => self.translate_ast_to_hir(&orig_ast, tb),
1107 }
1108 }
1109
1110 fn get_class(
1111 &mut self,
1112 negated: bool,
1113 kind: regex_syntax::ast::ClassPerlKind,
1114 tb: &mut TB<'s>,
1115 ) -> Result<NodeId> {
1116 let w = self
1118 .perl_classes
1119 .iter()
1120 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1121 match w {
1122 Some((_, _, value)) => return Ok(*value),
1123 None => {
1124 let tmp = regex_syntax::ast::ClassPerl {
1125 span: Span::splat(Position::new(0, 0, 0)),
1126 negated,
1127 kind: kind.clone(),
1128 };
1129 let word_ast = regex_syntax::ast::Ast::class_perl(tmp);
1130 let translated = self.translate_ast_to_hir(&word_ast, tb)?;
1131 self.perl_classes.push((negated, kind, translated));
1132 Ok(translated)
1133 }
1134 }
1135 }
1136
1137 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1138 use WordCharKind::*;
1139 match ast {
1140 Ast::Literal(lit) => {
1141 if is_word_byte(lit.c as u8) {
1142 Word
1143 } else {
1144 NonWord
1145 }
1146 }
1147 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1148 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1149 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1150 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1151 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1152 _ => Unknown,
1153 },
1154 Ast::Dot(_) | Ast::Top(_) => Unknown,
1155 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1156 Ast::Concat(c) if !c.asts.is_empty() => {
1157 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1158 }
1159 Ast::Repetition(r) => {
1160 let inner = Self::word_char_kind(&r.ast, left);
1161 let nullable = matches!(
1162 &r.op.kind,
1163 ast::RepetitionKind::ZeroOrMore
1164 | ast::RepetitionKind::ZeroOrOne
1165 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1166 );
1167 if nullable {
1168 match inner {
1169 Word => MaybeWord,
1170 NonWord => MaybeNonWord,
1171 _ => Unknown,
1172 }
1173 } else {
1174 inner
1175 }
1176 }
1177 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1178 _ => Unknown,
1179 }
1180 }
1181
1182 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1183 use WordCharKind::*;
1184 let next = idx as isize + dir;
1185 if next < 0 || next >= asts.len() as isize {
1186 return Edge;
1187 }
1188 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1189 match kind {
1190 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1191 Word => Word,
1192 _ => Unknown,
1193 },
1194 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1195 NonWord => NonWord,
1196 _ => Unknown,
1197 },
1198 other => other,
1199 }
1200 }
1201
1202 fn rewrite_word_boundary_in_concat(
1203 &mut self,
1204 asts: &[Ast],
1205 idx: usize,
1206 translator: &mut Option<Translator>,
1207 tb: &mut TB<'s>,
1208 ) -> Result<(NodeId, usize)> {
1209 use WordCharKind::*;
1210 let left = Self::concat_neighbor_kind(asts, idx, -1);
1211 let right = Self::concat_neighbor_kind(asts, idx, 1);
1212 let word_id = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1213 let not_word_id = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1214
1215 match (left, right) {
1216 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1217 (Word, _) => {
1218 let set = tb.mk_union(NodeId::END, not_word_id);
1219 let tail = tb.mk_concat(set, NodeId::TS);
1220 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1221 }
1222 (NonWord, _) => {
1223 let set = tb.mk_union(NodeId::END, word_id);
1224 let tail = tb.mk_concat(set, NodeId::TS);
1225 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1226 }
1227 (_, Word) => {
1228 let body = tb.mk_union(NodeId::BEGIN, not_word_id);
1229 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1230 }
1231 (_, NonWord) => {
1232 let body = tb.mk_union(NodeId::BEGIN, word_id);
1233 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1234 }
1235 _ => Ok((self.make_full_word_boundary(tb)?, idx + 1)),
1236 }
1237 }
1238
1239 fn merge_boundary_with_following_lookaheads(
1240 &mut self,
1241 asts: &[Ast],
1242 wb_idx: usize,
1243 boundary_tail: NodeId,
1244 translator: &mut Option<Translator>,
1245 tb: &mut TB<'s>,
1246 ) -> Result<(NodeId, usize)> {
1247 let mut next = wb_idx + 1;
1248 let mut la_bodies = vec![boundary_tail];
1249 while next < asts.len() {
1250 match &asts[next] {
1251 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1252 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1253 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1254 next += 1;
1255 }
1256 _ => break,
1257 }
1258 }
1259 let merged = tb.mk_inters(la_bodies.into_iter());
1260 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1261 }
1262
1263 fn make_full_word_boundary(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1264 let w = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1265 let nw = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1266 let lb_w = tb.mk_lookbehind(w, NodeId::MISSING);
1267 let la_nw = tb.mk_lookahead(nw, NodeId::MISSING, 0);
1268 let c1 = tb.mk_concat(lb_w, la_nw);
1269 let lb_nw = tb.mk_lookbehind(nw, NodeId::MISSING);
1270 let la_w = tb.mk_lookahead(w, NodeId::MISSING, 0);
1271 let c2 = tb.mk_concat(lb_nw, la_w);
1272 Ok(tb.mk_union(c1, c2))
1273 }
1274
1275 fn ast_to_node_id(
1276 &mut self,
1277 ast: &Ast,
1278 translator: &mut Option<Translator>,
1279 tb: &mut TB<'s>,
1280 ) -> Result<NodeId> {
1281 match ast {
1282 Ast::Empty(_) => Ok(NodeId::EPS),
1283 Ast::Flags(f) => {
1284 let mut translator_builder = Self::default_translator_builder();
1285 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1286 translator_builder.case_insensitive(state);
1287 }
1288 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1289 translator_builder.unicode(state);
1290 }
1291 let concat_translator = Some(translator_builder.build());
1292 *translator = concat_translator;
1293 Ok(NodeId::EPS)
1294 }
1295 Ast::Literal(l) => {
1296 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1297 self.translator_to_node_id(&ast_lit, translator, tb)
1298 }
1299 Ast::Top(_) => Ok(NodeId::TOP),
1300 Ast::Dot(_) => {
1301 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1302 self.hir_to_node_id(&hirv, tb)
1303 }
1304 Ast::Assertion(a) => match &a.kind {
1305 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1306 ast::AssertionKind::EndText => Ok(NodeId::END),
1307 ast::AssertionKind::WordBoundary => {
1308 let word_id =
1309 self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1310 let not_word_id =
1311 self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1312 let case1_1 = tb.mk_lookbehind(word_id, NodeId::MISSING);
1314 let case1_2 = tb.mk_lookahead(not_word_id, NodeId::MISSING, 0);
1315 let case1 = tb.mk_concat(case1_1, case1_2);
1316 let case2_1 = tb.mk_lookbehind(not_word_id, NodeId::MISSING);
1318 let case2_2 = tb.mk_lookahead(word_id, NodeId::MISSING, 0);
1319 let case2 = tb.mk_concat(case2_1, case2_2);
1320 Ok(tb.mk_union(case1, case2))
1321 }
1322 ast::AssertionKind::NotWordBoundary => {
1323 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1324 }
1325 ast::AssertionKind::StartLine => {
1326 let left = NodeId::BEGIN;
1327 let right = tb.mk_u8('\n' as u8);
1328 let union = tb.mk_union(left, right);
1329 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1330 }
1331 ast::AssertionKind::EndLine => {
1332 let left = NodeId::END;
1333 let right = tb.mk_u8('\n' as u8);
1334 let union = tb.mk_union(left, right);
1335 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1336 }
1337 ast::AssertionKind::WordBoundaryStart => todo!(),
1338 ast::AssertionKind::WordBoundaryEnd => todo!(),
1339 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1340 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1341 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1342 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1343 },
1344 Ast::ClassUnicode(c) => {
1345 let tmp = regex_syntax::ast::ClassUnicode {
1346 span: c.span,
1347 negated: c.negated,
1348 kind: c.kind.clone(),
1349 };
1350 if !c.negated {
1351 match &c.kind {
1352 regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1353 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1355 "utf8" => {
1357 let ascii = tb.mk_range_u8(0, 127);
1358 let beta = tb.mk_range_u8(128, 0xBF);
1359 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1360 let c0s = tb.mk_concats([c0, beta].into_iter());
1361 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1362 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1363 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1364 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1365 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1366 return Ok(tb.mk_star(merged));
1367 }
1368 "hex" => {
1369 let nums = tb.mk_range_u8(b'0', b'9');
1370 let lets = tb.mk_range_u8(b'a', b'f');
1371 let lets2 = tb.mk_range_u8(b'A', b'F');
1372 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1373 return Ok(merged);
1374 }
1375 _ => {}
1376 },
1377 _ => {}
1378 };
1379 }
1380
1381 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1382 self.translator_to_node_id(&orig_ast, translator, tb)
1383 }
1384 Ast::ClassPerl(c) => {
1385 let tmp = regex_syntax::ast::ClassPerl {
1386 span: c.span,
1387 negated: c.negated,
1388 kind: c.kind.clone(),
1389 };
1390 let orig_ast = regex_syntax::ast::Ast::class_perl(tmp);
1391 self.translator_to_node_id(&orig_ast, translator, tb)
1392 }
1393 Ast::ClassBracketed(c) => match &c.kind {
1394 regex_syntax::ast::ClassSet::Item(_) => {
1395 let tmp = regex_syntax::ast::ClassBracketed {
1396 span: c.span,
1397 negated: c.negated,
1398 kind: c.kind.clone(),
1399 };
1400 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1401 self.translator_to_node_id(&orig_ast, translator, tb)
1402 }
1403 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1404 },
1405 Ast::Repetition(r) => {
1406 let body = self.ast_to_node_id(&r.ast, translator, tb);
1407 match body {
1408 Ok(body) => match &r.op.kind {
1409 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1410 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1411 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1412 ast::RepetitionKind::Range(r) => match r {
1413 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1414 ast::RepetitionRange::AtLeast(n) => {
1415 let rep = tb.mk_repeat(body, *n, *n);
1416 let st = tb.mk_star(body);
1417 Ok(tb.mk_concat(rep, st))
1418 }
1419
1420 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1421 },
1422 },
1423 Err(_) => body,
1424 }
1425 }
1426 Ast::Lookaround(g) => {
1427 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1428 match g.kind {
1429 ast::LookaroundKind::PositiveLookahead => {
1430 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1431 }
1432 ast::LookaroundKind::PositiveLookbehind => {
1433 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1434 }
1435 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1436 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1437 }
1438 }
1439 Ast::Group(g) => {
1440 let child = self.ast_to_node_id(&g.ast, translator, tb);
1441 child
1442 }
1443 Ast::Alternation(a) => {
1444 let mut children = vec![];
1445 for ast in &a.asts {
1446 match self.ast_to_node_id(ast, translator, tb) {
1447 Ok(node_id) => children.push(node_id),
1448 Err(err) => return Err(err),
1449 }
1450 }
1451 Ok(tb.mk_unions(children.iter().copied()))
1452 }
1453 Ast::Concat(c) => {
1454 let mut concat_translator: Option<Translator> = None;
1455 let mut children = vec![];
1456 let mut i = 0;
1457 while i < c.asts.len() {
1458 let ast = &c.asts[i];
1459 match ast {
1460 Ast::Flags(f) => {
1461 let mut translator_builder = Self::default_translator_builder();
1462 translator_builder.utf8(false);
1463 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1464 translator_builder.case_insensitive(state);
1465 }
1466 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1467 translator_builder.unicode(state);
1468 }
1469 concat_translator = Some(translator_builder.build());
1470 i += 1;
1471 continue;
1472 }
1473 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1474 let node =
1475 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1476 children.push(node.0);
1477 i = node.1; continue;
1479 }
1480 _ => {}
1481 }
1482 match concat_translator {
1483 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1484 Ok(node_id) => children.push(node_id),
1485 Err(err) => return Err(err),
1486 },
1487 None => match self.ast_to_node_id(ast, translator, tb) {
1488 Ok(node_id) => children.push(node_id),
1489 Err(err) => return Err(err),
1490 },
1491 }
1492 i += 1;
1493 }
1494 Ok(tb.mk_concats(children.iter().cloned()))
1495 }
1496 Ast::Intersection(intersection) => {
1497 let mut children = vec![];
1498 for ast in &intersection.asts {
1499 match self.ast_to_node_id(ast, translator, tb) {
1500 Ok(node_id) => children.push(node_id),
1501 Err(err) => return Err(err),
1502 }
1503 }
1504 Ok(tb.mk_inters(children.into_iter()))
1505 }
1506 Ast::Complement(complement) => {
1507 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1508 body.map(|x| tb.mk_compl(x))
1509 }
1510 }
1511 }
1512
1513 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1516 let mut concat = Concat {
1517 span: self.span(),
1518 asts: vec![],
1519 };
1520 loop {
1521 self.bump_space();
1522 if self.is_eof() {
1523 break;
1524 }
1525 match self.char() {
1526 '(' => concat = self.push_group(concat)?,
1527 ')' => concat = self.pop_group(concat)?,
1528 '|' => concat = self.push_alternate(concat)?,
1529 '&' => concat = self.push_intersect(concat)?,
1530 '~' => concat = self.push_compl_group(concat)?,
1531 '[' => {
1532 let class = self.parse_set_class()?;
1533 concat.asts.push(Ast::class_bracketed(class));
1534 }
1535 '?' => {
1536 concat =
1537 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1538 }
1539 '*' => {
1540 concat =
1541 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1542 }
1543 '+' => {
1544 concat =
1545 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1546 }
1547 '{' => {
1548 concat = self.parse_counted_repetition(concat)?;
1549 }
1550 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1551 }
1552 }
1553 let ast = self.pop_group_end(concat)?;
1554 let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1556 node_id
1557 }
1558
1559 #[inline(never)]
1560 fn parse_uncounted_repetition(
1561 &self,
1562 mut concat: ast::Concat,
1563 kind: ast::RepetitionKind,
1564 ) -> Result<ast::Concat> {
1565 let op_start = self.pos();
1567 let ast = match concat.asts.pop() {
1568 Some(ast) => ast,
1569 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1570 };
1571 match ast {
1572 Ast::Empty(_) | Ast::Flags(_) => {
1573 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1574 }
1575 _ => {}
1576 }
1577 if self.bump() && self.char() == '?' {
1578 return Err(self.error(
1579 Span::new(op_start, self.pos()),
1580 ast::ErrorKind::UnsupportedLazyQuantifier,
1581 ));
1582 }
1583 concat.asts.push(Ast::repetition(ast::Repetition {
1584 span: ast.span().with_end(self.pos()),
1585 op: ast::RepetitionOp {
1586 span: Span::new(op_start, self.pos()),
1587 kind,
1588 },
1589 greedy: true,
1590 ast: Box::new(ast),
1591 }));
1592 Ok(concat)
1593 }
1594
1595 #[inline(never)]
1596 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1597 assert!(self.char() == '{');
1598 let start = self.pos();
1599 let ast = match concat.asts.pop() {
1600 Some(ast) => ast,
1601 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1602 };
1603 match ast {
1604 Ast::Empty(_) | Ast::Flags(_) => {
1605 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1606 }
1607 _ => {}
1608 }
1609 if !self.bump_and_bump_space() {
1610 return Err(self.error(
1611 Span::new(start, self.pos()),
1612 ast::ErrorKind::RepetitionCountUnclosed,
1613 ));
1614 }
1615 let count_start = specialize_err(
1616 self.parse_decimal(),
1617 ast::ErrorKind::DecimalEmpty,
1618 ast::ErrorKind::RepetitionCountDecimalEmpty,
1619 );
1620 if self.is_eof() {
1621 return Err(self.error(
1622 Span::new(start, self.pos()),
1623 ast::ErrorKind::RepetitionCountUnclosed,
1624 ));
1625 }
1626 let range = if self.char() == ',' {
1627 if !self.bump_and_bump_space() {
1628 return Err(self.error(
1629 Span::new(start, self.pos()),
1630 ast::ErrorKind::RepetitionCountUnclosed,
1631 ));
1632 }
1633 if self.char() != '}' {
1634 let count_start = match count_start {
1635 Ok(c) => c,
1636 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1637 if self.parser().empty_min_range {
1638 0
1639 } else {
1640 return Err(err);
1641 }
1642 }
1643 err => err?,
1644 };
1645 let count_end = specialize_err(
1646 self.parse_decimal(),
1647 ast::ErrorKind::DecimalEmpty,
1648 ast::ErrorKind::RepetitionCountDecimalEmpty,
1649 )?;
1650 ast::RepetitionRange::Bounded(count_start, count_end)
1651 } else {
1652 ast::RepetitionRange::AtLeast(count_start?)
1653 }
1654 } else {
1655 ast::RepetitionRange::Exactly(count_start?)
1656 };
1657
1658 if self.is_eof() || self.char() != '}' {
1659 return Err(self.error(
1660 Span::new(start, self.pos()),
1661 ast::ErrorKind::RepetitionCountUnclosed,
1662 ));
1663 }
1664
1665 if self.bump_and_bump_space() && self.char() == '?' {
1666 return Err(self.error(
1667 Span::new(start, self.pos()),
1668 ast::ErrorKind::UnsupportedLazyQuantifier,
1669 ));
1670 }
1671
1672 let op_span = Span::new(start, self.pos());
1673 if !range.is_valid() {
1674 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1675 }
1676 concat.asts.push(Ast::repetition(ast::Repetition {
1677 span: ast.span().with_end(self.pos()),
1678 op: ast::RepetitionOp {
1679 span: op_span,
1680 kind: ast::RepetitionKind::Range(range),
1681 },
1682 greedy: true,
1683 ast: Box::new(ast),
1684 }));
1685 Ok(concat)
1686 }
1687
1688 #[inline(never)]
1689 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1690 assert_eq!(self.char(), '(');
1691 let open_span = self.span_char();
1692 self.bump();
1693 self.bump_space();
1694 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1695 let kind = match (pos, ahead) {
1696 (true, true) => LookaroundKind::PositiveLookahead,
1697 (true, false) => LookaroundKind::PositiveLookbehind,
1698 (false, true) => LookaroundKind::NegativeLookahead,
1699 (false, false) => LookaroundKind::NegativeLookbehind,
1700 };
1701 return Ok(Either::Right(ast::Group {
1702 span: open_span,
1703 kind: ast::GroupKind::Lookaround(kind),
1704 ast: Box::new(Ast::empty(self.span())),
1705 }));
1706 }
1707 let inner_span = self.span();
1708 let mut starts_with_p = true;
1709 if self.bump_if("?P<") || {
1710 starts_with_p = false;
1711 self.bump_if("?<")
1712 } {
1713 let capture_index = self.next_capture_index(open_span)?;
1714 let name = self.parse_capture_name(capture_index)?;
1715 Ok(Either::Right(ast::Group {
1716 span: open_span,
1717 kind: ast::GroupKind::CaptureName {
1718 starts_with_p,
1719 name,
1720 },
1721 ast: Box::new(Ast::empty(self.span())),
1722 }))
1723 } else if self.bump_if("?") {
1724 if self.is_eof() {
1725 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1726 }
1727 let flags = self.parse_flags()?;
1728 let char_end = self.char();
1729 self.bump();
1730 if char_end == ')' {
1731 if flags.items.is_empty() {
1734 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1735 }
1736 Ok(Either::Left(ast::SetFlags {
1737 span: Span {
1738 end: self.pos(),
1739 ..open_span
1740 },
1741 flags,
1742 }))
1743 } else {
1744 assert_eq!(char_end, ':');
1745 Ok(Either::Right(ast::Group {
1746 span: open_span,
1747 kind: ast::GroupKind::NonCapturing(flags),
1748 ast: Box::new(Ast::empty(self.span())),
1749 }))
1750 }
1751 } else {
1752 let capture_index = self.next_capture_index(open_span)?;
1753 Ok(Either::Right(ast::Group {
1754 span: open_span,
1755 kind: ast::GroupKind::CaptureIndex(capture_index),
1756 ast: Box::new(Ast::empty(self.span())),
1757 }))
1758 }
1759 }
1760
1761 #[inline(never)]
1762 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1763 if self.is_eof() {
1764 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1765 }
1766 let start = self.pos();
1767 loop {
1768 if self.char() == '>' {
1769 break;
1770 }
1771 if !is_capture_char(self.char(), self.pos() == start) {
1772 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1773 }
1774 if !self.bump() {
1775 break;
1776 }
1777 }
1778 let end = self.pos();
1779 if self.is_eof() {
1780 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1781 }
1782 assert_eq!(self.char(), '>');
1783 self.bump();
1784 let name = &self.pattern()[start.offset..end.offset];
1785 if name.is_empty() {
1786 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1787 }
1788 let capname = ast::CaptureName {
1789 span: Span::new(start, end),
1790 name: name.to_string(),
1791 index: capture_index,
1792 };
1793 self.add_capture_name(&capname)?;
1794 Ok(capname)
1795 }
1796
1797 #[inline(never)]
1798 fn parse_flags(&self) -> Result<ast::Flags> {
1799 let mut flags = ast::Flags {
1800 span: self.span(),
1801 items: vec![],
1802 };
1803 let mut last_was_negation = None;
1804 while self.char() != ':' && self.char() != ')' {
1805 if self.char() == '-' {
1806 last_was_negation = Some(self.span_char());
1807 let item = ast::FlagsItem {
1808 span: self.span_char(),
1809 kind: ast::FlagsItemKind::Negation,
1810 };
1811 if let Some(i) = flags.add_item(item) {
1812 return Err(self.error(
1813 self.span_char(),
1814 ast::ErrorKind::FlagRepeatedNegation {
1815 original: flags.items[i].span,
1816 },
1817 ));
1818 }
1819 } else {
1820 last_was_negation = None;
1821 let item = ast::FlagsItem {
1822 span: self.span_char(),
1823 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1824 };
1825 if let Some(i) = flags.add_item(item) {
1826 return Err(self.error(
1827 self.span_char(),
1828 ast::ErrorKind::FlagDuplicate {
1829 original: flags.items[i].span,
1830 },
1831 ));
1832 }
1833 }
1834 if !self.bump() {
1835 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1836 }
1837 }
1838 if let Some(span) = last_was_negation {
1839 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1840 }
1841 flags.span.end = self.pos();
1842 Ok(flags)
1843 }
1844
1845 #[inline(never)]
1846 fn parse_flag(&self) -> Result<ast::Flag> {
1847 match self.char() {
1848 'i' => Ok(ast::Flag::CaseInsensitive),
1849 'm' => Ok(ast::Flag::MultiLine),
1850 's' => Ok(ast::Flag::DotMatchesNewLine),
1851 'U' => Ok(ast::Flag::SwapGreed),
1852 'u' => Ok(ast::Flag::Unicode),
1853 'R' => Ok(ast::Flag::CRLF),
1854 'x' => Ok(ast::Flag::IgnoreWhitespace),
1855 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1856 }
1857 }
1858
1859 fn parse_primitive(&self) -> Result<Primitive> {
1860 match self.char() {
1861 '\\' => self.parse_escape(),
1862 '_' => {
1863 let ast = Primitive::Top(self.span_char());
1864 self.bump();
1865 Ok(ast)
1866 }
1867 '.' => {
1868 let ast = Primitive::Dot(self.span_char());
1869 self.bump();
1870 Ok(ast)
1871 }
1872 '^' => {
1873 let ast = Primitive::Assertion(ast::Assertion {
1874 span: self.span_char(),
1875 kind: ast::AssertionKind::StartLine,
1876 });
1877 self.bump();
1878 Ok(ast)
1879 }
1880 '$' => {
1881 let ast = Primitive::Assertion(ast::Assertion {
1882 span: self.span_char(),
1883 kind: ast::AssertionKind::EndLine,
1884 });
1885 self.bump();
1886 Ok(ast)
1887 }
1888 c => {
1889 let ast = Primitive::Literal(Literal {
1890 span: self.span_char(),
1891 kind: LiteralKind::Verbatim,
1892 c,
1893 });
1894 self.bump();
1895 Ok(ast)
1896 }
1897 }
1898 }
1899
1900 #[inline(never)]
1901 fn parse_escape(&self) -> Result<Primitive> {
1902 assert_eq!(self.char(), '\\');
1903 let start = self.pos();
1904 if !self.bump() {
1905 return Err(self.error(
1906 Span::new(start, self.pos()),
1907 ast::ErrorKind::EscapeUnexpectedEof,
1908 ));
1909 }
1910 let c = self.char();
1911 match c {
1913 '0'..='9' => {
1914 if !self.parser().octal {
1915 return Err(self.error(
1916 Span::new(start, self.span_char().end),
1917 ast::ErrorKind::UnsupportedBackreference,
1918 ));
1919 }
1920 let mut lit = self.parse_octal();
1921 lit.span.start = start;
1922 return Ok(Primitive::Literal(lit));
1923 }
1924 'x' | 'u' | 'U' => {
1931 let mut lit = self.parse_hex()?;
1932 lit.span.start = start;
1933 return Ok(Primitive::Literal(lit));
1934 }
1935 'p' | 'P' => {
1936 let mut cls = self.parse_unicode_class()?;
1937 cls.span.start = start;
1938 return Ok(Primitive::Unicode(cls));
1939 }
1940 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1941 let mut cls = self.parse_perl_class();
1942 cls.span.start = start;
1943 return Ok(Primitive::Perl(cls));
1944 }
1945 _ => {}
1946 }
1947
1948 self.bump();
1950 let span = Span::new(start, self.pos());
1951 if is_meta_character(c) {
1952 return Ok(Primitive::Literal(Literal {
1953 span,
1954 kind: LiteralKind::Meta,
1955 c,
1956 }));
1957 }
1958 if is_escapeable_character(c) {
1959 return Ok(Primitive::Literal(Literal {
1960 span,
1961 kind: LiteralKind::Superfluous,
1962 c,
1963 }));
1964 }
1965 let special = |kind, c| {
1966 Ok(Primitive::Literal(Literal {
1967 span,
1968 kind: LiteralKind::Special(kind),
1969 c,
1970 }))
1971 };
1972 match c {
1973 'a' => special(SpecialLiteralKind::Bell, '\x07'),
1974 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1975 't' => special(SpecialLiteralKind::Tab, '\t'),
1976 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1977 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1978 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1979 'A' => Ok(Primitive::Assertion(ast::Assertion {
1980 span,
1981 kind: ast::AssertionKind::StartText,
1982 })),
1983 'z' => Ok(Primitive::Assertion(ast::Assertion {
1984 span,
1985 kind: ast::AssertionKind::EndText,
1986 })),
1987 'b' => {
1988 let mut wb = ast::Assertion {
1989 span,
1990 kind: ast::AssertionKind::WordBoundary,
1991 };
1992 if !self.is_eof() && self.char() == '{' {
1995 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
1996 wb.kind = kind;
1997 wb.span.end = self.pos();
1998 }
1999 }
2000 Ok(Primitive::Assertion(wb))
2001 }
2002 'B' => Ok(Primitive::Assertion(ast::Assertion {
2003 span,
2004 kind: ast::AssertionKind::NotWordBoundary,
2005 })),
2006 '<' => Ok(Primitive::Assertion(ast::Assertion {
2007 span,
2008 kind: ast::AssertionKind::WordBoundaryStartAngle,
2009 })),
2010 '>' => Ok(Primitive::Assertion(ast::Assertion {
2011 span,
2012 kind: ast::AssertionKind::WordBoundaryEndAngle,
2013 })),
2014 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2015 }
2016 }
2017
2018 fn maybe_parse_special_word_boundary(
2019 &self,
2020 wb_start: Position,
2021 ) -> Result<Option<ast::AssertionKind>> {
2022 assert_eq!(self.char(), '{');
2023
2024 let is_valid_char = |c| match c {
2025 'A'..='Z' | 'a'..='z' | '-' => true,
2026 _ => false,
2027 };
2028 let start = self.pos();
2029 if !self.bump_and_bump_space() {
2030 return Err(self.error(
2031 Span::new(wb_start, self.pos()),
2032 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2033 ));
2034 }
2035 let start_contents = self.pos();
2036 if !is_valid_char(self.char()) {
2041 self.parser().pos.set(start);
2042 return Ok(None);
2043 }
2044
2045 let mut scratch = self.parser().scratch.borrow_mut();
2047 scratch.clear();
2048 while !self.is_eof() && is_valid_char(self.char()) {
2049 scratch.push(self.char());
2050 self.bump_and_bump_space();
2051 }
2052 if self.is_eof() || self.char() != '}' {
2053 return Err(self.error(
2054 Span::new(start, self.pos()),
2055 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2056 ));
2057 }
2058 let end = self.pos();
2059 self.bump();
2060 let kind = match scratch.as_str() {
2061 "start" => ast::AssertionKind::WordBoundaryStart,
2062 "end" => ast::AssertionKind::WordBoundaryEnd,
2063 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2064 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2065 _ => {
2066 return Err(self.error(
2067 Span::new(start_contents, end),
2068 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2069 ))
2070 }
2071 };
2072 Ok(Some(kind))
2073 }
2074
2075 #[inline(never)]
2076 fn parse_octal(&self) -> Literal {
2077 assert!(self.parser().octal);
2078 assert!('0' <= self.char() && self.char() <= '7');
2079 let start = self.pos();
2080 while self.bump()
2082 && '0' <= self.char()
2083 && self.char() <= '7'
2084 && self.pos().offset - start.offset <= 2
2085 {}
2086 let end = self.pos();
2087 let octal = &self.pattern()[start.offset..end.offset];
2088 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2091 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2094 Literal {
2095 span: Span::new(start, end),
2096 kind: LiteralKind::Octal,
2097 c,
2098 }
2099 }
2100
2101 #[inline(never)]
2102 fn parse_hex(&self) -> Result<Literal> {
2103 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2104
2105 let hex_kind = match self.char() {
2106 'x' => HexLiteralKind::X,
2107 'u' => HexLiteralKind::UnicodeShort,
2108 _ => HexLiteralKind::UnicodeLong,
2109 };
2110 if !self.bump_and_bump_space() {
2111 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2112 }
2113 if self.char() == '{' {
2114 self.parse_hex_brace(hex_kind)
2115 } else {
2116 self.parse_hex_digits(hex_kind)
2117 }
2118 }
2119
2120 #[inline(never)]
2121 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2122 let mut scratch = self.parser().scratch.borrow_mut();
2123 scratch.clear();
2124
2125 let start = self.pos();
2126 for i in 0..kind.digits() {
2127 if i > 0 && !self.bump_and_bump_space() {
2128 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2129 }
2130 if !is_hex(self.char()) {
2131 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2132 }
2133 scratch.push(self.char());
2134 }
2135 self.bump_and_bump_space();
2138 let end = self.pos();
2139 let hex = scratch.as_str();
2140 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2141 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2142 Some(c) => Ok(Literal {
2143 span: Span::new(start, end),
2144 kind: LiteralKind::HexFixed(kind),
2145 c,
2146 }),
2147 }
2148 }
2149
2150 #[inline(never)]
2151 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2152 let mut scratch = self.parser().scratch.borrow_mut();
2153 scratch.clear();
2154
2155 let brace_pos = self.pos();
2156 let start = self.span_char().end;
2157 while self.bump_and_bump_space() && self.char() != '}' {
2158 if !is_hex(self.char()) {
2159 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2160 }
2161 scratch.push(self.char());
2162 }
2163 if self.is_eof() {
2164 return Err(self.error(
2165 Span::new(brace_pos, self.pos()),
2166 ast::ErrorKind::EscapeUnexpectedEof,
2167 ));
2168 }
2169 let end = self.pos();
2170 let hex = scratch.as_str();
2171 assert_eq!(self.char(), '}');
2172 self.bump_and_bump_space();
2173
2174 if hex.is_empty() {
2175 return Err(self.error(
2176 Span::new(brace_pos, self.pos()),
2177 ast::ErrorKind::EscapeHexEmpty,
2178 ));
2179 }
2180 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2181 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2182 Some(c) => Ok(Literal {
2183 span: Span::new(start, self.pos()),
2184 kind: LiteralKind::HexBrace(kind),
2185 c,
2186 }),
2187 }
2188 }
2189
2190 fn parse_decimal(&self) -> Result<u32> {
2191 let mut scratch = self.parser().scratch.borrow_mut();
2192 scratch.clear();
2193
2194 while !self.is_eof() && self.char().is_whitespace() {
2195 self.bump();
2196 }
2197 let start = self.pos();
2198 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2199 scratch.push(self.char());
2200 self.bump_and_bump_space();
2201 }
2202 let span = Span::new(start, self.pos());
2203 while !self.is_eof() && self.char().is_whitespace() {
2204 self.bump_and_bump_space();
2205 }
2206 let digits = scratch.as_str();
2207 if digits.is_empty() {
2208 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2209 }
2210 match u32::from_str_radix(digits, 10).ok() {
2211 Some(n) => Ok(n),
2212 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2213 }
2214 }
2215
2216 #[inline(never)]
2217 fn parse_set_class(&self) -> Result<ClassBracketed> {
2218 assert_eq!(self.char(), '[');
2219
2220 let mut union = ClassSetUnion {
2221 span: self.span(),
2222 items: vec![],
2223 };
2224 loop {
2225 self.bump_space();
2226 if self.is_eof() {
2227 return Err(self.unclosed_class_error());
2228 }
2229 match self.char() {
2230 '[' => {
2231 if !self.parser().stack_class.borrow().is_empty() {
2236 if let Some(cls) = self.maybe_parse_ascii_class() {
2237 union.push(ClassSetItem::Ascii(cls));
2238 continue;
2239 }
2240 }
2241 union = self.push_class_open(union)?;
2242 }
2243 ']' => match self.pop_class(union)? {
2244 Either::Left(nested_union) => {
2245 union = nested_union;
2246 }
2247 Either::Right(class) => return Ok(class),
2248 },
2249 '&' if self.peek() == Some('&') => {
2250 assert!(self.bump_if("&&"));
2251 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2252 }
2253 '-' if self.peek() == Some('-') => {
2254 assert!(self.bump_if("--"));
2255 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2256 }
2257 '~' if self.peek() == Some('~') => {
2258 assert!(self.bump_if("~~"));
2259 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2260 }
2261 _ => {
2262 union.push(self.parse_set_class_range()?);
2263 }
2264 }
2265 }
2266 }
2267
2268 #[inline(never)]
2269 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2270 let prim1 = self.parse_set_class_item()?;
2271 self.bump_space();
2272 if self.is_eof() {
2273 return Err(self.unclosed_class_error());
2274 }
2275 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2276 return prim1.into_class_set_item(self);
2277 }
2278 if !self.bump_and_bump_space() {
2279 return Err(self.unclosed_class_error());
2280 }
2281 let prim2 = self.parse_set_class_item()?;
2282 let range = ClassSetRange {
2283 span: Span::new(prim1.span().start, prim2.span().end),
2284 start: prim1.into_class_literal(self)?,
2285 end: prim2.into_class_literal(self)?,
2286 };
2287 if !range.is_valid() {
2288 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2289 }
2290 Ok(ClassSetItem::Range(range))
2291 }
2292
2293 #[inline(never)]
2294 fn parse_set_class_item(&self) -> Result<Primitive> {
2295 if self.char() == '\\' {
2296 self.parse_escape()
2297 } else {
2298 let x = Primitive::Literal(Literal {
2299 span: self.span_char(),
2300 kind: LiteralKind::Verbatim,
2301 c: self.char(),
2302 });
2303 self.bump();
2304 Ok(x)
2305 }
2306 }
2307
2308 #[inline(never)]
2309 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2310 assert_eq!(self.char(), '[');
2311 let start = self.pos();
2312 if !self.bump_and_bump_space() {
2313 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2314 }
2315
2316 let negated = if self.char() != '^' {
2317 false
2318 } else {
2319 if !self.bump_and_bump_space() {
2320 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2321 }
2322 true
2323 };
2324 let mut union = ClassSetUnion {
2326 span: self.span(),
2327 items: vec![],
2328 };
2329 while self.char() == '-' {
2330 union.push(ClassSetItem::Literal(Literal {
2331 span: self.span_char(),
2332 kind: LiteralKind::Verbatim,
2333 c: '-',
2334 }));
2335 if !self.bump_and_bump_space() {
2336 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2337 }
2338 }
2339 if union.items.is_empty() && self.char() == ']' {
2342 union.push(ClassSetItem::Literal(Literal {
2343 span: self.span_char(),
2344 kind: LiteralKind::Verbatim,
2345 c: ']',
2346 }));
2347 if !self.bump_and_bump_space() {
2348 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2349 }
2350 }
2351 let set = ClassBracketed {
2352 span: Span::new(start, self.pos()),
2353 negated,
2354 kind: ClassSet::union(ClassSetUnion {
2355 span: Span::new(union.span.start, union.span.start),
2356 items: vec![],
2357 }),
2358 };
2359 Ok((set, union))
2360 }
2361
2362 #[inline(never)]
2363 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2364 assert_eq!(self.char(), '[');
2365 let start = self.pos();
2367 let mut negated = false;
2368 if !self.bump() || self.char() != ':' {
2369 self.parser().pos.set(start);
2370 return None;
2371 }
2372 if !self.bump() {
2373 self.parser().pos.set(start);
2374 return None;
2375 }
2376 if self.char() == '^' {
2377 negated = true;
2378 if !self.bump() {
2379 self.parser().pos.set(start);
2380 return None;
2381 }
2382 }
2383 let name_start = self.offset();
2384 while self.char() != ':' && self.bump() {}
2385 if self.is_eof() {
2386 self.parser().pos.set(start);
2387 return None;
2388 }
2389 let name = &self.pattern()[name_start..self.offset()];
2390 if !self.bump_if(":]") {
2391 self.parser().pos.set(start);
2392 return None;
2393 }
2394 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2395 Some(kind) => kind,
2396 None => {
2397 self.parser().pos.set(start);
2398 return None;
2399 }
2400 };
2401 Some(ClassAscii {
2402 span: Span::new(start, self.pos()),
2403 kind,
2404 negated,
2405 })
2406 }
2407
2408 #[inline(never)]
2409 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2410 assert!(self.char() == 'p' || self.char() == 'P');
2411
2412 let mut scratch = self.parser().scratch.borrow_mut();
2413 scratch.clear();
2414
2415 let negated = self.char() == 'P';
2416 if !self.bump_and_bump_space() {
2417 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2418 }
2419 let (start, kind) = if self.char() == '{' {
2420 let start = self.span_char().end;
2421 while self.bump_and_bump_space() && self.char() != '}' {
2422 scratch.push(self.char());
2423 }
2424 if self.is_eof() {
2425 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2426 }
2427 assert_eq!(self.char(), '}');
2428 self.bump();
2429
2430 let name = scratch.as_str();
2431 if let Some(i) = name.find("!=") {
2432 (
2433 start,
2434 ClassUnicodeKind::NamedValue {
2435 op: ClassUnicodeOpKind::NotEqual,
2436 name: name[..i].to_string(),
2437 value: name[i + 2..].to_string(),
2438 },
2439 )
2440 } else if let Some(i) = name.find(':') {
2441 (
2442 start,
2443 ClassUnicodeKind::NamedValue {
2444 op: ClassUnicodeOpKind::Colon,
2445 name: name[..i].to_string(),
2446 value: name[i + 1..].to_string(),
2447 },
2448 )
2449 } else if let Some(i) = name.find('=') {
2450 (
2451 start,
2452 ClassUnicodeKind::NamedValue {
2453 op: ClassUnicodeOpKind::Equal,
2454 name: name[..i].to_string(),
2455 value: name[i + 1..].to_string(),
2456 },
2457 )
2458 } else {
2459 (start, ClassUnicodeKind::Named(name.to_string()))
2460 }
2461 } else {
2462 let start = self.pos();
2463 let c = self.char();
2464 if c == '\\' {
2465 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2466 }
2467 self.bump_and_bump_space();
2468 let kind = ClassUnicodeKind::OneLetter(c);
2469 (start, kind)
2470 };
2471 Ok(ClassUnicode {
2472 span: Span::new(start, self.pos()),
2473 negated,
2474 kind,
2475 })
2476 }
2477
2478 #[inline(never)]
2479 fn parse_perl_class(&self) -> ClassPerl {
2480 let c = self.char();
2481 let span = self.span_char();
2482 self.bump();
2483 let (negated, kind) = match c {
2484 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2485 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2486 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2487 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2488 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2489 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2490 c => panic!("expected valid Perl class but got '{}'", c),
2491 };
2492 ClassPerl {
2493 span,
2494 kind,
2495 negated,
2496 }
2497 }
2498}
2499
2500pub fn parse_ast<'s>(
2501 tb: &mut TB<'s>,
2502 pattern: &'s str,
2503) -> std::result::Result<NodeId, ResharpError> {
2504 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2505 let result = p.parse(tb);
2506 result
2507}