1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28 Word,
29 NonWord,
30 MaybeWord,
31 MaybeNonWord,
32 Unknown,
33 Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37 b.is_ascii_alphanumeric() || b == b'_'
38}
39
40
41#[derive(Clone, Debug, Eq, PartialEq)]
42enum Primitive {
43 Literal(Literal),
44 Assertion(ast::Assertion),
45 Dot(Span),
46 Top(Span),
47 Perl(ClassPerl),
48 Unicode(ClassUnicode),
49}
50
51impl Primitive {
52 fn span(&self) -> &Span {
53 match *self {
54 Primitive::Literal(ref x) => &x.span,
55 Primitive::Assertion(ref x) => &x.span,
56 Primitive::Dot(ref span) => span,
57 Primitive::Top(ref span) => span,
58 Primitive::Perl(ref x) => &x.span,
59 Primitive::Unicode(ref x) => &x.span,
60 }
61 }
62
63 fn into_ast(self) -> Ast {
64 match self {
65 Primitive::Literal(lit) => Ast::literal(lit),
66 Primitive::Assertion(assert) => Ast::assertion(assert),
67 Primitive::Dot(span) => Ast::dot(span),
68 Primitive::Top(span) => Ast::top(span),
69 Primitive::Perl(cls) => Ast::class_perl(cls),
70 Primitive::Unicode(cls) => Ast::class_unicode(cls),
71 }
72 }
73
74 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
75 use self::Primitive::*;
76 use regex_syntax::ast::ClassSetItem;
77
78 match self {
79 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
80 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
81 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
82 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
83 }
84 }
85
86 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
87 use self::Primitive::*;
88
89 match self {
90 Literal(lit) => Ok(lit),
91 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
92 }
93 }
94}
95
96#[derive(Clone, Debug, Eq, PartialEq)]
97pub enum Either<Left, Right> {
98 Left(Left),
99 Right(Right),
100}
101
102#[derive(Clone, Debug, Eq, PartialEq)]
103pub struct ResharpError {
104 pub kind: ErrorKind,
106 pattern: String,
109 pub span: Span,
111}
112
113impl std::fmt::Display for ResharpError {
114 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
115 write!(f, "{:?}: {:?}", self.kind, self.span)
116 }
117}
118impl std::error::Error for ResharpError {}
119
120type Result<T> = core::result::Result<T, ResharpError>;
121
122#[derive(Clone, Debug)]
123enum GroupState {
124 Group {
126 concat: Concat,
128 group: ast::Group,
130 ignore_whitespace: bool,
132 },
133 Alternation(ast::Alternation),
138 Intersection(ast::Intersection),
139}
140
141#[derive(Clone, Debug)]
142enum ClassState {
143 Open {
145 union: regex_syntax::ast::ClassSetUnion,
147 set: regex_syntax::ast::ClassBracketed,
151 },
152 Op {
155 kind: regex_syntax::ast::ClassSetBinaryOpKind,
157 lhs: regex_syntax::ast::ClassSet,
159 },
160}
161
162pub struct ResharpParser<'s> {
164 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
165 unicode_classes: resharp_algebra::UnicodeClassCache,
166 pub translator: regex_syntax::hir::translate::Translator,
167 pub pattern: &'s str,
168 pos: Cell<Position>,
169 capture_index: Cell<u32>,
170 octal: bool,
171 empty_min_range: bool,
172 ignore_whitespace: Cell<bool>,
173 comments: RefCell<Vec<ast::Comment>>,
174 stack_group: RefCell<Vec<GroupState>>,
175 stack_class: RefCell<Vec<ClassState>>,
176 capture_names: RefCell<Vec<ast::CaptureName>>,
177 scratch: RefCell<String>,
178}
179
180fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
181 if let Err(e) = result {
182 if e.kind == from {
183 Err(ResharpError {
184 kind: to,
185 pattern: e.pattern,
186 span: e.span,
187 })
188 } else {
189 Err(e)
190 }
191 } else {
192 result
193 }
194}
195
196fn is_capture_char(c: char, first: bool) -> bool {
197 if first {
198 c == '_' || c.is_alphabetic()
199 } else {
200 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
201 }
202}
203
204pub fn is_meta_character(c: char) -> bool {
205 match c {
206 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
207 | '#' | '&' | '-' | '~' | '_' => true,
208 _ => false,
209 }
210}
211
212pub fn escape(text: &str) -> String {
214 let mut buf = String::new();
215 escape_into(text, &mut buf);
216 buf
217}
218
219pub fn escape_into(text: &str, buf: &mut String) {
221 buf.reserve(text.len());
222 for c in text.chars() {
223 if is_meta_character(c) {
224 buf.push('\\');
225 }
226 buf.push(c);
227 }
228}
229
230pub fn is_escapeable_character(c: char) -> bool {
231 if is_meta_character(c) {
233 return true;
234 }
235 if !c.is_ascii() {
238 return false;
239 }
240 match c {
245 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
246 '<' | '>' => false,
256 _ => true,
257 }
258}
259
260fn is_hex(c: char) -> bool {
261 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
262}
263
264impl<'s> ResharpParser<'s> {
265 fn default_translator_builder() -> TranslatorBuilder {
266 let mut trb = TranslatorBuilder::new();
267 trb.unicode(true);
268 trb.utf8(false);
269 trb
270 }
271
272 pub fn new(pattern: &'s str) -> Self {
273 let trb = Self::default_translator_builder();
274 Self {
275 translator: trb.build(),
276 pattern,
277 perl_classes: vec![],
278 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
279 pos: Cell::new(Position::new(0, 0, 0)),
280 capture_index: Cell::new(0),
281 octal: false,
282 empty_min_range: false,
283 ignore_whitespace: Cell::new(false),
284 comments: RefCell::new(vec![]),
285 stack_group: RefCell::new(vec![]),
286 stack_class: RefCell::new(vec![]),
287 capture_names: RefCell::new(vec![]),
288 scratch: RefCell::new(String::new()),
289 }
290 }
291
292 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
294 self
295 }
296
297 fn pattern(&self) -> &str {
299 self.pattern
300 }
301
302 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
304 ResharpError {
305 kind,
306 pattern: self.pattern().to_string(),
307 span,
308 }
309 }
310
311 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
312 let emptyspan = Span::splat(self.pos());
313 let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
314 inner
315 }
316
317 fn offset(&self) -> usize {
322 self.parser().pos.get().offset
323 }
324
325 fn line(&self) -> usize {
329 self.parser().pos.get().line
330 }
331
332 fn column(&self) -> usize {
336 self.parser().pos.get().column
337 }
338
339 fn next_capture_index(&self, span: Span) -> Result<u32> {
347 let current = self.parser().capture_index.get();
348 let i = current
349 .checked_add(1)
350 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
351 self.parser().capture_index.set(i);
352 Ok(i)
353 }
354
355 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
356 let mut names = self.parser().capture_names.borrow_mut();
357 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
358 Err(i) => {
359 names.insert(i, cap.clone());
360 Ok(())
361 }
362 Ok(i) => Err(self.error(
363 cap.span,
364 ast::ErrorKind::GroupNameDuplicate {
365 original: names[i].span,
366 },
367 )),
368 }
369 }
370
371 fn ignore_whitespace(&self) -> bool {
372 self.parser().ignore_whitespace.get()
373 }
374
375 fn char(&self) -> char {
376 self.char_at(self.offset())
377 }
378
379 fn char_at(&self, i: usize) -> char {
380 self.pattern()[i..]
381 .chars()
382 .next()
383 .unwrap_or_else(|| panic!("expected char at offset {}", i))
384 }
385
386 fn bump(&self) -> bool {
387 if self.is_eof() {
388 return false;
389 }
390 let Position {
391 mut offset,
392 mut line,
393 mut column,
394 } = self.pos();
395 if self.char() == '\n' {
396 line = line.checked_add(1).unwrap();
397 column = 1;
398 } else {
399 column = column.checked_add(1).unwrap();
400 }
401 offset += self.char().len_utf8();
402 self.parser().pos.set(Position {
403 offset,
404 line,
405 column,
406 });
407 self.pattern()[self.offset()..].chars().next().is_some()
408 }
409
410 fn bump_if(&self, prefix: &str) -> bool {
411 if self.pattern()[self.offset()..].starts_with(prefix) {
412 for _ in 0..prefix.chars().count() {
413 self.bump();
414 }
415 true
416 } else {
417 false
418 }
419 }
420
421 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
422 if self.bump_if("?=") {
423 return Some((true, true));
424 }
425 if self.bump_if("?!") {
426 return Some((true, false));
427 }
428 if self.bump_if("?<=") {
429 return Some((false, true));
430 }
431 if self.bump_if("?<!") {
432 return Some((false, false));
433 }
434 return None;
435 }
437
438 fn bump_and_bump_space(&self) -> bool {
439 if !self.bump() {
440 return false;
441 }
442 self.bump_space();
443 !self.is_eof()
444 }
445
446 fn bump_space(&self) {
447 if !self.ignore_whitespace() {
448 return;
449 }
450 while !self.is_eof() {
451 if self.char().is_whitespace() {
452 self.bump();
453 } else if self.char() == '#' {
454 let start = self.pos();
455 let mut comment_text = String::new();
456 self.bump();
457 while !self.is_eof() {
458 let c = self.char();
459 self.bump();
460 if c == '\n' {
461 break;
462 }
463 comment_text.push(c);
464 }
465 let comment = ast::Comment {
466 span: Span::new(start, self.pos()),
467 comment: comment_text,
468 };
469 self.parser().comments.borrow_mut().push(comment);
470 } else {
471 break;
472 }
473 }
474 }
475
476 fn peek(&self) -> Option<char> {
480 if self.is_eof() {
481 return None;
482 }
483 self.pattern()[self.offset() + self.char().len_utf8()..]
484 .chars()
485 .next()
486 }
487
488 fn peek_space(&self) -> Option<char> {
491 if !self.ignore_whitespace() {
492 return self.peek();
493 }
494 if self.is_eof() {
495 return None;
496 }
497 let mut start = self.offset() + self.char().len_utf8();
498 let mut in_comment = false;
499 for (i, c) in self.pattern()[start..].char_indices() {
500 if c.is_whitespace() {
501 continue;
502 } else if !in_comment && c == '#' {
503 in_comment = true;
504 } else if in_comment && c == '\n' {
505 in_comment = false;
506 } else {
507 start += i;
508 break;
509 }
510 }
511 self.pattern()[start..].chars().next()
512 }
513
514 fn is_eof(&self) -> bool {
516 self.offset() == self.pattern().len()
517 }
518
519 fn pos(&self) -> Position {
522 self.parser().pos.get()
523 }
524
525 fn span(&self) -> Span {
528 Span::splat(self.pos())
529 }
530
531 fn span_char(&self) -> Span {
533 let mut next = Position {
534 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
535 line: self.line(),
536 column: self.column().checked_add(1).unwrap(),
537 };
538 if self.char() == '\n' {
539 next.line += 1;
540 next.column = 1;
541 }
542 Span::new(self.pos(), next)
543 }
544
545 #[inline(never)]
555 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
556 assert_eq!(self.char(), '|');
557 concat.span.end = self.pos();
558 self.push_or_add_alternation(concat);
559 self.bump();
560 Ok(ast::Concat {
561 span: self.span(),
562 asts: vec![],
563 })
564 }
565
566 fn push_or_add_alternation(&self, concat: Concat) {
569 use self::GroupState::*;
570
571 let mut stack = self.parser().stack_group.borrow_mut();
572 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
573 alts.asts.push(concat.into_ast());
574 return;
575 }
576 stack.push(Alternation(ast::Alternation {
577 span: Span::new(concat.span.start, self.pos()),
578 asts: vec![concat.into_ast()],
579 }));
580 }
581
582 #[inline(never)]
583 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
584 assert_eq!(self.char(), '&');
585 concat.span.end = self.pos();
586 self.push_or_add_intersect(concat);
587 self.bump();
588 Ok(Concat {
589 span: self.span(),
590 asts: vec![],
591 })
592 }
593
594 fn push_or_add_intersect(&self, concat: Concat) {
597 use self::GroupState::*;
598
599 let mut stack = self.parser().stack_group.borrow_mut();
600 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
601 alts.asts.push(concat.into_ast());
602 return;
603 }
604 stack.push(Intersection(ast::Intersection {
605 span: Span::new(concat.span.start, self.pos()),
606 asts: vec![concat.into_ast()],
607 }));
608 }
609
610 #[inline(never)]
624 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
625 assert_eq!(self.char(), '(');
626 match self.parse_group()? {
627 Either::Left(set) => {
628 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
629 if let Some(v) = ignore {
630 self.parser().ignore_whitespace.set(v);
631 }
632
633 concat.asts.push(Ast::flags(set));
634 Ok(concat)
635 }
636 Either::Right(group) => {
637 let old_ignore_whitespace = self.ignore_whitespace();
638 let new_ignore_whitespace = group
639 .flags()
640 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
641 .unwrap_or(old_ignore_whitespace);
642 self.parser()
643 .stack_group
644 .borrow_mut()
645 .push(GroupState::Group {
646 concat,
647 group,
648 ignore_whitespace: old_ignore_whitespace,
649 });
650 self.parser().ignore_whitespace.set(new_ignore_whitespace);
651 Ok(Concat {
652 span: self.span(),
653 asts: vec![],
654 })
655 }
656 }
657 }
658
659 #[inline(never)]
660 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
661 assert_eq!(self.char(), '~');
662 self.bump();
663 if self.is_eof() || self.char() != '(' {
664 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
665 }
666 let open_span = self.span_char();
667 self.bump();
668 let group = ast::Group {
669 span: open_span,
670 kind: ast::GroupKind::Complement,
671 ast: Box::new(Ast::empty(self.span())),
672 };
673
674 let old_ignore_whitespace = self.ignore_whitespace();
675 let new_ignore_whitespace = group
676 .flags()
677 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
678 .unwrap_or(old_ignore_whitespace);
679 self.parser()
680 .stack_group
681 .borrow_mut()
682 .push(GroupState::Group {
683 concat,
684 group,
685 ignore_whitespace: old_ignore_whitespace,
686 });
687 self.parser().ignore_whitespace.set(new_ignore_whitespace);
688 Ok(Concat {
689 span: self.span(),
690 asts: vec![],
691 })
692 }
693
694 #[inline(never)]
704 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
705 use self::GroupState::*;
706 assert_eq!(self.char(), ')');
707 let mut stack = self.parser().stack_group.borrow_mut();
708 let topstack = stack.pop();
709
710 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
711 Some(Group {
712 concat,
713 group,
714 ignore_whitespace,
715 }) => (concat, group, ignore_whitespace, None),
716 Some(Alternation(alt)) => match stack.pop() {
717 Some(Group {
718 concat,
719 group,
720 ignore_whitespace,
721 }) => (
722 concat,
723 group,
724 ignore_whitespace,
725 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
726 ),
727 None | Some(Alternation(_)) | Some(Intersection(_)) => {
728 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
729 }
730 },
731 Some(Intersection(int)) => match stack.pop() {
732 Some(Group {
733 concat,
734 group,
735 ignore_whitespace,
736 }) => (
737 concat,
738 group,
739 ignore_whitespace,
740 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
741 ),
742 None | Some(Alternation(_)) | Some(Intersection(_)) => {
743 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
744 }
745 },
746
747 None => {
748 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
749 }
750 };
751 self.parser().ignore_whitespace.set(ignore_whitespace);
752 group_concat.span.end = self.pos();
753 self.bump();
754 group.span.end = self.pos();
755 match alt {
756 Some(Either::Left(mut alt)) => {
757 alt.span.end = group_concat.span.end;
758 alt.asts.push(group_concat.into_ast());
759 group.ast = Box::new(alt.into_ast());
760 }
761 Some(Either::Right(mut int)) => {
762 int.span.end = group_concat.span.end;
763 int.asts.push(group_concat.into_ast());
764 group.ast = Box::new(int.into_ast());
765 }
766 None => {
767 group.ast = Box::new(group_concat.into_ast());
768 }
769 }
770
771 if group.kind == GroupKind::Complement {
772 let complement = ast::Complement {
773 span: self.span(),
774 ast: group.ast,
775 };
776 prior_concat.asts.push(Ast::complement(complement));
777 }
778 else {
780 prior_concat.asts.push(Ast::group(group));
782 }
783 Ok(prior_concat)
784 }
785
786 #[inline(never)]
793 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
794 concat.span.end = self.pos();
795 let mut stack = self.parser().stack_group.borrow_mut();
797 let ast = match stack.pop() {
798 None => Ok(concat.into_ast()),
799 Some(GroupState::Alternation(mut alt)) => {
800 alt.span.end = self.pos();
801 alt.asts.push(concat.into_ast());
802 Ok(Ast::alternation(alt))
803 }
804 Some(GroupState::Intersection(mut int)) => {
805 int.span.end = self.pos();
806 int.asts.push(concat.into_ast());
807
808 Ok(Ast::intersection(int))
810 }
811 Some(GroupState::Group { group, .. }) => {
812 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
813 }
814 };
815 match stack.pop() {
817 None => ast,
818 Some(GroupState::Alternation(_)) => {
819 unreachable!()
826 }
827 Some(GroupState::Intersection(_)) => {
828 unreachable!()
829 }
830 Some(GroupState::Group { group, .. }) => {
831 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
832 }
833 }
834 }
835
836 #[inline(never)]
845 fn push_class_open(
846 &self,
847 parent_union: regex_syntax::ast::ClassSetUnion,
848 ) -> Result<regex_syntax::ast::ClassSetUnion> {
849 assert_eq!(self.char(), '[');
850
851 let (nested_set, nested_union) = self.parse_set_class_open()?;
852 self.parser()
853 .stack_class
854 .borrow_mut()
855 .push(ClassState::Open {
856 union: parent_union,
857 set: nested_set,
858 });
859 Ok(nested_union)
860 }
861
862 #[inline(never)]
877 fn pop_class(
878 &self,
879 nested_union: regex_syntax::ast::ClassSetUnion,
880 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
881 assert_eq!(self.char(), ']');
882
883 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
884 let prevset = self.pop_class_op(item);
885 let mut stack = self.parser().stack_class.borrow_mut();
886 match stack.pop() {
887 None => {
888 panic!("unexpected empty character class stack")
897 }
898 Some(ClassState::Op { .. }) => {
899 panic!("unexpected ClassState::Op")
906 }
907 Some(ClassState::Open { mut union, mut set }) => {
908 self.bump();
909 set.span.end = self.pos();
910 set.kind = prevset;
911 if stack.is_empty() {
912 Ok(Either::Right(set))
913 } else {
914 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
915 Ok(Either::Left(union))
916 }
917 }
918 }
919 }
920
921 #[inline(never)]
926 fn unclosed_class_error(&self) -> ResharpError {
927 for state in self.parser().stack_class.borrow().iter().rev() {
928 if let ClassState::Open { ref set, .. } = *state {
929 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
930 }
931 }
932 panic!("no open character class found")
935 }
936
937 #[inline(never)]
943 fn push_class_op(
944 &self,
945 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
946 next_union: regex_syntax::ast::ClassSetUnion,
947 ) -> regex_syntax::ast::ClassSetUnion {
948 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
949 let new_lhs = self.pop_class_op(item);
950 self.parser().stack_class.borrow_mut().push(ClassState::Op {
951 kind: next_kind,
952 lhs: new_lhs,
953 });
954 regex_syntax::ast::ClassSetUnion {
955 span: self.span(),
956 items: vec![],
957 }
958 }
959
960 #[inline(never)]
966 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
967 let mut stack = self.parser().stack_class.borrow_mut();
968 let (kind, lhs) = match stack.pop() {
969 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
970 Some(state @ ClassState::Open { .. }) => {
971 stack.push(state);
972 return rhs;
973 }
974 None => unreachable!(),
975 };
976 let span = Span::new(lhs.span().start, rhs.span().end);
977 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
978 span,
979 kind,
980 lhs: Box::new(lhs),
981 rhs: Box::new(rhs),
982 })
983 }
984
985 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
986 match hir.kind() {
987 hir::HirKind::Empty => Ok(NodeId::EPS),
988 hir::HirKind::Literal(l) => {
989 if l.0.len() == 1 {
990 let node = tb.mk_u8(l.0[0]);
991 Ok(node)
992 } else {
993 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
994 let conc = tb.mk_concats(ws.iter().copied());
995 Ok(conc)
996 }
997 }
998 hir::HirKind::Class(class) => {
999 match class {
1001 hir::Class::Unicode(class_unicode) => {
1002 let ranges = class_unicode.ranges();
1003 let mut nodes = Vec::new();
1004 for range in ranges {
1005 for seq in Utf8Sequences::new(range.start(), range.end()) {
1006 let sl = seq.as_slice();
1007 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1008 let node = match bytes.len() {
1009 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1010 n => {
1011 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1012 let mut conc = last;
1013 for i in (0..n - 1).rev() {
1014 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1015 conc = tb.mk_concat(b, conc);
1016 }
1017 conc
1018 }
1019 };
1020 nodes.push(node);
1021 }
1022 }
1023 let merged = tb.mk_unions(nodes.into_iter());
1024 Ok(merged)
1025 }
1026 hir::Class::Bytes(class_bytes) => {
1027 let ranges = class_bytes.ranges();
1028 let mut result = NodeId::BOT;
1029 for range in ranges {
1030 let start = range.start();
1031 let end = range.end();
1032 let node = tb.mk_range_u8(start, end);
1033 result = tb.mk_union(result, node);
1034 }
1035 Ok(result)
1036 }
1037 }
1038 }
1039 hir::HirKind::Look(_) => todo!(),
1040 hir::HirKind::Repetition(_) => todo!(),
1041 hir::HirKind::Capture(_) => todo!(),
1042 hir::HirKind::Concat(body) => {
1043 let mut result = NodeId::EPS;
1044 for child in body {
1045 let node = self.hir_to_node_id(child, tb)?;
1046 result = tb.mk_concat(result, node);
1047 }
1048 Ok(result)
1049 }
1050 hir::HirKind::Alternation(_) => todo!(),
1051 }
1052 }
1053
1054 fn translate_ast_to_hir(
1055 &mut self,
1056 orig_ast: ®ex_syntax::ast::Ast,
1057 tb: &mut TB<'s>,
1058 ) -> Result<NodeId> {
1059 match self.translator.translate("", orig_ast) {
1062 Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1063 Ok(hir) => {
1064 let mapped = self.hir_to_node_id(&hir, tb);
1065 mapped
1066 }
1067 }
1068 }
1069
1070 fn translator_to_node_id(
1071 &mut self,
1072 orig_ast: ®ex_syntax::ast::Ast,
1073 translator: &mut Option<Translator>,
1074 tb: &mut TB<'s>,
1075 ) -> Result<NodeId> {
1076 match translator {
1077 Some(tr) => {
1078 let hir = tr
1079 .translate("", &orig_ast)
1080 .map_err(|e| self.unsupported_error(e))?;
1081 self.hir_to_node_id(&hir, tb)
1082 }
1083 None => self.translate_ast_to_hir(&orig_ast, tb),
1084 }
1085 }
1086
1087 fn get_class(
1088 &mut self,
1089 negated: bool,
1090 kind: regex_syntax::ast::ClassPerlKind,
1091 tb: &mut TB<'s>,
1092 ) -> Result<NodeId> {
1093 let w = self
1095 .perl_classes
1096 .iter()
1097 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1098 match w {
1099 Some((_, _, value)) => return Ok(*value),
1100 None => {
1101 let translated = match kind {
1102 regex_syntax::ast::ClassPerlKind::Word => {
1103 self.unicode_classes.ensure_word(tb);
1104 if negated { self.unicode_classes.non_word } else { self.unicode_classes.word }
1105 }
1106 regex_syntax::ast::ClassPerlKind::Digit => {
1107 self.unicode_classes.ensure_digit(tb);
1108 if negated { self.unicode_classes.non_digit } else { self.unicode_classes.digit }
1109 }
1110 regex_syntax::ast::ClassPerlKind::Space => {
1111 self.unicode_classes.ensure_space(tb);
1112 if negated { self.unicode_classes.non_space } else { self.unicode_classes.space }
1113 }
1114 };
1115 self.perl_classes.push((negated, kind, translated));
1116 Ok(translated)
1117 }
1118 }
1119 }
1120
1121 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1122 use WordCharKind::*;
1123 match ast {
1124 Ast::Literal(lit) => {
1125 if is_word_byte(lit.c as u8) {
1126 Word
1127 } else {
1128 NonWord
1129 }
1130 }
1131 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1132 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1133 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1134 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1135 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1136 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1137 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1138 },
1139 Ast::Dot(_) | Ast::Top(_) => Unknown,
1140 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1141 Ast::Concat(c) if !c.asts.is_empty() => {
1142 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1143 }
1144 Ast::Repetition(r) => {
1145 let inner = Self::word_char_kind(&r.ast, left);
1146 let nullable = matches!(
1147 &r.op.kind,
1148 ast::RepetitionKind::ZeroOrMore
1149 | ast::RepetitionKind::ZeroOrOne
1150 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1151 );
1152 if nullable {
1153 match inner {
1154 Word => MaybeWord,
1155 NonWord => MaybeNonWord,
1156 _ => Unknown,
1157 }
1158 } else {
1159 inner
1160 }
1161 }
1162 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1163 _ => Unknown,
1164 }
1165 }
1166
1167 fn edge_class_ast<'a>(ast: &'a Ast, left: bool) -> Option<&'a Ast> {
1168 match ast {
1169 Ast::Literal(_) | Ast::ClassPerl(_) | Ast::ClassBracketed(_)
1170 | Ast::ClassUnicode(_) | Ast::Dot(_) | Ast::Top(_) => Some(ast),
1171 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1172 Ast::Concat(c) if !c.asts.is_empty() => {
1173 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1174 }
1175 Ast::Repetition(r) => Self::edge_class_ast(&r.ast, left),
1176 Ast::Lookaround(la) => Self::edge_class_ast(&la.ast, left),
1177 _ => None,
1178 }
1179 }
1180
1181 fn resolve_word_kind(
1182 &mut self,
1183 asts: &[Ast],
1184 idx: usize,
1185 dir: isize,
1186 translator: &mut Option<Translator>,
1187 tb: &mut TB<'s>,
1188 word_id: NodeId,
1189 not_word_id: NodeId,
1190 ) -> Result<WordCharKind> {
1191 use WordCharKind::*;
1192 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1193 if fast != Unknown {
1194 return Ok(fast);
1195 }
1196 let neighbor_idx = (idx as isize + dir) as usize;
1197 let edge = match Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1198 Some(e) => e,
1199 None => return Ok(Unknown),
1200 };
1201 let node = self.ast_to_node_id(edge, translator, tb)?;
1202 if tb.subsumes(word_id, node) == Some(true) {
1203 Ok(Word)
1204 } else if tb.subsumes(not_word_id, node) == Some(true) {
1205 Ok(NonWord)
1206 } else {
1207 Ok(Unknown)
1208 }
1209 }
1210
1211 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1212 use WordCharKind::*;
1213 let next = idx as isize + dir;
1214 if next < 0 || next >= asts.len() as isize {
1215 return Edge;
1216 }
1217 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1218 match kind {
1219 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1220 Word => Word,
1221 _ => Unknown,
1222 },
1223 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1224 NonWord => NonWord,
1225 _ => Unknown,
1226 },
1227 other => other,
1228 }
1229 }
1230
1231 fn rewrite_word_boundary_in_concat(
1232 &mut self,
1233 asts: &[Ast],
1234 idx: usize,
1235 translator: &mut Option<Translator>,
1236 tb: &mut TB<'s>,
1237 ) -> Result<(NodeId, usize)> {
1238 use WordCharKind::*;
1239 self.unicode_classes.ensure_word(tb);
1240 let word_id = self.unicode_classes.word;
1241 let not_word_id = self.unicode_classes.non_word;
1242 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1243 let right =
1244 self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1245
1246 match (left, right) {
1247 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1248 (Word, _) => {
1249 let neg = tb.mk_neg_lookahead(word_id, 0);
1250 Ok((neg, idx + 1))
1251 }
1252 (NonWord, _) => {
1253 let set = tb.mk_union(NodeId::END, word_id);
1254 let tail = tb.mk_concat(set, NodeId::TS);
1255 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1256 }
1257 (_, Word) => {
1258 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1259 }
1260 (_, NonWord) => {
1261 let body = tb.mk_union(NodeId::BEGIN, word_id);
1262 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1263 }
1264 _ => return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1267 }
1268 }
1269
1270 fn merge_boundary_with_following_lookaheads(
1271 &mut self,
1272 asts: &[Ast],
1273 wb_idx: usize,
1274 boundary_tail: NodeId,
1275 translator: &mut Option<Translator>,
1276 tb: &mut TB<'s>,
1277 ) -> Result<(NodeId, usize)> {
1278 let mut next = wb_idx + 1;
1279 let mut la_bodies = vec![boundary_tail];
1280 while next < asts.len() {
1281 match &asts[next] {
1282 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1283 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1284 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1285 next += 1;
1286 }
1287 _ => break,
1288 }
1289 }
1290 let merged = tb.mk_inters(la_bodies.into_iter());
1291 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1292 }
1293
1294 fn ast_to_node_id(
1295 &mut self,
1296 ast: &Ast,
1297 translator: &mut Option<Translator>,
1298 tb: &mut TB<'s>,
1299 ) -> Result<NodeId> {
1300 match ast {
1301 Ast::Empty(_) => Ok(NodeId::EPS),
1302 Ast::Flags(f) => {
1303 let mut translator_builder = Self::default_translator_builder();
1304 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1305 translator_builder.case_insensitive(state);
1306 }
1307 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1308 translator_builder.unicode(state);
1309 }
1310 let concat_translator = Some(translator_builder.build());
1311 *translator = concat_translator;
1312 Ok(NodeId::EPS)
1313 }
1314 Ast::Literal(l) => {
1315 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1316 self.translator_to_node_id(&ast_lit, translator, tb)
1317 }
1318 Ast::Top(_) => Ok(NodeId::TOP),
1319 Ast::Dot(_) => {
1320 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1321 self.hir_to_node_id(&hirv, tb)
1322 }
1323 Ast::Assertion(a) => match &a.kind {
1324 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1325 ast::AssertionKind::EndText => Ok(NodeId::END),
1326 ast::AssertionKind::WordBoundary => {
1327 return Err(
1328 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex),
1329 )
1330 }
1331 ast::AssertionKind::NotWordBoundary => {
1332 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1333 }
1334 ast::AssertionKind::StartLine => {
1335 let left = NodeId::BEGIN;
1336 let right = tb.mk_u8('\n' as u8);
1337 let union = tb.mk_union(left, right);
1338 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1339 }
1340 ast::AssertionKind::EndLine => {
1341 let left = NodeId::END;
1342 let right = tb.mk_u8('\n' as u8);
1343 let union = tb.mk_union(left, right);
1344 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1345 }
1346 ast::AssertionKind::WordBoundaryStart => todo!(),
1347 ast::AssertionKind::WordBoundaryEnd => todo!(),
1348 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1349 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1350 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1351 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1352 },
1353 Ast::ClassUnicode(c) => {
1354 let tmp = regex_syntax::ast::ClassUnicode {
1355 span: c.span,
1356 negated: c.negated,
1357 kind: c.kind.clone(),
1358 };
1359 if !c.negated {
1360 match &c.kind {
1361 regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1362 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1364 "utf8" => {
1366 let ascii = tb.mk_range_u8(0, 127);
1367 let beta = tb.mk_range_u8(128, 0xBF);
1368 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1369 let c0s = tb.mk_concats([c0, beta].into_iter());
1370 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1371 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1372 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1373 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1374 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1375 return Ok(tb.mk_star(merged));
1376 }
1377 "hex" => {
1378 let nums = tb.mk_range_u8(b'0', b'9');
1379 let lets = tb.mk_range_u8(b'a', b'f');
1380 let lets2 = tb.mk_range_u8(b'A', b'F');
1381 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1382 return Ok(merged);
1383 }
1384 _ => {}
1385 },
1386 _ => {}
1387 };
1388 }
1389
1390 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1391 self.translator_to_node_id(&orig_ast, translator, tb)
1392 }
1393 Ast::ClassPerl(c) => {
1394 self.get_class(c.negated, c.kind.clone(), tb)
1395 }
1396 Ast::ClassBracketed(c) => match &c.kind {
1397 regex_syntax::ast::ClassSet::Item(_) => {
1398 let tmp = regex_syntax::ast::ClassBracketed {
1399 span: c.span,
1400 negated: c.negated,
1401 kind: c.kind.clone(),
1402 };
1403 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1404 self.translator_to_node_id(&orig_ast, translator, tb)
1405 }
1406 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1407 },
1408 Ast::Repetition(r) => {
1409 let body = self.ast_to_node_id(&r.ast, translator, tb);
1410 match body {
1411 Ok(body) => match &r.op.kind {
1412 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1413 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1414 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1415 ast::RepetitionKind::Range(r) => match r {
1416 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1417 ast::RepetitionRange::AtLeast(n) => {
1418 let rep = tb.mk_repeat(body, *n, *n);
1419 let st = tb.mk_star(body);
1420 Ok(tb.mk_concat(rep, st))
1421 }
1422
1423 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1424 },
1425 },
1426 Err(_) => body,
1427 }
1428 }
1429 Ast::Lookaround(g) => {
1430 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1431 match g.kind {
1432 ast::LookaroundKind::PositiveLookahead => {
1433 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1434 }
1435 ast::LookaroundKind::PositiveLookbehind => {
1436 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1437 }
1438 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1439 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1440 }
1441 }
1442 Ast::Group(g) => {
1443 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1444 if !flags.items.is_empty() {
1445 let mut translator_builder = Self::default_translator_builder();
1446 translator_builder.utf8(false);
1447 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1448 translator_builder.case_insensitive(state);
1449 }
1450 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1451 translator_builder.unicode(state);
1452 }
1453 let mut scoped = Some(translator_builder.build());
1454 return self.ast_to_node_id(&g.ast, &mut scoped, tb);
1455 }
1456 }
1457 self.ast_to_node_id(&g.ast, translator, tb)
1458 }
1459 Ast::Alternation(a) => {
1460 let mut children = vec![];
1461 for ast in &a.asts {
1462 match self.ast_to_node_id(ast, translator, tb) {
1463 Ok(node_id) => children.push(node_id),
1464 Err(err) => return Err(err),
1465 }
1466 }
1467 Ok(tb.mk_unions(children.iter().copied()))
1468 }
1469 Ast::Concat(c) => {
1470 let mut concat_translator: Option<Translator> = None;
1471 let mut children = vec![];
1472 let mut i = 0;
1473 while i < c.asts.len() {
1474 let ast = &c.asts[i];
1475 match ast {
1476 Ast::Flags(f) => {
1477 let mut translator_builder = Self::default_translator_builder();
1478 translator_builder.utf8(false);
1479 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1480 translator_builder.case_insensitive(state);
1481 }
1482 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1483 translator_builder.unicode(state);
1484 }
1485 concat_translator = Some(translator_builder.build());
1486 i += 1;
1487 continue;
1488 }
1489 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1490 let node =
1491 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1492 children.push(node.0);
1493 i = node.1; continue;
1495 }
1496 _ => {}
1497 }
1498 match concat_translator {
1499 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1500 Ok(node_id) => children.push(node_id),
1501 Err(err) => return Err(err),
1502 },
1503 None => match self.ast_to_node_id(ast, translator, tb) {
1504 Ok(node_id) => children.push(node_id),
1505 Err(err) => return Err(err),
1506 },
1507 }
1508 i += 1;
1509 }
1510 Ok(tb.mk_concats(children.iter().cloned()))
1511 }
1512 Ast::Intersection(intersection) => {
1513 let mut children = vec![];
1514 for ast in &intersection.asts {
1515 match self.ast_to_node_id(ast, translator, tb) {
1516 Ok(node_id) => children.push(node_id),
1517 Err(err) => return Err(err),
1518 }
1519 }
1520 Ok(tb.mk_inters(children.into_iter()))
1521 }
1522 Ast::Complement(complement) => {
1523 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1524 body.map(|x| tb.mk_compl(x))
1525 }
1526 }
1527 }
1528
1529 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1532 let mut concat = Concat {
1533 span: self.span(),
1534 asts: vec![],
1535 };
1536 loop {
1537 self.bump_space();
1538 if self.is_eof() {
1539 break;
1540 }
1541 match self.char() {
1542 '(' => concat = self.push_group(concat)?,
1543 ')' => concat = self.pop_group(concat)?,
1544 '|' => concat = self.push_alternate(concat)?,
1545 '&' => concat = self.push_intersect(concat)?,
1546 '~' => concat = self.push_compl_group(concat)?,
1547 '[' => {
1548 let class = self.parse_set_class()?;
1549 concat.asts.push(Ast::class_bracketed(class));
1550 }
1551 '?' => {
1552 concat =
1553 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1554 }
1555 '*' => {
1556 concat =
1557 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1558 }
1559 '+' => {
1560 concat =
1561 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1562 }
1563 '{' => {
1564 concat = self.parse_counted_repetition(concat)?;
1565 }
1566 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1567 }
1568 }
1569 let ast = self.pop_group_end(concat)?;
1570 let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1572 node_id
1573 }
1574
1575 #[inline(never)]
1576 fn parse_uncounted_repetition(
1577 &self,
1578 mut concat: ast::Concat,
1579 kind: ast::RepetitionKind,
1580 ) -> Result<ast::Concat> {
1581 let op_start = self.pos();
1583 let ast = match concat.asts.pop() {
1584 Some(ast) => ast,
1585 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1586 };
1587 match ast {
1588 Ast::Empty(_) | Ast::Flags(_) => {
1589 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1590 }
1591 _ => {}
1592 }
1593 if self.bump() && self.char() == '?' {
1594 return Err(self.error(
1595 Span::new(op_start, self.pos()),
1596 ast::ErrorKind::UnsupportedLazyQuantifier,
1597 ));
1598 }
1599 concat.asts.push(Ast::repetition(ast::Repetition {
1600 span: ast.span().with_end(self.pos()),
1601 op: ast::RepetitionOp {
1602 span: Span::new(op_start, self.pos()),
1603 kind,
1604 },
1605 greedy: true,
1606 ast: Box::new(ast),
1607 }));
1608 Ok(concat)
1609 }
1610
1611 #[inline(never)]
1612 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1613 assert!(self.char() == '{');
1614 let start = self.pos();
1615 let ast = match concat.asts.pop() {
1616 Some(ast) => ast,
1617 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1618 };
1619 match ast {
1620 Ast::Empty(_) | Ast::Flags(_) => {
1621 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1622 }
1623 _ => {}
1624 }
1625 if !self.bump_and_bump_space() {
1626 return Err(self.error(
1627 Span::new(start, self.pos()),
1628 ast::ErrorKind::RepetitionCountUnclosed,
1629 ));
1630 }
1631 let count_start = specialize_err(
1632 self.parse_decimal(),
1633 ast::ErrorKind::DecimalEmpty,
1634 ast::ErrorKind::RepetitionCountDecimalEmpty,
1635 );
1636 if self.is_eof() {
1637 return Err(self.error(
1638 Span::new(start, self.pos()),
1639 ast::ErrorKind::RepetitionCountUnclosed,
1640 ));
1641 }
1642 let range = if self.char() == ',' {
1643 if !self.bump_and_bump_space() {
1644 return Err(self.error(
1645 Span::new(start, self.pos()),
1646 ast::ErrorKind::RepetitionCountUnclosed,
1647 ));
1648 }
1649 if self.char() != '}' {
1650 let count_start = match count_start {
1651 Ok(c) => c,
1652 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1653 if self.parser().empty_min_range {
1654 0
1655 } else {
1656 return Err(err);
1657 }
1658 }
1659 err => err?,
1660 };
1661 let count_end = specialize_err(
1662 self.parse_decimal(),
1663 ast::ErrorKind::DecimalEmpty,
1664 ast::ErrorKind::RepetitionCountDecimalEmpty,
1665 )?;
1666 ast::RepetitionRange::Bounded(count_start, count_end)
1667 } else {
1668 ast::RepetitionRange::AtLeast(count_start?)
1669 }
1670 } else {
1671 ast::RepetitionRange::Exactly(count_start?)
1672 };
1673
1674 if self.is_eof() || self.char() != '}' {
1675 return Err(self.error(
1676 Span::new(start, self.pos()),
1677 ast::ErrorKind::RepetitionCountUnclosed,
1678 ));
1679 }
1680
1681 if self.bump_and_bump_space() && self.char() == '?' {
1682 return Err(self.error(
1683 Span::new(start, self.pos()),
1684 ast::ErrorKind::UnsupportedLazyQuantifier,
1685 ));
1686 }
1687
1688 let op_span = Span::new(start, self.pos());
1689 if !range.is_valid() {
1690 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1691 }
1692 concat.asts.push(Ast::repetition(ast::Repetition {
1693 span: ast.span().with_end(self.pos()),
1694 op: ast::RepetitionOp {
1695 span: op_span,
1696 kind: ast::RepetitionKind::Range(range),
1697 },
1698 greedy: true,
1699 ast: Box::new(ast),
1700 }));
1701 Ok(concat)
1702 }
1703
1704 #[inline(never)]
1705 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1706 assert_eq!(self.char(), '(');
1707 let open_span = self.span_char();
1708 self.bump();
1709 self.bump_space();
1710 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1711 let kind = match (pos, ahead) {
1712 (true, true) => LookaroundKind::PositiveLookahead,
1713 (true, false) => LookaroundKind::PositiveLookbehind,
1714 (false, true) => LookaroundKind::NegativeLookahead,
1715 (false, false) => LookaroundKind::NegativeLookbehind,
1716 };
1717 return Ok(Either::Right(ast::Group {
1718 span: open_span,
1719 kind: ast::GroupKind::Lookaround(kind),
1720 ast: Box::new(Ast::empty(self.span())),
1721 }));
1722 }
1723 let inner_span = self.span();
1724 let mut starts_with_p = true;
1725 if self.bump_if("?P<") || {
1726 starts_with_p = false;
1727 self.bump_if("?<")
1728 } {
1729 let capture_index = self.next_capture_index(open_span)?;
1730 let name = self.parse_capture_name(capture_index)?;
1731 Ok(Either::Right(ast::Group {
1732 span: open_span,
1733 kind: ast::GroupKind::CaptureName {
1734 starts_with_p,
1735 name,
1736 },
1737 ast: Box::new(Ast::empty(self.span())),
1738 }))
1739 } else if self.bump_if("?") {
1740 if self.is_eof() {
1741 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1742 }
1743 let flags = self.parse_flags()?;
1744 let char_end = self.char();
1745 self.bump();
1746 if char_end == ')' {
1747 if flags.items.is_empty() {
1750 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1751 }
1752 Ok(Either::Left(ast::SetFlags {
1753 span: Span {
1754 end: self.pos(),
1755 ..open_span
1756 },
1757 flags,
1758 }))
1759 } else {
1760 assert_eq!(char_end, ':');
1761 Ok(Either::Right(ast::Group {
1762 span: open_span,
1763 kind: ast::GroupKind::NonCapturing(flags),
1764 ast: Box::new(Ast::empty(self.span())),
1765 }))
1766 }
1767 } else {
1768 let capture_index = self.next_capture_index(open_span)?;
1769 Ok(Either::Right(ast::Group {
1770 span: open_span,
1771 kind: ast::GroupKind::CaptureIndex(capture_index),
1772 ast: Box::new(Ast::empty(self.span())),
1773 }))
1774 }
1775 }
1776
1777 #[inline(never)]
1778 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1779 if self.is_eof() {
1780 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1781 }
1782 let start = self.pos();
1783 loop {
1784 if self.char() == '>' {
1785 break;
1786 }
1787 if !is_capture_char(self.char(), self.pos() == start) {
1788 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1789 }
1790 if !self.bump() {
1791 break;
1792 }
1793 }
1794 let end = self.pos();
1795 if self.is_eof() {
1796 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1797 }
1798 assert_eq!(self.char(), '>');
1799 self.bump();
1800 let name = &self.pattern()[start.offset..end.offset];
1801 if name.is_empty() {
1802 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1803 }
1804 let capname = ast::CaptureName {
1805 span: Span::new(start, end),
1806 name: name.to_string(),
1807 index: capture_index,
1808 };
1809 self.add_capture_name(&capname)?;
1810 Ok(capname)
1811 }
1812
1813 #[inline(never)]
1814 fn parse_flags(&self) -> Result<ast::Flags> {
1815 let mut flags = ast::Flags {
1816 span: self.span(),
1817 items: vec![],
1818 };
1819 let mut last_was_negation = None;
1820 while self.char() != ':' && self.char() != ')' {
1821 if self.char() == '-' {
1822 last_was_negation = Some(self.span_char());
1823 let item = ast::FlagsItem {
1824 span: self.span_char(),
1825 kind: ast::FlagsItemKind::Negation,
1826 };
1827 if let Some(i) = flags.add_item(item) {
1828 return Err(self.error(
1829 self.span_char(),
1830 ast::ErrorKind::FlagRepeatedNegation {
1831 original: flags.items[i].span,
1832 },
1833 ));
1834 }
1835 } else {
1836 last_was_negation = None;
1837 let item = ast::FlagsItem {
1838 span: self.span_char(),
1839 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1840 };
1841 if let Some(i) = flags.add_item(item) {
1842 return Err(self.error(
1843 self.span_char(),
1844 ast::ErrorKind::FlagDuplicate {
1845 original: flags.items[i].span,
1846 },
1847 ));
1848 }
1849 }
1850 if !self.bump() {
1851 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1852 }
1853 }
1854 if let Some(span) = last_was_negation {
1855 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1856 }
1857 flags.span.end = self.pos();
1858 Ok(flags)
1859 }
1860
1861 #[inline(never)]
1862 fn parse_flag(&self) -> Result<ast::Flag> {
1863 match self.char() {
1864 'i' => Ok(ast::Flag::CaseInsensitive),
1865 'm' => Ok(ast::Flag::MultiLine),
1866 's' => Ok(ast::Flag::DotMatchesNewLine),
1867 'U' => Ok(ast::Flag::SwapGreed),
1868 'u' => Ok(ast::Flag::Unicode),
1869 'R' => Ok(ast::Flag::CRLF),
1870 'x' => Ok(ast::Flag::IgnoreWhitespace),
1871 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1872 }
1873 }
1874
1875 fn parse_primitive(&self) -> Result<Primitive> {
1876 match self.char() {
1877 '\\' => self.parse_escape(),
1878 '_' => {
1879 let ast = Primitive::Top(self.span_char());
1880 self.bump();
1881 Ok(ast)
1882 }
1883 '.' => {
1884 let ast = Primitive::Dot(self.span_char());
1885 self.bump();
1886 Ok(ast)
1887 }
1888 '^' => {
1889 let ast = Primitive::Assertion(ast::Assertion {
1890 span: self.span_char(),
1891 kind: ast::AssertionKind::StartLine,
1892 });
1893 self.bump();
1894 Ok(ast)
1895 }
1896 '$' => {
1897 let ast = Primitive::Assertion(ast::Assertion {
1898 span: self.span_char(),
1899 kind: ast::AssertionKind::EndLine,
1900 });
1901 self.bump();
1902 Ok(ast)
1903 }
1904 c => {
1905 let ast = Primitive::Literal(Literal {
1906 span: self.span_char(),
1907 kind: LiteralKind::Verbatim,
1908 c,
1909 });
1910 self.bump();
1911 Ok(ast)
1912 }
1913 }
1914 }
1915
1916 #[inline(never)]
1917 fn parse_escape(&self) -> Result<Primitive> {
1918 assert_eq!(self.char(), '\\');
1919 let start = self.pos();
1920 if !self.bump() {
1921 return Err(self.error(
1922 Span::new(start, self.pos()),
1923 ast::ErrorKind::EscapeUnexpectedEof,
1924 ));
1925 }
1926 let c = self.char();
1927 match c {
1929 '0'..='9' => {
1930 if !self.parser().octal {
1931 return Err(self.error(
1932 Span::new(start, self.span_char().end),
1933 ast::ErrorKind::UnsupportedBackreference,
1934 ));
1935 }
1936 let mut lit = self.parse_octal();
1937 lit.span.start = start;
1938 return Ok(Primitive::Literal(lit));
1939 }
1940 'x' | 'u' | 'U' => {
1947 let mut lit = self.parse_hex()?;
1948 lit.span.start = start;
1949 return Ok(Primitive::Literal(lit));
1950 }
1951 'p' | 'P' => {
1952 let mut cls = self.parse_unicode_class()?;
1953 cls.span.start = start;
1954 return Ok(Primitive::Unicode(cls));
1955 }
1956 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1957 let mut cls = self.parse_perl_class();
1958 cls.span.start = start;
1959 return Ok(Primitive::Perl(cls));
1960 }
1961 _ => {}
1962 }
1963
1964 self.bump();
1966 let span = Span::new(start, self.pos());
1967 if is_meta_character(c) {
1968 return Ok(Primitive::Literal(Literal {
1969 span,
1970 kind: LiteralKind::Meta,
1971 c,
1972 }));
1973 }
1974 if is_escapeable_character(c) {
1975 return Ok(Primitive::Literal(Literal {
1976 span,
1977 kind: LiteralKind::Superfluous,
1978 c,
1979 }));
1980 }
1981 let special = |kind, c| {
1982 Ok(Primitive::Literal(Literal {
1983 span,
1984 kind: LiteralKind::Special(kind),
1985 c,
1986 }))
1987 };
1988 match c {
1989 'a' => special(SpecialLiteralKind::Bell, '\x07'),
1990 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1991 't' => special(SpecialLiteralKind::Tab, '\t'),
1992 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1993 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1994 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1995 'A' => Ok(Primitive::Assertion(ast::Assertion {
1996 span,
1997 kind: ast::AssertionKind::StartText,
1998 })),
1999 'z' => Ok(Primitive::Assertion(ast::Assertion {
2000 span,
2001 kind: ast::AssertionKind::EndText,
2002 })),
2003 'b' => {
2004 let mut wb = ast::Assertion {
2005 span,
2006 kind: ast::AssertionKind::WordBoundary,
2007 };
2008 if !self.is_eof() && self.char() == '{' {
2011 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2012 wb.kind = kind;
2013 wb.span.end = self.pos();
2014 }
2015 }
2016 Ok(Primitive::Assertion(wb))
2017 }
2018 'B' => Ok(Primitive::Assertion(ast::Assertion {
2019 span,
2020 kind: ast::AssertionKind::NotWordBoundary,
2021 })),
2022 '<' => Ok(Primitive::Assertion(ast::Assertion {
2023 span,
2024 kind: ast::AssertionKind::WordBoundaryStartAngle,
2025 })),
2026 '>' => Ok(Primitive::Assertion(ast::Assertion {
2027 span,
2028 kind: ast::AssertionKind::WordBoundaryEndAngle,
2029 })),
2030 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2031 }
2032 }
2033
2034 fn maybe_parse_special_word_boundary(
2035 &self,
2036 wb_start: Position,
2037 ) -> Result<Option<ast::AssertionKind>> {
2038 assert_eq!(self.char(), '{');
2039
2040 let is_valid_char = |c| match c {
2041 'A'..='Z' | 'a'..='z' | '-' => true,
2042 _ => false,
2043 };
2044 let start = self.pos();
2045 if !self.bump_and_bump_space() {
2046 return Err(self.error(
2047 Span::new(wb_start, self.pos()),
2048 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2049 ));
2050 }
2051 let start_contents = self.pos();
2052 if !is_valid_char(self.char()) {
2057 self.parser().pos.set(start);
2058 return Ok(None);
2059 }
2060
2061 let mut scratch = self.parser().scratch.borrow_mut();
2063 scratch.clear();
2064 while !self.is_eof() && is_valid_char(self.char()) {
2065 scratch.push(self.char());
2066 self.bump_and_bump_space();
2067 }
2068 if self.is_eof() || self.char() != '}' {
2069 return Err(self.error(
2070 Span::new(start, self.pos()),
2071 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2072 ));
2073 }
2074 let end = self.pos();
2075 self.bump();
2076 let kind = match scratch.as_str() {
2077 "start" => ast::AssertionKind::WordBoundaryStart,
2078 "end" => ast::AssertionKind::WordBoundaryEnd,
2079 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2080 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2081 _ => {
2082 return Err(self.error(
2083 Span::new(start_contents, end),
2084 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2085 ))
2086 }
2087 };
2088 Ok(Some(kind))
2089 }
2090
2091 #[inline(never)]
2092 fn parse_octal(&self) -> Literal {
2093 assert!(self.parser().octal);
2094 assert!('0' <= self.char() && self.char() <= '7');
2095 let start = self.pos();
2096 while self.bump()
2098 && '0' <= self.char()
2099 && self.char() <= '7'
2100 && self.pos().offset - start.offset <= 2
2101 {}
2102 let end = self.pos();
2103 let octal = &self.pattern()[start.offset..end.offset];
2104 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2107 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2110 Literal {
2111 span: Span::new(start, end),
2112 kind: LiteralKind::Octal,
2113 c,
2114 }
2115 }
2116
2117 #[inline(never)]
2118 fn parse_hex(&self) -> Result<Literal> {
2119 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2120
2121 let hex_kind = match self.char() {
2122 'x' => HexLiteralKind::X,
2123 'u' => HexLiteralKind::UnicodeShort,
2124 _ => HexLiteralKind::UnicodeLong,
2125 };
2126 if !self.bump_and_bump_space() {
2127 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2128 }
2129 if self.char() == '{' {
2130 self.parse_hex_brace(hex_kind)
2131 } else {
2132 self.parse_hex_digits(hex_kind)
2133 }
2134 }
2135
2136 #[inline(never)]
2137 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2138 let mut scratch = self.parser().scratch.borrow_mut();
2139 scratch.clear();
2140
2141 let start = self.pos();
2142 for i in 0..kind.digits() {
2143 if i > 0 && !self.bump_and_bump_space() {
2144 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2145 }
2146 if !is_hex(self.char()) {
2147 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2148 }
2149 scratch.push(self.char());
2150 }
2151 self.bump_and_bump_space();
2154 let end = self.pos();
2155 let hex = scratch.as_str();
2156 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2157 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2158 Some(c) => Ok(Literal {
2159 span: Span::new(start, end),
2160 kind: LiteralKind::HexFixed(kind),
2161 c,
2162 }),
2163 }
2164 }
2165
2166 #[inline(never)]
2167 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2168 let mut scratch = self.parser().scratch.borrow_mut();
2169 scratch.clear();
2170
2171 let brace_pos = self.pos();
2172 let start = self.span_char().end;
2173 while self.bump_and_bump_space() && self.char() != '}' {
2174 if !is_hex(self.char()) {
2175 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2176 }
2177 scratch.push(self.char());
2178 }
2179 if self.is_eof() {
2180 return Err(self.error(
2181 Span::new(brace_pos, self.pos()),
2182 ast::ErrorKind::EscapeUnexpectedEof,
2183 ));
2184 }
2185 let end = self.pos();
2186 let hex = scratch.as_str();
2187 assert_eq!(self.char(), '}');
2188 self.bump_and_bump_space();
2189
2190 if hex.is_empty() {
2191 return Err(self.error(
2192 Span::new(brace_pos, self.pos()),
2193 ast::ErrorKind::EscapeHexEmpty,
2194 ));
2195 }
2196 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2197 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2198 Some(c) => Ok(Literal {
2199 span: Span::new(start, self.pos()),
2200 kind: LiteralKind::HexBrace(kind),
2201 c,
2202 }),
2203 }
2204 }
2205
2206 fn parse_decimal(&self) -> Result<u32> {
2207 let mut scratch = self.parser().scratch.borrow_mut();
2208 scratch.clear();
2209
2210 while !self.is_eof() && self.char().is_whitespace() {
2211 self.bump();
2212 }
2213 let start = self.pos();
2214 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2215 scratch.push(self.char());
2216 self.bump_and_bump_space();
2217 }
2218 let span = Span::new(start, self.pos());
2219 while !self.is_eof() && self.char().is_whitespace() {
2220 self.bump_and_bump_space();
2221 }
2222 let digits = scratch.as_str();
2223 if digits.is_empty() {
2224 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2225 }
2226 match u32::from_str_radix(digits, 10).ok() {
2227 Some(n) => Ok(n),
2228 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2229 }
2230 }
2231
2232 #[inline(never)]
2233 fn parse_set_class(&self) -> Result<ClassBracketed> {
2234 assert_eq!(self.char(), '[');
2235
2236 let mut union = ClassSetUnion {
2237 span: self.span(),
2238 items: vec![],
2239 };
2240 loop {
2241 self.bump_space();
2242 if self.is_eof() {
2243 return Err(self.unclosed_class_error());
2244 }
2245 match self.char() {
2246 '[' => {
2247 if !self.parser().stack_class.borrow().is_empty() {
2252 if let Some(cls) = self.maybe_parse_ascii_class() {
2253 union.push(ClassSetItem::Ascii(cls));
2254 continue;
2255 }
2256 }
2257 union = self.push_class_open(union)?;
2258 }
2259 ']' => match self.pop_class(union)? {
2260 Either::Left(nested_union) => {
2261 union = nested_union;
2262 }
2263 Either::Right(class) => return Ok(class),
2264 },
2265 '&' if self.peek() == Some('&') => {
2266 assert!(self.bump_if("&&"));
2267 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2268 }
2269 '-' if self.peek() == Some('-') => {
2270 assert!(self.bump_if("--"));
2271 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2272 }
2273 '~' if self.peek() == Some('~') => {
2274 assert!(self.bump_if("~~"));
2275 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2276 }
2277 _ => {
2278 union.push(self.parse_set_class_range()?);
2279 }
2280 }
2281 }
2282 }
2283
2284 #[inline(never)]
2285 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2286 let prim1 = self.parse_set_class_item()?;
2287 self.bump_space();
2288 if self.is_eof() {
2289 return Err(self.unclosed_class_error());
2290 }
2291 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2292 return prim1.into_class_set_item(self);
2293 }
2294 if !self.bump_and_bump_space() {
2295 return Err(self.unclosed_class_error());
2296 }
2297 let prim2 = self.parse_set_class_item()?;
2298 let range = ClassSetRange {
2299 span: Span::new(prim1.span().start, prim2.span().end),
2300 start: prim1.into_class_literal(self)?,
2301 end: prim2.into_class_literal(self)?,
2302 };
2303 if !range.is_valid() {
2304 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2305 }
2306 Ok(ClassSetItem::Range(range))
2307 }
2308
2309 #[inline(never)]
2310 fn parse_set_class_item(&self) -> Result<Primitive> {
2311 if self.char() == '\\' {
2312 self.parse_escape()
2313 } else {
2314 let x = Primitive::Literal(Literal {
2315 span: self.span_char(),
2316 kind: LiteralKind::Verbatim,
2317 c: self.char(),
2318 });
2319 self.bump();
2320 Ok(x)
2321 }
2322 }
2323
2324 #[inline(never)]
2325 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2326 assert_eq!(self.char(), '[');
2327 let start = self.pos();
2328 if !self.bump_and_bump_space() {
2329 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2330 }
2331
2332 let negated = if self.char() != '^' {
2333 false
2334 } else {
2335 if !self.bump_and_bump_space() {
2336 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2337 }
2338 true
2339 };
2340 let mut union = ClassSetUnion {
2342 span: self.span(),
2343 items: vec![],
2344 };
2345 while self.char() == '-' {
2346 union.push(ClassSetItem::Literal(Literal {
2347 span: self.span_char(),
2348 kind: LiteralKind::Verbatim,
2349 c: '-',
2350 }));
2351 if !self.bump_and_bump_space() {
2352 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2353 }
2354 }
2355 if union.items.is_empty() && self.char() == ']' {
2358 union.push(ClassSetItem::Literal(Literal {
2359 span: self.span_char(),
2360 kind: LiteralKind::Verbatim,
2361 c: ']',
2362 }));
2363 if !self.bump_and_bump_space() {
2364 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2365 }
2366 }
2367 let set = ClassBracketed {
2368 span: Span::new(start, self.pos()),
2369 negated,
2370 kind: ClassSet::union(ClassSetUnion {
2371 span: Span::new(union.span.start, union.span.start),
2372 items: vec![],
2373 }),
2374 };
2375 Ok((set, union))
2376 }
2377
2378 #[inline(never)]
2379 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2380 assert_eq!(self.char(), '[');
2381 let start = self.pos();
2383 let mut negated = false;
2384 if !self.bump() || self.char() != ':' {
2385 self.parser().pos.set(start);
2386 return None;
2387 }
2388 if !self.bump() {
2389 self.parser().pos.set(start);
2390 return None;
2391 }
2392 if self.char() == '^' {
2393 negated = true;
2394 if !self.bump() {
2395 self.parser().pos.set(start);
2396 return None;
2397 }
2398 }
2399 let name_start = self.offset();
2400 while self.char() != ':' && self.bump() {}
2401 if self.is_eof() {
2402 self.parser().pos.set(start);
2403 return None;
2404 }
2405 let name = &self.pattern()[name_start..self.offset()];
2406 if !self.bump_if(":]") {
2407 self.parser().pos.set(start);
2408 return None;
2409 }
2410 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2411 Some(kind) => kind,
2412 None => {
2413 self.parser().pos.set(start);
2414 return None;
2415 }
2416 };
2417 Some(ClassAscii {
2418 span: Span::new(start, self.pos()),
2419 kind,
2420 negated,
2421 })
2422 }
2423
2424 #[inline(never)]
2425 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2426 assert!(self.char() == 'p' || self.char() == 'P');
2427
2428 let mut scratch = self.parser().scratch.borrow_mut();
2429 scratch.clear();
2430
2431 let negated = self.char() == 'P';
2432 if !self.bump_and_bump_space() {
2433 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2434 }
2435 let (start, kind) = if self.char() == '{' {
2436 let start = self.span_char().end;
2437 while self.bump_and_bump_space() && self.char() != '}' {
2438 scratch.push(self.char());
2439 }
2440 if self.is_eof() {
2441 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2442 }
2443 assert_eq!(self.char(), '}');
2444 self.bump();
2445
2446 let name = scratch.as_str();
2447 if let Some(i) = name.find("!=") {
2448 (
2449 start,
2450 ClassUnicodeKind::NamedValue {
2451 op: ClassUnicodeOpKind::NotEqual,
2452 name: name[..i].to_string(),
2453 value: name[i + 2..].to_string(),
2454 },
2455 )
2456 } else if let Some(i) = name.find(':') {
2457 (
2458 start,
2459 ClassUnicodeKind::NamedValue {
2460 op: ClassUnicodeOpKind::Colon,
2461 name: name[..i].to_string(),
2462 value: name[i + 1..].to_string(),
2463 },
2464 )
2465 } else if let Some(i) = name.find('=') {
2466 (
2467 start,
2468 ClassUnicodeKind::NamedValue {
2469 op: ClassUnicodeOpKind::Equal,
2470 name: name[..i].to_string(),
2471 value: name[i + 1..].to_string(),
2472 },
2473 )
2474 } else {
2475 (start, ClassUnicodeKind::Named(name.to_string()))
2476 }
2477 } else {
2478 let start = self.pos();
2479 let c = self.char();
2480 if c == '\\' {
2481 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2482 }
2483 self.bump_and_bump_space();
2484 let kind = ClassUnicodeKind::OneLetter(c);
2485 (start, kind)
2486 };
2487 Ok(ClassUnicode {
2488 span: Span::new(start, self.pos()),
2489 negated,
2490 kind,
2491 })
2492 }
2493
2494 #[inline(never)]
2495 fn parse_perl_class(&self) -> ClassPerl {
2496 let c = self.char();
2497 let span = self.span_char();
2498 self.bump();
2499 let (negated, kind) = match c {
2500 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2501 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2502 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2503 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2504 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2505 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2506 c => panic!("expected valid Perl class but got '{}'", c),
2507 };
2508 ClassPerl {
2509 span,
2510 kind,
2511 negated,
2512 }
2513 }
2514}
2515
2516pub fn parse_ast<'s>(
2517 tb: &mut TB<'s>,
2518 pattern: &'s str,
2519) -> std::result::Result<NodeId, ResharpError> {
2520 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2521 let result = p.parse(tb);
2522 result
2523}