1#![warn(dead_code)]
6mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::NodeId;
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26#[derive(Clone, Copy, PartialEq)]
27enum WordCharKind {
28 Word,
29 NonWord,
30 MaybeWord,
31 MaybeNonWord,
32 Unknown,
33 Edge,
34}
35
36fn is_word_byte(b: u8) -> bool {
37 b.is_ascii_alphanumeric() || b == b'_'
38}
39
40#[derive(Clone, Debug, Eq, PartialEq)]
41enum Primitive {
42 Literal(Literal),
43 Assertion(ast::Assertion),
44 Dot(Span),
45 Top(Span),
46 Perl(ClassPerl),
47 Unicode(ClassUnicode),
48}
49
50impl Primitive {
51 fn span(&self) -> &Span {
52 match *self {
53 Primitive::Literal(ref x) => &x.span,
54 Primitive::Assertion(ref x) => &x.span,
55 Primitive::Dot(ref span) => span,
56 Primitive::Top(ref span) => span,
57 Primitive::Perl(ref x) => &x.span,
58 Primitive::Unicode(ref x) => &x.span,
59 }
60 }
61
62 fn into_ast(self) -> Ast {
63 match self {
64 Primitive::Literal(lit) => Ast::literal(lit),
65 Primitive::Assertion(assert) => Ast::assertion(assert),
66 Primitive::Dot(span) => Ast::dot(span),
67 Primitive::Top(span) => Ast::top(span),
68 Primitive::Perl(cls) => Ast::class_perl(cls),
69 Primitive::Unicode(cls) => Ast::class_unicode(cls),
70 }
71 }
72
73 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
74 use self::Primitive::*;
75 use regex_syntax::ast::ClassSetItem;
76
77 match self {
78 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
79 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
80 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
81 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
82 }
83 }
84
85 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
86 use self::Primitive::*;
87
88 match self {
89 Literal(lit) => Ok(lit),
90 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
91 }
92 }
93}
94
95#[derive(Clone, Debug, Eq, PartialEq)]
96pub enum Either<Left, Right> {
97 Left(Left),
98 Right(Right),
99}
100
101#[derive(Clone, Debug, Eq, PartialEq)]
102pub struct ResharpError {
103 pub kind: ErrorKind,
105 pattern: String,
108 pub span: Span,
110}
111
112impl std::fmt::Display for ResharpError {
113 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
114 write!(f, "{:?}: {:?}", self.kind, self.span)
115 }
116}
117impl std::error::Error for ResharpError {}
118
119type Result<T> = core::result::Result<T, ResharpError>;
120
121#[derive(Clone, Debug)]
122enum GroupState {
123 Group {
125 concat: Concat,
127 group: ast::Group,
129 ignore_whitespace: bool,
131 },
132 Alternation(ast::Alternation),
137 Intersection(ast::Intersection),
138}
139
140#[derive(Clone, Debug)]
141enum ClassState {
142 Open {
144 union: regex_syntax::ast::ClassSetUnion,
146 set: regex_syntax::ast::ClassBracketed,
150 },
151 Op {
154 kind: regex_syntax::ast::ClassSetBinaryOpKind,
156 lhs: regex_syntax::ast::ClassSet,
158 },
159}
160
161pub struct ResharpParser<'s> {
163 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
164 pub translator: regex_syntax::hir::translate::Translator,
165 pub pattern: &'s str,
166 pos: Cell<Position>,
167 capture_index: Cell<u32>,
168 octal: bool,
169 empty_min_range: bool,
170 ignore_whitespace: Cell<bool>,
171 comments: RefCell<Vec<ast::Comment>>,
172 stack_group: RefCell<Vec<GroupState>>,
173 stack_class: RefCell<Vec<ClassState>>,
174 capture_names: RefCell<Vec<ast::CaptureName>>,
175 scratch: RefCell<String>,
176}
177
178fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
179 if let Err(e) = result {
180 if e.kind == from {
181 Err(ResharpError {
182 kind: to,
183 pattern: e.pattern,
184 span: e.span,
185 })
186 } else {
187 Err(e)
188 }
189 } else {
190 result
191 }
192}
193
194fn is_capture_char(c: char, first: bool) -> bool {
195 if first {
196 c == '_' || c.is_alphabetic()
197 } else {
198 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
199 }
200}
201
202pub fn is_meta_character(c: char) -> bool {
203 match c {
204 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
205 | '#' | '&' | '-' | '~' => true,
206 _ => false,
207 }
208}
209
210pub fn is_escapeable_character(c: char) -> bool {
211 if is_meta_character(c) {
213 return true;
214 }
215 if !c.is_ascii() {
218 return false;
219 }
220 match c {
225 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
226 '<' | '>' => false,
236 _ => true,
237 }
238}
239
240fn is_hex(c: char) -> bool {
241 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
242}
243
244impl<'s> ResharpParser<'s> {
245 fn default_translator_builder() -> TranslatorBuilder {
246 let mut trb = TranslatorBuilder::new();
247 trb.unicode(true);
248 trb.utf8(false);
249 trb
250 }
251
252 pub fn new(pattern: &'s str) -> Self {
253 let trb = Self::default_translator_builder();
254 Self {
255 translator: trb.build(),
256 pattern,
257 perl_classes: vec![],
258 pos: Cell::new(Position::new(0, 0, 0)),
259 capture_index: Cell::new(0),
260 octal: false,
261 empty_min_range: false,
262 ignore_whitespace: Cell::new(false),
263 comments: RefCell::new(vec![]),
264 stack_group: RefCell::new(vec![]),
265 stack_class: RefCell::new(vec![]),
266 capture_names: RefCell::new(vec![]),
267 scratch: RefCell::new(String::new()),
268 }
269 }
270
271 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
273 self
274 }
275
276 fn pattern(&self) -> &str {
278 self.pattern
279 }
280
281 fn error(&self, span: Span, kind: ast::ErrorKind) -> ResharpError {
283 ResharpError {
284 kind,
285 pattern: self.pattern().to_string(),
286 span,
287 }
288 }
289
290 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ResharpError {
291 let emptyspan = Span::splat(self.pos());
292 let inner = self.error(emptyspan, ast::ErrorKind::UnsupportedResharpRegex);
293 inner
294 }
295
296 fn offset(&self) -> usize {
301 self.parser().pos.get().offset
302 }
303
304 fn line(&self) -> usize {
308 self.parser().pos.get().line
309 }
310
311 fn column(&self) -> usize {
315 self.parser().pos.get().column
316 }
317
318 fn next_capture_index(&self, span: Span) -> Result<u32> {
326 let current = self.parser().capture_index.get();
327 let i = current
328 .checked_add(1)
329 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
330 self.parser().capture_index.set(i);
331 Ok(i)
332 }
333
334 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
335 let mut names = self.parser().capture_names.borrow_mut();
336 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
337 Err(i) => {
338 names.insert(i, cap.clone());
339 Ok(())
340 }
341 Ok(i) => Err(self.error(
342 cap.span,
343 ast::ErrorKind::GroupNameDuplicate {
344 original: names[i].span,
345 },
346 )),
347 }
348 }
349
350 fn ignore_whitespace(&self) -> bool {
351 self.parser().ignore_whitespace.get()
352 }
353
354 fn char(&self) -> char {
355 self.char_at(self.offset())
356 }
357
358 fn char_at(&self, i: usize) -> char {
359 self.pattern()[i..]
360 .chars()
361 .next()
362 .unwrap_or_else(|| panic!("expected char at offset {}", i))
363 }
364
365 fn bump(&self) -> bool {
366 if self.is_eof() {
367 return false;
368 }
369 let Position {
370 mut offset,
371 mut line,
372 mut column,
373 } = self.pos();
374 if self.char() == '\n' {
375 line = line.checked_add(1).unwrap();
376 column = 1;
377 } else {
378 column = column.checked_add(1).unwrap();
379 }
380 offset += self.char().len_utf8();
381 self.parser().pos.set(Position {
382 offset,
383 line,
384 column,
385 });
386 self.pattern()[self.offset()..].chars().next().is_some()
387 }
388
389 fn bump_if(&self, prefix: &str) -> bool {
390 if self.pattern()[self.offset()..].starts_with(prefix) {
391 for _ in 0..prefix.chars().count() {
392 self.bump();
393 }
394 true
395 } else {
396 false
397 }
398 }
399
400 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
401 if self.bump_if("?=") {
402 return Some((true, true));
403 }
404 if self.bump_if("?!") {
405 return Some((true, false));
406 }
407 if self.bump_if("?<=") {
408 return Some((false, true));
409 }
410 if self.bump_if("?<!") {
411 return Some((false, false));
412 }
413 return None;
414 }
416
417 fn bump_and_bump_space(&self) -> bool {
418 if !self.bump() {
419 return false;
420 }
421 self.bump_space();
422 !self.is_eof()
423 }
424
425 fn bump_space(&self) {
426 if !self.ignore_whitespace() {
427 return;
428 }
429 while !self.is_eof() {
430 if self.char().is_whitespace() {
431 self.bump();
432 } else if self.char() == '#' {
433 let start = self.pos();
434 let mut comment_text = String::new();
435 self.bump();
436 while !self.is_eof() {
437 let c = self.char();
438 self.bump();
439 if c == '\n' {
440 break;
441 }
442 comment_text.push(c);
443 }
444 let comment = ast::Comment {
445 span: Span::new(start, self.pos()),
446 comment: comment_text,
447 };
448 self.parser().comments.borrow_mut().push(comment);
449 } else {
450 break;
451 }
452 }
453 }
454
455 fn peek(&self) -> Option<char> {
459 if self.is_eof() {
460 return None;
461 }
462 self.pattern()[self.offset() + self.char().len_utf8()..]
463 .chars()
464 .next()
465 }
466
467 fn peek_space(&self) -> Option<char> {
470 if !self.ignore_whitespace() {
471 return self.peek();
472 }
473 if self.is_eof() {
474 return None;
475 }
476 let mut start = self.offset() + self.char().len_utf8();
477 let mut in_comment = false;
478 for (i, c) in self.pattern()[start..].char_indices() {
479 if c.is_whitespace() {
480 continue;
481 } else if !in_comment && c == '#' {
482 in_comment = true;
483 } else if in_comment && c == '\n' {
484 in_comment = false;
485 } else {
486 start += i;
487 break;
488 }
489 }
490 self.pattern()[start..].chars().next()
491 }
492
493 fn is_eof(&self) -> bool {
495 self.offset() == self.pattern().len()
496 }
497
498 fn pos(&self) -> Position {
501 self.parser().pos.get()
502 }
503
504 fn span(&self) -> Span {
507 Span::splat(self.pos())
508 }
509
510 fn span_char(&self) -> Span {
512 let mut next = Position {
513 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
514 line: self.line(),
515 column: self.column().checked_add(1).unwrap(),
516 };
517 if self.char() == '\n' {
518 next.line += 1;
519 next.column = 1;
520 }
521 Span::new(self.pos(), next)
522 }
523
524 #[inline(never)]
534 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
535 assert_eq!(self.char(), '|');
536 concat.span.end = self.pos();
537 self.push_or_add_alternation(concat);
538 self.bump();
539 Ok(ast::Concat {
540 span: self.span(),
541 asts: vec![],
542 })
543 }
544
545 fn push_or_add_alternation(&self, concat: Concat) {
548 use self::GroupState::*;
549
550 let mut stack = self.parser().stack_group.borrow_mut();
551 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
552 alts.asts.push(concat.into_ast());
553 return;
554 }
555 stack.push(Alternation(ast::Alternation {
556 span: Span::new(concat.span.start, self.pos()),
557 asts: vec![concat.into_ast()],
558 }));
559 }
560
561 #[inline(never)]
562 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
563 assert_eq!(self.char(), '&');
564 concat.span.end = self.pos();
565 self.push_or_add_intersect(concat);
566 self.bump();
567 Ok(Concat {
568 span: self.span(),
569 asts: vec![],
570 })
571 }
572
573 fn push_or_add_intersect(&self, concat: Concat) {
576 use self::GroupState::*;
577
578 let mut stack = self.parser().stack_group.borrow_mut();
579 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
580 alts.asts.push(concat.into_ast());
581 return;
582 }
583 stack.push(Intersection(ast::Intersection {
584 span: Span::new(concat.span.start, self.pos()),
585 asts: vec![concat.into_ast()],
586 }));
587 }
588
589 #[inline(never)]
603 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
604 assert_eq!(self.char(), '(');
605 match self.parse_group()? {
606 Either::Left(set) => {
607 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
608 if let Some(v) = ignore {
609 self.parser().ignore_whitespace.set(v);
610 }
611
612 concat.asts.push(Ast::flags(set));
613 Ok(concat)
614 }
615 Either::Right(group) => {
616 let old_ignore_whitespace = self.ignore_whitespace();
617 let new_ignore_whitespace = group
618 .flags()
619 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
620 .unwrap_or(old_ignore_whitespace);
621 self.parser()
622 .stack_group
623 .borrow_mut()
624 .push(GroupState::Group {
625 concat,
626 group,
627 ignore_whitespace: old_ignore_whitespace,
628 });
629 self.parser().ignore_whitespace.set(new_ignore_whitespace);
630 Ok(Concat {
631 span: self.span(),
632 asts: vec![],
633 })
634 }
635 }
636 }
637
638 #[inline(never)]
639 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
640 assert_eq!(self.char(), '~');
641 self.bump();
642 if self.is_eof() || self.char() != '(' {
643 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
644 }
645 let open_span = self.span_char();
646 self.bump();
647 let group = ast::Group {
648 span: open_span,
649 kind: ast::GroupKind::Complement,
650 ast: Box::new(Ast::empty(self.span())),
651 };
652
653 let old_ignore_whitespace = self.ignore_whitespace();
654 let new_ignore_whitespace = group
655 .flags()
656 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
657 .unwrap_or(old_ignore_whitespace);
658 self.parser()
659 .stack_group
660 .borrow_mut()
661 .push(GroupState::Group {
662 concat,
663 group,
664 ignore_whitespace: old_ignore_whitespace,
665 });
666 self.parser().ignore_whitespace.set(new_ignore_whitespace);
667 Ok(Concat {
668 span: self.span(),
669 asts: vec![],
670 })
671 }
672
673 #[inline(never)]
683 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
684 use self::GroupState::*;
685 assert_eq!(self.char(), ')');
686 let mut stack = self.parser().stack_group.borrow_mut();
687 let topstack = stack.pop();
688
689 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
690 Some(Group {
691 concat,
692 group,
693 ignore_whitespace,
694 }) => (concat, group, ignore_whitespace, None),
695 Some(Alternation(alt)) => match stack.pop() {
696 Some(Group {
697 concat,
698 group,
699 ignore_whitespace,
700 }) => (
701 concat,
702 group,
703 ignore_whitespace,
704 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
705 ),
706 None | Some(Alternation(_)) | Some(Intersection(_)) => {
707 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
708 }
709 },
710 Some(Intersection(int)) => match stack.pop() {
711 Some(Group {
712 concat,
713 group,
714 ignore_whitespace,
715 }) => (
716 concat,
717 group,
718 ignore_whitespace,
719 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
720 ),
721 None | Some(Alternation(_)) | Some(Intersection(_)) => {
722 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
723 }
724 },
725
726 None => {
727 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
728 }
729 };
730 self.parser().ignore_whitespace.set(ignore_whitespace);
731 group_concat.span.end = self.pos();
732 self.bump();
733 group.span.end = self.pos();
734 match alt {
735 Some(Either::Left(mut alt)) => {
736 alt.span.end = group_concat.span.end;
737 alt.asts.push(group_concat.into_ast());
738 group.ast = Box::new(alt.into_ast());
739 }
740 Some(Either::Right(mut int)) => {
741 int.span.end = group_concat.span.end;
742 int.asts.push(group_concat.into_ast());
743 group.ast = Box::new(int.into_ast());
744 }
745 None => {
746 group.ast = Box::new(group_concat.into_ast());
747 }
748 }
749
750 if group.kind == GroupKind::Complement {
751 let complement = ast::Complement {
752 span: self.span(),
753 ast: group.ast,
754 };
755 prior_concat.asts.push(Ast::complement(complement));
756 }
757 else {
759 prior_concat.asts.push(Ast::group(group));
761 }
762 Ok(prior_concat)
763 }
764
765 #[inline(never)]
772 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
773 concat.span.end = self.pos();
774 let mut stack = self.parser().stack_group.borrow_mut();
776 let ast = match stack.pop() {
777 None => Ok(concat.into_ast()),
778 Some(GroupState::Alternation(mut alt)) => {
779 alt.span.end = self.pos();
780 alt.asts.push(concat.into_ast());
781 Ok(Ast::alternation(alt))
782 }
783 Some(GroupState::Intersection(mut int)) => {
784 int.span.end = self.pos();
785 int.asts.push(concat.into_ast());
786
787 Ok(Ast::intersection(int))
789 }
790 Some(GroupState::Group { group, .. }) => {
791 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
792 }
793 };
794 match stack.pop() {
796 None => ast,
797 Some(GroupState::Alternation(_)) => {
798 unreachable!()
805 }
806 Some(GroupState::Intersection(_)) => {
807 unreachable!()
808 }
809 Some(GroupState::Group { group, .. }) => {
810 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
811 }
812 }
813 }
814
815 #[inline(never)]
824 fn push_class_open(
825 &self,
826 parent_union: regex_syntax::ast::ClassSetUnion,
827 ) -> Result<regex_syntax::ast::ClassSetUnion> {
828 assert_eq!(self.char(), '[');
829
830 let (nested_set, nested_union) = self.parse_set_class_open()?;
831 self.parser()
832 .stack_class
833 .borrow_mut()
834 .push(ClassState::Open {
835 union: parent_union,
836 set: nested_set,
837 });
838 Ok(nested_union)
839 }
840
841 #[inline(never)]
856 fn pop_class(
857 &self,
858 nested_union: regex_syntax::ast::ClassSetUnion,
859 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
860 assert_eq!(self.char(), ']');
861
862 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
863 let prevset = self.pop_class_op(item);
864 let mut stack = self.parser().stack_class.borrow_mut();
865 match stack.pop() {
866 None => {
867 panic!("unexpected empty character class stack")
876 }
877 Some(ClassState::Op { .. }) => {
878 panic!("unexpected ClassState::Op")
885 }
886 Some(ClassState::Open { mut union, mut set }) => {
887 self.bump();
888 set.span.end = self.pos();
889 set.kind = prevset;
890 if stack.is_empty() {
891 Ok(Either::Right(set))
892 } else {
893 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
894 Ok(Either::Left(union))
895 }
896 }
897 }
898 }
899
900 #[inline(never)]
905 fn unclosed_class_error(&self) -> ResharpError {
906 for state in self.parser().stack_class.borrow().iter().rev() {
907 if let ClassState::Open { ref set, .. } = *state {
908 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
909 }
910 }
911 panic!("no open character class found")
914 }
915
916 #[inline(never)]
922 fn push_class_op(
923 &self,
924 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
925 next_union: regex_syntax::ast::ClassSetUnion,
926 ) -> regex_syntax::ast::ClassSetUnion {
927 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
928 let new_lhs = self.pop_class_op(item);
929 self.parser().stack_class.borrow_mut().push(ClassState::Op {
930 kind: next_kind,
931 lhs: new_lhs,
932 });
933 regex_syntax::ast::ClassSetUnion {
934 span: self.span(),
935 items: vec![],
936 }
937 }
938
939 #[inline(never)]
945 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
946 let mut stack = self.parser().stack_class.borrow_mut();
947 let (kind, lhs) = match stack.pop() {
948 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
949 Some(state @ ClassState::Open { .. }) => {
950 stack.push(state);
951 return rhs;
952 }
953 None => unreachable!(),
954 };
955 let span = Span::new(lhs.span().start, rhs.span().end);
956 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
957 span,
958 kind,
959 lhs: Box::new(lhs),
960 rhs: Box::new(rhs),
961 })
962 }
963
964 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
965 match hir.kind() {
966 hir::HirKind::Empty => Ok(NodeId::EPS),
967 hir::HirKind::Literal(l) => {
968 if l.0.len() == 1 {
969 let node = tb.mk_u8(l.0[0]);
970 Ok(node)
971 } else {
972 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
973 let conc = tb.mk_concats(ws.iter().copied());
974 Ok(conc)
975 }
976 }
977 hir::HirKind::Class(class) => {
978 match class {
980 hir::Class::Unicode(class_unicode) => {
981 let ranges = class_unicode.ranges();
982 let mut s1 = NodeId::BOT;
983 let mut s2 = NodeId::BOT;
984 let mut s3 = NodeId::BOT;
985 let mut s4 = NodeId::BOT;
986 for range in ranges {
987 for seq in Utf8Sequences::new(range.start(), range.end()) {
988 if seq.len() > 2 {
990 continue;
991 }
992
993 let v: Vec<_> = seq
994 .as_slice()
995 .iter()
996 .map(|s|
997 (s.start, s.end))
999 .collect();
1000 if v.len() == 1 {
1001 let (start, end) = v[0];
1002 let node = tb.mk_range_u8(start, end);
1003 s1 = tb.mk_union(s1, node);
1004 } else if v.len() == 2 {
1005 let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1006 let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1007 let conc = tb.mk_concat(node1, node2);
1008 s2 = tb.mk_union(s2, conc);
1009 } else if v.len() == 3 {
1010 let node1 = tb.mk_range_u8(v[0].0, v[0].1);
1011 let node2 = tb.mk_range_u8(v[1].0, v[1].1);
1012 let node3 = tb.mk_range_u8(v[2].0, v[2].1);
1013 let conc2 = tb.mk_concat(node2, node3);
1014 let conc1 = tb.mk_concat(node1, conc2);
1015 s3 = tb.mk_union(s3, conc1);
1016 } else {
1017 let mut conc = NodeId::EPS;
1018 for i in (0..4).rev() {
1019 let node = tb.mk_range_u8(v[i].0, v[i].1);
1020 conc = tb.mk_concat(conc, node);
1021 }
1022 s4 = tb.mk_union(s4, conc);
1023 }
1024 }
1025 }
1026 let merged = tb.mk_union(s2, s1);
1027 let merged = tb.mk_union(s3, merged);
1028 let merged = tb.mk_union(s4, merged);
1029 Ok(merged)
1030 }
1031 hir::Class::Bytes(class_bytes) => {
1032 let ranges = class_bytes.ranges();
1033 let mut result = NodeId::BOT;
1034 for range in ranges {
1035 let start = range.start();
1036 let end = range.end();
1037 let node = tb.mk_range_u8(start, end);
1038 result = tb.mk_union(result, node);
1039 }
1040 Ok(result)
1041 }
1042 }
1043 }
1044 hir::HirKind::Look(_) => todo!(),
1045 hir::HirKind::Repetition(_) => todo!(),
1046 hir::HirKind::Capture(_) => todo!(),
1047 hir::HirKind::Concat(body) => {
1048 let mut result = NodeId::EPS;
1049 for child in body {
1050 let node = self.hir_to_node_id(child, tb)?;
1051 result = tb.mk_concat(result, node);
1052 }
1053 Ok(result)
1054 }
1055 hir::HirKind::Alternation(_) => todo!(),
1056 }
1057 }
1058
1059 fn translate_ast_to_hir(
1060 &mut self,
1061 orig_ast: ®ex_syntax::ast::Ast,
1062 tb: &mut TB<'s>,
1063 ) -> Result<NodeId> {
1064 match self.translator.translate("", orig_ast) {
1067 Err(_) => return Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1068 Ok(hir) => {
1069 let mapped = self.hir_to_node_id(&hir, tb);
1070 mapped
1071 }
1072 }
1073 }
1074
1075 fn translator_to_node_id(
1076 &mut self,
1077 orig_ast: ®ex_syntax::ast::Ast,
1078 translator: &mut Option<Translator>,
1079 tb: &mut TB<'s>,
1080 ) -> Result<NodeId> {
1081 match translator {
1082 Some(tr) => {
1083 let hir = tr
1084 .translate("", &orig_ast)
1085 .map_err(|e| self.unsupported_error(e))?;
1086 self.hir_to_node_id(&hir, tb)
1087 }
1088 None => self.translate_ast_to_hir(&orig_ast, tb),
1089 }
1090 }
1091
1092 fn get_class(
1093 &mut self,
1094 negated: bool,
1095 kind: regex_syntax::ast::ClassPerlKind,
1096 tb: &mut TB<'s>,
1097 ) -> Result<NodeId> {
1098 let w = self
1100 .perl_classes
1101 .iter()
1102 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1103 match w {
1104 Some((_, _, value)) => return Ok(*value),
1105 None => {
1106 let tmp = regex_syntax::ast::ClassPerl {
1107 span: Span::splat(Position::new(0, 0, 0)),
1108 negated,
1109 kind: kind.clone(),
1110 };
1111 let word_ast = regex_syntax::ast::Ast::class_perl(tmp);
1112 let translated = self.translate_ast_to_hir(&word_ast, tb)?;
1113 self.perl_classes.push((negated, kind, translated));
1114 Ok(translated)
1115 }
1116 }
1117 }
1118
1119 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1120 use WordCharKind::*;
1121 match ast {
1122 Ast::Literal(lit) => {
1123 if is_word_byte(lit.c as u8) {
1124 Word
1125 } else {
1126 NonWord
1127 }
1128 }
1129 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1130 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1131 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1132 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1133 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1134 _ => Unknown,
1135 },
1136 Ast::Dot(_) | Ast::Top(_) => Unknown,
1137 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1138 Ast::Concat(c) if !c.asts.is_empty() => {
1139 Self::word_char_kind(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1140 }
1141 Ast::Repetition(r) => {
1142 let inner = Self::word_char_kind(&r.ast, left);
1143 let nullable = matches!(
1144 &r.op.kind,
1145 ast::RepetitionKind::ZeroOrMore
1146 | ast::RepetitionKind::ZeroOrOne
1147 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1148 );
1149 if nullable {
1150 match inner {
1151 Word => MaybeWord,
1152 NonWord => MaybeNonWord,
1153 _ => Unknown,
1154 }
1155 } else {
1156 inner
1157 }
1158 }
1159 Ast::Lookaround(la) => Self::word_char_kind(&la.ast, left),
1160 _ => Unknown,
1161 }
1162 }
1163
1164 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1165 use WordCharKind::*;
1166 let next = idx as isize + dir;
1167 if next < 0 || next >= asts.len() as isize {
1168 return Edge;
1169 }
1170 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1171 match kind {
1172 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1173 Word => Word,
1174 _ => Unknown,
1175 },
1176 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1177 NonWord => NonWord,
1178 _ => Unknown,
1179 },
1180 other => other,
1181 }
1182 }
1183
1184 fn rewrite_word_boundary_in_concat(
1185 &mut self,
1186 asts: &[Ast],
1187 idx: usize,
1188 translator: &mut Option<Translator>,
1189 tb: &mut TB<'s>,
1190 ) -> Result<(NodeId, usize)> {
1191 use WordCharKind::*;
1192 let left = Self::concat_neighbor_kind(asts, idx, -1);
1193 let right = Self::concat_neighbor_kind(asts, idx, 1);
1194 let word_id = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1195 let not_word_id = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1196
1197 match (left, right) {
1198 (NonWord, Word) | (Word, NonWord) => Ok((NodeId::EPS, idx + 1)),
1199 (Word, _) => {
1200 let set = tb.mk_union(NodeId::END, not_word_id);
1201 let tail = tb.mk_concat(set, NodeId::TS);
1202 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1203 }
1204 (NonWord, _) => {
1205 let set = tb.mk_union(NodeId::END, word_id);
1206 let tail = tb.mk_concat(set, NodeId::TS);
1207 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1208 }
1209 (_, Word) => {
1210 let body = tb.mk_union(NodeId::BEGIN, not_word_id);
1211 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1212 }
1213 (_, NonWord) => {
1214 let body = tb.mk_union(NodeId::BEGIN, word_id);
1215 Ok((tb.mk_lookbehind(body, NodeId::MISSING), idx + 1))
1216 }
1217 _ => Ok((self.make_full_word_boundary(tb)?, idx + 1)),
1218 }
1219 }
1220
1221 fn merge_boundary_with_following_lookaheads(
1222 &mut self,
1223 asts: &[Ast],
1224 wb_idx: usize,
1225 boundary_tail: NodeId,
1226 translator: &mut Option<Translator>,
1227 tb: &mut TB<'s>,
1228 ) -> Result<(NodeId, usize)> {
1229 let mut next = wb_idx + 1;
1230 let mut la_bodies = vec![boundary_tail];
1231 while next < asts.len() {
1232 match &asts[next] {
1233 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1234 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1235 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1236 next += 1;
1237 }
1238 _ => break,
1239 }
1240 }
1241 let merged = tb.mk_inters(la_bodies.into_iter());
1242 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1243 }
1244
1245 fn make_full_word_boundary(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1246 let w = self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1247 let nw = self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1248 let lb_w = tb.mk_lookbehind(w, NodeId::MISSING);
1249 let la_nw = tb.mk_lookahead(nw, NodeId::MISSING, 0);
1250 let c1 = tb.mk_concat(lb_w, la_nw);
1251 let lb_nw = tb.mk_lookbehind(nw, NodeId::MISSING);
1252 let la_w = tb.mk_lookahead(w, NodeId::MISSING, 0);
1253 let c2 = tb.mk_concat(lb_nw, la_w);
1254 Ok(tb.mk_union(c1, c2))
1255 }
1256
1257 fn ast_to_node_id(
1258 &mut self,
1259 ast: &Ast,
1260 translator: &mut Option<Translator>,
1261 tb: &mut TB<'s>,
1262 ) -> Result<NodeId> {
1263 match ast {
1264 Ast::Empty(_) => Ok(NodeId::EPS),
1265 Ast::Flags(f) => {
1266 let mut translator_builder = Self::default_translator_builder();
1267 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1268 translator_builder.case_insensitive(state);
1269 }
1270 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1271 translator_builder.unicode(state);
1272 }
1273 let concat_translator = Some(translator_builder.build());
1274 *translator = concat_translator;
1275 Ok(NodeId::EPS)
1276 }
1277 Ast::Literal(l) => {
1278 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1279 self.translator_to_node_id(&ast_lit, translator, tb)
1280 }
1281 Ast::Top(_) => Ok(NodeId::TOP),
1282 Ast::Dot(_) => {
1283 let hirv = hir::Hir::dot(hir::Dot::AnyByteExceptLF);
1284 self.hir_to_node_id(&hirv, tb)
1285 }
1286 Ast::Assertion(a) => match &a.kind {
1287 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1288 ast::AssertionKind::EndText => Ok(NodeId::END),
1289 ast::AssertionKind::WordBoundary => {
1290 let word_id =
1291 self.get_class(false, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1292 let not_word_id =
1293 self.get_class(true, regex_syntax::ast::ClassPerlKind::Word, tb)?;
1294 let case1_1 = tb.mk_lookbehind(word_id, NodeId::MISSING);
1296 let case1_2 = tb.mk_lookahead(not_word_id, NodeId::MISSING, 0);
1297 let case1 = tb.mk_concat(case1_1, case1_2);
1298 let case2_1 = tb.mk_lookbehind(not_word_id, NodeId::MISSING);
1300 let case2_2 = tb.mk_lookahead(word_id, NodeId::MISSING, 0);
1301 let case2 = tb.mk_concat(case2_1, case2_2);
1302 Ok(tb.mk_union(case1, case2))
1303 }
1304 ast::AssertionKind::NotWordBoundary => {
1305 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1306 }
1307 ast::AssertionKind::StartLine => {
1308 let left = NodeId::BEGIN;
1309 let right = tb.mk_u8('\n' as u8);
1310 let union = tb.mk_union(left, right);
1311 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1312 }
1313 ast::AssertionKind::EndLine => {
1314 let left = NodeId::END;
1315 let right = tb.mk_u8('\n' as u8);
1316 let union = tb.mk_union(left, right);
1317 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1318 }
1319 ast::AssertionKind::WordBoundaryStart => todo!(),
1320 ast::AssertionKind::WordBoundaryEnd => todo!(),
1321 ast::AssertionKind::WordBoundaryStartAngle => todo!(),
1322 ast::AssertionKind::WordBoundaryEndAngle => Ok(tb.mk_string(">")),
1323 ast::AssertionKind::WordBoundaryStartHalf => todo!(),
1324 ast::AssertionKind::WordBoundaryEndHalf => todo!(),
1325 },
1326 Ast::ClassUnicode(c) => {
1327 let tmp = regex_syntax::ast::ClassUnicode {
1328 span: c.span,
1329 negated: c.negated,
1330 kind: c.kind.clone(),
1331 };
1332 if !c.negated {
1333 match &c.kind {
1334 regex_syntax::ast::ClassUnicodeKind::Named(s) => match s.as_str() {
1335 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1337 "utf8" => {
1339 let ascii = tb.mk_range_u8(0, 127);
1340 let beta = tb.mk_range_u8(128, 0xBF);
1341 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1342 let c0s = tb.mk_concats([c0, beta].into_iter());
1343 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1344 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1345 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1346 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1347 let merged = tb.mk_unions([ascii, c0s, e0s, f0s].into_iter());
1348 return Ok(tb.mk_star(merged));
1349 }
1350 "hex" => {
1351 let nums = tb.mk_range_u8(b'0', b'9');
1352 let lets = tb.mk_range_u8(b'a', b'f');
1353 let lets2 = tb.mk_range_u8(b'A', b'F');
1354 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1355 return Ok(merged);
1356 }
1357 _ => {}
1358 },
1359 _ => {}
1360 };
1361 }
1362
1363 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1364 self.translator_to_node_id(&orig_ast, translator, tb)
1365 }
1366 Ast::ClassPerl(c) => {
1367 let tmp = regex_syntax::ast::ClassPerl {
1368 span: c.span,
1369 negated: c.negated,
1370 kind: c.kind.clone(),
1371 };
1372 let orig_ast = regex_syntax::ast::Ast::class_perl(tmp);
1373 self.translator_to_node_id(&orig_ast, translator, tb)
1374 }
1375 Ast::ClassBracketed(c) => match &c.kind {
1376 regex_syntax::ast::ClassSet::Item(_) => {
1377 let tmp = regex_syntax::ast::ClassBracketed {
1378 span: c.span,
1379 negated: c.negated,
1380 kind: c.kind.clone(),
1381 };
1382 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1383 self.translator_to_node_id(&orig_ast, translator, tb)
1384 }
1385 regex_syntax::ast::ClassSet::BinaryOp(_) => todo!(),
1386 },
1387 Ast::Repetition(r) => {
1388 let body = self.ast_to_node_id(&r.ast, translator, tb);
1389 match body {
1390 Ok(body) => match &r.op.kind {
1391 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1392 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1393 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1394 ast::RepetitionKind::Range(r) => match r {
1395 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1396 ast::RepetitionRange::AtLeast(n) => {
1397 let rep = tb.mk_repeat(body, *n, *n);
1398 let st = tb.mk_star(body);
1399 Ok(tb.mk_concat(rep, st))
1400 }
1401
1402 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1403 },
1404 },
1405 Err(_) => body,
1406 }
1407 }
1408 Ast::Lookaround(g) => {
1409 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1410 match g.kind {
1411 ast::LookaroundKind::PositiveLookahead => {
1412 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1413 }
1414 ast::LookaroundKind::PositiveLookbehind => {
1415 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1416 }
1417 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1418 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1419 }
1420 }
1421 Ast::Group(g) => {
1422 let child = self.ast_to_node_id(&g.ast, translator, tb);
1423 child
1424 }
1425 Ast::Alternation(a) => {
1426 let mut children = vec![];
1427 for ast in &a.asts {
1428 match self.ast_to_node_id(ast, translator, tb) {
1429 Ok(node_id) => children.push(node_id),
1430 Err(err) => return Err(err),
1431 }
1432 }
1433 Ok(tb.mk_unions(children.iter().copied()))
1434 }
1435 Ast::Concat(c) => {
1436 let mut concat_translator: Option<Translator> = None;
1437 let mut children = vec![];
1438 let mut i = 0;
1439 while i < c.asts.len() {
1440 let ast = &c.asts[i];
1441 match ast {
1442 Ast::Flags(f) => {
1443 let mut translator_builder = Self::default_translator_builder();
1444 translator_builder.utf8(false);
1445 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1446 translator_builder.case_insensitive(state);
1447 }
1448 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1449 translator_builder.unicode(state);
1450 }
1451 concat_translator = Some(translator_builder.build());
1452 i += 1;
1453 continue;
1454 }
1455 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1456 let node =
1457 self.rewrite_word_boundary_in_concat(&c.asts, i, translator, tb)?;
1458 children.push(node.0);
1459 i = node.1; continue;
1461 }
1462 _ => {}
1463 }
1464 match concat_translator {
1465 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1466 Ok(node_id) => children.push(node_id),
1467 Err(err) => return Err(err),
1468 },
1469 None => match self.ast_to_node_id(ast, translator, tb) {
1470 Ok(node_id) => children.push(node_id),
1471 Err(err) => return Err(err),
1472 },
1473 }
1474 i += 1;
1475 }
1476 Ok(tb.mk_concats(children.iter().cloned()))
1477 }
1478 Ast::Intersection(intersection) => {
1479 let mut children = vec![];
1480 for ast in &intersection.asts {
1481 match self.ast_to_node_id(ast, translator, tb) {
1482 Ok(node_id) => children.push(node_id),
1483 Err(err) => return Err(err),
1484 }
1485 }
1486 Ok(tb.mk_inters(children.into_iter()))
1487 }
1488 Ast::Complement(complement) => {
1489 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1490 body.map(|x| tb.mk_compl(x))
1491 }
1492 }
1493 }
1494
1495 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1498 let mut concat = Concat {
1499 span: self.span(),
1500 asts: vec![],
1501 };
1502 loop {
1503 self.bump_space();
1504 if self.is_eof() {
1505 break;
1506 }
1507 match self.char() {
1508 '(' => concat = self.push_group(concat)?,
1509 ')' => concat = self.pop_group(concat)?,
1510 '|' => concat = self.push_alternate(concat)?,
1511 '&' => concat = self.push_intersect(concat)?,
1512 '~' => concat = self.push_compl_group(concat)?,
1513 '[' => {
1514 let class = self.parse_set_class()?;
1515 concat.asts.push(Ast::class_bracketed(class));
1516 }
1517 '?' => {
1518 concat =
1519 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1520 }
1521 '*' => {
1522 concat =
1523 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1524 }
1525 '+' => {
1526 concat =
1527 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1528 }
1529 '{' => {
1530 concat = self.parse_counted_repetition(concat)?;
1531 }
1532 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1533 }
1534 }
1535 let ast = self.pop_group_end(concat)?;
1536 let node_id = self.ast_to_node_id(&ast, &mut None, tb);
1538 node_id
1539 }
1540
1541 #[inline(never)]
1542 fn parse_uncounted_repetition(
1543 &self,
1544 mut concat: ast::Concat,
1545 kind: ast::RepetitionKind,
1546 ) -> Result<ast::Concat> {
1547 let op_start = self.pos();
1549 let ast = match concat.asts.pop() {
1550 Some(ast) => ast,
1551 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1552 };
1553 match ast {
1554 Ast::Empty(_) | Ast::Flags(_) => {
1555 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1556 }
1557 _ => {}
1558 }
1559 if self.bump() && self.char() == '?' {
1560 return Err(self.error(
1561 Span::new(op_start, self.pos()),
1562 ast::ErrorKind::UnsupportedLazyQuantifier,
1563 ));
1564 }
1565 concat.asts.push(Ast::repetition(ast::Repetition {
1566 span: ast.span().with_end(self.pos()),
1567 op: ast::RepetitionOp {
1568 span: Span::new(op_start, self.pos()),
1569 kind,
1570 },
1571 greedy: true,
1572 ast: Box::new(ast),
1573 }));
1574 Ok(concat)
1575 }
1576
1577 #[inline(never)]
1578 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1579 assert!(self.char() == '{');
1580 let start = self.pos();
1581 let ast = match concat.asts.pop() {
1582 Some(ast) => ast,
1583 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1584 };
1585 match ast {
1586 Ast::Empty(_) | Ast::Flags(_) => {
1587 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1588 }
1589 _ => {}
1590 }
1591 if !self.bump_and_bump_space() {
1592 return Err(self.error(
1593 Span::new(start, self.pos()),
1594 ast::ErrorKind::RepetitionCountUnclosed,
1595 ));
1596 }
1597 let count_start = specialize_err(
1598 self.parse_decimal(),
1599 ast::ErrorKind::DecimalEmpty,
1600 ast::ErrorKind::RepetitionCountDecimalEmpty,
1601 );
1602 if self.is_eof() {
1603 return Err(self.error(
1604 Span::new(start, self.pos()),
1605 ast::ErrorKind::RepetitionCountUnclosed,
1606 ));
1607 }
1608 let range = if self.char() == ',' {
1609 if !self.bump_and_bump_space() {
1610 return Err(self.error(
1611 Span::new(start, self.pos()),
1612 ast::ErrorKind::RepetitionCountUnclosed,
1613 ));
1614 }
1615 if self.char() != '}' {
1616 let count_start = match count_start {
1617 Ok(c) => c,
1618 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1619 if self.parser().empty_min_range {
1620 0
1621 } else {
1622 return Err(err);
1623 }
1624 }
1625 err => err?,
1626 };
1627 let count_end = specialize_err(
1628 self.parse_decimal(),
1629 ast::ErrorKind::DecimalEmpty,
1630 ast::ErrorKind::RepetitionCountDecimalEmpty,
1631 )?;
1632 ast::RepetitionRange::Bounded(count_start, count_end)
1633 } else {
1634 ast::RepetitionRange::AtLeast(count_start?)
1635 }
1636 } else {
1637 ast::RepetitionRange::Exactly(count_start?)
1638 };
1639
1640 if self.is_eof() || self.char() != '}' {
1641 return Err(self.error(
1642 Span::new(start, self.pos()),
1643 ast::ErrorKind::RepetitionCountUnclosed,
1644 ));
1645 }
1646
1647 if self.bump_and_bump_space() && self.char() == '?' {
1648 return Err(self.error(
1649 Span::new(start, self.pos()),
1650 ast::ErrorKind::UnsupportedLazyQuantifier,
1651 ));
1652 }
1653
1654 let op_span = Span::new(start, self.pos());
1655 if !range.is_valid() {
1656 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
1657 }
1658 concat.asts.push(Ast::repetition(ast::Repetition {
1659 span: ast.span().with_end(self.pos()),
1660 op: ast::RepetitionOp {
1661 span: op_span,
1662 kind: ast::RepetitionKind::Range(range),
1663 },
1664 greedy: true,
1665 ast: Box::new(ast),
1666 }));
1667 Ok(concat)
1668 }
1669
1670 #[inline(never)]
1671 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1672 assert_eq!(self.char(), '(');
1673 let open_span = self.span_char();
1674 self.bump();
1675 self.bump_space();
1676 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
1677 let kind = match (pos, ahead) {
1678 (true, true) => LookaroundKind::PositiveLookahead,
1679 (true, false) => LookaroundKind::PositiveLookbehind,
1680 (false, true) => LookaroundKind::NegativeLookahead,
1681 (false, false) => LookaroundKind::NegativeLookbehind,
1682 };
1683 return Ok(Either::Right(ast::Group {
1684 span: open_span,
1685 kind: ast::GroupKind::Lookaround(kind),
1686 ast: Box::new(Ast::empty(self.span())),
1687 }));
1688 }
1689 let inner_span = self.span();
1690 let mut starts_with_p = true;
1691 if self.bump_if("?P<") || {
1692 starts_with_p = false;
1693 self.bump_if("?<")
1694 } {
1695 let capture_index = self.next_capture_index(open_span)?;
1696 let name = self.parse_capture_name(capture_index)?;
1697 Ok(Either::Right(ast::Group {
1698 span: open_span,
1699 kind: ast::GroupKind::CaptureName {
1700 starts_with_p,
1701 name,
1702 },
1703 ast: Box::new(Ast::empty(self.span())),
1704 }))
1705 } else if self.bump_if("?") {
1706 if self.is_eof() {
1707 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
1708 }
1709 let flags = self.parse_flags()?;
1710 let char_end = self.char();
1711 self.bump();
1712 if char_end == ')' {
1713 if flags.items.is_empty() {
1716 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
1717 }
1718 Ok(Either::Left(ast::SetFlags {
1719 span: Span {
1720 end: self.pos(),
1721 ..open_span
1722 },
1723 flags,
1724 }))
1725 } else {
1726 assert_eq!(char_end, ':');
1727 Ok(Either::Right(ast::Group {
1728 span: open_span,
1729 kind: ast::GroupKind::NonCapturing(flags),
1730 ast: Box::new(Ast::empty(self.span())),
1731 }))
1732 }
1733 } else {
1734 let capture_index = self.next_capture_index(open_span)?;
1735 Ok(Either::Right(ast::Group {
1736 span: open_span,
1737 kind: ast::GroupKind::CaptureIndex(capture_index),
1738 ast: Box::new(Ast::empty(self.span())),
1739 }))
1740 }
1741 }
1742
1743 #[inline(never)]
1744 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
1745 if self.is_eof() {
1746 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1747 }
1748 let start = self.pos();
1749 loop {
1750 if self.char() == '>' {
1751 break;
1752 }
1753 if !is_capture_char(self.char(), self.pos() == start) {
1754 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
1755 }
1756 if !self.bump() {
1757 break;
1758 }
1759 }
1760 let end = self.pos();
1761 if self.is_eof() {
1762 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1763 }
1764 assert_eq!(self.char(), '>');
1765 self.bump();
1766 let name = &self.pattern()[start.offset..end.offset];
1767 if name.is_empty() {
1768 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
1769 }
1770 let capname = ast::CaptureName {
1771 span: Span::new(start, end),
1772 name: name.to_string(),
1773 index: capture_index,
1774 };
1775 self.add_capture_name(&capname)?;
1776 Ok(capname)
1777 }
1778
1779 #[inline(never)]
1780 fn parse_flags(&self) -> Result<ast::Flags> {
1781 let mut flags = ast::Flags {
1782 span: self.span(),
1783 items: vec![],
1784 };
1785 let mut last_was_negation = None;
1786 while self.char() != ':' && self.char() != ')' {
1787 if self.char() == '-' {
1788 last_was_negation = Some(self.span_char());
1789 let item = ast::FlagsItem {
1790 span: self.span_char(),
1791 kind: ast::FlagsItemKind::Negation,
1792 };
1793 if let Some(i) = flags.add_item(item) {
1794 return Err(self.error(
1795 self.span_char(),
1796 ast::ErrorKind::FlagRepeatedNegation {
1797 original: flags.items[i].span,
1798 },
1799 ));
1800 }
1801 } else {
1802 last_was_negation = None;
1803 let item = ast::FlagsItem {
1804 span: self.span_char(),
1805 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1806 };
1807 if let Some(i) = flags.add_item(item) {
1808 return Err(self.error(
1809 self.span_char(),
1810 ast::ErrorKind::FlagDuplicate {
1811 original: flags.items[i].span,
1812 },
1813 ));
1814 }
1815 }
1816 if !self.bump() {
1817 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
1818 }
1819 }
1820 if let Some(span) = last_was_negation {
1821 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1822 }
1823 flags.span.end = self.pos();
1824 Ok(flags)
1825 }
1826
1827 #[inline(never)]
1828 fn parse_flag(&self) -> Result<ast::Flag> {
1829 match self.char() {
1830 'i' => Ok(ast::Flag::CaseInsensitive),
1831 'm' => Ok(ast::Flag::MultiLine),
1832 's' => Ok(ast::Flag::DotMatchesNewLine),
1833 'U' => Ok(ast::Flag::SwapGreed),
1834 'u' => Ok(ast::Flag::Unicode),
1835 'R' => Ok(ast::Flag::CRLF),
1836 'x' => Ok(ast::Flag::IgnoreWhitespace),
1837 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
1838 }
1839 }
1840
1841 fn parse_primitive(&self) -> Result<Primitive> {
1842 match self.char() {
1843 '\\' => self.parse_escape(),
1844 '_' => {
1845 let ast = Primitive::Top(self.span_char());
1846 self.bump();
1847 Ok(ast)
1848 }
1849 '.' => {
1850 let ast = Primitive::Dot(self.span_char());
1851 self.bump();
1852 Ok(ast)
1853 }
1854 '^' => {
1855 let ast = Primitive::Assertion(ast::Assertion {
1856 span: self.span_char(),
1857 kind: ast::AssertionKind::StartLine,
1858 });
1859 self.bump();
1860 Ok(ast)
1861 }
1862 '$' => {
1863 let ast = Primitive::Assertion(ast::Assertion {
1864 span: self.span_char(),
1865 kind: ast::AssertionKind::EndLine,
1866 });
1867 self.bump();
1868 Ok(ast)
1869 }
1870 c => {
1871 let ast = Primitive::Literal(Literal {
1872 span: self.span_char(),
1873 kind: LiteralKind::Verbatim,
1874 c,
1875 });
1876 self.bump();
1877 Ok(ast)
1878 }
1879 }
1880 }
1881
1882 #[inline(never)]
1883 fn parse_escape(&self) -> Result<Primitive> {
1884 assert_eq!(self.char(), '\\');
1885 let start = self.pos();
1886 if !self.bump() {
1887 return Err(self.error(
1888 Span::new(start, self.pos()),
1889 ast::ErrorKind::EscapeUnexpectedEof,
1890 ));
1891 }
1892 let c = self.char();
1893 match c {
1895 '0'..='9' => {
1896 if !self.parser().octal {
1897 return Err(self.error(
1898 Span::new(start, self.span_char().end),
1899 ast::ErrorKind::UnsupportedBackreference,
1900 ));
1901 }
1902 let mut lit = self.parse_octal();
1903 lit.span.start = start;
1904 return Ok(Primitive::Literal(lit));
1905 }
1906 'x' | 'u' | 'U' => {
1913 let mut lit = self.parse_hex()?;
1914 lit.span.start = start;
1915 return Ok(Primitive::Literal(lit));
1916 }
1917 'p' | 'P' => {
1918 let mut cls = self.parse_unicode_class()?;
1919 cls.span.start = start;
1920 return Ok(Primitive::Unicode(cls));
1921 }
1922 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1923 let mut cls = self.parse_perl_class();
1924 cls.span.start = start;
1925 return Ok(Primitive::Perl(cls));
1926 }
1927 _ => {}
1928 }
1929
1930 self.bump();
1932 let span = Span::new(start, self.pos());
1933 if is_meta_character(c) {
1934 return Ok(Primitive::Literal(Literal {
1935 span,
1936 kind: LiteralKind::Meta,
1937 c,
1938 }));
1939 }
1940 if is_escapeable_character(c) {
1941 return Ok(Primitive::Literal(Literal {
1942 span,
1943 kind: LiteralKind::Superfluous,
1944 c,
1945 }));
1946 }
1947 let special = |kind, c| {
1948 Ok(Primitive::Literal(Literal {
1949 span,
1950 kind: LiteralKind::Special(kind),
1951 c,
1952 }))
1953 };
1954 match c {
1955 'a' => special(SpecialLiteralKind::Bell, '\x07'),
1956 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
1957 't' => special(SpecialLiteralKind::Tab, '\t'),
1958 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
1959 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
1960 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
1961 'A' => Ok(Primitive::Assertion(ast::Assertion {
1962 span,
1963 kind: ast::AssertionKind::StartText,
1964 })),
1965 'z' => Ok(Primitive::Assertion(ast::Assertion {
1966 span,
1967 kind: ast::AssertionKind::EndText,
1968 })),
1969 'b' => {
1970 let mut wb = ast::Assertion {
1971 span,
1972 kind: ast::AssertionKind::WordBoundary,
1973 };
1974 if !self.is_eof() && self.char() == '{' {
1977 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
1978 wb.kind = kind;
1979 wb.span.end = self.pos();
1980 }
1981 }
1982 Ok(Primitive::Assertion(wb))
1983 }
1984 'B' => Ok(Primitive::Assertion(ast::Assertion {
1985 span,
1986 kind: ast::AssertionKind::NotWordBoundary,
1987 })),
1988 '<' => Ok(Primitive::Assertion(ast::Assertion {
1989 span,
1990 kind: ast::AssertionKind::WordBoundaryStartAngle,
1991 })),
1992 '>' => Ok(Primitive::Assertion(ast::Assertion {
1993 span,
1994 kind: ast::AssertionKind::WordBoundaryEndAngle,
1995 })),
1996 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1997 }
1998 }
1999
2000 fn maybe_parse_special_word_boundary(
2001 &self,
2002 wb_start: Position,
2003 ) -> Result<Option<ast::AssertionKind>> {
2004 assert_eq!(self.char(), '{');
2005
2006 let is_valid_char = |c| match c {
2007 'A'..='Z' | 'a'..='z' | '-' => true,
2008 _ => false,
2009 };
2010 let start = self.pos();
2011 if !self.bump_and_bump_space() {
2012 return Err(self.error(
2013 Span::new(wb_start, self.pos()),
2014 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2015 ));
2016 }
2017 let start_contents = self.pos();
2018 if !is_valid_char(self.char()) {
2023 self.parser().pos.set(start);
2024 return Ok(None);
2025 }
2026
2027 let mut scratch = self.parser().scratch.borrow_mut();
2029 scratch.clear();
2030 while !self.is_eof() && is_valid_char(self.char()) {
2031 scratch.push(self.char());
2032 self.bump_and_bump_space();
2033 }
2034 if self.is_eof() || self.char() != '}' {
2035 return Err(self.error(
2036 Span::new(start, self.pos()),
2037 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2038 ));
2039 }
2040 let end = self.pos();
2041 self.bump();
2042 let kind = match scratch.as_str() {
2043 "start" => ast::AssertionKind::WordBoundaryStart,
2044 "end" => ast::AssertionKind::WordBoundaryEnd,
2045 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2046 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2047 _ => {
2048 return Err(self.error(
2049 Span::new(start_contents, end),
2050 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2051 ))
2052 }
2053 };
2054 Ok(Some(kind))
2055 }
2056
2057 #[inline(never)]
2058 fn parse_octal(&self) -> Literal {
2059 assert!(self.parser().octal);
2060 assert!('0' <= self.char() && self.char() <= '7');
2061 let start = self.pos();
2062 while self.bump()
2064 && '0' <= self.char()
2065 && self.char() <= '7'
2066 && self.pos().offset - start.offset <= 2
2067 {}
2068 let end = self.pos();
2069 let octal = &self.pattern()[start.offset..end.offset];
2070 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2073 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2076 Literal {
2077 span: Span::new(start, end),
2078 kind: LiteralKind::Octal,
2079 c,
2080 }
2081 }
2082
2083 #[inline(never)]
2084 fn parse_hex(&self) -> Result<Literal> {
2085 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2086
2087 let hex_kind = match self.char() {
2088 'x' => HexLiteralKind::X,
2089 'u' => HexLiteralKind::UnicodeShort,
2090 _ => HexLiteralKind::UnicodeLong,
2091 };
2092 if !self.bump_and_bump_space() {
2093 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2094 }
2095 if self.char() == '{' {
2096 self.parse_hex_brace(hex_kind)
2097 } else {
2098 self.parse_hex_digits(hex_kind)
2099 }
2100 }
2101
2102 #[inline(never)]
2103 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2104 let mut scratch = self.parser().scratch.borrow_mut();
2105 scratch.clear();
2106
2107 let start = self.pos();
2108 for i in 0..kind.digits() {
2109 if i > 0 && !self.bump_and_bump_space() {
2110 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2111 }
2112 if !is_hex(self.char()) {
2113 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2114 }
2115 scratch.push(self.char());
2116 }
2117 self.bump_and_bump_space();
2120 let end = self.pos();
2121 let hex = scratch.as_str();
2122 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2123 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2124 Some(c) => Ok(Literal {
2125 span: Span::new(start, end),
2126 kind: LiteralKind::HexFixed(kind),
2127 c,
2128 }),
2129 }
2130 }
2131
2132 #[inline(never)]
2133 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2134 let mut scratch = self.parser().scratch.borrow_mut();
2135 scratch.clear();
2136
2137 let brace_pos = self.pos();
2138 let start = self.span_char().end;
2139 while self.bump_and_bump_space() && self.char() != '}' {
2140 if !is_hex(self.char()) {
2141 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2142 }
2143 scratch.push(self.char());
2144 }
2145 if self.is_eof() {
2146 return Err(self.error(
2147 Span::new(brace_pos, self.pos()),
2148 ast::ErrorKind::EscapeUnexpectedEof,
2149 ));
2150 }
2151 let end = self.pos();
2152 let hex = scratch.as_str();
2153 assert_eq!(self.char(), '}');
2154 self.bump_and_bump_space();
2155
2156 if hex.is_empty() {
2157 return Err(self.error(
2158 Span::new(brace_pos, self.pos()),
2159 ast::ErrorKind::EscapeHexEmpty,
2160 ));
2161 }
2162 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2163 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2164 Some(c) => Ok(Literal {
2165 span: Span::new(start, self.pos()),
2166 kind: LiteralKind::HexBrace(kind),
2167 c,
2168 }),
2169 }
2170 }
2171
2172 fn parse_decimal(&self) -> Result<u32> {
2173 let mut scratch = self.parser().scratch.borrow_mut();
2174 scratch.clear();
2175
2176 while !self.is_eof() && self.char().is_whitespace() {
2177 self.bump();
2178 }
2179 let start = self.pos();
2180 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2181 scratch.push(self.char());
2182 self.bump_and_bump_space();
2183 }
2184 let span = Span::new(start, self.pos());
2185 while !self.is_eof() && self.char().is_whitespace() {
2186 self.bump_and_bump_space();
2187 }
2188 let digits = scratch.as_str();
2189 if digits.is_empty() {
2190 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2191 }
2192 match u32::from_str_radix(digits, 10).ok() {
2193 Some(n) => Ok(n),
2194 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2195 }
2196 }
2197
2198 #[inline(never)]
2199 fn parse_set_class(&self) -> Result<ClassBracketed> {
2200 assert_eq!(self.char(), '[');
2201
2202 let mut union = ClassSetUnion {
2203 span: self.span(),
2204 items: vec![],
2205 };
2206 loop {
2207 self.bump_space();
2208 if self.is_eof() {
2209 return Err(self.unclosed_class_error());
2210 }
2211 match self.char() {
2212 '[' => {
2213 if !self.parser().stack_class.borrow().is_empty() {
2218 if let Some(cls) = self.maybe_parse_ascii_class() {
2219 union.push(ClassSetItem::Ascii(cls));
2220 continue;
2221 }
2222 }
2223 union = self.push_class_open(union)?;
2224 }
2225 ']' => match self.pop_class(union)? {
2226 Either::Left(nested_union) => {
2227 union = nested_union;
2228 }
2229 Either::Right(class) => return Ok(class),
2230 },
2231 '&' if self.peek() == Some('&') => {
2232 assert!(self.bump_if("&&"));
2233 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2234 }
2235 '-' if self.peek() == Some('-') => {
2236 assert!(self.bump_if("--"));
2237 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2238 }
2239 '~' if self.peek() == Some('~') => {
2240 assert!(self.bump_if("~~"));
2241 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2242 }
2243 _ => {
2244 union.push(self.parse_set_class_range()?);
2245 }
2246 }
2247 }
2248 }
2249
2250 #[inline(never)]
2251 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2252 let prim1 = self.parse_set_class_item()?;
2253 self.bump_space();
2254 if self.is_eof() {
2255 return Err(self.unclosed_class_error());
2256 }
2257 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2258 return prim1.into_class_set_item(self);
2259 }
2260 if !self.bump_and_bump_space() {
2261 return Err(self.unclosed_class_error());
2262 }
2263 let prim2 = self.parse_set_class_item()?;
2264 let range = ClassSetRange {
2265 span: Span::new(prim1.span().start, prim2.span().end),
2266 start: prim1.into_class_literal(self)?,
2267 end: prim2.into_class_literal(self)?,
2268 };
2269 if !range.is_valid() {
2270 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2271 }
2272 Ok(ClassSetItem::Range(range))
2273 }
2274
2275 #[inline(never)]
2276 fn parse_set_class_item(&self) -> Result<Primitive> {
2277 if self.char() == '\\' {
2278 self.parse_escape()
2279 } else {
2280 let x = Primitive::Literal(Literal {
2281 span: self.span_char(),
2282 kind: LiteralKind::Verbatim,
2283 c: self.char(),
2284 });
2285 self.bump();
2286 Ok(x)
2287 }
2288 }
2289
2290 #[inline(never)]
2291 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2292 assert_eq!(self.char(), '[');
2293 let start = self.pos();
2294 if !self.bump_and_bump_space() {
2295 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2296 }
2297
2298 let negated = if self.char() != '^' {
2299 false
2300 } else {
2301 if !self.bump_and_bump_space() {
2302 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2303 }
2304 true
2305 };
2306 let mut union = ClassSetUnion {
2308 span: self.span(),
2309 items: vec![],
2310 };
2311 while self.char() == '-' {
2312 union.push(ClassSetItem::Literal(Literal {
2313 span: self.span_char(),
2314 kind: LiteralKind::Verbatim,
2315 c: '-',
2316 }));
2317 if !self.bump_and_bump_space() {
2318 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2319 }
2320 }
2321 if union.items.is_empty() && self.char() == ']' {
2324 union.push(ClassSetItem::Literal(Literal {
2325 span: self.span_char(),
2326 kind: LiteralKind::Verbatim,
2327 c: ']',
2328 }));
2329 if !self.bump_and_bump_space() {
2330 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2331 }
2332 }
2333 let set = ClassBracketed {
2334 span: Span::new(start, self.pos()),
2335 negated,
2336 kind: ClassSet::union(ClassSetUnion {
2337 span: Span::new(union.span.start, union.span.start),
2338 items: vec![],
2339 }),
2340 };
2341 Ok((set, union))
2342 }
2343
2344 #[inline(never)]
2345 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2346 assert_eq!(self.char(), '[');
2347 let start = self.pos();
2349 let mut negated = false;
2350 if !self.bump() || self.char() != ':' {
2351 self.parser().pos.set(start);
2352 return None;
2353 }
2354 if !self.bump() {
2355 self.parser().pos.set(start);
2356 return None;
2357 }
2358 if self.char() == '^' {
2359 negated = true;
2360 if !self.bump() {
2361 self.parser().pos.set(start);
2362 return None;
2363 }
2364 }
2365 let name_start = self.offset();
2366 while self.char() != ':' && self.bump() {}
2367 if self.is_eof() {
2368 self.parser().pos.set(start);
2369 return None;
2370 }
2371 let name = &self.pattern()[name_start..self.offset()];
2372 if !self.bump_if(":]") {
2373 self.parser().pos.set(start);
2374 return None;
2375 }
2376 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2377 Some(kind) => kind,
2378 None => {
2379 self.parser().pos.set(start);
2380 return None;
2381 }
2382 };
2383 Some(ClassAscii {
2384 span: Span::new(start, self.pos()),
2385 kind,
2386 negated,
2387 })
2388 }
2389
2390 #[inline(never)]
2391 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2392 assert!(self.char() == 'p' || self.char() == 'P');
2393
2394 let mut scratch = self.parser().scratch.borrow_mut();
2395 scratch.clear();
2396
2397 let negated = self.char() == 'P';
2398 if !self.bump_and_bump_space() {
2399 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2400 }
2401 let (start, kind) = if self.char() == '{' {
2402 let start = self.span_char().end;
2403 while self.bump_and_bump_space() && self.char() != '}' {
2404 scratch.push(self.char());
2405 }
2406 if self.is_eof() {
2407 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2408 }
2409 assert_eq!(self.char(), '}');
2410 self.bump();
2411
2412 let name = scratch.as_str();
2413 if let Some(i) = name.find("!=") {
2414 (
2415 start,
2416 ClassUnicodeKind::NamedValue {
2417 op: ClassUnicodeOpKind::NotEqual,
2418 name: name[..i].to_string(),
2419 value: name[i + 2..].to_string(),
2420 },
2421 )
2422 } else if let Some(i) = name.find(':') {
2423 (
2424 start,
2425 ClassUnicodeKind::NamedValue {
2426 op: ClassUnicodeOpKind::Colon,
2427 name: name[..i].to_string(),
2428 value: name[i + 1..].to_string(),
2429 },
2430 )
2431 } else if let Some(i) = name.find('=') {
2432 (
2433 start,
2434 ClassUnicodeKind::NamedValue {
2435 op: ClassUnicodeOpKind::Equal,
2436 name: name[..i].to_string(),
2437 value: name[i + 1..].to_string(),
2438 },
2439 )
2440 } else {
2441 (start, ClassUnicodeKind::Named(name.to_string()))
2442 }
2443 } else {
2444 let start = self.pos();
2445 let c = self.char();
2446 if c == '\\' {
2447 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2448 }
2449 self.bump_and_bump_space();
2450 let kind = ClassUnicodeKind::OneLetter(c);
2451 (start, kind)
2452 };
2453 Ok(ClassUnicode {
2454 span: Span::new(start, self.pos()),
2455 negated,
2456 kind,
2457 })
2458 }
2459
2460 #[inline(never)]
2461 fn parse_perl_class(&self) -> ClassPerl {
2462 let c = self.char();
2463 let span = self.span_char();
2464 self.bump();
2465 let (negated, kind) = match c {
2466 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2467 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2468 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2469 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2470 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2471 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2472 c => panic!("expected valid Perl class but got '{}'", c),
2473 };
2474 ClassPerl {
2475 span,
2476 kind,
2477 negated,
2478 }
2479 }
2480}
2481
2482pub fn parse_ast<'s>(
2483 tb: &mut TB<'s>,
2484 pattern: &'s str,
2485) -> std::result::Result<NodeId, ResharpError> {
2486 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2487 let result = p.parse(tb);
2488 result
2489}