1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51}
52
53pub const DEFAULT_MAX_REPEAT: u32 = 500;
56pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
57pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
58
59impl Default for PatternFlags {
60 fn default() -> Self {
61 Self {
62 unicode: true,
63 full_unicode: false,
64 case_insensitive: false,
65 dot_matches_new_line: false,
66 multiline: true,
67 ignore_whitespace: false,
68 ascii_perl_classes: false,
69 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
70 max_list_len: DEFAULT_MAX_LIST_LEN,
71 max_repeat: DEFAULT_MAX_REPEAT,
72 }
73 }
74}
75
76#[derive(Clone, Copy, PartialEq, Debug)]
77enum WordCharKind {
78 Word,
79 NonWord,
80 MaybeWord,
81 MaybeNonWord,
82 Unknown,
83 Edge,
84}
85
86fn is_word_byte(b: u8) -> bool {
87 b.is_ascii_alphanumeric() || b == b'_'
88}
89
90fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
91 use regex_syntax::ast::{ClassSetItem, ClassPerlKind};
92 use WordCharKind::*;
93 match item {
94 ClassSetItem::Empty(_) => Unknown,
95 ClassSetItem::Literal(l) => {
96 if is_word_byte(l.c as u8) { Word } else { NonWord }
97 }
98 ClassSetItem::Range(r) => {
99 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
100 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
101 if all_word { Word } else if all_non { NonWord } else { Unknown }
102 }
103 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
104 (ClassPerlKind::Word, false) => Word,
105 (ClassPerlKind::Word, true) => NonWord,
106 (ClassPerlKind::Space, false) => NonWord,
107 _ => Unknown,
108 },
109 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
110 ClassSetItem::Union(u) => {
111 let mut kind = Unknown;
112 for item in &u.items {
113 let k = class_set_item_word_kind(item);
114 kind = match (kind, k) {
115 (Unknown, _) => k,
116 (Word, Word) => Word,
117 (NonWord, NonWord) => NonWord,
118 _ => return Unknown,
119 };
120 }
121 kind
122 }
123 _ => Unknown,
124 }
125}
126
127fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
128 use WordCharKind::*;
129 let inner = match &c.kind {
130 regex_syntax::ast::ClassSet::Item(item) => class_set_item_word_kind(item),
131 regex_syntax::ast::ClassSet::BinaryOp(_) => Unknown,
132 };
133 if c.negated {
134 match inner {
135 Word => NonWord,
136 NonWord => Word,
137 _ => Unknown,
138 }
139 } else {
140 inner
141 }
142}
143
144#[derive(Clone, Debug, Eq, PartialEq)]
145enum Primitive {
146 Literal(Literal),
147 Assertion(ast::Assertion),
148 Dot(Span),
149 Top(Span),
150 Perl(ClassPerl),
151 Unicode(ClassUnicode),
152}
153
154impl Primitive {
155 fn span(&self) -> &Span {
156 match *self {
157 Primitive::Literal(ref x) => &x.span,
158 Primitive::Assertion(ref x) => &x.span,
159 Primitive::Dot(ref span) => span,
160 Primitive::Top(ref span) => span,
161 Primitive::Perl(ref x) => &x.span,
162 Primitive::Unicode(ref x) => &x.span,
163 }
164 }
165
166 fn into_ast(self) -> Ast {
167 match self {
168 Primitive::Literal(lit) => Ast::literal(lit),
169 Primitive::Assertion(assert) => Ast::assertion(assert),
170 Primitive::Dot(span) => Ast::dot(span),
171 Primitive::Top(span) => Ast::top(span),
172 Primitive::Perl(cls) => Ast::class_perl(cls),
173 Primitive::Unicode(cls) => Ast::class_unicode(cls),
174 }
175 }
176
177 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
178 use self::Primitive::*;
179 use regex_syntax::ast::ClassSetItem;
180
181 match self {
182 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
183 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
184 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
185 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
186 }
187 }
188
189 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
190 use self::Primitive::*;
191
192 match self {
193 Literal(lit) => Ok(lit),
194 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
195 }
196 }
197}
198
199#[derive(Clone, Debug, Eq, PartialEq)]
200pub enum Either<Left, Right> {
201 Left(Left),
202 Right(Right),
203}
204
205#[derive(Clone, Debug, Eq, PartialEq)]
206pub struct ParseError {
207 pub kind: ErrorKind,
209 pattern: String,
212 pub span: Span,
214}
215
216impl std::fmt::Display for ParseError {
217 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
218 write!(f, "{:?}: {:?}", self.kind, self.span)
219 }
220}
221impl std::error::Error for ParseError {}
222
223type Result<T> = core::result::Result<T, ParseError>;
224
225#[derive(Clone, Debug)]
226enum GroupState {
227 Group {
229 concat: Concat,
231 group: ast::Group,
233 ignore_whitespace: bool,
235 },
236 Alternation(ast::Alternation),
237 Intersection(ast::Intersection),
238}
239
240#[derive(Clone, Debug)]
241enum ClassState {
242 Open {
244 union: regex_syntax::ast::ClassSetUnion,
246 set: regex_syntax::ast::ClassBracketed,
247 },
248 Op {
251 kind: regex_syntax::ast::ClassSetBinaryOpKind,
253 lhs: regex_syntax::ast::ClassSet,
255 },
256}
257
258pub struct ResharpParser<'s> {
260 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
261 unicode_classes: resharp_algebra::UnicodeClassCache,
262 pub translator: regex_syntax::hir::translate::Translator,
263 pub pattern: &'s str,
264 pos: Cell<Position>,
265 capture_index: Cell<u32>,
266 octal: bool,
267 empty_min_range: bool,
268 ignore_whitespace: Cell<bool>,
269 dot_all: Cell<bool>,
270 multiline: Cell<bool>,
271 global_unicode: bool,
272 global_full_unicode: bool,
273 global_ascii_perl: bool,
274 global_case_insensitive: bool,
275 expanded_ast_limit: u64,
276 max_list_len: usize,
277 max_repeat: u32,
278 comments: RefCell<Vec<ast::Comment>>,
279 stack_group: RefCell<Vec<GroupState>>,
280 stack_class: RefCell<Vec<ClassState>>,
281 capture_names: RefCell<Vec<ast::CaptureName>>,
282 scratch: RefCell<String>,
283}
284
285fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
286 result.map_err(|e| {
287 if e.kind == from {
288 ParseError {
289 kind: to,
290 pattern: e.pattern,
291 span: e.span,
292 }
293 } else {
294 e
295 }
296 })
297}
298
299fn is_capture_char(c: char, first: bool) -> bool {
300 if first {
301 c == '_' || c.is_alphabetic()
302 } else {
303 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
304 }
305}
306
307pub fn is_meta_character(c: char) -> bool {
308 matches!(
309 c,
310 '\\' | '.'
311 | '+'
312 | '*'
313 | '?'
314 | '('
315 | ')'
316 | '|'
317 | '['
318 | ']'
319 | '{'
320 | '}'
321 | '^'
322 | '$'
323 | '#'
324 | '&'
325 | '-'
326 | '~'
327 | '_'
328 )
329}
330
331pub fn escape(text: &str) -> String {
333 let mut buf = String::new();
334 escape_into(text, &mut buf);
335 buf
336}
337
338pub fn escape_into(text: &str, buf: &mut String) {
340 buf.reserve(text.len());
341 for c in text.chars() {
342 if is_meta_character(c) {
343 buf.push('\\');
344 }
345 buf.push(c);
346 }
347}
348
349pub fn is_escapeable_character(c: char) -> bool {
350 if is_meta_character(c) {
351 return true;
352 }
353 if !c.is_ascii() {
354 return false;
355 }
356 match c {
357 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
358 '<' | '>' => false,
359 _ => true,
360 }
361}
362
363fn is_hex(c: char) -> bool {
364 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
365}
366
367impl<'s> ResharpParser<'s> {
368 fn default_translator_builder(&self) -> TranslatorBuilder {
369 let mut trb = TranslatorBuilder::new();
370 trb.unicode(self.global_unicode);
371 trb.utf8(false);
372 trb.case_insensitive(self.global_case_insensitive);
373 trb
374 }
375
376 pub fn new(pattern: &'s str) -> Self {
377 Self::with_flags(pattern, &PatternFlags::default())
378 }
379
380 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
381 let mut trb = TranslatorBuilder::new();
382 trb.unicode(flags.unicode);
383 trb.utf8(false);
384 trb.case_insensitive(flags.case_insensitive);
385 Self {
386 translator: trb.build(),
387 pattern,
388 perl_classes: vec![],
389 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
390 pos: Cell::new(Position::new(0, 0, 0)),
391 capture_index: Cell::new(0),
392 octal: false,
393 empty_min_range: false,
394 ignore_whitespace: Cell::new(flags.ignore_whitespace),
395 dot_all: Cell::new(flags.dot_matches_new_line),
396 multiline: Cell::new(flags.multiline),
397 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
398 global_full_unicode: flags.full_unicode,
399 global_ascii_perl: flags.ascii_perl_classes,
400 global_case_insensitive: flags.case_insensitive,
401 expanded_ast_limit: flags.expanded_ast_limit,
402 max_list_len: flags.max_list_len,
403 max_repeat: flags.max_repeat,
404 comments: RefCell::new(vec![]),
405 stack_group: RefCell::new(vec![]),
406 stack_class: RefCell::new(vec![]),
407 capture_names: RefCell::new(vec![]),
408 scratch: RefCell::new(String::new()),
409 }
410 }
411
412 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
413 self
414 }
415
416 fn pattern(&self) -> &str {
417 self.pattern
418 }
419
420 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
421 ParseError {
422 kind,
423 pattern: self.pattern().to_string(),
424 span,
425 }
426 }
427
428 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
429 self.error(
430 Span::splat(self.pos()),
431 ast::ErrorKind::UnsupportedResharpRegex,
432 )
433 }
434
435 fn offset(&self) -> usize {
436 self.parser().pos.get().offset
437 }
438
439 fn line(&self) -> usize {
440 self.parser().pos.get().line
441 }
442
443 fn column(&self) -> usize {
444 self.parser().pos.get().column
445 }
446
447 fn next_capture_index(&self, span: Span) -> Result<u32> {
448 let current = self.parser().capture_index.get();
449 let i = current
450 .checked_add(1)
451 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
452 self.parser().capture_index.set(i);
453 Ok(i)
454 }
455
456 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
457 let mut names = self.parser().capture_names.borrow_mut();
458 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
459 Err(i) => {
460 names.insert(i, cap.clone());
461 Ok(())
462 }
463 Ok(i) => Err(self.error(
464 cap.span,
465 ast::ErrorKind::GroupNameDuplicate {
466 original: names[i].span,
467 },
468 )),
469 }
470 }
471
472 fn ignore_whitespace(&self) -> bool {
473 self.parser().ignore_whitespace.get()
474 }
475
476 fn char(&self) -> char {
477 self.char_at(self.offset())
478 }
479
480 fn char_at(&self, i: usize) -> char {
481 self.pattern()[i..]
482 .chars()
483 .next()
484 .unwrap_or_else(|| panic!("expected char at offset {}", i))
485 }
486
487 fn bump(&self) -> bool {
488 if self.is_eof() {
489 return false;
490 }
491 let Position {
492 mut offset,
493 mut line,
494 mut column,
495 } = self.pos();
496 if self.char() == '\n' {
497 line = line.checked_add(1).unwrap();
498 column = 1;
499 } else {
500 column = column.checked_add(1).unwrap();
501 }
502 offset += self.char().len_utf8();
503 self.parser().pos.set(Position {
504 offset,
505 line,
506 column,
507 });
508 self.pattern()[self.offset()..].chars().next().is_some()
509 }
510
511 fn bump_if(&self, prefix: &str) -> bool {
512 if self.pattern()[self.offset()..].starts_with(prefix) {
513 for _ in 0..prefix.chars().count() {
514 self.bump();
515 }
516 true
517 } else {
518 false
519 }
520 }
521
522 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
523 if self.bump_if("?=") {
524 return Some((true, true));
525 }
526 if self.bump_if("?!") {
527 return Some((true, false));
528 }
529 if self.bump_if("?<=") {
530 return Some((false, true));
531 }
532 if self.bump_if("?<!") {
533 return Some((false, false));
534 }
535 None
536 }
537
538 fn bump_and_bump_space(&self) -> bool {
539 if !self.bump() {
540 return false;
541 }
542 self.bump_space();
543 !self.is_eof()
544 }
545
546 fn bump_space(&self) {
547 if !self.ignore_whitespace() {
548 return;
549 }
550 while !self.is_eof() {
551 if self.char().is_whitespace() {
552 self.bump();
553 } else if self.char() == '#' {
554 let start = self.pos();
555 let mut comment_text = String::new();
556 self.bump();
557 while !self.is_eof() {
558 let c = self.char();
559 self.bump();
560 if c == '\n' {
561 break;
562 }
563 comment_text.push(c);
564 }
565 let comment = ast::Comment {
566 span: Span::new(start, self.pos()),
567 comment: comment_text,
568 };
569 self.parser().comments.borrow_mut().push(comment);
570 } else {
571 break;
572 }
573 }
574 }
575
576 fn peek(&self) -> Option<char> {
577 if self.is_eof() {
578 return None;
579 }
580 self.pattern()[self.offset() + self.char().len_utf8()..]
581 .chars()
582 .next()
583 }
584
585 fn peek_space(&self) -> Option<char> {
588 if !self.ignore_whitespace() {
589 return self.peek();
590 }
591 if self.is_eof() {
592 return None;
593 }
594 let mut start = self.offset() + self.char().len_utf8();
595 let mut in_comment = false;
596 for (i, c) in self.pattern()[start..].char_indices() {
597 if c.is_whitespace() {
598 continue;
599 } else if !in_comment && c == '#' {
600 in_comment = true;
601 } else if in_comment && c == '\n' {
602 in_comment = false;
603 } else {
604 start += i;
605 break;
606 }
607 }
608 self.pattern()[start..].chars().next()
609 }
610
611 fn is_eof(&self) -> bool {
612 self.offset() == self.pattern().len()
613 }
614
615 fn pos(&self) -> Position {
616 self.parser().pos.get()
617 }
618
619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 fn span_char(&self) -> Span {
624 let mut next = Position {
625 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
626 line: self.line(),
627 column: self.column().checked_add(1).unwrap(),
628 };
629 if self.char() == '\n' {
630 next.line += 1;
631 next.column = 1;
632 }
633 Span::new(self.pos(), next)
634 }
635
636 #[inline(never)]
637 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
638 assert_eq!(self.char(), '|');
639 concat.span.end = self.pos();
640 self.push_or_add_alternation(concat);
641 self.bump();
642 Ok(ast::Concat {
643 span: self.span(),
644 asts: vec![],
645 })
646 }
647
648 fn push_or_add_alternation(&self, concat: Concat) {
649 use self::GroupState::*;
650
651 let mut stack = self.parser().stack_group.borrow_mut();
652 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
653 alts.asts.push(concat.into_ast());
654 return;
655 }
656 stack.push(Alternation(ast::Alternation {
657 span: Span::new(concat.span.start, self.pos()),
658 asts: vec![concat.into_ast()],
659 }));
660 }
661
662 #[inline(never)]
663 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
664 assert_eq!(self.char(), '&');
665 concat.span.end = self.pos();
666 self.push_or_add_intersect(concat);
667 self.bump();
668 Ok(Concat {
669 span: self.span(),
670 asts: vec![],
671 })
672 }
673
674 fn push_or_add_intersect(&self, concat: Concat) {
675 use self::GroupState::*;
676
677 let mut stack = self.parser().stack_group.borrow_mut();
678 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
679 alts.asts.push(concat.into_ast());
680 return;
681 }
682 stack.push(Intersection(ast::Intersection {
683 span: Span::new(concat.span.start, self.pos()),
684 asts: vec![concat.into_ast()],
685 }));
686 }
687
688 #[inline(never)]
689 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
690 assert_eq!(self.char(), '(');
691 match self.parse_group()? {
692 Either::Left(set) => {
693 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
694 if let Some(v) = ignore {
695 self.parser().ignore_whitespace.set(v);
696 }
697
698 concat.asts.push(Ast::flags(set));
699 Ok(concat)
700 }
701 Either::Right(group) => {
702 let old_ignore_whitespace = self.ignore_whitespace();
703 let new_ignore_whitespace = group
704 .flags()
705 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
706 .unwrap_or(old_ignore_whitespace);
707 self.parser()
708 .stack_group
709 .borrow_mut()
710 .push(GroupState::Group {
711 concat,
712 group,
713 ignore_whitespace: old_ignore_whitespace,
714 });
715 self.parser().ignore_whitespace.set(new_ignore_whitespace);
716 Ok(Concat {
717 span: self.span(),
718 asts: vec![],
719 })
720 }
721 }
722 }
723
724 #[inline(never)]
725 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
726 assert_eq!(self.char(), '~');
727 self.bump();
728 if self.is_eof() || self.char() != '(' {
729 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
730 }
731 let open_span = self.span_char();
732 self.bump();
733 let group = ast::Group {
734 span: open_span,
735 kind: ast::GroupKind::Complement,
736 ast: Box::new(Ast::empty(self.span())),
737 };
738
739 let old_ignore_whitespace = self.ignore_whitespace();
740 let new_ignore_whitespace = group
741 .flags()
742 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
743 .unwrap_or(old_ignore_whitespace);
744 self.parser()
745 .stack_group
746 .borrow_mut()
747 .push(GroupState::Group {
748 concat,
749 group,
750 ignore_whitespace: old_ignore_whitespace,
751 });
752 self.parser().ignore_whitespace.set(new_ignore_whitespace);
753 Ok(Concat {
754 span: self.span(),
755 asts: vec![],
756 })
757 }
758
759 #[inline(never)]
760 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
761 use self::GroupState::*;
762 assert_eq!(self.char(), ')');
763 let mut stack = self.parser().stack_group.borrow_mut();
764 let topstack = stack.pop();
765
766 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
767 Some(Group {
768 concat,
769 group,
770 ignore_whitespace,
771 }) => (concat, group, ignore_whitespace, None),
772 Some(Alternation(alt)) => match stack.pop() {
773 Some(Group {
774 concat,
775 group,
776 ignore_whitespace,
777 }) => (
778 concat,
779 group,
780 ignore_whitespace,
781 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
782 ),
783 None | Some(Alternation(_)) | Some(Intersection(_)) => {
784 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
785 }
786 },
787 Some(Intersection(int)) => match stack.pop() {
788 Some(Group {
789 concat,
790 group,
791 ignore_whitespace,
792 }) => (
793 concat,
794 group,
795 ignore_whitespace,
796 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
797 ),
798 None | Some(Alternation(_)) | Some(Intersection(_)) => {
799 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
800 }
801 },
802
803 None => {
804 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
805 }
806 };
807 self.parser().ignore_whitespace.set(ignore_whitespace);
808 group_concat.span.end = self.pos();
809 self.bump();
810 group.span.end = self.pos();
811 match alt {
812 Some(Either::Left(mut alt)) => {
813 alt.span.end = group_concat.span.end;
814 alt.asts.push(group_concat.into_ast());
815 group.ast = Box::new(alt.into_ast());
816 }
817 Some(Either::Right(mut int)) => {
818 int.span.end = group_concat.span.end;
819 int.asts.push(group_concat.into_ast());
820 group.ast = Box::new(int.into_ast());
821 }
822 None => {
823 group.ast = Box::new(group_concat.into_ast());
824 }
825 }
826
827 if group.kind == GroupKind::Complement {
828 let complement = ast::Complement {
829 span: self.span(),
830 ast: group.ast,
831 };
832 prior_concat.asts.push(Ast::complement(complement));
833 }
834 else {
836 prior_concat.asts.push(Ast::group(group));
837 }
838 Ok(prior_concat)
839 }
840
841 #[inline(never)]
842 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
843 concat.span.end = self.pos();
844 let mut stack = self.parser().stack_group.borrow_mut();
845 let ast = match stack.pop() {
846 None => Ok(concat.into_ast()),
847 Some(GroupState::Alternation(mut alt)) => {
848 alt.span.end = self.pos();
849 alt.asts.push(concat.into_ast());
850 Ok(Ast::alternation(alt))
851 }
852 Some(GroupState::Intersection(mut int)) => {
853 int.span.end = self.pos();
854 int.asts.push(concat.into_ast());
855
856 Ok(Ast::intersection(int))
857 }
858 Some(GroupState::Group { group, .. }) => {
859 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
860 }
861 };
862 match stack.pop() {
864 None => ast,
865 Some(GroupState::Alternation(alt)) => {
866 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
867 }
868 Some(GroupState::Intersection(int)) => {
869 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
870 }
871 Some(GroupState::Group { group, .. }) => {
872 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
873 }
874 }
875 }
876
877 #[inline(never)]
878 fn push_class_open(
879 &self,
880 parent_union: regex_syntax::ast::ClassSetUnion,
881 ) -> Result<regex_syntax::ast::ClassSetUnion> {
882 assert_eq!(self.char(), '[');
883
884 let (nested_set, nested_union) = self.parse_set_class_open()?;
885 self.parser()
886 .stack_class
887 .borrow_mut()
888 .push(ClassState::Open {
889 union: parent_union,
890 set: nested_set,
891 });
892 Ok(nested_union)
893 }
894
895 #[inline(never)]
896 fn pop_class(
897 &self,
898 nested_union: regex_syntax::ast::ClassSetUnion,
899 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
900 assert_eq!(self.char(), ']');
901
902 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
903 let prevset = self.pop_class_op(item);
904 let mut stack = self.parser().stack_class.borrow_mut();
905 match stack.pop() {
906 None => panic!("unexpected empty character class stack"),
907 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
908 Some(ClassState::Open { mut union, mut set }) => {
909 self.bump();
910 set.span.end = self.pos();
911 set.kind = prevset;
912 if stack.is_empty() {
913 Ok(Either::Right(set))
914 } else {
915 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
916 Ok(Either::Left(union))
917 }
918 }
919 }
920 }
921
922 #[inline(never)]
923 fn unclosed_class_error(&self) -> ParseError {
924 for state in self.parser().stack_class.borrow().iter().rev() {
925 if let ClassState::Open { ref set, .. } = *state {
926 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
927 }
928 }
929 panic!("no open character class found")
930 }
931
932 #[inline(never)]
933 fn push_class_op(
934 &self,
935 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
936 next_union: regex_syntax::ast::ClassSetUnion,
937 ) -> regex_syntax::ast::ClassSetUnion {
938 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
939 let new_lhs = self.pop_class_op(item);
940 self.parser().stack_class.borrow_mut().push(ClassState::Op {
941 kind: next_kind,
942 lhs: new_lhs,
943 });
944 regex_syntax::ast::ClassSetUnion {
945 span: self.span(),
946 items: vec![],
947 }
948 }
949
950 #[inline(never)]
951 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
952 let mut stack = self.parser().stack_class.borrow_mut();
953 let (kind, lhs) = match stack.pop() {
954 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
955 Some(state @ ClassState::Open { .. }) => {
956 stack.push(state);
957 return rhs;
958 }
959 None => unreachable!(),
960 };
961 let span = Span::new(lhs.span().start, rhs.span().end);
962 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
963 span,
964 kind,
965 lhs: Box::new(lhs),
966 rhs: Box::new(rhs),
967 })
968 }
969
970 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
971 match hir.kind() {
972 hir::HirKind::Empty => Ok(NodeId::EPS),
973 hir::HirKind::Literal(l) => {
974 if l.0.len() == 1 {
975 let node = tb.mk_u8(l.0[0]);
976 Ok(node)
977 } else {
978 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
979 let conc = tb.mk_concats(ws.iter().copied());
980 Ok(conc)
981 }
982 }
983 hir::HirKind::Class(class) => match class {
984 hir::Class::Unicode(class_unicode) => {
985 let ranges = class_unicode.ranges();
986 if ranges.len() == 1
987 && ranges[0].start() == '\u{0}'
988 && ranges[0].end() == '\u{10FFFF}'
989 {
990 return Ok(tb.mk_range_u8(0, 255));
991 }
992 let mut nodes = Vec::new();
993 for range in ranges {
994 for seq in Utf8Sequences::new(range.start(), range.end()) {
995 let sl = seq.as_slice();
996 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
997 let node = match bytes.len() {
998 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
999 n => {
1000 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1001 let mut conc = last;
1002 for i in (0..n - 1).rev() {
1003 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1004 conc = tb.mk_concat(b, conc);
1005 }
1006 conc
1007 }
1008 };
1009 nodes.push(node);
1010 }
1011 }
1012 let merged = tb.mk_unions(nodes.into_iter());
1013 Ok(merged)
1014 }
1015 hir::Class::Bytes(class_bytes) => {
1016 let ranges = class_bytes.ranges();
1017 let mut result = NodeId::BOT;
1018 for range in ranges {
1019 let start = range.start();
1020 let end = range.end();
1021 let node = tb.mk_range_u8(start, end);
1022 result = tb.mk_union(result, node);
1023 }
1024 Ok(result)
1025 }
1026 },
1027 hir::HirKind::Look(_) => Err(self.error(
1028 Span::splat(self.pos()),
1029 ast::ErrorKind::UnsupportedResharpRegex,
1030 )),
1031 hir::HirKind::Repetition(_) => Err(self.error(
1032 Span::splat(self.pos()),
1033 ast::ErrorKind::UnsupportedResharpRegex,
1034 )),
1035 hir::HirKind::Capture(_) => Err(self.error(
1036 Span::splat(self.pos()),
1037 ast::ErrorKind::UnsupportedResharpRegex,
1038 )),
1039 hir::HirKind::Concat(body) => {
1040 let mut result = NodeId::EPS;
1041 for child in body {
1042 let node = self.hir_to_node_id(child, tb)?;
1043 result = tb.mk_concat(result, node);
1044 }
1045 Ok(result)
1046 }
1047 hir::HirKind::Alternation(_) => Err(self.error(
1048 Span::splat(self.pos()),
1049 ast::ErrorKind::UnsupportedResharpRegex,
1050 )),
1051 }
1052 }
1053
1054 fn translate_ast_to_hir(
1055 &mut self,
1056 orig_ast: ®ex_syntax::ast::Ast,
1057 tb: &mut TB<'s>,
1058 ) -> Result<NodeId> {
1059 match self.translator.translate("", orig_ast) {
1060 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1061 Ok(hir) => self.hir_to_node_id(&hir, tb),
1062 }
1063 }
1064
1065 fn translator_to_node_id(
1066 &mut self,
1067 orig_ast: ®ex_syntax::ast::Ast,
1068 translator: &mut Option<Translator>,
1069 tb: &mut TB<'s>,
1070 ) -> Result<NodeId> {
1071 match translator {
1072 Some(tr) => {
1073 let hir = tr
1074 .translate("", orig_ast)
1075 .map_err(|e| self.unsupported_error(e))?;
1076 self.hir_to_node_id(&hir, tb)
1077 }
1078 None => self.translate_ast_to_hir(orig_ast, tb),
1079 }
1080 }
1081
1082 fn get_class(
1083 &mut self,
1084 negated: bool,
1085 kind: regex_syntax::ast::ClassPerlKind,
1086 tb: &mut TB<'s>,
1087 ) -> Result<NodeId> {
1088 let w = self
1089 .perl_classes
1090 .iter()
1091 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1092 match w {
1093 Some((_, _, value)) => Ok(*value),
1094 None => {
1095 let translated = if self.global_ascii_perl {
1096 let pos = match kind {
1097 regex_syntax::ast::ClassPerlKind::Word => {
1098 let az = tb.mk_range_u8(b'a', b'z');
1099 let big = tb.mk_range_u8(b'A', b'Z');
1100 let dig = tb.mk_range_u8(b'0', b'9');
1101 let us = tb.mk_u8(b'_');
1102 tb.mk_unions([az, big, dig, us].into_iter())
1103 }
1104 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1105 regex_syntax::ast::ClassPerlKind::Space => {
1106 let sp = tb.mk_u8(b' ');
1107 let tab = tb.mk_u8(b'\t');
1108 let nl = tb.mk_u8(b'\n');
1109 let cr = tb.mk_u8(b'\r');
1110 let ff = tb.mk_u8(0x0C);
1111 let vt = tb.mk_u8(0x0B);
1112 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1113 }
1114 };
1115 if negated {
1116 resharp_algebra::neg_class(tb, pos)
1117 } else {
1118 pos
1119 }
1120 } else if self.global_unicode {
1121 match kind {
1122 regex_syntax::ast::ClassPerlKind::Word => {
1123 if self.global_full_unicode {
1124 self.unicode_classes.ensure_word_full(tb);
1125 } else {
1126 self.unicode_classes.ensure_word(tb);
1127 }
1128 if negated {
1129 self.unicode_classes.non_word
1130 } else {
1131 self.unicode_classes.word
1132 }
1133 }
1134 regex_syntax::ast::ClassPerlKind::Digit => {
1135 if self.global_full_unicode {
1136 self.unicode_classes.ensure_digit_full(tb);
1137 } else {
1138 self.unicode_classes.ensure_digit(tb);
1139 }
1140 if negated {
1141 self.unicode_classes.non_digit
1142 } else {
1143 self.unicode_classes.digit
1144 }
1145 }
1146 regex_syntax::ast::ClassPerlKind::Space => {
1147 if self.global_full_unicode {
1148 self.unicode_classes.ensure_space_full(tb);
1149 } else {
1150 self.unicode_classes.ensure_space(tb);
1151 }
1152 if negated {
1153 self.unicode_classes.non_space
1154 } else {
1155 self.unicode_classes.space
1156 }
1157 }
1158 }
1159 } else {
1160 let pos = match kind {
1161 regex_syntax::ast::ClassPerlKind::Word => {
1162 let az = tb.mk_range_u8(b'a', b'z');
1163 let big = tb.mk_range_u8(b'A', b'Z');
1164 let dig = tb.mk_range_u8(b'0', b'9');
1165 let us = tb.mk_u8(b'_');
1166 tb.mk_unions([az, big, dig, us].into_iter())
1167 }
1168 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1169 regex_syntax::ast::ClassPerlKind::Space => {
1170 let sp = tb.mk_u8(b' ');
1171 let tab = tb.mk_u8(b'\t');
1172 let nl = tb.mk_u8(b'\n');
1173 let cr = tb.mk_u8(b'\r');
1174 let ff = tb.mk_u8(0x0C);
1175 let vt = tb.mk_u8(0x0B);
1176 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1177 }
1178 };
1179 if negated {
1180 tb.mk_compl(pos)
1181 } else {
1182 pos
1183 }
1184 };
1185 self.perl_classes.push((negated, kind, translated));
1186 Ok(translated)
1187 }
1188 }
1189 }
1190
1191 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1192 use WordCharKind::*;
1193 match ast {
1194 Ast::Literal(lit) => {
1195 if is_word_byte(lit.c as u8) {
1196 Word
1197 } else {
1198 NonWord
1199 }
1200 }
1201 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1202 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1203 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1204 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1205 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1206 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1207 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1208 },
1209 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1210 Ast::Dot(_) | Ast::Top(_) => Unknown,
1211 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1212 Ast::Concat(c) if !c.asts.is_empty() => {
1213 let edge = if left { c.asts.len() - 1 } else { 0 };
1214 let kind = Self::word_char_kind(&c.asts[edge], left);
1215 match kind {
1216 MaybeWord => {
1217 let dir: isize = if left { -1 } else { 1 };
1218 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1219 Word => Word,
1220 _ => MaybeWord,
1221 }
1222 }
1223 MaybeNonWord => {
1224 let dir: isize = if left { -1 } else { 1 };
1225 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1226 NonWord => NonWord,
1227 _ => MaybeNonWord,
1228 }
1229 }
1230 other => other,
1231 }
1232 }
1233 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1234 let first = Self::word_char_kind(&alt.asts[0], left);
1235 if alt.asts[1..]
1236 .iter()
1237 .all(|a| Self::word_char_kind(a, left) == first)
1238 {
1239 first
1240 } else {
1241 Unknown
1242 }
1243 }
1244 Ast::Repetition(r) => {
1245 let inner = Self::word_char_kind(&r.ast, left);
1246 let nullable = matches!(
1247 &r.op.kind,
1248 ast::RepetitionKind::ZeroOrMore
1249 | ast::RepetitionKind::ZeroOrOne
1250 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1251 );
1252 if nullable {
1253 match inner {
1254 Word => MaybeWord,
1255 NonWord => MaybeNonWord,
1256 _ => Unknown,
1257 }
1258 } else {
1259 inner
1260 }
1261 }
1262 Ast::Lookaround(la) => match la.kind {
1263 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::PositiveLookbehind => {
1264 Self::word_char_kind(&la.ast, left)
1265 }
1266 _ => Unknown,
1267 },
1268 _ => Unknown,
1269 }
1270 }
1271
1272 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1274 match ast {
1275 Ast::Literal(_)
1276 | Ast::ClassPerl(_)
1277 | Ast::ClassBracketed(_)
1278 | Ast::ClassUnicode(_)
1279 | Ast::Dot(_)
1280 | Ast::Top(_) => Some(ast),
1281 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1282 Ast::Concat(c) if !c.asts.is_empty() => {
1283 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1284 }
1285 Ast::Repetition(r) => {
1286 let nullable = matches!(
1287 &r.op.kind,
1288 ast::RepetitionKind::ZeroOrMore
1289 | ast::RepetitionKind::ZeroOrOne
1290 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1291 );
1292 if nullable {
1293 None
1294 } else {
1295 Self::edge_class_ast(&r.ast, left)
1296 }
1297 }
1298 _ => None,
1299 }
1300 }
1301
1302 fn resolve_word_kind(
1303 &mut self,
1304 asts: &[Ast],
1305 idx: usize,
1306 dir: isize,
1307 translator: &mut Option<Translator>,
1308 tb: &mut TB<'s>,
1309 word_id: NodeId,
1310 not_word_id: NodeId,
1311 ) -> Result<WordCharKind> {
1312 use WordCharKind::*;
1313 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1314 if fast != Unknown {
1315 return Ok(fast);
1316 }
1317 let neighbor_idx = (idx as isize + dir) as usize;
1318 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1319 self.ast_to_node_id(edge, translator, tb)?
1320 } else if dir > 0 {
1321 let mut bodies: Vec<NodeId> = vec![];
1322 let mut j = neighbor_idx;
1323 while j < asts.len() {
1324 match &asts[j] {
1325 Ast::Lookaround(la) => {
1326 let kind = la.kind.clone();
1327 let lookbehind = matches!(
1328 kind,
1329 ast::LookaroundKind::PositiveLookbehind
1330 | ast::LookaroundKind::NegativeLookbehind
1331 );
1332 if lookbehind {
1333 j += 1;
1334 continue;
1335 }
1336 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1337 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1338 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1339 })?;
1340 let body_ts = tb.mk_concat(body, NodeId::TS);
1341 let constraint = match kind {
1342 ast::LookaroundKind::PositiveLookahead => body_ts,
1343 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1344 _ => unreachable!(),
1345 };
1346 bodies.push(constraint);
1347 j += 1;
1348 }
1349 other => {
1350 let n = self.ast_to_node_id(other, translator, tb)?;
1351 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1352 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1353 })?;
1354 bodies.push(tb.mk_concat(n, NodeId::TS));
1355 break;
1356 }
1357 }
1358 }
1359 if bodies.is_empty() {
1360 return Ok(Unknown);
1361 }
1362 let combined = tb.mk_inters(bodies.into_iter());
1363 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1364 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1365 return if tb.subsumes(word_prefix, combined) == Some(true) {
1366 Ok(Word)
1367 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1368 Ok(NonWord)
1369 } else {
1370 Ok(Unknown)
1371 };
1372 } else {
1373 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1374 let mut neighbor_node = tb
1375 .try_elim_lookarounds(neighbor_node)
1376 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1377 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1378 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1379 })?;
1380 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1381 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1382 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1383 Ok(Word)
1384 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1385 Ok(NonWord)
1386 } else {
1387 Ok(Unknown)
1388 };
1389 };
1390 if tb.subsumes(word_id, node) == Some(true) {
1391 Ok(Word)
1392 } else if tb.subsumes(not_word_id, node) == Some(true) {
1393 Ok(NonWord)
1394 } else {
1395 Ok(Unknown)
1396 }
1397 }
1398
1399 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1400 use WordCharKind::*;
1401 let next = idx as isize + dir;
1402 if next < 0 || next >= asts.len() as isize {
1403 return Edge;
1404 }
1405 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1406 match kind {
1407 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1408 Word => Word,
1409 _ => Unknown,
1410 },
1411 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1412 NonWord => NonWord,
1413 _ => Unknown,
1414 },
1415 other => other,
1416 }
1417 }
1418
1419 fn rewrite_word_boundary_in_concat(
1420 &mut self,
1421 asts: &[Ast],
1422 idx: usize,
1423 translator: &mut Option<Translator>,
1424 tb: &mut TB<'s>,
1425 negated: bool,
1426 ) -> Result<(NodeId, usize)> {
1427 use WordCharKind::*;
1428 let (word_id, not_word_id) = if self.global_full_unicode {
1429 self.unicode_classes.ensure_word_full(tb);
1430 (self.unicode_classes.word, self.unicode_classes.non_word)
1431 } else if self.global_unicode && !self.global_ascii_perl {
1432 self.unicode_classes.ensure_word(tb);
1433 (self.unicode_classes.word, self.unicode_classes.non_word)
1434 } else {
1435 let az = tb.mk_range_u8(b'a', b'z');
1436 let big = tb.mk_range_u8(b'A', b'Z');
1437 let dig = tb.mk_range_u8(b'0', b'9');
1438 let us = tb.mk_u8(b'_');
1439 let w = tb.mk_unions([az, big, dig, us].into_iter());
1440 (w, resharp_algebra::neg_class(tb, w))
1441 };
1442 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1443 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1444 let boundary_match = !negated;
1445 match (left, right) {
1446 (NonWord, Word) | (Word, NonWord) => Ok((
1447 if boundary_match { NodeId::EPS } else { NodeId::BOT },
1448 idx + 1,
1449 )),
1450 (Word, Word) | (NonWord, NonWord) => Ok((
1451 if boundary_match { NodeId::BOT } else { NodeId::EPS },
1452 idx + 1,
1453 )),
1454 (Word, _) => {
1455 if boundary_match {
1456 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1457 } else {
1458 let tail = tb.mk_concat(word_id, NodeId::TS);
1459 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1460 }
1461 }
1462 (NonWord, _) => {
1463 if boundary_match {
1464 let tail = tb.mk_concat(word_id, NodeId::TS);
1465 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1466 } else {
1467 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1468 }
1469 }
1470 (_, Word) => {
1471 if boundary_match {
1472 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1473 } else {
1474 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1475 }
1476 }
1477 (_, NonWord) => {
1478 if boundary_match {
1479 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1480 } else {
1481 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1482 }
1483 }
1484 _ => Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)),
1488 }
1489 }
1490
1491 fn merge_boundary_with_following_lookaheads(
1492 &mut self,
1493 asts: &[Ast],
1494 wb_idx: usize,
1495 boundary_tail: NodeId,
1496 translator: &mut Option<Translator>,
1497 tb: &mut TB<'s>,
1498 ) -> Result<(NodeId, usize)> {
1499 let mut next = wb_idx + 1;
1500 let mut la_bodies = vec![boundary_tail];
1501 while next < asts.len() {
1502 match &asts[next] {
1503 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1504 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1505 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1506 next += 1;
1507 }
1508 _ => break,
1509 }
1510 }
1511 let merged = tb.mk_inters(la_bodies.into_iter());
1512 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1513 }
1514
1515 fn ast_to_node_id(
1516 &mut self,
1517 ast: &Ast,
1518 translator: &mut Option<Translator>,
1519 tb: &mut TB<'s>,
1520 ) -> Result<NodeId> {
1521 match ast {
1522 Ast::Empty(_) => Ok(NodeId::EPS),
1523 Ast::Flags(f) => {
1524 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1525 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1526 }
1527 let mut translator_builder = self.default_translator_builder();
1528 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1529 translator_builder.case_insensitive(state);
1530 }
1531 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1532 translator_builder.unicode(state);
1533 }
1534 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1535 self.dot_all.set(state);
1536 }
1537 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1538 self.multiline.set(state);
1539 }
1540 let concat_translator = Some(translator_builder.build());
1541 *translator = concat_translator;
1542 Ok(NodeId::EPS)
1543 }
1544 Ast::Literal(l) => {
1545 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1546 self.translator_to_node_id(&ast_lit, translator, tb)
1547 }
1548 Ast::Top(_) => Ok(NodeId::TOP),
1549 Ast::Dot(_) => {
1550 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1551 let hirv = match (codepoint_dot, self.dot_all.get()) {
1552 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1553 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1554 (false, true) => return Ok(NodeId::TOP),
1555 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1556 };
1557 self.hir_to_node_id(&hirv, tb)
1558 }
1559 Ast::Assertion(a) => match &a.kind {
1560 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1561 ast::AssertionKind::EndText => Ok(NodeId::END),
1562 ast::AssertionKind::WordBoundary => {
1563 let only = Ast::Assertion(a.clone());
1564 let asts = std::slice::from_ref(&only);
1565 let (node, _) = self
1566 .rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1567 Ok(node)
1568 }
1569 ast::AssertionKind::NotWordBoundary => {
1570 let only = Ast::Assertion(a.clone());
1572 let asts = std::slice::from_ref(&only);
1573 let (node, _) = self
1574 .rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1575 Ok(node)
1576 }
1577 ast::AssertionKind::StartLine => {
1578 if !self.multiline.get() {
1579 return Ok(NodeId::BEGIN);
1580 }
1581 let left = NodeId::BEGIN;
1582 let right = tb.mk_u8(b'\n');
1583 let union = tb.mk_union(left, right);
1584 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1585 }
1586 ast::AssertionKind::EndLine => {
1587 if !self.multiline.get() {
1588 return Ok(NodeId::END);
1589 }
1590 let left = NodeId::END;
1591 let right = tb.mk_u8(b'\n');
1592 let union = tb.mk_union(left, right);
1593 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1594 }
1595 ast::AssertionKind::WordBoundaryStart => {
1596 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1597 }
1598 ast::AssertionKind::WordBoundaryEnd => {
1599 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1600 }
1601 ast::AssertionKind::WordBoundaryStartAngle => {
1602 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1603 }
1604 ast::AssertionKind::WordBoundaryEndAngle => {
1605 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1606 }
1607 ast::AssertionKind::WordBoundaryStartHalf => {
1608 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1609 }
1610 ast::AssertionKind::WordBoundaryEndHalf => {
1611 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1612 }
1613 },
1614 Ast::ClassUnicode(c) => {
1615 let tmp = regex_syntax::ast::ClassUnicode {
1616 span: c.span,
1617 negated: c.negated,
1618 kind: c.kind.clone(),
1619 };
1620 if !c.negated {
1621 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1622 match s.as_str() {
1623 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1625 "utf8" => {
1627 let ascii = tb.mk_range_u8(0, 127);
1628 let beta = tb.mk_range_u8(128, 0xBF);
1629 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1630 let c0s = tb.mk_concats([c0, beta].into_iter());
1631 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1632 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1633 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1634 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1635 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
1636 }
1637 "hex" => {
1638 let nums = tb.mk_range_u8(b'0', b'9');
1639 let lets = tb.mk_range_u8(b'a', b'f');
1640 let lets2 = tb.mk_range_u8(b'A', b'F');
1641 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1642 return Ok(merged);
1643 }
1644 _ => {}
1645 }
1646 };
1647 }
1648
1649 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1650 self.translator_to_node_id(&orig_ast, translator, tb)
1651 }
1652 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1653 Ast::ClassBracketed(c) => match &c.kind {
1654 regex_syntax::ast::ClassSet::Item(item) => {
1655 if !c.negated && is_universal_perl_pair(item) {
1656 return Ok(NodeId::TOP);
1657 }
1658 let tmp = regex_syntax::ast::ClassBracketed {
1659 span: c.span,
1660 negated: c.negated,
1661 kind: c.kind.clone(),
1662 };
1663 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1664 self.translator_to_node_id(&orig_ast, translator, tb)
1665 }
1666 regex_syntax::ast::ClassSet::BinaryOp(_) => {
1667 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1668 }
1669 },
1670 Ast::Repetition(r) => {
1671 let body = self.ast_to_node_id(&r.ast, translator, tb);
1672 match body {
1673 Ok(body) => match &r.op.kind {
1674 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1675 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1676 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1677 ast::RepetitionKind::Range(r) => match r {
1678 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1679 ast::RepetitionRange::AtLeast(n) => {
1680 let rep = tb.mk_repeat(body, *n, *n);
1681 let st = tb.mk_star(body);
1682 Ok(tb.mk_concat(rep, st))
1683 }
1684
1685 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1686 },
1687 },
1688 Err(_) => body,
1689 }
1690 }
1691 Ast::Lookaround(g) => {
1692 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1693 match g.kind {
1694 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
1695 let mut prefix = NodeId::EPS;
1696 let mut rest = body;
1697 while tb.get_kind(rest) == Kind::Concat
1698 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
1699 {
1700 prefix = tb.mk_concat(prefix, rest.left(tb));
1701 rest = rest.right(tb);
1702 }
1703 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
1704 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
1705 }
1706 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
1707 Ok(tb.mk_concat(prefix, la))
1708 }
1709 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
1710 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
1711 }
1712 ast::LookaroundKind::PositiveLookahead => {
1713 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1714 }
1715 ast::LookaroundKind::PositiveLookbehind => {
1716 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1717 }
1718 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1719 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1720 }
1721 }
1722 Ast::Group(g) => {
1723 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1724 if !flags.items.is_empty() {
1725 let mut translator_builder = self.default_translator_builder();
1726 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1727 translator_builder.case_insensitive(state);
1728 }
1729 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1730 translator_builder.unicode(state);
1731 }
1732 let saved_dot_all = self.dot_all.get();
1733 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1734 self.dot_all.set(state);
1735 }
1736 let saved_multiline = self.multiline.get();
1737 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
1738 self.multiline.set(state);
1739 }
1740 let mut scoped = Some(translator_builder.build());
1741 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1742 self.dot_all.set(saved_dot_all);
1743 self.multiline.set(saved_multiline);
1744 return result;
1745 }
1746 }
1747 self.ast_to_node_id(&g.ast, translator, tb)
1748 }
1749 Ast::Alternation(a) => {
1750 let mut children = vec![];
1751 for ast in &a.asts {
1752 match self.ast_to_node_id(ast, translator, tb) {
1753 Ok(node_id) => children.push(node_id),
1754 Err(err) => return Err(err),
1755 }
1756 }
1757 Ok(tb.mk_unions(children.iter().copied()))
1758 }
1759 Ast::Concat(c) => {
1760 let mut concat_translator: Option<Translator> = None;
1761 let mut children = vec![];
1762 let mut i = 0;
1763 while i < c.asts.len() {
1764 let ast = &c.asts[i];
1765 match ast {
1766 Ast::Flags(f) => {
1767 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1768 return Err(
1769 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1770 );
1771 }
1772 let mut translator_builder = self.default_translator_builder();
1773 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1774 translator_builder.case_insensitive(state);
1775 }
1776 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1777 translator_builder.unicode(state);
1778 }
1779 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1780 self.dot_all.set(state);
1781 }
1782 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1783 self.multiline.set(state);
1784 }
1785 concat_translator = Some(translator_builder.build());
1786 *translator = concat_translator.clone();
1787 i += 1;
1788 continue;
1789 }
1790 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1791 let node = self
1792 .rewrite_word_boundary_in_concat(&c.asts, i, translator, tb, false)?;
1793 children.push(node.0);
1794 i = node.1; continue;
1796 }
1797 Ast::Assertion(a) if a.kind == ast::AssertionKind::NotWordBoundary => {
1798 let node = self
1799 .rewrite_word_boundary_in_concat(&c.asts, i, translator, tb, true)?;
1800 children.push(node.0);
1801 i = node.1;
1802 continue;
1803 }
1804 _ => {}
1805 }
1806 match concat_translator {
1807 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1808 Ok(node_id) => children.push(node_id),
1809 Err(err) => return Err(err),
1810 },
1811 None => match self.ast_to_node_id(ast, translator, tb) {
1812 Ok(node_id) => children.push(node_id),
1813 Err(err) => return Err(err),
1814 },
1815 }
1816 i += 1;
1817 }
1818 Ok(tb.mk_concats(children.iter().cloned()))
1819 }
1820 Ast::Intersection(intersection) => {
1821 let mut children = vec![];
1822 for ast in &intersection.asts {
1823 match self.ast_to_node_id(ast, translator, tb) {
1824 Ok(node_id) => children.push(node_id),
1825 Err(err) => return Err(err),
1826 }
1827 }
1828 Ok(tb.mk_inters(children.into_iter()))
1829 }
1830 Ast::Complement(complement) => {
1831 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1832 body.map(|x| tb.mk_compl(x))
1833 }
1834 }
1835 }
1836
1837 fn parse_inner(&mut self) -> Result<Ast> {
1838 let mut concat = Concat {
1839 span: self.span(),
1840 asts: vec![],
1841 };
1842 loop {
1843 self.bump_space();
1844 if self.is_eof() {
1845 break;
1846 }
1847 match self.char() {
1848 '(' => concat = self.push_group(concat)?,
1849 ')' => concat = self.pop_group(concat)?,
1850 '|' => concat = self.push_alternate(concat)?,
1851 '&' => concat = self.push_intersect(concat)?,
1852 '~' => concat = self.push_compl_group(concat)?,
1853 '[' => {
1854 let class = self.parse_set_class()?;
1855 concat.asts.push(Ast::class_bracketed(class));
1856 }
1857 '?' => {
1858 concat =
1859 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1860 }
1861 '*' => {
1862 concat =
1863 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1864 }
1865 '+' => {
1866 concat =
1867 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1868 }
1869 '{' => {
1870 concat = self.parse_counted_repetition(concat)?;
1871 }
1872 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1873 }
1874 }
1875 let ast = self.pop_group_end(concat)?;
1876 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
1877 || max_concat_length(&ast) >= self.max_list_len
1878 {
1879 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1880 }
1881 Ok(ast)
1882 }
1883
1884 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1885 let ast = self.parse_inner()?;
1886 self.ast_to_node_id(&ast, &mut None, tb)
1887 }
1888
1889 #[inline(never)]
1890 fn parse_uncounted_repetition(
1891 &self,
1892 mut concat: ast::Concat,
1893 kind: ast::RepetitionKind,
1894 ) -> Result<ast::Concat> {
1895 let op_start = self.pos();
1897 let ast = match concat.asts.pop() {
1898 Some(ast) => ast,
1899 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1900 };
1901 match ast {
1902 Ast::Empty(_) | Ast::Flags(_) => {
1903 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1904 }
1905 _ => {}
1906 }
1907 if self.bump() && self.char() == '?' {
1908 return Err(self.error(
1909 Span::new(op_start, self.pos()),
1910 ast::ErrorKind::UnsupportedLazyQuantifier,
1911 ));
1912 }
1913 concat.asts.push(Ast::repetition(ast::Repetition {
1914 span: ast.span().with_end(self.pos()),
1915 op: ast::RepetitionOp {
1916 span: Span::new(op_start, self.pos()),
1917 kind,
1918 },
1919 greedy: true,
1920 ast: Box::new(ast),
1921 }));
1922 Ok(concat)
1923 }
1924
1925 #[inline(never)]
1926 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
1927 assert!(self.char() == '{');
1928 let start = self.pos();
1929 let ast = match concat.asts.pop() {
1930 Some(ast) => ast,
1931 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
1932 };
1933 match ast {
1934 Ast::Empty(_) | Ast::Flags(_) => {
1935 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
1936 }
1937 _ => {}
1938 }
1939 if !self.bump_and_bump_space() {
1940 return Err(self.error(
1941 Span::new(start, self.pos()),
1942 ast::ErrorKind::RepetitionCountUnclosed,
1943 ));
1944 }
1945 let count_start = specialize_err(
1946 self.parse_decimal(),
1947 ast::ErrorKind::DecimalEmpty,
1948 ast::ErrorKind::RepetitionCountDecimalEmpty,
1949 );
1950 if self.is_eof() {
1951 return Err(self.error(
1952 Span::new(start, self.pos()),
1953 ast::ErrorKind::RepetitionCountUnclosed,
1954 ));
1955 }
1956 let range = if self.char() == ',' {
1957 if !self.bump_and_bump_space() {
1958 return Err(self.error(
1959 Span::new(start, self.pos()),
1960 ast::ErrorKind::RepetitionCountUnclosed,
1961 ));
1962 }
1963 if self.char() != '}' {
1964 let count_start = match count_start {
1965 Ok(c) => c,
1966 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
1967 if self.parser().empty_min_range {
1968 0
1969 } else {
1970 return Err(err);
1971 }
1972 }
1973 err => err?,
1974 };
1975 let count_end = specialize_err(
1976 self.parse_decimal(),
1977 ast::ErrorKind::DecimalEmpty,
1978 ast::ErrorKind::RepetitionCountDecimalEmpty,
1979 )?;
1980 ast::RepetitionRange::Bounded(count_start, count_end)
1981 } else {
1982 ast::RepetitionRange::AtLeast(count_start?)
1983 }
1984 } else {
1985 ast::RepetitionRange::Exactly(count_start?)
1986 };
1987
1988 if self.is_eof() || self.char() != '}' {
1989 return Err(self.error(
1990 Span::new(start, self.pos()),
1991 ast::ErrorKind::RepetitionCountUnclosed,
1992 ));
1993 }
1994
1995 if self.bump_and_bump_space() && self.char() == '?' {
1996 return Err(self.error(
1997 Span::new(start, self.pos()),
1998 ast::ErrorKind::UnsupportedLazyQuantifier,
1999 ));
2000 }
2001
2002 let op_span = Span::new(start, self.pos());
2003 if !range.is_valid() {
2004 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2005 }
2006
2007 let over_limit = match &range {
2008 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2009 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2010 ast::RepetitionRange::Bounded(n, m) => {
2011 *n > self.max_repeat || *m > self.max_repeat
2012 }
2013 };
2014 if over_limit {
2015 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2016 }
2017 concat.asts.push(Ast::repetition(ast::Repetition {
2018 span: ast.span().with_end(self.pos()),
2019 op: ast::RepetitionOp {
2020 span: op_span,
2021 kind: ast::RepetitionKind::Range(range),
2022 },
2023 greedy: true,
2024 ast: Box::new(ast),
2025 }));
2026 Ok(concat)
2027 }
2028
2029 #[inline(never)]
2030 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2031 assert_eq!(self.char(), '(');
2032 let open_span = self.span_char();
2033 self.bump();
2034 self.bump_space();
2035 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2036 let kind = match (pos, ahead) {
2037 (true, true) => LookaroundKind::PositiveLookahead,
2038 (true, false) => LookaroundKind::PositiveLookbehind,
2039 (false, true) => LookaroundKind::NegativeLookahead,
2040 (false, false) => LookaroundKind::NegativeLookbehind,
2041 };
2042 return Ok(Either::Right(ast::Group {
2043 span: open_span,
2044 kind: ast::GroupKind::Lookaround(kind),
2045 ast: Box::new(Ast::empty(self.span())),
2046 }));
2047 }
2048 let inner_span = self.span();
2049 let mut starts_with_p = true;
2050 if self.bump_if("?P<") || {
2051 starts_with_p = false;
2052 self.bump_if("?<")
2053 } {
2054 let capture_index = self.next_capture_index(open_span)?;
2055 let name = self.parse_capture_name(capture_index)?;
2056 Ok(Either::Right(ast::Group {
2057 span: open_span,
2058 kind: ast::GroupKind::CaptureName {
2059 starts_with_p,
2060 name,
2061 },
2062 ast: Box::new(Ast::empty(self.span())),
2063 }))
2064 } else if self.bump_if("?") {
2065 if self.is_eof() {
2066 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2067 }
2068 let flags = self.parse_flags()?;
2069 let char_end = self.char();
2070 self.bump();
2071 if char_end == ')' {
2072 if flags.items.is_empty() {
2075 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2076 }
2077 Ok(Either::Left(ast::SetFlags {
2078 span: Span {
2079 end: self.pos(),
2080 ..open_span
2081 },
2082 flags,
2083 }))
2084 } else {
2085 assert_eq!(char_end, ':');
2086 Ok(Either::Right(ast::Group {
2087 span: open_span,
2088 kind: ast::GroupKind::NonCapturing(flags),
2089 ast: Box::new(Ast::empty(self.span())),
2090 }))
2091 }
2092 } else {
2093 let capture_index = self.next_capture_index(open_span)?;
2094 Ok(Either::Right(ast::Group {
2095 span: open_span,
2096 kind: ast::GroupKind::CaptureIndex(capture_index),
2097 ast: Box::new(Ast::empty(self.span())),
2098 }))
2099 }
2100 }
2101
2102 #[inline(never)]
2103 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2104 if self.is_eof() {
2105 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2106 }
2107 let start = self.pos();
2108 loop {
2109 if self.char() == '>' {
2110 break;
2111 }
2112 if !is_capture_char(self.char(), self.pos() == start) {
2113 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2114 }
2115 if !self.bump() {
2116 break;
2117 }
2118 }
2119 let end = self.pos();
2120 if self.is_eof() {
2121 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2122 }
2123 assert_eq!(self.char(), '>');
2124 self.bump();
2125 let name = &self.pattern()[start.offset..end.offset];
2126 if name.is_empty() {
2127 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2128 }
2129 let capname = ast::CaptureName {
2130 span: Span::new(start, end),
2131 name: name.to_string(),
2132 index: capture_index,
2133 };
2134 self.add_capture_name(&capname)?;
2135 Ok(capname)
2136 }
2137
2138 #[inline(never)]
2139 fn parse_flags(&self) -> Result<ast::Flags> {
2140 let mut flags = ast::Flags {
2141 span: self.span(),
2142 items: vec![],
2143 };
2144 let mut last_was_negation = None;
2145 while self.char() != ':' && self.char() != ')' {
2146 if self.char() == '-' {
2147 last_was_negation = Some(self.span_char());
2148 let item = ast::FlagsItem {
2149 span: self.span_char(),
2150 kind: ast::FlagsItemKind::Negation,
2151 };
2152 if let Some(i) = flags.add_item(item) {
2153 return Err(self.error(
2154 self.span_char(),
2155 ast::ErrorKind::FlagRepeatedNegation {
2156 original: flags.items[i].span,
2157 },
2158 ));
2159 }
2160 } else {
2161 last_was_negation = None;
2162 let item = ast::FlagsItem {
2163 span: self.span_char(),
2164 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2165 };
2166 if let Some(i) = flags.add_item(item) {
2167 return Err(self.error(
2168 self.span_char(),
2169 ast::ErrorKind::FlagDuplicate {
2170 original: flags.items[i].span,
2171 },
2172 ));
2173 }
2174 }
2175 if !self.bump() {
2176 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2177 }
2178 }
2179 if let Some(span) = last_was_negation {
2180 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2181 }
2182 flags.span.end = self.pos();
2183 Ok(flags)
2184 }
2185
2186 #[inline(never)]
2187 fn parse_flag(&self) -> Result<ast::Flag> {
2188 match self.char() {
2189 'i' => Ok(ast::Flag::CaseInsensitive),
2190 'm' => Ok(ast::Flag::MultiLine),
2191 's' => Ok(ast::Flag::DotMatchesNewLine),
2192 'U' => Ok(ast::Flag::SwapGreed),
2193 'u' => Ok(ast::Flag::Unicode),
2194 'R' => Ok(ast::Flag::CRLF),
2195 'x' => Ok(ast::Flag::IgnoreWhitespace),
2196 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2197 }
2198 }
2199
2200 fn parse_primitive(&self) -> Result<Primitive> {
2201 match self.char() {
2202 '\\' => self.parse_escape(),
2203 '_' => {
2204 let ast = Primitive::Top(self.span_char());
2205 self.bump();
2206 Ok(ast)
2207 }
2208 '.' => {
2209 let ast = Primitive::Dot(self.span_char());
2210 self.bump();
2211 Ok(ast)
2212 }
2213 '^' => {
2214 let ast = Primitive::Assertion(ast::Assertion {
2215 span: self.span_char(),
2216 kind: ast::AssertionKind::StartLine,
2217 });
2218 self.bump();
2219 Ok(ast)
2220 }
2221 '$' => {
2222 let ast = Primitive::Assertion(ast::Assertion {
2223 span: self.span_char(),
2224 kind: ast::AssertionKind::EndLine,
2225 });
2226 self.bump();
2227 Ok(ast)
2228 }
2229 c => {
2230 let ast = Primitive::Literal(Literal {
2231 span: self.span_char(),
2232 kind: LiteralKind::Verbatim,
2233 c,
2234 });
2235 self.bump();
2236 Ok(ast)
2237 }
2238 }
2239 }
2240
2241 #[inline(never)]
2242 fn parse_escape(&self) -> Result<Primitive> {
2243 assert_eq!(self.char(), '\\');
2244 let start = self.pos();
2245 if !self.bump() {
2246 return Err(self.error(
2247 Span::new(start, self.pos()),
2248 ast::ErrorKind::EscapeUnexpectedEof,
2249 ));
2250 }
2251 let c = self.char();
2252 match c {
2254 '0'..='9' => {
2255 if !self.parser().octal {
2256 return Err(self.error(
2257 Span::new(start, self.span_char().end),
2258 ast::ErrorKind::UnsupportedBackreference,
2259 ));
2260 }
2261 let mut lit = self.parse_octal();
2262 lit.span.start = start;
2263 return Ok(Primitive::Literal(lit));
2264 }
2265 'x' | 'u' | 'U' => {
2272 let mut lit = self.parse_hex()?;
2273 lit.span.start = start;
2274 return Ok(Primitive::Literal(lit));
2275 }
2276 'p' | 'P' => {
2277 let mut cls = self.parse_unicode_class()?;
2278 cls.span.start = start;
2279 return Ok(Primitive::Unicode(cls));
2280 }
2281 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2282 let mut cls = self.parse_perl_class();
2283 cls.span.start = start;
2284 return Ok(Primitive::Perl(cls));
2285 }
2286 _ => {}
2287 }
2288
2289 self.bump();
2291 let span = Span::new(start, self.pos());
2292 if is_meta_character(c) {
2293 return Ok(Primitive::Literal(Literal {
2294 span,
2295 kind: LiteralKind::Meta,
2296 c,
2297 }));
2298 }
2299 if is_escapeable_character(c) {
2300 return Ok(Primitive::Literal(Literal {
2301 span,
2302 kind: LiteralKind::Superfluous,
2303 c,
2304 }));
2305 }
2306 let special = |kind, c| {
2307 Ok(Primitive::Literal(Literal {
2308 span,
2309 kind: LiteralKind::Special(kind),
2310 c,
2311 }))
2312 };
2313 match c {
2314 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2315 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2316 't' => special(SpecialLiteralKind::Tab, '\t'),
2317 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2318 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2319 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2320 'A' => Ok(Primitive::Assertion(ast::Assertion {
2321 span,
2322 kind: ast::AssertionKind::StartText,
2323 })),
2324 'z' => Ok(Primitive::Assertion(ast::Assertion {
2325 span,
2326 kind: ast::AssertionKind::EndText,
2327 })),
2328 'b' => {
2329 let mut wb = ast::Assertion {
2330 span,
2331 kind: ast::AssertionKind::WordBoundary,
2332 };
2333 if !self.is_eof() && self.char() == '{' {
2336 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2337 wb.kind = kind;
2338 wb.span.end = self.pos();
2339 }
2340 }
2341 Ok(Primitive::Assertion(wb))
2342 }
2343 'B' => Ok(Primitive::Assertion(ast::Assertion {
2344 span,
2345 kind: ast::AssertionKind::NotWordBoundary,
2346 })),
2347 '<' => Ok(Primitive::Assertion(ast::Assertion {
2348 span,
2349 kind: ast::AssertionKind::WordBoundaryStartAngle,
2350 })),
2351 '>' => Ok(Primitive::Assertion(ast::Assertion {
2352 span,
2353 kind: ast::AssertionKind::WordBoundaryEndAngle,
2354 })),
2355 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2356 }
2357 }
2358
2359 fn maybe_parse_special_word_boundary(
2360 &self,
2361 wb_start: Position,
2362 ) -> Result<Option<ast::AssertionKind>> {
2363 assert_eq!(self.char(), '{');
2364
2365 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2366 let start = self.pos();
2367 if !self.bump_and_bump_space() {
2368 return Err(self.error(
2369 Span::new(wb_start, self.pos()),
2370 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2371 ));
2372 }
2373 let start_contents = self.pos();
2374 if !is_valid_char(self.char()) {
2375 self.parser().pos.set(start);
2376 return Ok(None);
2377 }
2378
2379 let mut scratch = self.parser().scratch.borrow_mut();
2381 scratch.clear();
2382 while !self.is_eof() && is_valid_char(self.char()) {
2383 scratch.push(self.char());
2384 self.bump_and_bump_space();
2385 }
2386 if self.is_eof() || self.char() != '}' {
2387 return Err(self.error(
2388 Span::new(start, self.pos()),
2389 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2390 ));
2391 }
2392 let end = self.pos();
2393 self.bump();
2394 let kind = match scratch.as_str() {
2395 "start" => ast::AssertionKind::WordBoundaryStart,
2396 "end" => ast::AssertionKind::WordBoundaryEnd,
2397 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2398 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2399 _ => {
2400 return Err(self.error(
2401 Span::new(start_contents, end),
2402 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2403 ))
2404 }
2405 };
2406 Ok(Some(kind))
2407 }
2408
2409 #[inline(never)]
2410 fn parse_octal(&self) -> Literal {
2411 assert!(self.parser().octal);
2412 assert!('0' <= self.char() && self.char() <= '7');
2413 let start = self.pos();
2414 while self.bump()
2416 && '0' <= self.char()
2417 && self.char() <= '7'
2418 && self.pos().offset - start.offset <= 2
2419 {}
2420 let end = self.pos();
2421 let octal = &self.pattern()[start.offset..end.offset];
2422 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2425 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2428 Literal {
2429 span: Span::new(start, end),
2430 kind: LiteralKind::Octal,
2431 c,
2432 }
2433 }
2434
2435 #[inline(never)]
2436 fn parse_hex(&self) -> Result<Literal> {
2437 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2438
2439 let hex_kind = match self.char() {
2440 'x' => HexLiteralKind::X,
2441 'u' => HexLiteralKind::UnicodeShort,
2442 _ => HexLiteralKind::UnicodeLong,
2443 };
2444 if !self.bump_and_bump_space() {
2445 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2446 }
2447 if self.char() == '{' {
2448 self.parse_hex_brace(hex_kind)
2449 } else {
2450 self.parse_hex_digits(hex_kind)
2451 }
2452 }
2453
2454 #[inline(never)]
2455 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2456 let mut scratch = self.parser().scratch.borrow_mut();
2457 scratch.clear();
2458
2459 let start = self.pos();
2460 for i in 0..kind.digits() {
2461 if i > 0 && !self.bump_and_bump_space() {
2462 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2463 }
2464 if !is_hex(self.char()) {
2465 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2466 }
2467 scratch.push(self.char());
2468 }
2469 self.bump_and_bump_space();
2470 let end = self.pos();
2471 let hex = scratch.as_str();
2472 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2473 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2474 Some(c) => Ok(Literal {
2475 span: Span::new(start, end),
2476 kind: LiteralKind::HexFixed(kind),
2477 c,
2478 }),
2479 }
2480 }
2481
2482 #[inline(never)]
2483 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2484 let mut scratch = self.parser().scratch.borrow_mut();
2485 scratch.clear();
2486
2487 let brace_pos = self.pos();
2488 let start = self.span_char().end;
2489 while self.bump_and_bump_space() && self.char() != '}' {
2490 if !is_hex(self.char()) {
2491 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2492 }
2493 scratch.push(self.char());
2494 }
2495 if self.is_eof() {
2496 return Err(self.error(
2497 Span::new(brace_pos, self.pos()),
2498 ast::ErrorKind::EscapeUnexpectedEof,
2499 ));
2500 }
2501 let end = self.pos();
2502 let hex = scratch.as_str();
2503 assert_eq!(self.char(), '}');
2504 self.bump_and_bump_space();
2505
2506 if hex.is_empty() {
2507 return Err(self.error(
2508 Span::new(brace_pos, self.pos()),
2509 ast::ErrorKind::EscapeHexEmpty,
2510 ));
2511 }
2512 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2513 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2514 Some(c) => Ok(Literal {
2515 span: Span::new(start, self.pos()),
2516 kind: LiteralKind::HexBrace(kind),
2517 c,
2518 }),
2519 }
2520 }
2521
2522 fn parse_decimal(&self) -> Result<u32> {
2523 let mut scratch = self.parser().scratch.borrow_mut();
2524 scratch.clear();
2525
2526 while !self.is_eof() && self.char().is_whitespace() {
2527 self.bump();
2528 }
2529 let start = self.pos();
2530 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2531 scratch.push(self.char());
2532 self.bump_and_bump_space();
2533 }
2534 let span = Span::new(start, self.pos());
2535 while !self.is_eof() && self.char().is_whitespace() {
2536 self.bump_and_bump_space();
2537 }
2538 let digits = scratch.as_str();
2539 if digits.is_empty() {
2540 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2541 }
2542 match digits.parse::<u32>().ok() {
2543 Some(n) => Ok(n),
2544 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2545 }
2546 }
2547
2548 #[inline(never)]
2549 fn parse_set_class(&self) -> Result<ClassBracketed> {
2550 assert_eq!(self.char(), '[');
2551
2552 let mut union = ClassSetUnion {
2553 span: self.span(),
2554 items: vec![],
2555 };
2556 loop {
2557 self.bump_space();
2558 if self.is_eof() {
2559 return Err(self.unclosed_class_error());
2560 }
2561 match self.char() {
2562 '[' => {
2563 if !self.parser().stack_class.borrow().is_empty() {
2564 if let Some(cls) = self.maybe_parse_ascii_class() {
2565 union.push(ClassSetItem::Ascii(cls));
2566 continue;
2567 }
2568 }
2569 union = self.push_class_open(union)?;
2570 }
2571 ']' => match self.pop_class(union)? {
2572 Either::Left(nested_union) => {
2573 union = nested_union;
2574 }
2575 Either::Right(class) => return Ok(class),
2576 },
2577 '&' if self.peek() == Some('&') => {
2578 assert!(self.bump_if("&&"));
2579 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2580 }
2581 '-' if self.peek() == Some('-') => {
2582 assert!(self.bump_if("--"));
2583 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2584 }
2585 '~' if self.peek() == Some('~') => {
2586 assert!(self.bump_if("~~"));
2587 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2588 }
2589 _ => {
2590 union.push(self.parse_set_class_range()?);
2591 }
2592 }
2593 }
2594 }
2595
2596 #[inline(never)]
2597 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2598 let prim1 = self.parse_set_class_item()?;
2599 self.bump_space();
2600 if self.is_eof() {
2601 return Err(self.unclosed_class_error());
2602 }
2603 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2604 return prim1.into_class_set_item(self);
2605 }
2606 if !self.bump_and_bump_space() {
2607 return Err(self.unclosed_class_error());
2608 }
2609 let prim2 = self.parse_set_class_item()?;
2610 let range = ClassSetRange {
2611 span: Span::new(prim1.span().start, prim2.span().end),
2612 start: prim1.into_class_literal(self)?,
2613 end: prim2.into_class_literal(self)?,
2614 };
2615 if !range.is_valid() {
2616 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2617 }
2618 Ok(ClassSetItem::Range(range))
2619 }
2620
2621 #[inline(never)]
2622 fn parse_set_class_item(&self) -> Result<Primitive> {
2623 if self.char() == '\\' {
2624 self.parse_escape()
2625 } else {
2626 let x = Primitive::Literal(Literal {
2627 span: self.span_char(),
2628 kind: LiteralKind::Verbatim,
2629 c: self.char(),
2630 });
2631 self.bump();
2632 Ok(x)
2633 }
2634 }
2635
2636 #[inline(never)]
2637 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2638 assert_eq!(self.char(), '[');
2639 let start = self.pos();
2640 if !self.bump_and_bump_space() {
2641 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2642 }
2643
2644 let negated = if self.char() != '^' {
2645 false
2646 } else {
2647 if !self.bump_and_bump_space() {
2648 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2649 }
2650 true
2651 };
2652 let mut union = ClassSetUnion {
2654 span: self.span(),
2655 items: vec![],
2656 };
2657 while self.char() == '-' {
2658 union.push(ClassSetItem::Literal(Literal {
2659 span: self.span_char(),
2660 kind: LiteralKind::Verbatim,
2661 c: '-',
2662 }));
2663 if !self.bump_and_bump_space() {
2664 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2665 }
2666 }
2667 if union.items.is_empty() && self.char() == ']' {
2670 union.push(ClassSetItem::Literal(Literal {
2671 span: self.span_char(),
2672 kind: LiteralKind::Verbatim,
2673 c: ']',
2674 }));
2675 if !self.bump_and_bump_space() {
2676 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2677 }
2678 }
2679 let set = ClassBracketed {
2680 span: Span::new(start, self.pos()),
2681 negated,
2682 kind: ClassSet::union(ClassSetUnion {
2683 span: Span::new(union.span.start, union.span.start),
2684 items: vec![],
2685 }),
2686 };
2687 Ok((set, union))
2688 }
2689
2690 #[inline(never)]
2691 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2692 assert_eq!(self.char(), '[');
2693 let start = self.pos();
2695 let mut negated = false;
2696 if !self.bump() || self.char() != ':' {
2697 self.parser().pos.set(start);
2698 return None;
2699 }
2700 if !self.bump() {
2701 self.parser().pos.set(start);
2702 return None;
2703 }
2704 if self.char() == '^' {
2705 negated = true;
2706 if !self.bump() {
2707 self.parser().pos.set(start);
2708 return None;
2709 }
2710 }
2711 let name_start = self.offset();
2712 while self.char() != ':' && self.bump() {}
2713 if self.is_eof() {
2714 self.parser().pos.set(start);
2715 return None;
2716 }
2717 let name = &self.pattern()[name_start..self.offset()];
2718 if !self.bump_if(":]") {
2719 self.parser().pos.set(start);
2720 return None;
2721 }
2722 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2723 Some(kind) => kind,
2724 None => {
2725 self.parser().pos.set(start);
2726 return None;
2727 }
2728 };
2729 Some(ClassAscii {
2730 span: Span::new(start, self.pos()),
2731 kind,
2732 negated,
2733 })
2734 }
2735
2736 #[inline(never)]
2737 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2738 assert!(self.char() == 'p' || self.char() == 'P');
2739
2740 let mut scratch = self.parser().scratch.borrow_mut();
2741 scratch.clear();
2742
2743 let negated = self.char() == 'P';
2744 if !self.bump_and_bump_space() {
2745 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2746 }
2747 let (start, kind) = if self.char() == '{' {
2748 let start = self.span_char().end;
2749 while self.bump_and_bump_space() && self.char() != '}' {
2750 scratch.push(self.char());
2751 }
2752 if self.is_eof() {
2753 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2754 }
2755 assert_eq!(self.char(), '}');
2756 self.bump();
2757
2758 let name = scratch.as_str();
2759 if let Some(i) = name.find("!=") {
2760 (
2761 start,
2762 ClassUnicodeKind::NamedValue {
2763 op: ClassUnicodeOpKind::NotEqual,
2764 name: name[..i].to_string(),
2765 value: name[i + 2..].to_string(),
2766 },
2767 )
2768 } else if let Some(i) = name.find(':') {
2769 (
2770 start,
2771 ClassUnicodeKind::NamedValue {
2772 op: ClassUnicodeOpKind::Colon,
2773 name: name[..i].to_string(),
2774 value: name[i + 1..].to_string(),
2775 },
2776 )
2777 } else if let Some(i) = name.find('=') {
2778 (
2779 start,
2780 ClassUnicodeKind::NamedValue {
2781 op: ClassUnicodeOpKind::Equal,
2782 name: name[..i].to_string(),
2783 value: name[i + 1..].to_string(),
2784 },
2785 )
2786 } else {
2787 (start, ClassUnicodeKind::Named(name.to_string()))
2788 }
2789 } else {
2790 let start = self.pos();
2791 let c = self.char();
2792 if c == '\\' {
2793 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2794 }
2795 self.bump_and_bump_space();
2796 let kind = ClassUnicodeKind::OneLetter(c);
2797 (start, kind)
2798 };
2799 Ok(ClassUnicode {
2800 span: Span::new(start, self.pos()),
2801 negated,
2802 kind,
2803 })
2804 }
2805
2806 #[inline(never)]
2807 fn parse_perl_class(&self) -> ClassPerl {
2808 let c = self.char();
2809 let span = self.span_char();
2810 self.bump();
2811 let (negated, kind) = match c {
2812 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2813 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2814 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2815 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2816 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2817 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2818 c => panic!("expected valid Perl class but got '{}'", c),
2819 };
2820 ClassPerl {
2821 span,
2822 kind,
2823 negated,
2824 }
2825 }
2826}
2827
2828fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
2831 use regex_syntax::ast::ClassSetItem;
2832 let items = match item {
2833 ClassSetItem::Union(u) => &u.items,
2834 _ => return false,
2835 };
2836 if items.len() != 2 {
2837 return false;
2838 }
2839 match (&items[0], &items[1]) {
2840 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2841 let is_all = a.kind == b.kind && a.negated != b.negated;
2842 is_all
2843 }
2844 _ => false,
2845 }
2846}
2847
2848pub fn max_concat_length(ast: &ast::Ast) -> usize {
2849 match ast {
2850 ast::Ast::Empty(_)
2851 | ast::Ast::Flags(_)
2852 | ast::Ast::Literal(_)
2853 | ast::Ast::Dot(_)
2854 | ast::Ast::Top(_)
2855 | ast::Ast::Assertion(_)
2856 | ast::Ast::ClassUnicode(_)
2857 | ast::Ast::ClassPerl(_)
2858 | ast::Ast::ClassBracketed(_) => 0,
2859 ast::Ast::Group(g) => max_concat_length(&g.ast),
2860 ast::Ast::Complement(c) => max_concat_length(&c.ast),
2861 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
2862 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
2863 ast::Ast::Concat(c) => c
2864 .asts
2865 .len()
2866 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
2867 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
2868 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
2869 }
2870}
2871
2872pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2873 fn go(ast: &ast::Ast, limit: u64) -> u64 {
2874 match ast {
2875 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2876 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2877 ast::Ast::Assertion(_) => 1,
2878 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2879 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2880 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2881 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2882 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2883 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2884 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2885 ast::Ast::Repetition(r) => {
2886 let body = go(&r.ast, limit);
2887 let factor: u64 = match &r.op.kind {
2888 ast::RepetitionKind::ZeroOrOne => 2,
2889 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2890 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2891 (*n as u64).max(1)
2892 }
2893 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2894 (*n as u64).max(1).saturating_add(1)
2895 }
2896 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2897 (*m as u64).max(1)
2898 }
2899 };
2900 body.saturating_mul(factor).min(limit)
2901 }
2902 }
2903 }
2904 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
2905 let mut total: u64 = 0;
2906 for c in children {
2907 total = total.saturating_add(go(c, limit));
2908 if total >= limit {
2909 return limit;
2910 }
2911 }
2912 total
2913 }
2914 go(ast, limit)
2915}
2916
2917pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
2918 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
2919 p.parse(tb)
2920}
2921
2922pub fn parse_ast_with<'s>(
2923 tb: &mut TB<'s>,
2924 pattern: &'s str,
2925 flags: &PatternFlags,
2926) -> std::result::Result<NodeId, ParseError> {
2927 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
2928 p.parse(tb)
2929}
2930
2931pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
2933 let mut p: ResharpParser = ResharpParser::new(pattern);
2934 p.parse_inner()
2935}