1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51}
52
53pub const DEFAULT_MAX_REPEAT: u32 = 500;
56pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
57pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
58
59impl Default for PatternFlags {
60 fn default() -> Self {
61 Self {
62 unicode: true,
63 full_unicode: false,
64 case_insensitive: false,
65 dot_matches_new_line: false,
66 multiline: true,
67 ignore_whitespace: false,
68 ascii_perl_classes: false,
69 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
70 max_list_len: DEFAULT_MAX_LIST_LEN,
71 max_repeat: DEFAULT_MAX_REPEAT,
72 }
73 }
74}
75
76#[derive(Clone, Copy, PartialEq, Debug)]
77enum WordCharKind {
78 Word,
79 NonWord,
80 MaybeWord,
81 MaybeNonWord,
82 Unknown,
83 Edge,
84}
85
86fn is_word_byte(b: u8) -> bool {
87 b.is_ascii_alphanumeric() || b == b'_'
88}
89
90fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
91 use regex_syntax::ast::{ClassSetItem, ClassPerlKind};
92 use WordCharKind::*;
93 match item {
94 ClassSetItem::Empty(_) => Unknown,
95 ClassSetItem::Literal(l) => {
96 if is_word_byte(l.c as u8) { Word } else { NonWord }
97 }
98 ClassSetItem::Range(r) => {
99 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
100 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
101 if all_word { Word } else if all_non { NonWord } else { Unknown }
102 }
103 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
104 (ClassPerlKind::Word, false) => Word,
105 (ClassPerlKind::Word, true) => NonWord,
106 (ClassPerlKind::Space, false) => NonWord,
107 _ => Unknown,
108 },
109 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
110 ClassSetItem::Union(u) => {
111 let mut kind = Unknown;
112 for item in &u.items {
113 let k = class_set_item_word_kind(item);
114 kind = match (kind, k) {
115 (Unknown, _) => k,
116 (Word, Word) => Word,
117 (NonWord, NonWord) => NonWord,
118 _ => return Unknown,
119 };
120 }
121 kind
122 }
123 _ => Unknown,
124 }
125}
126
127fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
128 use WordCharKind::*;
129 let inner = match &c.kind {
130 regex_syntax::ast::ClassSet::Item(item) => class_set_item_word_kind(item),
131 regex_syntax::ast::ClassSet::BinaryOp(_) => Unknown,
132 };
133 if c.negated {
134 match inner {
135 Word => NonWord,
136 NonWord => Word,
137 _ => Unknown,
138 }
139 } else {
140 inner
141 }
142}
143
144#[derive(Clone, Debug, Eq, PartialEq)]
145enum Primitive {
146 Literal(Literal),
147 Assertion(ast::Assertion),
148 Dot(Span),
149 Top(Span),
150 Perl(ClassPerl),
151 Unicode(ClassUnicode),
152}
153
154impl Primitive {
155 fn span(&self) -> &Span {
156 match *self {
157 Primitive::Literal(ref x) => &x.span,
158 Primitive::Assertion(ref x) => &x.span,
159 Primitive::Dot(ref span) => span,
160 Primitive::Top(ref span) => span,
161 Primitive::Perl(ref x) => &x.span,
162 Primitive::Unicode(ref x) => &x.span,
163 }
164 }
165
166 fn into_ast(self) -> Ast {
167 match self {
168 Primitive::Literal(lit) => Ast::literal(lit),
169 Primitive::Assertion(assert) => Ast::assertion(assert),
170 Primitive::Dot(span) => Ast::dot(span),
171 Primitive::Top(span) => Ast::top(span),
172 Primitive::Perl(cls) => Ast::class_perl(cls),
173 Primitive::Unicode(cls) => Ast::class_unicode(cls),
174 }
175 }
176
177 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
178 use self::Primitive::*;
179 use regex_syntax::ast::ClassSetItem;
180
181 match self {
182 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
183 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
184 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
185 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
186 }
187 }
188
189 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
190 use self::Primitive::*;
191
192 match self {
193 Literal(lit) => Ok(lit),
194 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
195 }
196 }
197}
198
199#[derive(Clone, Debug, Eq, PartialEq)]
200pub enum Either<Left, Right> {
201 Left(Left),
202 Right(Right),
203}
204
205#[derive(Clone, Debug, Eq, PartialEq)]
206pub struct ParseError {
207 pub kind: ErrorKind,
209 pattern: String,
212 pub span: Span,
214}
215
216impl std::fmt::Display for ParseError {
217 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
218 write!(f, "{:?}: {:?}", self.kind, self.span)
219 }
220}
221impl std::error::Error for ParseError {}
222
223type Result<T> = core::result::Result<T, ParseError>;
224
225#[derive(Clone, Debug)]
226enum GroupState {
227 Group {
229 concat: Concat,
231 group: ast::Group,
233 ignore_whitespace: bool,
235 },
236 Alternation(ast::Alternation),
237 Intersection(ast::Intersection),
238}
239
240#[derive(Clone, Debug)]
241enum ClassState {
242 Open {
244 union: regex_syntax::ast::ClassSetUnion,
246 set: regex_syntax::ast::ClassBracketed,
247 },
248 Op {
251 kind: regex_syntax::ast::ClassSetBinaryOpKind,
253 lhs: regex_syntax::ast::ClassSet,
255 },
256}
257
258pub struct ResharpParser<'s> {
260 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
261 unicode_classes: resharp_algebra::UnicodeClassCache,
262 pub translator: regex_syntax::hir::translate::Translator,
263 pub pattern: &'s str,
264 pos: Cell<Position>,
265 capture_index: Cell<u32>,
266 octal: bool,
267 empty_min_range: bool,
268 ignore_whitespace: Cell<bool>,
269 dot_all: Cell<bool>,
270 multiline: Cell<bool>,
271 global_unicode: bool,
272 global_full_unicode: bool,
273 global_ascii_perl: bool,
274 global_case_insensitive: bool,
275 expanded_ast_limit: u64,
276 max_list_len: usize,
277 max_repeat: u32,
278 comments: RefCell<Vec<ast::Comment>>,
279 stack_group: RefCell<Vec<GroupState>>,
280 stack_class: RefCell<Vec<ClassState>>,
281 capture_names: RefCell<Vec<ast::CaptureName>>,
282 scratch: RefCell<String>,
283}
284
285fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
286 result.map_err(|e| {
287 if e.kind == from {
288 ParseError {
289 kind: to,
290 pattern: e.pattern,
291 span: e.span,
292 }
293 } else {
294 e
295 }
296 })
297}
298
299fn is_capture_char(c: char, first: bool) -> bool {
300 if first {
301 c == '_' || c.is_alphabetic()
302 } else {
303 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
304 }
305}
306
307pub fn is_meta_character(c: char) -> bool {
308 matches!(
309 c,
310 '\\' | '.'
311 | '+'
312 | '*'
313 | '?'
314 | '('
315 | ')'
316 | '|'
317 | '['
318 | ']'
319 | '{'
320 | '}'
321 | '^'
322 | '$'
323 | '#'
324 | '&'
325 | '-'
326 | '~'
327 | '_'
328 )
329}
330
331pub fn escape(text: &str) -> String {
333 let mut buf = String::new();
334 escape_into(text, &mut buf);
335 buf
336}
337
338pub fn escape_into(text: &str, buf: &mut String) {
340 buf.reserve(text.len());
341 for c in text.chars() {
342 if is_meta_character(c) {
343 buf.push('\\');
344 }
345 buf.push(c);
346 }
347}
348
349pub fn is_escapeable_character(c: char) -> bool {
350 if is_meta_character(c) {
351 return true;
352 }
353 if !c.is_ascii() {
354 return false;
355 }
356 match c {
357 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
358 '<' | '>' => false,
359 _ => true,
360 }
361}
362
363fn is_hex(c: char) -> bool {
364 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
365}
366
367impl<'s> ResharpParser<'s> {
368 fn default_translator_builder(&self) -> TranslatorBuilder {
369 let mut trb = TranslatorBuilder::new();
370 trb.unicode(self.global_unicode);
371 trb.utf8(false);
372 trb.case_insensitive(self.global_case_insensitive);
373 trb
374 }
375
376 pub fn new(pattern: &'s str) -> Self {
377 Self::with_flags(pattern, &PatternFlags::default())
378 }
379
380 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
381 let mut trb = TranslatorBuilder::new();
382 trb.unicode(flags.unicode);
383 trb.utf8(false);
384 trb.case_insensitive(flags.case_insensitive);
385 Self {
386 translator: trb.build(),
387 pattern,
388 perl_classes: vec![],
389 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
390 pos: Cell::new(Position::new(0, 0, 0)),
391 capture_index: Cell::new(0),
392 octal: false,
393 empty_min_range: false,
394 ignore_whitespace: Cell::new(flags.ignore_whitespace),
395 dot_all: Cell::new(flags.dot_matches_new_line),
396 multiline: Cell::new(flags.multiline),
397 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
398 global_full_unicode: flags.full_unicode,
399 global_ascii_perl: flags.ascii_perl_classes,
400 global_case_insensitive: flags.case_insensitive,
401 expanded_ast_limit: flags.expanded_ast_limit,
402 max_list_len: flags.max_list_len,
403 max_repeat: flags.max_repeat,
404 comments: RefCell::new(vec![]),
405 stack_group: RefCell::new(vec![]),
406 stack_class: RefCell::new(vec![]),
407 capture_names: RefCell::new(vec![]),
408 scratch: RefCell::new(String::new()),
409 }
410 }
411
412 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
413 self
414 }
415
416 fn pattern(&self) -> &str {
417 self.pattern
418 }
419
420 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
421 ParseError {
422 kind,
423 pattern: self.pattern().to_string(),
424 span,
425 }
426 }
427
428 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
429 self.error(
430 Span::splat(self.pos()),
431 ast::ErrorKind::UnsupportedResharpRegex,
432 )
433 }
434
435 fn offset(&self) -> usize {
436 self.parser().pos.get().offset
437 }
438
439 fn line(&self) -> usize {
440 self.parser().pos.get().line
441 }
442
443 fn column(&self) -> usize {
444 self.parser().pos.get().column
445 }
446
447 fn next_capture_index(&self, span: Span) -> Result<u32> {
448 let current = self.parser().capture_index.get();
449 let i = current
450 .checked_add(1)
451 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
452 self.parser().capture_index.set(i);
453 Ok(i)
454 }
455
456 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
457 let mut names = self.parser().capture_names.borrow_mut();
458 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
459 Err(i) => {
460 names.insert(i, cap.clone());
461 Ok(())
462 }
463 Ok(i) => Err(self.error(
464 cap.span,
465 ast::ErrorKind::GroupNameDuplicate {
466 original: names[i].span,
467 },
468 )),
469 }
470 }
471
472 fn ignore_whitespace(&self) -> bool {
473 self.parser().ignore_whitespace.get()
474 }
475
476 fn char(&self) -> char {
477 self.char_at(self.offset())
478 }
479
480 fn char_at(&self, i: usize) -> char {
481 self.pattern()[i..]
482 .chars()
483 .next()
484 .unwrap_or_else(|| panic!("expected char at offset {}", i))
485 }
486
487 fn bump(&self) -> bool {
488 if self.is_eof() {
489 return false;
490 }
491 let Position {
492 mut offset,
493 mut line,
494 mut column,
495 } = self.pos();
496 if self.char() == '\n' {
497 line = line.checked_add(1).unwrap();
498 column = 1;
499 } else {
500 column = column.checked_add(1).unwrap();
501 }
502 offset += self.char().len_utf8();
503 self.parser().pos.set(Position {
504 offset,
505 line,
506 column,
507 });
508 self.pattern()[self.offset()..].chars().next().is_some()
509 }
510
511 fn bump_if(&self, prefix: &str) -> bool {
512 if self.pattern()[self.offset()..].starts_with(prefix) {
513 for _ in 0..prefix.chars().count() {
514 self.bump();
515 }
516 true
517 } else {
518 false
519 }
520 }
521
522 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
523 if self.bump_if("?=") {
524 return Some((true, true));
525 }
526 if self.bump_if("?!") {
527 return Some((true, false));
528 }
529 if self.bump_if("?<=") {
530 return Some((false, true));
531 }
532 if self.bump_if("?<!") {
533 return Some((false, false));
534 }
535 None
536 }
537
538 fn bump_and_bump_space(&self) -> bool {
539 if !self.bump() {
540 return false;
541 }
542 self.bump_space();
543 !self.is_eof()
544 }
545
546 fn bump_space(&self) {
547 if !self.ignore_whitespace() {
548 return;
549 }
550 while !self.is_eof() {
551 if self.char().is_whitespace() {
552 self.bump();
553 } else if self.char() == '#' {
554 let start = self.pos();
555 let mut comment_text = String::new();
556 self.bump();
557 while !self.is_eof() {
558 let c = self.char();
559 self.bump();
560 if c == '\n' {
561 break;
562 }
563 comment_text.push(c);
564 }
565 let comment = ast::Comment {
566 span: Span::new(start, self.pos()),
567 comment: comment_text,
568 };
569 self.parser().comments.borrow_mut().push(comment);
570 } else {
571 break;
572 }
573 }
574 }
575
576 fn peek(&self) -> Option<char> {
577 if self.is_eof() {
578 return None;
579 }
580 self.pattern()[self.offset() + self.char().len_utf8()..]
581 .chars()
582 .next()
583 }
584
585 fn peek_space(&self) -> Option<char> {
588 if !self.ignore_whitespace() {
589 return self.peek();
590 }
591 if self.is_eof() {
592 return None;
593 }
594 let mut start = self.offset() + self.char().len_utf8();
595 let mut in_comment = false;
596 for (i, c) in self.pattern()[start..].char_indices() {
597 if c.is_whitespace() {
598 continue;
599 } else if !in_comment && c == '#' {
600 in_comment = true;
601 } else if in_comment && c == '\n' {
602 in_comment = false;
603 } else {
604 start += i;
605 break;
606 }
607 }
608 self.pattern()[start..].chars().next()
609 }
610
611 fn is_eof(&self) -> bool {
612 self.offset() == self.pattern().len()
613 }
614
615 fn pos(&self) -> Position {
616 self.parser().pos.get()
617 }
618
619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 fn span_char(&self) -> Span {
624 let mut next = Position {
625 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
626 line: self.line(),
627 column: self.column().checked_add(1).unwrap(),
628 };
629 if self.char() == '\n' {
630 next.line += 1;
631 next.column = 1;
632 }
633 Span::new(self.pos(), next)
634 }
635
636 #[inline(never)]
637 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
638 assert_eq!(self.char(), '|');
639 concat.span.end = self.pos();
640 self.push_or_add_alternation(concat);
641 self.bump();
642 Ok(ast::Concat {
643 span: self.span(),
644 asts: vec![],
645 })
646 }
647
648 fn push_or_add_alternation(&self, concat: Concat) {
649 use self::GroupState::*;
650
651 let mut stack = self.parser().stack_group.borrow_mut();
652 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
653 alts.asts.push(concat.into_ast());
654 return;
655 }
656 stack.push(Alternation(ast::Alternation {
657 span: Span::new(concat.span.start, self.pos()),
658 asts: vec![concat.into_ast()],
659 }));
660 }
661
662 #[inline(never)]
663 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
664 assert_eq!(self.char(), '&');
665 concat.span.end = self.pos();
666 self.push_or_add_intersect(concat);
667 self.bump();
668 Ok(Concat {
669 span: self.span(),
670 asts: vec![],
671 })
672 }
673
674 fn push_or_add_intersect(&self, concat: Concat) {
675 use self::GroupState::*;
676
677 let mut stack = self.parser().stack_group.borrow_mut();
678 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
679 alts.asts.push(concat.into_ast());
680 return;
681 }
682 stack.push(Intersection(ast::Intersection {
683 span: Span::new(concat.span.start, self.pos()),
684 asts: vec![concat.into_ast()],
685 }));
686 }
687
688 #[inline(never)]
689 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
690 assert_eq!(self.char(), '(');
691 match self.parse_group()? {
692 Either::Left(set) => {
693 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
694 if let Some(v) = ignore {
695 self.parser().ignore_whitespace.set(v);
696 }
697
698 concat.asts.push(Ast::flags(set));
699 Ok(concat)
700 }
701 Either::Right(group) => {
702 let old_ignore_whitespace = self.ignore_whitespace();
703 let new_ignore_whitespace = group
704 .flags()
705 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
706 .unwrap_or(old_ignore_whitespace);
707 self.parser()
708 .stack_group
709 .borrow_mut()
710 .push(GroupState::Group {
711 concat,
712 group,
713 ignore_whitespace: old_ignore_whitespace,
714 });
715 self.parser().ignore_whitespace.set(new_ignore_whitespace);
716 Ok(Concat {
717 span: self.span(),
718 asts: vec![],
719 })
720 }
721 }
722 }
723
724 #[inline(never)]
725 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
726 assert_eq!(self.char(), '~');
727 self.bump();
728 if self.is_eof() || self.char() != '(' {
729 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
730 }
731 let open_span = self.span_char();
732 self.bump();
733 let group = ast::Group {
734 span: open_span,
735 kind: ast::GroupKind::Complement,
736 ast: Box::new(Ast::empty(self.span())),
737 };
738
739 let old_ignore_whitespace = self.ignore_whitespace();
740 let new_ignore_whitespace = group
741 .flags()
742 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
743 .unwrap_or(old_ignore_whitespace);
744 self.parser()
745 .stack_group
746 .borrow_mut()
747 .push(GroupState::Group {
748 concat,
749 group,
750 ignore_whitespace: old_ignore_whitespace,
751 });
752 self.parser().ignore_whitespace.set(new_ignore_whitespace);
753 Ok(Concat {
754 span: self.span(),
755 asts: vec![],
756 })
757 }
758
759 #[inline(never)]
760 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
761 use self::GroupState::*;
762 assert_eq!(self.char(), ')');
763 let mut stack = self.parser().stack_group.borrow_mut();
764 let topstack = stack.pop();
765
766 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
767 Some(Group {
768 concat,
769 group,
770 ignore_whitespace,
771 }) => (concat, group, ignore_whitespace, None),
772 Some(Alternation(alt)) => match stack.pop() {
773 Some(Group {
774 concat,
775 group,
776 ignore_whitespace,
777 }) => (
778 concat,
779 group,
780 ignore_whitespace,
781 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
782 ),
783 None | Some(Alternation(_)) | Some(Intersection(_)) => {
784 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
785 }
786 },
787 Some(Intersection(int)) => match stack.pop() {
788 Some(Group {
789 concat,
790 group,
791 ignore_whitespace,
792 }) => (
793 concat,
794 group,
795 ignore_whitespace,
796 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
797 ),
798 None | Some(Alternation(_)) | Some(Intersection(_)) => {
799 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
800 }
801 },
802
803 None => {
804 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
805 }
806 };
807 self.parser().ignore_whitespace.set(ignore_whitespace);
808 group_concat.span.end = self.pos();
809 self.bump();
810 group.span.end = self.pos();
811 match alt {
812 Some(Either::Left(mut alt)) => {
813 alt.span.end = group_concat.span.end;
814 alt.asts.push(group_concat.into_ast());
815 group.ast = Box::new(alt.into_ast());
816 }
817 Some(Either::Right(mut int)) => {
818 int.span.end = group_concat.span.end;
819 int.asts.push(group_concat.into_ast());
820 group.ast = Box::new(int.into_ast());
821 }
822 None => {
823 group.ast = Box::new(group_concat.into_ast());
824 }
825 }
826
827 if group.kind == GroupKind::Complement {
828 let complement = ast::Complement {
829 span: self.span(),
830 ast: group.ast,
831 };
832 prior_concat.asts.push(Ast::complement(complement));
833 }
834 else {
836 prior_concat.asts.push(Ast::group(group));
837 }
838 Ok(prior_concat)
839 }
840
841 #[inline(never)]
842 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
843 concat.span.end = self.pos();
844 let mut stack = self.parser().stack_group.borrow_mut();
845 let ast = match stack.pop() {
846 None => Ok(concat.into_ast()),
847 Some(GroupState::Alternation(mut alt)) => {
848 alt.span.end = self.pos();
849 alt.asts.push(concat.into_ast());
850 Ok(Ast::alternation(alt))
851 }
852 Some(GroupState::Intersection(mut int)) => {
853 int.span.end = self.pos();
854 int.asts.push(concat.into_ast());
855
856 Ok(Ast::intersection(int))
857 }
858 Some(GroupState::Group { group, .. }) => {
859 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
860 }
861 };
862 match stack.pop() {
864 None => ast,
865 Some(GroupState::Alternation(alt)) => {
866 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
867 }
868 Some(GroupState::Intersection(int)) => {
869 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
870 }
871 Some(GroupState::Group { group, .. }) => {
872 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
873 }
874 }
875 }
876
877 #[inline(never)]
878 fn push_class_open(
879 &self,
880 parent_union: regex_syntax::ast::ClassSetUnion,
881 ) -> Result<regex_syntax::ast::ClassSetUnion> {
882 assert_eq!(self.char(), '[');
883
884 let (nested_set, nested_union) = self.parse_set_class_open()?;
885 self.parser()
886 .stack_class
887 .borrow_mut()
888 .push(ClassState::Open {
889 union: parent_union,
890 set: nested_set,
891 });
892 Ok(nested_union)
893 }
894
895 #[inline(never)]
896 fn pop_class(
897 &self,
898 nested_union: regex_syntax::ast::ClassSetUnion,
899 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
900 assert_eq!(self.char(), ']');
901
902 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
903 let prevset = self.pop_class_op(item);
904 let mut stack = self.parser().stack_class.borrow_mut();
905 match stack.pop() {
906 None => panic!("unexpected empty character class stack"),
907 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
908 Some(ClassState::Open { mut union, mut set }) => {
909 self.bump();
910 set.span.end = self.pos();
911 set.kind = prevset;
912 if stack.is_empty() {
913 Ok(Either::Right(set))
914 } else {
915 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
916 Ok(Either::Left(union))
917 }
918 }
919 }
920 }
921
922 #[inline(never)]
923 fn unclosed_class_error(&self) -> ParseError {
924 for state in self.parser().stack_class.borrow().iter().rev() {
925 if let ClassState::Open { ref set, .. } = *state {
926 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
927 }
928 }
929 panic!("no open character class found")
930 }
931
932 #[inline(never)]
933 fn push_class_op(
934 &self,
935 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
936 next_union: regex_syntax::ast::ClassSetUnion,
937 ) -> regex_syntax::ast::ClassSetUnion {
938 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
939 let new_lhs = self.pop_class_op(item);
940 self.parser().stack_class.borrow_mut().push(ClassState::Op {
941 kind: next_kind,
942 lhs: new_lhs,
943 });
944 regex_syntax::ast::ClassSetUnion {
945 span: self.span(),
946 items: vec![],
947 }
948 }
949
950 #[inline(never)]
951 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
952 let mut stack = self.parser().stack_class.borrow_mut();
953 let (kind, lhs) = match stack.pop() {
954 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
955 Some(state @ ClassState::Open { .. }) => {
956 stack.push(state);
957 return rhs;
958 }
959 None => unreachable!(),
960 };
961 let span = Span::new(lhs.span().start, rhs.span().end);
962 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
963 span,
964 kind,
965 lhs: Box::new(lhs),
966 rhs: Box::new(rhs),
967 })
968 }
969
970 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
971 match hir.kind() {
972 hir::HirKind::Empty => Ok(NodeId::EPS),
973 hir::HirKind::Literal(l) => {
974 if l.0.len() == 1 {
975 let node = tb.mk_u8(l.0[0]);
976 Ok(node)
977 } else {
978 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
979 let conc = tb.mk_concats(ws.iter().copied());
980 Ok(conc)
981 }
982 }
983 hir::HirKind::Class(class) => match class {
984 hir::Class::Unicode(class_unicode) => {
985 let ranges = class_unicode.ranges();
986 if ranges.len() == 1
987 && ranges[0].start() == '\u{0}'
988 && ranges[0].end() == '\u{10FFFF}'
989 {
990 return Ok(tb.mk_range_u8(0, 255));
991 }
992 let mut nodes = Vec::new();
993 for range in ranges {
994 for seq in Utf8Sequences::new(range.start(), range.end()) {
995 let sl = seq.as_slice();
996 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
997 let node = match bytes.len() {
998 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
999 n => {
1000 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1001 let mut conc = last;
1002 for i in (0..n - 1).rev() {
1003 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1004 conc = tb.mk_concat(b, conc);
1005 }
1006 conc
1007 }
1008 };
1009 nodes.push(node);
1010 }
1011 }
1012 let merged = tb.mk_unions(nodes.into_iter());
1013 Ok(merged)
1014 }
1015 hir::Class::Bytes(class_bytes) => {
1016 let ranges = class_bytes.ranges();
1017 let mut result = NodeId::BOT;
1018 for range in ranges {
1019 let start = range.start();
1020 let end = range.end();
1021 let node = tb.mk_range_u8(start, end);
1022 result = tb.mk_union(result, node);
1023 }
1024 Ok(result)
1025 }
1026 },
1027 hir::HirKind::Look(_) => Err(self.error(
1028 Span::splat(self.pos()),
1029 ast::ErrorKind::UnsupportedResharpRegex,
1030 )),
1031 hir::HirKind::Repetition(_) => Err(self.error(
1032 Span::splat(self.pos()),
1033 ast::ErrorKind::UnsupportedResharpRegex,
1034 )),
1035 hir::HirKind::Capture(_) => Err(self.error(
1036 Span::splat(self.pos()),
1037 ast::ErrorKind::UnsupportedResharpRegex,
1038 )),
1039 hir::HirKind::Concat(body) => {
1040 let mut result = NodeId::EPS;
1041 for child in body {
1042 let node = self.hir_to_node_id(child, tb)?;
1043 result = tb.mk_concat(result, node);
1044 }
1045 Ok(result)
1046 }
1047 hir::HirKind::Alternation(_) => Err(self.error(
1048 Span::splat(self.pos()),
1049 ast::ErrorKind::UnsupportedResharpRegex,
1050 )),
1051 }
1052 }
1053
1054 fn translate_ast_to_hir(
1055 &mut self,
1056 orig_ast: ®ex_syntax::ast::Ast,
1057 tb: &mut TB<'s>,
1058 ) -> Result<NodeId> {
1059 match self.translator.translate("", orig_ast) {
1060 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1061 Ok(hir) => self.hir_to_node_id(&hir, tb),
1062 }
1063 }
1064
1065 fn translator_to_node_id(
1066 &mut self,
1067 orig_ast: ®ex_syntax::ast::Ast,
1068 translator: &mut Option<Translator>,
1069 tb: &mut TB<'s>,
1070 ) -> Result<NodeId> {
1071 match translator {
1072 Some(tr) => {
1073 let hir = tr
1074 .translate("", orig_ast)
1075 .map_err(|e| self.unsupported_error(e))?;
1076 self.hir_to_node_id(&hir, tb)
1077 }
1078 None => self.translate_ast_to_hir(orig_ast, tb),
1079 }
1080 }
1081
1082 fn get_class(
1083 &mut self,
1084 negated: bool,
1085 kind: regex_syntax::ast::ClassPerlKind,
1086 tb: &mut TB<'s>,
1087 ) -> Result<NodeId> {
1088 let w = self
1089 .perl_classes
1090 .iter()
1091 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1092 match w {
1093 Some((_, _, value)) => Ok(*value),
1094 None => {
1095 let translated = if self.global_ascii_perl {
1096 let pos = match kind {
1097 regex_syntax::ast::ClassPerlKind::Word => {
1098 let az = tb.mk_range_u8(b'a', b'z');
1099 let big = tb.mk_range_u8(b'A', b'Z');
1100 let dig = tb.mk_range_u8(b'0', b'9');
1101 let us = tb.mk_u8(b'_');
1102 tb.mk_unions([az, big, dig, us].into_iter())
1103 }
1104 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1105 regex_syntax::ast::ClassPerlKind::Space => {
1106 let sp = tb.mk_u8(b' ');
1107 let tab = tb.mk_u8(b'\t');
1108 let nl = tb.mk_u8(b'\n');
1109 let cr = tb.mk_u8(b'\r');
1110 let ff = tb.mk_u8(0x0C);
1111 let vt = tb.mk_u8(0x0B);
1112 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1113 }
1114 };
1115 if negated {
1116 resharp_algebra::neg_class(tb, pos)
1117 } else {
1118 pos
1119 }
1120 } else if self.global_unicode {
1121 match kind {
1122 regex_syntax::ast::ClassPerlKind::Word => {
1123 if self.global_full_unicode {
1124 self.unicode_classes.ensure_word_full(tb);
1125 } else {
1126 self.unicode_classes.ensure_word(tb);
1127 }
1128 if negated {
1129 self.unicode_classes.non_word
1130 } else {
1131 self.unicode_classes.word
1132 }
1133 }
1134 regex_syntax::ast::ClassPerlKind::Digit => {
1135 if self.global_full_unicode {
1136 self.unicode_classes.ensure_digit_full(tb);
1137 } else {
1138 self.unicode_classes.ensure_digit(tb);
1139 }
1140 if negated {
1141 self.unicode_classes.non_digit
1142 } else {
1143 self.unicode_classes.digit
1144 }
1145 }
1146 regex_syntax::ast::ClassPerlKind::Space => {
1147 if self.global_full_unicode {
1148 self.unicode_classes.ensure_space_full(tb);
1149 } else {
1150 self.unicode_classes.ensure_space(tb);
1151 }
1152 if negated {
1153 self.unicode_classes.non_space
1154 } else {
1155 self.unicode_classes.space
1156 }
1157 }
1158 }
1159 } else {
1160 let pos = match kind {
1161 regex_syntax::ast::ClassPerlKind::Word => {
1162 let az = tb.mk_range_u8(b'a', b'z');
1163 let big = tb.mk_range_u8(b'A', b'Z');
1164 let dig = tb.mk_range_u8(b'0', b'9');
1165 let us = tb.mk_u8(b'_');
1166 tb.mk_unions([az, big, dig, us].into_iter())
1167 }
1168 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1169 regex_syntax::ast::ClassPerlKind::Space => {
1170 let sp = tb.mk_u8(b' ');
1171 let tab = tb.mk_u8(b'\t');
1172 let nl = tb.mk_u8(b'\n');
1173 let cr = tb.mk_u8(b'\r');
1174 let ff = tb.mk_u8(0x0C);
1175 let vt = tb.mk_u8(0x0B);
1176 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1177 }
1178 };
1179 if negated {
1180 tb.mk_compl(pos)
1181 } else {
1182 pos
1183 }
1184 };
1185 self.perl_classes.push((negated, kind, translated));
1186 Ok(translated)
1187 }
1188 }
1189 }
1190
1191 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1192 use WordCharKind::*;
1193 match ast {
1194 Ast::Literal(lit) => {
1195 if is_word_byte(lit.c as u8) {
1196 Word
1197 } else {
1198 NonWord
1199 }
1200 }
1201 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1202 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1203 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1204 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1205 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1206 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1207 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1208 },
1209 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1210 Ast::Dot(_) | Ast::Top(_) => Unknown,
1211 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1212 Ast::Concat(c) if !c.asts.is_empty() => {
1213 let edge = if left { c.asts.len() - 1 } else { 0 };
1214 let kind = Self::word_char_kind(&c.asts[edge], left);
1215 match kind {
1216 MaybeWord => {
1217 let dir: isize = if left { -1 } else { 1 };
1218 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1219 Word => Word,
1220 _ => MaybeWord,
1221 }
1222 }
1223 MaybeNonWord => {
1224 let dir: isize = if left { -1 } else { 1 };
1225 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1226 NonWord => NonWord,
1227 _ => MaybeNonWord,
1228 }
1229 }
1230 other => other,
1231 }
1232 }
1233 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1234 let first = Self::word_char_kind(&alt.asts[0], left);
1235 if alt.asts[1..]
1236 .iter()
1237 .all(|a| Self::word_char_kind(a, left) == first)
1238 {
1239 first
1240 } else {
1241 Unknown
1242 }
1243 }
1244 Ast::Repetition(r) => {
1245 let inner = Self::word_char_kind(&r.ast, left);
1246 let nullable = matches!(
1247 &r.op.kind,
1248 ast::RepetitionKind::ZeroOrMore
1249 | ast::RepetitionKind::ZeroOrOne
1250 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1251 );
1252 if nullable {
1253 match inner {
1254 Word => MaybeWord,
1255 NonWord => MaybeNonWord,
1256 _ => Unknown,
1257 }
1258 } else {
1259 inner
1260 }
1261 }
1262 Ast::Lookaround(la) => match la.kind {
1263 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::PositiveLookbehind => {
1264 Self::word_char_kind(&la.ast, left)
1265 }
1266 _ => Unknown,
1267 },
1268 _ => Unknown,
1269 }
1270 }
1271
1272 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1274 match ast {
1275 Ast::Literal(_)
1276 | Ast::ClassPerl(_)
1277 | Ast::ClassBracketed(_)
1278 | Ast::ClassUnicode(_)
1279 | Ast::Dot(_)
1280 | Ast::Top(_) => Some(ast),
1281 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1282 Ast::Concat(c) if !c.asts.is_empty() => {
1283 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1284 }
1285 Ast::Repetition(r) => {
1286 let nullable = matches!(
1287 &r.op.kind,
1288 ast::RepetitionKind::ZeroOrMore
1289 | ast::RepetitionKind::ZeroOrOne
1290 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1291 );
1292 if nullable {
1293 None
1294 } else {
1295 Self::edge_class_ast(&r.ast, left)
1296 }
1297 }
1298 _ => None,
1299 }
1300 }
1301
1302 fn resolve_word_kind(
1303 &mut self,
1304 asts: &[Ast],
1305 idx: usize,
1306 dir: isize,
1307 translator: &mut Option<Translator>,
1308 tb: &mut TB<'s>,
1309 word_id: NodeId,
1310 not_word_id: NodeId,
1311 ) -> Result<WordCharKind> {
1312 use WordCharKind::*;
1313 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1314 if fast != Unknown {
1315 return Ok(fast);
1316 }
1317 let neighbor_idx = (idx as isize + dir) as usize;
1318 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1319 self.ast_to_node_id(edge, translator, tb)?
1320 } else if dir > 0 {
1321 let mut bodies: Vec<NodeId> = vec![];
1322 let mut j = neighbor_idx;
1323 while j < asts.len() {
1324 match &asts[j] {
1325 Ast::Lookaround(la) => {
1326 let kind = la.kind.clone();
1327 let lookbehind = matches!(
1328 kind,
1329 ast::LookaroundKind::PositiveLookbehind
1330 | ast::LookaroundKind::NegativeLookbehind
1331 );
1332 if lookbehind {
1333 j += 1;
1334 continue;
1335 }
1336 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1337 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1338 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1339 })?;
1340 let body_ts = tb.mk_concat(body, NodeId::TS);
1341 let constraint = match kind {
1342 ast::LookaroundKind::PositiveLookahead => body_ts,
1343 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1344 _ => unreachable!(),
1345 };
1346 bodies.push(constraint);
1347 j += 1;
1348 }
1349 other => {
1350 let n = self.ast_to_node_id(other, translator, tb)?;
1351 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1352 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1353 })?;
1354 bodies.push(tb.mk_concat(n, NodeId::TS));
1355 break;
1356 }
1357 }
1358 }
1359 if bodies.is_empty() {
1360 return Ok(Unknown);
1361 }
1362 let combined = tb.mk_inters(bodies.into_iter());
1363 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1364 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1365 return if tb.subsumes(word_prefix, combined) == Some(true) {
1366 Ok(Word)
1367 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1368 Ok(NonWord)
1369 } else {
1370 Ok(Unknown)
1371 };
1372 } else {
1373 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1374 let mut neighbor_node = tb
1375 .try_elim_lookarounds(neighbor_node)
1376 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1377 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1378 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1379 })?;
1380 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1381 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1382 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1383 Ok(Word)
1384 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1385 Ok(NonWord)
1386 } else {
1387 Ok(Unknown)
1388 };
1389 };
1390 if tb.subsumes(word_id, node) == Some(true) {
1391 Ok(Word)
1392 } else if tb.subsumes(not_word_id, node) == Some(true) {
1393 Ok(NonWord)
1394 } else {
1395 Ok(Unknown)
1396 }
1397 }
1398
1399 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1400 use WordCharKind::*;
1401 let next = idx as isize + dir;
1402 if next < 0 || next >= asts.len() as isize {
1403 return Edge;
1404 }
1405 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1406 match kind {
1407 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1408 Word => Word,
1409 _ => Unknown,
1410 },
1411 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1412 NonWord => NonWord,
1413 _ => Unknown,
1414 },
1415 other => other,
1416 }
1417 }
1418
1419
1420 fn specialize_word_boundaries(&mut self, children: &mut [NodeId], tb: &mut TB<'s>) {
1421 let wb = self.unicode_classes.wb;
1422 let non_wb = self.unicode_classes.non_wb;
1423 if wb == NodeId::MISSING {
1424 return;
1425 }
1426 let word = self.unicode_classes.word;
1427 let non_word = self.unicode_classes.non_word;
1428 if word == NodeId::MISSING {
1429 return;
1430 }
1431 let word_pref = tb.mk_concat(word, NodeId::TS);
1434 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1435 let word_suf = tb.mk_concat(NodeId::TS, word);
1436 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1437 let len = children.len();
1438 for k in 0..len {
1439 let l = if k == 0 {
1440 WordCharKind::Edge
1441 } else {
1442 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1443 };
1444 let r = if k + 1 >= len {
1445 WordCharKind::Edge
1446 } else {
1447 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1448 };
1449 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r);
1450 }
1451 }
1452
1453 fn rewrite_wb_in_node(
1454 b: &mut TB<'s>,
1455 node: NodeId,
1456 wb: NodeId,
1457 non_wb: NodeId,
1458 word: NodeId,
1459 left: WordCharKind,
1460 right: WordCharKind,
1461 ) -> NodeId {
1462 let boundary_match = if node == wb {
1463 true
1464 } else if node == non_wb {
1465 false
1466 } else if b.get_kind(node) == Kind::Union {
1467 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right);
1468 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right);
1469 return b.mk_union(l, r);
1470 } else {
1471 return node;
1472 };
1473 use WordCharKind::*;
1474 match (left, right) {
1475 (NonWord, Word) | (Word, NonWord) => {
1476 if boundary_match { NodeId::EPS } else { NodeId::BOT }
1477 }
1478 (Word, Word) | (NonWord, NonWord) => {
1479 if boundary_match { NodeId::BOT } else { NodeId::EPS }
1480 }
1481 (Word, _) => {
1482 if boundary_match { b.mk_neg_lookahead(word, 0) }
1483 else {
1484 let tail = b.mk_concat(word, NodeId::TS);
1485 b.mk_lookahead(tail, NodeId::MISSING, 0)
1486 }
1487 }
1488 (NonWord, _) => {
1489 if boundary_match {
1490 let tail = b.mk_concat(word, NodeId::TS);
1491 b.mk_lookahead(tail, NodeId::MISSING, 0)
1492 } else { b.mk_neg_lookahead(word, 0) }
1493 }
1494 (_, Word) => {
1495 if boundary_match { b.mk_neg_lookbehind(word) }
1496 else { b.mk_lookbehind(word, NodeId::MISSING) }
1497 }
1498 (_, NonWord) => {
1499 if boundary_match { b.mk_lookbehind(word, NodeId::MISSING) }
1500 else { b.mk_neg_lookbehind(word) }
1501 }
1502 _ => node,
1503 }
1504 }
1505
1506 fn classify(b: &mut TB<'s>, node: NodeId, word_dir: NodeId, non_word_dir: NodeId) -> WordCharKind {
1507 if b.subsumes(word_dir, node) == Some(true) {
1508 WordCharKind::Word
1509 } else if b.subsumes(non_word_dir, node) == Some(true) {
1510 WordCharKind::NonWord
1511 } else {
1512 WordCharKind::Unknown
1513 }
1514 }
1515
1516 fn rewrite_word_boundary_in_concat(
1517 &mut self,
1518 asts: &[Ast],
1519 idx: usize,
1520 translator: &mut Option<Translator>,
1521 tb: &mut TB<'s>,
1522 negated: bool,
1523 ) -> Result<(NodeId, usize)> {
1524 use WordCharKind::*;
1525 if self.global_full_unicode {
1526 self.unicode_classes.ensure_word_full(tb);
1527 } else if self.global_unicode && !self.global_ascii_perl {
1528 self.unicode_classes.ensure_word(tb);
1529 } else {
1530 self.unicode_classes.ensure_word_ascii(tb);
1531 }
1532 let word_id = self.unicode_classes.word;
1533 let not_word_id = self.unicode_classes.non_word;
1534 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1535 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1536 let boundary_match = !negated;
1537 match (left, right) {
1538 (NonWord, Word) | (Word, NonWord) => Ok((
1539 if boundary_match { NodeId::EPS } else { NodeId::BOT },
1540 idx + 1,
1541 )),
1542 (Word, Word) | (NonWord, NonWord) => Ok((
1543 if boundary_match { NodeId::BOT } else { NodeId::EPS },
1544 idx + 1,
1545 )),
1546 (Word, _) => {
1547 if boundary_match {
1548 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1549 } else {
1550 let tail = tb.mk_concat(word_id, NodeId::TS);
1551 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1552 }
1553 }
1554 (NonWord, _) => {
1555 if boundary_match {
1556 let tail = tb.mk_concat(word_id, NodeId::TS);
1557 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1558 } else {
1559 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1560 }
1561 }
1562 (_, Word) => {
1563 if boundary_match {
1564 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1565 } else {
1566 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1567 }
1568 }
1569 (_, NonWord) => {
1570 if boundary_match {
1571 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1572 } else {
1573 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1574 }
1575 }
1576 _ => {
1580 self.unicode_classes.ensure_wb(tb);
1581 let node = if boundary_match {
1582 self.unicode_classes.wb
1583 } else {
1584 self.unicode_classes.non_wb
1585 };
1586 Ok((node, idx + 1))
1587 }
1588 }
1589 }
1590
1591 fn merge_boundary_with_following_lookaheads(
1592 &mut self,
1593 asts: &[Ast],
1594 wb_idx: usize,
1595 boundary_tail: NodeId,
1596 translator: &mut Option<Translator>,
1597 tb: &mut TB<'s>,
1598 ) -> Result<(NodeId, usize)> {
1599 let mut next = wb_idx + 1;
1600 let mut la_bodies = vec![boundary_tail];
1601 while next < asts.len() {
1602 match &asts[next] {
1603 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1604 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1605 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1606 next += 1;
1607 }
1608 _ => break,
1609 }
1610 }
1611 let merged = tb.mk_inters(la_bodies.into_iter());
1612 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1613 }
1614
1615 fn ast_to_node_id(
1616 &mut self,
1617 ast: &Ast,
1618 translator: &mut Option<Translator>,
1619 tb: &mut TB<'s>,
1620 ) -> Result<NodeId> {
1621 match ast {
1622 Ast::Empty(_) => Ok(NodeId::EPS),
1623 Ast::Flags(f) => {
1624 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1625 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1626 }
1627 let mut translator_builder = self.default_translator_builder();
1628 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1629 translator_builder.case_insensitive(state);
1630 }
1631 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1632 translator_builder.unicode(state);
1633 }
1634 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1635 self.dot_all.set(state);
1636 }
1637 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1638 self.multiline.set(state);
1639 }
1640 let concat_translator = Some(translator_builder.build());
1641 *translator = concat_translator;
1642 Ok(NodeId::EPS)
1643 }
1644 Ast::Literal(l) => {
1645 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1646 self.translator_to_node_id(&ast_lit, translator, tb)
1647 }
1648 Ast::Top(_) => Ok(NodeId::TOP),
1649 Ast::Dot(_) => {
1650 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1651 let hirv = match (codepoint_dot, self.dot_all.get()) {
1652 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1653 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1654 (false, true) => return Ok(NodeId::TOP),
1655 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1656 };
1657 self.hir_to_node_id(&hirv, tb)
1658 }
1659 Ast::Assertion(a) => match &a.kind {
1660 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1661 ast::AssertionKind::EndText => Ok(NodeId::END),
1662 ast::AssertionKind::WordBoundary => {
1663 let only = Ast::Assertion(a.clone());
1664 let asts = std::slice::from_ref(&only);
1665 let (node, _) = self
1666 .rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1667 Ok(node)
1668 }
1669 ast::AssertionKind::NotWordBoundary => {
1670 let only = Ast::Assertion(a.clone());
1672 let asts = std::slice::from_ref(&only);
1673 let (node, _) = self
1674 .rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1675 Ok(node)
1676 }
1677 ast::AssertionKind::StartLine => {
1678 if !self.multiline.get() {
1679 return Ok(NodeId::BEGIN);
1680 }
1681 let left = NodeId::BEGIN;
1682 let right = tb.mk_u8(b'\n');
1683 let union = tb.mk_union(left, right);
1684 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1685 }
1686 ast::AssertionKind::EndLine => {
1687 if !self.multiline.get() {
1688 return Ok(NodeId::END);
1689 }
1690 let left = NodeId::END;
1691 let right = tb.mk_u8(b'\n');
1692 let union = tb.mk_union(left, right);
1693 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1694 }
1695 ast::AssertionKind::WordBoundaryStart => {
1696 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1697 }
1698 ast::AssertionKind::WordBoundaryEnd => {
1699 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1700 }
1701 ast::AssertionKind::WordBoundaryStartAngle => {
1702 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1703 }
1704 ast::AssertionKind::WordBoundaryEndAngle => {
1705 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1706 }
1707 ast::AssertionKind::WordBoundaryStartHalf => {
1708 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1709 }
1710 ast::AssertionKind::WordBoundaryEndHalf => {
1711 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1712 }
1713 },
1714 Ast::ClassUnicode(c) => {
1715 let tmp = regex_syntax::ast::ClassUnicode {
1716 span: c.span,
1717 negated: c.negated,
1718 kind: c.kind.clone(),
1719 };
1720 if !c.negated {
1721 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1722 match s.as_str() {
1723 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1725 "utf8" => {
1727 let ascii = tb.mk_range_u8(0, 127);
1728 let beta = tb.mk_range_u8(128, 0xBF);
1729 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1730 let c0s = tb.mk_concats([c0, beta].into_iter());
1731 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1732 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1733 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1734 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1735 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
1736 }
1737 "hex" => {
1738 let nums = tb.mk_range_u8(b'0', b'9');
1739 let lets = tb.mk_range_u8(b'a', b'f');
1740 let lets2 = tb.mk_range_u8(b'A', b'F');
1741 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1742 return Ok(merged);
1743 }
1744 _ => {}
1745 }
1746 };
1747 }
1748
1749 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1750 self.translator_to_node_id(&orig_ast, translator, tb)
1751 }
1752 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1753 Ast::ClassBracketed(c) => match &c.kind {
1754 regex_syntax::ast::ClassSet::Item(item) => {
1755 if !c.negated && is_universal_perl_pair(item) {
1756 return Ok(NodeId::TOP);
1757 }
1758 let tmp = regex_syntax::ast::ClassBracketed {
1759 span: c.span,
1760 negated: c.negated,
1761 kind: c.kind.clone(),
1762 };
1763 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
1764 self.translator_to_node_id(&orig_ast, translator, tb)
1765 }
1766 regex_syntax::ast::ClassSet::BinaryOp(_) => {
1767 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
1768 }
1769 },
1770 Ast::Repetition(r) => {
1771 let body = self.ast_to_node_id(&r.ast, translator, tb);
1772 match body {
1773 Ok(body) => match &r.op.kind {
1774 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
1775 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
1776 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
1777 ast::RepetitionKind::Range(r) => match r {
1778 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
1779 ast::RepetitionRange::AtLeast(n) => {
1780 let rep = tb.mk_repeat(body, *n, *n);
1781 let st = tb.mk_star(body);
1782 Ok(tb.mk_concat(rep, st))
1783 }
1784
1785 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
1786 },
1787 },
1788 Err(_) => body,
1789 }
1790 }
1791 Ast::Lookaround(g) => {
1792 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
1793 match g.kind {
1794 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
1795 let mut prefix = NodeId::EPS;
1796 let mut rest = body;
1797 while tb.get_kind(rest) == Kind::Concat
1798 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
1799 {
1800 prefix = tb.mk_concat(prefix, rest.left(tb));
1801 rest = rest.right(tb);
1802 }
1803 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
1804 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
1805 }
1806 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
1807 Ok(tb.mk_concat(prefix, la))
1808 }
1809 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
1810 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
1811 }
1812 ast::LookaroundKind::PositiveLookahead => {
1813 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
1814 }
1815 ast::LookaroundKind::PositiveLookbehind => {
1816 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
1817 }
1818 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
1819 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
1820 }
1821 }
1822 Ast::Group(g) => {
1823 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
1824 if !flags.items.is_empty() {
1825 let mut translator_builder = self.default_translator_builder();
1826 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
1827 translator_builder.case_insensitive(state);
1828 }
1829 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
1830 translator_builder.unicode(state);
1831 }
1832 let saved_dot_all = self.dot_all.get();
1833 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
1834 self.dot_all.set(state);
1835 }
1836 let saved_multiline = self.multiline.get();
1837 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
1838 self.multiline.set(state);
1839 }
1840 let mut scoped = Some(translator_builder.build());
1841 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
1842 self.dot_all.set(saved_dot_all);
1843 self.multiline.set(saved_multiline);
1844 return result;
1845 }
1846 }
1847 self.ast_to_node_id(&g.ast, translator, tb)
1848 }
1849 Ast::Alternation(a) => {
1850 let mut children = vec![];
1851 for ast in &a.asts {
1852 match self.ast_to_node_id(ast, translator, tb) {
1853 Ok(node_id) => children.push(node_id),
1854 Err(err) => return Err(err),
1855 }
1856 }
1857 Ok(tb.mk_unions(children.iter().copied()))
1858 }
1859 Ast::Concat(c) => {
1860 let mut concat_translator: Option<Translator> = None;
1861 let mut children = vec![];
1862 let mut i = 0;
1863 while i < c.asts.len() {
1864 let ast = &c.asts[i];
1865 match ast {
1866 Ast::Flags(f) => {
1867 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1868 return Err(
1869 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
1870 );
1871 }
1872 let mut translator_builder = self.default_translator_builder();
1873 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1874 translator_builder.case_insensitive(state);
1875 }
1876 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1877 translator_builder.unicode(state);
1878 }
1879 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1880 self.dot_all.set(state);
1881 }
1882 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1883 self.multiline.set(state);
1884 }
1885 concat_translator = Some(translator_builder.build());
1886 *translator = concat_translator.clone();
1887 i += 1;
1888 continue;
1889 }
1890 Ast::Assertion(a) if a.kind == ast::AssertionKind::WordBoundary => {
1891 let node = self
1892 .rewrite_word_boundary_in_concat(&c.asts, i, translator, tb, false)?;
1893 children.push(node.0);
1894 i = node.1; continue;
1896 }
1897 Ast::Assertion(a) if a.kind == ast::AssertionKind::NotWordBoundary => {
1898 let node = self
1899 .rewrite_word_boundary_in_concat(&c.asts, i, translator, tb, true)?;
1900 children.push(node.0);
1901 i = node.1;
1902 continue;
1903 }
1904 _ => {}
1905 }
1906 match concat_translator {
1907 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
1908 Ok(node_id) => children.push(node_id),
1909 Err(err) => return Err(err),
1910 },
1911 None => match self.ast_to_node_id(ast, translator, tb) {
1912 Ok(node_id) => children.push(node_id),
1913 Err(err) => return Err(err),
1914 },
1915 }
1916 i += 1;
1917 }
1918 self.specialize_word_boundaries(&mut children, tb);
1919 Ok(tb.mk_concats(children.iter().cloned()))
1920 }
1921 Ast::Intersection(intersection) => {
1922 let mut children = vec![];
1923 for ast in &intersection.asts {
1924 match self.ast_to_node_id(ast, translator, tb) {
1925 Ok(node_id) => children.push(node_id),
1926 Err(err) => return Err(err),
1927 }
1928 }
1929 Ok(tb.mk_inters(children.into_iter()))
1930 }
1931 Ast::Complement(complement) => {
1932 let body = self.ast_to_node_id(&complement.ast, translator, tb);
1933 body.map(|x| tb.mk_compl(x))
1934 }
1935 }
1936 }
1937
1938 fn parse_inner(&mut self) -> Result<Ast> {
1939 let mut concat = Concat {
1940 span: self.span(),
1941 asts: vec![],
1942 };
1943 loop {
1944 self.bump_space();
1945 if self.is_eof() {
1946 break;
1947 }
1948 match self.char() {
1949 '(' => concat = self.push_group(concat)?,
1950 ')' => concat = self.pop_group(concat)?,
1951 '|' => concat = self.push_alternate(concat)?,
1952 '&' => concat = self.push_intersect(concat)?,
1953 '~' => concat = self.push_compl_group(concat)?,
1954 '[' => {
1955 let class = self.parse_set_class()?;
1956 concat.asts.push(Ast::class_bracketed(class));
1957 }
1958 '?' => {
1959 concat =
1960 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
1961 }
1962 '*' => {
1963 concat =
1964 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
1965 }
1966 '+' => {
1967 concat =
1968 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
1969 }
1970 '{' => {
1971 concat = self.parse_counted_repetition(concat)?;
1972 }
1973 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1974 }
1975 }
1976 let ast = self.pop_group_end(concat)?;
1977 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
1978 || max_concat_length(&ast) >= self.max_list_len
1979 {
1980 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
1981 }
1982 Ok(ast)
1983 }
1984
1985 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
1986 let ast = self.parse_inner()?;
1987 self.ast_to_node_id(&ast, &mut None, tb)
1988 }
1989
1990 #[inline(never)]
1991 fn parse_uncounted_repetition(
1992 &self,
1993 mut concat: ast::Concat,
1994 kind: ast::RepetitionKind,
1995 ) -> Result<ast::Concat> {
1996 let op_start = self.pos();
1998 let ast = match concat.asts.pop() {
1999 Some(ast) => ast,
2000 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2001 };
2002 match ast {
2003 Ast::Empty(_) | Ast::Flags(_) => {
2004 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2005 }
2006 _ => {}
2007 }
2008 if self.bump() && self.char() == '?' {
2009 return Err(self.error(
2010 Span::new(op_start, self.pos()),
2011 ast::ErrorKind::UnsupportedLazyQuantifier,
2012 ));
2013 }
2014 concat.asts.push(Ast::repetition(ast::Repetition {
2015 span: ast.span().with_end(self.pos()),
2016 op: ast::RepetitionOp {
2017 span: Span::new(op_start, self.pos()),
2018 kind,
2019 },
2020 greedy: true,
2021 ast: Box::new(ast),
2022 }));
2023 Ok(concat)
2024 }
2025
2026 #[inline(never)]
2027 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2028 assert!(self.char() == '{');
2029 let start = self.pos();
2030 let ast = match concat.asts.pop() {
2031 Some(ast) => ast,
2032 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2033 };
2034 match ast {
2035 Ast::Empty(_) | Ast::Flags(_) => {
2036 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2037 }
2038 _ => {}
2039 }
2040 if !self.bump_and_bump_space() {
2041 return Err(self.error(
2042 Span::new(start, self.pos()),
2043 ast::ErrorKind::RepetitionCountUnclosed,
2044 ));
2045 }
2046 let count_start = specialize_err(
2047 self.parse_decimal(),
2048 ast::ErrorKind::DecimalEmpty,
2049 ast::ErrorKind::RepetitionCountDecimalEmpty,
2050 );
2051 if self.is_eof() {
2052 return Err(self.error(
2053 Span::new(start, self.pos()),
2054 ast::ErrorKind::RepetitionCountUnclosed,
2055 ));
2056 }
2057 let range = if self.char() == ',' {
2058 if !self.bump_and_bump_space() {
2059 return Err(self.error(
2060 Span::new(start, self.pos()),
2061 ast::ErrorKind::RepetitionCountUnclosed,
2062 ));
2063 }
2064 if self.char() != '}' {
2065 let count_start = match count_start {
2066 Ok(c) => c,
2067 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2068 if self.parser().empty_min_range {
2069 0
2070 } else {
2071 return Err(err);
2072 }
2073 }
2074 err => err?,
2075 };
2076 let count_end = specialize_err(
2077 self.parse_decimal(),
2078 ast::ErrorKind::DecimalEmpty,
2079 ast::ErrorKind::RepetitionCountDecimalEmpty,
2080 )?;
2081 ast::RepetitionRange::Bounded(count_start, count_end)
2082 } else {
2083 ast::RepetitionRange::AtLeast(count_start?)
2084 }
2085 } else {
2086 ast::RepetitionRange::Exactly(count_start?)
2087 };
2088
2089 if self.is_eof() || self.char() != '}' {
2090 return Err(self.error(
2091 Span::new(start, self.pos()),
2092 ast::ErrorKind::RepetitionCountUnclosed,
2093 ));
2094 }
2095
2096 if self.bump_and_bump_space() && self.char() == '?' {
2097 return Err(self.error(
2098 Span::new(start, self.pos()),
2099 ast::ErrorKind::UnsupportedLazyQuantifier,
2100 ));
2101 }
2102
2103 let op_span = Span::new(start, self.pos());
2104 if !range.is_valid() {
2105 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2106 }
2107
2108 let over_limit = match &range {
2109 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2110 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2111 ast::RepetitionRange::Bounded(n, m) => {
2112 *n > self.max_repeat || *m > self.max_repeat
2113 }
2114 };
2115 if over_limit {
2116 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2117 }
2118 concat.asts.push(Ast::repetition(ast::Repetition {
2119 span: ast.span().with_end(self.pos()),
2120 op: ast::RepetitionOp {
2121 span: op_span,
2122 kind: ast::RepetitionKind::Range(range),
2123 },
2124 greedy: true,
2125 ast: Box::new(ast),
2126 }));
2127 Ok(concat)
2128 }
2129
2130 #[inline(never)]
2131 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2132 assert_eq!(self.char(), '(');
2133 let open_span = self.span_char();
2134 self.bump();
2135 self.bump_space();
2136 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2137 let kind = match (pos, ahead) {
2138 (true, true) => LookaroundKind::PositiveLookahead,
2139 (true, false) => LookaroundKind::PositiveLookbehind,
2140 (false, true) => LookaroundKind::NegativeLookahead,
2141 (false, false) => LookaroundKind::NegativeLookbehind,
2142 };
2143 return Ok(Either::Right(ast::Group {
2144 span: open_span,
2145 kind: ast::GroupKind::Lookaround(kind),
2146 ast: Box::new(Ast::empty(self.span())),
2147 }));
2148 }
2149 let inner_span = self.span();
2150 let mut starts_with_p = true;
2151 if self.bump_if("?P<") || {
2152 starts_with_p = false;
2153 self.bump_if("?<")
2154 } {
2155 let capture_index = self.next_capture_index(open_span)?;
2156 let name = self.parse_capture_name(capture_index)?;
2157 Ok(Either::Right(ast::Group {
2158 span: open_span,
2159 kind: ast::GroupKind::CaptureName {
2160 starts_with_p,
2161 name,
2162 },
2163 ast: Box::new(Ast::empty(self.span())),
2164 }))
2165 } else if self.bump_if("?") {
2166 if self.is_eof() {
2167 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2168 }
2169 let flags = self.parse_flags()?;
2170 let char_end = self.char();
2171 self.bump();
2172 if char_end == ')' {
2173 if flags.items.is_empty() {
2176 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2177 }
2178 Ok(Either::Left(ast::SetFlags {
2179 span: Span {
2180 end: self.pos(),
2181 ..open_span
2182 },
2183 flags,
2184 }))
2185 } else {
2186 assert_eq!(char_end, ':');
2187 Ok(Either::Right(ast::Group {
2188 span: open_span,
2189 kind: ast::GroupKind::NonCapturing(flags),
2190 ast: Box::new(Ast::empty(self.span())),
2191 }))
2192 }
2193 } else {
2194 let capture_index = self.next_capture_index(open_span)?;
2195 Ok(Either::Right(ast::Group {
2196 span: open_span,
2197 kind: ast::GroupKind::CaptureIndex(capture_index),
2198 ast: Box::new(Ast::empty(self.span())),
2199 }))
2200 }
2201 }
2202
2203 #[inline(never)]
2204 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2205 if self.is_eof() {
2206 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2207 }
2208 let start = self.pos();
2209 loop {
2210 if self.char() == '>' {
2211 break;
2212 }
2213 if !is_capture_char(self.char(), self.pos() == start) {
2214 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2215 }
2216 if !self.bump() {
2217 break;
2218 }
2219 }
2220 let end = self.pos();
2221 if self.is_eof() {
2222 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2223 }
2224 assert_eq!(self.char(), '>');
2225 self.bump();
2226 let name = &self.pattern()[start.offset..end.offset];
2227 if name.is_empty() {
2228 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2229 }
2230 let capname = ast::CaptureName {
2231 span: Span::new(start, end),
2232 name: name.to_string(),
2233 index: capture_index,
2234 };
2235 self.add_capture_name(&capname)?;
2236 Ok(capname)
2237 }
2238
2239 #[inline(never)]
2240 fn parse_flags(&self) -> Result<ast::Flags> {
2241 let mut flags = ast::Flags {
2242 span: self.span(),
2243 items: vec![],
2244 };
2245 let mut last_was_negation = None;
2246 while self.char() != ':' && self.char() != ')' {
2247 if self.char() == '-' {
2248 last_was_negation = Some(self.span_char());
2249 let item = ast::FlagsItem {
2250 span: self.span_char(),
2251 kind: ast::FlagsItemKind::Negation,
2252 };
2253 if let Some(i) = flags.add_item(item) {
2254 return Err(self.error(
2255 self.span_char(),
2256 ast::ErrorKind::FlagRepeatedNegation {
2257 original: flags.items[i].span,
2258 },
2259 ));
2260 }
2261 } else {
2262 last_was_negation = None;
2263 let item = ast::FlagsItem {
2264 span: self.span_char(),
2265 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2266 };
2267 if let Some(i) = flags.add_item(item) {
2268 return Err(self.error(
2269 self.span_char(),
2270 ast::ErrorKind::FlagDuplicate {
2271 original: flags.items[i].span,
2272 },
2273 ));
2274 }
2275 }
2276 if !self.bump() {
2277 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2278 }
2279 }
2280 if let Some(span) = last_was_negation {
2281 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2282 }
2283 flags.span.end = self.pos();
2284 Ok(flags)
2285 }
2286
2287 #[inline(never)]
2288 fn parse_flag(&self) -> Result<ast::Flag> {
2289 match self.char() {
2290 'i' => Ok(ast::Flag::CaseInsensitive),
2291 'm' => Ok(ast::Flag::MultiLine),
2292 's' => Ok(ast::Flag::DotMatchesNewLine),
2293 'U' => Ok(ast::Flag::SwapGreed),
2294 'u' => Ok(ast::Flag::Unicode),
2295 'R' => Ok(ast::Flag::CRLF),
2296 'x' => Ok(ast::Flag::IgnoreWhitespace),
2297 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2298 }
2299 }
2300
2301 fn parse_primitive(&self) -> Result<Primitive> {
2302 match self.char() {
2303 '\\' => self.parse_escape(),
2304 '_' => {
2305 let ast = Primitive::Top(self.span_char());
2306 self.bump();
2307 Ok(ast)
2308 }
2309 '.' => {
2310 let ast = Primitive::Dot(self.span_char());
2311 self.bump();
2312 Ok(ast)
2313 }
2314 '^' => {
2315 let ast = Primitive::Assertion(ast::Assertion {
2316 span: self.span_char(),
2317 kind: ast::AssertionKind::StartLine,
2318 });
2319 self.bump();
2320 Ok(ast)
2321 }
2322 '$' => {
2323 let ast = Primitive::Assertion(ast::Assertion {
2324 span: self.span_char(),
2325 kind: ast::AssertionKind::EndLine,
2326 });
2327 self.bump();
2328 Ok(ast)
2329 }
2330 c => {
2331 let ast = Primitive::Literal(Literal {
2332 span: self.span_char(),
2333 kind: LiteralKind::Verbatim,
2334 c,
2335 });
2336 self.bump();
2337 Ok(ast)
2338 }
2339 }
2340 }
2341
2342 #[inline(never)]
2343 fn parse_escape(&self) -> Result<Primitive> {
2344 assert_eq!(self.char(), '\\');
2345 let start = self.pos();
2346 if !self.bump() {
2347 return Err(self.error(
2348 Span::new(start, self.pos()),
2349 ast::ErrorKind::EscapeUnexpectedEof,
2350 ));
2351 }
2352 let c = self.char();
2353 match c {
2355 '0'..='9' => {
2356 if !self.parser().octal {
2357 return Err(self.error(
2358 Span::new(start, self.span_char().end),
2359 ast::ErrorKind::UnsupportedBackreference,
2360 ));
2361 }
2362 let mut lit = self.parse_octal();
2363 lit.span.start = start;
2364 return Ok(Primitive::Literal(lit));
2365 }
2366 'x' | 'u' | 'U' => {
2373 let mut lit = self.parse_hex()?;
2374 lit.span.start = start;
2375 return Ok(Primitive::Literal(lit));
2376 }
2377 'p' | 'P' => {
2378 let mut cls = self.parse_unicode_class()?;
2379 cls.span.start = start;
2380 return Ok(Primitive::Unicode(cls));
2381 }
2382 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2383 let mut cls = self.parse_perl_class();
2384 cls.span.start = start;
2385 return Ok(Primitive::Perl(cls));
2386 }
2387 _ => {}
2388 }
2389
2390 self.bump();
2392 let span = Span::new(start, self.pos());
2393 if is_meta_character(c) {
2394 return Ok(Primitive::Literal(Literal {
2395 span,
2396 kind: LiteralKind::Meta,
2397 c,
2398 }));
2399 }
2400 if is_escapeable_character(c) {
2401 return Ok(Primitive::Literal(Literal {
2402 span,
2403 kind: LiteralKind::Superfluous,
2404 c,
2405 }));
2406 }
2407 let special = |kind, c| {
2408 Ok(Primitive::Literal(Literal {
2409 span,
2410 kind: LiteralKind::Special(kind),
2411 c,
2412 }))
2413 };
2414 match c {
2415 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2416 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2417 't' => special(SpecialLiteralKind::Tab, '\t'),
2418 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2419 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2420 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2421 'A' => Ok(Primitive::Assertion(ast::Assertion {
2422 span,
2423 kind: ast::AssertionKind::StartText,
2424 })),
2425 'z' => Ok(Primitive::Assertion(ast::Assertion {
2426 span,
2427 kind: ast::AssertionKind::EndText,
2428 })),
2429 'b' => {
2430 let mut wb = ast::Assertion {
2431 span,
2432 kind: ast::AssertionKind::WordBoundary,
2433 };
2434 if !self.is_eof() && self.char() == '{' {
2437 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2438 wb.kind = kind;
2439 wb.span.end = self.pos();
2440 }
2441 }
2442 Ok(Primitive::Assertion(wb))
2443 }
2444 'B' => Ok(Primitive::Assertion(ast::Assertion {
2445 span,
2446 kind: ast::AssertionKind::NotWordBoundary,
2447 })),
2448 '<' => Ok(Primitive::Assertion(ast::Assertion {
2449 span,
2450 kind: ast::AssertionKind::WordBoundaryStartAngle,
2451 })),
2452 '>' => Ok(Primitive::Assertion(ast::Assertion {
2453 span,
2454 kind: ast::AssertionKind::WordBoundaryEndAngle,
2455 })),
2456 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2457 }
2458 }
2459
2460 fn maybe_parse_special_word_boundary(
2461 &self,
2462 wb_start: Position,
2463 ) -> Result<Option<ast::AssertionKind>> {
2464 assert_eq!(self.char(), '{');
2465
2466 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2467 let start = self.pos();
2468 if !self.bump_and_bump_space() {
2469 return Err(self.error(
2470 Span::new(wb_start, self.pos()),
2471 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2472 ));
2473 }
2474 let start_contents = self.pos();
2475 if !is_valid_char(self.char()) {
2476 self.parser().pos.set(start);
2477 return Ok(None);
2478 }
2479
2480 let mut scratch = self.parser().scratch.borrow_mut();
2482 scratch.clear();
2483 while !self.is_eof() && is_valid_char(self.char()) {
2484 scratch.push(self.char());
2485 self.bump_and_bump_space();
2486 }
2487 if self.is_eof() || self.char() != '}' {
2488 return Err(self.error(
2489 Span::new(start, self.pos()),
2490 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2491 ));
2492 }
2493 let end = self.pos();
2494 self.bump();
2495 let kind = match scratch.as_str() {
2496 "start" => ast::AssertionKind::WordBoundaryStart,
2497 "end" => ast::AssertionKind::WordBoundaryEnd,
2498 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2499 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2500 _ => {
2501 return Err(self.error(
2502 Span::new(start_contents, end),
2503 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2504 ))
2505 }
2506 };
2507 Ok(Some(kind))
2508 }
2509
2510 #[inline(never)]
2511 fn parse_octal(&self) -> Literal {
2512 assert!(self.parser().octal);
2513 assert!('0' <= self.char() && self.char() <= '7');
2514 let start = self.pos();
2515 while self.bump()
2517 && '0' <= self.char()
2518 && self.char() <= '7'
2519 && self.pos().offset - start.offset <= 2
2520 {}
2521 let end = self.pos();
2522 let octal = &self.pattern()[start.offset..end.offset];
2523 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2526 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2529 Literal {
2530 span: Span::new(start, end),
2531 kind: LiteralKind::Octal,
2532 c,
2533 }
2534 }
2535
2536 #[inline(never)]
2537 fn parse_hex(&self) -> Result<Literal> {
2538 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2539
2540 let hex_kind = match self.char() {
2541 'x' => HexLiteralKind::X,
2542 'u' => HexLiteralKind::UnicodeShort,
2543 _ => HexLiteralKind::UnicodeLong,
2544 };
2545 if !self.bump_and_bump_space() {
2546 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2547 }
2548 if self.char() == '{' {
2549 self.parse_hex_brace(hex_kind)
2550 } else {
2551 self.parse_hex_digits(hex_kind)
2552 }
2553 }
2554
2555 #[inline(never)]
2556 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2557 let mut scratch = self.parser().scratch.borrow_mut();
2558 scratch.clear();
2559
2560 let start = self.pos();
2561 for i in 0..kind.digits() {
2562 if i > 0 && !self.bump_and_bump_space() {
2563 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2564 }
2565 if !is_hex(self.char()) {
2566 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2567 }
2568 scratch.push(self.char());
2569 }
2570 self.bump_and_bump_space();
2571 let end = self.pos();
2572 let hex = scratch.as_str();
2573 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2574 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2575 Some(c) => Ok(Literal {
2576 span: Span::new(start, end),
2577 kind: LiteralKind::HexFixed(kind),
2578 c,
2579 }),
2580 }
2581 }
2582
2583 #[inline(never)]
2584 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2585 let mut scratch = self.parser().scratch.borrow_mut();
2586 scratch.clear();
2587
2588 let brace_pos = self.pos();
2589 let start = self.span_char().end;
2590 while self.bump_and_bump_space() && self.char() != '}' {
2591 if !is_hex(self.char()) {
2592 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2593 }
2594 scratch.push(self.char());
2595 }
2596 if self.is_eof() {
2597 return Err(self.error(
2598 Span::new(brace_pos, self.pos()),
2599 ast::ErrorKind::EscapeUnexpectedEof,
2600 ));
2601 }
2602 let end = self.pos();
2603 let hex = scratch.as_str();
2604 assert_eq!(self.char(), '}');
2605 self.bump_and_bump_space();
2606
2607 if hex.is_empty() {
2608 return Err(self.error(
2609 Span::new(brace_pos, self.pos()),
2610 ast::ErrorKind::EscapeHexEmpty,
2611 ));
2612 }
2613 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2614 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2615 Some(c) => Ok(Literal {
2616 span: Span::new(start, self.pos()),
2617 kind: LiteralKind::HexBrace(kind),
2618 c,
2619 }),
2620 }
2621 }
2622
2623 fn parse_decimal(&self) -> Result<u32> {
2624 let mut scratch = self.parser().scratch.borrow_mut();
2625 scratch.clear();
2626
2627 while !self.is_eof() && self.char().is_whitespace() {
2628 self.bump();
2629 }
2630 let start = self.pos();
2631 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2632 scratch.push(self.char());
2633 self.bump_and_bump_space();
2634 }
2635 let span = Span::new(start, self.pos());
2636 while !self.is_eof() && self.char().is_whitespace() {
2637 self.bump_and_bump_space();
2638 }
2639 let digits = scratch.as_str();
2640 if digits.is_empty() {
2641 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2642 }
2643 match digits.parse::<u32>().ok() {
2644 Some(n) => Ok(n),
2645 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2646 }
2647 }
2648
2649 #[inline(never)]
2650 fn parse_set_class(&self) -> Result<ClassBracketed> {
2651 assert_eq!(self.char(), '[');
2652
2653 let mut union = ClassSetUnion {
2654 span: self.span(),
2655 items: vec![],
2656 };
2657 loop {
2658 self.bump_space();
2659 if self.is_eof() {
2660 return Err(self.unclosed_class_error());
2661 }
2662 match self.char() {
2663 '[' => {
2664 if !self.parser().stack_class.borrow().is_empty() {
2665 if let Some(cls) = self.maybe_parse_ascii_class() {
2666 union.push(ClassSetItem::Ascii(cls));
2667 continue;
2668 }
2669 }
2670 union = self.push_class_open(union)?;
2671 }
2672 ']' => match self.pop_class(union)? {
2673 Either::Left(nested_union) => {
2674 union = nested_union;
2675 }
2676 Either::Right(class) => return Ok(class),
2677 },
2678 '&' if self.peek() == Some('&') => {
2679 assert!(self.bump_if("&&"));
2680 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2681 }
2682 '-' if self.peek() == Some('-') => {
2683 assert!(self.bump_if("--"));
2684 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2685 }
2686 '~' if self.peek() == Some('~') => {
2687 assert!(self.bump_if("~~"));
2688 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2689 }
2690 _ => {
2691 union.push(self.parse_set_class_range()?);
2692 }
2693 }
2694 }
2695 }
2696
2697 #[inline(never)]
2698 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2699 let prim1 = self.parse_set_class_item()?;
2700 self.bump_space();
2701 if self.is_eof() {
2702 return Err(self.unclosed_class_error());
2703 }
2704 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2705 return prim1.into_class_set_item(self);
2706 }
2707 if !self.bump_and_bump_space() {
2708 return Err(self.unclosed_class_error());
2709 }
2710 let prim2 = self.parse_set_class_item()?;
2711 let range = ClassSetRange {
2712 span: Span::new(prim1.span().start, prim2.span().end),
2713 start: prim1.into_class_literal(self)?,
2714 end: prim2.into_class_literal(self)?,
2715 };
2716 if !range.is_valid() {
2717 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2718 }
2719 Ok(ClassSetItem::Range(range))
2720 }
2721
2722 #[inline(never)]
2723 fn parse_set_class_item(&self) -> Result<Primitive> {
2724 if self.char() == '\\' {
2725 self.parse_escape()
2726 } else {
2727 let x = Primitive::Literal(Literal {
2728 span: self.span_char(),
2729 kind: LiteralKind::Verbatim,
2730 c: self.char(),
2731 });
2732 self.bump();
2733 Ok(x)
2734 }
2735 }
2736
2737 #[inline(never)]
2738 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2739 assert_eq!(self.char(), '[');
2740 let start = self.pos();
2741 if !self.bump_and_bump_space() {
2742 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2743 }
2744
2745 let negated = if self.char() != '^' {
2746 false
2747 } else {
2748 if !self.bump_and_bump_space() {
2749 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2750 }
2751 true
2752 };
2753 let mut union = ClassSetUnion {
2755 span: self.span(),
2756 items: vec![],
2757 };
2758 while self.char() == '-' {
2759 union.push(ClassSetItem::Literal(Literal {
2760 span: self.span_char(),
2761 kind: LiteralKind::Verbatim,
2762 c: '-',
2763 }));
2764 if !self.bump_and_bump_space() {
2765 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
2766 }
2767 }
2768 if union.items.is_empty() && self.char() == ']' {
2771 union.push(ClassSetItem::Literal(Literal {
2772 span: self.span_char(),
2773 kind: LiteralKind::Verbatim,
2774 c: ']',
2775 }));
2776 if !self.bump_and_bump_space() {
2777 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2778 }
2779 }
2780 let set = ClassBracketed {
2781 span: Span::new(start, self.pos()),
2782 negated,
2783 kind: ClassSet::union(ClassSetUnion {
2784 span: Span::new(union.span.start, union.span.start),
2785 items: vec![],
2786 }),
2787 };
2788 Ok((set, union))
2789 }
2790
2791 #[inline(never)]
2792 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
2793 assert_eq!(self.char(), '[');
2794 let start = self.pos();
2796 let mut negated = false;
2797 if !self.bump() || self.char() != ':' {
2798 self.parser().pos.set(start);
2799 return None;
2800 }
2801 if !self.bump() {
2802 self.parser().pos.set(start);
2803 return None;
2804 }
2805 if self.char() == '^' {
2806 negated = true;
2807 if !self.bump() {
2808 self.parser().pos.set(start);
2809 return None;
2810 }
2811 }
2812 let name_start = self.offset();
2813 while self.char() != ':' && self.bump() {}
2814 if self.is_eof() {
2815 self.parser().pos.set(start);
2816 return None;
2817 }
2818 let name = &self.pattern()[name_start..self.offset()];
2819 if !self.bump_if(":]") {
2820 self.parser().pos.set(start);
2821 return None;
2822 }
2823 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
2824 Some(kind) => kind,
2825 None => {
2826 self.parser().pos.set(start);
2827 return None;
2828 }
2829 };
2830 Some(ClassAscii {
2831 span: Span::new(start, self.pos()),
2832 kind,
2833 negated,
2834 })
2835 }
2836
2837 #[inline(never)]
2838 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
2839 assert!(self.char() == 'p' || self.char() == 'P');
2840
2841 let mut scratch = self.parser().scratch.borrow_mut();
2842 scratch.clear();
2843
2844 let negated = self.char() == 'P';
2845 if !self.bump_and_bump_space() {
2846 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2847 }
2848 let (start, kind) = if self.char() == '{' {
2849 let start = self.span_char().end;
2850 while self.bump_and_bump_space() && self.char() != '}' {
2851 scratch.push(self.char());
2852 }
2853 if self.is_eof() {
2854 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2855 }
2856 assert_eq!(self.char(), '}');
2857 self.bump();
2858
2859 let name = scratch.as_str();
2860 if let Some(i) = name.find("!=") {
2861 (
2862 start,
2863 ClassUnicodeKind::NamedValue {
2864 op: ClassUnicodeOpKind::NotEqual,
2865 name: name[..i].to_string(),
2866 value: name[i + 2..].to_string(),
2867 },
2868 )
2869 } else if let Some(i) = name.find(':') {
2870 (
2871 start,
2872 ClassUnicodeKind::NamedValue {
2873 op: ClassUnicodeOpKind::Colon,
2874 name: name[..i].to_string(),
2875 value: name[i + 1..].to_string(),
2876 },
2877 )
2878 } else if let Some(i) = name.find('=') {
2879 (
2880 start,
2881 ClassUnicodeKind::NamedValue {
2882 op: ClassUnicodeOpKind::Equal,
2883 name: name[..i].to_string(),
2884 value: name[i + 1..].to_string(),
2885 },
2886 )
2887 } else {
2888 (start, ClassUnicodeKind::Named(name.to_string()))
2889 }
2890 } else {
2891 let start = self.pos();
2892 let c = self.char();
2893 if c == '\\' {
2894 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
2895 }
2896 self.bump_and_bump_space();
2897 let kind = ClassUnicodeKind::OneLetter(c);
2898 (start, kind)
2899 };
2900 Ok(ClassUnicode {
2901 span: Span::new(start, self.pos()),
2902 negated,
2903 kind,
2904 })
2905 }
2906
2907 #[inline(never)]
2908 fn parse_perl_class(&self) -> ClassPerl {
2909 let c = self.char();
2910 let span = self.span_char();
2911 self.bump();
2912 let (negated, kind) = match c {
2913 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
2914 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
2915 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
2916 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
2917 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
2918 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
2919 c => panic!("expected valid Perl class but got '{}'", c),
2920 };
2921 ClassPerl {
2922 span,
2923 kind,
2924 negated,
2925 }
2926 }
2927}
2928
2929fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
2932 use regex_syntax::ast::ClassSetItem;
2933 let items = match item {
2934 ClassSetItem::Union(u) => &u.items,
2935 _ => return false,
2936 };
2937 if items.len() != 2 {
2938 return false;
2939 }
2940 match (&items[0], &items[1]) {
2941 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
2942 let is_all = a.kind == b.kind && a.negated != b.negated;
2943 is_all
2944 }
2945 _ => false,
2946 }
2947}
2948
2949pub fn max_concat_length(ast: &ast::Ast) -> usize {
2950 match ast {
2951 ast::Ast::Empty(_)
2952 | ast::Ast::Flags(_)
2953 | ast::Ast::Literal(_)
2954 | ast::Ast::Dot(_)
2955 | ast::Ast::Top(_)
2956 | ast::Ast::Assertion(_)
2957 | ast::Ast::ClassUnicode(_)
2958 | ast::Ast::ClassPerl(_)
2959 | ast::Ast::ClassBracketed(_) => 0,
2960 ast::Ast::Group(g) => max_concat_length(&g.ast),
2961 ast::Ast::Complement(c) => max_concat_length(&c.ast),
2962 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
2963 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
2964 ast::Ast::Concat(c) => c
2965 .asts
2966 .len()
2967 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
2968 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
2969 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
2970 }
2971}
2972
2973pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
2974 fn go(ast: &ast::Ast, limit: u64) -> u64 {
2975 match ast {
2976 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
2977 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
2978 ast::Ast::Assertion(_) => 1,
2979 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
2980 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
2981 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
2982 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
2983 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
2984 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
2985 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
2986 ast::Ast::Repetition(r) => {
2987 let body = go(&r.ast, limit);
2988 let factor: u64 = match &r.op.kind {
2989 ast::RepetitionKind::ZeroOrOne => 2,
2990 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
2991 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
2992 (*n as u64).max(1)
2993 }
2994 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
2995 (*n as u64).max(1).saturating_add(1)
2996 }
2997 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
2998 (*m as u64).max(1)
2999 }
3000 };
3001 body.saturating_mul(factor).min(limit)
3002 }
3003 }
3004 }
3005 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3006 let mut total: u64 = 0;
3007 for c in children {
3008 total = total.saturating_add(go(c, limit));
3009 if total >= limit {
3010 return limit;
3011 }
3012 }
3013 total
3014 }
3015 go(ast, limit)
3016}
3017
3018pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3019 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3020 p.parse(tb)
3021}
3022
3023pub fn parse_ast_with<'s>(
3024 tb: &mut TB<'s>,
3025 pattern: &'s str,
3026 flags: &PatternFlags,
3027) -> std::result::Result<NodeId, ParseError> {
3028 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3029 p.parse(tb)
3030}
3031
3032pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3034 let mut p: ResharpParser = ResharpParser::new(pattern);
3035 p.parse_inner()
3036}