1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51 pub max_depth: usize,
52}
53
54pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62 fn default() -> Self {
63 Self {
64 unicode: true,
65 full_unicode: false,
66 case_insensitive: false,
67 dot_matches_new_line: false,
68 multiline: true,
69 ignore_whitespace: false,
70 ascii_perl_classes: false,
71 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72 max_list_len: DEFAULT_MAX_LIST_LEN,
73 max_repeat: DEFAULT_MAX_REPEAT,
74 max_depth: DEFAULT_MAX_DEPTH,
75 }
76 }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81 Word,
82 NonWord,
83 MaybeWord,
84 MaybeNonWord,
85 Unknown,
86 Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90 b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
94 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95 use WordCharKind::*;
96 match item {
97 ClassSetItem::Empty(_) => Unknown,
98 ClassSetItem::Literal(l) => {
99 if is_word_byte(l.c as u8) {
100 Word
101 } else {
102 NonWord
103 }
104 }
105 ClassSetItem::Range(r) => {
106 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108 if all_word {
109 Word
110 } else if all_non {
111 NonWord
112 } else {
113 Unknown
114 }
115 }
116 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117 (ClassPerlKind::Word, false) => Word,
118 (ClassPerlKind::Word, true) => NonWord,
119 (ClassPerlKind::Space, false) => NonWord,
120 (ClassPerlKind::Digit, false) => Word,
121 _ => Unknown,
122 },
123 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124 ClassSetItem::Union(u) => {
125 let mut kind = Unknown;
126 for item in &u.items {
127 let k = class_set_item_word_kind(item);
128 kind = match (kind, k) {
129 (_, Unknown) => return Unknown,
130 (Unknown, _) => k,
131 (Word, Word) => Word,
132 (NonWord, NonWord) => NonWord,
133 _ => return Unknown,
134 };
135 }
136 kind
137 }
138 _ => Unknown,
139 }
140}
141
142fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
143 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144 use WordCharKind::*;
145 if c.negated {
146 return match &c.kind {
147 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148 if p.negated {
149 Word
150 } else {
151 NonWord
152 }
153 }
154 _ => Unknown,
155 };
156 }
157 match &c.kind {
158 ClassSet::Item(item) => class_set_item_word_kind(item),
159 ClassSet::BinaryOp(_) => Unknown,
160 }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164 regex_syntax::ast::Literal {
165 span,
166 kind: regex_syntax::ast::LiteralKind::Verbatim,
167 c,
168 }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173 span,
174 start: ascii_class_lit(span, a),
175 end: ascii_class_lit(span, b),
176 })
177}
178
179fn ascii_perl_positive(
180 span: Span,
181 kind: ®ex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184 match kind {
185 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187 span,
188 items: vec![
189 ascii_class_range(span, 'a', 'z'),
190 ascii_class_range(span, 'A', 'Z'),
191 ascii_class_range(span, '0', '9'),
192 ClassSetItem::Literal(ascii_class_lit(span, '_')),
193 ],
194 }),
195 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196 span,
197 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198 .into_iter()
199 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200 .collect(),
201 }),
202 }
203}
204
205fn ascii_perl_set_item(
206 span: Span,
207 kind: ®ex_syntax::ast::ClassPerlKind,
208 negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211 let positive = ascii_perl_positive(span, kind);
212 if negated {
213 ClassSetItem::Bracketed(Box::new(ClassBracketed {
214 span,
215 negated: true,
216 kind: ClassSet::Item(positive),
217 }))
218 } else {
219 positive
220 }
221}
222
223fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225 match set {
226 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228 span: op.span,
229 kind: op.kind.clone(),
230 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232 }),
233 }
234}
235
236fn rewrite_ascii_perl_item(
237 item: ®ex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240 match item {
241 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243 span: u.span,
244 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245 }),
246 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247 span: b.span,
248 negated: b.negated,
249 kind: rewrite_ascii_perl_set(&b.kind),
250 })),
251 other => other.clone(),
252 }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257 Literal(Literal),
258 Assertion(ast::Assertion),
259 Dot(Span),
260 Top(Span),
261 Perl(ClassPerl),
262 Unicode(ClassUnicode),
263}
264
265impl Primitive {
266 fn span(&self) -> &Span {
267 match *self {
268 Primitive::Literal(ref x) => &x.span,
269 Primitive::Assertion(ref x) => &x.span,
270 Primitive::Dot(ref span) => span,
271 Primitive::Top(ref span) => span,
272 Primitive::Perl(ref x) => &x.span,
273 Primitive::Unicode(ref x) => &x.span,
274 }
275 }
276
277 fn into_ast(self) -> Ast {
278 match self {
279 Primitive::Literal(lit) => Ast::literal(lit),
280 Primitive::Assertion(assert) => Ast::assertion(assert),
281 Primitive::Dot(span) => Ast::dot(span),
282 Primitive::Top(span) => Ast::top(span),
283 Primitive::Perl(cls) => Ast::class_perl(cls),
284 Primitive::Unicode(cls) => Ast::class_unicode(cls),
285 }
286 }
287
288 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289 use self::Primitive::*;
290 use regex_syntax::ast::ClassSetItem;
291
292 match self {
293 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297 }
298 }
299
300 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301 use self::Primitive::*;
302
303 match self {
304 Literal(lit) => Ok(lit),
305 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306 }
307 }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312 Left(Left),
313 Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318 pub kind: ErrorKind,
320 pattern: String,
323 pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329 write!(f, "{:?}: {:?}", self.kind, self.span)
330 }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338 Group {
340 concat: Concat,
342 group: ast::Group,
344 ignore_whitespace: bool,
346 },
347 Alternation(ast::Alternation),
348 Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353 Open {
355 union: regex_syntax::ast::ClassSetUnion,
357 set: regex_syntax::ast::ClassBracketed,
358 },
359 Op {
362 kind: regex_syntax::ast::ClassSetBinaryOpKind,
364 lhs: regex_syntax::ast::ClassSet,
366 },
367}
368
369pub struct ResharpParser<'s> {
371 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372 unicode_classes: resharp_algebra::UnicodeClassCache,
373 pub translator: regex_syntax::hir::translate::Translator,
374 pub pattern: &'s str,
375 pos: Cell<Position>,
376 capture_index: Cell<u32>,
377 octal: bool,
378 empty_min_range: bool,
379 ignore_whitespace: Cell<bool>,
380 dot_all: Cell<bool>,
381 multiline: Cell<bool>,
382 global_unicode: bool,
383 global_full_unicode: bool,
384 global_ascii_perl: bool,
385 global_case_insensitive: bool,
386 expanded_ast_limit: u64,
387 max_list_len: usize,
388 max_repeat: u32,
389 max_depth: usize,
390 comments: RefCell<Vec<ast::Comment>>,
391 stack_group: RefCell<Vec<GroupState>>,
392 stack_class: RefCell<Vec<ClassState>>,
393 capture_names: RefCell<Vec<ast::CaptureName>>,
394 scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398 result.map_err(|e| {
399 if e.kind == from {
400 ParseError {
401 kind: to,
402 pattern: e.pattern,
403 span: e.span,
404 }
405 } else {
406 e
407 }
408 })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412 if first {
413 c == '_' || c.is_alphabetic()
414 } else {
415 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416 }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420 matches!(
421 c,
422 '\\' | '.'
423 | '+'
424 | '*'
425 | '?'
426 | '('
427 | ')'
428 | '|'
429 | '['
430 | ']'
431 | '{'
432 | '}'
433 | '^'
434 | '$'
435 | '#'
436 | '&'
437 | '-'
438 | '~'
439 | '_'
440 )
441}
442
443pub fn escape(text: &str) -> String {
445 let mut buf = String::new();
446 escape_into(text, &mut buf);
447 buf
448}
449
450pub fn escape_into(text: &str, buf: &mut String) {
452 buf.reserve(text.len());
453 for c in text.chars() {
454 if is_meta_character(c) {
455 buf.push('\\');
456 }
457 buf.push(c);
458 }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462 if is_meta_character(c) {
463 return true;
464 }
465 if !c.is_ascii() {
466 return false;
467 }
468 match c {
469 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470 '<' | '>' => false,
471 _ => true,
472 }
473}
474
475fn is_hex(c: char) -> bool {
476 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480 match ast {
481 Ast::Concat(c) => {
482 let mut child_at_start = at_start;
483 for child in &c.asts {
484 ensure_lookbehind_at_start(child, child_at_start)?;
485 if ast_may_consume(child) {
486 child_at_start = false;
487 }
488 }
489 Ok(())
490 }
491 Ast::Alternation(a) => {
492 for child in &a.asts {
493 ensure_lookbehind_at_start(child, at_start)?;
494 }
495 Ok(())
496 }
497 Ast::Intersection(i) => {
498 for child in &i.asts {
499 ensure_lookbehind_at_start(child, at_start)?;
500 }
501 Ok(())
502 }
503 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506 Ast::Lookaround(g) => {
507 match g.kind {
508 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509 if !at_start {
510 return Err(g.span);
511 }
512 }
513 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514 }
515 ensure_lookbehind_at_start(&g.ast, true)
516 }
517 Ast::Empty(_)
518 | Ast::Flags(_)
519 | Ast::Literal(_)
520 | Ast::Dot(_)
521 | Ast::Top(_)
522 | Ast::Assertion(_)
523 | Ast::ClassUnicode(_)
524 | Ast::ClassPerl(_)
525 | Ast::ClassBracketed(_) => Ok(()),
526 }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530 match ast {
531 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532 Ast::Literal(_)
533 | Ast::Dot(_)
534 | Ast::Top(_)
535 | Ast::ClassUnicode(_)
536 | Ast::ClassPerl(_)
537 | Ast::ClassBracketed(_) => true,
538 Ast::Group(g) => ast_may_consume(&g.ast),
539 Ast::Repetition(r) => {
540 if !ast_may_consume(&r.ast) {
541 return false;
542 }
543 match r.op.kind {
544 RepetitionKind::ZeroOrOne
545 | RepetitionKind::ZeroOrMore
546 | RepetitionKind::OneOrMore => true,
547 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549 RepetitionKind::Range(_) => true,
550 }
551 }
552 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554 Ast::Complement(_) => true,
555 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556 }
557}
558
559impl<'s> ResharpParser<'s> {
560 fn default_translator_builder(&self) -> TranslatorBuilder {
561 let mut trb = TranslatorBuilder::new();
562 trb.unicode(self.global_unicode);
563 trb.utf8(false);
564 trb.case_insensitive(self.global_case_insensitive);
565 trb
566 }
567
568 pub fn new(pattern: &'s str) -> Self {
569 Self::with_flags(pattern, &PatternFlags::default())
570 }
571
572 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573 let mut trb = TranslatorBuilder::new();
574 trb.unicode(flags.unicode);
575 trb.utf8(false);
576 trb.case_insensitive(flags.case_insensitive);
577 Self {
578 translator: trb.build(),
579 pattern,
580 perl_classes: vec![],
581 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582 pos: Cell::new(Position::new(0, 0, 0)),
583 capture_index: Cell::new(0),
584 octal: false,
585 empty_min_range: false,
586 ignore_whitespace: Cell::new(flags.ignore_whitespace),
587 dot_all: Cell::new(flags.dot_matches_new_line),
588 multiline: Cell::new(flags.multiline),
589 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590 global_full_unicode: flags.full_unicode,
591 global_ascii_perl: flags.ascii_perl_classes,
592 global_case_insensitive: flags.case_insensitive,
593 expanded_ast_limit: flags.expanded_ast_limit,
594 max_list_len: flags.max_list_len,
595 max_repeat: flags.max_repeat,
596 max_depth: flags.max_depth,
597 comments: RefCell::new(vec![]),
598 stack_group: RefCell::new(vec![]),
599 stack_class: RefCell::new(vec![]),
600 capture_names: RefCell::new(vec![]),
601 scratch: RefCell::new(String::new()),
602 }
603 }
604
605 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606 self
607 }
608
609 fn pattern(&self) -> &str {
610 self.pattern
611 }
612
613 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614 ParseError {
615 kind,
616 pattern: self.pattern().to_string(),
617 span,
618 }
619 }
620
621 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622 self.error(
623 Span::splat(self.pos()),
624 ast::ErrorKind::UnsupportedResharpRegex,
625 )
626 }
627
628 fn offset(&self) -> usize {
629 self.parser().pos.get().offset
630 }
631
632 fn line(&self) -> usize {
633 self.parser().pos.get().line
634 }
635
636 fn column(&self) -> usize {
637 self.parser().pos.get().column
638 }
639
640 fn next_capture_index(&self, span: Span) -> Result<u32> {
641 let current = self.parser().capture_index.get();
642 let i = current
643 .checked_add(1)
644 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645 self.parser().capture_index.set(i);
646 Ok(i)
647 }
648
649 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650 let mut names = self.parser().capture_names.borrow_mut();
651 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652 Err(i) => {
653 names.insert(i, cap.clone());
654 Ok(())
655 }
656 Ok(i) => Err(self.error(
657 cap.span,
658 ast::ErrorKind::GroupNameDuplicate {
659 original: names[i].span,
660 },
661 )),
662 }
663 }
664
665 fn ignore_whitespace(&self) -> bool {
666 self.parser().ignore_whitespace.get()
667 }
668
669 fn char(&self) -> char {
670 self.char_at(self.offset())
671 }
672
673 fn char_at(&self, i: usize) -> char {
674 self.pattern()[i..]
675 .chars()
676 .next()
677 .unwrap_or_else(|| panic!("expected char at offset {}", i))
678 }
679
680 fn bump(&self) -> bool {
681 if self.is_eof() {
682 return false;
683 }
684 let Position {
685 mut offset,
686 mut line,
687 mut column,
688 } = self.pos();
689 if self.char() == '\n' {
690 line = line.checked_add(1).unwrap();
691 column = 1;
692 } else {
693 column = column.checked_add(1).unwrap();
694 }
695 offset += self.char().len_utf8();
696 self.parser().pos.set(Position {
697 offset,
698 line,
699 column,
700 });
701 self.pattern()[self.offset()..].chars().next().is_some()
702 }
703
704 fn bump_if(&self, prefix: &str) -> bool {
705 if self.pattern()[self.offset()..].starts_with(prefix) {
706 for _ in 0..prefix.chars().count() {
707 self.bump();
708 }
709 true
710 } else {
711 false
712 }
713 }
714
715 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716 if self.bump_if("?=") {
717 return Some((true, true));
718 }
719 if self.bump_if("?!") {
720 return Some((true, false));
721 }
722 if self.bump_if("?<=") {
723 return Some((false, true));
724 }
725 if self.bump_if("?<!") {
726 return Some((false, false));
727 }
728 None
729 }
730
731 fn bump_and_bump_space(&self) -> bool {
732 if !self.bump() {
733 return false;
734 }
735 self.bump_space();
736 !self.is_eof()
737 }
738
739 fn bump_space(&self) {
740 if !self.ignore_whitespace() {
741 return;
742 }
743 while !self.is_eof() {
744 if self.char().is_whitespace() {
745 self.bump();
746 } else if self.char() == '#' {
747 let start = self.pos();
748 let mut comment_text = String::new();
749 self.bump();
750 while !self.is_eof() {
751 let c = self.char();
752 self.bump();
753 if c == '\n' {
754 break;
755 }
756 comment_text.push(c);
757 }
758 let comment = ast::Comment {
759 span: Span::new(start, self.pos()),
760 comment: comment_text,
761 };
762 self.parser().comments.borrow_mut().push(comment);
763 } else {
764 break;
765 }
766 }
767 }
768
769 fn peek(&self) -> Option<char> {
770 if self.is_eof() {
771 return None;
772 }
773 self.pattern()[self.offset() + self.char().len_utf8()..]
774 .chars()
775 .next()
776 }
777
778 fn peek_space(&self) -> Option<char> {
781 if !self.ignore_whitespace() {
782 return self.peek();
783 }
784 if self.is_eof() {
785 return None;
786 }
787 let mut start = self.offset() + self.char().len_utf8();
788 let mut in_comment = false;
789 for (i, c) in self.pattern()[start..].char_indices() {
790 if c.is_whitespace() {
791 continue;
792 } else if !in_comment && c == '#' {
793 in_comment = true;
794 } else if in_comment && c == '\n' {
795 in_comment = false;
796 } else {
797 start += i;
798 break;
799 }
800 }
801 self.pattern()[start..].chars().next()
802 }
803
804 fn is_eof(&self) -> bool {
805 self.offset() == self.pattern().len()
806 }
807
808 fn pos(&self) -> Position {
809 self.parser().pos.get()
810 }
811
812 fn span(&self) -> Span {
813 Span::splat(self.pos())
814 }
815
816 fn span_char(&self) -> Span {
817 let mut next = Position {
818 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819 line: self.line(),
820 column: self.column().checked_add(1).unwrap(),
821 };
822 if self.char() == '\n' {
823 next.line += 1;
824 next.column = 1;
825 }
826 Span::new(self.pos(), next)
827 }
828
829 #[inline(never)]
830 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831 assert_eq!(self.char(), '|');
832 concat.span.end = self.pos();
833 self.push_or_add_alternation(concat);
834 self.bump();
835 Ok(ast::Concat {
836 span: self.span(),
837 asts: vec![],
838 })
839 }
840
841 fn push_or_add_alternation(&self, concat: Concat) {
842 use self::GroupState::*;
843
844 let mut stack = self.parser().stack_group.borrow_mut();
845 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846 alts.asts.push(concat.into_ast());
847 return;
848 }
849 stack.push(Alternation(ast::Alternation {
850 span: Span::new(concat.span.start, self.pos()),
851 asts: vec![concat.into_ast()],
852 }));
853 }
854
855 #[inline(never)]
856 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857 assert_eq!(self.char(), '&');
858 concat.span.end = self.pos();
859 self.push_or_add_intersect(concat);
860 self.bump();
861 Ok(Concat {
862 span: self.span(),
863 asts: vec![],
864 })
865 }
866
867 fn push_or_add_intersect(&self, concat: Concat) {
868 use self::GroupState::*;
869
870 let mut stack = self.parser().stack_group.borrow_mut();
871 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872 alts.asts.push(concat.into_ast());
873 return;
874 }
875 stack.push(Intersection(ast::Intersection {
876 span: Span::new(concat.span.start, self.pos()),
877 asts: vec![concat.into_ast()],
878 }));
879 }
880
881 #[inline(never)]
882 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883 assert_eq!(self.char(), '(');
884 match self.parse_group()? {
885 Either::Left(set) => {
886 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887 if let Some(v) = ignore {
888 self.parser().ignore_whitespace.set(v);
889 }
890
891 concat.asts.push(Ast::flags(set));
892 Ok(concat)
893 }
894 Either::Right(group) => {
895 let old_ignore_whitespace = self.ignore_whitespace();
896 let new_ignore_whitespace = group
897 .flags()
898 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899 .unwrap_or(old_ignore_whitespace);
900 self.parser()
901 .stack_group
902 .borrow_mut()
903 .push(GroupState::Group {
904 concat,
905 group,
906 ignore_whitespace: old_ignore_whitespace,
907 });
908 self.parser().ignore_whitespace.set(new_ignore_whitespace);
909 Ok(Concat {
910 span: self.span(),
911 asts: vec![],
912 })
913 }
914 }
915 }
916
917 #[inline(never)]
918 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919 assert_eq!(self.char(), '~');
920 self.bump();
921 if self.is_eof() || self.char() != '(' {
922 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923 }
924 let open_span = self.span_char();
925 self.bump();
926 let group = ast::Group {
927 span: open_span,
928 kind: ast::GroupKind::Complement,
929 ast: Box::new(Ast::empty(self.span())),
930 };
931
932 let old_ignore_whitespace = self.ignore_whitespace();
933 let new_ignore_whitespace = group
934 .flags()
935 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936 .unwrap_or(old_ignore_whitespace);
937 self.parser()
938 .stack_group
939 .borrow_mut()
940 .push(GroupState::Group {
941 concat,
942 group,
943 ignore_whitespace: old_ignore_whitespace,
944 });
945 self.parser().ignore_whitespace.set(new_ignore_whitespace);
946 Ok(Concat {
947 span: self.span(),
948 asts: vec![],
949 })
950 }
951
952 #[inline(never)]
953 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954 use self::GroupState::*;
955 assert_eq!(self.char(), ')');
956 let mut stack = self.parser().stack_group.borrow_mut();
957 let topstack = stack.pop();
958
959 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960 Some(Group {
961 concat,
962 group,
963 ignore_whitespace,
964 }) => (concat, group, ignore_whitespace, None),
965 Some(Alternation(alt)) => match stack.pop() {
966 Some(Group {
967 concat,
968 group,
969 ignore_whitespace,
970 }) => (
971 concat,
972 group,
973 ignore_whitespace,
974 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975 ),
976 None | Some(Alternation(_)) | Some(Intersection(_)) => {
977 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978 }
979 },
980 Some(Intersection(int)) => match stack.pop() {
981 Some(Group {
982 concat,
983 group,
984 ignore_whitespace,
985 }) => (
986 concat,
987 group,
988 ignore_whitespace,
989 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990 ),
991 None | Some(Alternation(_)) | Some(Intersection(_)) => {
992 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993 }
994 },
995
996 None => {
997 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998 }
999 };
1000 self.parser().ignore_whitespace.set(ignore_whitespace);
1001 group_concat.span.end = self.pos();
1002 self.bump();
1003 group.span.end = self.pos();
1004 match alt {
1005 Some(Either::Left(mut alt)) => {
1006 alt.span.end = group_concat.span.end;
1007 alt.asts.push(group_concat.into_ast());
1008 group.ast = Box::new(alt.into_ast());
1009 }
1010 Some(Either::Right(mut int)) => {
1011 int.span.end = group_concat.span.end;
1012 int.asts.push(group_concat.into_ast());
1013 group.ast = Box::new(int.into_ast());
1014 }
1015 None => {
1016 group.ast = Box::new(group_concat.into_ast());
1017 }
1018 }
1019
1020 if group.kind == GroupKind::Complement {
1021 let complement = ast::Complement {
1022 span: self.span(),
1023 ast: group.ast,
1024 };
1025 prior_concat.asts.push(Ast::complement(complement));
1026 }
1027 else {
1029 prior_concat.asts.push(Ast::group(group));
1030 }
1031 Ok(prior_concat)
1032 }
1033
1034 #[inline(never)]
1035 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036 concat.span.end = self.pos();
1037 let mut stack = self.parser().stack_group.borrow_mut();
1038 let ast = match stack.pop() {
1039 None => Ok(concat.into_ast()),
1040 Some(GroupState::Alternation(mut alt)) => {
1041 alt.span.end = self.pos();
1042 alt.asts.push(concat.into_ast());
1043 Ok(Ast::alternation(alt))
1044 }
1045 Some(GroupState::Intersection(mut int)) => {
1046 int.span.end = self.pos();
1047 int.asts.push(concat.into_ast());
1048
1049 Ok(Ast::intersection(int))
1050 }
1051 Some(GroupState::Group { group, .. }) => {
1052 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053 }
1054 };
1055 match stack.pop() {
1057 None => ast,
1058 Some(GroupState::Alternation(alt)) => {
1059 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060 }
1061 Some(GroupState::Intersection(int)) => {
1062 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063 }
1064 Some(GroupState::Group { group, .. }) => {
1065 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066 }
1067 }
1068 }
1069
1070 #[inline(never)]
1071 fn push_class_open(
1072 &self,
1073 parent_union: regex_syntax::ast::ClassSetUnion,
1074 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075 assert_eq!(self.char(), '[');
1076
1077 let (nested_set, nested_union) = self.parse_set_class_open()?;
1078 self.parser()
1079 .stack_class
1080 .borrow_mut()
1081 .push(ClassState::Open {
1082 union: parent_union,
1083 set: nested_set,
1084 });
1085 Ok(nested_union)
1086 }
1087
1088 #[inline(never)]
1089 fn pop_class(
1090 &self,
1091 nested_union: regex_syntax::ast::ClassSetUnion,
1092 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093 assert_eq!(self.char(), ']');
1094
1095 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096 let prevset = self.pop_class_op(item);
1097 let mut stack = self.parser().stack_class.borrow_mut();
1098 match stack.pop() {
1099 None => panic!("unexpected empty character class stack"),
1100 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101 Some(ClassState::Open { mut union, mut set }) => {
1102 self.bump();
1103 set.span.end = self.pos();
1104 set.kind = prevset;
1105 if stack.is_empty() {
1106 Ok(Either::Right(set))
1107 } else {
1108 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109 Ok(Either::Left(union))
1110 }
1111 }
1112 }
1113 }
1114
1115 #[inline(never)]
1116 fn unclosed_class_error(&self) -> ParseError {
1117 for state in self.parser().stack_class.borrow().iter().rev() {
1118 if let ClassState::Open { ref set, .. } = *state {
1119 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120 }
1121 }
1122 panic!("no open character class found")
1123 }
1124
1125 #[inline(never)]
1126 fn push_class_op(
1127 &self,
1128 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129 next_union: regex_syntax::ast::ClassSetUnion,
1130 ) -> regex_syntax::ast::ClassSetUnion {
1131 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132 let new_lhs = self.pop_class_op(item);
1133 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134 kind: next_kind,
1135 lhs: new_lhs,
1136 });
1137 regex_syntax::ast::ClassSetUnion {
1138 span: self.span(),
1139 items: vec![],
1140 }
1141 }
1142
1143 #[inline(never)]
1144 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145 let mut stack = self.parser().stack_class.borrow_mut();
1146 let (kind, lhs) = match stack.pop() {
1147 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148 Some(state @ ClassState::Open { .. }) => {
1149 stack.push(state);
1150 return rhs;
1151 }
1152 None => unreachable!(),
1153 };
1154 let span = Span::new(lhs.span().start, rhs.span().end);
1155 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156 span,
1157 kind,
1158 lhs: Box::new(lhs),
1159 rhs: Box::new(rhs),
1160 })
1161 }
1162
1163 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164 match hir.kind() {
1165 hir::HirKind::Empty => Ok(NodeId::EPS),
1166 hir::HirKind::Literal(l) => {
1167 if l.0.len() == 1 {
1168 let node = tb.mk_u8(l.0[0]);
1169 Ok(node)
1170 } else {
1171 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172 let conc = tb.mk_concats(ws.iter().copied());
1173 Ok(conc)
1174 }
1175 }
1176 hir::HirKind::Class(class) => match class {
1177 hir::Class::Unicode(class_unicode) => {
1178 let ranges = class_unicode.ranges();
1179 if ranges.len() == 1
1180 && ranges[0].start() == '\u{0}'
1181 && ranges[0].end() == '\u{10FFFF}'
1182 {
1183 return Ok(tb.mk_range_u8(0, 255));
1184 }
1185 let mut nodes = Vec::new();
1186 for range in ranges {
1187 for seq in Utf8Sequences::new(range.start(), range.end()) {
1188 let sl = seq.as_slice();
1189 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190 let node = match bytes.len() {
1191 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192 n => {
1193 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194 let mut conc = last;
1195 for i in (0..n - 1).rev() {
1196 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197 conc = tb.mk_concat(b, conc);
1198 }
1199 conc
1200 }
1201 };
1202 nodes.push(node);
1203 }
1204 }
1205 let merged = tb.mk_unions(nodes.into_iter());
1206 Ok(merged)
1207 }
1208 hir::Class::Bytes(class_bytes) => {
1209 let ranges = class_bytes.ranges();
1210 let mut result = NodeId::BOT;
1211 for range in ranges {
1212 let start = range.start();
1213 let end = range.end();
1214 let node = tb.mk_range_u8(start, end);
1215 result = tb.mk_union(result, node);
1216 }
1217 Ok(result)
1218 }
1219 },
1220 hir::HirKind::Look(_) => Err(self.error(
1221 Span::splat(self.pos()),
1222 ast::ErrorKind::UnsupportedResharpRegex,
1223 )),
1224 hir::HirKind::Repetition(_) => Err(self.error(
1225 Span::splat(self.pos()),
1226 ast::ErrorKind::UnsupportedResharpRegex,
1227 )),
1228 hir::HirKind::Capture(_) => Err(self.error(
1229 Span::splat(self.pos()),
1230 ast::ErrorKind::UnsupportedResharpRegex,
1231 )),
1232 hir::HirKind::Concat(body) => {
1233 let mut result = NodeId::EPS;
1234 for child in body {
1235 let node = self.hir_to_node_id(child, tb)?;
1236 result = tb.mk_concat(result, node);
1237 }
1238 Ok(result)
1239 }
1240 hir::HirKind::Alternation(_) => Err(self.error(
1241 Span::splat(self.pos()),
1242 ast::ErrorKind::UnsupportedResharpRegex,
1243 )),
1244 }
1245 }
1246
1247 fn translate_ast_to_hir(
1248 &mut self,
1249 orig_ast: ®ex_syntax::ast::Ast,
1250 tb: &mut TB<'s>,
1251 ) -> Result<NodeId> {
1252 match self.translator.translate("", orig_ast) {
1253 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254 Ok(hir) => self.hir_to_node_id(&hir, tb),
1255 }
1256 }
1257
1258 fn translator_to_node_id(
1259 &mut self,
1260 orig_ast: ®ex_syntax::ast::Ast,
1261 translator: &mut Option<Translator>,
1262 tb: &mut TB<'s>,
1263 ) -> Result<NodeId> {
1264 match translator {
1265 Some(tr) => {
1266 let hir = tr
1267 .translate("", orig_ast)
1268 .map_err(|e| self.unsupported_error(e))?;
1269 self.hir_to_node_id(&hir, tb)
1270 }
1271 None => self.translate_ast_to_hir(orig_ast, tb),
1272 }
1273 }
1274
1275 fn get_class(
1276 &mut self,
1277 negated: bool,
1278 kind: regex_syntax::ast::ClassPerlKind,
1279 tb: &mut TB<'s>,
1280 ) -> Result<NodeId> {
1281 let w = self
1282 .perl_classes
1283 .iter()
1284 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285 match w {
1286 Some((_, _, value)) => Ok(*value),
1287 None => {
1288 let translated = if self.global_ascii_perl {
1289 let pos = match kind {
1290 regex_syntax::ast::ClassPerlKind::Word => {
1291 let az = tb.mk_range_u8(b'a', b'z');
1292 let big = tb.mk_range_u8(b'A', b'Z');
1293 let dig = tb.mk_range_u8(b'0', b'9');
1294 let us = tb.mk_u8(b'_');
1295 tb.mk_unions([az, big, dig, us].into_iter())
1296 }
1297 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298 regex_syntax::ast::ClassPerlKind::Space => {
1299 let sp = tb.mk_u8(b' ');
1300 let tab = tb.mk_u8(b'\t');
1301 let nl = tb.mk_u8(b'\n');
1302 let cr = tb.mk_u8(b'\r');
1303 let ff = tb.mk_u8(0x0C);
1304 let vt = tb.mk_u8(0x0B);
1305 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306 }
1307 };
1308 if negated {
1309 resharp_algebra::neg_class(tb, pos)
1310 } else {
1311 pos
1312 }
1313 } else if self.global_unicode {
1314 match kind {
1315 regex_syntax::ast::ClassPerlKind::Word => {
1316 if self.global_full_unicode {
1317 self.unicode_classes.ensure_word_full(tb);
1318 } else {
1319 self.unicode_classes.ensure_word(tb);
1320 }
1321 if negated {
1322 self.unicode_classes.non_word
1323 } else {
1324 self.unicode_classes.word
1325 }
1326 }
1327 regex_syntax::ast::ClassPerlKind::Digit => {
1328 if self.global_full_unicode {
1329 self.unicode_classes.ensure_digit_full(tb);
1330 } else {
1331 self.unicode_classes.ensure_digit(tb);
1332 }
1333 if negated {
1334 self.unicode_classes.non_digit
1335 } else {
1336 self.unicode_classes.digit
1337 }
1338 }
1339 regex_syntax::ast::ClassPerlKind::Space => {
1340 if self.global_full_unicode {
1341 self.unicode_classes.ensure_space_full(tb);
1342 } else {
1343 self.unicode_classes.ensure_space(tb);
1344 }
1345 if negated {
1346 self.unicode_classes.non_space
1347 } else {
1348 self.unicode_classes.space
1349 }
1350 }
1351 }
1352 } else {
1353 let pos = match kind {
1354 regex_syntax::ast::ClassPerlKind::Word => {
1355 let az = tb.mk_range_u8(b'a', b'z');
1356 let big = tb.mk_range_u8(b'A', b'Z');
1357 let dig = tb.mk_range_u8(b'0', b'9');
1358 let us = tb.mk_u8(b'_');
1359 tb.mk_unions([az, big, dig, us].into_iter())
1360 }
1361 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362 regex_syntax::ast::ClassPerlKind::Space => {
1363 let sp = tb.mk_u8(b' ');
1364 let tab = tb.mk_u8(b'\t');
1365 let nl = tb.mk_u8(b'\n');
1366 let cr = tb.mk_u8(b'\r');
1367 let ff = tb.mk_u8(0x0C);
1368 let vt = tb.mk_u8(0x0B);
1369 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370 }
1371 };
1372 if negated {
1373 tb.mk_compl(pos)
1374 } else {
1375 pos
1376 }
1377 };
1378 self.perl_classes.push((negated, kind, translated));
1379 Ok(translated)
1380 }
1381 }
1382 }
1383
1384 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1385 use WordCharKind::*;
1386 match ast {
1387 Ast::Literal(lit) => {
1388 if is_word_byte(lit.c as u8) {
1389 Word
1390 } else {
1391 NonWord
1392 }
1393 }
1394 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1395 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1396 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1397 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1398 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1399 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1400 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1401 },
1402 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1403 Ast::Dot(_) | Ast::Top(_) => Unknown,
1404 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1405 Ast::Concat(c) if !c.asts.is_empty() => {
1406 let edge = if left { c.asts.len() - 1 } else { 0 };
1407 let kind = Self::word_char_kind(&c.asts[edge], left);
1408 match kind {
1409 MaybeWord => {
1410 let dir: isize = if left { -1 } else { 1 };
1411 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1412 Word => Word,
1413 _ => MaybeWord,
1414 }
1415 }
1416 MaybeNonWord => {
1417 let dir: isize = if left { -1 } else { 1 };
1418 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1419 NonWord => NonWord,
1420 _ => MaybeNonWord,
1421 }
1422 }
1423 other => other,
1424 }
1425 }
1426 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1427 let first = Self::word_char_kind(&alt.asts[0], left);
1428 if alt.asts[1..]
1429 .iter()
1430 .all(|a| Self::word_char_kind(a, left) == first)
1431 {
1432 first
1433 } else {
1434 Unknown
1435 }
1436 }
1437 Ast::Repetition(r) => {
1438 let inner = Self::word_char_kind(&r.ast, left);
1439 let nullable = matches!(
1440 &r.op.kind,
1441 ast::RepetitionKind::ZeroOrMore
1442 | ast::RepetitionKind::ZeroOrOne
1443 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1444 );
1445 if nullable {
1446 match inner {
1447 Word => MaybeWord,
1448 NonWord => MaybeNonWord,
1449 _ => Unknown,
1450 }
1451 } else {
1452 inner
1453 }
1454 }
1455 Ast::Lookaround(la) => match la.kind {
1456 ast::LookaroundKind::PositiveLookahead
1457 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1458 _ => Unknown,
1459 },
1460 _ => Unknown,
1461 }
1462 }
1463
1464 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1466 match ast {
1467 Ast::Literal(_)
1468 | Ast::ClassPerl(_)
1469 | Ast::ClassBracketed(_)
1470 | Ast::ClassUnicode(_)
1471 | Ast::Dot(_)
1472 | Ast::Top(_) => Some(ast),
1473 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1474 Ast::Concat(c) if !c.asts.is_empty() => {
1475 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1476 }
1477 Ast::Repetition(r) => {
1478 let nullable = matches!(
1479 &r.op.kind,
1480 ast::RepetitionKind::ZeroOrMore
1481 | ast::RepetitionKind::ZeroOrOne
1482 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1483 );
1484 if nullable {
1485 None
1486 } else {
1487 Self::edge_class_ast(&r.ast, left)
1488 }
1489 }
1490 _ => None,
1491 }
1492 }
1493
1494 fn resolve_word_kind(
1495 &mut self,
1496 asts: &[Ast],
1497 idx: usize,
1498 dir: isize,
1499 translator: &mut Option<Translator>,
1500 tb: &mut TB<'s>,
1501 word_id: NodeId,
1502 not_word_id: NodeId,
1503 ) -> Result<WordCharKind> {
1504 use WordCharKind::*;
1505 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1506 if fast != Unknown {
1507 return Ok(fast);
1508 }
1509 let neighbor_idx = (idx as isize + dir) as usize;
1510 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1511 self.ast_to_node_id(edge, translator, tb)?
1512 } else if dir > 0 {
1513 let mut bodies: Vec<NodeId> = vec![];
1514 let mut j = neighbor_idx;
1515 while j < asts.len() {
1516 match &asts[j] {
1517 Ast::Lookaround(la) => {
1518 let kind = la.kind.clone();
1519 let lookbehind = matches!(
1520 kind,
1521 ast::LookaroundKind::PositiveLookbehind
1522 | ast::LookaroundKind::NegativeLookbehind
1523 );
1524 if lookbehind {
1525 j += 1;
1526 continue;
1527 }
1528 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1529 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1530 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1531 })?;
1532 let body_ts = tb.mk_concat(body, NodeId::TS);
1533 let constraint = match kind {
1534 ast::LookaroundKind::PositiveLookahead => body_ts,
1535 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1536 _ => unreachable!(),
1537 };
1538 bodies.push(constraint);
1539 j += 1;
1540 }
1541 other => {
1542 let n = self.ast_to_node_id(other, translator, tb)?;
1543 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1544 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1545 })?;
1546 bodies.push(tb.mk_concat(n, NodeId::TS));
1547 break;
1548 }
1549 }
1550 }
1551 if bodies.is_empty() {
1552 return Ok(Unknown);
1553 }
1554 let combined = tb.mk_inters(bodies.into_iter());
1555 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1556 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1557 return if tb.subsumes(word_prefix, combined) == Some(true) {
1558 Ok(Word)
1559 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1560 Ok(NonWord)
1561 } else {
1562 Ok(Unknown)
1563 };
1564 } else {
1565 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1566 let mut neighbor_node = tb
1567 .try_elim_lookarounds(neighbor_node)
1568 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1569 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1570 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1571 })?;
1572 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1573 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1574 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1575 Ok(Word)
1576 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1577 Ok(NonWord)
1578 } else {
1579 Ok(Unknown)
1580 };
1581 };
1582 if tb.subsumes(word_id, node) == Some(true) {
1583 Ok(Word)
1584 } else if tb.subsumes(not_word_id, node) == Some(true) {
1585 Ok(NonWord)
1586 } else {
1587 Ok(Unknown)
1588 }
1589 }
1590
1591 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1592 match ast {
1593 Ast::Lookaround(la) => match la.kind {
1594 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1595 dir < 0
1596 }
1597 ast::LookaroundKind::PositiveLookbehind
1598 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1599 },
1600 _ => false,
1601 }
1602 }
1603
1604 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1605 use WordCharKind::*;
1606 let next = idx as isize + dir;
1607 if next < 0 || next >= asts.len() as isize {
1608 return Edge;
1609 }
1610 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1611 return Self::concat_neighbor_kind(asts, next as usize, dir);
1612 }
1613 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1614 match kind {
1615 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1616 Word => Word,
1617 _ => Unknown,
1618 },
1619 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1620 NonWord => NonWord,
1621 _ => Unknown,
1622 },
1623 other => other,
1624 }
1625 }
1626
1627 fn specialize_word_boundaries(
1628 &mut self,
1629 children: &mut [NodeId],
1630 tb: &mut TB<'s>,
1631 ) -> Result<()> {
1632 let wb = self.unicode_classes.wb;
1633 let non_wb = self.unicode_classes.non_wb;
1634 if wb == NodeId::MISSING {
1635 return Ok(());
1636 }
1637 let word = self.unicode_classes.word;
1638 let non_word = self.unicode_classes.non_word;
1639 if word == NodeId::MISSING {
1640 return Ok(());
1641 }
1642 let word_pref = tb.mk_concat(word, NodeId::TS);
1645 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1646 let word_suf = tb.mk_concat(NodeId::TS, word);
1647 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1648 let len = children.len();
1649 for k in 0..len {
1650 let l = if k == 0 {
1651 WordCharKind::Edge
1652 } else {
1653 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1654 };
1655 let r = if k + 1 >= len {
1656 WordCharKind::Edge
1657 } else {
1658 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1659 };
1660 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1661 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1662 }
1663 Ok(())
1664 }
1665
1666 fn rewrite_wb_in_node(
1667 b: &mut TB<'s>,
1668 node: NodeId,
1669 wb: NodeId,
1670 non_wb: NodeId,
1671 word: NodeId,
1672 left: WordCharKind,
1673 right: WordCharKind,
1674 ) -> Option<NodeId> {
1675 let boundary_match = if node == wb {
1676 true
1677 } else if node == non_wb {
1678 false
1679 } else if b.get_kind(node) == Kind::Union {
1680 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1681 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1682 return Some(b.mk_union(l, r));
1683 } else {
1684 return Some(node);
1685 };
1686 use WordCharKind::*;
1687 let result = match (left, right) {
1688 (NonWord, Word) | (Word, NonWord) => {
1689 if boundary_match {
1690 NodeId::EPS
1691 } else {
1692 NodeId::BOT
1693 }
1694 }
1695 (Word, Word) | (NonWord, NonWord) => {
1696 if boundary_match {
1697 NodeId::BOT
1698 } else {
1699 NodeId::EPS
1700 }
1701 }
1702 (Word, _) => {
1703 if boundary_match {
1704 b.mk_neg_lookahead(word, 0)
1705 } else {
1706 let tail = b.mk_concat(word, NodeId::TS);
1707 b.mk_lookahead(tail, NodeId::MISSING, 0)
1708 }
1709 }
1710 (NonWord, _) => {
1711 if boundary_match {
1712 let tail = b.mk_concat(word, NodeId::TS);
1713 b.mk_lookahead(tail, NodeId::MISSING, 0)
1714 } else {
1715 b.mk_neg_lookahead(word, 0)
1716 }
1717 }
1718 (_, Word) => {
1719 if boundary_match {
1720 b.mk_neg_lookbehind(word)
1721 } else {
1722 b.mk_lookbehind(word, NodeId::MISSING)
1723 }
1724 }
1725 (_, NonWord) => {
1726 if boundary_match {
1727 b.mk_lookbehind(word, NodeId::MISSING)
1728 } else {
1729 b.mk_neg_lookbehind(word)
1730 }
1731 }
1732 _ => return None,
1733 };
1734 Some(result)
1735 }
1736
1737 fn classify(
1738 b: &mut TB<'s>,
1739 node: NodeId,
1740 word_dir: NodeId,
1741 non_word_dir: NodeId,
1742 ) -> WordCharKind {
1743 if b.contains_look(node) || b.contains_anchors(node) {
1744 return WordCharKind::Unknown;
1745 }
1746 if b.subsumes(word_dir, node) == Some(true) {
1747 WordCharKind::Word
1748 } else if b.subsumes(non_word_dir, node) == Some(true) {
1749 WordCharKind::NonWord
1750 } else {
1751 WordCharKind::Unknown
1752 }
1753 }
1754
1755 fn rewrite_word_boundary_in_concat(
1756 &mut self,
1757 asts: &[Ast],
1758 idx: usize,
1759 translator: &mut Option<Translator>,
1760 tb: &mut TB<'s>,
1761 negated: bool,
1762 ) -> Result<(NodeId, usize)> {
1763 use WordCharKind::*;
1764 if self.global_full_unicode {
1765 self.unicode_classes.ensure_word_full(tb);
1766 } else if self.global_unicode && !self.global_ascii_perl {
1767 self.unicode_classes.ensure_word(tb);
1768 } else {
1769 self.unicode_classes.ensure_word_ascii(tb);
1770 }
1771 let word_id = self.unicode_classes.word;
1772 let not_word_id = self.unicode_classes.non_word;
1773 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1774 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1775 let boundary_match = !negated;
1776 match (left, right) {
1777 (NonWord, Word) | (Word, NonWord) => Ok((
1778 if boundary_match {
1779 NodeId::EPS
1780 } else {
1781 NodeId::BOT
1782 },
1783 idx + 1,
1784 )),
1785 (Word, Word) | (NonWord, NonWord) => Ok((
1786 if boundary_match {
1787 NodeId::BOT
1788 } else {
1789 NodeId::EPS
1790 },
1791 idx + 1,
1792 )),
1793 (Word, _) => {
1794 if boundary_match {
1795 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1796 } else {
1797 let tail = tb.mk_concat(word_id, NodeId::TS);
1798 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1799 }
1800 }
1801 (NonWord, _) => {
1802 if boundary_match {
1803 let tail = tb.mk_concat(word_id, NodeId::TS);
1804 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1805 } else {
1806 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1807 }
1808 }
1809 (_, Word) => {
1810 if boundary_match {
1811 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1812 } else {
1813 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1814 }
1815 }
1816 (_, NonWord) => {
1817 if boundary_match {
1818 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1819 } else {
1820 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1821 }
1822 }
1823 _ => {
1827 self.unicode_classes.ensure_wb(tb);
1828 let node = if boundary_match {
1829 self.unicode_classes.wb
1830 } else {
1831 self.unicode_classes.non_wb
1832 };
1833 Ok((node, idx + 1))
1834 }
1835 }
1836 }
1837
1838 fn merge_boundary_with_following_lookaheads(
1839 &mut self,
1840 asts: &[Ast],
1841 wb_idx: usize,
1842 boundary_tail: NodeId,
1843 translator: &mut Option<Translator>,
1844 tb: &mut TB<'s>,
1845 ) -> Result<(NodeId, usize)> {
1846 let mut next = wb_idx + 1;
1847 let mut la_bodies = vec![boundary_tail];
1848 while next < asts.len() {
1849 match &asts[next] {
1850 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1851 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1852 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1853 next += 1;
1854 }
1855 _ => break,
1856 }
1857 }
1858 let merged = tb.mk_inters(la_bodies.into_iter());
1859 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1860 }
1861
1862 fn ast_to_node_id(
1863 &mut self,
1864 ast: &Ast,
1865 translator: &mut Option<Translator>,
1866 tb: &mut TB<'s>,
1867 ) -> Result<NodeId> {
1868 match ast {
1869 Ast::Empty(_) => Ok(NodeId::EPS),
1870 Ast::Flags(f) => {
1871 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1872 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1873 }
1874 let mut translator_builder = self.default_translator_builder();
1875 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1876 translator_builder.case_insensitive(state);
1877 }
1878 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1879 translator_builder.unicode(state);
1880 }
1881 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1882 self.dot_all.set(state);
1883 }
1884 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1885 self.multiline.set(state);
1886 }
1887 let concat_translator = Some(translator_builder.build());
1888 *translator = concat_translator;
1889 Ok(NodeId::EPS)
1890 }
1891 Ast::Literal(l) => {
1892 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1893 self.translator_to_node_id(&ast_lit, translator, tb)
1894 }
1895 Ast::Top(_) => Ok(NodeId::TOP),
1896 Ast::Dot(_) => {
1897 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1898 let hirv = match (codepoint_dot, self.dot_all.get()) {
1899 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1900 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1901 (false, true) => return Ok(NodeId::TOP),
1902 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1903 };
1904 self.hir_to_node_id(&hirv, tb)
1905 }
1906 Ast::Assertion(a) => match &a.kind {
1907 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1908 ast::AssertionKind::EndText => Ok(NodeId::END),
1909 ast::AssertionKind::WordBoundary => {
1910 let only = Ast::Assertion(a.clone());
1911 let asts = std::slice::from_ref(&only);
1912 let (node, _) =
1913 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1914 Ok(node)
1915 }
1916 ast::AssertionKind::NotWordBoundary => {
1917 let only = Ast::Assertion(a.clone());
1919 let asts = std::slice::from_ref(&only);
1920 let (node, _) =
1921 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1922 Ok(node)
1923 }
1924 ast::AssertionKind::StartLine => {
1925 if !self.multiline.get() {
1926 return Ok(NodeId::BEGIN);
1927 }
1928 let left = NodeId::BEGIN;
1929 let right = tb.mk_u8(b'\n');
1930 let union = tb.mk_union(left, right);
1931 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1932 }
1933 ast::AssertionKind::EndLine => {
1934 if !self.multiline.get() {
1935 return Ok(NodeId::END);
1936 }
1937 let left = NodeId::END;
1938 let right = tb.mk_u8(b'\n');
1939 let union = tb.mk_union(left, right);
1940 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1941 }
1942 ast::AssertionKind::WordBoundaryStart => {
1943 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1944 }
1945 ast::AssertionKind::WordBoundaryEnd => {
1946 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1947 }
1948 ast::AssertionKind::WordBoundaryStartAngle => {
1949 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1950 }
1951 ast::AssertionKind::WordBoundaryEndAngle => {
1952 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1953 }
1954 ast::AssertionKind::WordBoundaryStartHalf => {
1955 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1956 }
1957 ast::AssertionKind::WordBoundaryEndHalf => {
1958 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1959 }
1960 },
1961 Ast::ClassUnicode(c) => {
1962 let tmp = regex_syntax::ast::ClassUnicode {
1963 span: c.span,
1964 negated: c.negated,
1965 kind: c.kind.clone(),
1966 };
1967 if !c.negated {
1968 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1969 match s.as_str() {
1970 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1972 "utf8" => {
1974 let ascii = tb.mk_range_u8(0, 127);
1975 let beta = tb.mk_range_u8(128, 0xBF);
1976 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1977 let c0s = tb.mk_concats([c0, beta].into_iter());
1978 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1979 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1980 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1981 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1982 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
1983 }
1984 "hex" => {
1985 let nums = tb.mk_range_u8(b'0', b'9');
1986 let lets = tb.mk_range_u8(b'a', b'f');
1987 let lets2 = tb.mk_range_u8(b'A', b'F');
1988 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1989 return Ok(merged);
1990 }
1991 _ => {}
1992 }
1993 };
1994 }
1995
1996 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1997 self.translator_to_node_id(&orig_ast, translator, tb)
1998 }
1999 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2000 Ast::ClassBracketed(c) => match &c.kind {
2001 regex_syntax::ast::ClassSet::Item(item) => {
2002 if !c.negated && is_universal_perl_pair(item) {
2003 return Ok(NodeId::TOP);
2004 }
2005 let kind = if self.global_ascii_perl {
2006 rewrite_ascii_perl_set(&c.kind)
2007 } else {
2008 c.kind.clone()
2009 };
2010 let tmp = regex_syntax::ast::ClassBracketed {
2011 span: c.span,
2012 negated: c.negated,
2013 kind,
2014 };
2015 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2016 self.translator_to_node_id(&orig_ast, translator, tb)
2017 }
2018 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2019 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2020 }
2021 },
2022 Ast::Repetition(r) => {
2023 let body = self.ast_to_node_id(&r.ast, translator, tb);
2024 match body {
2025 Ok(body) => match &r.op.kind {
2026 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2027 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2028 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2029 ast::RepetitionKind::Range(r) => match r {
2030 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2031 ast::RepetitionRange::AtLeast(n) => {
2032 let rep = tb.mk_repeat(body, *n, *n);
2033 let st = tb.mk_star(body);
2034 Ok(tb.mk_concat(rep, st))
2035 }
2036
2037 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2038 },
2039 },
2040 Err(_) => body,
2041 }
2042 }
2043 Ast::Lookaround(g) => {
2044 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2045 match g.kind {
2046 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2047 let mut prefix = NodeId::EPS;
2048 let mut rest = body;
2049 while tb.get_kind(rest) == Kind::Concat
2050 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2051 {
2052 prefix = tb.mk_concat(prefix, rest.left(tb));
2053 rest = rest.right(tb);
2054 }
2055 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2056 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2057 }
2058 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2059 Ok(tb.mk_concat(prefix, la))
2060 }
2061 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2062 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2063 }
2064 ast::LookaroundKind::PositiveLookahead => {
2065 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2066 }
2067 ast::LookaroundKind::PositiveLookbehind => {
2068 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2069 }
2070 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2071 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2072 }
2073 }
2074 Ast::Group(g) => {
2075 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2076 if !flags.items.is_empty() {
2077 let mut translator_builder = self.default_translator_builder();
2078 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2079 translator_builder.case_insensitive(state);
2080 }
2081 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2082 translator_builder.unicode(state);
2083 }
2084 let saved_dot_all = self.dot_all.get();
2085 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2086 self.dot_all.set(state);
2087 }
2088 let saved_multiline = self.multiline.get();
2089 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2090 self.multiline.set(state);
2091 }
2092 let mut scoped = Some(translator_builder.build());
2093 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2094 self.dot_all.set(saved_dot_all);
2095 self.multiline.set(saved_multiline);
2096 return result;
2097 }
2098 }
2099 self.ast_to_node_id(&g.ast, translator, tb)
2100 }
2101 Ast::Alternation(a) => {
2102 let mut children = vec![];
2103 for ast in &a.asts {
2104 match self.ast_to_node_id(ast, translator, tb) {
2105 Ok(node_id) => children.push(node_id),
2106 Err(err) => return Err(err),
2107 }
2108 }
2109 Ok(tb.mk_unions(children.iter().copied()))
2110 }
2111 Ast::Concat(c) => {
2112 let mut concat_translator: Option<Translator> = None;
2113 let mut children = vec![];
2114 let mut prev_boundary_child: Option<usize> = None;
2115 let mut i = 0;
2116 while i < c.asts.len() {
2117 let ast = &c.asts[i];
2118 match ast {
2119 Ast::Flags(f) => {
2120 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2121 return Err(
2122 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2123 );
2124 }
2125 let mut translator_builder = self.default_translator_builder();
2126 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2127 translator_builder.case_insensitive(state);
2128 }
2129 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2130 translator_builder.unicode(state);
2131 }
2132 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2133 self.dot_all.set(state);
2134 }
2135 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2136 self.multiline.set(state);
2137 }
2138 concat_translator = Some(translator_builder.build());
2139 *translator = concat_translator.clone();
2140 i += 1;
2141 continue;
2142 }
2143 Ast::Assertion(a)
2144 if a.kind == ast::AssertionKind::WordBoundary
2145 || a.kind == ast::AssertionKind::NotWordBoundary =>
2146 {
2147 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2148 let node = self.rewrite_word_boundary_in_concat(
2149 &c.asts, i, translator, tb, negated,
2150 )?;
2151 match prev_boundary_child {
2152 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2153 None => {
2154 children.push(node.0);
2155 prev_boundary_child = Some(children.len() - 1);
2156 }
2157 }
2158 i = node.1; continue;
2160 }
2161 _ => {}
2162 }
2163 prev_boundary_child = None;
2164 match concat_translator {
2165 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2166 Ok(node_id) => children.push(node_id),
2167 Err(err) => return Err(err),
2168 },
2169 None => match self.ast_to_node_id(ast, translator, tb) {
2170 Ok(node_id) => children.push(node_id),
2171 Err(err) => return Err(err),
2172 },
2173 }
2174 i += 1;
2175 }
2176 self.specialize_word_boundaries(&mut children, tb)?;
2177 Ok(tb.mk_concats(children.iter().cloned()))
2178 }
2179 Ast::Intersection(intersection) => {
2180 let mut children = vec![];
2181 for ast in &intersection.asts {
2182 match self.ast_to_node_id(ast, translator, tb) {
2183 Ok(node_id) => children.push(node_id),
2184 Err(err) => return Err(err),
2185 }
2186 }
2187 Ok(tb.mk_inters(children.into_iter()))
2188 }
2189 Ast::Complement(complement) => {
2190 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2191 body.map(|x| tb.mk_compl(x))
2192 }
2193 }
2194 }
2195
2196 fn parse_inner(&mut self) -> Result<Ast> {
2197 let mut concat = Concat {
2198 span: self.span(),
2199 asts: vec![],
2200 };
2201 loop {
2202 self.bump_space();
2203 if self.is_eof() {
2204 break;
2205 }
2206 match self.char() {
2207 '(' => concat = self.push_group(concat)?,
2208 ')' => concat = self.pop_group(concat)?,
2209 '|' => concat = self.push_alternate(concat)?,
2210 '&' => concat = self.push_intersect(concat)?,
2211 '~' => concat = self.push_compl_group(concat)?,
2212 '[' => {
2213 let class = self.parse_set_class()?;
2214 concat.asts.push(Ast::class_bracketed(class));
2215 }
2216 '?' => {
2217 concat =
2218 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2219 }
2220 '*' => {
2221 concat =
2222 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2223 }
2224 '+' => {
2225 concat =
2226 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2227 }
2228 '{' => {
2229 concat = self.parse_counted_repetition(concat)?;
2230 }
2231 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2232 }
2233 if self.stack_group.borrow().len() > self.max_depth {
2234 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2235 }
2236 }
2237 let ast = self.pop_group_end(concat)?;
2238 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2239 || max_concat_length(&ast) >= self.max_list_len
2240 {
2241 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2242 }
2243 Ok(ast)
2244 }
2245
2246 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2247 let ast = self.parse_inner()?;
2248 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2249 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2250 }
2251 self.ast_to_node_id(&ast, &mut None, tb)
2252 }
2253
2254 #[inline(never)]
2255 fn parse_uncounted_repetition(
2256 &self,
2257 mut concat: ast::Concat,
2258 kind: ast::RepetitionKind,
2259 ) -> Result<ast::Concat> {
2260 let op_start = self.pos();
2262 let ast = match concat.asts.pop() {
2263 Some(ast) => ast,
2264 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2265 };
2266 match ast {
2267 Ast::Empty(_) | Ast::Flags(_) => {
2268 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2269 }
2270 _ => {}
2271 }
2272 if self.bump() && self.char() == '?' {
2273 return Err(self.error(
2274 Span::new(op_start, self.pos()),
2275 ast::ErrorKind::UnsupportedLazyQuantifier,
2276 ));
2277 }
2278 concat.asts.push(Ast::repetition(ast::Repetition {
2279 span: ast.span().with_end(self.pos()),
2280 op: ast::RepetitionOp {
2281 span: Span::new(op_start, self.pos()),
2282 kind,
2283 },
2284 greedy: true,
2285 ast: Box::new(ast),
2286 }));
2287 Ok(concat)
2288 }
2289
2290 #[inline(never)]
2291 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2292 assert!(self.char() == '{');
2293 let start = self.pos();
2294 let ast = match concat.asts.pop() {
2295 Some(ast) => ast,
2296 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2297 };
2298 match ast {
2299 Ast::Empty(_) | Ast::Flags(_) => {
2300 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2301 }
2302 _ => {}
2303 }
2304 if !self.bump_and_bump_space() {
2305 return Err(self.error(
2306 Span::new(start, self.pos()),
2307 ast::ErrorKind::RepetitionCountUnclosed,
2308 ));
2309 }
2310 let count_start = specialize_err(
2311 self.parse_decimal(),
2312 ast::ErrorKind::DecimalEmpty,
2313 ast::ErrorKind::RepetitionCountDecimalEmpty,
2314 );
2315 if self.is_eof() {
2316 return Err(self.error(
2317 Span::new(start, self.pos()),
2318 ast::ErrorKind::RepetitionCountUnclosed,
2319 ));
2320 }
2321 let range = if self.char() == ',' {
2322 if !self.bump_and_bump_space() {
2323 return Err(self.error(
2324 Span::new(start, self.pos()),
2325 ast::ErrorKind::RepetitionCountUnclosed,
2326 ));
2327 }
2328 if self.char() != '}' {
2329 let count_start = match count_start {
2330 Ok(c) => c,
2331 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2332 if self.parser().empty_min_range {
2333 0
2334 } else {
2335 return Err(err);
2336 }
2337 }
2338 err => err?,
2339 };
2340 let count_end = specialize_err(
2341 self.parse_decimal(),
2342 ast::ErrorKind::DecimalEmpty,
2343 ast::ErrorKind::RepetitionCountDecimalEmpty,
2344 )?;
2345 ast::RepetitionRange::Bounded(count_start, count_end)
2346 } else {
2347 ast::RepetitionRange::AtLeast(count_start?)
2348 }
2349 } else {
2350 ast::RepetitionRange::Exactly(count_start?)
2351 };
2352
2353 if self.is_eof() || self.char() != '}' {
2354 return Err(self.error(
2355 Span::new(start, self.pos()),
2356 ast::ErrorKind::RepetitionCountUnclosed,
2357 ));
2358 }
2359
2360 if self.bump_and_bump_space() && self.char() == '?' {
2361 return Err(self.error(
2362 Span::new(start, self.pos()),
2363 ast::ErrorKind::UnsupportedLazyQuantifier,
2364 ));
2365 }
2366
2367 let op_span = Span::new(start, self.pos());
2368 if !range.is_valid() {
2369 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2370 }
2371
2372 let over_limit = match &range {
2373 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2374 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2375 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2376 };
2377 if over_limit {
2378 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2379 }
2380 concat.asts.push(Ast::repetition(ast::Repetition {
2381 span: ast.span().with_end(self.pos()),
2382 op: ast::RepetitionOp {
2383 span: op_span,
2384 kind: ast::RepetitionKind::Range(range),
2385 },
2386 greedy: true,
2387 ast: Box::new(ast),
2388 }));
2389 Ok(concat)
2390 }
2391
2392 #[inline(never)]
2393 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2394 assert_eq!(self.char(), '(');
2395 let open_span = self.span_char();
2396 self.bump();
2397 self.bump_space();
2398 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2399 let kind = match (pos, ahead) {
2400 (true, true) => LookaroundKind::PositiveLookahead,
2401 (true, false) => LookaroundKind::PositiveLookbehind,
2402 (false, true) => LookaroundKind::NegativeLookahead,
2403 (false, false) => LookaroundKind::NegativeLookbehind,
2404 };
2405 return Ok(Either::Right(ast::Group {
2406 span: open_span,
2407 kind: ast::GroupKind::Lookaround(kind),
2408 ast: Box::new(Ast::empty(self.span())),
2409 }));
2410 }
2411 let inner_span = self.span();
2412 let mut starts_with_p = true;
2413 if self.bump_if("?P<") || {
2414 starts_with_p = false;
2415 self.bump_if("?<")
2416 } {
2417 let capture_index = self.next_capture_index(open_span)?;
2418 let name = self.parse_capture_name(capture_index)?;
2419 Ok(Either::Right(ast::Group {
2420 span: open_span,
2421 kind: ast::GroupKind::CaptureName {
2422 starts_with_p,
2423 name,
2424 },
2425 ast: Box::new(Ast::empty(self.span())),
2426 }))
2427 } else if self.bump_if("?") {
2428 if self.is_eof() {
2429 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2430 }
2431 let flags = self.parse_flags()?;
2432 let char_end = self.char();
2433 self.bump();
2434 if char_end == ')' {
2435 if flags.items.is_empty() {
2438 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2439 }
2440 Ok(Either::Left(ast::SetFlags {
2441 span: Span {
2442 end: self.pos(),
2443 ..open_span
2444 },
2445 flags,
2446 }))
2447 } else {
2448 assert_eq!(char_end, ':');
2449 Ok(Either::Right(ast::Group {
2450 span: open_span,
2451 kind: ast::GroupKind::NonCapturing(flags),
2452 ast: Box::new(Ast::empty(self.span())),
2453 }))
2454 }
2455 } else {
2456 let capture_index = self.next_capture_index(open_span)?;
2457 Ok(Either::Right(ast::Group {
2458 span: open_span,
2459 kind: ast::GroupKind::CaptureIndex(capture_index),
2460 ast: Box::new(Ast::empty(self.span())),
2461 }))
2462 }
2463 }
2464
2465 #[inline(never)]
2466 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2467 if self.is_eof() {
2468 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2469 }
2470 let start = self.pos();
2471 loop {
2472 if self.char() == '>' {
2473 break;
2474 }
2475 if !is_capture_char(self.char(), self.pos() == start) {
2476 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2477 }
2478 if !self.bump() {
2479 break;
2480 }
2481 }
2482 let end = self.pos();
2483 if self.is_eof() {
2484 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2485 }
2486 assert_eq!(self.char(), '>');
2487 self.bump();
2488 let name = &self.pattern()[start.offset..end.offset];
2489 if name.is_empty() {
2490 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2491 }
2492 let capname = ast::CaptureName {
2493 span: Span::new(start, end),
2494 name: name.to_string(),
2495 index: capture_index,
2496 };
2497 self.add_capture_name(&capname)?;
2498 Ok(capname)
2499 }
2500
2501 #[inline(never)]
2502 fn parse_flags(&self) -> Result<ast::Flags> {
2503 let mut flags = ast::Flags {
2504 span: self.span(),
2505 items: vec![],
2506 };
2507 let mut last_was_negation = None;
2508 while self.char() != ':' && self.char() != ')' {
2509 if self.char() == '-' {
2510 last_was_negation = Some(self.span_char());
2511 let item = ast::FlagsItem {
2512 span: self.span_char(),
2513 kind: ast::FlagsItemKind::Negation,
2514 };
2515 if let Some(i) = flags.add_item(item) {
2516 return Err(self.error(
2517 self.span_char(),
2518 ast::ErrorKind::FlagRepeatedNegation {
2519 original: flags.items[i].span,
2520 },
2521 ));
2522 }
2523 } else {
2524 last_was_negation = None;
2525 let item = ast::FlagsItem {
2526 span: self.span_char(),
2527 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2528 };
2529 if let Some(i) = flags.add_item(item) {
2530 return Err(self.error(
2531 self.span_char(),
2532 ast::ErrorKind::FlagDuplicate {
2533 original: flags.items[i].span,
2534 },
2535 ));
2536 }
2537 }
2538 if !self.bump() {
2539 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2540 }
2541 }
2542 if let Some(span) = last_was_negation {
2543 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2544 }
2545 flags.span.end = self.pos();
2546 Ok(flags)
2547 }
2548
2549 #[inline(never)]
2550 fn parse_flag(&self) -> Result<ast::Flag> {
2551 match self.char() {
2552 'i' => Ok(ast::Flag::CaseInsensitive),
2553 'm' => Ok(ast::Flag::MultiLine),
2554 's' => Ok(ast::Flag::DotMatchesNewLine),
2555 'U' => Ok(ast::Flag::SwapGreed),
2556 'u' => Ok(ast::Flag::Unicode),
2557 'R' => Ok(ast::Flag::CRLF),
2558 'x' => Ok(ast::Flag::IgnoreWhitespace),
2559 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2560 }
2561 }
2562
2563 fn parse_primitive(&self) -> Result<Primitive> {
2564 match self.char() {
2565 '\\' => self.parse_escape(),
2566 '_' => {
2567 let ast = Primitive::Top(self.span_char());
2568 self.bump();
2569 Ok(ast)
2570 }
2571 '.' => {
2572 let ast = Primitive::Dot(self.span_char());
2573 self.bump();
2574 Ok(ast)
2575 }
2576 '^' => {
2577 let ast = Primitive::Assertion(ast::Assertion {
2578 span: self.span_char(),
2579 kind: ast::AssertionKind::StartLine,
2580 });
2581 self.bump();
2582 Ok(ast)
2583 }
2584 '$' => {
2585 let ast = Primitive::Assertion(ast::Assertion {
2586 span: self.span_char(),
2587 kind: ast::AssertionKind::EndLine,
2588 });
2589 self.bump();
2590 Ok(ast)
2591 }
2592 c => {
2593 let ast = Primitive::Literal(Literal {
2594 span: self.span_char(),
2595 kind: LiteralKind::Verbatim,
2596 c,
2597 });
2598 self.bump();
2599 Ok(ast)
2600 }
2601 }
2602 }
2603
2604 #[inline(never)]
2605 fn parse_escape(&self) -> Result<Primitive> {
2606 assert_eq!(self.char(), '\\');
2607 let start = self.pos();
2608 if !self.bump() {
2609 return Err(self.error(
2610 Span::new(start, self.pos()),
2611 ast::ErrorKind::EscapeUnexpectedEof,
2612 ));
2613 }
2614 let c = self.char();
2615 match c {
2617 '0'..='9' => {
2618 if !self.parser().octal {
2619 return Err(self.error(
2620 Span::new(start, self.span_char().end),
2621 ast::ErrorKind::UnsupportedBackreference,
2622 ));
2623 }
2624 let mut lit = self.parse_octal();
2625 lit.span.start = start;
2626 return Ok(Primitive::Literal(lit));
2627 }
2628 'x' | 'u' | 'U' => {
2629 let mut lit = self.parse_hex()?;
2630 lit.span.start = start;
2631 return Ok(Primitive::Literal(lit));
2632 }
2633 'p' | 'P' => {
2634 let mut cls = self.parse_unicode_class()?;
2635 cls.span.start = start;
2636 return Ok(Primitive::Unicode(cls));
2637 }
2638 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2639 let mut cls = self.parse_perl_class();
2640 cls.span.start = start;
2641 return Ok(Primitive::Perl(cls));
2642 }
2643 _ => {}
2644 }
2645
2646 self.bump();
2648 let span = Span::new(start, self.pos());
2649 if is_meta_character(c) {
2650 return Ok(Primitive::Literal(Literal {
2651 span,
2652 kind: LiteralKind::Meta,
2653 c,
2654 }));
2655 }
2656 if is_escapeable_character(c) {
2657 return Ok(Primitive::Literal(Literal {
2658 span,
2659 kind: LiteralKind::Superfluous,
2660 c,
2661 }));
2662 }
2663 let special = |kind, c| {
2664 Ok(Primitive::Literal(Literal {
2665 span,
2666 kind: LiteralKind::Special(kind),
2667 c,
2668 }))
2669 };
2670 match c {
2671 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2672 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2673 't' => special(SpecialLiteralKind::Tab, '\t'),
2674 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2675 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2676 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2677 'A' => Ok(Primitive::Assertion(ast::Assertion {
2678 span,
2679 kind: ast::AssertionKind::StartText,
2680 })),
2681 'z' => Ok(Primitive::Assertion(ast::Assertion {
2682 span,
2683 kind: ast::AssertionKind::EndText,
2684 })),
2685 'b' => {
2686 let mut wb = ast::Assertion {
2687 span,
2688 kind: ast::AssertionKind::WordBoundary,
2689 };
2690 if !self.is_eof() && self.char() == '{' {
2693 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2694 wb.kind = kind;
2695 wb.span.end = self.pos();
2696 }
2697 }
2698 Ok(Primitive::Assertion(wb))
2699 }
2700 'B' => Ok(Primitive::Assertion(ast::Assertion {
2701 span,
2702 kind: ast::AssertionKind::NotWordBoundary,
2703 })),
2704 '<' => Ok(Primitive::Assertion(ast::Assertion {
2705 span,
2706 kind: ast::AssertionKind::WordBoundaryStartAngle,
2707 })),
2708 '>' => Ok(Primitive::Assertion(ast::Assertion {
2709 span,
2710 kind: ast::AssertionKind::WordBoundaryEndAngle,
2711 })),
2712 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2713 }
2714 }
2715
2716 fn maybe_parse_special_word_boundary(
2717 &self,
2718 wb_start: Position,
2719 ) -> Result<Option<ast::AssertionKind>> {
2720 assert_eq!(self.char(), '{');
2721
2722 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2723 let start = self.pos();
2724 if !self.bump_and_bump_space() {
2725 return Err(self.error(
2726 Span::new(wb_start, self.pos()),
2727 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2728 ));
2729 }
2730 let start_contents = self.pos();
2731 if !is_valid_char(self.char()) {
2732 self.parser().pos.set(start);
2733 return Ok(None);
2734 }
2735
2736 let mut scratch = self.parser().scratch.borrow_mut();
2738 scratch.clear();
2739 while !self.is_eof() && is_valid_char(self.char()) {
2740 scratch.push(self.char());
2741 self.bump_and_bump_space();
2742 }
2743 if self.is_eof() || self.char() != '}' {
2744 return Err(self.error(
2745 Span::new(start, self.pos()),
2746 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2747 ));
2748 }
2749 let end = self.pos();
2750 self.bump();
2751 let kind = match scratch.as_str() {
2752 "start" => ast::AssertionKind::WordBoundaryStart,
2753 "end" => ast::AssertionKind::WordBoundaryEnd,
2754 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2755 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2756 _ => {
2757 return Err(self.error(
2758 Span::new(start_contents, end),
2759 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2760 ))
2761 }
2762 };
2763 Ok(Some(kind))
2764 }
2765
2766 #[inline(never)]
2767 fn parse_octal(&self) -> Literal {
2768 assert!(self.parser().octal);
2769 assert!('0' <= self.char() && self.char() <= '7');
2770 let start = self.pos();
2771 while self.bump()
2773 && '0' <= self.char()
2774 && self.char() <= '7'
2775 && self.pos().offset - start.offset <= 2
2776 {}
2777 let end = self.pos();
2778 let octal = &self.pattern()[start.offset..end.offset];
2779 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2782 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2785 Literal {
2786 span: Span::new(start, end),
2787 kind: LiteralKind::Octal,
2788 c,
2789 }
2790 }
2791
2792 #[inline(never)]
2793 fn parse_hex(&self) -> Result<Literal> {
2794 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2795
2796 let hex_kind = match self.char() {
2797 'x' => HexLiteralKind::X,
2798 'u' => HexLiteralKind::UnicodeShort,
2799 _ => HexLiteralKind::UnicodeLong,
2800 };
2801 if !self.bump_and_bump_space() {
2802 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2803 }
2804 if self.char() == '{' {
2805 self.parse_hex_brace(hex_kind)
2806 } else {
2807 self.parse_hex_digits(hex_kind)
2808 }
2809 }
2810
2811 #[inline(never)]
2812 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2813 let mut scratch = self.parser().scratch.borrow_mut();
2814 scratch.clear();
2815
2816 let start = self.pos();
2817 for i in 0..kind.digits() {
2818 if i > 0 && !self.bump_and_bump_space() {
2819 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2820 }
2821 if !is_hex(self.char()) {
2822 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2823 }
2824 scratch.push(self.char());
2825 }
2826 self.bump_and_bump_space();
2827 let end = self.pos();
2828 let hex = scratch.as_str();
2829 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2830 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2831 Some(c) => Ok(Literal {
2832 span: Span::new(start, end),
2833 kind: LiteralKind::HexFixed(kind),
2834 c,
2835 }),
2836 }
2837 }
2838
2839 #[inline(never)]
2840 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2841 let mut scratch = self.parser().scratch.borrow_mut();
2842 scratch.clear();
2843
2844 let brace_pos = self.pos();
2845 let start = self.span_char().end;
2846 while self.bump_and_bump_space() && self.char() != '}' {
2847 if !is_hex(self.char()) {
2848 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2849 }
2850 scratch.push(self.char());
2851 }
2852 if self.is_eof() {
2853 return Err(self.error(
2854 Span::new(brace_pos, self.pos()),
2855 ast::ErrorKind::EscapeUnexpectedEof,
2856 ));
2857 }
2858 let end = self.pos();
2859 let hex = scratch.as_str();
2860 assert_eq!(self.char(), '}');
2861 self.bump_and_bump_space();
2862
2863 if hex.is_empty() {
2864 return Err(self.error(
2865 Span::new(brace_pos, self.pos()),
2866 ast::ErrorKind::EscapeHexEmpty,
2867 ));
2868 }
2869 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2870 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2871 Some(c) => Ok(Literal {
2872 span: Span::new(start, self.pos()),
2873 kind: LiteralKind::HexBrace(kind),
2874 c,
2875 }),
2876 }
2877 }
2878
2879 fn parse_decimal(&self) -> Result<u32> {
2880 let mut scratch = self.parser().scratch.borrow_mut();
2881 scratch.clear();
2882
2883 while !self.is_eof() && self.char().is_whitespace() {
2884 self.bump();
2885 }
2886 let start = self.pos();
2887 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2888 scratch.push(self.char());
2889 self.bump_and_bump_space();
2890 }
2891 let span = Span::new(start, self.pos());
2892 while !self.is_eof() && self.char().is_whitespace() {
2893 self.bump_and_bump_space();
2894 }
2895 let digits = scratch.as_str();
2896 if digits.is_empty() {
2897 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2898 }
2899 match digits.parse::<u32>().ok() {
2900 Some(n) => Ok(n),
2901 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2902 }
2903 }
2904
2905 #[inline(never)]
2906 fn parse_set_class(&self) -> Result<ClassBracketed> {
2907 assert_eq!(self.char(), '[');
2908
2909 let mut union = ClassSetUnion {
2910 span: self.span(),
2911 items: vec![],
2912 };
2913 loop {
2914 self.bump_space();
2915 if self.is_eof() {
2916 return Err(self.unclosed_class_error());
2917 }
2918 match self.char() {
2919 '[' => {
2920 if !self.parser().stack_class.borrow().is_empty() {
2921 if let Some(cls) = self.maybe_parse_ascii_class() {
2922 union.push(ClassSetItem::Ascii(cls));
2923 continue;
2924 }
2925 }
2926 union = self.push_class_open(union)?;
2927 }
2928 ']' => match self.pop_class(union)? {
2929 Either::Left(nested_union) => {
2930 union = nested_union;
2931 }
2932 Either::Right(class) => return Ok(class),
2933 },
2934 '&' if self.peek() == Some('&') => {
2935 assert!(self.bump_if("&&"));
2936 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2937 }
2938 '-' if self.peek() == Some('-') => {
2939 assert!(self.bump_if("--"));
2940 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2941 }
2942 '~' if self.peek() == Some('~') => {
2943 assert!(self.bump_if("~~"));
2944 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2945 }
2946 _ => {
2947 union.push(self.parse_set_class_range()?);
2948 }
2949 }
2950 }
2951 }
2952
2953 #[inline(never)]
2954 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2955 let prim1 = self.parse_set_class_item()?;
2956 self.bump_space();
2957 if self.is_eof() {
2958 return Err(self.unclosed_class_error());
2959 }
2960 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2961 return prim1.into_class_set_item(self);
2962 }
2963 if !self.bump_and_bump_space() {
2964 return Err(self.unclosed_class_error());
2965 }
2966 let prim2 = self.parse_set_class_item()?;
2967 let range = ClassSetRange {
2968 span: Span::new(prim1.span().start, prim2.span().end),
2969 start: prim1.into_class_literal(self)?,
2970 end: prim2.into_class_literal(self)?,
2971 };
2972 if !range.is_valid() {
2973 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2974 }
2975 Ok(ClassSetItem::Range(range))
2976 }
2977
2978 #[inline(never)]
2979 fn parse_set_class_item(&self) -> Result<Primitive> {
2980 if self.char() == '\\' {
2981 self.parse_escape()
2982 } else {
2983 let x = Primitive::Literal(Literal {
2984 span: self.span_char(),
2985 kind: LiteralKind::Verbatim,
2986 c: self.char(),
2987 });
2988 self.bump();
2989 Ok(x)
2990 }
2991 }
2992
2993 #[inline(never)]
2994 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2995 assert_eq!(self.char(), '[');
2996 let start = self.pos();
2997 if !self.bump_and_bump_space() {
2998 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2999 }
3000
3001 let negated = if self.char() != '^' {
3002 false
3003 } else {
3004 if !self.bump_and_bump_space() {
3005 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3006 }
3007 true
3008 };
3009 let mut union = ClassSetUnion {
3011 span: self.span(),
3012 items: vec![],
3013 };
3014 while self.char() == '-' {
3015 union.push(ClassSetItem::Literal(Literal {
3016 span: self.span_char(),
3017 kind: LiteralKind::Verbatim,
3018 c: '-',
3019 }));
3020 if !self.bump_and_bump_space() {
3021 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3022 }
3023 }
3024 if union.items.is_empty() && self.char() == ']' {
3027 union.push(ClassSetItem::Literal(Literal {
3028 span: self.span_char(),
3029 kind: LiteralKind::Verbatim,
3030 c: ']',
3031 }));
3032 if !self.bump_and_bump_space() {
3033 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3034 }
3035 }
3036 let set = ClassBracketed {
3037 span: Span::new(start, self.pos()),
3038 negated,
3039 kind: ClassSet::union(ClassSetUnion {
3040 span: Span::new(union.span.start, union.span.start),
3041 items: vec![],
3042 }),
3043 };
3044 Ok((set, union))
3045 }
3046
3047 #[inline(never)]
3048 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3049 assert_eq!(self.char(), '[');
3050 let start = self.pos();
3052 let mut negated = false;
3053 if !self.bump() || self.char() != ':' {
3054 self.parser().pos.set(start);
3055 return None;
3056 }
3057 if !self.bump() {
3058 self.parser().pos.set(start);
3059 return None;
3060 }
3061 if self.char() == '^' {
3062 negated = true;
3063 if !self.bump() {
3064 self.parser().pos.set(start);
3065 return None;
3066 }
3067 }
3068 let name_start = self.offset();
3069 while self.char() != ':' && self.bump() {}
3070 if self.is_eof() {
3071 self.parser().pos.set(start);
3072 return None;
3073 }
3074 let name = &self.pattern()[name_start..self.offset()];
3075 if !self.bump_if(":]") {
3076 self.parser().pos.set(start);
3077 return None;
3078 }
3079 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3080 Some(kind) => kind,
3081 None => {
3082 self.parser().pos.set(start);
3083 return None;
3084 }
3085 };
3086 Some(ClassAscii {
3087 span: Span::new(start, self.pos()),
3088 kind,
3089 negated,
3090 })
3091 }
3092
3093 #[inline(never)]
3094 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3095 assert!(self.char() == 'p' || self.char() == 'P');
3096
3097 let mut scratch = self.parser().scratch.borrow_mut();
3098 scratch.clear();
3099
3100 let negated = self.char() == 'P';
3101 if !self.bump_and_bump_space() {
3102 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3103 }
3104 let (start, kind) = if self.char() == '{' {
3105 let start = self.span_char().end;
3106 while self.bump_and_bump_space() && self.char() != '}' {
3107 scratch.push(self.char());
3108 }
3109 if self.is_eof() {
3110 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3111 }
3112 assert_eq!(self.char(), '}');
3113 self.bump();
3114
3115 let name = scratch.as_str();
3116 if let Some(i) = name.find("!=") {
3117 (
3118 start,
3119 ClassUnicodeKind::NamedValue {
3120 op: ClassUnicodeOpKind::NotEqual,
3121 name: name[..i].to_string(),
3122 value: name[i + 2..].to_string(),
3123 },
3124 )
3125 } else if let Some(i) = name.find(':') {
3126 (
3127 start,
3128 ClassUnicodeKind::NamedValue {
3129 op: ClassUnicodeOpKind::Colon,
3130 name: name[..i].to_string(),
3131 value: name[i + 1..].to_string(),
3132 },
3133 )
3134 } else if let Some(i) = name.find('=') {
3135 (
3136 start,
3137 ClassUnicodeKind::NamedValue {
3138 op: ClassUnicodeOpKind::Equal,
3139 name: name[..i].to_string(),
3140 value: name[i + 1..].to_string(),
3141 },
3142 )
3143 } else {
3144 (start, ClassUnicodeKind::Named(name.to_string()))
3145 }
3146 } else {
3147 let start = self.pos();
3148 let c = self.char();
3149 if c == '\\' {
3150 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3151 }
3152 self.bump_and_bump_space();
3153 let kind = ClassUnicodeKind::OneLetter(c);
3154 (start, kind)
3155 };
3156 Ok(ClassUnicode {
3157 span: Span::new(start, self.pos()),
3158 negated,
3159 kind,
3160 })
3161 }
3162
3163 #[inline(never)]
3164 fn parse_perl_class(&self) -> ClassPerl {
3165 let c = self.char();
3166 let span = self.span_char();
3167 self.bump();
3168 let (negated, kind) = match c {
3169 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3170 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3171 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3172 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3173 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3174 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3175 c => panic!("expected valid Perl class but got '{}'", c),
3176 };
3177 ClassPerl {
3178 span,
3179 kind,
3180 negated,
3181 }
3182 }
3183}
3184
3185fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3188 use regex_syntax::ast::ClassSetItem;
3189 let items = match item {
3190 ClassSetItem::Union(u) => &u.items,
3191 _ => return false,
3192 };
3193 if items.len() != 2 {
3194 return false;
3195 }
3196 match (&items[0], &items[1]) {
3197 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3198 let is_all = a.kind == b.kind && a.negated != b.negated;
3199 is_all
3200 }
3201 _ => false,
3202 }
3203}
3204
3205pub fn max_concat_length(ast: &ast::Ast) -> usize {
3206 match ast {
3207 ast::Ast::Empty(_)
3208 | ast::Ast::Flags(_)
3209 | ast::Ast::Literal(_)
3210 | ast::Ast::Dot(_)
3211 | ast::Ast::Top(_)
3212 | ast::Ast::Assertion(_)
3213 | ast::Ast::ClassUnicode(_)
3214 | ast::Ast::ClassPerl(_)
3215 | ast::Ast::ClassBracketed(_) => 0,
3216 ast::Ast::Group(g) => max_concat_length(&g.ast),
3217 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3218 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3219 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3220 ast::Ast::Concat(c) => c
3221 .asts
3222 .len()
3223 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3224 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3225 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3226 }
3227}
3228
3229pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3230 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3231 match ast {
3232 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3233 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3234 ast::Ast::Assertion(_) => 1,
3235 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3236 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3237 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3238 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3239 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3240 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3241 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3242 ast::Ast::Repetition(r) => {
3243 let body = go(&r.ast, limit);
3244 let factor: u64 = match &r.op.kind {
3245 ast::RepetitionKind::ZeroOrOne => 2,
3246 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3247 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3248 (*n as u64).max(1)
3249 }
3250 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3251 (*n as u64).max(1).saturating_add(1)
3252 }
3253 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3254 (*m as u64).max(1)
3255 }
3256 };
3257 body.saturating_mul(factor).min(limit)
3258 }
3259 }
3260 }
3261 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3262 let mut total: u64 = 0;
3263 for c in children {
3264 total = total.saturating_add(go(c, limit));
3265 if total >= limit {
3266 return limit;
3267 }
3268 }
3269 total
3270 }
3271 go(ast, limit)
3272}
3273
3274pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3275 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3276 p.parse(tb)
3277}
3278
3279pub fn parse_ast_with<'s>(
3280 tb: &mut TB<'s>,
3281 pattern: &'s str,
3282 flags: &PatternFlags,
3283) -> std::result::Result<NodeId, ParseError> {
3284 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3285 p.parse(tb)
3286}
3287
3288pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3290 let mut p: ResharpParser = ResharpParser::new(pattern);
3291 p.parse_inner()
3292}