1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51 pub max_depth: usize,
52}
53
54pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62 fn default() -> Self {
63 Self {
64 unicode: true,
65 full_unicode: false,
66 case_insensitive: false,
67 dot_matches_new_line: false,
68 multiline: true,
69 ignore_whitespace: false,
70 ascii_perl_classes: false,
71 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72 max_list_len: DEFAULT_MAX_LIST_LEN,
73 max_repeat: DEFAULT_MAX_REPEAT,
74 max_depth: DEFAULT_MAX_DEPTH,
75 }
76 }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81 Word,
82 NonWord,
83 MaybeWord,
84 MaybeNonWord,
85 Unknown,
86 Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90 b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
94 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95 use WordCharKind::*;
96 match item {
97 ClassSetItem::Empty(_) => Unknown,
98 ClassSetItem::Literal(l) => {
99 if is_word_byte(l.c as u8) {
100 Word
101 } else {
102 NonWord
103 }
104 }
105 ClassSetItem::Range(r) => {
106 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108 if all_word {
109 Word
110 } else if all_non {
111 NonWord
112 } else {
113 Unknown
114 }
115 }
116 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117 (ClassPerlKind::Word, false) => Word,
118 (ClassPerlKind::Word, true) => NonWord,
119 (ClassPerlKind::Space, false) => NonWord,
120 (ClassPerlKind::Digit, false) => Word,
121 _ => Unknown,
122 },
123 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124 ClassSetItem::Union(u) => {
125 let mut kind = Unknown;
126 for item in &u.items {
127 let k = class_set_item_word_kind(item);
128 kind = match (kind, k) {
129 (_, Unknown) => return Unknown,
130 (Unknown, _) => k,
131 (Word, Word) => Word,
132 (NonWord, NonWord) => NonWord,
133 _ => return Unknown,
134 };
135 }
136 kind
137 }
138 _ => Unknown,
139 }
140}
141
142fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
143 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144 use WordCharKind::*;
145 if c.negated {
146 return match &c.kind {
147 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148 if p.negated {
149 Word
150 } else {
151 NonWord
152 }
153 }
154 _ => Unknown,
155 };
156 }
157 match &c.kind {
158 ClassSet::Item(item) => class_set_item_word_kind(item),
159 ClassSet::BinaryOp(_) => Unknown,
160 }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164 regex_syntax::ast::Literal {
165 span,
166 kind: regex_syntax::ast::LiteralKind::Verbatim,
167 c,
168 }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173 span,
174 start: ascii_class_lit(span, a),
175 end: ascii_class_lit(span, b),
176 })
177}
178
179fn ascii_perl_positive(
180 span: Span,
181 kind: ®ex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184 match kind {
185 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187 span,
188 items: vec![
189 ascii_class_range(span, 'a', 'z'),
190 ascii_class_range(span, 'A', 'Z'),
191 ascii_class_range(span, '0', '9'),
192 ClassSetItem::Literal(ascii_class_lit(span, '_')),
193 ],
194 }),
195 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196 span,
197 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198 .into_iter()
199 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200 .collect(),
201 }),
202 }
203}
204
205fn ascii_perl_set_item(
206 span: Span,
207 kind: ®ex_syntax::ast::ClassPerlKind,
208 negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211 let positive = ascii_perl_positive(span, kind);
212 if negated {
213 ClassSetItem::Bracketed(Box::new(ClassBracketed {
214 span,
215 negated: true,
216 kind: ClassSet::Item(positive),
217 }))
218 } else {
219 positive
220 }
221}
222
223fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225 match set {
226 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228 span: op.span,
229 kind: op.kind.clone(),
230 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232 }),
233 }
234}
235
236fn rewrite_ascii_perl_item(
237 item: ®ex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240 match item {
241 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243 span: u.span,
244 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245 }),
246 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247 span: b.span,
248 negated: b.negated,
249 kind: rewrite_ascii_perl_set(&b.kind),
250 })),
251 other => other.clone(),
252 }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257 Literal(Literal),
258 Assertion(ast::Assertion),
259 Dot(Span),
260 Top(Span),
261 Perl(ClassPerl),
262 Unicode(ClassUnicode),
263}
264
265impl Primitive {
266 fn span(&self) -> &Span {
267 match *self {
268 Primitive::Literal(ref x) => &x.span,
269 Primitive::Assertion(ref x) => &x.span,
270 Primitive::Dot(ref span) => span,
271 Primitive::Top(ref span) => span,
272 Primitive::Perl(ref x) => &x.span,
273 Primitive::Unicode(ref x) => &x.span,
274 }
275 }
276
277 fn into_ast(self) -> Ast {
278 match self {
279 Primitive::Literal(lit) => Ast::literal(lit),
280 Primitive::Assertion(assert) => Ast::assertion(assert),
281 Primitive::Dot(span) => Ast::dot(span),
282 Primitive::Top(span) => Ast::top(span),
283 Primitive::Perl(cls) => Ast::class_perl(cls),
284 Primitive::Unicode(cls) => Ast::class_unicode(cls),
285 }
286 }
287
288 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289 use self::Primitive::*;
290 use regex_syntax::ast::ClassSetItem;
291
292 match self {
293 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297 }
298 }
299
300 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301 use self::Primitive::*;
302
303 match self {
304 Literal(lit) => Ok(lit),
305 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306 }
307 }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312 Left(Left),
313 Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318 pub kind: ErrorKind,
320 pattern: String,
323 pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329 write!(f, "{:?}: {:?}", self.kind, self.span)
330 }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338 Group {
340 concat: Concat,
342 group: ast::Group,
344 ignore_whitespace: bool,
346 },
347 Alternation(ast::Alternation),
348 Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353 Open {
355 union: regex_syntax::ast::ClassSetUnion,
357 set: regex_syntax::ast::ClassBracketed,
358 },
359 Op {
362 kind: regex_syntax::ast::ClassSetBinaryOpKind,
364 lhs: regex_syntax::ast::ClassSet,
366 },
367}
368
369pub struct ResharpParser<'s> {
371 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372 unicode_classes: resharp_algebra::UnicodeClassCache,
373 pub translator: regex_syntax::hir::translate::Translator,
374 pub pattern: &'s str,
375 pos: Cell<Position>,
376 capture_index: Cell<u32>,
377 octal: bool,
378 empty_min_range: bool,
379 ignore_whitespace: Cell<bool>,
380 dot_all: Cell<bool>,
381 multiline: Cell<bool>,
382 global_unicode: bool,
383 global_full_unicode: bool,
384 global_ascii_perl: bool,
385 global_case_insensitive: bool,
386 expanded_ast_limit: u64,
387 max_list_len: usize,
388 max_repeat: u32,
389 max_depth: usize,
390 comments: RefCell<Vec<ast::Comment>>,
391 stack_group: RefCell<Vec<GroupState>>,
392 stack_class: RefCell<Vec<ClassState>>,
393 capture_names: RefCell<Vec<ast::CaptureName>>,
394 scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398 result.map_err(|e| {
399 if e.kind == from {
400 ParseError {
401 kind: to,
402 pattern: e.pattern,
403 span: e.span,
404 }
405 } else {
406 e
407 }
408 })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412 if first {
413 c == '_' || c.is_alphabetic()
414 } else {
415 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416 }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420 matches!(
421 c,
422 '\\' | '.'
423 | '+'
424 | '*'
425 | '?'
426 | '('
427 | ')'
428 | '|'
429 | '['
430 | ']'
431 | '{'
432 | '}'
433 | '^'
434 | '$'
435 | '#'
436 | '&'
437 | '-'
438 | '~'
439 | '_'
440 )
441}
442
443pub fn escape(text: &str) -> String {
445 let mut buf = String::new();
446 escape_into(text, &mut buf);
447 buf
448}
449
450pub fn escape_into(text: &str, buf: &mut String) {
452 buf.reserve(text.len());
453 for c in text.chars() {
454 if is_meta_character(c) {
455 buf.push('\\');
456 }
457 buf.push(c);
458 }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462 if is_meta_character(c) {
463 return true;
464 }
465 if !c.is_ascii() {
466 return false;
467 }
468 match c {
469 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470 '<' | '>' => false,
471 _ => true,
472 }
473}
474
475fn is_hex(c: char) -> bool {
476 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480 match ast {
481 Ast::Concat(c) => {
482 let mut child_at_start = at_start;
483 for child in &c.asts {
484 ensure_lookbehind_at_start(child, child_at_start)?;
485 if ast_may_consume(child) {
486 child_at_start = false;
487 }
488 }
489 Ok(())
490 }
491 Ast::Alternation(a) => {
492 for child in &a.asts {
493 ensure_lookbehind_at_start(child, at_start)?;
494 }
495 Ok(())
496 }
497 Ast::Intersection(i) => {
498 for child in &i.asts {
499 ensure_lookbehind_at_start(child, at_start)?;
500 }
501 Ok(())
502 }
503 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506 Ast::Lookaround(g) => {
507 match g.kind {
508 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509 if !at_start {
510 return Err(g.span);
511 }
512 }
513 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514 }
515 ensure_lookbehind_at_start(&g.ast, true)
516 }
517 Ast::Empty(_)
518 | Ast::Flags(_)
519 | Ast::Literal(_)
520 | Ast::Dot(_)
521 | Ast::Top(_)
522 | Ast::Assertion(_)
523 | Ast::ClassUnicode(_)
524 | Ast::ClassPerl(_)
525 | Ast::ClassBracketed(_) => Ok(()),
526 }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530 match ast {
531 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532 Ast::Literal(_)
533 | Ast::Dot(_)
534 | Ast::Top(_)
535 | Ast::ClassUnicode(_)
536 | Ast::ClassPerl(_)
537 | Ast::ClassBracketed(_) => true,
538 Ast::Group(g) => ast_may_consume(&g.ast),
539 Ast::Repetition(r) => {
540 if !ast_may_consume(&r.ast) {
541 return false;
542 }
543 match r.op.kind {
544 RepetitionKind::ZeroOrOne
545 | RepetitionKind::ZeroOrMore
546 | RepetitionKind::OneOrMore => true,
547 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549 RepetitionKind::Range(_) => true,
550 }
551 }
552 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554 Ast::Complement(_) => true,
555 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556 }
557}
558
559impl<'s> ResharpParser<'s> {
560 fn default_translator_builder(&self) -> TranslatorBuilder {
561 let mut trb = TranslatorBuilder::new();
562 trb.unicode(self.global_unicode);
563 trb.utf8(false);
564 trb.case_insensitive(self.global_case_insensitive);
565 trb
566 }
567
568 pub fn new(pattern: &'s str) -> Self {
569 Self::with_flags(pattern, &PatternFlags::default())
570 }
571
572 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573 let mut trb = TranslatorBuilder::new();
574 trb.unicode(flags.unicode);
575 trb.utf8(false);
576 trb.case_insensitive(flags.case_insensitive);
577 Self {
578 translator: trb.build(),
579 pattern,
580 perl_classes: vec![],
581 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582 pos: Cell::new(Position::new(0, 0, 0)),
583 capture_index: Cell::new(0),
584 octal: false,
585 empty_min_range: false,
586 ignore_whitespace: Cell::new(flags.ignore_whitespace),
587 dot_all: Cell::new(flags.dot_matches_new_line),
588 multiline: Cell::new(flags.multiline),
589 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590 global_full_unicode: flags.full_unicode,
591 global_ascii_perl: flags.ascii_perl_classes,
592 global_case_insensitive: flags.case_insensitive,
593 expanded_ast_limit: flags.expanded_ast_limit,
594 max_list_len: flags.max_list_len,
595 max_repeat: flags.max_repeat,
596 max_depth: flags.max_depth,
597 comments: RefCell::new(vec![]),
598 stack_group: RefCell::new(vec![]),
599 stack_class: RefCell::new(vec![]),
600 capture_names: RefCell::new(vec![]),
601 scratch: RefCell::new(String::new()),
602 }
603 }
604
605 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606 self
607 }
608
609 fn pattern(&self) -> &str {
610 self.pattern
611 }
612
613 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614 ParseError {
615 kind,
616 pattern: self.pattern().to_string(),
617 span,
618 }
619 }
620
621 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622 self.error(
623 Span::splat(self.pos()),
624 ast::ErrorKind::UnsupportedResharpRegex,
625 )
626 }
627
628 fn offset(&self) -> usize {
629 self.parser().pos.get().offset
630 }
631
632 fn line(&self) -> usize {
633 self.parser().pos.get().line
634 }
635
636 fn column(&self) -> usize {
637 self.parser().pos.get().column
638 }
639
640 fn next_capture_index(&self, span: Span) -> Result<u32> {
641 let current = self.parser().capture_index.get();
642 let i = current
643 .checked_add(1)
644 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645 self.parser().capture_index.set(i);
646 Ok(i)
647 }
648
649 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650 let mut names = self.parser().capture_names.borrow_mut();
651 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652 Err(i) => {
653 names.insert(i, cap.clone());
654 Ok(())
655 }
656 Ok(i) => Err(self.error(
657 cap.span,
658 ast::ErrorKind::GroupNameDuplicate {
659 original: names[i].span,
660 },
661 )),
662 }
663 }
664
665 fn ignore_whitespace(&self) -> bool {
666 self.parser().ignore_whitespace.get()
667 }
668
669 fn char(&self) -> char {
670 self.char_at(self.offset())
671 }
672
673 fn char_at(&self, i: usize) -> char {
674 self.pattern()[i..]
675 .chars()
676 .next()
677 .unwrap_or_else(|| panic!("expected char at offset {}", i))
678 }
679
680 fn bump(&self) -> bool {
681 if self.is_eof() {
682 return false;
683 }
684 let Position {
685 mut offset,
686 mut line,
687 mut column,
688 } = self.pos();
689 if self.char() == '\n' {
690 line = line.checked_add(1).unwrap();
691 column = 1;
692 } else {
693 column = column.checked_add(1).unwrap();
694 }
695 offset += self.char().len_utf8();
696 self.parser().pos.set(Position {
697 offset,
698 line,
699 column,
700 });
701 self.pattern()[self.offset()..].chars().next().is_some()
702 }
703
704 fn bump_if(&self, prefix: &str) -> bool {
705 if self.pattern()[self.offset()..].starts_with(prefix) {
706 for _ in 0..prefix.chars().count() {
707 self.bump();
708 }
709 true
710 } else {
711 false
712 }
713 }
714
715 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716 if self.bump_if("?=") {
717 return Some((true, true));
718 }
719 if self.bump_if("?!") {
720 return Some((true, false));
721 }
722 if self.bump_if("?<=") {
723 return Some((false, true));
724 }
725 if self.bump_if("?<!") {
726 return Some((false, false));
727 }
728 None
729 }
730
731 fn bump_and_bump_space(&self) -> bool {
732 if !self.bump() {
733 return false;
734 }
735 self.bump_space();
736 !self.is_eof()
737 }
738
739 fn bump_space(&self) {
740 if !self.ignore_whitespace() {
741 return;
742 }
743 while !self.is_eof() {
744 if self.char().is_whitespace() {
745 self.bump();
746 } else if self.char() == '#' {
747 let start = self.pos();
748 let mut comment_text = String::new();
749 self.bump();
750 while !self.is_eof() {
751 let c = self.char();
752 self.bump();
753 if c == '\n' {
754 break;
755 }
756 comment_text.push(c);
757 }
758 let comment = ast::Comment {
759 span: Span::new(start, self.pos()),
760 comment: comment_text,
761 };
762 self.parser().comments.borrow_mut().push(comment);
763 } else {
764 break;
765 }
766 }
767 }
768
769 fn peek(&self) -> Option<char> {
770 if self.is_eof() {
771 return None;
772 }
773 self.pattern()[self.offset() + self.char().len_utf8()..]
774 .chars()
775 .next()
776 }
777
778 fn peek_space(&self) -> Option<char> {
781 if !self.ignore_whitespace() {
782 return self.peek();
783 }
784 if self.is_eof() {
785 return None;
786 }
787 let mut start = self.offset() + self.char().len_utf8();
788 let mut in_comment = false;
789 for (i, c) in self.pattern()[start..].char_indices() {
790 if c.is_whitespace() {
791 continue;
792 } else if !in_comment && c == '#' {
793 in_comment = true;
794 } else if in_comment && c == '\n' {
795 in_comment = false;
796 } else {
797 start += i;
798 break;
799 }
800 }
801 self.pattern()[start..].chars().next()
802 }
803
804 fn is_eof(&self) -> bool {
805 self.offset() == self.pattern().len()
806 }
807
808 fn pos(&self) -> Position {
809 self.parser().pos.get()
810 }
811
812 fn span(&self) -> Span {
813 Span::splat(self.pos())
814 }
815
816 fn span_char(&self) -> Span {
817 let mut next = Position {
818 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819 line: self.line(),
820 column: self.column().checked_add(1).unwrap(),
821 };
822 if self.char() == '\n' {
823 next.line += 1;
824 next.column = 1;
825 }
826 Span::new(self.pos(), next)
827 }
828
829 #[inline(never)]
830 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831 assert_eq!(self.char(), '|');
832 concat.span.end = self.pos();
833 self.push_or_add_alternation(concat);
834 self.bump();
835 Ok(ast::Concat {
836 span: self.span(),
837 asts: vec![],
838 })
839 }
840
841 fn push_or_add_alternation(&self, concat: Concat) {
842 use self::GroupState::*;
843
844 let mut stack = self.parser().stack_group.borrow_mut();
845 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846 alts.asts.push(concat.into_ast());
847 return;
848 }
849 stack.push(Alternation(ast::Alternation {
850 span: Span::new(concat.span.start, self.pos()),
851 asts: vec![concat.into_ast()],
852 }));
853 }
854
855 #[inline(never)]
856 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857 assert_eq!(self.char(), '&');
858 concat.span.end = self.pos();
859 self.push_or_add_intersect(concat);
860 self.bump();
861 Ok(Concat {
862 span: self.span(),
863 asts: vec![],
864 })
865 }
866
867 fn push_or_add_intersect(&self, concat: Concat) {
868 use self::GroupState::*;
869
870 let mut stack = self.parser().stack_group.borrow_mut();
871 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872 alts.asts.push(concat.into_ast());
873 return;
874 }
875 stack.push(Intersection(ast::Intersection {
876 span: Span::new(concat.span.start, self.pos()),
877 asts: vec![concat.into_ast()],
878 }));
879 }
880
881 #[inline(never)]
882 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883 assert_eq!(self.char(), '(');
884 match self.parse_group()? {
885 Either::Left(set) => {
886 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887 if let Some(v) = ignore {
888 self.parser().ignore_whitespace.set(v);
889 }
890
891 concat.asts.push(Ast::flags(set));
892 Ok(concat)
893 }
894 Either::Right(group) => {
895 let old_ignore_whitespace = self.ignore_whitespace();
896 let new_ignore_whitespace = group
897 .flags()
898 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899 .unwrap_or(old_ignore_whitespace);
900 self.parser()
901 .stack_group
902 .borrow_mut()
903 .push(GroupState::Group {
904 concat,
905 group,
906 ignore_whitespace: old_ignore_whitespace,
907 });
908 self.parser().ignore_whitespace.set(new_ignore_whitespace);
909 Ok(Concat {
910 span: self.span(),
911 asts: vec![],
912 })
913 }
914 }
915 }
916
917 #[inline(never)]
918 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919 assert_eq!(self.char(), '~');
920 self.bump();
921 if self.is_eof() || self.char() != '(' {
922 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923 }
924 let open_span = self.span_char();
925 self.bump();
926 let group = ast::Group {
927 span: open_span,
928 kind: ast::GroupKind::Complement,
929 ast: Box::new(Ast::empty(self.span())),
930 };
931
932 let old_ignore_whitespace = self.ignore_whitespace();
933 let new_ignore_whitespace = group
934 .flags()
935 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936 .unwrap_or(old_ignore_whitespace);
937 self.parser()
938 .stack_group
939 .borrow_mut()
940 .push(GroupState::Group {
941 concat,
942 group,
943 ignore_whitespace: old_ignore_whitespace,
944 });
945 self.parser().ignore_whitespace.set(new_ignore_whitespace);
946 Ok(Concat {
947 span: self.span(),
948 asts: vec![],
949 })
950 }
951
952 #[inline(never)]
953 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954 use self::GroupState::*;
955 assert_eq!(self.char(), ')');
956 let mut stack = self.parser().stack_group.borrow_mut();
957 let topstack = stack.pop();
958
959 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960 Some(Group {
961 concat,
962 group,
963 ignore_whitespace,
964 }) => (concat, group, ignore_whitespace, None),
965 Some(Alternation(alt)) => match stack.pop() {
966 Some(Group {
967 concat,
968 group,
969 ignore_whitespace,
970 }) => (
971 concat,
972 group,
973 ignore_whitespace,
974 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975 ),
976 None | Some(Alternation(_)) | Some(Intersection(_)) => {
977 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978 }
979 },
980 Some(Intersection(int)) => match stack.pop() {
981 Some(Group {
982 concat,
983 group,
984 ignore_whitespace,
985 }) => (
986 concat,
987 group,
988 ignore_whitespace,
989 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990 ),
991 None | Some(Alternation(_)) | Some(Intersection(_)) => {
992 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993 }
994 },
995
996 None => {
997 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998 }
999 };
1000 self.parser().ignore_whitespace.set(ignore_whitespace);
1001 group_concat.span.end = self.pos();
1002 self.bump();
1003 group.span.end = self.pos();
1004 match alt {
1005 Some(Either::Left(mut alt)) => {
1006 alt.span.end = group_concat.span.end;
1007 alt.asts.push(group_concat.into_ast());
1008 group.ast = Box::new(alt.into_ast());
1009 }
1010 Some(Either::Right(mut int)) => {
1011 int.span.end = group_concat.span.end;
1012 int.asts.push(group_concat.into_ast());
1013 group.ast = Box::new(int.into_ast());
1014 }
1015 None => {
1016 group.ast = Box::new(group_concat.into_ast());
1017 }
1018 }
1019
1020 if group.kind == GroupKind::Complement {
1021 let complement = ast::Complement {
1022 span: self.span(),
1023 ast: group.ast,
1024 };
1025 prior_concat.asts.push(Ast::complement(complement));
1026 }
1027 else {
1029 prior_concat.asts.push(Ast::group(group));
1030 }
1031 Ok(prior_concat)
1032 }
1033
1034 #[inline(never)]
1035 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036 concat.span.end = self.pos();
1037 let mut stack = self.parser().stack_group.borrow_mut();
1038 let ast = match stack.pop() {
1039 None => Ok(concat.into_ast()),
1040 Some(GroupState::Alternation(mut alt)) => {
1041 alt.span.end = self.pos();
1042 alt.asts.push(concat.into_ast());
1043 Ok(Ast::alternation(alt))
1044 }
1045 Some(GroupState::Intersection(mut int)) => {
1046 int.span.end = self.pos();
1047 int.asts.push(concat.into_ast());
1048
1049 Ok(Ast::intersection(int))
1050 }
1051 Some(GroupState::Group { group, .. }) => {
1052 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053 }
1054 };
1055 match stack.pop() {
1057 None => ast,
1058 Some(GroupState::Alternation(alt)) => {
1059 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060 }
1061 Some(GroupState::Intersection(int)) => {
1062 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063 }
1064 Some(GroupState::Group { group, .. }) => {
1065 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066 }
1067 }
1068 }
1069
1070 #[inline(never)]
1071 fn push_class_open(
1072 &self,
1073 parent_union: regex_syntax::ast::ClassSetUnion,
1074 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075 assert_eq!(self.char(), '[');
1076
1077 let (nested_set, nested_union) = self.parse_set_class_open()?;
1078 self.parser()
1079 .stack_class
1080 .borrow_mut()
1081 .push(ClassState::Open {
1082 union: parent_union,
1083 set: nested_set,
1084 });
1085 Ok(nested_union)
1086 }
1087
1088 #[inline(never)]
1089 fn pop_class(
1090 &self,
1091 nested_union: regex_syntax::ast::ClassSetUnion,
1092 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093 assert_eq!(self.char(), ']');
1094
1095 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096 let prevset = self.pop_class_op(item);
1097 let mut stack = self.parser().stack_class.borrow_mut();
1098 match stack.pop() {
1099 None => panic!("unexpected empty character class stack"),
1100 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101 Some(ClassState::Open { mut union, mut set }) => {
1102 self.bump();
1103 set.span.end = self.pos();
1104 set.kind = prevset;
1105 if stack.is_empty() {
1106 Ok(Either::Right(set))
1107 } else {
1108 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109 Ok(Either::Left(union))
1110 }
1111 }
1112 }
1113 }
1114
1115 #[inline(never)]
1116 fn unclosed_class_error(&self) -> ParseError {
1117 for state in self.parser().stack_class.borrow().iter().rev() {
1118 if let ClassState::Open { ref set, .. } = *state {
1119 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120 }
1121 }
1122 panic!("no open character class found")
1123 }
1124
1125 #[inline(never)]
1126 fn push_class_op(
1127 &self,
1128 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129 next_union: regex_syntax::ast::ClassSetUnion,
1130 ) -> regex_syntax::ast::ClassSetUnion {
1131 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132 let new_lhs = self.pop_class_op(item);
1133 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134 kind: next_kind,
1135 lhs: new_lhs,
1136 });
1137 regex_syntax::ast::ClassSetUnion {
1138 span: self.span(),
1139 items: vec![],
1140 }
1141 }
1142
1143 #[inline(never)]
1144 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145 let mut stack = self.parser().stack_class.borrow_mut();
1146 let (kind, lhs) = match stack.pop() {
1147 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148 Some(state @ ClassState::Open { .. }) => {
1149 stack.push(state);
1150 return rhs;
1151 }
1152 None => unreachable!(),
1153 };
1154 let span = Span::new(lhs.span().start, rhs.span().end);
1155 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156 span,
1157 kind,
1158 lhs: Box::new(lhs),
1159 rhs: Box::new(rhs),
1160 })
1161 }
1162
1163 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164 match hir.kind() {
1165 hir::HirKind::Empty => Ok(NodeId::EPS),
1166 hir::HirKind::Literal(l) => {
1167 if l.0.len() == 1 {
1168 let node = tb.mk_u8(l.0[0]);
1169 Ok(node)
1170 } else {
1171 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172 let conc = tb.mk_concats(ws.iter().copied());
1173 Ok(conc)
1174 }
1175 }
1176 hir::HirKind::Class(class) => match class {
1177 hir::Class::Unicode(class_unicode) => {
1178 let ranges = class_unicode.ranges();
1179 if ranges.len() == 1
1180 && ranges[0].start() == '\u{0}'
1181 && ranges[0].end() == '\u{10FFFF}'
1182 {
1183 return Ok(tb.mk_range_u8(0, 255));
1184 }
1185 let mut nodes = Vec::new();
1186 for range in ranges {
1187 for seq in Utf8Sequences::new(range.start(), range.end()) {
1188 let sl = seq.as_slice();
1189 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190 let node = match bytes.len() {
1191 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192 n => {
1193 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194 let mut conc = last;
1195 for i in (0..n - 1).rev() {
1196 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197 conc = tb.mk_concat(b, conc);
1198 }
1199 conc
1200 }
1201 };
1202 nodes.push(node);
1203 }
1204 }
1205 let merged = tb.mk_unions(nodes.into_iter());
1206 Ok(merged)
1207 }
1208 hir::Class::Bytes(class_bytes) => {
1209 let ranges = class_bytes.ranges();
1210 let mut result = NodeId::BOT;
1211 for range in ranges {
1212 let start = range.start();
1213 let end = range.end();
1214 let node = tb.mk_range_u8(start, end);
1215 result = tb.mk_union(result, node);
1216 }
1217 Ok(result)
1218 }
1219 },
1220 hir::HirKind::Look(_) => Err(self.error(
1221 Span::splat(self.pos()),
1222 ast::ErrorKind::UnsupportedResharpRegex,
1223 )),
1224 hir::HirKind::Repetition(_) => Err(self.error(
1225 Span::splat(self.pos()),
1226 ast::ErrorKind::UnsupportedResharpRegex,
1227 )),
1228 hir::HirKind::Capture(_) => Err(self.error(
1229 Span::splat(self.pos()),
1230 ast::ErrorKind::UnsupportedResharpRegex,
1231 )),
1232 hir::HirKind::Concat(body) => {
1233 let mut result = NodeId::EPS;
1234 for child in body {
1235 let node = self.hir_to_node_id(child, tb)?;
1236 result = tb.mk_concat(result, node);
1237 }
1238 Ok(result)
1239 }
1240 hir::HirKind::Alternation(_) => Err(self.error(
1241 Span::splat(self.pos()),
1242 ast::ErrorKind::UnsupportedResharpRegex,
1243 )),
1244 }
1245 }
1246
1247 fn translate_ast_to_hir(
1248 &mut self,
1249 orig_ast: ®ex_syntax::ast::Ast,
1250 tb: &mut TB<'s>,
1251 ) -> Result<NodeId> {
1252 match self.translator.translate("", orig_ast) {
1253 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254 Ok(hir) => self.hir_to_node_id(&hir, tb),
1255 }
1256 }
1257
1258 fn translator_to_node_id(
1259 &mut self,
1260 orig_ast: ®ex_syntax::ast::Ast,
1261 translator: &mut Option<Translator>,
1262 tb: &mut TB<'s>,
1263 ) -> Result<NodeId> {
1264 match translator {
1265 Some(tr) => {
1266 let hir = tr
1267 .translate("", orig_ast)
1268 .map_err(|e| self.unsupported_error(e))?;
1269 self.hir_to_node_id(&hir, tb)
1270 }
1271 None => self.translate_ast_to_hir(orig_ast, tb),
1272 }
1273 }
1274
1275 fn get_class(
1276 &mut self,
1277 negated: bool,
1278 kind: regex_syntax::ast::ClassPerlKind,
1279 tb: &mut TB<'s>,
1280 ) -> Result<NodeId> {
1281 let w = self
1282 .perl_classes
1283 .iter()
1284 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285 match w {
1286 Some((_, _, value)) => Ok(*value),
1287 None => {
1288 let translated = if self.global_ascii_perl {
1289 let pos = match kind {
1290 regex_syntax::ast::ClassPerlKind::Word => {
1291 let az = tb.mk_range_u8(b'a', b'z');
1292 let big = tb.mk_range_u8(b'A', b'Z');
1293 let dig = tb.mk_range_u8(b'0', b'9');
1294 let us = tb.mk_u8(b'_');
1295 tb.mk_unions([az, big, dig, us].into_iter())
1296 }
1297 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298 regex_syntax::ast::ClassPerlKind::Space => {
1299 let sp = tb.mk_u8(b' ');
1300 let tab = tb.mk_u8(b'\t');
1301 let nl = tb.mk_u8(b'\n');
1302 let cr = tb.mk_u8(b'\r');
1303 let ff = tb.mk_u8(0x0C);
1304 let vt = tb.mk_u8(0x0B);
1305 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306 }
1307 };
1308 if negated {
1309 resharp_algebra::neg_class(tb, pos)
1310 } else {
1311 pos
1312 }
1313 } else if self.global_unicode {
1314 match kind {
1315 regex_syntax::ast::ClassPerlKind::Word => {
1316 if self.global_full_unicode {
1317 self.unicode_classes.ensure_word_full(tb);
1318 } else {
1319 self.unicode_classes.ensure_word(tb);
1320 }
1321 if negated {
1322 self.unicode_classes.non_word
1323 } else {
1324 self.unicode_classes.word
1325 }
1326 }
1327 regex_syntax::ast::ClassPerlKind::Digit => {
1328 if self.global_full_unicode {
1329 self.unicode_classes.ensure_digit_full(tb);
1330 } else {
1331 self.unicode_classes.ensure_digit(tb);
1332 }
1333 if negated {
1334 self.unicode_classes.non_digit
1335 } else {
1336 self.unicode_classes.digit
1337 }
1338 }
1339 regex_syntax::ast::ClassPerlKind::Space => {
1340 if self.global_full_unicode {
1341 self.unicode_classes.ensure_space_full(tb);
1342 } else {
1343 self.unicode_classes.ensure_space(tb);
1344 }
1345 if negated {
1346 self.unicode_classes.non_space
1347 } else {
1348 self.unicode_classes.space
1349 }
1350 }
1351 }
1352 } else {
1353 let pos = match kind {
1354 regex_syntax::ast::ClassPerlKind::Word => {
1355 let az = tb.mk_range_u8(b'a', b'z');
1356 let big = tb.mk_range_u8(b'A', b'Z');
1357 let dig = tb.mk_range_u8(b'0', b'9');
1358 let us = tb.mk_u8(b'_');
1359 tb.mk_unions([az, big, dig, us].into_iter())
1360 }
1361 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362 regex_syntax::ast::ClassPerlKind::Space => {
1363 let sp = tb.mk_u8(b' ');
1364 let tab = tb.mk_u8(b'\t');
1365 let nl = tb.mk_u8(b'\n');
1366 let cr = tb.mk_u8(b'\r');
1367 let ff = tb.mk_u8(0x0C);
1368 let vt = tb.mk_u8(0x0B);
1369 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370 }
1371 };
1372 if negated {
1373 resharp_algebra::neg_class(tb, pos)
1379 } else {
1380 pos
1381 }
1382 };
1383 self.perl_classes.push((negated, kind, translated));
1384 Ok(translated)
1385 }
1386 }
1387 }
1388
1389 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1390 use WordCharKind::*;
1391 match ast {
1392 Ast::Literal(lit) => {
1393 if is_word_byte(lit.c as u8) {
1394 Word
1395 } else {
1396 NonWord
1397 }
1398 }
1399 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1400 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1401 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1402 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1403 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1404 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1405 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1406 },
1407 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1408 Ast::Dot(_) | Ast::Top(_) => Unknown,
1409 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1410 Ast::Concat(c) if !c.asts.is_empty() => {
1411 let dir: isize = if left { -1 } else { 1 };
1412 let edge = match Self::concat_edge_index(&c.asts, left) {
1413 Some(e) => e,
1414 None => return Unknown,
1415 };
1416 let kind = Self::word_char_kind(&c.asts[edge], left);
1417 match kind {
1418 MaybeWord => {
1419 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1420 Word => Word,
1421 _ => MaybeWord,
1422 }
1423 }
1424 MaybeNonWord => {
1425 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1426 NonWord => NonWord,
1427 _ => MaybeNonWord,
1428 }
1429 }
1430 other => other,
1431 }
1432 }
1433 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1434 let first = Self::word_char_kind(&alt.asts[0], left);
1435 if alt.asts[1..]
1436 .iter()
1437 .all(|a| Self::word_char_kind(a, left) == first)
1438 {
1439 first
1440 } else {
1441 Unknown
1442 }
1443 }
1444 Ast::Repetition(r) => {
1445 let inner = Self::word_char_kind(&r.ast, left);
1446 let nullable = matches!(
1447 &r.op.kind,
1448 ast::RepetitionKind::ZeroOrMore
1449 | ast::RepetitionKind::ZeroOrOne
1450 | ast::RepetitionKind::Range(
1451 ast::RepetitionRange::Bounded(0, _)
1452 | ast::RepetitionRange::Exactly(0)
1453 )
1454 );
1455 if nullable {
1456 match inner {
1457 Word => MaybeWord,
1458 NonWord => MaybeNonWord,
1459 _ => Unknown,
1460 }
1461 } else {
1462 inner
1463 }
1464 }
1465 Ast::Lookaround(la) => match la.kind {
1466 ast::LookaroundKind::PositiveLookahead
1467 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1468 _ => Unknown,
1469 },
1470 Ast::Assertion(a) => match (&a.kind, left) {
1471 (ast::AssertionKind::EndText, false) => NonWord,
1472 (ast::AssertionKind::StartText, true) => NonWord,
1473 _ => Unknown,
1474 },
1475 _ => Unknown,
1476 }
1477 }
1478
1479 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1481 match ast {
1482 Ast::Literal(_)
1483 | Ast::ClassPerl(_)
1484 | Ast::ClassBracketed(_)
1485 | Ast::ClassUnicode(_)
1486 | Ast::Dot(_)
1487 | Ast::Top(_) => Some(ast),
1488 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1489 Ast::Concat(c) if !c.asts.is_empty() => {
1490 Self::concat_edge_index(&c.asts, left)
1491 .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1492 }
1493 Ast::Repetition(r) => {
1494 let nullable = matches!(
1495 &r.op.kind,
1496 ast::RepetitionKind::ZeroOrMore
1497 | ast::RepetitionKind::ZeroOrOne
1498 | ast::RepetitionKind::Range(
1499 ast::RepetitionRange::Bounded(0, _)
1500 | ast::RepetitionRange::Exactly(0)
1501 )
1502 );
1503 if nullable {
1504 None
1505 } else {
1506 Self::edge_class_ast(&r.ast, left)
1507 }
1508 }
1509 _ => None,
1510 }
1511 }
1512
1513 fn resolve_word_kind(
1514 &mut self,
1515 asts: &[Ast],
1516 idx: usize,
1517 dir: isize,
1518 translator: &mut Option<Translator>,
1519 tb: &mut TB<'s>,
1520 word_id: NodeId,
1521 not_word_id: NodeId,
1522 ) -> Result<WordCharKind> {
1523 use WordCharKind::*;
1524 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1525 if fast != Unknown {
1526 return Ok(fast);
1527 }
1528 let neighbor_idx = (idx as isize + dir) as usize;
1529 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1530 self.ast_to_node_id(edge, translator, tb)?
1531 } else if dir > 0 {
1532 let mut bodies: Vec<NodeId> = vec![];
1533 let mut j = neighbor_idx;
1534 while j < asts.len() {
1535 match &asts[j] {
1536 Ast::Lookaround(la) => {
1537 let kind = la.kind.clone();
1538 let lookbehind = matches!(
1539 kind,
1540 ast::LookaroundKind::PositiveLookbehind
1541 | ast::LookaroundKind::NegativeLookbehind
1542 );
1543 if lookbehind {
1544 j += 1;
1545 continue;
1546 }
1547 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1548 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1549 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1550 })?;
1551 let body_ts = tb.mk_concat(body, NodeId::TS);
1552 let constraint = match kind {
1553 ast::LookaroundKind::PositiveLookahead => body_ts,
1554 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1555 _ => unreachable!(),
1556 };
1557 bodies.push(constraint);
1558 j += 1;
1559 }
1560 other => {
1561 let n = self.ast_to_node_id(other, translator, tb)?;
1562 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1563 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1564 })?;
1565 bodies.push(tb.mk_concat(n, NodeId::TS));
1566 break;
1567 }
1568 }
1569 }
1570 if bodies.is_empty() {
1571 return Ok(Unknown);
1572 }
1573 let combined = tb.mk_inters(bodies.into_iter());
1574 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1575 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1576 return if tb.subsumes(word_prefix, combined) == Some(true) {
1577 Ok(Word)
1578 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1579 Ok(NonWord)
1580 } else {
1581 Ok(Unknown)
1582 };
1583 } else {
1584 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1585 let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1586 let mut neighbor_node = tb
1587 .try_elim_lookarounds(neighbor_node)
1588 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1589 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1590 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1591 })?;
1592 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1593 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1594 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1595 Ok(Word)
1596 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1597 Ok(NonWord)
1598 } else {
1599 Ok(Unknown)
1600 };
1601 };
1602 if tb.subsumes(word_id, node) == Some(true) {
1603 Ok(Word)
1604 } else if tb.subsumes(not_word_id, node) == Some(true) {
1605 Ok(NonWord)
1606 } else {
1607 Ok(Unknown)
1608 }
1609 }
1610
1611 fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1612 match tb.get_kind(node) {
1613 Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1614 Kind::Concat => {
1615 let l = node.left(tb);
1616 let r = node.right(tb);
1617 let stripped_r = Self::strip_trailing_lookahead(tb, r);
1618 if stripped_r == NodeId::EPS {
1619 Self::strip_trailing_lookahead(tb, l)
1620 } else if stripped_r == r {
1621 node
1622 } else {
1623 tb.mk_concat(l, stripped_r)
1624 }
1625 }
1626 _ => node,
1627 }
1628 }
1629
1630 fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1631 let dir: isize = if left { -1 } else { 1 };
1632 let mut e = if left { asts.len() as isize - 1 } else { 0 };
1633 while e >= 0
1634 && (e as usize) < asts.len()
1635 && Self::is_transparent_for_dir(&asts[e as usize], dir)
1636 {
1637 e += dir;
1638 }
1639 if e < 0 || e as usize >= asts.len() {
1640 None
1641 } else {
1642 Some(e as usize)
1643 }
1644 }
1645
1646 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1647 match ast {
1648 Ast::Lookaround(la) => match la.kind {
1649 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1650 dir < 0
1651 }
1652 ast::LookaroundKind::PositiveLookbehind
1653 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1654 },
1655 Ast::Repetition(r) => matches!(
1656 &r.op.kind,
1657 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1658 ),
1659 _ => false,
1660 }
1661 }
1662
1663 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1664 use WordCharKind::*;
1665 let next = idx as isize + dir;
1666 if next < 0 || next >= asts.len() as isize {
1667 return Edge;
1668 }
1669 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1670 return Self::concat_neighbor_kind(asts, next as usize, dir);
1671 }
1672 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1673 match kind {
1674 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1675 Word => Word,
1676 _ => Unknown,
1677 },
1678 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1679 NonWord => NonWord,
1680 _ => Unknown,
1681 },
1682 other => other,
1683 }
1684 }
1685
1686 fn specialize_word_boundaries(
1687 &mut self,
1688 children: &mut [NodeId],
1689 tb: &mut TB<'s>,
1690 ) -> Result<()> {
1691 let wb = self.unicode_classes.wb;
1692 let non_wb = self.unicode_classes.non_wb;
1693 if wb == NodeId::MISSING {
1694 return Ok(());
1695 }
1696 let word = self.unicode_classes.word;
1697 let non_word = self.unicode_classes.non_word;
1698 if word == NodeId::MISSING {
1699 return Ok(());
1700 }
1701 let word_pref = tb.mk_concat(word, NodeId::TS);
1704 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1705 let word_suf = tb.mk_concat(NodeId::TS, word);
1706 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1707 let len = children.len();
1708 for k in 0..len {
1709 let l = if k == 0 {
1710 WordCharKind::Edge
1711 } else {
1712 use resharp_algebra::Kind;
1713 if tb.get_kind(children[k - 1]) == Kind::End
1714 && (children[k] == wb || children[k] == non_wb)
1715 {
1716 return Err(
1717 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1718 );
1719 }
1720 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1721 };
1722 let r = if k + 1 >= len {
1723 WordCharKind::Edge
1724 } else {
1725 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1726 };
1727 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1728 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1729 }
1730 Ok(())
1731 }
1732
1733 fn rewrite_wb_in_node(
1734 b: &mut TB<'s>,
1735 node: NodeId,
1736 wb: NodeId,
1737 non_wb: NodeId,
1738 word: NodeId,
1739 left: WordCharKind,
1740 right: WordCharKind,
1741 ) -> Option<NodeId> {
1742 let boundary_match = if node == wb {
1743 true
1744 } else if node == non_wb {
1745 false
1746 } else if b.get_kind(node) == Kind::Union {
1747 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1748 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1749 return Some(b.mk_union(l, r));
1750 } else {
1751 return Some(node);
1752 };
1753 use WordCharKind::*;
1754 let result = match (left, right) {
1755 (NonWord, Word) | (Word, NonWord) => {
1756 if boundary_match {
1757 NodeId::EPS
1758 } else {
1759 NodeId::BOT
1760 }
1761 }
1762 (Word, Word) | (NonWord, NonWord) => {
1763 if boundary_match {
1764 NodeId::BOT
1765 } else {
1766 NodeId::EPS
1767 }
1768 }
1769 (Word, _) => {
1770 if boundary_match {
1771 b.mk_neg_lookahead(word, 0)
1772 } else {
1773 let tail = b.mk_concat(word, NodeId::TS);
1774 b.mk_lookahead(tail, NodeId::MISSING, 0)
1775 }
1776 }
1777 (NonWord, _) => {
1778 if boundary_match {
1779 let tail = b.mk_concat(word, NodeId::TS);
1780 b.mk_lookahead(tail, NodeId::MISSING, 0)
1781 } else {
1782 b.mk_neg_lookahead(word, 0)
1783 }
1784 }
1785 (_, Word) => {
1786 if boundary_match {
1787 b.mk_neg_lookbehind(word)
1788 } else {
1789 b.mk_lookbehind(word, NodeId::MISSING)
1790 }
1791 }
1792 (_, NonWord) => {
1793 if boundary_match {
1794 b.mk_lookbehind(word, NodeId::MISSING)
1795 } else {
1796 b.mk_neg_lookbehind(word)
1797 }
1798 }
1799 _ => return Some(node),
1800 };
1801 Some(result)
1802 }
1803
1804 fn classify(
1805 b: &mut TB<'s>,
1806 node: NodeId,
1807 word_dir: NodeId,
1808 non_word_dir: NodeId,
1809 ) -> WordCharKind {
1810 if b.contains_look(node) || b.contains_anchors(node) {
1811 return WordCharKind::Unknown;
1812 }
1813 if b.subsumes(word_dir, node) == Some(true) {
1814 WordCharKind::Word
1815 } else if b.subsumes(non_word_dir, node) == Some(true) {
1816 WordCharKind::NonWord
1817 } else {
1818 WordCharKind::Unknown
1819 }
1820 }
1821
1822 fn rewrite_word_boundary_in_concat(
1823 &mut self,
1824 asts: &[Ast],
1825 idx: usize,
1826 translator: &mut Option<Translator>,
1827 tb: &mut TB<'s>,
1828 negated: bool,
1829 ) -> Result<(NodeId, usize)> {
1830 use WordCharKind::*;
1831 if self.global_full_unicode {
1832 self.unicode_classes.ensure_word_full(tb);
1833 } else if self.global_unicode && !self.global_ascii_perl {
1834 self.unicode_classes.ensure_word(tb);
1835 } else {
1836 self.unicode_classes.ensure_word_ascii(tb);
1837 }
1838 let word_id = self.unicode_classes.word;
1839 let not_word_id = self.unicode_classes.non_word;
1840 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1841 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1842 let boundary_match = !negated;
1843 match (left, right) {
1844 (NonWord, Word) | (Word, NonWord) => Ok((
1845 if boundary_match {
1846 NodeId::EPS
1847 } else {
1848 NodeId::BOT
1849 },
1850 idx + 1,
1851 )),
1852 (Word, Word) | (NonWord, NonWord) => Ok((
1853 if boundary_match {
1854 NodeId::BOT
1855 } else {
1856 NodeId::EPS
1857 },
1858 idx + 1,
1859 )),
1860 (Word, _) => {
1861 if boundary_match {
1862 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1863 } else {
1864 let tail = tb.mk_concat(word_id, NodeId::TS);
1865 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1866 }
1867 }
1868 (NonWord, _) => {
1869 if boundary_match {
1870 let tail = tb.mk_concat(word_id, NodeId::TS);
1871 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1872 } else {
1873 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1874 }
1875 }
1876 (_, Word) => {
1877 if boundary_match {
1878 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1879 } else {
1880 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1881 }
1882 }
1883 (_, NonWord) => {
1884 if boundary_match {
1885 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1886 } else {
1887 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1888 }
1889 }
1890 _ => {
1894 self.unicode_classes.ensure_wb(tb);
1895 let node = if boundary_match {
1896 self.unicode_classes.wb
1897 } else {
1898 self.unicode_classes.non_wb
1899 };
1900 Ok((node, idx + 1))
1901 }
1902 }
1903 }
1904
1905 fn merge_boundary_with_following_lookaheads(
1906 &mut self,
1907 asts: &[Ast],
1908 wb_idx: usize,
1909 boundary_tail: NodeId,
1910 translator: &mut Option<Translator>,
1911 tb: &mut TB<'s>,
1912 ) -> Result<(NodeId, usize)> {
1913 let mut next = wb_idx + 1;
1914 let mut la_bodies = vec![boundary_tail];
1915 while next < asts.len() {
1916 match &asts[next] {
1917 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1918 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1919 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1920 next += 1;
1921 }
1922 _ => break,
1923 }
1924 }
1925 let merged = tb.mk_inters(la_bodies.into_iter());
1926 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1927 }
1928
1929 fn ast_to_node_id(
1930 &mut self,
1931 ast: &Ast,
1932 translator: &mut Option<Translator>,
1933 tb: &mut TB<'s>,
1934 ) -> Result<NodeId> {
1935 match ast {
1936 Ast::Empty(_) => Ok(NodeId::EPS),
1937 Ast::Flags(f) => {
1938 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1939 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1940 }
1941 let mut translator_builder = self.default_translator_builder();
1942 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1943 translator_builder.case_insensitive(state);
1944 }
1945 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1946 translator_builder.unicode(state);
1947 }
1948 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1949 self.dot_all.set(state);
1950 }
1951 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1952 self.multiline.set(state);
1953 }
1954 let concat_translator = Some(translator_builder.build());
1955 *translator = concat_translator;
1956 Ok(NodeId::EPS)
1957 }
1958 Ast::Literal(l) => {
1959 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1960 self.translator_to_node_id(&ast_lit, translator, tb)
1961 }
1962 Ast::Top(_) => Ok(NodeId::TOP),
1963 Ast::Dot(_) => {
1964 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1965 let hirv = match (codepoint_dot, self.dot_all.get()) {
1966 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1967 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1968 (false, true) => return Ok(NodeId::TOP),
1969 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1970 };
1971 self.hir_to_node_id(&hirv, tb)
1972 }
1973 Ast::Assertion(a) => match &a.kind {
1974 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1975 ast::AssertionKind::EndText => Ok(NodeId::END),
1976 ast::AssertionKind::WordBoundary => {
1977 let only = Ast::Assertion(a.clone());
1978 let asts = std::slice::from_ref(&only);
1979 let (node, _) =
1980 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1981 Ok(node)
1982 }
1983 ast::AssertionKind::NotWordBoundary => {
1984 let only = Ast::Assertion(a.clone());
1986 let asts = std::slice::from_ref(&only);
1987 let (node, _) =
1988 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1989 Ok(node)
1990 }
1991 ast::AssertionKind::StartLine => {
1992 if !self.multiline.get() {
1993 return Ok(NodeId::BEGIN);
1994 }
1995 let left = NodeId::BEGIN;
1996 let right = tb.mk_u8(b'\n');
1997 let union = tb.mk_union(left, right);
1998 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1999 }
2000 ast::AssertionKind::EndLine => {
2001 if !self.multiline.get() {
2002 return Ok(NodeId::END);
2003 }
2004 let left = NodeId::END;
2005 let right = tb.mk_u8(b'\n');
2006 let union = tb.mk_union(left, right);
2007 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2008 }
2009 ast::AssertionKind::WordBoundaryStart => {
2010 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2011 }
2012 ast::AssertionKind::WordBoundaryEnd => {
2013 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2014 }
2015 ast::AssertionKind::WordBoundaryStartAngle => {
2016 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2017 }
2018 ast::AssertionKind::WordBoundaryEndAngle => {
2019 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2020 }
2021 ast::AssertionKind::WordBoundaryStartHalf => {
2022 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2023 }
2024 ast::AssertionKind::WordBoundaryEndHalf => {
2025 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2026 }
2027 },
2028 Ast::ClassUnicode(c) => {
2029 let tmp = regex_syntax::ast::ClassUnicode {
2030 span: c.span,
2031 negated: c.negated,
2032 kind: c.kind.clone(),
2033 };
2034 if !c.negated {
2035 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2036 match s.as_str() {
2037 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2039 "utf8" => {
2041 let ascii = tb.mk_range_u8(0, 127);
2042 let beta = tb.mk_range_u8(128, 0xBF);
2043 let c0 = tb.mk_range_u8(0xC0, 0xDF);
2044 let c0s = tb.mk_concats([c0, beta].into_iter());
2045 let e0 = tb.mk_range_u8(0xE0, 0xEF);
2046 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2047 let f0 = tb.mk_range_u8(0xF0, 0xF7);
2048 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2049 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2050 }
2051 "hex" => {
2052 let nums = tb.mk_range_u8(b'0', b'9');
2053 let lets = tb.mk_range_u8(b'a', b'f');
2054 let lets2 = tb.mk_range_u8(b'A', b'F');
2055 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2056 return Ok(merged);
2057 }
2058 _ => {}
2059 }
2060 };
2061 }
2062
2063 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2064 self.translator_to_node_id(&orig_ast, translator, tb)
2065 }
2066 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2067 Ast::ClassBracketed(c) => match &c.kind {
2068 regex_syntax::ast::ClassSet::Item(item) => {
2069 if !c.negated && is_universal_perl_pair(item) {
2070 return Ok(NodeId::TOP);
2071 }
2072 if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2073 return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2074 }
2075 let kind = if self.global_ascii_perl {
2076 rewrite_ascii_perl_set(&c.kind)
2077 } else {
2078 c.kind.clone()
2079 };
2080 let tmp = regex_syntax::ast::ClassBracketed {
2081 span: c.span,
2082 negated: c.negated,
2083 kind,
2084 };
2085 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2086 self.translator_to_node_id(&orig_ast, translator, tb)
2087 }
2088 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2089 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2090 }
2091 },
2092 Ast::Repetition(r) => {
2093 let body = self.ast_to_node_id(&r.ast, translator, tb);
2094 match body {
2095 Ok(body) => match &r.op.kind {
2096 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2097 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2098 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2099 ast::RepetitionKind::Range(r) => match r {
2100 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2101 ast::RepetitionRange::AtLeast(n) => {
2102 let rep = tb.mk_repeat(body, *n, *n);
2103 let st = tb.mk_star(body);
2104 Ok(tb.mk_concat(rep, st))
2105 }
2106
2107 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2108 },
2109 },
2110 Err(_) => body,
2111 }
2112 }
2113 Ast::Lookaround(g) => {
2114 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2115 match g.kind {
2116 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2117 let mut prefix = NodeId::EPS;
2118 let mut rest = body;
2119 while tb.get_kind(rest) == Kind::Concat
2120 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2121 {
2122 prefix = tb.mk_concat(prefix, rest.left(tb));
2123 rest = rest.right(tb);
2124 }
2125 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2126 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2127 }
2128 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2129 Ok(tb.mk_concat(prefix, la))
2130 }
2131 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2132 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2133 }
2134 ast::LookaroundKind::PositiveLookahead => {
2135 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2136 }
2137 ast::LookaroundKind::PositiveLookbehind
2138 | ast::LookaroundKind::NegativeLookbehind
2139 if body.contains_lookahead(tb) =>
2140 {
2141 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2142 }
2143 ast::LookaroundKind::PositiveLookbehind => {
2144 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2145 }
2146 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2147 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2148 }
2149 }
2150 Ast::Group(g) => {
2151 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2152 if !flags.items.is_empty() {
2153 let mut translator_builder = self.default_translator_builder();
2154 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2155 translator_builder.case_insensitive(state);
2156 }
2157 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2158 translator_builder.unicode(state);
2159 }
2160 let saved_dot_all = self.dot_all.get();
2161 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2162 self.dot_all.set(state);
2163 }
2164 let saved_multiline = self.multiline.get();
2165 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2166 self.multiline.set(state);
2167 }
2168 let mut scoped = Some(translator_builder.build());
2169 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2170 self.dot_all.set(saved_dot_all);
2171 self.multiline.set(saved_multiline);
2172 return result;
2173 }
2174 }
2175 self.ast_to_node_id(&g.ast, translator, tb)
2176 }
2177 Ast::Alternation(a) => {
2178 let mut children = vec![];
2179 for ast in &a.asts {
2180 match self.ast_to_node_id(ast, translator, tb) {
2181 Ok(node_id) => children.push(node_id),
2182 Err(err) => return Err(err),
2183 }
2184 }
2185 Ok(tb.mk_unions(children.iter().copied()))
2186 }
2187 Ast::Concat(c) => {
2188 let mut concat_translator: Option<Translator> = None;
2189 let mut children = vec![];
2190 let mut prev_boundary_child: Option<usize> = None;
2191 let mut i = 0;
2192 while i < c.asts.len() {
2193 let ast = &c.asts[i];
2194 match ast {
2195 Ast::Flags(f) => {
2196 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2197 return Err(
2198 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2199 );
2200 }
2201 let mut translator_builder = self.default_translator_builder();
2202 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2203 translator_builder.case_insensitive(state);
2204 }
2205 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2206 translator_builder.unicode(state);
2207 }
2208 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2209 self.dot_all.set(state);
2210 }
2211 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2212 self.multiline.set(state);
2213 }
2214 concat_translator = Some(translator_builder.build());
2215 *translator = concat_translator.clone();
2216 i += 1;
2217 continue;
2218 }
2219 Ast::Assertion(a)
2220 if a.kind == ast::AssertionKind::WordBoundary
2221 || a.kind == ast::AssertionKind::NotWordBoundary =>
2222 {
2223 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2224 let node = self.rewrite_word_boundary_in_concat(
2225 &c.asts, i, translator, tb, negated,
2226 )?;
2227 match prev_boundary_child {
2228 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2229 None => {
2230 children.push(node.0);
2231 prev_boundary_child = Some(children.len() - 1);
2232 }
2233 }
2234 i = node.1; continue;
2236 }
2237 _ => {}
2238 }
2239 match concat_translator {
2240 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2241 Ok(node_id) => {
2242 if node_id != resharp_algebra::NodeId::EPS {
2243 prev_boundary_child = None;
2244 children.push(node_id);
2245 }
2246 }
2247 Err(err) => return Err(err),
2248 },
2249 None => match self.ast_to_node_id(ast, translator, tb) {
2250 Ok(node_id) => {
2251 if node_id != resharp_algebra::NodeId::EPS {
2252 prev_boundary_child = None;
2253 children.push(node_id);
2254 }
2255 }
2256 Err(err) => return Err(err),
2257 },
2258 }
2259 i += 1;
2260 }
2261 self.specialize_word_boundaries(&mut children, tb)?;
2262 Ok(tb.mk_concats(children.iter().cloned()))
2263 }
2264 Ast::Intersection(intersection) => {
2265 let mut children = vec![];
2266 for ast in &intersection.asts {
2267 match self.ast_to_node_id(ast, translator, tb) {
2268 Ok(node_id) => children.push(node_id),
2269 Err(err) => return Err(err),
2270 }
2271 }
2272 Ok(tb.mk_inters(children.into_iter()))
2273 }
2274 Ast::Complement(complement) => {
2275 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2276 body.map(|x| tb.mk_compl(x))
2277 }
2278 }
2279 }
2280
2281 fn parse_inner(&mut self) -> Result<Ast> {
2282 let mut concat = Concat {
2283 span: self.span(),
2284 asts: vec![],
2285 };
2286 loop {
2287 self.bump_space();
2288 if self.is_eof() {
2289 break;
2290 }
2291 match self.char() {
2292 '(' => concat = self.push_group(concat)?,
2293 ')' => concat = self.pop_group(concat)?,
2294 '|' => concat = self.push_alternate(concat)?,
2295 '&' => concat = self.push_intersect(concat)?,
2296 '~' => concat = self.push_compl_group(concat)?,
2297 '[' => {
2298 let class = self.parse_set_class()?;
2299 concat.asts.push(Ast::class_bracketed(class));
2300 }
2301 '?' => {
2302 concat =
2303 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2304 }
2305 '*' => {
2306 concat =
2307 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2308 }
2309 '+' => {
2310 concat =
2311 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2312 }
2313 '{' => {
2314 concat = self.parse_counted_repetition(concat)?;
2315 }
2316 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2317 }
2318 if self.stack_group.borrow().len() > self.max_depth {
2319 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2320 }
2321 }
2322 let ast = self.pop_group_end(concat)?;
2323 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2324 || max_concat_length(&ast) >= self.max_list_len
2325 {
2326 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2327 }
2328 Ok(ast)
2329 }
2330
2331 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2332 let ast = self.parse_inner()?;
2333 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2334 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2335 }
2336 self.ast_to_node_id(&ast, &mut None, tb)
2337 }
2338
2339 #[inline(never)]
2340 fn parse_uncounted_repetition(
2341 &self,
2342 mut concat: ast::Concat,
2343 kind: ast::RepetitionKind,
2344 ) -> Result<ast::Concat> {
2345 let op_start = self.pos();
2347 let ast = match concat.asts.pop() {
2348 Some(ast) => ast,
2349 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2350 };
2351 match ast {
2352 Ast::Empty(_) | Ast::Flags(_) => {
2353 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2354 }
2355 _ => {}
2356 }
2357 if self.bump() && self.char() == '?' {
2358 return Err(self.error(
2359 Span::new(op_start, self.pos()),
2360 ast::ErrorKind::UnsupportedLazyQuantifier,
2361 ));
2362 }
2363 concat.asts.push(Ast::repetition(ast::Repetition {
2364 span: ast.span().with_end(self.pos()),
2365 op: ast::RepetitionOp {
2366 span: Span::new(op_start, self.pos()),
2367 kind,
2368 },
2369 greedy: true,
2370 ast: Box::new(ast),
2371 }));
2372 Ok(concat)
2373 }
2374
2375 #[inline(never)]
2376 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2377 assert!(self.char() == '{');
2378 let start = self.pos();
2379 let ast = match concat.asts.pop() {
2380 Some(ast) => ast,
2381 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2382 };
2383 match ast {
2384 Ast::Empty(_) | Ast::Flags(_) => {
2385 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2386 }
2387 _ => {}
2388 }
2389 if !self.bump_and_bump_space() {
2390 return Err(self.error(
2391 Span::new(start, self.pos()),
2392 ast::ErrorKind::RepetitionCountUnclosed,
2393 ));
2394 }
2395 let count_start = specialize_err(
2396 self.parse_decimal(),
2397 ast::ErrorKind::DecimalEmpty,
2398 ast::ErrorKind::RepetitionCountDecimalEmpty,
2399 );
2400 if self.is_eof() {
2401 return Err(self.error(
2402 Span::new(start, self.pos()),
2403 ast::ErrorKind::RepetitionCountUnclosed,
2404 ));
2405 }
2406 let range = if self.char() == ',' {
2407 if !self.bump_and_bump_space() {
2408 return Err(self.error(
2409 Span::new(start, self.pos()),
2410 ast::ErrorKind::RepetitionCountUnclosed,
2411 ));
2412 }
2413 if self.char() != '}' {
2414 let count_start = match count_start {
2415 Ok(c) => c,
2416 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2417 if self.parser().empty_min_range {
2418 0
2419 } else {
2420 return Err(err);
2421 }
2422 }
2423 err => err?,
2424 };
2425 let count_end = specialize_err(
2426 self.parse_decimal(),
2427 ast::ErrorKind::DecimalEmpty,
2428 ast::ErrorKind::RepetitionCountDecimalEmpty,
2429 )?;
2430 ast::RepetitionRange::Bounded(count_start, count_end)
2431 } else {
2432 ast::RepetitionRange::AtLeast(count_start?)
2433 }
2434 } else {
2435 ast::RepetitionRange::Exactly(count_start?)
2436 };
2437
2438 if self.is_eof() || self.char() != '}' {
2439 return Err(self.error(
2440 Span::new(start, self.pos()),
2441 ast::ErrorKind::RepetitionCountUnclosed,
2442 ));
2443 }
2444
2445 if self.bump_and_bump_space() && self.char() == '?' {
2446 return Err(self.error(
2447 Span::new(start, self.pos()),
2448 ast::ErrorKind::UnsupportedLazyQuantifier,
2449 ));
2450 }
2451
2452 let op_span = Span::new(start, self.pos());
2453 if !range.is_valid() {
2454 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2455 }
2456
2457 let over_limit = match &range {
2458 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2459 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2460 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2461 };
2462 if over_limit {
2463 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2464 }
2465 concat.asts.push(Ast::repetition(ast::Repetition {
2466 span: ast.span().with_end(self.pos()),
2467 op: ast::RepetitionOp {
2468 span: op_span,
2469 kind: ast::RepetitionKind::Range(range),
2470 },
2471 greedy: true,
2472 ast: Box::new(ast),
2473 }));
2474 Ok(concat)
2475 }
2476
2477 #[inline(never)]
2478 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2479 assert_eq!(self.char(), '(');
2480 let open_span = self.span_char();
2481 self.bump();
2482 self.bump_space();
2483 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2484 let kind = match (pos, ahead) {
2485 (true, true) => LookaroundKind::PositiveLookahead,
2486 (true, false) => LookaroundKind::PositiveLookbehind,
2487 (false, true) => LookaroundKind::NegativeLookahead,
2488 (false, false) => LookaroundKind::NegativeLookbehind,
2489 };
2490 return Ok(Either::Right(ast::Group {
2491 span: open_span,
2492 kind: ast::GroupKind::Lookaround(kind),
2493 ast: Box::new(Ast::empty(self.span())),
2494 }));
2495 }
2496 let inner_span = self.span();
2497 let mut starts_with_p = true;
2498 if self.bump_if("?P<") || {
2499 starts_with_p = false;
2500 self.bump_if("?<")
2501 } {
2502 let capture_index = self.next_capture_index(open_span)?;
2503 let name = self.parse_capture_name(capture_index)?;
2504 Ok(Either::Right(ast::Group {
2505 span: open_span,
2506 kind: ast::GroupKind::CaptureName {
2507 starts_with_p,
2508 name,
2509 },
2510 ast: Box::new(Ast::empty(self.span())),
2511 }))
2512 } else if self.bump_if("?") {
2513 if self.is_eof() {
2514 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2515 }
2516 let flags = self.parse_flags()?;
2517 let char_end = self.char();
2518 self.bump();
2519 if char_end == ')' {
2520 if flags.items.is_empty() {
2523 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2524 }
2525 Ok(Either::Left(ast::SetFlags {
2526 span: Span {
2527 end: self.pos(),
2528 ..open_span
2529 },
2530 flags,
2531 }))
2532 } else {
2533 assert_eq!(char_end, ':');
2534 Ok(Either::Right(ast::Group {
2535 span: open_span,
2536 kind: ast::GroupKind::NonCapturing(flags),
2537 ast: Box::new(Ast::empty(self.span())),
2538 }))
2539 }
2540 } else {
2541 let capture_index = self.next_capture_index(open_span)?;
2542 Ok(Either::Right(ast::Group {
2543 span: open_span,
2544 kind: ast::GroupKind::CaptureIndex(capture_index),
2545 ast: Box::new(Ast::empty(self.span())),
2546 }))
2547 }
2548 }
2549
2550 #[inline(never)]
2551 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2552 if self.is_eof() {
2553 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2554 }
2555 let start = self.pos();
2556 loop {
2557 if self.char() == '>' {
2558 break;
2559 }
2560 if !is_capture_char(self.char(), self.pos() == start) {
2561 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2562 }
2563 if !self.bump() {
2564 break;
2565 }
2566 }
2567 let end = self.pos();
2568 if self.is_eof() {
2569 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2570 }
2571 assert_eq!(self.char(), '>');
2572 self.bump();
2573 let name = &self.pattern()[start.offset..end.offset];
2574 if name.is_empty() {
2575 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2576 }
2577 let capname = ast::CaptureName {
2578 span: Span::new(start, end),
2579 name: name.to_string(),
2580 index: capture_index,
2581 };
2582 self.add_capture_name(&capname)?;
2583 Ok(capname)
2584 }
2585
2586 #[inline(never)]
2587 fn parse_flags(&self) -> Result<ast::Flags> {
2588 let mut flags = ast::Flags {
2589 span: self.span(),
2590 items: vec![],
2591 };
2592 let mut last_was_negation = None;
2593 while self.char() != ':' && self.char() != ')' {
2594 if self.char() == '-' {
2595 last_was_negation = Some(self.span_char());
2596 let item = ast::FlagsItem {
2597 span: self.span_char(),
2598 kind: ast::FlagsItemKind::Negation,
2599 };
2600 if let Some(i) = flags.add_item(item) {
2601 return Err(self.error(
2602 self.span_char(),
2603 ast::ErrorKind::FlagRepeatedNegation {
2604 original: flags.items[i].span,
2605 },
2606 ));
2607 }
2608 } else {
2609 last_was_negation = None;
2610 let item = ast::FlagsItem {
2611 span: self.span_char(),
2612 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2613 };
2614 if let Some(i) = flags.add_item(item) {
2615 return Err(self.error(
2616 self.span_char(),
2617 ast::ErrorKind::FlagDuplicate {
2618 original: flags.items[i].span,
2619 },
2620 ));
2621 }
2622 }
2623 if !self.bump() {
2624 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2625 }
2626 }
2627 if let Some(span) = last_was_negation {
2628 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2629 }
2630 flags.span.end = self.pos();
2631 Ok(flags)
2632 }
2633
2634 #[inline(never)]
2635 fn parse_flag(&self) -> Result<ast::Flag> {
2636 match self.char() {
2637 'i' => Ok(ast::Flag::CaseInsensitive),
2638 'm' => Ok(ast::Flag::MultiLine),
2639 's' => Ok(ast::Flag::DotMatchesNewLine),
2640 'U' => Ok(ast::Flag::SwapGreed),
2641 'u' => Ok(ast::Flag::Unicode),
2642 'R' => Ok(ast::Flag::CRLF),
2643 'x' => Ok(ast::Flag::IgnoreWhitespace),
2644 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2645 }
2646 }
2647
2648 fn parse_primitive(&self) -> Result<Primitive> {
2649 match self.char() {
2650 '\\' => self.parse_escape(),
2651 '_' => {
2652 let ast = Primitive::Top(self.span_char());
2653 self.bump();
2654 Ok(ast)
2655 }
2656 '.' => {
2657 let ast = Primitive::Dot(self.span_char());
2658 self.bump();
2659 Ok(ast)
2660 }
2661 '^' => {
2662 let ast = Primitive::Assertion(ast::Assertion {
2663 span: self.span_char(),
2664 kind: ast::AssertionKind::StartLine,
2665 });
2666 self.bump();
2667 Ok(ast)
2668 }
2669 '$' => {
2670 let ast = Primitive::Assertion(ast::Assertion {
2671 span: self.span_char(),
2672 kind: ast::AssertionKind::EndLine,
2673 });
2674 self.bump();
2675 Ok(ast)
2676 }
2677 c => {
2678 let ast = Primitive::Literal(Literal {
2679 span: self.span_char(),
2680 kind: LiteralKind::Verbatim,
2681 c,
2682 });
2683 self.bump();
2684 Ok(ast)
2685 }
2686 }
2687 }
2688
2689 #[inline(never)]
2690 fn parse_escape(&self) -> Result<Primitive> {
2691 assert_eq!(self.char(), '\\');
2692 let start = self.pos();
2693 if !self.bump() {
2694 return Err(self.error(
2695 Span::new(start, self.pos()),
2696 ast::ErrorKind::EscapeUnexpectedEof,
2697 ));
2698 }
2699 let c = self.char();
2700 match c {
2702 '0'..='9' => {
2703 if !self.parser().octal {
2704 return Err(self.error(
2705 Span::new(start, self.span_char().end),
2706 ast::ErrorKind::UnsupportedBackreference,
2707 ));
2708 }
2709 let mut lit = self.parse_octal();
2710 lit.span.start = start;
2711 return Ok(Primitive::Literal(lit));
2712 }
2713 'x' | 'u' | 'U' => {
2714 let mut lit = self.parse_hex()?;
2715 lit.span.start = start;
2716 return Ok(Primitive::Literal(lit));
2717 }
2718 'p' | 'P' => {
2719 let mut cls = self.parse_unicode_class()?;
2720 cls.span.start = start;
2721 return Ok(Primitive::Unicode(cls));
2722 }
2723 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2724 let mut cls = self.parse_perl_class();
2725 cls.span.start = start;
2726 return Ok(Primitive::Perl(cls));
2727 }
2728 _ => {}
2729 }
2730
2731 self.bump();
2733 let span = Span::new(start, self.pos());
2734 if is_meta_character(c) {
2735 return Ok(Primitive::Literal(Literal {
2736 span,
2737 kind: LiteralKind::Meta,
2738 c,
2739 }));
2740 }
2741 if is_escapeable_character(c) {
2742 return Ok(Primitive::Literal(Literal {
2743 span,
2744 kind: LiteralKind::Superfluous,
2745 c,
2746 }));
2747 }
2748 let special = |kind, c| {
2749 Ok(Primitive::Literal(Literal {
2750 span,
2751 kind: LiteralKind::Special(kind),
2752 c,
2753 }))
2754 };
2755 match c {
2756 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2757 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2758 't' => special(SpecialLiteralKind::Tab, '\t'),
2759 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2760 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2761 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2762 'A' => Ok(Primitive::Assertion(ast::Assertion {
2763 span,
2764 kind: ast::AssertionKind::StartText,
2765 })),
2766 'z' => Ok(Primitive::Assertion(ast::Assertion {
2767 span,
2768 kind: ast::AssertionKind::EndText,
2769 })),
2770 'b' => {
2771 let mut wb = ast::Assertion {
2772 span,
2773 kind: ast::AssertionKind::WordBoundary,
2774 };
2775 if !self.is_eof() && self.char() == '{' {
2778 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2779 wb.kind = kind;
2780 wb.span.end = self.pos();
2781 }
2782 }
2783 Ok(Primitive::Assertion(wb))
2784 }
2785 'B' => Ok(Primitive::Assertion(ast::Assertion {
2786 span,
2787 kind: ast::AssertionKind::NotWordBoundary,
2788 })),
2789 '<' => Ok(Primitive::Assertion(ast::Assertion {
2790 span,
2791 kind: ast::AssertionKind::WordBoundaryStartAngle,
2792 })),
2793 '>' => Ok(Primitive::Assertion(ast::Assertion {
2794 span,
2795 kind: ast::AssertionKind::WordBoundaryEndAngle,
2796 })),
2797 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2798 }
2799 }
2800
2801 fn maybe_parse_special_word_boundary(
2802 &self,
2803 wb_start: Position,
2804 ) -> Result<Option<ast::AssertionKind>> {
2805 assert_eq!(self.char(), '{');
2806
2807 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2808 let start = self.pos();
2809 if !self.bump_and_bump_space() {
2810 return Err(self.error(
2811 Span::new(wb_start, self.pos()),
2812 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2813 ));
2814 }
2815 let start_contents = self.pos();
2816 if !is_valid_char(self.char()) {
2817 self.parser().pos.set(start);
2818 return Ok(None);
2819 }
2820
2821 let mut scratch = self.parser().scratch.borrow_mut();
2823 scratch.clear();
2824 while !self.is_eof() && is_valid_char(self.char()) {
2825 scratch.push(self.char());
2826 self.bump_and_bump_space();
2827 }
2828 if self.is_eof() || self.char() != '}' {
2829 return Err(self.error(
2830 Span::new(start, self.pos()),
2831 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2832 ));
2833 }
2834 let end = self.pos();
2835 self.bump();
2836 let kind = match scratch.as_str() {
2837 "start" => ast::AssertionKind::WordBoundaryStart,
2838 "end" => ast::AssertionKind::WordBoundaryEnd,
2839 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2840 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2841 _ => {
2842 return Err(self.error(
2843 Span::new(start_contents, end),
2844 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2845 ))
2846 }
2847 };
2848 Ok(Some(kind))
2849 }
2850
2851 #[inline(never)]
2852 fn parse_octal(&self) -> Literal {
2853 assert!(self.parser().octal);
2854 assert!('0' <= self.char() && self.char() <= '7');
2855 let start = self.pos();
2856 while self.bump()
2858 && '0' <= self.char()
2859 && self.char() <= '7'
2860 && self.pos().offset - start.offset <= 2
2861 {}
2862 let end = self.pos();
2863 let octal = &self.pattern()[start.offset..end.offset];
2864 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2867 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2870 Literal {
2871 span: Span::new(start, end),
2872 kind: LiteralKind::Octal,
2873 c,
2874 }
2875 }
2876
2877 #[inline(never)]
2878 fn parse_hex(&self) -> Result<Literal> {
2879 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2880
2881 let hex_kind = match self.char() {
2882 'x' => HexLiteralKind::X,
2883 'u' => HexLiteralKind::UnicodeShort,
2884 _ => HexLiteralKind::UnicodeLong,
2885 };
2886 if !self.bump_and_bump_space() {
2887 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2888 }
2889 if self.char() == '{' {
2890 self.parse_hex_brace(hex_kind)
2891 } else {
2892 self.parse_hex_digits(hex_kind)
2893 }
2894 }
2895
2896 #[inline(never)]
2897 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2898 let mut scratch = self.parser().scratch.borrow_mut();
2899 scratch.clear();
2900
2901 let start = self.pos();
2902 for i in 0..kind.digits() {
2903 if i > 0 && !self.bump_and_bump_space() {
2904 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2905 }
2906 if !is_hex(self.char()) {
2907 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2908 }
2909 scratch.push(self.char());
2910 }
2911 self.bump_and_bump_space();
2912 let end = self.pos();
2913 let hex = scratch.as_str();
2914 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2915 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2916 Some(c) => Ok(Literal {
2917 span: Span::new(start, end),
2918 kind: LiteralKind::HexFixed(kind),
2919 c,
2920 }),
2921 }
2922 }
2923
2924 #[inline(never)]
2925 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2926 let mut scratch = self.parser().scratch.borrow_mut();
2927 scratch.clear();
2928
2929 let brace_pos = self.pos();
2930 let start = self.span_char().end;
2931 while self.bump_and_bump_space() && self.char() != '}' {
2932 if !is_hex(self.char()) {
2933 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2934 }
2935 scratch.push(self.char());
2936 }
2937 if self.is_eof() {
2938 return Err(self.error(
2939 Span::new(brace_pos, self.pos()),
2940 ast::ErrorKind::EscapeUnexpectedEof,
2941 ));
2942 }
2943 let end = self.pos();
2944 let hex = scratch.as_str();
2945 assert_eq!(self.char(), '}');
2946 self.bump_and_bump_space();
2947
2948 if hex.is_empty() {
2949 return Err(self.error(
2950 Span::new(brace_pos, self.pos()),
2951 ast::ErrorKind::EscapeHexEmpty,
2952 ));
2953 }
2954 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2955 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2956 Some(c) => Ok(Literal {
2957 span: Span::new(start, self.pos()),
2958 kind: LiteralKind::HexBrace(kind),
2959 c,
2960 }),
2961 }
2962 }
2963
2964 fn parse_decimal(&self) -> Result<u32> {
2965 let mut scratch = self.parser().scratch.borrow_mut();
2966 scratch.clear();
2967
2968 while !self.is_eof() && self.char().is_whitespace() {
2969 self.bump();
2970 }
2971 let start = self.pos();
2972 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2973 scratch.push(self.char());
2974 self.bump_and_bump_space();
2975 }
2976 let span = Span::new(start, self.pos());
2977 while !self.is_eof() && self.char().is_whitespace() {
2978 self.bump_and_bump_space();
2979 }
2980 let digits = scratch.as_str();
2981 if digits.is_empty() {
2982 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2983 }
2984 match digits.parse::<u32>().ok() {
2985 Some(n) => Ok(n),
2986 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2987 }
2988 }
2989
2990 #[inline(never)]
2991 fn parse_set_class(&self) -> Result<ClassBracketed> {
2992 assert_eq!(self.char(), '[');
2993
2994 let mut union = ClassSetUnion {
2995 span: self.span(),
2996 items: vec![],
2997 };
2998 loop {
2999 self.bump_space();
3000 if self.is_eof() {
3001 return Err(self.unclosed_class_error());
3002 }
3003 match self.char() {
3004 '[' => {
3005 if !self.parser().stack_class.borrow().is_empty() {
3006 if let Some(cls) = self.maybe_parse_ascii_class() {
3007 union.push(ClassSetItem::Ascii(cls));
3008 continue;
3009 }
3010 }
3011 union = self.push_class_open(union)?;
3012 }
3013 ']' => match self.pop_class(union)? {
3014 Either::Left(nested_union) => {
3015 union = nested_union;
3016 }
3017 Either::Right(class) => return Ok(class),
3018 },
3019 '&' if self.peek() == Some('&') => {
3020 assert!(self.bump_if("&&"));
3021 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3022 }
3023 '-' if self.peek() == Some('-') => {
3024 assert!(self.bump_if("--"));
3025 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3026 }
3027 '~' if self.peek() == Some('~') => {
3028 assert!(self.bump_if("~~"));
3029 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3030 }
3031 _ => {
3032 union.push(self.parse_set_class_range()?);
3033 }
3034 }
3035 }
3036 }
3037
3038 #[inline(never)]
3039 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3040 let prim1 = self.parse_set_class_item()?;
3041 self.bump_space();
3042 if self.is_eof() {
3043 return Err(self.unclosed_class_error());
3044 }
3045 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3046 return prim1.into_class_set_item(self);
3047 }
3048 if !self.bump_and_bump_space() {
3049 return Err(self.unclosed_class_error());
3050 }
3051 let prim2 = self.parse_set_class_item()?;
3052 let range = ClassSetRange {
3053 span: Span::new(prim1.span().start, prim2.span().end),
3054 start: prim1.into_class_literal(self)?,
3055 end: prim2.into_class_literal(self)?,
3056 };
3057 if !range.is_valid() {
3058 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3059 }
3060 Ok(ClassSetItem::Range(range))
3061 }
3062
3063 #[inline(never)]
3064 fn parse_set_class_item(&self) -> Result<Primitive> {
3065 if self.char() == '\\' {
3066 self.parse_escape()
3067 } else {
3068 let x = Primitive::Literal(Literal {
3069 span: self.span_char(),
3070 kind: LiteralKind::Verbatim,
3071 c: self.char(),
3072 });
3073 self.bump();
3074 Ok(x)
3075 }
3076 }
3077
3078 #[inline(never)]
3079 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3080 assert_eq!(self.char(), '[');
3081 let start = self.pos();
3082 if !self.bump_and_bump_space() {
3083 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3084 }
3085
3086 let negated = if self.char() != '^' {
3087 false
3088 } else {
3089 if !self.bump_and_bump_space() {
3090 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3091 }
3092 true
3093 };
3094 let mut union = ClassSetUnion {
3096 span: self.span(),
3097 items: vec![],
3098 };
3099 while self.char() == '-' {
3100 union.push(ClassSetItem::Literal(Literal {
3101 span: self.span_char(),
3102 kind: LiteralKind::Verbatim,
3103 c: '-',
3104 }));
3105 if !self.bump_and_bump_space() {
3106 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3107 }
3108 }
3109 if union.items.is_empty() && self.char() == ']' {
3112 union.push(ClassSetItem::Literal(Literal {
3113 span: self.span_char(),
3114 kind: LiteralKind::Verbatim,
3115 c: ']',
3116 }));
3117 if !self.bump_and_bump_space() {
3118 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3119 }
3120 }
3121 let set = ClassBracketed {
3122 span: Span::new(start, self.pos()),
3123 negated,
3124 kind: ClassSet::union(ClassSetUnion {
3125 span: Span::new(union.span.start, union.span.start),
3126 items: vec![],
3127 }),
3128 };
3129 Ok((set, union))
3130 }
3131
3132 #[inline(never)]
3133 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3134 assert_eq!(self.char(), '[');
3135 let start = self.pos();
3137 let mut negated = false;
3138 if !self.bump() || self.char() != ':' {
3139 self.parser().pos.set(start);
3140 return None;
3141 }
3142 if !self.bump() {
3143 self.parser().pos.set(start);
3144 return None;
3145 }
3146 if self.char() == '^' {
3147 negated = true;
3148 if !self.bump() {
3149 self.parser().pos.set(start);
3150 return None;
3151 }
3152 }
3153 let name_start = self.offset();
3154 while self.char() != ':' && self.bump() {}
3155 if self.is_eof() {
3156 self.parser().pos.set(start);
3157 return None;
3158 }
3159 let name = &self.pattern()[name_start..self.offset()];
3160 if !self.bump_if(":]") {
3161 self.parser().pos.set(start);
3162 return None;
3163 }
3164 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3165 Some(kind) => kind,
3166 None => {
3167 self.parser().pos.set(start);
3168 return None;
3169 }
3170 };
3171 Some(ClassAscii {
3172 span: Span::new(start, self.pos()),
3173 kind,
3174 negated,
3175 })
3176 }
3177
3178 #[inline(never)]
3179 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3180 assert!(self.char() == 'p' || self.char() == 'P');
3181
3182 let mut scratch = self.parser().scratch.borrow_mut();
3183 scratch.clear();
3184
3185 let negated = self.char() == 'P';
3186 if !self.bump_and_bump_space() {
3187 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3188 }
3189 let (start, kind) = if self.char() == '{' {
3190 let start = self.span_char().end;
3191 while self.bump_and_bump_space() && self.char() != '}' {
3192 scratch.push(self.char());
3193 }
3194 if self.is_eof() {
3195 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3196 }
3197 assert_eq!(self.char(), '}');
3198 self.bump();
3199
3200 let name = scratch.as_str();
3201 if let Some(i) = name.find("!=") {
3202 (
3203 start,
3204 ClassUnicodeKind::NamedValue {
3205 op: ClassUnicodeOpKind::NotEqual,
3206 name: name[..i].to_string(),
3207 value: name[i + 2..].to_string(),
3208 },
3209 )
3210 } else if let Some(i) = name.find(':') {
3211 (
3212 start,
3213 ClassUnicodeKind::NamedValue {
3214 op: ClassUnicodeOpKind::Colon,
3215 name: name[..i].to_string(),
3216 value: name[i + 1..].to_string(),
3217 },
3218 )
3219 } else if let Some(i) = name.find('=') {
3220 (
3221 start,
3222 ClassUnicodeKind::NamedValue {
3223 op: ClassUnicodeOpKind::Equal,
3224 name: name[..i].to_string(),
3225 value: name[i + 1..].to_string(),
3226 },
3227 )
3228 } else {
3229 (start, ClassUnicodeKind::Named(name.to_string()))
3230 }
3231 } else {
3232 let start = self.pos();
3233 let c = self.char();
3234 if c == '\\' {
3235 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3236 }
3237 self.bump_and_bump_space();
3238 let kind = ClassUnicodeKind::OneLetter(c);
3239 (start, kind)
3240 };
3241 Ok(ClassUnicode {
3242 span: Span::new(start, self.pos()),
3243 negated,
3244 kind,
3245 })
3246 }
3247
3248 #[inline(never)]
3249 fn parse_perl_class(&self) -> ClassPerl {
3250 let c = self.char();
3251 let span = self.span_char();
3252 self.bump();
3253 let (negated, kind) = match c {
3254 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3255 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3256 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3257 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3258 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3259 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3260 c => panic!("expected valid Perl class but got '{}'", c),
3261 };
3262 ClassPerl {
3263 span,
3264 kind,
3265 negated,
3266 }
3267 }
3268}
3269
3270fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3273 use regex_syntax::ast::ClassSetItem;
3274 let items = match item {
3275 ClassSetItem::Union(u) => &u.items,
3276 _ => return false,
3277 };
3278 if items.len() != 2 {
3279 return false;
3280 }
3281 match (&items[0], &items[1]) {
3282 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3283 let is_all = a.kind == b.kind && a.negated != b.negated;
3284 is_all
3285 }
3286 _ => false,
3287 }
3288}
3289
3290pub fn max_concat_length(ast: &ast::Ast) -> usize {
3291 match ast {
3292 ast::Ast::Empty(_)
3293 | ast::Ast::Flags(_)
3294 | ast::Ast::Literal(_)
3295 | ast::Ast::Dot(_)
3296 | ast::Ast::Top(_)
3297 | ast::Ast::Assertion(_)
3298 | ast::Ast::ClassUnicode(_)
3299 | ast::Ast::ClassPerl(_)
3300 | ast::Ast::ClassBracketed(_) => 0,
3301 ast::Ast::Group(g) => max_concat_length(&g.ast),
3302 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3303 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3304 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3305 ast::Ast::Concat(c) => c
3306 .asts
3307 .len()
3308 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3309 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3310 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3311 }
3312}
3313
3314pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3315 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3316 match ast {
3317 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3318 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3319 ast::Ast::Assertion(_) => 1,
3320 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3321 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3322 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3323 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3324 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3325 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3326 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3327 ast::Ast::Repetition(r) => {
3328 let body = go(&r.ast, limit);
3329 let factor: u64 = match &r.op.kind {
3330 ast::RepetitionKind::ZeroOrOne => 2,
3331 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3332 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3333 (*n as u64).max(1)
3334 }
3335 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3336 (*n as u64).max(1).saturating_add(1)
3337 }
3338 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3339 (*m as u64).max(1)
3340 }
3341 };
3342 body.saturating_mul(factor).min(limit)
3343 }
3344 }
3345 }
3346 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3347 let mut total: u64 = 0;
3348 for c in children {
3349 total = total.saturating_add(go(c, limit));
3350 if total >= limit {
3351 return limit;
3352 }
3353 }
3354 total
3355 }
3356 go(ast, limit)
3357}
3358
3359pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3360 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3361 p.parse(tb)
3362}
3363
3364pub fn parse_ast_with<'s>(
3365 tb: &mut TB<'s>,
3366 pattern: &'s str,
3367 flags: &PatternFlags,
3368) -> std::result::Result<NodeId, ParseError> {
3369 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3370 p.parse(tb)
3371}
3372
3373pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3375 let mut p: ResharpParser = ResharpParser::new(pattern);
3376 p.parse_inner()
3377}