1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51 pub max_depth: usize,
52}
53
54pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62 fn default() -> Self {
63 Self {
64 unicode: true,
65 full_unicode: false,
66 case_insensitive: false,
67 dot_matches_new_line: false,
68 multiline: true,
69 ignore_whitespace: false,
70 ascii_perl_classes: false,
71 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72 max_list_len: DEFAULT_MAX_LIST_LEN,
73 max_repeat: DEFAULT_MAX_REPEAT,
74 max_depth: DEFAULT_MAX_DEPTH,
75 }
76 }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81 Word,
82 NonWord,
83 MaybeWord,
84 MaybeNonWord,
85 Unknown,
86 Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90 b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
94 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95 use WordCharKind::*;
96 match item {
97 ClassSetItem::Empty(_) => Unknown,
98 ClassSetItem::Literal(l) => {
99 if is_word_byte(l.c as u8) {
100 Word
101 } else {
102 NonWord
103 }
104 }
105 ClassSetItem::Range(r) => {
106 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108 if all_word {
109 Word
110 } else if all_non {
111 NonWord
112 } else {
113 Unknown
114 }
115 }
116 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117 (ClassPerlKind::Word, false) => Word,
118 (ClassPerlKind::Word, true) => NonWord,
119 (ClassPerlKind::Space, false) => NonWord,
120 (ClassPerlKind::Digit, false) => Word,
121 _ => Unknown,
122 },
123 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124 ClassSetItem::Union(u) => {
125 let mut kind = Unknown;
126 for item in &u.items {
127 let k = class_set_item_word_kind(item);
128 kind = match (kind, k) {
129 (_, Unknown) => return Unknown,
130 (Unknown, _) => k,
131 (Word, Word) => Word,
132 (NonWord, NonWord) => NonWord,
133 _ => return Unknown,
134 };
135 }
136 kind
137 }
138 _ => Unknown,
139 }
140}
141
142fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
143 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144 use WordCharKind::*;
145 if c.negated {
146 return match &c.kind {
147 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148 if p.negated {
149 Word
150 } else {
151 NonWord
152 }
153 }
154 _ => Unknown,
155 };
156 }
157 match &c.kind {
158 ClassSet::Item(item) => class_set_item_word_kind(item),
159 ClassSet::BinaryOp(_) => Unknown,
160 }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164 regex_syntax::ast::Literal {
165 span,
166 kind: regex_syntax::ast::LiteralKind::Verbatim,
167 c,
168 }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173 span,
174 start: ascii_class_lit(span, a),
175 end: ascii_class_lit(span, b),
176 })
177}
178
179fn ascii_perl_positive(
180 span: Span,
181 kind: ®ex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184 match kind {
185 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187 span,
188 items: vec![
189 ascii_class_range(span, 'a', 'z'),
190 ascii_class_range(span, 'A', 'Z'),
191 ascii_class_range(span, '0', '9'),
192 ClassSetItem::Literal(ascii_class_lit(span, '_')),
193 ],
194 }),
195 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196 span,
197 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198 .into_iter()
199 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200 .collect(),
201 }),
202 }
203}
204
205fn ascii_perl_set_item(
206 span: Span,
207 kind: ®ex_syntax::ast::ClassPerlKind,
208 negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211 let positive = ascii_perl_positive(span, kind);
212 if negated {
213 ClassSetItem::Bracketed(Box::new(ClassBracketed {
214 span,
215 negated: true,
216 kind: ClassSet::Item(positive),
217 }))
218 } else {
219 positive
220 }
221}
222
223fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225 match set {
226 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228 span: op.span,
229 kind: op.kind.clone(),
230 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232 }),
233 }
234}
235
236fn rewrite_ascii_perl_item(
237 item: ®ex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240 match item {
241 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243 span: u.span,
244 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245 }),
246 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247 span: b.span,
248 negated: b.negated,
249 kind: rewrite_ascii_perl_set(&b.kind),
250 })),
251 other => other.clone(),
252 }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257 Literal(Literal),
258 Assertion(ast::Assertion),
259 Dot(Span),
260 Top(Span),
261 Perl(ClassPerl),
262 Unicode(ClassUnicode),
263}
264
265impl Primitive {
266 fn span(&self) -> &Span {
267 match *self {
268 Primitive::Literal(ref x) => &x.span,
269 Primitive::Assertion(ref x) => &x.span,
270 Primitive::Dot(ref span) => span,
271 Primitive::Top(ref span) => span,
272 Primitive::Perl(ref x) => &x.span,
273 Primitive::Unicode(ref x) => &x.span,
274 }
275 }
276
277 fn into_ast(self) -> Ast {
278 match self {
279 Primitive::Literal(lit) => Ast::literal(lit),
280 Primitive::Assertion(assert) => Ast::assertion(assert),
281 Primitive::Dot(span) => Ast::dot(span),
282 Primitive::Top(span) => Ast::top(span),
283 Primitive::Perl(cls) => Ast::class_perl(cls),
284 Primitive::Unicode(cls) => Ast::class_unicode(cls),
285 }
286 }
287
288 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289 use self::Primitive::*;
290 use regex_syntax::ast::ClassSetItem;
291
292 match self {
293 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297 }
298 }
299
300 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301 use self::Primitive::*;
302
303 match self {
304 Literal(lit) => Ok(lit),
305 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306 }
307 }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312 Left(Left),
313 Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318 pub kind: ErrorKind,
320 pattern: String,
323 pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329 write!(f, "{:?}: {:?}", self.kind, self.span)
330 }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338 Group {
340 concat: Concat,
342 group: ast::Group,
344 ignore_whitespace: bool,
346 },
347 Alternation(ast::Alternation),
348 Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353 Open {
355 union: regex_syntax::ast::ClassSetUnion,
357 set: regex_syntax::ast::ClassBracketed,
358 },
359 Op {
362 kind: regex_syntax::ast::ClassSetBinaryOpKind,
364 lhs: regex_syntax::ast::ClassSet,
366 },
367}
368
369pub struct ResharpParser<'s> {
371 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372 unicode_classes: resharp_algebra::UnicodeClassCache,
373 pub translator: regex_syntax::hir::translate::Translator,
374 pub pattern: &'s str,
375 pos: Cell<Position>,
376 capture_index: Cell<u32>,
377 octal: bool,
378 empty_min_range: bool,
379 ignore_whitespace: Cell<bool>,
380 dot_all: Cell<bool>,
381 multiline: Cell<bool>,
382 global_unicode: bool,
383 global_full_unicode: bool,
384 global_ascii_perl: bool,
385 global_case_insensitive: bool,
386 expanded_ast_limit: u64,
387 max_list_len: usize,
388 max_repeat: u32,
389 max_depth: usize,
390 comments: RefCell<Vec<ast::Comment>>,
391 stack_group: RefCell<Vec<GroupState>>,
392 stack_class: RefCell<Vec<ClassState>>,
393 capture_names: RefCell<Vec<ast::CaptureName>>,
394 scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398 result.map_err(|e| {
399 if e.kind == from {
400 ParseError {
401 kind: to,
402 pattern: e.pattern,
403 span: e.span,
404 }
405 } else {
406 e
407 }
408 })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412 if first {
413 c == '_' || c.is_alphabetic()
414 } else {
415 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416 }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420 matches!(
421 c,
422 '\\' | '.'
423 | '+'
424 | '*'
425 | '?'
426 | '('
427 | ')'
428 | '|'
429 | '['
430 | ']'
431 | '{'
432 | '}'
433 | '^'
434 | '$'
435 | '#'
436 | '&'
437 | '-'
438 | '~'
439 | '_'
440 )
441}
442
443pub fn escape(text: &str) -> String {
445 let mut buf = String::new();
446 escape_into(text, &mut buf);
447 buf
448}
449
450pub fn escape_into(text: &str, buf: &mut String) {
452 buf.reserve(text.len());
453 for c in text.chars() {
454 if is_meta_character(c) {
455 buf.push('\\');
456 }
457 buf.push(c);
458 }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462 if is_meta_character(c) {
463 return true;
464 }
465 if !c.is_ascii() {
466 return false;
467 }
468 match c {
469 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470 '<' | '>' => false,
471 _ => true,
472 }
473}
474
475fn is_hex(c: char) -> bool {
476 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480 match ast {
481 Ast::Concat(c) => {
482 let mut child_at_start = at_start;
483 for child in &c.asts {
484 ensure_lookbehind_at_start(child, child_at_start)?;
485 if ast_may_consume(child) {
486 child_at_start = false;
487 }
488 }
489 Ok(())
490 }
491 Ast::Alternation(a) => {
492 for child in &a.asts {
493 ensure_lookbehind_at_start(child, at_start)?;
494 }
495 Ok(())
496 }
497 Ast::Intersection(i) => {
498 for child in &i.asts {
499 ensure_lookbehind_at_start(child, at_start)?;
500 }
501 Ok(())
502 }
503 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506 Ast::Lookaround(g) => {
507 match g.kind {
508 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509 if !at_start {
510 return Err(g.span);
511 }
512 }
513 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514 }
515 ensure_lookbehind_at_start(&g.ast, true)
516 }
517 Ast::Empty(_)
518 | Ast::Flags(_)
519 | Ast::Literal(_)
520 | Ast::Dot(_)
521 | Ast::Top(_)
522 | Ast::Assertion(_)
523 | Ast::ClassUnicode(_)
524 | Ast::ClassPerl(_)
525 | Ast::ClassBracketed(_) => Ok(()),
526 }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530 match ast {
531 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532 Ast::Literal(_)
533 | Ast::Dot(_)
534 | Ast::Top(_)
535 | Ast::ClassUnicode(_)
536 | Ast::ClassPerl(_)
537 | Ast::ClassBracketed(_) => true,
538 Ast::Group(g) => ast_may_consume(&g.ast),
539 Ast::Repetition(r) => {
540 if !ast_may_consume(&r.ast) {
541 return false;
542 }
543 match r.op.kind {
544 RepetitionKind::ZeroOrOne
545 | RepetitionKind::ZeroOrMore
546 | RepetitionKind::OneOrMore => true,
547 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549 RepetitionKind::Range(_) => true,
550 }
551 }
552 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554 Ast::Complement(_) => true,
555 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556 }
557}
558
559impl<'s> ResharpParser<'s> {
560 fn default_translator_builder(&self) -> TranslatorBuilder {
561 let mut trb = TranslatorBuilder::new();
562 trb.unicode(self.global_unicode);
563 trb.utf8(false);
564 trb.case_insensitive(self.global_case_insensitive);
565 trb
566 }
567
568 pub fn new(pattern: &'s str) -> Self {
569 Self::with_flags(pattern, &PatternFlags::default())
570 }
571
572 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573 let mut trb = TranslatorBuilder::new();
574 trb.unicode(flags.unicode);
575 trb.utf8(false);
576 trb.case_insensitive(flags.case_insensitive);
577 Self {
578 translator: trb.build(),
579 pattern,
580 perl_classes: vec![],
581 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582 pos: Cell::new(Position::new(0, 0, 0)),
583 capture_index: Cell::new(0),
584 octal: false,
585 empty_min_range: false,
586 ignore_whitespace: Cell::new(flags.ignore_whitespace),
587 dot_all: Cell::new(flags.dot_matches_new_line),
588 multiline: Cell::new(flags.multiline),
589 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590 global_full_unicode: flags.full_unicode,
591 global_ascii_perl: flags.ascii_perl_classes,
592 global_case_insensitive: flags.case_insensitive,
593 expanded_ast_limit: flags.expanded_ast_limit,
594 max_list_len: flags.max_list_len,
595 max_repeat: flags.max_repeat,
596 max_depth: flags.max_depth,
597 comments: RefCell::new(vec![]),
598 stack_group: RefCell::new(vec![]),
599 stack_class: RefCell::new(vec![]),
600 capture_names: RefCell::new(vec![]),
601 scratch: RefCell::new(String::new()),
602 }
603 }
604
605 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606 self
607 }
608
609 fn pattern(&self) -> &str {
610 self.pattern
611 }
612
613 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614 ParseError {
615 kind,
616 pattern: self.pattern().to_string(),
617 span,
618 }
619 }
620
621 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622 self.error(
623 Span::splat(self.pos()),
624 ast::ErrorKind::UnsupportedResharpRegex,
625 )
626 }
627
628 fn offset(&self) -> usize {
629 self.parser().pos.get().offset
630 }
631
632 fn line(&self) -> usize {
633 self.parser().pos.get().line
634 }
635
636 fn column(&self) -> usize {
637 self.parser().pos.get().column
638 }
639
640 fn next_capture_index(&self, span: Span) -> Result<u32> {
641 let current = self.parser().capture_index.get();
642 let i = current
643 .checked_add(1)
644 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645 self.parser().capture_index.set(i);
646 Ok(i)
647 }
648
649 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650 let mut names = self.parser().capture_names.borrow_mut();
651 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652 Err(i) => {
653 names.insert(i, cap.clone());
654 Ok(())
655 }
656 Ok(i) => Err(self.error(
657 cap.span,
658 ast::ErrorKind::GroupNameDuplicate {
659 original: names[i].span,
660 },
661 )),
662 }
663 }
664
665 fn ignore_whitespace(&self) -> bool {
666 self.parser().ignore_whitespace.get()
667 }
668
669 fn char(&self) -> char {
670 self.char_at(self.offset())
671 }
672
673 fn char_at(&self, i: usize) -> char {
674 self.pattern()[i..]
675 .chars()
676 .next()
677 .unwrap_or_else(|| panic!("expected char at offset {}", i))
678 }
679
680 fn bump(&self) -> bool {
681 if self.is_eof() {
682 return false;
683 }
684 let Position {
685 mut offset,
686 mut line,
687 mut column,
688 } = self.pos();
689 if self.char() == '\n' {
690 line = line.checked_add(1).unwrap();
691 column = 1;
692 } else {
693 column = column.checked_add(1).unwrap();
694 }
695 offset += self.char().len_utf8();
696 self.parser().pos.set(Position {
697 offset,
698 line,
699 column,
700 });
701 self.pattern()[self.offset()..].chars().next().is_some()
702 }
703
704 fn bump_if(&self, prefix: &str) -> bool {
705 if self.pattern()[self.offset()..].starts_with(prefix) {
706 for _ in 0..prefix.chars().count() {
707 self.bump();
708 }
709 true
710 } else {
711 false
712 }
713 }
714
715 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716 if self.bump_if("?=") {
717 return Some((true, true));
718 }
719 if self.bump_if("?!") {
720 return Some((true, false));
721 }
722 if self.bump_if("?<=") {
723 return Some((false, true));
724 }
725 if self.bump_if("?<!") {
726 return Some((false, false));
727 }
728 None
729 }
730
731 fn bump_and_bump_space(&self) -> bool {
732 if !self.bump() {
733 return false;
734 }
735 self.bump_space();
736 !self.is_eof()
737 }
738
739 fn bump_space(&self) {
740 if !self.ignore_whitespace() {
741 return;
742 }
743 while !self.is_eof() {
744 if self.char().is_whitespace() {
745 self.bump();
746 } else if self.char() == '#' {
747 let start = self.pos();
748 let mut comment_text = String::new();
749 self.bump();
750 while !self.is_eof() {
751 let c = self.char();
752 self.bump();
753 if c == '\n' {
754 break;
755 }
756 comment_text.push(c);
757 }
758 let comment = ast::Comment {
759 span: Span::new(start, self.pos()),
760 comment: comment_text,
761 };
762 self.parser().comments.borrow_mut().push(comment);
763 } else {
764 break;
765 }
766 }
767 }
768
769 fn peek(&self) -> Option<char> {
770 if self.is_eof() {
771 return None;
772 }
773 self.pattern()[self.offset() + self.char().len_utf8()..]
774 .chars()
775 .next()
776 }
777
778 fn peek_space(&self) -> Option<char> {
781 if !self.ignore_whitespace() {
782 return self.peek();
783 }
784 if self.is_eof() {
785 return None;
786 }
787 let mut start = self.offset() + self.char().len_utf8();
788 let mut in_comment = false;
789 for (i, c) in self.pattern()[start..].char_indices() {
790 if c.is_whitespace() {
791 continue;
792 } else if !in_comment && c == '#' {
793 in_comment = true;
794 } else if in_comment && c == '\n' {
795 in_comment = false;
796 } else {
797 start += i;
798 break;
799 }
800 }
801 self.pattern()[start..].chars().next()
802 }
803
804 fn is_eof(&self) -> bool {
805 self.offset() == self.pattern().len()
806 }
807
808 fn pos(&self) -> Position {
809 self.parser().pos.get()
810 }
811
812 fn span(&self) -> Span {
813 Span::splat(self.pos())
814 }
815
816 fn span_char(&self) -> Span {
817 let mut next = Position {
818 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819 line: self.line(),
820 column: self.column().checked_add(1).unwrap(),
821 };
822 if self.char() == '\n' {
823 next.line += 1;
824 next.column = 1;
825 }
826 Span::new(self.pos(), next)
827 }
828
829 #[inline(never)]
830 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831 assert_eq!(self.char(), '|');
832 concat.span.end = self.pos();
833 self.push_or_add_alternation(concat);
834 self.bump();
835 Ok(ast::Concat {
836 span: self.span(),
837 asts: vec![],
838 })
839 }
840
841 fn push_or_add_alternation(&self, concat: Concat) {
842 use self::GroupState::*;
843
844 let mut stack = self.parser().stack_group.borrow_mut();
845 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846 alts.asts.push(concat.into_ast());
847 return;
848 }
849 stack.push(Alternation(ast::Alternation {
850 span: Span::new(concat.span.start, self.pos()),
851 asts: vec![concat.into_ast()],
852 }));
853 }
854
855 #[inline(never)]
856 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857 assert_eq!(self.char(), '&');
858 concat.span.end = self.pos();
859 self.push_or_add_intersect(concat);
860 self.bump();
861 Ok(Concat {
862 span: self.span(),
863 asts: vec![],
864 })
865 }
866
867 fn push_or_add_intersect(&self, concat: Concat) {
868 use self::GroupState::*;
869
870 let mut stack = self.parser().stack_group.borrow_mut();
871 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872 alts.asts.push(concat.into_ast());
873 return;
874 }
875 stack.push(Intersection(ast::Intersection {
876 span: Span::new(concat.span.start, self.pos()),
877 asts: vec![concat.into_ast()],
878 }));
879 }
880
881 #[inline(never)]
882 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883 assert_eq!(self.char(), '(');
884 match self.parse_group()? {
885 Either::Left(set) => {
886 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887 if let Some(v) = ignore {
888 self.parser().ignore_whitespace.set(v);
889 }
890
891 concat.asts.push(Ast::flags(set));
892 Ok(concat)
893 }
894 Either::Right(group) => {
895 let old_ignore_whitespace = self.ignore_whitespace();
896 let new_ignore_whitespace = group
897 .flags()
898 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899 .unwrap_or(old_ignore_whitespace);
900 self.parser()
901 .stack_group
902 .borrow_mut()
903 .push(GroupState::Group {
904 concat,
905 group,
906 ignore_whitespace: old_ignore_whitespace,
907 });
908 self.parser().ignore_whitespace.set(new_ignore_whitespace);
909 Ok(Concat {
910 span: self.span(),
911 asts: vec![],
912 })
913 }
914 }
915 }
916
917 #[inline(never)]
918 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919 assert_eq!(self.char(), '~');
920 self.bump();
921 if self.is_eof() || self.char() != '(' {
922 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923 }
924 let open_span = self.span_char();
925 self.bump();
926 let group = ast::Group {
927 span: open_span,
928 kind: ast::GroupKind::Complement,
929 ast: Box::new(Ast::empty(self.span())),
930 };
931
932 let old_ignore_whitespace = self.ignore_whitespace();
933 let new_ignore_whitespace = group
934 .flags()
935 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936 .unwrap_or(old_ignore_whitespace);
937 self.parser()
938 .stack_group
939 .borrow_mut()
940 .push(GroupState::Group {
941 concat,
942 group,
943 ignore_whitespace: old_ignore_whitespace,
944 });
945 self.parser().ignore_whitespace.set(new_ignore_whitespace);
946 Ok(Concat {
947 span: self.span(),
948 asts: vec![],
949 })
950 }
951
952 #[inline(never)]
953 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954 use self::GroupState::*;
955 assert_eq!(self.char(), ')');
956 let mut stack = self.parser().stack_group.borrow_mut();
957 let topstack = stack.pop();
958
959 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960 Some(Group {
961 concat,
962 group,
963 ignore_whitespace,
964 }) => (concat, group, ignore_whitespace, None),
965 Some(Alternation(alt)) => match stack.pop() {
966 Some(Group {
967 concat,
968 group,
969 ignore_whitespace,
970 }) => (
971 concat,
972 group,
973 ignore_whitespace,
974 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975 ),
976 None | Some(Alternation(_)) | Some(Intersection(_)) => {
977 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978 }
979 },
980 Some(Intersection(int)) => match stack.pop() {
981 Some(Group {
982 concat,
983 group,
984 ignore_whitespace,
985 }) => (
986 concat,
987 group,
988 ignore_whitespace,
989 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990 ),
991 None | Some(Alternation(_)) | Some(Intersection(_)) => {
992 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993 }
994 },
995
996 None => {
997 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998 }
999 };
1000 self.parser().ignore_whitespace.set(ignore_whitespace);
1001 group_concat.span.end = self.pos();
1002 self.bump();
1003 group.span.end = self.pos();
1004 match alt {
1005 Some(Either::Left(mut alt)) => {
1006 alt.span.end = group_concat.span.end;
1007 alt.asts.push(group_concat.into_ast());
1008 group.ast = Box::new(alt.into_ast());
1009 }
1010 Some(Either::Right(mut int)) => {
1011 int.span.end = group_concat.span.end;
1012 int.asts.push(group_concat.into_ast());
1013 group.ast = Box::new(int.into_ast());
1014 }
1015 None => {
1016 group.ast = Box::new(group_concat.into_ast());
1017 }
1018 }
1019
1020 if group.kind == GroupKind::Complement {
1021 let complement = ast::Complement {
1022 span: self.span(),
1023 ast: group.ast,
1024 };
1025 prior_concat.asts.push(Ast::complement(complement));
1026 }
1027 else {
1029 prior_concat.asts.push(Ast::group(group));
1030 }
1031 Ok(prior_concat)
1032 }
1033
1034 #[inline(never)]
1035 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036 concat.span.end = self.pos();
1037 let mut stack = self.parser().stack_group.borrow_mut();
1038 let ast = match stack.pop() {
1039 None => Ok(concat.into_ast()),
1040 Some(GroupState::Alternation(mut alt)) => {
1041 alt.span.end = self.pos();
1042 alt.asts.push(concat.into_ast());
1043 Ok(Ast::alternation(alt))
1044 }
1045 Some(GroupState::Intersection(mut int)) => {
1046 int.span.end = self.pos();
1047 int.asts.push(concat.into_ast());
1048
1049 Ok(Ast::intersection(int))
1050 }
1051 Some(GroupState::Group { group, .. }) => {
1052 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053 }
1054 };
1055 match stack.pop() {
1057 None => ast,
1058 Some(GroupState::Alternation(alt)) => {
1059 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060 }
1061 Some(GroupState::Intersection(int)) => {
1062 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063 }
1064 Some(GroupState::Group { group, .. }) => {
1065 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066 }
1067 }
1068 }
1069
1070 #[inline(never)]
1071 fn push_class_open(
1072 &self,
1073 parent_union: regex_syntax::ast::ClassSetUnion,
1074 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075 assert_eq!(self.char(), '[');
1076
1077 let (nested_set, nested_union) = self.parse_set_class_open()?;
1078 self.parser()
1079 .stack_class
1080 .borrow_mut()
1081 .push(ClassState::Open {
1082 union: parent_union,
1083 set: nested_set,
1084 });
1085 Ok(nested_union)
1086 }
1087
1088 #[inline(never)]
1089 fn pop_class(
1090 &self,
1091 nested_union: regex_syntax::ast::ClassSetUnion,
1092 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093 assert_eq!(self.char(), ']');
1094
1095 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096 let prevset = self.pop_class_op(item);
1097 let mut stack = self.parser().stack_class.borrow_mut();
1098 match stack.pop() {
1099 None => panic!("unexpected empty character class stack"),
1100 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101 Some(ClassState::Open { mut union, mut set }) => {
1102 self.bump();
1103 set.span.end = self.pos();
1104 set.kind = prevset;
1105 if stack.is_empty() {
1106 Ok(Either::Right(set))
1107 } else {
1108 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109 Ok(Either::Left(union))
1110 }
1111 }
1112 }
1113 }
1114
1115 #[inline(never)]
1116 fn unclosed_class_error(&self) -> ParseError {
1117 for state in self.parser().stack_class.borrow().iter().rev() {
1118 if let ClassState::Open { ref set, .. } = *state {
1119 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120 }
1121 }
1122 panic!("no open character class found")
1123 }
1124
1125 #[inline(never)]
1126 fn push_class_op(
1127 &self,
1128 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129 next_union: regex_syntax::ast::ClassSetUnion,
1130 ) -> regex_syntax::ast::ClassSetUnion {
1131 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132 let new_lhs = self.pop_class_op(item);
1133 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134 kind: next_kind,
1135 lhs: new_lhs,
1136 });
1137 regex_syntax::ast::ClassSetUnion {
1138 span: self.span(),
1139 items: vec![],
1140 }
1141 }
1142
1143 #[inline(never)]
1144 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145 let mut stack = self.parser().stack_class.borrow_mut();
1146 let (kind, lhs) = match stack.pop() {
1147 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148 Some(state @ ClassState::Open { .. }) => {
1149 stack.push(state);
1150 return rhs;
1151 }
1152 None => unreachable!(),
1153 };
1154 let span = Span::new(lhs.span().start, rhs.span().end);
1155 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156 span,
1157 kind,
1158 lhs: Box::new(lhs),
1159 rhs: Box::new(rhs),
1160 })
1161 }
1162
1163 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164 match hir.kind() {
1165 hir::HirKind::Empty => Ok(NodeId::EPS),
1166 hir::HirKind::Literal(l) => {
1167 if l.0.len() == 1 {
1168 let node = tb.mk_u8(l.0[0]);
1169 Ok(node)
1170 } else {
1171 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172 let conc = tb.mk_concats(ws.iter().copied());
1173 Ok(conc)
1174 }
1175 }
1176 hir::HirKind::Class(class) => match class {
1177 hir::Class::Unicode(class_unicode) => {
1178 let ranges = class_unicode.ranges();
1179 if ranges.len() == 1
1180 && ranges[0].start() == '\u{0}'
1181 && ranges[0].end() == '\u{10FFFF}'
1182 {
1183 return Ok(tb.mk_range_u8(0, 255));
1184 }
1185 let mut nodes = Vec::new();
1186 for range in ranges {
1187 for seq in Utf8Sequences::new(range.start(), range.end()) {
1188 let sl = seq.as_slice();
1189 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190 let node = match bytes.len() {
1191 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192 n => {
1193 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194 let mut conc = last;
1195 for i in (0..n - 1).rev() {
1196 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197 conc = tb.mk_concat(b, conc);
1198 }
1199 conc
1200 }
1201 };
1202 nodes.push(node);
1203 }
1204 }
1205 let merged = tb.mk_unions(nodes.into_iter());
1206 Ok(merged)
1207 }
1208 hir::Class::Bytes(class_bytes) => {
1209 let ranges = class_bytes.ranges();
1210 let mut result = NodeId::BOT;
1211 for range in ranges {
1212 let start = range.start();
1213 let end = range.end();
1214 let node = tb.mk_range_u8(start, end);
1215 result = tb.mk_union(result, node);
1216 }
1217 Ok(result)
1218 }
1219 },
1220 hir::HirKind::Look(_) => Err(self.error(
1221 Span::splat(self.pos()),
1222 ast::ErrorKind::UnsupportedResharpRegex,
1223 )),
1224 hir::HirKind::Repetition(_) => Err(self.error(
1225 Span::splat(self.pos()),
1226 ast::ErrorKind::UnsupportedResharpRegex,
1227 )),
1228 hir::HirKind::Capture(_) => Err(self.error(
1229 Span::splat(self.pos()),
1230 ast::ErrorKind::UnsupportedResharpRegex,
1231 )),
1232 hir::HirKind::Concat(body) => {
1233 let mut result = NodeId::EPS;
1234 for child in body {
1235 let node = self.hir_to_node_id(child, tb)?;
1236 result = tb.mk_concat(result, node);
1237 }
1238 Ok(result)
1239 }
1240 hir::HirKind::Alternation(_) => Err(self.error(
1241 Span::splat(self.pos()),
1242 ast::ErrorKind::UnsupportedResharpRegex,
1243 )),
1244 }
1245 }
1246
1247 fn translate_ast_to_hir(
1248 &mut self,
1249 orig_ast: ®ex_syntax::ast::Ast,
1250 tb: &mut TB<'s>,
1251 ) -> Result<NodeId> {
1252 match self.translator.translate("", orig_ast) {
1253 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254 Ok(hir) => self.hir_to_node_id(&hir, tb),
1255 }
1256 }
1257
1258 fn translator_to_node_id(
1259 &mut self,
1260 orig_ast: ®ex_syntax::ast::Ast,
1261 translator: &mut Option<Translator>,
1262 tb: &mut TB<'s>,
1263 ) -> Result<NodeId> {
1264 match translator {
1265 Some(tr) => {
1266 let hir = tr
1267 .translate("", orig_ast)
1268 .map_err(|e| self.unsupported_error(e))?;
1269 self.hir_to_node_id(&hir, tb)
1270 }
1271 None => self.translate_ast_to_hir(orig_ast, tb),
1272 }
1273 }
1274
1275 fn get_class(
1276 &mut self,
1277 negated: bool,
1278 kind: regex_syntax::ast::ClassPerlKind,
1279 tb: &mut TB<'s>,
1280 ) -> Result<NodeId> {
1281 let w = self
1282 .perl_classes
1283 .iter()
1284 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285 match w {
1286 Some((_, _, value)) => Ok(*value),
1287 None => {
1288 let translated = if self.global_ascii_perl {
1289 let pos = match kind {
1290 regex_syntax::ast::ClassPerlKind::Word => {
1291 let az = tb.mk_range_u8(b'a', b'z');
1292 let big = tb.mk_range_u8(b'A', b'Z');
1293 let dig = tb.mk_range_u8(b'0', b'9');
1294 let us = tb.mk_u8(b'_');
1295 tb.mk_unions([az, big, dig, us].into_iter())
1296 }
1297 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298 regex_syntax::ast::ClassPerlKind::Space => {
1299 let sp = tb.mk_u8(b' ');
1300 let tab = tb.mk_u8(b'\t');
1301 let nl = tb.mk_u8(b'\n');
1302 let cr = tb.mk_u8(b'\r');
1303 let ff = tb.mk_u8(0x0C);
1304 let vt = tb.mk_u8(0x0B);
1305 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306 }
1307 };
1308 if negated {
1309 resharp_algebra::neg_class(tb, pos)
1310 } else {
1311 pos
1312 }
1313 } else if self.global_unicode {
1314 match kind {
1315 regex_syntax::ast::ClassPerlKind::Word => {
1316 if self.global_full_unicode {
1317 self.unicode_classes.ensure_word_full(tb);
1318 } else {
1319 self.unicode_classes.ensure_word(tb);
1320 }
1321 if negated {
1322 self.unicode_classes.non_word
1323 } else {
1324 self.unicode_classes.word
1325 }
1326 }
1327 regex_syntax::ast::ClassPerlKind::Digit => {
1328 if self.global_full_unicode {
1329 self.unicode_classes.ensure_digit_full(tb);
1330 } else {
1331 self.unicode_classes.ensure_digit(tb);
1332 }
1333 if negated {
1334 self.unicode_classes.non_digit
1335 } else {
1336 self.unicode_classes.digit
1337 }
1338 }
1339 regex_syntax::ast::ClassPerlKind::Space => {
1340 if self.global_full_unicode {
1341 self.unicode_classes.ensure_space_full(tb);
1342 } else {
1343 self.unicode_classes.ensure_space(tb);
1344 }
1345 if negated {
1346 self.unicode_classes.non_space
1347 } else {
1348 self.unicode_classes.space
1349 }
1350 }
1351 }
1352 } else {
1353 let pos = match kind {
1354 regex_syntax::ast::ClassPerlKind::Word => {
1355 let az = tb.mk_range_u8(b'a', b'z');
1356 let big = tb.mk_range_u8(b'A', b'Z');
1357 let dig = tb.mk_range_u8(b'0', b'9');
1358 let us = tb.mk_u8(b'_');
1359 tb.mk_unions([az, big, dig, us].into_iter())
1360 }
1361 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362 regex_syntax::ast::ClassPerlKind::Space => {
1363 let sp = tb.mk_u8(b' ');
1364 let tab = tb.mk_u8(b'\t');
1365 let nl = tb.mk_u8(b'\n');
1366 let cr = tb.mk_u8(b'\r');
1367 let ff = tb.mk_u8(0x0C);
1368 let vt = tb.mk_u8(0x0B);
1369 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370 }
1371 };
1372 if negated {
1373 resharp_algebra::neg_class(tb, pos)
1375 } else {
1376 pos
1377 }
1378 };
1379 self.perl_classes.push((negated, kind, translated));
1380 Ok(translated)
1381 }
1382 }
1383 }
1384
1385 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1386 use WordCharKind::*;
1387 match ast {
1388 Ast::Literal(lit) => {
1389 if is_word_byte(lit.c as u8) {
1390 Word
1391 } else {
1392 NonWord
1393 }
1394 }
1395 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1396 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1397 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1398 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1399 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1400 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1401 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1402 },
1403 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1404 Ast::Dot(_) | Ast::Top(_) => Unknown,
1405 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1406 Ast::Concat(c) if !c.asts.is_empty() => {
1407 let dir: isize = if left { -1 } else { 1 };
1408 let edge = match Self::concat_edge_index(&c.asts, left) {
1409 Some(e) => e,
1410 None => return Unknown,
1411 };
1412 let kind = Self::word_char_kind(&c.asts[edge], left);
1413 match kind {
1414 MaybeWord => {
1415 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1416 Word => Word,
1417 Edge => MaybeWord,
1418 _ => Unknown,
1419 }
1420 }
1421 MaybeNonWord => {
1422 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1423 NonWord => NonWord,
1424 Edge => MaybeNonWord,
1425 _ => Unknown,
1426 }
1427 }
1428 other => other,
1429 }
1430 }
1431 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1432 let first = Self::word_char_kind(&alt.asts[0], left);
1433 if alt.asts[1..]
1434 .iter()
1435 .all(|a| Self::word_char_kind(a, left) == first)
1436 {
1437 first
1438 } else {
1439 Unknown
1440 }
1441 }
1442 Ast::Repetition(r) => {
1443 let inner = Self::word_char_kind(&r.ast, left);
1444 let nullable = matches!(
1445 &r.op.kind,
1446 ast::RepetitionKind::ZeroOrMore
1447 | ast::RepetitionKind::ZeroOrOne
1448 | ast::RepetitionKind::Range(
1449 ast::RepetitionRange::Bounded(0, _)
1450 | ast::RepetitionRange::Exactly(0)
1451 )
1452 );
1453 if nullable {
1454 match inner {
1455 Word => MaybeWord,
1456 NonWord => MaybeNonWord,
1457 _ => Unknown,
1458 }
1459 } else {
1460 inner
1461 }
1462 }
1463 Ast::Lookaround(la) => match la.kind {
1464 ast::LookaroundKind::PositiveLookahead
1465 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1466 _ => Unknown,
1467 },
1468 Ast::Assertion(a) => match (&a.kind, left) {
1469 (ast::AssertionKind::EndText, false) => NonWord,
1470 (ast::AssertionKind::StartText, true) => NonWord,
1471 _ => Unknown,
1472 },
1473 _ => Unknown,
1474 }
1475 }
1476
1477 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1479 match ast {
1480 Ast::Literal(_)
1481 | Ast::ClassPerl(_)
1482 | Ast::ClassBracketed(_)
1483 | Ast::ClassUnicode(_)
1484 | Ast::Dot(_)
1485 | Ast::Top(_) => Some(ast),
1486 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1487 Ast::Concat(c) if !c.asts.is_empty() => {
1488 Self::concat_edge_index(&c.asts, left)
1489 .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1490 }
1491 Ast::Repetition(r) => {
1492 let nullable = matches!(
1493 &r.op.kind,
1494 ast::RepetitionKind::ZeroOrMore
1495 | ast::RepetitionKind::ZeroOrOne
1496 | ast::RepetitionKind::Range(
1497 ast::RepetitionRange::Bounded(0, _)
1498 | ast::RepetitionRange::Exactly(0)
1499 )
1500 );
1501 if nullable {
1502 None
1503 } else {
1504 Self::edge_class_ast(&r.ast, left)
1505 }
1506 }
1507 _ => None,
1508 }
1509 }
1510
1511 fn resolve_word_kind(
1512 &mut self,
1513 asts: &[Ast],
1514 idx: usize,
1515 dir: isize,
1516 translator: &mut Option<Translator>,
1517 tb: &mut TB<'s>,
1518 word_id: NodeId,
1519 not_word_id: NodeId,
1520 ) -> Result<WordCharKind> {
1521 use WordCharKind::*;
1522 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1523 if fast != Unknown {
1524 return Ok(fast);
1525 }
1526 let neighbor_idx = (idx as isize + dir) as usize;
1527 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1528 self.ast_to_node_id(edge, translator, tb)?
1529 } else if dir > 0 {
1530 let mut bodies: Vec<NodeId> = vec![];
1531 let mut j = neighbor_idx;
1532 while j < asts.len() {
1533 match &asts[j] {
1534 Ast::Lookaround(la) => {
1535 let kind = la.kind.clone();
1536 let lookbehind = matches!(
1537 kind,
1538 ast::LookaroundKind::PositiveLookbehind
1539 | ast::LookaroundKind::NegativeLookbehind
1540 );
1541 if lookbehind {
1542 j += 1;
1543 continue;
1544 }
1545 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1546 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1547 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1548 })?;
1549 let body_ts = tb.mk_concat(body, NodeId::TS);
1550 let constraint = match kind {
1551 ast::LookaroundKind::PositiveLookahead => body_ts,
1552 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1553 _ => unreachable!(),
1554 };
1555 bodies.push(constraint);
1556 j += 1;
1557 }
1558 other => {
1559 let n = self.ast_to_node_id(other, translator, tb)?;
1560 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1561 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1562 })?;
1563 bodies.push(tb.mk_concat(n, NodeId::TS));
1564 break;
1565 }
1566 }
1567 }
1568 if bodies.is_empty() {
1569 return Ok(Unknown);
1570 }
1571 let combined = tb.mk_inters(bodies.into_iter());
1572 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1573 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1574 return if tb.subsumes(word_prefix, combined) == Some(true) {
1575 Ok(Word)
1576 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1577 Ok(NonWord)
1578 } else {
1579 Ok(Unknown)
1580 };
1581 } else {
1582 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1583 let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1584 let mut neighbor_node = tb
1585 .try_elim_lookarounds(neighbor_node)
1586 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1587 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1588 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1589 })?;
1590 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1591 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1592 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1593 Ok(Word)
1594 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1595 Ok(NonWord)
1596 } else {
1597 Ok(Unknown)
1598 };
1599 };
1600 if tb.subsumes(word_id, node) == Some(true) {
1601 Ok(Word)
1602 } else if tb.subsumes(not_word_id, node) == Some(true) {
1603 Ok(NonWord)
1604 } else {
1605 Ok(Unknown)
1606 }
1607 }
1608
1609 fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1610 match tb.get_kind(node) {
1611 Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1612 Kind::Concat => {
1613 let l = node.left(tb);
1614 let r = node.right(tb);
1615 let stripped_r = Self::strip_trailing_lookahead(tb, r);
1616 if stripped_r == NodeId::EPS {
1617 Self::strip_trailing_lookahead(tb, l)
1618 } else if stripped_r == r {
1619 node
1620 } else {
1621 tb.mk_concat(l, stripped_r)
1622 }
1623 }
1624 _ => node,
1625 }
1626 }
1627
1628 fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1629 let dir: isize = if left { -1 } else { 1 };
1630 let mut e = if left { asts.len() as isize - 1 } else { 0 };
1631 while e >= 0
1632 && (e as usize) < asts.len()
1633 && Self::is_transparent_for_dir(&asts[e as usize], dir)
1634 {
1635 e += dir;
1636 }
1637 if e < 0 || e as usize >= asts.len() {
1638 None
1639 } else {
1640 Some(e as usize)
1641 }
1642 }
1643
1644 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1645 match ast {
1646 Ast::Lookaround(la) => match la.kind {
1647 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1648 dir < 0
1649 }
1650 ast::LookaroundKind::PositiveLookbehind
1651 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1652 },
1653 Ast::Repetition(r) => matches!(
1654 &r.op.kind,
1655 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1656 ),
1657 _ => false,
1658 }
1659 }
1660
1661 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1662 use WordCharKind::*;
1663 let next = idx as isize + dir;
1664 if next < 0 || next >= asts.len() as isize {
1665 return Edge;
1666 }
1667 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1668 return Self::concat_neighbor_kind(asts, next as usize, dir);
1669 }
1670 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1671 match kind {
1672 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1673 Word => Word,
1674 _ => Unknown,
1675 },
1676 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1677 NonWord => NonWord,
1678 _ => Unknown,
1679 },
1680 other => other,
1681 }
1682 }
1683
1684 fn specialize_word_boundaries(
1685 &mut self,
1686 children: &mut [NodeId],
1687 tb: &mut TB<'s>,
1688 ) -> Result<()> {
1689 let wb = self.unicode_classes.wb;
1690 let non_wb = self.unicode_classes.non_wb;
1691 if wb == NodeId::MISSING {
1692 return Ok(());
1693 }
1694 let word = self.unicode_classes.word;
1695 let non_word = self.unicode_classes.non_word;
1696 if word == NodeId::MISSING {
1697 return Ok(());
1698 }
1699 let word_pref = tb.mk_concat(word, NodeId::TS);
1702 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1703 let word_suf = tb.mk_concat(NodeId::TS, word);
1704 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1705 let len = children.len();
1706 for k in 0..len {
1707 let l = if k == 0 {
1708 WordCharKind::Edge
1709 } else {
1710 use resharp_algebra::Kind;
1711 if tb.get_kind(children[k - 1]) == Kind::End
1712 && (children[k] == wb || children[k] == non_wb)
1713 {
1714 return Err(
1715 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1716 );
1717 }
1718 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1719 };
1720 let r = if k + 1 >= len {
1721 WordCharKind::Edge
1722 } else {
1723 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1724 };
1725 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1726 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1727 }
1728 Ok(())
1729 }
1730
1731 fn rewrite_wb_in_node(
1732 b: &mut TB<'s>,
1733 node: NodeId,
1734 wb: NodeId,
1735 non_wb: NodeId,
1736 word: NodeId,
1737 left: WordCharKind,
1738 right: WordCharKind,
1739 ) -> Option<NodeId> {
1740 let boundary_match = if node == wb {
1741 true
1742 } else if node == non_wb {
1743 false
1744 } else if b.get_kind(node) == Kind::Union {
1745 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1746 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1747 return Some(b.mk_union(l, r));
1748 } else {
1749 return Some(node);
1750 };
1751 use WordCharKind::*;
1752 let result = match (left, right) {
1753 (NonWord, Word) | (Word, NonWord) => {
1754 if boundary_match {
1755 NodeId::EPS
1756 } else {
1757 NodeId::BOT
1758 }
1759 }
1760 (Word, Word) | (NonWord, NonWord) => {
1761 if boundary_match {
1762 NodeId::BOT
1763 } else {
1764 NodeId::EPS
1765 }
1766 }
1767 (Word, _) => {
1768 if boundary_match {
1769 b.mk_neg_lookahead(word, 0)
1770 } else {
1771 let tail = b.mk_concat(word, NodeId::TS);
1772 b.mk_lookahead(tail, NodeId::MISSING, 0)
1773 }
1774 }
1775 (NonWord, _) => {
1776 if boundary_match {
1777 let tail = b.mk_concat(word, NodeId::TS);
1778 b.mk_lookahead(tail, NodeId::MISSING, 0)
1779 } else {
1780 b.mk_neg_lookahead(word, 0)
1781 }
1782 }
1783 (_, Word) => {
1784 if boundary_match {
1785 b.mk_neg_lookbehind(word)
1786 } else {
1787 b.mk_lookbehind(word, NodeId::MISSING)
1788 }
1789 }
1790 (_, NonWord) => {
1791 if boundary_match {
1792 b.mk_lookbehind(word, NodeId::MISSING)
1793 } else {
1794 b.mk_neg_lookbehind(word)
1795 }
1796 }
1797 _ => return Some(node),
1798 };
1799 Some(result)
1800 }
1801
1802 fn classify(
1803 b: &mut TB<'s>,
1804 node: NodeId,
1805 word_dir: NodeId,
1806 non_word_dir: NodeId,
1807 ) -> WordCharKind {
1808 if b.contains_look(node) || b.contains_anchors(node) {
1809 return WordCharKind::Unknown;
1810 }
1811 if b.subsumes(word_dir, node) == Some(true) {
1812 WordCharKind::Word
1813 } else if b.subsumes(non_word_dir, node) == Some(true) {
1814 WordCharKind::NonWord
1815 } else {
1816 WordCharKind::Unknown
1817 }
1818 }
1819
1820 fn rewrite_word_boundary_in_concat(
1821 &mut self,
1822 asts: &[Ast],
1823 idx: usize,
1824 translator: &mut Option<Translator>,
1825 tb: &mut TB<'s>,
1826 negated: bool,
1827 ) -> Result<(NodeId, usize)> {
1828 use WordCharKind::*;
1829 if self.global_full_unicode {
1830 self.unicode_classes.ensure_word_full(tb);
1831 } else if self.global_unicode && !self.global_ascii_perl {
1832 self.unicode_classes.ensure_word(tb);
1833 } else {
1834 self.unicode_classes.ensure_word_ascii(tb);
1835 }
1836 let word_id = self.unicode_classes.word;
1837 let not_word_id = self.unicode_classes.non_word;
1838 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1839 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1840 let boundary_match = !negated;
1841 match (left, right) {
1842 (NonWord, Word) | (Word, NonWord) => Ok((
1843 if boundary_match {
1844 NodeId::EPS
1845 } else {
1846 NodeId::BOT
1847 },
1848 idx + 1,
1849 )),
1850 (Word, Word) | (NonWord, NonWord) => Ok((
1851 if boundary_match {
1852 NodeId::BOT
1853 } else {
1854 NodeId::EPS
1855 },
1856 idx + 1,
1857 )),
1858 (Word, _) => {
1859 if boundary_match {
1860 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1861 } else {
1862 let tail = tb.mk_concat(word_id, NodeId::TS);
1863 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1864 }
1865 }
1866 (NonWord, _) => {
1867 if boundary_match {
1868 let tail = tb.mk_concat(word_id, NodeId::TS);
1869 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1870 } else {
1871 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1872 }
1873 }
1874 (_, Word) => {
1875 if boundary_match {
1876 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1877 } else {
1878 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1879 }
1880 }
1881 (_, NonWord) => {
1882 if boundary_match {
1883 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1884 } else {
1885 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1886 }
1887 }
1888 _ => {
1892 self.unicode_classes.ensure_wb(tb);
1893 let node = if boundary_match {
1894 self.unicode_classes.wb
1895 } else {
1896 self.unicode_classes.non_wb
1897 };
1898 Ok((node, idx + 1))
1899 }
1900 }
1901 }
1902
1903 fn merge_boundary_with_following_lookaheads(
1904 &mut self,
1905 asts: &[Ast],
1906 wb_idx: usize,
1907 boundary_tail: NodeId,
1908 translator: &mut Option<Translator>,
1909 tb: &mut TB<'s>,
1910 ) -> Result<(NodeId, usize)> {
1911 let mut next = wb_idx + 1;
1912 let mut la_bodies = vec![boundary_tail];
1913 while next < asts.len() {
1914 match &asts[next] {
1915 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1916 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1917 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1918 next += 1;
1919 }
1920 _ => break,
1921 }
1922 }
1923 let merged = tb.mk_inters(la_bodies.into_iter());
1924 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1925 }
1926
1927 fn ast_to_node_id(
1928 &mut self,
1929 ast: &Ast,
1930 translator: &mut Option<Translator>,
1931 tb: &mut TB<'s>,
1932 ) -> Result<NodeId> {
1933 match ast {
1934 Ast::Empty(_) => Ok(NodeId::EPS),
1935 Ast::Flags(f) => {
1936 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1937 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1938 }
1939 let mut translator_builder = self.default_translator_builder();
1940 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1941 translator_builder.case_insensitive(state);
1942 }
1943 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1944 translator_builder.unicode(state);
1945 }
1946 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1947 self.dot_all.set(state);
1948 }
1949 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1950 self.multiline.set(state);
1951 }
1952 let concat_translator = Some(translator_builder.build());
1953 *translator = concat_translator;
1954 Ok(NodeId::EPS)
1955 }
1956 Ast::Literal(l) => {
1957 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1958 self.translator_to_node_id(&ast_lit, translator, tb)
1959 }
1960 Ast::Top(_) => Ok(NodeId::TOP),
1961 Ast::Dot(_) => {
1962 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1963 let hirv = match (codepoint_dot, self.dot_all.get()) {
1964 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1965 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1966 (false, true) => return Ok(NodeId::TOP),
1967 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1968 };
1969 self.hir_to_node_id(&hirv, tb)
1970 }
1971 Ast::Assertion(a) => match &a.kind {
1972 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1973 ast::AssertionKind::EndText => Ok(NodeId::END),
1974 ast::AssertionKind::WordBoundary => {
1975 let only = Ast::Assertion(a.clone());
1976 let asts = std::slice::from_ref(&only);
1977 let (node, _) =
1978 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1979 Ok(node)
1980 }
1981 ast::AssertionKind::NotWordBoundary => {
1982 let only = Ast::Assertion(a.clone());
1984 let asts = std::slice::from_ref(&only);
1985 let (node, _) =
1986 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1987 Ok(node)
1988 }
1989 ast::AssertionKind::StartLine => {
1990 if !self.multiline.get() {
1991 return Ok(NodeId::BEGIN);
1992 }
1993 let left = NodeId::BEGIN;
1994 let right = tb.mk_u8(b'\n');
1995 let union = tb.mk_union(left, right);
1996 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1997 }
1998 ast::AssertionKind::EndLine => {
1999 if !self.multiline.get() {
2000 return Ok(NodeId::END);
2001 }
2002 let left = NodeId::END;
2003 let right = tb.mk_u8(b'\n');
2004 let union = tb.mk_union(left, right);
2005 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2006 }
2007 ast::AssertionKind::WordBoundaryStart => {
2008 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2009 }
2010 ast::AssertionKind::WordBoundaryEnd => {
2011 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2012 }
2013 ast::AssertionKind::WordBoundaryStartAngle => {
2014 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2015 }
2016 ast::AssertionKind::WordBoundaryEndAngle => {
2017 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2018 }
2019 ast::AssertionKind::WordBoundaryStartHalf => {
2020 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2021 }
2022 ast::AssertionKind::WordBoundaryEndHalf => {
2023 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2024 }
2025 },
2026 Ast::ClassUnicode(c) => {
2027 let tmp = regex_syntax::ast::ClassUnicode {
2028 span: c.span,
2029 negated: c.negated,
2030 kind: c.kind.clone(),
2031 };
2032 if !c.negated {
2033 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2034 match s.as_str() {
2035 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2037 "utf8" => {
2039 let ascii = tb.mk_range_u8(0, 127);
2040 let beta = tb.mk_range_u8(128, 0xBF);
2041 let c0 = tb.mk_range_u8(0xC0, 0xDF);
2042 let c0s = tb.mk_concats([c0, beta].into_iter());
2043 let e0 = tb.mk_range_u8(0xE0, 0xEF);
2044 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2045 let f0 = tb.mk_range_u8(0xF0, 0xF7);
2046 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2047 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2048 }
2049 "hex" => {
2050 let nums = tb.mk_range_u8(b'0', b'9');
2051 let lets = tb.mk_range_u8(b'a', b'f');
2052 let lets2 = tb.mk_range_u8(b'A', b'F');
2053 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2054 return Ok(merged);
2055 }
2056 _ => {}
2057 }
2058 };
2059 }
2060
2061 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2062 self.translator_to_node_id(&orig_ast, translator, tb)
2063 }
2064 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2065 Ast::ClassBracketed(c) => match &c.kind {
2066 regex_syntax::ast::ClassSet::Item(item) => {
2067 if !c.negated && is_universal_perl_pair(item) {
2068 return Ok(NodeId::TOP);
2069 }
2070 if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2071 return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2072 }
2073 let kind = if self.global_ascii_perl {
2074 rewrite_ascii_perl_set(&c.kind)
2075 } else {
2076 c.kind.clone()
2077 };
2078 let tmp = regex_syntax::ast::ClassBracketed {
2079 span: c.span,
2080 negated: c.negated,
2081 kind,
2082 };
2083 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2084 self.translator_to_node_id(&orig_ast, translator, tb)
2085 }
2086 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2087 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2088 }
2089 },
2090 Ast::Repetition(r) => {
2091 let body = self.ast_to_node_id(&r.ast, translator, tb);
2092 match body {
2093 Ok(body) => match &r.op.kind {
2094 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2095 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2096 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2097 ast::RepetitionKind::Range(r) => match r {
2098 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2099 ast::RepetitionRange::AtLeast(n) => {
2100 let rep = tb.mk_repeat(body, *n, *n);
2101 let st = tb.mk_star(body);
2102 Ok(tb.mk_concat(rep, st))
2103 }
2104
2105 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2106 },
2107 },
2108 Err(_) => body,
2109 }
2110 }
2111 Ast::Lookaround(g) => {
2112 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2113 match g.kind {
2114 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2115 let mut prefix = NodeId::EPS;
2116 let mut rest = body;
2117 while tb.get_kind(rest) == Kind::Concat
2118 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2119 {
2120 prefix = tb.mk_concat(prefix, rest.left(tb));
2121 rest = rest.right(tb);
2122 }
2123 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2124 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2125 }
2126 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2127 Ok(tb.mk_concat(prefix, la))
2128 }
2129 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2130 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2131 }
2132 ast::LookaroundKind::PositiveLookahead => {
2133 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2134 }
2135 ast::LookaroundKind::PositiveLookbehind
2136 | ast::LookaroundKind::NegativeLookbehind
2137 if body.contains_lookahead(tb) =>
2138 {
2139 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2140 }
2141 ast::LookaroundKind::PositiveLookbehind => {
2142 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2143 }
2144 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2145 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2146 }
2147 }
2148 Ast::Group(g) => {
2149 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2150 if !flags.items.is_empty() {
2151 let mut translator_builder = self.default_translator_builder();
2152 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2153 translator_builder.case_insensitive(state);
2154 }
2155 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2156 translator_builder.unicode(state);
2157 }
2158 let saved_dot_all = self.dot_all.get();
2159 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2160 self.dot_all.set(state);
2161 }
2162 let saved_multiline = self.multiline.get();
2163 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2164 self.multiline.set(state);
2165 }
2166 let mut scoped = Some(translator_builder.build());
2167 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2168 self.dot_all.set(saved_dot_all);
2169 self.multiline.set(saved_multiline);
2170 return result;
2171 }
2172 }
2173 self.ast_to_node_id(&g.ast, translator, tb)
2174 }
2175 Ast::Alternation(a) => {
2176 let mut children = vec![];
2177 for ast in &a.asts {
2178 match self.ast_to_node_id(ast, translator, tb) {
2179 Ok(node_id) => children.push(node_id),
2180 Err(err) => return Err(err),
2181 }
2182 }
2183 Ok(tb.mk_unions(children.iter().copied()))
2184 }
2185 Ast::Concat(c) => {
2186 let mut concat_translator: Option<Translator> = None;
2187 let mut children = vec![];
2188 let mut prev_boundary_child: Option<usize> = None;
2189 let mut i = 0;
2190 while i < c.asts.len() {
2191 let ast = &c.asts[i];
2192 match ast {
2193 Ast::Flags(f) => {
2194 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2195 return Err(
2196 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2197 );
2198 }
2199 let mut translator_builder = self.default_translator_builder();
2200 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2201 translator_builder.case_insensitive(state);
2202 }
2203 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2204 translator_builder.unicode(state);
2205 }
2206 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2207 self.dot_all.set(state);
2208 }
2209 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2210 self.multiline.set(state);
2211 }
2212 concat_translator = Some(translator_builder.build());
2213 *translator = concat_translator.clone();
2214 i += 1;
2215 continue;
2216 }
2217 Ast::Assertion(a)
2218 if a.kind == ast::AssertionKind::WordBoundary
2219 || a.kind == ast::AssertionKind::NotWordBoundary =>
2220 {
2221 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2222 let node = self.rewrite_word_boundary_in_concat(
2223 &c.asts, i, translator, tb, negated,
2224 )?;
2225 match prev_boundary_child {
2226 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2227 None => {
2228 children.push(node.0);
2229 prev_boundary_child = Some(children.len() - 1);
2230 }
2231 }
2232 i = node.1; continue;
2234 }
2235 _ => {}
2236 }
2237 match concat_translator {
2238 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2239 Ok(node_id) => {
2240 if node_id != resharp_algebra::NodeId::EPS {
2241 prev_boundary_child = None;
2242 children.push(node_id);
2243 }
2244 }
2245 Err(err) => return Err(err),
2246 },
2247 None => match self.ast_to_node_id(ast, translator, tb) {
2248 Ok(node_id) => {
2249 if node_id != resharp_algebra::NodeId::EPS {
2250 prev_boundary_child = None;
2251 children.push(node_id);
2252 }
2253 }
2254 Err(err) => return Err(err),
2255 },
2256 }
2257 i += 1;
2258 }
2259 self.specialize_word_boundaries(&mut children, tb)?;
2260 Ok(tb.mk_concats(children.iter().cloned()))
2261 }
2262 Ast::Intersection(intersection) => {
2263 let mut children = vec![];
2264 for ast in &intersection.asts {
2265 match self.ast_to_node_id(ast, translator, tb) {
2266 Ok(node_id) => children.push(node_id),
2267 Err(err) => return Err(err),
2268 }
2269 }
2270 Ok(tb.mk_inters(children.into_iter()))
2271 }
2272 Ast::Complement(complement) => {
2273 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2274 body.map(|x| tb.mk_compl(x))
2275 }
2276 }
2277 }
2278
2279 fn parse_inner(&mut self) -> Result<Ast> {
2280 let mut concat = Concat {
2281 span: self.span(),
2282 asts: vec![],
2283 };
2284 loop {
2285 self.bump_space();
2286 if self.is_eof() {
2287 break;
2288 }
2289 match self.char() {
2290 '(' => concat = self.push_group(concat)?,
2291 ')' => concat = self.pop_group(concat)?,
2292 '|' => concat = self.push_alternate(concat)?,
2293 '&' => concat = self.push_intersect(concat)?,
2294 '~' => concat = self.push_compl_group(concat)?,
2295 '[' => {
2296 let class = self.parse_set_class()?;
2297 concat.asts.push(Ast::class_bracketed(class));
2298 }
2299 '?' => {
2300 concat =
2301 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2302 }
2303 '*' => {
2304 concat =
2305 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2306 }
2307 '+' => {
2308 concat =
2309 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2310 }
2311 '{' => {
2312 concat = self.parse_counted_repetition(concat)?;
2313 }
2314 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2315 }
2316 if self.stack_group.borrow().len() > self.max_depth {
2317 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2318 }
2319 }
2320 let ast = self.pop_group_end(concat)?;
2321 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2322 || max_concat_length(&ast) >= self.max_list_len
2323 {
2324 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2325 }
2326 Ok(ast)
2327 }
2328
2329 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2330 let ast = self.parse_inner()?;
2331 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2332 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2333 }
2334 self.ast_to_node_id(&ast, &mut None, tb)
2335 }
2336
2337 #[inline(never)]
2338 fn parse_uncounted_repetition(
2339 &self,
2340 mut concat: ast::Concat,
2341 kind: ast::RepetitionKind,
2342 ) -> Result<ast::Concat> {
2343 let op_start = self.pos();
2345 let ast = match concat.asts.pop() {
2346 Some(ast) => ast,
2347 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2348 };
2349 match ast {
2350 Ast::Empty(_) | Ast::Flags(_) => {
2351 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2352 }
2353 _ => {}
2354 }
2355 if self.bump() && self.char() == '?' {
2356 return Err(self.error(
2357 Span::new(op_start, self.pos()),
2358 ast::ErrorKind::UnsupportedLazyQuantifier,
2359 ));
2360 }
2361 concat.asts.push(Ast::repetition(ast::Repetition {
2362 span: ast.span().with_end(self.pos()),
2363 op: ast::RepetitionOp {
2364 span: Span::new(op_start, self.pos()),
2365 kind,
2366 },
2367 greedy: true,
2368 ast: Box::new(ast),
2369 }));
2370 Ok(concat)
2371 }
2372
2373 #[inline(never)]
2374 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2375 assert!(self.char() == '{');
2376 let start = self.pos();
2377 let ast = match concat.asts.pop() {
2378 Some(ast) => ast,
2379 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2380 };
2381 match ast {
2382 Ast::Empty(_) | Ast::Flags(_) => {
2383 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2384 }
2385 _ => {}
2386 }
2387 if !self.bump_and_bump_space() {
2388 return Err(self.error(
2389 Span::new(start, self.pos()),
2390 ast::ErrorKind::RepetitionCountUnclosed,
2391 ));
2392 }
2393 let count_start = specialize_err(
2394 self.parse_decimal(),
2395 ast::ErrorKind::DecimalEmpty,
2396 ast::ErrorKind::RepetitionCountDecimalEmpty,
2397 );
2398 if self.is_eof() {
2399 return Err(self.error(
2400 Span::new(start, self.pos()),
2401 ast::ErrorKind::RepetitionCountUnclosed,
2402 ));
2403 }
2404 let range = if self.char() == ',' {
2405 if !self.bump_and_bump_space() {
2406 return Err(self.error(
2407 Span::new(start, self.pos()),
2408 ast::ErrorKind::RepetitionCountUnclosed,
2409 ));
2410 }
2411 if self.char() != '}' {
2412 let count_start = match count_start {
2413 Ok(c) => c,
2414 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2415 if self.parser().empty_min_range {
2416 0
2417 } else {
2418 return Err(err);
2419 }
2420 }
2421 err => err?,
2422 };
2423 let count_end = specialize_err(
2424 self.parse_decimal(),
2425 ast::ErrorKind::DecimalEmpty,
2426 ast::ErrorKind::RepetitionCountDecimalEmpty,
2427 )?;
2428 ast::RepetitionRange::Bounded(count_start, count_end)
2429 } else {
2430 ast::RepetitionRange::AtLeast(count_start?)
2431 }
2432 } else {
2433 ast::RepetitionRange::Exactly(count_start?)
2434 };
2435
2436 if self.is_eof() || self.char() != '}' {
2437 return Err(self.error(
2438 Span::new(start, self.pos()),
2439 ast::ErrorKind::RepetitionCountUnclosed,
2440 ));
2441 }
2442
2443 if self.bump_and_bump_space() && self.char() == '?' {
2444 return Err(self.error(
2445 Span::new(start, self.pos()),
2446 ast::ErrorKind::UnsupportedLazyQuantifier,
2447 ));
2448 }
2449
2450 let op_span = Span::new(start, self.pos());
2451 if !range.is_valid() {
2452 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2453 }
2454
2455 let over_limit = match &range {
2456 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2457 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2458 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2459 };
2460 if over_limit {
2461 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2462 }
2463 concat.asts.push(Ast::repetition(ast::Repetition {
2464 span: ast.span().with_end(self.pos()),
2465 op: ast::RepetitionOp {
2466 span: op_span,
2467 kind: ast::RepetitionKind::Range(range),
2468 },
2469 greedy: true,
2470 ast: Box::new(ast),
2471 }));
2472 Ok(concat)
2473 }
2474
2475 #[inline(never)]
2476 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2477 assert_eq!(self.char(), '(');
2478 let open_span = self.span_char();
2479 self.bump();
2480 self.bump_space();
2481 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2482 let kind = match (pos, ahead) {
2483 (true, true) => LookaroundKind::PositiveLookahead,
2484 (true, false) => LookaroundKind::PositiveLookbehind,
2485 (false, true) => LookaroundKind::NegativeLookahead,
2486 (false, false) => LookaroundKind::NegativeLookbehind,
2487 };
2488 return Ok(Either::Right(ast::Group {
2489 span: open_span,
2490 kind: ast::GroupKind::Lookaround(kind),
2491 ast: Box::new(Ast::empty(self.span())),
2492 }));
2493 }
2494 let inner_span = self.span();
2495 let mut starts_with_p = true;
2496 if self.bump_if("?P<") || {
2497 starts_with_p = false;
2498 self.bump_if("?<")
2499 } {
2500 let capture_index = self.next_capture_index(open_span)?;
2501 let name = self.parse_capture_name(capture_index)?;
2502 Ok(Either::Right(ast::Group {
2503 span: open_span,
2504 kind: ast::GroupKind::CaptureName {
2505 starts_with_p,
2506 name,
2507 },
2508 ast: Box::new(Ast::empty(self.span())),
2509 }))
2510 } else if self.bump_if("?") {
2511 if self.is_eof() {
2512 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2513 }
2514 let flags = self.parse_flags()?;
2515 let char_end = self.char();
2516 self.bump();
2517 if char_end == ')' {
2518 if flags.items.is_empty() {
2521 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2522 }
2523 Ok(Either::Left(ast::SetFlags {
2524 span: Span {
2525 end: self.pos(),
2526 ..open_span
2527 },
2528 flags,
2529 }))
2530 } else {
2531 assert_eq!(char_end, ':');
2532 Ok(Either::Right(ast::Group {
2533 span: open_span,
2534 kind: ast::GroupKind::NonCapturing(flags),
2535 ast: Box::new(Ast::empty(self.span())),
2536 }))
2537 }
2538 } else {
2539 let capture_index = self.next_capture_index(open_span)?;
2540 Ok(Either::Right(ast::Group {
2541 span: open_span,
2542 kind: ast::GroupKind::CaptureIndex(capture_index),
2543 ast: Box::new(Ast::empty(self.span())),
2544 }))
2545 }
2546 }
2547
2548 #[inline(never)]
2549 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2550 if self.is_eof() {
2551 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2552 }
2553 let start = self.pos();
2554 loop {
2555 if self.char() == '>' {
2556 break;
2557 }
2558 if !is_capture_char(self.char(), self.pos() == start) {
2559 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2560 }
2561 if !self.bump() {
2562 break;
2563 }
2564 }
2565 let end = self.pos();
2566 if self.is_eof() {
2567 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2568 }
2569 assert_eq!(self.char(), '>');
2570 self.bump();
2571 let name = &self.pattern()[start.offset..end.offset];
2572 if name.is_empty() {
2573 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2574 }
2575 let capname = ast::CaptureName {
2576 span: Span::new(start, end),
2577 name: name.to_string(),
2578 index: capture_index,
2579 };
2580 self.add_capture_name(&capname)?;
2581 Ok(capname)
2582 }
2583
2584 #[inline(never)]
2585 fn parse_flags(&self) -> Result<ast::Flags> {
2586 let mut flags = ast::Flags {
2587 span: self.span(),
2588 items: vec![],
2589 };
2590 let mut last_was_negation = None;
2591 while self.char() != ':' && self.char() != ')' {
2592 if self.char() == '-' {
2593 last_was_negation = Some(self.span_char());
2594 let item = ast::FlagsItem {
2595 span: self.span_char(),
2596 kind: ast::FlagsItemKind::Negation,
2597 };
2598 if let Some(i) = flags.add_item(item) {
2599 return Err(self.error(
2600 self.span_char(),
2601 ast::ErrorKind::FlagRepeatedNegation {
2602 original: flags.items[i].span,
2603 },
2604 ));
2605 }
2606 } else {
2607 last_was_negation = None;
2608 let item = ast::FlagsItem {
2609 span: self.span_char(),
2610 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2611 };
2612 if let Some(i) = flags.add_item(item) {
2613 return Err(self.error(
2614 self.span_char(),
2615 ast::ErrorKind::FlagDuplicate {
2616 original: flags.items[i].span,
2617 },
2618 ));
2619 }
2620 }
2621 if !self.bump() {
2622 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2623 }
2624 }
2625 if let Some(span) = last_was_negation {
2626 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2627 }
2628 flags.span.end = self.pos();
2629 Ok(flags)
2630 }
2631
2632 #[inline(never)]
2633 fn parse_flag(&self) -> Result<ast::Flag> {
2634 match self.char() {
2635 'i' => Ok(ast::Flag::CaseInsensitive),
2636 'm' => Ok(ast::Flag::MultiLine),
2637 's' => Ok(ast::Flag::DotMatchesNewLine),
2638 'U' => Ok(ast::Flag::SwapGreed),
2639 'u' => Ok(ast::Flag::Unicode),
2640 'R' => Ok(ast::Flag::CRLF),
2641 'x' => Ok(ast::Flag::IgnoreWhitespace),
2642 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2643 }
2644 }
2645
2646 fn parse_primitive(&self) -> Result<Primitive> {
2647 match self.char() {
2648 '\\' => self.parse_escape(),
2649 '_' => {
2650 let ast = Primitive::Top(self.span_char());
2651 self.bump();
2652 Ok(ast)
2653 }
2654 '.' => {
2655 let ast = Primitive::Dot(self.span_char());
2656 self.bump();
2657 Ok(ast)
2658 }
2659 '^' => {
2660 let ast = Primitive::Assertion(ast::Assertion {
2661 span: self.span_char(),
2662 kind: ast::AssertionKind::StartLine,
2663 });
2664 self.bump();
2665 Ok(ast)
2666 }
2667 '$' => {
2668 let ast = Primitive::Assertion(ast::Assertion {
2669 span: self.span_char(),
2670 kind: ast::AssertionKind::EndLine,
2671 });
2672 self.bump();
2673 Ok(ast)
2674 }
2675 c => {
2676 let ast = Primitive::Literal(Literal {
2677 span: self.span_char(),
2678 kind: LiteralKind::Verbatim,
2679 c,
2680 });
2681 self.bump();
2682 Ok(ast)
2683 }
2684 }
2685 }
2686
2687 #[inline(never)]
2688 fn parse_escape(&self) -> Result<Primitive> {
2689 assert_eq!(self.char(), '\\');
2690 let start = self.pos();
2691 if !self.bump() {
2692 return Err(self.error(
2693 Span::new(start, self.pos()),
2694 ast::ErrorKind::EscapeUnexpectedEof,
2695 ));
2696 }
2697 let c = self.char();
2698 match c {
2700 '0'..='9' => {
2701 if !self.parser().octal {
2702 return Err(self.error(
2703 Span::new(start, self.span_char().end),
2704 ast::ErrorKind::UnsupportedBackreference,
2705 ));
2706 }
2707 let mut lit = self.parse_octal();
2708 lit.span.start = start;
2709 return Ok(Primitive::Literal(lit));
2710 }
2711 'x' | 'u' | 'U' => {
2712 let mut lit = self.parse_hex()?;
2713 lit.span.start = start;
2714 return Ok(Primitive::Literal(lit));
2715 }
2716 'p' | 'P' => {
2717 let mut cls = self.parse_unicode_class()?;
2718 cls.span.start = start;
2719 return Ok(Primitive::Unicode(cls));
2720 }
2721 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2722 let mut cls = self.parse_perl_class();
2723 cls.span.start = start;
2724 return Ok(Primitive::Perl(cls));
2725 }
2726 _ => {}
2727 }
2728
2729 self.bump();
2731 let span = Span::new(start, self.pos());
2732 if is_meta_character(c) {
2733 return Ok(Primitive::Literal(Literal {
2734 span,
2735 kind: LiteralKind::Meta,
2736 c,
2737 }));
2738 }
2739 if is_escapeable_character(c) {
2740 return Ok(Primitive::Literal(Literal {
2741 span,
2742 kind: LiteralKind::Superfluous,
2743 c,
2744 }));
2745 }
2746 let special = |kind, c| {
2747 Ok(Primitive::Literal(Literal {
2748 span,
2749 kind: LiteralKind::Special(kind),
2750 c,
2751 }))
2752 };
2753 match c {
2754 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2755 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2756 't' => special(SpecialLiteralKind::Tab, '\t'),
2757 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2758 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2759 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2760 'A' => Ok(Primitive::Assertion(ast::Assertion {
2761 span,
2762 kind: ast::AssertionKind::StartText,
2763 })),
2764 'z' => Ok(Primitive::Assertion(ast::Assertion {
2765 span,
2766 kind: ast::AssertionKind::EndText,
2767 })),
2768 'b' => {
2769 let mut wb = ast::Assertion {
2770 span,
2771 kind: ast::AssertionKind::WordBoundary,
2772 };
2773 if !self.is_eof() && self.char() == '{' {
2776 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2777 wb.kind = kind;
2778 wb.span.end = self.pos();
2779 }
2780 }
2781 Ok(Primitive::Assertion(wb))
2782 }
2783 'B' => Ok(Primitive::Assertion(ast::Assertion {
2784 span,
2785 kind: ast::AssertionKind::NotWordBoundary,
2786 })),
2787 '<' => Ok(Primitive::Assertion(ast::Assertion {
2788 span,
2789 kind: ast::AssertionKind::WordBoundaryStartAngle,
2790 })),
2791 '>' => Ok(Primitive::Assertion(ast::Assertion {
2792 span,
2793 kind: ast::AssertionKind::WordBoundaryEndAngle,
2794 })),
2795 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2796 }
2797 }
2798
2799 fn maybe_parse_special_word_boundary(
2800 &self,
2801 wb_start: Position,
2802 ) -> Result<Option<ast::AssertionKind>> {
2803 assert_eq!(self.char(), '{');
2804
2805 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2806 let start = self.pos();
2807 if !self.bump_and_bump_space() {
2808 return Err(self.error(
2809 Span::new(wb_start, self.pos()),
2810 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2811 ));
2812 }
2813 let start_contents = self.pos();
2814 if !is_valid_char(self.char()) {
2815 self.parser().pos.set(start);
2816 return Ok(None);
2817 }
2818
2819 let mut scratch = self.parser().scratch.borrow_mut();
2821 scratch.clear();
2822 while !self.is_eof() && is_valid_char(self.char()) {
2823 scratch.push(self.char());
2824 self.bump_and_bump_space();
2825 }
2826 if self.is_eof() || self.char() != '}' {
2827 return Err(self.error(
2828 Span::new(start, self.pos()),
2829 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2830 ));
2831 }
2832 let end = self.pos();
2833 self.bump();
2834 let kind = match scratch.as_str() {
2835 "start" => ast::AssertionKind::WordBoundaryStart,
2836 "end" => ast::AssertionKind::WordBoundaryEnd,
2837 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2838 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2839 _ => {
2840 return Err(self.error(
2841 Span::new(start_contents, end),
2842 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2843 ))
2844 }
2845 };
2846 Ok(Some(kind))
2847 }
2848
2849 #[inline(never)]
2850 fn parse_octal(&self) -> Literal {
2851 assert!(self.parser().octal);
2852 assert!('0' <= self.char() && self.char() <= '7');
2853 let start = self.pos();
2854 while self.bump()
2856 && '0' <= self.char()
2857 && self.char() <= '7'
2858 && self.pos().offset - start.offset <= 2
2859 {}
2860 let end = self.pos();
2861 let octal = &self.pattern()[start.offset..end.offset];
2862 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2865 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2868 Literal {
2869 span: Span::new(start, end),
2870 kind: LiteralKind::Octal,
2871 c,
2872 }
2873 }
2874
2875 #[inline(never)]
2876 fn parse_hex(&self) -> Result<Literal> {
2877 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2878
2879 let hex_kind = match self.char() {
2880 'x' => HexLiteralKind::X,
2881 'u' => HexLiteralKind::UnicodeShort,
2882 _ => HexLiteralKind::UnicodeLong,
2883 };
2884 if !self.bump_and_bump_space() {
2885 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2886 }
2887 if self.char() == '{' {
2888 self.parse_hex_brace(hex_kind)
2889 } else {
2890 self.parse_hex_digits(hex_kind)
2891 }
2892 }
2893
2894 #[inline(never)]
2895 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2896 let mut scratch = self.parser().scratch.borrow_mut();
2897 scratch.clear();
2898
2899 let start = self.pos();
2900 for i in 0..kind.digits() {
2901 if i > 0 && !self.bump_and_bump_space() {
2902 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2903 }
2904 if !is_hex(self.char()) {
2905 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2906 }
2907 scratch.push(self.char());
2908 }
2909 self.bump_and_bump_space();
2910 let end = self.pos();
2911 let hex = scratch.as_str();
2912 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2913 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2914 Some(c) => Ok(Literal {
2915 span: Span::new(start, end),
2916 kind: LiteralKind::HexFixed(kind),
2917 c,
2918 }),
2919 }
2920 }
2921
2922 #[inline(never)]
2923 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2924 let mut scratch = self.parser().scratch.borrow_mut();
2925 scratch.clear();
2926
2927 let brace_pos = self.pos();
2928 let start = self.span_char().end;
2929 while self.bump_and_bump_space() && self.char() != '}' {
2930 if !is_hex(self.char()) {
2931 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2932 }
2933 scratch.push(self.char());
2934 }
2935 if self.is_eof() {
2936 return Err(self.error(
2937 Span::new(brace_pos, self.pos()),
2938 ast::ErrorKind::EscapeUnexpectedEof,
2939 ));
2940 }
2941 let end = self.pos();
2942 let hex = scratch.as_str();
2943 assert_eq!(self.char(), '}');
2944 self.bump_and_bump_space();
2945
2946 if hex.is_empty() {
2947 return Err(self.error(
2948 Span::new(brace_pos, self.pos()),
2949 ast::ErrorKind::EscapeHexEmpty,
2950 ));
2951 }
2952 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2953 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2954 Some(c) => Ok(Literal {
2955 span: Span::new(start, self.pos()),
2956 kind: LiteralKind::HexBrace(kind),
2957 c,
2958 }),
2959 }
2960 }
2961
2962 fn parse_decimal(&self) -> Result<u32> {
2963 let mut scratch = self.parser().scratch.borrow_mut();
2964 scratch.clear();
2965
2966 while !self.is_eof() && self.char().is_whitespace() {
2967 self.bump();
2968 }
2969 let start = self.pos();
2970 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2971 scratch.push(self.char());
2972 self.bump_and_bump_space();
2973 }
2974 let span = Span::new(start, self.pos());
2975 while !self.is_eof() && self.char().is_whitespace() {
2976 self.bump_and_bump_space();
2977 }
2978 let digits = scratch.as_str();
2979 if digits.is_empty() {
2980 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2981 }
2982 match digits.parse::<u32>().ok() {
2983 Some(n) => Ok(n),
2984 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2985 }
2986 }
2987
2988 #[inline(never)]
2989 fn parse_set_class(&self) -> Result<ClassBracketed> {
2990 assert_eq!(self.char(), '[');
2991
2992 let mut union = ClassSetUnion {
2993 span: self.span(),
2994 items: vec![],
2995 };
2996 loop {
2997 self.bump_space();
2998 if self.is_eof() {
2999 return Err(self.unclosed_class_error());
3000 }
3001 match self.char() {
3002 '[' => {
3003 if !self.parser().stack_class.borrow().is_empty() {
3004 if let Some(cls) = self.maybe_parse_ascii_class() {
3005 union.push(ClassSetItem::Ascii(cls));
3006 continue;
3007 }
3008 }
3009 union = self.push_class_open(union)?;
3010 }
3011 ']' => match self.pop_class(union)? {
3012 Either::Left(nested_union) => {
3013 union = nested_union;
3014 }
3015 Either::Right(class) => return Ok(class),
3016 },
3017 '&' if self.peek() == Some('&') => {
3018 assert!(self.bump_if("&&"));
3019 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3020 }
3021 '-' if self.peek() == Some('-') => {
3022 assert!(self.bump_if("--"));
3023 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3024 }
3025 '~' if self.peek() == Some('~') => {
3026 assert!(self.bump_if("~~"));
3027 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3028 }
3029 _ => {
3030 union.push(self.parse_set_class_range()?);
3031 }
3032 }
3033 }
3034 }
3035
3036 #[inline(never)]
3037 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3038 let prim1 = self.parse_set_class_item()?;
3039 self.bump_space();
3040 if self.is_eof() {
3041 return Err(self.unclosed_class_error());
3042 }
3043 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3044 return prim1.into_class_set_item(self);
3045 }
3046 if !self.bump_and_bump_space() {
3047 return Err(self.unclosed_class_error());
3048 }
3049 let prim2 = self.parse_set_class_item()?;
3050 let range = ClassSetRange {
3051 span: Span::new(prim1.span().start, prim2.span().end),
3052 start: prim1.into_class_literal(self)?,
3053 end: prim2.into_class_literal(self)?,
3054 };
3055 if !range.is_valid() {
3056 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3057 }
3058 Ok(ClassSetItem::Range(range))
3059 }
3060
3061 #[inline(never)]
3062 fn parse_set_class_item(&self) -> Result<Primitive> {
3063 if self.char() == '\\' {
3064 self.parse_escape()
3065 } else {
3066 let x = Primitive::Literal(Literal {
3067 span: self.span_char(),
3068 kind: LiteralKind::Verbatim,
3069 c: self.char(),
3070 });
3071 self.bump();
3072 Ok(x)
3073 }
3074 }
3075
3076 #[inline(never)]
3077 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3078 assert_eq!(self.char(), '[');
3079 let start = self.pos();
3080 if !self.bump_and_bump_space() {
3081 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3082 }
3083
3084 let negated = if self.char() != '^' {
3085 false
3086 } else {
3087 if !self.bump_and_bump_space() {
3088 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3089 }
3090 true
3091 };
3092 let mut union = ClassSetUnion {
3094 span: self.span(),
3095 items: vec![],
3096 };
3097 while self.char() == '-' {
3098 union.push(ClassSetItem::Literal(Literal {
3099 span: self.span_char(),
3100 kind: LiteralKind::Verbatim,
3101 c: '-',
3102 }));
3103 if !self.bump_and_bump_space() {
3104 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3105 }
3106 }
3107 if union.items.is_empty() && self.char() == ']' {
3110 union.push(ClassSetItem::Literal(Literal {
3111 span: self.span_char(),
3112 kind: LiteralKind::Verbatim,
3113 c: ']',
3114 }));
3115 if !self.bump_and_bump_space() {
3116 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3117 }
3118 }
3119 let set = ClassBracketed {
3120 span: Span::new(start, self.pos()),
3121 negated,
3122 kind: ClassSet::union(ClassSetUnion {
3123 span: Span::new(union.span.start, union.span.start),
3124 items: vec![],
3125 }),
3126 };
3127 Ok((set, union))
3128 }
3129
3130 #[inline(never)]
3131 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3132 assert_eq!(self.char(), '[');
3133 let start = self.pos();
3135 let mut negated = false;
3136 if !self.bump() || self.char() != ':' {
3137 self.parser().pos.set(start);
3138 return None;
3139 }
3140 if !self.bump() {
3141 self.parser().pos.set(start);
3142 return None;
3143 }
3144 if self.char() == '^' {
3145 negated = true;
3146 if !self.bump() {
3147 self.parser().pos.set(start);
3148 return None;
3149 }
3150 }
3151 let name_start = self.offset();
3152 while self.char() != ':' && self.bump() {}
3153 if self.is_eof() {
3154 self.parser().pos.set(start);
3155 return None;
3156 }
3157 let name = &self.pattern()[name_start..self.offset()];
3158 if !self.bump_if(":]") {
3159 self.parser().pos.set(start);
3160 return None;
3161 }
3162 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3163 Some(kind) => kind,
3164 None => {
3165 self.parser().pos.set(start);
3166 return None;
3167 }
3168 };
3169 Some(ClassAscii {
3170 span: Span::new(start, self.pos()),
3171 kind,
3172 negated,
3173 })
3174 }
3175
3176 #[inline(never)]
3177 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3178 assert!(self.char() == 'p' || self.char() == 'P');
3179
3180 let mut scratch = self.parser().scratch.borrow_mut();
3181 scratch.clear();
3182
3183 let negated = self.char() == 'P';
3184 if !self.bump_and_bump_space() {
3185 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3186 }
3187 let (start, kind) = if self.char() == '{' {
3188 let start = self.span_char().end;
3189 while self.bump_and_bump_space() && self.char() != '}' {
3190 scratch.push(self.char());
3191 }
3192 if self.is_eof() {
3193 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3194 }
3195 assert_eq!(self.char(), '}');
3196 self.bump();
3197
3198 let name = scratch.as_str();
3199 if let Some(i) = name.find("!=") {
3200 (
3201 start,
3202 ClassUnicodeKind::NamedValue {
3203 op: ClassUnicodeOpKind::NotEqual,
3204 name: name[..i].to_string(),
3205 value: name[i + 2..].to_string(),
3206 },
3207 )
3208 } else if let Some(i) = name.find(':') {
3209 (
3210 start,
3211 ClassUnicodeKind::NamedValue {
3212 op: ClassUnicodeOpKind::Colon,
3213 name: name[..i].to_string(),
3214 value: name[i + 1..].to_string(),
3215 },
3216 )
3217 } else if let Some(i) = name.find('=') {
3218 (
3219 start,
3220 ClassUnicodeKind::NamedValue {
3221 op: ClassUnicodeOpKind::Equal,
3222 name: name[..i].to_string(),
3223 value: name[i + 1..].to_string(),
3224 },
3225 )
3226 } else {
3227 (start, ClassUnicodeKind::Named(name.to_string()))
3228 }
3229 } else {
3230 let start = self.pos();
3231 let c = self.char();
3232 if c == '\\' {
3233 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3234 }
3235 self.bump_and_bump_space();
3236 let kind = ClassUnicodeKind::OneLetter(c);
3237 (start, kind)
3238 };
3239 Ok(ClassUnicode {
3240 span: Span::new(start, self.pos()),
3241 negated,
3242 kind,
3243 })
3244 }
3245
3246 #[inline(never)]
3247 fn parse_perl_class(&self) -> ClassPerl {
3248 let c = self.char();
3249 let span = self.span_char();
3250 self.bump();
3251 let (negated, kind) = match c {
3252 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3253 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3254 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3255 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3256 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3257 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3258 c => panic!("expected valid Perl class but got '{}'", c),
3259 };
3260 ClassPerl {
3261 span,
3262 kind,
3263 negated,
3264 }
3265 }
3266}
3267
3268fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3271 use regex_syntax::ast::ClassSetItem;
3272 let items = match item {
3273 ClassSetItem::Union(u) => &u.items,
3274 _ => return false,
3275 };
3276 if items.len() != 2 {
3277 return false;
3278 }
3279 match (&items[0], &items[1]) {
3280 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3281 let is_all = a.kind == b.kind && a.negated != b.negated;
3282 is_all
3283 }
3284 _ => false,
3285 }
3286}
3287
3288pub fn max_concat_length(ast: &ast::Ast) -> usize {
3289 match ast {
3290 ast::Ast::Empty(_)
3291 | ast::Ast::Flags(_)
3292 | ast::Ast::Literal(_)
3293 | ast::Ast::Dot(_)
3294 | ast::Ast::Top(_)
3295 | ast::Ast::Assertion(_)
3296 | ast::Ast::ClassUnicode(_)
3297 | ast::Ast::ClassPerl(_)
3298 | ast::Ast::ClassBracketed(_) => 0,
3299 ast::Ast::Group(g) => max_concat_length(&g.ast),
3300 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3301 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3302 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3303 ast::Ast::Concat(c) => c
3304 .asts
3305 .len()
3306 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3307 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3308 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3309 }
3310}
3311
3312pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3313 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3314 match ast {
3315 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3316 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3317 ast::Ast::Assertion(_) => 1,
3318 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3319 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3320 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3321 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3322 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3323 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3324 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3325 ast::Ast::Repetition(r) => {
3326 let body = go(&r.ast, limit);
3327 let factor: u64 = match &r.op.kind {
3328 ast::RepetitionKind::ZeroOrOne => 2,
3329 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3330 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3331 (*n as u64).max(1)
3332 }
3333 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3334 (*n as u64).max(1).saturating_add(1)
3335 }
3336 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3337 (*m as u64).max(1)
3338 }
3339 };
3340 body.saturating_mul(factor).min(limit)
3341 }
3342 }
3343 }
3344 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3345 let mut total: u64 = 0;
3346 for c in children {
3347 total = total.saturating_add(go(c, limit));
3348 if total >= limit {
3349 return limit;
3350 }
3351 }
3352 total
3353 }
3354 go(ast, limit)
3355}
3356
3357pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3358 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3359 p.parse(tb)
3360}
3361
3362pub fn parse_ast_with<'s>(
3363 tb: &mut TB<'s>,
3364 pattern: &'s str,
3365 flags: &PatternFlags,
3366) -> std::result::Result<NodeId, ParseError> {
3367 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3368 p.parse(tb)
3369}
3370
3371pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3373 let mut p: ResharpParser = ResharpParser::new(pattern);
3374 p.parse_inner()
3375}