1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51}
52
53pub const DEFAULT_MAX_REPEAT: u32 = 500;
56pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
57pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
58
59impl Default for PatternFlags {
60 fn default() -> Self {
61 Self {
62 unicode: true,
63 full_unicode: false,
64 case_insensitive: false,
65 dot_matches_new_line: false,
66 multiline: true,
67 ignore_whitespace: false,
68 ascii_perl_classes: false,
69 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
70 max_list_len: DEFAULT_MAX_LIST_LEN,
71 max_repeat: DEFAULT_MAX_REPEAT,
72 }
73 }
74}
75
76#[derive(Clone, Copy, PartialEq, Debug)]
77enum WordCharKind {
78 Word,
79 NonWord,
80 MaybeWord,
81 MaybeNonWord,
82 Unknown,
83 Edge,
84}
85
86fn is_word_byte(b: u8) -> bool {
87 b.is_ascii_alphanumeric() || b == b'_'
88}
89
90fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
91 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
92 use WordCharKind::*;
93 match item {
94 ClassSetItem::Empty(_) => Unknown,
95 ClassSetItem::Literal(l) => {
96 if is_word_byte(l.c as u8) {
97 Word
98 } else {
99 NonWord
100 }
101 }
102 ClassSetItem::Range(r) => {
103 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
104 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
105 if all_word {
106 Word
107 } else if all_non {
108 NonWord
109 } else {
110 Unknown
111 }
112 }
113 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
114 (ClassPerlKind::Word, false) => Word,
115 (ClassPerlKind::Word, true) => NonWord,
116 (ClassPerlKind::Space, false) => NonWord,
117 (ClassPerlKind::Digit, false) => Word,
118 _ => Unknown,
119 },
120 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
121 ClassSetItem::Union(u) => {
122 let mut kind = Unknown;
123 for item in &u.items {
124 let k = class_set_item_word_kind(item);
125 kind = match (kind, k) {
126 (_, Unknown) => return Unknown,
127 (Unknown, _) => k,
128 (Word, Word) => Word,
129 (NonWord, NonWord) => NonWord,
130 _ => return Unknown,
131 };
132 }
133 kind
134 }
135 _ => Unknown,
136 }
137}
138
139fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
140 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
141 use WordCharKind::*;
142 if c.negated {
143 return match &c.kind {
144 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
145 if p.negated {
146 Word
147 } else {
148 NonWord
149 }
150 }
151 _ => Unknown,
152 };
153 }
154 match &c.kind {
155 ClassSet::Item(item) => class_set_item_word_kind(item),
156 ClassSet::BinaryOp(_) => Unknown,
157 }
158}
159
160fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
161 regex_syntax::ast::Literal {
162 span,
163 kind: regex_syntax::ast::LiteralKind::Verbatim,
164 c,
165 }
166}
167
168fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
169 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
170 span,
171 start: ascii_class_lit(span, a),
172 end: ascii_class_lit(span, b),
173 })
174}
175
176fn ascii_perl_positive(
177 span: Span,
178 kind: ®ex_syntax::ast::ClassPerlKind,
179) -> regex_syntax::ast::ClassSetItem {
180 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
181 match kind {
182 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
183 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
184 span,
185 items: vec![
186 ascii_class_range(span, 'a', 'z'),
187 ascii_class_range(span, 'A', 'Z'),
188 ascii_class_range(span, '0', '9'),
189 ClassSetItem::Literal(ascii_class_lit(span, '_')),
190 ],
191 }),
192 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
193 span,
194 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
195 .into_iter()
196 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
197 .collect(),
198 }),
199 }
200}
201
202fn ascii_perl_set_item(
203 span: Span,
204 kind: ®ex_syntax::ast::ClassPerlKind,
205 negated: bool,
206) -> regex_syntax::ast::ClassSetItem {
207 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
208 let positive = ascii_perl_positive(span, kind);
209 if negated {
210 ClassSetItem::Bracketed(Box::new(ClassBracketed {
211 span,
212 negated: true,
213 kind: ClassSet::Item(positive),
214 }))
215 } else {
216 positive
217 }
218}
219
220fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
221 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
222 match set {
223 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
224 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
225 span: op.span,
226 kind: op.kind.clone(),
227 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
228 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
229 }),
230 }
231}
232
233fn rewrite_ascii_perl_item(
234 item: ®ex_syntax::ast::ClassSetItem,
235) -> regex_syntax::ast::ClassSetItem {
236 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
237 match item {
238 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
239 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
240 span: u.span,
241 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
242 }),
243 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
244 span: b.span,
245 negated: b.negated,
246 kind: rewrite_ascii_perl_set(&b.kind),
247 })),
248 other => other.clone(),
249 }
250}
251
252#[derive(Clone, Debug, Eq, PartialEq)]
253enum Primitive {
254 Literal(Literal),
255 Assertion(ast::Assertion),
256 Dot(Span),
257 Top(Span),
258 Perl(ClassPerl),
259 Unicode(ClassUnicode),
260}
261
262impl Primitive {
263 fn span(&self) -> &Span {
264 match *self {
265 Primitive::Literal(ref x) => &x.span,
266 Primitive::Assertion(ref x) => &x.span,
267 Primitive::Dot(ref span) => span,
268 Primitive::Top(ref span) => span,
269 Primitive::Perl(ref x) => &x.span,
270 Primitive::Unicode(ref x) => &x.span,
271 }
272 }
273
274 fn into_ast(self) -> Ast {
275 match self {
276 Primitive::Literal(lit) => Ast::literal(lit),
277 Primitive::Assertion(assert) => Ast::assertion(assert),
278 Primitive::Dot(span) => Ast::dot(span),
279 Primitive::Top(span) => Ast::top(span),
280 Primitive::Perl(cls) => Ast::class_perl(cls),
281 Primitive::Unicode(cls) => Ast::class_unicode(cls),
282 }
283 }
284
285 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
286 use self::Primitive::*;
287 use regex_syntax::ast::ClassSetItem;
288
289 match self {
290 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
291 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
292 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
293 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
294 }
295 }
296
297 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
298 use self::Primitive::*;
299
300 match self {
301 Literal(lit) => Ok(lit),
302 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
303 }
304 }
305}
306
307#[derive(Clone, Debug, Eq, PartialEq)]
308pub enum Either<Left, Right> {
309 Left(Left),
310 Right(Right),
311}
312
313#[derive(Clone, Debug, Eq, PartialEq)]
314pub struct ParseError {
315 pub kind: ErrorKind,
317 pattern: String,
320 pub span: Span,
322}
323
324impl std::fmt::Display for ParseError {
325 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
326 write!(f, "{:?}: {:?}", self.kind, self.span)
327 }
328}
329impl std::error::Error for ParseError {}
330
331type Result<T> = core::result::Result<T, ParseError>;
332
333#[derive(Clone, Debug)]
334enum GroupState {
335 Group {
337 concat: Concat,
339 group: ast::Group,
341 ignore_whitespace: bool,
343 },
344 Alternation(ast::Alternation),
345 Intersection(ast::Intersection),
346}
347
348#[derive(Clone, Debug)]
349enum ClassState {
350 Open {
352 union: regex_syntax::ast::ClassSetUnion,
354 set: regex_syntax::ast::ClassBracketed,
355 },
356 Op {
359 kind: regex_syntax::ast::ClassSetBinaryOpKind,
361 lhs: regex_syntax::ast::ClassSet,
363 },
364}
365
366pub struct ResharpParser<'s> {
368 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
369 unicode_classes: resharp_algebra::UnicodeClassCache,
370 pub translator: regex_syntax::hir::translate::Translator,
371 pub pattern: &'s str,
372 pos: Cell<Position>,
373 capture_index: Cell<u32>,
374 octal: bool,
375 empty_min_range: bool,
376 ignore_whitespace: Cell<bool>,
377 dot_all: Cell<bool>,
378 multiline: Cell<bool>,
379 global_unicode: bool,
380 global_full_unicode: bool,
381 global_ascii_perl: bool,
382 global_case_insensitive: bool,
383 expanded_ast_limit: u64,
384 max_list_len: usize,
385 max_repeat: u32,
386 comments: RefCell<Vec<ast::Comment>>,
387 stack_group: RefCell<Vec<GroupState>>,
388 stack_class: RefCell<Vec<ClassState>>,
389 capture_names: RefCell<Vec<ast::CaptureName>>,
390 scratch: RefCell<String>,
391}
392
393fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
394 result.map_err(|e| {
395 if e.kind == from {
396 ParseError {
397 kind: to,
398 pattern: e.pattern,
399 span: e.span,
400 }
401 } else {
402 e
403 }
404 })
405}
406
407fn is_capture_char(c: char, first: bool) -> bool {
408 if first {
409 c == '_' || c.is_alphabetic()
410 } else {
411 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
412 }
413}
414
415pub fn is_meta_character(c: char) -> bool {
416 matches!(
417 c,
418 '\\' | '.'
419 | '+'
420 | '*'
421 | '?'
422 | '('
423 | ')'
424 | '|'
425 | '['
426 | ']'
427 | '{'
428 | '}'
429 | '^'
430 | '$'
431 | '#'
432 | '&'
433 | '-'
434 | '~'
435 | '_'
436 )
437}
438
439pub fn escape(text: &str) -> String {
441 let mut buf = String::new();
442 escape_into(text, &mut buf);
443 buf
444}
445
446pub fn escape_into(text: &str, buf: &mut String) {
448 buf.reserve(text.len());
449 for c in text.chars() {
450 if is_meta_character(c) {
451 buf.push('\\');
452 }
453 buf.push(c);
454 }
455}
456
457pub fn is_escapeable_character(c: char) -> bool {
458 if is_meta_character(c) {
459 return true;
460 }
461 if !c.is_ascii() {
462 return false;
463 }
464 match c {
465 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
466 '<' | '>' => false,
467 _ => true,
468 }
469}
470
471fn is_hex(c: char) -> bool {
472 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
473}
474
475fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
476 match ast {
477 Ast::Concat(c) => {
478 let mut child_at_start = at_start;
479 for child in &c.asts {
480 ensure_lookbehind_at_start(child, child_at_start)?;
481 if ast_may_consume(child) {
482 child_at_start = false;
483 }
484 }
485 Ok(())
486 }
487 Ast::Alternation(a) => {
488 for child in &a.asts {
489 ensure_lookbehind_at_start(child, at_start)?;
490 }
491 Ok(())
492 }
493 Ast::Intersection(i) => {
494 for child in &i.asts {
495 ensure_lookbehind_at_start(child, at_start)?;
496 }
497 Ok(())
498 }
499 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
500 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
501 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
502 Ast::Lookaround(g) => {
503 match g.kind {
504 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
505 if !at_start {
506 return Err(g.span);
507 }
508 }
509 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
510 }
511 ensure_lookbehind_at_start(&g.ast, true)
512 }
513 Ast::Empty(_)
514 | Ast::Flags(_)
515 | Ast::Literal(_)
516 | Ast::Dot(_)
517 | Ast::Top(_)
518 | Ast::Assertion(_)
519 | Ast::ClassUnicode(_)
520 | Ast::ClassPerl(_)
521 | Ast::ClassBracketed(_) => Ok(()),
522 }
523}
524
525fn ast_may_consume(ast: &Ast) -> bool {
526 match ast {
527 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
528 Ast::Literal(_)
529 | Ast::Dot(_)
530 | Ast::Top(_)
531 | Ast::ClassUnicode(_)
532 | Ast::ClassPerl(_)
533 | Ast::ClassBracketed(_) => true,
534 Ast::Group(g) => ast_may_consume(&g.ast),
535 Ast::Repetition(r) => {
536 if !ast_may_consume(&r.ast) {
537 return false;
538 }
539 match r.op.kind {
540 RepetitionKind::ZeroOrOne
541 | RepetitionKind::ZeroOrMore
542 | RepetitionKind::OneOrMore => true,
543 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
544 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
545 RepetitionKind::Range(_) => true,
546 }
547 }
548 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
549 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
550 Ast::Complement(_) => true,
551 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
552 }
553}
554
555impl<'s> ResharpParser<'s> {
556 fn default_translator_builder(&self) -> TranslatorBuilder {
557 let mut trb = TranslatorBuilder::new();
558 trb.unicode(self.global_unicode);
559 trb.utf8(false);
560 trb.case_insensitive(self.global_case_insensitive);
561 trb
562 }
563
564 pub fn new(pattern: &'s str) -> Self {
565 Self::with_flags(pattern, &PatternFlags::default())
566 }
567
568 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
569 let mut trb = TranslatorBuilder::new();
570 trb.unicode(flags.unicode);
571 trb.utf8(false);
572 trb.case_insensitive(flags.case_insensitive);
573 Self {
574 translator: trb.build(),
575 pattern,
576 perl_classes: vec![],
577 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
578 pos: Cell::new(Position::new(0, 0, 0)),
579 capture_index: Cell::new(0),
580 octal: false,
581 empty_min_range: false,
582 ignore_whitespace: Cell::new(flags.ignore_whitespace),
583 dot_all: Cell::new(flags.dot_matches_new_line),
584 multiline: Cell::new(flags.multiline),
585 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
586 global_full_unicode: flags.full_unicode,
587 global_ascii_perl: flags.ascii_perl_classes,
588 global_case_insensitive: flags.case_insensitive,
589 expanded_ast_limit: flags.expanded_ast_limit,
590 max_list_len: flags.max_list_len,
591 max_repeat: flags.max_repeat,
592 comments: RefCell::new(vec![]),
593 stack_group: RefCell::new(vec![]),
594 stack_class: RefCell::new(vec![]),
595 capture_names: RefCell::new(vec![]),
596 scratch: RefCell::new(String::new()),
597 }
598 }
599
600 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
601 self
602 }
603
604 fn pattern(&self) -> &str {
605 self.pattern
606 }
607
608 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
609 ParseError {
610 kind,
611 pattern: self.pattern().to_string(),
612 span,
613 }
614 }
615
616 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
617 self.error(
618 Span::splat(self.pos()),
619 ast::ErrorKind::UnsupportedResharpRegex,
620 )
621 }
622
623 fn offset(&self) -> usize {
624 self.parser().pos.get().offset
625 }
626
627 fn line(&self) -> usize {
628 self.parser().pos.get().line
629 }
630
631 fn column(&self) -> usize {
632 self.parser().pos.get().column
633 }
634
635 fn next_capture_index(&self, span: Span) -> Result<u32> {
636 let current = self.parser().capture_index.get();
637 let i = current
638 .checked_add(1)
639 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
640 self.parser().capture_index.set(i);
641 Ok(i)
642 }
643
644 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
645 let mut names = self.parser().capture_names.borrow_mut();
646 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
647 Err(i) => {
648 names.insert(i, cap.clone());
649 Ok(())
650 }
651 Ok(i) => Err(self.error(
652 cap.span,
653 ast::ErrorKind::GroupNameDuplicate {
654 original: names[i].span,
655 },
656 )),
657 }
658 }
659
660 fn ignore_whitespace(&self) -> bool {
661 self.parser().ignore_whitespace.get()
662 }
663
664 fn char(&self) -> char {
665 self.char_at(self.offset())
666 }
667
668 fn char_at(&self, i: usize) -> char {
669 self.pattern()[i..]
670 .chars()
671 .next()
672 .unwrap_or_else(|| panic!("expected char at offset {}", i))
673 }
674
675 fn bump(&self) -> bool {
676 if self.is_eof() {
677 return false;
678 }
679 let Position {
680 mut offset,
681 mut line,
682 mut column,
683 } = self.pos();
684 if self.char() == '\n' {
685 line = line.checked_add(1).unwrap();
686 column = 1;
687 } else {
688 column = column.checked_add(1).unwrap();
689 }
690 offset += self.char().len_utf8();
691 self.parser().pos.set(Position {
692 offset,
693 line,
694 column,
695 });
696 self.pattern()[self.offset()..].chars().next().is_some()
697 }
698
699 fn bump_if(&self, prefix: &str) -> bool {
700 if self.pattern()[self.offset()..].starts_with(prefix) {
701 for _ in 0..prefix.chars().count() {
702 self.bump();
703 }
704 true
705 } else {
706 false
707 }
708 }
709
710 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
711 if self.bump_if("?=") {
712 return Some((true, true));
713 }
714 if self.bump_if("?!") {
715 return Some((true, false));
716 }
717 if self.bump_if("?<=") {
718 return Some((false, true));
719 }
720 if self.bump_if("?<!") {
721 return Some((false, false));
722 }
723 None
724 }
725
726 fn bump_and_bump_space(&self) -> bool {
727 if !self.bump() {
728 return false;
729 }
730 self.bump_space();
731 !self.is_eof()
732 }
733
734 fn bump_space(&self) {
735 if !self.ignore_whitespace() {
736 return;
737 }
738 while !self.is_eof() {
739 if self.char().is_whitespace() {
740 self.bump();
741 } else if self.char() == '#' {
742 let start = self.pos();
743 let mut comment_text = String::new();
744 self.bump();
745 while !self.is_eof() {
746 let c = self.char();
747 self.bump();
748 if c == '\n' {
749 break;
750 }
751 comment_text.push(c);
752 }
753 let comment = ast::Comment {
754 span: Span::new(start, self.pos()),
755 comment: comment_text,
756 };
757 self.parser().comments.borrow_mut().push(comment);
758 } else {
759 break;
760 }
761 }
762 }
763
764 fn peek(&self) -> Option<char> {
765 if self.is_eof() {
766 return None;
767 }
768 self.pattern()[self.offset() + self.char().len_utf8()..]
769 .chars()
770 .next()
771 }
772
773 fn peek_space(&self) -> Option<char> {
776 if !self.ignore_whitespace() {
777 return self.peek();
778 }
779 if self.is_eof() {
780 return None;
781 }
782 let mut start = self.offset() + self.char().len_utf8();
783 let mut in_comment = false;
784 for (i, c) in self.pattern()[start..].char_indices() {
785 if c.is_whitespace() {
786 continue;
787 } else if !in_comment && c == '#' {
788 in_comment = true;
789 } else if in_comment && c == '\n' {
790 in_comment = false;
791 } else {
792 start += i;
793 break;
794 }
795 }
796 self.pattern()[start..].chars().next()
797 }
798
799 fn is_eof(&self) -> bool {
800 self.offset() == self.pattern().len()
801 }
802
803 fn pos(&self) -> Position {
804 self.parser().pos.get()
805 }
806
807 fn span(&self) -> Span {
808 Span::splat(self.pos())
809 }
810
811 fn span_char(&self) -> Span {
812 let mut next = Position {
813 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
814 line: self.line(),
815 column: self.column().checked_add(1).unwrap(),
816 };
817 if self.char() == '\n' {
818 next.line += 1;
819 next.column = 1;
820 }
821 Span::new(self.pos(), next)
822 }
823
824 #[inline(never)]
825 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
826 assert_eq!(self.char(), '|');
827 concat.span.end = self.pos();
828 self.push_or_add_alternation(concat);
829 self.bump();
830 Ok(ast::Concat {
831 span: self.span(),
832 asts: vec![],
833 })
834 }
835
836 fn push_or_add_alternation(&self, concat: Concat) {
837 use self::GroupState::*;
838
839 let mut stack = self.parser().stack_group.borrow_mut();
840 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
841 alts.asts.push(concat.into_ast());
842 return;
843 }
844 stack.push(Alternation(ast::Alternation {
845 span: Span::new(concat.span.start, self.pos()),
846 asts: vec![concat.into_ast()],
847 }));
848 }
849
850 #[inline(never)]
851 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
852 assert_eq!(self.char(), '&');
853 concat.span.end = self.pos();
854 self.push_or_add_intersect(concat);
855 self.bump();
856 Ok(Concat {
857 span: self.span(),
858 asts: vec![],
859 })
860 }
861
862 fn push_or_add_intersect(&self, concat: Concat) {
863 use self::GroupState::*;
864
865 let mut stack = self.parser().stack_group.borrow_mut();
866 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
867 alts.asts.push(concat.into_ast());
868 return;
869 }
870 stack.push(Intersection(ast::Intersection {
871 span: Span::new(concat.span.start, self.pos()),
872 asts: vec![concat.into_ast()],
873 }));
874 }
875
876 #[inline(never)]
877 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
878 assert_eq!(self.char(), '(');
879 match self.parse_group()? {
880 Either::Left(set) => {
881 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
882 if let Some(v) = ignore {
883 self.parser().ignore_whitespace.set(v);
884 }
885
886 concat.asts.push(Ast::flags(set));
887 Ok(concat)
888 }
889 Either::Right(group) => {
890 let old_ignore_whitespace = self.ignore_whitespace();
891 let new_ignore_whitespace = group
892 .flags()
893 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
894 .unwrap_or(old_ignore_whitespace);
895 self.parser()
896 .stack_group
897 .borrow_mut()
898 .push(GroupState::Group {
899 concat,
900 group,
901 ignore_whitespace: old_ignore_whitespace,
902 });
903 self.parser().ignore_whitespace.set(new_ignore_whitespace);
904 Ok(Concat {
905 span: self.span(),
906 asts: vec![],
907 })
908 }
909 }
910 }
911
912 #[inline(never)]
913 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
914 assert_eq!(self.char(), '~');
915 self.bump();
916 if self.is_eof() || self.char() != '(' {
917 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
918 }
919 let open_span = self.span_char();
920 self.bump();
921 let group = ast::Group {
922 span: open_span,
923 kind: ast::GroupKind::Complement,
924 ast: Box::new(Ast::empty(self.span())),
925 };
926
927 let old_ignore_whitespace = self.ignore_whitespace();
928 let new_ignore_whitespace = group
929 .flags()
930 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
931 .unwrap_or(old_ignore_whitespace);
932 self.parser()
933 .stack_group
934 .borrow_mut()
935 .push(GroupState::Group {
936 concat,
937 group,
938 ignore_whitespace: old_ignore_whitespace,
939 });
940 self.parser().ignore_whitespace.set(new_ignore_whitespace);
941 Ok(Concat {
942 span: self.span(),
943 asts: vec![],
944 })
945 }
946
947 #[inline(never)]
948 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
949 use self::GroupState::*;
950 assert_eq!(self.char(), ')');
951 let mut stack = self.parser().stack_group.borrow_mut();
952 let topstack = stack.pop();
953
954 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
955 Some(Group {
956 concat,
957 group,
958 ignore_whitespace,
959 }) => (concat, group, ignore_whitespace, None),
960 Some(Alternation(alt)) => match stack.pop() {
961 Some(Group {
962 concat,
963 group,
964 ignore_whitespace,
965 }) => (
966 concat,
967 group,
968 ignore_whitespace,
969 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
970 ),
971 None | Some(Alternation(_)) | Some(Intersection(_)) => {
972 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
973 }
974 },
975 Some(Intersection(int)) => match stack.pop() {
976 Some(Group {
977 concat,
978 group,
979 ignore_whitespace,
980 }) => (
981 concat,
982 group,
983 ignore_whitespace,
984 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
985 ),
986 None | Some(Alternation(_)) | Some(Intersection(_)) => {
987 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
988 }
989 },
990
991 None => {
992 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993 }
994 };
995 self.parser().ignore_whitespace.set(ignore_whitespace);
996 group_concat.span.end = self.pos();
997 self.bump();
998 group.span.end = self.pos();
999 match alt {
1000 Some(Either::Left(mut alt)) => {
1001 alt.span.end = group_concat.span.end;
1002 alt.asts.push(group_concat.into_ast());
1003 group.ast = Box::new(alt.into_ast());
1004 }
1005 Some(Either::Right(mut int)) => {
1006 int.span.end = group_concat.span.end;
1007 int.asts.push(group_concat.into_ast());
1008 group.ast = Box::new(int.into_ast());
1009 }
1010 None => {
1011 group.ast = Box::new(group_concat.into_ast());
1012 }
1013 }
1014
1015 if group.kind == GroupKind::Complement {
1016 let complement = ast::Complement {
1017 span: self.span(),
1018 ast: group.ast,
1019 };
1020 prior_concat.asts.push(Ast::complement(complement));
1021 }
1022 else {
1024 prior_concat.asts.push(Ast::group(group));
1025 }
1026 Ok(prior_concat)
1027 }
1028
1029 #[inline(never)]
1030 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1031 concat.span.end = self.pos();
1032 let mut stack = self.parser().stack_group.borrow_mut();
1033 let ast = match stack.pop() {
1034 None => Ok(concat.into_ast()),
1035 Some(GroupState::Alternation(mut alt)) => {
1036 alt.span.end = self.pos();
1037 alt.asts.push(concat.into_ast());
1038 Ok(Ast::alternation(alt))
1039 }
1040 Some(GroupState::Intersection(mut int)) => {
1041 int.span.end = self.pos();
1042 int.asts.push(concat.into_ast());
1043
1044 Ok(Ast::intersection(int))
1045 }
1046 Some(GroupState::Group { group, .. }) => {
1047 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1048 }
1049 };
1050 match stack.pop() {
1052 None => ast,
1053 Some(GroupState::Alternation(alt)) => {
1054 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1055 }
1056 Some(GroupState::Intersection(int)) => {
1057 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1058 }
1059 Some(GroupState::Group { group, .. }) => {
1060 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1061 }
1062 }
1063 }
1064
1065 #[inline(never)]
1066 fn push_class_open(
1067 &self,
1068 parent_union: regex_syntax::ast::ClassSetUnion,
1069 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1070 assert_eq!(self.char(), '[');
1071
1072 let (nested_set, nested_union) = self.parse_set_class_open()?;
1073 self.parser()
1074 .stack_class
1075 .borrow_mut()
1076 .push(ClassState::Open {
1077 union: parent_union,
1078 set: nested_set,
1079 });
1080 Ok(nested_union)
1081 }
1082
1083 #[inline(never)]
1084 fn pop_class(
1085 &self,
1086 nested_union: regex_syntax::ast::ClassSetUnion,
1087 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1088 assert_eq!(self.char(), ']');
1089
1090 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1091 let prevset = self.pop_class_op(item);
1092 let mut stack = self.parser().stack_class.borrow_mut();
1093 match stack.pop() {
1094 None => panic!("unexpected empty character class stack"),
1095 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1096 Some(ClassState::Open { mut union, mut set }) => {
1097 self.bump();
1098 set.span.end = self.pos();
1099 set.kind = prevset;
1100 if stack.is_empty() {
1101 Ok(Either::Right(set))
1102 } else {
1103 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1104 Ok(Either::Left(union))
1105 }
1106 }
1107 }
1108 }
1109
1110 #[inline(never)]
1111 fn unclosed_class_error(&self) -> ParseError {
1112 for state in self.parser().stack_class.borrow().iter().rev() {
1113 if let ClassState::Open { ref set, .. } = *state {
1114 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1115 }
1116 }
1117 panic!("no open character class found")
1118 }
1119
1120 #[inline(never)]
1121 fn push_class_op(
1122 &self,
1123 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1124 next_union: regex_syntax::ast::ClassSetUnion,
1125 ) -> regex_syntax::ast::ClassSetUnion {
1126 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1127 let new_lhs = self.pop_class_op(item);
1128 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1129 kind: next_kind,
1130 lhs: new_lhs,
1131 });
1132 regex_syntax::ast::ClassSetUnion {
1133 span: self.span(),
1134 items: vec![],
1135 }
1136 }
1137
1138 #[inline(never)]
1139 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1140 let mut stack = self.parser().stack_class.borrow_mut();
1141 let (kind, lhs) = match stack.pop() {
1142 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1143 Some(state @ ClassState::Open { .. }) => {
1144 stack.push(state);
1145 return rhs;
1146 }
1147 None => unreachable!(),
1148 };
1149 let span = Span::new(lhs.span().start, rhs.span().end);
1150 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1151 span,
1152 kind,
1153 lhs: Box::new(lhs),
1154 rhs: Box::new(rhs),
1155 })
1156 }
1157
1158 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1159 match hir.kind() {
1160 hir::HirKind::Empty => Ok(NodeId::EPS),
1161 hir::HirKind::Literal(l) => {
1162 if l.0.len() == 1 {
1163 let node = tb.mk_u8(l.0[0]);
1164 Ok(node)
1165 } else {
1166 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1167 let conc = tb.mk_concats(ws.iter().copied());
1168 Ok(conc)
1169 }
1170 }
1171 hir::HirKind::Class(class) => match class {
1172 hir::Class::Unicode(class_unicode) => {
1173 let ranges = class_unicode.ranges();
1174 if ranges.len() == 1
1175 && ranges[0].start() == '\u{0}'
1176 && ranges[0].end() == '\u{10FFFF}'
1177 {
1178 return Ok(tb.mk_range_u8(0, 255));
1179 }
1180 let mut nodes = Vec::new();
1181 for range in ranges {
1182 for seq in Utf8Sequences::new(range.start(), range.end()) {
1183 let sl = seq.as_slice();
1184 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1185 let node = match bytes.len() {
1186 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1187 n => {
1188 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1189 let mut conc = last;
1190 for i in (0..n - 1).rev() {
1191 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1192 conc = tb.mk_concat(b, conc);
1193 }
1194 conc
1195 }
1196 };
1197 nodes.push(node);
1198 }
1199 }
1200 let merged = tb.mk_unions(nodes.into_iter());
1201 Ok(merged)
1202 }
1203 hir::Class::Bytes(class_bytes) => {
1204 let ranges = class_bytes.ranges();
1205 let mut result = NodeId::BOT;
1206 for range in ranges {
1207 let start = range.start();
1208 let end = range.end();
1209 let node = tb.mk_range_u8(start, end);
1210 result = tb.mk_union(result, node);
1211 }
1212 Ok(result)
1213 }
1214 },
1215 hir::HirKind::Look(_) => Err(self.error(
1216 Span::splat(self.pos()),
1217 ast::ErrorKind::UnsupportedResharpRegex,
1218 )),
1219 hir::HirKind::Repetition(_) => Err(self.error(
1220 Span::splat(self.pos()),
1221 ast::ErrorKind::UnsupportedResharpRegex,
1222 )),
1223 hir::HirKind::Capture(_) => Err(self.error(
1224 Span::splat(self.pos()),
1225 ast::ErrorKind::UnsupportedResharpRegex,
1226 )),
1227 hir::HirKind::Concat(body) => {
1228 let mut result = NodeId::EPS;
1229 for child in body {
1230 let node = self.hir_to_node_id(child, tb)?;
1231 result = tb.mk_concat(result, node);
1232 }
1233 Ok(result)
1234 }
1235 hir::HirKind::Alternation(_) => Err(self.error(
1236 Span::splat(self.pos()),
1237 ast::ErrorKind::UnsupportedResharpRegex,
1238 )),
1239 }
1240 }
1241
1242 fn translate_ast_to_hir(
1243 &mut self,
1244 orig_ast: ®ex_syntax::ast::Ast,
1245 tb: &mut TB<'s>,
1246 ) -> Result<NodeId> {
1247 match self.translator.translate("", orig_ast) {
1248 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1249 Ok(hir) => self.hir_to_node_id(&hir, tb),
1250 }
1251 }
1252
1253 fn translator_to_node_id(
1254 &mut self,
1255 orig_ast: ®ex_syntax::ast::Ast,
1256 translator: &mut Option<Translator>,
1257 tb: &mut TB<'s>,
1258 ) -> Result<NodeId> {
1259 match translator {
1260 Some(tr) => {
1261 let hir = tr
1262 .translate("", orig_ast)
1263 .map_err(|e| self.unsupported_error(e))?;
1264 self.hir_to_node_id(&hir, tb)
1265 }
1266 None => self.translate_ast_to_hir(orig_ast, tb),
1267 }
1268 }
1269
1270 fn get_class(
1271 &mut self,
1272 negated: bool,
1273 kind: regex_syntax::ast::ClassPerlKind,
1274 tb: &mut TB<'s>,
1275 ) -> Result<NodeId> {
1276 let w = self
1277 .perl_classes
1278 .iter()
1279 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1280 match w {
1281 Some((_, _, value)) => Ok(*value),
1282 None => {
1283 let translated = if self.global_ascii_perl {
1284 let pos = match kind {
1285 regex_syntax::ast::ClassPerlKind::Word => {
1286 let az = tb.mk_range_u8(b'a', b'z');
1287 let big = tb.mk_range_u8(b'A', b'Z');
1288 let dig = tb.mk_range_u8(b'0', b'9');
1289 let us = tb.mk_u8(b'_');
1290 tb.mk_unions([az, big, dig, us].into_iter())
1291 }
1292 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1293 regex_syntax::ast::ClassPerlKind::Space => {
1294 let sp = tb.mk_u8(b' ');
1295 let tab = tb.mk_u8(b'\t');
1296 let nl = tb.mk_u8(b'\n');
1297 let cr = tb.mk_u8(b'\r');
1298 let ff = tb.mk_u8(0x0C);
1299 let vt = tb.mk_u8(0x0B);
1300 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1301 }
1302 };
1303 if negated {
1304 resharp_algebra::neg_class(tb, pos)
1305 } else {
1306 pos
1307 }
1308 } else if self.global_unicode {
1309 match kind {
1310 regex_syntax::ast::ClassPerlKind::Word => {
1311 if self.global_full_unicode {
1312 self.unicode_classes.ensure_word_full(tb);
1313 } else {
1314 self.unicode_classes.ensure_word(tb);
1315 }
1316 if negated {
1317 self.unicode_classes.non_word
1318 } else {
1319 self.unicode_classes.word
1320 }
1321 }
1322 regex_syntax::ast::ClassPerlKind::Digit => {
1323 if self.global_full_unicode {
1324 self.unicode_classes.ensure_digit_full(tb);
1325 } else {
1326 self.unicode_classes.ensure_digit(tb);
1327 }
1328 if negated {
1329 self.unicode_classes.non_digit
1330 } else {
1331 self.unicode_classes.digit
1332 }
1333 }
1334 regex_syntax::ast::ClassPerlKind::Space => {
1335 if self.global_full_unicode {
1336 self.unicode_classes.ensure_space_full(tb);
1337 } else {
1338 self.unicode_classes.ensure_space(tb);
1339 }
1340 if negated {
1341 self.unicode_classes.non_space
1342 } else {
1343 self.unicode_classes.space
1344 }
1345 }
1346 }
1347 } else {
1348 let pos = match kind {
1349 regex_syntax::ast::ClassPerlKind::Word => {
1350 let az = tb.mk_range_u8(b'a', b'z');
1351 let big = tb.mk_range_u8(b'A', b'Z');
1352 let dig = tb.mk_range_u8(b'0', b'9');
1353 let us = tb.mk_u8(b'_');
1354 tb.mk_unions([az, big, dig, us].into_iter())
1355 }
1356 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1357 regex_syntax::ast::ClassPerlKind::Space => {
1358 let sp = tb.mk_u8(b' ');
1359 let tab = tb.mk_u8(b'\t');
1360 let nl = tb.mk_u8(b'\n');
1361 let cr = tb.mk_u8(b'\r');
1362 let ff = tb.mk_u8(0x0C);
1363 let vt = tb.mk_u8(0x0B);
1364 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1365 }
1366 };
1367 if negated {
1368 tb.mk_compl(pos)
1369 } else {
1370 pos
1371 }
1372 };
1373 self.perl_classes.push((negated, kind, translated));
1374 Ok(translated)
1375 }
1376 }
1377 }
1378
1379 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1380 use WordCharKind::*;
1381 match ast {
1382 Ast::Literal(lit) => {
1383 if is_word_byte(lit.c as u8) {
1384 Word
1385 } else {
1386 NonWord
1387 }
1388 }
1389 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1390 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1391 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1392 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1393 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1394 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1395 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1396 },
1397 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1398 Ast::Dot(_) | Ast::Top(_) => Unknown,
1399 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1400 Ast::Concat(c) if !c.asts.is_empty() => {
1401 let edge = if left { c.asts.len() - 1 } else { 0 };
1402 let kind = Self::word_char_kind(&c.asts[edge], left);
1403 match kind {
1404 MaybeWord => {
1405 let dir: isize = if left { -1 } else { 1 };
1406 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1407 Word => Word,
1408 _ => MaybeWord,
1409 }
1410 }
1411 MaybeNonWord => {
1412 let dir: isize = if left { -1 } else { 1 };
1413 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1414 NonWord => NonWord,
1415 _ => MaybeNonWord,
1416 }
1417 }
1418 other => other,
1419 }
1420 }
1421 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1422 let first = Self::word_char_kind(&alt.asts[0], left);
1423 if alt.asts[1..]
1424 .iter()
1425 .all(|a| Self::word_char_kind(a, left) == first)
1426 {
1427 first
1428 } else {
1429 Unknown
1430 }
1431 }
1432 Ast::Repetition(r) => {
1433 let inner = Self::word_char_kind(&r.ast, left);
1434 let nullable = matches!(
1435 &r.op.kind,
1436 ast::RepetitionKind::ZeroOrMore
1437 | ast::RepetitionKind::ZeroOrOne
1438 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1439 );
1440 if nullable {
1441 match inner {
1442 Word => MaybeWord,
1443 NonWord => MaybeNonWord,
1444 _ => Unknown,
1445 }
1446 } else {
1447 inner
1448 }
1449 }
1450 Ast::Lookaround(la) => match la.kind {
1451 ast::LookaroundKind::PositiveLookahead
1452 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1453 _ => Unknown,
1454 },
1455 _ => Unknown,
1456 }
1457 }
1458
1459 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1461 match ast {
1462 Ast::Literal(_)
1463 | Ast::ClassPerl(_)
1464 | Ast::ClassBracketed(_)
1465 | Ast::ClassUnicode(_)
1466 | Ast::Dot(_)
1467 | Ast::Top(_) => Some(ast),
1468 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1469 Ast::Concat(c) if !c.asts.is_empty() => {
1470 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1471 }
1472 Ast::Repetition(r) => {
1473 let nullable = matches!(
1474 &r.op.kind,
1475 ast::RepetitionKind::ZeroOrMore
1476 | ast::RepetitionKind::ZeroOrOne
1477 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(0, _))
1478 );
1479 if nullable {
1480 None
1481 } else {
1482 Self::edge_class_ast(&r.ast, left)
1483 }
1484 }
1485 _ => None,
1486 }
1487 }
1488
1489 fn resolve_word_kind(
1490 &mut self,
1491 asts: &[Ast],
1492 idx: usize,
1493 dir: isize,
1494 translator: &mut Option<Translator>,
1495 tb: &mut TB<'s>,
1496 word_id: NodeId,
1497 not_word_id: NodeId,
1498 ) -> Result<WordCharKind> {
1499 use WordCharKind::*;
1500 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1501 if fast != Unknown {
1502 return Ok(fast);
1503 }
1504 let neighbor_idx = (idx as isize + dir) as usize;
1505 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1506 self.ast_to_node_id(edge, translator, tb)?
1507 } else if dir > 0 {
1508 let mut bodies: Vec<NodeId> = vec![];
1509 let mut j = neighbor_idx;
1510 while j < asts.len() {
1511 match &asts[j] {
1512 Ast::Lookaround(la) => {
1513 let kind = la.kind.clone();
1514 let lookbehind = matches!(
1515 kind,
1516 ast::LookaroundKind::PositiveLookbehind
1517 | ast::LookaroundKind::NegativeLookbehind
1518 );
1519 if lookbehind {
1520 j += 1;
1521 continue;
1522 }
1523 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1524 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1525 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1526 })?;
1527 let body_ts = tb.mk_concat(body, NodeId::TS);
1528 let constraint = match kind {
1529 ast::LookaroundKind::PositiveLookahead => body_ts,
1530 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1531 _ => unreachable!(),
1532 };
1533 bodies.push(constraint);
1534 j += 1;
1535 }
1536 other => {
1537 let n = self.ast_to_node_id(other, translator, tb)?;
1538 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1539 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1540 })?;
1541 bodies.push(tb.mk_concat(n, NodeId::TS));
1542 break;
1543 }
1544 }
1545 }
1546 if bodies.is_empty() {
1547 return Ok(Unknown);
1548 }
1549 let combined = tb.mk_inters(bodies.into_iter());
1550 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1551 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1552 return if tb.subsumes(word_prefix, combined) == Some(true) {
1553 Ok(Word)
1554 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1555 Ok(NonWord)
1556 } else {
1557 Ok(Unknown)
1558 };
1559 } else {
1560 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1561 let mut neighbor_node = tb
1562 .try_elim_lookarounds(neighbor_node)
1563 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1564 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1565 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1566 })?;
1567 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1568 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1569 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1570 Ok(Word)
1571 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1572 Ok(NonWord)
1573 } else {
1574 Ok(Unknown)
1575 };
1576 };
1577 if tb.subsumes(word_id, node) == Some(true) {
1578 Ok(Word)
1579 } else if tb.subsumes(not_word_id, node) == Some(true) {
1580 Ok(NonWord)
1581 } else {
1582 Ok(Unknown)
1583 }
1584 }
1585
1586 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1587 match ast {
1588 Ast::Lookaround(la) => match la.kind {
1589 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1590 dir < 0
1591 }
1592 ast::LookaroundKind::PositiveLookbehind
1593 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1594 },
1595 _ => false,
1596 }
1597 }
1598
1599 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1600 use WordCharKind::*;
1601 let next = idx as isize + dir;
1602 if next < 0 || next >= asts.len() as isize {
1603 return Edge;
1604 }
1605 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1606 return Self::concat_neighbor_kind(asts, next as usize, dir);
1607 }
1608 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1609 match kind {
1610 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1611 Word => Word,
1612 _ => Unknown,
1613 },
1614 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1615 NonWord => NonWord,
1616 _ => Unknown,
1617 },
1618 other => other,
1619 }
1620 }
1621
1622 fn specialize_word_boundaries(
1623 &mut self,
1624 children: &mut [NodeId],
1625 tb: &mut TB<'s>,
1626 ) -> Result<()> {
1627 let wb = self.unicode_classes.wb;
1628 let non_wb = self.unicode_classes.non_wb;
1629 if wb == NodeId::MISSING {
1630 return Ok(());
1631 }
1632 let word = self.unicode_classes.word;
1633 let non_word = self.unicode_classes.non_word;
1634 if word == NodeId::MISSING {
1635 return Ok(());
1636 }
1637 let word_pref = tb.mk_concat(word, NodeId::TS);
1640 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1641 let word_suf = tb.mk_concat(NodeId::TS, word);
1642 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1643 let len = children.len();
1644 for k in 0..len {
1645 let l = if k == 0 {
1646 WordCharKind::Edge
1647 } else {
1648 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1649 };
1650 let r = if k + 1 >= len {
1651 WordCharKind::Edge
1652 } else {
1653 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1654 };
1655 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1656 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1657 }
1658 Ok(())
1659 }
1660
1661 fn rewrite_wb_in_node(
1662 b: &mut TB<'s>,
1663 node: NodeId,
1664 wb: NodeId,
1665 non_wb: NodeId,
1666 word: NodeId,
1667 left: WordCharKind,
1668 right: WordCharKind,
1669 ) -> Option<NodeId> {
1670 let boundary_match = if node == wb {
1671 true
1672 } else if node == non_wb {
1673 false
1674 } else if b.get_kind(node) == Kind::Union {
1675 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1676 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1677 return Some(b.mk_union(l, r));
1678 } else {
1679 return Some(node);
1680 };
1681 use WordCharKind::*;
1682 let result = match (left, right) {
1683 (NonWord, Word) | (Word, NonWord) => {
1684 if boundary_match {
1685 NodeId::EPS
1686 } else {
1687 NodeId::BOT
1688 }
1689 }
1690 (Word, Word) | (NonWord, NonWord) => {
1691 if boundary_match {
1692 NodeId::BOT
1693 } else {
1694 NodeId::EPS
1695 }
1696 }
1697 (Word, _) => {
1698 if boundary_match {
1699 b.mk_neg_lookahead(word, 0)
1700 } else {
1701 let tail = b.mk_concat(word, NodeId::TS);
1702 b.mk_lookahead(tail, NodeId::MISSING, 0)
1703 }
1704 }
1705 (NonWord, _) => {
1706 if boundary_match {
1707 let tail = b.mk_concat(word, NodeId::TS);
1708 b.mk_lookahead(tail, NodeId::MISSING, 0)
1709 } else {
1710 b.mk_neg_lookahead(word, 0)
1711 }
1712 }
1713 (_, Word) => {
1714 if boundary_match {
1715 b.mk_neg_lookbehind(word)
1716 } else {
1717 b.mk_lookbehind(word, NodeId::MISSING)
1718 }
1719 }
1720 (_, NonWord) => {
1721 if boundary_match {
1722 b.mk_lookbehind(word, NodeId::MISSING)
1723 } else {
1724 b.mk_neg_lookbehind(word)
1725 }
1726 }
1727 _ => return None,
1728 };
1729 Some(result)
1730 }
1731
1732 fn classify(
1733 b: &mut TB<'s>,
1734 node: NodeId,
1735 word_dir: NodeId,
1736 non_word_dir: NodeId,
1737 ) -> WordCharKind {
1738 if b.contains_look(node) || b.contains_anchors(node) {
1739 return WordCharKind::Unknown;
1740 }
1741 if b.subsumes(word_dir, node) == Some(true) {
1742 WordCharKind::Word
1743 } else if b.subsumes(non_word_dir, node) == Some(true) {
1744 WordCharKind::NonWord
1745 } else {
1746 WordCharKind::Unknown
1747 }
1748 }
1749
1750 fn rewrite_word_boundary_in_concat(
1751 &mut self,
1752 asts: &[Ast],
1753 idx: usize,
1754 translator: &mut Option<Translator>,
1755 tb: &mut TB<'s>,
1756 negated: bool,
1757 ) -> Result<(NodeId, usize)> {
1758 use WordCharKind::*;
1759 if self.global_full_unicode {
1760 self.unicode_classes.ensure_word_full(tb);
1761 } else if self.global_unicode && !self.global_ascii_perl {
1762 self.unicode_classes.ensure_word(tb);
1763 } else {
1764 self.unicode_classes.ensure_word_ascii(tb);
1765 }
1766 let word_id = self.unicode_classes.word;
1767 let not_word_id = self.unicode_classes.non_word;
1768 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1769 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1770 let boundary_match = !negated;
1771 match (left, right) {
1772 (NonWord, Word) | (Word, NonWord) => Ok((
1773 if boundary_match {
1774 NodeId::EPS
1775 } else {
1776 NodeId::BOT
1777 },
1778 idx + 1,
1779 )),
1780 (Word, Word) | (NonWord, NonWord) => Ok((
1781 if boundary_match {
1782 NodeId::BOT
1783 } else {
1784 NodeId::EPS
1785 },
1786 idx + 1,
1787 )),
1788 (Word, _) => {
1789 if boundary_match {
1790 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1791 } else {
1792 let tail = tb.mk_concat(word_id, NodeId::TS);
1793 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1794 }
1795 }
1796 (NonWord, _) => {
1797 if boundary_match {
1798 let tail = tb.mk_concat(word_id, NodeId::TS);
1799 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1800 } else {
1801 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1802 }
1803 }
1804 (_, Word) => {
1805 if boundary_match {
1806 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1807 } else {
1808 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1809 }
1810 }
1811 (_, NonWord) => {
1812 if boundary_match {
1813 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1814 } else {
1815 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1816 }
1817 }
1818 _ => {
1822 self.unicode_classes.ensure_wb(tb);
1823 let node = if boundary_match {
1824 self.unicode_classes.wb
1825 } else {
1826 self.unicode_classes.non_wb
1827 };
1828 Ok((node, idx + 1))
1829 }
1830 }
1831 }
1832
1833 fn merge_boundary_with_following_lookaheads(
1834 &mut self,
1835 asts: &[Ast],
1836 wb_idx: usize,
1837 boundary_tail: NodeId,
1838 translator: &mut Option<Translator>,
1839 tb: &mut TB<'s>,
1840 ) -> Result<(NodeId, usize)> {
1841 let mut next = wb_idx + 1;
1842 let mut la_bodies = vec![boundary_tail];
1843 while next < asts.len() {
1844 match &asts[next] {
1845 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1846 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1847 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1848 next += 1;
1849 }
1850 _ => break,
1851 }
1852 }
1853 let merged = tb.mk_inters(la_bodies.into_iter());
1854 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1855 }
1856
1857 fn ast_to_node_id(
1858 &mut self,
1859 ast: &Ast,
1860 translator: &mut Option<Translator>,
1861 tb: &mut TB<'s>,
1862 ) -> Result<NodeId> {
1863 match ast {
1864 Ast::Empty(_) => Ok(NodeId::EPS),
1865 Ast::Flags(f) => {
1866 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1867 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1868 }
1869 let mut translator_builder = self.default_translator_builder();
1870 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1871 translator_builder.case_insensitive(state);
1872 }
1873 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1874 translator_builder.unicode(state);
1875 }
1876 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1877 self.dot_all.set(state);
1878 }
1879 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1880 self.multiline.set(state);
1881 }
1882 let concat_translator = Some(translator_builder.build());
1883 *translator = concat_translator;
1884 Ok(NodeId::EPS)
1885 }
1886 Ast::Literal(l) => {
1887 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1888 self.translator_to_node_id(&ast_lit, translator, tb)
1889 }
1890 Ast::Top(_) => Ok(NodeId::TOP),
1891 Ast::Dot(_) => {
1892 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1893 let hirv = match (codepoint_dot, self.dot_all.get()) {
1894 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1895 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1896 (false, true) => return Ok(NodeId::TOP),
1897 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1898 };
1899 self.hir_to_node_id(&hirv, tb)
1900 }
1901 Ast::Assertion(a) => match &a.kind {
1902 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1903 ast::AssertionKind::EndText => Ok(NodeId::END),
1904 ast::AssertionKind::WordBoundary => {
1905 let only = Ast::Assertion(a.clone());
1906 let asts = std::slice::from_ref(&only);
1907 let (node, _) =
1908 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1909 Ok(node)
1910 }
1911 ast::AssertionKind::NotWordBoundary => {
1912 let only = Ast::Assertion(a.clone());
1914 let asts = std::slice::from_ref(&only);
1915 let (node, _) =
1916 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1917 Ok(node)
1918 }
1919 ast::AssertionKind::StartLine => {
1920 if !self.multiline.get() {
1921 return Ok(NodeId::BEGIN);
1922 }
1923 let left = NodeId::BEGIN;
1924 let right = tb.mk_u8(b'\n');
1925 let union = tb.mk_union(left, right);
1926 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1927 }
1928 ast::AssertionKind::EndLine => {
1929 if !self.multiline.get() {
1930 return Ok(NodeId::END);
1931 }
1932 let left = NodeId::END;
1933 let right = tb.mk_u8(b'\n');
1934 let union = tb.mk_union(left, right);
1935 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1936 }
1937 ast::AssertionKind::WordBoundaryStart => {
1938 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1939 }
1940 ast::AssertionKind::WordBoundaryEnd => {
1941 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1942 }
1943 ast::AssertionKind::WordBoundaryStartAngle => {
1944 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1945 }
1946 ast::AssertionKind::WordBoundaryEndAngle => {
1947 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1948 }
1949 ast::AssertionKind::WordBoundaryStartHalf => {
1950 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1951 }
1952 ast::AssertionKind::WordBoundaryEndHalf => {
1953 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1954 }
1955 },
1956 Ast::ClassUnicode(c) => {
1957 let tmp = regex_syntax::ast::ClassUnicode {
1958 span: c.span,
1959 negated: c.negated,
1960 kind: c.kind.clone(),
1961 };
1962 if !c.negated {
1963 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1964 match s.as_str() {
1965 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
1967 "utf8" => {
1969 let ascii = tb.mk_range_u8(0, 127);
1970 let beta = tb.mk_range_u8(128, 0xBF);
1971 let c0 = tb.mk_range_u8(0xC0, 0xDF);
1972 let c0s = tb.mk_concats([c0, beta].into_iter());
1973 let e0 = tb.mk_range_u8(0xE0, 0xEF);
1974 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
1975 let f0 = tb.mk_range_u8(0xF0, 0xF7);
1976 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
1977 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
1978 }
1979 "hex" => {
1980 let nums = tb.mk_range_u8(b'0', b'9');
1981 let lets = tb.mk_range_u8(b'a', b'f');
1982 let lets2 = tb.mk_range_u8(b'A', b'F');
1983 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
1984 return Ok(merged);
1985 }
1986 _ => {}
1987 }
1988 };
1989 }
1990
1991 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
1992 self.translator_to_node_id(&orig_ast, translator, tb)
1993 }
1994 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
1995 Ast::ClassBracketed(c) => match &c.kind {
1996 regex_syntax::ast::ClassSet::Item(item) => {
1997 if !c.negated && is_universal_perl_pair(item) {
1998 return Ok(NodeId::TOP);
1999 }
2000 let kind = if self.global_ascii_perl {
2001 rewrite_ascii_perl_set(&c.kind)
2002 } else {
2003 c.kind.clone()
2004 };
2005 let tmp = regex_syntax::ast::ClassBracketed {
2006 span: c.span,
2007 negated: c.negated,
2008 kind,
2009 };
2010 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2011 self.translator_to_node_id(&orig_ast, translator, tb)
2012 }
2013 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2014 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2015 }
2016 },
2017 Ast::Repetition(r) => {
2018 let body = self.ast_to_node_id(&r.ast, translator, tb);
2019 match body {
2020 Ok(body) => match &r.op.kind {
2021 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2022 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2023 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2024 ast::RepetitionKind::Range(r) => match r {
2025 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2026 ast::RepetitionRange::AtLeast(n) => {
2027 let rep = tb.mk_repeat(body, *n, *n);
2028 let st = tb.mk_star(body);
2029 Ok(tb.mk_concat(rep, st))
2030 }
2031
2032 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2033 },
2034 },
2035 Err(_) => body,
2036 }
2037 }
2038 Ast::Lookaround(g) => {
2039 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2040 match g.kind {
2041 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2042 let mut prefix = NodeId::EPS;
2043 let mut rest = body;
2044 while tb.get_kind(rest) == Kind::Concat
2045 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2046 {
2047 prefix = tb.mk_concat(prefix, rest.left(tb));
2048 rest = rest.right(tb);
2049 }
2050 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2051 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2052 }
2053 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2054 Ok(tb.mk_concat(prefix, la))
2055 }
2056 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2057 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2058 }
2059 ast::LookaroundKind::PositiveLookahead => {
2060 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2061 }
2062 ast::LookaroundKind::PositiveLookbehind => {
2063 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2064 }
2065 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2066 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2067 }
2068 }
2069 Ast::Group(g) => {
2070 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2071 if !flags.items.is_empty() {
2072 let mut translator_builder = self.default_translator_builder();
2073 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2074 translator_builder.case_insensitive(state);
2075 }
2076 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2077 translator_builder.unicode(state);
2078 }
2079 let saved_dot_all = self.dot_all.get();
2080 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2081 self.dot_all.set(state);
2082 }
2083 let saved_multiline = self.multiline.get();
2084 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2085 self.multiline.set(state);
2086 }
2087 let mut scoped = Some(translator_builder.build());
2088 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2089 self.dot_all.set(saved_dot_all);
2090 self.multiline.set(saved_multiline);
2091 return result;
2092 }
2093 }
2094 self.ast_to_node_id(&g.ast, translator, tb)
2095 }
2096 Ast::Alternation(a) => {
2097 let mut children = vec![];
2098 for ast in &a.asts {
2099 match self.ast_to_node_id(ast, translator, tb) {
2100 Ok(node_id) => children.push(node_id),
2101 Err(err) => return Err(err),
2102 }
2103 }
2104 Ok(tb.mk_unions(children.iter().copied()))
2105 }
2106 Ast::Concat(c) => {
2107 let mut concat_translator: Option<Translator> = None;
2108 let mut children = vec![];
2109 let mut prev_boundary_child: Option<usize> = None;
2110 let mut i = 0;
2111 while i < c.asts.len() {
2112 let ast = &c.asts[i];
2113 match ast {
2114 Ast::Flags(f) => {
2115 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2116 return Err(
2117 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2118 );
2119 }
2120 let mut translator_builder = self.default_translator_builder();
2121 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2122 translator_builder.case_insensitive(state);
2123 }
2124 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2125 translator_builder.unicode(state);
2126 }
2127 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2128 self.dot_all.set(state);
2129 }
2130 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2131 self.multiline.set(state);
2132 }
2133 concat_translator = Some(translator_builder.build());
2134 *translator = concat_translator.clone();
2135 i += 1;
2136 continue;
2137 }
2138 Ast::Assertion(a)
2139 if a.kind == ast::AssertionKind::WordBoundary
2140 || a.kind == ast::AssertionKind::NotWordBoundary =>
2141 {
2142 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2143 let node = self.rewrite_word_boundary_in_concat(
2144 &c.asts, i, translator, tb, negated,
2145 )?;
2146 match prev_boundary_child {
2147 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2148 None => {
2149 children.push(node.0);
2150 prev_boundary_child = Some(children.len() - 1);
2151 }
2152 }
2153 i = node.1; continue;
2155 }
2156 _ => {}
2157 }
2158 prev_boundary_child = None;
2159 match concat_translator {
2160 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2161 Ok(node_id) => children.push(node_id),
2162 Err(err) => return Err(err),
2163 },
2164 None => match self.ast_to_node_id(ast, translator, tb) {
2165 Ok(node_id) => children.push(node_id),
2166 Err(err) => return Err(err),
2167 },
2168 }
2169 i += 1;
2170 }
2171 self.specialize_word_boundaries(&mut children, tb)?;
2172 Ok(tb.mk_concats(children.iter().cloned()))
2173 }
2174 Ast::Intersection(intersection) => {
2175 let mut children = vec![];
2176 for ast in &intersection.asts {
2177 match self.ast_to_node_id(ast, translator, tb) {
2178 Ok(node_id) => children.push(node_id),
2179 Err(err) => return Err(err),
2180 }
2181 }
2182 Ok(tb.mk_inters(children.into_iter()))
2183 }
2184 Ast::Complement(complement) => {
2185 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2186 body.map(|x| tb.mk_compl(x))
2187 }
2188 }
2189 }
2190
2191 fn parse_inner(&mut self) -> Result<Ast> {
2192 let mut concat = Concat {
2193 span: self.span(),
2194 asts: vec![],
2195 };
2196 loop {
2197 self.bump_space();
2198 if self.is_eof() {
2199 break;
2200 }
2201 match self.char() {
2202 '(' => concat = self.push_group(concat)?,
2203 ')' => concat = self.pop_group(concat)?,
2204 '|' => concat = self.push_alternate(concat)?,
2205 '&' => concat = self.push_intersect(concat)?,
2206 '~' => concat = self.push_compl_group(concat)?,
2207 '[' => {
2208 let class = self.parse_set_class()?;
2209 concat.asts.push(Ast::class_bracketed(class));
2210 }
2211 '?' => {
2212 concat =
2213 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2214 }
2215 '*' => {
2216 concat =
2217 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2218 }
2219 '+' => {
2220 concat =
2221 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2222 }
2223 '{' => {
2224 concat = self.parse_counted_repetition(concat)?;
2225 }
2226 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2227 }
2228 }
2229 let ast = self.pop_group_end(concat)?;
2230 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2231 || max_concat_length(&ast) >= self.max_list_len
2232 {
2233 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2234 }
2235 Ok(ast)
2236 }
2237
2238 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2239 let ast = self.parse_inner()?;
2240 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2241 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2242 }
2243 self.ast_to_node_id(&ast, &mut None, tb)
2244 }
2245
2246 #[inline(never)]
2247 fn parse_uncounted_repetition(
2248 &self,
2249 mut concat: ast::Concat,
2250 kind: ast::RepetitionKind,
2251 ) -> Result<ast::Concat> {
2252 let op_start = self.pos();
2254 let ast = match concat.asts.pop() {
2255 Some(ast) => ast,
2256 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2257 };
2258 match ast {
2259 Ast::Empty(_) | Ast::Flags(_) => {
2260 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2261 }
2262 _ => {}
2263 }
2264 if self.bump() && self.char() == '?' {
2265 return Err(self.error(
2266 Span::new(op_start, self.pos()),
2267 ast::ErrorKind::UnsupportedLazyQuantifier,
2268 ));
2269 }
2270 concat.asts.push(Ast::repetition(ast::Repetition {
2271 span: ast.span().with_end(self.pos()),
2272 op: ast::RepetitionOp {
2273 span: Span::new(op_start, self.pos()),
2274 kind,
2275 },
2276 greedy: true,
2277 ast: Box::new(ast),
2278 }));
2279 Ok(concat)
2280 }
2281
2282 #[inline(never)]
2283 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2284 assert!(self.char() == '{');
2285 let start = self.pos();
2286 let ast = match concat.asts.pop() {
2287 Some(ast) => ast,
2288 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2289 };
2290 match ast {
2291 Ast::Empty(_) | Ast::Flags(_) => {
2292 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2293 }
2294 _ => {}
2295 }
2296 if !self.bump_and_bump_space() {
2297 return Err(self.error(
2298 Span::new(start, self.pos()),
2299 ast::ErrorKind::RepetitionCountUnclosed,
2300 ));
2301 }
2302 let count_start = specialize_err(
2303 self.parse_decimal(),
2304 ast::ErrorKind::DecimalEmpty,
2305 ast::ErrorKind::RepetitionCountDecimalEmpty,
2306 );
2307 if self.is_eof() {
2308 return Err(self.error(
2309 Span::new(start, self.pos()),
2310 ast::ErrorKind::RepetitionCountUnclosed,
2311 ));
2312 }
2313 let range = if self.char() == ',' {
2314 if !self.bump_and_bump_space() {
2315 return Err(self.error(
2316 Span::new(start, self.pos()),
2317 ast::ErrorKind::RepetitionCountUnclosed,
2318 ));
2319 }
2320 if self.char() != '}' {
2321 let count_start = match count_start {
2322 Ok(c) => c,
2323 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2324 if self.parser().empty_min_range {
2325 0
2326 } else {
2327 return Err(err);
2328 }
2329 }
2330 err => err?,
2331 };
2332 let count_end = specialize_err(
2333 self.parse_decimal(),
2334 ast::ErrorKind::DecimalEmpty,
2335 ast::ErrorKind::RepetitionCountDecimalEmpty,
2336 )?;
2337 ast::RepetitionRange::Bounded(count_start, count_end)
2338 } else {
2339 ast::RepetitionRange::AtLeast(count_start?)
2340 }
2341 } else {
2342 ast::RepetitionRange::Exactly(count_start?)
2343 };
2344
2345 if self.is_eof() || self.char() != '}' {
2346 return Err(self.error(
2347 Span::new(start, self.pos()),
2348 ast::ErrorKind::RepetitionCountUnclosed,
2349 ));
2350 }
2351
2352 if self.bump_and_bump_space() && self.char() == '?' {
2353 return Err(self.error(
2354 Span::new(start, self.pos()),
2355 ast::ErrorKind::UnsupportedLazyQuantifier,
2356 ));
2357 }
2358
2359 let op_span = Span::new(start, self.pos());
2360 if !range.is_valid() {
2361 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2362 }
2363
2364 let over_limit = match &range {
2365 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2366 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2367 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2368 };
2369 if over_limit {
2370 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2371 }
2372 concat.asts.push(Ast::repetition(ast::Repetition {
2373 span: ast.span().with_end(self.pos()),
2374 op: ast::RepetitionOp {
2375 span: op_span,
2376 kind: ast::RepetitionKind::Range(range),
2377 },
2378 greedy: true,
2379 ast: Box::new(ast),
2380 }));
2381 Ok(concat)
2382 }
2383
2384 #[inline(never)]
2385 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2386 assert_eq!(self.char(), '(');
2387 let open_span = self.span_char();
2388 self.bump();
2389 self.bump_space();
2390 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2391 let kind = match (pos, ahead) {
2392 (true, true) => LookaroundKind::PositiveLookahead,
2393 (true, false) => LookaroundKind::PositiveLookbehind,
2394 (false, true) => LookaroundKind::NegativeLookahead,
2395 (false, false) => LookaroundKind::NegativeLookbehind,
2396 };
2397 return Ok(Either::Right(ast::Group {
2398 span: open_span,
2399 kind: ast::GroupKind::Lookaround(kind),
2400 ast: Box::new(Ast::empty(self.span())),
2401 }));
2402 }
2403 let inner_span = self.span();
2404 let mut starts_with_p = true;
2405 if self.bump_if("?P<") || {
2406 starts_with_p = false;
2407 self.bump_if("?<")
2408 } {
2409 let capture_index = self.next_capture_index(open_span)?;
2410 let name = self.parse_capture_name(capture_index)?;
2411 Ok(Either::Right(ast::Group {
2412 span: open_span,
2413 kind: ast::GroupKind::CaptureName {
2414 starts_with_p,
2415 name,
2416 },
2417 ast: Box::new(Ast::empty(self.span())),
2418 }))
2419 } else if self.bump_if("?") {
2420 if self.is_eof() {
2421 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2422 }
2423 let flags = self.parse_flags()?;
2424 let char_end = self.char();
2425 self.bump();
2426 if char_end == ')' {
2427 if flags.items.is_empty() {
2430 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2431 }
2432 Ok(Either::Left(ast::SetFlags {
2433 span: Span {
2434 end: self.pos(),
2435 ..open_span
2436 },
2437 flags,
2438 }))
2439 } else {
2440 assert_eq!(char_end, ':');
2441 Ok(Either::Right(ast::Group {
2442 span: open_span,
2443 kind: ast::GroupKind::NonCapturing(flags),
2444 ast: Box::new(Ast::empty(self.span())),
2445 }))
2446 }
2447 } else {
2448 let capture_index = self.next_capture_index(open_span)?;
2449 Ok(Either::Right(ast::Group {
2450 span: open_span,
2451 kind: ast::GroupKind::CaptureIndex(capture_index),
2452 ast: Box::new(Ast::empty(self.span())),
2453 }))
2454 }
2455 }
2456
2457 #[inline(never)]
2458 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2459 if self.is_eof() {
2460 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2461 }
2462 let start = self.pos();
2463 loop {
2464 if self.char() == '>' {
2465 break;
2466 }
2467 if !is_capture_char(self.char(), self.pos() == start) {
2468 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2469 }
2470 if !self.bump() {
2471 break;
2472 }
2473 }
2474 let end = self.pos();
2475 if self.is_eof() {
2476 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2477 }
2478 assert_eq!(self.char(), '>');
2479 self.bump();
2480 let name = &self.pattern()[start.offset..end.offset];
2481 if name.is_empty() {
2482 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2483 }
2484 let capname = ast::CaptureName {
2485 span: Span::new(start, end),
2486 name: name.to_string(),
2487 index: capture_index,
2488 };
2489 self.add_capture_name(&capname)?;
2490 Ok(capname)
2491 }
2492
2493 #[inline(never)]
2494 fn parse_flags(&self) -> Result<ast::Flags> {
2495 let mut flags = ast::Flags {
2496 span: self.span(),
2497 items: vec![],
2498 };
2499 let mut last_was_negation = None;
2500 while self.char() != ':' && self.char() != ')' {
2501 if self.char() == '-' {
2502 last_was_negation = Some(self.span_char());
2503 let item = ast::FlagsItem {
2504 span: self.span_char(),
2505 kind: ast::FlagsItemKind::Negation,
2506 };
2507 if let Some(i) = flags.add_item(item) {
2508 return Err(self.error(
2509 self.span_char(),
2510 ast::ErrorKind::FlagRepeatedNegation {
2511 original: flags.items[i].span,
2512 },
2513 ));
2514 }
2515 } else {
2516 last_was_negation = None;
2517 let item = ast::FlagsItem {
2518 span: self.span_char(),
2519 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2520 };
2521 if let Some(i) = flags.add_item(item) {
2522 return Err(self.error(
2523 self.span_char(),
2524 ast::ErrorKind::FlagDuplicate {
2525 original: flags.items[i].span,
2526 },
2527 ));
2528 }
2529 }
2530 if !self.bump() {
2531 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2532 }
2533 }
2534 if let Some(span) = last_was_negation {
2535 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2536 }
2537 flags.span.end = self.pos();
2538 Ok(flags)
2539 }
2540
2541 #[inline(never)]
2542 fn parse_flag(&self) -> Result<ast::Flag> {
2543 match self.char() {
2544 'i' => Ok(ast::Flag::CaseInsensitive),
2545 'm' => Ok(ast::Flag::MultiLine),
2546 's' => Ok(ast::Flag::DotMatchesNewLine),
2547 'U' => Ok(ast::Flag::SwapGreed),
2548 'u' => Ok(ast::Flag::Unicode),
2549 'R' => Ok(ast::Flag::CRLF),
2550 'x' => Ok(ast::Flag::IgnoreWhitespace),
2551 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2552 }
2553 }
2554
2555 fn parse_primitive(&self) -> Result<Primitive> {
2556 match self.char() {
2557 '\\' => self.parse_escape(),
2558 '_' => {
2559 let ast = Primitive::Top(self.span_char());
2560 self.bump();
2561 Ok(ast)
2562 }
2563 '.' => {
2564 let ast = Primitive::Dot(self.span_char());
2565 self.bump();
2566 Ok(ast)
2567 }
2568 '^' => {
2569 let ast = Primitive::Assertion(ast::Assertion {
2570 span: self.span_char(),
2571 kind: ast::AssertionKind::StartLine,
2572 });
2573 self.bump();
2574 Ok(ast)
2575 }
2576 '$' => {
2577 let ast = Primitive::Assertion(ast::Assertion {
2578 span: self.span_char(),
2579 kind: ast::AssertionKind::EndLine,
2580 });
2581 self.bump();
2582 Ok(ast)
2583 }
2584 c => {
2585 let ast = Primitive::Literal(Literal {
2586 span: self.span_char(),
2587 kind: LiteralKind::Verbatim,
2588 c,
2589 });
2590 self.bump();
2591 Ok(ast)
2592 }
2593 }
2594 }
2595
2596 #[inline(never)]
2597 fn parse_escape(&self) -> Result<Primitive> {
2598 assert_eq!(self.char(), '\\');
2599 let start = self.pos();
2600 if !self.bump() {
2601 return Err(self.error(
2602 Span::new(start, self.pos()),
2603 ast::ErrorKind::EscapeUnexpectedEof,
2604 ));
2605 }
2606 let c = self.char();
2607 match c {
2609 '0'..='9' => {
2610 if !self.parser().octal {
2611 return Err(self.error(
2612 Span::new(start, self.span_char().end),
2613 ast::ErrorKind::UnsupportedBackreference,
2614 ));
2615 }
2616 let mut lit = self.parse_octal();
2617 lit.span.start = start;
2618 return Ok(Primitive::Literal(lit));
2619 }
2620 'x' | 'u' | 'U' => {
2621 let mut lit = self.parse_hex()?;
2622 lit.span.start = start;
2623 return Ok(Primitive::Literal(lit));
2624 }
2625 'p' | 'P' => {
2626 let mut cls = self.parse_unicode_class()?;
2627 cls.span.start = start;
2628 return Ok(Primitive::Unicode(cls));
2629 }
2630 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2631 let mut cls = self.parse_perl_class();
2632 cls.span.start = start;
2633 return Ok(Primitive::Perl(cls));
2634 }
2635 _ => {}
2636 }
2637
2638 self.bump();
2640 let span = Span::new(start, self.pos());
2641 if is_meta_character(c) {
2642 return Ok(Primitive::Literal(Literal {
2643 span,
2644 kind: LiteralKind::Meta,
2645 c,
2646 }));
2647 }
2648 if is_escapeable_character(c) {
2649 return Ok(Primitive::Literal(Literal {
2650 span,
2651 kind: LiteralKind::Superfluous,
2652 c,
2653 }));
2654 }
2655 let special = |kind, c| {
2656 Ok(Primitive::Literal(Literal {
2657 span,
2658 kind: LiteralKind::Special(kind),
2659 c,
2660 }))
2661 };
2662 match c {
2663 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2664 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2665 't' => special(SpecialLiteralKind::Tab, '\t'),
2666 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2667 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2668 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2669 'A' => Ok(Primitive::Assertion(ast::Assertion {
2670 span,
2671 kind: ast::AssertionKind::StartText,
2672 })),
2673 'z' => Ok(Primitive::Assertion(ast::Assertion {
2674 span,
2675 kind: ast::AssertionKind::EndText,
2676 })),
2677 'b' => {
2678 let mut wb = ast::Assertion {
2679 span,
2680 kind: ast::AssertionKind::WordBoundary,
2681 };
2682 if !self.is_eof() && self.char() == '{' {
2685 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2686 wb.kind = kind;
2687 wb.span.end = self.pos();
2688 }
2689 }
2690 Ok(Primitive::Assertion(wb))
2691 }
2692 'B' => Ok(Primitive::Assertion(ast::Assertion {
2693 span,
2694 kind: ast::AssertionKind::NotWordBoundary,
2695 })),
2696 '<' => Ok(Primitive::Assertion(ast::Assertion {
2697 span,
2698 kind: ast::AssertionKind::WordBoundaryStartAngle,
2699 })),
2700 '>' => Ok(Primitive::Assertion(ast::Assertion {
2701 span,
2702 kind: ast::AssertionKind::WordBoundaryEndAngle,
2703 })),
2704 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2705 }
2706 }
2707
2708 fn maybe_parse_special_word_boundary(
2709 &self,
2710 wb_start: Position,
2711 ) -> Result<Option<ast::AssertionKind>> {
2712 assert_eq!(self.char(), '{');
2713
2714 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2715 let start = self.pos();
2716 if !self.bump_and_bump_space() {
2717 return Err(self.error(
2718 Span::new(wb_start, self.pos()),
2719 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2720 ));
2721 }
2722 let start_contents = self.pos();
2723 if !is_valid_char(self.char()) {
2724 self.parser().pos.set(start);
2725 return Ok(None);
2726 }
2727
2728 let mut scratch = self.parser().scratch.borrow_mut();
2730 scratch.clear();
2731 while !self.is_eof() && is_valid_char(self.char()) {
2732 scratch.push(self.char());
2733 self.bump_and_bump_space();
2734 }
2735 if self.is_eof() || self.char() != '}' {
2736 return Err(self.error(
2737 Span::new(start, self.pos()),
2738 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2739 ));
2740 }
2741 let end = self.pos();
2742 self.bump();
2743 let kind = match scratch.as_str() {
2744 "start" => ast::AssertionKind::WordBoundaryStart,
2745 "end" => ast::AssertionKind::WordBoundaryEnd,
2746 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2747 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2748 _ => {
2749 return Err(self.error(
2750 Span::new(start_contents, end),
2751 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2752 ))
2753 }
2754 };
2755 Ok(Some(kind))
2756 }
2757
2758 #[inline(never)]
2759 fn parse_octal(&self) -> Literal {
2760 assert!(self.parser().octal);
2761 assert!('0' <= self.char() && self.char() <= '7');
2762 let start = self.pos();
2763 while self.bump()
2765 && '0' <= self.char()
2766 && self.char() <= '7'
2767 && self.pos().offset - start.offset <= 2
2768 {}
2769 let end = self.pos();
2770 let octal = &self.pattern()[start.offset..end.offset];
2771 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2774 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2777 Literal {
2778 span: Span::new(start, end),
2779 kind: LiteralKind::Octal,
2780 c,
2781 }
2782 }
2783
2784 #[inline(never)]
2785 fn parse_hex(&self) -> Result<Literal> {
2786 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2787
2788 let hex_kind = match self.char() {
2789 'x' => HexLiteralKind::X,
2790 'u' => HexLiteralKind::UnicodeShort,
2791 _ => HexLiteralKind::UnicodeLong,
2792 };
2793 if !self.bump_and_bump_space() {
2794 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2795 }
2796 if self.char() == '{' {
2797 self.parse_hex_brace(hex_kind)
2798 } else {
2799 self.parse_hex_digits(hex_kind)
2800 }
2801 }
2802
2803 #[inline(never)]
2804 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2805 let mut scratch = self.parser().scratch.borrow_mut();
2806 scratch.clear();
2807
2808 let start = self.pos();
2809 for i in 0..kind.digits() {
2810 if i > 0 && !self.bump_and_bump_space() {
2811 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2812 }
2813 if !is_hex(self.char()) {
2814 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2815 }
2816 scratch.push(self.char());
2817 }
2818 self.bump_and_bump_space();
2819 let end = self.pos();
2820 let hex = scratch.as_str();
2821 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2822 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2823 Some(c) => Ok(Literal {
2824 span: Span::new(start, end),
2825 kind: LiteralKind::HexFixed(kind),
2826 c,
2827 }),
2828 }
2829 }
2830
2831 #[inline(never)]
2832 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2833 let mut scratch = self.parser().scratch.borrow_mut();
2834 scratch.clear();
2835
2836 let brace_pos = self.pos();
2837 let start = self.span_char().end;
2838 while self.bump_and_bump_space() && self.char() != '}' {
2839 if !is_hex(self.char()) {
2840 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2841 }
2842 scratch.push(self.char());
2843 }
2844 if self.is_eof() {
2845 return Err(self.error(
2846 Span::new(brace_pos, self.pos()),
2847 ast::ErrorKind::EscapeUnexpectedEof,
2848 ));
2849 }
2850 let end = self.pos();
2851 let hex = scratch.as_str();
2852 assert_eq!(self.char(), '}');
2853 self.bump_and_bump_space();
2854
2855 if hex.is_empty() {
2856 return Err(self.error(
2857 Span::new(brace_pos, self.pos()),
2858 ast::ErrorKind::EscapeHexEmpty,
2859 ));
2860 }
2861 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2862 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2863 Some(c) => Ok(Literal {
2864 span: Span::new(start, self.pos()),
2865 kind: LiteralKind::HexBrace(kind),
2866 c,
2867 }),
2868 }
2869 }
2870
2871 fn parse_decimal(&self) -> Result<u32> {
2872 let mut scratch = self.parser().scratch.borrow_mut();
2873 scratch.clear();
2874
2875 while !self.is_eof() && self.char().is_whitespace() {
2876 self.bump();
2877 }
2878 let start = self.pos();
2879 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2880 scratch.push(self.char());
2881 self.bump_and_bump_space();
2882 }
2883 let span = Span::new(start, self.pos());
2884 while !self.is_eof() && self.char().is_whitespace() {
2885 self.bump_and_bump_space();
2886 }
2887 let digits = scratch.as_str();
2888 if digits.is_empty() {
2889 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2890 }
2891 match digits.parse::<u32>().ok() {
2892 Some(n) => Ok(n),
2893 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2894 }
2895 }
2896
2897 #[inline(never)]
2898 fn parse_set_class(&self) -> Result<ClassBracketed> {
2899 assert_eq!(self.char(), '[');
2900
2901 let mut union = ClassSetUnion {
2902 span: self.span(),
2903 items: vec![],
2904 };
2905 loop {
2906 self.bump_space();
2907 if self.is_eof() {
2908 return Err(self.unclosed_class_error());
2909 }
2910 match self.char() {
2911 '[' => {
2912 if !self.parser().stack_class.borrow().is_empty() {
2913 if let Some(cls) = self.maybe_parse_ascii_class() {
2914 union.push(ClassSetItem::Ascii(cls));
2915 continue;
2916 }
2917 }
2918 union = self.push_class_open(union)?;
2919 }
2920 ']' => match self.pop_class(union)? {
2921 Either::Left(nested_union) => {
2922 union = nested_union;
2923 }
2924 Either::Right(class) => return Ok(class),
2925 },
2926 '&' if self.peek() == Some('&') => {
2927 assert!(self.bump_if("&&"));
2928 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2929 }
2930 '-' if self.peek() == Some('-') => {
2931 assert!(self.bump_if("--"));
2932 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2933 }
2934 '~' if self.peek() == Some('~') => {
2935 assert!(self.bump_if("~~"));
2936 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2937 }
2938 _ => {
2939 union.push(self.parse_set_class_range()?);
2940 }
2941 }
2942 }
2943 }
2944
2945 #[inline(never)]
2946 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
2947 let prim1 = self.parse_set_class_item()?;
2948 self.bump_space();
2949 if self.is_eof() {
2950 return Err(self.unclosed_class_error());
2951 }
2952 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
2953 return prim1.into_class_set_item(self);
2954 }
2955 if !self.bump_and_bump_space() {
2956 return Err(self.unclosed_class_error());
2957 }
2958 let prim2 = self.parse_set_class_item()?;
2959 let range = ClassSetRange {
2960 span: Span::new(prim1.span().start, prim2.span().end),
2961 start: prim1.into_class_literal(self)?,
2962 end: prim2.into_class_literal(self)?,
2963 };
2964 if !range.is_valid() {
2965 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
2966 }
2967 Ok(ClassSetItem::Range(range))
2968 }
2969
2970 #[inline(never)]
2971 fn parse_set_class_item(&self) -> Result<Primitive> {
2972 if self.char() == '\\' {
2973 self.parse_escape()
2974 } else {
2975 let x = Primitive::Literal(Literal {
2976 span: self.span_char(),
2977 kind: LiteralKind::Verbatim,
2978 c: self.char(),
2979 });
2980 self.bump();
2981 Ok(x)
2982 }
2983 }
2984
2985 #[inline(never)]
2986 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
2987 assert_eq!(self.char(), '[');
2988 let start = self.pos();
2989 if !self.bump_and_bump_space() {
2990 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2991 }
2992
2993 let negated = if self.char() != '^' {
2994 false
2995 } else {
2996 if !self.bump_and_bump_space() {
2997 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
2998 }
2999 true
3000 };
3001 let mut union = ClassSetUnion {
3003 span: self.span(),
3004 items: vec![],
3005 };
3006 while self.char() == '-' {
3007 union.push(ClassSetItem::Literal(Literal {
3008 span: self.span_char(),
3009 kind: LiteralKind::Verbatim,
3010 c: '-',
3011 }));
3012 if !self.bump_and_bump_space() {
3013 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3014 }
3015 }
3016 if union.items.is_empty() && self.char() == ']' {
3019 union.push(ClassSetItem::Literal(Literal {
3020 span: self.span_char(),
3021 kind: LiteralKind::Verbatim,
3022 c: ']',
3023 }));
3024 if !self.bump_and_bump_space() {
3025 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3026 }
3027 }
3028 let set = ClassBracketed {
3029 span: Span::new(start, self.pos()),
3030 negated,
3031 kind: ClassSet::union(ClassSetUnion {
3032 span: Span::new(union.span.start, union.span.start),
3033 items: vec![],
3034 }),
3035 };
3036 Ok((set, union))
3037 }
3038
3039 #[inline(never)]
3040 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3041 assert_eq!(self.char(), '[');
3042 let start = self.pos();
3044 let mut negated = false;
3045 if !self.bump() || self.char() != ':' {
3046 self.parser().pos.set(start);
3047 return None;
3048 }
3049 if !self.bump() {
3050 self.parser().pos.set(start);
3051 return None;
3052 }
3053 if self.char() == '^' {
3054 negated = true;
3055 if !self.bump() {
3056 self.parser().pos.set(start);
3057 return None;
3058 }
3059 }
3060 let name_start = self.offset();
3061 while self.char() != ':' && self.bump() {}
3062 if self.is_eof() {
3063 self.parser().pos.set(start);
3064 return None;
3065 }
3066 let name = &self.pattern()[name_start..self.offset()];
3067 if !self.bump_if(":]") {
3068 self.parser().pos.set(start);
3069 return None;
3070 }
3071 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3072 Some(kind) => kind,
3073 None => {
3074 self.parser().pos.set(start);
3075 return None;
3076 }
3077 };
3078 Some(ClassAscii {
3079 span: Span::new(start, self.pos()),
3080 kind,
3081 negated,
3082 })
3083 }
3084
3085 #[inline(never)]
3086 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3087 assert!(self.char() == 'p' || self.char() == 'P');
3088
3089 let mut scratch = self.parser().scratch.borrow_mut();
3090 scratch.clear();
3091
3092 let negated = self.char() == 'P';
3093 if !self.bump_and_bump_space() {
3094 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3095 }
3096 let (start, kind) = if self.char() == '{' {
3097 let start = self.span_char().end;
3098 while self.bump_and_bump_space() && self.char() != '}' {
3099 scratch.push(self.char());
3100 }
3101 if self.is_eof() {
3102 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3103 }
3104 assert_eq!(self.char(), '}');
3105 self.bump();
3106
3107 let name = scratch.as_str();
3108 if let Some(i) = name.find("!=") {
3109 (
3110 start,
3111 ClassUnicodeKind::NamedValue {
3112 op: ClassUnicodeOpKind::NotEqual,
3113 name: name[..i].to_string(),
3114 value: name[i + 2..].to_string(),
3115 },
3116 )
3117 } else if let Some(i) = name.find(':') {
3118 (
3119 start,
3120 ClassUnicodeKind::NamedValue {
3121 op: ClassUnicodeOpKind::Colon,
3122 name: name[..i].to_string(),
3123 value: name[i + 1..].to_string(),
3124 },
3125 )
3126 } else if let Some(i) = name.find('=') {
3127 (
3128 start,
3129 ClassUnicodeKind::NamedValue {
3130 op: ClassUnicodeOpKind::Equal,
3131 name: name[..i].to_string(),
3132 value: name[i + 1..].to_string(),
3133 },
3134 )
3135 } else {
3136 (start, ClassUnicodeKind::Named(name.to_string()))
3137 }
3138 } else {
3139 let start = self.pos();
3140 let c = self.char();
3141 if c == '\\' {
3142 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3143 }
3144 self.bump_and_bump_space();
3145 let kind = ClassUnicodeKind::OneLetter(c);
3146 (start, kind)
3147 };
3148 Ok(ClassUnicode {
3149 span: Span::new(start, self.pos()),
3150 negated,
3151 kind,
3152 })
3153 }
3154
3155 #[inline(never)]
3156 fn parse_perl_class(&self) -> ClassPerl {
3157 let c = self.char();
3158 let span = self.span_char();
3159 self.bump();
3160 let (negated, kind) = match c {
3161 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3162 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3163 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3164 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3165 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3166 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3167 c => panic!("expected valid Perl class but got '{}'", c),
3168 };
3169 ClassPerl {
3170 span,
3171 kind,
3172 negated,
3173 }
3174 }
3175}
3176
3177fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3180 use regex_syntax::ast::ClassSetItem;
3181 let items = match item {
3182 ClassSetItem::Union(u) => &u.items,
3183 _ => return false,
3184 };
3185 if items.len() != 2 {
3186 return false;
3187 }
3188 match (&items[0], &items[1]) {
3189 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3190 let is_all = a.kind == b.kind && a.negated != b.negated;
3191 is_all
3192 }
3193 _ => false,
3194 }
3195}
3196
3197pub fn max_concat_length(ast: &ast::Ast) -> usize {
3198 match ast {
3199 ast::Ast::Empty(_)
3200 | ast::Ast::Flags(_)
3201 | ast::Ast::Literal(_)
3202 | ast::Ast::Dot(_)
3203 | ast::Ast::Top(_)
3204 | ast::Ast::Assertion(_)
3205 | ast::Ast::ClassUnicode(_)
3206 | ast::Ast::ClassPerl(_)
3207 | ast::Ast::ClassBracketed(_) => 0,
3208 ast::Ast::Group(g) => max_concat_length(&g.ast),
3209 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3210 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3211 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3212 ast::Ast::Concat(c) => c
3213 .asts
3214 .len()
3215 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3216 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3217 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3218 }
3219}
3220
3221pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3222 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3223 match ast {
3224 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3225 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3226 ast::Ast::Assertion(_) => 1,
3227 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3228 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3229 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3230 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3231 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3232 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3233 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3234 ast::Ast::Repetition(r) => {
3235 let body = go(&r.ast, limit);
3236 let factor: u64 = match &r.op.kind {
3237 ast::RepetitionKind::ZeroOrOne => 2,
3238 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3239 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3240 (*n as u64).max(1)
3241 }
3242 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3243 (*n as u64).max(1).saturating_add(1)
3244 }
3245 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3246 (*m as u64).max(1)
3247 }
3248 };
3249 body.saturating_mul(factor).min(limit)
3250 }
3251 }
3252 }
3253 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3254 let mut total: u64 = 0;
3255 for c in children {
3256 total = total.saturating_add(go(c, limit));
3257 if total >= limit {
3258 return limit;
3259 }
3260 }
3261 total
3262 }
3263 go(ast, limit)
3264}
3265
3266pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3267 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3268 p.parse(tb)
3269}
3270
3271pub fn parse_ast_with<'s>(
3272 tb: &mut TB<'s>,
3273 pattern: &'s str,
3274 flags: &PatternFlags,
3275) -> std::result::Result<NodeId, ParseError> {
3276 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3277 p.parse(tb)
3278}
3279
3280pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3282 let mut p: ResharpParser = ResharpParser::new(pattern);
3283 p.parse_inner()
3284}