1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51 pub max_depth: usize,
52}
53
54pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62 fn default() -> Self {
63 Self {
64 unicode: true,
65 full_unicode: false,
66 case_insensitive: false,
67 dot_matches_new_line: false,
68 multiline: true,
69 ignore_whitespace: false,
70 ascii_perl_classes: false,
71 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72 max_list_len: DEFAULT_MAX_LIST_LEN,
73 max_repeat: DEFAULT_MAX_REPEAT,
74 max_depth: DEFAULT_MAX_DEPTH,
75 }
76 }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81 Word,
82 NonWord,
83 MaybeWord,
84 MaybeNonWord,
85 Unknown,
86 Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90 b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
94 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95 use WordCharKind::*;
96 match item {
97 ClassSetItem::Empty(_) => Unknown,
98 ClassSetItem::Literal(l) => {
99 if is_word_byte(l.c as u8) {
100 Word
101 } else {
102 NonWord
103 }
104 }
105 ClassSetItem::Range(r) => {
106 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108 if all_word {
109 Word
110 } else if all_non {
111 NonWord
112 } else {
113 Unknown
114 }
115 }
116 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117 (ClassPerlKind::Word, false) => Word,
118 (ClassPerlKind::Word, true) => NonWord,
119 (ClassPerlKind::Space, false) => NonWord,
120 (ClassPerlKind::Digit, false) => Word,
121 _ => Unknown,
122 },
123 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124 ClassSetItem::Union(u) => {
125 let mut kind = Unknown;
126 for item in &u.items {
127 let k = class_set_item_word_kind(item);
128 kind = match (kind, k) {
129 (_, Unknown) => return Unknown,
130 (Unknown, _) => k,
131 (Word, Word) => Word,
132 (NonWord, NonWord) => NonWord,
133 _ => return Unknown,
134 };
135 }
136 kind
137 }
138 _ => Unknown,
139 }
140}
141
142fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
143 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
144 use WordCharKind::*;
145 if c.negated {
146 return match &c.kind {
147 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
148 if p.negated {
149 Word
150 } else {
151 NonWord
152 }
153 }
154 _ => Unknown,
155 };
156 }
157 match &c.kind {
158 ClassSet::Item(item) => class_set_item_word_kind(item),
159 ClassSet::BinaryOp(_) => Unknown,
160 }
161}
162
163fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
164 regex_syntax::ast::Literal {
165 span,
166 kind: regex_syntax::ast::LiteralKind::Verbatim,
167 c,
168 }
169}
170
171fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
172 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
173 span,
174 start: ascii_class_lit(span, a),
175 end: ascii_class_lit(span, b),
176 })
177}
178
179fn ascii_perl_positive(
180 span: Span,
181 kind: ®ex_syntax::ast::ClassPerlKind,
182) -> regex_syntax::ast::ClassSetItem {
183 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
184 match kind {
185 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
186 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
187 span,
188 items: vec![
189 ascii_class_range(span, 'a', 'z'),
190 ascii_class_range(span, 'A', 'Z'),
191 ascii_class_range(span, '0', '9'),
192 ClassSetItem::Literal(ascii_class_lit(span, '_')),
193 ],
194 }),
195 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
196 span,
197 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
198 .into_iter()
199 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
200 .collect(),
201 }),
202 }
203}
204
205fn ascii_perl_set_item(
206 span: Span,
207 kind: ®ex_syntax::ast::ClassPerlKind,
208 negated: bool,
209) -> regex_syntax::ast::ClassSetItem {
210 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
211 let positive = ascii_perl_positive(span, kind);
212 if negated {
213 ClassSetItem::Bracketed(Box::new(ClassBracketed {
214 span,
215 negated: true,
216 kind: ClassSet::Item(positive),
217 }))
218 } else {
219 positive
220 }
221}
222
223fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
224 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
225 match set {
226 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
227 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
228 span: op.span,
229 kind: op.kind.clone(),
230 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
231 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
232 }),
233 }
234}
235
236fn rewrite_ascii_perl_item(
237 item: ®ex_syntax::ast::ClassSetItem,
238) -> regex_syntax::ast::ClassSetItem {
239 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
240 match item {
241 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
242 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
243 span: u.span,
244 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
245 }),
246 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
247 span: b.span,
248 negated: b.negated,
249 kind: rewrite_ascii_perl_set(&b.kind),
250 })),
251 other => other.clone(),
252 }
253}
254
255#[derive(Clone, Debug, Eq, PartialEq)]
256enum Primitive {
257 Literal(Literal),
258 Assertion(ast::Assertion),
259 Dot(Span),
260 Top(Span),
261 Perl(ClassPerl),
262 Unicode(ClassUnicode),
263}
264
265impl Primitive {
266 fn span(&self) -> &Span {
267 match *self {
268 Primitive::Literal(ref x) => &x.span,
269 Primitive::Assertion(ref x) => &x.span,
270 Primitive::Dot(ref span) => span,
271 Primitive::Top(ref span) => span,
272 Primitive::Perl(ref x) => &x.span,
273 Primitive::Unicode(ref x) => &x.span,
274 }
275 }
276
277 fn into_ast(self) -> Ast {
278 match self {
279 Primitive::Literal(lit) => Ast::literal(lit),
280 Primitive::Assertion(assert) => Ast::assertion(assert),
281 Primitive::Dot(span) => Ast::dot(span),
282 Primitive::Top(span) => Ast::top(span),
283 Primitive::Perl(cls) => Ast::class_perl(cls),
284 Primitive::Unicode(cls) => Ast::class_unicode(cls),
285 }
286 }
287
288 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
289 use self::Primitive::*;
290 use regex_syntax::ast::ClassSetItem;
291
292 match self {
293 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
294 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
295 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
296 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
297 }
298 }
299
300 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
301 use self::Primitive::*;
302
303 match self {
304 Literal(lit) => Ok(lit),
305 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
306 }
307 }
308}
309
310#[derive(Clone, Debug, Eq, PartialEq)]
311pub enum Either<Left, Right> {
312 Left(Left),
313 Right(Right),
314}
315
316#[derive(Clone, Debug, Eq, PartialEq)]
317pub struct ParseError {
318 pub kind: ErrorKind,
320 pattern: String,
323 pub span: Span,
325}
326
327impl std::fmt::Display for ParseError {
328 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
329 write!(f, "{:?}: {:?}", self.kind, self.span)
330 }
331}
332impl std::error::Error for ParseError {}
333
334type Result<T> = core::result::Result<T, ParseError>;
335
336#[derive(Clone, Debug)]
337enum GroupState {
338 Group {
340 concat: Concat,
342 group: ast::Group,
344 ignore_whitespace: bool,
346 },
347 Alternation(ast::Alternation),
348 Intersection(ast::Intersection),
349}
350
351#[derive(Clone, Debug)]
352enum ClassState {
353 Open {
355 union: regex_syntax::ast::ClassSetUnion,
357 set: regex_syntax::ast::ClassBracketed,
358 },
359 Op {
362 kind: regex_syntax::ast::ClassSetBinaryOpKind,
364 lhs: regex_syntax::ast::ClassSet,
366 },
367}
368
369pub struct ResharpParser<'s> {
371 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
372 unicode_classes: resharp_algebra::UnicodeClassCache,
373 pub translator: regex_syntax::hir::translate::Translator,
374 pub pattern: &'s str,
375 pos: Cell<Position>,
376 capture_index: Cell<u32>,
377 octal: bool,
378 empty_min_range: bool,
379 ignore_whitespace: Cell<bool>,
380 dot_all: Cell<bool>,
381 multiline: Cell<bool>,
382 global_unicode: bool,
383 global_full_unicode: bool,
384 global_ascii_perl: bool,
385 global_case_insensitive: bool,
386 expanded_ast_limit: u64,
387 max_list_len: usize,
388 max_repeat: u32,
389 max_depth: usize,
390 comments: RefCell<Vec<ast::Comment>>,
391 stack_group: RefCell<Vec<GroupState>>,
392 stack_class: RefCell<Vec<ClassState>>,
393 capture_names: RefCell<Vec<ast::CaptureName>>,
394 scratch: RefCell<String>,
395}
396
397fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
398 result.map_err(|e| {
399 if e.kind == from {
400 ParseError {
401 kind: to,
402 pattern: e.pattern,
403 span: e.span,
404 }
405 } else {
406 e
407 }
408 })
409}
410
411fn is_capture_char(c: char, first: bool) -> bool {
412 if first {
413 c == '_' || c.is_alphabetic()
414 } else {
415 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
416 }
417}
418
419pub fn is_meta_character(c: char) -> bool {
420 matches!(
421 c,
422 '\\' | '.'
423 | '+'
424 | '*'
425 | '?'
426 | '('
427 | ')'
428 | '|'
429 | '['
430 | ']'
431 | '{'
432 | '}'
433 | '^'
434 | '$'
435 | '#'
436 | '&'
437 | '-'
438 | '~'
439 | '_'
440 )
441}
442
443pub fn escape(text: &str) -> String {
445 let mut buf = String::new();
446 escape_into(text, &mut buf);
447 buf
448}
449
450pub fn escape_into(text: &str, buf: &mut String) {
452 buf.reserve(text.len());
453 for c in text.chars() {
454 if is_meta_character(c) {
455 buf.push('\\');
456 }
457 buf.push(c);
458 }
459}
460
461pub fn is_escapeable_character(c: char) -> bool {
462 if is_meta_character(c) {
463 return true;
464 }
465 if !c.is_ascii() {
466 return false;
467 }
468 match c {
469 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
470 '<' | '>' => false,
471 _ => true,
472 }
473}
474
475fn is_hex(c: char) -> bool {
476 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
477}
478
479fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
480 match ast {
481 Ast::Concat(c) => {
482 let mut child_at_start = at_start;
483 for child in &c.asts {
484 ensure_lookbehind_at_start(child, child_at_start)?;
485 if ast_may_consume(child) {
486 child_at_start = false;
487 }
488 }
489 Ok(())
490 }
491 Ast::Alternation(a) => {
492 for child in &a.asts {
493 ensure_lookbehind_at_start(child, at_start)?;
494 }
495 Ok(())
496 }
497 Ast::Intersection(i) => {
498 for child in &i.asts {
499 ensure_lookbehind_at_start(child, at_start)?;
500 }
501 Ok(())
502 }
503 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
504 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
505 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
506 Ast::Lookaround(g) => {
507 match g.kind {
508 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
509 if !at_start {
510 return Err(g.span);
511 }
512 }
513 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
514 }
515 ensure_lookbehind_at_start(&g.ast, true)
516 }
517 Ast::Empty(_)
518 | Ast::Flags(_)
519 | Ast::Literal(_)
520 | Ast::Dot(_)
521 | Ast::Top(_)
522 | Ast::Assertion(_)
523 | Ast::ClassUnicode(_)
524 | Ast::ClassPerl(_)
525 | Ast::ClassBracketed(_) => Ok(()),
526 }
527}
528
529fn ast_may_consume(ast: &Ast) -> bool {
530 match ast {
531 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
532 Ast::Literal(_)
533 | Ast::Dot(_)
534 | Ast::Top(_)
535 | Ast::ClassUnicode(_)
536 | Ast::ClassPerl(_)
537 | Ast::ClassBracketed(_) => true,
538 Ast::Group(g) => ast_may_consume(&g.ast),
539 Ast::Repetition(r) => {
540 if !ast_may_consume(&r.ast) {
541 return false;
542 }
543 match r.op.kind {
544 RepetitionKind::ZeroOrOne
545 | RepetitionKind::ZeroOrMore
546 | RepetitionKind::OneOrMore => true,
547 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
548 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
549 RepetitionKind::Range(_) => true,
550 }
551 }
552 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
553 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
554 Ast::Complement(_) => true,
555 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
556 }
557}
558
559impl<'s> ResharpParser<'s> {
560 fn default_translator_builder(&self) -> TranslatorBuilder {
561 let mut trb = TranslatorBuilder::new();
562 trb.unicode(self.global_unicode);
563 trb.utf8(false);
564 trb.case_insensitive(self.global_case_insensitive);
565 trb
566 }
567
568 pub fn new(pattern: &'s str) -> Self {
569 Self::with_flags(pattern, &PatternFlags::default())
570 }
571
572 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
573 let mut trb = TranslatorBuilder::new();
574 trb.unicode(flags.unicode);
575 trb.utf8(false);
576 trb.case_insensitive(flags.case_insensitive);
577 Self {
578 translator: trb.build(),
579 pattern,
580 perl_classes: vec![],
581 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
582 pos: Cell::new(Position::new(0, 0, 0)),
583 capture_index: Cell::new(0),
584 octal: false,
585 empty_min_range: false,
586 ignore_whitespace: Cell::new(flags.ignore_whitespace),
587 dot_all: Cell::new(flags.dot_matches_new_line),
588 multiline: Cell::new(flags.multiline),
589 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
590 global_full_unicode: flags.full_unicode,
591 global_ascii_perl: flags.ascii_perl_classes,
592 global_case_insensitive: flags.case_insensitive,
593 expanded_ast_limit: flags.expanded_ast_limit,
594 max_list_len: flags.max_list_len,
595 max_repeat: flags.max_repeat,
596 max_depth: flags.max_depth,
597 comments: RefCell::new(vec![]),
598 stack_group: RefCell::new(vec![]),
599 stack_class: RefCell::new(vec![]),
600 capture_names: RefCell::new(vec![]),
601 scratch: RefCell::new(String::new()),
602 }
603 }
604
605 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
606 self
607 }
608
609 fn pattern(&self) -> &str {
610 self.pattern
611 }
612
613 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
614 ParseError {
615 kind,
616 pattern: self.pattern().to_string(),
617 span,
618 }
619 }
620
621 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
622 self.error(
623 Span::splat(self.pos()),
624 ast::ErrorKind::UnsupportedResharpRegex,
625 )
626 }
627
628 fn offset(&self) -> usize {
629 self.parser().pos.get().offset
630 }
631
632 fn line(&self) -> usize {
633 self.parser().pos.get().line
634 }
635
636 fn column(&self) -> usize {
637 self.parser().pos.get().column
638 }
639
640 fn next_capture_index(&self, span: Span) -> Result<u32> {
641 let current = self.parser().capture_index.get();
642 let i = current
643 .checked_add(1)
644 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
645 self.parser().capture_index.set(i);
646 Ok(i)
647 }
648
649 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
650 let mut names = self.parser().capture_names.borrow_mut();
651 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
652 Err(i) => {
653 names.insert(i, cap.clone());
654 Ok(())
655 }
656 Ok(i) => Err(self.error(
657 cap.span,
658 ast::ErrorKind::GroupNameDuplicate {
659 original: names[i].span,
660 },
661 )),
662 }
663 }
664
665 fn ignore_whitespace(&self) -> bool {
666 self.parser().ignore_whitespace.get()
667 }
668
669 fn char(&self) -> char {
670 self.char_at(self.offset())
671 }
672
673 fn char_at(&self, i: usize) -> char {
674 self.pattern()[i..]
675 .chars()
676 .next()
677 .unwrap_or_else(|| panic!("expected char at offset {}", i))
678 }
679
680 fn bump(&self) -> bool {
681 if self.is_eof() {
682 return false;
683 }
684 let Position {
685 mut offset,
686 mut line,
687 mut column,
688 } = self.pos();
689 if self.char() == '\n' {
690 line = line.checked_add(1).unwrap();
691 column = 1;
692 } else {
693 column = column.checked_add(1).unwrap();
694 }
695 offset += self.char().len_utf8();
696 self.parser().pos.set(Position {
697 offset,
698 line,
699 column,
700 });
701 self.pattern()[self.offset()..].chars().next().is_some()
702 }
703
704 fn bump_if(&self, prefix: &str) -> bool {
705 if self.pattern()[self.offset()..].starts_with(prefix) {
706 for _ in 0..prefix.chars().count() {
707 self.bump();
708 }
709 true
710 } else {
711 false
712 }
713 }
714
715 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
716 if self.bump_if("?=") {
717 return Some((true, true));
718 }
719 if self.bump_if("?!") {
720 return Some((true, false));
721 }
722 if self.bump_if("?<=") {
723 return Some((false, true));
724 }
725 if self.bump_if("?<!") {
726 return Some((false, false));
727 }
728 None
729 }
730
731 fn bump_and_bump_space(&self) -> bool {
732 if !self.bump() {
733 return false;
734 }
735 self.bump_space();
736 !self.is_eof()
737 }
738
739 fn bump_space(&self) {
740 if !self.ignore_whitespace() {
741 return;
742 }
743 while !self.is_eof() {
744 if self.char().is_whitespace() {
745 self.bump();
746 } else if self.char() == '#' {
747 let start = self.pos();
748 let mut comment_text = String::new();
749 self.bump();
750 while !self.is_eof() {
751 let c = self.char();
752 self.bump();
753 if c == '\n' {
754 break;
755 }
756 comment_text.push(c);
757 }
758 let comment = ast::Comment {
759 span: Span::new(start, self.pos()),
760 comment: comment_text,
761 };
762 self.parser().comments.borrow_mut().push(comment);
763 } else {
764 break;
765 }
766 }
767 }
768
769 fn peek(&self) -> Option<char> {
770 if self.is_eof() {
771 return None;
772 }
773 self.pattern()[self.offset() + self.char().len_utf8()..]
774 .chars()
775 .next()
776 }
777
778 fn peek_space(&self) -> Option<char> {
781 if !self.ignore_whitespace() {
782 return self.peek();
783 }
784 if self.is_eof() {
785 return None;
786 }
787 let mut start = self.offset() + self.char().len_utf8();
788 let mut in_comment = false;
789 for (i, c) in self.pattern()[start..].char_indices() {
790 if c.is_whitespace() {
791 continue;
792 } else if !in_comment && c == '#' {
793 in_comment = true;
794 } else if in_comment && c == '\n' {
795 in_comment = false;
796 } else {
797 start += i;
798 break;
799 }
800 }
801 self.pattern()[start..].chars().next()
802 }
803
804 fn is_eof(&self) -> bool {
805 self.offset() == self.pattern().len()
806 }
807
808 fn pos(&self) -> Position {
809 self.parser().pos.get()
810 }
811
812 fn span(&self) -> Span {
813 Span::splat(self.pos())
814 }
815
816 fn span_char(&self) -> Span {
817 let mut next = Position {
818 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
819 line: self.line(),
820 column: self.column().checked_add(1).unwrap(),
821 };
822 if self.char() == '\n' {
823 next.line += 1;
824 next.column = 1;
825 }
826 Span::new(self.pos(), next)
827 }
828
829 #[inline(never)]
830 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
831 assert_eq!(self.char(), '|');
832 concat.span.end = self.pos();
833 self.push_or_add_alternation(concat);
834 self.bump();
835 Ok(ast::Concat {
836 span: self.span(),
837 asts: vec![],
838 })
839 }
840
841 fn push_or_add_alternation(&self, concat: Concat) {
842 use self::GroupState::*;
843
844 let mut stack = self.parser().stack_group.borrow_mut();
845 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
846 alts.asts.push(concat.into_ast());
847 return;
848 }
849 stack.push(Alternation(ast::Alternation {
850 span: Span::new(concat.span.start, self.pos()),
851 asts: vec![concat.into_ast()],
852 }));
853 }
854
855 #[inline(never)]
856 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
857 assert_eq!(self.char(), '&');
858 concat.span.end = self.pos();
859 self.push_or_add_intersect(concat);
860 self.bump();
861 Ok(Concat {
862 span: self.span(),
863 asts: vec![],
864 })
865 }
866
867 fn push_or_add_intersect(&self, concat: Concat) {
868 use self::GroupState::*;
869
870 let mut stack = self.parser().stack_group.borrow_mut();
871 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
872 alts.asts.push(concat.into_ast());
873 return;
874 }
875 stack.push(Intersection(ast::Intersection {
876 span: Span::new(concat.span.start, self.pos()),
877 asts: vec![concat.into_ast()],
878 }));
879 }
880
881 #[inline(never)]
882 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
883 assert_eq!(self.char(), '(');
884 match self.parse_group()? {
885 Either::Left(set) => {
886 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
887 if let Some(v) = ignore {
888 self.parser().ignore_whitespace.set(v);
889 }
890
891 concat.asts.push(Ast::flags(set));
892 Ok(concat)
893 }
894 Either::Right(group) => {
895 let old_ignore_whitespace = self.ignore_whitespace();
896 let new_ignore_whitespace = group
897 .flags()
898 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
899 .unwrap_or(old_ignore_whitespace);
900 self.parser()
901 .stack_group
902 .borrow_mut()
903 .push(GroupState::Group {
904 concat,
905 group,
906 ignore_whitespace: old_ignore_whitespace,
907 });
908 self.parser().ignore_whitespace.set(new_ignore_whitespace);
909 Ok(Concat {
910 span: self.span(),
911 asts: vec![],
912 })
913 }
914 }
915 }
916
917 #[inline(never)]
918 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
919 assert_eq!(self.char(), '~');
920 self.bump();
921 if self.is_eof() || self.char() != '(' {
922 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
923 }
924 let open_span = self.span_char();
925 self.bump();
926 let group = ast::Group {
927 span: open_span,
928 kind: ast::GroupKind::Complement,
929 ast: Box::new(Ast::empty(self.span())),
930 };
931
932 let old_ignore_whitespace = self.ignore_whitespace();
933 let new_ignore_whitespace = group
934 .flags()
935 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
936 .unwrap_or(old_ignore_whitespace);
937 self.parser()
938 .stack_group
939 .borrow_mut()
940 .push(GroupState::Group {
941 concat,
942 group,
943 ignore_whitespace: old_ignore_whitespace,
944 });
945 self.parser().ignore_whitespace.set(new_ignore_whitespace);
946 Ok(Concat {
947 span: self.span(),
948 asts: vec![],
949 })
950 }
951
952 #[inline(never)]
953 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
954 use self::GroupState::*;
955 assert_eq!(self.char(), ')');
956 let mut stack = self.parser().stack_group.borrow_mut();
957 let topstack = stack.pop();
958
959 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
960 Some(Group {
961 concat,
962 group,
963 ignore_whitespace,
964 }) => (concat, group, ignore_whitespace, None),
965 Some(Alternation(alt)) => match stack.pop() {
966 Some(Group {
967 concat,
968 group,
969 ignore_whitespace,
970 }) => (
971 concat,
972 group,
973 ignore_whitespace,
974 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
975 ),
976 None | Some(Alternation(_)) | Some(Intersection(_)) => {
977 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
978 }
979 },
980 Some(Intersection(int)) => match stack.pop() {
981 Some(Group {
982 concat,
983 group,
984 ignore_whitespace,
985 }) => (
986 concat,
987 group,
988 ignore_whitespace,
989 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
990 ),
991 None | Some(Alternation(_)) | Some(Intersection(_)) => {
992 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
993 }
994 },
995
996 None => {
997 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
998 }
999 };
1000 self.parser().ignore_whitespace.set(ignore_whitespace);
1001 group_concat.span.end = self.pos();
1002 self.bump();
1003 group.span.end = self.pos();
1004 match alt {
1005 Some(Either::Left(mut alt)) => {
1006 alt.span.end = group_concat.span.end;
1007 alt.asts.push(group_concat.into_ast());
1008 group.ast = Box::new(alt.into_ast());
1009 }
1010 Some(Either::Right(mut int)) => {
1011 int.span.end = group_concat.span.end;
1012 int.asts.push(group_concat.into_ast());
1013 group.ast = Box::new(int.into_ast());
1014 }
1015 None => {
1016 group.ast = Box::new(group_concat.into_ast());
1017 }
1018 }
1019
1020 if group.kind == GroupKind::Complement {
1021 let complement = ast::Complement {
1022 span: self.span(),
1023 ast: group.ast,
1024 };
1025 prior_concat.asts.push(Ast::complement(complement));
1026 }
1027 else {
1029 prior_concat.asts.push(Ast::group(group));
1030 }
1031 Ok(prior_concat)
1032 }
1033
1034 #[inline(never)]
1035 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1036 concat.span.end = self.pos();
1037 let mut stack = self.parser().stack_group.borrow_mut();
1038 let ast = match stack.pop() {
1039 None => Ok(concat.into_ast()),
1040 Some(GroupState::Alternation(mut alt)) => {
1041 alt.span.end = self.pos();
1042 alt.asts.push(concat.into_ast());
1043 Ok(Ast::alternation(alt))
1044 }
1045 Some(GroupState::Intersection(mut int)) => {
1046 int.span.end = self.pos();
1047 int.asts.push(concat.into_ast());
1048
1049 Ok(Ast::intersection(int))
1050 }
1051 Some(GroupState::Group { group, .. }) => {
1052 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1053 }
1054 };
1055 match stack.pop() {
1057 None => ast,
1058 Some(GroupState::Alternation(alt)) => {
1059 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1060 }
1061 Some(GroupState::Intersection(int)) => {
1062 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1063 }
1064 Some(GroupState::Group { group, .. }) => {
1065 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1066 }
1067 }
1068 }
1069
1070 #[inline(never)]
1071 fn push_class_open(
1072 &self,
1073 parent_union: regex_syntax::ast::ClassSetUnion,
1074 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1075 assert_eq!(self.char(), '[');
1076
1077 let (nested_set, nested_union) = self.parse_set_class_open()?;
1078 self.parser()
1079 .stack_class
1080 .borrow_mut()
1081 .push(ClassState::Open {
1082 union: parent_union,
1083 set: nested_set,
1084 });
1085 Ok(nested_union)
1086 }
1087
1088 #[inline(never)]
1089 fn pop_class(
1090 &self,
1091 nested_union: regex_syntax::ast::ClassSetUnion,
1092 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1093 assert_eq!(self.char(), ']');
1094
1095 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1096 let prevset = self.pop_class_op(item);
1097 let mut stack = self.parser().stack_class.borrow_mut();
1098 match stack.pop() {
1099 None => panic!("unexpected empty character class stack"),
1100 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1101 Some(ClassState::Open { mut union, mut set }) => {
1102 self.bump();
1103 set.span.end = self.pos();
1104 set.kind = prevset;
1105 if stack.is_empty() {
1106 Ok(Either::Right(set))
1107 } else {
1108 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1109 Ok(Either::Left(union))
1110 }
1111 }
1112 }
1113 }
1114
1115 #[inline(never)]
1116 fn unclosed_class_error(&self) -> ParseError {
1117 for state in self.parser().stack_class.borrow().iter().rev() {
1118 if let ClassState::Open { ref set, .. } = *state {
1119 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1120 }
1121 }
1122 panic!("no open character class found")
1123 }
1124
1125 #[inline(never)]
1126 fn push_class_op(
1127 &self,
1128 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1129 next_union: regex_syntax::ast::ClassSetUnion,
1130 ) -> regex_syntax::ast::ClassSetUnion {
1131 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1132 let new_lhs = self.pop_class_op(item);
1133 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1134 kind: next_kind,
1135 lhs: new_lhs,
1136 });
1137 regex_syntax::ast::ClassSetUnion {
1138 span: self.span(),
1139 items: vec![],
1140 }
1141 }
1142
1143 #[inline(never)]
1144 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1145 let mut stack = self.parser().stack_class.borrow_mut();
1146 let (kind, lhs) = match stack.pop() {
1147 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1148 Some(state @ ClassState::Open { .. }) => {
1149 stack.push(state);
1150 return rhs;
1151 }
1152 None => unreachable!(),
1153 };
1154 let span = Span::new(lhs.span().start, rhs.span().end);
1155 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1156 span,
1157 kind,
1158 lhs: Box::new(lhs),
1159 rhs: Box::new(rhs),
1160 })
1161 }
1162
1163 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1164 match hir.kind() {
1165 hir::HirKind::Empty => Ok(NodeId::EPS),
1166 hir::HirKind::Literal(l) => {
1167 if l.0.len() == 1 {
1168 let node = tb.mk_u8(l.0[0]);
1169 Ok(node)
1170 } else {
1171 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1172 let conc = tb.mk_concats(ws.iter().copied());
1173 Ok(conc)
1174 }
1175 }
1176 hir::HirKind::Class(class) => match class {
1177 hir::Class::Unicode(class_unicode) => {
1178 let ranges = class_unicode.ranges();
1179 if ranges.len() == 1
1180 && ranges[0].start() == '\u{0}'
1181 && ranges[0].end() == '\u{10FFFF}'
1182 {
1183 return Ok(tb.mk_range_u8(0, 255));
1184 }
1185 let mut nodes = Vec::new();
1186 for range in ranges {
1187 for seq in Utf8Sequences::new(range.start(), range.end()) {
1188 let sl = seq.as_slice();
1189 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
1190 let node = match bytes.len() {
1191 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
1192 n => {
1193 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
1194 let mut conc = last;
1195 for i in (0..n - 1).rev() {
1196 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
1197 conc = tb.mk_concat(b, conc);
1198 }
1199 conc
1200 }
1201 };
1202 nodes.push(node);
1203 }
1204 }
1205 let merged = tb.mk_unions(nodes.into_iter());
1206 Ok(merged)
1207 }
1208 hir::Class::Bytes(class_bytes) => {
1209 let ranges = class_bytes.ranges();
1210 let mut result = NodeId::BOT;
1211 for range in ranges {
1212 let start = range.start();
1213 let end = range.end();
1214 let node = tb.mk_range_u8(start, end);
1215 result = tb.mk_union(result, node);
1216 }
1217 Ok(result)
1218 }
1219 },
1220 hir::HirKind::Look(_) => Err(self.error(
1221 Span::splat(self.pos()),
1222 ast::ErrorKind::UnsupportedResharpRegex,
1223 )),
1224 hir::HirKind::Repetition(_) => Err(self.error(
1225 Span::splat(self.pos()),
1226 ast::ErrorKind::UnsupportedResharpRegex,
1227 )),
1228 hir::HirKind::Capture(_) => Err(self.error(
1229 Span::splat(self.pos()),
1230 ast::ErrorKind::UnsupportedResharpRegex,
1231 )),
1232 hir::HirKind::Concat(body) => {
1233 let mut result = NodeId::EPS;
1234 for child in body {
1235 let node = self.hir_to_node_id(child, tb)?;
1236 result = tb.mk_concat(result, node);
1237 }
1238 Ok(result)
1239 }
1240 hir::HirKind::Alternation(_) => Err(self.error(
1241 Span::splat(self.pos()),
1242 ast::ErrorKind::UnsupportedResharpRegex,
1243 )),
1244 }
1245 }
1246
1247 fn translate_ast_to_hir(
1248 &mut self,
1249 orig_ast: ®ex_syntax::ast::Ast,
1250 tb: &mut TB<'s>,
1251 ) -> Result<NodeId> {
1252 match self.translator.translate("", orig_ast) {
1253 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1254 Ok(hir) => self.hir_to_node_id(&hir, tb),
1255 }
1256 }
1257
1258 fn translator_to_node_id(
1259 &mut self,
1260 orig_ast: ®ex_syntax::ast::Ast,
1261 translator: &mut Option<Translator>,
1262 tb: &mut TB<'s>,
1263 ) -> Result<NodeId> {
1264 match translator {
1265 Some(tr) => {
1266 let hir = tr
1267 .translate("", orig_ast)
1268 .map_err(|e| self.unsupported_error(e))?;
1269 self.hir_to_node_id(&hir, tb)
1270 }
1271 None => self.translate_ast_to_hir(orig_ast, tb),
1272 }
1273 }
1274
1275 fn get_class(
1276 &mut self,
1277 negated: bool,
1278 kind: regex_syntax::ast::ClassPerlKind,
1279 tb: &mut TB<'s>,
1280 ) -> Result<NodeId> {
1281 let w = self
1282 .perl_classes
1283 .iter()
1284 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1285 match w {
1286 Some((_, _, value)) => Ok(*value),
1287 None => {
1288 let translated = if self.global_ascii_perl {
1289 let pos = match kind {
1290 regex_syntax::ast::ClassPerlKind::Word => {
1291 let az = tb.mk_range_u8(b'a', b'z');
1292 let big = tb.mk_range_u8(b'A', b'Z');
1293 let dig = tb.mk_range_u8(b'0', b'9');
1294 let us = tb.mk_u8(b'_');
1295 tb.mk_unions([az, big, dig, us].into_iter())
1296 }
1297 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1298 regex_syntax::ast::ClassPerlKind::Space => {
1299 let sp = tb.mk_u8(b' ');
1300 let tab = tb.mk_u8(b'\t');
1301 let nl = tb.mk_u8(b'\n');
1302 let cr = tb.mk_u8(b'\r');
1303 let ff = tb.mk_u8(0x0C);
1304 let vt = tb.mk_u8(0x0B);
1305 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1306 }
1307 };
1308 if negated {
1309 resharp_algebra::neg_class(tb, pos)
1310 } else {
1311 pos
1312 }
1313 } else if self.global_unicode {
1314 match kind {
1315 regex_syntax::ast::ClassPerlKind::Word => {
1316 if self.global_full_unicode {
1317 self.unicode_classes.ensure_word_full(tb);
1318 } else {
1319 self.unicode_classes.ensure_word(tb);
1320 }
1321 if negated {
1322 self.unicode_classes.non_word
1323 } else {
1324 self.unicode_classes.word
1325 }
1326 }
1327 regex_syntax::ast::ClassPerlKind::Digit => {
1328 if self.global_full_unicode {
1329 self.unicode_classes.ensure_digit_full(tb);
1330 } else {
1331 self.unicode_classes.ensure_digit(tb);
1332 }
1333 if negated {
1334 self.unicode_classes.non_digit
1335 } else {
1336 self.unicode_classes.digit
1337 }
1338 }
1339 regex_syntax::ast::ClassPerlKind::Space => {
1340 if self.global_full_unicode {
1341 self.unicode_classes.ensure_space_full(tb);
1342 } else {
1343 self.unicode_classes.ensure_space(tb);
1344 }
1345 if negated {
1346 self.unicode_classes.non_space
1347 } else {
1348 self.unicode_classes.space
1349 }
1350 }
1351 }
1352 } else {
1353 let pos = match kind {
1354 regex_syntax::ast::ClassPerlKind::Word => {
1355 let az = tb.mk_range_u8(b'a', b'z');
1356 let big = tb.mk_range_u8(b'A', b'Z');
1357 let dig = tb.mk_range_u8(b'0', b'9');
1358 let us = tb.mk_u8(b'_');
1359 tb.mk_unions([az, big, dig, us].into_iter())
1360 }
1361 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1362 regex_syntax::ast::ClassPerlKind::Space => {
1363 let sp = tb.mk_u8(b' ');
1364 let tab = tb.mk_u8(b'\t');
1365 let nl = tb.mk_u8(b'\n');
1366 let cr = tb.mk_u8(b'\r');
1367 let ff = tb.mk_u8(0x0C);
1368 let vt = tb.mk_u8(0x0B);
1369 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1370 }
1371 };
1372 if negated {
1373 resharp_algebra::neg_class(tb, pos)
1379 } else {
1380 pos
1381 }
1382 };
1383 self.perl_classes.push((negated, kind, translated));
1384 Ok(translated)
1385 }
1386 }
1387 }
1388
1389 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1390 use WordCharKind::*;
1391 match ast {
1392 Ast::Literal(lit) => {
1393 if is_word_byte(lit.c as u8) {
1394 Word
1395 } else {
1396 NonWord
1397 }
1398 }
1399 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1400 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1401 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1402 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1403 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1404 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1405 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1406 },
1407 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1408 Ast::Dot(_) | Ast::Top(_) => Unknown,
1409 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1410 Ast::Concat(c) if !c.asts.is_empty() => {
1411 let edge = if left { c.asts.len() - 1 } else { 0 };
1412 let kind = Self::word_char_kind(&c.asts[edge], left);
1413 match kind {
1414 MaybeWord => {
1415 let dir: isize = if left { -1 } else { 1 };
1416 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1417 Word => Word,
1418 _ => MaybeWord,
1419 }
1420 }
1421 MaybeNonWord => {
1422 let dir: isize = if left { -1 } else { 1 };
1423 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1424 NonWord => NonWord,
1425 _ => MaybeNonWord,
1426 }
1427 }
1428 other => other,
1429 }
1430 }
1431 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1432 let first = Self::word_char_kind(&alt.asts[0], left);
1433 if alt.asts[1..]
1434 .iter()
1435 .all(|a| Self::word_char_kind(a, left) == first)
1436 {
1437 first
1438 } else {
1439 Unknown
1440 }
1441 }
1442 Ast::Repetition(r) => {
1443 let inner = Self::word_char_kind(&r.ast, left);
1444 let nullable = matches!(
1445 &r.op.kind,
1446 ast::RepetitionKind::ZeroOrMore
1447 | ast::RepetitionKind::ZeroOrOne
1448 | ast::RepetitionKind::Range(
1449 ast::RepetitionRange::Bounded(0, _)
1450 | ast::RepetitionRange::Exactly(0)
1451 )
1452 );
1453 if nullable {
1454 match inner {
1455 Word => MaybeWord,
1456 NonWord => MaybeNonWord,
1457 _ => Unknown,
1458 }
1459 } else {
1460 inner
1461 }
1462 }
1463 Ast::Lookaround(la) => match la.kind {
1464 ast::LookaroundKind::PositiveLookahead
1465 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1466 _ => Unknown,
1467 },
1468 Ast::Assertion(a) => match (&a.kind, left) {
1469 (ast::AssertionKind::EndText, false) => NonWord,
1470 (ast::AssertionKind::StartText, true) => NonWord,
1471 _ => Unknown,
1472 },
1473 _ => Unknown,
1474 }
1475 }
1476
1477 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1479 match ast {
1480 Ast::Literal(_)
1481 | Ast::ClassPerl(_)
1482 | Ast::ClassBracketed(_)
1483 | Ast::ClassUnicode(_)
1484 | Ast::Dot(_)
1485 | Ast::Top(_) => Some(ast),
1486 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1487 Ast::Concat(c) if !c.asts.is_empty() => {
1488 Self::edge_class_ast(&c.asts[if left { c.asts.len() - 1 } else { 0 }], left)
1489 }
1490 Ast::Repetition(r) => {
1491 let nullable = matches!(
1492 &r.op.kind,
1493 ast::RepetitionKind::ZeroOrMore
1494 | ast::RepetitionKind::ZeroOrOne
1495 | ast::RepetitionKind::Range(
1496 ast::RepetitionRange::Bounded(0, _)
1497 | ast::RepetitionRange::Exactly(0)
1498 )
1499 );
1500 if nullable {
1501 None
1502 } else {
1503 Self::edge_class_ast(&r.ast, left)
1504 }
1505 }
1506 _ => None,
1507 }
1508 }
1509
1510 fn resolve_word_kind(
1511 &mut self,
1512 asts: &[Ast],
1513 idx: usize,
1514 dir: isize,
1515 translator: &mut Option<Translator>,
1516 tb: &mut TB<'s>,
1517 word_id: NodeId,
1518 not_word_id: NodeId,
1519 ) -> Result<WordCharKind> {
1520 use WordCharKind::*;
1521 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1522 if fast != Unknown {
1523 return Ok(fast);
1524 }
1525 let neighbor_idx = (idx as isize + dir) as usize;
1526 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1527 self.ast_to_node_id(edge, translator, tb)?
1528 } else if dir > 0 {
1529 let mut bodies: Vec<NodeId> = vec![];
1530 let mut j = neighbor_idx;
1531 while j < asts.len() {
1532 match &asts[j] {
1533 Ast::Lookaround(la) => {
1534 let kind = la.kind.clone();
1535 let lookbehind = matches!(
1536 kind,
1537 ast::LookaroundKind::PositiveLookbehind
1538 | ast::LookaroundKind::NegativeLookbehind
1539 );
1540 if lookbehind {
1541 j += 1;
1542 continue;
1543 }
1544 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1545 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1546 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1547 })?;
1548 let body_ts = tb.mk_concat(body, NodeId::TS);
1549 let constraint = match kind {
1550 ast::LookaroundKind::PositiveLookahead => body_ts,
1551 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1552 _ => unreachable!(),
1553 };
1554 bodies.push(constraint);
1555 j += 1;
1556 }
1557 other => {
1558 let n = self.ast_to_node_id(other, translator, tb)?;
1559 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1560 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1561 })?;
1562 bodies.push(tb.mk_concat(n, NodeId::TS));
1563 break;
1564 }
1565 }
1566 }
1567 if bodies.is_empty() {
1568 return Ok(Unknown);
1569 }
1570 let combined = tb.mk_inters(bodies.into_iter());
1571 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1572 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1573 return if tb.subsumes(word_prefix, combined) == Some(true) {
1574 Ok(Word)
1575 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1576 Ok(NonWord)
1577 } else {
1578 Ok(Unknown)
1579 };
1580 } else {
1581 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1582 let mut neighbor_node = tb
1583 .try_elim_lookarounds(neighbor_node)
1584 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1585 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1586 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1587 })?;
1588 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1589 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1590 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1591 Ok(Word)
1592 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1593 Ok(NonWord)
1594 } else {
1595 Ok(Unknown)
1596 };
1597 };
1598 if tb.subsumes(word_id, node) == Some(true) {
1599 Ok(Word)
1600 } else if tb.subsumes(not_word_id, node) == Some(true) {
1601 Ok(NonWord)
1602 } else {
1603 Ok(Unknown)
1604 }
1605 }
1606
1607 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1608 match ast {
1609 Ast::Lookaround(la) => match la.kind {
1610 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1611 dir < 0
1612 }
1613 ast::LookaroundKind::PositiveLookbehind
1614 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1615 },
1616 Ast::Repetition(r) => matches!(
1617 &r.op.kind,
1618 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1619 ),
1620 _ => false,
1621 }
1622 }
1623
1624 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1625 use WordCharKind::*;
1626 let next = idx as isize + dir;
1627 if next < 0 || next >= asts.len() as isize {
1628 return Edge;
1629 }
1630 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1631 return Self::concat_neighbor_kind(asts, next as usize, dir);
1632 }
1633 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1634 match kind {
1635 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1636 Word => Word,
1637 _ => Unknown,
1638 },
1639 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1640 NonWord => NonWord,
1641 _ => Unknown,
1642 },
1643 other => other,
1644 }
1645 }
1646
1647 fn specialize_word_boundaries(
1648 &mut self,
1649 children: &mut [NodeId],
1650 tb: &mut TB<'s>,
1651 ) -> Result<()> {
1652 let wb = self.unicode_classes.wb;
1653 let non_wb = self.unicode_classes.non_wb;
1654 if wb == NodeId::MISSING {
1655 return Ok(());
1656 }
1657 let word = self.unicode_classes.word;
1658 let non_word = self.unicode_classes.non_word;
1659 if word == NodeId::MISSING {
1660 return Ok(());
1661 }
1662 let word_pref = tb.mk_concat(word, NodeId::TS);
1665 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1666 let word_suf = tb.mk_concat(NodeId::TS, word);
1667 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1668 let len = children.len();
1669 for k in 0..len {
1670 let l = if k == 0 {
1671 WordCharKind::Edge
1672 } else {
1673 use resharp_algebra::Kind;
1674 if tb.get_kind(children[k - 1]) == Kind::End
1675 && (children[k] == wb || children[k] == non_wb)
1676 {
1677 return Err(
1678 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1679 );
1680 }
1681 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1682 };
1683 let r = if k + 1 >= len {
1684 WordCharKind::Edge
1685 } else {
1686 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1687 };
1688 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1689 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1690 }
1691 Ok(())
1692 }
1693
1694 fn rewrite_wb_in_node(
1695 b: &mut TB<'s>,
1696 node: NodeId,
1697 wb: NodeId,
1698 non_wb: NodeId,
1699 word: NodeId,
1700 left: WordCharKind,
1701 right: WordCharKind,
1702 ) -> Option<NodeId> {
1703 let boundary_match = if node == wb {
1704 true
1705 } else if node == non_wb {
1706 false
1707 } else if b.get_kind(node) == Kind::Union {
1708 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1709 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1710 return Some(b.mk_union(l, r));
1711 } else {
1712 return Some(node);
1713 };
1714 use WordCharKind::*;
1715 let result = match (left, right) {
1716 (NonWord, Word) | (Word, NonWord) => {
1717 if boundary_match {
1718 NodeId::EPS
1719 } else {
1720 NodeId::BOT
1721 }
1722 }
1723 (Word, Word) | (NonWord, NonWord) => {
1724 if boundary_match {
1725 NodeId::BOT
1726 } else {
1727 NodeId::EPS
1728 }
1729 }
1730 (Word, _) => {
1731 if boundary_match {
1732 b.mk_neg_lookahead(word, 0)
1733 } else {
1734 let tail = b.mk_concat(word, NodeId::TS);
1735 b.mk_lookahead(tail, NodeId::MISSING, 0)
1736 }
1737 }
1738 (NonWord, _) => {
1739 if boundary_match {
1740 let tail = b.mk_concat(word, NodeId::TS);
1741 b.mk_lookahead(tail, NodeId::MISSING, 0)
1742 } else {
1743 b.mk_neg_lookahead(word, 0)
1744 }
1745 }
1746 (_, Word) => {
1747 if boundary_match {
1748 b.mk_neg_lookbehind(word)
1749 } else {
1750 b.mk_lookbehind(word, NodeId::MISSING)
1751 }
1752 }
1753 (_, NonWord) => {
1754 if boundary_match {
1755 b.mk_lookbehind(word, NodeId::MISSING)
1756 } else {
1757 b.mk_neg_lookbehind(word)
1758 }
1759 }
1760 _ => return Some(node),
1761 };
1762 Some(result)
1763 }
1764
1765 fn classify(
1766 b: &mut TB<'s>,
1767 node: NodeId,
1768 word_dir: NodeId,
1769 non_word_dir: NodeId,
1770 ) -> WordCharKind {
1771 if b.contains_look(node) || b.contains_anchors(node) {
1772 return WordCharKind::Unknown;
1773 }
1774 if b.subsumes(word_dir, node) == Some(true) {
1775 WordCharKind::Word
1776 } else if b.subsumes(non_word_dir, node) == Some(true) {
1777 WordCharKind::NonWord
1778 } else {
1779 WordCharKind::Unknown
1780 }
1781 }
1782
1783 fn rewrite_word_boundary_in_concat(
1784 &mut self,
1785 asts: &[Ast],
1786 idx: usize,
1787 translator: &mut Option<Translator>,
1788 tb: &mut TB<'s>,
1789 negated: bool,
1790 ) -> Result<(NodeId, usize)> {
1791 use WordCharKind::*;
1792 if self.global_full_unicode {
1793 self.unicode_classes.ensure_word_full(tb);
1794 } else if self.global_unicode && !self.global_ascii_perl {
1795 self.unicode_classes.ensure_word(tb);
1796 } else {
1797 self.unicode_classes.ensure_word_ascii(tb);
1798 }
1799 let word_id = self.unicode_classes.word;
1800 let not_word_id = self.unicode_classes.non_word;
1801 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1802 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1803 let boundary_match = !negated;
1804 match (left, right) {
1805 (NonWord, Word) | (Word, NonWord) => Ok((
1806 if boundary_match {
1807 NodeId::EPS
1808 } else {
1809 NodeId::BOT
1810 },
1811 idx + 1,
1812 )),
1813 (Word, Word) | (NonWord, NonWord) => Ok((
1814 if boundary_match {
1815 NodeId::BOT
1816 } else {
1817 NodeId::EPS
1818 },
1819 idx + 1,
1820 )),
1821 (Word, _) => {
1822 if boundary_match {
1823 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1824 } else {
1825 let tail = tb.mk_concat(word_id, NodeId::TS);
1826 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1827 }
1828 }
1829 (NonWord, _) => {
1830 if boundary_match {
1831 let tail = tb.mk_concat(word_id, NodeId::TS);
1832 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1833 } else {
1834 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1835 }
1836 }
1837 (_, Word) => {
1838 if boundary_match {
1839 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1840 } else {
1841 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1842 }
1843 }
1844 (_, NonWord) => {
1845 if boundary_match {
1846 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1847 } else {
1848 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1849 }
1850 }
1851 _ => {
1855 self.unicode_classes.ensure_wb(tb);
1856 let node = if boundary_match {
1857 self.unicode_classes.wb
1858 } else {
1859 self.unicode_classes.non_wb
1860 };
1861 Ok((node, idx + 1))
1862 }
1863 }
1864 }
1865
1866 fn merge_boundary_with_following_lookaheads(
1867 &mut self,
1868 asts: &[Ast],
1869 wb_idx: usize,
1870 boundary_tail: NodeId,
1871 translator: &mut Option<Translator>,
1872 tb: &mut TB<'s>,
1873 ) -> Result<(NodeId, usize)> {
1874 let mut next = wb_idx + 1;
1875 let mut la_bodies = vec![boundary_tail];
1876 while next < asts.len() {
1877 match &asts[next] {
1878 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1879 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1880 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1881 next += 1;
1882 }
1883 _ => break,
1884 }
1885 }
1886 let merged = tb.mk_inters(la_bodies.into_iter());
1887 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1888 }
1889
1890 fn ast_to_node_id(
1891 &mut self,
1892 ast: &Ast,
1893 translator: &mut Option<Translator>,
1894 tb: &mut TB<'s>,
1895 ) -> Result<NodeId> {
1896 match ast {
1897 Ast::Empty(_) => Ok(NodeId::EPS),
1898 Ast::Flags(f) => {
1899 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1900 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1901 }
1902 let mut translator_builder = self.default_translator_builder();
1903 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1904 translator_builder.case_insensitive(state);
1905 }
1906 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1907 translator_builder.unicode(state);
1908 }
1909 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1910 self.dot_all.set(state);
1911 }
1912 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1913 self.multiline.set(state);
1914 }
1915 let concat_translator = Some(translator_builder.build());
1916 *translator = concat_translator;
1917 Ok(NodeId::EPS)
1918 }
1919 Ast::Literal(l) => {
1920 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1921 self.translator_to_node_id(&ast_lit, translator, tb)
1922 }
1923 Ast::Top(_) => Ok(NodeId::TOP),
1924 Ast::Dot(_) => {
1925 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1926 let hirv = match (codepoint_dot, self.dot_all.get()) {
1927 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1928 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1929 (false, true) => return Ok(NodeId::TOP),
1930 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1931 };
1932 self.hir_to_node_id(&hirv, tb)
1933 }
1934 Ast::Assertion(a) => match &a.kind {
1935 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1936 ast::AssertionKind::EndText => Ok(NodeId::END),
1937 ast::AssertionKind::WordBoundary => {
1938 let only = Ast::Assertion(a.clone());
1939 let asts = std::slice::from_ref(&only);
1940 let (node, _) =
1941 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1942 Ok(node)
1943 }
1944 ast::AssertionKind::NotWordBoundary => {
1945 let only = Ast::Assertion(a.clone());
1947 let asts = std::slice::from_ref(&only);
1948 let (node, _) =
1949 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1950 Ok(node)
1951 }
1952 ast::AssertionKind::StartLine => {
1953 if !self.multiline.get() {
1954 return Ok(NodeId::BEGIN);
1955 }
1956 let left = NodeId::BEGIN;
1957 let right = tb.mk_u8(b'\n');
1958 let union = tb.mk_union(left, right);
1959 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
1960 }
1961 ast::AssertionKind::EndLine => {
1962 if !self.multiline.get() {
1963 return Ok(NodeId::END);
1964 }
1965 let left = NodeId::END;
1966 let right = tb.mk_u8(b'\n');
1967 let union = tb.mk_union(left, right);
1968 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
1969 }
1970 ast::AssertionKind::WordBoundaryStart => {
1971 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1972 }
1973 ast::AssertionKind::WordBoundaryEnd => {
1974 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1975 }
1976 ast::AssertionKind::WordBoundaryStartAngle => {
1977 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1978 }
1979 ast::AssertionKind::WordBoundaryEndAngle => {
1980 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1981 }
1982 ast::AssertionKind::WordBoundaryStartHalf => {
1983 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1984 }
1985 ast::AssertionKind::WordBoundaryEndHalf => {
1986 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
1987 }
1988 },
1989 Ast::ClassUnicode(c) => {
1990 let tmp = regex_syntax::ast::ClassUnicode {
1991 span: c.span,
1992 negated: c.negated,
1993 kind: c.kind.clone(),
1994 };
1995 if !c.negated {
1996 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
1997 match s.as_str() {
1998 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2000 "utf8" => {
2002 let ascii = tb.mk_range_u8(0, 127);
2003 let beta = tb.mk_range_u8(128, 0xBF);
2004 let c0 = tb.mk_range_u8(0xC0, 0xDF);
2005 let c0s = tb.mk_concats([c0, beta].into_iter());
2006 let e0 = tb.mk_range_u8(0xE0, 0xEF);
2007 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
2008 let f0 = tb.mk_range_u8(0xF0, 0xF7);
2009 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
2010 return Ok(tb.mk_unions([ascii, c0s, e0s, f0s].into_iter()));
2011 }
2012 "hex" => {
2013 let nums = tb.mk_range_u8(b'0', b'9');
2014 let lets = tb.mk_range_u8(b'a', b'f');
2015 let lets2 = tb.mk_range_u8(b'A', b'F');
2016 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2017 return Ok(merged);
2018 }
2019 _ => {}
2020 }
2021 };
2022 }
2023
2024 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2025 self.translator_to_node_id(&orig_ast, translator, tb)
2026 }
2027 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2028 Ast::ClassBracketed(c) => match &c.kind {
2029 regex_syntax::ast::ClassSet::Item(item) => {
2030 if !c.negated && is_universal_perl_pair(item) {
2031 return Ok(NodeId::TOP);
2032 }
2033 if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2034 return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2035 }
2036 let kind = if self.global_ascii_perl {
2037 rewrite_ascii_perl_set(&c.kind)
2038 } else {
2039 c.kind.clone()
2040 };
2041 let tmp = regex_syntax::ast::ClassBracketed {
2042 span: c.span,
2043 negated: c.negated,
2044 kind,
2045 };
2046 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2047 self.translator_to_node_id(&orig_ast, translator, tb)
2048 }
2049 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2050 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2051 }
2052 },
2053 Ast::Repetition(r) => {
2054 let body = self.ast_to_node_id(&r.ast, translator, tb);
2055 match body {
2056 Ok(body) => match &r.op.kind {
2057 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2058 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2059 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2060 ast::RepetitionKind::Range(r) => match r {
2061 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2062 ast::RepetitionRange::AtLeast(n) => {
2063 let rep = tb.mk_repeat(body, *n, *n);
2064 let st = tb.mk_star(body);
2065 Ok(tb.mk_concat(rep, st))
2066 }
2067
2068 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2069 },
2070 },
2071 Err(_) => body,
2072 }
2073 }
2074 Ast::Lookaround(g) => {
2075 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2076 match g.kind {
2077 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2078 let mut prefix = NodeId::EPS;
2079 let mut rest = body;
2080 while tb.get_kind(rest) == Kind::Concat
2081 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2082 {
2083 prefix = tb.mk_concat(prefix, rest.left(tb));
2084 rest = rest.right(tb);
2085 }
2086 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2087 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2088 }
2089 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2090 Ok(tb.mk_concat(prefix, la))
2091 }
2092 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2093 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2094 }
2095 ast::LookaroundKind::PositiveLookahead => {
2096 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2097 }
2098 ast::LookaroundKind::PositiveLookbehind
2099 | ast::LookaroundKind::NegativeLookbehind
2100 if body.contains_lookahead(tb) =>
2101 {
2102 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2103 }
2104 ast::LookaroundKind::PositiveLookbehind => {
2105 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2106 }
2107 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2108 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2109 }
2110 }
2111 Ast::Group(g) => {
2112 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2113 if !flags.items.is_empty() {
2114 let mut translator_builder = self.default_translator_builder();
2115 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2116 translator_builder.case_insensitive(state);
2117 }
2118 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2119 translator_builder.unicode(state);
2120 }
2121 let saved_dot_all = self.dot_all.get();
2122 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2123 self.dot_all.set(state);
2124 }
2125 let saved_multiline = self.multiline.get();
2126 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2127 self.multiline.set(state);
2128 }
2129 let mut scoped = Some(translator_builder.build());
2130 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2131 self.dot_all.set(saved_dot_all);
2132 self.multiline.set(saved_multiline);
2133 return result;
2134 }
2135 }
2136 self.ast_to_node_id(&g.ast, translator, tb)
2137 }
2138 Ast::Alternation(a) => {
2139 let mut children = vec![];
2140 for ast in &a.asts {
2141 match self.ast_to_node_id(ast, translator, tb) {
2142 Ok(node_id) => children.push(node_id),
2143 Err(err) => return Err(err),
2144 }
2145 }
2146 Ok(tb.mk_unions(children.iter().copied()))
2147 }
2148 Ast::Concat(c) => {
2149 let mut concat_translator: Option<Translator> = None;
2150 let mut children = vec![];
2151 let mut prev_boundary_child: Option<usize> = None;
2152 let mut i = 0;
2153 while i < c.asts.len() {
2154 let ast = &c.asts[i];
2155 match ast {
2156 Ast::Flags(f) => {
2157 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2158 return Err(
2159 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2160 );
2161 }
2162 let mut translator_builder = self.default_translator_builder();
2163 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2164 translator_builder.case_insensitive(state);
2165 }
2166 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2167 translator_builder.unicode(state);
2168 }
2169 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2170 self.dot_all.set(state);
2171 }
2172 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2173 self.multiline.set(state);
2174 }
2175 concat_translator = Some(translator_builder.build());
2176 *translator = concat_translator.clone();
2177 i += 1;
2178 continue;
2179 }
2180 Ast::Assertion(a)
2181 if a.kind == ast::AssertionKind::WordBoundary
2182 || a.kind == ast::AssertionKind::NotWordBoundary =>
2183 {
2184 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2185 let node = self.rewrite_word_boundary_in_concat(
2186 &c.asts, i, translator, tb, negated,
2187 )?;
2188 match prev_boundary_child {
2189 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2190 None => {
2191 children.push(node.0);
2192 prev_boundary_child = Some(children.len() - 1);
2193 }
2194 }
2195 i = node.1; continue;
2197 }
2198 _ => {}
2199 }
2200 match concat_translator {
2201 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2202 Ok(node_id) => {
2203 if node_id != resharp_algebra::NodeId::EPS {
2204 prev_boundary_child = None;
2205 children.push(node_id);
2206 }
2207 }
2208 Err(err) => return Err(err),
2209 },
2210 None => match self.ast_to_node_id(ast, translator, tb) {
2211 Ok(node_id) => {
2212 if node_id != resharp_algebra::NodeId::EPS {
2213 prev_boundary_child = None;
2214 children.push(node_id);
2215 }
2216 }
2217 Err(err) => return Err(err),
2218 },
2219 }
2220 i += 1;
2221 }
2222 self.specialize_word_boundaries(&mut children, tb)?;
2223 Ok(tb.mk_concats(children.iter().cloned()))
2224 }
2225 Ast::Intersection(intersection) => {
2226 let mut children = vec![];
2227 for ast in &intersection.asts {
2228 match self.ast_to_node_id(ast, translator, tb) {
2229 Ok(node_id) => children.push(node_id),
2230 Err(err) => return Err(err),
2231 }
2232 }
2233 Ok(tb.mk_inters(children.into_iter()))
2234 }
2235 Ast::Complement(complement) => {
2236 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2237 body.map(|x| tb.mk_compl(x))
2238 }
2239 }
2240 }
2241
2242 fn parse_inner(&mut self) -> Result<Ast> {
2243 let mut concat = Concat {
2244 span: self.span(),
2245 asts: vec![],
2246 };
2247 loop {
2248 self.bump_space();
2249 if self.is_eof() {
2250 break;
2251 }
2252 match self.char() {
2253 '(' => concat = self.push_group(concat)?,
2254 ')' => concat = self.pop_group(concat)?,
2255 '|' => concat = self.push_alternate(concat)?,
2256 '&' => concat = self.push_intersect(concat)?,
2257 '~' => concat = self.push_compl_group(concat)?,
2258 '[' => {
2259 let class = self.parse_set_class()?;
2260 concat.asts.push(Ast::class_bracketed(class));
2261 }
2262 '?' => {
2263 concat =
2264 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2265 }
2266 '*' => {
2267 concat =
2268 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2269 }
2270 '+' => {
2271 concat =
2272 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2273 }
2274 '{' => {
2275 concat = self.parse_counted_repetition(concat)?;
2276 }
2277 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2278 }
2279 if self.stack_group.borrow().len() > self.max_depth {
2280 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2281 }
2282 }
2283 let ast = self.pop_group_end(concat)?;
2284 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2285 || max_concat_length(&ast) >= self.max_list_len
2286 {
2287 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2288 }
2289 Ok(ast)
2290 }
2291
2292 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2293 let ast = self.parse_inner()?;
2294 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2295 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2296 }
2297 self.ast_to_node_id(&ast, &mut None, tb)
2298 }
2299
2300 #[inline(never)]
2301 fn parse_uncounted_repetition(
2302 &self,
2303 mut concat: ast::Concat,
2304 kind: ast::RepetitionKind,
2305 ) -> Result<ast::Concat> {
2306 let op_start = self.pos();
2308 let ast = match concat.asts.pop() {
2309 Some(ast) => ast,
2310 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2311 };
2312 match ast {
2313 Ast::Empty(_) | Ast::Flags(_) => {
2314 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2315 }
2316 _ => {}
2317 }
2318 if self.bump() && self.char() == '?' {
2319 return Err(self.error(
2320 Span::new(op_start, self.pos()),
2321 ast::ErrorKind::UnsupportedLazyQuantifier,
2322 ));
2323 }
2324 concat.asts.push(Ast::repetition(ast::Repetition {
2325 span: ast.span().with_end(self.pos()),
2326 op: ast::RepetitionOp {
2327 span: Span::new(op_start, self.pos()),
2328 kind,
2329 },
2330 greedy: true,
2331 ast: Box::new(ast),
2332 }));
2333 Ok(concat)
2334 }
2335
2336 #[inline(never)]
2337 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2338 assert!(self.char() == '{');
2339 let start = self.pos();
2340 let ast = match concat.asts.pop() {
2341 Some(ast) => ast,
2342 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2343 };
2344 match ast {
2345 Ast::Empty(_) | Ast::Flags(_) => {
2346 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2347 }
2348 _ => {}
2349 }
2350 if !self.bump_and_bump_space() {
2351 return Err(self.error(
2352 Span::new(start, self.pos()),
2353 ast::ErrorKind::RepetitionCountUnclosed,
2354 ));
2355 }
2356 let count_start = specialize_err(
2357 self.parse_decimal(),
2358 ast::ErrorKind::DecimalEmpty,
2359 ast::ErrorKind::RepetitionCountDecimalEmpty,
2360 );
2361 if self.is_eof() {
2362 return Err(self.error(
2363 Span::new(start, self.pos()),
2364 ast::ErrorKind::RepetitionCountUnclosed,
2365 ));
2366 }
2367 let range = if self.char() == ',' {
2368 if !self.bump_and_bump_space() {
2369 return Err(self.error(
2370 Span::new(start, self.pos()),
2371 ast::ErrorKind::RepetitionCountUnclosed,
2372 ));
2373 }
2374 if self.char() != '}' {
2375 let count_start = match count_start {
2376 Ok(c) => c,
2377 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2378 if self.parser().empty_min_range {
2379 0
2380 } else {
2381 return Err(err);
2382 }
2383 }
2384 err => err?,
2385 };
2386 let count_end = specialize_err(
2387 self.parse_decimal(),
2388 ast::ErrorKind::DecimalEmpty,
2389 ast::ErrorKind::RepetitionCountDecimalEmpty,
2390 )?;
2391 ast::RepetitionRange::Bounded(count_start, count_end)
2392 } else {
2393 ast::RepetitionRange::AtLeast(count_start?)
2394 }
2395 } else {
2396 ast::RepetitionRange::Exactly(count_start?)
2397 };
2398
2399 if self.is_eof() || self.char() != '}' {
2400 return Err(self.error(
2401 Span::new(start, self.pos()),
2402 ast::ErrorKind::RepetitionCountUnclosed,
2403 ));
2404 }
2405
2406 if self.bump_and_bump_space() && self.char() == '?' {
2407 return Err(self.error(
2408 Span::new(start, self.pos()),
2409 ast::ErrorKind::UnsupportedLazyQuantifier,
2410 ));
2411 }
2412
2413 let op_span = Span::new(start, self.pos());
2414 if !range.is_valid() {
2415 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2416 }
2417
2418 let over_limit = match &range {
2419 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2420 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2421 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2422 };
2423 if over_limit {
2424 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2425 }
2426 concat.asts.push(Ast::repetition(ast::Repetition {
2427 span: ast.span().with_end(self.pos()),
2428 op: ast::RepetitionOp {
2429 span: op_span,
2430 kind: ast::RepetitionKind::Range(range),
2431 },
2432 greedy: true,
2433 ast: Box::new(ast),
2434 }));
2435 Ok(concat)
2436 }
2437
2438 #[inline(never)]
2439 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2440 assert_eq!(self.char(), '(');
2441 let open_span = self.span_char();
2442 self.bump();
2443 self.bump_space();
2444 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2445 let kind = match (pos, ahead) {
2446 (true, true) => LookaroundKind::PositiveLookahead,
2447 (true, false) => LookaroundKind::PositiveLookbehind,
2448 (false, true) => LookaroundKind::NegativeLookahead,
2449 (false, false) => LookaroundKind::NegativeLookbehind,
2450 };
2451 return Ok(Either::Right(ast::Group {
2452 span: open_span,
2453 kind: ast::GroupKind::Lookaround(kind),
2454 ast: Box::new(Ast::empty(self.span())),
2455 }));
2456 }
2457 let inner_span = self.span();
2458 let mut starts_with_p = true;
2459 if self.bump_if("?P<") || {
2460 starts_with_p = false;
2461 self.bump_if("?<")
2462 } {
2463 let capture_index = self.next_capture_index(open_span)?;
2464 let name = self.parse_capture_name(capture_index)?;
2465 Ok(Either::Right(ast::Group {
2466 span: open_span,
2467 kind: ast::GroupKind::CaptureName {
2468 starts_with_p,
2469 name,
2470 },
2471 ast: Box::new(Ast::empty(self.span())),
2472 }))
2473 } else if self.bump_if("?") {
2474 if self.is_eof() {
2475 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2476 }
2477 let flags = self.parse_flags()?;
2478 let char_end = self.char();
2479 self.bump();
2480 if char_end == ')' {
2481 if flags.items.is_empty() {
2484 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2485 }
2486 Ok(Either::Left(ast::SetFlags {
2487 span: Span {
2488 end: self.pos(),
2489 ..open_span
2490 },
2491 flags,
2492 }))
2493 } else {
2494 assert_eq!(char_end, ':');
2495 Ok(Either::Right(ast::Group {
2496 span: open_span,
2497 kind: ast::GroupKind::NonCapturing(flags),
2498 ast: Box::new(Ast::empty(self.span())),
2499 }))
2500 }
2501 } else {
2502 let capture_index = self.next_capture_index(open_span)?;
2503 Ok(Either::Right(ast::Group {
2504 span: open_span,
2505 kind: ast::GroupKind::CaptureIndex(capture_index),
2506 ast: Box::new(Ast::empty(self.span())),
2507 }))
2508 }
2509 }
2510
2511 #[inline(never)]
2512 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2513 if self.is_eof() {
2514 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2515 }
2516 let start = self.pos();
2517 loop {
2518 if self.char() == '>' {
2519 break;
2520 }
2521 if !is_capture_char(self.char(), self.pos() == start) {
2522 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2523 }
2524 if !self.bump() {
2525 break;
2526 }
2527 }
2528 let end = self.pos();
2529 if self.is_eof() {
2530 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2531 }
2532 assert_eq!(self.char(), '>');
2533 self.bump();
2534 let name = &self.pattern()[start.offset..end.offset];
2535 if name.is_empty() {
2536 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2537 }
2538 let capname = ast::CaptureName {
2539 span: Span::new(start, end),
2540 name: name.to_string(),
2541 index: capture_index,
2542 };
2543 self.add_capture_name(&capname)?;
2544 Ok(capname)
2545 }
2546
2547 #[inline(never)]
2548 fn parse_flags(&self) -> Result<ast::Flags> {
2549 let mut flags = ast::Flags {
2550 span: self.span(),
2551 items: vec![],
2552 };
2553 let mut last_was_negation = None;
2554 while self.char() != ':' && self.char() != ')' {
2555 if self.char() == '-' {
2556 last_was_negation = Some(self.span_char());
2557 let item = ast::FlagsItem {
2558 span: self.span_char(),
2559 kind: ast::FlagsItemKind::Negation,
2560 };
2561 if let Some(i) = flags.add_item(item) {
2562 return Err(self.error(
2563 self.span_char(),
2564 ast::ErrorKind::FlagRepeatedNegation {
2565 original: flags.items[i].span,
2566 },
2567 ));
2568 }
2569 } else {
2570 last_was_negation = None;
2571 let item = ast::FlagsItem {
2572 span: self.span_char(),
2573 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2574 };
2575 if let Some(i) = flags.add_item(item) {
2576 return Err(self.error(
2577 self.span_char(),
2578 ast::ErrorKind::FlagDuplicate {
2579 original: flags.items[i].span,
2580 },
2581 ));
2582 }
2583 }
2584 if !self.bump() {
2585 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2586 }
2587 }
2588 if let Some(span) = last_was_negation {
2589 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2590 }
2591 flags.span.end = self.pos();
2592 Ok(flags)
2593 }
2594
2595 #[inline(never)]
2596 fn parse_flag(&self) -> Result<ast::Flag> {
2597 match self.char() {
2598 'i' => Ok(ast::Flag::CaseInsensitive),
2599 'm' => Ok(ast::Flag::MultiLine),
2600 's' => Ok(ast::Flag::DotMatchesNewLine),
2601 'U' => Ok(ast::Flag::SwapGreed),
2602 'u' => Ok(ast::Flag::Unicode),
2603 'R' => Ok(ast::Flag::CRLF),
2604 'x' => Ok(ast::Flag::IgnoreWhitespace),
2605 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2606 }
2607 }
2608
2609 fn parse_primitive(&self) -> Result<Primitive> {
2610 match self.char() {
2611 '\\' => self.parse_escape(),
2612 '_' => {
2613 let ast = Primitive::Top(self.span_char());
2614 self.bump();
2615 Ok(ast)
2616 }
2617 '.' => {
2618 let ast = Primitive::Dot(self.span_char());
2619 self.bump();
2620 Ok(ast)
2621 }
2622 '^' => {
2623 let ast = Primitive::Assertion(ast::Assertion {
2624 span: self.span_char(),
2625 kind: ast::AssertionKind::StartLine,
2626 });
2627 self.bump();
2628 Ok(ast)
2629 }
2630 '$' => {
2631 let ast = Primitive::Assertion(ast::Assertion {
2632 span: self.span_char(),
2633 kind: ast::AssertionKind::EndLine,
2634 });
2635 self.bump();
2636 Ok(ast)
2637 }
2638 c => {
2639 let ast = Primitive::Literal(Literal {
2640 span: self.span_char(),
2641 kind: LiteralKind::Verbatim,
2642 c,
2643 });
2644 self.bump();
2645 Ok(ast)
2646 }
2647 }
2648 }
2649
2650 #[inline(never)]
2651 fn parse_escape(&self) -> Result<Primitive> {
2652 assert_eq!(self.char(), '\\');
2653 let start = self.pos();
2654 if !self.bump() {
2655 return Err(self.error(
2656 Span::new(start, self.pos()),
2657 ast::ErrorKind::EscapeUnexpectedEof,
2658 ));
2659 }
2660 let c = self.char();
2661 match c {
2663 '0'..='9' => {
2664 if !self.parser().octal {
2665 return Err(self.error(
2666 Span::new(start, self.span_char().end),
2667 ast::ErrorKind::UnsupportedBackreference,
2668 ));
2669 }
2670 let mut lit = self.parse_octal();
2671 lit.span.start = start;
2672 return Ok(Primitive::Literal(lit));
2673 }
2674 'x' | 'u' | 'U' => {
2675 let mut lit = self.parse_hex()?;
2676 lit.span.start = start;
2677 return Ok(Primitive::Literal(lit));
2678 }
2679 'p' | 'P' => {
2680 let mut cls = self.parse_unicode_class()?;
2681 cls.span.start = start;
2682 return Ok(Primitive::Unicode(cls));
2683 }
2684 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2685 let mut cls = self.parse_perl_class();
2686 cls.span.start = start;
2687 return Ok(Primitive::Perl(cls));
2688 }
2689 _ => {}
2690 }
2691
2692 self.bump();
2694 let span = Span::new(start, self.pos());
2695 if is_meta_character(c) {
2696 return Ok(Primitive::Literal(Literal {
2697 span,
2698 kind: LiteralKind::Meta,
2699 c,
2700 }));
2701 }
2702 if is_escapeable_character(c) {
2703 return Ok(Primitive::Literal(Literal {
2704 span,
2705 kind: LiteralKind::Superfluous,
2706 c,
2707 }));
2708 }
2709 let special = |kind, c| {
2710 Ok(Primitive::Literal(Literal {
2711 span,
2712 kind: LiteralKind::Special(kind),
2713 c,
2714 }))
2715 };
2716 match c {
2717 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2718 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2719 't' => special(SpecialLiteralKind::Tab, '\t'),
2720 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2721 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2722 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2723 'A' => Ok(Primitive::Assertion(ast::Assertion {
2724 span,
2725 kind: ast::AssertionKind::StartText,
2726 })),
2727 'z' => Ok(Primitive::Assertion(ast::Assertion {
2728 span,
2729 kind: ast::AssertionKind::EndText,
2730 })),
2731 'b' => {
2732 let mut wb = ast::Assertion {
2733 span,
2734 kind: ast::AssertionKind::WordBoundary,
2735 };
2736 if !self.is_eof() && self.char() == '{' {
2739 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2740 wb.kind = kind;
2741 wb.span.end = self.pos();
2742 }
2743 }
2744 Ok(Primitive::Assertion(wb))
2745 }
2746 'B' => Ok(Primitive::Assertion(ast::Assertion {
2747 span,
2748 kind: ast::AssertionKind::NotWordBoundary,
2749 })),
2750 '<' => Ok(Primitive::Assertion(ast::Assertion {
2751 span,
2752 kind: ast::AssertionKind::WordBoundaryStartAngle,
2753 })),
2754 '>' => Ok(Primitive::Assertion(ast::Assertion {
2755 span,
2756 kind: ast::AssertionKind::WordBoundaryEndAngle,
2757 })),
2758 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2759 }
2760 }
2761
2762 fn maybe_parse_special_word_boundary(
2763 &self,
2764 wb_start: Position,
2765 ) -> Result<Option<ast::AssertionKind>> {
2766 assert_eq!(self.char(), '{');
2767
2768 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2769 let start = self.pos();
2770 if !self.bump_and_bump_space() {
2771 return Err(self.error(
2772 Span::new(wb_start, self.pos()),
2773 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2774 ));
2775 }
2776 let start_contents = self.pos();
2777 if !is_valid_char(self.char()) {
2778 self.parser().pos.set(start);
2779 return Ok(None);
2780 }
2781
2782 let mut scratch = self.parser().scratch.borrow_mut();
2784 scratch.clear();
2785 while !self.is_eof() && is_valid_char(self.char()) {
2786 scratch.push(self.char());
2787 self.bump_and_bump_space();
2788 }
2789 if self.is_eof() || self.char() != '}' {
2790 return Err(self.error(
2791 Span::new(start, self.pos()),
2792 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2793 ));
2794 }
2795 let end = self.pos();
2796 self.bump();
2797 let kind = match scratch.as_str() {
2798 "start" => ast::AssertionKind::WordBoundaryStart,
2799 "end" => ast::AssertionKind::WordBoundaryEnd,
2800 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2801 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2802 _ => {
2803 return Err(self.error(
2804 Span::new(start_contents, end),
2805 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2806 ))
2807 }
2808 };
2809 Ok(Some(kind))
2810 }
2811
2812 #[inline(never)]
2813 fn parse_octal(&self) -> Literal {
2814 assert!(self.parser().octal);
2815 assert!('0' <= self.char() && self.char() <= '7');
2816 let start = self.pos();
2817 while self.bump()
2819 && '0' <= self.char()
2820 && self.char() <= '7'
2821 && self.pos().offset - start.offset <= 2
2822 {}
2823 let end = self.pos();
2824 let octal = &self.pattern()[start.offset..end.offset];
2825 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2828 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2831 Literal {
2832 span: Span::new(start, end),
2833 kind: LiteralKind::Octal,
2834 c,
2835 }
2836 }
2837
2838 #[inline(never)]
2839 fn parse_hex(&self) -> Result<Literal> {
2840 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2841
2842 let hex_kind = match self.char() {
2843 'x' => HexLiteralKind::X,
2844 'u' => HexLiteralKind::UnicodeShort,
2845 _ => HexLiteralKind::UnicodeLong,
2846 };
2847 if !self.bump_and_bump_space() {
2848 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2849 }
2850 if self.char() == '{' {
2851 self.parse_hex_brace(hex_kind)
2852 } else {
2853 self.parse_hex_digits(hex_kind)
2854 }
2855 }
2856
2857 #[inline(never)]
2858 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2859 let mut scratch = self.parser().scratch.borrow_mut();
2860 scratch.clear();
2861
2862 let start = self.pos();
2863 for i in 0..kind.digits() {
2864 if i > 0 && !self.bump_and_bump_space() {
2865 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2866 }
2867 if !is_hex(self.char()) {
2868 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2869 }
2870 scratch.push(self.char());
2871 }
2872 self.bump_and_bump_space();
2873 let end = self.pos();
2874 let hex = scratch.as_str();
2875 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2876 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2877 Some(c) => Ok(Literal {
2878 span: Span::new(start, end),
2879 kind: LiteralKind::HexFixed(kind),
2880 c,
2881 }),
2882 }
2883 }
2884
2885 #[inline(never)]
2886 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2887 let mut scratch = self.parser().scratch.borrow_mut();
2888 scratch.clear();
2889
2890 let brace_pos = self.pos();
2891 let start = self.span_char().end;
2892 while self.bump_and_bump_space() && self.char() != '}' {
2893 if !is_hex(self.char()) {
2894 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2895 }
2896 scratch.push(self.char());
2897 }
2898 if self.is_eof() {
2899 return Err(self.error(
2900 Span::new(brace_pos, self.pos()),
2901 ast::ErrorKind::EscapeUnexpectedEof,
2902 ));
2903 }
2904 let end = self.pos();
2905 let hex = scratch.as_str();
2906 assert_eq!(self.char(), '}');
2907 self.bump_and_bump_space();
2908
2909 if hex.is_empty() {
2910 return Err(self.error(
2911 Span::new(brace_pos, self.pos()),
2912 ast::ErrorKind::EscapeHexEmpty,
2913 ));
2914 }
2915 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2916 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2917 Some(c) => Ok(Literal {
2918 span: Span::new(start, self.pos()),
2919 kind: LiteralKind::HexBrace(kind),
2920 c,
2921 }),
2922 }
2923 }
2924
2925 fn parse_decimal(&self) -> Result<u32> {
2926 let mut scratch = self.parser().scratch.borrow_mut();
2927 scratch.clear();
2928
2929 while !self.is_eof() && self.char().is_whitespace() {
2930 self.bump();
2931 }
2932 let start = self.pos();
2933 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2934 scratch.push(self.char());
2935 self.bump_and_bump_space();
2936 }
2937 let span = Span::new(start, self.pos());
2938 while !self.is_eof() && self.char().is_whitespace() {
2939 self.bump_and_bump_space();
2940 }
2941 let digits = scratch.as_str();
2942 if digits.is_empty() {
2943 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2944 }
2945 match digits.parse::<u32>().ok() {
2946 Some(n) => Ok(n),
2947 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2948 }
2949 }
2950
2951 #[inline(never)]
2952 fn parse_set_class(&self) -> Result<ClassBracketed> {
2953 assert_eq!(self.char(), '[');
2954
2955 let mut union = ClassSetUnion {
2956 span: self.span(),
2957 items: vec![],
2958 };
2959 loop {
2960 self.bump_space();
2961 if self.is_eof() {
2962 return Err(self.unclosed_class_error());
2963 }
2964 match self.char() {
2965 '[' => {
2966 if !self.parser().stack_class.borrow().is_empty() {
2967 if let Some(cls) = self.maybe_parse_ascii_class() {
2968 union.push(ClassSetItem::Ascii(cls));
2969 continue;
2970 }
2971 }
2972 union = self.push_class_open(union)?;
2973 }
2974 ']' => match self.pop_class(union)? {
2975 Either::Left(nested_union) => {
2976 union = nested_union;
2977 }
2978 Either::Right(class) => return Ok(class),
2979 },
2980 '&' if self.peek() == Some('&') => {
2981 assert!(self.bump_if("&&"));
2982 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
2983 }
2984 '-' if self.peek() == Some('-') => {
2985 assert!(self.bump_if("--"));
2986 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
2987 }
2988 '~' if self.peek() == Some('~') => {
2989 assert!(self.bump_if("~~"));
2990 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
2991 }
2992 _ => {
2993 union.push(self.parse_set_class_range()?);
2994 }
2995 }
2996 }
2997 }
2998
2999 #[inline(never)]
3000 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3001 let prim1 = self.parse_set_class_item()?;
3002 self.bump_space();
3003 if self.is_eof() {
3004 return Err(self.unclosed_class_error());
3005 }
3006 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3007 return prim1.into_class_set_item(self);
3008 }
3009 if !self.bump_and_bump_space() {
3010 return Err(self.unclosed_class_error());
3011 }
3012 let prim2 = self.parse_set_class_item()?;
3013 let range = ClassSetRange {
3014 span: Span::new(prim1.span().start, prim2.span().end),
3015 start: prim1.into_class_literal(self)?,
3016 end: prim2.into_class_literal(self)?,
3017 };
3018 if !range.is_valid() {
3019 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3020 }
3021 Ok(ClassSetItem::Range(range))
3022 }
3023
3024 #[inline(never)]
3025 fn parse_set_class_item(&self) -> Result<Primitive> {
3026 if self.char() == '\\' {
3027 self.parse_escape()
3028 } else {
3029 let x = Primitive::Literal(Literal {
3030 span: self.span_char(),
3031 kind: LiteralKind::Verbatim,
3032 c: self.char(),
3033 });
3034 self.bump();
3035 Ok(x)
3036 }
3037 }
3038
3039 #[inline(never)]
3040 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3041 assert_eq!(self.char(), '[');
3042 let start = self.pos();
3043 if !self.bump_and_bump_space() {
3044 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3045 }
3046
3047 let negated = if self.char() != '^' {
3048 false
3049 } else {
3050 if !self.bump_and_bump_space() {
3051 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3052 }
3053 true
3054 };
3055 let mut union = ClassSetUnion {
3057 span: self.span(),
3058 items: vec![],
3059 };
3060 while self.char() == '-' {
3061 union.push(ClassSetItem::Literal(Literal {
3062 span: self.span_char(),
3063 kind: LiteralKind::Verbatim,
3064 c: '-',
3065 }));
3066 if !self.bump_and_bump_space() {
3067 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3068 }
3069 }
3070 if union.items.is_empty() && self.char() == ']' {
3073 union.push(ClassSetItem::Literal(Literal {
3074 span: self.span_char(),
3075 kind: LiteralKind::Verbatim,
3076 c: ']',
3077 }));
3078 if !self.bump_and_bump_space() {
3079 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3080 }
3081 }
3082 let set = ClassBracketed {
3083 span: Span::new(start, self.pos()),
3084 negated,
3085 kind: ClassSet::union(ClassSetUnion {
3086 span: Span::new(union.span.start, union.span.start),
3087 items: vec![],
3088 }),
3089 };
3090 Ok((set, union))
3091 }
3092
3093 #[inline(never)]
3094 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3095 assert_eq!(self.char(), '[');
3096 let start = self.pos();
3098 let mut negated = false;
3099 if !self.bump() || self.char() != ':' {
3100 self.parser().pos.set(start);
3101 return None;
3102 }
3103 if !self.bump() {
3104 self.parser().pos.set(start);
3105 return None;
3106 }
3107 if self.char() == '^' {
3108 negated = true;
3109 if !self.bump() {
3110 self.parser().pos.set(start);
3111 return None;
3112 }
3113 }
3114 let name_start = self.offset();
3115 while self.char() != ':' && self.bump() {}
3116 if self.is_eof() {
3117 self.parser().pos.set(start);
3118 return None;
3119 }
3120 let name = &self.pattern()[name_start..self.offset()];
3121 if !self.bump_if(":]") {
3122 self.parser().pos.set(start);
3123 return None;
3124 }
3125 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3126 Some(kind) => kind,
3127 None => {
3128 self.parser().pos.set(start);
3129 return None;
3130 }
3131 };
3132 Some(ClassAscii {
3133 span: Span::new(start, self.pos()),
3134 kind,
3135 negated,
3136 })
3137 }
3138
3139 #[inline(never)]
3140 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3141 assert!(self.char() == 'p' || self.char() == 'P');
3142
3143 let mut scratch = self.parser().scratch.borrow_mut();
3144 scratch.clear();
3145
3146 let negated = self.char() == 'P';
3147 if !self.bump_and_bump_space() {
3148 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3149 }
3150 let (start, kind) = if self.char() == '{' {
3151 let start = self.span_char().end;
3152 while self.bump_and_bump_space() && self.char() != '}' {
3153 scratch.push(self.char());
3154 }
3155 if self.is_eof() {
3156 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3157 }
3158 assert_eq!(self.char(), '}');
3159 self.bump();
3160
3161 let name = scratch.as_str();
3162 if let Some(i) = name.find("!=") {
3163 (
3164 start,
3165 ClassUnicodeKind::NamedValue {
3166 op: ClassUnicodeOpKind::NotEqual,
3167 name: name[..i].to_string(),
3168 value: name[i + 2..].to_string(),
3169 },
3170 )
3171 } else if let Some(i) = name.find(':') {
3172 (
3173 start,
3174 ClassUnicodeKind::NamedValue {
3175 op: ClassUnicodeOpKind::Colon,
3176 name: name[..i].to_string(),
3177 value: name[i + 1..].to_string(),
3178 },
3179 )
3180 } else if let Some(i) = name.find('=') {
3181 (
3182 start,
3183 ClassUnicodeKind::NamedValue {
3184 op: ClassUnicodeOpKind::Equal,
3185 name: name[..i].to_string(),
3186 value: name[i + 1..].to_string(),
3187 },
3188 )
3189 } else {
3190 (start, ClassUnicodeKind::Named(name.to_string()))
3191 }
3192 } else {
3193 let start = self.pos();
3194 let c = self.char();
3195 if c == '\\' {
3196 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3197 }
3198 self.bump_and_bump_space();
3199 let kind = ClassUnicodeKind::OneLetter(c);
3200 (start, kind)
3201 };
3202 Ok(ClassUnicode {
3203 span: Span::new(start, self.pos()),
3204 negated,
3205 kind,
3206 })
3207 }
3208
3209 #[inline(never)]
3210 fn parse_perl_class(&self) -> ClassPerl {
3211 let c = self.char();
3212 let span = self.span_char();
3213 self.bump();
3214 let (negated, kind) = match c {
3215 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3216 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3217 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3218 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3219 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3220 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3221 c => panic!("expected valid Perl class but got '{}'", c),
3222 };
3223 ClassPerl {
3224 span,
3225 kind,
3226 negated,
3227 }
3228 }
3229}
3230
3231fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3234 use regex_syntax::ast::ClassSetItem;
3235 let items = match item {
3236 ClassSetItem::Union(u) => &u.items,
3237 _ => return false,
3238 };
3239 if items.len() != 2 {
3240 return false;
3241 }
3242 match (&items[0], &items[1]) {
3243 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3244 let is_all = a.kind == b.kind && a.negated != b.negated;
3245 is_all
3246 }
3247 _ => false,
3248 }
3249}
3250
3251pub fn max_concat_length(ast: &ast::Ast) -> usize {
3252 match ast {
3253 ast::Ast::Empty(_)
3254 | ast::Ast::Flags(_)
3255 | ast::Ast::Literal(_)
3256 | ast::Ast::Dot(_)
3257 | ast::Ast::Top(_)
3258 | ast::Ast::Assertion(_)
3259 | ast::Ast::ClassUnicode(_)
3260 | ast::Ast::ClassPerl(_)
3261 | ast::Ast::ClassBracketed(_) => 0,
3262 ast::Ast::Group(g) => max_concat_length(&g.ast),
3263 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3264 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3265 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3266 ast::Ast::Concat(c) => c
3267 .asts
3268 .len()
3269 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3270 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3271 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3272 }
3273}
3274
3275pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3276 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3277 match ast {
3278 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3279 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3280 ast::Ast::Assertion(_) => 1,
3281 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3282 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3283 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3284 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3285 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3286 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3287 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3288 ast::Ast::Repetition(r) => {
3289 let body = go(&r.ast, limit);
3290 let factor: u64 = match &r.op.kind {
3291 ast::RepetitionKind::ZeroOrOne => 2,
3292 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3293 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3294 (*n as u64).max(1)
3295 }
3296 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3297 (*n as u64).max(1).saturating_add(1)
3298 }
3299 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3300 (*m as u64).max(1)
3301 }
3302 };
3303 body.saturating_mul(factor).min(limit)
3304 }
3305 }
3306 }
3307 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3308 let mut total: u64 = 0;
3309 for c in children {
3310 total = total.saturating_add(go(c, limit));
3311 if total >= limit {
3312 return limit;
3313 }
3314 }
3315 total
3316 }
3317 go(ast, limit)
3318}
3319
3320pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3321 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3322 p.parse(tb)
3323}
3324
3325pub fn parse_ast_with<'s>(
3326 tb: &mut TB<'s>,
3327 pattern: &'s str,
3328 flags: &PatternFlags,
3329) -> std::result::Result<NodeId, ParseError> {
3330 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3331 p.parse(tb)
3332}
3333
3334pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3336 let mut p: ResharpParser = ResharpParser::new(pattern);
3337 p.parse_inner()
3338}