1#![warn(dead_code)]
6pub mod ast;
7use std::cell::{Cell, RefCell};
8
9use ast::{Ast, Concat, ErrorKind, GroupKind, LookaroundKind, RepetitionKind};
10use regex_syntax::{
11 ast::{
12 ClassAscii, ClassBracketed, ClassPerl, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
13 ClassSetRange, ClassSetUnion, ClassUnicode, ClassUnicodeKind, ClassUnicodeOpKind,
14 HexLiteralKind, Literal, LiteralKind, Position, Span, SpecialLiteralKind,
15 },
16 hir::{
17 self,
18 translate::{Translator, TranslatorBuilder},
19 },
20 utf8::Utf8Sequences,
21};
22use resharp_algebra::{Kind, NodeId};
23
24type TB<'s> = resharp_algebra::RegexBuilder;
25
26pub struct PatternFlags {
28 pub unicode: bool,
30 pub full_unicode: bool,
32 pub case_insensitive: bool,
34 pub dot_matches_new_line: bool,
36 pub multiline: bool,
38 pub ignore_whitespace: bool,
40 pub ascii_perl_classes: bool,
43 pub expanded_ast_limit: u64,
46 pub max_list_len: usize,
49 pub max_repeat: u32,
51 pub max_depth: usize,
52}
53
54pub const DEFAULT_MAX_REPEAT: u32 = 500;
57pub const DEFAULT_EXPANDED_AST_LIMIT: u64 = 50_000;
58pub const DEFAULT_MAX_LIST_LEN: usize = 4_000;
59pub const DEFAULT_MAX_DEPTH: usize = 1_000;
60
61impl Default for PatternFlags {
62 fn default() -> Self {
63 Self {
64 unicode: true,
65 full_unicode: false,
66 case_insensitive: false,
67 dot_matches_new_line: false,
68 multiline: true,
69 ignore_whitespace: false,
70 ascii_perl_classes: false,
71 expanded_ast_limit: DEFAULT_EXPANDED_AST_LIMIT,
72 max_list_len: DEFAULT_MAX_LIST_LEN,
73 max_repeat: DEFAULT_MAX_REPEAT,
74 max_depth: DEFAULT_MAX_DEPTH,
75 }
76 }
77}
78
79#[derive(Clone, Copy, PartialEq, Debug)]
80enum WordCharKind {
81 Word,
82 NonWord,
83 MaybeWord,
84 MaybeNonWord,
85 Unknown,
86 Edge,
87}
88
89fn is_word_byte(b: u8) -> bool {
90 b.is_ascii_alphanumeric() || b == b'_'
91}
92
93fn class_set_item_word_kind(item: ®ex_syntax::ast::ClassSetItem) -> WordCharKind {
94 use regex_syntax::ast::{ClassPerlKind, ClassSetItem};
95 use WordCharKind::*;
96 match item {
97 ClassSetItem::Empty(_) => Unknown,
98 ClassSetItem::Literal(l) => {
99 if is_word_byte(l.c as u8) {
100 Word
101 } else {
102 NonWord
103 }
104 }
105 ClassSetItem::Range(r) => {
106 let all_word = (r.start.c as u8..=r.end.c as u8).all(is_word_byte);
107 let all_non = (r.start.c as u8..=r.end.c as u8).all(|b| !is_word_byte(b));
108 if all_word {
109 Word
110 } else if all_non {
111 NonWord
112 } else {
113 Unknown
114 }
115 }
116 ClassSetItem::Perl(p) => match (&p.kind, p.negated) {
117 (ClassPerlKind::Word, false) => Word,
118 (ClassPerlKind::Word, true) => NonWord,
119 (ClassPerlKind::Space, false) => NonWord,
120 (ClassPerlKind::Digit, false) => Word,
121 _ => Unknown,
122 },
123 ClassSetItem::Bracketed(b) => class_bracketed_word_kind(b),
124 ClassSetItem::Union(u) => {
125 let mut kind = Unknown;
126 for item in &u.items {
127 let k = class_set_item_word_kind(item);
128 kind = match (kind, k) {
129 (_, Unknown) => return Unknown,
130 (Unknown, _) => k,
131 (Word, Word) => Word,
132 (NonWord, NonWord) => NonWord,
133 _ => return Unknown,
134 };
135 }
136 kind
137 }
138 _ => Unknown,
139 }
140}
141
142fn utf8_codepoint_node(tb: &mut TB<'_>) -> NodeId {
143 let ascii = tb.mk_range_u8(0, 127);
144 let beta = tb.mk_range_u8(0x80, 0xBF);
145 let c0 = tb.mk_range_u8(0xC0, 0xDF);
146 let c0s = tb.mk_concats([c0, beta].into_iter());
147 let e0 = tb.mk_range_u8(0xE0, 0xEF);
148 let e0s = tb.mk_concats([e0, beta, beta].into_iter());
149 let f0 = tb.mk_range_u8(0xF0, 0xF7);
150 let f0s = tb.mk_concats([f0, beta, beta, beta].into_iter());
151 tb.mk_unions([ascii, c0s, e0s, f0s].into_iter())
152}
153
154fn unicode_ranges_to_node(ranges: &[hir::ClassUnicodeRange], tb: &mut TB<'_>) -> NodeId {
155 let mut nodes = Vec::new();
156 for range in ranges {
157 for seq in Utf8Sequences::new(range.start(), range.end()) {
158 let sl = seq.as_slice();
159 let bytes: Vec<_> = sl.iter().map(|s| (s.start, s.end)).collect();
160 let node = match bytes.len() {
161 1 => tb.mk_range_u8(bytes[0].0, bytes[0].1),
162 n => {
163 let last = tb.mk_range_u8(bytes[n - 1].0, bytes[n - 1].1);
164 let mut conc = last;
165 for i in (0..n - 1).rev() {
166 let b = tb.mk_range_u8(bytes[i].0, bytes[i].1);
167 conc = tb.mk_concat(b, conc);
168 }
169 conc
170 }
171 };
172 nodes.push(node);
173 }
174 }
175 tb.mk_unions(nodes.into_iter())
176}
177
178fn class_bracketed_word_kind(c: ®ex_syntax::ast::ClassBracketed) -> WordCharKind {
179 use regex_syntax::ast::{ClassPerlKind, ClassSet, ClassSetItem};
180 use WordCharKind::*;
181 if c.negated {
182 return match &c.kind {
183 ClassSet::Item(ClassSetItem::Perl(p)) if p.kind == ClassPerlKind::Word => {
184 if p.negated {
185 Word
186 } else {
187 NonWord
188 }
189 }
190 _ => Unknown,
191 };
192 }
193 match &c.kind {
194 ClassSet::Item(item) => class_set_item_word_kind(item),
195 ClassSet::BinaryOp(_) => Unknown,
196 }
197}
198
199fn ascii_class_lit(span: Span, c: char) -> regex_syntax::ast::Literal {
200 regex_syntax::ast::Literal {
201 span,
202 kind: regex_syntax::ast::LiteralKind::Verbatim,
203 c,
204 }
205}
206
207fn ascii_class_range(span: Span, a: char, b: char) -> regex_syntax::ast::ClassSetItem {
208 regex_syntax::ast::ClassSetItem::Range(regex_syntax::ast::ClassSetRange {
209 span,
210 start: ascii_class_lit(span, a),
211 end: ascii_class_lit(span, b),
212 })
213}
214
215fn ascii_perl_positive(
216 span: Span,
217 kind: ®ex_syntax::ast::ClassPerlKind,
218) -> regex_syntax::ast::ClassSetItem {
219 use regex_syntax::ast::{ClassPerlKind, ClassSetItem, ClassSetUnion};
220 match kind {
221 ClassPerlKind::Digit => ascii_class_range(span, '0', '9'),
222 ClassPerlKind::Word => ClassSetItem::Union(ClassSetUnion {
223 span,
224 items: vec![
225 ascii_class_range(span, 'a', 'z'),
226 ascii_class_range(span, 'A', 'Z'),
227 ascii_class_range(span, '0', '9'),
228 ClassSetItem::Literal(ascii_class_lit(span, '_')),
229 ],
230 }),
231 ClassPerlKind::Space => ClassSetItem::Union(ClassSetUnion {
232 span,
233 items: ['\t', '\n', '\x0B', '\x0C', '\r', ' ']
234 .into_iter()
235 .map(|c| ClassSetItem::Literal(ascii_class_lit(span, c)))
236 .collect(),
237 }),
238 }
239}
240
241fn ascii_perl_set_item(
242 span: Span,
243 kind: ®ex_syntax::ast::ClassPerlKind,
244 negated: bool,
245) -> regex_syntax::ast::ClassSetItem {
246 use regex_syntax::ast::{ClassBracketed, ClassSet, ClassSetItem};
247 let positive = ascii_perl_positive(span, kind);
248 if negated {
249 ClassSetItem::Bracketed(Box::new(ClassBracketed {
250 span,
251 negated: true,
252 kind: ClassSet::Item(positive),
253 }))
254 } else {
255 positive
256 }
257}
258
259fn rewrite_ascii_perl_set(set: ®ex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
260 use regex_syntax::ast::{ClassSet, ClassSetBinaryOp};
261 match set {
262 ClassSet::Item(item) => ClassSet::Item(rewrite_ascii_perl_item(item)),
263 ClassSet::BinaryOp(op) => ClassSet::BinaryOp(ClassSetBinaryOp {
264 span: op.span,
265 kind: op.kind.clone(),
266 lhs: Box::new(rewrite_ascii_perl_set(&op.lhs)),
267 rhs: Box::new(rewrite_ascii_perl_set(&op.rhs)),
268 }),
269 }
270}
271
272fn rewrite_ascii_perl_item(
273 item: ®ex_syntax::ast::ClassSetItem,
274) -> regex_syntax::ast::ClassSetItem {
275 use regex_syntax::ast::{ClassBracketed, ClassSetItem, ClassSetUnion};
276 match item {
277 ClassSetItem::Perl(p) => ascii_perl_set_item(p.span, &p.kind, p.negated),
278 ClassSetItem::Union(u) => ClassSetItem::Union(ClassSetUnion {
279 span: u.span,
280 items: u.items.iter().map(rewrite_ascii_perl_item).collect(),
281 }),
282 ClassSetItem::Bracketed(b) => ClassSetItem::Bracketed(Box::new(ClassBracketed {
283 span: b.span,
284 negated: b.negated,
285 kind: rewrite_ascii_perl_set(&b.kind),
286 })),
287 other => other.clone(),
288 }
289}
290
291#[derive(Clone, Debug, Eq, PartialEq)]
292enum Primitive {
293 Literal(Literal),
294 Assertion(ast::Assertion),
295 Dot(Span),
296 Top(Span),
297 Perl(ClassPerl),
298 Unicode(ClassUnicode),
299}
300
301impl Primitive {
302 fn span(&self) -> &Span {
303 match *self {
304 Primitive::Literal(ref x) => &x.span,
305 Primitive::Assertion(ref x) => &x.span,
306 Primitive::Dot(ref span) => span,
307 Primitive::Top(ref span) => span,
308 Primitive::Perl(ref x) => &x.span,
309 Primitive::Unicode(ref x) => &x.span,
310 }
311 }
312
313 fn into_ast(self) -> Ast {
314 match self {
315 Primitive::Literal(lit) => Ast::literal(lit),
316 Primitive::Assertion(assert) => Ast::assertion(assert),
317 Primitive::Dot(span) => Ast::dot(span),
318 Primitive::Top(span) => Ast::top(span),
319 Primitive::Perl(cls) => Ast::class_perl(cls),
320 Primitive::Unicode(cls) => Ast::class_unicode(cls),
321 }
322 }
323
324 fn into_class_set_item(self, p: &ResharpParser) -> Result<regex_syntax::ast::ClassSetItem> {
325 use self::Primitive::*;
326 use regex_syntax::ast::ClassSetItem;
327
328 match self {
329 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
330 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
331 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
332 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
333 }
334 }
335
336 fn into_class_literal(self, p: &ResharpParser) -> Result<Literal> {
337 use self::Primitive::*;
338
339 match self {
340 Literal(lit) => Ok(lit),
341 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
342 }
343 }
344}
345
346#[derive(Clone, Debug, Eq, PartialEq)]
347pub enum Either<Left, Right> {
348 Left(Left),
349 Right(Right),
350}
351
352#[derive(Clone, Debug, Eq, PartialEq)]
353pub struct ParseError {
354 pub kind: ErrorKind,
356 pattern: String,
359 pub span: Span,
361}
362
363impl std::fmt::Display for ParseError {
364 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
365 write!(f, "{:?}: {:?}", self.kind, self.span)
366 }
367}
368impl std::error::Error for ParseError {}
369
370type Result<T> = core::result::Result<T, ParseError>;
371
372#[derive(Clone, Debug)]
373enum GroupState {
374 Group {
376 concat: Concat,
378 group: ast::Group,
380 ignore_whitespace: bool,
382 },
383 Alternation(ast::Alternation),
384 Intersection(ast::Intersection),
385}
386
387#[derive(Clone, Debug)]
388enum ClassState {
389 Open {
391 union: regex_syntax::ast::ClassSetUnion,
393 set: regex_syntax::ast::ClassBracketed,
394 },
395 Op {
398 kind: regex_syntax::ast::ClassSetBinaryOpKind,
400 lhs: regex_syntax::ast::ClassSet,
402 },
403}
404
405pub struct ResharpParser<'s> {
407 perl_classes: Vec<(bool, regex_syntax::ast::ClassPerlKind, NodeId)>,
408 unicode_classes: resharp_algebra::UnicodeClassCache,
409 pub translator: regex_syntax::hir::translate::Translator,
410 pub pattern: &'s str,
411 pos: Cell<Position>,
412 capture_index: Cell<u32>,
413 octal: bool,
414 empty_min_range: bool,
415 ignore_whitespace: Cell<bool>,
416 dot_all: Cell<bool>,
417 multiline: Cell<bool>,
418 global_unicode: bool,
419 global_full_unicode: bool,
420 global_ascii_perl: bool,
421 global_case_insensitive: bool,
422 expanded_ast_limit: u64,
423 max_list_len: usize,
424 max_repeat: u32,
425 max_depth: usize,
426 comments: RefCell<Vec<ast::Comment>>,
427 stack_group: RefCell<Vec<GroupState>>,
428 stack_class: RefCell<Vec<ClassState>>,
429 capture_names: RefCell<Vec<ast::CaptureName>>,
430 scratch: RefCell<String>,
431}
432
433fn specialize_err<T>(result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind) -> Result<T> {
434 result.map_err(|e| {
435 if e.kind == from {
436 ParseError {
437 kind: to,
438 pattern: e.pattern,
439 span: e.span,
440 }
441 } else {
442 e
443 }
444 })
445}
446
447fn is_capture_char(c: char, first: bool) -> bool {
448 if first {
449 c == '_' || c.is_alphabetic()
450 } else {
451 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
452 }
453}
454
455pub fn is_meta_character(c: char) -> bool {
456 matches!(
457 c,
458 '\\' | '.'
459 | '+'
460 | '*'
461 | '?'
462 | '('
463 | ')'
464 | '|'
465 | '['
466 | ']'
467 | '{'
468 | '}'
469 | '^'
470 | '$'
471 | '#'
472 | '&'
473 | '-'
474 | '~'
475 | '_'
476 )
477}
478
479pub fn escape(text: &str) -> String {
481 let mut buf = String::new();
482 escape_into(text, &mut buf);
483 buf
484}
485
486pub fn escape_into(text: &str, buf: &mut String) {
488 buf.reserve(text.len());
489 for c in text.chars() {
490 if is_meta_character(c) {
491 buf.push('\\');
492 }
493 buf.push(c);
494 }
495}
496
497pub fn is_escapeable_character(c: char) -> bool {
498 if is_meta_character(c) {
499 return true;
500 }
501 if !c.is_ascii() {
502 return false;
503 }
504 match c {
505 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
506 '<' | '>' => false,
507 _ => true,
508 }
509}
510
511fn is_hex(c: char) -> bool {
512 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
513}
514
515fn ensure_lookbehind_at_start(ast: &Ast, at_start: bool) -> core::result::Result<(), Span> {
516 match ast {
517 Ast::Concat(c) => {
518 let mut child_at_start = at_start;
519 for child in &c.asts {
520 ensure_lookbehind_at_start(child, child_at_start)?;
521 if ast_may_consume(child) {
522 child_at_start = false;
523 }
524 }
525 Ok(())
526 }
527 Ast::Alternation(a) => {
528 for child in &a.asts {
529 ensure_lookbehind_at_start(child, at_start)?;
530 }
531 Ok(())
532 }
533 Ast::Intersection(i) => {
534 for child in &i.asts {
535 ensure_lookbehind_at_start(child, at_start)?;
536 }
537 Ok(())
538 }
539 Ast::Complement(c) => ensure_lookbehind_at_start(&c.ast, at_start),
540 Ast::Group(g) => ensure_lookbehind_at_start(&g.ast, at_start),
541 Ast::Repetition(r) => ensure_lookbehind_at_start(&r.ast, at_start),
542 Ast::Lookaround(g) => {
543 match g.kind {
544 LookaroundKind::PositiveLookbehind | LookaroundKind::NegativeLookbehind => {
545 if !at_start {
546 return Err(g.span);
547 }
548 }
549 LookaroundKind::PositiveLookahead | LookaroundKind::NegativeLookahead => {}
550 }
551 ensure_lookbehind_at_start(&g.ast, true)
552 }
553 Ast::Empty(_)
554 | Ast::Flags(_)
555 | Ast::Literal(_)
556 | Ast::Dot(_)
557 | Ast::Top(_)
558 | Ast::Assertion(_)
559 | Ast::ClassUnicode(_)
560 | Ast::ClassPerl(_)
561 | Ast::ClassBracketed(_) => Ok(()),
562 }
563}
564
565fn ast_may_consume(ast: &Ast) -> bool {
566 match ast {
567 Ast::Empty(_) | Ast::Flags(_) | Ast::Assertion(_) | Ast::Lookaround(_) => false,
568 Ast::Literal(_)
569 | Ast::Dot(_)
570 | Ast::Top(_)
571 | Ast::ClassUnicode(_)
572 | Ast::ClassPerl(_)
573 | Ast::ClassBracketed(_) => true,
574 Ast::Group(g) => ast_may_consume(&g.ast),
575 Ast::Repetition(r) => {
576 if !ast_may_consume(&r.ast) {
577 return false;
578 }
579 match r.op.kind {
580 RepetitionKind::ZeroOrOne
581 | RepetitionKind::ZeroOrMore
582 | RepetitionKind::OneOrMore => true,
583 RepetitionKind::Range(ast::RepetitionRange::Exactly(0)) => false,
584 RepetitionKind::Range(ast::RepetitionRange::Bounded(_, 0)) => false,
585 RepetitionKind::Range(_) => true,
586 }
587 }
588 Ast::Alternation(a) => a.asts.iter().any(ast_may_consume),
589 Ast::Intersection(i) => i.asts.iter().any(ast_may_consume),
590 Ast::Complement(_) => true,
591 Ast::Concat(c) => c.asts.iter().any(ast_may_consume),
592 }
593}
594
595impl<'s> ResharpParser<'s> {
596 fn default_translator_builder(&self) -> TranslatorBuilder {
597 let mut trb = TranslatorBuilder::new();
598 trb.unicode(self.global_unicode);
599 trb.utf8(false);
600 trb.case_insensitive(self.global_case_insensitive);
601 trb
602 }
603
604 pub fn new(pattern: &'s str) -> Self {
605 Self::with_flags(pattern, &PatternFlags::default())
606 }
607
608 pub fn with_flags(pattern: &'s str, flags: &PatternFlags) -> Self {
609 let mut trb = TranslatorBuilder::new();
610 trb.unicode(flags.unicode);
611 trb.utf8(false);
612 trb.case_insensitive(flags.case_insensitive);
613 Self {
614 translator: trb.build(),
615 pattern,
616 perl_classes: vec![],
617 unicode_classes: resharp_algebra::UnicodeClassCache::default(),
618 pos: Cell::new(Position::new(0, 0, 0)),
619 capture_index: Cell::new(0),
620 octal: false,
621 empty_min_range: false,
622 ignore_whitespace: Cell::new(flags.ignore_whitespace),
623 dot_all: Cell::new(flags.dot_matches_new_line),
624 multiline: Cell::new(flags.multiline),
625 global_unicode: flags.unicode || flags.full_unicode || flags.ascii_perl_classes,
626 global_full_unicode: flags.full_unicode,
627 global_ascii_perl: flags.ascii_perl_classes,
628 global_case_insensitive: flags.case_insensitive,
629 expanded_ast_limit: flags.expanded_ast_limit,
630 max_list_len: flags.max_list_len,
631 max_repeat: flags.max_repeat,
632 max_depth: flags.max_depth,
633 comments: RefCell::new(vec![]),
634 stack_group: RefCell::new(vec![]),
635 stack_class: RefCell::new(vec![]),
636 capture_names: RefCell::new(vec![]),
637 scratch: RefCell::new(String::new()),
638 }
639 }
640
641 fn parser(&'_ self) -> &'_ ResharpParser<'_> {
642 self
643 }
644
645 fn pattern(&self) -> &str {
646 self.pattern
647 }
648
649 fn error(&self, span: Span, kind: ast::ErrorKind) -> ParseError {
650 ParseError {
651 kind,
652 pattern: self.pattern().to_string(),
653 span,
654 }
655 }
656
657 fn unsupported_error(&self, _: regex_syntax::hir::Error) -> ParseError {
658 self.error(
659 Span::splat(self.pos()),
660 ast::ErrorKind::UnsupportedResharpRegex,
661 )
662 }
663
664 fn offset(&self) -> usize {
665 self.parser().pos.get().offset
666 }
667
668 fn line(&self) -> usize {
669 self.parser().pos.get().line
670 }
671
672 fn column(&self) -> usize {
673 self.parser().pos.get().column
674 }
675
676 fn next_capture_index(&self, span: Span) -> Result<u32> {
677 let current = self.parser().capture_index.get();
678 let i = current
679 .checked_add(1)
680 .ok_or_else(|| self.error(span, ast::ErrorKind::CaptureLimitExceeded))?;
681 self.parser().capture_index.set(i);
682 Ok(i)
683 }
684
685 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
686 let mut names = self.parser().capture_names.borrow_mut();
687 match names.binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) {
688 Err(i) => {
689 names.insert(i, cap.clone());
690 Ok(())
691 }
692 Ok(i) => Err(self.error(
693 cap.span,
694 ast::ErrorKind::GroupNameDuplicate {
695 original: names[i].span,
696 },
697 )),
698 }
699 }
700
701 fn ignore_whitespace(&self) -> bool {
702 self.parser().ignore_whitespace.get()
703 }
704
705 fn char(&self) -> char {
706 self.char_at(self.offset())
707 }
708
709 fn char_at(&self, i: usize) -> char {
710 self.pattern()[i..]
711 .chars()
712 .next()
713 .unwrap_or_else(|| panic!("expected char at offset {}", i))
714 }
715
716 fn bump(&self) -> bool {
717 if self.is_eof() {
718 return false;
719 }
720 let Position {
721 mut offset,
722 mut line,
723 mut column,
724 } = self.pos();
725 if self.char() == '\n' {
726 line = line.checked_add(1).unwrap();
727 column = 1;
728 } else {
729 column = column.checked_add(1).unwrap();
730 }
731 offset += self.char().len_utf8();
732 self.parser().pos.set(Position {
733 offset,
734 line,
735 column,
736 });
737 self.pattern()[self.offset()..].chars().next().is_some()
738 }
739
740 fn bump_if(&self, prefix: &str) -> bool {
741 if self.pattern()[self.offset()..].starts_with(prefix) {
742 for _ in 0..prefix.chars().count() {
743 self.bump();
744 }
745 true
746 } else {
747 false
748 }
749 }
750
751 fn is_lookaround_prefix(&self) -> Option<(bool, bool)> {
752 if self.bump_if("?=") {
753 return Some((true, true));
754 }
755 if self.bump_if("?!") {
756 return Some((true, false));
757 }
758 if self.bump_if("?<=") {
759 return Some((false, true));
760 }
761 if self.bump_if("?<!") {
762 return Some((false, false));
763 }
764 None
765 }
766
767 fn bump_and_bump_space(&self) -> bool {
768 if !self.bump() {
769 return false;
770 }
771 self.bump_space();
772 !self.is_eof()
773 }
774
775 fn bump_space(&self) {
776 if !self.ignore_whitespace() {
777 return;
778 }
779 while !self.is_eof() {
780 if self.char().is_whitespace() {
781 self.bump();
782 } else if self.char() == '#' {
783 let start = self.pos();
784 let mut comment_text = String::new();
785 self.bump();
786 while !self.is_eof() {
787 let c = self.char();
788 self.bump();
789 if c == '\n' {
790 break;
791 }
792 comment_text.push(c);
793 }
794 let comment = ast::Comment {
795 span: Span::new(start, self.pos()),
796 comment: comment_text,
797 };
798 self.parser().comments.borrow_mut().push(comment);
799 } else {
800 break;
801 }
802 }
803 }
804
805 fn peek(&self) -> Option<char> {
806 if self.is_eof() {
807 return None;
808 }
809 self.pattern()[self.offset() + self.char().len_utf8()..]
810 .chars()
811 .next()
812 }
813
814 fn peek_space(&self) -> Option<char> {
817 if !self.ignore_whitespace() {
818 return self.peek();
819 }
820 if self.is_eof() {
821 return None;
822 }
823 let mut start = self.offset() + self.char().len_utf8();
824 let mut in_comment = false;
825 for (i, c) in self.pattern()[start..].char_indices() {
826 if c.is_whitespace() {
827 continue;
828 } else if !in_comment && c == '#' {
829 in_comment = true;
830 } else if in_comment && c == '\n' {
831 in_comment = false;
832 } else {
833 start += i;
834 break;
835 }
836 }
837 self.pattern()[start..].chars().next()
838 }
839
840 fn is_eof(&self) -> bool {
841 self.offset() == self.pattern().len()
842 }
843
844 fn pos(&self) -> Position {
845 self.parser().pos.get()
846 }
847
848 fn span(&self) -> Span {
849 Span::splat(self.pos())
850 }
851
852 fn span_char(&self) -> Span {
853 let mut next = Position {
854 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
855 line: self.line(),
856 column: self.column().checked_add(1).unwrap(),
857 };
858 if self.char() == '\n' {
859 next.line += 1;
860 next.column = 1;
861 }
862 Span::new(self.pos(), next)
863 }
864
865 #[inline(never)]
866 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
867 assert_eq!(self.char(), '|');
868 concat.span.end = self.pos();
869 self.push_or_add_alternation(concat);
870 self.bump();
871 Ok(ast::Concat {
872 span: self.span(),
873 asts: vec![],
874 })
875 }
876
877 fn push_or_add_alternation(&self, concat: Concat) {
878 use self::GroupState::*;
879
880 let mut stack = self.parser().stack_group.borrow_mut();
881 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
882 alts.asts.push(concat.into_ast());
883 return;
884 }
885 stack.push(Alternation(ast::Alternation {
886 span: Span::new(concat.span.start, self.pos()),
887 asts: vec![concat.into_ast()],
888 }));
889 }
890
891 #[inline(never)]
892 fn push_intersect(&self, mut concat: Concat) -> Result<Concat> {
893 assert_eq!(self.char(), '&');
894 concat.span.end = self.pos();
895 self.push_or_add_intersect(concat);
896 self.bump();
897 Ok(Concat {
898 span: self.span(),
899 asts: vec![],
900 })
901 }
902
903 fn push_or_add_intersect(&self, concat: Concat) {
904 use self::GroupState::*;
905
906 let mut stack = self.parser().stack_group.borrow_mut();
907 if let Some(&mut Intersection(ref mut alts)) = stack.last_mut() {
908 alts.asts.push(concat.into_ast());
909 return;
910 }
911 stack.push(Intersection(ast::Intersection {
912 span: Span::new(concat.span.start, self.pos()),
913 asts: vec![concat.into_ast()],
914 }));
915 }
916
917 #[inline(never)]
918 fn push_group(&self, mut concat: Concat) -> Result<Concat> {
919 assert_eq!(self.char(), '(');
920 match self.parse_group()? {
921 Either::Left(set) => {
922 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
923 if let Some(v) = ignore {
924 self.parser().ignore_whitespace.set(v);
925 }
926
927 concat.asts.push(Ast::flags(set));
928 Ok(concat)
929 }
930 Either::Right(group) => {
931 let old_ignore_whitespace = self.ignore_whitespace();
932 let new_ignore_whitespace = group
933 .flags()
934 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
935 .unwrap_or(old_ignore_whitespace);
936 self.parser()
937 .stack_group
938 .borrow_mut()
939 .push(GroupState::Group {
940 concat,
941 group,
942 ignore_whitespace: old_ignore_whitespace,
943 });
944 self.parser().ignore_whitespace.set(new_ignore_whitespace);
945 Ok(Concat {
946 span: self.span(),
947 asts: vec![],
948 })
949 }
950 }
951 }
952
953 #[inline(never)]
954 fn push_compl_group(&self, concat: Concat) -> Result<Concat> {
955 assert_eq!(self.char(), '~');
956 self.bump();
957 if self.is_eof() || self.char() != '(' {
958 return Err(self.error(self.span(), ast::ErrorKind::ComplementGroupExpected));
959 }
960 let open_span = self.span_char();
961 self.bump();
962 let group = ast::Group {
963 span: open_span,
964 kind: ast::GroupKind::Complement,
965 ast: Box::new(Ast::empty(self.span())),
966 };
967
968 let old_ignore_whitespace = self.ignore_whitespace();
969 let new_ignore_whitespace = group
970 .flags()
971 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
972 .unwrap_or(old_ignore_whitespace);
973 self.parser()
974 .stack_group
975 .borrow_mut()
976 .push(GroupState::Group {
977 concat,
978 group,
979 ignore_whitespace: old_ignore_whitespace,
980 });
981 self.parser().ignore_whitespace.set(new_ignore_whitespace);
982 Ok(Concat {
983 span: self.span(),
984 asts: vec![],
985 })
986 }
987
988 #[inline(never)]
989 fn pop_group(&self, mut group_concat: Concat) -> Result<Concat> {
990 use self::GroupState::*;
991 assert_eq!(self.char(), ')');
992 let mut stack = self.parser().stack_group.borrow_mut();
993 let topstack = stack.pop();
994
995 let (mut prior_concat, mut group, ignore_whitespace, alt) = match topstack {
996 Some(Group {
997 concat,
998 group,
999 ignore_whitespace,
1000 }) => (concat, group, ignore_whitespace, None),
1001 Some(Alternation(alt)) => match stack.pop() {
1002 Some(Group {
1003 concat,
1004 group,
1005 ignore_whitespace,
1006 }) => (
1007 concat,
1008 group,
1009 ignore_whitespace,
1010 Some(Either::Left::<ast::Alternation, ast::Intersection>(alt)),
1011 ),
1012 None | Some(Alternation(_)) | Some(Intersection(_)) => {
1013 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1014 }
1015 },
1016 Some(Intersection(int)) => match stack.pop() {
1017 Some(Group {
1018 concat,
1019 group,
1020 ignore_whitespace,
1021 }) => (
1022 concat,
1023 group,
1024 ignore_whitespace,
1025 Some(Either::Right::<ast::Alternation, ast::Intersection>(int)),
1026 ),
1027 None | Some(Alternation(_)) | Some(Intersection(_)) => {
1028 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1029 }
1030 },
1031
1032 None => {
1033 return Err(self.error(self.span_char(), ast::ErrorKind::GroupUnopened));
1034 }
1035 };
1036 self.parser().ignore_whitespace.set(ignore_whitespace);
1037 group_concat.span.end = self.pos();
1038 self.bump();
1039 group.span.end = self.pos();
1040 match alt {
1041 Some(Either::Left(mut alt)) => {
1042 alt.span.end = group_concat.span.end;
1043 alt.asts.push(group_concat.into_ast());
1044 group.ast = Box::new(alt.into_ast());
1045 }
1046 Some(Either::Right(mut int)) => {
1047 int.span.end = group_concat.span.end;
1048 int.asts.push(group_concat.into_ast());
1049 group.ast = Box::new(int.into_ast());
1050 }
1051 None => {
1052 group.ast = Box::new(group_concat.into_ast());
1053 }
1054 }
1055
1056 if group.kind == GroupKind::Complement {
1057 let complement = ast::Complement {
1058 span: self.span(),
1059 ast: group.ast,
1060 };
1061 prior_concat.asts.push(Ast::complement(complement));
1062 }
1063 else {
1065 prior_concat.asts.push(Ast::group(group));
1066 }
1067 Ok(prior_concat)
1068 }
1069
1070 #[inline(never)]
1071 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
1072 concat.span.end = self.pos();
1073 let mut stack = self.parser().stack_group.borrow_mut();
1074 let ast = match stack.pop() {
1075 None => Ok(concat.into_ast()),
1076 Some(GroupState::Alternation(mut alt)) => {
1077 alt.span.end = self.pos();
1078 alt.asts.push(concat.into_ast());
1079 Ok(Ast::alternation(alt))
1080 }
1081 Some(GroupState::Intersection(mut int)) => {
1082 int.span.end = self.pos();
1083 int.asts.push(concat.into_ast());
1084
1085 Ok(Ast::intersection(int))
1086 }
1087 Some(GroupState::Group { group, .. }) => {
1088 return Err(self.error(group.span, ast::ErrorKind::GroupUnclosed));
1089 }
1090 };
1091 match stack.pop() {
1093 None => ast,
1094 Some(GroupState::Alternation(alt)) => {
1095 Err(self.error(alt.span, ast::ErrorKind::UnsupportedResharpRegex))
1096 }
1097 Some(GroupState::Intersection(int)) => {
1098 Err(self.error(int.span, ast::ErrorKind::UnsupportedResharpRegex))
1099 }
1100 Some(GroupState::Group { group, .. }) => {
1101 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
1102 }
1103 }
1104 }
1105
1106 #[inline(never)]
1107 fn push_class_open(
1108 &self,
1109 parent_union: regex_syntax::ast::ClassSetUnion,
1110 ) -> Result<regex_syntax::ast::ClassSetUnion> {
1111 assert_eq!(self.char(), '[');
1112
1113 let (nested_set, nested_union) = self.parse_set_class_open()?;
1114 self.parser()
1115 .stack_class
1116 .borrow_mut()
1117 .push(ClassState::Open {
1118 union: parent_union,
1119 set: nested_set,
1120 });
1121 Ok(nested_union)
1122 }
1123
1124 #[inline(never)]
1125 fn pop_class(
1126 &self,
1127 nested_union: regex_syntax::ast::ClassSetUnion,
1128 ) -> Result<Either<regex_syntax::ast::ClassSetUnion, regex_syntax::ast::ClassBracketed>> {
1129 assert_eq!(self.char(), ']');
1130
1131 let item = regex_syntax::ast::ClassSet::Item(nested_union.into_item());
1132 let prevset = self.pop_class_op(item);
1133 let mut stack = self.parser().stack_class.borrow_mut();
1134 match stack.pop() {
1135 None => panic!("unexpected empty character class stack"),
1136 Some(ClassState::Op { .. }) => panic!("unexpected ClassState::Op"),
1137 Some(ClassState::Open { mut union, mut set }) => {
1138 self.bump();
1139 set.span.end = self.pos();
1140 set.kind = prevset;
1141 if stack.is_empty() {
1142 Ok(Either::Right(set))
1143 } else {
1144 union.push(regex_syntax::ast::ClassSetItem::Bracketed(Box::new(set)));
1145 Ok(Either::Left(union))
1146 }
1147 }
1148 }
1149 }
1150
1151 #[inline(never)]
1152 fn unclosed_class_error(&self) -> ParseError {
1153 for state in self.parser().stack_class.borrow().iter().rev() {
1154 if let ClassState::Open { ref set, .. } = *state {
1155 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
1156 }
1157 }
1158 panic!("no open character class found")
1159 }
1160
1161 #[inline(never)]
1162 fn push_class_op(
1163 &self,
1164 next_kind: regex_syntax::ast::ClassSetBinaryOpKind,
1165 next_union: regex_syntax::ast::ClassSetUnion,
1166 ) -> regex_syntax::ast::ClassSetUnion {
1167 let item = regex_syntax::ast::ClassSet::Item(next_union.into_item());
1168 let new_lhs = self.pop_class_op(item);
1169 self.parser().stack_class.borrow_mut().push(ClassState::Op {
1170 kind: next_kind,
1171 lhs: new_lhs,
1172 });
1173 regex_syntax::ast::ClassSetUnion {
1174 span: self.span(),
1175 items: vec![],
1176 }
1177 }
1178
1179 #[inline(never)]
1180 fn pop_class_op(&self, rhs: regex_syntax::ast::ClassSet) -> regex_syntax::ast::ClassSet {
1181 let mut stack = self.parser().stack_class.borrow_mut();
1182 let (kind, lhs) = match stack.pop() {
1183 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
1184 Some(state @ ClassState::Open { .. }) => {
1185 stack.push(state);
1186 return rhs;
1187 }
1188 None => unreachable!(),
1189 };
1190 let span = Span::new(lhs.span().start, rhs.span().end);
1191 regex_syntax::ast::ClassSet::BinaryOp(regex_syntax::ast::ClassSetBinaryOp {
1192 span,
1193 kind,
1194 lhs: Box::new(lhs),
1195 rhs: Box::new(rhs),
1196 })
1197 }
1198
1199 fn any_codepoint_node(&self, tb: &mut TB<'_>) -> NodeId {
1200 utf8_codepoint_node(tb)
1201 }
1202
1203 fn hir_to_node_id(&self, hir: &hir::Hir, tb: &mut TB<'s>) -> Result<NodeId> {
1204 match hir.kind() {
1205 hir::HirKind::Empty => Ok(NodeId::EPS),
1206 hir::HirKind::Literal(l) => {
1207 if l.0.len() == 1 {
1208 let node = tb.mk_u8(l.0[0]);
1209 Ok(node)
1210 } else {
1211 let ws: Vec<_> = l.0.iter().map(|l| tb.mk_u8(*l)).collect();
1212 let conc = tb.mk_concats(ws.iter().copied());
1213 Ok(conc)
1214 }
1215 }
1216 hir::HirKind::Class(class) => match class {
1217 hir::Class::Unicode(class_unicode) => {
1218 Ok(unicode_ranges_to_node(class_unicode.ranges(), tb))
1219 }
1220 hir::Class::Bytes(class_bytes) => {
1221 let ranges = class_bytes.ranges();
1222 let mut result = NodeId::BOT;
1223 for range in ranges {
1224 let start = range.start();
1225 let end = range.end();
1226 let node = tb.mk_range_u8(start, end);
1227 result = tb.mk_union(result, node);
1228 }
1229 Ok(result)
1230 }
1231 },
1232 hir::HirKind::Look(_) => Err(self.error(
1233 Span::splat(self.pos()),
1234 ast::ErrorKind::UnsupportedResharpRegex,
1235 )),
1236 hir::HirKind::Repetition(_) => Err(self.error(
1237 Span::splat(self.pos()),
1238 ast::ErrorKind::UnsupportedResharpRegex,
1239 )),
1240 hir::HirKind::Capture(_) => Err(self.error(
1241 Span::splat(self.pos()),
1242 ast::ErrorKind::UnsupportedResharpRegex,
1243 )),
1244 hir::HirKind::Concat(body) => {
1245 let mut result = NodeId::EPS;
1246 for child in body {
1247 let node = self.hir_to_node_id(child, tb)?;
1248 result = tb.mk_concat(result, node);
1249 }
1250 Ok(result)
1251 }
1252 hir::HirKind::Alternation(_) => Err(self.error(
1253 Span::splat(self.pos()),
1254 ast::ErrorKind::UnsupportedResharpRegex,
1255 )),
1256 }
1257 }
1258
1259 fn translate_ast_to_hir(
1260 &mut self,
1261 orig_ast: ®ex_syntax::ast::Ast,
1262 tb: &mut TB<'s>,
1263 ) -> Result<NodeId> {
1264 match self.translator.translate("", orig_ast) {
1265 Err(_) => Err(self.error(self.span(), ast::ErrorKind::UnicodeClassInvalid)),
1266 Ok(hir) => self.hir_to_node_id(&hir, tb),
1267 }
1268 }
1269
1270 fn translator_to_node_id(
1271 &mut self,
1272 orig_ast: ®ex_syntax::ast::Ast,
1273 translator: &mut Option<Translator>,
1274 tb: &mut TB<'s>,
1275 ) -> Result<NodeId> {
1276 match translator {
1277 Some(tr) => {
1278 let hir = tr
1279 .translate("", orig_ast)
1280 .map_err(|e| self.unsupported_error(e))?;
1281 self.hir_to_node_id(&hir, tb)
1282 }
1283 None => self.translate_ast_to_hir(orig_ast, tb),
1284 }
1285 }
1286
1287 fn get_class(
1288 &mut self,
1289 negated: bool,
1290 kind: regex_syntax::ast::ClassPerlKind,
1291 tb: &mut TB<'s>,
1292 ) -> Result<NodeId> {
1293 let w = self
1294 .perl_classes
1295 .iter()
1296 .find(|(c_neg, c_kind, _)| *c_kind == kind && *c_neg == negated);
1297 match w {
1298 Some((_, _, value)) => Ok(*value),
1299 None => {
1300 let translated = if self.global_ascii_perl {
1301 let pos = match kind {
1302 regex_syntax::ast::ClassPerlKind::Word => {
1303 let az = tb.mk_range_u8(b'a', b'z');
1304 let big = tb.mk_range_u8(b'A', b'Z');
1305 let dig = tb.mk_range_u8(b'0', b'9');
1306 let us = tb.mk_u8(b'_');
1307 tb.mk_unions([az, big, dig, us].into_iter())
1308 }
1309 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1310 regex_syntax::ast::ClassPerlKind::Space => {
1311 let sp = tb.mk_u8(b' ');
1312 let tab = tb.mk_u8(b'\t');
1313 let nl = tb.mk_u8(b'\n');
1314 let cr = tb.mk_u8(b'\r');
1315 let ff = tb.mk_u8(0x0C);
1316 let vt = tb.mk_u8(0x0B);
1317 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1318 }
1319 };
1320 if negated {
1321 resharp_algebra::neg_class(tb, pos)
1322 } else {
1323 pos
1324 }
1325 } else if self.global_unicode {
1326 match kind {
1327 regex_syntax::ast::ClassPerlKind::Word => {
1328 if self.global_full_unicode {
1329 self.unicode_classes.ensure_word_full(tb);
1330 } else {
1331 self.unicode_classes.ensure_word(tb);
1332 }
1333 if negated {
1334 self.unicode_classes.non_word
1335 } else {
1336 self.unicode_classes.word
1337 }
1338 }
1339 regex_syntax::ast::ClassPerlKind::Digit => {
1340 if self.global_full_unicode {
1341 self.unicode_classes.ensure_digit_full(tb);
1342 } else {
1343 self.unicode_classes.ensure_digit(tb);
1344 }
1345 if negated {
1346 self.unicode_classes.non_digit
1347 } else {
1348 self.unicode_classes.digit
1349 }
1350 }
1351 regex_syntax::ast::ClassPerlKind::Space => {
1352 if self.global_full_unicode {
1353 self.unicode_classes.ensure_space_full(tb);
1354 } else {
1355 self.unicode_classes.ensure_space(tb);
1356 }
1357 if negated {
1358 self.unicode_classes.non_space
1359 } else {
1360 self.unicode_classes.space
1361 }
1362 }
1363 }
1364 } else {
1365 let pos = match kind {
1366 regex_syntax::ast::ClassPerlKind::Word => {
1367 let az = tb.mk_range_u8(b'a', b'z');
1368 let big = tb.mk_range_u8(b'A', b'Z');
1369 let dig = tb.mk_range_u8(b'0', b'9');
1370 let us = tb.mk_u8(b'_');
1371 tb.mk_unions([az, big, dig, us].into_iter())
1372 }
1373 regex_syntax::ast::ClassPerlKind::Digit => tb.mk_range_u8(b'0', b'9'),
1374 regex_syntax::ast::ClassPerlKind::Space => {
1375 let sp = tb.mk_u8(b' ');
1376 let tab = tb.mk_u8(b'\t');
1377 let nl = tb.mk_u8(b'\n');
1378 let cr = tb.mk_u8(b'\r');
1379 let ff = tb.mk_u8(0x0C);
1380 let vt = tb.mk_u8(0x0B);
1381 tb.mk_unions([sp, tab, nl, cr, ff, vt].into_iter())
1382 }
1383 };
1384 if negated {
1385 resharp_algebra::neg_class(tb, pos)
1387 } else {
1388 pos
1389 }
1390 };
1391 self.perl_classes.push((negated, kind, translated));
1392 Ok(translated)
1393 }
1394 }
1395 }
1396
1397 fn word_char_kind(ast: &Ast, left: bool) -> WordCharKind {
1398 use WordCharKind::*;
1399 match ast {
1400 Ast::Literal(lit) => {
1401 if is_word_byte(lit.c as u8) {
1402 Word
1403 } else {
1404 NonWord
1405 }
1406 }
1407 Ast::ClassPerl(c) => match (&c.kind, c.negated) {
1408 (®ex_syntax::ast::ClassPerlKind::Word, false) => Word,
1409 (®ex_syntax::ast::ClassPerlKind::Word, true) => NonWord,
1410 (®ex_syntax::ast::ClassPerlKind::Space, false) => NonWord,
1411 (®ex_syntax::ast::ClassPerlKind::Space, true) => Unknown,
1412 (®ex_syntax::ast::ClassPerlKind::Digit, false) => Word,
1413 (®ex_syntax::ast::ClassPerlKind::Digit, true) => Unknown,
1414 },
1415 Ast::ClassBracketed(c) => class_bracketed_word_kind(c),
1416 Ast::Dot(_) | Ast::Top(_) => Unknown,
1417 Ast::Group(g) => Self::word_char_kind(&g.ast, left),
1418 Ast::Concat(c) if !c.asts.is_empty() => {
1419 let dir: isize = if left { -1 } else { 1 };
1420 let edge = match Self::concat_edge_index(&c.asts, left) {
1421 Some(e) => e,
1422 None => return Unknown,
1423 };
1424 let kind = Self::word_char_kind(&c.asts[edge], left);
1425 match kind {
1426 MaybeWord => {
1427 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1428 Word => Word,
1429 Edge => MaybeWord,
1430 _ => Unknown,
1431 }
1432 }
1433 MaybeNonWord => {
1434 match Self::concat_neighbor_kind(&c.asts, edge, dir) {
1435 NonWord => NonWord,
1436 Edge => MaybeNonWord,
1437 _ => Unknown,
1438 }
1439 }
1440 other => other,
1441 }
1442 }
1443 Ast::Alternation(alt) if !alt.asts.is_empty() => {
1444 let first = Self::word_char_kind(&alt.asts[0], left);
1445 if alt.asts[1..]
1446 .iter()
1447 .all(|a| Self::word_char_kind(a, left) == first)
1448 {
1449 first
1450 } else {
1451 Unknown
1452 }
1453 }
1454 Ast::Repetition(r) => {
1455 let inner = Self::word_char_kind(&r.ast, left);
1456 let nullable = matches!(
1457 &r.op.kind,
1458 ast::RepetitionKind::ZeroOrMore
1459 | ast::RepetitionKind::ZeroOrOne
1460 | ast::RepetitionKind::Range(
1461 ast::RepetitionRange::Bounded(0, _)
1462 | ast::RepetitionRange::Exactly(0)
1463 )
1464 );
1465 if nullable {
1466 match inner {
1467 Word => MaybeWord,
1468 NonWord => MaybeNonWord,
1469 _ => Unknown,
1470 }
1471 } else {
1472 inner
1473 }
1474 }
1475 Ast::Lookaround(la) => match la.kind {
1476 ast::LookaroundKind::PositiveLookahead
1477 | ast::LookaroundKind::PositiveLookbehind => Self::word_char_kind(&la.ast, left),
1478 _ => Unknown,
1479 },
1480 Ast::Assertion(a) => match (&a.kind, left) {
1481 (ast::AssertionKind::EndText, false) => NonWord,
1482 (ast::AssertionKind::StartText, true) => NonWord,
1483 _ => Unknown,
1484 },
1485 _ => Unknown,
1486 }
1487 }
1488
1489 fn edge_class_ast(ast: &Ast, left: bool) -> Option<&Ast> {
1491 match ast {
1492 Ast::Literal(_)
1493 | Ast::ClassPerl(_)
1494 | Ast::ClassBracketed(_)
1495 | Ast::ClassUnicode(_)
1496 | Ast::Dot(_)
1497 | Ast::Top(_) => Some(ast),
1498 Ast::Group(g) => Self::edge_class_ast(&g.ast, left),
1499 Ast::Concat(c) if !c.asts.is_empty() => {
1500 Self::concat_edge_index(&c.asts, left)
1501 .and_then(|e| Self::edge_class_ast(&c.asts[e], left))
1502 }
1503 Ast::Repetition(r) => {
1504 let nullable = matches!(
1505 &r.op.kind,
1506 ast::RepetitionKind::ZeroOrMore
1507 | ast::RepetitionKind::ZeroOrOne
1508 | ast::RepetitionKind::Range(
1509 ast::RepetitionRange::Bounded(0, _)
1510 | ast::RepetitionRange::Exactly(0)
1511 )
1512 );
1513 if nullable {
1514 None
1515 } else {
1516 Self::edge_class_ast(&r.ast, left)
1517 }
1518 }
1519 _ => None,
1520 }
1521 }
1522
1523 fn resolve_word_kind(
1524 &mut self,
1525 asts: &[Ast],
1526 idx: usize,
1527 dir: isize,
1528 translator: &mut Option<Translator>,
1529 tb: &mut TB<'s>,
1530 word_id: NodeId,
1531 not_word_id: NodeId,
1532 ) -> Result<WordCharKind> {
1533 use WordCharKind::*;
1534 let fast = Self::concat_neighbor_kind(asts, idx, dir);
1535 if fast != Unknown {
1536 return Ok(fast);
1537 }
1538 let neighbor_idx = (idx as isize + dir) as usize;
1539 let node = if let Some(edge) = Self::edge_class_ast(&asts[neighbor_idx], dir < 0) {
1540 self.ast_to_node_id(edge, translator, tb)?
1541 } else if dir > 0 {
1542 let mut bodies: Vec<NodeId> = vec![];
1543 let mut j = neighbor_idx;
1544 while j < asts.len() {
1545 match &asts[j] {
1546 Ast::Lookaround(la) => {
1547 let kind = la.kind.clone();
1548 let lookbehind = matches!(
1549 kind,
1550 ast::LookaroundKind::PositiveLookbehind
1551 | ast::LookaroundKind::NegativeLookbehind
1552 );
1553 if lookbehind {
1554 j += 1;
1555 continue;
1556 }
1557 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1558 let body = tb.try_elim_lookarounds(body).ok_or_else(|| {
1559 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1560 })?;
1561 let body_ts = tb.mk_concat(body, NodeId::TS);
1562 let constraint = match kind {
1563 ast::LookaroundKind::PositiveLookahead => body_ts,
1564 ast::LookaroundKind::NegativeLookahead => tb.mk_compl(body_ts),
1565 _ => unreachable!(),
1566 };
1567 bodies.push(constraint);
1568 j += 1;
1569 }
1570 other => {
1571 let n = self.ast_to_node_id(other, translator, tb)?;
1572 let n = tb.try_elim_lookarounds(n).ok_or_else(|| {
1573 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1574 })?;
1575 bodies.push(tb.mk_concat(n, NodeId::TS));
1576 break;
1577 }
1578 }
1579 }
1580 if bodies.is_empty() {
1581 return Ok(Unknown);
1582 }
1583 let combined = tb.mk_inters(bodies.into_iter());
1584 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1585 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1586 return if tb.subsumes(word_prefix, combined) == Some(true) {
1587 Ok(Word)
1588 } else if tb.subsumes(non_word_prefix, combined) == Some(true) {
1589 Ok(NonWord)
1590 } else {
1591 Ok(Unknown)
1592 };
1593 } else {
1594 let neighbor_node = self.ast_to_node_id(&asts[neighbor_idx], translator, tb)?;
1595 let neighbor_node = Self::strip_trailing_lookahead(tb, neighbor_node);
1596 let mut neighbor_node = tb
1597 .try_elim_lookarounds(neighbor_node)
1598 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1599 neighbor_node = tb.reverse(neighbor_node).or_else(|_| {
1600 Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))
1601 })?;
1602 let word_prefix = tb.mk_concat(word_id, NodeId::TS);
1603 let non_word_prefix = tb.mk_concat(not_word_id, NodeId::TS);
1604 return if tb.subsumes(word_prefix, neighbor_node) == Some(true) {
1605 Ok(Word)
1606 } else if tb.subsumes(non_word_prefix, neighbor_node) == Some(true) {
1607 Ok(NonWord)
1608 } else {
1609 Ok(Unknown)
1610 };
1611 };
1612 if tb.subsumes(word_id, node) == Some(true) {
1613 Ok(Word)
1614 } else if tb.subsumes(not_word_id, node) == Some(true) {
1615 Ok(NonWord)
1616 } else {
1617 Ok(Unknown)
1618 }
1619 }
1620
1621 fn strip_trailing_lookahead(tb: &mut TB<'s>, node: NodeId) -> NodeId {
1622 match tb.get_kind(node) {
1623 Kind::Lookahead if tb.get_min_max_length(node).1 == 0 => NodeId::EPS,
1624 Kind::Concat => {
1625 let l = node.left(tb);
1626 let r = node.right(tb);
1627 let stripped_r = Self::strip_trailing_lookahead(tb, r);
1628 if stripped_r == NodeId::EPS {
1629 Self::strip_trailing_lookahead(tb, l)
1630 } else if stripped_r == r {
1631 node
1632 } else {
1633 tb.mk_concat(l, stripped_r)
1634 }
1635 }
1636 _ => node,
1637 }
1638 }
1639
1640 fn concat_edge_index(asts: &[Ast], left: bool) -> Option<usize> {
1641 let dir: isize = if left { -1 } else { 1 };
1642 let mut e = if left { asts.len() as isize - 1 } else { 0 };
1643 while e >= 0
1644 && (e as usize) < asts.len()
1645 && Self::is_transparent_for_dir(&asts[e as usize], dir)
1646 {
1647 e += dir;
1648 }
1649 if e < 0 || e as usize >= asts.len() {
1650 None
1651 } else {
1652 Some(e as usize)
1653 }
1654 }
1655
1656 fn is_transparent_for_dir(ast: &Ast, dir: isize) -> bool {
1657 match ast {
1658 Ast::Lookaround(la) => match la.kind {
1659 ast::LookaroundKind::PositiveLookahead | ast::LookaroundKind::NegativeLookahead => {
1660 dir < 0
1661 }
1662 ast::LookaroundKind::PositiveLookbehind
1663 | ast::LookaroundKind::NegativeLookbehind => dir > 0,
1664 },
1665 Ast::Repetition(r) => matches!(
1666 &r.op.kind,
1667 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(0))
1668 ),
1669 _ => false,
1670 }
1671 }
1672
1673 fn concat_neighbor_kind(asts: &[Ast], idx: usize, dir: isize) -> WordCharKind {
1674 use WordCharKind::*;
1675 let next = idx as isize + dir;
1676 if next < 0 || next >= asts.len() as isize {
1677 return Edge;
1678 }
1679 if Self::is_transparent_for_dir(&asts[next as usize], dir) {
1680 return Self::concat_neighbor_kind(asts, next as usize, dir);
1681 }
1682 let kind = Self::word_char_kind(&asts[next as usize], dir < 0);
1683 match kind {
1684 MaybeWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1685 Word => Word,
1686 _ => Unknown,
1687 },
1688 MaybeNonWord => match Self::concat_neighbor_kind(asts, next as usize, dir) {
1689 NonWord => NonWord,
1690 _ => Unknown,
1691 },
1692 other => other,
1693 }
1694 }
1695
1696 fn specialize_word_boundaries(
1697 &mut self,
1698 children: &mut [NodeId],
1699 tb: &mut TB<'s>,
1700 ) -> Result<()> {
1701 let wb = self.unicode_classes.wb;
1702 let non_wb = self.unicode_classes.non_wb;
1703 if wb == NodeId::MISSING {
1704 return Ok(());
1705 }
1706 let word = self.unicode_classes.word;
1707 let non_word = self.unicode_classes.non_word;
1708 if word == NodeId::MISSING {
1709 return Ok(());
1710 }
1711 let word_pref = tb.mk_concat(word, NodeId::TS);
1714 let non_word_pref = tb.mk_concat(non_word, NodeId::TS);
1715 let word_suf = tb.mk_concat(NodeId::TS, word);
1716 let non_word_suf = tb.mk_concat(NodeId::TS, non_word);
1717 let len = children.len();
1718 for k in 0..len {
1719 let l = if k == 0 {
1720 WordCharKind::Edge
1721 } else {
1722 use resharp_algebra::Kind;
1723 if tb.get_kind(children[k - 1]) == Kind::End
1724 && (children[k] == wb || children[k] == non_wb)
1725 {
1726 return Err(
1727 self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex)
1728 );
1729 }
1730 Self::classify(tb, children[k - 1], word_suf, non_word_suf)
1731 };
1732 let r = if k + 1 >= len {
1733 WordCharKind::Edge
1734 } else {
1735 Self::classify(tb, children[k + 1], word_pref, non_word_pref)
1736 };
1737 children[k] = Self::rewrite_wb_in_node(tb, children[k], wb, non_wb, word, l, r)
1738 .ok_or_else(|| self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex))?;
1739 }
1740 Ok(())
1741 }
1742
1743 fn rewrite_wb_in_node(
1744 b: &mut TB<'s>,
1745 node: NodeId,
1746 wb: NodeId,
1747 non_wb: NodeId,
1748 word: NodeId,
1749 left: WordCharKind,
1750 right: WordCharKind,
1751 ) -> Option<NodeId> {
1752 let boundary_match = if node == wb {
1753 true
1754 } else if node == non_wb {
1755 false
1756 } else if b.get_kind(node) == Kind::Union {
1757 let l = Self::rewrite_wb_in_node(b, node.left(b), wb, non_wb, word, left, right)?;
1758 let r = Self::rewrite_wb_in_node(b, node.right(b), wb, non_wb, word, left, right)?;
1759 return Some(b.mk_union(l, r));
1760 } else {
1761 return Some(node);
1762 };
1763 use WordCharKind::*;
1764 let result = match (left, right) {
1765 (NonWord, Word) | (Word, NonWord) => {
1766 if boundary_match {
1767 NodeId::EPS
1768 } else {
1769 NodeId::BOT
1770 }
1771 }
1772 (Word, Word) | (NonWord, NonWord) => {
1773 if boundary_match {
1774 NodeId::BOT
1775 } else {
1776 NodeId::EPS
1777 }
1778 }
1779 (Word, _) => {
1780 if boundary_match {
1781 b.mk_neg_lookahead(word, 0)
1782 } else {
1783 let tail = b.mk_concat(word, NodeId::TS);
1784 b.mk_lookahead(tail, NodeId::MISSING, 0)
1785 }
1786 }
1787 (NonWord, _) => {
1788 if boundary_match {
1789 let tail = b.mk_concat(word, NodeId::TS);
1790 b.mk_lookahead(tail, NodeId::MISSING, 0)
1791 } else {
1792 b.mk_neg_lookahead(word, 0)
1793 }
1794 }
1795 (_, Word) => {
1796 if boundary_match {
1797 b.mk_neg_lookbehind(word)
1798 } else {
1799 b.mk_lookbehind(word, NodeId::MISSING)
1800 }
1801 }
1802 (_, NonWord) => {
1803 if boundary_match {
1804 b.mk_lookbehind(word, NodeId::MISSING)
1805 } else {
1806 b.mk_neg_lookbehind(word)
1807 }
1808 }
1809 _ => return Some(node),
1810 };
1811 Some(result)
1812 }
1813
1814 fn classify(
1815 b: &mut TB<'s>,
1816 node: NodeId,
1817 word_dir: NodeId,
1818 non_word_dir: NodeId,
1819 ) -> WordCharKind {
1820 if b.contains_look(node) || b.contains_anchors(node) {
1821 return WordCharKind::Unknown;
1822 }
1823 if b.subsumes(word_dir, node) == Some(true) {
1824 WordCharKind::Word
1825 } else if b.subsumes(non_word_dir, node) == Some(true) {
1826 WordCharKind::NonWord
1827 } else {
1828 WordCharKind::Unknown
1829 }
1830 }
1831
1832 fn rewrite_word_boundary_in_concat(
1833 &mut self,
1834 asts: &[Ast],
1835 idx: usize,
1836 translator: &mut Option<Translator>,
1837 tb: &mut TB<'s>,
1838 negated: bool,
1839 ) -> Result<(NodeId, usize)> {
1840 use WordCharKind::*;
1841 if self.global_full_unicode {
1842 self.unicode_classes.ensure_word_full(tb);
1843 } else if self.global_unicode && !self.global_ascii_perl {
1844 self.unicode_classes.ensure_word(tb);
1845 } else {
1846 self.unicode_classes.ensure_word_ascii(tb);
1847 }
1848 let word_id = self.unicode_classes.word;
1849 let not_word_id = self.unicode_classes.non_word;
1850 let left = self.resolve_word_kind(asts, idx, -1, translator, tb, word_id, not_word_id)?;
1851 let right = self.resolve_word_kind(asts, idx, 1, translator, tb, word_id, not_word_id)?;
1852 let boundary_match = !negated;
1853 match (left, right) {
1854 (NonWord, Word) | (Word, NonWord) => Ok((
1855 if boundary_match {
1856 NodeId::EPS
1857 } else {
1858 NodeId::BOT
1859 },
1860 idx + 1,
1861 )),
1862 (Word, Word) | (NonWord, NonWord) => Ok((
1863 if boundary_match {
1864 NodeId::BOT
1865 } else {
1866 NodeId::EPS
1867 },
1868 idx + 1,
1869 )),
1870 (Word, _) => {
1871 if boundary_match {
1872 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1873 } else {
1874 let tail = tb.mk_concat(word_id, NodeId::TS);
1875 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1876 }
1877 }
1878 (NonWord, _) => {
1879 if boundary_match {
1880 let tail = tb.mk_concat(word_id, NodeId::TS);
1881 self.merge_boundary_with_following_lookaheads(asts, idx, tail, translator, tb)
1882 } else {
1883 Ok((tb.mk_neg_lookahead(word_id, 0), idx + 1))
1884 }
1885 }
1886 (_, Word) => {
1887 if boundary_match {
1888 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1889 } else {
1890 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1891 }
1892 }
1893 (_, NonWord) => {
1894 if boundary_match {
1895 Ok((tb.mk_lookbehind(word_id, NodeId::MISSING), idx + 1))
1896 } else {
1897 Ok((tb.mk_neg_lookbehind(word_id), idx + 1))
1898 }
1899 }
1900 _ => {
1904 self.unicode_classes.ensure_wb(tb);
1905 let node = if boundary_match {
1906 self.unicode_classes.wb
1907 } else {
1908 self.unicode_classes.non_wb
1909 };
1910 Ok((node, idx + 1))
1911 }
1912 }
1913 }
1914
1915 fn merge_boundary_with_following_lookaheads(
1916 &mut self,
1917 asts: &[Ast],
1918 wb_idx: usize,
1919 boundary_tail: NodeId,
1920 translator: &mut Option<Translator>,
1921 tb: &mut TB<'s>,
1922 ) -> Result<(NodeId, usize)> {
1923 let mut next = wb_idx + 1;
1924 let mut la_bodies = vec![boundary_tail];
1925 while next < asts.len() {
1926 match &asts[next] {
1927 Ast::Lookaround(la) if la.kind == ast::LookaroundKind::PositiveLookahead => {
1928 let body = self.ast_to_node_id(&la.ast, translator, tb)?;
1929 la_bodies.push(tb.mk_concat(body, NodeId::TS));
1930 next += 1;
1931 }
1932 _ => break,
1933 }
1934 }
1935 let merged = tb.mk_inters(la_bodies.into_iter());
1936 Ok((tb.mk_lookahead(merged, NodeId::MISSING, 0), next))
1937 }
1938
1939 fn ast_to_node_id(
1940 &mut self,
1941 ast: &Ast,
1942 translator: &mut Option<Translator>,
1943 tb: &mut TB<'s>,
1944 ) -> Result<NodeId> {
1945 match ast {
1946 Ast::Empty(_) => Ok(NodeId::EPS),
1947 Ast::Flags(f) => {
1948 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
1949 return Err(self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex));
1950 }
1951 let mut translator_builder = self.default_translator_builder();
1952 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
1953 translator_builder.case_insensitive(state);
1954 }
1955 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
1956 translator_builder.unicode(state);
1957 }
1958 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
1959 self.dot_all.set(state);
1960 }
1961 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
1962 self.multiline.set(state);
1963 }
1964 let concat_translator = Some(translator_builder.build());
1965 *translator = concat_translator;
1966 Ok(NodeId::EPS)
1967 }
1968 Ast::Literal(l) => {
1969 let ast_lit = regex_syntax::ast::Ast::literal(*l.to_owned());
1970 self.translator_to_node_id(&ast_lit, translator, tb)
1971 }
1972 Ast::Top(_) => Ok(NodeId::TOP),
1973 Ast::Dot(_) => {
1974 let codepoint_dot = self.global_ascii_perl || self.global_full_unicode;
1975 let hirv = match (codepoint_dot, self.dot_all.get()) {
1976 (true, true) => hir::Hir::dot(hir::Dot::AnyChar),
1977 (true, false) => hir::Hir::dot(hir::Dot::AnyCharExceptLF),
1978 (false, true) => return Ok(NodeId::TOP),
1979 (false, false) => hir::Hir::dot(hir::Dot::AnyByteExceptLF),
1980 };
1981 self.hir_to_node_id(&hirv, tb)
1982 }
1983 Ast::Assertion(a) => match &a.kind {
1984 ast::AssertionKind::StartText => Ok(NodeId::BEGIN),
1985 ast::AssertionKind::EndText => Ok(NodeId::END),
1986 ast::AssertionKind::WordBoundary => {
1987 let only = Ast::Assertion(a.clone());
1988 let asts = std::slice::from_ref(&only);
1989 let (node, _) =
1990 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, false)?;
1991 Ok(node)
1992 }
1993 ast::AssertionKind::NotWordBoundary => {
1994 let only = Ast::Assertion(a.clone());
1996 let asts = std::slice::from_ref(&only);
1997 let (node, _) =
1998 self.rewrite_word_boundary_in_concat(asts, 0, translator, tb, true)?;
1999 Ok(node)
2000 }
2001 ast::AssertionKind::StartLine => {
2002 if !self.multiline.get() {
2003 return Ok(NodeId::BEGIN);
2004 }
2005 let left = NodeId::BEGIN;
2006 let right = tb.mk_u8(b'\n');
2007 let union = tb.mk_union(left, right);
2008 Ok(tb.mk_lookbehind(union, NodeId::MISSING))
2009 }
2010 ast::AssertionKind::EndLine => {
2011 if !self.multiline.get() {
2012 return Ok(NodeId::END);
2013 }
2014 let left = NodeId::END;
2015 let right = tb.mk_u8(b'\n');
2016 let union = tb.mk_union(left, right);
2017 Ok(tb.mk_lookahead(union, NodeId::MISSING, 0))
2018 }
2019 ast::AssertionKind::WordBoundaryStart => {
2020 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2021 }
2022 ast::AssertionKind::WordBoundaryEnd => {
2023 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2024 }
2025 ast::AssertionKind::WordBoundaryStartAngle => {
2026 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2027 }
2028 ast::AssertionKind::WordBoundaryEndAngle => {
2029 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2030 }
2031 ast::AssertionKind::WordBoundaryStartHalf => {
2032 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2033 }
2034 ast::AssertionKind::WordBoundaryEndHalf => {
2035 Err(self.error(a.span, ast::ErrorKind::UnsupportedResharpRegex))
2036 }
2037 },
2038 Ast::ClassUnicode(c) => {
2039 let tmp = regex_syntax::ast::ClassUnicode {
2040 span: c.span,
2041 negated: c.negated,
2042 kind: c.kind.clone(),
2043 };
2044 if !c.negated {
2045 if let regex_syntax::ast::ClassUnicodeKind::Named(s) = &c.kind {
2046 match s.as_str() {
2047 "ascii" => return Ok(tb.mk_range_u8(0, 127)),
2049 "utf8" => return Ok(utf8_codepoint_node(tb)),
2051 "hex" => {
2052 let nums = tb.mk_range_u8(b'0', b'9');
2053 let lets = tb.mk_range_u8(b'a', b'f');
2054 let lets2 = tb.mk_range_u8(b'A', b'F');
2055 let merged = tb.mk_unions([nums, lets, lets2].into_iter());
2056 return Ok(merged);
2057 }
2058 _ => {}
2059 }
2060 };
2061 }
2062
2063 let orig_ast = regex_syntax::ast::Ast::class_unicode(tmp);
2064 self.translator_to_node_id(&orig_ast, translator, tb)
2065 }
2066 Ast::ClassPerl(c) => self.get_class(c.negated, c.kind.clone(), tb),
2067 Ast::ClassBracketed(c) => match &c.kind {
2068 regex_syntax::ast::ClassSet::Item(item) => {
2069 if !c.negated && is_universal_perl_pair(item) {
2070 if self.global_ascii_perl || self.global_full_unicode {
2071 return Ok(self.any_codepoint_node(tb));
2072 }
2073 return Ok(NodeId::TOP);
2074 }
2075 if let regex_syntax::ast::ClassSetItem::Perl(p) = item {
2076 return self.get_class(c.negated ^ p.negated, p.kind.clone(), tb);
2077 }
2078 let kind = if self.global_ascii_perl {
2079 rewrite_ascii_perl_set(&c.kind)
2080 } else {
2081 c.kind.clone()
2082 };
2083 let tmp = regex_syntax::ast::ClassBracketed {
2084 span: c.span,
2085 negated: c.negated,
2086 kind,
2087 };
2088 let orig_ast = regex_syntax::ast::Ast::class_bracketed(tmp);
2089 self.translator_to_node_id(&orig_ast, translator, tb)
2090 }
2091 regex_syntax::ast::ClassSet::BinaryOp(_) => {
2092 Err(self.error(c.span, ast::ErrorKind::UnsupportedResharpRegex))
2093 }
2094 },
2095 Ast::Repetition(r) => {
2096 let body = self.ast_to_node_id(&r.ast, translator, tb);
2097 match body {
2098 Ok(body) => match &r.op.kind {
2099 ast::RepetitionKind::ZeroOrOne => Ok(tb.mk_opt(body)),
2100 ast::RepetitionKind::ZeroOrMore => Ok(tb.mk_star(body)),
2101 ast::RepetitionKind::OneOrMore => Ok(tb.mk_plus(body)),
2102 ast::RepetitionKind::Range(r) => match r {
2103 ast::RepetitionRange::Exactly(n) => Ok(tb.mk_repeat(body, *n, *n)),
2104 ast::RepetitionRange::AtLeast(n) => {
2105 let rep = tb.mk_repeat(body, *n, *n);
2106 let st = tb.mk_star(body);
2107 Ok(tb.mk_concat(rep, st))
2108 }
2109
2110 ast::RepetitionRange::Bounded(n, m) => Ok(tb.mk_repeat(body, *n, *m)),
2111 },
2112 },
2113 Err(_) => body,
2114 }
2115 }
2116 Ast::Lookaround(g) => {
2117 let body = self.ast_to_node_id(&g.ast, translator, tb)?;
2118 match g.kind {
2119 ast::LookaroundKind::PositiveLookahead if body.contains_lookbehind(tb) => {
2120 let mut prefix = NodeId::EPS;
2121 let mut rest = body;
2122 while tb.get_kind(rest) == Kind::Concat
2123 && tb.get_kind(rest.left(tb)) == Kind::Lookbehind
2124 {
2125 prefix = tb.mk_concat(prefix, rest.left(tb));
2126 rest = rest.right(tb);
2127 }
2128 if prefix == NodeId::EPS || rest.contains_lookbehind(tb) {
2129 return Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex));
2130 }
2131 let la = tb.mk_lookahead(rest, NodeId::MISSING, 0);
2132 Ok(tb.mk_concat(prefix, la))
2133 }
2134 ast::LookaroundKind::NegativeLookahead if body.contains_lookbehind(tb) => {
2135 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2136 }
2137 ast::LookaroundKind::PositiveLookahead => {
2138 Ok(tb.mk_lookahead(body, NodeId::MISSING, 0))
2139 }
2140 ast::LookaroundKind::PositiveLookbehind
2141 | ast::LookaroundKind::NegativeLookbehind
2142 if body.contains_lookahead(tb) =>
2143 {
2144 Err(self.error(g.span, ast::ErrorKind::UnsupportedResharpRegex))
2145 }
2146 ast::LookaroundKind::PositiveLookbehind => {
2147 Ok(tb.mk_lookbehind(body, NodeId::MISSING))
2148 }
2149 ast::LookaroundKind::NegativeLookahead => Ok(tb.mk_neg_lookahead(body, 0)),
2150 ast::LookaroundKind::NegativeLookbehind => Ok(tb.mk_neg_lookbehind(body)),
2151 }
2152 }
2153 Ast::Group(g) => {
2154 if let ast::GroupKind::NonCapturing(ref flags) = g.kind {
2155 if !flags.items.is_empty() {
2156 let mut translator_builder = self.default_translator_builder();
2157 if let Some(state) = flags.flag_state(ast::Flag::CaseInsensitive) {
2158 translator_builder.case_insensitive(state);
2159 }
2160 if let Some(state) = flags.flag_state(ast::Flag::Unicode) {
2161 translator_builder.unicode(state);
2162 }
2163 let saved_dot_all = self.dot_all.get();
2164 if let Some(state) = flags.flag_state(ast::Flag::DotMatchesNewLine) {
2165 self.dot_all.set(state);
2166 }
2167 let saved_multiline = self.multiline.get();
2168 if let Some(state) = flags.flag_state(ast::Flag::MultiLine) {
2169 self.multiline.set(state);
2170 }
2171 let mut scoped = Some(translator_builder.build());
2172 let result = self.ast_to_node_id(&g.ast, &mut scoped, tb);
2173 self.dot_all.set(saved_dot_all);
2174 self.multiline.set(saved_multiline);
2175 return result;
2176 }
2177 }
2178 self.ast_to_node_id(&g.ast, translator, tb)
2179 }
2180 Ast::Alternation(a) => {
2181 let mut children = vec![];
2182 for ast in &a.asts {
2183 match self.ast_to_node_id(ast, translator, tb) {
2184 Ok(node_id) => children.push(node_id),
2185 Err(err) => return Err(err),
2186 }
2187 }
2188 Ok(tb.mk_unions(children.iter().copied()))
2189 }
2190 Ast::Concat(c) => {
2191 let mut concat_translator: Option<Translator> = None;
2192 let mut children = vec![];
2193 let mut prev_boundary_child: Option<usize> = None;
2194 let mut i = 0;
2195 while i < c.asts.len() {
2196 let ast = &c.asts[i];
2197 match ast {
2198 Ast::Flags(f) => {
2199 if f.flags.flag_state(ast::Flag::SwapGreed).is_some() {
2200 return Err(
2201 self.error(f.span, ast::ErrorKind::UnsupportedResharpRegex)
2202 );
2203 }
2204 let mut translator_builder = self.default_translator_builder();
2205 if let Some(state) = f.flags.flag_state(ast::Flag::CaseInsensitive) {
2206 translator_builder.case_insensitive(state);
2207 }
2208 if let Some(state) = f.flags.flag_state(ast::Flag::Unicode) {
2209 translator_builder.unicode(state);
2210 }
2211 if let Some(state) = f.flags.flag_state(ast::Flag::DotMatchesNewLine) {
2212 self.dot_all.set(state);
2213 }
2214 if let Some(state) = f.flags.flag_state(ast::Flag::MultiLine) {
2215 self.multiline.set(state);
2216 }
2217 concat_translator = Some(translator_builder.build());
2218 *translator = concat_translator.clone();
2219 i += 1;
2220 continue;
2221 }
2222 Ast::Assertion(a)
2223 if a.kind == ast::AssertionKind::WordBoundary
2224 || a.kind == ast::AssertionKind::NotWordBoundary =>
2225 {
2226 let negated = a.kind == ast::AssertionKind::NotWordBoundary;
2227 let node = self.rewrite_word_boundary_in_concat(
2228 &c.asts, i, translator, tb, negated,
2229 )?;
2230 match prev_boundary_child {
2231 Some(idx) => children[idx] = tb.mk_inter(children[idx], node.0),
2232 None => {
2233 children.push(node.0);
2234 prev_boundary_child = Some(children.len() - 1);
2235 }
2236 }
2237 i = node.1; continue;
2239 }
2240 _ => {}
2241 }
2242 match concat_translator {
2243 Some(_) => match self.ast_to_node_id(ast, &mut concat_translator, tb) {
2244 Ok(node_id) => {
2245 if node_id != resharp_algebra::NodeId::EPS {
2246 prev_boundary_child = None;
2247 children.push(node_id);
2248 }
2249 }
2250 Err(err) => return Err(err),
2251 },
2252 None => match self.ast_to_node_id(ast, translator, tb) {
2253 Ok(node_id) => {
2254 if node_id != resharp_algebra::NodeId::EPS {
2255 prev_boundary_child = None;
2256 children.push(node_id);
2257 }
2258 }
2259 Err(err) => return Err(err),
2260 },
2261 }
2262 i += 1;
2263 }
2264 self.specialize_word_boundaries(&mut children, tb)?;
2265 Ok(tb.mk_concats(children.iter().cloned()))
2266 }
2267 Ast::Intersection(intersection) => {
2268 let mut children = vec![];
2269 for ast in &intersection.asts {
2270 match self.ast_to_node_id(ast, translator, tb) {
2271 Ok(node_id) => children.push(node_id),
2272 Err(err) => return Err(err),
2273 }
2274 }
2275 Ok(tb.mk_inters(children.into_iter()))
2276 }
2277 Ast::Complement(complement) => {
2278 let body = self.ast_to_node_id(&complement.ast, translator, tb);
2279 body.map(|x| tb.mk_compl(x))
2280 }
2281 }
2282 }
2283
2284 fn parse_inner(&mut self) -> Result<Ast> {
2285 let mut concat = Concat {
2286 span: self.span(),
2287 asts: vec![],
2288 };
2289 loop {
2290 self.bump_space();
2291 if self.is_eof() {
2292 break;
2293 }
2294 match self.char() {
2295 '(' => concat = self.push_group(concat)?,
2296 ')' => concat = self.pop_group(concat)?,
2297 '|' => concat = self.push_alternate(concat)?,
2298 '&' => concat = self.push_intersect(concat)?,
2299 '~' => concat = self.push_compl_group(concat)?,
2300 '[' => {
2301 let class = self.parse_set_class()?;
2302 concat.asts.push(Ast::class_bracketed(class));
2303 }
2304 '?' => {
2305 concat =
2306 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrOne)?;
2307 }
2308 '*' => {
2309 concat =
2310 self.parse_uncounted_repetition(concat, ast::RepetitionKind::ZeroOrMore)?;
2311 }
2312 '+' => {
2313 concat =
2314 self.parse_uncounted_repetition(concat, ast::RepetitionKind::OneOrMore)?;
2315 }
2316 '{' => {
2317 concat = self.parse_counted_repetition(concat)?;
2318 }
2319 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
2320 }
2321 if self.stack_group.borrow().len() > self.max_depth {
2322 return Err(self.error(self.span(), ast::ErrorKind::UnsupportedResharpRegex));
2323 }
2324 }
2325 let ast = self.pop_group_end(concat)?;
2326 if expanded_ast_size(&ast, self.expanded_ast_limit) >= self.expanded_ast_limit
2327 || max_concat_length(&ast) >= self.max_list_len
2328 {
2329 return Err(self.error(*ast.span(), ast::ErrorKind::UnsupportedResharpRegex));
2330 }
2331 Ok(ast)
2332 }
2333
2334 fn parse(&mut self, tb: &mut TB<'s>) -> Result<NodeId> {
2335 let ast = self.parse_inner()?;
2336 if let Err(span) = ensure_lookbehind_at_start(&ast, true) {
2337 return Err(self.error(span, ast::ErrorKind::UnsupportedResharpRegex));
2338 }
2339 self.ast_to_node_id(&ast, &mut None, tb)
2340 }
2341
2342 #[inline(never)]
2343 fn parse_uncounted_repetition(
2344 &self,
2345 mut concat: ast::Concat,
2346 kind: ast::RepetitionKind,
2347 ) -> Result<ast::Concat> {
2348 let op_start = self.pos();
2350 let ast = match concat.asts.pop() {
2351 Some(ast) => ast,
2352 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2353 };
2354 match ast {
2355 Ast::Empty(_) | Ast::Flags(_) => {
2356 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2357 }
2358 _ => {}
2359 }
2360 if self.bump() && self.char() == '?' {
2361 return Err(self.error(
2362 Span::new(op_start, self.pos()),
2363 ast::ErrorKind::UnsupportedLazyQuantifier,
2364 ));
2365 }
2366 concat.asts.push(Ast::repetition(ast::Repetition {
2367 span: ast.span().with_end(self.pos()),
2368 op: ast::RepetitionOp {
2369 span: Span::new(op_start, self.pos()),
2370 kind,
2371 },
2372 greedy: true,
2373 ast: Box::new(ast),
2374 }));
2375 Ok(concat)
2376 }
2377
2378 #[inline(never)]
2379 fn parse_counted_repetition(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
2380 assert!(self.char() == '{');
2381 let start = self.pos();
2382 let ast = match concat.asts.pop() {
2383 Some(ast) => ast,
2384 None => return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing)),
2385 };
2386 match ast {
2387 Ast::Empty(_) | Ast::Flags(_) => {
2388 return Err(self.error(self.span(), ast::ErrorKind::RepetitionMissing))
2389 }
2390 _ => {}
2391 }
2392 if !self.bump_and_bump_space() {
2393 return Err(self.error(
2394 Span::new(start, self.pos()),
2395 ast::ErrorKind::RepetitionCountUnclosed,
2396 ));
2397 }
2398 let count_start = specialize_err(
2399 self.parse_decimal(),
2400 ast::ErrorKind::DecimalEmpty,
2401 ast::ErrorKind::RepetitionCountDecimalEmpty,
2402 );
2403 if self.is_eof() {
2404 return Err(self.error(
2405 Span::new(start, self.pos()),
2406 ast::ErrorKind::RepetitionCountUnclosed,
2407 ));
2408 }
2409 let range = if self.char() == ',' {
2410 if !self.bump_and_bump_space() {
2411 return Err(self.error(
2412 Span::new(start, self.pos()),
2413 ast::ErrorKind::RepetitionCountUnclosed,
2414 ));
2415 }
2416 if self.char() != '}' {
2417 let count_start = match count_start {
2418 Ok(c) => c,
2419 Err(err) if err.kind == ast::ErrorKind::RepetitionCountDecimalEmpty => {
2420 if self.parser().empty_min_range {
2421 0
2422 } else {
2423 return Err(err);
2424 }
2425 }
2426 err => err?,
2427 };
2428 let count_end = specialize_err(
2429 self.parse_decimal(),
2430 ast::ErrorKind::DecimalEmpty,
2431 ast::ErrorKind::RepetitionCountDecimalEmpty,
2432 )?;
2433 ast::RepetitionRange::Bounded(count_start, count_end)
2434 } else {
2435 ast::RepetitionRange::AtLeast(count_start?)
2436 }
2437 } else {
2438 ast::RepetitionRange::Exactly(count_start?)
2439 };
2440
2441 if self.is_eof() || self.char() != '}' {
2442 return Err(self.error(
2443 Span::new(start, self.pos()),
2444 ast::ErrorKind::RepetitionCountUnclosed,
2445 ));
2446 }
2447
2448 if self.bump_and_bump_space() && self.char() == '?' {
2449 return Err(self.error(
2450 Span::new(start, self.pos()),
2451 ast::ErrorKind::UnsupportedLazyQuantifier,
2452 ));
2453 }
2454
2455 let op_span = Span::new(start, self.pos());
2456 if !range.is_valid() {
2457 return Err(self.error(op_span, ast::ErrorKind::RepetitionCountInvalid));
2458 }
2459
2460 let over_limit = match &range {
2461 ast::RepetitionRange::Exactly(n) => *n > self.max_repeat,
2462 ast::RepetitionRange::AtLeast(n) => *n > self.max_repeat,
2463 ast::RepetitionRange::Bounded(n, m) => *n > self.max_repeat || *m > self.max_repeat,
2464 };
2465 if over_limit {
2466 return Err(self.error(op_span, ast::ErrorKind::UnsupportedResharpRegex));
2467 }
2468 concat.asts.push(Ast::repetition(ast::Repetition {
2469 span: ast.span().with_end(self.pos()),
2470 op: ast::RepetitionOp {
2471 span: op_span,
2472 kind: ast::RepetitionKind::Range(range),
2473 },
2474 greedy: true,
2475 ast: Box::new(ast),
2476 }));
2477 Ok(concat)
2478 }
2479
2480 #[inline(never)]
2481 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
2482 assert_eq!(self.char(), '(');
2483 let open_span = self.span_char();
2484 self.bump();
2485 self.bump_space();
2486 if let Some((ahead, pos)) = self.is_lookaround_prefix() {
2487 let kind = match (pos, ahead) {
2488 (true, true) => LookaroundKind::PositiveLookahead,
2489 (true, false) => LookaroundKind::PositiveLookbehind,
2490 (false, true) => LookaroundKind::NegativeLookahead,
2491 (false, false) => LookaroundKind::NegativeLookbehind,
2492 };
2493 return Ok(Either::Right(ast::Group {
2494 span: open_span,
2495 kind: ast::GroupKind::Lookaround(kind),
2496 ast: Box::new(Ast::empty(self.span())),
2497 }));
2498 }
2499 let inner_span = self.span();
2500 let mut starts_with_p = true;
2501 if self.bump_if("?P<") || {
2502 starts_with_p = false;
2503 self.bump_if("?<")
2504 } {
2505 let capture_index = self.next_capture_index(open_span)?;
2506 let name = self.parse_capture_name(capture_index)?;
2507 Ok(Either::Right(ast::Group {
2508 span: open_span,
2509 kind: ast::GroupKind::CaptureName {
2510 starts_with_p,
2511 name,
2512 },
2513 ast: Box::new(Ast::empty(self.span())),
2514 }))
2515 } else if self.bump_if("?") {
2516 if self.is_eof() {
2517 return Err(self.error(open_span, ast::ErrorKind::GroupUnclosed));
2518 }
2519 let flags = self.parse_flags()?;
2520 let char_end = self.char();
2521 self.bump();
2522 if char_end == ')' {
2523 if flags.items.is_empty() {
2526 return Err(self.error(inner_span, ast::ErrorKind::RepetitionMissing));
2527 }
2528 Ok(Either::Left(ast::SetFlags {
2529 span: Span {
2530 end: self.pos(),
2531 ..open_span
2532 },
2533 flags,
2534 }))
2535 } else {
2536 assert_eq!(char_end, ':');
2537 Ok(Either::Right(ast::Group {
2538 span: open_span,
2539 kind: ast::GroupKind::NonCapturing(flags),
2540 ast: Box::new(Ast::empty(self.span())),
2541 }))
2542 }
2543 } else {
2544 let capture_index = self.next_capture_index(open_span)?;
2545 Ok(Either::Right(ast::Group {
2546 span: open_span,
2547 kind: ast::GroupKind::CaptureIndex(capture_index),
2548 ast: Box::new(Ast::empty(self.span())),
2549 }))
2550 }
2551 }
2552
2553 #[inline(never)]
2554 fn parse_capture_name(&self, capture_index: u32) -> Result<ast::CaptureName> {
2555 if self.is_eof() {
2556 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2557 }
2558 let start = self.pos();
2559 loop {
2560 if self.char() == '>' {
2561 break;
2562 }
2563 if !is_capture_char(self.char(), self.pos() == start) {
2564 return Err(self.error(self.span_char(), ast::ErrorKind::GroupNameInvalid));
2565 }
2566 if !self.bump() {
2567 break;
2568 }
2569 }
2570 let end = self.pos();
2571 if self.is_eof() {
2572 return Err(self.error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
2573 }
2574 assert_eq!(self.char(), '>');
2575 self.bump();
2576 let name = &self.pattern()[start.offset..end.offset];
2577 if name.is_empty() {
2578 return Err(self.error(Span::new(start, start), ast::ErrorKind::GroupNameEmpty));
2579 }
2580 let capname = ast::CaptureName {
2581 span: Span::new(start, end),
2582 name: name.to_string(),
2583 index: capture_index,
2584 };
2585 self.add_capture_name(&capname)?;
2586 Ok(capname)
2587 }
2588
2589 #[inline(never)]
2590 fn parse_flags(&self) -> Result<ast::Flags> {
2591 let mut flags = ast::Flags {
2592 span: self.span(),
2593 items: vec![],
2594 };
2595 let mut last_was_negation = None;
2596 while self.char() != ':' && self.char() != ')' {
2597 if self.char() == '-' {
2598 last_was_negation = Some(self.span_char());
2599 let item = ast::FlagsItem {
2600 span: self.span_char(),
2601 kind: ast::FlagsItemKind::Negation,
2602 };
2603 if let Some(i) = flags.add_item(item) {
2604 return Err(self.error(
2605 self.span_char(),
2606 ast::ErrorKind::FlagRepeatedNegation {
2607 original: flags.items[i].span,
2608 },
2609 ));
2610 }
2611 } else {
2612 last_was_negation = None;
2613 let item = ast::FlagsItem {
2614 span: self.span_char(),
2615 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
2616 };
2617 if let Some(i) = flags.add_item(item) {
2618 return Err(self.error(
2619 self.span_char(),
2620 ast::ErrorKind::FlagDuplicate {
2621 original: flags.items[i].span,
2622 },
2623 ));
2624 }
2625 }
2626 if !self.bump() {
2627 return Err(self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof));
2628 }
2629 }
2630 if let Some(span) = last_was_negation {
2631 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
2632 }
2633 flags.span.end = self.pos();
2634 Ok(flags)
2635 }
2636
2637 #[inline(never)]
2638 fn parse_flag(&self) -> Result<ast::Flag> {
2639 match self.char() {
2640 'i' => Ok(ast::Flag::CaseInsensitive),
2641 'm' => Ok(ast::Flag::MultiLine),
2642 's' => Ok(ast::Flag::DotMatchesNewLine),
2643 'U' => Ok(ast::Flag::SwapGreed),
2644 'u' => Ok(ast::Flag::Unicode),
2645 'R' => Ok(ast::Flag::CRLF),
2646 'x' => Ok(ast::Flag::IgnoreWhitespace),
2647 _ => Err(self.error(self.span_char(), ast::ErrorKind::FlagUnrecognized)),
2648 }
2649 }
2650
2651 fn parse_primitive(&self) -> Result<Primitive> {
2652 match self.char() {
2653 '\\' => self.parse_escape(),
2654 '_' => {
2655 let ast = Primitive::Top(self.span_char());
2656 self.bump();
2657 Ok(ast)
2658 }
2659 '.' => {
2660 let ast = Primitive::Dot(self.span_char());
2661 self.bump();
2662 Ok(ast)
2663 }
2664 '^' => {
2665 let ast = Primitive::Assertion(ast::Assertion {
2666 span: self.span_char(),
2667 kind: ast::AssertionKind::StartLine,
2668 });
2669 self.bump();
2670 Ok(ast)
2671 }
2672 '$' => {
2673 let ast = Primitive::Assertion(ast::Assertion {
2674 span: self.span_char(),
2675 kind: ast::AssertionKind::EndLine,
2676 });
2677 self.bump();
2678 Ok(ast)
2679 }
2680 c => {
2681 let ast = Primitive::Literal(Literal {
2682 span: self.span_char(),
2683 kind: LiteralKind::Verbatim,
2684 c,
2685 });
2686 self.bump();
2687 Ok(ast)
2688 }
2689 }
2690 }
2691
2692 #[inline(never)]
2693 fn parse_escape(&self) -> Result<Primitive> {
2694 assert_eq!(self.char(), '\\');
2695 let start = self.pos();
2696 if !self.bump() {
2697 return Err(self.error(
2698 Span::new(start, self.pos()),
2699 ast::ErrorKind::EscapeUnexpectedEof,
2700 ));
2701 }
2702 let c = self.char();
2703 match c {
2705 '0'..='9' => {
2706 if !self.parser().octal {
2707 return Err(self.error(
2708 Span::new(start, self.span_char().end),
2709 ast::ErrorKind::UnsupportedBackreference,
2710 ));
2711 }
2712 let mut lit = self.parse_octal();
2713 lit.span.start = start;
2714 return Ok(Primitive::Literal(lit));
2715 }
2716 'x' | 'u' | 'U' => {
2717 let mut lit = self.parse_hex()?;
2718 lit.span.start = start;
2719 return Ok(Primitive::Literal(lit));
2720 }
2721 'p' | 'P' => {
2722 let mut cls = self.parse_unicode_class()?;
2723 cls.span.start = start;
2724 return Ok(Primitive::Unicode(cls));
2725 }
2726 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
2727 let mut cls = self.parse_perl_class();
2728 cls.span.start = start;
2729 return Ok(Primitive::Perl(cls));
2730 }
2731 _ => {}
2732 }
2733
2734 self.bump();
2736 let span = Span::new(start, self.pos());
2737 if is_meta_character(c) {
2738 return Ok(Primitive::Literal(Literal {
2739 span,
2740 kind: LiteralKind::Meta,
2741 c,
2742 }));
2743 }
2744 if is_escapeable_character(c) {
2745 return Ok(Primitive::Literal(Literal {
2746 span,
2747 kind: LiteralKind::Superfluous,
2748 c,
2749 }));
2750 }
2751 let special = |kind, c| {
2752 Ok(Primitive::Literal(Literal {
2753 span,
2754 kind: LiteralKind::Special(kind),
2755 c,
2756 }))
2757 };
2758 match c {
2759 'a' => special(SpecialLiteralKind::Bell, '\x07'),
2760 'f' => special(SpecialLiteralKind::FormFeed, '\x0C'),
2761 't' => special(SpecialLiteralKind::Tab, '\t'),
2762 'n' => special(SpecialLiteralKind::LineFeed, '\n'),
2763 'r' => special(SpecialLiteralKind::CarriageReturn, '\r'),
2764 'v' => special(SpecialLiteralKind::VerticalTab, '\x0B'),
2765 'A' => Ok(Primitive::Assertion(ast::Assertion {
2766 span,
2767 kind: ast::AssertionKind::StartText,
2768 })),
2769 'z' => Ok(Primitive::Assertion(ast::Assertion {
2770 span,
2771 kind: ast::AssertionKind::EndText,
2772 })),
2773 'b' => {
2774 let mut wb = ast::Assertion {
2775 span,
2776 kind: ast::AssertionKind::WordBoundary,
2777 };
2778 if !self.is_eof() && self.char() == '{' {
2781 if let Some(kind) = self.maybe_parse_special_word_boundary(start)? {
2782 wb.kind = kind;
2783 wb.span.end = self.pos();
2784 }
2785 }
2786 Ok(Primitive::Assertion(wb))
2787 }
2788 'B' => Ok(Primitive::Assertion(ast::Assertion {
2789 span,
2790 kind: ast::AssertionKind::NotWordBoundary,
2791 })),
2792 '<' => Ok(Primitive::Assertion(ast::Assertion {
2793 span,
2794 kind: ast::AssertionKind::WordBoundaryStartAngle,
2795 })),
2796 '>' => Ok(Primitive::Assertion(ast::Assertion {
2797 span,
2798 kind: ast::AssertionKind::WordBoundaryEndAngle,
2799 })),
2800 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
2801 }
2802 }
2803
2804 fn maybe_parse_special_word_boundary(
2805 &self,
2806 wb_start: Position,
2807 ) -> Result<Option<ast::AssertionKind>> {
2808 assert_eq!(self.char(), '{');
2809
2810 let is_valid_char = |c| matches!(c, 'A'..='Z' | 'a'..='z' | '-');
2811 let start = self.pos();
2812 if !self.bump_and_bump_space() {
2813 return Err(self.error(
2814 Span::new(wb_start, self.pos()),
2815 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
2816 ));
2817 }
2818 let start_contents = self.pos();
2819 if !is_valid_char(self.char()) {
2820 self.parser().pos.set(start);
2821 return Ok(None);
2822 }
2823
2824 let mut scratch = self.parser().scratch.borrow_mut();
2826 scratch.clear();
2827 while !self.is_eof() && is_valid_char(self.char()) {
2828 scratch.push(self.char());
2829 self.bump_and_bump_space();
2830 }
2831 if self.is_eof() || self.char() != '}' {
2832 return Err(self.error(
2833 Span::new(start, self.pos()),
2834 ast::ErrorKind::SpecialWordBoundaryUnclosed,
2835 ));
2836 }
2837 let end = self.pos();
2838 self.bump();
2839 let kind = match scratch.as_str() {
2840 "start" => ast::AssertionKind::WordBoundaryStart,
2841 "end" => ast::AssertionKind::WordBoundaryEnd,
2842 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
2843 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
2844 _ => {
2845 return Err(self.error(
2846 Span::new(start_contents, end),
2847 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
2848 ))
2849 }
2850 };
2851 Ok(Some(kind))
2852 }
2853
2854 #[inline(never)]
2855 fn parse_octal(&self) -> Literal {
2856 assert!(self.parser().octal);
2857 assert!('0' <= self.char() && self.char() <= '7');
2858 let start = self.pos();
2859 while self.bump()
2861 && '0' <= self.char()
2862 && self.char() <= '7'
2863 && self.pos().offset - start.offset <= 2
2864 {}
2865 let end = self.pos();
2866 let octal = &self.pattern()[start.offset..end.offset];
2867 let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number");
2870 let c = char::from_u32(codepoint).expect("Unicode scalar value");
2873 Literal {
2874 span: Span::new(start, end),
2875 kind: LiteralKind::Octal,
2876 c,
2877 }
2878 }
2879
2880 #[inline(never)]
2881 fn parse_hex(&self) -> Result<Literal> {
2882 assert!(self.char() == 'x' || self.char() == 'u' || self.char() == 'U');
2883
2884 let hex_kind = match self.char() {
2885 'x' => HexLiteralKind::X,
2886 'u' => HexLiteralKind::UnicodeShort,
2887 _ => HexLiteralKind::UnicodeLong,
2888 };
2889 if !self.bump_and_bump_space() {
2890 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2891 }
2892 if self.char() == '{' {
2893 self.parse_hex_brace(hex_kind)
2894 } else {
2895 self.parse_hex_digits(hex_kind)
2896 }
2897 }
2898
2899 #[inline(never)]
2900 fn parse_hex_digits(&self, kind: HexLiteralKind) -> Result<Literal> {
2901 let mut scratch = self.parser().scratch.borrow_mut();
2902 scratch.clear();
2903
2904 let start = self.pos();
2905 for i in 0..kind.digits() {
2906 if i > 0 && !self.bump_and_bump_space() {
2907 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2908 }
2909 if !is_hex(self.char()) {
2910 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2911 }
2912 scratch.push(self.char());
2913 }
2914 self.bump_and_bump_space();
2915 let end = self.pos();
2916 let hex = scratch.as_str();
2917 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2918 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2919 Some(c) => Ok(Literal {
2920 span: Span::new(start, end),
2921 kind: LiteralKind::HexFixed(kind),
2922 c,
2923 }),
2924 }
2925 }
2926
2927 #[inline(never)]
2928 fn parse_hex_brace(&self, kind: HexLiteralKind) -> Result<Literal> {
2929 let mut scratch = self.parser().scratch.borrow_mut();
2930 scratch.clear();
2931
2932 let brace_pos = self.pos();
2933 let start = self.span_char().end;
2934 while self.bump_and_bump_space() && self.char() != '}' {
2935 if !is_hex(self.char()) {
2936 return Err(self.error(self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit));
2937 }
2938 scratch.push(self.char());
2939 }
2940 if self.is_eof() {
2941 return Err(self.error(
2942 Span::new(brace_pos, self.pos()),
2943 ast::ErrorKind::EscapeUnexpectedEof,
2944 ));
2945 }
2946 let end = self.pos();
2947 let hex = scratch.as_str();
2948 assert_eq!(self.char(), '}');
2949 self.bump_and_bump_space();
2950
2951 if hex.is_empty() {
2952 return Err(self.error(
2953 Span::new(brace_pos, self.pos()),
2954 ast::ErrorKind::EscapeHexEmpty,
2955 ));
2956 }
2957 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
2958 None => Err(self.error(Span::new(start, end), ast::ErrorKind::EscapeHexInvalid)),
2959 Some(c) => Ok(Literal {
2960 span: Span::new(start, self.pos()),
2961 kind: LiteralKind::HexBrace(kind),
2962 c,
2963 }),
2964 }
2965 }
2966
2967 fn parse_decimal(&self) -> Result<u32> {
2968 let mut scratch = self.parser().scratch.borrow_mut();
2969 scratch.clear();
2970
2971 while !self.is_eof() && self.char().is_whitespace() {
2972 self.bump();
2973 }
2974 let start = self.pos();
2975 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
2976 scratch.push(self.char());
2977 self.bump_and_bump_space();
2978 }
2979 let span = Span::new(start, self.pos());
2980 while !self.is_eof() && self.char().is_whitespace() {
2981 self.bump_and_bump_space();
2982 }
2983 let digits = scratch.as_str();
2984 if digits.is_empty() {
2985 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
2986 }
2987 match digits.parse::<u32>().ok() {
2988 Some(n) => Ok(n),
2989 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
2990 }
2991 }
2992
2993 #[inline(never)]
2994 fn parse_set_class(&self) -> Result<ClassBracketed> {
2995 assert_eq!(self.char(), '[');
2996
2997 let mut union = ClassSetUnion {
2998 span: self.span(),
2999 items: vec![],
3000 };
3001 loop {
3002 self.bump_space();
3003 if self.is_eof() {
3004 return Err(self.unclosed_class_error());
3005 }
3006 match self.char() {
3007 '[' => {
3008 if !self.parser().stack_class.borrow().is_empty() {
3009 if let Some(cls) = self.maybe_parse_ascii_class() {
3010 union.push(ClassSetItem::Ascii(cls));
3011 continue;
3012 }
3013 }
3014 union = self.push_class_open(union)?;
3015 }
3016 ']' => match self.pop_class(union)? {
3017 Either::Left(nested_union) => {
3018 union = nested_union;
3019 }
3020 Either::Right(class) => return Ok(class),
3021 },
3022 '&' if self.peek() == Some('&') => {
3023 assert!(self.bump_if("&&"));
3024 union = self.push_class_op(ClassSetBinaryOpKind::Intersection, union);
3025 }
3026 '-' if self.peek() == Some('-') => {
3027 assert!(self.bump_if("--"));
3028 union = self.push_class_op(ClassSetBinaryOpKind::Difference, union);
3029 }
3030 '~' if self.peek() == Some('~') => {
3031 assert!(self.bump_if("~~"));
3032 union = self.push_class_op(ClassSetBinaryOpKind::SymmetricDifference, union);
3033 }
3034 _ => {
3035 union.push(self.parse_set_class_range()?);
3036 }
3037 }
3038 }
3039 }
3040
3041 #[inline(never)]
3042 fn parse_set_class_range(&self) -> Result<ClassSetItem> {
3043 let prim1 = self.parse_set_class_item()?;
3044 self.bump_space();
3045 if self.is_eof() {
3046 return Err(self.unclosed_class_error());
3047 }
3048 if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') {
3049 return prim1.into_class_set_item(self);
3050 }
3051 if !self.bump_and_bump_space() {
3052 return Err(self.unclosed_class_error());
3053 }
3054 let prim2 = self.parse_set_class_item()?;
3055 let range = ClassSetRange {
3056 span: Span::new(prim1.span().start, prim2.span().end),
3057 start: prim1.into_class_literal(self)?,
3058 end: prim2.into_class_literal(self)?,
3059 };
3060 if !range.is_valid() {
3061 return Err(self.error(range.span, ast::ErrorKind::ClassRangeInvalid));
3062 }
3063 Ok(ClassSetItem::Range(range))
3064 }
3065
3066 #[inline(never)]
3067 fn parse_set_class_item(&self) -> Result<Primitive> {
3068 if self.char() == '\\' {
3069 self.parse_escape()
3070 } else {
3071 let x = Primitive::Literal(Literal {
3072 span: self.span_char(),
3073 kind: LiteralKind::Verbatim,
3074 c: self.char(),
3075 });
3076 self.bump();
3077 Ok(x)
3078 }
3079 }
3080
3081 #[inline(never)]
3082 fn parse_set_class_open(&self) -> Result<(ClassBracketed, ClassSetUnion)> {
3083 assert_eq!(self.char(), '[');
3084 let start = self.pos();
3085 if !self.bump_and_bump_space() {
3086 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3087 }
3088
3089 let negated = if self.char() != '^' {
3090 false
3091 } else {
3092 if !self.bump_and_bump_space() {
3093 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3094 }
3095 true
3096 };
3097 let mut union = ClassSetUnion {
3099 span: self.span(),
3100 items: vec![],
3101 };
3102 while self.char() == '-' {
3103 union.push(ClassSetItem::Literal(Literal {
3104 span: self.span_char(),
3105 kind: LiteralKind::Verbatim,
3106 c: '-',
3107 }));
3108 if !self.bump_and_bump_space() {
3109 return Err(self.error(Span::new(start, start), ast::ErrorKind::ClassUnclosed));
3110 }
3111 }
3112 if union.items.is_empty() && self.char() == ']' {
3115 union.push(ClassSetItem::Literal(Literal {
3116 span: self.span_char(),
3117 kind: LiteralKind::Verbatim,
3118 c: ']',
3119 }));
3120 if !self.bump_and_bump_space() {
3121 return Err(self.error(Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed));
3122 }
3123 }
3124 let set = ClassBracketed {
3125 span: Span::new(start, self.pos()),
3126 negated,
3127 kind: ClassSet::union(ClassSetUnion {
3128 span: Span::new(union.span.start, union.span.start),
3129 items: vec![],
3130 }),
3131 };
3132 Ok((set, union))
3133 }
3134
3135 #[inline(never)]
3136 fn maybe_parse_ascii_class(&self) -> Option<ClassAscii> {
3137 assert_eq!(self.char(), '[');
3138 let start = self.pos();
3140 let mut negated = false;
3141 if !self.bump() || self.char() != ':' {
3142 self.parser().pos.set(start);
3143 return None;
3144 }
3145 if !self.bump() {
3146 self.parser().pos.set(start);
3147 return None;
3148 }
3149 if self.char() == '^' {
3150 negated = true;
3151 if !self.bump() {
3152 self.parser().pos.set(start);
3153 return None;
3154 }
3155 }
3156 let name_start = self.offset();
3157 while self.char() != ':' && self.bump() {}
3158 if self.is_eof() {
3159 self.parser().pos.set(start);
3160 return None;
3161 }
3162 let name = &self.pattern()[name_start..self.offset()];
3163 if !self.bump_if(":]") {
3164 self.parser().pos.set(start);
3165 return None;
3166 }
3167 let kind = match regex_syntax::ast::ClassAsciiKind::from_name(name) {
3168 Some(kind) => kind,
3169 None => {
3170 self.parser().pos.set(start);
3171 return None;
3172 }
3173 };
3174 Some(ClassAscii {
3175 span: Span::new(start, self.pos()),
3176 kind,
3177 negated,
3178 })
3179 }
3180
3181 #[inline(never)]
3182 fn parse_unicode_class(&self) -> Result<ClassUnicode> {
3183 assert!(self.char() == 'p' || self.char() == 'P');
3184
3185 let mut scratch = self.parser().scratch.borrow_mut();
3186 scratch.clear();
3187
3188 let negated = self.char() == 'P';
3189 if !self.bump_and_bump_space() {
3190 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3191 }
3192 let (start, kind) = if self.char() == '{' {
3193 let start = self.span_char().end;
3194 while self.bump_and_bump_space() && self.char() != '}' {
3195 scratch.push(self.char());
3196 }
3197 if self.is_eof() {
3198 return Err(self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
3199 }
3200 assert_eq!(self.char(), '}');
3201 self.bump();
3202
3203 let name = scratch.as_str();
3204 if let Some(i) = name.find("!=") {
3205 (
3206 start,
3207 ClassUnicodeKind::NamedValue {
3208 op: ClassUnicodeOpKind::NotEqual,
3209 name: name[..i].to_string(),
3210 value: name[i + 2..].to_string(),
3211 },
3212 )
3213 } else if let Some(i) = name.find(':') {
3214 (
3215 start,
3216 ClassUnicodeKind::NamedValue {
3217 op: ClassUnicodeOpKind::Colon,
3218 name: name[..i].to_string(),
3219 value: name[i + 1..].to_string(),
3220 },
3221 )
3222 } else if let Some(i) = name.find('=') {
3223 (
3224 start,
3225 ClassUnicodeKind::NamedValue {
3226 op: ClassUnicodeOpKind::Equal,
3227 name: name[..i].to_string(),
3228 value: name[i + 1..].to_string(),
3229 },
3230 )
3231 } else {
3232 (start, ClassUnicodeKind::Named(name.to_string()))
3233 }
3234 } else {
3235 let start = self.pos();
3236 let c = self.char();
3237 if c == '\\' {
3238 return Err(self.error(self.span_char(), ast::ErrorKind::UnicodeClassInvalid));
3239 }
3240 self.bump_and_bump_space();
3241 let kind = ClassUnicodeKind::OneLetter(c);
3242 (start, kind)
3243 };
3244 Ok(ClassUnicode {
3245 span: Span::new(start, self.pos()),
3246 negated,
3247 kind,
3248 })
3249 }
3250
3251 #[inline(never)]
3252 fn parse_perl_class(&self) -> ClassPerl {
3253 let c = self.char();
3254 let span = self.span_char();
3255 self.bump();
3256 let (negated, kind) = match c {
3257 'd' => (false, regex_syntax::ast::ClassPerlKind::Digit),
3258 'D' => (true, regex_syntax::ast::ClassPerlKind::Digit),
3259 's' => (false, regex_syntax::ast::ClassPerlKind::Space),
3260 'S' => (true, regex_syntax::ast::ClassPerlKind::Space),
3261 'w' => (false, regex_syntax::ast::ClassPerlKind::Word),
3262 'W' => (true, regex_syntax::ast::ClassPerlKind::Word),
3263 c => panic!("expected valid Perl class but got '{}'", c),
3264 };
3265 ClassPerl {
3266 span,
3267 kind,
3268 negated,
3269 }
3270 }
3271}
3272
3273fn is_universal_perl_pair(item: ®ex_syntax::ast::ClassSetItem) -> bool {
3274 use regex_syntax::ast::ClassSetItem;
3275 let items = match item {
3276 ClassSetItem::Union(u) => &u.items,
3277 _ => return false,
3278 };
3279 if items.len() != 2 {
3280 return false;
3281 }
3282 match (&items[0], &items[1]) {
3283 (ClassSetItem::Perl(a), ClassSetItem::Perl(b)) => {
3284 let is_all = a.kind == b.kind && a.negated != b.negated;
3285 is_all
3286 }
3287 _ => false,
3288 }
3289}
3290
3291pub fn max_concat_length(ast: &ast::Ast) -> usize {
3292 match ast {
3293 ast::Ast::Empty(_)
3294 | ast::Ast::Flags(_)
3295 | ast::Ast::Literal(_)
3296 | ast::Ast::Dot(_)
3297 | ast::Ast::Top(_)
3298 | ast::Ast::Assertion(_)
3299 | ast::Ast::ClassUnicode(_)
3300 | ast::Ast::ClassPerl(_)
3301 | ast::Ast::ClassBracketed(_) => 0,
3302 ast::Ast::Group(g) => max_concat_length(&g.ast),
3303 ast::Ast::Complement(c) => max_concat_length(&c.ast),
3304 ast::Ast::Lookaround(l) => max_concat_length(&l.ast),
3305 ast::Ast::Repetition(r) => max_concat_length(&r.ast),
3306 ast::Ast::Concat(c) => c
3307 .asts
3308 .len()
3309 .max(c.asts.iter().map(max_concat_length).max().unwrap_or(0)),
3310 ast::Ast::Alternation(a) => a.asts.iter().map(max_concat_length).max().unwrap_or(0),
3311 ast::Ast::Intersection(i) => i.asts.iter().map(max_concat_length).max().unwrap_or(0),
3312 }
3313}
3314
3315pub fn expanded_ast_size(ast: &ast::Ast, limit: u64) -> u64 {
3316 fn go(ast: &ast::Ast, limit: u64) -> u64 {
3317 match ast {
3318 ast::Ast::Empty(_) | ast::Ast::Flags(_) => 1,
3319 ast::Ast::Literal(_) | ast::Ast::Dot(_) | ast::Ast::Top(_) => 1,
3320 ast::Ast::Assertion(_) => 1,
3321 ast::Ast::ClassUnicode(_) | ast::Ast::ClassPerl(_) | ast::Ast::ClassBracketed(_) => 1,
3322 ast::Ast::Group(g) => go(&g.ast, limit).saturating_add(1).min(limit),
3323 ast::Ast::Complement(c) => go(&c.ast, limit).saturating_add(1).min(limit),
3324 ast::Ast::Lookaround(l) => go(&l.ast, limit).saturating_add(1).min(limit),
3325 ast::Ast::Concat(c) => sum_children(&c.asts, limit),
3326 ast::Ast::Alternation(a) => sum_children(&a.asts, limit),
3327 ast::Ast::Intersection(i) => sum_children(&i.asts, limit),
3328 ast::Ast::Repetition(r) => {
3329 let body = go(&r.ast, limit);
3330 let factor: u64 = match &r.op.kind {
3331 ast::RepetitionKind::ZeroOrOne => 2,
3332 ast::RepetitionKind::ZeroOrMore | ast::RepetitionKind::OneOrMore => 2,
3333 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(n)) => {
3334 (*n as u64).max(1)
3335 }
3336 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(n)) => {
3337 (*n as u64).max(1).saturating_add(1)
3338 }
3339 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(_, m)) => {
3340 (*m as u64).max(1)
3341 }
3342 };
3343 body.saturating_mul(factor).min(limit)
3344 }
3345 }
3346 }
3347 fn sum_children(children: &[ast::Ast], limit: u64) -> u64 {
3348 let mut total: u64 = 0;
3349 for c in children {
3350 total = total.saturating_add(go(c, limit));
3351 if total >= limit {
3352 return limit;
3353 }
3354 }
3355 total
3356 }
3357 go(ast, limit)
3358}
3359
3360pub fn parse_ast<'s>(tb: &mut TB<'s>, pattern: &'s str) -> std::result::Result<NodeId, ParseError> {
3361 let mut p: ResharpParser<'s> = ResharpParser::new(pattern);
3362 p.parse(tb)
3363}
3364
3365pub fn parse_ast_with<'s>(
3366 tb: &mut TB<'s>,
3367 pattern: &'s str,
3368 flags: &PatternFlags,
3369) -> std::result::Result<NodeId, ParseError> {
3370 let mut p: ResharpParser<'s> = ResharpParser::with_flags(pattern, flags);
3371 p.parse(tb)
3372}
3373
3374pub fn parse_to_ast(pattern: &str) -> std::result::Result<ast::Ast, ParseError> {
3376 let mut p: ResharpParser = ResharpParser::new(pattern);
3377 p.parse_inner()
3378}