1use std::ops::Range;
3
4use nom::branch::alt;
5use nom::bytes::complete::{tag, take};
6use nom::character::complete::{anychar, char, digit0, digit1, multispace0, none_of};
7use nom::combinator::{cut, map, opt};
8use nom::multi::many0;
9use nom::sequence::{delimited, separated_pair, terminated};
10use nom::Parser;
11
12use crate::error::ErrorKind;
13
14use super::error::Error;
15use super::nom_recipes::rtrim;
16use super::types::{Input, ParseResult};
17
18#[derive(Clone, Debug, PartialEq, Eq)]
20pub struct Regex {
21 pub ast: Node,
23 pub case_insensitive: bool,
25 pub dot_all: bool,
27
28 pub span: Range<usize>,
30}
31
32#[derive(Clone, Debug, PartialEq, Eq)]
34pub enum Node {
35 Alternation(Vec<Node>),
37
38 Assertion(AssertionKind),
40
41 Class(ClassKind),
43
44 Concat(Vec<Node>),
46
47 Dot,
49
50 Empty,
52
53 Literal(Literal),
55
56 Char(LiteralChar),
58
59 Group(Box<Node>),
61
62 Repetition {
64 node: Box<Node>,
66
67 kind: RepetitionKind,
69
70 greedy: bool,
72 },
73}
74
75#[derive(Clone, Debug, PartialEq, Eq)]
77pub enum ClassKind {
78 Perl(PerlClass),
80 Bracketed(BracketedClass),
82}
83
84#[derive(Clone, Debug, PartialEq, Eq)]
86pub struct PerlClass {
87 pub kind: PerlClassKind,
89 pub negated: bool,
91}
92
93#[derive(Clone, Debug, PartialEq, Eq)]
95pub enum PerlClassKind {
96 Word,
98 Space,
100 Digit,
102}
103
104#[derive(Clone, Debug, PartialEq, Eq)]
106pub struct BracketedClass {
107 pub items: Vec<BracketedClassItem>,
109 pub negated: bool,
111}
112
113#[derive(Clone, Debug, PartialEq, Eq)]
115pub enum BracketedClassItem {
116 Perl(PerlClass),
118 Literal(Literal),
120 Range(Literal, Literal),
122}
123
124#[derive(Clone, Debug, PartialEq, Eq)]
126pub enum RepetitionKind {
127 ZeroOrOne,
129 ZeroOrMore,
131 OneOrMore,
133 Range(RepetitionRange),
135}
136
137#[derive(Clone, Debug, PartialEq, Eq)]
139pub enum RepetitionRange {
140 Exactly(u32),
142 AtLeast(u32),
144 Bounded(u32, u32),
146}
147
148#[derive(Clone, Debug, PartialEq, Eq)]
150pub enum AssertionKind {
151 StartLine,
153 EndLine,
155 WordBoundary,
157 NonWordBoundary,
159}
160
161#[derive(Clone, Debug, PartialEq, Eq)]
163pub struct LiteralChar {
164 pub c: char,
166
167 pub span: Range<usize>,
169
170 pub escaped: bool,
174}
175
176#[derive(Clone, Debug, PartialEq, Eq)]
178pub struct Literal {
179 pub byte: u8,
181
182 pub span: Range<usize>,
184
185 pub escaped: bool,
189}
190
191impl PartialOrd for Literal {
192 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
193 self.byte.partial_cmp(&other.byte)
194 }
195}
196
197pub fn parse_regex(input: &str) -> Result<Regex, Error> {
205 use nom::Finish;
206
207 let input = Input::new(input);
208 let (_, res) = regex(input).finish()?;
209
210 Ok(res)
211}
212
213pub(crate) fn regex(input: Input) -> ParseResult<Regex> {
222 let start = input.pos();
223 let (input, _) = char('/').parse(input)?;
224
225 let (input, ast) = cut(terminated(alternative, char('/'))).parse(input)?;
228 let (input, (no_case, dot_all)) = rtrim((opt(char('i')), opt(char('s')))).parse(input)?;
229
230 Ok((
231 input,
232 Regex {
233 ast,
234 case_insensitive: no_case.is_some(),
235 dot_all: dot_all.is_some(),
236 span: input.get_span_from(start),
237 },
238 ))
239}
240
241fn alternative(mut input: Input) -> ParseResult<Node> {
242 if input.string_recursion_counter >= input.params.string_recursion_limit {
248 return Err(nom::Err::Failure(Error::new(
249 input.get_span_from(input.pos()),
250 ErrorKind::RegexTooDeep,
251 )));
252 }
253
254 let mut alts = Vec::new();
255 loop {
256 input.string_recursion_counter += 1;
257 let (mut input2, node) = concatenation(input)?;
258 input2.string_recursion_counter -= 1;
259
260 let (input2, has_alt_char) = eat_opt_char('|', input2);
261 if has_alt_char {
262 alts.push(node);
263 input = input2;
264 continue;
265 }
266
267 return Ok((
268 input2,
269 if alts.is_empty() {
270 node
271 } else {
272 alts.push(node);
273 Node::Alternation(alts)
274 },
275 ));
276 }
277}
278
279fn eat_opt_char(c: char, mut input: Input) -> (Input, bool) {
280 match input.cursor().chars().next() {
281 Some(c2) if c2 == c => {
282 input.advance(c.len_utf8());
283 (input, true)
284 }
285 _ => (input, false),
286 }
287}
288
289fn concatenation(input: Input) -> ParseResult<Node> {
290 let (input, mut nodes) = many0(repeat).parse(input)?;
291
292 let node = if nodes.is_empty() {
293 Node::Empty
294 } else if nodes.len() == 1 {
295 nodes.pop().unwrap()
296 } else {
297 Node::Concat(nodes)
298 };
299
300 Ok((input, node))
301}
302
303fn repeat(input: Input) -> ParseResult<Node> {
304 if let Ok((input, node)) = assertion(input) {
306 return Ok((input, node));
307 }
308
309 let (input, node) = single(input)?;
311 let (input, repetition) = opt(repetition).parse(input)?;
312 match repetition {
313 Some((kind, greedy)) => Ok((
314 input,
315 Node::Repetition {
316 node: Box::new(node),
317 kind,
318 greedy,
319 },
320 )),
321 None => Ok((input, node)),
322 }
323}
324
325fn assertion(input: Input) -> ParseResult<Node> {
327 alt((
328 map(tag(r"\b"), |_| Node::Assertion(AssertionKind::WordBoundary)),
329 map(tag(r"\B"), |_| {
330 Node::Assertion(AssertionKind::NonWordBoundary)
331 }),
332 map(char('^'), |_| Node::Assertion(AssertionKind::StartLine)),
333 map(char('$'), |_| Node::Assertion(AssertionKind::EndLine)),
334 ))
335 .parse(input)
336}
337
338fn repetition(input: Input) -> ParseResult<(RepetitionKind, bool)> {
339 alt((
340 map(tag("*?"), |_| (RepetitionKind::ZeroOrMore, false)),
341 map(tag("+?"), |_| (RepetitionKind::OneOrMore, false)),
342 map(tag("??"), |_| (RepetitionKind::ZeroOrOne, false)),
343 map(tag("*"), |_| (RepetitionKind::ZeroOrMore, true)),
344 map(tag("+"), |_| (RepetitionKind::OneOrMore, true)),
345 map(tag("?"), |_| (RepetitionKind::ZeroOrOne, true)),
346 map(range_repetition, |(kind, greedy)| {
347 (RepetitionKind::Range(kind), greedy)
348 }),
349 ))
350 .parse(input)
351}
352
353fn single(input: Input) -> ParseResult<Node> {
354 alt((
355 map(delimited(char('('), alternative, char(')')), |node| {
356 Node::Group(Box::new(node))
357 }),
358 map(char('.'), |_| Node::Dot),
359 map(perl_class, |p| Node::Class(ClassKind::Perl(p))),
360 map(bracketed_class, |p| Node::Class(ClassKind::Bracketed(p))),
361 escaped_char,
362 literal,
363 ))
364 .parse(input)
365}
366
367fn perl_class(input: Input) -> ParseResult<PerlClass> {
368 alt((
369 map(tag(r"\w"), |_| PerlClass {
370 kind: PerlClassKind::Word,
371 negated: false,
372 }),
373 map(tag(r"\W"), |_| PerlClass {
374 kind: PerlClassKind::Word,
375 negated: true,
376 }),
377 map(tag(r"\s"), |_| PerlClass {
378 kind: PerlClassKind::Space,
379 negated: false,
380 }),
381 map(tag(r"\S"), |_| PerlClass {
382 kind: PerlClassKind::Space,
383 negated: true,
384 }),
385 map(tag(r"\d"), |_| PerlClass {
386 kind: PerlClassKind::Digit,
387 negated: false,
388 }),
389 map(tag(r"\D"), |_| PerlClass {
390 kind: PerlClassKind::Digit,
391 negated: true,
392 }),
393 ))
394 .parse(input)
395}
396
397fn bracketed_class(input: Input) -> ParseResult<BracketedClass> {
398 let (input, _) = char('[').parse(input)?;
399 cut(bracketed_class_inner).parse(input)
401}
402
403fn bracketed_class_inner(input: Input) -> ParseResult<BracketedClass> {
404 let (input, negated) = eat_opt_char('^', input);
405 let start = input.pos();
406 let (input2, contains_closing_bracket) = eat_opt_char(']', input);
407
408 let (input, mut items) = many0(bracketed_class_item).parse(input2)?;
409 let (input, _) = char(']').parse(input)?;
410
411 if contains_closing_bracket {
412 items.push(BracketedClassItem::Literal(Literal {
413 byte: b']',
414 span: input2.get_span_from_no_rtrim(start),
415 escaped: false,
416 }));
417 }
418 Ok((input, BracketedClass { items, negated }))
419}
420
421fn bracketed_class_item(input: Input) -> ParseResult<BracketedClassItem> {
422 alt((
423 map(perl_class, BracketedClassItem::Perl),
424 bracketed_class_range_or_literal,
425 ))
426 .parse(input)
427}
428
429fn bracketed_class_range_or_literal(input: Input) -> ParseResult<BracketedClassItem> {
430 let start = input.pos();
431 let (input, lit) = bracketed_class_literal(input)?;
432 let (input2, has_dash) = eat_opt_char('-', input);
433
434 if has_dash {
435 let (input3, lit2) = opt(bracketed_class_literal).parse(input2)?;
436 match lit2 {
437 Some(lit2) if lit2 < lit => Err(nom::Err::Failure(Error::new(
438 input3.get_span_from_no_rtrim(start),
439 ErrorKind::RegexClassRangeInvalid,
440 ))),
441 Some(lit2) => Ok((input3, BracketedClassItem::Range(lit, lit2))),
442 None => Ok((input, BracketedClassItem::Literal(lit))),
443 }
444 } else {
445 Ok((input, BracketedClassItem::Literal(lit)))
446 }
447}
448
449fn bracketed_class_literal(input: Input) -> ParseResult<Literal> {
450 alt((escaped_char_only_ascii, bracketed_class_char)).parse(input)
451}
452
453fn bracketed_class_char(input: Input) -> ParseResult<Literal> {
454 let start = input.pos();
455
456 let (input, b) = none_of("/\n]").parse(input)?;
460 if b.is_ascii() {
461 Ok((
462 input,
463 Literal {
464 byte: b as u8,
465 span: input.get_span_from_no_rtrim(start),
466 escaped: false,
467 },
468 ))
469 } else {
470 Err(nom::Err::Failure(Error::new(
471 input.get_span_from_no_rtrim(start),
472 ErrorKind::RegexNonAsciiByte,
473 )))
474 }
475}
476
477fn literal(input: Input) -> ParseResult<Node> {
478 let start = input.pos();
479
480 let (input, c) = none_of("/\n()[\\|.$^+*?").parse(input)?;
484 let node = if c.is_ascii() {
485 Node::Literal(Literal {
486 byte: c as u8,
487 span: input.get_span_from_no_rtrim(start),
488 escaped: false,
489 })
490 } else {
491 Node::Char(LiteralChar {
492 c,
493 span: input.get_span_from_no_rtrim(start),
494 escaped: false,
495 })
496 };
497
498 Ok((input, node))
499}
500
501fn escaped_char(input: Input) -> ParseResult<Node> {
502 let (input, res) = escaped_char_inner(input)?;
503
504 let node = match res {
505 Escaped {
506 kind: EscapedKind::Byte(byte),
507 span,
508 escaped,
509 } => Node::Literal(Literal {
510 byte,
511 span,
512 escaped,
513 }),
514 Escaped {
515 kind: EscapedKind::Char(c),
516 span,
517 escaped,
518 } => Node::Char(LiteralChar { c, span, escaped }),
519 };
520
521 Ok((input, node))
522}
523
524fn escaped_char_only_ascii(input: Input) -> ParseResult<Literal> {
525 let (input, res) = escaped_char_inner(input)?;
526
527 match res {
528 Escaped {
529 kind: EscapedKind::Byte(byte),
530 span,
531 escaped,
532 } => Ok((
533 input,
534 Literal {
535 byte,
536 span,
537 escaped,
538 },
539 )),
540 Escaped {
541 kind: EscapedKind::Char(_),
542 span,
543 ..
544 } => Err(nom::Err::Failure(Error::new(
545 span,
546 ErrorKind::RegexNonAsciiByte,
547 ))),
548 }
549}
550
551fn escaped_char_inner(input: Input) -> ParseResult<Escaped> {
552 let start = input.pos();
553 let (input2, _) = char('\\').parse(input)?;
554 let (input, b) = anychar(input2)?;
555
556 let span = input.get_span_from_no_rtrim(start);
557 let (kind, escaped) = match b {
558 'n' => (EscapedKind::Byte(b'\n'), false),
559 't' => (EscapedKind::Byte(b'\t'), false),
560 'r' => (EscapedKind::Byte(b'\r'), false),
561 'f' => (EscapedKind::Byte(b'\x0C'), false),
562 'a' => (EscapedKind::Byte(b'\x07'), false),
563 'x' => {
564 let (input, n) = cut(take(2_u32)).parse(input)?;
565
566 let n = match u8::from_str_radix(&n, 16) {
567 Ok(n) => n,
568 Err(e) => {
569 return Err(nom::Err::Failure(Error::new(
570 input.get_span_from_no_rtrim(start),
571 ErrorKind::StrToHexIntError(e),
572 )));
573 }
574 };
575 return Ok((
576 input,
577 Escaped {
578 kind: EscapedKind::Byte(n),
579 span: input.get_span_from_no_rtrim(start),
580 escaped: false,
581 },
582 ));
583 }
584 c if c.is_ascii() => (EscapedKind::Byte(c as u8), true),
585 c => (EscapedKind::Char(c), true),
586 };
587
588 Ok((
589 input,
590 Escaped {
591 kind,
592 span,
593 escaped,
594 },
595 ))
596}
597
598struct Escaped {
599 kind: EscapedKind,
600 span: Range<usize>,
601 escaped: bool,
602}
603
604#[allow(variant_size_differences)]
605enum EscapedKind {
606 Byte(u8),
607 Char(char),
608}
609
610fn range_repetition(input: Input) -> ParseResult<(RepetitionRange, bool)> {
611 let (input, range) = alt((range_single, range_multi)).parse(input)?;
612 let (input, non_greedy) = eat_opt_char('?', input);
613
614 Ok((input, (range, !non_greedy)))
615}
616
617fn range_single(input: Input) -> ParseResult<RepetitionRange> {
618 let (input, v) = delimited(char('{'), parse_u32, char('}')).parse(input)?;
619
620 Ok((input, RepetitionRange::Exactly(v)))
621}
622
623fn range_multi(input: Input) -> ParseResult<RepetitionRange> {
624 let start = input.pos();
625 let (input, (from, to)) = delimited(
626 char('{'),
627 separated_pair(
628 parse_opt_u32,
629 delimited(multispace0, char(','), multispace0),
630 parse_opt_u32,
631 ),
632 char('}'),
633 )
634 .parse(input)?;
635
636 let range = match (from, to) {
637 (None, None) => RepetitionRange::AtLeast(0),
638 (Some(from), None) => RepetitionRange::AtLeast(from),
639 (None, Some(to)) => RepetitionRange::Bounded(0, to),
640 (Some(from), Some(to)) if to < from => {
641 return Err(nom::Err::Failure(Error::new(
642 input.get_span_from_no_rtrim(start),
643 ErrorKind::RegexRangeInvalid,
644 )))
645 }
646 (Some(from), Some(to)) => RepetitionRange::Bounded(from, to),
647 };
648
649 Ok((input, range))
650}
651
652fn parse_u32(input: Input) -> ParseResult<u32> {
653 let start = input.pos();
654 let (input, v) = digit1(input)?;
655
656 let n = match str::parse::<u32>(&v) {
657 Ok(n) => n,
658 Err(e) => {
659 return Err(nom::Err::Failure(Error::new(
660 input.get_span_from_no_rtrim(start),
661 ErrorKind::StrToIntError(e),
662 )))
663 }
664 };
665
666 Ok((input, n))
667}
668
669fn parse_opt_u32(input: Input) -> ParseResult<Option<u32>> {
670 let start = input.pos();
671 let (input, v) = match digit0::<_, Error>(input) {
672 Ok((input, s)) if !s.is_empty() => (input, s),
673 _ => return Ok((input, None)),
674 };
675
676 let n = match str::parse::<u32>(&v) {
677 Ok(n) => n,
678 Err(e) => {
679 return Err(nom::Err::Failure(Error::new(
680 input.get_span_from_no_rtrim(start),
681 ErrorKind::StrToIntError(e),
682 )))
683 }
684 };
685
686 Ok((input, Some(n)))
687}
688
689#[cfg(test)]
690mod tests {
691 use crate::test_helpers::{parse, parse_err, test_public_type};
692 use crate::types::Params;
693 use nom::Finish;
694
695 use super::*;
696
697 fn lit(byte: u8, span: Range<usize>, escaped: bool) -> Literal {
698 Literal {
699 byte,
700 span,
701 escaped,
702 }
703 }
704
705 #[test]
706 fn test_parse() {
707 parse(
708 regex,
709 "/a/i",
710 "",
711 Regex {
712 ast: Node::Literal(lit(b'a', 1..2, false)),
713 case_insensitive: true,
714 dot_all: false,
715 span: 0..4,
716 },
717 );
718 parse(
719 regex,
720 "/[^0-9]+/a",
721 "a",
722 Regex {
723 ast: Node::Repetition {
724 node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
725 items: vec![BracketedClassItem::Range(
726 lit(b'0', 3..4, false),
727 lit(b'9', 5..6, false),
728 )],
729 negated: true,
730 }))),
731 kind: RepetitionKind::OneOrMore,
732 greedy: true,
733 },
734 case_insensitive: false,
735 dot_all: false,
736 span: 0..9,
737 },
738 );
739 parse(
740 regex,
741 r"/a\/b\cd/isb",
742 "b",
743 Regex {
744 ast: Node::Concat(vec![
745 Node::Literal(lit(b'a', 1..2, false)),
746 Node::Literal(lit(b'/', 2..4, true)),
747 Node::Literal(lit(b'b', 4..5, false)),
748 Node::Literal(lit(b'c', 5..7, true)),
749 Node::Literal(lit(b'd', 7..8, false)),
750 ]),
751 case_insensitive: true,
752 dot_all: true,
753 span: 0..11,
754 },
755 );
756 parse(
757 regex,
758 r"/.{2}/si c",
759 "i c",
760 Regex {
761 ast: Node::Repetition {
762 node: Box::new(Node::Dot),
763 kind: RepetitionKind::Range(RepetitionRange::Exactly(2)),
764 greedy: true,
765 },
766 case_insensitive: false,
767 dot_all: true,
768 span: 0..7,
769 },
770 );
771 parse(
772 regex,
773 "/\0\\\0/ c",
774 "c",
775 Regex {
776 ast: Node::Concat(vec![
777 Node::Literal(lit(b'\0', 1..2, false)),
778 Node::Literal(lit(b'\0', 2..4, true)),
779 ]),
780 case_insensitive: false,
781 dot_all: false,
782 span: 0..5,
783 },
784 );
785
786 parse_err(regex, "");
787 parse_err(regex, "/");
788 parse_err(regex, "/\n/");
789 parse_err(regex, "/a{2}");
790 parse_err(regex, "/a///");
791 parse_err(regex, "/a{5,4}/");
792 }
793
794 #[test]
795 fn test_alternative() {
796 parse(alternative, "(", "(", Node::Empty);
797 parse(
798 alternative,
799 "a)",
800 ")",
801 Node::Literal(lit(b'a', 0..1, false)),
802 );
803 parse(
804 alternative,
805 "a|b",
806 "",
807 Node::Alternation(vec![
808 Node::Literal(lit(b'a', 0..1, false)),
809 Node::Literal(lit(b'b', 2..3, false)),
810 ]),
811 );
812 parse(
813 alternative,
814 "a|)",
815 ")",
816 Node::Alternation(vec![Node::Literal(lit(b'a', 0..1, false)), Node::Empty]),
817 );
818
819 parse(
820 alternative,
821 r"ab|.\||\b$|",
822 "",
823 Node::Alternation(vec![
824 Node::Concat(vec![
825 Node::Literal(lit(b'a', 0..1, false)),
826 Node::Literal(lit(b'b', 1..2, false)),
827 ]),
828 Node::Concat(vec![Node::Dot, Node::Literal(lit(b'|', 4..6, true))]),
829 Node::Concat(vec![
830 Node::Assertion(AssertionKind::WordBoundary),
831 Node::Assertion(AssertionKind::EndLine),
832 ]),
833 Node::Empty,
834 ]),
835 );
836
837 parse_err(alternative, "\\xEG");
838 }
839
840 #[test]
841 fn test_concatenation() {
842 parse(concatenation, "", "", Node::Empty);
843 parse(
844 concatenation,
845 "a",
846 "",
847 Node::Literal(lit(b'a', 0..1, false)),
848 );
849 parse(
850 concatenation,
851 "ab",
852 "",
853 Node::Concat(vec![
854 Node::Literal(lit(b'a', 0..1, false)),
855 Node::Literal(lit(b'b', 1..2, false)),
856 ]),
857 );
858 parse(
859 concatenation,
860 "a$*",
861 "*",
862 Node::Concat(vec![
863 Node::Literal(lit(b'a', 0..1, false)),
864 Node::Assertion(AssertionKind::EndLine),
865 ]),
866 );
867 parse(
868 concatenation,
869 r"^a+\b\d{2,3}[^z]*?)",
870 ")",
871 Node::Concat(vec![
872 Node::Assertion(AssertionKind::StartLine),
873 Node::Repetition {
874 node: Box::new(Node::Literal(lit(b'a', 1..2, false))),
875 kind: RepetitionKind::OneOrMore,
876 greedy: true,
877 },
878 Node::Assertion(AssertionKind::WordBoundary),
879 Node::Repetition {
880 node: Box::new(Node::Class(ClassKind::Perl(PerlClass {
881 kind: PerlClassKind::Digit,
882 negated: false,
883 }))),
884 kind: RepetitionKind::Range(RepetitionRange::Bounded(2, 3)),
885 greedy: true,
886 },
887 Node::Repetition {
888 node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
889 items: vec![BracketedClassItem::Literal(lit(b'z', 14..15, false))],
890 negated: true,
891 }))),
892 kind: RepetitionKind::ZeroOrMore,
893 greedy: false,
894 },
895 ]),
896 );
897
898 parse_err(concatenation, "\\xEG");
899 }
900
901 #[test]
902 fn test_assertion() {
903 parse(
904 assertion,
905 r"\ba",
906 "a",
907 Node::Assertion(AssertionKind::WordBoundary),
908 );
909 parse(
910 assertion,
911 r"\B ",
912 " ",
913 Node::Assertion(AssertionKind::NonWordBoundary),
914 );
915 parse(
916 assertion,
917 r"^^",
918 "^",
919 Node::Assertion(AssertionKind::StartLine),
920 );
921 parse(
922 assertion,
923 r"$^",
924 "^",
925 Node::Assertion(AssertionKind::EndLine),
926 );
927
928 parse_err(assertion, r"\w");
929 }
930
931 #[test]
932 fn test_repetition() {
933 parse(repetition, "*??", "?", (RepetitionKind::ZeroOrMore, false));
934 parse(repetition, "+??", "?", (RepetitionKind::OneOrMore, false));
935 parse(repetition, "???", "?", (RepetitionKind::ZeroOrOne, false));
936 parse(repetition, "*a?", "a?", (RepetitionKind::ZeroOrMore, true));
937 parse(repetition, "+a?", "a?", (RepetitionKind::OneOrMore, true));
938 parse(repetition, "?a?", "a?", (RepetitionKind::ZeroOrOne, true));
939 parse(
940 repetition,
941 "{5}??",
942 "?",
943 (RepetitionKind::Range(RepetitionRange::Exactly(5)), false),
944 );
945
946 parse_err(repetition, "5");
947 }
948
949 #[test]
950 fn test_single() {
951 parse(single, ".a", "a", Node::Dot);
952 parse(single, "()a", "a", Node::Group(Box::new(Node::Empty)));
953 parse(
954 single,
955 "(ab)a",
956 "a",
957 Node::Group(Box::new(Node::Concat(vec![
958 Node::Literal(lit(b'a', 1..2, false)),
959 Node::Literal(lit(b'b', 2..3, false)),
960 ]))),
961 );
962 parse(
963 single,
964 r"\s",
965 "",
966 Node::Class(ClassKind::Perl(PerlClass {
967 kind: PerlClassKind::Space,
968 negated: false,
969 })),
970 );
971 parse(
972 single,
973 r"[a-fA-F] ",
974 " ",
975 Node::Class(ClassKind::Bracketed(BracketedClass {
976 items: vec![
977 BracketedClassItem::Range(lit(b'a', 1..2, false), lit(b'f', 3..4, false)),
978 BracketedClassItem::Range(lit(b'A', 4..5, false), lit(b'F', 6..7, false)),
979 ],
980 negated: false,
981 })),
982 );
983 parse(
984 single,
985 r"\xFFa",
986 "a",
987 Node::Literal(lit(b'\xFF', 0..4, false)),
988 );
989 parse(single, r"]a", "a", Node::Literal(lit(b']', 0..1, false)));
990
991 parse_err(single, "");
992 parse_err(single, "(");
993 parse_err(single, ")");
994 parse_err(single, "[");
995 parse_err(single, "|");
996 parse_err(single, "$");
997 parse_err(single, "^");
998 parse_err(single, "+");
999 parse_err(single, "*");
1000 parse_err(single, "?");
1001 parse_err(single, "(a");
1002 }
1003
1004 #[test]
1005 fn test_perl_class() {
1006 parse(
1007 perl_class,
1008 r"\w ",
1009 " ",
1010 PerlClass {
1011 kind: PerlClassKind::Word,
1012 negated: false,
1013 },
1014 );
1015 parse(
1016 perl_class,
1017 r"\Wa",
1018 "a",
1019 PerlClass {
1020 kind: PerlClassKind::Word,
1021 negated: true,
1022 },
1023 );
1024 parse(
1025 perl_class,
1026 r"\s",
1027 "",
1028 PerlClass {
1029 kind: PerlClassKind::Space,
1030 negated: false,
1031 },
1032 );
1033 parse(
1034 perl_class,
1035 r"\S\",
1036 "\\",
1037 PerlClass {
1038 kind: PerlClassKind::Space,
1039 negated: true,
1040 },
1041 );
1042 parse(
1043 perl_class,
1044 r"\d",
1045 "",
1046 PerlClass {
1047 kind: PerlClassKind::Digit,
1048 negated: false,
1049 },
1050 );
1051 parse(
1052 perl_class,
1053 r"\Da",
1054 "a",
1055 PerlClass {
1056 kind: PerlClassKind::Digit,
1057 negated: true,
1058 },
1059 );
1060
1061 parse_err(perl_class, "");
1062 parse_err(perl_class, "\\");
1063 parse_err(perl_class, "\\k");
1064 }
1065
1066 #[test]
1067 fn test_bracketed_class() {
1068 parse(
1069 bracketed_class,
1070 "[a]b",
1071 "b",
1072 BracketedClass {
1073 items: vec![BracketedClassItem::Literal(lit(b'a', 1..2, false))],
1074 negated: false,
1075 },
1076 );
1077 parse(
1078 bracketed_class,
1079 "[^a-z_\\S0-9]",
1080 "",
1081 BracketedClass {
1082 items: vec![
1083 BracketedClassItem::Range(lit(b'a', 2..3, false), lit(b'z', 4..5, false)),
1084 BracketedClassItem::Literal(lit(b'_', 5..6, false)),
1085 BracketedClassItem::Perl(PerlClass {
1086 kind: PerlClassKind::Space,
1087 negated: true,
1088 }),
1089 BracketedClassItem::Range(lit(b'0', 8..9, false), lit(b'9', 10..11, false)),
1090 ],
1091 negated: true,
1092 },
1093 );
1094 parse(
1095 bracketed_class,
1096 "[]\\j]",
1097 "",
1098 BracketedClass {
1099 items: vec![
1100 BracketedClassItem::Literal(lit(b'j', 2..4, true)),
1101 BracketedClassItem::Literal(lit(b']', 1..2, false)),
1102 ],
1103 negated: false,
1104 },
1105 );
1106 parse(
1107 bracketed_class,
1108 "[]]",
1109 "",
1110 BracketedClass {
1111 items: vec![BracketedClassItem::Literal(lit(b']', 1..2, false))],
1112 negated: false,
1113 },
1114 );
1115 parse(
1116 bracketed_class,
1117 "[^]]",
1118 "",
1119 BracketedClass {
1120 items: vec![BracketedClassItem::Literal(lit(b']', 2..3, false))],
1121 negated: true,
1122 },
1123 );
1124 parse(
1125 bracketed_class,
1126 "[^a\\]b-]",
1127 "",
1128 BracketedClass {
1129 items: vec![
1130 BracketedClassItem::Literal(lit(b'a', 2..3, false)),
1131 BracketedClassItem::Literal(lit(b']', 3..5, true)),
1132 BracketedClassItem::Literal(lit(b'b', 5..6, false)),
1133 BracketedClassItem::Literal(lit(b'-', 6..7, false)),
1134 ],
1135 negated: true,
1136 },
1137 );
1138
1139 parse_err(bracketed_class, "[");
1140 parse_err(bracketed_class, "[]");
1141 parse_err(bracketed_class, "[^]");
1142 parse_err(bracketed_class, "[é]");
1143 parse_err(bracketed_class, "[\\]");
1144 parse_err(bracketed_class, "[\\x]");
1145 parse_err(bracketed_class, "[\\x0]");
1146 parse_err(bracketed_class, "[\\é]");
1147 }
1148
1149 #[test]
1150 fn test_bracketed_class_item() {
1151 parse(
1152 bracketed_class_item,
1153 "\\sw",
1154 "w",
1155 BracketedClassItem::Perl(PerlClass {
1156 kind: PerlClassKind::Space,
1157 negated: false,
1158 }),
1159 );
1160 parse(
1161 bracketed_class_item,
1162 "\\c-z]",
1163 "]",
1164 BracketedClassItem::Range(lit(b'c', 0..2, true), lit(b'z', 3..4, false)),
1165 );
1166
1167 parse_err(bracketed_class_item, "é");
1168 }
1169
1170 #[test]
1171 fn test_bracketed_class_range_or_literal() {
1172 parse(
1173 bracketed_class_range_or_literal,
1174 "ab",
1175 "b",
1176 BracketedClassItem::Literal(lit(b'a', 0..1, false)),
1177 );
1178 parse(
1179 bracketed_class_range_or_literal,
1180 r"\x01-",
1181 "-",
1182 BracketedClassItem::Literal(lit(b'\x01', 0..4, false)),
1183 );
1184 parse(
1185 bracketed_class_range_or_literal,
1186 "-\\]",
1187 "\\]",
1188 BracketedClassItem::Literal(lit(b'-', 0..1, false)),
1189 );
1190 parse(
1191 bracketed_class_range_or_literal,
1192 "A-]",
1193 "-]",
1194 BracketedClassItem::Literal(lit(b'A', 0..1, false)),
1195 );
1196
1197 parse(
1198 bracketed_class_range_or_literal,
1199 "a-\\sb",
1200 "b",
1201 BracketedClassItem::Range(lit(b'a', 0..1, false), lit(b's', 2..4, true)),
1202 );
1203 parse(
1204 bracketed_class_range_or_literal,
1205 "!--",
1206 "",
1207 BracketedClassItem::Range(lit(b'!', 0..1, false), lit(b'-', 2..3, false)),
1208 );
1209 parse(
1210 bracketed_class_range_or_literal,
1211 "---",
1212 "",
1213 BracketedClassItem::Range(lit(b'-', 0..1, false), lit(b'-', 2..3, false)),
1214 );
1215 parse(
1216 bracketed_class_range_or_literal,
1217 r"\n-\n",
1218 "",
1219 BracketedClassItem::Range(lit(b'\n', 0..2, false), lit(b'\n', 3..5, false)),
1220 );
1221 parse(
1222 bracketed_class_range_or_literal,
1223 r"\x01-\xFE",
1224 "",
1225 BracketedClassItem::Range(lit(b'\x01', 0..4, false), lit(b'\xFE', 5..9, false)),
1226 );
1227
1228 parse_err(bracketed_class_range_or_literal, "é");
1229 parse_err(bracketed_class_range_or_literal, "b-a");
1230 parse_err(bracketed_class_range_or_literal, "é-a");
1231 parse_err(bracketed_class_range_or_literal, "a-é");
1232 parse_err(bracketed_class_range_or_literal, "]-a");
1233 }
1234
1235 #[test]
1236 fn test_bracketed_class_literal() {
1237 parse(bracketed_class_literal, "ab", "b", lit(b'a', 0..1, false));
1238 parse(
1239 bracketed_class_literal,
1240 "\\nb",
1241 "b",
1242 lit(b'\n', 0..2, false),
1243 );
1244 parse(bracketed_class_literal, "\\]", "", lit(b']', 0..2, true));
1245
1246 parse_err(bracketed_class_literal, "]b");
1247 parse_err(bracketed_class_literal, "é");
1248 parse_err(bracketed_class_literal, "\\x1");
1249 parse_err(bracketed_class_literal, "\\é");
1250 }
1251
1252 #[test]
1253 fn test_bracketed_class_char() {
1254 parse(bracketed_class_char, "ab", "b", lit(b'a', 0..1, false));
1255
1256 parse_err(bracketed_class_char, "]b");
1257 parse_err(bracketed_class_char, "é");
1258 }
1259
1260 #[test]
1261 fn test_literal() {
1262 parse(literal, "ab", "b", Node::Literal(lit(b'a', 0..1, false)));
1263 parse(literal, "]", "", Node::Literal(lit(b']', 0..1, false)));
1264
1265 parse(
1266 literal,
1267 "éb",
1268 "b",
1269 Node::Char(LiteralChar {
1270 c: 'é',
1271 span: 0..2,
1272 escaped: false,
1273 }),
1274 );
1275 }
1276
1277 #[test]
1278 fn test_escaped_char() {
1279 parse(
1280 escaped_char,
1281 "\\na",
1282 "a",
1283 Node::Literal(lit(b'\n', 0..2, false)),
1284 );
1285 parse(
1286 escaped_char,
1287 "\\ta",
1288 "a",
1289 Node::Literal(lit(b'\t', 0..2, false)),
1290 );
1291 parse(
1292 escaped_char,
1293 "\\ra",
1294 "a",
1295 Node::Literal(lit(b'\r', 0..2, false)),
1296 );
1297 parse(
1298 escaped_char,
1299 "\\fa",
1300 "a",
1301 Node::Literal(lit(b'\x0C', 0..2, false)),
1302 );
1303 parse(
1304 escaped_char,
1305 "\\aa",
1306 "a",
1307 Node::Literal(lit(b'\x07', 0..2, false)),
1308 );
1309 parse(
1310 escaped_char,
1311 "\\x00a",
1312 "a",
1313 Node::Literal(lit(b'\0', 0..4, false)),
1314 );
1315 parse(
1316 escaped_char,
1317 "\\xAF a",
1318 " a",
1319 Node::Literal(lit(b'\xAF', 0..4, false)),
1320 );
1321 parse(
1322 escaped_char,
1323 "\\k",
1324 "",
1325 Node::Literal(lit(b'k', 0..2, true)),
1326 );
1327 parse(
1328 escaped_char,
1329 "\\é_",
1330 "_",
1331 Node::Char(LiteralChar {
1332 c: 'é',
1333 span: 0..3,
1334 escaped: true,
1335 }),
1336 );
1337
1338 parse_err(escaped_char, "\\");
1339 parse_err(escaped_char, "\\x");
1340 parse_err(escaped_char, "\\x2");
1341 parse_err(escaped_char, "\\x2G");
1342 }
1343
1344 #[test]
1345 fn test_escaped_char_only_ascii() {
1346 parse(
1347 escaped_char_only_ascii,
1348 "\\na",
1349 "a",
1350 lit(b'\n', 0..2, false),
1351 );
1352 parse(
1353 escaped_char_only_ascii,
1354 "\\ta",
1355 "a",
1356 lit(b'\t', 0..2, false),
1357 );
1358 parse(
1359 escaped_char_only_ascii,
1360 "\\ra",
1361 "a",
1362 lit(b'\r', 0..2, false),
1363 );
1364 parse(
1365 escaped_char_only_ascii,
1366 "\\fa",
1367 "a",
1368 lit(b'\x0C', 0..2, false),
1369 );
1370 parse(
1371 escaped_char_only_ascii,
1372 "\\aa",
1373 "a",
1374 lit(b'\x07', 0..2, false),
1375 );
1376 parse(
1377 escaped_char_only_ascii,
1378 "\\x00a",
1379 "a",
1380 lit(b'\0', 0..4, false),
1381 );
1382 parse(
1383 escaped_char_only_ascii,
1384 "\\xAF a",
1385 " a",
1386 lit(b'\xAF', 0..4, false),
1387 );
1388 parse(escaped_char_only_ascii, "\\k", "", lit(b'k', 0..2, true));
1389
1390 parse_err(escaped_char_only_ascii, "\\");
1391 parse_err(escaped_char_only_ascii, "\\é");
1392 parse_err(escaped_char_only_ascii, "\\x");
1393 parse_err(escaped_char_only_ascii, "\\x2");
1394 parse_err(escaped_char_only_ascii, "\\x2G");
1395 }
1396
1397 #[test]
1398 fn test_range_repetition() {
1399 parse(
1400 range_repetition,
1401 "{0} ?a",
1402 " ?a",
1403 (RepetitionRange::Exactly(0), true),
1404 );
1405 parse(
1406 range_repetition,
1407 "{5}?a",
1408 "a",
1409 (RepetitionRange::Exactly(5), false),
1410 );
1411
1412 parse(
1413 range_repetition,
1414 "{5,15} a",
1415 " a",
1416 (RepetitionRange::Bounded(5, 15), true),
1417 );
1418 parse(
1419 range_repetition,
1420 "{5,}?a",
1421 "a",
1422 (RepetitionRange::AtLeast(5), false),
1423 );
1424
1425 parse_err(range_repetition, "{}?");
1426 }
1427
1428 #[test]
1429 fn test_range_single() {
1430 parse(range_single, "{0}a", "a", RepetitionRange::Exactly(0));
1431 parse(range_single, "{350} a", " a", RepetitionRange::Exactly(350));
1432
1433 parse_err(range_single, "{");
1434 parse_err(range_single, "{}");
1435 parse_err(range_single, "{-5}");
1436 }
1437
1438 #[test]
1439 fn test_range_multi() {
1440 parse(range_multi, "{,5}a", "a", RepetitionRange::Bounded(0, 5));
1441 parse(range_multi, "{5,}a", "a", RepetitionRange::AtLeast(5));
1442 parse(range_multi, "{5,10}a", "a", RepetitionRange::Bounded(5, 10));
1443 parse(range_multi, "{0,0} a", " a", RepetitionRange::Bounded(0, 0));
1444 parse(range_multi, "{,}", "", RepetitionRange::AtLeast(0));
1445 parse(range_multi, "{, }", "", RepetitionRange::AtLeast(0));
1446 parse(range_multi, "{ ,}", "", RepetitionRange::AtLeast(0));
1447 parse(range_multi, "{ , }", "", RepetitionRange::AtLeast(0));
1448 parse(range_multi, "{2 , }", "", RepetitionRange::AtLeast(2));
1449 parse(range_multi, "{ , 2}", "", RepetitionRange::Bounded(0, 2));
1450 parse(range_multi, "{1 , 2}", "", RepetitionRange::Bounded(1, 2));
1451
1452 parse_err(range_multi, "{");
1453 parse_err(range_multi, "{,5");
1454 parse_err(range_multi, "{,-5}");
1455 parse_err(range_multi, "{-5,}");
1456 parse_err(range_multi, "{10,5}");
1457 parse_err(range_multi, "{ 1,5}");
1458 parse_err(range_multi, "{1,5 }");
1459 }
1460
1461 #[test]
1462 fn test_parse_u32() {
1463 parse(parse_u32, "5a", "a", 5_u32);
1464
1465 parse_err(parse_u32, "a");
1466 parse_err(parse_u32, "-5a");
1467 parse_err(parse_u32, "5000000000000");
1468 }
1469
1470 #[test]
1471 fn test_parse_opt_u32() {
1472 parse(parse_opt_u32, "a", "a", None);
1473 parse(parse_opt_u32, "5a", "a", Some(5));
1474 parse(parse_opt_u32, "-5a", "-5a", None);
1475
1476 parse_err(parse_opt_u32, "5000000000000");
1477 }
1478
1479 #[test]
1480 fn test_stack_overflow() {
1481 const STRING_RECURSION_TEST_LIMIT: u8 = 10;
1485
1486 let mut v = String::new();
1489 v.push('/');
1490 for _ in 0..1_000 {
1491 v.push_str("a(b");
1492 }
1493 for _ in 0..1_000 {
1494 v.push_str(")c");
1495 }
1496 v.push('/');
1497
1498 let input = Input::with_params(
1499 &v,
1500 Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
1501 );
1502 let res = regex(input).finish();
1503 assert_eq!(
1504 &res.unwrap_err(),
1505 &Error::new(30..30, ErrorKind::RegexTooDeep)
1506 );
1507
1508 let input = Input::with_params(
1510 &v,
1511 Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT + 2),
1512 );
1513 let res = regex(input).finish();
1514 assert_eq!(
1515 &res.unwrap_err(),
1516 &Error::new(36..36, ErrorKind::RegexTooDeep)
1517 );
1518
1519 let mut v = String::new();
1521 v.push('/');
1522 let nb = STRING_RECURSION_TEST_LIMIT - 1;
1523 for _ in 0..nb {
1524 v.push_str("a(b");
1525 }
1526 for _ in 0..nb {
1527 v.push_str(")c");
1528 }
1529 v.push('d');
1530 for _ in 0..nb {
1531 v.push_str("e(f");
1532 }
1533 for _ in 0..nb {
1534 v.push_str(")h");
1535 }
1536 v.push('/');
1537
1538 let input = Input::with_params(
1539 &v,
1540 Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
1541 );
1542 let _res = regex(input).unwrap();
1543 assert_eq!(input.string_recursion_counter, 0);
1544 }
1545
1546 #[test]
1547 fn test_parse_regex() {
1548 assert!(parse_regex(r"/a{2}/").is_ok());
1549 assert!(parse_regex(r"a{2}/").is_err());
1550 }
1551
1552 #[test]
1553 fn test_public_types() {
1554 test_public_type(regex(Input::new(r"/a{2}[az]\b\s|.+$/")).unwrap());
1555 }
1556}