1use std::ops::Range;
3
4use nom::branch::alt;
5use nom::bytes::complete::{tag, take};
6use nom::character::complete::{anychar, char, digit0, digit1, multispace0, none_of};
7use nom::combinator::{cut, map, opt};
8use nom::multi::many0;
9use nom::sequence::{delimited, separated_pair, terminated};
10use nom::Parser;
11
12use crate::error::ErrorKind;
13
14use super::error::Error;
15use super::nom_recipes::rtrim;
16use super::types::{Input, ParseResult};
17
18const MAX_REGEX_RECURSION: usize = 10;
19
20#[derive(Clone, Debug, PartialEq, Eq)]
22pub struct Regex {
23 pub ast: Node,
25 pub case_insensitive: bool,
27 pub dot_all: bool,
29
30 pub span: Range<usize>,
32}
33
34#[derive(Clone, Debug, PartialEq, Eq)]
36pub enum Node {
37 Alternation(Vec<Node>),
39
40 Assertion(AssertionKind),
42
43 Class(ClassKind),
45
46 Concat(Vec<Node>),
48
49 Dot,
51
52 Empty,
54
55 Literal(Literal),
57
58 Char(LiteralChar),
60
61 Group(Box<Node>),
63
64 Repetition {
66 node: Box<Node>,
68
69 kind: RepetitionKind,
71
72 greedy: bool,
74 },
75}
76
77#[derive(Clone, Debug, PartialEq, Eq)]
79pub enum ClassKind {
80 Perl(PerlClass),
82 Bracketed(BracketedClass),
84}
85
86#[derive(Clone, Debug, PartialEq, Eq)]
88pub struct PerlClass {
89 pub kind: PerlClassKind,
91 pub negated: bool,
93}
94
95#[derive(Clone, Debug, PartialEq, Eq)]
97pub enum PerlClassKind {
98 Word,
100 Space,
102 Digit,
104}
105
106#[derive(Clone, Debug, PartialEq, Eq)]
108pub struct BracketedClass {
109 pub items: Vec<BracketedClassItem>,
111 pub negated: bool,
113}
114
115#[derive(Clone, Debug, PartialEq, Eq)]
117pub enum BracketedClassItem {
118 Perl(PerlClass),
120 Literal(Literal),
122 Range(Literal, Literal),
124}
125
126#[derive(Clone, Debug, PartialEq, Eq)]
128pub enum RepetitionKind {
129 ZeroOrOne,
131 ZeroOrMore,
133 OneOrMore,
135 Range(RepetitionRange),
137}
138
139#[derive(Clone, Debug, PartialEq, Eq)]
141pub enum RepetitionRange {
142 Exactly(u32),
144 AtLeast(u32),
146 Bounded(u32, u32),
148}
149
150#[derive(Clone, Debug, PartialEq, Eq)]
152pub enum AssertionKind {
153 StartLine,
155 EndLine,
157 WordBoundary,
159 NonWordBoundary,
161}
162
163#[derive(Clone, Debug, PartialEq, Eq)]
165pub struct LiteralChar {
166 pub c: char,
168
169 pub span: Range<usize>,
171
172 pub escaped: bool,
176}
177
178#[derive(Clone, Debug, PartialEq, Eq)]
180pub struct Literal {
181 pub byte: u8,
183
184 pub span: Range<usize>,
186
187 pub escaped: bool,
191}
192
193impl PartialOrd for Literal {
194 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
195 self.byte.partial_cmp(&other.byte)
196 }
197}
198
199pub fn parse_regex(input: &str) -> Result<Regex, Error> {
207 use nom::Finish;
208
209 let input = Input::new(input);
210 let (_, res) = regex(input).finish()?;
211
212 Ok(res)
213}
214
215pub(crate) fn regex(input: Input) -> ParseResult<Regex> {
224 let start = input.pos();
225 let (input, _) = char('/').parse(input)?;
226
227 let (input, ast) = cut(terminated(alternative, char('/'))).parse(input)?;
230 let (input, (no_case, dot_all)) = rtrim((opt(char('i')), opt(char('s')))).parse(input)?;
231
232 Ok((
233 input,
234 Regex {
235 ast,
236 case_insensitive: no_case.is_some(),
237 dot_all: dot_all.is_some(),
238 span: input.get_span_from(start),
239 },
240 ))
241}
242
243fn alternative(mut input: Input) -> ParseResult<Node> {
244 if input.inner_recursion_counter >= MAX_REGEX_RECURSION {
250 return Err(nom::Err::Failure(Error::new(
251 input.get_span_from(input.pos()),
252 ErrorKind::RegexTooDeep,
253 )));
254 }
255
256 let mut alts = Vec::new();
257 loop {
258 input.inner_recursion_counter += 1;
259 let (mut input2, node) = concatenation(input)?;
260 input2.inner_recursion_counter -= 1;
261
262 let (input2, has_alt_char) = eat_opt_char('|', input2);
263 if has_alt_char {
264 alts.push(node);
265 input = input2;
266 continue;
267 }
268
269 return Ok((
270 input2,
271 if alts.is_empty() {
272 node
273 } else {
274 alts.push(node);
275 Node::Alternation(alts)
276 },
277 ));
278 }
279}
280
281fn eat_opt_char(c: char, mut input: Input) -> (Input, bool) {
282 match input.cursor().chars().next() {
283 Some(c2) if c2 == c => {
284 input.advance(c.len_utf8());
285 (input, true)
286 }
287 _ => (input, false),
288 }
289}
290
291fn concatenation(input: Input) -> ParseResult<Node> {
292 let (input, mut nodes) = many0(repeat).parse(input)?;
293
294 let node = if nodes.is_empty() {
295 Node::Empty
296 } else if nodes.len() == 1 {
297 nodes.pop().unwrap()
298 } else {
299 Node::Concat(nodes)
300 };
301
302 Ok((input, node))
303}
304
305fn repeat(input: Input) -> ParseResult<Node> {
306 if let Ok((input, node)) = assertion(input) {
308 return Ok((input, node));
309 }
310
311 let (input, node) = single(input)?;
313 let (input, repetition) = opt(repetition).parse(input)?;
314 match repetition {
315 Some((kind, greedy)) => Ok((
316 input,
317 Node::Repetition {
318 node: Box::new(node),
319 kind,
320 greedy,
321 },
322 )),
323 None => Ok((input, node)),
324 }
325}
326
327fn assertion(input: Input) -> ParseResult<Node> {
329 alt((
330 map(tag(r"\b"), |_| Node::Assertion(AssertionKind::WordBoundary)),
331 map(tag(r"\B"), |_| {
332 Node::Assertion(AssertionKind::NonWordBoundary)
333 }),
334 map(char('^'), |_| Node::Assertion(AssertionKind::StartLine)),
335 map(char('$'), |_| Node::Assertion(AssertionKind::EndLine)),
336 ))
337 .parse(input)
338}
339
340fn repetition(input: Input) -> ParseResult<(RepetitionKind, bool)> {
341 alt((
342 map(tag("*?"), |_| (RepetitionKind::ZeroOrMore, false)),
343 map(tag("+?"), |_| (RepetitionKind::OneOrMore, false)),
344 map(tag("??"), |_| (RepetitionKind::ZeroOrOne, false)),
345 map(tag("*"), |_| (RepetitionKind::ZeroOrMore, true)),
346 map(tag("+"), |_| (RepetitionKind::OneOrMore, true)),
347 map(tag("?"), |_| (RepetitionKind::ZeroOrOne, true)),
348 map(range_repetition, |(kind, greedy)| {
349 (RepetitionKind::Range(kind), greedy)
350 }),
351 ))
352 .parse(input)
353}
354
355fn single(input: Input) -> ParseResult<Node> {
356 alt((
357 map(delimited(char('('), alternative, char(')')), |node| {
358 Node::Group(Box::new(node))
359 }),
360 map(char('.'), |_| Node::Dot),
361 map(perl_class, |p| Node::Class(ClassKind::Perl(p))),
362 map(bracketed_class, |p| Node::Class(ClassKind::Bracketed(p))),
363 escaped_char,
364 literal,
365 ))
366 .parse(input)
367}
368
369fn perl_class(input: Input) -> ParseResult<PerlClass> {
370 alt((
371 map(tag(r"\w"), |_| PerlClass {
372 kind: PerlClassKind::Word,
373 negated: false,
374 }),
375 map(tag(r"\W"), |_| PerlClass {
376 kind: PerlClassKind::Word,
377 negated: true,
378 }),
379 map(tag(r"\s"), |_| PerlClass {
380 kind: PerlClassKind::Space,
381 negated: false,
382 }),
383 map(tag(r"\S"), |_| PerlClass {
384 kind: PerlClassKind::Space,
385 negated: true,
386 }),
387 map(tag(r"\d"), |_| PerlClass {
388 kind: PerlClassKind::Digit,
389 negated: false,
390 }),
391 map(tag(r"\D"), |_| PerlClass {
392 kind: PerlClassKind::Digit,
393 negated: true,
394 }),
395 ))
396 .parse(input)
397}
398
399fn bracketed_class(input: Input) -> ParseResult<BracketedClass> {
400 let (input, _) = char('[').parse(input)?;
401 cut(bracketed_class_inner).parse(input)
403}
404
405fn bracketed_class_inner(input: Input) -> ParseResult<BracketedClass> {
406 let (input, negated) = eat_opt_char('^', input);
407 let start = input.pos();
408 let (input2, contains_closing_bracket) = eat_opt_char(']', input);
409
410 let (input, mut items) = many0(bracketed_class_item).parse(input2)?;
411 let (input, _) = char(']').parse(input)?;
412
413 if contains_closing_bracket {
414 items.push(BracketedClassItem::Literal(Literal {
415 byte: b']',
416 span: input2.get_span_from_no_rtrim(start),
417 escaped: false,
418 }));
419 }
420 Ok((input, BracketedClass { items, negated }))
421}
422
423fn bracketed_class_item(input: Input) -> ParseResult<BracketedClassItem> {
424 alt((
425 map(perl_class, BracketedClassItem::Perl),
426 bracketed_class_range_or_literal,
427 ))
428 .parse(input)
429}
430
431fn bracketed_class_range_or_literal(input: Input) -> ParseResult<BracketedClassItem> {
432 let start = input.pos();
433 let (input, lit) = bracketed_class_literal(input)?;
434 let (input2, has_dash) = eat_opt_char('-', input);
435
436 if has_dash {
437 let (input3, lit2) = opt(bracketed_class_literal).parse(input2)?;
438 match lit2 {
439 Some(lit2) if lit2 < lit => Err(nom::Err::Failure(Error::new(
440 input3.get_span_from_no_rtrim(start),
441 ErrorKind::RegexClassRangeInvalid,
442 ))),
443 Some(lit2) => Ok((input3, BracketedClassItem::Range(lit, lit2))),
444 None => Ok((input, BracketedClassItem::Literal(lit))),
445 }
446 } else {
447 Ok((input, BracketedClassItem::Literal(lit)))
448 }
449}
450
451fn bracketed_class_literal(input: Input) -> ParseResult<Literal> {
452 alt((escaped_char_only_ascii, bracketed_class_char)).parse(input)
453}
454
455fn bracketed_class_char(input: Input) -> ParseResult<Literal> {
456 let start = input.pos();
457
458 let (input, b) = none_of("/\n]").parse(input)?;
462 if b.is_ascii() {
463 Ok((
464 input,
465 Literal {
466 byte: b as u8,
467 span: input.get_span_from_no_rtrim(start),
468 escaped: false,
469 },
470 ))
471 } else {
472 Err(nom::Err::Failure(Error::new(
473 input.get_span_from_no_rtrim(start),
474 ErrorKind::RegexNonAsciiByte,
475 )))
476 }
477}
478
479fn literal(input: Input) -> ParseResult<Node> {
480 let start = input.pos();
481
482 let (input, c) = none_of("/\n()[\\|.$^+*?").parse(input)?;
486 let node = if c.is_ascii() {
487 Node::Literal(Literal {
488 byte: c as u8,
489 span: input.get_span_from_no_rtrim(start),
490 escaped: false,
491 })
492 } else {
493 Node::Char(LiteralChar {
494 c,
495 span: input.get_span_from_no_rtrim(start),
496 escaped: false,
497 })
498 };
499
500 Ok((input, node))
501}
502
503fn escaped_char(input: Input) -> ParseResult<Node> {
504 let (input, res) = escaped_char_inner(input)?;
505
506 let node = match res {
507 Escaped {
508 kind: EscapedKind::Byte(byte),
509 span,
510 escaped,
511 } => Node::Literal(Literal {
512 byte,
513 span,
514 escaped,
515 }),
516 Escaped {
517 kind: EscapedKind::Char(c),
518 span,
519 escaped,
520 } => Node::Char(LiteralChar { c, span, escaped }),
521 };
522
523 Ok((input, node))
524}
525
526fn escaped_char_only_ascii(input: Input) -> ParseResult<Literal> {
527 let (input, res) = escaped_char_inner(input)?;
528
529 match res {
530 Escaped {
531 kind: EscapedKind::Byte(byte),
532 span,
533 escaped,
534 } => Ok((
535 input,
536 Literal {
537 byte,
538 span,
539 escaped,
540 },
541 )),
542 Escaped {
543 kind: EscapedKind::Char(_),
544 span,
545 ..
546 } => Err(nom::Err::Failure(Error::new(
547 span,
548 ErrorKind::RegexNonAsciiByte,
549 ))),
550 }
551}
552
553fn escaped_char_inner(input: Input) -> ParseResult<Escaped> {
554 let start = input.pos();
555 let (input2, _) = char('\\').parse(input)?;
556 let (input, b) = anychar(input2)?;
557
558 let span = input.get_span_from_no_rtrim(start);
559 let (kind, escaped) = match b {
560 'n' => (EscapedKind::Byte(b'\n'), false),
561 't' => (EscapedKind::Byte(b'\t'), false),
562 'r' => (EscapedKind::Byte(b'\r'), false),
563 'f' => (EscapedKind::Byte(b'\x0C'), false),
564 'a' => (EscapedKind::Byte(b'\x07'), false),
565 'x' => {
566 let (input, n) = cut(take(2_u32)).parse(input)?;
567
568 let n = match u8::from_str_radix(&n, 16) {
569 Ok(n) => n,
570 Err(e) => {
571 return Err(nom::Err::Failure(Error::new(
572 input.get_span_from_no_rtrim(start),
573 ErrorKind::StrToHexIntError(e),
574 )));
575 }
576 };
577 return Ok((
578 input,
579 Escaped {
580 kind: EscapedKind::Byte(n),
581 span: input.get_span_from_no_rtrim(start),
582 escaped: false,
583 },
584 ));
585 }
586 c if c.is_ascii() => (EscapedKind::Byte(c as u8), true),
587 c => (EscapedKind::Char(c), true),
588 };
589
590 Ok((
591 input,
592 Escaped {
593 kind,
594 span,
595 escaped,
596 },
597 ))
598}
599
600struct Escaped {
601 kind: EscapedKind,
602 span: Range<usize>,
603 escaped: bool,
604}
605
606#[allow(variant_size_differences)]
607enum EscapedKind {
608 Byte(u8),
609 Char(char),
610}
611
612fn range_repetition(input: Input) -> ParseResult<(RepetitionRange, bool)> {
613 let (input, range) = alt((range_single, range_multi)).parse(input)?;
614 let (input, non_greedy) = eat_opt_char('?', input);
615
616 Ok((input, (range, !non_greedy)))
617}
618
619fn range_single(input: Input) -> ParseResult<RepetitionRange> {
620 let (input, v) = delimited(char('{'), parse_u32, char('}')).parse(input)?;
621
622 Ok((input, RepetitionRange::Exactly(v)))
623}
624
625fn range_multi(input: Input) -> ParseResult<RepetitionRange> {
626 let start = input.pos();
627 let (input, (from, to)) = delimited(
628 char('{'),
629 separated_pair(
630 parse_opt_u32,
631 delimited(multispace0, char(','), multispace0),
632 parse_opt_u32,
633 ),
634 char('}'),
635 )
636 .parse(input)?;
637
638 let range = match (from, to) {
639 (None, None) => RepetitionRange::AtLeast(0),
640 (Some(from), None) => RepetitionRange::AtLeast(from),
641 (None, Some(to)) => RepetitionRange::Bounded(0, to),
642 (Some(from), Some(to)) if to < from => {
643 return Err(nom::Err::Failure(Error::new(
644 input.get_span_from_no_rtrim(start),
645 ErrorKind::RegexRangeInvalid,
646 )))
647 }
648 (Some(from), Some(to)) => RepetitionRange::Bounded(from, to),
649 };
650
651 Ok((input, range))
652}
653
654fn parse_u32(input: Input) -> ParseResult<u32> {
655 let start = input.pos();
656 let (input, v) = digit1(input)?;
657
658 let n = match str::parse::<u32>(&v) {
659 Ok(n) => n,
660 Err(e) => {
661 return Err(nom::Err::Failure(Error::new(
662 input.get_span_from_no_rtrim(start),
663 ErrorKind::StrToIntError(e),
664 )))
665 }
666 };
667
668 Ok((input, n))
669}
670
671fn parse_opt_u32(input: Input) -> ParseResult<Option<u32>> {
672 let start = input.pos();
673 let (input, v) = match digit0::<_, Error>(input) {
674 Ok((input, s)) if !s.is_empty() => (input, s),
675 _ => return Ok((input, None)),
676 };
677
678 let n = match str::parse::<u32>(&v) {
679 Ok(n) => n,
680 Err(e) => {
681 return Err(nom::Err::Failure(Error::new(
682 input.get_span_from_no_rtrim(start),
683 ErrorKind::StrToIntError(e),
684 )))
685 }
686 };
687
688 Ok((input, Some(n)))
689}
690
691#[cfg(test)]
692mod tests {
693 use super::*;
694 use crate::test_helpers::{parse, parse_err, parse_err_type, test_public_type};
695
696 fn lit(byte: u8, span: Range<usize>, escaped: bool) -> Literal {
697 Literal {
698 byte,
699 span,
700 escaped,
701 }
702 }
703
704 #[test]
705 fn test_parse() {
706 parse(
707 regex,
708 "/a/i",
709 "",
710 Regex {
711 ast: Node::Literal(lit(b'a', 1..2, false)),
712 case_insensitive: true,
713 dot_all: false,
714 span: 0..4,
715 },
716 );
717 parse(
718 regex,
719 "/[^0-9]+/a",
720 "a",
721 Regex {
722 ast: Node::Repetition {
723 node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
724 items: vec![BracketedClassItem::Range(
725 lit(b'0', 3..4, false),
726 lit(b'9', 5..6, false),
727 )],
728 negated: true,
729 }))),
730 kind: RepetitionKind::OneOrMore,
731 greedy: true,
732 },
733 case_insensitive: false,
734 dot_all: false,
735 span: 0..9,
736 },
737 );
738 parse(
739 regex,
740 r"/a\/b\cd/isb",
741 "b",
742 Regex {
743 ast: Node::Concat(vec![
744 Node::Literal(lit(b'a', 1..2, false)),
745 Node::Literal(lit(b'/', 2..4, true)),
746 Node::Literal(lit(b'b', 4..5, false)),
747 Node::Literal(lit(b'c', 5..7, true)),
748 Node::Literal(lit(b'd', 7..8, false)),
749 ]),
750 case_insensitive: true,
751 dot_all: true,
752 span: 0..11,
753 },
754 );
755 parse(
756 regex,
757 r"/.{2}/si c",
758 "i c",
759 Regex {
760 ast: Node::Repetition {
761 node: Box::new(Node::Dot),
762 kind: RepetitionKind::Range(RepetitionRange::Exactly(2)),
763 greedy: true,
764 },
765 case_insensitive: false,
766 dot_all: true,
767 span: 0..7,
768 },
769 );
770 parse(
771 regex,
772 "/\0\\\0/ c",
773 "c",
774 Regex {
775 ast: Node::Concat(vec![
776 Node::Literal(lit(b'\0', 1..2, false)),
777 Node::Literal(lit(b'\0', 2..4, true)),
778 ]),
779 case_insensitive: false,
780 dot_all: false,
781 span: 0..5,
782 },
783 );
784
785 parse_err(regex, "");
786 parse_err(regex, "/");
787 parse_err(regex, "/\n/");
788 parse_err(regex, "/a{2}");
789 parse_err(regex, "/a///");
790 parse_err(regex, "/a{5,4}/");
791 }
792
793 #[test]
794 fn test_alternative() {
795 parse(alternative, "(", "(", Node::Empty);
796 parse(
797 alternative,
798 "a)",
799 ")",
800 Node::Literal(lit(b'a', 0..1, false)),
801 );
802 parse(
803 alternative,
804 "a|b",
805 "",
806 Node::Alternation(vec![
807 Node::Literal(lit(b'a', 0..1, false)),
808 Node::Literal(lit(b'b', 2..3, false)),
809 ]),
810 );
811 parse(
812 alternative,
813 "a|)",
814 ")",
815 Node::Alternation(vec![Node::Literal(lit(b'a', 0..1, false)), Node::Empty]),
816 );
817
818 parse(
819 alternative,
820 r"ab|.\||\b$|",
821 "",
822 Node::Alternation(vec![
823 Node::Concat(vec![
824 Node::Literal(lit(b'a', 0..1, false)),
825 Node::Literal(lit(b'b', 1..2, false)),
826 ]),
827 Node::Concat(vec![Node::Dot, Node::Literal(lit(b'|', 4..6, true))]),
828 Node::Concat(vec![
829 Node::Assertion(AssertionKind::WordBoundary),
830 Node::Assertion(AssertionKind::EndLine),
831 ]),
832 Node::Empty,
833 ]),
834 );
835
836 parse_err(alternative, "\\xEG");
837 }
838
839 #[test]
840 fn test_concatenation() {
841 parse(concatenation, "", "", Node::Empty);
842 parse(
843 concatenation,
844 "a",
845 "",
846 Node::Literal(lit(b'a', 0..1, false)),
847 );
848 parse(
849 concatenation,
850 "ab",
851 "",
852 Node::Concat(vec![
853 Node::Literal(lit(b'a', 0..1, false)),
854 Node::Literal(lit(b'b', 1..2, false)),
855 ]),
856 );
857 parse(
858 concatenation,
859 "a$*",
860 "*",
861 Node::Concat(vec![
862 Node::Literal(lit(b'a', 0..1, false)),
863 Node::Assertion(AssertionKind::EndLine),
864 ]),
865 );
866 parse(
867 concatenation,
868 r"^a+\b\d{2,3}[^z]*?)",
869 ")",
870 Node::Concat(vec![
871 Node::Assertion(AssertionKind::StartLine),
872 Node::Repetition {
873 node: Box::new(Node::Literal(lit(b'a', 1..2, false))),
874 kind: RepetitionKind::OneOrMore,
875 greedy: true,
876 },
877 Node::Assertion(AssertionKind::WordBoundary),
878 Node::Repetition {
879 node: Box::new(Node::Class(ClassKind::Perl(PerlClass {
880 kind: PerlClassKind::Digit,
881 negated: false,
882 }))),
883 kind: RepetitionKind::Range(RepetitionRange::Bounded(2, 3)),
884 greedy: true,
885 },
886 Node::Repetition {
887 node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
888 items: vec![BracketedClassItem::Literal(lit(b'z', 14..15, false))],
889 negated: true,
890 }))),
891 kind: RepetitionKind::ZeroOrMore,
892 greedy: false,
893 },
894 ]),
895 );
896
897 parse_err(concatenation, "\\xEG");
898 }
899
900 #[test]
901 fn test_assertion() {
902 parse(
903 assertion,
904 r"\ba",
905 "a",
906 Node::Assertion(AssertionKind::WordBoundary),
907 );
908 parse(
909 assertion,
910 r"\B ",
911 " ",
912 Node::Assertion(AssertionKind::NonWordBoundary),
913 );
914 parse(
915 assertion,
916 r"^^",
917 "^",
918 Node::Assertion(AssertionKind::StartLine),
919 );
920 parse(
921 assertion,
922 r"$^",
923 "^",
924 Node::Assertion(AssertionKind::EndLine),
925 );
926
927 parse_err(assertion, r"\w");
928 }
929
930 #[test]
931 fn test_repetition() {
932 parse(repetition, "*??", "?", (RepetitionKind::ZeroOrMore, false));
933 parse(repetition, "+??", "?", (RepetitionKind::OneOrMore, false));
934 parse(repetition, "???", "?", (RepetitionKind::ZeroOrOne, false));
935 parse(repetition, "*a?", "a?", (RepetitionKind::ZeroOrMore, true));
936 parse(repetition, "+a?", "a?", (RepetitionKind::OneOrMore, true));
937 parse(repetition, "?a?", "a?", (RepetitionKind::ZeroOrOne, true));
938 parse(
939 repetition,
940 "{5}??",
941 "?",
942 (RepetitionKind::Range(RepetitionRange::Exactly(5)), false),
943 );
944
945 parse_err(repetition, "5");
946 }
947
948 #[test]
949 fn test_single() {
950 parse(single, ".a", "a", Node::Dot);
951 parse(single, "()a", "a", Node::Group(Box::new(Node::Empty)));
952 parse(
953 single,
954 "(ab)a",
955 "a",
956 Node::Group(Box::new(Node::Concat(vec![
957 Node::Literal(lit(b'a', 1..2, false)),
958 Node::Literal(lit(b'b', 2..3, false)),
959 ]))),
960 );
961 parse(
962 single,
963 r"\s",
964 "",
965 Node::Class(ClassKind::Perl(PerlClass {
966 kind: PerlClassKind::Space,
967 negated: false,
968 })),
969 );
970 parse(
971 single,
972 r"[a-fA-F] ",
973 " ",
974 Node::Class(ClassKind::Bracketed(BracketedClass {
975 items: vec![
976 BracketedClassItem::Range(lit(b'a', 1..2, false), lit(b'f', 3..4, false)),
977 BracketedClassItem::Range(lit(b'A', 4..5, false), lit(b'F', 6..7, false)),
978 ],
979 negated: false,
980 })),
981 );
982 parse(
983 single,
984 r"\xFFa",
985 "a",
986 Node::Literal(lit(b'\xFF', 0..4, false)),
987 );
988 parse(single, r"]a", "a", Node::Literal(lit(b']', 0..1, false)));
989
990 parse_err(single, "");
991 parse_err(single, "(");
992 parse_err(single, ")");
993 parse_err(single, "[");
994 parse_err(single, "|");
995 parse_err(single, "$");
996 parse_err(single, "^");
997 parse_err(single, "+");
998 parse_err(single, "*");
999 parse_err(single, "?");
1000 parse_err(single, "(a");
1001 }
1002
1003 #[test]
1004 fn test_perl_class() {
1005 parse(
1006 perl_class,
1007 r"\w ",
1008 " ",
1009 PerlClass {
1010 kind: PerlClassKind::Word,
1011 negated: false,
1012 },
1013 );
1014 parse(
1015 perl_class,
1016 r"\Wa",
1017 "a",
1018 PerlClass {
1019 kind: PerlClassKind::Word,
1020 negated: true,
1021 },
1022 );
1023 parse(
1024 perl_class,
1025 r"\s",
1026 "",
1027 PerlClass {
1028 kind: PerlClassKind::Space,
1029 negated: false,
1030 },
1031 );
1032 parse(
1033 perl_class,
1034 r"\S\",
1035 "\\",
1036 PerlClass {
1037 kind: PerlClassKind::Space,
1038 negated: true,
1039 },
1040 );
1041 parse(
1042 perl_class,
1043 r"\d",
1044 "",
1045 PerlClass {
1046 kind: PerlClassKind::Digit,
1047 negated: false,
1048 },
1049 );
1050 parse(
1051 perl_class,
1052 r"\Da",
1053 "a",
1054 PerlClass {
1055 kind: PerlClassKind::Digit,
1056 negated: true,
1057 },
1058 );
1059
1060 parse_err(perl_class, "");
1061 parse_err(perl_class, "\\");
1062 parse_err(perl_class, "\\k");
1063 }
1064
1065 #[test]
1066 fn test_bracketed_class() {
1067 parse(
1068 bracketed_class,
1069 "[a]b",
1070 "b",
1071 BracketedClass {
1072 items: vec![BracketedClassItem::Literal(lit(b'a', 1..2, false))],
1073 negated: false,
1074 },
1075 );
1076 parse(
1077 bracketed_class,
1078 "[^a-z_\\S0-9]",
1079 "",
1080 BracketedClass {
1081 items: vec![
1082 BracketedClassItem::Range(lit(b'a', 2..3, false), lit(b'z', 4..5, false)),
1083 BracketedClassItem::Literal(lit(b'_', 5..6, false)),
1084 BracketedClassItem::Perl(PerlClass {
1085 kind: PerlClassKind::Space,
1086 negated: true,
1087 }),
1088 BracketedClassItem::Range(lit(b'0', 8..9, false), lit(b'9', 10..11, false)),
1089 ],
1090 negated: true,
1091 },
1092 );
1093 parse(
1094 bracketed_class,
1095 "[]\\j]",
1096 "",
1097 BracketedClass {
1098 items: vec![
1099 BracketedClassItem::Literal(lit(b'j', 2..4, true)),
1100 BracketedClassItem::Literal(lit(b']', 1..2, false)),
1101 ],
1102 negated: false,
1103 },
1104 );
1105 parse(
1106 bracketed_class,
1107 "[]]",
1108 "",
1109 BracketedClass {
1110 items: vec![BracketedClassItem::Literal(lit(b']', 1..2, false))],
1111 negated: false,
1112 },
1113 );
1114 parse(
1115 bracketed_class,
1116 "[^]]",
1117 "",
1118 BracketedClass {
1119 items: vec![BracketedClassItem::Literal(lit(b']', 2..3, false))],
1120 negated: true,
1121 },
1122 );
1123 parse(
1124 bracketed_class,
1125 "[^a\\]b-]",
1126 "",
1127 BracketedClass {
1128 items: vec![
1129 BracketedClassItem::Literal(lit(b'a', 2..3, false)),
1130 BracketedClassItem::Literal(lit(b']', 3..5, true)),
1131 BracketedClassItem::Literal(lit(b'b', 5..6, false)),
1132 BracketedClassItem::Literal(lit(b'-', 6..7, false)),
1133 ],
1134 negated: true,
1135 },
1136 );
1137
1138 parse_err(bracketed_class, "[");
1139 parse_err(bracketed_class, "[]");
1140 parse_err(bracketed_class, "[^]");
1141 parse_err(bracketed_class, "[é]");
1142 parse_err(bracketed_class, "[\\]");
1143 parse_err(bracketed_class, "[\\x]");
1144 parse_err(bracketed_class, "[\\x0]");
1145 parse_err(bracketed_class, "[\\é]");
1146 }
1147
1148 #[test]
1149 fn test_bracketed_class_item() {
1150 parse(
1151 bracketed_class_item,
1152 "\\sw",
1153 "w",
1154 BracketedClassItem::Perl(PerlClass {
1155 kind: PerlClassKind::Space,
1156 negated: false,
1157 }),
1158 );
1159 parse(
1160 bracketed_class_item,
1161 "\\c-z]",
1162 "]",
1163 BracketedClassItem::Range(lit(b'c', 0..2, true), lit(b'z', 3..4, false)),
1164 );
1165
1166 parse_err(bracketed_class_item, "é");
1167 }
1168
1169 #[test]
1170 fn test_bracketed_class_range_or_literal() {
1171 parse(
1172 bracketed_class_range_or_literal,
1173 "ab",
1174 "b",
1175 BracketedClassItem::Literal(lit(b'a', 0..1, false)),
1176 );
1177 parse(
1178 bracketed_class_range_or_literal,
1179 r"\x01-",
1180 "-",
1181 BracketedClassItem::Literal(lit(b'\x01', 0..4, false)),
1182 );
1183 parse(
1184 bracketed_class_range_or_literal,
1185 "-\\]",
1186 "\\]",
1187 BracketedClassItem::Literal(lit(b'-', 0..1, false)),
1188 );
1189 parse(
1190 bracketed_class_range_or_literal,
1191 "A-]",
1192 "-]",
1193 BracketedClassItem::Literal(lit(b'A', 0..1, false)),
1194 );
1195
1196 parse(
1197 bracketed_class_range_or_literal,
1198 "a-\\sb",
1199 "b",
1200 BracketedClassItem::Range(lit(b'a', 0..1, false), lit(b's', 2..4, true)),
1201 );
1202 parse(
1203 bracketed_class_range_or_literal,
1204 "!--",
1205 "",
1206 BracketedClassItem::Range(lit(b'!', 0..1, false), lit(b'-', 2..3, false)),
1207 );
1208 parse(
1209 bracketed_class_range_or_literal,
1210 "---",
1211 "",
1212 BracketedClassItem::Range(lit(b'-', 0..1, false), lit(b'-', 2..3, false)),
1213 );
1214 parse(
1215 bracketed_class_range_or_literal,
1216 r"\n-\n",
1217 "",
1218 BracketedClassItem::Range(lit(b'\n', 0..2, false), lit(b'\n', 3..5, false)),
1219 );
1220 parse(
1221 bracketed_class_range_or_literal,
1222 r"\x01-\xFE",
1223 "",
1224 BracketedClassItem::Range(lit(b'\x01', 0..4, false), lit(b'\xFE', 5..9, false)),
1225 );
1226
1227 parse_err(bracketed_class_range_or_literal, "é");
1228 parse_err(bracketed_class_range_or_literal, "b-a");
1229 parse_err(bracketed_class_range_or_literal, "é-a");
1230 parse_err(bracketed_class_range_or_literal, "a-é");
1231 parse_err(bracketed_class_range_or_literal, "]-a");
1232 }
1233
1234 #[test]
1235 fn test_bracketed_class_literal() {
1236 parse(bracketed_class_literal, "ab", "b", lit(b'a', 0..1, false));
1237 parse(
1238 bracketed_class_literal,
1239 "\\nb",
1240 "b",
1241 lit(b'\n', 0..2, false),
1242 );
1243 parse(bracketed_class_literal, "\\]", "", lit(b']', 0..2, true));
1244
1245 parse_err(bracketed_class_literal, "]b");
1246 parse_err(bracketed_class_literal, "é");
1247 parse_err(bracketed_class_literal, "\\x1");
1248 parse_err(bracketed_class_literal, "\\é");
1249 }
1250
1251 #[test]
1252 fn test_bracketed_class_char() {
1253 parse(bracketed_class_char, "ab", "b", lit(b'a', 0..1, false));
1254
1255 parse_err(bracketed_class_char, "]b");
1256 parse_err(bracketed_class_char, "é");
1257 }
1258
1259 #[test]
1260 fn test_literal() {
1261 parse(literal, "ab", "b", Node::Literal(lit(b'a', 0..1, false)));
1262 parse(literal, "]", "", Node::Literal(lit(b']', 0..1, false)));
1263
1264 parse(
1265 literal,
1266 "éb",
1267 "b",
1268 Node::Char(LiteralChar {
1269 c: 'é',
1270 span: 0..2,
1271 escaped: false,
1272 }),
1273 );
1274 }
1275
1276 #[test]
1277 fn test_escaped_char() {
1278 parse(
1279 escaped_char,
1280 "\\na",
1281 "a",
1282 Node::Literal(lit(b'\n', 0..2, false)),
1283 );
1284 parse(
1285 escaped_char,
1286 "\\ta",
1287 "a",
1288 Node::Literal(lit(b'\t', 0..2, false)),
1289 );
1290 parse(
1291 escaped_char,
1292 "\\ra",
1293 "a",
1294 Node::Literal(lit(b'\r', 0..2, false)),
1295 );
1296 parse(
1297 escaped_char,
1298 "\\fa",
1299 "a",
1300 Node::Literal(lit(b'\x0C', 0..2, false)),
1301 );
1302 parse(
1303 escaped_char,
1304 "\\aa",
1305 "a",
1306 Node::Literal(lit(b'\x07', 0..2, false)),
1307 );
1308 parse(
1309 escaped_char,
1310 "\\x00a",
1311 "a",
1312 Node::Literal(lit(b'\0', 0..4, false)),
1313 );
1314 parse(
1315 escaped_char,
1316 "\\xAF a",
1317 " a",
1318 Node::Literal(lit(b'\xAF', 0..4, false)),
1319 );
1320 parse(
1321 escaped_char,
1322 "\\k",
1323 "",
1324 Node::Literal(lit(b'k', 0..2, true)),
1325 );
1326 parse(
1327 escaped_char,
1328 "\\é_",
1329 "_",
1330 Node::Char(LiteralChar {
1331 c: 'é',
1332 span: 0..3,
1333 escaped: true,
1334 }),
1335 );
1336
1337 parse_err(escaped_char, "\\");
1338 parse_err(escaped_char, "\\x");
1339 parse_err(escaped_char, "\\x2");
1340 parse_err(escaped_char, "\\x2G");
1341 }
1342
1343 #[test]
1344 fn test_escaped_char_only_ascii() {
1345 parse(
1346 escaped_char_only_ascii,
1347 "\\na",
1348 "a",
1349 lit(b'\n', 0..2, false),
1350 );
1351 parse(
1352 escaped_char_only_ascii,
1353 "\\ta",
1354 "a",
1355 lit(b'\t', 0..2, false),
1356 );
1357 parse(
1358 escaped_char_only_ascii,
1359 "\\ra",
1360 "a",
1361 lit(b'\r', 0..2, false),
1362 );
1363 parse(
1364 escaped_char_only_ascii,
1365 "\\fa",
1366 "a",
1367 lit(b'\x0C', 0..2, false),
1368 );
1369 parse(
1370 escaped_char_only_ascii,
1371 "\\aa",
1372 "a",
1373 lit(b'\x07', 0..2, false),
1374 );
1375 parse(
1376 escaped_char_only_ascii,
1377 "\\x00a",
1378 "a",
1379 lit(b'\0', 0..4, false),
1380 );
1381 parse(
1382 escaped_char_only_ascii,
1383 "\\xAF a",
1384 " a",
1385 lit(b'\xAF', 0..4, false),
1386 );
1387 parse(escaped_char_only_ascii, "\\k", "", lit(b'k', 0..2, true));
1388
1389 parse_err(escaped_char_only_ascii, "\\");
1390 parse_err(escaped_char_only_ascii, "\\é");
1391 parse_err(escaped_char_only_ascii, "\\x");
1392 parse_err(escaped_char_only_ascii, "\\x2");
1393 parse_err(escaped_char_only_ascii, "\\x2G");
1394 }
1395
1396 #[test]
1397 fn test_range_repetition() {
1398 parse(
1399 range_repetition,
1400 "{0} ?a",
1401 " ?a",
1402 (RepetitionRange::Exactly(0), true),
1403 );
1404 parse(
1405 range_repetition,
1406 "{5}?a",
1407 "a",
1408 (RepetitionRange::Exactly(5), false),
1409 );
1410
1411 parse(
1412 range_repetition,
1413 "{5,15} a",
1414 " a",
1415 (RepetitionRange::Bounded(5, 15), true),
1416 );
1417 parse(
1418 range_repetition,
1419 "{5,}?a",
1420 "a",
1421 (RepetitionRange::AtLeast(5), false),
1422 );
1423
1424 parse_err(range_repetition, "{}?");
1425 }
1426
1427 #[test]
1428 fn test_range_single() {
1429 parse(range_single, "{0}a", "a", RepetitionRange::Exactly(0));
1430 parse(range_single, "{350} a", " a", RepetitionRange::Exactly(350));
1431
1432 parse_err(range_single, "{");
1433 parse_err(range_single, "{}");
1434 parse_err(range_single, "{-5}");
1435 }
1436
1437 #[test]
1438 fn test_range_multi() {
1439 parse(range_multi, "{,5}a", "a", RepetitionRange::Bounded(0, 5));
1440 parse(range_multi, "{5,}a", "a", RepetitionRange::AtLeast(5));
1441 parse(range_multi, "{5,10}a", "a", RepetitionRange::Bounded(5, 10));
1442 parse(range_multi, "{0,0} a", " a", RepetitionRange::Bounded(0, 0));
1443 parse(range_multi, "{,}", "", RepetitionRange::AtLeast(0));
1444 parse(range_multi, "{, }", "", RepetitionRange::AtLeast(0));
1445 parse(range_multi, "{ ,}", "", RepetitionRange::AtLeast(0));
1446 parse(range_multi, "{ , }", "", RepetitionRange::AtLeast(0));
1447 parse(range_multi, "{2 , }", "", RepetitionRange::AtLeast(2));
1448 parse(range_multi, "{ , 2}", "", RepetitionRange::Bounded(0, 2));
1449 parse(range_multi, "{1 , 2}", "", RepetitionRange::Bounded(1, 2));
1450
1451 parse_err(range_multi, "{");
1452 parse_err(range_multi, "{,5");
1453 parse_err(range_multi, "{,-5}");
1454 parse_err(range_multi, "{-5,}");
1455 parse_err(range_multi, "{10,5}");
1456 parse_err(range_multi, "{ 1,5}");
1457 parse_err(range_multi, "{1,5 }");
1458 }
1459
1460 #[test]
1461 fn test_parse_u32() {
1462 parse(parse_u32, "5a", "a", 5_u32);
1463
1464 parse_err(parse_u32, "a");
1465 parse_err(parse_u32, "-5a");
1466 parse_err(parse_u32, "5000000000000");
1467 }
1468
1469 #[test]
1470 fn test_parse_opt_u32() {
1471 parse(parse_opt_u32, "a", "a", None);
1472 parse(parse_opt_u32, "5a", "a", Some(5));
1473 parse(parse_opt_u32, "-5a", "-5a", None);
1474
1475 parse_err(parse_opt_u32, "5000000000000");
1476 }
1477
1478 #[test]
1479 fn test_stack_overflow() {
1480 let mut v = String::new();
1483 v.push('/');
1484 for _ in 0..1_000 {
1485 v.push_str("a(b");
1486 }
1487 for _ in 0..1_000 {
1488 v.push_str(")c");
1489 }
1490 v.push('/');
1491
1492 parse_err_type(regex, &v, &Error::new(30..30, ErrorKind::RegexTooDeep));
1493
1494 let mut v = String::new();
1496 v.push('/');
1497 let nb = MAX_REGEX_RECURSION - 1;
1498 for _ in 0..nb {
1499 v.push_str("a(b");
1500 }
1501 for _ in 0..nb {
1502 v.push_str(")c");
1503 }
1504 v.push('d');
1505 for _ in 0..nb {
1506 v.push_str("e(f");
1507 }
1508 for _ in 0..nb {
1509 v.push_str(")h");
1510 }
1511 v.push('/');
1512
1513 let input = Input::new(&v);
1514 let _res = regex(input).unwrap();
1515 assert_eq!(input.inner_recursion_counter, 0);
1516 }
1517
1518 #[test]
1519 fn test_parse_regex() {
1520 assert!(parse_regex(r"/a{2}/").is_ok());
1521 assert!(parse_regex(r"a{2}/").is_err());
1522 }
1523
1524 #[test]
1525 fn test_public_types() {
1526 test_public_type(regex(Input::new(r"/a{2}[az]\b\s|.+$/")).unwrap());
1527 }
1528}