boreal_parser/
regex.rs

1//! AST elements related to YARA regexes.
2use std::ops::Range;
3
4use nom::branch::alt;
5use nom::bytes::complete::{tag, take};
6use nom::character::complete::{anychar, char, digit0, digit1, multispace0, none_of};
7use nom::combinator::{cut, map, opt};
8use nom::multi::many0;
9use nom::sequence::{delimited, separated_pair, terminated};
10use nom::Parser;
11
12use crate::error::ErrorKind;
13
14use super::error::Error;
15use super::nom_recipes::rtrim;
16use super::types::{Input, ParseResult};
17
18const MAX_REGEX_RECURSION: usize = 10;
19
20/// A regular expression.
21#[derive(Clone, Debug, PartialEq, Eq)]
22pub struct Regex {
23    /// The AST of the regular expression parsed inside the `/` delimiters.
24    pub ast: Node,
25    /// case insensitive (`i` flag).
26    pub case_insensitive: bool,
27    /// `.` matches `\n` (`s` flag).
28    pub dot_all: bool,
29
30    /// The span of the regex expression
31    pub span: Range<usize>,
32}
33
34/// AST node of a regular expression.
35#[derive(Clone, Debug, PartialEq, Eq)]
36pub enum Node {
37    /// Alternation of nodes, ie `node|node|...`.
38    Alternation(Vec<Node>),
39
40    /// Zero-width assertion, e.g. ^, \b, ...
41    Assertion(AssertionKind),
42
43    /// Set of allowed values for a single byte.
44    Class(ClassKind),
45
46    /// Concatenation, must match in order.
47    Concat(Vec<Node>),
48
49    /// The special `.` character.
50    Dot,
51
52    /// Empty expression.
53    Empty,
54
55    /// Literal byte.
56    Literal(Literal),
57
58    /// Literal char, not ascii.
59    Char(LiteralChar),
60
61    /// A group, i.e. (...).
62    Group(Box<Node>),
63
64    /// Repetition of an expression.
65    Repetition {
66        /// Expression to repeat.
67        node: Box<Node>,
68
69        /// Kind of repetition.
70        kind: RepetitionKind,
71
72        /// Is the repetition greedy or not.
73        greedy: bool,
74    },
75}
76
77/// Regex class.
78#[derive(Clone, Debug, PartialEq, Eq)]
79pub enum ClassKind {
80    /// Perl style class, e.g. `\w`, `\d`.
81    Perl(PerlClass),
82    /// Bracket class, i.e. `[...]`.
83    Bracketed(BracketedClass),
84}
85
86/// PERL style class.
87#[derive(Clone, Debug, PartialEq, Eq)]
88pub struct PerlClass {
89    /// Class kind.
90    pub kind: PerlClassKind,
91    /// Is the class negated.
92    pub negated: bool,
93}
94
95/// Kind of PERL style class.
96#[derive(Clone, Debug, PartialEq, Eq)]
97pub enum PerlClassKind {
98    /// Word class, i.e. `[a-zA-Z0-9_]`.
99    Word,
100    /// Space class, i.e. `[\t\n\v\f\r ]`.
101    Space,
102    /// Digit class, i.e. `[0-9]`.
103    Digit,
104}
105
106/// Class expressed in brackets.
107#[derive(Clone, Debug, PartialEq, Eq)]
108pub struct BracketedClass {
109    /// List of items in the class.
110    pub items: Vec<BracketedClassItem>,
111    /// Is the class negated.
112    pub negated: bool,
113}
114
115/// Item in a bracketed class.
116#[derive(Clone, Debug, PartialEq, Eq)]
117pub enum BracketedClassItem {
118    /// Perl style class.
119    Perl(PerlClass),
120    /// Literal byte.
121    Literal(Literal),
122    /// Range of bytes.
123    Range(Literal, Literal),
124}
125
126/// Kind of repetition.
127#[derive(Clone, Debug, PartialEq, Eq)]
128pub enum RepetitionKind {
129    /// zero or one, i.e. `?`
130    ZeroOrOne,
131    /// zero or more, i.e. `*`
132    ZeroOrMore,
133    /// one or more, i.e. `+`
134    OneOrMore,
135    /// Range, i.e. `{N}`, `{N,M}`, etc.
136    Range(RepetitionRange),
137}
138
139/// Repetition range.
140#[derive(Clone, Debug, PartialEq, Eq)]
141pub enum RepetitionRange {
142    /// Exactly the given number, i.e. `{N}`.
143    Exactly(u32),
144    /// At least the given number, i.e. `{N,}`.
145    AtLeast(u32),
146    /// Between two numbers, i.e. `{N,M}`.
147    Bounded(u32, u32),
148}
149
150/// Kind of zero-width assertion.
151#[derive(Clone, Debug, PartialEq, Eq)]
152pub enum AssertionKind {
153    /// Start of the line, i.e. `^`.
154    StartLine,
155    /// End of the line, i.e. `$`.
156    EndLine,
157    /// Word boundary, i.e. `\b`.
158    WordBoundary,
159    /// Non word boundary, i.e. `\B`.
160    NonWordBoundary,
161}
162
163/// Literal unicode character.
164#[derive(Clone, Debug, PartialEq, Eq)]
165pub struct LiteralChar {
166    /// The unicode character.
167    pub c: char,
168
169    /// Position in the input for this char.
170    pub span: Range<usize>,
171
172    /// Was the character escaped.
173    ///
174    /// See `Literal::escaped` for more details on what this means.
175    pub escaped: bool,
176}
177
178/// Literal byte
179#[derive(Clone, Debug, PartialEq, Eq)]
180pub struct Literal {
181    /// Byte value.
182    pub byte: u8,
183
184    /// Span of the literal
185    pub span: Range<usize>,
186
187    /// Was the byte escaped.
188    ///
189    /// This is for example true for '\[' or '\.', but not for '\n' or '\xAB'.
190    pub escaped: bool,
191}
192
193impl PartialOrd for Literal {
194    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
195        self.byte.partial_cmp(&other.byte)
196    }
197}
198
199/// Parse a regex.
200///
201/// The input is expected to look like `/<regex>/<modifiers>`.
202///
203/// # Errors
204///
205/// Returns an error if the parsing fails.
206pub fn parse_regex(input: &str) -> Result<Regex, Error> {
207    use nom::Finish;
208
209    let input = Input::new(input);
210    let (_, res) = regex(input).finish()?;
211
212    Ok(res)
213}
214
215/// Parse a regular expression.
216///
217/// Similar to the _REGEX_ lexical pattern in libyara. but the parsing of the AST is done
218/// directly.
219///
220/// XXX: There is change of behavior from libyara. `\<nul_byte>` was forbidden,
221/// but we do not have an issue about this (we do not save the regular expression
222/// as a C string). See [Issue #576 in Yara](https://github.com/VirusTotal/yara/issues/576).
223pub(crate) fn regex(input: Input) -> ParseResult<Regex> {
224    let start = input.pos();
225    let (input, _) = char('/').parse(input)?;
226
227    // We cannot use escaped_transform, as it is not an error to use
228    // the control character with any char other than `/`.
229    let (input, ast) = cut(terminated(alternative, char('/'))).parse(input)?;
230    let (input, (no_case, dot_all)) = rtrim((opt(char('i')), opt(char('s')))).parse(input)?;
231
232    Ok((
233        input,
234        Regex {
235            ast,
236            case_insensitive: no_case.is_some(),
237            dot_all: dot_all.is_some(),
238            span: input.get_span_from(start),
239        },
240    ))
241}
242
243fn alternative(mut input: Input) -> ParseResult<Node> {
244    // This combinator is recursive:
245    //
246    // tokens => hex_token => alternatives => tokens
247    //
248    // Use the inner recursive counter to make sure this recursion cannot grow too much.
249    if input.inner_recursion_counter >= MAX_REGEX_RECURSION {
250        return Err(nom::Err::Failure(Error::new(
251            input.get_span_from(input.pos()),
252            ErrorKind::RegexTooDeep,
253        )));
254    }
255
256    let mut alts = Vec::new();
257    loop {
258        input.inner_recursion_counter += 1;
259        let (mut input2, node) = concatenation(input)?;
260        input2.inner_recursion_counter -= 1;
261
262        let (input2, has_alt_char) = eat_opt_char('|', input2);
263        if has_alt_char {
264            alts.push(node);
265            input = input2;
266            continue;
267        }
268
269        return Ok((
270            input2,
271            if alts.is_empty() {
272                node
273            } else {
274                alts.push(node);
275                Node::Alternation(alts)
276            },
277        ));
278    }
279}
280
281fn eat_opt_char(c: char, mut input: Input) -> (Input, bool) {
282    match input.cursor().chars().next() {
283        Some(c2) if c2 == c => {
284            input.advance(c.len_utf8());
285            (input, true)
286        }
287        _ => (input, false),
288    }
289}
290
291fn concatenation(input: Input) -> ParseResult<Node> {
292    let (input, mut nodes) = many0(repeat).parse(input)?;
293
294    let node = if nodes.is_empty() {
295        Node::Empty
296    } else if nodes.len() == 1 {
297        nodes.pop().unwrap()
298    } else {
299        Node::Concat(nodes)
300    };
301
302    Ok((input, node))
303}
304
305fn repeat(input: Input) -> ParseResult<Node> {
306    // First, parse assertions
307    if let Ok((input, node)) = assertion(input) {
308        return Ok((input, node));
309    }
310
311    // Otherwise, parse single node with optional repetition
312    let (input, node) = single(input)?;
313    let (input, repetition) = opt(repetition).parse(input)?;
314    match repetition {
315        Some((kind, greedy)) => Ok((
316            input,
317            Node::Repetition {
318                node: Box::new(node),
319                kind,
320                greedy,
321            },
322        )),
323        None => Ok((input, node)),
324    }
325}
326
327// Parse node that contains a repetition, or nodes that cannot be repeated
328fn assertion(input: Input) -> ParseResult<Node> {
329    alt((
330        map(tag(r"\b"), |_| Node::Assertion(AssertionKind::WordBoundary)),
331        map(tag(r"\B"), |_| {
332            Node::Assertion(AssertionKind::NonWordBoundary)
333        }),
334        map(char('^'), |_| Node::Assertion(AssertionKind::StartLine)),
335        map(char('$'), |_| Node::Assertion(AssertionKind::EndLine)),
336    ))
337    .parse(input)
338}
339
340fn repetition(input: Input) -> ParseResult<(RepetitionKind, bool)> {
341    alt((
342        map(tag("*?"), |_| (RepetitionKind::ZeroOrMore, false)),
343        map(tag("+?"), |_| (RepetitionKind::OneOrMore, false)),
344        map(tag("??"), |_| (RepetitionKind::ZeroOrOne, false)),
345        map(tag("*"), |_| (RepetitionKind::ZeroOrMore, true)),
346        map(tag("+"), |_| (RepetitionKind::OneOrMore, true)),
347        map(tag("?"), |_| (RepetitionKind::ZeroOrOne, true)),
348        map(range_repetition, |(kind, greedy)| {
349            (RepetitionKind::Range(kind), greedy)
350        }),
351    ))
352    .parse(input)
353}
354
355fn single(input: Input) -> ParseResult<Node> {
356    alt((
357        map(delimited(char('('), alternative, char(')')), |node| {
358            Node::Group(Box::new(node))
359        }),
360        map(char('.'), |_| Node::Dot),
361        map(perl_class, |p| Node::Class(ClassKind::Perl(p))),
362        map(bracketed_class, |p| Node::Class(ClassKind::Bracketed(p))),
363        escaped_char,
364        literal,
365    ))
366    .parse(input)
367}
368
369fn perl_class(input: Input) -> ParseResult<PerlClass> {
370    alt((
371        map(tag(r"\w"), |_| PerlClass {
372            kind: PerlClassKind::Word,
373            negated: false,
374        }),
375        map(tag(r"\W"), |_| PerlClass {
376            kind: PerlClassKind::Word,
377            negated: true,
378        }),
379        map(tag(r"\s"), |_| PerlClass {
380            kind: PerlClassKind::Space,
381            negated: false,
382        }),
383        map(tag(r"\S"), |_| PerlClass {
384            kind: PerlClassKind::Space,
385            negated: true,
386        }),
387        map(tag(r"\d"), |_| PerlClass {
388            kind: PerlClassKind::Digit,
389            negated: false,
390        }),
391        map(tag(r"\D"), |_| PerlClass {
392            kind: PerlClassKind::Digit,
393            negated: true,
394        }),
395    ))
396    .parse(input)
397}
398
399fn bracketed_class(input: Input) -> ParseResult<BracketedClass> {
400    let (input, _) = char('[').parse(input)?;
401    // As soon as we parse a '[', we are in class mode, hence the cut if parsing fails.
402    cut(bracketed_class_inner).parse(input)
403}
404
405fn bracketed_class_inner(input: Input) -> ParseResult<BracketedClass> {
406    let (input, negated) = eat_opt_char('^', input);
407    let start = input.pos();
408    let (input2, contains_closing_bracket) = eat_opt_char(']', input);
409
410    let (input, mut items) = many0(bracketed_class_item).parse(input2)?;
411    let (input, _) = char(']').parse(input)?;
412
413    if contains_closing_bracket {
414        items.push(BracketedClassItem::Literal(Literal {
415            byte: b']',
416            span: input2.get_span_from_no_rtrim(start),
417            escaped: false,
418        }));
419    }
420    Ok((input, BracketedClass { items, negated }))
421}
422
423fn bracketed_class_item(input: Input) -> ParseResult<BracketedClassItem> {
424    alt((
425        map(perl_class, BracketedClassItem::Perl),
426        bracketed_class_range_or_literal,
427    ))
428    .parse(input)
429}
430
431fn bracketed_class_range_or_literal(input: Input) -> ParseResult<BracketedClassItem> {
432    let start = input.pos();
433    let (input, lit) = bracketed_class_literal(input)?;
434    let (input2, has_dash) = eat_opt_char('-', input);
435
436    if has_dash {
437        let (input3, lit2) = opt(bracketed_class_literal).parse(input2)?;
438        match lit2 {
439            Some(lit2) if lit2 < lit => Err(nom::Err::Failure(Error::new(
440                input3.get_span_from_no_rtrim(start),
441                ErrorKind::RegexClassRangeInvalid,
442            ))),
443            Some(lit2) => Ok((input3, BracketedClassItem::Range(lit, lit2))),
444            None => Ok((input, BracketedClassItem::Literal(lit))),
445        }
446    } else {
447        Ok((input, BracketedClassItem::Literal(lit)))
448    }
449}
450
451fn bracketed_class_literal(input: Input) -> ParseResult<Literal> {
452    alt((escaped_char_only_ascii, bracketed_class_char)).parse(input)
453}
454
455fn bracketed_class_char(input: Input) -> ParseResult<Literal> {
456    let start = input.pos();
457
458    // / and \n are disallowed because of the surrounding rule (we are parsing a /.../ variable,
459    // and newlines are not allowed
460    // ] is disallowed because it indicates the end of the class
461    let (input, b) = none_of("/\n]").parse(input)?;
462    if b.is_ascii() {
463        Ok((
464            input,
465            Literal {
466                byte: b as u8,
467                span: input.get_span_from_no_rtrim(start),
468                escaped: false,
469            },
470        ))
471    } else {
472        Err(nom::Err::Failure(Error::new(
473            input.get_span_from_no_rtrim(start),
474            ErrorKind::RegexNonAsciiByte,
475        )))
476    }
477}
478
479fn literal(input: Input) -> ParseResult<Node> {
480    let start = input.pos();
481
482    // / and \n are disallowed because of the surrounding rule (we are parsing a /.../ variable,
483    // and newlines are not allowed
484    // rest is disallowed because they have specific meaning.
485    let (input, c) = none_of("/\n()[\\|.$^+*?").parse(input)?;
486    let node = if c.is_ascii() {
487        Node::Literal(Literal {
488            byte: c as u8,
489            span: input.get_span_from_no_rtrim(start),
490            escaped: false,
491        })
492    } else {
493        Node::Char(LiteralChar {
494            c,
495            span: input.get_span_from_no_rtrim(start),
496            escaped: false,
497        })
498    };
499
500    Ok((input, node))
501}
502
503fn escaped_char(input: Input) -> ParseResult<Node> {
504    let (input, res) = escaped_char_inner(input)?;
505
506    let node = match res {
507        Escaped {
508            kind: EscapedKind::Byte(byte),
509            span,
510            escaped,
511        } => Node::Literal(Literal {
512            byte,
513            span,
514            escaped,
515        }),
516        Escaped {
517            kind: EscapedKind::Char(c),
518            span,
519            escaped,
520        } => Node::Char(LiteralChar { c, span, escaped }),
521    };
522
523    Ok((input, node))
524}
525
526fn escaped_char_only_ascii(input: Input) -> ParseResult<Literal> {
527    let (input, res) = escaped_char_inner(input)?;
528
529    match res {
530        Escaped {
531            kind: EscapedKind::Byte(byte),
532            span,
533            escaped,
534        } => Ok((
535            input,
536            Literal {
537                byte,
538                span,
539                escaped,
540            },
541        )),
542        Escaped {
543            kind: EscapedKind::Char(_),
544            span,
545            ..
546        } => Err(nom::Err::Failure(Error::new(
547            span,
548            ErrorKind::RegexNonAsciiByte,
549        ))),
550    }
551}
552
553fn escaped_char_inner(input: Input) -> ParseResult<Escaped> {
554    let start = input.pos();
555    let (input2, _) = char('\\').parse(input)?;
556    let (input, b) = anychar(input2)?;
557
558    let span = input.get_span_from_no_rtrim(start);
559    let (kind, escaped) = match b {
560        'n' => (EscapedKind::Byte(b'\n'), false),
561        't' => (EscapedKind::Byte(b'\t'), false),
562        'r' => (EscapedKind::Byte(b'\r'), false),
563        'f' => (EscapedKind::Byte(b'\x0C'), false),
564        'a' => (EscapedKind::Byte(b'\x07'), false),
565        'x' => {
566            let (input, n) = cut(take(2_u32)).parse(input)?;
567
568            let n = match u8::from_str_radix(&n, 16) {
569                Ok(n) => n,
570                Err(e) => {
571                    return Err(nom::Err::Failure(Error::new(
572                        input.get_span_from_no_rtrim(start),
573                        ErrorKind::StrToHexIntError(e),
574                    )));
575                }
576            };
577            return Ok((
578                input,
579                Escaped {
580                    kind: EscapedKind::Byte(n),
581                    span: input.get_span_from_no_rtrim(start),
582                    escaped: false,
583                },
584            ));
585        }
586        c if c.is_ascii() => (EscapedKind::Byte(c as u8), true),
587        c => (EscapedKind::Char(c), true),
588    };
589
590    Ok((
591        input,
592        Escaped {
593            kind,
594            span,
595            escaped,
596        },
597    ))
598}
599
600struct Escaped {
601    kind: EscapedKind,
602    span: Range<usize>,
603    escaped: bool,
604}
605
606#[allow(variant_size_differences)]
607enum EscapedKind {
608    Byte(u8),
609    Char(char),
610}
611
612fn range_repetition(input: Input) -> ParseResult<(RepetitionRange, bool)> {
613    let (input, range) = alt((range_single, range_multi)).parse(input)?;
614    let (input, non_greedy) = eat_opt_char('?', input);
615
616    Ok((input, (range, !non_greedy)))
617}
618
619fn range_single(input: Input) -> ParseResult<RepetitionRange> {
620    let (input, v) = delimited(char('{'), parse_u32, char('}')).parse(input)?;
621
622    Ok((input, RepetitionRange::Exactly(v)))
623}
624
625fn range_multi(input: Input) -> ParseResult<RepetitionRange> {
626    let start = input.pos();
627    let (input, (from, to)) = delimited(
628        char('{'),
629        separated_pair(
630            parse_opt_u32,
631            delimited(multispace0, char(','), multispace0),
632            parse_opt_u32,
633        ),
634        char('}'),
635    )
636    .parse(input)?;
637
638    let range = match (from, to) {
639        (None, None) => RepetitionRange::AtLeast(0),
640        (Some(from), None) => RepetitionRange::AtLeast(from),
641        (None, Some(to)) => RepetitionRange::Bounded(0, to),
642        (Some(from), Some(to)) if to < from => {
643            return Err(nom::Err::Failure(Error::new(
644                input.get_span_from_no_rtrim(start),
645                ErrorKind::RegexRangeInvalid,
646            )))
647        }
648        (Some(from), Some(to)) => RepetitionRange::Bounded(from, to),
649    };
650
651    Ok((input, range))
652}
653
654fn parse_u32(input: Input) -> ParseResult<u32> {
655    let start = input.pos();
656    let (input, v) = digit1(input)?;
657
658    let n = match str::parse::<u32>(&v) {
659        Ok(n) => n,
660        Err(e) => {
661            return Err(nom::Err::Failure(Error::new(
662                input.get_span_from_no_rtrim(start),
663                ErrorKind::StrToIntError(e),
664            )))
665        }
666    };
667
668    Ok((input, n))
669}
670
671fn parse_opt_u32(input: Input) -> ParseResult<Option<u32>> {
672    let start = input.pos();
673    let (input, v) = match digit0::<_, Error>(input) {
674        Ok((input, s)) if !s.is_empty() => (input, s),
675        _ => return Ok((input, None)),
676    };
677
678    let n = match str::parse::<u32>(&v) {
679        Ok(n) => n,
680        Err(e) => {
681            return Err(nom::Err::Failure(Error::new(
682                input.get_span_from_no_rtrim(start),
683                ErrorKind::StrToIntError(e),
684            )))
685        }
686    };
687
688    Ok((input, Some(n)))
689}
690
691#[cfg(test)]
692mod tests {
693    use super::*;
694    use crate::test_helpers::{parse, parse_err, parse_err_type, test_public_type};
695
696    fn lit(byte: u8, span: Range<usize>, escaped: bool) -> Literal {
697        Literal {
698            byte,
699            span,
700            escaped,
701        }
702    }
703
704    #[test]
705    fn test_parse() {
706        parse(
707            regex,
708            "/a/i",
709            "",
710            Regex {
711                ast: Node::Literal(lit(b'a', 1..2, false)),
712                case_insensitive: true,
713                dot_all: false,
714                span: 0..4,
715            },
716        );
717        parse(
718            regex,
719            "/[^0-9]+/a",
720            "a",
721            Regex {
722                ast: Node::Repetition {
723                    node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
724                        items: vec![BracketedClassItem::Range(
725                            lit(b'0', 3..4, false),
726                            lit(b'9', 5..6, false),
727                        )],
728                        negated: true,
729                    }))),
730                    kind: RepetitionKind::OneOrMore,
731                    greedy: true,
732                },
733                case_insensitive: false,
734                dot_all: false,
735                span: 0..9,
736            },
737        );
738        parse(
739            regex,
740            r"/a\/b\cd/isb",
741            "b",
742            Regex {
743                ast: Node::Concat(vec![
744                    Node::Literal(lit(b'a', 1..2, false)),
745                    Node::Literal(lit(b'/', 2..4, true)),
746                    Node::Literal(lit(b'b', 4..5, false)),
747                    Node::Literal(lit(b'c', 5..7, true)),
748                    Node::Literal(lit(b'd', 7..8, false)),
749                ]),
750                case_insensitive: true,
751                dot_all: true,
752                span: 0..11,
753            },
754        );
755        parse(
756            regex,
757            r"/.{2}/si c",
758            "i c",
759            Regex {
760                ast: Node::Repetition {
761                    node: Box::new(Node::Dot),
762                    kind: RepetitionKind::Range(RepetitionRange::Exactly(2)),
763                    greedy: true,
764                },
765                case_insensitive: false,
766                dot_all: true,
767                span: 0..7,
768            },
769        );
770        parse(
771            regex,
772            "/\0\\\0/ c",
773            "c",
774            Regex {
775                ast: Node::Concat(vec![
776                    Node::Literal(lit(b'\0', 1..2, false)),
777                    Node::Literal(lit(b'\0', 2..4, true)),
778                ]),
779                case_insensitive: false,
780                dot_all: false,
781                span: 0..5,
782            },
783        );
784
785        parse_err(regex, "");
786        parse_err(regex, "/");
787        parse_err(regex, "/\n/");
788        parse_err(regex, "/a{2}");
789        parse_err(regex, "/a///");
790        parse_err(regex, "/a{5,4}/");
791    }
792
793    #[test]
794    fn test_alternative() {
795        parse(alternative, "(", "(", Node::Empty);
796        parse(
797            alternative,
798            "a)",
799            ")",
800            Node::Literal(lit(b'a', 0..1, false)),
801        );
802        parse(
803            alternative,
804            "a|b",
805            "",
806            Node::Alternation(vec![
807                Node::Literal(lit(b'a', 0..1, false)),
808                Node::Literal(lit(b'b', 2..3, false)),
809            ]),
810        );
811        parse(
812            alternative,
813            "a|)",
814            ")",
815            Node::Alternation(vec![Node::Literal(lit(b'a', 0..1, false)), Node::Empty]),
816        );
817
818        parse(
819            alternative,
820            r"ab|.\||\b$|",
821            "",
822            Node::Alternation(vec![
823                Node::Concat(vec![
824                    Node::Literal(lit(b'a', 0..1, false)),
825                    Node::Literal(lit(b'b', 1..2, false)),
826                ]),
827                Node::Concat(vec![Node::Dot, Node::Literal(lit(b'|', 4..6, true))]),
828                Node::Concat(vec![
829                    Node::Assertion(AssertionKind::WordBoundary),
830                    Node::Assertion(AssertionKind::EndLine),
831                ]),
832                Node::Empty,
833            ]),
834        );
835
836        parse_err(alternative, "\\xEG");
837    }
838
839    #[test]
840    fn test_concatenation() {
841        parse(concatenation, "", "", Node::Empty);
842        parse(
843            concatenation,
844            "a",
845            "",
846            Node::Literal(lit(b'a', 0..1, false)),
847        );
848        parse(
849            concatenation,
850            "ab",
851            "",
852            Node::Concat(vec![
853                Node::Literal(lit(b'a', 0..1, false)),
854                Node::Literal(lit(b'b', 1..2, false)),
855            ]),
856        );
857        parse(
858            concatenation,
859            "a$*",
860            "*",
861            Node::Concat(vec![
862                Node::Literal(lit(b'a', 0..1, false)),
863                Node::Assertion(AssertionKind::EndLine),
864            ]),
865        );
866        parse(
867            concatenation,
868            r"^a+\b\d{2,3}[^z]*?)",
869            ")",
870            Node::Concat(vec![
871                Node::Assertion(AssertionKind::StartLine),
872                Node::Repetition {
873                    node: Box::new(Node::Literal(lit(b'a', 1..2, false))),
874                    kind: RepetitionKind::OneOrMore,
875                    greedy: true,
876                },
877                Node::Assertion(AssertionKind::WordBoundary),
878                Node::Repetition {
879                    node: Box::new(Node::Class(ClassKind::Perl(PerlClass {
880                        kind: PerlClassKind::Digit,
881                        negated: false,
882                    }))),
883                    kind: RepetitionKind::Range(RepetitionRange::Bounded(2, 3)),
884                    greedy: true,
885                },
886                Node::Repetition {
887                    node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
888                        items: vec![BracketedClassItem::Literal(lit(b'z', 14..15, false))],
889                        negated: true,
890                    }))),
891                    kind: RepetitionKind::ZeroOrMore,
892                    greedy: false,
893                },
894            ]),
895        );
896
897        parse_err(concatenation, "\\xEG");
898    }
899
900    #[test]
901    fn test_assertion() {
902        parse(
903            assertion,
904            r"\ba",
905            "a",
906            Node::Assertion(AssertionKind::WordBoundary),
907        );
908        parse(
909            assertion,
910            r"\B ",
911            " ",
912            Node::Assertion(AssertionKind::NonWordBoundary),
913        );
914        parse(
915            assertion,
916            r"^^",
917            "^",
918            Node::Assertion(AssertionKind::StartLine),
919        );
920        parse(
921            assertion,
922            r"$^",
923            "^",
924            Node::Assertion(AssertionKind::EndLine),
925        );
926
927        parse_err(assertion, r"\w");
928    }
929
930    #[test]
931    fn test_repetition() {
932        parse(repetition, "*??", "?", (RepetitionKind::ZeroOrMore, false));
933        parse(repetition, "+??", "?", (RepetitionKind::OneOrMore, false));
934        parse(repetition, "???", "?", (RepetitionKind::ZeroOrOne, false));
935        parse(repetition, "*a?", "a?", (RepetitionKind::ZeroOrMore, true));
936        parse(repetition, "+a?", "a?", (RepetitionKind::OneOrMore, true));
937        parse(repetition, "?a?", "a?", (RepetitionKind::ZeroOrOne, true));
938        parse(
939            repetition,
940            "{5}??",
941            "?",
942            (RepetitionKind::Range(RepetitionRange::Exactly(5)), false),
943        );
944
945        parse_err(repetition, "5");
946    }
947
948    #[test]
949    fn test_single() {
950        parse(single, ".a", "a", Node::Dot);
951        parse(single, "()a", "a", Node::Group(Box::new(Node::Empty)));
952        parse(
953            single,
954            "(ab)a",
955            "a",
956            Node::Group(Box::new(Node::Concat(vec![
957                Node::Literal(lit(b'a', 1..2, false)),
958                Node::Literal(lit(b'b', 2..3, false)),
959            ]))),
960        );
961        parse(
962            single,
963            r"\s",
964            "",
965            Node::Class(ClassKind::Perl(PerlClass {
966                kind: PerlClassKind::Space,
967                negated: false,
968            })),
969        );
970        parse(
971            single,
972            r"[a-fA-F] ",
973            " ",
974            Node::Class(ClassKind::Bracketed(BracketedClass {
975                items: vec![
976                    BracketedClassItem::Range(lit(b'a', 1..2, false), lit(b'f', 3..4, false)),
977                    BracketedClassItem::Range(lit(b'A', 4..5, false), lit(b'F', 6..7, false)),
978                ],
979                negated: false,
980            })),
981        );
982        parse(
983            single,
984            r"\xFFa",
985            "a",
986            Node::Literal(lit(b'\xFF', 0..4, false)),
987        );
988        parse(single, r"]a", "a", Node::Literal(lit(b']', 0..1, false)));
989
990        parse_err(single, "");
991        parse_err(single, "(");
992        parse_err(single, ")");
993        parse_err(single, "[");
994        parse_err(single, "|");
995        parse_err(single, "$");
996        parse_err(single, "^");
997        parse_err(single, "+");
998        parse_err(single, "*");
999        parse_err(single, "?");
1000        parse_err(single, "(a");
1001    }
1002
1003    #[test]
1004    fn test_perl_class() {
1005        parse(
1006            perl_class,
1007            r"\w ",
1008            " ",
1009            PerlClass {
1010                kind: PerlClassKind::Word,
1011                negated: false,
1012            },
1013        );
1014        parse(
1015            perl_class,
1016            r"\Wa",
1017            "a",
1018            PerlClass {
1019                kind: PerlClassKind::Word,
1020                negated: true,
1021            },
1022        );
1023        parse(
1024            perl_class,
1025            r"\s",
1026            "",
1027            PerlClass {
1028                kind: PerlClassKind::Space,
1029                negated: false,
1030            },
1031        );
1032        parse(
1033            perl_class,
1034            r"\S\",
1035            "\\",
1036            PerlClass {
1037                kind: PerlClassKind::Space,
1038                negated: true,
1039            },
1040        );
1041        parse(
1042            perl_class,
1043            r"\d",
1044            "",
1045            PerlClass {
1046                kind: PerlClassKind::Digit,
1047                negated: false,
1048            },
1049        );
1050        parse(
1051            perl_class,
1052            r"\Da",
1053            "a",
1054            PerlClass {
1055                kind: PerlClassKind::Digit,
1056                negated: true,
1057            },
1058        );
1059
1060        parse_err(perl_class, "");
1061        parse_err(perl_class, "\\");
1062        parse_err(perl_class, "\\k");
1063    }
1064
1065    #[test]
1066    fn test_bracketed_class() {
1067        parse(
1068            bracketed_class,
1069            "[a]b",
1070            "b",
1071            BracketedClass {
1072                items: vec![BracketedClassItem::Literal(lit(b'a', 1..2, false))],
1073                negated: false,
1074            },
1075        );
1076        parse(
1077            bracketed_class,
1078            "[^a-z_\\S0-9]",
1079            "",
1080            BracketedClass {
1081                items: vec![
1082                    BracketedClassItem::Range(lit(b'a', 2..3, false), lit(b'z', 4..5, false)),
1083                    BracketedClassItem::Literal(lit(b'_', 5..6, false)),
1084                    BracketedClassItem::Perl(PerlClass {
1085                        kind: PerlClassKind::Space,
1086                        negated: true,
1087                    }),
1088                    BracketedClassItem::Range(lit(b'0', 8..9, false), lit(b'9', 10..11, false)),
1089                ],
1090                negated: true,
1091            },
1092        );
1093        parse(
1094            bracketed_class,
1095            "[]\\j]",
1096            "",
1097            BracketedClass {
1098                items: vec![
1099                    BracketedClassItem::Literal(lit(b'j', 2..4, true)),
1100                    BracketedClassItem::Literal(lit(b']', 1..2, false)),
1101                ],
1102                negated: false,
1103            },
1104        );
1105        parse(
1106            bracketed_class,
1107            "[]]",
1108            "",
1109            BracketedClass {
1110                items: vec![BracketedClassItem::Literal(lit(b']', 1..2, false))],
1111                negated: false,
1112            },
1113        );
1114        parse(
1115            bracketed_class,
1116            "[^]]",
1117            "",
1118            BracketedClass {
1119                items: vec![BracketedClassItem::Literal(lit(b']', 2..3, false))],
1120                negated: true,
1121            },
1122        );
1123        parse(
1124            bracketed_class,
1125            "[^a\\]b-]",
1126            "",
1127            BracketedClass {
1128                items: vec![
1129                    BracketedClassItem::Literal(lit(b'a', 2..3, false)),
1130                    BracketedClassItem::Literal(lit(b']', 3..5, true)),
1131                    BracketedClassItem::Literal(lit(b'b', 5..6, false)),
1132                    BracketedClassItem::Literal(lit(b'-', 6..7, false)),
1133                ],
1134                negated: true,
1135            },
1136        );
1137
1138        parse_err(bracketed_class, "[");
1139        parse_err(bracketed_class, "[]");
1140        parse_err(bracketed_class, "[^]");
1141        parse_err(bracketed_class, "[é]");
1142        parse_err(bracketed_class, "[\\]");
1143        parse_err(bracketed_class, "[\\x]");
1144        parse_err(bracketed_class, "[\\x0]");
1145        parse_err(bracketed_class, "[\\é]");
1146    }
1147
1148    #[test]
1149    fn test_bracketed_class_item() {
1150        parse(
1151            bracketed_class_item,
1152            "\\sw",
1153            "w",
1154            BracketedClassItem::Perl(PerlClass {
1155                kind: PerlClassKind::Space,
1156                negated: false,
1157            }),
1158        );
1159        parse(
1160            bracketed_class_item,
1161            "\\c-z]",
1162            "]",
1163            BracketedClassItem::Range(lit(b'c', 0..2, true), lit(b'z', 3..4, false)),
1164        );
1165
1166        parse_err(bracketed_class_item, "é");
1167    }
1168
1169    #[test]
1170    fn test_bracketed_class_range_or_literal() {
1171        parse(
1172            bracketed_class_range_or_literal,
1173            "ab",
1174            "b",
1175            BracketedClassItem::Literal(lit(b'a', 0..1, false)),
1176        );
1177        parse(
1178            bracketed_class_range_or_literal,
1179            r"\x01-",
1180            "-",
1181            BracketedClassItem::Literal(lit(b'\x01', 0..4, false)),
1182        );
1183        parse(
1184            bracketed_class_range_or_literal,
1185            "-\\]",
1186            "\\]",
1187            BracketedClassItem::Literal(lit(b'-', 0..1, false)),
1188        );
1189        parse(
1190            bracketed_class_range_or_literal,
1191            "A-]",
1192            "-]",
1193            BracketedClassItem::Literal(lit(b'A', 0..1, false)),
1194        );
1195
1196        parse(
1197            bracketed_class_range_or_literal,
1198            "a-\\sb",
1199            "b",
1200            BracketedClassItem::Range(lit(b'a', 0..1, false), lit(b's', 2..4, true)),
1201        );
1202        parse(
1203            bracketed_class_range_or_literal,
1204            "!--",
1205            "",
1206            BracketedClassItem::Range(lit(b'!', 0..1, false), lit(b'-', 2..3, false)),
1207        );
1208        parse(
1209            bracketed_class_range_or_literal,
1210            "---",
1211            "",
1212            BracketedClassItem::Range(lit(b'-', 0..1, false), lit(b'-', 2..3, false)),
1213        );
1214        parse(
1215            bracketed_class_range_or_literal,
1216            r"\n-\n",
1217            "",
1218            BracketedClassItem::Range(lit(b'\n', 0..2, false), lit(b'\n', 3..5, false)),
1219        );
1220        parse(
1221            bracketed_class_range_or_literal,
1222            r"\x01-\xFE",
1223            "",
1224            BracketedClassItem::Range(lit(b'\x01', 0..4, false), lit(b'\xFE', 5..9, false)),
1225        );
1226
1227        parse_err(bracketed_class_range_or_literal, "é");
1228        parse_err(bracketed_class_range_or_literal, "b-a");
1229        parse_err(bracketed_class_range_or_literal, "é-a");
1230        parse_err(bracketed_class_range_or_literal, "a-é");
1231        parse_err(bracketed_class_range_or_literal, "]-a");
1232    }
1233
1234    #[test]
1235    fn test_bracketed_class_literal() {
1236        parse(bracketed_class_literal, "ab", "b", lit(b'a', 0..1, false));
1237        parse(
1238            bracketed_class_literal,
1239            "\\nb",
1240            "b",
1241            lit(b'\n', 0..2, false),
1242        );
1243        parse(bracketed_class_literal, "\\]", "", lit(b']', 0..2, true));
1244
1245        parse_err(bracketed_class_literal, "]b");
1246        parse_err(bracketed_class_literal, "é");
1247        parse_err(bracketed_class_literal, "\\x1");
1248        parse_err(bracketed_class_literal, "\\é");
1249    }
1250
1251    #[test]
1252    fn test_bracketed_class_char() {
1253        parse(bracketed_class_char, "ab", "b", lit(b'a', 0..1, false));
1254
1255        parse_err(bracketed_class_char, "]b");
1256        parse_err(bracketed_class_char, "é");
1257    }
1258
1259    #[test]
1260    fn test_literal() {
1261        parse(literal, "ab", "b", Node::Literal(lit(b'a', 0..1, false)));
1262        parse(literal, "]", "", Node::Literal(lit(b']', 0..1, false)));
1263
1264        parse(
1265            literal,
1266            "éb",
1267            "b",
1268            Node::Char(LiteralChar {
1269                c: 'é',
1270                span: 0..2,
1271                escaped: false,
1272            }),
1273        );
1274    }
1275
1276    #[test]
1277    fn test_escaped_char() {
1278        parse(
1279            escaped_char,
1280            "\\na",
1281            "a",
1282            Node::Literal(lit(b'\n', 0..2, false)),
1283        );
1284        parse(
1285            escaped_char,
1286            "\\ta",
1287            "a",
1288            Node::Literal(lit(b'\t', 0..2, false)),
1289        );
1290        parse(
1291            escaped_char,
1292            "\\ra",
1293            "a",
1294            Node::Literal(lit(b'\r', 0..2, false)),
1295        );
1296        parse(
1297            escaped_char,
1298            "\\fa",
1299            "a",
1300            Node::Literal(lit(b'\x0C', 0..2, false)),
1301        );
1302        parse(
1303            escaped_char,
1304            "\\aa",
1305            "a",
1306            Node::Literal(lit(b'\x07', 0..2, false)),
1307        );
1308        parse(
1309            escaped_char,
1310            "\\x00a",
1311            "a",
1312            Node::Literal(lit(b'\0', 0..4, false)),
1313        );
1314        parse(
1315            escaped_char,
1316            "\\xAF a",
1317            " a",
1318            Node::Literal(lit(b'\xAF', 0..4, false)),
1319        );
1320        parse(
1321            escaped_char,
1322            "\\k",
1323            "",
1324            Node::Literal(lit(b'k', 0..2, true)),
1325        );
1326        parse(
1327            escaped_char,
1328            "\\é_",
1329            "_",
1330            Node::Char(LiteralChar {
1331                c: 'é',
1332                span: 0..3,
1333                escaped: true,
1334            }),
1335        );
1336
1337        parse_err(escaped_char, "\\");
1338        parse_err(escaped_char, "\\x");
1339        parse_err(escaped_char, "\\x2");
1340        parse_err(escaped_char, "\\x2G");
1341    }
1342
1343    #[test]
1344    fn test_escaped_char_only_ascii() {
1345        parse(
1346            escaped_char_only_ascii,
1347            "\\na",
1348            "a",
1349            lit(b'\n', 0..2, false),
1350        );
1351        parse(
1352            escaped_char_only_ascii,
1353            "\\ta",
1354            "a",
1355            lit(b'\t', 0..2, false),
1356        );
1357        parse(
1358            escaped_char_only_ascii,
1359            "\\ra",
1360            "a",
1361            lit(b'\r', 0..2, false),
1362        );
1363        parse(
1364            escaped_char_only_ascii,
1365            "\\fa",
1366            "a",
1367            lit(b'\x0C', 0..2, false),
1368        );
1369        parse(
1370            escaped_char_only_ascii,
1371            "\\aa",
1372            "a",
1373            lit(b'\x07', 0..2, false),
1374        );
1375        parse(
1376            escaped_char_only_ascii,
1377            "\\x00a",
1378            "a",
1379            lit(b'\0', 0..4, false),
1380        );
1381        parse(
1382            escaped_char_only_ascii,
1383            "\\xAF a",
1384            " a",
1385            lit(b'\xAF', 0..4, false),
1386        );
1387        parse(escaped_char_only_ascii, "\\k", "", lit(b'k', 0..2, true));
1388
1389        parse_err(escaped_char_only_ascii, "\\");
1390        parse_err(escaped_char_only_ascii, "\\é");
1391        parse_err(escaped_char_only_ascii, "\\x");
1392        parse_err(escaped_char_only_ascii, "\\x2");
1393        parse_err(escaped_char_only_ascii, "\\x2G");
1394    }
1395
1396    #[test]
1397    fn test_range_repetition() {
1398        parse(
1399            range_repetition,
1400            "{0} ?a",
1401            " ?a",
1402            (RepetitionRange::Exactly(0), true),
1403        );
1404        parse(
1405            range_repetition,
1406            "{5}?a",
1407            "a",
1408            (RepetitionRange::Exactly(5), false),
1409        );
1410
1411        parse(
1412            range_repetition,
1413            "{5,15} a",
1414            " a",
1415            (RepetitionRange::Bounded(5, 15), true),
1416        );
1417        parse(
1418            range_repetition,
1419            "{5,}?a",
1420            "a",
1421            (RepetitionRange::AtLeast(5), false),
1422        );
1423
1424        parse_err(range_repetition, "{}?");
1425    }
1426
1427    #[test]
1428    fn test_range_single() {
1429        parse(range_single, "{0}a", "a", RepetitionRange::Exactly(0));
1430        parse(range_single, "{350} a", " a", RepetitionRange::Exactly(350));
1431
1432        parse_err(range_single, "{");
1433        parse_err(range_single, "{}");
1434        parse_err(range_single, "{-5}");
1435    }
1436
1437    #[test]
1438    fn test_range_multi() {
1439        parse(range_multi, "{,5}a", "a", RepetitionRange::Bounded(0, 5));
1440        parse(range_multi, "{5,}a", "a", RepetitionRange::AtLeast(5));
1441        parse(range_multi, "{5,10}a", "a", RepetitionRange::Bounded(5, 10));
1442        parse(range_multi, "{0,0} a", " a", RepetitionRange::Bounded(0, 0));
1443        parse(range_multi, "{,}", "", RepetitionRange::AtLeast(0));
1444        parse(range_multi, "{, }", "", RepetitionRange::AtLeast(0));
1445        parse(range_multi, "{ ,}", "", RepetitionRange::AtLeast(0));
1446        parse(range_multi, "{ , }", "", RepetitionRange::AtLeast(0));
1447        parse(range_multi, "{2 , }", "", RepetitionRange::AtLeast(2));
1448        parse(range_multi, "{ , 2}", "", RepetitionRange::Bounded(0, 2));
1449        parse(range_multi, "{1 , 2}", "", RepetitionRange::Bounded(1, 2));
1450
1451        parse_err(range_multi, "{");
1452        parse_err(range_multi, "{,5");
1453        parse_err(range_multi, "{,-5}");
1454        parse_err(range_multi, "{-5,}");
1455        parse_err(range_multi, "{10,5}");
1456        parse_err(range_multi, "{ 1,5}");
1457        parse_err(range_multi, "{1,5 }");
1458    }
1459
1460    #[test]
1461    fn test_parse_u32() {
1462        parse(parse_u32, "5a", "a", 5_u32);
1463
1464        parse_err(parse_u32, "a");
1465        parse_err(parse_u32, "-5a");
1466        parse_err(parse_u32, "5000000000000");
1467    }
1468
1469    #[test]
1470    fn test_parse_opt_u32() {
1471        parse(parse_opt_u32, "a", "a", None);
1472        parse(parse_opt_u32, "5a", "a", Some(5));
1473        parse(parse_opt_u32, "-5a", "-5a", None);
1474
1475        parse_err(parse_opt_u32, "5000000000000");
1476    }
1477
1478    #[test]
1479    fn test_stack_overflow() {
1480        // Parsing of a regex includes recursion, so it must be protected against
1481        // stack overflowing.
1482        let mut v = String::new();
1483        v.push('/');
1484        for _ in 0..1_000 {
1485            v.push_str("a(b");
1486        }
1487        for _ in 0..1_000 {
1488            v.push_str(")c");
1489        }
1490        v.push('/');
1491
1492        parse_err_type(regex, &v, &Error::new(30..30, ErrorKind::RegexTooDeep));
1493
1494        // counter should reset, so many imbricated groups, but all below the limit should be fine.
1495        let mut v = String::new();
1496        v.push('/');
1497        let nb = MAX_REGEX_RECURSION - 1;
1498        for _ in 0..nb {
1499            v.push_str("a(b");
1500        }
1501        for _ in 0..nb {
1502            v.push_str(")c");
1503        }
1504        v.push('d');
1505        for _ in 0..nb {
1506            v.push_str("e(f");
1507        }
1508        for _ in 0..nb {
1509            v.push_str(")h");
1510        }
1511        v.push('/');
1512
1513        let input = Input::new(&v);
1514        let _res = regex(input).unwrap();
1515        assert_eq!(input.inner_recursion_counter, 0);
1516    }
1517
1518    #[test]
1519    fn test_parse_regex() {
1520        assert!(parse_regex(r"/a{2}/").is_ok());
1521        assert!(parse_regex(r"a{2}/").is_err());
1522    }
1523
1524    #[test]
1525    fn test_public_types() {
1526        test_public_type(regex(Input::new(r"/a{2}[az]\b\s|.+$/")).unwrap());
1527    }
1528}