boreal_parser/
regex.rs

1//! AST elements related to YARA regexes.
2use std::ops::Range;
3
4use nom::branch::alt;
5use nom::bytes::complete::{tag, take};
6use nom::character::complete::{anychar, char, digit0, digit1, multispace0, none_of};
7use nom::combinator::{cut, map, opt};
8use nom::multi::many0;
9use nom::sequence::{delimited, separated_pair, terminated};
10use nom::Parser;
11
12use crate::error::ErrorKind;
13
14use super::error::Error;
15use super::nom_recipes::rtrim;
16use super::types::{Input, ParseResult};
17
18/// A regular expression.
19#[derive(Clone, Debug, PartialEq, Eq)]
20pub struct Regex {
21    /// The AST of the regular expression parsed inside the `/` delimiters.
22    pub ast: Node,
23    /// case insensitive (`i` flag).
24    pub case_insensitive: bool,
25    /// `.` matches `\n` (`s` flag).
26    pub dot_all: bool,
27
28    /// The span of the regex expression
29    pub span: Range<usize>,
30}
31
32/// AST node of a regular expression.
33#[derive(Clone, Debug, PartialEq, Eq)]
34pub enum Node {
35    /// Alternation of nodes, ie `node|node|...`.
36    Alternation(Vec<Node>),
37
38    /// Zero-width assertion, e.g. ^, \b, ...
39    Assertion(AssertionKind),
40
41    /// Set of allowed values for a single byte.
42    Class(ClassKind),
43
44    /// Concatenation, must match in order.
45    Concat(Vec<Node>),
46
47    /// The special `.` character.
48    Dot,
49
50    /// Empty expression.
51    Empty,
52
53    /// Literal byte.
54    Literal(Literal),
55
56    /// Literal char, not ascii.
57    Char(LiteralChar),
58
59    /// A group, i.e. (...).
60    Group(Box<Node>),
61
62    /// Repetition of an expression.
63    Repetition {
64        /// Expression to repeat.
65        node: Box<Node>,
66
67        /// Kind of repetition.
68        kind: RepetitionKind,
69
70        /// Is the repetition greedy or not.
71        greedy: bool,
72    },
73}
74
75/// Regex class.
76#[derive(Clone, Debug, PartialEq, Eq)]
77pub enum ClassKind {
78    /// Perl style class, e.g. `\w`, `\d`.
79    Perl(PerlClass),
80    /// Bracket class, i.e. `[...]`.
81    Bracketed(BracketedClass),
82}
83
84/// PERL style class.
85#[derive(Clone, Debug, PartialEq, Eq)]
86pub struct PerlClass {
87    /// Class kind.
88    pub kind: PerlClassKind,
89    /// Is the class negated.
90    pub negated: bool,
91}
92
93/// Kind of PERL style class.
94#[derive(Clone, Debug, PartialEq, Eq)]
95pub enum PerlClassKind {
96    /// Word class, i.e. `[a-zA-Z0-9_]`.
97    Word,
98    /// Space class, i.e. `[\t\n\v\f\r ]`.
99    Space,
100    /// Digit class, i.e. `[0-9]`.
101    Digit,
102}
103
104/// Class expressed in brackets.
105#[derive(Clone, Debug, PartialEq, Eq)]
106pub struct BracketedClass {
107    /// List of items in the class.
108    pub items: Vec<BracketedClassItem>,
109    /// Is the class negated.
110    pub negated: bool,
111}
112
113/// Item in a bracketed class.
114#[derive(Clone, Debug, PartialEq, Eq)]
115pub enum BracketedClassItem {
116    /// Perl style class.
117    Perl(PerlClass),
118    /// Literal byte.
119    Literal(Literal),
120    /// Range of bytes.
121    Range(Literal, Literal),
122}
123
124/// Kind of repetition.
125#[derive(Clone, Debug, PartialEq, Eq)]
126pub enum RepetitionKind {
127    /// zero or one, i.e. `?`
128    ZeroOrOne,
129    /// zero or more, i.e. `*`
130    ZeroOrMore,
131    /// one or more, i.e. `+`
132    OneOrMore,
133    /// Range, i.e. `{N}`, `{N,M}`, etc.
134    Range(RepetitionRange),
135}
136
137/// Repetition range.
138#[derive(Clone, Debug, PartialEq, Eq)]
139pub enum RepetitionRange {
140    /// Exactly the given number, i.e. `{N}`.
141    Exactly(u32),
142    /// At least the given number, i.e. `{N,}`.
143    AtLeast(u32),
144    /// Between two numbers, i.e. `{N,M}`.
145    Bounded(u32, u32),
146}
147
148/// Kind of zero-width assertion.
149#[derive(Clone, Debug, PartialEq, Eq)]
150pub enum AssertionKind {
151    /// Start of the line, i.e. `^`.
152    StartLine,
153    /// End of the line, i.e. `$`.
154    EndLine,
155    /// Word boundary, i.e. `\b`.
156    WordBoundary,
157    /// Non word boundary, i.e. `\B`.
158    NonWordBoundary,
159}
160
161/// Literal unicode character.
162#[derive(Clone, Debug, PartialEq, Eq)]
163pub struct LiteralChar {
164    /// The unicode character.
165    pub c: char,
166
167    /// Position in the input for this char.
168    pub span: Range<usize>,
169
170    /// Was the character escaped.
171    ///
172    /// See `Literal::escaped` for more details on what this means.
173    pub escaped: bool,
174}
175
176/// Literal byte
177#[derive(Clone, Debug, PartialEq, Eq)]
178pub struct Literal {
179    /// Byte value.
180    pub byte: u8,
181
182    /// Span of the literal
183    pub span: Range<usize>,
184
185    /// Was the byte escaped.
186    ///
187    /// This is for example true for '\[' or '\.', but not for '\n' or '\xAB'.
188    pub escaped: bool,
189}
190
191impl PartialOrd for Literal {
192    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
193        self.byte.partial_cmp(&other.byte)
194    }
195}
196
197/// Parse a regex.
198///
199/// The input is expected to look like `/<regex>/<modifiers>`.
200///
201/// # Errors
202///
203/// Returns an error if the parsing fails.
204pub fn parse_regex(input: &str) -> Result<Regex, Error> {
205    use nom::Finish;
206
207    let input = Input::new(input);
208    let (_, res) = regex(input).finish()?;
209
210    Ok(res)
211}
212
213/// Parse a regular expression.
214///
215/// Similar to the _REGEX_ lexical pattern in libyara. but the parsing of the AST is done
216/// directly.
217///
218/// XXX: There is change of behavior from libyara. `\<nul_byte>` was forbidden,
219/// but we do not have an issue about this (we do not save the regular expression
220/// as a C string). See [Issue #576 in Yara](https://github.com/VirusTotal/yara/issues/576).
221pub(crate) fn regex(input: Input) -> ParseResult<Regex> {
222    let start = input.pos();
223    let (input, _) = char('/').parse(input)?;
224
225    // We cannot use escaped_transform, as it is not an error to use
226    // the control character with any char other than `/`.
227    let (input, ast) = cut(terminated(alternative, char('/'))).parse(input)?;
228    let (input, (no_case, dot_all)) = rtrim((opt(char('i')), opt(char('s')))).parse(input)?;
229
230    Ok((
231        input,
232        Regex {
233            ast,
234            case_insensitive: no_case.is_some(),
235            dot_all: dot_all.is_some(),
236            span: input.get_span_from(start),
237        },
238    ))
239}
240
241fn alternative(mut input: Input) -> ParseResult<Node> {
242    // This combinator is recursive:
243    //
244    // tokens => hex_token => alternatives => tokens
245    //
246    // Use the string recursive counter to make sure this recursion cannot grow too much.
247    if input.string_recursion_counter >= input.params.string_recursion_limit {
248        return Err(nom::Err::Failure(Error::new(
249            input.get_span_from(input.pos()),
250            ErrorKind::RegexTooDeep,
251        )));
252    }
253
254    let mut alts = Vec::new();
255    loop {
256        input.string_recursion_counter += 1;
257        let (mut input2, node) = concatenation(input)?;
258        input2.string_recursion_counter -= 1;
259
260        let (input2, has_alt_char) = eat_opt_char('|', input2);
261        if has_alt_char {
262            alts.push(node);
263            input = input2;
264            continue;
265        }
266
267        return Ok((
268            input2,
269            if alts.is_empty() {
270                node
271            } else {
272                alts.push(node);
273                Node::Alternation(alts)
274            },
275        ));
276    }
277}
278
279fn eat_opt_char(c: char, mut input: Input) -> (Input, bool) {
280    match input.cursor().chars().next() {
281        Some(c2) if c2 == c => {
282            input.advance(c.len_utf8());
283            (input, true)
284        }
285        _ => (input, false),
286    }
287}
288
289fn concatenation(input: Input) -> ParseResult<Node> {
290    let (input, mut nodes) = many0(repeat).parse(input)?;
291
292    let node = if nodes.is_empty() {
293        Node::Empty
294    } else if nodes.len() == 1 {
295        nodes.pop().unwrap()
296    } else {
297        Node::Concat(nodes)
298    };
299
300    Ok((input, node))
301}
302
303fn repeat(input: Input) -> ParseResult<Node> {
304    // First, parse assertions
305    if let Ok((input, node)) = assertion(input) {
306        return Ok((input, node));
307    }
308
309    // Otherwise, parse single node with optional repetition
310    let (input, node) = single(input)?;
311    let (input, repetition) = opt(repetition).parse(input)?;
312    match repetition {
313        Some((kind, greedy)) => Ok((
314            input,
315            Node::Repetition {
316                node: Box::new(node),
317                kind,
318                greedy,
319            },
320        )),
321        None => Ok((input, node)),
322    }
323}
324
325// Parse node that contains a repetition, or nodes that cannot be repeated
326fn assertion(input: Input) -> ParseResult<Node> {
327    alt((
328        map(tag(r"\b"), |_| Node::Assertion(AssertionKind::WordBoundary)),
329        map(tag(r"\B"), |_| {
330            Node::Assertion(AssertionKind::NonWordBoundary)
331        }),
332        map(char('^'), |_| Node::Assertion(AssertionKind::StartLine)),
333        map(char('$'), |_| Node::Assertion(AssertionKind::EndLine)),
334    ))
335    .parse(input)
336}
337
338fn repetition(input: Input) -> ParseResult<(RepetitionKind, bool)> {
339    alt((
340        map(tag("*?"), |_| (RepetitionKind::ZeroOrMore, false)),
341        map(tag("+?"), |_| (RepetitionKind::OneOrMore, false)),
342        map(tag("??"), |_| (RepetitionKind::ZeroOrOne, false)),
343        map(tag("*"), |_| (RepetitionKind::ZeroOrMore, true)),
344        map(tag("+"), |_| (RepetitionKind::OneOrMore, true)),
345        map(tag("?"), |_| (RepetitionKind::ZeroOrOne, true)),
346        map(range_repetition, |(kind, greedy)| {
347            (RepetitionKind::Range(kind), greedy)
348        }),
349    ))
350    .parse(input)
351}
352
353fn single(input: Input) -> ParseResult<Node> {
354    alt((
355        map(delimited(char('('), alternative, char(')')), |node| {
356            Node::Group(Box::new(node))
357        }),
358        map(char('.'), |_| Node::Dot),
359        map(perl_class, |p| Node::Class(ClassKind::Perl(p))),
360        map(bracketed_class, |p| Node::Class(ClassKind::Bracketed(p))),
361        escaped_char,
362        literal,
363    ))
364    .parse(input)
365}
366
367fn perl_class(input: Input) -> ParseResult<PerlClass> {
368    alt((
369        map(tag(r"\w"), |_| PerlClass {
370            kind: PerlClassKind::Word,
371            negated: false,
372        }),
373        map(tag(r"\W"), |_| PerlClass {
374            kind: PerlClassKind::Word,
375            negated: true,
376        }),
377        map(tag(r"\s"), |_| PerlClass {
378            kind: PerlClassKind::Space,
379            negated: false,
380        }),
381        map(tag(r"\S"), |_| PerlClass {
382            kind: PerlClassKind::Space,
383            negated: true,
384        }),
385        map(tag(r"\d"), |_| PerlClass {
386            kind: PerlClassKind::Digit,
387            negated: false,
388        }),
389        map(tag(r"\D"), |_| PerlClass {
390            kind: PerlClassKind::Digit,
391            negated: true,
392        }),
393    ))
394    .parse(input)
395}
396
397fn bracketed_class(input: Input) -> ParseResult<BracketedClass> {
398    let (input, _) = char('[').parse(input)?;
399    // As soon as we parse a '[', we are in class mode, hence the cut if parsing fails.
400    cut(bracketed_class_inner).parse(input)
401}
402
403fn bracketed_class_inner(input: Input) -> ParseResult<BracketedClass> {
404    let (input, negated) = eat_opt_char('^', input);
405    let start = input.pos();
406    let (input2, contains_closing_bracket) = eat_opt_char(']', input);
407
408    let (input, mut items) = many0(bracketed_class_item).parse(input2)?;
409    let (input, _) = char(']').parse(input)?;
410
411    if contains_closing_bracket {
412        items.push(BracketedClassItem::Literal(Literal {
413            byte: b']',
414            span: input2.get_span_from_no_rtrim(start),
415            escaped: false,
416        }));
417    }
418    Ok((input, BracketedClass { items, negated }))
419}
420
421fn bracketed_class_item(input: Input) -> ParseResult<BracketedClassItem> {
422    alt((
423        map(perl_class, BracketedClassItem::Perl),
424        bracketed_class_range_or_literal,
425    ))
426    .parse(input)
427}
428
429fn bracketed_class_range_or_literal(input: Input) -> ParseResult<BracketedClassItem> {
430    let start = input.pos();
431    let (input, lit) = bracketed_class_literal(input)?;
432    let (input2, has_dash) = eat_opt_char('-', input);
433
434    if has_dash {
435        let (input3, lit2) = opt(bracketed_class_literal).parse(input2)?;
436        match lit2 {
437            Some(lit2) if lit2 < lit => Err(nom::Err::Failure(Error::new(
438                input3.get_span_from_no_rtrim(start),
439                ErrorKind::RegexClassRangeInvalid,
440            ))),
441            Some(lit2) => Ok((input3, BracketedClassItem::Range(lit, lit2))),
442            None => Ok((input, BracketedClassItem::Literal(lit))),
443        }
444    } else {
445        Ok((input, BracketedClassItem::Literal(lit)))
446    }
447}
448
449fn bracketed_class_literal(input: Input) -> ParseResult<Literal> {
450    alt((escaped_char_only_ascii, bracketed_class_char)).parse(input)
451}
452
453fn bracketed_class_char(input: Input) -> ParseResult<Literal> {
454    let start = input.pos();
455
456    // / and \n are disallowed because of the surrounding rule (we are parsing a /.../ variable,
457    // and newlines are not allowed
458    // ] is disallowed because it indicates the end of the class
459    let (input, b) = none_of("/\n]").parse(input)?;
460    if b.is_ascii() {
461        Ok((
462            input,
463            Literal {
464                byte: b as u8,
465                span: input.get_span_from_no_rtrim(start),
466                escaped: false,
467            },
468        ))
469    } else {
470        Err(nom::Err::Failure(Error::new(
471            input.get_span_from_no_rtrim(start),
472            ErrorKind::RegexNonAsciiByte,
473        )))
474    }
475}
476
477fn literal(input: Input) -> ParseResult<Node> {
478    let start = input.pos();
479
480    // / and \n are disallowed because of the surrounding rule (we are parsing a /.../ variable,
481    // and newlines are not allowed
482    // rest is disallowed because they have specific meaning.
483    let (input, c) = none_of("/\n()[\\|.$^+*?").parse(input)?;
484    let node = if c.is_ascii() {
485        Node::Literal(Literal {
486            byte: c as u8,
487            span: input.get_span_from_no_rtrim(start),
488            escaped: false,
489        })
490    } else {
491        Node::Char(LiteralChar {
492            c,
493            span: input.get_span_from_no_rtrim(start),
494            escaped: false,
495        })
496    };
497
498    Ok((input, node))
499}
500
501fn escaped_char(input: Input) -> ParseResult<Node> {
502    let (input, res) = escaped_char_inner(input)?;
503
504    let node = match res {
505        Escaped {
506            kind: EscapedKind::Byte(byte),
507            span,
508            escaped,
509        } => Node::Literal(Literal {
510            byte,
511            span,
512            escaped,
513        }),
514        Escaped {
515            kind: EscapedKind::Char(c),
516            span,
517            escaped,
518        } => Node::Char(LiteralChar { c, span, escaped }),
519    };
520
521    Ok((input, node))
522}
523
524fn escaped_char_only_ascii(input: Input) -> ParseResult<Literal> {
525    let (input, res) = escaped_char_inner(input)?;
526
527    match res {
528        Escaped {
529            kind: EscapedKind::Byte(byte),
530            span,
531            escaped,
532        } => Ok((
533            input,
534            Literal {
535                byte,
536                span,
537                escaped,
538            },
539        )),
540        Escaped {
541            kind: EscapedKind::Char(_),
542            span,
543            ..
544        } => Err(nom::Err::Failure(Error::new(
545            span,
546            ErrorKind::RegexNonAsciiByte,
547        ))),
548    }
549}
550
551fn escaped_char_inner(input: Input) -> ParseResult<Escaped> {
552    let start = input.pos();
553    let (input2, _) = char('\\').parse(input)?;
554    let (input, b) = anychar(input2)?;
555
556    let span = input.get_span_from_no_rtrim(start);
557    let (kind, escaped) = match b {
558        'n' => (EscapedKind::Byte(b'\n'), false),
559        't' => (EscapedKind::Byte(b'\t'), false),
560        'r' => (EscapedKind::Byte(b'\r'), false),
561        'f' => (EscapedKind::Byte(b'\x0C'), false),
562        'a' => (EscapedKind::Byte(b'\x07'), false),
563        'x' => {
564            let (input, n) = cut(take(2_u32)).parse(input)?;
565
566            let n = match u8::from_str_radix(&n, 16) {
567                Ok(n) => n,
568                Err(e) => {
569                    return Err(nom::Err::Failure(Error::new(
570                        input.get_span_from_no_rtrim(start),
571                        ErrorKind::StrToHexIntError(e),
572                    )));
573                }
574            };
575            return Ok((
576                input,
577                Escaped {
578                    kind: EscapedKind::Byte(n),
579                    span: input.get_span_from_no_rtrim(start),
580                    escaped: false,
581                },
582            ));
583        }
584        c if c.is_ascii() => (EscapedKind::Byte(c as u8), true),
585        c => (EscapedKind::Char(c), true),
586    };
587
588    Ok((
589        input,
590        Escaped {
591            kind,
592            span,
593            escaped,
594        },
595    ))
596}
597
598struct Escaped {
599    kind: EscapedKind,
600    span: Range<usize>,
601    escaped: bool,
602}
603
604#[allow(variant_size_differences)]
605enum EscapedKind {
606    Byte(u8),
607    Char(char),
608}
609
610fn range_repetition(input: Input) -> ParseResult<(RepetitionRange, bool)> {
611    let (input, range) = alt((range_single, range_multi)).parse(input)?;
612    let (input, non_greedy) = eat_opt_char('?', input);
613
614    Ok((input, (range, !non_greedy)))
615}
616
617fn range_single(input: Input) -> ParseResult<RepetitionRange> {
618    let (input, v) = delimited(char('{'), parse_u32, char('}')).parse(input)?;
619
620    Ok((input, RepetitionRange::Exactly(v)))
621}
622
623fn range_multi(input: Input) -> ParseResult<RepetitionRange> {
624    let start = input.pos();
625    let (input, (from, to)) = delimited(
626        char('{'),
627        separated_pair(
628            parse_opt_u32,
629            delimited(multispace0, char(','), multispace0),
630            parse_opt_u32,
631        ),
632        char('}'),
633    )
634    .parse(input)?;
635
636    let range = match (from, to) {
637        (None, None) => RepetitionRange::AtLeast(0),
638        (Some(from), None) => RepetitionRange::AtLeast(from),
639        (None, Some(to)) => RepetitionRange::Bounded(0, to),
640        (Some(from), Some(to)) if to < from => {
641            return Err(nom::Err::Failure(Error::new(
642                input.get_span_from_no_rtrim(start),
643                ErrorKind::RegexRangeInvalid,
644            )))
645        }
646        (Some(from), Some(to)) => RepetitionRange::Bounded(from, to),
647    };
648
649    Ok((input, range))
650}
651
652fn parse_u32(input: Input) -> ParseResult<u32> {
653    let start = input.pos();
654    let (input, v) = digit1(input)?;
655
656    let n = match str::parse::<u32>(&v) {
657        Ok(n) => n,
658        Err(e) => {
659            return Err(nom::Err::Failure(Error::new(
660                input.get_span_from_no_rtrim(start),
661                ErrorKind::StrToIntError(e),
662            )))
663        }
664    };
665
666    Ok((input, n))
667}
668
669fn parse_opt_u32(input: Input) -> ParseResult<Option<u32>> {
670    let start = input.pos();
671    let (input, v) = match digit0::<_, Error>(input) {
672        Ok((input, s)) if !s.is_empty() => (input, s),
673        _ => return Ok((input, None)),
674    };
675
676    let n = match str::parse::<u32>(&v) {
677        Ok(n) => n,
678        Err(e) => {
679            return Err(nom::Err::Failure(Error::new(
680                input.get_span_from_no_rtrim(start),
681                ErrorKind::StrToIntError(e),
682            )))
683        }
684    };
685
686    Ok((input, Some(n)))
687}
688
689#[cfg(test)]
690mod tests {
691    use crate::test_helpers::{parse, parse_err, test_public_type};
692    use crate::types::Params;
693    use nom::Finish;
694
695    use super::*;
696
697    fn lit(byte: u8, span: Range<usize>, escaped: bool) -> Literal {
698        Literal {
699            byte,
700            span,
701            escaped,
702        }
703    }
704
705    #[test]
706    fn test_parse() {
707        parse(
708            regex,
709            "/a/i",
710            "",
711            Regex {
712                ast: Node::Literal(lit(b'a', 1..2, false)),
713                case_insensitive: true,
714                dot_all: false,
715                span: 0..4,
716            },
717        );
718        parse(
719            regex,
720            "/[^0-9]+/a",
721            "a",
722            Regex {
723                ast: Node::Repetition {
724                    node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
725                        items: vec![BracketedClassItem::Range(
726                            lit(b'0', 3..4, false),
727                            lit(b'9', 5..6, false),
728                        )],
729                        negated: true,
730                    }))),
731                    kind: RepetitionKind::OneOrMore,
732                    greedy: true,
733                },
734                case_insensitive: false,
735                dot_all: false,
736                span: 0..9,
737            },
738        );
739        parse(
740            regex,
741            r"/a\/b\cd/isb",
742            "b",
743            Regex {
744                ast: Node::Concat(vec![
745                    Node::Literal(lit(b'a', 1..2, false)),
746                    Node::Literal(lit(b'/', 2..4, true)),
747                    Node::Literal(lit(b'b', 4..5, false)),
748                    Node::Literal(lit(b'c', 5..7, true)),
749                    Node::Literal(lit(b'd', 7..8, false)),
750                ]),
751                case_insensitive: true,
752                dot_all: true,
753                span: 0..11,
754            },
755        );
756        parse(
757            regex,
758            r"/.{2}/si c",
759            "i c",
760            Regex {
761                ast: Node::Repetition {
762                    node: Box::new(Node::Dot),
763                    kind: RepetitionKind::Range(RepetitionRange::Exactly(2)),
764                    greedy: true,
765                },
766                case_insensitive: false,
767                dot_all: true,
768                span: 0..7,
769            },
770        );
771        parse(
772            regex,
773            "/\0\\\0/ c",
774            "c",
775            Regex {
776                ast: Node::Concat(vec![
777                    Node::Literal(lit(b'\0', 1..2, false)),
778                    Node::Literal(lit(b'\0', 2..4, true)),
779                ]),
780                case_insensitive: false,
781                dot_all: false,
782                span: 0..5,
783            },
784        );
785
786        parse_err(regex, "");
787        parse_err(regex, "/");
788        parse_err(regex, "/\n/");
789        parse_err(regex, "/a{2}");
790        parse_err(regex, "/a///");
791        parse_err(regex, "/a{5,4}/");
792    }
793
794    #[test]
795    fn test_alternative() {
796        parse(alternative, "(", "(", Node::Empty);
797        parse(
798            alternative,
799            "a)",
800            ")",
801            Node::Literal(lit(b'a', 0..1, false)),
802        );
803        parse(
804            alternative,
805            "a|b",
806            "",
807            Node::Alternation(vec![
808                Node::Literal(lit(b'a', 0..1, false)),
809                Node::Literal(lit(b'b', 2..3, false)),
810            ]),
811        );
812        parse(
813            alternative,
814            "a|)",
815            ")",
816            Node::Alternation(vec![Node::Literal(lit(b'a', 0..1, false)), Node::Empty]),
817        );
818
819        parse(
820            alternative,
821            r"ab|.\||\b$|",
822            "",
823            Node::Alternation(vec![
824                Node::Concat(vec![
825                    Node::Literal(lit(b'a', 0..1, false)),
826                    Node::Literal(lit(b'b', 1..2, false)),
827                ]),
828                Node::Concat(vec![Node::Dot, Node::Literal(lit(b'|', 4..6, true))]),
829                Node::Concat(vec![
830                    Node::Assertion(AssertionKind::WordBoundary),
831                    Node::Assertion(AssertionKind::EndLine),
832                ]),
833                Node::Empty,
834            ]),
835        );
836
837        parse_err(alternative, "\\xEG");
838    }
839
840    #[test]
841    fn test_concatenation() {
842        parse(concatenation, "", "", Node::Empty);
843        parse(
844            concatenation,
845            "a",
846            "",
847            Node::Literal(lit(b'a', 0..1, false)),
848        );
849        parse(
850            concatenation,
851            "ab",
852            "",
853            Node::Concat(vec![
854                Node::Literal(lit(b'a', 0..1, false)),
855                Node::Literal(lit(b'b', 1..2, false)),
856            ]),
857        );
858        parse(
859            concatenation,
860            "a$*",
861            "*",
862            Node::Concat(vec![
863                Node::Literal(lit(b'a', 0..1, false)),
864                Node::Assertion(AssertionKind::EndLine),
865            ]),
866        );
867        parse(
868            concatenation,
869            r"^a+\b\d{2,3}[^z]*?)",
870            ")",
871            Node::Concat(vec![
872                Node::Assertion(AssertionKind::StartLine),
873                Node::Repetition {
874                    node: Box::new(Node::Literal(lit(b'a', 1..2, false))),
875                    kind: RepetitionKind::OneOrMore,
876                    greedy: true,
877                },
878                Node::Assertion(AssertionKind::WordBoundary),
879                Node::Repetition {
880                    node: Box::new(Node::Class(ClassKind::Perl(PerlClass {
881                        kind: PerlClassKind::Digit,
882                        negated: false,
883                    }))),
884                    kind: RepetitionKind::Range(RepetitionRange::Bounded(2, 3)),
885                    greedy: true,
886                },
887                Node::Repetition {
888                    node: Box::new(Node::Class(ClassKind::Bracketed(BracketedClass {
889                        items: vec![BracketedClassItem::Literal(lit(b'z', 14..15, false))],
890                        negated: true,
891                    }))),
892                    kind: RepetitionKind::ZeroOrMore,
893                    greedy: false,
894                },
895            ]),
896        );
897
898        parse_err(concatenation, "\\xEG");
899    }
900
901    #[test]
902    fn test_assertion() {
903        parse(
904            assertion,
905            r"\ba",
906            "a",
907            Node::Assertion(AssertionKind::WordBoundary),
908        );
909        parse(
910            assertion,
911            r"\B ",
912            " ",
913            Node::Assertion(AssertionKind::NonWordBoundary),
914        );
915        parse(
916            assertion,
917            r"^^",
918            "^",
919            Node::Assertion(AssertionKind::StartLine),
920        );
921        parse(
922            assertion,
923            r"$^",
924            "^",
925            Node::Assertion(AssertionKind::EndLine),
926        );
927
928        parse_err(assertion, r"\w");
929    }
930
931    #[test]
932    fn test_repetition() {
933        parse(repetition, "*??", "?", (RepetitionKind::ZeroOrMore, false));
934        parse(repetition, "+??", "?", (RepetitionKind::OneOrMore, false));
935        parse(repetition, "???", "?", (RepetitionKind::ZeroOrOne, false));
936        parse(repetition, "*a?", "a?", (RepetitionKind::ZeroOrMore, true));
937        parse(repetition, "+a?", "a?", (RepetitionKind::OneOrMore, true));
938        parse(repetition, "?a?", "a?", (RepetitionKind::ZeroOrOne, true));
939        parse(
940            repetition,
941            "{5}??",
942            "?",
943            (RepetitionKind::Range(RepetitionRange::Exactly(5)), false),
944        );
945
946        parse_err(repetition, "5");
947    }
948
949    #[test]
950    fn test_single() {
951        parse(single, ".a", "a", Node::Dot);
952        parse(single, "()a", "a", Node::Group(Box::new(Node::Empty)));
953        parse(
954            single,
955            "(ab)a",
956            "a",
957            Node::Group(Box::new(Node::Concat(vec![
958                Node::Literal(lit(b'a', 1..2, false)),
959                Node::Literal(lit(b'b', 2..3, false)),
960            ]))),
961        );
962        parse(
963            single,
964            r"\s",
965            "",
966            Node::Class(ClassKind::Perl(PerlClass {
967                kind: PerlClassKind::Space,
968                negated: false,
969            })),
970        );
971        parse(
972            single,
973            r"[a-fA-F] ",
974            " ",
975            Node::Class(ClassKind::Bracketed(BracketedClass {
976                items: vec![
977                    BracketedClassItem::Range(lit(b'a', 1..2, false), lit(b'f', 3..4, false)),
978                    BracketedClassItem::Range(lit(b'A', 4..5, false), lit(b'F', 6..7, false)),
979                ],
980                negated: false,
981            })),
982        );
983        parse(
984            single,
985            r"\xFFa",
986            "a",
987            Node::Literal(lit(b'\xFF', 0..4, false)),
988        );
989        parse(single, r"]a", "a", Node::Literal(lit(b']', 0..1, false)));
990
991        parse_err(single, "");
992        parse_err(single, "(");
993        parse_err(single, ")");
994        parse_err(single, "[");
995        parse_err(single, "|");
996        parse_err(single, "$");
997        parse_err(single, "^");
998        parse_err(single, "+");
999        parse_err(single, "*");
1000        parse_err(single, "?");
1001        parse_err(single, "(a");
1002    }
1003
1004    #[test]
1005    fn test_perl_class() {
1006        parse(
1007            perl_class,
1008            r"\w ",
1009            " ",
1010            PerlClass {
1011                kind: PerlClassKind::Word,
1012                negated: false,
1013            },
1014        );
1015        parse(
1016            perl_class,
1017            r"\Wa",
1018            "a",
1019            PerlClass {
1020                kind: PerlClassKind::Word,
1021                negated: true,
1022            },
1023        );
1024        parse(
1025            perl_class,
1026            r"\s",
1027            "",
1028            PerlClass {
1029                kind: PerlClassKind::Space,
1030                negated: false,
1031            },
1032        );
1033        parse(
1034            perl_class,
1035            r"\S\",
1036            "\\",
1037            PerlClass {
1038                kind: PerlClassKind::Space,
1039                negated: true,
1040            },
1041        );
1042        parse(
1043            perl_class,
1044            r"\d",
1045            "",
1046            PerlClass {
1047                kind: PerlClassKind::Digit,
1048                negated: false,
1049            },
1050        );
1051        parse(
1052            perl_class,
1053            r"\Da",
1054            "a",
1055            PerlClass {
1056                kind: PerlClassKind::Digit,
1057                negated: true,
1058            },
1059        );
1060
1061        parse_err(perl_class, "");
1062        parse_err(perl_class, "\\");
1063        parse_err(perl_class, "\\k");
1064    }
1065
1066    #[test]
1067    fn test_bracketed_class() {
1068        parse(
1069            bracketed_class,
1070            "[a]b",
1071            "b",
1072            BracketedClass {
1073                items: vec![BracketedClassItem::Literal(lit(b'a', 1..2, false))],
1074                negated: false,
1075            },
1076        );
1077        parse(
1078            bracketed_class,
1079            "[^a-z_\\S0-9]",
1080            "",
1081            BracketedClass {
1082                items: vec![
1083                    BracketedClassItem::Range(lit(b'a', 2..3, false), lit(b'z', 4..5, false)),
1084                    BracketedClassItem::Literal(lit(b'_', 5..6, false)),
1085                    BracketedClassItem::Perl(PerlClass {
1086                        kind: PerlClassKind::Space,
1087                        negated: true,
1088                    }),
1089                    BracketedClassItem::Range(lit(b'0', 8..9, false), lit(b'9', 10..11, false)),
1090                ],
1091                negated: true,
1092            },
1093        );
1094        parse(
1095            bracketed_class,
1096            "[]\\j]",
1097            "",
1098            BracketedClass {
1099                items: vec![
1100                    BracketedClassItem::Literal(lit(b'j', 2..4, true)),
1101                    BracketedClassItem::Literal(lit(b']', 1..2, false)),
1102                ],
1103                negated: false,
1104            },
1105        );
1106        parse(
1107            bracketed_class,
1108            "[]]",
1109            "",
1110            BracketedClass {
1111                items: vec![BracketedClassItem::Literal(lit(b']', 1..2, false))],
1112                negated: false,
1113            },
1114        );
1115        parse(
1116            bracketed_class,
1117            "[^]]",
1118            "",
1119            BracketedClass {
1120                items: vec![BracketedClassItem::Literal(lit(b']', 2..3, false))],
1121                negated: true,
1122            },
1123        );
1124        parse(
1125            bracketed_class,
1126            "[^a\\]b-]",
1127            "",
1128            BracketedClass {
1129                items: vec![
1130                    BracketedClassItem::Literal(lit(b'a', 2..3, false)),
1131                    BracketedClassItem::Literal(lit(b']', 3..5, true)),
1132                    BracketedClassItem::Literal(lit(b'b', 5..6, false)),
1133                    BracketedClassItem::Literal(lit(b'-', 6..7, false)),
1134                ],
1135                negated: true,
1136            },
1137        );
1138
1139        parse_err(bracketed_class, "[");
1140        parse_err(bracketed_class, "[]");
1141        parse_err(bracketed_class, "[^]");
1142        parse_err(bracketed_class, "[é]");
1143        parse_err(bracketed_class, "[\\]");
1144        parse_err(bracketed_class, "[\\x]");
1145        parse_err(bracketed_class, "[\\x0]");
1146        parse_err(bracketed_class, "[\\é]");
1147    }
1148
1149    #[test]
1150    fn test_bracketed_class_item() {
1151        parse(
1152            bracketed_class_item,
1153            "\\sw",
1154            "w",
1155            BracketedClassItem::Perl(PerlClass {
1156                kind: PerlClassKind::Space,
1157                negated: false,
1158            }),
1159        );
1160        parse(
1161            bracketed_class_item,
1162            "\\c-z]",
1163            "]",
1164            BracketedClassItem::Range(lit(b'c', 0..2, true), lit(b'z', 3..4, false)),
1165        );
1166
1167        parse_err(bracketed_class_item, "é");
1168    }
1169
1170    #[test]
1171    fn test_bracketed_class_range_or_literal() {
1172        parse(
1173            bracketed_class_range_or_literal,
1174            "ab",
1175            "b",
1176            BracketedClassItem::Literal(lit(b'a', 0..1, false)),
1177        );
1178        parse(
1179            bracketed_class_range_or_literal,
1180            r"\x01-",
1181            "-",
1182            BracketedClassItem::Literal(lit(b'\x01', 0..4, false)),
1183        );
1184        parse(
1185            bracketed_class_range_or_literal,
1186            "-\\]",
1187            "\\]",
1188            BracketedClassItem::Literal(lit(b'-', 0..1, false)),
1189        );
1190        parse(
1191            bracketed_class_range_or_literal,
1192            "A-]",
1193            "-]",
1194            BracketedClassItem::Literal(lit(b'A', 0..1, false)),
1195        );
1196
1197        parse(
1198            bracketed_class_range_or_literal,
1199            "a-\\sb",
1200            "b",
1201            BracketedClassItem::Range(lit(b'a', 0..1, false), lit(b's', 2..4, true)),
1202        );
1203        parse(
1204            bracketed_class_range_or_literal,
1205            "!--",
1206            "",
1207            BracketedClassItem::Range(lit(b'!', 0..1, false), lit(b'-', 2..3, false)),
1208        );
1209        parse(
1210            bracketed_class_range_or_literal,
1211            "---",
1212            "",
1213            BracketedClassItem::Range(lit(b'-', 0..1, false), lit(b'-', 2..3, false)),
1214        );
1215        parse(
1216            bracketed_class_range_or_literal,
1217            r"\n-\n",
1218            "",
1219            BracketedClassItem::Range(lit(b'\n', 0..2, false), lit(b'\n', 3..5, false)),
1220        );
1221        parse(
1222            bracketed_class_range_or_literal,
1223            r"\x01-\xFE",
1224            "",
1225            BracketedClassItem::Range(lit(b'\x01', 0..4, false), lit(b'\xFE', 5..9, false)),
1226        );
1227
1228        parse_err(bracketed_class_range_or_literal, "é");
1229        parse_err(bracketed_class_range_or_literal, "b-a");
1230        parse_err(bracketed_class_range_or_literal, "é-a");
1231        parse_err(bracketed_class_range_or_literal, "a-é");
1232        parse_err(bracketed_class_range_or_literal, "]-a");
1233    }
1234
1235    #[test]
1236    fn test_bracketed_class_literal() {
1237        parse(bracketed_class_literal, "ab", "b", lit(b'a', 0..1, false));
1238        parse(
1239            bracketed_class_literal,
1240            "\\nb",
1241            "b",
1242            lit(b'\n', 0..2, false),
1243        );
1244        parse(bracketed_class_literal, "\\]", "", lit(b']', 0..2, true));
1245
1246        parse_err(bracketed_class_literal, "]b");
1247        parse_err(bracketed_class_literal, "é");
1248        parse_err(bracketed_class_literal, "\\x1");
1249        parse_err(bracketed_class_literal, "\\é");
1250    }
1251
1252    #[test]
1253    fn test_bracketed_class_char() {
1254        parse(bracketed_class_char, "ab", "b", lit(b'a', 0..1, false));
1255
1256        parse_err(bracketed_class_char, "]b");
1257        parse_err(bracketed_class_char, "é");
1258    }
1259
1260    #[test]
1261    fn test_literal() {
1262        parse(literal, "ab", "b", Node::Literal(lit(b'a', 0..1, false)));
1263        parse(literal, "]", "", Node::Literal(lit(b']', 0..1, false)));
1264
1265        parse(
1266            literal,
1267            "éb",
1268            "b",
1269            Node::Char(LiteralChar {
1270                c: 'é',
1271                span: 0..2,
1272                escaped: false,
1273            }),
1274        );
1275    }
1276
1277    #[test]
1278    fn test_escaped_char() {
1279        parse(
1280            escaped_char,
1281            "\\na",
1282            "a",
1283            Node::Literal(lit(b'\n', 0..2, false)),
1284        );
1285        parse(
1286            escaped_char,
1287            "\\ta",
1288            "a",
1289            Node::Literal(lit(b'\t', 0..2, false)),
1290        );
1291        parse(
1292            escaped_char,
1293            "\\ra",
1294            "a",
1295            Node::Literal(lit(b'\r', 0..2, false)),
1296        );
1297        parse(
1298            escaped_char,
1299            "\\fa",
1300            "a",
1301            Node::Literal(lit(b'\x0C', 0..2, false)),
1302        );
1303        parse(
1304            escaped_char,
1305            "\\aa",
1306            "a",
1307            Node::Literal(lit(b'\x07', 0..2, false)),
1308        );
1309        parse(
1310            escaped_char,
1311            "\\x00a",
1312            "a",
1313            Node::Literal(lit(b'\0', 0..4, false)),
1314        );
1315        parse(
1316            escaped_char,
1317            "\\xAF a",
1318            " a",
1319            Node::Literal(lit(b'\xAF', 0..4, false)),
1320        );
1321        parse(
1322            escaped_char,
1323            "\\k",
1324            "",
1325            Node::Literal(lit(b'k', 0..2, true)),
1326        );
1327        parse(
1328            escaped_char,
1329            "\\é_",
1330            "_",
1331            Node::Char(LiteralChar {
1332                c: 'é',
1333                span: 0..3,
1334                escaped: true,
1335            }),
1336        );
1337
1338        parse_err(escaped_char, "\\");
1339        parse_err(escaped_char, "\\x");
1340        parse_err(escaped_char, "\\x2");
1341        parse_err(escaped_char, "\\x2G");
1342    }
1343
1344    #[test]
1345    fn test_escaped_char_only_ascii() {
1346        parse(
1347            escaped_char_only_ascii,
1348            "\\na",
1349            "a",
1350            lit(b'\n', 0..2, false),
1351        );
1352        parse(
1353            escaped_char_only_ascii,
1354            "\\ta",
1355            "a",
1356            lit(b'\t', 0..2, false),
1357        );
1358        parse(
1359            escaped_char_only_ascii,
1360            "\\ra",
1361            "a",
1362            lit(b'\r', 0..2, false),
1363        );
1364        parse(
1365            escaped_char_only_ascii,
1366            "\\fa",
1367            "a",
1368            lit(b'\x0C', 0..2, false),
1369        );
1370        parse(
1371            escaped_char_only_ascii,
1372            "\\aa",
1373            "a",
1374            lit(b'\x07', 0..2, false),
1375        );
1376        parse(
1377            escaped_char_only_ascii,
1378            "\\x00a",
1379            "a",
1380            lit(b'\0', 0..4, false),
1381        );
1382        parse(
1383            escaped_char_only_ascii,
1384            "\\xAF a",
1385            " a",
1386            lit(b'\xAF', 0..4, false),
1387        );
1388        parse(escaped_char_only_ascii, "\\k", "", lit(b'k', 0..2, true));
1389
1390        parse_err(escaped_char_only_ascii, "\\");
1391        parse_err(escaped_char_only_ascii, "\\é");
1392        parse_err(escaped_char_only_ascii, "\\x");
1393        parse_err(escaped_char_only_ascii, "\\x2");
1394        parse_err(escaped_char_only_ascii, "\\x2G");
1395    }
1396
1397    #[test]
1398    fn test_range_repetition() {
1399        parse(
1400            range_repetition,
1401            "{0} ?a",
1402            " ?a",
1403            (RepetitionRange::Exactly(0), true),
1404        );
1405        parse(
1406            range_repetition,
1407            "{5}?a",
1408            "a",
1409            (RepetitionRange::Exactly(5), false),
1410        );
1411
1412        parse(
1413            range_repetition,
1414            "{5,15} a",
1415            " a",
1416            (RepetitionRange::Bounded(5, 15), true),
1417        );
1418        parse(
1419            range_repetition,
1420            "{5,}?a",
1421            "a",
1422            (RepetitionRange::AtLeast(5), false),
1423        );
1424
1425        parse_err(range_repetition, "{}?");
1426    }
1427
1428    #[test]
1429    fn test_range_single() {
1430        parse(range_single, "{0}a", "a", RepetitionRange::Exactly(0));
1431        parse(range_single, "{350} a", " a", RepetitionRange::Exactly(350));
1432
1433        parse_err(range_single, "{");
1434        parse_err(range_single, "{}");
1435        parse_err(range_single, "{-5}");
1436    }
1437
1438    #[test]
1439    fn test_range_multi() {
1440        parse(range_multi, "{,5}a", "a", RepetitionRange::Bounded(0, 5));
1441        parse(range_multi, "{5,}a", "a", RepetitionRange::AtLeast(5));
1442        parse(range_multi, "{5,10}a", "a", RepetitionRange::Bounded(5, 10));
1443        parse(range_multi, "{0,0} a", " a", RepetitionRange::Bounded(0, 0));
1444        parse(range_multi, "{,}", "", RepetitionRange::AtLeast(0));
1445        parse(range_multi, "{, }", "", RepetitionRange::AtLeast(0));
1446        parse(range_multi, "{ ,}", "", RepetitionRange::AtLeast(0));
1447        parse(range_multi, "{ , }", "", RepetitionRange::AtLeast(0));
1448        parse(range_multi, "{2 , }", "", RepetitionRange::AtLeast(2));
1449        parse(range_multi, "{ , 2}", "", RepetitionRange::Bounded(0, 2));
1450        parse(range_multi, "{1 , 2}", "", RepetitionRange::Bounded(1, 2));
1451
1452        parse_err(range_multi, "{");
1453        parse_err(range_multi, "{,5");
1454        parse_err(range_multi, "{,-5}");
1455        parse_err(range_multi, "{-5,}");
1456        parse_err(range_multi, "{10,5}");
1457        parse_err(range_multi, "{ 1,5}");
1458        parse_err(range_multi, "{1,5 }");
1459    }
1460
1461    #[test]
1462    fn test_parse_u32() {
1463        parse(parse_u32, "5a", "a", 5_u32);
1464
1465        parse_err(parse_u32, "a");
1466        parse_err(parse_u32, "-5a");
1467        parse_err(parse_u32, "5000000000000");
1468    }
1469
1470    #[test]
1471    fn test_parse_opt_u32() {
1472        parse(parse_opt_u32, "a", "a", None);
1473        parse(parse_opt_u32, "5a", "a", Some(5));
1474        parse(parse_opt_u32, "-5a", "-5a", None);
1475
1476        parse_err(parse_opt_u32, "5000000000000");
1477    }
1478
1479    #[test]
1480    fn test_stack_overflow() {
1481        // Use a limit below the default one. This is because the default one
1482        // works well when compiled in release mode, but in debug mode the
1483        // stack grows much more quickly.
1484        const STRING_RECURSION_TEST_LIMIT: u8 = 10;
1485
1486        // Parsing of a regex includes recursion, so it must be protected against
1487        // stack overflowing.
1488        let mut v = String::new();
1489        v.push('/');
1490        for _ in 0..1_000 {
1491            v.push_str("a(b");
1492        }
1493        for _ in 0..1_000 {
1494            v.push_str(")c");
1495        }
1496        v.push('/');
1497
1498        let input = Input::with_params(
1499            &v,
1500            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
1501        );
1502        let res = regex(input).finish();
1503        assert_eq!(
1504            &res.unwrap_err(),
1505            &Error::new(30..30, ErrorKind::RegexTooDeep)
1506        );
1507
1508        // test that changing the limit works
1509        let input = Input::with_params(
1510            &v,
1511            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT + 2),
1512        );
1513        let res = regex(input).finish();
1514        assert_eq!(
1515            &res.unwrap_err(),
1516            &Error::new(36..36, ErrorKind::RegexTooDeep)
1517        );
1518
1519        // counter should reset, so many imbricated groups, but all below the limit should be fine.
1520        let mut v = String::new();
1521        v.push('/');
1522        let nb = STRING_RECURSION_TEST_LIMIT - 1;
1523        for _ in 0..nb {
1524            v.push_str("a(b");
1525        }
1526        for _ in 0..nb {
1527            v.push_str(")c");
1528        }
1529        v.push('d');
1530        for _ in 0..nb {
1531            v.push_str("e(f");
1532        }
1533        for _ in 0..nb {
1534            v.push_str(")h");
1535        }
1536        v.push('/');
1537
1538        let input = Input::with_params(
1539            &v,
1540            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
1541        );
1542        let _res = regex(input).unwrap();
1543        assert_eq!(input.string_recursion_counter, 0);
1544    }
1545
1546    #[test]
1547    fn test_parse_regex() {
1548        assert!(parse_regex(r"/a{2}/").is_ok());
1549        assert!(parse_regex(r"a{2}/").is_err());
1550    }
1551
1552    #[test]
1553    fn test_public_types() {
1554        test_public_type(regex(Input::new(r"/a{2}[az]\b\s|.+$/")).unwrap());
1555    }
1556}
boreal_parser/regex.rs

boreal_parser/
regex.rs