boreal_parser/
hex_string.rs

1//! AST objects related to hex strings.
2use nom::branch::alt;
3use nom::bytes::complete::tag;
4use nom::character::complete::{char, digit1};
5use nom::combinator::{cut, map, opt, value};
6use nom::error::{ErrorKind as NomErrorKind, ParseError};
7use nom::multi::{many1, separated_list1};
8use nom::sequence::{delimited, preceded, terminated};
9use nom::Parser;
10
11use super::error::{Error, ErrorKind};
12use super::nom_recipes::{map_res, rtrim};
13use super::types::{Input, ParseResult};
14
15const JUMP_LIMIT_IN_ALTERNATIVES: u32 = 200;
16
17/// A token in an hex string.
18#[derive(Clone, Debug, PartialEq)]
19pub enum Token {
20    /// A fully declared byte, eg `9C`
21    Byte(u8),
22    /// Negation of a byte, eg `~9C`
23    NotByte(u8),
24    /// A masked byte, eg `?5`, `C?`, `??`
25    MaskedByte(u8, Mask),
26    /// Negation of a masked byte, eg `~?C`. The mask cannot be [`Mask::All`].
27    NotMaskedByte(u8, Mask),
28    /// A jump of unknown bytes, eg `[5-10]`, `[3-]`, ...
29    Jump(Jump),
30    /// Two possible list of tokens, eg `( 12 34 | 98 76 )`
31    Alternatives(Vec<Vec<Token>>),
32}
33
34/// Mask on a byte.
35#[derive(Clone, Debug, PartialEq, Eq)]
36pub enum Mask {
37    /// The left part is masked, ie ?X
38    Left,
39    /// The right part is masked, ie X?
40    Right,
41    /// Both parts are masked, ie ??
42    All,
43}
44
45/// A jump range, which can be expressed in multiple ways:
46///
47/// - `[a-b]` means between `a` and `b`, inclusive.
48/// - `[-b]` is equivalent to `[0-b]`.
49/// - `[a-]` means `a` or more.
50/// - `[-]` is equivalent to `[0-]`.
51/// - `[a]` is equivalent to `[a-a]`.
52#[derive(Clone, Debug, PartialEq, Eq)]
53pub struct Jump {
54    /// Beginning of the range, included.
55    pub from: u32,
56    /// Optional end of the range, included.
57    pub to: Option<u32>,
58}
59
60/// Parse a hex string.
61///
62/// The input is expected to look like `{ AB .. }`.
63///
64/// # Errors
65///
66/// Returns an error if the parsing fails.
67pub fn parse_hex_string(input: &str) -> Result<Vec<Token>, Error> {
68    use nom::Finish;
69
70    let input = Input::new(input);
71    let (_, tokens) = hex_string(input).finish()?;
72
73    Ok(tokens)
74}
75
76/// Parse an hex string.
77///
78/// This looks like `{ AB .. }`.
79///
80/// This is equivalent to the `hex_string` rule in `hex_grammar.y` in libyara.
81pub(crate) fn hex_string(input: Input) -> ParseResult<Vec<Token>> {
82    let (input, _) = rtrim(char('{')).parse(input)?;
83
84    cut(terminated(|input| tokens(input, false), rtrim(char('}')))).parse(input)
85}
86
87/// Parse an hex-digit, and return its value in [0-15].
88fn hex_digit(mut input: Input) -> ParseResult<u8> {
89    match input.cursor().chars().next().and_then(|c| {
90        // Cannot truncate, so allow lint
91        #[allow(clippy::cast_possible_truncation)]
92        c.to_digit(16).map(|v| v as u8)
93    }) {
94        Some(v) => {
95            input.advance(1);
96            Ok((input, v))
97        }
98        _ => Err(nom::Err::Error(Error::from_error_kind(
99            input,
100            NomErrorKind::HexDigit,
101        ))),
102    }
103}
104
105/// Parse a hex byte.
106///
107/// Equivalent to the _BYTE_ lexical pattern in libyara.
108fn byte(input: Input) -> ParseResult<u8> {
109    let (input, digit0) = hex_digit(input)?;
110
111    map(rtrim(hex_digit), move |digit1| (digit0 << 4) | digit1).parse(input)
112}
113
114/// Parse the not tokens.
115fn not_token(input: Input) -> ParseResult<Token> {
116    let start = input.pos();
117
118    let (input, _) = char('~').parse(input)?;
119    let (input, token) = cut(alt((
120        map(byte, Token::NotByte),
121        map(masked_byte, |(b, mask)| Token::NotMaskedByte(b, mask)),
122    )))
123    .parse(input)?;
124
125    if let Token::NotMaskedByte(_, Mask::All) = &token {
126        return Err(nom::Err::Failure(Error::new(
127            input.get_span_from(start),
128            ErrorKind::CannotNegateMaskAll,
129        )));
130    }
131
132    Ok((input, token))
133}
134
135/// Parse a masked hex byte, ie X?, ?X or ??.
136///
137/// Equivalent to the `_MASKED_BYTE_` lexical pattern in libyara.
138fn masked_byte(input: Input) -> ParseResult<(u8, Mask)> {
139    rtrim(alt((
140        map(tag("??"), |_| (0, Mask::All)),
141        map(preceded(char('?'), hex_digit), |v| (v, Mask::Left)),
142        map(terminated(hex_digit, char('?')), |v| (v, Mask::Right)),
143    )))
144    .parse(input)
145}
146
147/// Parse a jump range, which can be expressed in multiple ways:
148///
149/// - `[a-b]` means between `a` and `b`, inclusive.
150/// - `[-b]` is equivalent to `[0-b]`.
151/// - `[a-]` means `a` or more.
152/// - `[-]` is equivalent to `[0-]`.
153/// - `[a]` is equivalent to `[a-a]`.
154///
155/// This is equivalent to the range state in libyara.
156fn range(input: Input) -> ParseResult<Jump> {
157    let start = input.pos();
158    let (input, _) = rtrim(char('[')).parse(input)?;
159
160    // Parse 'a'
161    let (input, from) = opt(map_res(rtrim(digit1), |v| {
162        str::parse::<u32>(v.cursor()).map_err(ErrorKind::StrToIntError)
163    }))
164    .parse(input)?;
165
166    let (input, to) = match from {
167        Some(from) => {
168            alt((
169                // Parses -b?]
170                delimited(
171                    rtrim(char('-')),
172                    opt(map_res(rtrim(digit1), |v| {
173                        str::parse(v.cursor()).map_err(ErrorKind::StrToIntError)
174                    })),
175                    rtrim(char(']')),
176                ),
177                // Otherwise, this means '[a]'
178                value(Some(from), rtrim(char(']'))),
179            ))
180            .parse(input)?
181        }
182        None => delimited(
183            rtrim(char('-')),
184            opt(map_res(rtrim(digit1), |v| {
185                str::parse(v.cursor()).map_err(ErrorKind::StrToIntError)
186            })),
187            rtrim(char(']')),
188        )
189        .parse(input)?,
190    };
191
192    let jump = Jump {
193        from: from.unwrap_or(0),
194        to,
195    };
196
197    if let Err(kind) = validate_jump(&jump) {
198        return Err(nom::Err::Failure(Error::new(
199            input.get_span_from(start),
200            kind,
201        )));
202    }
203    Ok((input, jump))
204}
205
206/// Validate that a jump is well-formed.
207fn validate_jump(range: &Jump) -> Result<(), ErrorKind> {
208    if let Some(to) = range.to {
209        if range.from == 0 && to == 0 {
210            return Err(ErrorKind::JumpEmpty);
211        }
212        if range.from > to {
213            return Err(ErrorKind::JumpRangeInvalid {
214                from: range.from,
215                to,
216            });
217        }
218    }
219
220    Ok(())
221}
222
223/// Parse an alternative between two sets of tokens.
224///
225/// This looks like `( AB .. | CD .. [ | .. ] )`.
226///
227/// This is equivalent to the `alternatives` from `hex_grammar.y` in libyara.
228fn alternatives(input: Input) -> ParseResult<Token> {
229    let (input, _) = rtrim(char('(')).parse(input)?;
230
231    cut(terminated(
232        map(
233            separated_list1(rtrim(char('|')), |input| tokens(input, true)),
234            Token::Alternatives,
235        ),
236        rtrim(char(')')),
237    ))
238    .parse(input)
239}
240
241fn range_as_hex_token(input: Input, in_alternatives: bool) -> ParseResult<Token> {
242    let start = input.pos();
243    let (input, range) = range(input)?;
244
245    // Some jumps are forbidden inside an alternatives
246    if in_alternatives {
247        if let Err(kind) = validate_jump_in_alternatives(&range) {
248            return Err(nom::Err::Failure(Error::new(
249                input.get_span_from(start),
250                kind,
251            )));
252        }
253    }
254
255    // Jump of one is equivalent to ??
256    if let Some(to) = &range.to {
257        if range.from == *to && range.from == 1 {
258            return Ok((input, Token::MaskedByte(0, Mask::All)));
259        }
260    }
261    Ok((input, Token::Jump(range)))
262}
263
264fn validate_jump_in_alternatives(jump: &Jump) -> Result<(), ErrorKind> {
265    match jump.to {
266        None => Err(ErrorKind::JumpUnboundedInAlternation),
267        Some(to) => {
268            // No need to test from, as from <= to, if from is over the limit, to will be.
269            if to > JUMP_LIMIT_IN_ALTERNATIVES {
270                Err(ErrorKind::JumpTooBigInAlternation {
271                    limit: JUMP_LIMIT_IN_ALTERNATIVES,
272                })
273            } else {
274                Ok(())
275            }
276        }
277    }
278}
279
280/// Parse an hex token.
281///
282/// Some token are not allowed inside an alternatives, which is why a
283/// `in_alternatives` flag is needed.
284///
285/// This is equivalent to the `token_or_range` rule in `hex_grammar.y` in libyara.
286fn hex_token(input: Input, in_alternatives: bool) -> ParseResult<Token> {
287    alt((
288        not_token,
289        map(byte, Token::Byte),
290        map(masked_byte, |(v, mask)| Token::MaskedByte(v, mask)),
291        |input| range_as_hex_token(input, in_alternatives),
292        alternatives,
293    ))
294    .parse(input)
295}
296
297/// Parse a list of token
298///
299/// A jump is not allowed at the beginning or at the end of the list.
300///
301/// This is equivalent to the `tokens` rule in `hex_grammar.y` in libyara.
302fn tokens(mut input: Input, in_alternatives: bool) -> ParseResult<Vec<Token>> {
303    let start = input.pos();
304
305    // This combinator is recursive:
306    //
307    // tokens => hex_token => alternatives => tokens
308    //
309    // Use the string recursive counter to make sure this recursion cannot grow too much.
310    if input.string_recursion_counter >= input.params.string_recursion_limit {
311        return Err(nom::Err::Failure(Error::new(
312            input.get_span_from(start),
313            ErrorKind::HexStringTooDeep,
314        )));
315    }
316
317    input.string_recursion_counter += 1;
318    let (mut input, tokens) = many1(|input| hex_token(input, in_alternatives)).parse(input)?;
319    input.string_recursion_counter -= 1;
320
321    if matches!(tokens[0], Token::Jump(_))
322        || (tokens.len() > 1 && matches!(tokens[tokens.len() - 1], Token::Jump(_)))
323    {
324        Err(nom::Err::Failure(Error::new(
325            input.get_span_from(start),
326            ErrorKind::JumpAtBound,
327        )))
328    } else {
329        Ok((input, tokens))
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use crate::test_helpers::{parse, parse_err, test_public_type};
336    use crate::types::Params;
337    use nom::Finish;
338
339    use super::*;
340
341    #[test]
342    fn test_parse_hex_byte() {
343        parse(byte, "AF", "", 0xAF);
344        parse(byte, "10F", "F", 0x10);
345        parse(byte, "9E 1", "1", 0x9E);
346
347        parse_err(byte, "G1");
348        parse_err(byte, "1G");
349        parse_err(byte, "1");
350        parse_err(byte, " ");
351    }
352
353    #[test]
354    fn test_parse_masked_byte() {
355        parse(masked_byte, "?1", "", (1, Mask::Left));
356        parse(masked_byte, "C??", "?", (0xC, Mask::Right));
357        parse(masked_byte, "?? ", "", (0, Mask::All));
358
359        parse_err(masked_byte, "AB");
360        parse_err(masked_byte, " ?");
361        parse_err(masked_byte, "G?");
362        parse_err(masked_byte, "?G");
363    }
364
365    #[test]
366    fn test_parse_not_token() {
367        parse(not_token, "~23a", "a", Token::NotByte(0x23));
368        parse(
369            not_token,
370            "~?3b",
371            "b",
372            Token::NotMaskedByte(0x03, Mask::Left),
373        );
374        parse(
375            not_token,
376            "~F?",
377            "",
378            Token::NotMaskedByte(0x0F, Mask::Right),
379        );
380
381        parse_err(not_token, "~");
382        parse_err(not_token, "~1");
383        parse_err(not_token, "~1 2");
384        parse_err(not_token, "~ 12");
385        parse_err(not_token, "~??");
386        parse_err(not_token, "~g1");
387        parse_err(not_token, "~1g");
388        parse_err(not_token, "12");
389        parse_err(not_token, "?a");
390        parse_err(not_token, "a?");
391        parse_err(not_token, "??");
392    }
393
394    #[test]
395    fn test_range() {
396        parse(range, "[-] a", "a", Jump { from: 0, to: None });
397        parse(
398            range,
399            "[ 15 -35]",
400            "",
401            Jump {
402                from: 15,
403                to: Some(35),
404            },
405        );
406        parse(range, "[1-  ]", "", Jump { from: 1, to: None });
407        parse(
408            range,
409            "[1-2]]",
410            "]",
411            Jump {
412                from: 1,
413                to: Some(2),
414            },
415        );
416        parse(
417            range,
418            "[  1  -  2  ]",
419            "",
420            Jump {
421                from: 1,
422                to: Some(2),
423            },
424        );
425        parse(
426            range,
427            "[-1]",
428            "",
429            Jump {
430                from: 0,
431                to: Some(1),
432            },
433        );
434        parse(
435            range,
436            "[12 ]",
437            "",
438            Jump {
439                from: 12,
440                to: Some(12),
441            },
442        );
443
444        parse_err(range, "[");
445        parse_err(range, "[]");
446        parse_err(range, "[--]");
447        parse_err(range, "[1-2-3]");
448        parse_err(range, "[1-2-]");
449        parse_err(range, "[-2-]");
450        parse_err(range, "[d-e]");
451        parse_err(range, "[1 2]");
452        parse_err(range, "[999999999999-]");
453        parse_err(range, "[1-999999999999]");
454        parse_err(range, "[-999999999999]");
455
456        // validation errors
457        parse_err(range, "[4-2]");
458        parse_err(range, "[4-3]");
459        parse(
460            range,
461            "[4-4]",
462            "",
463            Jump {
464                from: 4,
465                to: Some(4),
466            },
467        );
468        parse_err(range, "[0]");
469        parse_err(range, "[0-0]");
470        parse(
471            range,
472            "[1]",
473            "",
474            Jump {
475                from: 1,
476                to: Some(1),
477            },
478        );
479    }
480
481    #[test]
482    fn test_alternatives() {
483        parse(
484            alternatives,
485            "( AB | 56 ?F ) ",
486            "",
487            Token::Alternatives(vec![
488                vec![Token::Byte(0xAB)],
489                vec![Token::Byte(0x56), Token::MaskedByte(0x0F, Mask::Left)],
490            ]),
491        );
492        parse(
493            alternatives,
494            "(12[1]C?|??[3-5]33)",
495            "",
496            Token::Alternatives(vec![
497                vec![
498                    Token::Byte(0x12),
499                    Token::MaskedByte(0, Mask::All),
500                    Token::MaskedByte(0x0C, Mask::Right),
501                ],
502                vec![
503                    Token::MaskedByte(0x00, Mask::All),
504                    Token::Jump(Jump {
505                        from: 3,
506                        to: Some(5),
507                    }),
508                    Token::Byte(0x33),
509                ],
510            ]),
511        );
512        parse(
513            alternatives,
514            "( ( ?D | 23)| 15) ",
515            "",
516            Token::Alternatives(vec![
517                vec![Token::Alternatives(vec![
518                    vec![Token::MaskedByte(0x0D, Mask::Left)],
519                    vec![Token::Byte(0x23)],
520                ])],
521                vec![Token::Byte(0x15)],
522            ]),
523        );
524        parse(
525            alternatives,
526            "( AA (BB | CC) | DD | EE FF )",
527            "",
528            Token::Alternatives(vec![
529                vec![
530                    Token::Byte(0xAA),
531                    Token::Alternatives(vec![vec![Token::Byte(0xBB)], vec![Token::Byte(0xCC)]]),
532                ],
533                vec![Token::Byte(0xDD)],
534                vec![Token::Byte(0xEE), Token::Byte(0xFF)],
535            ]),
536        );
537
538        parse_err(alternatives, "( AB | [-] )");
539        parse_err(alternatives, "( AB | [1-] )");
540        parse_err(alternatives, "( AB | [1-250] )");
541        parse_err(alternatives, "( AB | [199-201] )");
542        parse_err(alternatives, "( AB | [200-201] )");
543        parse_err(alternatives, ")");
544        parse_err(alternatives, "()");
545        parse_err(alternatives, "(");
546        parse_err(alternatives, "(|)");
547        parse_err(alternatives, "(|");
548        parse_err(alternatives, "(AB|)");
549        parse_err(alternatives, "(|12)");
550        parse_err(alternatives, "(|123)");
551
552        parse_err(alternatives, "( [-] AB | CD )");
553        parse_err(alternatives, "( AB [1-2] | CD )");
554        parse_err(alternatives, "( AB | [3-] CD )");
555        parse_err(alternatives, "( AB | CD EF [-5] )");
556    }
557
558    #[test]
559    fn test_hex_string() {
560        parse(hex_string, "{ AB }", "", vec![Token::Byte(0xAB)]);
561
562        parse(
563            hex_string,
564            "{ DE AD BE EF }",
565            "",
566            vec![
567                Token::Byte(0xDE),
568                Token::Byte(0xAD),
569                Token::Byte(0xBE),
570                Token::Byte(0xEF),
571            ],
572        );
573        parse(
574            hex_string,
575            "{ 01 ?2 ?? 3? [1-] ( AF | DC ) }",
576            "",
577            vec![
578                Token::Byte(1),
579                Token::MaskedByte(2, Mask::Left),
580                Token::MaskedByte(0, Mask::All),
581                Token::MaskedByte(3, Mask::Right),
582                Token::Jump(Jump { from: 1, to: None }),
583                Token::Alternatives(vec![vec![Token::Byte(0xAF)], vec![Token::Byte(0xDC)]]),
584            ],
585        );
586
587        parse(
588            hex_string,
589            "{ 01 [1] 02 [2] 03 }  a",
590            "a",
591            vec![
592                Token::Byte(1),
593                Token::MaskedByte(0, Mask::All),
594                Token::Byte(2),
595                Token::Jump(Jump {
596                    from: 2,
597                    to: Some(2),
598                }),
599                Token::Byte(3),
600            ],
601        );
602
603        parse_err(hex_string, "{ [-] }");
604        parse_err(hex_string, "{ [-] AB }");
605        parse_err(hex_string, "{ AB CD [-] }");
606
607        parse_err(hex_string, "AB");
608        parse_err(hex_string, "{");
609        parse_err(hex_string, "{}");
610        parse_err(hex_string, "{A}");
611        parse_err(hex_string, "{ABA}");
612        parse_err(hex_string, "{AB");
613    }
614
615    #[test]
616    fn test_stack_overflow() {
617        // Use a limit below the default one. This is because the default one
618        // works well when compiled in release mode, but in debug mode the
619        // stack grows much more quickly.
620        const STRING_RECURSION_TEST_LIMIT: u8 = 10;
621
622        // Parsing of a hex string includes recursion, so it must be protected against
623        // stack overflowing.
624        let mut hex = String::new();
625        hex.push_str("{ AB ");
626        for _ in 0..10_000 {
627            hex.push_str("( CD | ");
628        }
629        for _ in 0..10_000 {
630            hex.push(')');
631        }
632        hex.push('}');
633
634        let input = Input::with_params(
635            &hex,
636            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
637        );
638        let res = hex_string(input).finish();
639        assert_eq!(
640            &res.unwrap_err(),
641            &Error::new(70..70, ErrorKind::HexStringTooDeep)
642        );
643
644        // test that changing the limit works
645        let input = Input::with_params(
646            &hex,
647            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT + 2),
648        );
649        let res = hex_string(input).finish();
650        assert_eq!(
651            &res.unwrap_err(),
652            &Error::new(84..84, ErrorKind::HexStringTooDeep)
653        );
654
655        // counter should reset, so many imbricated alternations, but all below the limit should be
656        // fine.
657        let mut hex = String::new();
658        hex.push_str("{ AB ");
659        let nb = STRING_RECURSION_TEST_LIMIT - 1;
660        for _ in 0..nb {
661            hex.push_str("( CD | ");
662        }
663        for _ in 0..nb {
664            hex.push_str(" EF )");
665        }
666        hex.push_str(" EF ");
667        for _ in 0..nb {
668            hex.push_str("( CD | ");
669        }
670        for _ in 0..nb {
671            hex.push_str("EF )");
672        }
673        hex.push('}');
674
675        let input = Input::with_params(
676            &hex,
677            Params::default().string_recursion_limit(STRING_RECURSION_TEST_LIMIT),
678        );
679        let _res = hex_string(input).unwrap();
680        assert_eq!(input.string_recursion_counter, 0);
681    }
682
683    #[test]
684    fn test_parse_hex_string() {
685        assert!(parse_hex_string(r"{ AB }").is_ok());
686        assert!(parse_hex_string(r"AB").is_err());
687    }
688
689    #[test]
690    fn test_public_types() {
691        test_public_type(Token::Byte(3));
692        test_public_type(Mask::Left);
693        test_public_type(Jump { from: 3, to: None });
694    }
695}