boreal 1.1.0

A library to evaluate YARA rules, used to scan bytes for textual and binary pattern
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
use crate::bitmaps::Bitmap;
use boreal_parser::hex_string::{Mask, Token};
use boreal_parser::regex::{
    AssertionKind, BracketedClass, BracketedClassItem, ClassKind, Literal, LiteralChar, Node,
    PerlClass, PerlClassKind, RepetitionKind, RepetitionRange,
};
use std::ops::Range;

/// HIR of a regular expression.
///
/// Both regexes and hex strings are translated into this representation, which
/// is then compiled into optimized matchers.
#[derive(Clone, Debug)]
pub enum Hir {
    /// Alternation, ie `a|b|...`.
    Alternation(Vec<Hir>),

    /// Zero-width assertion, e.g. ^, \b, ...
    Assertion(AssertionKind),

    /// Set of allowed values for a single byte.
    Class(Class),

    /// Mask over a byte, e.g. `?A`, `~5?`, ...
    Mask {
        /// Value to compare once the mask has been applied.
        ///
        /// For example, for `5?`, this is `0x50`.
        value: u8,

        /// Mask to apply before comparing the value.
        ///
        /// For example, for `5?`, this is `0xF0`.
        mask: u8,

        /// Is the comparison negated.
        ///
        /// If true, this matches if the comparison does not match.
        negated: bool,
    },

    /// Concatenation, must match in order.
    Concat(Vec<Hir>),

    /// The special `.` character.
    Dot,

    /// Empty expression.
    Empty,

    /// Literal byte.
    Literal(u8),

    /// A group, i.e. (...).
    Group(Box<Hir>),

    /// Repetition of an expression.
    Repetition {
        /// Expression to repeat.
        hir: Box<Hir>,

        /// Kind of repetition.
        kind: RepetitionKind,

        /// Is the repetition greedy or not.
        greedy: bool,
    },
}

/// Class of bytes.
#[derive(Clone, Debug)]
pub struct Class {
    /// Class definition.
    //
    // TODO: This is kept around so that the HIR can be retranslated back into a proper regex
    // string when converting to the regex_automata expressions.
    // This could be removed by either:
    // - converting our HIR into the regex_automata HIR
    // - converting this class into an explicit class with one byte for each bit in the
    //   bitmap. But it would require ensuring this does not cause regressions (it should
    //   not).
    pub definition: ClassKind,

    /// Bitfield of which bytes are in the class.
    pub bitmap: Bitmap,
}

/// Convert a parsed regex AST into our HIR.
///
/// This is quite straightforward, but for one particular transformation: the parsing
/// is unicode aware, while the HIR is bytes only.
///
/// See <https://github.com/VirusTotal/yara/pull/1770#issuecomment-1357622486> for some
/// discussions on this. To be compatible with YARA, we need to accept unicode bytes, but
/// match as if the explicit bytes were provided. This function takes care of that.
// TODO: implement a visitor for the regex ast
pub(crate) fn regex_ast_to_hir(node: Node, warnings: &mut Vec<RegexAstError>) -> Hir {
    match node {
        Node::Alternation(v) => Hir::Alternation(
            v.into_iter()
                .map(|n| regex_ast_to_hir(n, warnings))
                .collect(),
        ),
        Node::Assertion(v) => Hir::Assertion(v),
        Node::Class(definition) => Hir::Class(Class {
            bitmap: class_to_bitmap(&definition, warnings),
            definition,
        }),
        Node::Concat(v) => Hir::Concat(
            v.into_iter()
                .map(|n| regex_ast_to_hir(n, warnings))
                .collect(),
        ),
        Node::Dot => Hir::Dot,
        Node::Empty => Hir::Empty,
        Node::Literal(lit) => {
            let byte = unwrap_literal(&lit, warnings);
            Hir::Literal(byte)
        }
        Node::Group(v) => Hir::Group(Box::new(regex_ast_to_hir(*v, warnings))),
        Node::Repetition { node, kind, greedy } => {
            match *node {
                // This special code is here to normalize the HIR. The issue is that
                // the parsing is unicode aware, but the yara engine is not. So non ascii
                // characters are transformed into the utf-8 representation, which
                // causes an issue with repetitions: when `<char><repetition>` is
                // parsed, what we want to evaluate is
                // `<byte0><byte1>...<byteN><repetition>`, so the repetition only
                // applies on the last byte of the utf-7 representation of the character.
                //
                // We *could* only transform Node::Char into an Hir::Concat
                // of its utf-8 bytes. This however leads to an HIR that is no
                // longer stable through a string representation. For example,
                // take this example:
                //
                // - the regex `ù+` is parsed, giving the AST
                //   `Repetition(Char('ù'), OneOrMore)`.
                // - this is normalized into the HIR
                //   `Repetition(Concat([b'\xC3', b'\xB9']), OneOrMore)`
                // - converting to a string gives `\xC3\xB9+`.
                // - this is parsed and converted into the HIR
                //   `Concat([b'\xC3', Repetition(b'\xB9', OneOrMore))`
                //
                // This attempts to fall into working currently, since we print
                // the HIR to give it to `regex_automata`. But it is extremely
                // flimsy, and could lead to a many bugs, since it means we
                // still work with an HIR that is invalid, since its
                // representation of the regex does not match the matching
                // behavior.
                //
                // This is all to say: we want to normalize the HIR into
                // a stable and faithful representation. Hence this code
                // a bit hacky, where we handle the special
                // "repetition over a char" case, to put the repetition
                // only over the last byte.
                Node::Char(LiteralChar { c, span, escaped }) => {
                    if escaped {
                        warnings.push(RegexAstError::UnknownEscape {
                            span: span.clone(),
                            c,
                        });
                    }
                    warnings.push(RegexAstError::NonAsciiChar { span });

                    let mut enc = vec![0; 4];
                    let _r = c.encode_utf8(&mut enc);
                    let len = c.len_utf8();

                    // Move the repetition to the last char only.
                    let mut concat = Vec::with_capacity(len);
                    for b in &enc[0..(len - 1)] {
                        concat.push(Hir::Literal(*b));
                    }
                    concat.push(Hir::Repetition {
                        hir: Box::new(Hir::Literal(enc[len - 1])),
                        kind,
                        greedy,
                    });
                    Hir::Concat(concat)
                }
                v => Hir::Repetition {
                    hir: Box::new(regex_ast_to_hir(v, warnings)),
                    kind,
                    greedy,
                },
            }
        }
        Node::Char(LiteralChar { c, span, escaped }) => {
            if escaped {
                warnings.push(RegexAstError::UnknownEscape {
                    span: span.clone(),
                    c,
                });
            }
            warnings.push(RegexAstError::NonAsciiChar { span });

            let mut enc = vec![0; 4];
            let res = c.encode_utf8(&mut enc);
            Hir::Concat(res.as_bytes().iter().map(|v| Hir::Literal(*v)).collect())
        }
    }
}

fn unwrap_literal(lit: &Literal, warnings: &mut Vec<RegexAstError>) -> u8 {
    let Literal {
        byte,
        span,
        escaped,
    } = lit;

    if *escaped && !is_meta_character(*byte) {
        warnings.push(RegexAstError::UnknownEscape {
            span: span.clone(),
            c: char::from(*byte),
        });
    }

    *byte
}

fn is_meta_character(byte: u8) -> bool {
    matches!(
        byte,
        b'\\'
            | b'/'
            | b'.'
            | b'+'
            | b'*'
            | b'?'
            | b'('
            | b')'
            | b'|'
            | b'['
            | b']'
            | b'{'
            | b'}'
            | b'^'
            | b'$'
            | b'-'
    )
}

fn class_to_bitmap(class_kind: &ClassKind, warnings: &mut Vec<RegexAstError>) -> Bitmap {
    match class_kind {
        ClassKind::Perl(p) => perl_class_to_bitmap(p),
        ClassKind::Bracketed(BracketedClass { items, negated }) => {
            let mut bitmap = Bitmap::new();

            for item in items {
                match item {
                    BracketedClassItem::Perl(p) => {
                        bitmap |= perl_class_to_bitmap(p);
                    }
                    BracketedClassItem::Literal(lit) => {
                        let byte = unwrap_literal(lit, warnings);
                        bitmap.set(byte);
                    }
                    BracketedClassItem::Range(lita, litb) => {
                        let a = unwrap_literal(lita, warnings);
                        let b = unwrap_literal(litb, warnings);
                        for c in a..=b {
                            bitmap.set(c);
                        }
                    }
                }
            }

            if *negated {
                bitmap.invert();
            }
            bitmap
        }
    }
}

fn perl_class_to_bitmap(cls: &PerlClass) -> Bitmap {
    let PerlClass { kind, negated } = cls;

    let mut bitmap = Bitmap::new();
    match kind {
        PerlClassKind::Word => {
            for c in b'0'..=b'9' {
                bitmap.set(c);
            }
            for c in b'A'..=b'Z' {
                bitmap.set(c);
            }
            bitmap.set(b'_');
            for c in b'a'..=b'z' {
                bitmap.set(c);
            }
        }
        PerlClassKind::Space => {
            for c in [b'\t', b'\n', b'\x0B', b'\x0C', b'\r', b' '] {
                bitmap.set(c);
            }
        }
        PerlClassKind::Digit => {
            for c in b'0'..=b'9' {
                bitmap.set(c);
            }
        }
    }
    if *negated {
        bitmap.invert();
    }
    bitmap
}

impl From<Vec<Token>> for Hir {
    fn from(tokens: Vec<Token>) -> Self {
        Hir::Concat(tokens.into_iter().map(Into::into).collect())
    }
}

impl From<Token> for Hir {
    fn from(token: Token) -> Self {
        match token {
            Token::Byte(b) => Hir::Literal(b),
            Token::NotByte(b) => {
                let mut bitmap = Bitmap::new();
                bitmap.set(b);
                bitmap.invert();

                Hir::Class(Class {
                    definition: ClassKind::Bracketed(BracketedClass {
                        items: vec![BracketedClassItem::Literal(Literal {
                            byte: b,
                            span: 0..1,
                            escaped: false,
                        })],
                        negated: true,
                    }),
                    bitmap,
                })
            }
            Token::MaskedByte(b, mask) => masked_byte_to_hir(b, &mask, false),
            Token::NotMaskedByte(b, mask) => masked_byte_to_hir(b, &mask, true),
            Token::Jump(jump) => {
                let kind = match (jump.from, jump.to) {
                    (from, None) => RepetitionKind::Range(RepetitionRange::AtLeast(from)),
                    (from, Some(to)) => RepetitionKind::Range(RepetitionRange::Bounded(from, to)),
                };
                Hir::Repetition {
                    hir: Box::new(Hir::Dot),
                    kind,
                    greedy: false,
                }
            }
            Token::Alternatives(elems) => Hir::Group(Box::new(Hir::Alternation(
                elems.into_iter().map(Into::into).collect(),
            ))),
        }
    }
}

fn masked_byte_to_hir(byte: u8, mask: &Mask, negated: bool) -> Hir {
    match mask {
        Mask::Left => Hir::Mask {
            value: byte,
            mask: 0x0F,
            negated,
        },
        Mask::Right => Hir::Mask {
            value: byte << 4,
            mask: 0xF0,
            negated,
        },
        Mask::All => Hir::Dot,
    }
}

/// Errors related to a regex AST.
#[derive(Clone, Debug)]
pub enum RegexAstError {
    /// A non ascii character is present in the regex.
    NonAsciiChar {
        /// Span of the character.
        span: Range<usize>,
    },
    /// An unknown escape sequence is present in the regex.
    UnknownEscape {
        /// Span of the escape sequence.
        span: Range<usize>,

        /// Character equivalent to the sequence, without the escape.
        c: char,
    },
}

#[cfg(test)]
mod tests {
    use crate::test_helpers::test_type_traits;

    use super::*;

    #[test]
    fn test_types_traits() {
        test_type_traits(Hir::Empty);
        test_type_traits(Class {
            definition: ClassKind::Perl(PerlClass {
                kind: PerlClassKind::Word,
                negated: false,
            }),
            bitmap: Bitmap::new(),
        });
        test_type_traits(RegexAstError::NonAsciiChar { span: 0..1 });
    }
}