yara_x/scanner/
blocks.rs

1/*! Scanner for scanning data in blocks.
2
3This scanner is designed for scenarios where the data to be scanned is not
4available as a single contiguous block of memory, but rather arrives in
5smaller, discrete blocks, allowing for incremental scanning.
6*/
7use std::collections::btree_map::Entry;
8use std::collections::BTreeMap;
9use std::mem;
10use std::mem::transmute;
11use std::pin::Pin;
12use std::time::Duration;
13
14use wasmtime::Store;
15
16use crate::errors::VariableError;
17use crate::scanner::context::{create_wasm_store_and_ctx, ScanState};
18use crate::scanner::{DataSnippets, ScanContext};
19use crate::{Rules, ScanError, ScanResults, Variable};
20
21/// Scans data in blocks
22///
23/// This scanner is designed for scenarios where the data to be scanned is not
24/// available as a single contiguous block of memory, but rather arrives in
25/// smaller, discrete blocks, allowing for incremental scanning.
26///
27/// # Examples
28///
29/// ```
30/// # use yara_x::{blocks, compile};
31///
32/// let rules = compile(r#"rule test { strings: $a = "abc" condition: $a }"#).unwrap();
33///
34/// let mut scanner = blocks::Scanner::new(&rules);
35///
36/// // Scan the first block of data.
37/// scanner.scan(0, b"xabcy").unwrap();
38///
39/// // Scan a second block of data, which can overlap with the first.
40/// scanner.scan(3, b"cyz").unwrap();
41///
42/// // Finish the scan and get the results.
43/// let results = scanner.finish().unwrap();
44///
45/// assert_eq!(results.matching_rules().len(), 1);
46/// ```
47///
48/// # Limitations of Block Scanning
49///
50/// Block scanning works by analyzing data in chunks rather than as a whole
51/// file. This makes it useful for streaming or memory-constrained scenarios,
52/// but it comes with important limitations compared to standard scanning:
53///
54/// 1) Modules won't work. Parsers for structured formats (e.g., PE, ELF)
55///    require access to the entire file and cannot be applied in block
56///    scanning mode.
57/// 2) Other modules like `hash` won't work either, as they require access to
58///    all the scanned data during the evaluation of the rule's condition,
59///    something that can't be guaranteed in block scanning mode. The hash
60///    functions will return `undefined` when used in a multi-block context.
61/// 3) Built-in functions like `uint8`, `uint16`, `uint32`, etc., have the
62///    same limitation. They also return `undefined` in block scanning mode.
63/// 4) The `filesize` keyword returns `undefined` in block scanning mode.
64/// 5) Patterns won't match across block boundaries. Every match will be
65///    completely contained within one of the blocks.
66///
67/// All these limitations imply that in block scanning mode you should only
68/// use rules that rely on text, hex or regex patterns.
69///
70/// # Data Consistency in Overlapping Blocks
71///
72/// When [`Scanner::scan`] is invoked multiple times with different blocks
73/// that may overlap, the user is responsible for ensuring data consistency.
74/// This means that if the same region of the original data is present in two
75/// or more overlapping blocks, the content of that region must be identical
76/// across all calls to this function.
77///
78/// Generally speaking, the scanner does not verify this consistency and
79/// assumes the user provides accurate and consistent data. In debug releases
80/// the scanner may try to verify this consistency, but only when some pattern
81/// matches in the overlapping region.
82pub struct Scanner<'r> {
83    _rules: &'r Rules,
84    wasm_store: Pin<Box<Store<ScanContext<'static, 'static>>>>,
85    needs_reset: bool,
86    snippets: BTreeMap<usize, Vec<u8>>,
87}
88
89impl<'r> Scanner<'r> {
90    /// Creates a new block scanner.
91    pub fn new(rules: &'r Rules) -> Scanner<'r> {
92        Scanner {
93            _rules: rules,
94            wasm_store: create_wasm_store_and_ctx(rules),
95            needs_reset: true,
96            snippets: BTreeMap::new(),
97        }
98    }
99}
100impl<'r> Scanner<'r> {
101    /// Scans a block of data.
102    ///
103    /// This method processes a given block of data, searching for patterns
104    /// defined in the YARA rules. The `base` argument specifies the offset
105    /// of the current block within the overall data being scanned. In most
106    /// cases you will want to call this method multiple times, providing a
107    /// different block on each call.
108    ///
109    /// # Arguments
110    ///
111    /// * `base` - The starting offset of the `data` block within overall
112    ///   data being scanned.
113    /// * `data` - The byte slice representing the current block of data to
114    ///   scan.
115    ///
116    /// # Returns
117    ///
118    /// A `Result` indicating success or a `ScanError` if the scan operation
119    /// fails.
120    pub fn scan(
121        &mut self,
122        base: usize,
123        data: &[u8],
124    ) -> Result<&mut Self, ScanError> {
125        // Reset the scanner if needed. This is done before scanning the first
126        // block after the scanner has been created, or when a previous scan
127        // has finished and the scanner is going to be reused.
128        if self.needs_reset {
129            self.scan_context_mut().reset();
130            self.needs_reset = false;
131        }
132        // Even when the scanner is not reset, we must clear unconfirmed matches
133        // between blocks. Otherwise, matches partially detected in one block
134        // could incorrectly be confirmed by data from a different block.
135        //
136        // This prevents matches from spanning multiple blocks — a scenario that
137        // could occur with patterns split into multiple subpatterns, for
138        // example:
139        //
140        // { 01 02 03 [-] 04 05 06}
141        //
142        // In this case, the subpattern `01 02 03` might match in one block, and
143        // `04 05 06` in the next. While supporting cross-block matches is
144        // technically possible, it would be inconsistent with patterns that
145        // cannot span blocks. To maintain a simple, uniform rule — that matches
146        // never cross block boundaries — we clear all unconfirmed matches here.
147        else {
148            self.scan_context_mut().unconfirmed_matches.clear();
149        }
150
151        let ctx = self.scan_context_mut();
152
153        ctx.scan_state = ScanState::ScanningBlock((base, data));
154        ctx.search_for_patterns()?;
155
156        for (_, match_list) in ctx.pattern_matches.matches_per_pattern() {
157            // Here we iterate the matches in order to gather snippets of data
158            // from where the matches occurred. Notice however that we are only
159            // interested in the matches that occurred in the recently scanned
160            // block (those were match.base == base).
161            for match_ in
162                match_list.iter().filter(|match_| match_.base == base)
163            {
164                if let Some(match_data) = data.get(match_.block_range()) {
165                    // Snippets are indexed by their offsets within the scanned
166                    // data. This offset is not relative to the start of the
167                    // memory block, it takes into account the block's base
168                    // offset.
169                    //
170                    // The matching data is stored into the snippets B-tree map.
171                    // If an entry exists for the same offset, it will be replaced
172                    // with the new matching data only if it's larger than the
173                    // existing one.
174                    match self.snippets.entry(match_.range.start) {
175                        Entry::Occupied(mut entry) => {
176                            let snippet = entry.get_mut();
177                            if match_data.len() > snippet.len() {
178                                debug_assert!(match_data.starts_with(snippet));
179                                entry.insert(match_data.to_vec());
180                            } else {
181                                debug_assert!(snippet.starts_with(match_data));
182                            }
183                        }
184                        Entry::Vacant(entry) => {
185                            entry.insert(match_data.to_vec());
186                        }
187                    }
188                } else {
189                    debug_assert!(false)
190                }
191            }
192        }
193
194        Ok(self)
195    }
196
197    /// Finalizes the scanning process.
198    ///
199    /// After all data blocks have been scanned, this method evaluates the
200    /// conditions of the YARA rules and produces the final scan results.
201    pub fn finish(&mut self) -> Result<ScanResults<'_, 'r>, ScanError> {
202        if self.needs_reset {
203            self.scan_context_mut().reset();
204        }
205
206        self.needs_reset = true;
207
208        let ctx = self.scan_context_mut();
209
210        ctx.eval_conditions()?;
211
212        ctx.scan_state = ScanState::Finished(DataSnippets::MultiBlock(
213            mem::take(&mut self.snippets),
214        ));
215
216        Ok(ScanResults::new(ctx))
217    }
218
219    /// Sets the value of a global variable.
220    ///
221    /// The variable must has been previously defined by calling
222    /// [`crate::Compiler::define_global`], and the type it has during the
223    /// definition must match the type of the new value (`T`).
224    ///
225    /// The variable will retain the new value in subsequent scans, unless this
226    /// function is called again for setting a new value.
227    pub fn set_global<T: TryInto<Variable>>(
228        &mut self,
229        ident: &str,
230        value: T,
231    ) -> Result<&mut Self, VariableError>
232    where
233        VariableError: From<<T as TryInto<Variable>>::Error>,
234    {
235        self.scan_context_mut().set_global(ident, value)?;
236        Ok(self)
237    }
238
239    /// Sets a timeout for scan operations.
240    ///
241    /// The scan functions will return an [ScanError::Timeout] once the
242    /// provided timeout duration has elapsed. The scanner will make every
243    /// effort to stop promptly after the designated timeout duration. However,
244    /// in some cases, particularly with rules containing only a few patterns,
245    /// the scanner could potentially continue running for a longer period than
246    /// the specified timeout.
247    pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
248        self.scan_context_mut().set_timeout(timeout);
249        self
250    }
251
252    /// Sets the maximum number of matches per pattern.
253    ///
254    /// When some pattern reaches the maximum number of patterns it won't
255    /// produce more matches.
256    pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
257        self.scan_context_mut().pattern_matches.max_matches_per_pattern(n);
258        self
259    }
260
261    /// Sets a callback that is invoked every time a YARA rule calls the
262    /// `console` module.
263    ///
264    /// The `callback` function is invoked with a string representing the
265    /// message being logged. The function can print the message to stdout,
266    /// append it to a file, etc. If no callback is set these messages are
267    /// ignored.
268    pub fn console_log<F>(&mut self, callback: F) -> &mut Self
269    where
270        F: FnMut(String) + 'r,
271    {
272        self.scan_context_mut().console_log = Some(Box::new(callback));
273        self
274    }
275
276    /// Returns profiling data for the slowest N rules.
277    ///
278    /// The profiling data reflects the cumulative execution time of each rule
279    /// across all scanned files. This information is useful for identifying
280    /// performance bottlenecks. To reset the profiling data and start fresh
281    /// for subsequent scans, use [`crate::Scanner::clear_profiling_data`].
282    #[cfg(feature = "rules-profiling")]
283    pub fn slowest_rules(
284        &self,
285        n: usize,
286    ) -> Vec<crate::scanner::ProfilingData<'_>> {
287        self.scan_context().slowest_rules(n)
288    }
289
290    /// Clears all accumulated profiling data.
291    ///
292    /// This method resets the profiling data collected during rule execution
293    /// across scanned files. Use this to start a new profiling session, ensuring
294    /// the results reflect only the data gathered after this method is called.
295    #[cfg(feature = "rules-profiling")]
296    pub fn clear_profiling_data(&mut self) {
297        self.scan_context_mut().clear_profiling_data()
298    }
299}
300
301impl<'r> Scanner<'r> {
302    #[cfg(feature = "rules-profiling")]
303    #[inline]
304    fn scan_context<'a>(&self) -> &ScanContext<'r, 'a> {
305        unsafe {
306            transmute::<&ScanContext<'static, 'static>, &ScanContext<'r, '_>>(
307                self.wasm_store.data(),
308            )
309        }
310    }
311    #[inline]
312    fn scan_context_mut<'a>(&mut self) -> &'a mut ScanContext<'r, 'a> {
313        unsafe {
314            transmute::<
315                &mut ScanContext<'static, 'static>,
316                &mut ScanContext<'r, 'a>,
317            >(self.wasm_store.data_mut())
318        }
319    }
320}
321
322impl<'r> From<crate::scanner::Scanner<'r>> for Scanner<'r> {
323    fn from(scanner: crate::scanner::Scanner<'r>) -> Self {
324        Self {
325            _rules: scanner._rules,
326            wasm_store: scanner.wasm_store,
327            needs_reset: true,
328            snippets: Default::default(),
329        }
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use crate::scanner::blocks::Scanner;
336    use crate::{compile, Compiler};
337    use std::time::Duration;
338
339    #[test]
340    fn block_scanner_1() {
341        let rules = compile(
342            r#"
343            rule test { strings: $a = "ipsum" condition: $a }"#,
344        )
345        .unwrap();
346
347        let mut scanner = Scanner::new(&rules);
348
349        let results = scanner
350            .scan(0, b"Lorem ipsum")
351            .unwrap()
352            .scan(1000, b"dolor ipsum sit amet")
353            .unwrap()
354            .finish()
355            .unwrap();
356
357        assert_eq!(results.matching_rules().len(), 1);
358
359        let rule = results.matching_rules().next().unwrap();
360        let pattern = rule.patterns().next().unwrap();
361        let mut matches = pattern.matches();
362
363        let match1 = matches.next().unwrap();
364        assert_eq!(match1.data(), b"ipsum".as_slice());
365        assert_eq!(match1.range(), 6..11);
366
367        let match2 = matches.next().unwrap();
368        assert_eq!(match2.data(), b"ipsum".as_slice());
369        assert_eq!(match2.range(), 1006..1011);
370    }
371
372    #[test]
373    fn block_scanner_2() {
374        let rules = compile(
375            r#"
376            rule test { strings: $a = /ipsum.*amet/s condition: $a }"#,
377        )
378        .unwrap();
379
380        let mut scanner = Scanner::new(&rules);
381
382        let results = scanner
383            .scan(0, b"Lorem ipsum")
384            .unwrap()
385            .scan(1000, b"dolor ipsum sit amet")
386            .unwrap()
387            .finish()
388            .unwrap();
389
390        let rule = results.matching_rules().next().unwrap();
391        let pattern = rule.patterns().next().unwrap();
392        let mut matches = pattern.matches();
393
394        let match_ = matches.next().unwrap();
395        assert_eq!(match_.data(), b"ipsum sit amet".as_slice());
396        assert_eq!(match_.range(), 1006..1020);
397    }
398
399    #[test]
400    fn block_scanner_match_in_range() {
401        let rules = compile(
402            r#"
403            rule test { strings: $a = "ipsum" condition: $a in (1003..1008) }"#,
404        )
405        .unwrap();
406
407        let mut scanner = Scanner::new(&rules);
408
409        let results = scanner
410            .scan(0, b"Lorem ipsum")
411            .unwrap()
412            .scan(1000, b"dolor ipsum sit amet")
413            .unwrap()
414            .finish()
415            .unwrap();
416
417        assert_eq!(results.matching_rules().len(), 1);
418
419        let rule = results.matching_rules().next().unwrap();
420        let pattern = rule.patterns().next().unwrap();
421        let mut matches = pattern.matches();
422
423        let match1 = matches.next().unwrap();
424        assert_eq!(match1.data(), b"ipsum".as_slice());
425        assert_eq!(match1.range(), 6..11);
426
427        let match2 = matches.next().unwrap();
428        assert_eq!(match2.data(), b"ipsum".as_slice());
429        assert_eq!(match2.range(), 1006..1011);
430    }
431
432    #[test]
433    fn block_scanner_match_at_offset() {
434        let rules = compile(
435            r#"
436            rule test { strings: $a = "ipsum" condition: $a at 1006 }"#,
437        )
438        .unwrap();
439
440        let mut scanner = Scanner::new(&rules);
441
442        let results = scanner
443            .scan(1000, b"dolor ipsum sit amet")
444            .unwrap()
445            .finish()
446            .unwrap();
447
448        assert_eq!(results.matching_rules().len(), 1);
449    }
450
451    #[test]
452    fn block_scanner_global() {
453        let mut compiler = Compiler::new();
454
455        compiler
456            .define_global("foo", "")
457            .unwrap()
458            .add_source(
459                r#"
460                rule test { condition: foo == "foo" }"#,
461            )
462            .unwrap();
463
464        let rules = compiler.build();
465        let mut scanner = Scanner::new(&rules);
466        scanner.set_global("foo", "foo").unwrap();
467        let results = scanner.finish().unwrap();
468        assert_eq!(results.matching_rules().len(), 1);
469    }
470
471    #[test]
472    fn block_scanner_timeout() {
473        let rules = compile(
474            r#"
475            rule slow {
476                condition: 
477                    for any i in (0..1000000000) : (
478                         uint8(i) == 0xCC
479                    )
480            }"#,
481        )
482        .unwrap();
483
484        let mut scanner = Scanner::new(&rules);
485        scanner.set_timeout(Duration::from_secs(1));
486        let err = scanner.finish().unwrap_err();
487        assert_eq!(err.to_string(), "timeout");
488    }
489
490    #[test]
491    fn block_scanner_filesize() {
492        let rules = compile(
493            r#"
494            rule filesize_undefined {
495                condition: 
496                    not defined filesize 
497            }"#,
498        )
499        .unwrap();
500
501        let mut scanner = Scanner::new(&rules);
502        let results = scanner.finish().unwrap();
503
504        assert_eq!(results.matching_rules().len(), 1);
505    }
506}