Skip to main content

yara_x/scanner/
blocks.rs

1/*! Scanner for scanning data in blocks.
2
3This scanner is designed for scenarios where the data to be scanned is not
4available as a single contiguous block of memory, but rather arrives in
5smaller, discrete blocks, allowing for incremental scanning.
6*/
7use std::cmp;
8use std::collections::BTreeMap;
9use std::collections::btree_map::Entry;
10use std::mem;
11use std::mem::transmute;
12use std::pin::Pin;
13use std::time::Duration;
14
15use crate::errors::VariableError;
16use crate::scanner::context::{ScanState, create_wasm_store_and_ctx};
17use crate::scanner::{DataSnippets, ScanContext};
18use crate::wasm::runtime::Store;
19use crate::{Rules, ScanError, ScanResults, Variable};
20
21/// Scans data in blocks
22///
23/// This scanner is designed for scenarios where the data to be scanned is not
24/// available as a single contiguous block of memory, but rather arrives in
25/// smaller, discrete blocks, allowing for incremental scanning.
26///
27/// # Examples
28///
29/// ```
30/// # use yara_x::{blocks, compile};
31///
32/// let rules = compile(r#"rule test { strings: $a = "abc" condition: $a }"#).unwrap();
33///
34/// let mut scanner = blocks::Scanner::new(&rules);
35///
36/// // Scan the first block of data.
37/// scanner.scan(0, b"xabcy").unwrap();
38///
39/// // Scan a second block of data, which can overlap with the first.
40/// scanner.scan(3, b"cyz").unwrap();
41///
42/// // Finish the scan and get the results.
43/// let results = scanner.finish().unwrap();
44///
45/// assert_eq!(results.matching_rules().len(), 1);
46/// ```
47///
48/// # Limitations of Block Scanning
49///
50/// Block scanning works by analyzing data in chunks rather than as a whole
51/// file. This makes it useful for streaming or memory-constrained scenarios,
52/// but it comes with important limitations compared to standard scanning:
53///
54/// 1) Modules won't work. Parsers for structured formats (e.g., PE, ELF)
55///    require access to the entire file and cannot be applied in block
56///    scanning mode.
57/// 2) Other modules like `hash` won't work either, as they require access to
58///    all the scanned data during the evaluation of the rule's condition,
59///    something that can't be guaranteed in block scanning mode. The hash
60///    functions will return `undefined` when used in a multi-block context.
61/// 3) Built-in functions like `uint8`, `uint16`, `uint32`, etc., have the
62///    same limitation. They also return `undefined` in block scanning mode.
63/// 4) The `filesize` keyword returns `undefined` in block scanning mode.
64/// 5) Patterns won't match across block boundaries. Every match will be
65///    completely contained within one of the blocks.
66///
67/// All these limitations imply that in block scanning mode you should only
68/// use rules that rely on text, hex or regex patterns.
69///
70/// # Data Consistency in Overlapping Blocks
71///
72/// When [`Scanner::scan`] is invoked multiple times with different blocks
73/// that may overlap, the user is responsible for ensuring data consistency.
74/// This means that if the same region of the original data is present in two
75/// or more overlapping blocks, the content of that region must be identical
76/// across all calls to this function.
77///
78/// Generally speaking, the scanner does not verify this consistency and
79/// assumes the user provides accurate and consistent data. In debug releases
80/// the scanner may try to verify this consistency, but only when some pattern
81/// matches in the overlapping region.
82pub struct Scanner<'r> {
83    _rules: &'r Rules,
84    wasm_store: Pin<Box<Store<ScanContext<'static, 'static>>>>,
85    needs_reset: bool,
86    snippets: BTreeMap<usize, Vec<u8>>,
87}
88
89impl<'r> Scanner<'r> {
90    /// Creates a new block scanner.
91    pub fn new(rules: &'r Rules) -> Scanner<'r> {
92        Scanner {
93            _rules: rules,
94            wasm_store: create_wasm_store_and_ctx(rules),
95            needs_reset: true,
96            snippets: BTreeMap::new(),
97        }
98    }
99
100    /// Sets the context size for matches.
101    ///
102    /// This specifies how many bytes at the left and right of each match will
103    /// be reported by [`crate::Match::data_with_context`]. By default, the
104    /// match context size is 0, which means that [`crate::Match::data_with_context`]
105    /// will return exactly the same data as [`crate::Match::data`].
106    pub fn match_context_size(&mut self, size: usize) -> &mut Self {
107        self.scan_context_mut().match_context_size = size;
108        self
109    }
110}
111impl<'r> Scanner<'r> {
112    /// Scans a block of data.
113    ///
114    /// This method processes a given block of data, searching for patterns
115    /// defined in the YARA rules. The `base` argument specifies the offset
116    /// of the current block within the overall data being scanned. In most
117    /// cases you will want to call this method multiple times, providing a
118    /// different block on each call.
119    ///
120    /// # Arguments
121    ///
122    /// * `base` - The starting offset of the `data` block within overall
123    ///   data being scanned.
124    /// * `data` - The byte slice representing the current block of data to
125    ///   scan.
126    ///
127    /// # Returns
128    ///
129    /// A `Result` indicating success or a `ScanError` if the scan operation
130    /// fails.
131    pub fn scan(
132        &mut self,
133        base: usize,
134        data: &[u8],
135    ) -> Result<&mut Self, ScanError> {
136        // Reset the scanner if needed. This is done before scanning the first
137        // block after the scanner has been created, or when a previous scan
138        // has finished and the scanner is going to be reused.
139        if self.needs_reset {
140            self.scan_context_mut().reset();
141            self.needs_reset = false;
142        }
143        // Even when the scanner is not reset, we must clear unconfirmed matches
144        // between blocks. Otherwise, matches partially detected in one block
145        // could incorrectly be confirmed by data from a different block.
146        //
147        // This prevents matches from spanning multiple blocks — a scenario that
148        // could occur with patterns split into multiple subpatterns, for
149        // example:
150        //
151        // { 01 02 03 [-] 04 05 06}
152        //
153        // In this case, the subpattern `01 02 03` might match in one block, and
154        // `04 05 06` in the next. While supporting cross-block matches is
155        // technically possible, it would be inconsistent with patterns that
156        // cannot span blocks. To maintain a simple, uniform rule — that matches
157        // never cross block boundaries — we clear all unconfirmed matches here.
158        else {
159            self.scan_context_mut().tracker.unconfirmed_matches.clear();
160        }
161
162        let ctx = self.scan_context_mut();
163
164        ctx.scan_state = ScanState::ScanningBlock((base, data));
165
166        ctx.set_pattern_search_done(false);
167        ctx.search_for_patterns()?;
168
169        ctx.scan_state = ScanState::Idle;
170
171        for (_, match_list) in
172            ctx.tracker.pattern_matches.matches_per_pattern()
173        {
174            // Here we iterate the matches in order to gather snippets of data
175            // from where the matches occurred. Notice however that we are only
176            // interested in the matches that occurred in the recently scanned
177            // block (those were match.base == base).
178            for match_ in
179                match_list.iter().filter(|match_| match_.base == base)
180            {
181                let context_start = cmp::max(
182                    match_.range.start.saturating_sub(ctx.match_context_size),
183                    base,
184                );
185
186                let context_end = cmp::min(
187                    match_.range.end + ctx.match_context_size,
188                    base + data.len(),
189                );
190
191                let block_start = context_start - base;
192                let block_end = context_end - base;
193
194                if let Some(context_data) = data.get(block_start..block_end) {
195                    // Snippets are indexed by the offset where the context starts.
196                    match self.snippets.entry(context_start) {
197                        Entry::Occupied(mut entry) => {
198                            let snippet = entry.get_mut();
199                            if context_data.len() > snippet.len() {
200                                entry.insert(context_data.to_vec());
201                            }
202                        }
203                        Entry::Vacant(entry) => {
204                            entry.insert(context_data.to_vec());
205                        }
206                    }
207                } else {
208                    debug_assert!(false)
209                }
210            }
211        }
212
213        Ok(self)
214    }
215
216    /// Finalizes the scanning process.
217    ///
218    /// After all data blocks have been scanned, this method evaluates the
219    /// conditions of the YARA rules and produces the final scan results.
220    pub fn finish(&mut self) -> Result<ScanResults<'_, 'r>, ScanError> {
221        if self.needs_reset {
222            self.scan_context_mut().reset();
223        }
224
225        self.needs_reset = true;
226
227        let ctx = self.scan_context_mut();
228
229        ctx.eval_conditions()?;
230
231        ctx.scan_state = ScanState::Finished(DataSnippets::MultiBlock(
232            mem::take(&mut self.snippets),
233        ));
234
235        Ok(ScanResults::new(ctx))
236    }
237
238    /// Sets the value of a global variable.
239    ///
240    /// The variable must has been previously defined by calling
241    /// [`crate::Compiler::define_global`], and the type it has during the
242    /// definition must match the type of the new value (`T`).
243    ///
244    /// The variable will retain the new value in subsequent scans, unless this
245    /// function is called again for setting a new value.
246    pub fn set_global<T: TryInto<Variable>>(
247        &mut self,
248        ident: &str,
249        value: T,
250    ) -> Result<&mut Self, VariableError>
251    where
252        VariableError: From<<T as TryInto<Variable>>::Error>,
253    {
254        self.scan_context_mut().set_global(ident, value)?;
255        Ok(self)
256    }
257
258    /// Sets a timeout for scan operations.
259    ///
260    /// The scan functions will return an [ScanError::Timeout] once the
261    /// provided timeout duration has elapsed. The scanner will make every
262    /// effort to stop promptly after the designated timeout duration. However,
263    /// in some cases, particularly with rules containing only a few patterns,
264    /// the scanner could potentially continue running for a longer period than
265    /// the specified timeout.
266    pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
267        self.scan_context_mut().set_timeout(timeout);
268        self
269    }
270
271    /// Sets the maximum number of matches per pattern.
272    ///
273    /// When some pattern reaches the maximum number of patterns it won't
274    /// produce more matches.
275    pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
276        self.scan_context_mut()
277            .tracker
278            .pattern_matches
279            .max_matches_per_pattern(n);
280        self
281    }
282
283    /// Enables or disables fast scan mode.
284    ///
285    /// In fast scan mode, the scanner avoids tracking matches for patterns
286    /// when it is not necessary (e.g. when a rule condition only performs a
287    /// simple boolean check `$a`).
288    ///
289    /// Note that using fast scan mode implies that not all matches will be
290    /// reported. For instance, when iterating matches using [`ScanResults`],
291    /// you won't get all occurrences of the pattern in the file, only the first
292    /// one.
293    pub fn fast_scan(&mut self, yes: bool) -> &mut Self {
294        self.scan_context_mut().tracker.fast_scan = yes;
295        self
296    }
297
298    /// Sets a callback that is invoked every time a YARA rule calls the
299    /// `console` module.
300    ///
301    /// The `callback` function is invoked with a string representing the
302    /// message being logged. The function can print the message to stdout,
303    /// append it to a file, etc. If no callback is set these messages are
304    /// ignored.
305    pub fn console_log<F>(&mut self, callback: F) -> &mut Self
306    where
307        F: FnMut(String) + 'r,
308    {
309        self.scan_context_mut().console_log = Some(Box::new(callback));
310        self
311    }
312
313    /// Returns profiling data for the slowest N rules.
314    ///
315    /// The profiling data reflects the cumulative execution time of each rule
316    /// across all scanned files. This information is useful for identifying
317    /// performance bottlenecks. To reset the profiling data and start fresh
318    /// for subsequent scans, use [`crate::Scanner::clear_profiling_data`].
319    #[cfg(feature = "rules-profiling")]
320    pub fn slowest_rules(
321        &self,
322        n: usize,
323    ) -> Vec<crate::scanner::ProfilingData<'_>> {
324        self.scan_context().slowest_rules(n)
325    }
326
327    /// Clears all accumulated profiling data.
328    ///
329    /// This method resets the profiling data collected during rule execution
330    /// across scanned files. Use this to start a new profiling session, ensuring
331    /// the results reflect only the data gathered after this method is called.
332    #[cfg(feature = "rules-profiling")]
333    pub fn clear_profiling_data(&mut self) {
334        self.scan_context_mut().clear_profiling_data()
335    }
336}
337
338impl<'r> Scanner<'r> {
339    #[cfg(feature = "rules-profiling")]
340    #[inline]
341    fn scan_context<'a>(&self) -> &ScanContext<'r, 'a> {
342        unsafe {
343            transmute::<&ScanContext<'static, 'static>, &ScanContext<'r, '_>>(
344                self.wasm_store.data(),
345            )
346        }
347    }
348    #[inline]
349    fn scan_context_mut<'a>(&mut self) -> &'a mut ScanContext<'r, 'a> {
350        unsafe {
351            transmute::<
352                &mut ScanContext<'static, 'static>,
353                &mut ScanContext<'r, 'a>,
354            >(self.wasm_store.data_mut())
355        }
356    }
357}
358
359impl<'r> From<crate::scanner::Scanner<'r>> for Scanner<'r> {
360    fn from(scanner: crate::scanner::Scanner<'r>) -> Self {
361        Self {
362            _rules: scanner._rules,
363            wasm_store: scanner.wasm_store,
364            needs_reset: true,
365            snippets: Default::default(),
366        }
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use crate::scanner::blocks::Scanner;
373    use crate::{Compiler, compile};
374    use std::time::Duration;
375
376    #[test]
377    fn block_scanner_1() {
378        let rules = compile(
379            r#"
380            rule test { strings: $a = "ipsum" condition: $a }"#,
381        )
382        .unwrap();
383
384        let mut scanner = Scanner::new(&rules);
385
386        let results = scanner
387            .scan(0, b"Lorem ipsum")
388            .unwrap()
389            .scan(1000, b"dolor ipsum sit amet")
390            .unwrap()
391            .finish()
392            .unwrap();
393
394        assert_eq!(results.matching_rules().len(), 1);
395
396        let rule = results.matching_rules().next().unwrap();
397        let pattern = rule.patterns().next().unwrap();
398        let mut matches = pattern.matches();
399
400        let match1 = matches.next().unwrap();
401        assert_eq!(match1.data(), b"ipsum".as_slice());
402        assert_eq!(match1.range(), 6..11);
403
404        let match2 = matches.next().unwrap();
405        assert_eq!(match2.data(), b"ipsum".as_slice());
406        assert_eq!(match2.range(), 1006..1011);
407    }
408
409    #[test]
410    fn block_scanner_context() {
411        let rules = compile(
412            r#"
413            rule test { strings: $a = "ipsum" condition: $a }"#,
414        )
415        .unwrap();
416
417        let mut scanner = Scanner::new(&rules);
418
419        let results = scanner
420            .match_context_size(5)
421            .scan(0, b"Lorem ipsum sit amet")
422            .unwrap()
423            .scan(1000, b"dolor ipsum sit amet")
424            .unwrap()
425            .finish()
426            .unwrap();
427
428        assert_eq!(results.matching_rules().len(), 1);
429
430        let rule = results.matching_rules().next().unwrap();
431        let pattern = rule.patterns().next().unwrap();
432        let mut matches = pattern.matches();
433
434        let match1 = matches.next().unwrap();
435        let (data1, range1) = match1.data_with_context();
436        assert_eq!(data1, b"orem ipsum sit ".as_slice());
437        assert_eq!(range1, 5..10);
438
439        let match2 = matches.next().unwrap();
440        let (data2, range2) = match2.data_with_context();
441        assert_eq!(data2, b"olor ipsum sit ".as_slice());
442        assert_eq!(range2, 5..10);
443    }
444
445    #[test]
446    fn block_scanner_2() {
447        let rules = compile(
448            r#"
449            rule test { strings: $a = /ipsum.*amet/s condition: $a }"#,
450        )
451        .unwrap();
452
453        let mut scanner = Scanner::new(&rules);
454
455        let results = scanner
456            .scan(0, b"Lorem ipsum")
457            .unwrap()
458            .scan(1000, b"dolor ipsum sit amet")
459            .unwrap()
460            .finish()
461            .unwrap();
462
463        let rule = results.matching_rules().next().unwrap();
464        let pattern = rule.patterns().next().unwrap();
465        let mut matches = pattern.matches();
466
467        let match_ = matches.next().unwrap();
468        assert_eq!(match_.data(), b"ipsum sit amet".as_slice());
469        assert_eq!(match_.range(), 1006..1020);
470    }
471
472    #[test]
473    fn block_scanner_match_in_range() {
474        let rules = compile(
475            r#"
476            rule test { strings: $a = "ipsum" condition: $a in (1003..1008) }"#,
477        )
478        .unwrap();
479
480        let mut scanner = Scanner::new(&rules);
481
482        let results = scanner
483            .scan(0, b"Lorem ipsum")
484            .unwrap()
485            .scan(1000, b"dolor ipsum sit amet")
486            .unwrap()
487            .finish()
488            .unwrap();
489
490        assert_eq!(results.matching_rules().len(), 1);
491
492        let rule = results.matching_rules().next().unwrap();
493        let pattern = rule.patterns().next().unwrap();
494        let mut matches = pattern.matches();
495
496        let match1 = matches.next().unwrap();
497        assert_eq!(match1.data(), b"ipsum".as_slice());
498        assert_eq!(match1.range(), 6..11);
499
500        let match2 = matches.next().unwrap();
501        assert_eq!(match2.data(), b"ipsum".as_slice());
502        assert_eq!(match2.range(), 1006..1011);
503    }
504
505    #[test]
506    fn block_scanner_match_at_offset() {
507        let rules = compile(
508            r#"
509            rule test { strings: $a = "ipsum" condition: $a at 1006 }"#,
510        )
511        .unwrap();
512
513        let mut scanner = Scanner::new(&rules);
514
515        let results = scanner
516            .scan(1000, b"dolor ipsum sit amet")
517            .unwrap()
518            .finish()
519            .unwrap();
520
521        assert_eq!(results.matching_rules().len(), 1);
522    }
523
524    #[test]
525    fn block_scanner_global() {
526        let mut compiler = Compiler::new();
527
528        compiler
529            .define_global("foo", "")
530            .unwrap()
531            .add_source(
532                r#"
533                rule test { condition: foo == "foo" }"#,
534            )
535            .unwrap();
536
537        let rules = compiler.build();
538        let mut scanner = Scanner::new(&rules);
539        scanner.set_global("foo", "foo").unwrap();
540        let results = scanner.finish().unwrap();
541        assert_eq!(results.matching_rules().len(), 1);
542    }
543
544    #[test]
545    fn block_scanner_timeout() {
546        let rules = compile(
547            r#"
548            rule slow {
549                condition: 
550                    for any i in (0..1000000000) : (
551                         uint8(i) == 0xCC
552                    )
553            }"#,
554        )
555        .unwrap();
556
557        let mut scanner = Scanner::new(&rules);
558        scanner.set_timeout(Duration::from_secs(1));
559        let err = scanner.finish().unwrap_err();
560        assert_eq!(err.to_string(), "timeout");
561    }
562
563    #[test]
564    fn block_scanner_filesize() {
565        let rules = compile(
566            r#"
567            rule filesize_undefined {
568                condition: 
569                    not defined filesize 
570            }"#,
571        )
572        .unwrap();
573
574        let mut scanner = Scanner::new(&rules);
575        let results = scanner.finish().unwrap();
576
577        assert_eq!(results.matching_rules().len(), 1);
578    }
579
580    #[test]
581    fn block_scanner_fast_scan() {
582        let rules = compile(
583            r#"
584            rule test {
585                strings:
586                    $a = "foo"
587                condition:
588                    $a
589            }"#,
590        )
591        .unwrap();
592
593        let mut scanner = Scanner::new(&rules);
594        let results = scanner
595            .fast_scan(true)
596            .scan(0, b"foofoofoo")
597            .unwrap()
598            .finish()
599            .unwrap();
600
601        assert_eq!(results.matching_rules().len(), 1);
602
603        let rule = results.matching_rules().next().unwrap();
604        let pattern = rule.patterns().next().unwrap();
605        let mut matches = pattern.matches();
606
607        // Only a single match is returned because of the fast scan mode!
608        let match1 = matches.next().unwrap();
609        assert_eq!(match1.data(), b"foo".as_slice());
610        assert_eq!(match1.range(), 0..3);
611
612        assert!(matches.next().is_none());
613    }
614}