yara_x/scanner/blocks.rs
1/*! Scanner for scanning data in blocks.
2
3This scanner is designed for scenarios where the data to be scanned is not
4available as a single contiguous block of memory, but rather arrives in
5smaller, discrete blocks, allowing for incremental scanning.
6*/
7use std::cmp;
8use std::collections::BTreeMap;
9use std::collections::btree_map::Entry;
10use std::mem;
11use std::mem::transmute;
12use std::pin::Pin;
13use std::time::Duration;
14
15use crate::errors::VariableError;
16use crate::scanner::context::{ScanState, create_wasm_store_and_ctx};
17use crate::scanner::{DataSnippets, ScanContext};
18use crate::wasm::runtime::Store;
19use crate::{Rules, ScanError, ScanResults, Variable};
20
21/// Scans data in blocks
22///
23/// This scanner is designed for scenarios where the data to be scanned is not
24/// available as a single contiguous block of memory, but rather arrives in
25/// smaller, discrete blocks, allowing for incremental scanning.
26///
27/// # Examples
28///
29/// ```
30/// # use yara_x::{blocks, compile};
31///
32/// let rules = compile(r#"rule test { strings: $a = "abc" condition: $a }"#).unwrap();
33///
34/// let mut scanner = blocks::Scanner::new(&rules);
35///
36/// // Scan the first block of data.
37/// scanner.scan(0, b"xabcy").unwrap();
38///
39/// // Scan a second block of data, which can overlap with the first.
40/// scanner.scan(3, b"cyz").unwrap();
41///
42/// // Finish the scan and get the results.
43/// let results = scanner.finish().unwrap();
44///
45/// assert_eq!(results.matching_rules().len(), 1);
46/// ```
47///
48/// # Limitations of Block Scanning
49///
50/// Block scanning works by analyzing data in chunks rather than as a whole
51/// file. This makes it useful for streaming or memory-constrained scenarios,
52/// but it comes with important limitations compared to standard scanning:
53///
54/// 1) Modules won't work. Parsers for structured formats (e.g., PE, ELF)
55/// require access to the entire file and cannot be applied in block
56/// scanning mode.
57/// 2) Other modules like `hash` won't work either, as they require access to
58/// all the scanned data during the evaluation of the rule's condition,
59/// something that can't be guaranteed in block scanning mode. The hash
60/// functions will return `undefined` when used in a multi-block context.
61/// 3) Built-in functions like `uint8`, `uint16`, `uint32`, etc., have the
62/// same limitation. They also return `undefined` in block scanning mode.
63/// 4) The `filesize` keyword returns `undefined` in block scanning mode.
64/// 5) Patterns won't match across block boundaries. Every match will be
65/// completely contained within one of the blocks.
66///
67/// All these limitations imply that in block scanning mode you should only
68/// use rules that rely on text, hex or regex patterns.
69///
70/// # Data Consistency in Overlapping Blocks
71///
72/// When [`Scanner::scan`] is invoked multiple times with different blocks
73/// that may overlap, the user is responsible for ensuring data consistency.
74/// This means that if the same region of the original data is present in two
75/// or more overlapping blocks, the content of that region must be identical
76/// across all calls to this function.
77///
78/// Generally speaking, the scanner does not verify this consistency and
79/// assumes the user provides accurate and consistent data. In debug releases
80/// the scanner may try to verify this consistency, but only when some pattern
81/// matches in the overlapping region.
82pub struct Scanner<'r> {
83 _rules: &'r Rules,
84 wasm_store: Pin<Box<Store<ScanContext<'static, 'static>>>>,
85 needs_reset: bool,
86 snippets: BTreeMap<usize, Vec<u8>>,
87}
88
89impl<'r> Scanner<'r> {
90 /// Creates a new block scanner.
91 pub fn new(rules: &'r Rules) -> Scanner<'r> {
92 Scanner {
93 _rules: rules,
94 wasm_store: create_wasm_store_and_ctx(rules),
95 needs_reset: true,
96 snippets: BTreeMap::new(),
97 }
98 }
99
100 /// Sets the context size for matches.
101 ///
102 /// This specifies how many bytes at the left and right of each match will
103 /// be reported by [`crate::Match::data_with_context`]. By default, the
104 /// match context size is 0, which means that [`crate::Match::data_with_context`]
105 /// will return exactly the same data as [`crate::Match::data`].
106 pub fn match_context_size(&mut self, size: usize) -> &mut Self {
107 self.scan_context_mut().match_context_size = size;
108 self
109 }
110}
111impl<'r> Scanner<'r> {
112 /// Scans a block of data.
113 ///
114 /// This method processes a given block of data, searching for patterns
115 /// defined in the YARA rules. The `base` argument specifies the offset
116 /// of the current block within the overall data being scanned. In most
117 /// cases you will want to call this method multiple times, providing a
118 /// different block on each call.
119 ///
120 /// # Arguments
121 ///
122 /// * `base` - The starting offset of the `data` block within overall
123 /// data being scanned.
124 /// * `data` - The byte slice representing the current block of data to
125 /// scan.
126 ///
127 /// # Returns
128 ///
129 /// A `Result` indicating success or a `ScanError` if the scan operation
130 /// fails.
131 pub fn scan(
132 &mut self,
133 base: usize,
134 data: &[u8],
135 ) -> Result<&mut Self, ScanError> {
136 // Reset the scanner if needed. This is done before scanning the first
137 // block after the scanner has been created, or when a previous scan
138 // has finished and the scanner is going to be reused.
139 if self.needs_reset {
140 self.scan_context_mut().reset();
141 self.needs_reset = false;
142 }
143 // Even when the scanner is not reset, we must clear unconfirmed matches
144 // between blocks. Otherwise, matches partially detected in one block
145 // could incorrectly be confirmed by data from a different block.
146 //
147 // This prevents matches from spanning multiple blocks — a scenario that
148 // could occur with patterns split into multiple subpatterns, for
149 // example:
150 //
151 // { 01 02 03 [-] 04 05 06}
152 //
153 // In this case, the subpattern `01 02 03` might match in one block, and
154 // `04 05 06` in the next. While supporting cross-block matches is
155 // technically possible, it would be inconsistent with patterns that
156 // cannot span blocks. To maintain a simple, uniform rule — that matches
157 // never cross block boundaries — we clear all unconfirmed matches here.
158 else {
159 self.scan_context_mut().tracker.unconfirmed_matches.clear();
160 }
161
162 let ctx = self.scan_context_mut();
163
164 ctx.scan_state = ScanState::ScanningBlock((base, data));
165
166 ctx.set_pattern_search_done(false);
167 ctx.search_for_patterns()?;
168
169 ctx.scan_state = ScanState::Idle;
170
171 for (_, match_list) in
172 ctx.tracker.pattern_matches.matches_per_pattern()
173 {
174 // Here we iterate the matches in order to gather snippets of data
175 // from where the matches occurred. Notice however that we are only
176 // interested in the matches that occurred in the recently scanned
177 // block (those were match.base == base).
178 for match_ in
179 match_list.iter().filter(|match_| match_.base == base)
180 {
181 let context_start = cmp::max(
182 match_.range.start.saturating_sub(ctx.match_context_size),
183 base,
184 );
185
186 let context_end = cmp::min(
187 match_.range.end + ctx.match_context_size,
188 base + data.len(),
189 );
190
191 let block_start = context_start - base;
192 let block_end = context_end - base;
193
194 if let Some(context_data) = data.get(block_start..block_end) {
195 // Snippets are indexed by the offset where the context starts.
196 match self.snippets.entry(context_start) {
197 Entry::Occupied(mut entry) => {
198 let snippet = entry.get_mut();
199 if context_data.len() > snippet.len() {
200 entry.insert(context_data.to_vec());
201 }
202 }
203 Entry::Vacant(entry) => {
204 entry.insert(context_data.to_vec());
205 }
206 }
207 } else {
208 debug_assert!(false)
209 }
210 }
211 }
212
213 Ok(self)
214 }
215
216 /// Finalizes the scanning process.
217 ///
218 /// After all data blocks have been scanned, this method evaluates the
219 /// conditions of the YARA rules and produces the final scan results.
220 pub fn finish(&mut self) -> Result<ScanResults<'_, 'r>, ScanError> {
221 if self.needs_reset {
222 self.scan_context_mut().reset();
223 }
224
225 self.needs_reset = true;
226
227 let ctx = self.scan_context_mut();
228
229 ctx.eval_conditions()?;
230
231 ctx.scan_state = ScanState::Finished(DataSnippets::MultiBlock(
232 mem::take(&mut self.snippets),
233 ));
234
235 Ok(ScanResults::new(ctx))
236 }
237
238 /// Sets the value of a global variable.
239 ///
240 /// The variable must has been previously defined by calling
241 /// [`crate::Compiler::define_global`], and the type it has during the
242 /// definition must match the type of the new value (`T`).
243 ///
244 /// The variable will retain the new value in subsequent scans, unless this
245 /// function is called again for setting a new value.
246 pub fn set_global<T: TryInto<Variable>>(
247 &mut self,
248 ident: &str,
249 value: T,
250 ) -> Result<&mut Self, VariableError>
251 where
252 VariableError: From<<T as TryInto<Variable>>::Error>,
253 {
254 self.scan_context_mut().set_global(ident, value)?;
255 Ok(self)
256 }
257
258 /// Sets a timeout for scan operations.
259 ///
260 /// The scan functions will return an [ScanError::Timeout] once the
261 /// provided timeout duration has elapsed. The scanner will make every
262 /// effort to stop promptly after the designated timeout duration. However,
263 /// in some cases, particularly with rules containing only a few patterns,
264 /// the scanner could potentially continue running for a longer period than
265 /// the specified timeout.
266 pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
267 self.scan_context_mut().set_timeout(timeout);
268 self
269 }
270
271 /// Sets the maximum number of matches per pattern.
272 ///
273 /// When some pattern reaches the maximum number of patterns it won't
274 /// produce more matches.
275 pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
276 self.scan_context_mut()
277 .tracker
278 .pattern_matches
279 .max_matches_per_pattern(n);
280 self
281 }
282
283 /// Enables or disables fast scan mode.
284 ///
285 /// In fast scan mode, the scanner avoids tracking matches for patterns
286 /// when it is not necessary (e.g. when a rule condition only performs a
287 /// simple boolean check `$a`).
288 ///
289 /// Note that using fast scan mode implies that not all matches will be
290 /// reported. For instance, when iterating matches using [`ScanResults`],
291 /// you won't get all occurrences of the pattern in the file, only the first
292 /// one.
293 pub fn fast_scan(&mut self, yes: bool) -> &mut Self {
294 self.scan_context_mut().tracker.fast_scan = yes;
295 self
296 }
297
298 /// Sets a callback that is invoked every time a YARA rule calls the
299 /// `console` module.
300 ///
301 /// The `callback` function is invoked with a string representing the
302 /// message being logged. The function can print the message to stdout,
303 /// append it to a file, etc. If no callback is set these messages are
304 /// ignored.
305 pub fn console_log<F>(&mut self, callback: F) -> &mut Self
306 where
307 F: FnMut(String) + 'r,
308 {
309 self.scan_context_mut().console_log = Some(Box::new(callback));
310 self
311 }
312
313 /// Returns profiling data for the slowest N rules.
314 ///
315 /// The profiling data reflects the cumulative execution time of each rule
316 /// across all scanned files. This information is useful for identifying
317 /// performance bottlenecks. To reset the profiling data and start fresh
318 /// for subsequent scans, use [`crate::Scanner::clear_profiling_data`].
319 #[cfg(feature = "rules-profiling")]
320 pub fn slowest_rules(
321 &self,
322 n: usize,
323 ) -> Vec<crate::scanner::ProfilingData<'_>> {
324 self.scan_context().slowest_rules(n)
325 }
326
327 /// Clears all accumulated profiling data.
328 ///
329 /// This method resets the profiling data collected during rule execution
330 /// across scanned files. Use this to start a new profiling session, ensuring
331 /// the results reflect only the data gathered after this method is called.
332 #[cfg(feature = "rules-profiling")]
333 pub fn clear_profiling_data(&mut self) {
334 self.scan_context_mut().clear_profiling_data()
335 }
336}
337
338impl<'r> Scanner<'r> {
339 #[cfg(feature = "rules-profiling")]
340 #[inline]
341 fn scan_context<'a>(&self) -> &ScanContext<'r, 'a> {
342 unsafe {
343 transmute::<&ScanContext<'static, 'static>, &ScanContext<'r, '_>>(
344 self.wasm_store.data(),
345 )
346 }
347 }
348 #[inline]
349 fn scan_context_mut<'a>(&mut self) -> &'a mut ScanContext<'r, 'a> {
350 unsafe {
351 transmute::<
352 &mut ScanContext<'static, 'static>,
353 &mut ScanContext<'r, 'a>,
354 >(self.wasm_store.data_mut())
355 }
356 }
357}
358
359impl<'r> From<crate::scanner::Scanner<'r>> for Scanner<'r> {
360 fn from(scanner: crate::scanner::Scanner<'r>) -> Self {
361 Self {
362 _rules: scanner._rules,
363 wasm_store: scanner.wasm_store,
364 needs_reset: true,
365 snippets: Default::default(),
366 }
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use crate::scanner::blocks::Scanner;
373 use crate::{Compiler, compile};
374 use std::time::Duration;
375
376 #[test]
377 fn block_scanner_1() {
378 let rules = compile(
379 r#"
380 rule test { strings: $a = "ipsum" condition: $a }"#,
381 )
382 .unwrap();
383
384 let mut scanner = Scanner::new(&rules);
385
386 let results = scanner
387 .scan(0, b"Lorem ipsum")
388 .unwrap()
389 .scan(1000, b"dolor ipsum sit amet")
390 .unwrap()
391 .finish()
392 .unwrap();
393
394 assert_eq!(results.matching_rules().len(), 1);
395
396 let rule = results.matching_rules().next().unwrap();
397 let pattern = rule.patterns().next().unwrap();
398 let mut matches = pattern.matches();
399
400 let match1 = matches.next().unwrap();
401 assert_eq!(match1.data(), b"ipsum".as_slice());
402 assert_eq!(match1.range(), 6..11);
403
404 let match2 = matches.next().unwrap();
405 assert_eq!(match2.data(), b"ipsum".as_slice());
406 assert_eq!(match2.range(), 1006..1011);
407 }
408
409 #[test]
410 fn block_scanner_context() {
411 let rules = compile(
412 r#"
413 rule test { strings: $a = "ipsum" condition: $a }"#,
414 )
415 .unwrap();
416
417 let mut scanner = Scanner::new(&rules);
418
419 let results = scanner
420 .match_context_size(5)
421 .scan(0, b"Lorem ipsum sit amet")
422 .unwrap()
423 .scan(1000, b"dolor ipsum sit amet")
424 .unwrap()
425 .finish()
426 .unwrap();
427
428 assert_eq!(results.matching_rules().len(), 1);
429
430 let rule = results.matching_rules().next().unwrap();
431 let pattern = rule.patterns().next().unwrap();
432 let mut matches = pattern.matches();
433
434 let match1 = matches.next().unwrap();
435 let (data1, range1) = match1.data_with_context();
436 assert_eq!(data1, b"orem ipsum sit ".as_slice());
437 assert_eq!(range1, 5..10);
438
439 let match2 = matches.next().unwrap();
440 let (data2, range2) = match2.data_with_context();
441 assert_eq!(data2, b"olor ipsum sit ".as_slice());
442 assert_eq!(range2, 5..10);
443 }
444
445 #[test]
446 fn block_scanner_2() {
447 let rules = compile(
448 r#"
449 rule test { strings: $a = /ipsum.*amet/s condition: $a }"#,
450 )
451 .unwrap();
452
453 let mut scanner = Scanner::new(&rules);
454
455 let results = scanner
456 .scan(0, b"Lorem ipsum")
457 .unwrap()
458 .scan(1000, b"dolor ipsum sit amet")
459 .unwrap()
460 .finish()
461 .unwrap();
462
463 let rule = results.matching_rules().next().unwrap();
464 let pattern = rule.patterns().next().unwrap();
465 let mut matches = pattern.matches();
466
467 let match_ = matches.next().unwrap();
468 assert_eq!(match_.data(), b"ipsum sit amet".as_slice());
469 assert_eq!(match_.range(), 1006..1020);
470 }
471
472 #[test]
473 fn block_scanner_match_in_range() {
474 let rules = compile(
475 r#"
476 rule test { strings: $a = "ipsum" condition: $a in (1003..1008) }"#,
477 )
478 .unwrap();
479
480 let mut scanner = Scanner::new(&rules);
481
482 let results = scanner
483 .scan(0, b"Lorem ipsum")
484 .unwrap()
485 .scan(1000, b"dolor ipsum sit amet")
486 .unwrap()
487 .finish()
488 .unwrap();
489
490 assert_eq!(results.matching_rules().len(), 1);
491
492 let rule = results.matching_rules().next().unwrap();
493 let pattern = rule.patterns().next().unwrap();
494 let mut matches = pattern.matches();
495
496 let match1 = matches.next().unwrap();
497 assert_eq!(match1.data(), b"ipsum".as_slice());
498 assert_eq!(match1.range(), 6..11);
499
500 let match2 = matches.next().unwrap();
501 assert_eq!(match2.data(), b"ipsum".as_slice());
502 assert_eq!(match2.range(), 1006..1011);
503 }
504
505 #[test]
506 fn block_scanner_match_at_offset() {
507 let rules = compile(
508 r#"
509 rule test { strings: $a = "ipsum" condition: $a at 1006 }"#,
510 )
511 .unwrap();
512
513 let mut scanner = Scanner::new(&rules);
514
515 let results = scanner
516 .scan(1000, b"dolor ipsum sit amet")
517 .unwrap()
518 .finish()
519 .unwrap();
520
521 assert_eq!(results.matching_rules().len(), 1);
522 }
523
524 #[test]
525 fn block_scanner_global() {
526 let mut compiler = Compiler::new();
527
528 compiler
529 .define_global("foo", "")
530 .unwrap()
531 .add_source(
532 r#"
533 rule test { condition: foo == "foo" }"#,
534 )
535 .unwrap();
536
537 let rules = compiler.build();
538 let mut scanner = Scanner::new(&rules);
539 scanner.set_global("foo", "foo").unwrap();
540 let results = scanner.finish().unwrap();
541 assert_eq!(results.matching_rules().len(), 1);
542 }
543
544 #[test]
545 fn block_scanner_timeout() {
546 let rules = compile(
547 r#"
548 rule slow {
549 condition:
550 for any i in (0..1000000000) : (
551 uint8(i) == 0xCC
552 )
553 }"#,
554 )
555 .unwrap();
556
557 let mut scanner = Scanner::new(&rules);
558 scanner.set_timeout(Duration::from_secs(1));
559 let err = scanner.finish().unwrap_err();
560 assert_eq!(err.to_string(), "timeout");
561 }
562
563 #[test]
564 fn block_scanner_filesize() {
565 let rules = compile(
566 r#"
567 rule filesize_undefined {
568 condition:
569 not defined filesize
570 }"#,
571 )
572 .unwrap();
573
574 let mut scanner = Scanner::new(&rules);
575 let results = scanner.finish().unwrap();
576
577 assert_eq!(results.matching_rules().len(), 1);
578 }
579
580 #[test]
581 fn block_scanner_fast_scan() {
582 let rules = compile(
583 r#"
584 rule test {
585 strings:
586 $a = "foo"
587 condition:
588 $a
589 }"#,
590 )
591 .unwrap();
592
593 let mut scanner = Scanner::new(&rules);
594 let results = scanner
595 .fast_scan(true)
596 .scan(0, b"foofoofoo")
597 .unwrap()
598 .finish()
599 .unwrap();
600
601 assert_eq!(results.matching_rules().len(), 1);
602
603 let rule = results.matching_rules().next().unwrap();
604 let pattern = rule.patterns().next().unwrap();
605 let mut matches = pattern.matches();
606
607 // Only a single match is returned because of the fast scan mode!
608 let match1 = matches.next().unwrap();
609 assert_eq!(match1.data(), b"foo".as_slice());
610 assert_eq!(match1.range(), 0..3);
611
612 assert!(matches.next().is_none());
613 }
614}