yara_x/scanner/blocks.rs
1/*! Scanner for scanning data in blocks.
2
3This scanner is designed for scenarios where the data to be scanned is not
4available as a single contiguous block of memory, but rather arrives in
5smaller, discrete blocks, allowing for incremental scanning.
6*/
7use std::collections::btree_map::Entry;
8use std::collections::BTreeMap;
9use std::mem;
10use std::mem::transmute;
11use std::pin::Pin;
12use std::time::Duration;
13
14use wasmtime::Store;
15
16use crate::errors::VariableError;
17use crate::scanner::context::{create_wasm_store_and_ctx, ScanState};
18use crate::scanner::{DataSnippets, ScanContext};
19use crate::{Rules, ScanError, ScanResults, Variable};
20
21/// Scans data in blocks
22///
23/// This scanner is designed for scenarios where the data to be scanned is not
24/// available as a single contiguous block of memory, but rather arrives in
25/// smaller, discrete blocks, allowing for incremental scanning.
26///
27/// # Examples
28///
29/// ```
30/// # use yara_x::{blocks, compile};
31///
32/// let rules = compile(r#"rule test { strings: $a = "abc" condition: $a }"#).unwrap();
33///
34/// let mut scanner = blocks::Scanner::new(&rules);
35///
36/// // Scan the first block of data.
37/// scanner.scan(0, b"xabcy").unwrap();
38///
39/// // Scan a second block of data, which can overlap with the first.
40/// scanner.scan(3, b"cyz").unwrap();
41///
42/// // Finish the scan and get the results.
43/// let results = scanner.finish().unwrap();
44///
45/// assert_eq!(results.matching_rules().len(), 1);
46/// ```
47///
48/// # Limitations of Block Scanning
49///
50/// Block scanning works by analyzing data in chunks rather than as a whole
51/// file. This makes it useful for streaming or memory-constrained scenarios,
52/// but it comes with important limitations compared to standard scanning:
53///
54/// 1) Modules won't work. Parsers for structured formats (e.g., PE, ELF)
55/// require access to the entire file and cannot be applied in block
56/// scanning mode.
57/// 2) Other modules like `hash` won't work either, as they require access to
58/// all the scanned data during the evaluation of the rule's condition,
59/// something that can't be guaranteed in block scanning mode. The hash
60/// functions will return `undefined` when used in a multi-block context.
61/// 3) Built-in functions like `uint8`, `uint16`, `uint32`, etc., have the
62/// same limitation. They also return `undefined` in block scanning mode.
63/// 4) The `filesize` keyword returns `undefined` in block scanning mode.
64/// 5) Patterns won't match across block boundaries. Every match will be
65/// completely contained within one of the blocks.
66///
67/// All these limitations imply that in block scanning mode you should only
68/// use rules that rely on text, hex or regex patterns.
69///
70/// # Data Consistency in Overlapping Blocks
71///
72/// When [`Scanner::scan`] is invoked multiple times with different blocks
73/// that may overlap, the user is responsible for ensuring data consistency.
74/// This means that if the same region of the original data is present in two
75/// or more overlapping blocks, the content of that region must be identical
76/// across all calls to this function.
77///
78/// Generally speaking, the scanner does not verify this consistency and
79/// assumes the user provides accurate and consistent data. In debug releases
80/// the scanner may try to verify this consistency, but only when some pattern
81/// matches in the overlapping region.
82pub struct Scanner<'r> {
83 _rules: &'r Rules,
84 wasm_store: Pin<Box<Store<ScanContext<'static, 'static>>>>,
85 needs_reset: bool,
86 snippets: BTreeMap<usize, Vec<u8>>,
87}
88
89impl<'r> Scanner<'r> {
90 /// Creates a new block scanner.
91 pub fn new(rules: &'r Rules) -> Scanner<'r> {
92 Scanner {
93 _rules: rules,
94 wasm_store: create_wasm_store_and_ctx(rules),
95 needs_reset: true,
96 snippets: BTreeMap::new(),
97 }
98 }
99}
100impl<'r> Scanner<'r> {
101 /// Scans a block of data.
102 ///
103 /// This method processes a given block of data, searching for patterns
104 /// defined in the YARA rules. The `base` argument specifies the offset
105 /// of the current block within the overall data being scanned. In most
106 /// cases you will want to call this method multiple times, providing a
107 /// different block on each call.
108 ///
109 /// # Arguments
110 ///
111 /// * `base` - The starting offset of the `data` block within overall
112 /// data being scanned.
113 /// * `data` - The byte slice representing the current block of data to
114 /// scan.
115 ///
116 /// # Returns
117 ///
118 /// A `Result` indicating success or a `ScanError` if the scan operation
119 /// fails.
120 pub fn scan(
121 &mut self,
122 base: usize,
123 data: &[u8],
124 ) -> Result<&mut Self, ScanError> {
125 // Reset the scanner if needed. This is done before scanning the first
126 // block after the scanner has been created, or when a previous scan
127 // has finished and the scanner is going to be reused.
128 if self.needs_reset {
129 self.scan_context_mut().reset();
130 self.needs_reset = false;
131 }
132 // Even when the scanner is not reset, we must clear unconfirmed matches
133 // between blocks. Otherwise, matches partially detected in one block
134 // could incorrectly be confirmed by data from a different block.
135 //
136 // This prevents matches from spanning multiple blocks — a scenario that
137 // could occur with patterns split into multiple subpatterns, for
138 // example:
139 //
140 // { 01 02 03 [-] 04 05 06}
141 //
142 // In this case, the subpattern `01 02 03` might match in one block, and
143 // `04 05 06` in the next. While supporting cross-block matches is
144 // technically possible, it would be inconsistent with patterns that
145 // cannot span blocks. To maintain a simple, uniform rule — that matches
146 // never cross block boundaries — we clear all unconfirmed matches here.
147 else {
148 self.scan_context_mut().unconfirmed_matches.clear();
149 }
150
151 let ctx = self.scan_context_mut();
152
153 ctx.scan_state = ScanState::ScanningBlock((base, data));
154 ctx.search_for_patterns()?;
155
156 for (_, match_list) in ctx.pattern_matches.matches_per_pattern() {
157 // Here we iterate the matches in order to gather snippets of data
158 // from where the matches occurred. Notice however that we are only
159 // interested in the matches that occurred in the recently scanned
160 // block (those were match.base == base).
161 for match_ in
162 match_list.iter().filter(|match_| match_.base == base)
163 {
164 if let Some(match_data) = data.get(match_.block_range()) {
165 // Snippets are indexed by their offsets within the scanned
166 // data. This offset is not relative to the start of the
167 // memory block, it takes into account the block's base
168 // offset.
169 //
170 // The matching data is stored into the snippets B-tree map.
171 // If an entry exists for the same offset, it will be replaced
172 // with the new matching data only if it's larger than the
173 // existing one.
174 match self.snippets.entry(match_.range.start) {
175 Entry::Occupied(mut entry) => {
176 let snippet = entry.get_mut();
177 if match_data.len() > snippet.len() {
178 debug_assert!(match_data.starts_with(snippet));
179 entry.insert(match_data.to_vec());
180 } else {
181 debug_assert!(snippet.starts_with(match_data));
182 }
183 }
184 Entry::Vacant(entry) => {
185 entry.insert(match_data.to_vec());
186 }
187 }
188 } else {
189 debug_assert!(false)
190 }
191 }
192 }
193
194 Ok(self)
195 }
196
197 /// Finalizes the scanning process.
198 ///
199 /// After all data blocks have been scanned, this method evaluates the
200 /// conditions of the YARA rules and produces the final scan results.
201 pub fn finish(&mut self) -> Result<ScanResults<'_, 'r>, ScanError> {
202 if self.needs_reset {
203 self.scan_context_mut().reset();
204 }
205
206 self.needs_reset = true;
207
208 let ctx = self.scan_context_mut();
209
210 ctx.eval_conditions()?;
211
212 ctx.scan_state = ScanState::Finished(DataSnippets::MultiBlock(
213 mem::take(&mut self.snippets),
214 ));
215
216 Ok(ScanResults::new(ctx))
217 }
218
219 /// Sets the value of a global variable.
220 ///
221 /// The variable must has been previously defined by calling
222 /// [`crate::Compiler::define_global`], and the type it has during the
223 /// definition must match the type of the new value (`T`).
224 ///
225 /// The variable will retain the new value in subsequent scans, unless this
226 /// function is called again for setting a new value.
227 pub fn set_global<T: TryInto<Variable>>(
228 &mut self,
229 ident: &str,
230 value: T,
231 ) -> Result<&mut Self, VariableError>
232 where
233 VariableError: From<<T as TryInto<Variable>>::Error>,
234 {
235 self.scan_context_mut().set_global(ident, value)?;
236 Ok(self)
237 }
238
239 /// Sets a timeout for scan operations.
240 ///
241 /// The scan functions will return an [ScanError::Timeout] once the
242 /// provided timeout duration has elapsed. The scanner will make every
243 /// effort to stop promptly after the designated timeout duration. However,
244 /// in some cases, particularly with rules containing only a few patterns,
245 /// the scanner could potentially continue running for a longer period than
246 /// the specified timeout.
247 pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
248 self.scan_context_mut().set_timeout(timeout);
249 self
250 }
251
252 /// Sets the maximum number of matches per pattern.
253 ///
254 /// When some pattern reaches the maximum number of patterns it won't
255 /// produce more matches.
256 pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
257 self.scan_context_mut().pattern_matches.max_matches_per_pattern(n);
258 self
259 }
260
261 /// Sets a callback that is invoked every time a YARA rule calls the
262 /// `console` module.
263 ///
264 /// The `callback` function is invoked with a string representing the
265 /// message being logged. The function can print the message to stdout,
266 /// append it to a file, etc. If no callback is set these messages are
267 /// ignored.
268 pub fn console_log<F>(&mut self, callback: F) -> &mut Self
269 where
270 F: FnMut(String) + 'r,
271 {
272 self.scan_context_mut().console_log = Some(Box::new(callback));
273 self
274 }
275
276 /// Returns profiling data for the slowest N rules.
277 ///
278 /// The profiling data reflects the cumulative execution time of each rule
279 /// across all scanned files. This information is useful for identifying
280 /// performance bottlenecks. To reset the profiling data and start fresh
281 /// for subsequent scans, use [`crate::Scanner::clear_profiling_data`].
282 #[cfg(feature = "rules-profiling")]
283 pub fn slowest_rules(
284 &self,
285 n: usize,
286 ) -> Vec<crate::scanner::ProfilingData<'_>> {
287 self.scan_context().slowest_rules(n)
288 }
289
290 /// Clears all accumulated profiling data.
291 ///
292 /// This method resets the profiling data collected during rule execution
293 /// across scanned files. Use this to start a new profiling session, ensuring
294 /// the results reflect only the data gathered after this method is called.
295 #[cfg(feature = "rules-profiling")]
296 pub fn clear_profiling_data(&mut self) {
297 self.scan_context_mut().clear_profiling_data()
298 }
299}
300
301impl<'r> Scanner<'r> {
302 #[cfg(feature = "rules-profiling")]
303 #[inline]
304 fn scan_context<'a>(&self) -> &ScanContext<'r, 'a> {
305 unsafe {
306 transmute::<&ScanContext<'static, 'static>, &ScanContext<'r, '_>>(
307 self.wasm_store.data(),
308 )
309 }
310 }
311 #[inline]
312 fn scan_context_mut<'a>(&mut self) -> &'a mut ScanContext<'r, 'a> {
313 unsafe {
314 transmute::<
315 &mut ScanContext<'static, 'static>,
316 &mut ScanContext<'r, 'a>,
317 >(self.wasm_store.data_mut())
318 }
319 }
320}
321
322impl<'r> From<crate::scanner::Scanner<'r>> for Scanner<'r> {
323 fn from(scanner: crate::scanner::Scanner<'r>) -> Self {
324 Self {
325 _rules: scanner._rules,
326 wasm_store: scanner.wasm_store,
327 needs_reset: true,
328 snippets: Default::default(),
329 }
330 }
331}
332
333#[cfg(test)]
334mod tests {
335 use crate::scanner::blocks::Scanner;
336 use crate::{compile, Compiler};
337 use std::time::Duration;
338
339 #[test]
340 fn block_scanner_1() {
341 let rules = compile(
342 r#"
343 rule test { strings: $a = "ipsum" condition: $a }"#,
344 )
345 .unwrap();
346
347 let mut scanner = Scanner::new(&rules);
348
349 let results = scanner
350 .scan(0, b"Lorem ipsum")
351 .unwrap()
352 .scan(1000, b"dolor ipsum sit amet")
353 .unwrap()
354 .finish()
355 .unwrap();
356
357 assert_eq!(results.matching_rules().len(), 1);
358
359 let rule = results.matching_rules().next().unwrap();
360 let pattern = rule.patterns().next().unwrap();
361 let mut matches = pattern.matches();
362
363 let match1 = matches.next().unwrap();
364 assert_eq!(match1.data(), b"ipsum".as_slice());
365 assert_eq!(match1.range(), 6..11);
366
367 let match2 = matches.next().unwrap();
368 assert_eq!(match2.data(), b"ipsum".as_slice());
369 assert_eq!(match2.range(), 1006..1011);
370 }
371
372 #[test]
373 fn block_scanner_2() {
374 let rules = compile(
375 r#"
376 rule test { strings: $a = /ipsum.*amet/s condition: $a }"#,
377 )
378 .unwrap();
379
380 let mut scanner = Scanner::new(&rules);
381
382 let results = scanner
383 .scan(0, b"Lorem ipsum")
384 .unwrap()
385 .scan(1000, b"dolor ipsum sit amet")
386 .unwrap()
387 .finish()
388 .unwrap();
389
390 let rule = results.matching_rules().next().unwrap();
391 let pattern = rule.patterns().next().unwrap();
392 let mut matches = pattern.matches();
393
394 let match_ = matches.next().unwrap();
395 assert_eq!(match_.data(), b"ipsum sit amet".as_slice());
396 assert_eq!(match_.range(), 1006..1020);
397 }
398
399 #[test]
400 fn block_scanner_match_in_range() {
401 let rules = compile(
402 r#"
403 rule test { strings: $a = "ipsum" condition: $a in (1003..1008) }"#,
404 )
405 .unwrap();
406
407 let mut scanner = Scanner::new(&rules);
408
409 let results = scanner
410 .scan(0, b"Lorem ipsum")
411 .unwrap()
412 .scan(1000, b"dolor ipsum sit amet")
413 .unwrap()
414 .finish()
415 .unwrap();
416
417 assert_eq!(results.matching_rules().len(), 1);
418
419 let rule = results.matching_rules().next().unwrap();
420 let pattern = rule.patterns().next().unwrap();
421 let mut matches = pattern.matches();
422
423 let match1 = matches.next().unwrap();
424 assert_eq!(match1.data(), b"ipsum".as_slice());
425 assert_eq!(match1.range(), 6..11);
426
427 let match2 = matches.next().unwrap();
428 assert_eq!(match2.data(), b"ipsum".as_slice());
429 assert_eq!(match2.range(), 1006..1011);
430 }
431
432 #[test]
433 fn block_scanner_match_at_offset() {
434 let rules = compile(
435 r#"
436 rule test { strings: $a = "ipsum" condition: $a at 1006 }"#,
437 )
438 .unwrap();
439
440 let mut scanner = Scanner::new(&rules);
441
442 let results = scanner
443 .scan(1000, b"dolor ipsum sit amet")
444 .unwrap()
445 .finish()
446 .unwrap();
447
448 assert_eq!(results.matching_rules().len(), 1);
449 }
450
451 #[test]
452 fn block_scanner_global() {
453 let mut compiler = Compiler::new();
454
455 compiler
456 .define_global("foo", "")
457 .unwrap()
458 .add_source(
459 r#"
460 rule test { condition: foo == "foo" }"#,
461 )
462 .unwrap();
463
464 let rules = compiler.build();
465 let mut scanner = Scanner::new(&rules);
466 scanner.set_global("foo", "foo").unwrap();
467 let results = scanner.finish().unwrap();
468 assert_eq!(results.matching_rules().len(), 1);
469 }
470
471 #[test]
472 fn block_scanner_timeout() {
473 let rules = compile(
474 r#"
475 rule slow {
476 condition:
477 for any i in (0..1000000000) : (
478 uint8(i) == 0xCC
479 )
480 }"#,
481 )
482 .unwrap();
483
484 let mut scanner = Scanner::new(&rules);
485 scanner.set_timeout(Duration::from_secs(1));
486 let err = scanner.finish().unwrap_err();
487 assert_eq!(err.to_string(), "timeout");
488 }
489
490 #[test]
491 fn block_scanner_filesize() {
492 let rules = compile(
493 r#"
494 rule filesize_undefined {
495 condition:
496 not defined filesize
497 }"#,
498 )
499 .unwrap();
500
501 let mut scanner = Scanner::new(&rules);
502 let results = scanner.finish().unwrap();
503
504 assert_eq!(results.matching_rules().len(), 1);
505 }
506}