Skip to main content

yara_x/scanner/
mod.rs

1/*! This module implements the YARA scanner.
2
3The scanner takes the rules produces by the compiler and scans data with them.
4*/
5use std::collections::{BTreeMap, HashMap};
6use std::fmt::{Debug, Formatter};
7use std::fs;
8use std::io::Read;
9use std::mem::transmute;
10use std::ops::Range;
11use std::path::{Path, PathBuf};
12use std::pin::Pin;
13use std::slice::Iter;
14use std::sync::Once;
15use std::sync::atomic::AtomicU64;
16use std::time::Duration;
17
18use bitvec::prelude::*;
19#[cfg(unix)]
20use memmap2::Advice;
21use memmap2::{Mmap, MmapOptions};
22use protobuf::{CodedInputStream, MessageDyn};
23use thiserror::Error;
24
25use crate::Variable;
26use crate::compiler::{RuleId, Rules};
27use crate::models::Rule;
28use crate::modules::{ModuleError, RegisteredModule};
29pub(crate) use crate::scanner::context::RuntimeObject;
30pub(crate) use crate::scanner::context::RuntimeObjectHandle;
31pub(crate) use crate::scanner::context::ScanContext;
32pub(crate) use crate::scanner::context::ScanState;
33use crate::scanner::context::create_wasm_store_and_ctx;
34pub(crate) use crate::scanner::matches::Match;
35use crate::types::{Struct, TypeValue};
36use crate::variables::VariableError;
37use crate::wasm::MATCHING_RULES_BITMAP_BASE;
38use crate::wasm::runtime::Store;
39
40mod context;
41mod matches;
42
43pub mod blocks;
44
45#[cfg(test)]
46mod tests;
47
48/// Error returned when a scan operation fails.
49#[derive(Error, Debug)]
50#[non_exhaustive]
51pub enum ScanError {
52    /// The scan was aborted after the timeout period.
53    #[error("timeout")]
54    Timeout,
55    /// Could not open the scanned file.
56    #[error("can not open `{path}`: {err}")]
57    OpenError {
58        /// Path of the file being scanned.
59        path: PathBuf,
60        /// Error that occurred.
61        err: std::io::Error,
62    },
63    /// Could not map the scanned file into memory.
64    #[error("can not map `{path}`: {err}")]
65    MapError {
66        /// Path of the file being scanned.
67        path: PathBuf,
68        /// Error that occurred.
69        err: std::io::Error,
70    },
71    /// Could not deserialize the protobuf message for some YARA module.
72    #[error(
73        "can not deserialize protobuf message for YARA module `{module}`: {err}"
74    )]
75    ProtoError {
76        /// Module name.
77        module: String,
78        /// Error that occurred.
79        err: protobuf::Error,
80    },
81    /// The module is unknown.
82    #[error("unknown module `{module}`")]
83    UnknownModule {
84        /// Module name.
85        module: String,
86    },
87    /// Some module produced an error when it was invoked.
88    #[error("error in module `{module}`: {err}")]
89    ModuleError {
90        /// Module name.
91        module: String,
92        /// Error that occurred.
93        err: ModuleError,
94    },
95}
96
97/// Global counter that gets incremented every 1 second by a dedicated thread.
98///
99/// This counter is used for determining when a scan operation has timed out.
100static HEARTBEAT_COUNTER: AtomicU64 = AtomicU64::new(0);
101
102/// Used for spawning the thread that increments `HEARTBEAT_COUNTER`.
103static INIT_HEARTBEAT: Once = Once::new();
104
105/// Represents the data being scanned.
106///
107/// The scanned data can be backed by a slice owned by someone else, or a
108/// vector or memory-mapped file owned by `ScannedData` itself.
109pub enum ScannedData<'d> {
110    Slice(&'d [u8]),
111    Vec(Vec<u8>),
112    Mmap { mmap: Mmap, len: usize },
113}
114
115impl AsRef<[u8]> for ScannedData<'_> {
116    fn as_ref(&self) -> &[u8] {
117        match self {
118            ScannedData::Slice(s) => s,
119            ScannedData::Vec(v) => v.as_ref(),
120            ScannedData::Mmap { mmap, len } => &mmap.as_ref()[..*len],
121        }
122    }
123}
124
125impl ScannedData<'_> {
126    #[inline]
127    fn len(&self) -> usize {
128        self.as_ref().len()
129    }
130}
131
132impl<'d> TryInto<ScannedData<'d>> for &'d [u8] {
133    type Error = ScanError;
134    fn try_into(self) -> Result<ScannedData<'d>, Self::Error> {
135        Ok(ScannedData::Slice(self))
136    }
137}
138
139impl<'d, const N: usize> TryInto<ScannedData<'d>> for &'d [u8; N] {
140    type Error = ScanError;
141    fn try_into(self) -> Result<ScannedData<'d>, Self::Error> {
142        Ok(ScannedData::Slice(self))
143    }
144}
145
146/// Contains information about the time spent on a rule.
147#[cfg(feature = "rules-profiling")]
148pub struct ProfilingData<'r> {
149    /// Rule namespace.
150    pub namespace: &'r str,
151    /// Rule name.
152    pub rule: &'r str,
153    /// Time spent executing the rule's condition.
154    pub condition_exec_time: Duration,
155    /// Time spent matching the rule's patterns.
156    pub pattern_matching_time: Duration,
157}
158
159/// Optional information for the scan operation.
160#[derive(Debug, Default)]
161pub struct ScanOptions<'a> {
162    module_metadata: HashMap<&'a str, &'a [u8]>,
163}
164
165impl<'a> ScanOptions<'a> {
166    /// Creates a new instance of `ScanOptions` with no additional information
167    /// for the scan operation.
168    ///
169    /// Use other methods to add additional information.
170    pub fn new() -> Self {
171        Self { module_metadata: Default::default() }
172    }
173
174    /// Adds metadata for a YARA module.
175    pub fn set_module_metadata(
176        mut self,
177        module_name: &'a str,
178        metadata: &'a [u8],
179    ) -> Self {
180        self.module_metadata.insert(module_name, metadata);
181        self
182    }
183}
184
185/// Scans data with already compiled YARA rules.
186///
187/// The scanner receives a set of compiled [`Rules`] and scans data with those
188/// rules. The same scanner can be used for scanning multiple files or
189/// in-memory data sequentially, but you need multiple scanners for scanning in
190/// parallel.
191pub struct Scanner<'r> {
192    _rules: &'r Rules,
193    wasm_store: Pin<Box<Store<ScanContext<'static, 'static>>>>,
194    use_mmap: bool,
195    max_scan_size: Option<usize>,
196}
197
198impl<'r> Scanner<'r> {
199    /// Creates a new scanner.
200    pub fn new(rules: &'r Rules) -> Self {
201        let wasm_store = create_wasm_store_and_ctx(rules);
202        Self { _rules: rules, wasm_store, use_mmap: true, max_scan_size: None }
203    }
204
205    /// Sets a timeout for scan operations.
206    ///
207    /// The scan functions will return an [ScanError::Timeout] once the
208    /// provided timeout duration has elapsed. The scanner will make every
209    /// effort to stop promptly after the designated timeout duration. However,
210    /// in some cases, particularly with rules containing only a few patterns,
211    /// the scanner could potentially continue running for a longer period than
212    /// the specified timeout.
213    pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
214        self.scan_context_mut().set_timeout(timeout);
215        self
216    }
217
218    /// Sets the maximum number of matches per pattern.
219    ///
220    /// When some pattern reaches the maximum number of patterns it won't
221    /// produce more matches.
222    pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
223        self.scan_context_mut()
224            .tracker
225            .pattern_matches
226            .max_matches_per_pattern(n);
227        self
228    }
229
230    /// Enables or disables fast scan mode.
231    ///
232    /// In fast scan mode, the scanner avoids tracking matches for patterns
233    /// when it is not necessary (e.g. when a rule condition only performs a
234    /// simple boolean check `$a`).
235    ///
236    /// Note that using fast scan mode implies that not all matches will be
237    /// reported. For instance, when iterating matches using [`ScanResults`],
238    /// you won't get all occurrences of the pattern in the file, only the first
239    /// one.
240    pub fn fast_scan(&mut self, yes: bool) -> &mut Self {
241        self.scan_context_mut().tracker.fast_scan = yes;
242        self
243    }
244
245    /// Specifies whether [`Scanner::scan_file`] and [`Scanner::scan_file_with_options`]
246    /// may use memory-mapped files to read input.
247    ///
248    /// By default, the scanner uses memory mapping for very large files, as this
249    /// is typically faster than copying file contents into memory. However, this
250    /// approach has a drawback: if another process truncates the file during
251    /// scanning, a `SIGBUS` signal may occur.
252    ///
253    /// Setting this option disables memory mapping and forces the scanner to
254    /// always read files into an in-memory buffer instead. This method is slower,
255    /// but safer.
256    pub fn use_mmap(&mut self, yes: bool) -> &mut Self {
257        self.use_mmap = yes;
258        self
259    }
260
261    /// Sets the maximum size of the data that will be scanned.
262    ///
263    /// If the scanned data (either a file or an in-memory buffer) is larger
264    /// than this value, it will be truncated to the given size.
265    ///
266    /// The value returned by `filesize` will be also limited to the given
267    /// size.
268    ///
269    /// Also notice that some modules (pe, elf, macho, etc) may be unable
270    /// to properly parse truncated files.
271    pub fn max_scan_size(&mut self, size: usize) -> &mut Self {
272        self.max_scan_size = Some(size);
273        self
274    }
275
276    /// Sets a callback that is invoked every time a YARA rule calls the
277    /// `console` module.
278    ///
279    /// The `callback` function is invoked with a string representing the
280    /// message being logged. The function can print the message to stdout,
281    /// append it to a file, etc. If no callback is set these messages are
282    /// ignored.
283    pub fn console_log<F>(&mut self, callback: F) -> &mut Self
284    where
285        F: FnMut(String) + 'r,
286    {
287        self.scan_context_mut().console_log = Some(Box::new(callback));
288        self
289    }
290
291    /// Sets the context size for matches.
292    ///
293    /// This specifies how many bytes at the left and right of each match will
294    /// be reported by [`crate::Match::data_with_context`]. By default, the
295    /// match context size is 0, which means that [`crate::Match::data_with_context`]
296    /// will return exactly the same data as [`crate::Match::data`].
297    pub fn match_context_size(&mut self, size: usize) -> &mut Self {
298        self.scan_context_mut().match_context_size = size;
299        self
300    }
301
302    /// Scans in-memory data.
303    pub fn scan<'a>(
304        &'a mut self,
305        data: &'a [u8],
306    ) -> Result<ScanResults<'a, 'r>, ScanError> {
307        let mut data = data;
308        if let Some(max) = self.max_scan_size
309            && data.len() > max
310        {
311            data = &data[..max];
312        }
313        self.scan_impl(ScannedData::Slice(data), None)
314    }
315
316    /// Scans a file.
317    pub fn scan_file<'a, P>(
318        &'a mut self,
319        target: P,
320    ) -> Result<ScanResults<'a, 'r>, ScanError>
321    where
322        P: AsRef<Path>,
323    {
324        self.scan_impl(self.load_file(target.as_ref())?, None)
325    }
326
327    /// Like [`Scanner::scan`], but allows to specify additional scan options.
328    pub fn scan_with_options<'a, 'opts>(
329        &'a mut self,
330        data: &'a [u8],
331        options: ScanOptions<'opts>,
332    ) -> Result<ScanResults<'a, 'r>, ScanError> {
333        let mut data = data;
334        if let Some(max) = self.max_scan_size
335            && data.len() > max
336        {
337            data = &data[..max];
338        }
339        self.scan_impl(ScannedData::Slice(data), Some(options))
340    }
341
342    /// Like [`Scanner::scan_file`], but allows to specify additional scan
343    /// options.
344    pub fn scan_file_with_options<'opts, P>(
345        &mut self,
346        target: P,
347        options: ScanOptions<'opts>,
348    ) -> Result<ScanResults<'_, 'r>, ScanError>
349    where
350        P: AsRef<Path>,
351    {
352        self.scan_impl(self.load_file(target.as_ref())?, Some(options))
353    }
354
355    /// Sets the value of a global variable.
356    ///
357    /// The variable must has been previously defined by calling
358    /// [`crate::Compiler::define_global`], and the type it has during the
359    /// definition must match the type of the new value (`T`).
360    ///
361    /// The variable will retain the new value in subsequent scans, unless this
362    /// function is called again for setting a new value.
363    pub fn set_global<T: TryInto<Variable>>(
364        &mut self,
365        ident: &str,
366        value: T,
367    ) -> Result<&mut Self, VariableError>
368    where
369        VariableError: From<<T as TryInto<Variable>>::Error>,
370    {
371        self.scan_context_mut().set_global(ident, value)?;
372        Ok(self)
373    }
374
375    /// Sets the output data for a YARA module.
376    ///
377    /// Each YARA module generates an output consisting of a data structure that
378    /// contains information about the scanned file. This data structure is
379    /// represented by a Protocol Buffer message. Typically, you won't need to
380    /// provide this data yourself, as the YARA module automatically generates
381    /// different outputs for each file it scans.
382    ///
383    /// However, there are two scenarios in which you may want to provide the
384    /// output for a module yourself:
385    ///
386    /// 1) When the module does not produce any output on its own.
387    /// 2) When you already know the output of the module for the upcoming file
388    ///    to be scanned, and you prefer to reuse this data instead of generating
389    ///    it again.
390    ///
391    /// Case 1) applies to certain modules lacking a main function, thus
392    /// incapable of producing any output on their own. For such modules, you
393    /// must set the output before scanning the associated data. Since the
394    /// module's output typically varies with each scanned file, you need to
395    /// call [`Scanner::set_module_output`] prior to each invocation of
396    /// [`Scanner::scan`]. Once [`Scanner::scan`] is executed, the module's
397    /// output is consumed and will be empty unless set again before the
398    /// subsequent call.
399    ///
400    /// Case 2) applies when you have previously stored the module's output for
401    /// certain scanned data. In such cases, when rescanning the data, you can
402    /// utilize this function to supply the module's output, thereby preventing
403    /// redundant computation by the module. This optimization enhances
404    /// performance by eliminating the need for the module to reparse the
405    /// scanned data.
406    ///
407    /// <br>
408    ///
409    /// The `data` argument must be a Protocol Buffer message corresponding
410    /// to any of the existing YARA modules.
411    pub fn set_module_output(
412        &mut self,
413        data: Box<dyn MessageDyn>,
414    ) -> Result<&mut Self, ScanError> {
415        let descriptor = data.descriptor_dyn();
416        let full_name = descriptor.full_name();
417
418        // Check if the protobuf message passed to this function corresponds
419        // with any of the existing modules.
420        if !crate::modules::registered_modules()
421            .any(|m| m.root_descriptor().full_name() == full_name)
422        {
423            return Err(ScanError::UnknownModule {
424                module: full_name.to_string(),
425            });
426        }
427
428        self.scan_context_mut()
429            .user_provided_module_outputs
430            .insert(full_name.to_string(), data);
431
432        Ok(self)
433    }
434
435    /// Similar to [`Scanner::set_module_output`], but receives a module name
436    /// and the protobuf message as raw data.
437    ///
438    /// `name` can be either the YARA module name (i.e: "pe", "elf", "dotnet",
439    /// etc.) or the fully-qualified name for the protobuf message associated
440    /// to the module (i.e: "pe.PE", "elf.ELF", "dotnet.Dotnet", etc.).
441    pub fn set_module_output_raw(
442        &mut self,
443        name: &str,
444        data: &[u8],
445    ) -> Result<&mut Self, ScanError> {
446        // Try to find the module by name first, if not found, then try
447        // to find a module where the fully-qualified name for its protobuf
448        // message matches the `name` arguments.
449        let descriptor = crate::modules::registered_modules()
450            .find_map(|module| {
451                if module.name() == name {
452                    Some(module.root_descriptor())
453                } else {
454                    None
455                }
456            })
457            .or_else(|| {
458                crate::modules::registered_modules().find_map(|module| {
459                    let descriptor = module.root_descriptor();
460                    if descriptor.full_name() == name {
461                        Some(descriptor)
462                    } else {
463                        None
464                    }
465                })
466            });
467
468        if descriptor.is_none() {
469            return Err(ScanError::UnknownModule { module: name.to_string() });
470        }
471
472        let mut is = CodedInputStream::from_bytes(data);
473
474        // Default recursion limit is 100, that's not enough for some deeply
475        // nested structures like the process tree in the `vt` module.
476        is.set_recursion_limit(500);
477
478        self.set_module_output(
479            descriptor.unwrap().parse_from(&mut is).map_err(|err| {
480                ScanError::ProtoError { module: name.to_string(), err }
481            })?,
482        )
483    }
484
485    /// Returns profiling data for the slowest N rules.
486    ///
487    /// The profiling data reflects the cumulative execution time of each rule
488    /// across all scanned files. This information is useful for identifying
489    /// performance bottlenecks. To reset the profiling data and start fresh
490    /// for subsequent scans, use [`Scanner::clear_profiling_data`].
491    #[cfg(feature = "rules-profiling")]
492    pub fn slowest_rules(&self, n: usize) -> Vec<ProfilingData<'_>> {
493        self.scan_context().slowest_rules(n)
494    }
495
496    /// Clears all accumulated profiling data.
497    ///
498    /// This method resets the profiling data collected during rule execution
499    /// across scanned files. Use this to start a new profiling session, ensuring
500    /// the results reflect only the data gathered after this method is called.
501    #[cfg(feature = "rules-profiling")]
502    pub fn clear_profiling_data(&mut self) {
503        self.scan_context_mut().clear_profiling_data()
504    }
505}
506
507impl<'r> Scanner<'r> {
508    #[cfg(feature = "rules-profiling")]
509    #[inline]
510    fn scan_context<'a>(&self) -> &ScanContext<'r, 'a> {
511        unsafe {
512            transmute::<&ScanContext<'static, 'static>, &ScanContext<'r, '_>>(
513                self.wasm_store.data(),
514            )
515        }
516    }
517    #[inline]
518    fn scan_context_mut<'a>(&mut self) -> &mut ScanContext<'r, 'a> {
519        unsafe {
520            transmute::<
521                &mut ScanContext<'static, 'static>,
522                &mut ScanContext<'r, '_>,
523            >(self.wasm_store.data_mut())
524        }
525    }
526
527    fn load_file<'a>(
528        &self,
529        path: &Path,
530    ) -> Result<ScannedData<'a>, ScanError> {
531        let file = fs::File::open(path).map_err(|err| {
532            ScanError::OpenError { path: path.to_path_buf(), err }
533        })?;
534
535        let mut size = file.metadata().map(|m| m.len()).unwrap_or(0) as usize;
536
537        if let Some(max_scan_size) = self.max_scan_size {
538            size = std::cmp::min(size, max_scan_size);
539        }
540
541        // For files smaller than ~500MB reading the whole file is faster than
542        // using a memory-mapped file.
543        let data = if self.use_mmap && size > 500_000_000 {
544            let mapped_file = unsafe {
545                MmapOptions::new().map_copy_read_only(&file).map_err(|err| {
546                    ScanError::MapError { path: path.to_path_buf(), err }
547                })
548            }?;
549            #[cfg(unix)]
550            mapped_file.advise(Advice::Sequential).map_err(|err| {
551                ScanError::MapError { path: path.to_path_buf(), err }
552            })?;
553            ScannedData::Mmap { mmap: mapped_file, len: size }
554        } else {
555            let mut buffered_file = Vec::with_capacity(size);
556            (&file)
557                .take(size as u64)
558                .read_to_end(&mut buffered_file)
559                .map_err(|err| ScanError::OpenError {
560                    path: path.to_path_buf(),
561                    err,
562                })?;
563            ScannedData::Vec(buffered_file)
564        };
565
566        Ok(data)
567    }
568
569    fn scan_impl<'a, 'opts>(
570        &'a mut self,
571        data: ScannedData<'a>,
572        options: Option<ScanOptions<'opts>>,
573    ) -> Result<ScanResults<'a, 'r>, ScanError> {
574        let ctx = self.scan_context_mut();
575
576        // Clear information about matches found in a previous scan, if any.
577        ctx.reset();
578
579        // Set the global variable `filesize` to the size of the scanned data.
580        ctx.set_filesize(data.len() as i64);
581
582        // Indicate that the scanner is currently scanning the given data.
583        ctx.scan_state = ScanState::ScanningData(data);
584
585        for module_name in ctx.compiled_rules.imports() {
586            // Look up the module in the module registry.
587            let module = crate::modules::registered_modules()
588                .find(|module| module.name() == module_name)
589                .unwrap_or_else(|| panic!("module `{module_name}` not found"));
590
591            let module_root_descriptor = module.root_descriptor();
592            let root_struct_name = module_root_descriptor.full_name();
593
594            let module_output;
595            // If the user already provided some output for the module by
596            // calling `Scanner::set_module_output`, use that output. If not,
597            // call the module's main function (if the module has a main
598            // function) for getting its output.
599            if let Some(output) =
600                ctx.user_provided_module_outputs.remove(root_struct_name)
601            {
602                module_output = Some(output);
603            } else {
604                let meta: Option<&'opts [u8]> =
605                    options.as_ref().and_then(|options| {
606                        options.module_metadata.get(module_name).copied()
607                    });
608
609                if let Some(main_res) =
610                    module.main_fn(ctx.scanned_data().unwrap(), meta)
611                {
612                    module_output = Some(main_res.map_err(|err| {
613                        ScanError::ModuleError {
614                            module: module_name.to_string(),
615                            err,
616                        }
617                    })?);
618                } else {
619                    module_output = None;
620                }
621            }
622
623            if let Some(module_output) = &module_output {
624                // Make sure that the module is returning a protobuf message of
625                // the expected type.
626                debug_assert_eq!(
627                    module_output.descriptor_dyn().full_name(),
628                    root_struct_name,
629                    "main function of module `{}` must return `{}`, but returned `{}`",
630                    module_name,
631                    root_struct_name,
632                    module_output.descriptor_dyn().full_name(),
633                );
634
635                // Make sure that the module is returning a protobuf message
636                // where all required fields are initialized. This only applies
637                // to proto2, proto3 doesn't have "required" fields, all fields
638                // are optional.
639                debug_assert!(
640                    module_output.is_initialized_dyn(),
641                    "module `{}` returned a protobuf `{}` where some required fields are not initialized ",
642                    module_name,
643                    root_struct_name
644                );
645            }
646
647            // When constant folding is enabled we don't need to generate
648            // structure fields for enums. This is because during the
649            // optimization process symbols like MyEnum.ENUM_ITEM are resolved
650            // to their constant values at compile time. In other words, the
651            // compiler determines that MyEnum.ENUM_ITEM is equal to some value
652            // X, and uses that value in the generated code.
653            //
654            // However, without constant folding, enums are treated as any
655            // other field in a struct, and their values are determined at scan
656            // time. For that reason these fields must be generated for enums
657            // when constant folding is disabled.
658            let generate_fields_for_enums =
659                !cfg!(feature = "constant-folding");
660
661            let module_struct = Struct::from_proto_descriptor_and_msg(
662                &module_root_descriptor,
663                module_output.as_deref(),
664                generate_fields_for_enums,
665            );
666
667            if let Some(module_output) = module_output {
668                ctx.module_outputs
669                    .insert(root_struct_name.to_string(), module_output);
670            }
671
672            // The data structure obtained from the module is added to the
673            // root structure. Any data from previous scans will be replaced
674            // with the new data structure.
675            ctx.root_struct
676                .add_field(module_name, TypeValue::Struct(module_struct));
677        }
678
679        // The user provided module outputs are not needed anymore. Let's
680        // clear any remaining entry in the hash map (which can happen if
681        // the user has set outputs for modules that are not even imported
682        // by the rules.
683        ctx.user_provided_module_outputs.clear();
684
685        // Clear the flag that indicates that the search phase was done.
686        ctx.set_pattern_search_done(false);
687
688        // Evaluate the conditions of every rule, this will call
689        // `ScanContext::search_for_patterns` if necessary.
690        ctx.eval_conditions()?;
691
692        let data = match ctx.scan_state.take() {
693            ScanState::ScanningData(data) => data,
694            _ => unreachable!(),
695        };
696
697        ctx.scan_state = ScanState::Finished(DataSnippets::SingleBlock(data));
698
699        Ok(ScanResults::new(ctx))
700    }
701}
702
703/// Helper type that exposes the data matched during a scan operation.
704///
705/// Matching data can be accessed through the [`Match::data`] method. Normally,
706/// this data can be retrieved by slicing directly into the scanned input.
707/// However, that requires the original input to remain valid until the scan
708/// results are processed. This works fine for a single contiguous block of
709/// memory, but is impractical when scanning multiple blocks, since holding
710/// onto all of them until the end would consume excessive memory.
711///
712/// To handle this, two strategies are used:
713///
714/// - **Single-block scans**: Data is accessed directly from the input slice.
715/// - **Multi-block scans**: Matching fragments are copied and retained in a
716///   BTreeMap until the results are processed. The keys in the btree are
717///   the offsets where the snippets start and the values are vectors with
718///   the snippet's data.
719///
720/// Each strategy corresponds to a variant in this enum.
721pub(crate) enum DataSnippets<'d> {
722    SingleBlock(ScannedData<'d>),
723    MultiBlock(BTreeMap<usize, Vec<u8>>),
724}
725
726impl DataSnippets<'_> {
727    pub(crate) fn get(&self, range: Range<usize>) -> Option<&[u8]> {
728        self.get_with_context(range, 0).map(|(data, _)| data)
729    }
730
731    /// Gets the data for the given `range`, but adding `context_size` additional
732    /// bytes to the left and right.
733    ///
734    /// Returns a tuple where the first item is the data slice with context,
735    /// and the second item is a range relative to the slice indicating where
736    /// the `range` part is located.
737    ///
738    /// The result will be `None` only if the data for `range` can't be found.
739    /// The additional bytes at the left and right will be added if possible,
740    /// but otherwise won't affect the result.
741    pub(crate) fn get_with_context(
742        &self,
743        range: Range<usize>,
744        context_size: usize,
745    ) -> Option<(&[u8], Range<usize>)> {
746        match self {
747            Self::SingleBlock(data) => {
748                let start = range.start.saturating_sub(context_size);
749                let end = range.end.saturating_add(context_size);
750                let end = std::cmp::min(end, data.len());
751
752                let slice = data.as_ref().get(start..end)?;
753                let rel_start = range.start - start;
754                let rel_end = range.end - start;
755
756                Some((slice, rel_start..rel_end))
757            }
758            Self::MultiBlock(btree) => {
759                for (snippet_offset, snippet_data) in
760                    btree.range(..=range.start).rev()
761                {
762                    // Calculate the start and end of the slice within the snippet.
763                    let start = range.start.saturating_sub(*snippet_offset);
764                    let end = range.end.saturating_sub(*snippet_offset);
765
766                    if end > snippet_data.len() {
767                        continue;
768                    }
769
770                    let start = start.saturating_sub(context_size);
771                    let end = end.saturating_add(context_size);
772                    let end = std::cmp::min(end, snippet_data.len());
773
774                    match snippet_data.get(start..end) {
775                        Some(data) if !data.is_empty() => {
776                            let rel_start =
777                                range.start - (*snippet_offset + start);
778                            let rel_end =
779                                range.end - (*snippet_offset + start);
780                            return Some((data, rel_start..rel_end));
781                        }
782                        _ => continue,
783                    }
784                }
785
786                None
787            }
788        }
789    }
790}
791
792/// Results of a scan operation.
793///
794/// Allows iterating over both the matching and non-matching rules.
795pub struct ScanResults<'a, 'r> {
796    ctx: &'a ScanContext<'r, 'a>,
797}
798
799impl Debug for ScanResults<'_, '_> {
800    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
801        f.write_str("ScanResults")
802    }
803}
804
805impl<'a, 'r> ScanResults<'a, 'r> {
806    fn new(ctx: &'a ScanContext<'r, 'a>) -> Self {
807        Self { ctx }
808    }
809
810    /// Returns an iterator that yields the matching rules in arbitrary order.
811    pub fn matching_rules(&self) -> MatchingRules<'_, 'r> {
812        MatchingRules::new(self.ctx)
813    }
814
815    /// Returns an iterator that yields the non-matching rules in arbitrary
816    /// order.
817    pub fn non_matching_rules(&self) -> NonMatchingRules<'_, 'r> {
818        NonMatchingRules::new(self.ctx)
819    }
820
821    /// Returns the protobuf produced by a YARA module after processing the
822    /// data.
823    ///
824    /// The result will be `None` if the module doesn't exist or didn't
825    /// produce any output.
826    pub fn module_output(
827        &self,
828        module_name: &str,
829    ) -> Option<&'a dyn MessageDyn> {
830        let module_descriptor = crate::modules::registered_modules()
831            .find_map(|m| {
832                if m.name() == module_name {
833                    Some(m.root_descriptor())
834                } else {
835                    None
836                }
837            })?;
838        let module_output = self
839            .ctx
840            .module_outputs
841            .get(module_descriptor.full_name())?
842            .as_ref();
843        Some(module_output)
844    }
845
846    /// Returns an iterator that yields tuples composed of a YARA module name
847    /// and the protobuf produced by that module.
848    ///
849    /// Only returns the modules that produced some output.
850    pub fn module_outputs(&self) -> ModuleOutputs<'a, 'r> {
851        ModuleOutputs::new(self.ctx)
852    }
853}
854
855/// Iterator that yields the rules that matched during a scan.
856///
857/// Private rules are not included by default, use
858/// [`MatchingRules::include_private`] for changing this behaviour.
859pub struct MatchingRules<'a, 'r> {
860    ctx: &'a ScanContext<'r, 'a>,
861    iterator: Iter<'a, RuleId>,
862    len_non_private: usize,
863    len_private: usize,
864    include_private: bool,
865}
866
867impl<'a, 'r> MatchingRules<'a, 'r> {
868    fn new(ctx: &'a ScanContext<'r, 'a>) -> Self {
869        Self {
870            ctx,
871            iterator: ctx.matching_rules.iter(),
872            include_private: false,
873            len_non_private: ctx.matching_rules.len()
874                - ctx.num_matching_private_rules,
875            len_private: ctx.num_matching_private_rules,
876        }
877    }
878
879    /// Specifies whether the iterator should yield private rules.
880    ///
881    /// This does not reset the iterator to its initial state, the iterator will
882    /// continue from its current position.
883    pub fn include_private(mut self, yes: bool) -> Self {
884        self.include_private = yes;
885        self
886    }
887}
888
889impl<'a, 'r> Iterator for MatchingRules<'a, 'r> {
890    type Item = Rule<'a, 'r>;
891
892    fn next(&mut self) -> Option<Self::Item> {
893        let rules = self.ctx.compiled_rules;
894        loop {
895            let rule_id = *self.iterator.next()?;
896            let rule_info = rules.get(rule_id);
897            if rule_info.is_private {
898                self.len_private -= 1;
899            } else {
900                self.len_non_private -= 1;
901            }
902            if self.include_private || !rule_info.is_private {
903                return Some(Rule { ctx: Some(self.ctx), rule_info, rules });
904            }
905        }
906    }
907}
908
909impl ExactSizeIterator for MatchingRules<'_, '_> {
910    #[inline]
911    fn len(&self) -> usize {
912        if self.include_private {
913            self.len_non_private + self.len_private
914        } else {
915            self.len_non_private
916        }
917    }
918}
919
920/// Iterator that yields the rules that didn't match during a scan.
921///
922/// Private rules are not included by default, use
923/// [`NonMatchingRules::include_private`] for changing this behaviour.
924pub struct NonMatchingRules<'a, 'r> {
925    ctx: &'a ScanContext<'r, 'a>,
926    iterator: bitvec::slice::IterZeros<'a, u8, Lsb0>,
927    include_private: bool,
928    len_private: usize,
929    len_non_private: usize,
930}
931
932impl<'a, 'r> NonMatchingRules<'a, 'r> {
933    fn new(ctx: &'a ScanContext<'r, 'a>) -> Self {
934        let num_rules = ctx.compiled_rules.num_rules();
935        let main_memory = ctx
936            .wasm
937            .main_memory
938            .unwrap()
939            .data(unsafe { ctx.wasm.store.as_ref() });
940
941        let base = MATCHING_RULES_BITMAP_BASE as usize;
942
943        // Create a BitSlice that covers the region of main memory containing
944        // the bitmap that tells which rules matched and which did not.
945        let matching_rules_bitmap = BitSlice::<_, Lsb0>::from_slice(
946            &main_memory[base..base + num_rules / 8 + 1],
947        );
948
949        // The BitSlice will cover more bits than necessary, for example, if
950        // there are 3 rules the BitSlice will have 8 bits because it is
951        // created from a u8 slice that has 1 byte. Here we make sure that
952        // the BitSlice has exactly as many bits as existing rules.
953        let matching_rules_bitmap = &matching_rules_bitmap[0..num_rules];
954
955        Self {
956            ctx,
957            iterator: matching_rules_bitmap.iter_zeros(),
958            include_private: false,
959            len_non_private: ctx.compiled_rules.num_rules()
960                - ctx.matching_rules.len()
961                - ctx.num_non_matching_private_rules,
962            len_private: ctx.num_non_matching_private_rules,
963        }
964    }
965
966    /// Specifies whether the iterator should yield private rules.
967    ///
968    /// This does not reset the iterator to its initial state, the iterator will
969    /// continue from its current position.
970    pub fn include_private(mut self, yes: bool) -> Self {
971        self.include_private = yes;
972        self
973    }
974}
975
976impl<'a, 'r> Iterator for NonMatchingRules<'a, 'r> {
977    type Item = Rule<'a, 'r>;
978
979    fn next(&mut self) -> Option<Self::Item> {
980        let rules = self.ctx.compiled_rules;
981
982        loop {
983            let rule_id = RuleId::from(self.iterator.next()?);
984            let rule_info = rules.get(rule_id);
985
986            if rule_info.is_private {
987                self.len_private -= 1;
988            } else {
989                self.len_non_private -= 1;
990            }
991
992            if self.include_private || !rule_info.is_private {
993                return Some(Rule { ctx: Some(self.ctx), rule_info, rules });
994            }
995        }
996    }
997}
998
999impl ExactSizeIterator for NonMatchingRules<'_, '_> {
1000    #[inline]
1001    fn len(&self) -> usize {
1002        if self.include_private {
1003            self.len_non_private + self.len_private
1004        } else {
1005            self.len_non_private
1006        }
1007    }
1008}
1009
1010/// Iterator that returns the outputs produced by YARA modules.
1011pub struct ModuleOutputs<'a, 'r> {
1012    ctx: &'a ScanContext<'r, 'a>,
1013    len: usize,
1014    iterator: Box<dyn Iterator<Item = &'static dyn RegisteredModule> + 'a>,
1015}
1016
1017impl<'a, 'r> ModuleOutputs<'a, 'r> {
1018    fn new(ctx: &'a ScanContext<'r, 'a>) -> Self {
1019        Self {
1020            ctx,
1021            len: ctx.module_outputs.len(),
1022            iterator: Box::new(crate::modules::registered_modules()),
1023        }
1024    }
1025}
1026
1027impl ExactSizeIterator for ModuleOutputs<'_, '_> {
1028    #[inline]
1029    fn len(&self) -> usize {
1030        self.len
1031    }
1032}
1033
1034impl<'a> Iterator for ModuleOutputs<'a, '_> {
1035    type Item = (&'a str, &'a dyn MessageDyn);
1036
1037    fn next(&mut self) -> Option<Self::Item> {
1038        loop {
1039            let module = self.iterator.next()?;
1040            if let Some(module_output) = self
1041                .ctx
1042                .module_outputs
1043                .get(module.root_descriptor().full_name())
1044            {
1045                return Some((module.name(), module_output.as_ref()));
1046            }
1047        }
1048    }
1049}
1050
1051#[cfg(test)]
1052mod snippet_tests {
1053    use super::DataSnippets;
1054    use std::collections::BTreeMap;
1055
1056    #[test]
1057    fn snippets_multiblock() {
1058        let mut btree_map = BTreeMap::new();
1059
1060        btree_map.insert(0, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
1061        btree_map.insert(50, vec![50, 51, 52, 53, 54]);
1062        btree_map.insert(52, vec![52, 53]);
1063        btree_map.insert(100, vec![100, 101, 102, 103]);
1064
1065        let snippets = DataSnippets::MultiBlock(btree_map);
1066
1067        assert_eq!(snippets.get(0..2), Some([0, 1].as_slice()));
1068        assert_eq!(snippets.get(1..3), Some([1, 2].as_slice()));
1069        assert_eq!(snippets.get(8..9), Some([8].as_slice()));
1070        assert_eq!(snippets.get(10..11), None);
1071        assert_eq!(snippets.get(50..51), Some([50].as_slice()));
1072        assert_eq!(snippets.get(51..53), Some([51, 52].as_slice()));
1073        assert_eq!(snippets.get(50..54), Some([50, 51, 52, 53].as_slice()));
1074        assert_eq!(snippets.get(52..54), Some([52, 53].as_slice()));
1075        assert_eq!(snippets.get(52..55), Some([52, 53, 54].as_slice()));
1076        assert_eq!(snippets.get(52..53), Some([52].as_slice()));
1077        assert_eq!(snippets.get(50..56), None);
1078        assert_eq!(snippets.get(100..101), Some([100].as_slice()));
1079        assert_eq!(snippets.get(101..103), Some([101, 102].as_slice()));
1080
1081        assert_eq!(
1082            snippets.get_with_context(0..2, 1),
1083            Some(([0, 1, 2].as_slice(), 0..2))
1084        );
1085
1086        assert_eq!(
1087            snippets.get_with_context(0..2, 2),
1088            Some(([0, 1, 2, 3].as_slice(), 0..2))
1089        );
1090
1091        assert_eq!(
1092            snippets.get_with_context(2..4, 2),
1093            Some(([0, 1, 2, 3, 4, 5].as_slice(), 2..4))
1094        );
1095
1096        assert_eq!(
1097            snippets.get_with_context(51..52, 3),
1098            Some(([50, 51, 52, 53, 54].as_slice(), 1..2))
1099        );
1100
1101        assert_eq!(
1102            snippets.get_with_context(102..103, 3),
1103            Some(([100, 101, 102, 103].as_slice(), 2..3))
1104        );
1105    }
1106
1107    #[test]
1108    fn snippets_singleblock() {
1109        let data = b"Lorem ipsum dolor sit amet".to_vec();
1110        let scanned_data = super::ScannedData::Vec(data);
1111        let snippets = DataSnippets::SingleBlock(scanned_data);
1112
1113        // Test get
1114        assert_eq!(snippets.get(6..11), Some(b"ipsum".as_slice()));
1115        assert_eq!(snippets.get(0..5), Some(b"Lorem".as_slice()));
1116        assert_eq!(snippets.get(20..26), Some(b"t amet".as_slice()));
1117        assert_eq!(snippets.get(27..30), None);
1118
1119        // Test get_with_context
1120        // context_size = 5
1121        assert_eq!(
1122            snippets.get_with_context(6..11, 5),
1123            Some((b"orem ipsum dolo".as_slice(), 5..10))
1124        );
1125        assert_eq!(
1126            snippets.get_with_context(0..5, 5),
1127            Some((b"Lorem ipsu".as_slice(), 0..5))
1128        );
1129        assert_eq!(
1130            snippets.get_with_context(20..26, 5),
1131            Some((b"or sit amet".as_slice(), 5..11))
1132        );
1133        assert_eq!(snippets.get_with_context(32..35, 5), None);
1134    }
1135}