yara_x/compiler/
mod.rs

1/*! Compiles YARA source code into binary form.
2
3YARA rules must be compiled before they can be used for scanning data. This
4module implements the YARA compiler.
5*/
6
7use std::cell::RefCell;
8use std::collections::hash_map::Entry;
9use std::collections::{HashMap, HashSet};
10use std::io::Write;
11use std::path::{Path, PathBuf};
12use std::rc::Rc;
13#[cfg(feature = "logging")]
14use std::time::Instant;
15use std::{env, fmt, fs, io, iter};
16
17use bitflags::bitflags;
18use bstr::{BStr, ByteSlice};
19use itertools::{Itertools, MinMaxResult, izip};
20#[cfg(feature = "logging")]
21use log::*;
22use regex_syntax::hir;
23use rustc_hash::{FxHashMap, FxHashSet};
24use serde::{Deserialize, Serialize};
25use walrus::FunctionId;
26
27use yara_x_parser::ast;
28use yara_x_parser::ast::{AST, Ident, Import, Include, RuleFlags, WithSpan};
29use yara_x_parser::cst::CSTStream;
30use yara_x_parser::{Parser, Span};
31
32use crate::compiler::base64::base64_patterns;
33use crate::compiler::emit::{EmitContext, emit_rule_condition};
34use crate::compiler::errors::{
35    CompileError, ConflictingRuleIdentifier, CustomError, DuplicateRule,
36    DuplicateTag, EmitWasmError, InvalidRegexp, InvalidUTF8, UnknownModule,
37    UnusedPattern,
38};
39use crate::compiler::report::ReportBuilder;
40use crate::compiler::{CompileContext, VarStack};
41use crate::modules::BUILTIN_MODULES;
42use crate::re::hir::{ChainedPattern, ChainedPatternGap};
43use crate::string_pool::{BStringPool, StringPool};
44use crate::symbols::{StackedSymbolTable, Symbol, SymbolLookup, SymbolTable};
45use crate::types::{Func, Struct, TypeValue};
46use crate::utils::cast;
47use crate::variables::{Variable, VariableError, is_valid_identifier};
48use crate::wasm::builder::WasmModuleBuilder;
49use crate::wasm::{WasmSymbols, wasm_exports};
50use crate::{re, wasm};
51
52pub(crate) use crate::compiler::atoms::*;
53pub(crate) use crate::compiler::context::*;
54pub(crate) use crate::compiler::ir::*;
55
56use crate::compiler::wsh::WarningSuppressionHook;
57use crate::errors::{
58    CircularIncludes, IncludeError, IncludeNotAllowed, IncludeNotFound,
59    InvalidWarningCode,
60};
61use crate::linters::LinterResult;
62use crate::models::PatternKind;
63
64#[doc(inline)]
65pub use crate::compiler::report::Patch;
66#[doc(inline)]
67pub use crate::compiler::rules::*;
68#[doc(inline)]
69pub use crate::compiler::warnings::*;
70
71mod atoms;
72mod context;
73mod emit;
74mod ir;
75mod report;
76mod rules;
77
78#[cfg(test)]
79mod tests;
80
81pub mod base64;
82pub mod errors;
83pub mod linters;
84pub mod warnings;
85pub mod wsh;
86
87/// A structure that describes some YARA source code.
88///
89/// This structure contains a `&str` pointing to the code itself, and an
90/// optional `origin` that tells where the source code came from. The
91/// most common use for `origin` is indicating the path of the file from
92/// where the source code was obtained, but it can contain any arbitrary
93/// string. This string, if provided, will appear in error messages. For
94/// example, in this error message `origin` was set to `some_file.yar`:
95///
96/// ```text
97/// error: syntax error
98///  --> some_file.yar:4:17
99///   |
100/// 4 | ... more details
101/// ```
102///
103/// # Example
104///
105/// ```
106/// use yara_x::SourceCode;
107/// let src = SourceCode::from("rule test { condition: true }").with_origin("some_file.yar");
108/// ```
109///
110#[derive(Debug, Clone)]
111pub struct SourceCode<'src> {
112    /// A reference to the source code itself. This is a BStr because the
113    /// source code could contain non-UTF8 content.
114    pub(crate) raw: &'src BStr,
115    /// A reference to the source code after validating that it is valid
116    /// UTF-8.
117    pub(crate) valid: Option<&'src str>,
118    /// An optional string that tells which is the origin of the code. Usually
119    /// a file path.
120    pub(crate) origin: Option<String>,
121}
122
123impl<'src> SourceCode<'src> {
124    /// Sets a string that describes the origin of the source code.
125    ///
126    /// This is usually the path of the file that contained the source code,
127    /// but it can be an arbitrary string. The origin appears in error and
128    /// warning messages.
129    pub fn with_origin<S: Into<String>>(self, origin: S) -> Self {
130        Self { raw: self.raw, valid: self.valid, origin: Some(origin.into()) }
131    }
132
133    /// Returns the source code as a `&str`.
134    ///
135    /// If the source code is not valid UTF-8 it will return an error.
136    fn as_str(&mut self) -> Result<&'src str, bstr::Utf8Error> {
137        match self.valid {
138            // We already know that source code is valid UTF-8, return it
139            // as is.
140            Some(s) => Ok(s),
141            // We don't know yet if the source code is valid UTF-8, some
142            // validation must be done. If validation fails an error is
143            // returned.
144            None => {
145                let src = self.raw.to_str()?;
146                self.valid = Some(src);
147                Ok(src)
148            }
149        }
150    }
151}
152
153impl<'src> From<&'src str> for SourceCode<'src> {
154    /// Creates a new [`SourceCode`] from a `&str`.
155    fn from(src: &'src str) -> Self {
156        // The input is a &str, therefore it's guaranteed to be valid UTF-8
157        // and the `valid` field can be initialized.
158        Self { raw: BStr::new(src), valid: Some(src), origin: None }
159    }
160}
161
162impl<'src> From<&'src [u8]> for SourceCode<'src> {
163    /// Creates a new [`SourceCode`] from a `&[u8]`.
164    ///
165    /// As `src` is not guaranteed to be a valid UTF-8 string, the parser will
166    /// verify it and return an error if invalid UTF-8 characters are found.
167    fn from(src: &'src [u8]) -> Self {
168        // The input is a &[u8], its content is not guaranteed to be valid
169        // UTF-8 so the `valid` field is set to `None`. The `validate_utf8`
170        // function will be called for validating the source code before
171        // being parsed.
172        Self { raw: BStr::new(src), valid: None, origin: None }
173    }
174}
175
176/// Compiles a YARA source code.
177///
178/// This function receives any type that implements the `Into<SourceCode>` trait,
179/// which includes `&str`, `String` and [`SourceCode`] and produces compiled
180/// [`Rules`] that can be passed later to the scanner.
181///
182/// # Example
183///
184/// ```rust
185/// # use yara_x;
186/// let rules = yara_x::compile("rule test { condition: true }").unwrap();
187/// let mut scanner = yara_x::Scanner::new(&rules);
188/// let results = scanner.scan("Lorem ipsum".as_bytes()).unwrap();
189/// assert_eq!(results.matching_rules().len(), 1);
190/// ```
191pub fn compile<'src, S>(src: S) -> Result<Rules, CompileError>
192where
193    S: Into<SourceCode<'src>>,
194{
195    let mut compiler = Compiler::new();
196    compiler.add_source(src)?;
197    Ok(compiler.build())
198}
199
200/// Structure that contains information about a rule namespace.
201///
202/// Includes NamespaceId, the IdentId corresponding to the namespace's
203/// identifier, and the symbol table that contains the symbols defined
204/// in the namespace.
205struct Namespace {
206    id: NamespaceId,
207    ident_id: IdentId,
208    symbols: Rc<RefCell<SymbolTable>>,
209}
210
211/// Compiles YARA source code producing a set of compiled [`Rules`].
212///
213/// The two most important methods in this type are [`Compiler::add_source`]
214/// and [`Compiler::build`]. The former tells the compiler which YARA source
215/// code must be compiled, and can be called multiple times with different
216/// set of rules. The latter consumes the compiler and produces a set of
217/// compiled [`Rules`].
218///
219/// # Example
220///
221/// ```rust
222/// # use yara_x;
223/// let mut compiler = yara_x::Compiler::new();
224///
225/// compiler
226///     .add_source(r#"
227///         rule always_true {
228///             condition: true
229///         }"#)?
230///     .add_source(r#"
231///         rule always_false {
232///             condition: false
233///         }"#)?;
234///
235/// let rules = compiler.build();
236///
237/// # Ok::<(), Box<dyn std::error::Error>>(())
238/// ```
239///
240pub struct Compiler<'a> {
241    /// Mimics YARA behavior with respect to regular expressions, allowing
242    /// some constructs that are invalid in YARA-X by default, like invalid
243    /// escape sequences.
244    relaxed_re_syntax: bool,
245
246    /// If true, the compiler hoists loop-invariant expressions (i.e: those
247    /// that don't vary on each iteration of the loop), moving them outside
248    /// the loop.
249    hoisting: bool,
250
251    /// List of directories where the compiler should look for included files.
252    /// If `None`, the current directory is used.
253    include_dirs: Option<Vec<PathBuf>>,
254
255    /// If true, slow patterns produce an error instead of a warning. A slow
256    /// pattern is one with atoms shorter than 2 bytes.
257    error_on_slow_pattern: bool,
258
259    /// If true, a slow loop produces an error instead of a warning. A slow
260    /// rule is one where the upper bound of the loop is potentially large.
261    /// Like for example: `for all x in (0..filesize) : (...)`
262    error_on_slow_loop: bool,
263
264    /// If true, include statements are allowed. If false, include statements
265    /// will produce a compile error.
266    includes_enabled: bool,
267
268    /// Tracks the paths of the files that have been included by nested
269    /// includes. This is useful for detecting circular includes and resolving
270    /// relative includes.
271    include_stack: Vec<PathBuf>,
272
273    /// Used for generating error and warning reports.
274    report_builder: ReportBuilder,
275
276    /// The main symbol table used by the compiler. This is actually a stack of
277    /// symbol tables where the bottom-most table is the one that contains
278    /// global identifiers like built-in functions and user-defined global
279    /// identifiers.
280    symbol_table: StackedSymbolTable,
281
282    /// Symbol table that contains the global identifiers, including built-in
283    /// functions like `uint8`, `uint16`, etc. This symbol table is at the
284    /// bottom of the `symbol_table`'s stack. This field is used when we
285    /// need to access the global symbol table directly, for example for
286    /// defining new global variables.
287    global_symbols: Rc<RefCell<SymbolTable>>,
288
289    /// Information about the current namespace (i.e: the namespace that will
290    /// contain any new rules added via a call to `add_sources`.
291    current_namespace: Namespace,
292
293    /// Pool that contains all the identifiers used in the rules. Each
294    /// identifier appears only once, even if they are used by multiple
295    /// rules. For example, the pool contains a single copy of the common
296    /// identifier `$a`. Each identifier have a unique 32-bits [`IdentId`]
297    /// that can be used for retrieving the identifier from the pool.
298    ident_pool: StringPool<IdentId>,
299
300    /// Similar to `ident_pool` but for regular expressions found in rule
301    /// conditions.
302    regexp_pool: StringPool<RegexpId>,
303
304    /// Similar to `ident_pool` but for string literals found in the source
305    /// code. As literal strings in YARA can contain arbitrary bytes, a pool
306    /// capable of storing [`bstr::BString`] must be used, the [`String`] type
307    /// only accepts valid UTF-8. This pool also stores the atoms extracted
308    /// from patterns.
309    lit_pool: BStringPool<LiteralId>,
310
311    /// Intermediate representation (IR) tree for condition of the rule that
312    /// is currently being compiled. After compiling each rule the tree is
313    /// cleared, but it will be reused for the next rule.
314    ir: IR,
315
316    /// Builder for creating the WebAssembly module that contains the code
317    /// for all rule conditions.
318    wasm_mod: WasmModuleBuilder,
319
320    /// Struct that contains the IDs for WASM memories, global and local
321    /// variables, etc.
322    wasm_symbols: WasmSymbols,
323
324    /// Map that contains the functions that are callable from WASM code. These
325    /// are the same functions in [`static@WASM_EXPORTS`]. This map allows to
326    /// retrieve the WASM [`FunctionId`] from the fully qualified mangled
327    /// function name (e.g: `my_module.my_struct.my_func@ii@i`)
328    wasm_exports: FxHashMap<String, FunctionId>,
329
330    /// Map that associates a `PatternId` to a certain filesize bound.
331    ///
332    /// A condition like `filesize < 1000 and $a` only matches if `filesize`
333    /// is less than 1000. Therefore, the pattern `$a` does not need be
334    /// checked for files of size 1000 bytes or larger.
335    ///
336    /// In this case, the map will contain an entry associating `$a` to a
337    /// `FilesizeBounds` value like:
338    /// `FilesizeBounds{start: Bound::Unbounded, end: Bound:Excluded(1000)}`.
339    filesize_bounds: FxHashMap<PatternId, FilesizeBounds>,
340
341    /// A vector with all the rules that has been compiled. A [`RuleId`] is
342    /// an index in this vector.
343    rules: Vec<RuleInfo>,
344
345    /// Next (not used yet) [`PatternId`].
346    next_pattern_id: PatternId,
347
348    /// Map used for de-duplicating pattern. Keys are the pattern's IR and
349    /// values are the `PatternId` assigned to each pattern. Every time a rule
350    /// declares a pattern, this map is used for determining if the same
351    /// pattern (i.e: a pattern with exactly the same IR) was already declared
352    /// by some other rule. If that's the case, that same pattern is re-used.
353    patterns: FxHashMap<Pattern, PatternId>,
354
355    /// A vector with all the sub-patterns from all the rules. A
356    /// [`SubPatternId`] is an index in this vector.
357    sub_patterns: Vec<(PatternId, SubPattern)>,
358
359    /// Vector that contains the [`SubPatternId`] for sub-patterns that can
360    /// match only at a fixed offset within the scanned data. These sub-patterns
361    /// are not added to the Aho-Corasick automaton.
362    anchored_sub_patterns: Vec<SubPatternId>,
363
364    /// A vector that contains all the atoms generated from the patterns.
365    /// Each atom has an associated [`SubPatternId`] that indicates the
366    /// sub-pattern it belongs to.
367    atoms: Vec<SubPatternAtom>,
368
369    /// A vector that contains the code for all regexp patterns (this includes
370    /// hex patterns which are just a special case of regexp). The code for
371    /// each regexp is appended to the vector, during the compilation process
372    /// and the atoms extracted from the regexp contain offsets within this
373    /// vector. This vector contains both forward and backward code.
374    re_code: Vec<u8>,
375
376    /// Vector with the names of all the imported modules. The vector contains
377    /// the [`IdentId`] corresponding to the module's identifier.
378    imported_modules: Vec<IdentId>,
379
380    /// Names of modules that are known, but not supported. When an `import`
381    /// statement with one of these modules is found, the statement is accepted
382    /// without causing an error, but a warning is raised to let the user know
383    /// that the module is not supported. Any rule that depends on an unsupported
384    /// module is ignored.
385    ignored_modules: FxHashSet<String>,
386
387    /// Keys in this map are the modules that are banned, and values are a pair
388    /// of strings with the title and message for the error that will be shown
389    /// if the banned module is imported.
390    banned_modules: FxHashMap<String, (String, String)>,
391
392    /// Keys in this map are the name of rules that will be ignored because they
393    /// depend on unsupported modules, either directly or indirectly. Values are
394    /// the names of the unsupported modules they depend on.
395    ignored_rules: FxHashMap<String, String>,
396
397    /// Structure where each field corresponds to a global identifier or a module
398    /// imported by the rules. For fields corresponding to modules, the value is
399    /// the structure that describes the module.
400    root_struct: Struct,
401
402    /// Warnings generated while compiling the rules.
403    warnings: Warnings,
404
405    /// Errors generated while compiling the rules.
406    errors: Vec<CompileError>,
407
408    /// Features enabled for this compiler. See [`Compiler::enable_feature`]
409    /// for details.
410    features: FxHashSet<String>,
411
412    /// Optional writer where the compiler writes the IR produced by each rule.
413    /// This is used for test cases and debugging.
414    ir_writer: Option<Box<dyn Write>>,
415
416    /// Linters applied to each rule during compilation. The linters are added
417    /// to the compiler using [`Compiler::add_linter`]:
418    linters: Vec<Box<dyn linters::Linter + 'a>>,
419}
420
421impl<'a> Compiler<'a> {
422    /// Creates a new YARA compiler.
423    pub fn new() -> Self {
424        let mut ident_pool = StringPool::new();
425        let mut symbol_table = StackedSymbolTable::new();
426
427        let global_symbols = symbol_table.push_new();
428
429        // Add symbols for built-in functions like uint8, uint16, etc.
430        for export in wasm_exports()
431            // Get only the public exports not belonging to a YARA module.
432            .filter(|e| e.public && e.builtin())
433        {
434            let func = Rc::new(Func::from(export.mangled_name));
435            let symbol = Symbol::Func(func);
436
437            global_symbols.borrow_mut().insert(export.name, symbol);
438        }
439
440        // Create the default namespace. Rule identifiers will be added to this
441        // namespace, unless the user defines some namespace explicitly by calling
442        // `Compiler::new_namespace`.
443        let default_namespace = Namespace {
444            id: NamespaceId(0),
445            ident_id: ident_pool.get_or_intern("default"),
446            symbols: symbol_table.push_new(),
447        };
448
449        // At this point the symbol table (which is a stacked symbol table) has
450        // two layers, the global symbols at the bottom, and the default
451        // namespace on top of it. Calls to `Compiler::new_namespace` replace
452        // the top layer (default namespace) with a new one, but the bottom
453        // layer remains, so the global symbols are shared by all namespaces.
454
455        // Create a WASM module builder. This object is used for building the
456        // WASM module that will execute the rule conditions.
457        let mut wasm_mod = WasmModuleBuilder::new();
458
459        wasm_mod.namespaces_per_func(20);
460        wasm_mod.rules_per_func(10);
461
462        let wasm_symbols = wasm_mod.wasm_symbols();
463        let wasm_exports = wasm_mod.wasm_exports();
464
465        let mut ir = IR::new();
466
467        if cfg!(feature = "constant-folding") {
468            ir.constant_folding(true);
469        }
470
471        Self {
472            ir,
473            ident_pool,
474            global_symbols,
475            symbol_table,
476            wasm_mod,
477            wasm_symbols,
478            wasm_exports,
479            relaxed_re_syntax: false,
480            hoisting: false,
481            error_on_slow_pattern: false,
482            error_on_slow_loop: false,
483            next_pattern_id: PatternId(0),
484            current_namespace: default_namespace,
485            features: FxHashSet::default(),
486            warnings: Warnings::default(),
487            errors: Vec::new(),
488            rules: Vec::new(),
489            sub_patterns: Vec::new(),
490            anchored_sub_patterns: Vec::new(),
491            atoms: Vec::new(),
492            re_code: Vec::new(),
493            imported_modules: Vec::new(),
494            ignored_modules: FxHashSet::default(),
495            banned_modules: FxHashMap::default(),
496            ignored_rules: FxHashMap::default(),
497            filesize_bounds: FxHashMap::default(),
498            root_struct: Struct::new().make_root(),
499            report_builder: ReportBuilder::new(),
500            lit_pool: BStringPool::new(),
501            regexp_pool: StringPool::new(),
502            patterns: FxHashMap::default(),
503            ir_writer: None,
504            linters: Vec::new(),
505            include_dirs: None,
506            includes_enabled: true,
507            include_stack: Vec::new(),
508        }
509    }
510
511    /// Adds a directory to the list of directories where the compiler should
512    /// look for included files.
513    ///
514    /// When an `include` statement is found, the compiler looks for the included
515    /// file in the directories added with this function, in the order they were
516    /// added.
517    ///
518    /// If this function is not called, the compiler will only look for included
519    /// files in the current directory.
520    ///
521    /// Use [Compiler::enable_includes] for controlling whether include statements
522    /// are allowed or not.
523    ///
524    /// # Example
525    ///
526    /// ```no_run
527    /// # use yara_x::Compiler;
528    /// # use std::path::Path;
529    /// let mut compiler = Compiler::new();
530    /// compiler.add_include_dir("/path/to/rules")
531    ///         .add_include_dir("/another/path");
532    /// ```
533    pub fn add_include_dir<P: AsRef<std::path::Path>>(
534        &mut self,
535        dir: P,
536    ) -> &mut Self {
537        self.include_dirs
538            .get_or_insert_default()
539            .push(dir.as_ref().to_path_buf());
540        self
541    }
542
543    /// Adds some YARA source code to be compiled.
544    ///
545    /// The `src` parameter accepts any type that implements [`Into<SourceCode>`],
546    /// such as `&str`, `&[u8]`, or an instance of [`SourceCode`] itself. The source
547    /// code may include one or more YARA rules.
548    ///
549    /// You can call this function multiple times to add different sets of rules.
550    /// If the provided source code contains syntax or semantic errors that prevent
551    /// compilation, the function returns the first encountered error. All errors
552    /// found during compilation are also recorded and can be retrieved using
553    /// [`Compiler::errors`].
554    ///
555    /// Even if previous calls to this function resulted in compilation errors,
556    /// you may continue adding additional rules. Only successfully compiled rules
557    /// will be included in the final rule set.
558    pub fn add_source<'src, S>(
559        &mut self,
560        src: S,
561    ) -> Result<&mut Self, CompileError>
562    where
563        S: Into<SourceCode<'src>>,
564    {
565        // Convert `src` into an instance of `SourceCode` if it is something
566        // else, like a &str.
567        let mut src = src.into();
568
569        // Register source code, even before validating that it is UTF-8. In
570        // case of UTF-8 encoding errors we want to report that error too,
571        // and we need the source code registered for creating the report.
572        self.report_builder.register_source(&src);
573
574        // Make sure that the source code is valid UTF-8, or return an error
575        // if otherwise.
576        let ast = match src.as_str() {
577            Ok(src) => {
578                // Parse the source code and build the Abstract Syntax Tree.
579                let cst = Parser::new(src.as_bytes());
580                let cst =
581                    WarningSuppressionHook::from(cst).hook(|warning, span| {
582                        self.warnings.suppress(warning, span);
583                    });
584
585                AST::from(CSTStream::new(src.as_bytes(), cst))
586            }
587            Err(err) => {
588                let span_start = err.valid_up_to();
589                let span_end = if let Some(error_len) = err.error_len() {
590                    // `error_len` is the number of invalid UTF-8 bytes found
591                    // after `span_start`. Round the number up to the next 3
592                    // bytes boundary because invalid bytes are replaced with
593                    // the Unicode replacement characters that takes 3 bytes.
594                    // This way the span ends at a valid UTF-8 character
595                    // boundary.
596                    span_start + error_len.next_multiple_of(3)
597                } else {
598                    span_start
599                };
600
601                let err = InvalidUTF8::build(
602                    &self.report_builder,
603                    self.report_builder.span_to_code_loc(Span(
604                        span_start as u32..span_end as u32,
605                    )),
606                );
607
608                self.errors.push(err.clone());
609                return Err(err);
610            }
611        };
612
613        // Store the current length of the `errors` vector, so that we can
614        // know if more errors were added.
615        let existing_errors = self.errors.len();
616
617        self.c_items(ast.items());
618
619        self.warnings.clear_suppressed();
620
621        self.errors.extend(
622            ast.into_errors()
623                .into_iter()
624                .map(|err| CompileError::from(&self.report_builder, err)),
625        );
626
627        // More errors were added? Return the first error that was added.
628        if self.errors.len() > existing_errors {
629            return Err(self.errors[existing_errors].clone());
630        }
631
632        Ok(self)
633    }
634
635    /// Defines a global variable and sets its initial value.
636    ///
637    /// Global variables must be defined before adding any YARA source code
638    /// that references them via [`Compiler::add_source`]. Once defined, the
639    /// variable's initial value is preserved in the compiled [`Rules`] and
640    /// will be used unless overridden.
641    ///
642    /// When scanning, each scanner instance can modify the initial value of
643    /// the variable using [`crate::Scanner::set_global`].
644    ///
645    /// `T` can be any type that implements [`TryInto<Variable>`], including:
646    /// `i64`, `i32`, `i16`, `i8`, `u32`, `u16`, `u8`, `f64`, `f32`, `bool`,
647    /// `&str`, `String` and [`serde_json::Value`].
648    ///
649    /// When using a [`serde_json::Value`] there are certain limitations: keys
650    /// in maps must be valid YARA identifiers (the first character must be `_`
651    /// or a letter, the remaining ones must be `_`, a letter or a digit),
652    /// because these maps are translated into YARA structures. Also, all items
653    /// in an array must have the same type.
654    ///
655    /// ```
656    /// # use yara_x::Compiler;
657    /// assert!(Compiler::new()
658    ///     .define_global("some_int", 1)?
659    ///     .add_source("rule some_int_not_zero {condition: some_int != 0}")
660    ///     .is_ok());
661    ///
662    /// # Ok::<(), Box<dyn std::error::Error>>(())
663    /// ```
664    pub fn define_global<T: TryInto<Variable>>(
665        &mut self,
666        ident: &str,
667        value: T,
668    ) -> Result<&mut Self, VariableError>
669    where
670        VariableError: From<<T as TryInto<Variable>>::Error>,
671    {
672        if !is_valid_identifier(ident) {
673            return Err(VariableError::InvalidIdentifier(ident.to_string()));
674        }
675
676        let var: Variable = value.try_into()?;
677        let type_value: TypeValue = var.into();
678
679        if self.root_struct.add_field(ident, type_value).is_some() {
680            return Err(VariableError::AlreadyExists(ident.to_string()));
681        }
682
683        self.global_symbols
684            .borrow_mut()
685            .insert(ident, self.root_struct.lookup(ident).unwrap());
686
687        Ok(self)
688    }
689
690    /// Creates a new namespace.
691    ///
692    /// Further calls to [`Compiler::add_source`] will put the rules under the
693    /// newly created namespace. If the new namespace is named as the current
694    /// one, no new namespace is created.
695    ///
696    /// In the example below both rules `foo` and `bar` are put into the same
697    /// namespace (the default namespace), therefore `bar` can use `foo` as
698    /// part of its condition, and everything is ok.
699    ///
700    /// ```
701    /// # use yara_x::Compiler;
702    /// assert!(Compiler::new()
703    ///     .add_source("rule foo {condition: true}")?
704    ///     .add_source("rule bar {condition: foo}")
705    ///     .is_ok());
706    ///
707    /// # Ok::<(), Box<dyn std::error::Error>>(())
708    /// ```
709    ///
710    /// In this other example the rule `foo` is put in the default namespace,
711    /// but the rule `bar` is put under the `bar` namespace. This implies that
712    /// `foo` is not visible to `bar`, and the second call to `add_source`
713    /// fails.
714    ///
715    /// ```
716    /// # use yara_x::Compiler;
717    /// assert!(Compiler::new()
718    ///     .add_source("rule foo {condition: true}")?
719    ///     .new_namespace("bar")
720    ///     .add_source("rule bar {condition: foo}")
721    ///     .is_err());
722    ///
723    /// # Ok::<(), Box<dyn std::error::Error>>(())
724    /// ```
725    pub fn new_namespace(&mut self, namespace: &str) -> &mut Self {
726        let current_namespace = self
727            .ident_pool
728            .get(self.current_namespace.ident_id)
729            .expect("expecting a namespace");
730        // If the current namespace is already named as the new namespace
731        // this function has no effect.
732        if namespace == current_namespace {
733            return self;
734        }
735        // Remove the symbol table corresponding to the current namespace.
736        self.symbol_table.pop().expect("expecting a namespace");
737        // Create a new namespace. The NamespaceId is simply the ID of the
738        // previous namespace + 1.
739        self.current_namespace = Namespace {
740            id: NamespaceId(self.current_namespace.id.0 + 1),
741            ident_id: self.ident_pool.get_or_intern(namespace),
742            symbols: self.symbol_table.push_new(),
743        };
744        self.ignored_rules.clear();
745        self.wasm_mod.new_namespace();
746        self
747    }
748
749    /// Builds the source code previously added to the compiler.
750    ///
751    /// This function consumes the compiler and returns an instance of
752    /// [`Rules`].
753    pub fn build(self) -> Rules {
754        // Finish building the WASM module.
755        let wasm_mod = self.wasm_mod.build().emit_wasm();
756
757        #[cfg(feature = "logging")]
758        let start = Instant::now();
759
760        // Compile the WASM module for the current platform. This panics
761        // if the WASM code is invalid, which should not happen as the code is
762        // emitted by YARA itself. If this ever happens is probably because
763        // wrong WASM code is being emitted.
764        let compiled_wasm_mod = wasm::runtime::Module::from_binary(
765            wasm::get_engine(),
766            wasm_mod.as_slice(),
767        )
768        .expect("WASM module is not valid");
769
770        #[cfg(feature = "logging")]
771        info!("WASM module build time: {:?}", Instant::elapsed(&start));
772
773        // The structure that contains the global variables is serialized before
774        // being passed to the `Rules` struct. This is because we want `Rules`
775        // to be `Send`, so that it can be shared with scanners running in
776        // different threads. In order for `Rules` to be `Send`, it can't
777        // contain fields that are not `Send`. As `Struct` is not `Send` we
778        // can't have a `Struct` field in `Rules`, so what we have a `Vec<u8>`
779        // with a serialized version of the struct.
780        //
781        // An alternative is changing the `Rc` in some variants of `TypeValue`
782        // to `Arc`, as the root cause that prevents `Struct` from being `Send`
783        // is the use of `Rc` in `TypeValue`.
784        let serialized_globals = bincode::serde::encode_to_vec(
785            &self.root_struct,
786            bincode::config::standard().with_variable_int_encoding(),
787        )
788        .expect("failed to serialize global variables");
789
790        let mut rules = Rules {
791            serialized_globals,
792            wasm_mod,
793            compiled_wasm_mod: Some(compiled_wasm_mod),
794            relaxed_re_syntax: self.relaxed_re_syntax,
795            ac: None,
796            num_patterns: self.next_pattern_id.0 as usize,
797            ident_pool: self.ident_pool,
798            regexp_pool: self.regexp_pool,
799            lit_pool: self.lit_pool,
800            imported_modules: self.imported_modules,
801            rules: self.rules,
802            sub_patterns: self.sub_patterns,
803            anchored_sub_patterns: self.anchored_sub_patterns,
804            atoms: self.atoms,
805            re_code: self.re_code,
806            warnings: self.warnings.into(),
807            filesize_bounds: self.filesize_bounds,
808        };
809
810        rules.build_ac_automaton();
811        rules
812    }
813
814    /// Adds a linter to the compiler.
815    ///
816    /// Linters perform additional checks to each YARA rule, generating
817    /// warnings when a rule does not meet the linter's requirements. See
818    /// [`crate::linters`] for a list of available linters.
819    pub fn add_linter<L: linters::Linter + 'a>(
820        &mut self,
821        linter: L,
822    ) -> &mut Self {
823        self.linters.push(Box::new(linter));
824        self
825    }
826
827    /// Enables a feature on this compiler.
828    ///
829    /// When defining the structure of a module in a `.proto` file, you can
830    /// specify that certain fields are accessible only when one or more
831    /// features are enabled. For example, the snippet below shows the
832    /// definition of a field named `requires_foo_and_bar`, which can be
833    /// accessed only when both features "foo" and "bar" are enabled.
834    ///
835    /// ```protobuf
836    /// optional uint64 requires_foo_and_bar = 500 [
837    ///   (yara.field_options) = {
838    ///     acl: [
839    ///       {
840    ///         allow_if: "foo",
841    ///         error_title: "foo is required",
842    ///         error_label: "this field was used without foo"
843    ///       },
844    ///       {
845    ///         allow_if: "bar",
846    ///         error_title: "bar is required",
847    ///         error_label: "this field was used without bar"
848    ///       }
849    ///     ]
850    ///   }
851    /// ];
852    /// ```
853    ///
854    /// If some of the required features are not enabled, using this field in
855    /// a YARA rule will cause an error while compiling the rules. The error
856    /// looks like:
857    ///
858    /// ```text
859    /// error[E034]: foo is required
860    ///  --> line:5:29
861    ///   |
862    /// 5 |  test_proto2.requires_foo_and_bar == 0
863    ///   |              ^^^^^^^^^^^^^^^^^^^^ this field was used without foo
864    ///   |
865    /// ```
866    ///
867    /// Notice that both the title and label in the error message are defined
868    /// in the .proto file.
869    ///
870    /// # Important
871    ///
872    /// This API is hidden from the public documentation because it is unstable
873    /// and subject to change.
874    #[doc(hidden)]
875    pub fn enable_feature<F: Into<String>>(
876        &mut self,
877        feature: F,
878    ) -> &mut Self {
879        self.features.insert(feature.into());
880        self
881    }
882
883    /// Tell the compiler that a YARA module is not supported.
884    ///
885    /// Import statements for ignored modules will be ignored without errors,
886    /// but a warning will be issued. Any rule that makes use of an ignored
887    /// module will be also ignored, while the rest of the rules that don't
888    /// rely on that module will be correctly compiled.
889    pub fn ignore_module<M: Into<String>>(&mut self, module: M) -> &mut Self {
890        self.ignored_modules.insert(module.into());
891        self
892    }
893
894    /// Tell the compiler that a YARA module can't be used.
895    ///
896    /// Import statements for the banned module will cause an error. The error
897    /// message can be customized by using the given error title and message.
898    ///
899    /// If this function is called multiple times with the same module name,
900    /// the error title and message will be updated.
901    pub fn ban_module<M: Into<String>, T: Into<String>, E: Into<String>>(
902        &mut self,
903        module: M,
904        error_title: T,
905        error_message: E,
906    ) -> &mut Self {
907        self.banned_modules
908            .insert(module.into(), (error_title.into(), error_message.into()));
909        self
910    }
911
912    /// Specifies whether the compiler should produce colorful error messages.
913    ///
914    /// Colorized error messages contain ANSI escape sequences that make them
915    /// look nicer on compatible consoles.
916    ///
917    /// The default setting is `false`.
918    pub fn colorize_errors(&mut self, yes: bool) -> &mut Self {
919        self.report_builder.with_colors(yes);
920        self
921    }
922
923    /// Sets the maximum number of columns in error messages.
924    ///
925    /// The default value is 140.
926    pub fn errors_max_width(&mut self, width: usize) -> &mut Self {
927        self.report_builder.max_width(width);
928        self
929    }
930
931    /// Enables or disables a specific type of warning.
932    ///
933    /// Each warning type has a description code (i.e: `slow_pattern`,
934    /// `unsupported_module`, etc.). This function allows to enable or disable
935    /// a specific type of warning identified by the given code.
936    ///
937    /// Returns an error if the given warning code doesn't exist.
938    pub fn switch_warning(
939        &mut self,
940        code: &str,
941        enabled: bool,
942    ) -> Result<&mut Self, InvalidWarningCode> {
943        self.warnings.switch_warning(code, enabled)?;
944        Ok(self)
945    }
946
947    /// Enables or disables all warnings.
948    pub fn switch_all_warnings(&mut self, enabled: bool) -> &mut Self {
949        self.warnings.switch_all_warnings(enabled);
950        self
951    }
952
953    /// Enables a more relaxed syntax check for regular expressions.
954    ///
955    /// YARA-X enforces stricter regular expression syntax compared to YARA.
956    /// For instance, YARA accepts invalid escape sequences and treats them
957    /// as literal characters (e.g., \R is interpreted as a literal 'R'). It
958    /// also allows some special characters to appear unescaped, inferring
959    /// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
960    /// literal, but in `/foo{0,1}bar/` they form the repetition operator
961    /// `{0,1}`).
962    ///
963    /// This setting controls whether the compiler should mimic YARA's behavior,
964    /// allowing constructs that YARA-X doesn't accept by default.
965    ///
966    /// This should be called before any rule is added to the compiler.
967    ///
968    /// # Panics
969    ///
970    /// If called after adding rules to the compiler.
971    pub fn relaxed_re_syntax(&mut self, yes: bool) -> &mut Self {
972        if !self.rules.is_empty() {
973            panic!("calling relaxed_re_syntax in non-empty compiler")
974        }
975        self.relaxed_re_syntax = yes;
976        self
977    }
978
979    /// When enabled, slow patterns produce an error instead of a warning.
980    ///
981    /// This is disabled by default.
982    pub fn error_on_slow_pattern(&mut self, yes: bool) -> &mut Self {
983        self.error_on_slow_pattern = yes;
984        self
985    }
986
987    /// When enabled, potentially slow loops produce an error instead of a
988    /// warning.
989    ///
990    /// This is disabled by default.
991    pub fn error_on_slow_loop(&mut self, yes: bool) -> &mut Self {
992        self.error_on_slow_loop = yes;
993        self
994    }
995
996    /// Controls whether `include` statements are allowed.
997    ///
998    /// By default, the compiler allows the use of `include` statements, which
999    /// include the content of other files. When includes are disabled, any
1000    /// attempt to use an `include` statement will result in a compile error.
1001    ///
1002    /// ```
1003    /// # use yara_x::Compiler;
1004    /// let mut compiler = Compiler::new();
1005    /// compiler.enable_includes(false);  // Disable includes
1006    /// ```
1007    pub fn enable_includes(&mut self, yes: bool) -> &mut Self {
1008        self.includes_enabled = yes;
1009        self
1010    }
1011
1012    /// When enabled, the compiler tries to optimize rule conditions.
1013    ///
1014    /// The optimizations usually reduce condition evaluation times, specially
1015    /// in complex rules that contain loops, but it can break short-circuit
1016    /// evaluation rules because some subexpressions are not executed in the
1017    /// order they appear in the source code.
1018    ///
1019    /// This is a very experimental feature.
1020    #[doc(hidden)]
1021    pub fn condition_optimization(&mut self, yes: bool) -> &mut Self {
1022        self.hoisting(yes)
1023    }
1024
1025    pub(crate) fn hoisting(&mut self, yes: bool) -> &mut Self {
1026        self.hoisting = yes;
1027        self
1028    }
1029
1030    /// Retrieves all errors generated by the compiler.
1031    ///
1032    /// This method returns every error encountered during the compilation,
1033    /// across all invocations of [`Compiler::add_source`].
1034    #[inline]
1035    pub fn errors(&self) -> &[CompileError] {
1036        self.errors.as_slice()
1037    }
1038
1039    /// Returns the warnings emitted by the compiler.
1040    ///
1041    /// This method returns every warning issued during the compilation,
1042    /// across all invocations of [`Compiler::add_source`].
1043    #[inline]
1044    pub fn warnings(&self) -> &[Warning] {
1045        self.warnings.as_slice()
1046    }
1047
1048    /// Emits a `.wasm` file with the WASM module generated by the compiler.
1049    ///
1050    /// This file can be inspected and converted to WASM text format by using
1051    /// third-party [tooling](https://github.com/WebAssembly/wabt). This is
1052    /// useful for debugging issues with incorrectly emitted WASM code.
1053    pub fn emit_wasm_file<P>(self, path: P) -> Result<(), EmitWasmError>
1054    where
1055        P: AsRef<Path>,
1056    {
1057        let mut wasm_mod = self.wasm_mod.build();
1058        Ok(wasm_mod.emit_wasm_file(path)?)
1059    }
1060
1061    /// Sets a writer where the compiler will write the Intermediate
1062    /// Representation (IR) of compiled conditions.
1063    ///
1064    /// This is used for testing and debugging purposes.
1065    #[doc(hidden)]
1066    pub fn set_ir_writer<W: Write + 'static>(&mut self, w: W) -> &mut Self {
1067        self.ir_writer = Some(Box::new(w));
1068        self
1069    }
1070}
1071
1072impl Compiler<'_> {
1073    fn add_sub_pattern<I, F, A>(
1074        &mut self,
1075        pattern_id: PatternId,
1076        sub_pattern: SubPattern,
1077        atoms: I,
1078        f: F,
1079    ) -> SubPatternId
1080    where
1081        I: Iterator<Item = A>,
1082        F: Fn(SubPatternId, A) -> SubPatternAtom,
1083    {
1084        let sub_pattern_id = SubPatternId(self.sub_patterns.len() as u32);
1085
1086        // Sub-patterns that are anchored at some fixed offset are not added to
1087        // the Aho-Corasick automata. Instead, their IDs are added to the
1088        // anchored_sub_patterns list.
1089        if let SubPattern::Literal { anchored_at: Some(_), .. } = sub_pattern {
1090            self.anchored_sub_patterns.push(sub_pattern_id);
1091        } else {
1092            self.atoms.extend(atoms.map(|atom| f(sub_pattern_id, atom)));
1093        }
1094
1095        self.sub_patterns.push((pattern_id, sub_pattern));
1096
1097        sub_pattern_id
1098    }
1099
1100    /// Checks if another rule, module or variable has the given identifier and
1101    /// return an error in that case.
1102    fn check_for_existing_identifier(
1103        &self,
1104        ident: &Ident,
1105    ) -> Result<(), CompileError> {
1106        if let Some(symbol) = self.symbol_table.lookup(ident.name) {
1107            return match symbol {
1108                // Found another rule with the same name.
1109                Symbol::Rule { rule_id, .. } => Err(DuplicateRule::build(
1110                    &self.report_builder,
1111                    ident.name.to_string(),
1112                    self.report_builder.span_to_code_loc(ident.span()),
1113                    self.rules
1114                        .get(rule_id.0 as usize)
1115                        .unwrap()
1116                        .ident_ref
1117                        .clone(),
1118                )),
1119                // Found another symbol that is not a rule, but has the same
1120                // name.
1121                _ => Err(ConflictingRuleIdentifier::build(
1122                    &self.report_builder,
1123                    ident.name.to_string(),
1124                    self.report_builder.span_to_code_loc(ident.span()),
1125                )),
1126            };
1127        }
1128        Ok(())
1129    }
1130
1131    /// Checks that tags are not duplicate.
1132    fn check_for_duplicate_tags(
1133        &self,
1134        tags: &[Ident],
1135    ) -> Result<(), CompileError> {
1136        let mut s = HashSet::new();
1137        for tag in tags {
1138            if !s.insert(tag.name) {
1139                return Err(DuplicateTag::build(
1140                    &self.report_builder,
1141                    tag.name.to_string(),
1142                    self.report_builder.span_to_code_loc(tag.span()),
1143                ));
1144            }
1145        }
1146        Ok(())
1147    }
1148
1149    /// Interns a literal in the literals pool.
1150    ///
1151    /// If `wide` is true the literal gets zeroes interleaved between each byte
1152    /// before being interned.
1153    fn intern_literal(&mut self, literal: &[u8], wide: bool) -> LiteralId {
1154        let wide_pattern;
1155        let literal_bytes = if wide {
1156            wide_pattern = make_wide(literal);
1157            wide_pattern.as_bytes()
1158        } else {
1159            literal
1160        };
1161        self.lit_pool.get_or_intern(literal_bytes)
1162    }
1163
1164    /// Takes a snapshot of the compiler's state at this moment.
1165    ///
1166    /// The returned [`Snapshot`] can be passed to [`Compiler::restore_snapshot`]
1167    /// for restoring the compiler to the state it was when the snapshot was
1168    /// taken.
1169    ///
1170    /// This is useful when the compilation of a rule fails, for restoring the
1171    /// compiler to the state it had before starting compiling the failed rule,
1172    /// which avoids leaving junk in the compiler's internal structures.
1173    fn take_snapshot(&self) -> Snapshot {
1174        Snapshot {
1175            next_pattern_id: self.next_pattern_id,
1176            rules_len: self.rules.len(),
1177            atoms_len: self.atoms.len(),
1178            re_code_len: self.re_code.len(),
1179            sub_patterns_len: self.sub_patterns.len(),
1180            symbol_table_len: self.symbol_table.len(),
1181        }
1182    }
1183
1184    /// Restores the compiler's to a previous state.
1185    ///
1186    /// Use [`Compiler::take_snapshot`] for taking a snapshot of the compiler's
1187    /// state.
1188    fn restore_snapshot(&mut self, snapshot: Snapshot) {
1189        self.next_pattern_id = snapshot.next_pattern_id;
1190        self.rules.truncate(snapshot.rules_len);
1191        self.sub_patterns.truncate(snapshot.sub_patterns_len);
1192        self.re_code.truncate(snapshot.re_code_len);
1193        self.atoms.truncate(snapshot.atoms_len);
1194        self.symbol_table.truncate(snapshot.symbol_table_len);
1195
1196        // Pattern IDs that are >= next_pattern_id, are being discarded. Any pattern
1197        // or file size bound associated to such IDs must be removed.
1198
1199        self.patterns
1200            .retain(|_, pattern_id| *pattern_id < snapshot.next_pattern_id);
1201
1202        self.filesize_bounds
1203            .retain(|pattern_id, _| *pattern_id < snapshot.next_pattern_id);
1204    }
1205
1206    /// Returns true if the bytes in the slice are all 0x00, 0x90, or 0xff.
1207    fn common_byte_repetition(bytes: &[u8]) -> bool {
1208        let mut all_x00 = true;
1209        let mut all_x90 = true;
1210        let mut all_xff = true;
1211
1212        for b in bytes {
1213            match *b {
1214                0x00 => {
1215                    all_x90 = false;
1216                    all_xff = false;
1217                }
1218                0x90 => {
1219                    all_x00 = false;
1220                    all_xff = false;
1221                }
1222                0xff => {
1223                    all_x00 = false;
1224                    all_x90 = false;
1225                }
1226                _ => return false,
1227            }
1228            if !all_x00 && !all_x90 && !all_xff {
1229                return false;
1230            }
1231        }
1232
1233        true
1234    }
1235
1236    /// Reads the file specified by an `include` statement.
1237    ///
1238    /// Tries to read the file in the include directories that were specified
1239    /// with [`Compiler::add_include_dir`], or in the current directory, if
1240    /// no include directories were specified.
1241    ///
1242    /// The function returns both the content and the path of the included file
1243    /// relative to the current directory, or an error if the included file could
1244    /// not be read.
1245    fn read_included_file(
1246        &mut self,
1247        include: &Include,
1248    ) -> Result<(Vec<u8>, PathBuf), CompileError> {
1249        let read_file =
1250            |path: PathBuf| -> Result<(Vec<u8>, PathBuf), io::Error> {
1251                let mut path = path.canonicalize()?;
1252                let content = fs::read(&path)?;
1253
1254                if let Ok(cwd) =
1255                    env::current_dir().and_then(|dir| dir.canonicalize())
1256                    && let Ok(relative_path) = path.strip_prefix(cwd)
1257                {
1258                    path = relative_path.to_path_buf();
1259                }
1260
1261                Ok((content, path))
1262            };
1263
1264        // Look for the included file in the directory at the top of the
1265        // include stack.
1266        if let Some(dir) =
1267            self.include_stack.last().and_then(|path| path.parent())
1268            && let Ok(result) = read_file(dir.join(include.file_name))
1269        {
1270            return Ok(result);
1271        }
1272
1273        // If one or more include directory were specified, try to find the
1274        // included file in them, in the order they were specified. Otherwise,
1275        // try to find the included file in the current directory.
1276        if let Some(include_dirs) = &self.include_dirs {
1277            if let Some(result) = include_dirs
1278                .iter()
1279                .find_map(|dir| read_file(dir.join(include.file_name)).ok())
1280            {
1281                Ok(result)
1282            } else {
1283                Err(IncludeNotFound::build(
1284                    &self.report_builder,
1285                    include.file_name.to_string(),
1286                    self.report_builder.span_to_code_loc(include.span()),
1287                ))
1288            }
1289        } else {
1290            read_file(PathBuf::from(include.file_name)).map_err(|err| {
1291                if err.kind() == io::ErrorKind::NotFound {
1292                    IncludeNotFound::build(
1293                        &self.report_builder,
1294                        include.file_name.to_string(),
1295                        self.report_builder.span_to_code_loc(include.span()),
1296                    )
1297                } else {
1298                    IncludeError::build(
1299                        &self.report_builder,
1300                        self.report_builder.span_to_code_loc(include.span()),
1301                        err.to_string(),
1302                    )
1303                }
1304            })
1305        }
1306    }
1307}
1308
1309impl Compiler<'_> {
1310    fn c_items<'a, I>(&mut self, items: I)
1311    where
1312        I: Iterator<Item = &'a ast::Item<'a>>,
1313    {
1314        let mut already_imported = FxHashMap::default();
1315
1316        for item in items {
1317            match item {
1318                ast::Item::Import(import) => {
1319                    // Checks that all imported modules actually exist, and
1320                    // raise warnings in case of duplicated imports within
1321                    // the same source file. For each module add a symbol to
1322                    // the current namespace.
1323                    if let Some(existing_import) = already_imported.insert(
1324                        &import.module_name,
1325                        self.report_builder.span_to_code_loc(import.span()),
1326                    ) {
1327                        let duplicated_import = self
1328                            .report_builder
1329                            .span_to_code_loc(import.span());
1330
1331                        let mut warning = warnings::DuplicateImport::build(
1332                            &self.report_builder,
1333                            import.module_name.to_string(),
1334                            duplicated_import.clone(),
1335                            existing_import,
1336                        );
1337
1338                        warning.report_mut().patch(duplicated_import, "");
1339
1340                        self.warnings.add(|| warning)
1341                    }
1342                    // Import the module. This updates `self.root_struct` if
1343                    // necessary.
1344                    if let Err(err) = self.c_import(import) {
1345                        self.errors.push(err);
1346                    }
1347                }
1348                ast::Item::Include(include) => {
1349                    // Return an error if includes are disabled
1350                    if !self.includes_enabled {
1351                        self.errors.push(IncludeNotAllowed::build(
1352                            &self.report_builder,
1353                            self.report_builder
1354                                .span_to_code_loc(include.span()),
1355                        ));
1356                        continue;
1357                    }
1358
1359                    let (included_src, included_path) =
1360                        match self.read_included_file(include) {
1361                            Ok(included) => included,
1362                            Err(err) => {
1363                                self.errors.push(err);
1364                                continue;
1365                            }
1366                        };
1367
1368                    if self.include_stack.contains(&included_path) {
1369                        self.errors.push(CircularIncludes::build(
1370                            &self.report_builder,
1371                            self.report_builder
1372                                .span_to_code_loc(include.span()),
1373                            Some(format!(
1374                                "include dependencies:\n{}",
1375                                self.include_stack
1376                                    .iter()
1377                                    .enumerate()
1378                                    .map(|(i, path)| format!(
1379                                        "{:>width$}↳ {}",
1380                                        "",
1381                                        path.display(),
1382                                        width = i * 2
1383                                    ))
1384                                    .collect::<Vec<_>>()
1385                                    .join("\n")
1386                            )),
1387                        ));
1388                        continue;
1389                    }
1390
1391                    // Save the current source ID from the report builder in
1392                    // order to restore it later. Any recursive call to
1393                    // `add_source` will change the current source ID, and we
1394                    // need to restore after `add_source` returns.
1395                    let source_id =
1396                        self.report_builder.get_current_source_id().unwrap();
1397
1398                    let source_code =
1399                        SourceCode::from(included_src.as_slice()).with_origin(
1400                            // In Windows the paths separators are backslashes, but we
1401                            // want to use slashes.
1402                            included_path.to_str().unwrap().replace("\\", "/"),
1403                        );
1404
1405                    self.include_stack.push(included_path);
1406
1407                    // Any error generated while processing the included source
1408                    // code will be added to `self.errors`. The error returned
1409                    // by `add_source` is simply the first of the added errors,
1410                    // we don't need to handle the error here.
1411                    let _ = self.add_source(source_code);
1412
1413                    // Restore the current source ID to the value it had before
1414                    // calling `add_source`.
1415                    self.report_builder.set_current_source_id(source_id);
1416
1417                    self.include_stack.pop().unwrap();
1418                }
1419                ast::Item::Rule(rule) => {
1420                    if let Err(err) = self.c_rule(rule) {
1421                        self.errors.push(err);
1422                    }
1423                }
1424            }
1425        }
1426    }
1427
1428    fn c_rule(&mut self, rule: &ast::Rule) -> Result<(), CompileError> {
1429        // Check if another rule, module or variable has the same identifier
1430        // and return an error in that case.
1431        self.check_for_existing_identifier(&rule.identifier)?;
1432
1433        // Check that rule tags, if any, doesn't contain duplicates.
1434        if let Some(tags) = &rule.tags {
1435            self.check_for_duplicate_tags(tags.as_slice())?;
1436        }
1437
1438        // Check the rule with all the linters.
1439        let mut first_linter_err: Option<CompileError> = None;
1440        for linter in self.linters.iter() {
1441            match linter.check(&self.report_builder, rule) {
1442                LinterResult::Ok => {}
1443                LinterResult::Warn(warning) => {
1444                    self.warnings.add(|| warning);
1445                }
1446                LinterResult::Warns(warnings) => {
1447                    for warning in warnings {
1448                        self.warnings.add(|| warning);
1449                    }
1450                }
1451                LinterResult::Err(err) => {
1452                    if first_linter_err.is_none() {
1453                        first_linter_err = Some(err);
1454                    } else {
1455                        self.errors.push(err);
1456                    }
1457                }
1458            }
1459        }
1460        if let Some(err) = first_linter_err {
1461            return Err(err);
1462        }
1463
1464        // Take snapshot of the current compiler state. In case of error
1465        // compiling the current rule this snapshot allows restoring the
1466        // compiler to the state it had before starting compiling the rule.
1467        // This way we don't leave too much junk, like atoms, or sub-patterns
1468        // corresponding to failed rules. However, there is some junk left
1469        // behind in `ident_pool` and `lit_pool`, because once a string is
1470        // added to one of these pools it can't be removed.
1471        let snapshot = self.take_snapshot();
1472
1473        let tags: Vec<IdentId> = rule
1474            .tags
1475            .iter()
1476            .flatten()
1477            .map(|t| self.ident_pool.get_or_intern(t.name))
1478            .collect();
1479
1480        // Helper function that converts from `ast::MetaValue` to
1481        // `compiler::rules::MetaValue`.
1482        let mut convert_meta_value = |value: &ast::MetaValue| match value {
1483            ast::MetaValue::Integer((i, _)) => MetaValue::Integer(*i),
1484            ast::MetaValue::Float((f, _)) => MetaValue::Float(*f),
1485            ast::MetaValue::Bool((b, _)) => MetaValue::Bool(*b),
1486            ast::MetaValue::String((s, _)) => {
1487                MetaValue::String(self.lit_pool.get_or_intern(s))
1488            }
1489            ast::MetaValue::Bytes((s, _)) => {
1490                MetaValue::Bytes(self.lit_pool.get_or_intern(s))
1491            }
1492        };
1493
1494        // Build a vector of pairs (IdentId, MetaValue) for every meta defined
1495        // in the rule.
1496        let metadata = rule
1497            .meta
1498            .iter()
1499            .flatten()
1500            .map(|m| {
1501                (
1502                    self.ident_pool.get_or_intern(m.identifier.name),
1503                    convert_meta_value(&m.value),
1504                )
1505            })
1506            .collect();
1507
1508        let mut rule_patterns = Vec::new();
1509
1510        let mut ctx = CompileContext {
1511            ir: &mut self.ir,
1512            relaxed_re_syntax: self.relaxed_re_syntax,
1513            error_on_slow_loop: self.error_on_slow_loop,
1514            one_shot_symbol_table: None,
1515            symbol_table: &mut self.symbol_table,
1516            report_builder: &self.report_builder,
1517            current_rule_patterns: &mut rule_patterns,
1518            warnings: &mut self.warnings,
1519            vars: VarStack::new(),
1520            for_of_depth: 0,
1521            features: &self.features,
1522            loop_iteration_multiplier: 1,
1523        };
1524
1525        // Convert the patterns from AST to IR. This populates the
1526        // `ctx.current_rule_patterns` vector.
1527        if let Err(err) = patterns_from_ast(&mut ctx, rule) {
1528            drop(ctx);
1529            self.restore_snapshot(snapshot);
1530            return Err(err);
1531        }
1532
1533        // Convert the condition from AST to IR. Also updates the patterns
1534        // with information about whether they are used in the condition and
1535        // if they are anchored or not.
1536        let condition = rule_condition_from_ast(&mut ctx, rule);
1537
1538        drop(ctx);
1539
1540        // Search for patterns that are very common byte repetitions like:
1541        //
1542        //   00 00 00 00 00 00 ....
1543        //   90 90 09 90 90 90 ....
1544        //   FF FF FF FF FF FF ....
1545        //
1546        // Raise a warning when such a pattern is found, except in the
1547        // following cases:
1548        //
1549        // 1) When the pattern is anchored, because anchored pattern can appear
1550        //    only at a fixed offset and are not searched by Aho-Corasick.
1551        //
1552        // 2) When the pattern has attributes: xor, fullword, base64 or
1553        //    base64wide, because in those cases the real pattern is not that
1554        //    common.
1555        //
1556        // Note: this can't be done before calling `rule_condition_from_ast`,
1557        // because we don't know which patterns are anchored until the condition
1558        // is processed.
1559        for pat in rule_patterns.iter() {
1560            if pat.anchored_at().is_none()
1561                && !pat.pattern().flags().intersects(
1562                    PatternFlags::Xor
1563                        | PatternFlags::Fullword
1564                        | PatternFlags::Base64
1565                        | PatternFlags::Base64Wide,
1566                )
1567            {
1568                let literal_bytes = match pat.pattern() {
1569                    Pattern::Text(lit) => Some(lit.text.as_bytes()),
1570                    Pattern::Regexp(re) => re.hir.as_literal_bytes(),
1571                    Pattern::Hex(re) => re.hir.as_literal_bytes(),
1572                };
1573                if let Some(literal_bytes) = literal_bytes
1574                    && Self::common_byte_repetition(literal_bytes)
1575                {
1576                    self.warnings.add(|| {
1577                        warnings::SlowPattern::build(
1578                            &self.report_builder,
1579                            self.report_builder
1580                                .span_to_code_loc(pat.span().clone()),
1581                            None,
1582                        )
1583                    });
1584                }
1585            }
1586        }
1587
1588        // In case of error, restore the compiler to the state it was before
1589        // entering this function. Also, if the error is due to an unknown
1590        // identifier, but the identifier is one of the unsupported modules,
1591        // the error is tolerated and a warning is issued instead.
1592        let mut condition = match condition {
1593            Ok(condition) => condition,
1594            Err(CompileError::UnknownIdentifier(unknown))
1595                if self.ignored_rules.contains_key(unknown.identifier())
1596                    || self.ignored_modules.contains(unknown.identifier()) =>
1597            {
1598                self.restore_snapshot(snapshot);
1599
1600                if let Some(module_name) =
1601                    self.ignored_rules.get(unknown.identifier())
1602                {
1603                    self.warnings.add(|| {
1604                        warnings::IgnoredRule::build(
1605                            &self.report_builder,
1606                            module_name.clone(),
1607                            rule.identifier.name.to_string(),
1608                            unknown.identifier_location().clone(),
1609                        )
1610                    });
1611                    self.ignored_rules.insert(
1612                        rule.identifier.name.to_string(),
1613                        module_name.clone(),
1614                    );
1615                } else {
1616                    self.warnings.add(|| {
1617                        warnings::IgnoredModule::build(
1618                            &self.report_builder,
1619                            unknown.identifier().to_string(),
1620                            unknown.identifier_location().clone(),
1621                            Some(format!(
1622                                "the whole rule `{}` will be ignored",
1623                                rule.identifier.name
1624                            )),
1625                        )
1626                    });
1627                    self.ignored_rules.insert(
1628                        rule.identifier.name.to_string(),
1629                        unknown.identifier().to_string(),
1630                    );
1631                }
1632
1633                return Ok(());
1634            }
1635            Err(err) => {
1636                self.restore_snapshot(snapshot);
1637                return Err(err);
1638            }
1639        };
1640
1641        if self.hoisting {
1642            condition = self.ir.hoisting();
1643        }
1644
1645        // Analyze the condition and determine the bounds it imposes to
1646        // `filesize`, if any.
1647        let filesize_bounds = self.ir.filesize_bounds();
1648
1649        // Set the bounds to all patterns in the rule. This must be done
1650        // before assigning the PatternId to each pattern, as the filesize
1651        // bounds are taken into account when determining if the pattern
1652        // is unique or re-used from a previous rule.
1653        if !filesize_bounds.unbounded() {
1654            for pattern in &mut rule_patterns {
1655                pattern.pattern_mut().set_filesize_bounds(&filesize_bounds);
1656            }
1657        }
1658
1659        if let Some(w) = &mut self.ir_writer {
1660            writeln!(w, "RULE {}", rule.identifier.name).unwrap();
1661            writeln!(w, "{:?}", self.ir).unwrap();
1662            if !filesize_bounds.unbounded() {
1663                writeln!(w, "{filesize_bounds:?}\n",).unwrap();
1664            }
1665        }
1666
1667        let mut pattern_ids = Vec::with_capacity(rule_patterns.len());
1668        let mut patterns = Vec::with_capacity(rule_patterns.len());
1669        let mut pending_patterns = HashSet::new();
1670        let mut num_private_patterns = 0;
1671
1672        for pattern in &rule_patterns {
1673            // Raise error is some pattern was not used, except if the pattern
1674            // identifier starts with underscore.
1675            if !pattern.in_use() && !pattern.identifier().starts_with("$_") {
1676                self.restore_snapshot(snapshot);
1677                return Err(UnusedPattern::build(
1678                    &self.report_builder,
1679                    pattern.identifier().name.to_string(),
1680                    self.report_builder
1681                        .span_to_code_loc(pattern.identifier().span()),
1682                ));
1683            }
1684
1685            if pattern.pattern().flags().contains(PatternFlags::Private) {
1686                num_private_patterns += 1;
1687            }
1688
1689            // Check if this pattern has been declared before, in this rule or
1690            // in some other rule. In such cases the pattern ID is re-used, and
1691            // we don't need to process (i.e: extract atoms and add them to
1692            // Aho-Corasick automaton) the pattern again. Two patterns are
1693            // considered equal if they are exactly the same, including any
1694            // modifiers associated to the pattern, both are non-anchored
1695            // or anchored at the same file offset, and if they have the same
1696            // file size bounds.
1697            let pattern_id =
1698                match self.patterns.entry(pattern.pattern().clone()) {
1699                    // The pattern already exists, return the existing ID.
1700                    Entry::Occupied(entry) => *entry.get(),
1701                    // The pattern didn't exist.
1702                    Entry::Vacant(entry) => {
1703                        let pattern_id = self.next_pattern_id;
1704                        self.next_pattern_id.incr(1);
1705                        pending_patterns.insert(pattern_id);
1706                        entry.insert(pattern_id);
1707                        pattern_id
1708                    }
1709                };
1710
1711            let kind = match pattern.pattern() {
1712                Pattern::Text(_) => PatternKind::Text,
1713                Pattern::Regexp(_) => PatternKind::Regexp,
1714                Pattern::Hex(_) => PatternKind::Hex,
1715            };
1716
1717            patterns.push(PatternInfo {
1718                kind,
1719                pattern_id,
1720                ident_id: self
1721                    .ident_pool
1722                    .get_or_intern(pattern.identifier().name),
1723                is_private: pattern
1724                    .pattern()
1725                    .flags()
1726                    .contains(PatternFlags::Private),
1727            });
1728
1729            pattern_ids.push(pattern_id);
1730        }
1731
1732        // The RuleId for the new rule is current length of `self.rules`. The
1733        // first rule has RuleId = 0.
1734        let rule_id = RuleId::from(self.rules.len());
1735
1736        self.rules.push(RuleInfo {
1737            tags,
1738            metadata,
1739            patterns,
1740            num_private_patterns,
1741            is_global: rule.flags.contains(RuleFlags::Global),
1742            is_private: rule.flags.contains(RuleFlags::Private),
1743            namespace_id: self.current_namespace.id,
1744            namespace_ident_id: self.current_namespace.ident_id,
1745            ident_id: self.ident_pool.get_or_intern(rule.identifier.name),
1746            ident_ref: self
1747                .report_builder
1748                .span_to_code_loc(rule.identifier.span()),
1749        });
1750
1751        // Process the patterns in the rule. This extracts the best atoms
1752        // from each pattern, adding them to the `self.atoms` vector, it
1753        // also creates one or more sub-patterns per pattern and adds them
1754        // to `self.sub_patterns`
1755        for (pattern_id, pattern) in
1756            izip!(pattern_ids.iter(), rule_patterns.into_iter())
1757        {
1758            if pending_patterns.contains(pattern_id) {
1759                let pattern_span = pattern.span().clone();
1760                match pattern.into_pattern() {
1761                    Pattern::Text(pattern) => {
1762                        self.c_literal_pattern(*pattern_id, pattern);
1763                    }
1764                    Pattern::Regexp(pattern) | Pattern::Hex(pattern) => {
1765                        if let Err(err) = self.c_regexp_pattern(
1766                            *pattern_id,
1767                            pattern,
1768                            pattern_span,
1769                        ) {
1770                            self.restore_snapshot(snapshot);
1771                            return Err(err);
1772                        }
1773                    }
1774                };
1775                if !filesize_bounds.unbounded()
1776                    && self
1777                        .filesize_bounds
1778                        .insert(*pattern_id, filesize_bounds.clone())
1779                        .is_some()
1780                {
1781                    // This should not happen.
1782                    panic!(
1783                        "modifying the file size bounds of an existing pattern"
1784                    )
1785                }
1786                pending_patterns.remove(pattern_id);
1787            }
1788        }
1789
1790        // Create a new symbol of bool type for the rule.
1791        let new_symbol = Symbol::Rule {
1792            rule_id,
1793            is_global: rule.flags.contains(RuleFlags::Global),
1794        };
1795
1796        // Insert the symbol in the symbol table corresponding to the
1797        // current namespace. This must be done after every fallible function
1798        // has been called; once the symbol is inserted in the symbol table,
1799        // it can't be undone.
1800        let existing_symbol = self
1801            .current_namespace
1802            .symbols
1803            .as_ref()
1804            .borrow_mut()
1805            .insert(rule.identifier.name, new_symbol);
1806
1807        // No other symbol with the same identifier should exist.
1808        assert!(existing_symbol.is_none());
1809
1810        // The last step is emitting the WASM code corresponding to the rule's
1811        // condition. This is done after every fallible function has been called
1812        // because once the code is emitted it cannot be undone, which means
1813        // that if this function fails after emitting the code, some code debris
1814        // will remain in the WASM module.
1815        let mut ctx = EmitContext {
1816            current_rule: self.rules.last_mut().unwrap(),
1817            lit_pool: &mut self.lit_pool,
1818            regexp_pool: &mut self.regexp_pool,
1819            wasm_symbols: &self.wasm_symbols,
1820            wasm_exports: &self.wasm_exports,
1821            exception_handler_stack: Vec::new(),
1822            lookup_list: Vec::new(),
1823            emit_search_for_pattern_stack: Vec::new(),
1824        };
1825
1826        emit_rule_condition(
1827            &mut ctx,
1828            &self.ir,
1829            rule_id,
1830            condition,
1831            &mut self.wasm_mod,
1832        );
1833
1834        Ok(())
1835    }
1836
1837    fn c_import(&mut self, import: &Import) -> Result<(), CompileError> {
1838        let module_name = import.module_name;
1839        let module = BUILTIN_MODULES.get(module_name);
1840
1841        // Does a module with the given name actually exist? ...
1842        if module.is_none() {
1843            // The module does not exist, but it is included in the list
1844            // of unsupported modules. In such cases we don't raise an error,
1845            // only a warning.
1846            return if self.ignored_modules.iter().any(|m| m == module_name) {
1847                self.warnings.add(|| {
1848                    warnings::IgnoredModule::build(
1849                        &self.report_builder,
1850                        module_name.to_string(),
1851                        self.report_builder.span_to_code_loc(import.span()),
1852                        None,
1853                    )
1854                });
1855                Ok(())
1856            } else {
1857                // The module does not exist, and is not explicitly added to
1858                // the list of unsupported modules, that's an error.
1859                Err(UnknownModule::build(
1860                    &self.report_builder,
1861                    module_name.to_string(),
1862                    self.report_builder.span_to_code_loc(import.span()),
1863                ))
1864            };
1865        }
1866
1867        // Yes, module exists.
1868        let module = module.unwrap();
1869
1870        // If the module has not been added to `self.root_struct` and
1871        // `self.imported_modules`, do it.
1872        if !self.root_struct.has_field(module_name) {
1873            // Add the module to the list of imported modules.
1874            self.imported_modules
1875                .push(self.ident_pool.get_or_intern(module_name));
1876
1877            // Create the `Struct` that describes the module.
1878            let module_struct = Rc::<Struct>::from(module);
1879
1880            // Insert the module in the struct that contains all imported
1881            // modules. This struct contains all modules imported, from
1882            // all namespaces. Panic if the module was already in the struct.
1883            if self
1884                .root_struct
1885                .add_field(module_name, TypeValue::Struct(module_struct))
1886                .is_some()
1887            {
1888                panic!("duplicate module `{module_name}`")
1889            }
1890        }
1891
1892        let mut symbol_table =
1893            self.current_namespace.symbols.as_ref().borrow_mut();
1894
1895        // Create a symbol for the module and insert it in the symbol
1896        // table for this namespace, if it doesn't exist.
1897        if !symbol_table.contains(module_name) {
1898            symbol_table.insert(
1899                module_name,
1900                self.root_struct.lookup(module_name).unwrap(),
1901            );
1902        }
1903
1904        // Is the module banned? If yes, produce an error. Notice however that
1905        // this check is done after the module has been added to the symbol
1906        // table because we don't want additional errors due to undefined
1907        // identifiers when the banned module is used in some rule condition.
1908        if let Some((error_title, error_msg)) =
1909            self.banned_modules.get(module_name)
1910        {
1911            return Err(CustomError::build(
1912                &self.report_builder,
1913                error_title.clone(),
1914                error_msg.clone(),
1915                self.report_builder.span_to_code_loc(import.span()),
1916            ));
1917        }
1918
1919        Ok(())
1920    }
1921
1922    fn c_literal_pattern(
1923        &mut self,
1924        pattern_id: PatternId,
1925        pattern: LiteralPattern,
1926    ) {
1927        let full_word = pattern.flags.contains(PatternFlags::Fullword);
1928        let mut flags = SubPatternFlags::empty();
1929
1930        if full_word {
1931            flags.insert(SubPatternFlags::FullwordLeft);
1932            flags.insert(SubPatternFlags::FullwordRight);
1933        }
1934
1935        // Depending on the combination of `ascii` and `wide` modifiers, the
1936        // `main_patterns` vector will contain either the pattern's `ascii`
1937        // version, the `wide` version, or both. Each item in `main_patterns`
1938        // also contains the best atom for the pattern.
1939        let mut main_patterns = Vec::new();
1940        let wide_pattern;
1941
1942        if pattern.flags.contains(PatternFlags::Wide) {
1943            wide_pattern = make_wide(pattern.text.as_bytes());
1944            main_patterns.push((
1945                wide_pattern.as_slice(),
1946                best_atom_in_bytes(wide_pattern.as_slice()),
1947                flags | SubPatternFlags::Wide,
1948            ));
1949        }
1950
1951        if pattern.flags.contains(PatternFlags::Ascii) {
1952            main_patterns.push((
1953                pattern.text.as_bytes(),
1954                best_atom_in_bytes(pattern.text.as_bytes()),
1955                flags,
1956            ));
1957        }
1958
1959        for (main_pattern, best_atom, flags) in main_patterns {
1960            let pattern_lit_id = self.lit_pool.get_or_intern(main_pattern);
1961
1962            if pattern.flags.contains(PatternFlags::Xor) {
1963                // When `xor` is used, `base64`, `base64wide` and `nocase` are
1964                // not accepted.
1965                debug_assert!(!pattern.flags.contains(
1966                    PatternFlags::Base64
1967                        | PatternFlags::Base64Wide
1968                        | PatternFlags::Nocase,
1969                ));
1970
1971                let xor_range = pattern.xor_range.clone().unwrap();
1972                self.add_sub_pattern(
1973                    pattern_id,
1974                    SubPattern::Xor { pattern: pattern_lit_id, flags },
1975                    best_atom.xor_combinations(xor_range),
1976                    SubPatternAtom::from_atom,
1977                );
1978            } else if pattern.flags.contains(PatternFlags::Nocase) {
1979                // When `nocase` is used, `base64`, `base64wide` and `xor` are
1980                // not accepted.
1981                debug_assert!(!pattern.flags.contains(
1982                    PatternFlags::Base64
1983                        | PatternFlags::Base64Wide
1984                        | PatternFlags::Xor,
1985                ));
1986
1987                self.add_sub_pattern(
1988                    pattern_id,
1989                    SubPattern::Literal {
1990                        pattern: pattern_lit_id,
1991                        flags: flags | SubPatternFlags::Nocase,
1992                        anchored_at: None,
1993                    },
1994                    best_atom.case_combinations(),
1995                    SubPatternAtom::from_atom,
1996                );
1997            }
1998            // Used `base64`, or `base64wide`, or both.
1999            else if pattern
2000                .flags
2001                .intersects(PatternFlags::Base64 | PatternFlags::Base64Wide)
2002            {
2003                // When `base64` or `base64wide` are used, `xor`, `fullword`
2004                // and `nocase` are not accepted.
2005                debug_assert!(!pattern.flags.contains(
2006                    PatternFlags::Xor
2007                        | PatternFlags::Fullword
2008                        | PatternFlags::Nocase,
2009                ));
2010
2011                if pattern.flags.contains(PatternFlags::Base64) {
2012                    for (padding, base64_pattern) in base64_patterns(
2013                        main_pattern,
2014                        pattern.base64_alphabet.as_deref(),
2015                    ) {
2016                        let sub_pattern = if let Some(alphabet) =
2017                            pattern.base64_alphabet.as_deref()
2018                        {
2019                            SubPattern::CustomBase64 {
2020                                pattern: pattern_lit_id,
2021                                alphabet: self
2022                                    .lit_pool
2023                                    .get_or_intern(alphabet),
2024                                padding,
2025                            }
2026                        } else {
2027                            SubPattern::Base64 {
2028                                pattern: pattern_lit_id,
2029                                padding,
2030                            }
2031                        };
2032
2033                        self.add_sub_pattern(
2034                            pattern_id,
2035                            sub_pattern,
2036                            iter::once({
2037                                let mut atom = best_atom_in_bytes(
2038                                    base64_pattern.as_slice(),
2039                                );
2040                                // Atoms for base64 patterns are always
2041                                // inexact, they require verification.
2042                                atom.make_inexact();
2043                                atom
2044                            }),
2045                            SubPatternAtom::from_atom,
2046                        );
2047                    }
2048                }
2049
2050                if pattern.flags.contains(PatternFlags::Base64Wide) {
2051                    for (padding, base64_pattern) in base64_patterns(
2052                        main_pattern,
2053                        pattern.base64wide_alphabet.as_deref(),
2054                    ) {
2055                        let sub_pattern = if let Some(alphabet) =
2056                            pattern.base64wide_alphabet.as_deref()
2057                        {
2058                            SubPattern::CustomBase64Wide {
2059                                pattern: pattern_lit_id,
2060                                alphabet: self
2061                                    .lit_pool
2062                                    .get_or_intern(alphabet),
2063                                padding,
2064                            }
2065                        } else {
2066                            SubPattern::Base64Wide {
2067                                pattern: pattern_lit_id,
2068                                padding,
2069                            }
2070                        };
2071
2072                        let wide = make_wide(base64_pattern.as_slice());
2073
2074                        self.add_sub_pattern(
2075                            pattern_id,
2076                            sub_pattern,
2077                            iter::once({
2078                                let mut atom =
2079                                    best_atom_in_bytes(wide.as_slice());
2080                                // Atoms for base64 patterns are always
2081                                // inexact, they require verification.
2082                                atom.make_inexact();
2083                                atom
2084                            }),
2085                            SubPatternAtom::from_atom,
2086                        );
2087                    }
2088                }
2089            } else {
2090                self.add_sub_pattern(
2091                    pattern_id,
2092                    SubPattern::Literal {
2093                        pattern: pattern_lit_id,
2094                        anchored_at: pattern.anchored_at,
2095                        flags,
2096                    },
2097                    iter::once(best_atom),
2098                    SubPatternAtom::from_atom,
2099                );
2100            }
2101        }
2102    }
2103
2104    fn c_regexp_pattern(
2105        &mut self,
2106        pattern_id: PatternId,
2107        pattern: RegexpPattern,
2108        span: Span,
2109    ) -> Result<(), CompileError> {
2110        // Try splitting the regexp into multiple chained sub-patterns if it
2111        // contains large gaps. For example, `{ 01 02 03 [-] 04 05 06 }` is
2112        // split into `{ 01 02 03 }` and `{ 04 05 06 }`, where `{ 04 05 06 }`
2113        // is chained to `{ 01 02 03 }`.
2114        //
2115        // If the regexp can't be split then `head` is the whole regexp.
2116        let (head, tail) = pattern.hir.split_at_large_gaps();
2117
2118        if !tail.is_empty() {
2119            // The pattern was split into multiple chained regexps.
2120            return self.c_chain(
2121                pattern_id,
2122                &head,
2123                &tail,
2124                pattern.flags,
2125                span,
2126            );
2127        }
2128
2129        if head.is_alternation_literal() {
2130            // The pattern is either a literal, or an alternation of literals.
2131            // Examples:
2132            //   /foo/
2133            //   /foo|bar|baz/
2134            //   { 01 02 03 }
2135            //   { (01 02 03 | 04 05 06 ) }
2136            return self.c_alternation_literal(
2137                pattern_id,
2138                head,
2139                pattern.anchored_at,
2140                pattern.flags,
2141            );
2142        }
2143
2144        // If this point is reached, this is a pattern that can't be split into
2145        // multiple chained patterns, and is neither a literal or alternation
2146        // of literals. Most patterns fall in this category.
2147        let mut flags = SubPatternFlags::empty();
2148
2149        if pattern.flags.contains(PatternFlags::Nocase) {
2150            flags.insert(SubPatternFlags::Nocase);
2151        }
2152
2153        if pattern.flags.contains(PatternFlags::Fullword) {
2154            flags.insert(SubPatternFlags::FullwordLeft);
2155            flags.insert(SubPatternFlags::FullwordRight);
2156        }
2157
2158        if matches!(head.is_greedy(), Some(true)) {
2159            flags.insert(SubPatternFlags::GreedyRegexp);
2160        }
2161
2162        let (atoms, is_fast_regexp) = self.c_regexp(&head, span)?;
2163
2164        if is_fast_regexp {
2165            flags.insert(SubPatternFlags::FastRegexp);
2166        }
2167
2168        if pattern.flags.contains(PatternFlags::Wide) {
2169            self.add_sub_pattern(
2170                pattern_id,
2171                SubPattern::Regexp { flags: flags | SubPatternFlags::Wide },
2172                atoms.iter().cloned().map(|atom| atom.make_wide()),
2173                SubPatternAtom::from_regexp_atom,
2174            );
2175        }
2176
2177        if pattern.flags.contains(PatternFlags::Ascii) {
2178            self.add_sub_pattern(
2179                pattern_id,
2180                SubPattern::Regexp { flags },
2181                atoms.into_iter(),
2182                SubPatternAtom::from_regexp_atom,
2183            );
2184        }
2185
2186        Ok(())
2187    }
2188
2189    fn c_alternation_literal(
2190        &mut self,
2191        pattern_id: PatternId,
2192        hir: re::hir::Hir,
2193        anchored_at: Option<usize>,
2194        flags: PatternFlags,
2195    ) -> Result<(), CompileError> {
2196        let ascii = flags.contains(PatternFlags::Ascii);
2197        let wide = flags.contains(PatternFlags::Wide);
2198        let case_insensitive = flags.contains(PatternFlags::Nocase);
2199        let full_word = flags.contains(PatternFlags::Fullword);
2200
2201        let mut flags = SubPatternFlags::empty();
2202
2203        if case_insensitive {
2204            flags.insert(SubPatternFlags::Nocase);
2205        }
2206
2207        if full_word {
2208            flags.insert(SubPatternFlags::FullwordLeft);
2209            flags.insert(SubPatternFlags::FullwordRight);
2210        }
2211
2212        let mut process_literal = |literal: &hir::Literal, wide: bool| {
2213            let pattern_lit_id =
2214                self.intern_literal(literal.0.as_bytes(), wide);
2215
2216            let best_atom = best_atom_in_bytes(
2217                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2218            );
2219
2220            let flags =
2221                if wide { flags | SubPatternFlags::Wide } else { flags };
2222
2223            let sub_pattern = SubPattern::Literal {
2224                pattern: pattern_lit_id,
2225                anchored_at,
2226                flags,
2227            };
2228
2229            if case_insensitive {
2230                self.add_sub_pattern(
2231                    pattern_id,
2232                    sub_pattern,
2233                    best_atom.case_combinations(),
2234                    SubPatternAtom::from_atom,
2235                );
2236            } else {
2237                self.add_sub_pattern(
2238                    pattern_id,
2239                    sub_pattern,
2240                    iter::once(best_atom),
2241                    SubPatternAtom::from_atom,
2242                );
2243            }
2244        };
2245
2246        let inner;
2247
2248        let hir = if let hir::HirKind::Capture(group) = hir.kind() {
2249            group.sub.as_ref()
2250        } else {
2251            inner = hir.into_inner();
2252            &inner
2253        };
2254
2255        match hir.kind() {
2256            hir::HirKind::Literal(literal) => {
2257                if ascii {
2258                    process_literal(literal, false);
2259                }
2260                if wide {
2261                    process_literal(literal, true);
2262                }
2263            }
2264            hir::HirKind::Alternation(literals) => {
2265                let literals = literals
2266                    .iter()
2267                    .map(|l| cast!(l.kind(), hir::HirKind::Literal));
2268                for literal in literals {
2269                    if ascii {
2270                        process_literal(literal, false);
2271                    }
2272                    if wide {
2273                        process_literal(literal, true);
2274                    }
2275                }
2276            }
2277            _ => unreachable!(),
2278        }
2279
2280        Ok(())
2281    }
2282
2283    fn c_chain(
2284        &mut self,
2285        pattern_id: PatternId,
2286        leading: &re::hir::Hir,
2287        trailing: &[ChainedPattern],
2288        flags: PatternFlags,
2289        span: Span,
2290    ) -> Result<(), CompileError> {
2291        let ascii = flags.contains(PatternFlags::Ascii);
2292        let wide = flags.contains(PatternFlags::Wide);
2293        let case_insensitive = flags.contains(PatternFlags::Nocase);
2294        let full_word = flags.contains(PatternFlags::Fullword);
2295
2296        let mut common_flags = SubPatternFlags::empty();
2297
2298        if case_insensitive {
2299            common_flags.insert(SubPatternFlags::Nocase);
2300        }
2301
2302        if matches!(leading.is_greedy(), Some(true)) {
2303            common_flags.insert(SubPatternFlags::GreedyRegexp);
2304        }
2305
2306        let mut prev_sub_pattern_ascii = SubPatternId(0);
2307        let mut prev_sub_pattern_wide = SubPatternId(0);
2308
2309        if let hir::HirKind::Literal(literal) = leading.kind() {
2310            let mut flags = common_flags;
2311
2312            if full_word {
2313                flags.insert(SubPatternFlags::FullwordLeft);
2314            }
2315
2316            if ascii {
2317                prev_sub_pattern_ascii =
2318                    self.c_literal_chain_head(pattern_id, literal, flags);
2319            }
2320
2321            if wide {
2322                prev_sub_pattern_wide = self.c_literal_chain_head(
2323                    pattern_id,
2324                    literal,
2325                    flags | SubPatternFlags::Wide,
2326                );
2327            };
2328        } else {
2329            let mut flags = common_flags;
2330
2331            let (atoms, is_fast_regexp) =
2332                self.c_regexp(leading, span.clone())?;
2333
2334            if is_fast_regexp {
2335                flags.insert(SubPatternFlags::FastRegexp);
2336            }
2337
2338            if full_word {
2339                flags.insert(SubPatternFlags::FullwordLeft);
2340            }
2341
2342            if wide {
2343                prev_sub_pattern_wide = self.add_sub_pattern(
2344                    pattern_id,
2345                    SubPattern::RegexpChainHead {
2346                        flags: flags | SubPatternFlags::Wide,
2347                    },
2348                    atoms.iter().cloned().map(|atom| atom.make_wide()),
2349                    SubPatternAtom::from_regexp_atom,
2350                );
2351            }
2352
2353            if ascii {
2354                prev_sub_pattern_ascii = self.add_sub_pattern(
2355                    pattern_id,
2356                    SubPattern::RegexpChainHead { flags },
2357                    atoms.into_iter(),
2358                    SubPatternAtom::from_regexp_atom,
2359                );
2360            }
2361        }
2362
2363        for (i, p) in trailing.iter().enumerate() {
2364            let mut flags = common_flags;
2365
2366            // The last pattern in the chain has the `LastInChain` flag and
2367            // the `FullwordRight` if the original pattern was `Fullword`.
2368            // Patterns in the middle of the chain won't have either of these
2369            // flags.
2370            if i == trailing.len() - 1 {
2371                flags.insert(SubPatternFlags::LastInChain);
2372                if full_word {
2373                    flags.insert(SubPatternFlags::FullwordRight);
2374                }
2375            }
2376
2377            if let hir::HirKind::Literal(literal) = p.hir.kind() {
2378                if wide {
2379                    prev_sub_pattern_wide = self.c_literal_chain_tail(
2380                        pattern_id,
2381                        literal,
2382                        prev_sub_pattern_wide,
2383                        p.gap.clone(),
2384                        flags | SubPatternFlags::Wide,
2385                    );
2386                };
2387                if ascii {
2388                    prev_sub_pattern_ascii = self.c_literal_chain_tail(
2389                        pattern_id,
2390                        literal,
2391                        prev_sub_pattern_ascii,
2392                        p.gap.clone(),
2393                        flags,
2394                    );
2395                }
2396            } else {
2397                if matches!(p.hir.is_greedy(), Some(true)) {
2398                    flags.insert(SubPatternFlags::GreedyRegexp);
2399                }
2400
2401                let (atoms, is_fast_regexp) =
2402                    self.c_regexp(&p.hir, span.clone())?;
2403
2404                if is_fast_regexp {
2405                    flags.insert(SubPatternFlags::FastRegexp);
2406                }
2407
2408                if wide {
2409                    prev_sub_pattern_wide = self.add_sub_pattern(
2410                        pattern_id,
2411                        SubPattern::RegexpChainTail {
2412                            chained_to: prev_sub_pattern_wide,
2413                            gap: p.gap.clone(),
2414                            flags: flags | SubPatternFlags::Wide,
2415                        },
2416                        atoms.iter().cloned().map(|atom| atom.make_wide()),
2417                        SubPatternAtom::from_regexp_atom,
2418                    )
2419                }
2420
2421                if ascii {
2422                    prev_sub_pattern_ascii = self.add_sub_pattern(
2423                        pattern_id,
2424                        SubPattern::RegexpChainTail {
2425                            chained_to: prev_sub_pattern_ascii,
2426                            gap: p.gap.clone(),
2427                            flags,
2428                        },
2429                        atoms.into_iter(),
2430                        SubPatternAtom::from_regexp_atom,
2431                    );
2432                }
2433            }
2434        }
2435
2436        Ok(())
2437    }
2438
2439    fn c_regexp(
2440        &mut self,
2441        hir: &re::hir::Hir,
2442        span: Span,
2443    ) -> Result<(Vec<re::RegexpAtom>, bool), CompileError> {
2444        // When the `fast-regexp` feature is enabled, try to compile the regexp
2445        // for `FastVM` first, if it fails with `Error::FastIncompatible`, the
2446        // regexp is not compatible for `FastVM` and `PikeVM` must be used
2447        // instead.
2448        #[cfg(feature = "fast-regexp")]
2449        let (result, is_fast_regexp) = match re::fast::Compiler::new()
2450            .compile(hir, &mut self.re_code)
2451        {
2452            Err(re::Error::FastIncompatible) => (
2453                re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2454                false,
2455            ),
2456            result => (result, true),
2457        };
2458
2459        #[cfg(not(feature = "fast-regexp"))]
2460        let (result, is_fast_regexp) = (
2461            re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2462            false,
2463        );
2464
2465        let re_atoms = result.map_err(|err| {
2466            InvalidRegexp::build(
2467                &self.report_builder,
2468                err.to_string(),
2469                self.report_builder.span_to_code_loc(span.clone()),
2470                None,
2471            )
2472        })?;
2473
2474        if matches!(hir.minimum_len(), Some(0)) {
2475            return Err(InvalidRegexp::build(
2476                &self.report_builder,
2477                "this regexp can match empty strings".to_string(),
2478                self.report_builder.span_to_code_loc(span),
2479                None,
2480            ));
2481        }
2482
2483        let (slow_pattern, note) =
2484            match re_atoms.iter().map(|re_atom| re_atom.atom.len()).minmax() {
2485                // No atoms, slow pattern.
2486                MinMaxResult::NoElements => (true, None),
2487                // Only one atom of len 0.
2488                MinMaxResult::OneElement(0) => (
2489                    true,
2490                    Some(
2491                        "this is an exceptionally extreme case that may severely degrade scanning throughput"
2492                            .to_string(),
2493                    ),
2494                ),
2495                // Only one atom shorter than 2 bytes, slow pattern.
2496                MinMaxResult::OneElement(len) if len < 2 => (true, None),
2497                // More than one atom, at least one is shorter than 2 bytes.
2498                MinMaxResult::MinMax(min, _) if min < 2 => (true, None),
2499                // More than 2700 atoms, all with exactly 2 bytes.
2500                // Why 2700?. The larger the number of atoms the higher the
2501                // odds of finding one of them in the data, which slows down
2502                // the scan. The regex [A-Za-z]{N,} (with N>=2) produces
2503                // (26+26)^2 = 2704 atoms. So, 2700 is large enough, but
2504                // produces a warning with the aforementioned regex.
2505                MinMaxResult::MinMax(2, 2) if re_atoms.len() > 2700 => {
2506                    (true, None)
2507                }
2508                // In all other cases the pattern is not slow.
2509                _ => (false, None),
2510            };
2511
2512        if slow_pattern {
2513            if self.error_on_slow_pattern {
2514                return Err(errors::SlowPattern::build(
2515                    &self.report_builder,
2516                    self.report_builder.span_to_code_loc(span),
2517                    note,
2518                ));
2519            } else {
2520                self.warnings.add(|| {
2521                    warnings::SlowPattern::build(
2522                        &self.report_builder,
2523                        self.report_builder.span_to_code_loc(span),
2524                        note,
2525                    )
2526                });
2527            }
2528        }
2529
2530        Ok((re_atoms, is_fast_regexp))
2531    }
2532
2533    fn c_literal_chain_head(
2534        &mut self,
2535        pattern_id: PatternId,
2536        literal: &hir::Literal,
2537        flags: SubPatternFlags,
2538    ) -> SubPatternId {
2539        let pattern_lit_id = self.intern_literal(
2540            literal.0.as_bytes(),
2541            flags.contains(SubPatternFlags::Wide),
2542        );
2543        self.add_sub_pattern(
2544            pattern_id,
2545            SubPattern::LiteralChainHead { pattern: pattern_lit_id, flags },
2546            extract_atoms(
2547                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2548                flags,
2549            ),
2550            SubPatternAtom::from_atom,
2551        )
2552    }
2553
2554    fn c_literal_chain_tail(
2555        &mut self,
2556        pattern_id: PatternId,
2557        literal: &hir::Literal,
2558        chained_to: SubPatternId,
2559        gap: ChainedPatternGap,
2560        flags: SubPatternFlags,
2561    ) -> SubPatternId {
2562        let pattern_lit_id = self.intern_literal(
2563            literal.0.as_bytes(),
2564            flags.contains(SubPatternFlags::Wide),
2565        );
2566        self.add_sub_pattern(
2567            pattern_id,
2568            SubPattern::LiteralChainTail {
2569                pattern: pattern_lit_id,
2570                chained_to,
2571                gap,
2572                flags,
2573            },
2574            extract_atoms(
2575                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2576                flags,
2577            ),
2578            SubPatternAtom::from_atom,
2579        )
2580    }
2581}
2582
2583impl fmt::Debug for Compiler<'_> {
2584    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2585        write!(f, "Compiler")
2586    }
2587}
2588
2589impl Default for Compiler<'_> {
2590    fn default() -> Self {
2591        Self::new()
2592    }
2593}
2594
2595/// ID associated to each identifier in the identifiers pool.
2596#[derive(Eq, PartialEq, Hash, Debug, Copy, Clone, Serialize, Deserialize)]
2597#[serde(transparent)]
2598pub(crate) struct IdentId(u32);
2599
2600impl From<u32> for IdentId {
2601    fn from(v: u32) -> Self {
2602        Self(v)
2603    }
2604}
2605
2606impl From<IdentId> for u32 {
2607    fn from(v: IdentId) -> Self {
2608        v.0
2609    }
2610}
2611
2612/// ID associated to each literal string in the literals pool.
2613#[derive(PartialEq, Debug, Copy, Clone, Serialize, Deserialize)]
2614#[serde(transparent)]
2615pub(crate) struct LiteralId(u32);
2616
2617impl From<i32> for LiteralId {
2618    fn from(v: i32) -> Self {
2619        Self(v as u32)
2620    }
2621}
2622
2623impl From<u32> for LiteralId {
2624    fn from(v: u32) -> Self {
2625        Self(v)
2626    }
2627}
2628
2629impl From<LiteralId> for u32 {
2630    fn from(v: LiteralId) -> Self {
2631        v.0
2632    }
2633}
2634
2635impl From<LiteralId> for i64 {
2636    fn from(v: LiteralId) -> Self {
2637        v.0 as i64
2638    }
2639}
2640
2641impl From<LiteralId> for u64 {
2642    fn from(v: LiteralId) -> Self {
2643        v.0 as u64
2644    }
2645}
2646
2647/// ID associated to each namespace.
2648#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
2649#[serde(transparent)]
2650pub(crate) struct NamespaceId(i32);
2651
2652/// ID associated to each rule.
2653#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Hash)]
2654pub(crate) struct RuleId(i32);
2655
2656impl RuleId {
2657    /// Returns the [`RuleId`] that comes after this one.
2658    ///
2659    /// This simply adds 1 to the ID.
2660    #[allow(dead_code)]
2661    pub(crate) fn next(&self) -> Self {
2662        RuleId(self.0 + 1)
2663    }
2664}
2665
2666impl From<i32> for RuleId {
2667    #[inline]
2668    fn from(value: i32) -> Self {
2669        Self(value)
2670    }
2671}
2672
2673impl From<usize> for RuleId {
2674    #[inline]
2675    fn from(value: usize) -> Self {
2676        Self(value.try_into().unwrap())
2677    }
2678}
2679
2680impl From<RuleId> for usize {
2681    #[inline]
2682    fn from(value: RuleId) -> Self {
2683        value.0 as usize
2684    }
2685}
2686
2687impl From<RuleId> for i32 {
2688    #[inline]
2689    fn from(value: RuleId) -> Self {
2690        value.0
2691    }
2692}
2693
2694/// ID associated to each regexp used in a rule condition.
2695#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
2696pub(crate) struct RegexpId(i32);
2697
2698impl From<i32> for RegexpId {
2699    #[inline]
2700    fn from(value: i32) -> Self {
2701        Self(value)
2702    }
2703}
2704
2705impl From<u32> for RegexpId {
2706    #[inline]
2707    fn from(value: u32) -> Self {
2708        Self(value.try_into().unwrap())
2709    }
2710}
2711
2712impl From<i64> for RegexpId {
2713    #[inline]
2714    fn from(value: i64) -> Self {
2715        Self(value.try_into().unwrap())
2716    }
2717}
2718
2719impl From<RegexpId> for usize {
2720    #[inline]
2721    fn from(value: RegexpId) -> Self {
2722        value.0 as usize
2723    }
2724}
2725
2726impl From<RegexpId> for i32 {
2727    #[inline]
2728    fn from(value: RegexpId) -> Self {
2729        value.0
2730    }
2731}
2732
2733impl From<RegexpId> for u32 {
2734    #[inline]
2735    fn from(value: RegexpId) -> Self {
2736        value.0.try_into().unwrap()
2737    }
2738}
2739
2740/// ID associated to each pattern.
2741///
2742/// For each unique pattern defined in a set of YARA rules there's a PatternId
2743/// that identifies it. If two different rules define exactly the same pattern
2744/// there's a single instance of the pattern and therefore a single PatternId
2745/// shared by both rules. For example, if one rule defines `$a = "mz"` and
2746/// another one `$mz = "mz"`, the pattern `"mz"` is shared by the two rules.
2747///
2748/// However, in order to be considered the same, the following conditions must
2749/// be met:
2750///
2751/// * Both patterns must have the same modifiers (i.e: `"mz" nocase` is not the
2752///   same pattern as `"mz"`),
2753/// * Both patterns must be either non-anchored, or anchored to the same offset.
2754/// * Both patterns must have the same file size bounds (or no bounds at all).
2755#[derive(
2756    Copy, Clone, Debug, Eq, Hash, PartialEq, PartialOrd, Serialize, Deserialize,
2757)]
2758#[serde(transparent)]
2759#[derive(Ord)]
2760pub(crate) struct PatternId(i32);
2761
2762impl PatternId {
2763    #[inline]
2764    fn incr(&mut self, amount: usize) {
2765        self.0 += amount as i32;
2766    }
2767}
2768
2769impl From<i32> for PatternId {
2770    #[inline]
2771    fn from(value: i32) -> Self {
2772        Self(value)
2773    }
2774}
2775
2776impl From<usize> for PatternId {
2777    #[inline]
2778    fn from(value: usize) -> Self {
2779        Self(value as i32)
2780    }
2781}
2782
2783impl From<PatternId> for i32 {
2784    #[inline]
2785    fn from(value: PatternId) -> Self {
2786        value.0
2787    }
2788}
2789
2790impl From<PatternId> for i64 {
2791    #[inline]
2792    fn from(value: PatternId) -> Self {
2793        value.0 as i64
2794    }
2795}
2796
2797impl From<PatternId> for usize {
2798    #[inline]
2799    fn from(value: PatternId) -> Self {
2800        value.0 as usize
2801    }
2802}
2803
2804/// ID associated to each sub-pattern.
2805///
2806/// For each pattern there's one or more sub-patterns, depending on the pattern
2807/// and its modifiers. For example the pattern `"foo" ascii wide` may have one
2808/// subpattern for the ascii case and another one for the wide case.
2809#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
2810#[serde(transparent)]
2811pub(crate) struct SubPatternId(u32);
2812
2813/// Iterator that yields the names of the modules imported by the rules.
2814pub struct Imports<'a> {
2815    iter: std::slice::Iter<'a, IdentId>,
2816    ident_pool: &'a StringPool<IdentId>,
2817}
2818
2819impl<'a> Iterator for Imports<'a> {
2820    type Item = &'a str;
2821
2822    fn next(&mut self) -> Option<Self::Item> {
2823        self.iter.next().map(|id| self.ident_pool.get(*id).unwrap())
2824    }
2825}
2826
2827bitflags! {
2828    /// Flags associated to some kinds of [`SubPattern`].
2829    #[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq)]
2830    pub struct SubPatternFlags: u16  {
2831        const Wide                 = 0x01;
2832        const Nocase               = 0x02;
2833        // Indicates that the pattern is the last one in chain. Applies only
2834        // to chained sub-patterns.
2835        const LastInChain          = 0x04;
2836        const FullwordLeft         = 0x08;
2837        const FullwordRight        = 0x10;
2838        // Indicates that the pattern is a greedy regexp. Apply only to regexp
2839        // sub-patterns, or to any sub-pattern is part of chain that corresponds
2840        // to a greedy regexp.
2841        const GreedyRegexp         = 0x20;
2842        // Indicates that the pattern is a fast regexp. A fast regexp is one
2843        // that can be matched by the FastVM.
2844        const FastRegexp           = 0x40;
2845    }
2846}
2847
2848/// A sub-pattern in the compiled rules.
2849///
2850/// Each pattern in a rule has one or more associated sub-patterns. For
2851/// example, the pattern `$a = "foo" ascii wide` has a sub-pattern for the
2852/// ASCII variant of "foo", and another one for the wide variant.
2853///
2854/// Also, each [`Atom`] is associated to a [`SubPattern`]. When the atom is
2855/// found in the scanned data by the Aho-Corasick algorithm, the scanner
2856/// verifies that the sub-pattern actually matches.
2857#[derive(Serialize, Deserialize)]
2858pub(crate) enum SubPattern {
2859    Literal {
2860        pattern: LiteralId,
2861        anchored_at: Option<usize>,
2862        flags: SubPatternFlags,
2863    },
2864
2865    LiteralChainHead {
2866        pattern: LiteralId,
2867        flags: SubPatternFlags,
2868    },
2869
2870    LiteralChainTail {
2871        pattern: LiteralId,
2872        chained_to: SubPatternId,
2873        gap: ChainedPatternGap,
2874        flags: SubPatternFlags,
2875    },
2876
2877    Regexp {
2878        flags: SubPatternFlags,
2879    },
2880
2881    RegexpChainHead {
2882        flags: SubPatternFlags,
2883    },
2884
2885    RegexpChainTail {
2886        chained_to: SubPatternId,
2887        gap: ChainedPatternGap,
2888        flags: SubPatternFlags,
2889    },
2890
2891    Xor {
2892        pattern: LiteralId,
2893        flags: SubPatternFlags,
2894    },
2895
2896    Base64 {
2897        pattern: LiteralId,
2898        padding: u8,
2899    },
2900
2901    Base64Wide {
2902        pattern: LiteralId,
2903        padding: u8,
2904    },
2905
2906    CustomBase64 {
2907        pattern: LiteralId,
2908        alphabet: LiteralId,
2909        padding: u8,
2910    },
2911
2912    CustomBase64Wide {
2913        pattern: LiteralId,
2914        alphabet: LiteralId,
2915        padding: u8,
2916    },
2917}
2918
2919impl SubPattern {
2920    /// If this sub-pattern is chained to another one, returns the
2921    /// [`SubPatternId`] associated to this other pattern.
2922    pub fn chained_to(&self) -> Option<SubPatternId> {
2923        match self {
2924            SubPattern::LiteralChainTail { chained_to, .. }
2925            | SubPattern::RegexpChainTail { chained_to, .. } => {
2926                Some(*chained_to)
2927            }
2928            _ => None,
2929        }
2930    }
2931}
2932
2933/// A snapshot that represents the state of the compiler at a particular moment.
2934#[derive(Debug, PartialEq, Eq)]
2935struct Snapshot {
2936    next_pattern_id: PatternId,
2937    rules_len: usize,
2938    atoms_len: usize,
2939    re_code_len: usize,
2940    sub_patterns_len: usize,
2941    symbol_table_len: usize,
2942}
2943
2944/// Represents a list of warnings.
2945///
2946/// This is a wrapper around a `Vec<Warning>` that contains additional logic
2947/// for limiting the number of warnings stored in the vector and silencing some
2948/// warnings types.
2949pub(crate) struct Warnings {
2950    warnings: Vec<Warning>,
2951    /// Maximum number of warnings that will be stored in `warnings`.
2952    max_warnings: usize,
2953    /// Warnings that are globally disabled.
2954    disabled_warnings: HashSet<String>,
2955    /// Warnings that are suppressed for a specific code span. Keys are
2956    /// warning identifiers, and values are the code spans in which the
2957    /// warning is disabled.
2958    suppressed_warnings: HashMap<String, Vec<Span>>,
2959}
2960
2961impl Default for Warnings {
2962    fn default() -> Self {
2963        Self {
2964            warnings: Vec::new(),
2965            max_warnings: 100,
2966            disabled_warnings: HashSet::default(),
2967            suppressed_warnings: HashMap::default(),
2968        }
2969    }
2970}
2971
2972impl Warnings {
2973    /// Adds the warning returned by `f` to the list.
2974    ///
2975    /// If the maximum number of warnings has been reached the warning is not
2976    /// added.
2977    #[inline]
2978    pub fn add(&mut self, f: impl FnOnce() -> Warning) {
2979        if self.warnings.len() < self.max_warnings {
2980            let warning = f();
2981            let mut warn = !self.disabled_warnings.contains(warning.code());
2982
2983            if warn
2984                && let Some(spans) =
2985                    self.suppressed_warnings.get(warning.code())
2986            {
2987                'l: for disabled_span in spans {
2988                    for label in warning.labels() {
2989                        if disabled_span.contains(label.span()) {
2990                            warn = false;
2991                            break 'l;
2992                        }
2993                    }
2994                }
2995            }
2996
2997            if warn {
2998                self.warnings.push(warning);
2999            }
3000        }
3001    }
3002
3003    /// Returns true if the given code is a valid warning code.
3004    pub fn is_valid_code(code: &str) -> bool {
3005        Warning::all_codes().contains(&code)
3006    }
3007
3008    /// Enables or disables a specific warning identified by `code`.
3009    ///
3010    /// Returns `true` if the warning was previously enabled, or `false` if
3011    /// otherwise. Returns an error if the code doesn't correspond to any
3012    /// of the existing warnings.
3013    #[inline]
3014    pub fn switch_warning(
3015        &mut self,
3016        code: &str,
3017        enabled: bool,
3018    ) -> Result<bool, InvalidWarningCode> {
3019        if !Self::is_valid_code(code) {
3020            return Err(InvalidWarningCode::new(code.to_string()));
3021        }
3022        if enabled {
3023            Ok(!self.disabled_warnings.remove(code))
3024        } else {
3025            Ok(self.disabled_warnings.insert(code.to_string()))
3026        }
3027    }
3028
3029    /// Enable or disables all warnings.
3030    pub fn switch_all_warnings(&mut self, enabled: bool) {
3031        if enabled {
3032            self.disabled_warnings.clear();
3033        } else {
3034            for c in Warning::all_codes() {
3035                self.disabled_warnings.insert(c.to_string());
3036            }
3037        }
3038    }
3039
3040    /// Clear suppressed warnings.
3041    pub fn clear_suppressed(&mut self) {
3042        self.suppressed_warnings.clear();
3043    }
3044
3045    /// Suppress the warning with the given code, for the given span.
3046    pub fn suppress(&mut self, code: &str, span: Span) {
3047        self.suppressed_warnings
3048            .entry(code.to_string())
3049            .or_default()
3050            .push(span);
3051    }
3052
3053    #[inline]
3054    pub fn as_slice(&self) -> &[Warning] {
3055        self.warnings.as_slice()
3056    }
3057}
3058
3059impl From<Warnings> for Vec<Warning> {
3060    fn from(value: Warnings) -> Self {
3061        value.warnings
3062    }
3063}
yara_x/compiler/mod.rs

yara_x/compiler/
mod.rs