yara_x/compiler/
mod.rs

1/*! Compiles YARA source code into binary form.
2
3YARA rules must be compiled before they can be used for scanning data. This
4module implements the YARA compiler.
5*/
6
7use std::cell::RefCell;
8use std::collections::hash_map::Entry;
9use std::collections::{HashMap, HashSet};
10use std::io::Write;
11use std::path::{Path, PathBuf};
12use std::rc::Rc;
13#[cfg(feature = "logging")]
14use std::time::Instant;
15use std::{env, fmt, fs, io, iter};
16
17use bitflags::bitflags;
18use bstr::{BStr, ByteSlice};
19use itertools::{Itertools, MinMaxResult, izip};
20#[cfg(feature = "logging")]
21use log::*;
22use regex_syntax::hir;
23use rustc_hash::{FxHashMap, FxHashSet};
24use serde::{Deserialize, Serialize};
25use walrus::FunctionId;
26
27use yara_x_parser::ast;
28use yara_x_parser::ast::{AST, Ident, Import, Include, RuleFlags, WithSpan};
29use yara_x_parser::cst::CSTStream;
30use yara_x_parser::{Parser, Span};
31
32use crate::compiler::base64::base64_patterns;
33use crate::compiler::emit::{EmitContext, emit_rule_condition};
34use crate::compiler::errors::{
35    CompileError, ConflictingRuleIdentifier, CustomError, DuplicateRule,
36    DuplicateTag, EmitWasmError, InvalidRegexp, InvalidUTF8, UnknownModule,
37    UnusedPattern,
38};
39use crate::compiler::report::ReportBuilder;
40use crate::compiler::{CompileContext, VarStack};
41use crate::re::hir::{ChainedPattern, ChainedPatternGap};
42use crate::string_pool::{BStringPool, StringPool};
43use crate::symbols::{StackedSymbolTable, Symbol, SymbolLookup, SymbolTable};
44use crate::types::{Func, Struct, TypeValue};
45use crate::utils::cast;
46use crate::variables::{Variable, VariableError, is_valid_identifier};
47use crate::wasm::builder::WasmModuleBuilder;
48use crate::wasm::{WasmSymbols, wasm_exports};
49use crate::{re, wasm};
50
51pub(crate) use crate::compiler::atoms::*;
52pub(crate) use crate::compiler::context::*;
53pub(crate) use crate::compiler::ir::*;
54
55use crate::compiler::wsh::WarningSuppressionHook;
56use crate::errors::{
57    CircularIncludes, IncludeError, IncludeNotAllowed, IncludeNotFound,
58    InvalidWarningCode,
59};
60use crate::linters::LinterResult;
61use crate::models::PatternKind;
62
63#[doc(inline)]
64pub use crate::compiler::report::Patch;
65#[doc(inline)]
66pub use crate::compiler::rules::*;
67#[doc(inline)]
68pub use crate::compiler::warnings::*;
69
70mod atoms;
71mod context;
72mod emit;
73mod ir;
74mod report;
75mod rules;
76
77#[cfg(test)]
78mod tests;
79
80pub mod base64;
81pub mod errors;
82pub mod linters;
83pub mod warnings;
84pub mod wsh;
85
86/// A structure that describes some YARA source code.
87///
88/// This structure contains a `&str` pointing to the code itself, and an
89/// optional `origin` that tells where the source code came from. The
90/// most common use for `origin` is indicating the path of the file from
91/// where the source code was obtained, but it can contain any arbitrary
92/// string. This string, if provided, will appear in error messages. For
93/// example, in this error message `origin` was set to `some_file.yar`:
94///
95/// ```text
96/// error: syntax error
97///  --> some_file.yar:4:17
98///   |
99/// 4 | ... more details
100/// ```
101///
102/// # Example
103///
104/// ```
105/// use yara_x::SourceCode;
106/// let src = SourceCode::from("rule test { condition: true }").with_origin("some_file.yar");
107/// ```
108///
109#[derive(Debug, Clone)]
110pub struct SourceCode<'src> {
111    /// A reference to the source code itself. This is a BStr because the
112    /// source code could contain non-UTF8 content.
113    pub(crate) raw: &'src BStr,
114    /// A reference to the source code after validating that it is valid
115    /// UTF-8.
116    pub(crate) valid: Option<&'src str>,
117    /// An optional string that tells which is the origin of the code. Usually
118    /// a file path.
119    pub(crate) origin: Option<String>,
120}
121
122impl<'src> SourceCode<'src> {
123    /// Sets a string that describes the origin of the source code.
124    ///
125    /// This is usually the path of the file that contained the source code,
126    /// but it can be an arbitrary string. The origin appears in error and
127    /// warning messages.
128    pub fn with_origin<S: Into<String>>(self, origin: S) -> Self {
129        Self { raw: self.raw, valid: self.valid, origin: Some(origin.into()) }
130    }
131
132    /// Returns the source code as a `&str`.
133    ///
134    /// If the source code is not valid UTF-8 it will return an error.
135    fn as_str(&mut self) -> Result<&'src str, bstr::Utf8Error> {
136        match self.valid {
137            // We already know that source code is valid UTF-8, return it
138            // as is.
139            Some(s) => Ok(s),
140            // We don't know yet if the source code is valid UTF-8, some
141            // validation must be done. If validation fails an error is
142            // returned.
143            None => {
144                let src = self.raw.to_str()?;
145                self.valid = Some(src);
146                Ok(src)
147            }
148        }
149    }
150}
151
152impl<'src> From<&'src str> for SourceCode<'src> {
153    /// Creates a new [`SourceCode`] from a `&str`.
154    fn from(src: &'src str) -> Self {
155        // The input is a &str, therefore it's guaranteed to be valid UTF-8
156        // and the `valid` field can be initialized.
157        Self { raw: BStr::new(src), valid: Some(src), origin: None }
158    }
159}
160
161impl<'src> From<&'src [u8]> for SourceCode<'src> {
162    /// Creates a new [`SourceCode`] from a `&[u8]`.
163    ///
164    /// As `src` is not guaranteed to be a valid UTF-8 string, the parser will
165    /// verify it and return an error if invalid UTF-8 characters are found.
166    fn from(src: &'src [u8]) -> Self {
167        // The input is a &[u8], its content is not guaranteed to be valid
168        // UTF-8 so the `valid` field is set to `None`. The `validate_utf8`
169        // function will be called for validating the source code before
170        // being parsed.
171        Self { raw: BStr::new(src), valid: None, origin: None }
172    }
173}
174
175/// Compiles a YARA source code.
176///
177/// This function receives any type that implements the `Into<SourceCode>` trait,
178/// which includes `&str`, `String` and [`SourceCode`] and produces compiled
179/// [`Rules`] that can be passed later to the scanner.
180///
181/// # Example
182///
183/// ```rust
184/// # use yara_x;
185/// let rules = yara_x::compile("rule test { condition: true }").unwrap();
186/// let mut scanner = yara_x::Scanner::new(&rules);
187/// let results = scanner.scan("Lorem ipsum".as_bytes()).unwrap();
188/// assert_eq!(results.matching_rules().len(), 1);
189/// ```
190pub fn compile<'src, S>(src: S) -> Result<Rules, CompileError>
191where
192    S: Into<SourceCode<'src>>,
193{
194    let mut compiler = Compiler::new();
195    compiler.add_source(src)?;
196    Ok(compiler.build())
197}
198
199/// Structure that contains information about a rule namespace.
200///
201/// Includes NamespaceId, the IdentId corresponding to the namespace's
202/// identifier, and the symbol table that contains the symbols defined
203/// in the namespace.
204struct Namespace {
205    id: NamespaceId,
206    ident_id: IdentId,
207    symbols: Rc<RefCell<SymbolTable>>,
208}
209
210/// Compiles YARA source code producing a set of compiled [`Rules`].
211///
212/// The two most important methods in this type are [`Compiler::add_source`]
213/// and [`Compiler::build`]. The former tells the compiler which YARA source
214/// code must be compiled, and can be called multiple times with different
215/// set of rules. The latter consumes the compiler and produces a set of
216/// compiled [`Rules`].
217///
218/// # Example
219///
220/// ```rust
221/// # use yara_x;
222/// let mut compiler = yara_x::Compiler::new();
223///
224/// compiler
225///     .add_source(r#"
226///         rule always_true {
227///             condition: true
228///         }"#)?
229///     .add_source(r#"
230///         rule always_false {
231///             condition: false
232///         }"#)?;
233///
234/// let rules = compiler.build();
235///
236/// # Ok::<(), Box<dyn std::error::Error>>(())
237/// ```
238///
239pub struct Compiler<'a> {
240    /// Mimics YARA behavior with respect to regular expressions, allowing
241    /// some constructs that are invalid in YARA-X by default, like invalid
242    /// escape sequences.
243    relaxed_re_syntax: bool,
244
245    /// If true, the compiler hoists loop-invariant expressions (i.e: those
246    /// that don't vary on each iteration of the loop), moving them outside
247    /// the loop.
248    hoisting: bool,
249
250    /// List of directories where the compiler should look for included files.
251    /// If `None`, the current directory is used.
252    include_dirs: Option<Vec<PathBuf>>,
253
254    /// If true, slow patterns produce an error instead of a warning. A slow
255    /// pattern is one with atoms shorter than 2 bytes.
256    error_on_slow_pattern: bool,
257
258    /// If true, a slow loop produces an error instead of a warning. A slow
259    /// rule is one where the upper bound of the loop is potentially large.
260    /// Like for example: `for all x in (0..filesize) : (...)`
261    error_on_slow_loop: bool,
262
263    /// If true, include statements are allowed. If false, include statements
264    /// will produce a compile error.
265    includes_enabled: bool,
266
267    /// Tracks the paths of the files that have been included by nested
268    /// includes. This is useful for detecting circular includes and resolving
269    /// relative includes.
270    include_stack: Vec<PathBuf>,
271
272    /// Used for generating error and warning reports.
273    report_builder: ReportBuilder,
274
275    /// The main symbol table used by the compiler. This is actually a stack of
276    /// symbol tables where the bottom-most table is the one that contains
277    /// global identifiers like built-in functions and user-defined global
278    /// identifiers.
279    symbol_table: StackedSymbolTable,
280
281    /// Symbol table that contains the global identifiers, including built-in
282    /// functions like `uint8`, `uint16`, etc. This symbol table is at the
283    /// bottom of the `symbol_table`'s stack. This field is used when we
284    /// need to access the global symbol table directly, for example for
285    /// defining new global variables.
286    global_symbols: Rc<RefCell<SymbolTable>>,
287
288    /// Information about the current namespace (i.e: the namespace that will
289    /// contain any new rules added via a call to `add_sources`.
290    current_namespace: Namespace,
291
292    /// Pool that contains all the identifiers used in the rules. Each
293    /// identifier appears only once, even if they are used by multiple
294    /// rules. For example, the pool contains a single copy of the common
295    /// identifier `$a`. Each identifier have a unique 32-bits [`IdentId`]
296    /// that can be used for retrieving the identifier from the pool.
297    ident_pool: StringPool<IdentId>,
298
299    /// Similar to `ident_pool` but for regular expressions found in rule
300    /// conditions.
301    regex_pool: StringPool<RegexId>,
302
303    /// Similar to `ident_pool` but for string literals found in the source
304    /// code. As literal strings in YARA can contain arbitrary bytes, a pool
305    /// capable of storing [`bstr::BString`] must be used, the [`String`] type
306    /// only accepts valid UTF-8. This pool also stores the atoms extracted
307    /// from patterns.
308    lit_pool: BStringPool<LiteralId>,
309
310    /// Intermediate representation (IR) tree for condition of the rule that
311    /// is currently being compiled. After compiling each rule the tree is
312    /// cleared, but it will be reused for the next rule.
313    ir: IR,
314
315    /// Builder for creating the WebAssembly module that contains the code
316    /// for all rule conditions.
317    wasm_mod: WasmModuleBuilder,
318
319    /// Struct that contains the IDs for WASM memories, global and local
320    /// variables, etc.
321    wasm_symbols: WasmSymbols,
322
323    /// Map that contains the functions that are callable from WASM code. These
324    /// are the same functions in [`static@WASM_EXPORTS`]. This map allows to
325    /// retrieve the WASM [`FunctionId`] from the fully qualified mangled
326    /// function name (e.g: `my_module.my_struct.my_func@ii@i`)
327    wasm_exports: FxHashMap<String, FunctionId>,
328
329    /// Map that associates a `PatternId` to a certain filesize bound.
330    ///
331    /// A condition like `filesize < 1000 and $a` only matches if `filesize`
332    /// is less than 1000. Therefore, the pattern `$a` does not need be
333    /// checked for files of size 1000 bytes or larger.
334    ///
335    /// In this case, the map will contain an entry associating `$a` to a
336    /// `FilesizeBounds` value like:
337    /// `FilesizeBounds{start: Bound::Unbounded, end: Bound:Excluded(1000)}`.
338    filesize_bounds: FxHashMap<PatternId, FilesizeBounds>,
339
340    /// A vector with all the rules that has been compiled. A [`RuleId`] is
341    /// an index in this vector.
342    rules: Vec<RuleInfo>,
343
344    /// Next (not used yet) [`PatternId`].
345    next_pattern_id: PatternId,
346
347    /// Vector where the N-th boolean indicates whether the pattern with
348    /// PatternId = N is a fast-scan pattern.
349    fast_scan_patterns: bitvec::vec::BitVec,
350
351    /// Map used for de-duplicating pattern. Keys are the pattern's IR and
352    /// values are the `PatternId` assigned to each pattern. Every time a rule
353    /// declares a pattern, this map is used for determining if the same
354    /// pattern (i.e: a pattern with exactly the same IR) was already declared
355    /// by some other rule. If that's the case, that same pattern is re-used.
356    patterns: FxHashMap<Pattern, PatternId>,
357
358    /// A vector with all the sub-patterns from all the rules. A
359    /// [`SubPatternId`] is an index in this vector.
360    sub_patterns: Vec<(PatternId, SubPattern)>,
361
362    /// Vector that contains the [`SubPatternId`] for sub-patterns that can
363    /// match only at a fixed offset within the scanned data. These sub-patterns
364    /// are not added to the Aho-Corasick automaton.
365    anchored_sub_patterns: Vec<SubPatternId>,
366
367    /// A vector that contains all the atoms generated from the patterns.
368    /// Each atom has an associated [`SubPatternId`] that indicates the
369    /// sub-pattern it belongs to.
370    atoms: Vec<SubPatternAtom>,
371
372    /// A vector that contains the code for all regexp patterns (this includes
373    /// hex patterns which are just a special case of regexp). The code for
374    /// each regexp is appended to the vector, during the compilation process
375    /// and the atoms extracted from the regexp contain offsets within this
376    /// vector. This vector contains both forward and backward code.
377    re_code: Vec<u8>,
378
379    /// Vector with the names of all the imported modules. The vector contains
380    /// the [`IdentId`] corresponding to the module's identifier.
381    imported_modules: Vec<IdentId>,
382
383    /// Names of modules that are known, but not supported. When an `import`
384    /// statement with one of these modules is found, the statement is accepted
385    /// without causing an error, but a warning is raised to let the user know
386    /// that the module is not supported. Any rule that depends on an unsupported
387    /// module is ignored.
388    ignored_modules: FxHashSet<String>,
389
390    /// Keys in this map are the modules that are banned, and values are a pair
391    /// of strings with the title and message for the error that will be shown
392    /// if the banned module is imported.
393    banned_modules: FxHashMap<String, (String, String)>,
394
395    /// Keys in this map are the name of rules that will be ignored because they
396    /// depend on unsupported modules, either directly or indirectly. Values are
397    /// the names of the unsupported modules they depend on.
398    ignored_rules: FxHashMap<String, String>,
399
400    /// Structure where each field corresponds to a global identifier or a module
401    /// imported by the rules. For fields corresponding to modules, the value is
402    /// the structure that describes the module.
403    root_struct: Struct,
404
405    /// Warnings generated while compiling the rules.
406    warnings: Warnings,
407
408    /// Errors generated while compiling the rules.
409    errors: Vec<CompileError>,
410
411    /// Features enabled for this compiler. See [`Compiler::enable_feature`]
412    /// for details.
413    features: FxHashSet<String>,
414
415    /// Optional writer where the compiler writes the IR produced by each rule.
416    /// This is used for test cases and debugging.
417    ir_writer: Option<Box<dyn Write>>,
418
419    /// Linters applied to each rule during compilation. The linters are added
420    /// to the compiler using [`Compiler::add_linter`]:
421    linters: Vec<Box<dyn linters::Linter + 'a>>,
422
423    /// Grouped RegexSets constructed during IR creation for or-expressions.
424    pub(crate) regex_sets: FxHashMap<RegexSetId, Vec<RegexId>>,
425}
426
427impl<'a> Compiler<'a> {
428    /// Creates a new YARA compiler.
429    pub fn new() -> Self {
430        let mut ident_pool = StringPool::new();
431        let mut symbol_table = StackedSymbolTable::new();
432
433        let global_symbols = symbol_table.push_new();
434
435        // Add symbols for built-in functions like uint8, uint16, etc.
436        for export in wasm_exports()
437            // Get only the public exports not belonging to a YARA module.
438            .filter(|e| e.public && e.builtin())
439        {
440            let func = Rc::new(Func::from(export.mangled_name));
441            let symbol = Symbol::Func(func);
442
443            global_symbols.borrow_mut().insert(export.name, symbol);
444        }
445
446        // Create the default namespace. Rule identifiers will be added to this
447        // namespace, unless the user defines some namespace explicitly by calling
448        // `Compiler::new_namespace`.
449        let default_namespace = Namespace {
450            id: NamespaceId(0),
451            ident_id: ident_pool.get_or_intern("default"),
452            symbols: symbol_table.push_new(),
453        };
454
455        // At this point the symbol table (which is a stacked symbol table) has
456        // two layers, the global symbols at the bottom, and the default
457        // namespace on top of it. Calls to `Compiler::new_namespace` replace
458        // the top layer (default namespace) with a new one, but the bottom
459        // layer remains, so the global symbols are shared by all namespaces.
460
461        // Create a WASM module builder. This object is used for building the
462        // WASM module that will execute the rule conditions.
463        let mut wasm_mod = WasmModuleBuilder::new();
464
465        wasm_mod.namespaces_per_func(20);
466        wasm_mod.rules_per_func(10);
467
468        let wasm_symbols = wasm_mod.wasm_symbols();
469        let wasm_exports = wasm_mod.wasm_exports();
470
471        let mut ir = IR::new();
472
473        if cfg!(feature = "constant-folding") {
474            ir.constant_folding(true);
475        }
476
477        Self {
478            ir,
479            ident_pool,
480            global_symbols,
481            symbol_table,
482            wasm_mod,
483            wasm_symbols,
484            wasm_exports,
485            relaxed_re_syntax: false,
486            hoisting: false,
487            error_on_slow_pattern: false,
488            error_on_slow_loop: false,
489            next_pattern_id: PatternId(0),
490            fast_scan_patterns: bitvec::vec::BitVec::new(),
491            current_namespace: default_namespace,
492            features: FxHashSet::default(),
493            warnings: Warnings::default(),
494            errors: Vec::new(),
495            rules: Vec::new(),
496            sub_patterns: Vec::new(),
497            anchored_sub_patterns: Vec::new(),
498            atoms: Vec::new(),
499            re_code: Vec::new(),
500            imported_modules: Vec::new(),
501            ignored_modules: FxHashSet::default(),
502            banned_modules: FxHashMap::default(),
503            ignored_rules: FxHashMap::default(),
504            filesize_bounds: FxHashMap::default(),
505            root_struct: Struct::new().make_root(),
506            report_builder: ReportBuilder::new(),
507            lit_pool: BStringPool::new(),
508            regex_pool: StringPool::new(),
509            patterns: FxHashMap::default(),
510            ir_writer: None,
511            linters: Vec::new(),
512            include_dirs: None,
513            includes_enabled: true,
514            include_stack: Vec::new(),
515            regex_sets: FxHashMap::default(),
516        }
517    }
518
519    /// Adds a directory to the list of directories where the compiler should
520    /// look for included files.
521    ///
522    /// When an `include` statement is found, the compiler looks for the included
523    /// file in the directories added with this function, in the order they were
524    /// added.
525    ///
526    /// If this function is not called, the compiler will only look for included
527    /// files in the current directory.
528    ///
529    /// Use [Compiler::enable_includes] for controlling whether include statements
530    /// are allowed or not.
531    ///
532    /// # Example
533    ///
534    /// ```no_run
535    /// # use yara_x::Compiler;
536    /// # use std::path::Path;
537    /// let mut compiler = Compiler::new();
538    /// compiler.add_include_dir("/path/to/rules")
539    ///         .add_include_dir("/another/path");
540    /// ```
541    pub fn add_include_dir<P: AsRef<std::path::Path>>(
542        &mut self,
543        dir: P,
544    ) -> &mut Self {
545        self.include_dirs
546            .get_or_insert_default()
547            .push(dir.as_ref().to_path_buf());
548        self
549    }
550
551    /// Adds some YARA source code to be compiled.
552    ///
553    /// The `src` parameter accepts any type that implements [`Into<SourceCode>`],
554    /// such as `&str`, `&[u8]`, or an instance of [`SourceCode`] itself. The source
555    /// code may include one or more YARA rules.
556    ///
557    /// You can call this function multiple times to add different sets of rules.
558    /// If the provided source code contains syntax or semantic errors that prevent
559    /// compilation, the function returns the first encountered error. All errors
560    /// found during compilation are also recorded and can be retrieved using
561    /// [`Compiler::errors`].
562    ///
563    /// Even if previous calls to this function resulted in compilation errors,
564    /// you may continue adding additional rules. Only successfully compiled rules
565    /// will be included in the final rule set.
566    pub fn add_source<'src, S>(
567        &mut self,
568        src: S,
569    ) -> Result<&mut Self, CompileError>
570    where
571        S: Into<SourceCode<'src>>,
572    {
573        // Convert `src` into an instance of `SourceCode` if it is something
574        // else, like a &str.
575        let mut src = src.into();
576
577        // Register source code, even before validating that it is UTF-8. In
578        // case of UTF-8 encoding errors we want to report that error too,
579        // and we need the source code registered for creating the report.
580        self.report_builder.register_source(&src);
581
582        // Make sure that the source code is valid UTF-8, or return an error
583        // if otherwise.
584        let ast = match src.as_str() {
585            Ok(src) => {
586                // Parse the source code and build the Abstract Syntax Tree.
587                let cst = Parser::new(src.as_bytes());
588                let cst =
589                    WarningSuppressionHook::from(cst).hook(|warning, span| {
590                        self.warnings.suppress(warning, span);
591                    });
592
593                AST::from(CSTStream::new(src.as_bytes(), cst))
594            }
595            Err(err) => {
596                let span_start = err.valid_up_to();
597                let span_end = if let Some(error_len) = err.error_len() {
598                    // `error_len` is the number of invalid UTF-8 bytes found
599                    // after `span_start`. Round the number up to the next 3
600                    // bytes boundary because invalid bytes are replaced with
601                    // the Unicode replacement characters that takes 3 bytes.
602                    // This way the span ends at a valid UTF-8 character
603                    // boundary.
604                    span_start + error_len.next_multiple_of(3)
605                } else {
606                    span_start
607                };
608
609                let err = InvalidUTF8::build(
610                    &self.report_builder,
611                    self.report_builder.span_to_code_loc(Span(
612                        span_start as u32..span_end as u32,
613                    )),
614                );
615
616                self.errors.push(err.clone());
617                return Err(err);
618            }
619        };
620
621        // Store the current length of the `errors` vector, so that we can
622        // know if more errors were added.
623        let existing_errors = self.errors.len();
624
625        self.c_items(ast.items());
626
627        self.warnings.clear_suppressed();
628
629        self.errors.extend(
630            ast.into_errors()
631                .into_iter()
632                .map(|err| CompileError::from(&self.report_builder, err)),
633        );
634
635        // More errors were added? Return the first error that was added.
636        if self.errors.len() > existing_errors {
637            return Err(self.errors[existing_errors].clone());
638        }
639
640        Ok(self)
641    }
642
643    /// Defines a global variable and sets its initial value.
644    ///
645    /// Global variables must be defined before adding any YARA source code
646    /// that references them via [`Compiler::add_source`]. Once defined, the
647    /// variable's initial value is preserved in the compiled [`Rules`] and
648    /// will be used unless overridden.
649    ///
650    /// When scanning, each scanner instance can modify the initial value of
651    /// the variable using [`crate::Scanner::set_global`].
652    ///
653    /// `T` can be any type that implements [`TryInto<Variable>`], including:
654    /// `i64`, `i32`, `i16`, `i8`, `u32`, `u16`, `u8`, `f64`, `f32`, `bool`,
655    /// `&str`, `String` and [`serde_json::Value`].
656    ///
657    /// When using a [`serde_json::Value`] there are certain limitations: keys
658    /// in maps must be valid YARA identifiers (the first character must be `_`
659    /// or a letter, the remaining ones must be `_`, a letter or a digit),
660    /// because these maps are translated into YARA structures. Also, all items
661    /// in an array must have the same type.
662    ///
663    /// ```
664    /// # use yara_x::Compiler;
665    /// assert!(Compiler::new()
666    ///     .define_global("some_int", 1)?
667    ///     .add_source("rule some_int_not_zero {condition: some_int != 0}")
668    ///     .is_ok());
669    ///
670    /// # Ok::<(), Box<dyn std::error::Error>>(())
671    /// ```
672    pub fn define_global<T: TryInto<Variable>>(
673        &mut self,
674        ident: &str,
675        value: T,
676    ) -> Result<&mut Self, VariableError>
677    where
678        VariableError: From<<T as TryInto<Variable>>::Error>,
679    {
680        if !is_valid_identifier(ident) {
681            return Err(VariableError::InvalidIdentifier(ident.to_string()));
682        }
683
684        let var: Variable = value.try_into()?;
685        let type_value: TypeValue = var.into();
686
687        if self.root_struct.add_field(ident, type_value).is_some() {
688            return Err(VariableError::AlreadyExists(ident.to_string()));
689        }
690
691        self.global_symbols
692            .borrow_mut()
693            .insert(ident, self.root_struct.lookup(ident).unwrap());
694
695        Ok(self)
696    }
697
698    /// Creates a new namespace.
699    ///
700    /// Further calls to [`Compiler::add_source`] will put the rules under the
701    /// newly created namespace. If the new namespace is named as the current
702    /// one, no new namespace is created.
703    ///
704    /// In the example below both rules `foo` and `bar` are put into the same
705    /// namespace (the default namespace), therefore `bar` can use `foo` as
706    /// part of its condition, and everything is ok.
707    ///
708    /// ```
709    /// # use yara_x::Compiler;
710    /// assert!(Compiler::new()
711    ///     .add_source("rule foo {condition: true}")?
712    ///     .add_source("rule bar {condition: foo}")
713    ///     .is_ok());
714    ///
715    /// # Ok::<(), Box<dyn std::error::Error>>(())
716    /// ```
717    ///
718    /// In this other example the rule `foo` is put in the default namespace,
719    /// but the rule `bar` is put under the `bar` namespace. This implies that
720    /// `foo` is not visible to `bar`, and the second call to `add_source`
721    /// fails.
722    ///
723    /// ```
724    /// # use yara_x::Compiler;
725    /// assert!(Compiler::new()
726    ///     .add_source("rule foo {condition: true}")?
727    ///     .new_namespace("bar")
728    ///     .add_source("rule bar {condition: foo}")
729    ///     .is_err());
730    ///
731    /// # Ok::<(), Box<dyn std::error::Error>>(())
732    /// ```
733    pub fn new_namespace(&mut self, namespace: &str) -> &mut Self {
734        let current_namespace = self
735            .ident_pool
736            .get(self.current_namespace.ident_id)
737            .expect("expecting a namespace");
738        // If the current namespace is already named as the new namespace
739        // this function has no effect.
740        if namespace == current_namespace {
741            return self;
742        }
743        // Remove the symbol table corresponding to the current namespace.
744        self.symbol_table.pop().expect("expecting a namespace");
745        // Create a new namespace. The NamespaceId is simply the ID of the
746        // previous namespace + 1.
747        self.current_namespace = Namespace {
748            id: NamespaceId(self.current_namespace.id.0 + 1),
749            ident_id: self.ident_pool.get_or_intern(namespace),
750            symbols: self.symbol_table.push_new(),
751        };
752        self.ignored_rules.clear();
753        self.wasm_mod.new_namespace();
754        self
755    }
756
757    /// Builds the source code previously added to the compiler.
758    ///
759    /// This function consumes the compiler and returns an instance of
760    /// [`Rules`].
761    pub fn build(self) -> Rules {
762        // Finish building the WASM module.
763        let wasm_mod = self.wasm_mod.build().emit_wasm();
764
765        #[cfg(feature = "logging")]
766        let start = Instant::now();
767
768        // Compile the WASM module for the current platform. This panics
769        // if the WASM code is invalid, which should not happen as the code is
770        // emitted by YARA itself. If this ever happens is probably because
771        // wrong WASM code is being emitted.
772        let compiled_wasm_mod = wasm::runtime::Module::from_binary(
773            wasm::get_engine(),
774            wasm_mod.as_slice(),
775        )
776        .expect("WASM module is not valid");
777
778        #[cfg(feature = "logging")]
779        info!("WASM module build time: {:?}", Instant::elapsed(&start));
780
781        // The structure that contains the global variables is serialized before
782        // being passed to the `Rules` struct. This is because we want `Rules`
783        // to be `Send`, so that it can be shared with scanners running in
784        // different threads. In order for `Rules` to be `Send`, it can't
785        // contain fields that are not `Send`. As `Struct` is not `Send` we
786        // can't have a `Struct` field in `Rules`, so what we have a `Vec<u8>`
787        // with a serialized version of the struct.
788        //
789        // An alternative is changing the `Rc` in some variants of `TypeValue`
790        // to `Arc`, as the root cause that prevents `Struct` from being `Send`
791        // is the use of `Rc` in `TypeValue`.
792        let serialized_globals = bincode::serde::encode_to_vec(
793            &self.root_struct,
794            bincode::config::standard().with_variable_int_encoding(),
795        )
796        .expect("failed to serialize global variables");
797
798        let mut rules = Rules {
799            serialized_globals,
800            wasm_mod,
801            compiled_wasm_mod: Some(compiled_wasm_mod),
802            relaxed_re_syntax: self.relaxed_re_syntax,
803            ac: None,
804            num_patterns: self.next_pattern_id.0 as usize,
805            ident_pool: self.ident_pool,
806            regex_pool: self.regex_pool,
807            lit_pool: self.lit_pool,
808            imported_modules: self.imported_modules,
809            rules: self.rules,
810            sub_patterns: self.sub_patterns,
811            anchored_sub_patterns: self.anchored_sub_patterns,
812            atoms: self.atoms,
813            re_code: self.re_code,
814            warnings: self.warnings.into(),
815            filesize_bounds: self.filesize_bounds,
816            regex_sets: self.regex_sets,
817            fast_scan_patterns: self.fast_scan_patterns,
818        };
819
820        rules.build_ac_automaton();
821        rules
822    }
823
824    /// Adds a linter to the compiler.
825    ///
826    /// Linters perform additional checks to each YARA rule, generating
827    /// warnings when a rule does not meet the linter's requirements. See
828    /// [`crate::linters`] for a list of available linters.
829    pub fn add_linter<L: linters::Linter + 'a>(
830        &mut self,
831        linter: L,
832    ) -> &mut Self {
833        self.linters.push(Box::new(linter));
834        self
835    }
836
837    /// Enables a feature on this compiler.
838    ///
839    /// When defining the structure of a module in a `.proto` file, you can
840    /// specify that certain fields are accessible only when one or more
841    /// features are enabled. For example, the snippet below shows the
842    /// definition of a field named `requires_foo_and_bar`, which can be
843    /// accessed only when both features "foo" and "bar" are enabled.
844    ///
845    /// ```protobuf
846    /// optional uint64 requires_foo_and_bar = 500 [
847    ///   (yara.field_options) = {
848    ///     acl: [
849    ///       {
850    ///         allow_if: "foo",
851    ///         error_title: "foo is required",
852    ///         error_label: "this field was used without foo"
853    ///       },
854    ///       {
855    ///         allow_if: "bar",
856    ///         error_title: "bar is required",
857    ///         error_label: "this field was used without bar"
858    ///       }
859    ///     ]
860    ///   }
861    /// ];
862    /// ```
863    ///
864    /// If some of the required features are not enabled, using this field in
865    /// a YARA rule will cause an error while compiling the rules. The error
866    /// looks like:
867    ///
868    /// ```text
869    /// error[E034]: foo is required
870    ///  --> line:5:29
871    ///   |
872    /// 5 |  test_proto2.requires_foo_and_bar == 0
873    ///   |              ^^^^^^^^^^^^^^^^^^^^ this field was used without foo
874    ///   |
875    /// ```
876    ///
877    /// Notice that both the title and label in the error message are defined
878    /// in the .proto file.
879    ///
880    /// # Important
881    ///
882    /// This API is hidden from the public documentation because it is unstable
883    /// and subject to change.
884    #[doc(hidden)]
885    pub fn enable_feature<F: Into<String>>(
886        &mut self,
887        feature: F,
888    ) -> &mut Self {
889        self.features.insert(feature.into());
890        self
891    }
892
893    /// Tell the compiler that a YARA module is not supported.
894    ///
895    /// Import statements for ignored modules will be ignored without errors,
896    /// but a warning will be issued. Any rule that makes use of an ignored
897    /// module will be also ignored, while the rest of the rules that don't
898    /// rely on that module will be correctly compiled.
899    pub fn ignore_module<M: Into<String>>(&mut self, module: M) -> &mut Self {
900        self.ignored_modules.insert(module.into());
901        self
902    }
903
904    /// Tell the compiler that a YARA module can't be used.
905    ///
906    /// Import statements for the banned module will cause an error. The error
907    /// message can be customized by using the given error title and message.
908    ///
909    /// If this function is called multiple times with the same module name,
910    /// the error title and message will be updated.
911    pub fn ban_module<M: Into<String>, T: Into<String>, E: Into<String>>(
912        &mut self,
913        module: M,
914        error_title: T,
915        error_message: E,
916    ) -> &mut Self {
917        self.banned_modules
918            .insert(module.into(), (error_title.into(), error_message.into()));
919        self
920    }
921
922    /// Specifies whether the compiler should produce colorful error messages.
923    ///
924    /// Colorized error messages contain ANSI escape sequences that make them
925    /// look nicer on compatible consoles.
926    ///
927    /// The default setting is `false`.
928    pub fn colorize_errors(&mut self, yes: bool) -> &mut Self {
929        self.report_builder.with_colors(yes);
930        self
931    }
932
933    /// Sets the maximum number of columns in error messages.
934    ///
935    /// The default value is 140.
936    pub fn errors_max_width(&mut self, width: usize) -> &mut Self {
937        self.report_builder.max_width(width);
938        self
939    }
940
941    /// Enables or disables a specific type of warning.
942    ///
943    /// Each warning type has a description code (i.e: `slow_pattern`,
944    /// `unsupported_module`, etc.). This function allows to enable or disable
945    /// a specific type of warning identified by the given code.
946    ///
947    /// Returns an error if the given warning code doesn't exist.
948    pub fn switch_warning(
949        &mut self,
950        code: &str,
951        enabled: bool,
952    ) -> Result<&mut Self, InvalidWarningCode> {
953        self.warnings.switch_warning(code, enabled)?;
954        Ok(self)
955    }
956
957    /// Enables or disables all warnings.
958    pub fn switch_all_warnings(&mut self, enabled: bool) -> &mut Self {
959        self.warnings.switch_all_warnings(enabled);
960        self
961    }
962
963    /// Sets the maximum number of warnings.
964    ///
965    /// The compiler will report only the first `n` warnings.
966    pub fn max_warnings(&mut self, n: usize) -> &mut Self {
967        self.warnings.max_warnings = Some(n);
968        self
969    }
970
971    /// Enables a more relaxed syntax check for regular expressions.
972    ///
973    /// YARA-X enforces stricter regular expression syntax compared to YARA.
974    /// For instance, YARA accepts invalid escape sequences and treats them
975    /// as literal characters (e.g., \R is interpreted as a literal 'R'). It
976    /// also allows some special characters to appear unescaped, inferring
977    /// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
978    /// literal, but in `/foo{0,1}bar/` they form the repetition operator
979    /// `{0,1}`).
980    ///
981    /// This setting controls whether the compiler should mimic YARA's behavior,
982    /// allowing constructs that YARA-X doesn't accept by default.
983    ///
984    /// This should be called before any rule is added to the compiler.
985    ///
986    /// # Panics
987    ///
988    /// If called after adding rules to the compiler.
989    pub fn relaxed_re_syntax(&mut self, yes: bool) -> &mut Self {
990        if !self.rules.is_empty() {
991            panic!("calling relaxed_re_syntax in non-empty compiler")
992        }
993        self.relaxed_re_syntax = yes;
994        self
995    }
996
997    /// When enabled, slow patterns produce an error instead of a warning.
998    ///
999    /// This is disabled by default.
1000    pub fn error_on_slow_pattern(&mut self, yes: bool) -> &mut Self {
1001        self.error_on_slow_pattern = yes;
1002        self
1003    }
1004
1005    /// When enabled, potentially slow loops produce an error instead of a
1006    /// warning.
1007    ///
1008    /// This is disabled by default.
1009    pub fn error_on_slow_loop(&mut self, yes: bool) -> &mut Self {
1010        self.error_on_slow_loop = yes;
1011        self
1012    }
1013
1014    /// Controls whether `include` statements are allowed.
1015    ///
1016    /// By default, the compiler allows the use of `include` statements, which
1017    /// include the content of other files. When includes are disabled, any
1018    /// attempt to use an `include` statement will result in a compile error.
1019    ///
1020    /// ```
1021    /// # use yara_x::Compiler;
1022    /// let mut compiler = Compiler::new();
1023    /// compiler.enable_includes(false);  // Disable includes
1024    /// ```
1025    pub fn enable_includes(&mut self, yes: bool) -> &mut Self {
1026        self.includes_enabled = yes;
1027        self
1028    }
1029
1030    /// When enabled, the compiler tries to optimize rule conditions.
1031    ///
1032    /// The optimizations usually reduce condition evaluation times, specially
1033    /// in complex rules that contain loops, but it can break short-circuit
1034    /// evaluation rules because some subexpressions are not executed in the
1035    /// order they appear in the source code.
1036    ///
1037    /// This is a very experimental feature.
1038    #[doc(hidden)]
1039    pub fn condition_optimization(&mut self, yes: bool) -> &mut Self {
1040        self.hoisting(yes)
1041    }
1042
1043    pub(crate) fn hoisting(&mut self, yes: bool) -> &mut Self {
1044        self.hoisting = yes;
1045        self
1046    }
1047
1048    /// Retrieves all errors generated by the compiler.
1049    ///
1050    /// This method returns every error encountered during the compilation,
1051    /// across all invocations of [`Compiler::add_source`].
1052    #[inline]
1053    pub fn errors(&self) -> &[CompileError] {
1054        self.errors.as_slice()
1055    }
1056
1057    /// Returns the warnings emitted by the compiler.
1058    ///
1059    /// This method returns every warning issued during the compilation,
1060    /// across all invocations of [`Compiler::add_source`].
1061    #[inline]
1062    pub fn warnings(&self) -> &[Warning] {
1063        self.warnings.as_slice()
1064    }
1065
1066    /// Emits a `.wasm` file with the WASM module generated by the compiler.
1067    ///
1068    /// This file can be inspected and converted to WASM text format by using
1069    /// third-party [tooling](https://github.com/WebAssembly/wabt). This is
1070    /// useful for debugging issues with incorrectly emitted WASM code.
1071    pub fn emit_wasm_file<P>(self, path: P) -> Result<(), EmitWasmError>
1072    where
1073        P: AsRef<Path>,
1074    {
1075        let mut wasm_mod = self.wasm_mod.build();
1076        Ok(wasm_mod.emit_wasm_file(path)?)
1077    }
1078
1079    /// Sets a writer where the compiler will write the Intermediate
1080    /// Representation (IR) of compiled conditions.
1081    ///
1082    /// This is used for testing and debugging purposes.
1083    #[doc(hidden)]
1084    pub fn set_ir_writer<W: Write + 'static>(&mut self, w: W) -> &mut Self {
1085        self.ir_writer = Some(Box::new(w));
1086        self
1087    }
1088}
1089
1090impl Compiler<'_> {
1091    fn add_sub_pattern<I, F, A>(
1092        &mut self,
1093        pattern_id: PatternId,
1094        sub_pattern: SubPattern,
1095        atoms: I,
1096        f: F,
1097    ) -> SubPatternId
1098    where
1099        I: Iterator<Item = A>,
1100        F: Fn(SubPatternId, A) -> SubPatternAtom,
1101    {
1102        let sub_pattern_id = SubPatternId(self.sub_patterns.len() as u32);
1103
1104        // Sub-patterns that are anchored at some fixed offset are not added to
1105        // the Aho-Corasick automata. Instead, their IDs are added to the
1106        // anchored_sub_patterns list.
1107        if let SubPattern::Literal { anchored_at: Some(_), .. } = sub_pattern {
1108            self.anchored_sub_patterns.push(sub_pattern_id);
1109        } else {
1110            self.atoms.extend(atoms.map(|atom| f(sub_pattern_id, atom)));
1111        }
1112
1113        self.sub_patterns.push((pattern_id, sub_pattern));
1114
1115        sub_pattern_id
1116    }
1117
1118    /// Checks if another rule, module or variable has the given identifier and
1119    /// return an error in that case.
1120    fn check_for_existing_identifier(
1121        &self,
1122        ident: &Ident,
1123    ) -> Result<(), CompileError> {
1124        if let Some(symbol) = self.symbol_table.lookup(ident.name) {
1125            return match symbol {
1126                // Found another rule with the same name.
1127                Symbol::Rule { rule_id, .. } => Err(DuplicateRule::build(
1128                    &self.report_builder,
1129                    ident.name.to_string(),
1130                    self.report_builder.span_to_code_loc(ident.span()),
1131                    self.rules
1132                        .get(rule_id.0 as usize)
1133                        .unwrap()
1134                        .ident_ref
1135                        .clone(),
1136                )),
1137                // Found another symbol that is not a rule, but has the same
1138                // name.
1139                _ => Err(ConflictingRuleIdentifier::build(
1140                    &self.report_builder,
1141                    ident.name.to_string(),
1142                    self.report_builder.span_to_code_loc(ident.span()),
1143                )),
1144            };
1145        }
1146        Ok(())
1147    }
1148
1149    /// Checks that tags are not duplicate.
1150    fn check_for_duplicate_tags(
1151        &self,
1152        tags: &[Ident],
1153    ) -> Result<(), CompileError> {
1154        let mut s = HashSet::new();
1155        for tag in tags {
1156            if !s.insert(tag.name) {
1157                return Err(DuplicateTag::build(
1158                    &self.report_builder,
1159                    tag.name.to_string(),
1160                    self.report_builder.span_to_code_loc(tag.span()),
1161                ));
1162            }
1163        }
1164        Ok(())
1165    }
1166
1167    /// Interns a literal in the literals pool.
1168    ///
1169    /// If `wide` is true the literal gets zeroes interleaved between each byte
1170    /// before being interned.
1171    fn intern_literal(&mut self, literal: &[u8], wide: bool) -> LiteralId {
1172        let wide_pattern;
1173        let literal_bytes = if wide {
1174            wide_pattern = make_wide(literal);
1175            wide_pattern.as_bytes()
1176        } else {
1177            literal
1178        };
1179        self.lit_pool.get_or_intern(literal_bytes)
1180    }
1181
1182    /// Takes a snapshot of the compiler's state at this moment.
1183    ///
1184    /// The returned [`Snapshot`] can be passed to [`Compiler::restore_snapshot`]
1185    /// for restoring the compiler to the state it was when the snapshot was
1186    /// taken.
1187    ///
1188    /// This is useful when the compilation of a rule fails, for restoring the
1189    /// compiler to the state it had before starting compiling the failed rule,
1190    /// which avoids leaving junk in the compiler's internal structures.
1191    fn take_snapshot(&self) -> Snapshot {
1192        Snapshot {
1193            next_pattern_id: self.next_pattern_id,
1194            rules_len: self.rules.len(),
1195            atoms_len: self.atoms.len(),
1196            re_code_len: self.re_code.len(),
1197            sub_patterns_len: self.sub_patterns.len(),
1198            symbol_table_len: self.symbol_table.len(),
1199            fast_scan_patterns_len: self.fast_scan_patterns.len(),
1200        }
1201    }
1202
1203    /// Restores the compiler's to a previous state.
1204    ///
1205    /// Use [`Compiler::take_snapshot`] for taking a snapshot of the compiler's
1206    /// state.
1207    fn restore_snapshot(&mut self, snapshot: Snapshot) {
1208        self.next_pattern_id = snapshot.next_pattern_id;
1209        self.rules.truncate(snapshot.rules_len);
1210        self.sub_patterns.truncate(snapshot.sub_patterns_len);
1211        self.re_code.truncate(snapshot.re_code_len);
1212        self.atoms.truncate(snapshot.atoms_len);
1213        self.symbol_table.truncate(snapshot.symbol_table_len);
1214        self.fast_scan_patterns.truncate(snapshot.fast_scan_patterns_len);
1215
1216        // Pattern IDs that are >= next_pattern_id, are being discarded. Any pattern
1217        // or file size bound associated to such IDs must be removed.
1218
1219        self.patterns
1220            .retain(|_, pattern_id| *pattern_id < snapshot.next_pattern_id);
1221
1222        self.filesize_bounds
1223            .retain(|pattern_id, _| *pattern_id < snapshot.next_pattern_id);
1224    }
1225
1226    /// Returns true if the bytes in the slice are all 0x00, 0x90, or 0xff.
1227    fn common_byte_repetition(bytes: &[u8]) -> bool {
1228        let mut all_x00 = true;
1229        let mut all_x90 = true;
1230        let mut all_xff = true;
1231
1232        for b in bytes {
1233            match *b {
1234                0x00 => {
1235                    all_x90 = false;
1236                    all_xff = false;
1237                }
1238                0x90 => {
1239                    all_x00 = false;
1240                    all_xff = false;
1241                }
1242                0xff => {
1243                    all_x00 = false;
1244                    all_x90 = false;
1245                }
1246                _ => return false,
1247            }
1248            if !all_x00 && !all_x90 && !all_xff {
1249                return false;
1250            }
1251        }
1252
1253        true
1254    }
1255
1256    /// Reads the file specified by an `include` statement.
1257    ///
1258    /// Tries to read the file in the include directories that were specified
1259    /// with [`Compiler::add_include_dir`], or in the current directory, if
1260    /// no include directories were specified.
1261    ///
1262    /// The function returns both the content and the path of the included file
1263    /// relative to the current directory, or an error if the included file could
1264    /// not be read.
1265    fn read_included_file(
1266        &mut self,
1267        include: &Include,
1268    ) -> Result<(Vec<u8>, PathBuf), CompileError> {
1269        let read_file =
1270            |path: PathBuf| -> Result<(Vec<u8>, PathBuf), io::Error> {
1271                let mut path = path.canonicalize()?;
1272                let content = fs::read(&path)?;
1273
1274                if let Ok(cwd) =
1275                    env::current_dir().and_then(|dir| dir.canonicalize())
1276                    && let Ok(relative_path) = path.strip_prefix(cwd)
1277                {
1278                    path = relative_path.to_path_buf();
1279                }
1280
1281                Ok((content, path))
1282            };
1283
1284        // Look for the included file in the directory at the top of the
1285        // include stack.
1286        if let Some(dir) =
1287            self.include_stack.last().and_then(|path| path.parent())
1288            && let Ok(result) = read_file(dir.join(include.file_name))
1289        {
1290            return Ok(result);
1291        }
1292
1293        // If one or more include directory were specified, try to find the
1294        // included file in them, in the order they were specified. Otherwise,
1295        // try to find the included file in the current directory.
1296        if let Some(include_dirs) = &self.include_dirs {
1297            if let Some(result) = include_dirs
1298                .iter()
1299                .find_map(|dir| read_file(dir.join(include.file_name)).ok())
1300            {
1301                Ok(result)
1302            } else {
1303                Err(IncludeNotFound::build(
1304                    &self.report_builder,
1305                    include.file_name.to_string(),
1306                    self.report_builder.span_to_code_loc(include.span()),
1307                ))
1308            }
1309        } else {
1310            read_file(PathBuf::from(include.file_name)).map_err(|err| {
1311                if err.kind() == io::ErrorKind::NotFound {
1312                    IncludeNotFound::build(
1313                        &self.report_builder,
1314                        include.file_name.to_string(),
1315                        self.report_builder.span_to_code_loc(include.span()),
1316                    )
1317                } else {
1318                    IncludeError::build(
1319                        &self.report_builder,
1320                        self.report_builder.span_to_code_loc(include.span()),
1321                        err.to_string(),
1322                    )
1323                }
1324            })
1325        }
1326    }
1327}
1328
1329impl Compiler<'_> {
1330    fn c_items<'a, I>(&mut self, items: I)
1331    where
1332        I: Iterator<Item = &'a ast::Item<'a>>,
1333    {
1334        let mut already_imported = FxHashMap::default();
1335
1336        for item in items {
1337            match item {
1338                ast::Item::Import(import) => {
1339                    // Checks that all imported modules actually exist, and
1340                    // raise warnings in case of duplicated imports within
1341                    // the same source file. For each module add a symbol to
1342                    // the current namespace.
1343                    if let Some(existing_import) = already_imported.insert(
1344                        &import.module_name,
1345                        self.report_builder.span_to_code_loc(import.span()),
1346                    ) {
1347                        let duplicated_import = self
1348                            .report_builder
1349                            .span_to_code_loc(import.span());
1350
1351                        let mut warning = warnings::DuplicateImport::build(
1352                            &self.report_builder,
1353                            import.module_name.to_string(),
1354                            duplicated_import.clone(),
1355                            existing_import,
1356                        );
1357
1358                        warning.report_mut().patch(duplicated_import, "");
1359
1360                        self.warnings.add(|| warning)
1361                    }
1362                    // Import the module. This updates `self.root_struct` if
1363                    // necessary.
1364                    if let Err(err) = self.c_import(import) {
1365                        self.errors.push(err);
1366                    }
1367                }
1368                ast::Item::Include(include) => {
1369                    // Return an error if includes are disabled
1370                    if !self.includes_enabled {
1371                        self.errors.push(IncludeNotAllowed::build(
1372                            &self.report_builder,
1373                            self.report_builder
1374                                .span_to_code_loc(include.span()),
1375                        ));
1376                        continue;
1377                    }
1378
1379                    let (included_src, included_path) =
1380                        match self.read_included_file(include) {
1381                            Ok(included) => included,
1382                            Err(err) => {
1383                                self.errors.push(err);
1384                                continue;
1385                            }
1386                        };
1387
1388                    if self.include_stack.contains(&included_path) {
1389                        self.errors.push(CircularIncludes::build(
1390                            &self.report_builder,
1391                            self.report_builder
1392                                .span_to_code_loc(include.span()),
1393                            Some(format!(
1394                                "include dependencies:\n{}",
1395                                self.include_stack
1396                                    .iter()
1397                                    .enumerate()
1398                                    .map(|(i, path)| format!(
1399                                        "{:>width$}↳ {}",
1400                                        "",
1401                                        path.display(),
1402                                        width = i * 2
1403                                    ))
1404                                    .collect::<Vec<_>>()
1405                                    .join("\n")
1406                            )),
1407                        ));
1408                        continue;
1409                    }
1410
1411                    // Save the current source ID from the report builder in
1412                    // order to restore it later. Any recursive call to
1413                    // `add_source` will change the current source ID, and we
1414                    // need to restore after `add_source` returns.
1415                    let source_id =
1416                        self.report_builder.get_current_source_id().unwrap();
1417
1418                    let source_code =
1419                        SourceCode::from(included_src.as_slice()).with_origin(
1420                            // In Windows the paths separators are backslashes, but we
1421                            // want to use slashes.
1422                            included_path.to_str().unwrap().replace("\\", "/"),
1423                        );
1424
1425                    self.include_stack.push(included_path);
1426
1427                    // Any error generated while processing the included source
1428                    // code will be added to `self.errors`. The error returned
1429                    // by `add_source` is simply the first of the added errors,
1430                    // we don't need to handle the error here.
1431                    let _ = self.add_source(source_code);
1432
1433                    // Restore the current source ID to the value it had before
1434                    // calling `add_source`.
1435                    self.report_builder.set_current_source_id(source_id);
1436
1437                    self.include_stack.pop().unwrap();
1438                }
1439                ast::Item::Rule(rule) => {
1440                    if let Err(err) = self.c_rule(rule) {
1441                        self.errors.push(err);
1442                    }
1443                }
1444            }
1445        }
1446    }
1447
1448    fn c_rule(&mut self, rule: &ast::Rule) -> Result<(), CompileError> {
1449        // Check if another rule, module or variable has the same identifier
1450        // and return an error in that case.
1451        self.check_for_existing_identifier(&rule.identifier)?;
1452
1453        // Check that rule tags, if any, doesn't contain duplicates.
1454        if let Some(tags) = &rule.tags {
1455            self.check_for_duplicate_tags(tags.as_slice())?;
1456        }
1457
1458        // Check the rule with all the linters.
1459        let mut first_linter_err: Option<CompileError> = None;
1460        for linter in self.linters.iter() {
1461            match linter.check(&self.report_builder, rule) {
1462                LinterResult::Ok => {}
1463                LinterResult::Warn(warning) => {
1464                    self.warnings.add(|| warning);
1465                }
1466                LinterResult::Warns(warnings) => {
1467                    for warning in warnings {
1468                        self.warnings.add(|| warning);
1469                    }
1470                }
1471                LinterResult::Err(err) => {
1472                    if first_linter_err.is_none() {
1473                        first_linter_err = Some(err);
1474                    } else {
1475                        self.errors.push(err);
1476                    }
1477                }
1478            }
1479        }
1480        if let Some(err) = first_linter_err {
1481            return Err(err);
1482        }
1483
1484        // Take snapshot of the current compiler state. In case of error
1485        // compiling the current rule this snapshot allows restoring the
1486        // compiler to the state it had before starting compiling the rule.
1487        // This way we don't leave too much junk, like atoms, or sub-patterns
1488        // corresponding to failed rules. However, there is some junk left
1489        // behind in `ident_pool` and `lit_pool`, because once a string is
1490        // added to one of these pools it can't be removed.
1491        let snapshot = self.take_snapshot();
1492
1493        let tags: Vec<IdentId> = rule
1494            .tags
1495            .iter()
1496            .flatten()
1497            .map(|t| self.ident_pool.get_or_intern(t.name))
1498            .collect();
1499
1500        // Helper function that converts from `ast::MetaValue` to
1501        // `compiler::rules::MetaValue`.
1502        let mut convert_meta_value = |value: &ast::MetaValue| match value {
1503            ast::MetaValue::Integer((i, _)) => MetaValue::Integer(*i),
1504            ast::MetaValue::Float((f, _)) => MetaValue::Float(*f),
1505            ast::MetaValue::Bool((b, _)) => MetaValue::Bool(*b),
1506            ast::MetaValue::String((s, _)) => {
1507                MetaValue::String(self.lit_pool.get_or_intern(s))
1508            }
1509            ast::MetaValue::Bytes((s, _)) => {
1510                MetaValue::Bytes(self.lit_pool.get_or_intern(s))
1511            }
1512        };
1513
1514        // Build a vector of pairs (IdentId, MetaValue) for every meta defined
1515        // in the rule.
1516        let metadata = rule
1517            .meta
1518            .iter()
1519            .flatten()
1520            .map(|m| {
1521                (
1522                    self.ident_pool.get_or_intern(m.identifier.name),
1523                    convert_meta_value(&m.value),
1524                )
1525            })
1526            .collect();
1527
1528        let mut rule_patterns = Vec::new();
1529
1530        let mut ctx = CompileContext {
1531            ir: &mut self.ir,
1532            relaxed_re_syntax: self.relaxed_re_syntax,
1533            error_on_slow_loop: self.error_on_slow_loop,
1534            one_shot_symbol_table: None,
1535            symbol_table: &mut self.symbol_table,
1536            report_builder: &self.report_builder,
1537            current_rule_patterns: &mut rule_patterns,
1538            warnings: &mut self.warnings,
1539            vars: VarStack::new(),
1540            for_of_depth: 0,
1541            features: &self.features,
1542            loop_iteration_multiplier: 1,
1543            regex_sets: &mut self.regex_sets,
1544            regex_pool: &mut self.regex_pool,
1545        };
1546
1547        // Convert the patterns from AST to IR. This populates the
1548        // `ctx.current_rule_patterns` vector.
1549        if let Err(err) = patterns_from_ast(&mut ctx, rule) {
1550            drop(ctx);
1551            self.restore_snapshot(snapshot);
1552            return Err(err);
1553        }
1554
1555        // Convert the condition from AST to IR. Also updates the patterns
1556        // with information about whether they are used in the condition and
1557        // if they are anchored or not.
1558        let condition = rule_condition_from_ast(&mut ctx, rule);
1559
1560        drop(ctx);
1561
1562        // Search for patterns that are very common byte repetitions like:
1563        //
1564        //   00 00 00 00 00 00 ....
1565        //   90 90 09 90 90 90 ....
1566        //   FF FF FF FF FF FF ....
1567        //
1568        // Raise a warning when such a pattern is found, except in the
1569        // following cases:
1570        //
1571        // 1) When the pattern is anchored, because anchored pattern can appear
1572        //    only at a fixed offset and are not searched by Aho-Corasick.
1573        //
1574        // 2) When the pattern has attributes: xor, fullword, base64 or
1575        //    base64wide, because in those cases the real pattern is not that
1576        //    common.
1577        //
1578        // Note: this can't be done before calling `rule_condition_from_ast`,
1579        // because we don't know which patterns are anchored until the condition
1580        // is processed.
1581        for pat in rule_patterns.iter() {
1582            if pat.anchored_at().is_none()
1583                && !pat.pattern().flags().intersects(
1584                    PatternFlags::Xor
1585                        | PatternFlags::Fullword
1586                        | PatternFlags::Base64
1587                        | PatternFlags::Base64Wide,
1588                )
1589            {
1590                let literal_bytes = match pat.pattern() {
1591                    Pattern::Text(lit) => Some(lit.text.as_bytes()),
1592                    Pattern::Regexp(re) => re.hir.as_literal_bytes(),
1593                    Pattern::Hex(re) => re.hir.as_literal_bytes(),
1594                };
1595                if let Some(literal_bytes) = literal_bytes
1596                    && Self::common_byte_repetition(literal_bytes)
1597                {
1598                    self.warnings.add(|| {
1599                        warnings::SlowPattern::build(
1600                            &self.report_builder,
1601                            self.report_builder
1602                                .span_to_code_loc(pat.span().clone()),
1603                            None,
1604                        )
1605                    });
1606                }
1607            }
1608        }
1609
1610        // In case of error, restore the compiler to the state it was before
1611        // entering this function. Also, if the error is due to an unknown
1612        // identifier, but the identifier is one of the unsupported modules,
1613        // the error is tolerated and a warning is issued instead.
1614        let mut condition = match condition {
1615            Ok(condition) => condition,
1616            Err(CompileError::UnknownIdentifier(unknown))
1617                if self.ignored_rules.contains_key(unknown.identifier())
1618                    || self.ignored_modules.contains(unknown.identifier()) =>
1619            {
1620                self.restore_snapshot(snapshot);
1621
1622                if let Some(module_name) =
1623                    self.ignored_rules.get(unknown.identifier())
1624                {
1625                    self.warnings.add(|| {
1626                        warnings::IgnoredRule::build(
1627                            &self.report_builder,
1628                            module_name.clone(),
1629                            rule.identifier.name.to_string(),
1630                            unknown.identifier_location().clone(),
1631                        )
1632                    });
1633                    self.ignored_rules.insert(
1634                        rule.identifier.name.to_string(),
1635                        module_name.clone(),
1636                    );
1637                } else {
1638                    self.warnings.add(|| {
1639                        warnings::IgnoredModule::build(
1640                            &self.report_builder,
1641                            unknown.identifier().to_string(),
1642                            unknown.identifier_location().clone(),
1643                            Some(format!(
1644                                "the whole rule `{}` will be ignored",
1645                                rule.identifier.name
1646                            )),
1647                        )
1648                    });
1649                    self.ignored_rules.insert(
1650                        rule.identifier.name.to_string(),
1651                        unknown.identifier().to_string(),
1652                    );
1653                }
1654
1655                return Ok(());
1656            }
1657            Err(err) => {
1658                self.restore_snapshot(snapshot);
1659                return Err(err);
1660            }
1661        };
1662
1663        if self.hoisting {
1664            condition = self.ir.hoisting();
1665        }
1666
1667        // Analyze the condition and determine the bounds it imposes to
1668        // `filesize`, if any.
1669        let filesize_bounds = self.ir.filesize_bounds();
1670
1671        // Set the bounds to all patterns in the rule. This must be done
1672        // before assigning the PatternId to each pattern, as the filesize
1673        // bounds are taken into account when determining if the pattern
1674        // is unique or re-used from a previous rule.
1675        if !filesize_bounds.unbounded() {
1676            for pattern in &mut rule_patterns {
1677                pattern.pattern_mut().set_filesize_bounds(&filesize_bounds);
1678            }
1679        }
1680
1681        if let Some(w) = &mut self.ir_writer {
1682            writeln!(w, "RULE {}", rule.identifier.name).unwrap();
1683            writeln!(w, "{:?}", self.ir).unwrap();
1684            if !filesize_bounds.unbounded() {
1685                writeln!(w, "{filesize_bounds:?}\n",).unwrap();
1686            }
1687        }
1688
1689        let mut pattern_ids = Vec::with_capacity(rule_patterns.len());
1690        let mut patterns = Vec::with_capacity(rule_patterns.len());
1691        let mut pending_patterns = HashSet::new();
1692        let mut num_private_patterns = 0;
1693
1694        for pattern in &rule_patterns {
1695            // Raise error is some pattern was not used, except if the pattern
1696            // identifier starts with underscore.
1697            if !pattern.in_use() && !pattern.identifier().starts_with("$_") {
1698                self.restore_snapshot(snapshot);
1699                return Err(UnusedPattern::build(
1700                    &self.report_builder,
1701                    pattern.identifier().name.to_string(),
1702                    self.report_builder
1703                        .span_to_code_loc(pattern.identifier().span()),
1704                ));
1705            }
1706
1707            if pattern.pattern().flags().contains(PatternFlags::Private) {
1708                num_private_patterns += 1;
1709            }
1710
1711            // Check if this pattern has been declared before, in this rule or
1712            // in some other rule. In such cases the pattern ID is re-used, and
1713            // we don't need to process (i.e: extract atoms and add them to
1714            // Aho-Corasick automaton) the pattern again. Two patterns are
1715            // considered equal if they are exactly the same, including any
1716            // modifiers associated to the pattern, both are non-anchored
1717            // or anchored at the same file offset, and if they have the same
1718            // file size bounds.
1719            let pattern_id =
1720                match self.patterns.entry(pattern.pattern().clone()) {
1721                    // The pattern already exists, return the existing ID.
1722                    Entry::Occupied(entry) => *entry.get(),
1723                    // The pattern didn't exist.
1724                    Entry::Vacant(entry) => {
1725                        let pattern_id = self.next_pattern_id;
1726                        self.next_pattern_id.incr(1);
1727                        self.fast_scan_patterns.push(true);
1728                        pending_patterns.insert(pattern_id);
1729                        entry.insert(pattern_id);
1730                        pattern_id
1731                    }
1732                };
1733
1734            if !pattern.fast_scan_allowed() {
1735                self.fast_scan_patterns.set(usize::from(pattern_id), false);
1736            }
1737
1738            let kind = match pattern.pattern() {
1739                Pattern::Text(_) => PatternKind::Text,
1740                Pattern::Regexp(_) => PatternKind::Regexp,
1741                Pattern::Hex(_) => PatternKind::Hex,
1742            };
1743
1744            patterns.push(PatternInfo {
1745                kind,
1746                pattern_id,
1747                ident_id: self
1748                    .ident_pool
1749                    .get_or_intern(pattern.identifier().name),
1750                is_private: pattern
1751                    .pattern()
1752                    .flags()
1753                    .contains(PatternFlags::Private),
1754            });
1755
1756            pattern_ids.push(pattern_id);
1757        }
1758
1759        // The RuleId for the new rule is current length of `self.rules`. The
1760        // first rule has RuleId = 0.
1761        let rule_id = RuleId::from(self.rules.len());
1762
1763        self.rules.push(RuleInfo {
1764            tags,
1765            metadata,
1766            patterns,
1767            num_private_patterns,
1768            is_global: rule.flags.contains(RuleFlags::Global),
1769            is_private: rule.flags.contains(RuleFlags::Private),
1770            namespace_id: self.current_namespace.id,
1771            namespace_ident_id: self.current_namespace.ident_id,
1772            ident_id: self.ident_pool.get_or_intern(rule.identifier.name),
1773            ident_ref: self
1774                .report_builder
1775                .span_to_code_loc(rule.identifier.span()),
1776        });
1777
1778        // Process the patterns in the rule. This extracts the best atoms
1779        // from each pattern, adding them to the `self.atoms` vector, it
1780        // also creates one or more sub-patterns per pattern and adds them
1781        // to `self.sub_patterns`
1782        for (pattern_id, pattern) in
1783            izip!(pattern_ids.iter(), rule_patterns.into_iter())
1784        {
1785            if pending_patterns.contains(pattern_id) {
1786                let pattern_span = pattern.span().clone();
1787                match pattern.into_pattern() {
1788                    Pattern::Text(pattern) => {
1789                        self.c_literal_pattern(*pattern_id, pattern);
1790                    }
1791                    Pattern::Regexp(pattern) | Pattern::Hex(pattern) => {
1792                        if let Err(err) = self.c_regexp_pattern(
1793                            *pattern_id,
1794                            pattern,
1795                            pattern_span,
1796                        ) {
1797                            self.restore_snapshot(snapshot);
1798                            return Err(err);
1799                        }
1800                    }
1801                };
1802                if !filesize_bounds.unbounded()
1803                    && self
1804                        .filesize_bounds
1805                        .insert(*pattern_id, filesize_bounds.clone())
1806                        .is_some()
1807                {
1808                    // This should not happen.
1809                    panic!(
1810                        "modifying the file size bounds of an existing pattern"
1811                    )
1812                }
1813                pending_patterns.remove(pattern_id);
1814            }
1815        }
1816
1817        // Create a new symbol of bool type for the rule.
1818        let new_symbol = Symbol::Rule {
1819            rule_id,
1820            is_global: rule.flags.contains(RuleFlags::Global),
1821        };
1822
1823        // Insert the symbol in the symbol table corresponding to the
1824        // current namespace. This must be done after every fallible function
1825        // has been called; once the symbol is inserted in the symbol table,
1826        // it can't be undone.
1827        let existing_symbol = self
1828            .current_namespace
1829            .symbols
1830            .as_ref()
1831            .borrow_mut()
1832            .insert(rule.identifier.name, new_symbol);
1833
1834        // No other symbol with the same identifier should exist.
1835        assert!(existing_symbol.is_none());
1836
1837        // The last step is emitting the WASM code corresponding to the rule's
1838        // condition. This is done after every fallible function has been called
1839        // because once the code is emitted it cannot be undone, which means
1840        // that if this function fails after emitting the code, some code debris
1841        // will remain in the WASM module.
1842        let mut ctx = EmitContext {
1843            current_rule: self.rules.last_mut().unwrap(),
1844            lit_pool: &mut self.lit_pool,
1845            regex_pool: &mut self.regex_pool,
1846            wasm_symbols: &self.wasm_symbols,
1847            wasm_exports: &self.wasm_exports,
1848            exception_handler_stack: Vec::new(),
1849            lookup_list: Vec::new(),
1850            emit_search_for_pattern_stack: Vec::new(),
1851        };
1852
1853        emit_rule_condition(
1854            &mut ctx,
1855            &self.ir,
1856            rule_id,
1857            condition,
1858            &mut self.wasm_mod,
1859        );
1860
1861        Ok(())
1862    }
1863
1864    fn c_import(&mut self, import: &Import) -> Result<(), CompileError> {
1865        let module_name = import.module_name;
1866        let module = crate::modules::registered_modules()
1867            .find(|m| m.name() == module_name);
1868
1869        // Does a module with the given name actually exist? ...
1870        if module.is_none() {
1871            // The module does not exist, but it is included in the list
1872            // of unsupported modules. In such cases we don't raise an error,
1873            // only a warning.
1874            return if self.ignored_modules.iter().any(|m| m == module_name) {
1875                self.warnings.add(|| {
1876                    warnings::IgnoredModule::build(
1877                        &self.report_builder,
1878                        module_name.to_string(),
1879                        self.report_builder.span_to_code_loc(import.span()),
1880                        None,
1881                    )
1882                });
1883                Ok(())
1884            } else {
1885                // The module does not exist, and is not explicitly added to
1886                // the list of unsupported modules, that's an error.
1887                Err(UnknownModule::build(
1888                    &self.report_builder,
1889                    module_name.to_string(),
1890                    self.report_builder.span_to_code_loc(import.span()),
1891                ))
1892            };
1893        }
1894
1895        // Yes, module exists.
1896        let module = module.unwrap();
1897
1898        // If the module has not been added to `self.root_struct` and
1899        // `self.imported_modules`, do it.
1900        if !self.root_struct.has_field(module_name) {
1901            // Add the module to the list of imported modules.
1902            self.imported_modules
1903                .push(self.ident_pool.get_or_intern(module_name));
1904
1905            // Create the `Struct` that describes the module.
1906            let module_struct = Rc::<Struct>::from(module);
1907
1908            // Insert the module in the struct that contains all imported
1909            // modules. This struct contains all modules imported, from
1910            // all namespaces. Panic if the module was already in the struct.
1911            if self
1912                .root_struct
1913                .add_field(module_name, TypeValue::Struct(module_struct))
1914                .is_some()
1915            {
1916                panic!("duplicate module `{module_name}`")
1917            }
1918        }
1919
1920        let mut symbol_table =
1921            self.current_namespace.symbols.as_ref().borrow_mut();
1922
1923        // Create a symbol for the module and insert it in the symbol
1924        // table for this namespace, if it doesn't exist.
1925        if !symbol_table.contains(module_name) {
1926            symbol_table.insert(
1927                module_name,
1928                self.root_struct.lookup(module_name).unwrap(),
1929            );
1930        }
1931
1932        // Is the module banned? If yes, produce an error. Notice however that
1933        // this check is done after the module has been added to the symbol
1934        // table because we don't want additional errors due to undefined
1935        // identifiers when the banned module is used in some rule condition.
1936        if let Some((error_title, error_msg)) =
1937            self.banned_modules.get(module_name)
1938        {
1939            return Err(CustomError::build(
1940                &self.report_builder,
1941                error_title.clone(),
1942                error_msg.clone(),
1943                self.report_builder.span_to_code_loc(import.span()),
1944            ));
1945        }
1946
1947        Ok(())
1948    }
1949
1950    fn c_literal_pattern(
1951        &mut self,
1952        pattern_id: PatternId,
1953        pattern: LiteralPattern,
1954    ) {
1955        let full_word = pattern.flags.contains(PatternFlags::Fullword);
1956        let mut flags = SubPatternFlags::empty();
1957
1958        if full_word {
1959            flags.insert(SubPatternFlags::FullwordLeft);
1960            flags.insert(SubPatternFlags::FullwordRight);
1961        }
1962
1963        // Depending on the combination of `ascii` and `wide` modifiers, the
1964        // `main_patterns` vector will contain either the pattern's `ascii`
1965        // version, the `wide` version, or both. Each item in `main_patterns`
1966        // also contains the best atom for the pattern.
1967        let mut main_patterns = Vec::new();
1968        let wide_pattern;
1969
1970        if pattern.flags.contains(PatternFlags::Wide) {
1971            wide_pattern = make_wide(pattern.text.as_bytes());
1972            main_patterns.push((
1973                wide_pattern.as_slice(),
1974                best_atom_in_bytes(wide_pattern.as_slice()),
1975                flags | SubPatternFlags::Wide,
1976            ));
1977        }
1978
1979        if pattern.flags.contains(PatternFlags::Ascii) {
1980            main_patterns.push((
1981                pattern.text.as_bytes(),
1982                best_atom_in_bytes(pattern.text.as_bytes()),
1983                flags,
1984            ));
1985        }
1986
1987        for (main_pattern, best_atom, flags) in main_patterns {
1988            let pattern_lit_id = self.lit_pool.get_or_intern(main_pattern);
1989
1990            if pattern.flags.contains(PatternFlags::Xor) {
1991                // When `xor` is used, `base64`, `base64wide` and `nocase` are
1992                // not accepted.
1993                debug_assert!(!pattern.flags.contains(
1994                    PatternFlags::Base64
1995                        | PatternFlags::Base64Wide
1996                        | PatternFlags::Nocase,
1997                ));
1998
1999                let xor_range = pattern.xor_range.clone().unwrap();
2000                self.add_sub_pattern(
2001                    pattern_id,
2002                    SubPattern::Xor { pattern: pattern_lit_id, flags },
2003                    best_atom.xor_combinations(xor_range),
2004                    SubPatternAtom::from_atom,
2005                );
2006            } else if pattern.flags.contains(PatternFlags::Nocase) {
2007                // When `nocase` is used, `base64`, `base64wide` and `xor` are
2008                // not accepted.
2009                debug_assert!(!pattern.flags.contains(
2010                    PatternFlags::Base64
2011                        | PatternFlags::Base64Wide
2012                        | PatternFlags::Xor,
2013                ));
2014
2015                self.add_sub_pattern(
2016                    pattern_id,
2017                    SubPattern::Literal {
2018                        pattern: pattern_lit_id,
2019                        flags: flags | SubPatternFlags::Nocase,
2020                        anchored_at: None,
2021                    },
2022                    best_atom.case_combinations(),
2023                    SubPatternAtom::from_atom,
2024                );
2025            }
2026            // Used `base64`, or `base64wide`, or both.
2027            else if pattern
2028                .flags
2029                .intersects(PatternFlags::Base64 | PatternFlags::Base64Wide)
2030            {
2031                // When `base64` or `base64wide` are used, `xor`, `fullword`
2032                // and `nocase` are not accepted.
2033                debug_assert!(!pattern.flags.contains(
2034                    PatternFlags::Xor
2035                        | PatternFlags::Fullword
2036                        | PatternFlags::Nocase,
2037                ));
2038
2039                if pattern.flags.contains(PatternFlags::Base64) {
2040                    for (padding, base64_pattern) in base64_patterns(
2041                        main_pattern,
2042                        pattern.base64_alphabet.as_deref(),
2043                    ) {
2044                        let sub_pattern = if let Some(alphabet) =
2045                            pattern.base64_alphabet.as_deref()
2046                        {
2047                            SubPattern::CustomBase64 {
2048                                pattern: pattern_lit_id,
2049                                alphabet: self
2050                                    .lit_pool
2051                                    .get_or_intern(alphabet),
2052                                padding,
2053                            }
2054                        } else {
2055                            SubPattern::Base64 {
2056                                pattern: pattern_lit_id,
2057                                padding,
2058                            }
2059                        };
2060
2061                        self.add_sub_pattern(
2062                            pattern_id,
2063                            sub_pattern,
2064                            iter::once({
2065                                let mut atom = best_atom_in_bytes(
2066                                    base64_pattern.as_slice(),
2067                                );
2068                                // Atoms for base64 patterns are always
2069                                // inexact, they require verification.
2070                                atom.make_inexact();
2071                                atom
2072                            }),
2073                            SubPatternAtom::from_atom,
2074                        );
2075                    }
2076                }
2077
2078                if pattern.flags.contains(PatternFlags::Base64Wide) {
2079                    for (padding, base64_pattern) in base64_patterns(
2080                        main_pattern,
2081                        pattern.base64wide_alphabet.as_deref(),
2082                    ) {
2083                        let sub_pattern = if let Some(alphabet) =
2084                            pattern.base64wide_alphabet.as_deref()
2085                        {
2086                            SubPattern::CustomBase64Wide {
2087                                pattern: pattern_lit_id,
2088                                alphabet: self
2089                                    .lit_pool
2090                                    .get_or_intern(alphabet),
2091                                padding,
2092                            }
2093                        } else {
2094                            SubPattern::Base64Wide {
2095                                pattern: pattern_lit_id,
2096                                padding,
2097                            }
2098                        };
2099
2100                        let wide = make_wide(base64_pattern.as_slice());
2101
2102                        self.add_sub_pattern(
2103                            pattern_id,
2104                            sub_pattern,
2105                            iter::once({
2106                                let mut atom =
2107                                    best_atom_in_bytes(wide.as_slice());
2108                                // Atoms for base64 patterns are always
2109                                // inexact, they require verification.
2110                                atom.make_inexact();
2111                                atom
2112                            }),
2113                            SubPatternAtom::from_atom,
2114                        );
2115                    }
2116                }
2117            } else {
2118                self.add_sub_pattern(
2119                    pattern_id,
2120                    SubPattern::Literal {
2121                        pattern: pattern_lit_id,
2122                        anchored_at: pattern.anchored_at,
2123                        flags,
2124                    },
2125                    iter::once(best_atom),
2126                    SubPatternAtom::from_atom,
2127                );
2128            }
2129        }
2130    }
2131
2132    fn c_regexp_pattern(
2133        &mut self,
2134        pattern_id: PatternId,
2135        pattern: RegexpPattern,
2136        span: Span,
2137    ) -> Result<(), CompileError> {
2138        // Try splitting the regexp into multiple chained sub-patterns if it
2139        // contains large gaps. For example, `{ 01 02 03 [-] 04 05 06 }` is
2140        // split into `{ 01 02 03 }` and `{ 04 05 06 }`, where `{ 04 05 06 }`
2141        // is chained to `{ 01 02 03 }`.
2142        //
2143        // If the regexp can't be split then `head` is the whole regexp.
2144        let (head, tail) = pattern.hir.split_at_large_gaps();
2145
2146        if !tail.is_empty() {
2147            // The pattern was split into multiple chained regexps.
2148            return self.c_chain(
2149                pattern_id,
2150                &head,
2151                &tail,
2152                pattern.flags,
2153                span,
2154            );
2155        }
2156
2157        if head.is_alternation_literal() {
2158            // The pattern is either a literal, or an alternation of literals.
2159            // Examples:
2160            //   /foo/
2161            //   /foo|bar|baz/
2162            //   { 01 02 03 }
2163            //   { (01 02 03 | 04 05 06 ) }
2164            return self.c_alternation_literal(
2165                pattern_id,
2166                head,
2167                pattern.anchored_at,
2168                pattern.flags,
2169            );
2170        }
2171
2172        // If this point is reached, this is a pattern that can't be split into
2173        // multiple chained patterns, and is neither a literal or alternation
2174        // of literals. Most patterns fall in this category.
2175        let mut flags = SubPatternFlags::empty();
2176
2177        if pattern.flags.contains(PatternFlags::Nocase) {
2178            flags.insert(SubPatternFlags::Nocase);
2179        }
2180
2181        if pattern.flags.contains(PatternFlags::Fullword) {
2182            flags.insert(SubPatternFlags::FullwordLeft);
2183            flags.insert(SubPatternFlags::FullwordRight);
2184        }
2185
2186        if matches!(head.is_greedy(), Some(true)) {
2187            flags.insert(SubPatternFlags::GreedyRegexp);
2188        }
2189
2190        let (atoms, is_fast_regexp) = self.c_regexp(&head, span)?;
2191
2192        if is_fast_regexp {
2193            flags.insert(SubPatternFlags::FastRegexp);
2194        }
2195
2196        if pattern.flags.contains(PatternFlags::Wide) {
2197            self.add_sub_pattern(
2198                pattern_id,
2199                SubPattern::Regexp { flags: flags | SubPatternFlags::Wide },
2200                atoms.iter().cloned().map(|atom| atom.make_wide()),
2201                SubPatternAtom::from_regexp_atom,
2202            );
2203        }
2204
2205        if pattern.flags.contains(PatternFlags::Ascii) {
2206            self.add_sub_pattern(
2207                pattern_id,
2208                SubPattern::Regexp { flags },
2209                atoms.into_iter(),
2210                SubPatternAtom::from_regexp_atom,
2211            );
2212        }
2213
2214        Ok(())
2215    }
2216
2217    fn c_alternation_literal(
2218        &mut self,
2219        pattern_id: PatternId,
2220        hir: re::hir::Hir,
2221        anchored_at: Option<usize>,
2222        flags: PatternFlags,
2223    ) -> Result<(), CompileError> {
2224        let ascii = flags.contains(PatternFlags::Ascii);
2225        let wide = flags.contains(PatternFlags::Wide);
2226        let case_insensitive = flags.contains(PatternFlags::Nocase);
2227        let full_word = flags.contains(PatternFlags::Fullword);
2228
2229        let mut flags = SubPatternFlags::empty();
2230
2231        if case_insensitive {
2232            flags.insert(SubPatternFlags::Nocase);
2233        }
2234
2235        if full_word {
2236            flags.insert(SubPatternFlags::FullwordLeft);
2237            flags.insert(SubPatternFlags::FullwordRight);
2238        }
2239
2240        let mut process_literal = |literal: &hir::Literal, wide: bool| {
2241            let pattern_lit_id =
2242                self.intern_literal(literal.0.as_bytes(), wide);
2243
2244            let best_atom = best_atom_in_bytes(
2245                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2246            );
2247
2248            let flags =
2249                if wide { flags | SubPatternFlags::Wide } else { flags };
2250
2251            let sub_pattern = SubPattern::Literal {
2252                pattern: pattern_lit_id,
2253                anchored_at,
2254                flags,
2255            };
2256
2257            if case_insensitive {
2258                self.add_sub_pattern(
2259                    pattern_id,
2260                    sub_pattern,
2261                    best_atom.case_combinations(),
2262                    SubPatternAtom::from_atom,
2263                );
2264            } else {
2265                self.add_sub_pattern(
2266                    pattern_id,
2267                    sub_pattern,
2268                    iter::once(best_atom),
2269                    SubPatternAtom::from_atom,
2270                );
2271            }
2272        };
2273
2274        let inner;
2275
2276        let hir = if let hir::HirKind::Capture(group) = hir.kind() {
2277            group.sub.as_ref()
2278        } else {
2279            inner = hir.into_inner();
2280            &inner
2281        };
2282
2283        match hir.kind() {
2284            hir::HirKind::Literal(literal) => {
2285                if ascii {
2286                    process_literal(literal, false);
2287                }
2288                if wide {
2289                    process_literal(literal, true);
2290                }
2291            }
2292            hir::HirKind::Alternation(literals) => {
2293                let literals = literals
2294                    .iter()
2295                    .map(|l| cast!(l.kind(), hir::HirKind::Literal));
2296                for literal in literals {
2297                    if ascii {
2298                        process_literal(literal, false);
2299                    }
2300                    if wide {
2301                        process_literal(literal, true);
2302                    }
2303                }
2304            }
2305            _ => unreachable!(),
2306        }
2307
2308        Ok(())
2309    }
2310
2311    fn c_chain(
2312        &mut self,
2313        pattern_id: PatternId,
2314        leading: &re::hir::Hir,
2315        trailing: &[ChainedPattern],
2316        flags: PatternFlags,
2317        span: Span,
2318    ) -> Result<(), CompileError> {
2319        let ascii = flags.contains(PatternFlags::Ascii);
2320        let wide = flags.contains(PatternFlags::Wide);
2321        let case_insensitive = flags.contains(PatternFlags::Nocase);
2322        let full_word = flags.contains(PatternFlags::Fullword);
2323
2324        let mut common_flags = SubPatternFlags::empty();
2325
2326        if case_insensitive {
2327            common_flags.insert(SubPatternFlags::Nocase);
2328        }
2329
2330        if matches!(leading.is_greedy(), Some(true)) {
2331            common_flags.insert(SubPatternFlags::GreedyRegexp);
2332        }
2333
2334        let mut prev_sub_pattern_ascii = SubPatternId(0);
2335        let mut prev_sub_pattern_wide = SubPatternId(0);
2336
2337        if let hir::HirKind::Literal(literal) = leading.kind() {
2338            let mut flags = common_flags;
2339
2340            if full_word {
2341                flags.insert(SubPatternFlags::FullwordLeft);
2342            }
2343
2344            if ascii {
2345                prev_sub_pattern_ascii =
2346                    self.c_literal_chain_head(pattern_id, literal, flags);
2347            }
2348
2349            if wide {
2350                prev_sub_pattern_wide = self.c_literal_chain_head(
2351                    pattern_id,
2352                    literal,
2353                    flags | SubPatternFlags::Wide,
2354                );
2355            };
2356        } else {
2357            let mut flags = common_flags;
2358
2359            let (atoms, is_fast_regexp) =
2360                self.c_regexp(leading, span.clone())?;
2361
2362            if is_fast_regexp {
2363                flags.insert(SubPatternFlags::FastRegexp);
2364            }
2365
2366            if full_word {
2367                flags.insert(SubPatternFlags::FullwordLeft);
2368            }
2369
2370            if wide {
2371                prev_sub_pattern_wide = self.add_sub_pattern(
2372                    pattern_id,
2373                    SubPattern::RegexpChainHead {
2374                        flags: flags | SubPatternFlags::Wide,
2375                    },
2376                    atoms.iter().cloned().map(|atom| atom.make_wide()),
2377                    SubPatternAtom::from_regexp_atom,
2378                );
2379            }
2380
2381            if ascii {
2382                prev_sub_pattern_ascii = self.add_sub_pattern(
2383                    pattern_id,
2384                    SubPattern::RegexpChainHead { flags },
2385                    atoms.into_iter(),
2386                    SubPatternAtom::from_regexp_atom,
2387                );
2388            }
2389        }
2390
2391        for (i, p) in trailing.iter().enumerate() {
2392            let mut flags = common_flags;
2393
2394            // The last pattern in the chain has the `LastInChain` flag and
2395            // the `FullwordRight` if the original pattern was `Fullword`.
2396            // Patterns in the middle of the chain won't have either of these
2397            // flags.
2398            if i == trailing.len() - 1 {
2399                flags.insert(SubPatternFlags::LastInChain);
2400                if full_word {
2401                    flags.insert(SubPatternFlags::FullwordRight);
2402                }
2403            }
2404
2405            if let hir::HirKind::Literal(literal) = p.hir.kind() {
2406                if wide {
2407                    prev_sub_pattern_wide = self.c_literal_chain_tail(
2408                        pattern_id,
2409                        literal,
2410                        prev_sub_pattern_wide,
2411                        p.gap.clone(),
2412                        flags | SubPatternFlags::Wide,
2413                    );
2414                };
2415                if ascii {
2416                    prev_sub_pattern_ascii = self.c_literal_chain_tail(
2417                        pattern_id,
2418                        literal,
2419                        prev_sub_pattern_ascii,
2420                        p.gap.clone(),
2421                        flags,
2422                    );
2423                }
2424            } else {
2425                if matches!(p.hir.is_greedy(), Some(true)) {
2426                    flags.insert(SubPatternFlags::GreedyRegexp);
2427                }
2428
2429                let (atoms, is_fast_regexp) =
2430                    self.c_regexp(&p.hir, span.clone())?;
2431
2432                if is_fast_regexp {
2433                    flags.insert(SubPatternFlags::FastRegexp);
2434                }
2435
2436                if wide {
2437                    prev_sub_pattern_wide = self.add_sub_pattern(
2438                        pattern_id,
2439                        SubPattern::RegexpChainTail {
2440                            chained_to: prev_sub_pattern_wide,
2441                            gap: p.gap.clone(),
2442                            flags: flags | SubPatternFlags::Wide,
2443                        },
2444                        atoms.iter().cloned().map(|atom| atom.make_wide()),
2445                        SubPatternAtom::from_regexp_atom,
2446                    )
2447                }
2448
2449                if ascii {
2450                    prev_sub_pattern_ascii = self.add_sub_pattern(
2451                        pattern_id,
2452                        SubPattern::RegexpChainTail {
2453                            chained_to: prev_sub_pattern_ascii,
2454                            gap: p.gap.clone(),
2455                            flags,
2456                        },
2457                        atoms.into_iter(),
2458                        SubPatternAtom::from_regexp_atom,
2459                    );
2460                }
2461            }
2462        }
2463
2464        Ok(())
2465    }
2466
2467    fn c_regexp(
2468        &mut self,
2469        hir: &re::hir::Hir,
2470        span: Span,
2471    ) -> Result<(Vec<re::RegexpAtom>, bool), CompileError> {
2472        // When the `fast-regexp` feature is enabled, try to compile the regexp
2473        // for `FastVM` first, if it fails with `Error::FastIncompatible`, the
2474        // regexp is not compatible for `FastVM` and `PikeVM` must be used
2475        // instead.
2476        #[cfg(feature = "fast-regexp")]
2477        let (result, is_fast_regexp) = match re::fast::Compiler::new()
2478            .compile(hir, &mut self.re_code)
2479        {
2480            Err(re::Error::FastIncompatible) => (
2481                re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2482                false,
2483            ),
2484            result => (result, true),
2485        };
2486
2487        #[cfg(not(feature = "fast-regexp"))]
2488        let (result, is_fast_regexp) = (
2489            re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2490            false,
2491        );
2492
2493        let re_atoms = result.map_err(|err| {
2494            InvalidRegexp::build(
2495                &self.report_builder,
2496                err.to_string(),
2497                self.report_builder.span_to_code_loc(span.clone()),
2498                None,
2499            )
2500        })?;
2501
2502        if matches!(hir.minimum_len(), Some(0)) {
2503            return Err(InvalidRegexp::build(
2504                &self.report_builder,
2505                "this regexp can match empty strings".to_string(),
2506                self.report_builder.span_to_code_loc(span),
2507                None,
2508            ));
2509        }
2510
2511        let (slow_pattern, note) =
2512            match re_atoms.iter().map(|re_atom| re_atom.atom.len()).minmax() {
2513                // No atoms, slow pattern.
2514                MinMaxResult::NoElements => (true, None),
2515                // Only one atom of len 0.
2516                MinMaxResult::OneElement(0) => (
2517                    true,
2518                    Some(
2519                        "this is an exceptionally extreme case that may severely degrade scanning throughput"
2520                            .to_string(),
2521                    ),
2522                ),
2523                // Only one atom shorter than 2 bytes, slow pattern.
2524                MinMaxResult::OneElement(len) if len < 2 => (true, None),
2525                // More than one atom, at least one is shorter than 2 bytes.
2526                MinMaxResult::MinMax(min, _) if min < 2 => (true, None),
2527                // More than 2700 atoms, all with exactly 2 bytes.
2528                // Why 2700?. The larger the number of atoms the higher the
2529                // odds of finding one of them in the data, which slows down
2530                // the scan. The regex [A-Za-z]{N,} (with N>=2) produces
2531                // (26+26)^2 = 2704 atoms. So, 2700 is large enough, but
2532                // produces a warning with the aforementioned regex.
2533                MinMaxResult::MinMax(2, 2) if re_atoms.len() > 2700 => {
2534                    (true, None)
2535                }
2536                // In all other cases the pattern is not slow.
2537                _ => (false, None),
2538            };
2539
2540        if slow_pattern {
2541            if self.error_on_slow_pattern {
2542                return Err(errors::SlowPattern::build(
2543                    &self.report_builder,
2544                    self.report_builder.span_to_code_loc(span),
2545                    note,
2546                ));
2547            } else {
2548                self.warnings.add(|| {
2549                    warnings::SlowPattern::build(
2550                        &self.report_builder,
2551                        self.report_builder.span_to_code_loc(span),
2552                        note,
2553                    )
2554                });
2555            }
2556        }
2557
2558        Ok((re_atoms, is_fast_regexp))
2559    }
2560
2561    fn c_literal_chain_head(
2562        &mut self,
2563        pattern_id: PatternId,
2564        literal: &hir::Literal,
2565        flags: SubPatternFlags,
2566    ) -> SubPatternId {
2567        let pattern_lit_id = self.intern_literal(
2568            literal.0.as_bytes(),
2569            flags.contains(SubPatternFlags::Wide),
2570        );
2571        self.add_sub_pattern(
2572            pattern_id,
2573            SubPattern::LiteralChainHead { pattern: pattern_lit_id, flags },
2574            extract_atoms(
2575                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2576                flags,
2577            ),
2578            SubPatternAtom::from_atom,
2579        )
2580    }
2581
2582    fn c_literal_chain_tail(
2583        &mut self,
2584        pattern_id: PatternId,
2585        literal: &hir::Literal,
2586        chained_to: SubPatternId,
2587        gap: ChainedPatternGap,
2588        flags: SubPatternFlags,
2589    ) -> SubPatternId {
2590        let pattern_lit_id = self.intern_literal(
2591            literal.0.as_bytes(),
2592            flags.contains(SubPatternFlags::Wide),
2593        );
2594        self.add_sub_pattern(
2595            pattern_id,
2596            SubPattern::LiteralChainTail {
2597                pattern: pattern_lit_id,
2598                chained_to,
2599                gap,
2600                flags,
2601            },
2602            extract_atoms(
2603                self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2604                flags,
2605            ),
2606            SubPatternAtom::from_atom,
2607        )
2608    }
2609}
2610
2611impl fmt::Debug for Compiler<'_> {
2612    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2613        write!(f, "Compiler")
2614    }
2615}
2616
2617impl Default for Compiler<'_> {
2618    fn default() -> Self {
2619        Self::new()
2620    }
2621}
2622
2623/// ID associated to each identifier in the identifiers pool.
2624#[derive(Eq, PartialEq, Hash, Debug, Copy, Clone, Serialize, Deserialize)]
2625#[serde(transparent)]
2626pub(crate) struct IdentId(u32);
2627
2628impl From<u32> for IdentId {
2629    fn from(v: u32) -> Self {
2630        Self(v)
2631    }
2632}
2633
2634impl From<IdentId> for u32 {
2635    fn from(v: IdentId) -> Self {
2636        v.0
2637    }
2638}
2639
2640/// ID associated to each literal string in the literals pool.
2641#[derive(PartialEq, Debug, Copy, Clone, Serialize, Deserialize)]
2642#[serde(transparent)]
2643pub struct LiteralId(u32);
2644
2645impl From<i32> for LiteralId {
2646    fn from(v: i32) -> Self {
2647        Self(v as u32)
2648    }
2649}
2650
2651impl From<u32> for LiteralId {
2652    fn from(v: u32) -> Self {
2653        Self(v)
2654    }
2655}
2656
2657impl From<LiteralId> for u32 {
2658    fn from(v: LiteralId) -> Self {
2659        v.0
2660    }
2661}
2662
2663impl From<LiteralId> for i64 {
2664    fn from(v: LiteralId) -> Self {
2665        v.0 as i64
2666    }
2667}
2668
2669impl From<LiteralId> for u64 {
2670    fn from(v: LiteralId) -> Self {
2671        v.0 as u64
2672    }
2673}
2674
2675/// ID associated to each namespace.
2676#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
2677#[serde(transparent)]
2678pub(crate) struct NamespaceId(i32);
2679
2680impl From<i32> for NamespaceId {
2681    #[inline]
2682    fn from(v: i32) -> Self {
2683        Self(v)
2684    }
2685}
2686
2687/// ID associated to each rule.
2688#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Hash)]
2689pub(crate) struct RuleId(i32);
2690
2691impl RuleId {
2692    /// Returns the [`RuleId`] that comes after this one.
2693    ///
2694    /// This simply adds 1 to the ID.
2695    #[allow(dead_code)]
2696    pub(crate) fn next(&self) -> Self {
2697        RuleId(self.0 + 1)
2698    }
2699}
2700
2701impl From<i32> for RuleId {
2702    #[inline]
2703    fn from(value: i32) -> Self {
2704        Self(value)
2705    }
2706}
2707
2708impl From<usize> for RuleId {
2709    #[inline]
2710    fn from(value: usize) -> Self {
2711        Self(value.try_into().unwrap())
2712    }
2713}
2714
2715impl From<RuleId> for usize {
2716    #[inline]
2717    fn from(value: RuleId) -> Self {
2718        value.0 as usize
2719    }
2720}
2721
2722impl From<RuleId> for i32 {
2723    #[inline]
2724    fn from(value: RuleId) -> Self {
2725        value.0
2726    }
2727}
2728
2729/// ID associated to each regexp used in a rule condition.
2730#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
2731pub(crate) struct RegexId(i32);
2732
2733impl From<i32> for RegexId {
2734    #[inline]
2735    fn from(value: i32) -> Self {
2736        Self(value)
2737    }
2738}
2739
2740impl From<u32> for RegexId {
2741    #[inline]
2742    fn from(value: u32) -> Self {
2743        Self(value.try_into().unwrap())
2744    }
2745}
2746
2747impl From<i64> for RegexId {
2748    #[inline]
2749    fn from(value: i64) -> Self {
2750        Self(value.try_into().unwrap())
2751    }
2752}
2753
2754impl From<RegexId> for usize {
2755    #[inline]
2756    fn from(value: RegexId) -> Self {
2757        value.0 as usize
2758    }
2759}
2760
2761impl From<RegexId> for i32 {
2762    #[inline]
2763    fn from(value: RegexId) -> Self {
2764        value.0
2765    }
2766}
2767
2768impl From<RegexId> for u32 {
2769    #[inline]
2770    fn from(value: RegexId) -> Self {
2771        value.0.try_into().unwrap()
2772    }
2773}
2774
2775/// ID associated to each grouped `RegexSet`.
2776///
2777/// When compiling multiple rules, identical string expressions (such as a
2778/// specific field access like `vt.net.domain.raw`) are frequently matched
2779/// against multiple distinct regular expressions. To optimize these
2780/// evaluations, the compiler identifies identical targets, assigns them a
2781/// unique `RegexSetId`, and groups all their associated regular expressions
2782/// together. At runtime, the entire set is evaluated simultaneously in a
2783/// single pass.
2784#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
2785pub(crate) struct RegexSetId(i32);
2786
2787impl From<i32> for RegexSetId {
2788    #[inline]
2789    fn from(value: i32) -> Self {
2790        Self(value)
2791    }
2792}
2793
2794impl From<RegexSetId> for usize {
2795    #[inline]
2796    fn from(value: RegexSetId) -> Self {
2797        value.0 as usize
2798    }
2799}
2800
2801impl From<RegexSetId> for i32 {
2802    #[inline]
2803    fn from(value: RegexSetId) -> Self {
2804        value.0
2805    }
2806}
2807
2808/// ID associated to each pattern.
2809///
2810/// For each unique pattern defined in a set of YARA rules there's a PatternId
2811/// that identifies it. If two different rules define exactly the same pattern
2812/// there's a single instance of the pattern and therefore a single PatternId
2813/// shared by both rules. For example, if one rule defines `$a = "mz"` and
2814/// another one `$mz = "mz"`, the pattern `"mz"` is shared by the two rules.
2815///
2816/// However, in order to be considered the same, the following conditions must
2817/// be met:
2818///
2819/// * Both patterns must have the same modifiers (i.e: `"mz" nocase` is not the
2820///   same pattern as `"mz"`),
2821/// * Both patterns must be either non-anchored, or anchored to the same offset.
2822/// * Both patterns must have the same file size bounds (or no bounds at all).
2823#[derive(
2824    Copy, Clone, Debug, Eq, Hash, PartialEq, PartialOrd, Serialize, Deserialize,
2825)]
2826#[serde(transparent)]
2827#[derive(Ord)]
2828pub(crate) struct PatternId(i32);
2829
2830impl PatternId {
2831    #[inline]
2832    fn incr(&mut self, amount: usize) {
2833        self.0 += amount as i32;
2834    }
2835}
2836
2837impl From<i32> for PatternId {
2838    #[inline]
2839    fn from(value: i32) -> Self {
2840        Self(value)
2841    }
2842}
2843
2844impl From<usize> for PatternId {
2845    #[inline]
2846    fn from(value: usize) -> Self {
2847        Self(value as i32)
2848    }
2849}
2850
2851impl From<PatternId> for i32 {
2852    #[inline]
2853    fn from(value: PatternId) -> Self {
2854        value.0
2855    }
2856}
2857
2858impl From<PatternId> for i64 {
2859    #[inline]
2860    fn from(value: PatternId) -> Self {
2861        value.0 as i64
2862    }
2863}
2864
2865impl From<PatternId> for usize {
2866    #[inline]
2867    fn from(value: PatternId) -> Self {
2868        value.0 as usize
2869    }
2870}
2871
2872/// ID associated to each sub-pattern.
2873///
2874/// For each pattern there's one or more sub-patterns, depending on the pattern
2875/// and its modifiers. For example the pattern `"foo" ascii wide` may have one
2876/// subpattern for the ascii case and another one for the wide case.
2877#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
2878#[serde(transparent)]
2879pub(crate) struct SubPatternId(u32);
2880
2881/// Iterator that yields the names of the modules imported by the rules.
2882pub struct Imports<'a> {
2883    iter: std::slice::Iter<'a, IdentId>,
2884    ident_pool: &'a StringPool<IdentId>,
2885}
2886
2887impl<'a> Iterator for Imports<'a> {
2888    type Item = &'a str;
2889
2890    fn next(&mut self) -> Option<Self::Item> {
2891        self.iter.next().map(|id| self.ident_pool.get(*id).unwrap())
2892    }
2893}
2894
2895bitflags! {
2896    /// Flags associated to some kinds of [`SubPattern`].
2897    #[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq)]
2898    pub struct SubPatternFlags: u16  {
2899        const Wide                 = 0x01;
2900        const Nocase               = 0x02;
2901        // Indicates that the pattern is the last one in chain. Applies only
2902        // to chained sub-patterns.
2903        const LastInChain          = 0x04;
2904        const FullwordLeft         = 0x08;
2905        const FullwordRight        = 0x10;
2906        // Indicates that the pattern is a greedy regexp. Apply only to regexp
2907        // sub-patterns, or to any sub-pattern is part of chain that corresponds
2908        // to a greedy regexp.
2909        const GreedyRegexp         = 0x20;
2910        // Indicates that the pattern is a fast regexp. A fast regexp is one
2911        // that can be matched by the FastVM.
2912        const FastRegexp           = 0x40;
2913    }
2914}
2915
2916/// A sub-pattern in the compiled rules.
2917///
2918/// Each pattern in a rule has one or more associated sub-patterns. For
2919/// example, the pattern `$a = "foo" ascii wide` has a sub-pattern for the
2920/// ASCII variant of "foo", and another one for the wide variant.
2921///
2922/// Also, each [`Atom`] is associated to a [`SubPattern`]. When the atom is
2923/// found in the scanned data by the Aho-Corasick algorithm, the scanner
2924/// verifies that the sub-pattern actually matches.
2925#[derive(Serialize, Deserialize)]
2926pub(crate) enum SubPattern {
2927    Literal {
2928        pattern: LiteralId,
2929        anchored_at: Option<usize>,
2930        flags: SubPatternFlags,
2931    },
2932
2933    LiteralChainHead {
2934        pattern: LiteralId,
2935        flags: SubPatternFlags,
2936    },
2937
2938    LiteralChainTail {
2939        pattern: LiteralId,
2940        chained_to: SubPatternId,
2941        gap: ChainedPatternGap,
2942        flags: SubPatternFlags,
2943    },
2944
2945    Regexp {
2946        flags: SubPatternFlags,
2947    },
2948
2949    RegexpChainHead {
2950        flags: SubPatternFlags,
2951    },
2952
2953    RegexpChainTail {
2954        chained_to: SubPatternId,
2955        gap: ChainedPatternGap,
2956        flags: SubPatternFlags,
2957    },
2958
2959    Xor {
2960        pattern: LiteralId,
2961        flags: SubPatternFlags,
2962    },
2963
2964    Base64 {
2965        pattern: LiteralId,
2966        padding: u8,
2967    },
2968
2969    Base64Wide {
2970        pattern: LiteralId,
2971        padding: u8,
2972    },
2973
2974    CustomBase64 {
2975        pattern: LiteralId,
2976        alphabet: LiteralId,
2977        padding: u8,
2978    },
2979
2980    CustomBase64Wide {
2981        pattern: LiteralId,
2982        alphabet: LiteralId,
2983        padding: u8,
2984    },
2985}
2986
2987impl SubPattern {
2988    /// If this sub-pattern is chained to another one, returns the
2989    /// [`SubPatternId`] associated to this other pattern.
2990    pub fn chained_to(&self) -> Option<SubPatternId> {
2991        match self {
2992            SubPattern::LiteralChainTail { chained_to, .. }
2993            | SubPattern::RegexpChainTail { chained_to, .. } => {
2994                Some(*chained_to)
2995            }
2996            _ => None,
2997        }
2998    }
2999}
3000
3001/// A snapshot that represents the state of the compiler at a particular moment.
3002#[derive(Debug, PartialEq, Eq)]
3003struct Snapshot {
3004    next_pattern_id: PatternId,
3005    rules_len: usize,
3006    atoms_len: usize,
3007    re_code_len: usize,
3008    sub_patterns_len: usize,
3009    symbol_table_len: usize,
3010    fast_scan_patterns_len: usize,
3011}
3012
3013/// Represents a list of warnings.
3014///
3015/// This is a wrapper around a `Vec<Warning>` that contains additional logic
3016/// for limiting the number of warnings stored in the vector and silencing some
3017/// warnings types.
3018#[derive(Default)]
3019pub(crate) struct Warnings {
3020    warnings: Vec<Warning>,
3021    /// Maximum number of warnings that will be stored in `warnings`. If this
3022    /// is `None`, there will no limits.
3023    max_warnings: Option<usize>,
3024    /// Warnings that are globally disabled.
3025    disabled_warnings: HashSet<String>,
3026    /// Warnings that are suppressed for a specific code span. Keys are
3027    /// warning identifiers, and values are the code spans in which the
3028    /// warning is disabled.
3029    suppressed_warnings: HashMap<String, Vec<Span>>,
3030}
3031
3032impl Warnings {
3033    /// Adds the warning returned by `f` to the list.
3034    ///
3035    /// If the maximum number of warnings has been reached the warning is not
3036    /// added.
3037    #[inline]
3038    pub fn add(&mut self, f: impl FnOnce() -> Warning) {
3039        if self.warnings.len() < self.max_warnings.unwrap_or(usize::MAX) {
3040            let warning = f();
3041            let mut warn = !self.disabled_warnings.contains(warning.code());
3042
3043            if warn
3044                && let Some(spans) =
3045                    self.suppressed_warnings.get(warning.code())
3046            {
3047                'l: for disabled_span in spans {
3048                    for label in warning.labels() {
3049                        if disabled_span.contains(label.span()) {
3050                            warn = false;
3051                            break 'l;
3052                        }
3053                    }
3054                }
3055            }
3056
3057            if warn {
3058                self.warnings.push(warning);
3059            }
3060        }
3061    }
3062
3063    /// Returns true if the given code is a valid warning code.
3064    pub fn is_valid_code(code: &str) -> bool {
3065        Warning::all_codes().contains(&code)
3066    }
3067
3068    /// Enables or disables a specific warning identified by `code`.
3069    ///
3070    /// Returns `true` if the warning was previously enabled, or `false` if
3071    /// otherwise. Returns an error if the code doesn't correspond to any
3072    /// of the existing warnings.
3073    #[inline]
3074    pub fn switch_warning(
3075        &mut self,
3076        code: &str,
3077        enabled: bool,
3078    ) -> Result<bool, InvalidWarningCode> {
3079        if !Self::is_valid_code(code) {
3080            return Err(InvalidWarningCode::new(code.to_string()));
3081        }
3082        if enabled {
3083            Ok(!self.disabled_warnings.remove(code))
3084        } else {
3085            Ok(self.disabled_warnings.insert(code.to_string()))
3086        }
3087    }
3088
3089    /// Enable or disables all warnings.
3090    pub fn switch_all_warnings(&mut self, enabled: bool) {
3091        if enabled {
3092            self.disabled_warnings.clear();
3093        } else {
3094            for c in Warning::all_codes() {
3095                self.disabled_warnings.insert(c.to_string());
3096            }
3097        }
3098    }
3099
3100    /// Clear suppressed warnings.
3101    pub fn clear_suppressed(&mut self) {
3102        self.suppressed_warnings.clear();
3103    }
3104
3105    /// Suppress the warning with the given code, for the given span.
3106    pub fn suppress(&mut self, code: &str, span: Span) {
3107        self.suppressed_warnings
3108            .entry(code.to_string())
3109            .or_default()
3110            .push(span);
3111    }
3112
3113    #[inline]
3114    pub fn as_slice(&self) -> &[Warning] {
3115        self.warnings.as_slice()
3116    }
3117}
3118
3119impl From<Warnings> for Vec<Warning> {
3120    fn from(value: Warnings) -> Self {
3121        value.warnings
3122    }
3123}
yara_x/compiler/mod.rs

yara_x/compiler/
mod.rs