yara_x/compiler/mod.rs
1/*! Compiles YARA source code into binary form.
2
3YARA rules must be compiled before they can be used for scanning data. This
4module implements the YARA compiler.
5*/
6
7use std::cell::RefCell;
8use std::collections::hash_map::Entry;
9use std::collections::{HashMap, HashSet};
10use std::io::Write;
11use std::path::{Path, PathBuf};
12use std::rc::Rc;
13#[cfg(feature = "logging")]
14use std::time::Instant;
15use std::{env, fmt, fs, io, iter};
16
17use bitflags::bitflags;
18use bstr::{BStr, ByteSlice};
19use itertools::{izip, Itertools, MinMaxResult};
20#[cfg(feature = "logging")]
21use log::*;
22use regex_syntax::hir;
23use rustc_hash::{FxHashMap, FxHashSet};
24use serde::{Deserialize, Serialize};
25use walrus::FunctionId;
26
27use yara_x_parser::ast;
28use yara_x_parser::ast::{Ident, Import, Include, RuleFlags, WithSpan, AST};
29use yara_x_parser::cst::CSTStream;
30use yara_x_parser::{Parser, Span};
31
32use crate::compiler::base64::base64_patterns;
33use crate::compiler::emit::{emit_rule_condition, EmitContext};
34use crate::compiler::errors::{
35 CompileError, ConflictingRuleIdentifier, CustomError, DuplicateRule,
36 DuplicateTag, EmitWasmError, InvalidRegexp, InvalidUTF8, UnknownModule,
37 UnusedPattern,
38};
39use crate::compiler::report::ReportBuilder;
40use crate::compiler::{CompileContext, VarStack};
41use crate::modules::BUILTIN_MODULES;
42use crate::re::hir::{ChainedPattern, ChainedPatternGap};
43use crate::string_pool::{BStringPool, StringPool};
44use crate::symbols::{StackedSymbolTable, Symbol, SymbolLookup, SymbolTable};
45use crate::types::{Func, Struct, TypeValue};
46use crate::utils::cast;
47use crate::variables::{is_valid_identifier, Variable, VariableError};
48use crate::wasm::builder::WasmModuleBuilder;
49use crate::wasm::{wasm_exports, WasmExport, WasmSymbols};
50use crate::{re, wasm};
51
52pub(crate) use crate::compiler::atoms::*;
53pub(crate) use crate::compiler::context::*;
54pub(crate) use crate::compiler::ir::*;
55
56use crate::compiler::wsh::WarningSuppressionHook;
57use crate::errors::{
58 CircularIncludes, IncludeError, IncludeNotAllowed, IncludeNotFound,
59 InvalidWarningCode,
60};
61use crate::linters::LinterResult;
62use crate::models::PatternKind;
63
64#[doc(inline)]
65pub use crate::compiler::report::Patch;
66#[doc(inline)]
67pub use crate::compiler::rules::*;
68#[doc(inline)]
69pub use crate::compiler::warnings::*;
70
71mod atoms;
72mod context;
73mod emit;
74mod ir;
75mod report;
76mod rules;
77
78#[cfg(test)]
79mod tests;
80
81pub mod base64;
82pub mod errors;
83pub mod linters;
84pub mod warnings;
85pub mod wsh;
86
87/// A structure that describes some YARA source code.
88///
89/// This structure contains a `&str` pointing to the code itself, and an
90/// optional `origin` that tells where the source code came from. The
91/// most common use for `origin` is indicating the path of the file from
92/// where the source code was obtained, but it can contain any arbitrary
93/// string. This string, if provided, will appear in error messages. For
94/// example, in this error message `origin` was set to `some_file.yar`:
95///
96/// ```text
97/// error: syntax error
98/// --> some_file.yar:4:17
99/// |
100/// 4 | ... more details
101/// ```
102///
103/// # Example
104///
105/// ```
106/// use yara_x::SourceCode;
107/// let src = SourceCode::from("rule test { condition: true }").with_origin("some_file.yar");
108/// ```
109///
110#[derive(Debug, Clone)]
111pub struct SourceCode<'src> {
112 /// A reference to the source code itself. This is a BStr because the
113 /// source code could contain non-UTF8 content.
114 pub(crate) raw: &'src BStr,
115 /// A reference to the source code after validating that it is valid
116 /// UTF-8.
117 pub(crate) valid: Option<&'src str>,
118 /// An optional string that tells which is the origin of the code. Usually
119 /// a file path.
120 pub(crate) origin: Option<String>,
121}
122
123impl<'src> SourceCode<'src> {
124 /// Sets a string that describes the origin of the source code.
125 ///
126 /// This is usually the path of the file that contained the source code,
127 /// but it can be an arbitrary string. The origin appears in error and
128 /// warning messages.
129 pub fn with_origin<S: Into<String>>(self, origin: S) -> Self {
130 Self { raw: self.raw, valid: self.valid, origin: Some(origin.into()) }
131 }
132
133 /// Returns the source code as a `&str`.
134 ///
135 /// If the source code is not valid UTF-8 it will return an error.
136 fn as_str(&mut self) -> Result<&'src str, bstr::Utf8Error> {
137 match self.valid {
138 // We already know that source code is valid UTF-8, return it
139 // as is.
140 Some(s) => Ok(s),
141 // We don't know yet if the source code is valid UTF-8, some
142 // validation must be done. If validation fails an error is
143 // returned.
144 None => {
145 let src = self.raw.to_str()?;
146 self.valid = Some(src);
147 Ok(src)
148 }
149 }
150 }
151}
152
153impl<'src> From<&'src str> for SourceCode<'src> {
154 /// Creates a new [`SourceCode`] from a `&str`.
155 fn from(src: &'src str) -> Self {
156 // The input is a &str, therefore it's guaranteed to be valid UTF-8
157 // and the `valid` field can be initialized.
158 Self { raw: BStr::new(src), valid: Some(src), origin: None }
159 }
160}
161
162impl<'src> From<&'src [u8]> for SourceCode<'src> {
163 /// Creates a new [`SourceCode`] from a `&[u8]`.
164 ///
165 /// As `src` is not guaranteed to be a valid UTF-8 string, the parser will
166 /// verify it and return an error if invalid UTF-8 characters are found.
167 fn from(src: &'src [u8]) -> Self {
168 // The input is a &[u8], its content is not guaranteed to be valid
169 // UTF-8 so the `valid` field is set to `None`. The `validate_utf8`
170 // function will be called for validating the source code before
171 // being parsed.
172 Self { raw: BStr::new(src), valid: None, origin: None }
173 }
174}
175
176/// Compiles a YARA source code.
177///
178/// This function receives any type that implements the `Into<SourceCode>` trait,
179/// which includes `&str`, `String` and [`SourceCode`] and produces compiled
180/// [`Rules`] that can be passed later to the scanner.
181///
182/// # Example
183///
184/// ```rust
185/// # use yara_x;
186/// let rules = yara_x::compile("rule test { condition: true }").unwrap();
187/// let mut scanner = yara_x::Scanner::new(&rules);
188/// let results = scanner.scan("Lorem ipsum".as_bytes()).unwrap();
189/// assert_eq!(results.matching_rules().len(), 1);
190/// ```
191pub fn compile<'src, S>(src: S) -> Result<Rules, CompileError>
192where
193 S: Into<SourceCode<'src>>,
194{
195 let mut compiler = Compiler::new();
196 compiler.add_source(src)?;
197 Ok(compiler.build())
198}
199
200/// Structure that contains information about a rule namespace.
201///
202/// Includes NamespaceId, the IdentId corresponding to the namespace's
203/// identifier, and the symbol table that contains the symbols defined
204/// in the namespace.
205struct Namespace {
206 id: NamespaceId,
207 ident_id: IdentId,
208 symbols: Rc<RefCell<SymbolTable>>,
209}
210
211/// Compiles YARA source code producing a set of compiled [`Rules`].
212///
213/// The two most important methods in this type are [`Compiler::add_source`]
214/// and [`Compiler::build`]. The former tells the compiler which YARA source
215/// code must be compiled, and can be called multiple times with different
216/// set of rules. The latter consumes the compiler and produces a set of
217/// compiled [`Rules`].
218///
219/// # Example
220///
221/// ```rust
222/// # use yara_x;
223/// let mut compiler = yara_x::Compiler::new();
224///
225/// compiler
226/// .add_source(r#"
227/// rule always_true {
228/// condition: true
229/// }"#)?
230/// .add_source(r#"
231/// rule always_false {
232/// condition: false
233/// }"#)?;///
234///
235/// let rules = compiler.build();
236///
237/// # Ok::<(), Box<dyn std::error::Error>>(())
238/// ```
239///
240pub struct Compiler<'a> {
241 /// Mimics YARA behaviour with respect to regular expressions, allowing
242 /// some constructs that are invalid in YARA-X by default, like invalid
243 /// escape sequences.
244 relaxed_re_syntax: bool,
245
246 /// If true, the compiler hoists loop-invariant expressions (i.e: those
247 /// that don't vary on each iteration of the loop), moving them outside
248 /// the loop.
249 hoisting: bool,
250
251 /// List of directories where the compiler should look for included files.
252 /// If `None`, the current directory is used.
253 include_dirs: Option<Vec<PathBuf>>,
254
255 /// If true, slow patterns produce an error instead of a warning. A slow
256 /// pattern is one with atoms shorter than 2 bytes.
257 error_on_slow_pattern: bool,
258
259 /// If true, a slow loop produces an error instead of a warning. A slow
260 /// rule is one where the upper bound of the loop is potentially large.
261 /// Like for example: `for all x in (0..filesize) : (...)`
262 error_on_slow_loop: bool,
263
264 /// If true, include statements are allowed. If false, include statements
265 /// will produce a compile error.
266 includes_enabled: bool,
267
268 /// Tracks the paths of the files that have been included by nested
269 /// includes. This is useful for detecting circular includes and resolving
270 /// relative includes.
271 include_stack: Vec<PathBuf>,
272
273 /// Used for generating error and warning reports.
274 report_builder: ReportBuilder,
275
276 /// The main symbol table used by the compiler. This is actually a stack of
277 /// symbol tables where the bottom-most table is the one that contains
278 /// global identifiers like built-in functions and user-defined global
279 /// identifiers.
280 symbol_table: StackedSymbolTable,
281
282 /// Symbol table that contains the global identifiers, including built-in
283 /// functions like `uint8`, `uint16`, etc. This symbol table is at the
284 /// bottom of the `symbol_table`'s stack. This field is used when we
285 /// need to access the global symbol table directly, for example for
286 /// defining new global variables.
287 global_symbols: Rc<RefCell<SymbolTable>>,
288
289 /// Information about the current namespace (i.e: the namespace that will
290 /// contain any new rules added via a call to `add_sources`.
291 current_namespace: Namespace,
292
293 /// Pool that contains all the identifiers used in the rules. Each
294 /// identifier appears only once, even if they are used by multiple
295 /// rules. For example, the pool contains a single copy of the common
296 /// identifier `$a`. Each identifier have a unique 32-bits [`IdentId`]
297 /// that can be used for retrieving the identifier from the pool.
298 ident_pool: StringPool<IdentId>,
299
300 /// Similar to `ident_pool` but for regular expressions found in rule
301 /// conditions.
302 regexp_pool: StringPool<RegexpId>,
303
304 /// Similar to `ident_pool` but for string literals found in the source
305 /// code. As literal strings in YARA can contain arbitrary bytes, a pool
306 /// capable of storing [`bstr::BString`] must be used, the [`String`] type
307 /// only accepts valid UTF-8. This pool also stores the atoms extracted
308 /// from patterns.
309 lit_pool: BStringPool<LiteralId>,
310
311 /// Intermediate representation (IR) tree for condition of the rule that
312 /// is currently being compiled. After compiling each rule the tree is
313 /// cleared, but it will be reused for the next rule.
314 ir: IR,
315
316 /// Builder for creating the WebAssembly module that contains the code
317 /// for all rule conditions.
318 wasm_mod: WasmModuleBuilder,
319
320 /// Struct that contains the IDs for WASM memories, global and local
321 /// variables, etc.
322 wasm_symbols: WasmSymbols,
323
324 /// Map that contains the functions that are callable from WASM code. These
325 /// are the same functions in [`static@WASM_EXPORTS`]. This map allows to
326 /// retrieve the WASM [`FunctionId`] from the fully qualified mangled
327 /// function name (e.g: `my_module.my_struct.my_func@ii@i`)
328 wasm_exports: FxHashMap<String, FunctionId>,
329
330 /// Map that associates a `PatternId` to a certain filesize bound.
331 ///
332 /// A condition like `filesize < 1000 and $a` only matches if `filesize`
333 /// is less than 1000. Therefore, the pattern `$a` does not need be
334 /// checked for files of size 1000 bytes or larger.
335 ///
336 /// In this case, the map will contain an entry associating `$a` to a
337 /// `FilesizeBounds` value like:
338 /// `FilesizeBounds{start: Bound::Unbounded, end: Bound:Excluded(1000)}`.
339 filesize_bounds: FxHashMap<PatternId, FilesizeBounds>,
340
341 /// A vector with all the rules that has been compiled. A [`RuleId`] is
342 /// an index in this vector.
343 rules: Vec<RuleInfo>,
344
345 /// Next (not used yet) [`PatternId`].
346 next_pattern_id: PatternId,
347
348 /// Map used for de-duplicating pattern. Keys are the pattern's IR and
349 /// values are the `PatternId` assigned to each pattern. Every time a rule
350 /// declares a pattern, this map is used for determining if the same
351 /// pattern (i.e: a pattern with exactly the same IR) was already declared
352 /// by some other rule. If that's the case, that same pattern is re-used.
353 patterns: FxHashMap<Pattern, PatternId>,
354
355 /// A vector with all the sub-patterns from all the rules. A
356 /// [`SubPatternId`] is an index in this vector.
357 sub_patterns: Vec<(PatternId, SubPattern)>,
358
359 /// Vector that contains the [`SubPatternId`] for sub-patterns that can
360 /// match only at a fixed offset within the scanned data. These sub-patterns
361 /// are not added to the Aho-Corasick automaton.
362 anchored_sub_patterns: Vec<SubPatternId>,
363
364 /// A vector that contains all the atoms generated from the patterns.
365 /// Each atom has an associated [`SubPatternId`] that indicates the
366 /// sub-pattern it belongs to.
367 atoms: Vec<SubPatternAtom>,
368
369 /// A vector that contains the code for all regexp patterns (this includes
370 /// hex patterns which are just a special case of regexp). The code for
371 /// each regexp is appended to the vector, during the compilation process
372 /// and the atoms extracted from the regexp contain offsets within this
373 /// vector. This vector contains both forward and backward code.
374 re_code: Vec<u8>,
375
376 /// Vector with the names of all the imported modules. The vector contains
377 /// the [`IdentId`] corresponding to the module's identifier.
378 imported_modules: Vec<IdentId>,
379
380 /// Names of modules that are known, but not supported. When an `import`
381 /// statement with one of these modules is found, the statement is accepted
382 /// without causing an error, but a warning is raised to let the user know
383 /// that the module is not supported. Any rule that depends on an unsupported
384 /// module is ignored.
385 ignored_modules: FxHashSet<String>,
386
387 /// Keys in this map are the modules that are banned, and values are a pair
388 /// of strings with the title and message for the error that will be shown
389 /// if the banned module is imported.
390 banned_modules: FxHashMap<String, (String, String)>,
391
392 /// Keys in this map are the name of rules that will be ignored because they
393 /// depend on unsupported modules, either directly or indirectly. Values are
394 /// the names of the unsupported modules they depend on.
395 ignored_rules: FxHashMap<String, String>,
396
397 /// Structure where each field corresponds to a global identifier or a module
398 /// imported by the rules. For fields corresponding to modules, the value is
399 /// the structure that describes the module.
400 root_struct: Struct,
401
402 /// Warnings generated while compiling the rules.
403 warnings: Warnings,
404
405 /// Errors generated while compiling the rules.
406 errors: Vec<CompileError>,
407
408 /// Features enabled for this compiler. See [`Compiler::enable_feature`]
409 /// for details.
410 features: FxHashSet<String>,
411
412 /// Optional writer where the compiler writes the IR produced by each rule.
413 /// This is used for test cases and debugging.
414 ir_writer: Option<Box<dyn Write>>,
415
416 /// Linters applied to each rule during compilation. The linters are added
417 /// to the compiler using [`Compiler::add_linter`]:
418 linters: Vec<Box<dyn linters::Linter + 'a>>,
419}
420
421impl<'a> Compiler<'a> {
422 /// Creates a new YARA compiler.
423 pub fn new() -> Self {
424 let mut ident_pool = StringPool::new();
425 let mut symbol_table = StackedSymbolTable::new();
426
427 let global_symbols = symbol_table.push_new();
428
429 // Add symbols for built-in functions like uint8, uint16, etc.
430 for export in wasm_exports()
431 // Get only the public exports not belonging to a YARA module.
432 .filter(|e| e.public && e.builtin())
433 {
434 let func = Rc::new(Func::from(export.mangled_name));
435 let symbol = Symbol::Func(func);
436
437 global_symbols.borrow_mut().insert(export.name, symbol);
438 }
439
440 // Create the default namespace. Rule identifiers will be added to this
441 // namespace, unless the user defines some namespace explicitly by calling
442 // `Compiler::new_namespace`.
443 let default_namespace = Namespace {
444 id: NamespaceId(0),
445 ident_id: ident_pool.get_or_intern("default"),
446 symbols: symbol_table.push_new(),
447 };
448
449 // At this point the symbol table (which is a stacked symbol table) has
450 // two layers, the global symbols at the bottom, and the default
451 // namespace on top of it. Calls to `Compiler::new_namespace` replace
452 // the top layer (default namespace) with a new one, but the bottom
453 // layer remains, so the global symbols are shared by all namespaces.
454
455 // Create a WASM module builder. This object is used for building the
456 // WASM module that will execute the rule conditions.
457 let mut wasm_mod = WasmModuleBuilder::new();
458
459 wasm_mod.namespaces_per_func(20);
460 wasm_mod.rules_per_func(10);
461
462 let wasm_symbols = wasm_mod.wasm_symbols();
463 let wasm_exports = wasm_mod.wasm_exports();
464
465 let mut ir = IR::new();
466
467 if cfg!(feature = "constant-folding") {
468 ir.constant_folding(true);
469 }
470
471 Self {
472 ir,
473 ident_pool,
474 global_symbols,
475 symbol_table,
476 wasm_mod,
477 wasm_symbols,
478 wasm_exports,
479 relaxed_re_syntax: false,
480 hoisting: false,
481 error_on_slow_pattern: false,
482 error_on_slow_loop: false,
483 next_pattern_id: PatternId(0),
484 current_namespace: default_namespace,
485 features: FxHashSet::default(),
486 warnings: Warnings::default(),
487 errors: Vec::new(),
488 rules: Vec::new(),
489 sub_patterns: Vec::new(),
490 anchored_sub_patterns: Vec::new(),
491 atoms: Vec::new(),
492 re_code: Vec::new(),
493 imported_modules: Vec::new(),
494 ignored_modules: FxHashSet::default(),
495 banned_modules: FxHashMap::default(),
496 ignored_rules: FxHashMap::default(),
497 filesize_bounds: FxHashMap::default(),
498 root_struct: Struct::new().make_root(),
499 report_builder: ReportBuilder::new(),
500 lit_pool: BStringPool::new(),
501 regexp_pool: StringPool::new(),
502 patterns: FxHashMap::default(),
503 ir_writer: None,
504 linters: Vec::new(),
505 include_dirs: None,
506 includes_enabled: true,
507 include_stack: Vec::new(),
508 }
509 }
510
511 /// Adds a directory to the list of directories where the compiler should
512 /// look for included files.
513 ///
514 /// When an `include` statement is found, the compiler looks for the included
515 /// file in the directories added with this function, in the order they were
516 /// added.
517 ///
518 /// If this function is not called, the compiler will only look for included
519 /// files in the current directory.
520 ///
521 /// Use [Compiler::enable_includes] for controlling whether include statements
522 /// are allowed or not.
523 ///
524 /// # Example
525 ///
526 /// ```no_run
527 /// # use yara_x::Compiler;
528 /// # use std::path::Path;
529 /// let mut compiler = Compiler::new();
530 /// compiler.add_include_dir("/path/to/rules")
531 /// .add_include_dir("/another/path");
532 /// ```
533 pub fn add_include_dir<P: AsRef<std::path::Path>>(
534 &mut self,
535 dir: P,
536 ) -> &mut Self {
537 self.include_dirs
538 .get_or_insert_default()
539 .push(dir.as_ref().to_path_buf());
540 self
541 }
542
543 /// Adds some YARA source code to be compiled.
544 ///
545 /// The `src` parameter accepts any type that implements [`Into<SourceCode>`],
546 /// such as `&str`, `&[u8]`, or an instance of [`SourceCode`] itself. The source
547 /// code may include one or more YARA rules.
548 ///
549 /// You can call this function multiple times to add different sets of rules.
550 /// If the provided source code contains syntax or semantic errors that prevent
551 /// compilation, the function returns the first encountered error. All errors
552 /// found during compilation are also recorded and can be retrieved using
553 /// [`Compiler::errors`].
554 ///
555 /// Even if previous calls to this function resulted in compilation errors,
556 /// you may continue adding additional rules. Only successfully compiled rules
557 /// will be included in the final rule set.
558 pub fn add_source<'src, S>(
559 &mut self,
560 src: S,
561 ) -> Result<&mut Self, CompileError>
562 where
563 S: Into<SourceCode<'src>>,
564 {
565 // Convert `src` into an instance of `SourceCode` if it is something
566 // else, like a &str.
567 let mut src = src.into();
568
569 // Register source code, even before validating that it is UTF-8. In
570 // case of UTF-8 encoding errors we want to report that error too,
571 // and we need the source code registered for creating the report.
572 self.report_builder.register_source(&src);
573
574 // Make sure that the source code is valid UTF-8, or return an error
575 // if otherwise.
576 let ast = match src.as_str() {
577 Ok(src) => {
578 // Parse the source code and build the Abstract Syntax Tree.
579 let cst = Parser::new(src.as_bytes());
580 let cst =
581 WarningSuppressionHook::from(cst).hook(|warning, span| {
582 self.warnings.suppress(warning, span);
583 });
584
585 AST::from(CSTStream::new(src.as_bytes(), cst))
586 }
587 Err(err) => {
588 let span_start = err.valid_up_to();
589 let span_end = if let Some(error_len) = err.error_len() {
590 // `error_len` is the number of invalid UTF-8 bytes found
591 // after `span_start`. Round the number up to the next 3
592 // bytes boundary because invalid bytes are replaced with
593 // the Unicode replacement characters that takes 3 bytes.
594 // This way the span ends at a valid UTF-8 character
595 // boundary.
596 span_start + error_len.next_multiple_of(3)
597 } else {
598 span_start
599 };
600
601 let err = InvalidUTF8::build(
602 &self.report_builder,
603 self.report_builder.span_to_code_loc(Span(
604 span_start as u32..span_end as u32,
605 )),
606 );
607
608 self.errors.push(err.clone());
609 return Err(err);
610 }
611 };
612
613 // Store the current length of the `errors` vector, so that we can
614 // know if more errors were added.
615 let existing_errors = self.errors.len();
616
617 self.c_items(ast.items());
618
619 self.warnings.clear_suppressed();
620
621 self.errors.extend(
622 ast.into_errors()
623 .into_iter()
624 .map(|err| CompileError::from(&self.report_builder, err)),
625 );
626
627 // More errors were added? Return the first error that was added.
628 if self.errors.len() > existing_errors {
629 return Err(self.errors[existing_errors].clone());
630 }
631
632 Ok(self)
633 }
634
635 /// Defines a global variable and sets its initial value.
636 ///
637 /// Global variables must be defined before adding any YARA source code
638 /// that references them via [`Compiler::add_source`]. Once defined, the
639 /// variable's initial value is preserved in the compiled [`Rules`] and
640 /// will be used unless overridden.
641 ///
642 /// When scanning, each scanner instance can modify the initial value of
643 /// the variable using [`crate::Scanner::set_global`].
644 ///
645 /// `T` can be any type that implements [`TryInto<Variable>`], including:
646 /// `i64`, `i32`, `i16`, `i8`, `u32`, `u16`, `u8`, `f64`, `f32`, `bool`,
647 /// `&str`, `String` and [`serde_json::Value`].
648 ///
649 /// ```
650 /// # use yara_x::Compiler;
651 /// assert!(Compiler::new()
652 /// .define_global("some_int", 1)?
653 /// .add_source("rule some_int_not_zero {condition: some_int != 0}")
654 /// .is_ok());
655 ///
656 /// # Ok::<(), Box<dyn std::error::Error>>(())
657 /// ```
658 pub fn define_global<T: TryInto<Variable>>(
659 &mut self,
660 ident: &str,
661 value: T,
662 ) -> Result<&mut Self, VariableError>
663 where
664 VariableError: From<<T as TryInto<Variable>>::Error>,
665 {
666 if !is_valid_identifier(ident) {
667 return Err(VariableError::InvalidIdentifier(ident.to_string()));
668 }
669
670 let var: Variable = value.try_into()?;
671 let type_value: TypeValue = var.into();
672
673 if self.root_struct.add_field(ident, type_value).is_some() {
674 return Err(VariableError::AlreadyExists(ident.to_string()));
675 }
676
677 self.global_symbols
678 .borrow_mut()
679 .insert(ident, self.root_struct.lookup(ident).unwrap());
680
681 Ok(self)
682 }
683
684 /// Creates a new namespace.
685 ///
686 /// Further calls to [`Compiler::add_source`] will put the rules under the
687 /// newly created namespace. If the new namespace is named as the current
688 /// one, no new namespace is created.
689 ///
690 /// In the example below both rules `foo` and `bar` are put into the same
691 /// namespace (the default namespace), therefore `bar` can use `foo` as
692 /// part of its condition, and everything is ok.
693 ///
694 /// ```
695 /// # use yara_x::Compiler;
696 /// assert!(Compiler::new()
697 /// .add_source("rule foo {condition: true}")?
698 /// .add_source("rule bar {condition: foo}")
699 /// .is_ok());
700 ///
701 /// # Ok::<(), Box<dyn std::error::Error>>(())
702 /// ```
703 ///
704 /// In this other example the rule `foo` is put in the default namespace,
705 /// but the rule `bar` is put under the `bar` namespace. This implies that
706 /// `foo` is not visible to `bar`, and the second call to `add_source`
707 /// fails.
708 ///
709 /// ```
710 /// # use yara_x::Compiler;
711 /// assert!(Compiler::new()
712 /// .add_source("rule foo {condition: true}")?
713 /// .new_namespace("bar")
714 /// .add_source("rule bar {condition: foo}")
715 /// .is_err());
716 ///
717 /// # Ok::<(), Box<dyn std::error::Error>>(())
718 /// ```
719 pub fn new_namespace(&mut self, namespace: &str) -> &mut Self {
720 let current_namespace = self
721 .ident_pool
722 .get(self.current_namespace.ident_id)
723 .expect("expecting a namespace");
724 // If the current namespace is already named as the new namespace
725 // this function has no effect.
726 if namespace == current_namespace {
727 return self;
728 }
729 // Remove the symbol table corresponding to the current namespace.
730 self.symbol_table.pop().expect("expecting a namespace");
731 // Create a new namespace. The NamespaceId is simply the ID of the
732 // previous namespace + 1.
733 self.current_namespace = Namespace {
734 id: NamespaceId(self.current_namespace.id.0 + 1),
735 ident_id: self.ident_pool.get_or_intern(namespace),
736 symbols: self.symbol_table.push_new(),
737 };
738 self.ignored_rules.clear();
739 self.wasm_mod.new_namespace();
740 self
741 }
742
743 /// Builds the source code previously added to the compiler.
744 ///
745 /// This function consumes the compiler and returns an instance of
746 /// [`Rules`].
747 pub fn build(self) -> Rules {
748 // Finish building the WASM module.
749 let wasm_mod = self.wasm_mod.build().emit_wasm();
750
751 #[cfg(feature = "logging")]
752 let start = Instant::now();
753
754 // Compile the WASM module for the current platform. This panics
755 // if the WASM code is invalid, which should not happen as the code is
756 // emitted by YARA itself. If this ever happens is probably because
757 // wrong WASM code is being emitted.
758 let compiled_wasm_mod = wasmtime::Module::from_binary(
759 wasm::get_engine(),
760 wasm_mod.as_slice(),
761 )
762 .expect("WASM module is not valid");
763
764 #[cfg(feature = "logging")]
765 info!("WASM module build time: {:?}", Instant::elapsed(&start));
766
767 // The structure that contains the global variables is serialized before
768 // being passed to the `Rules` struct. This is because we want `Rules`
769 // to be `Send`, so that it can be shared with scanners running in
770 // different threads. In order for `Rules` to be `Send`, it can't
771 // contain fields that are not `Send`. As `Struct` is not `Send` we
772 // can't have a `Struct` field in `Rules`, so what we have a `Vec<u8>`
773 // with a serialized version of the struct.
774 //
775 // An alternative is changing the `Rc` in some variants of `TypeValue`
776 // to `Arc`, as the root cause that prevents `Struct` from being `Send`
777 // is the use of `Rc` in `TypeValue`.
778 let serialized_globals = bincode::serde::encode_to_vec(
779 &self.root_struct,
780 bincode::config::standard().with_variable_int_encoding(),
781 )
782 .expect("failed to serialize global variables");
783
784 let mut rules = Rules {
785 serialized_globals,
786 wasm_mod,
787 compiled_wasm_mod: Some(compiled_wasm_mod),
788 relaxed_re_syntax: self.relaxed_re_syntax,
789 ac: None,
790 num_patterns: self.next_pattern_id.0 as usize,
791 ident_pool: self.ident_pool,
792 regexp_pool: self.regexp_pool,
793 lit_pool: self.lit_pool,
794 imported_modules: self.imported_modules,
795 rules: self.rules,
796 sub_patterns: self.sub_patterns,
797 anchored_sub_patterns: self.anchored_sub_patterns,
798 atoms: self.atoms,
799 re_code: self.re_code,
800 warnings: self.warnings.into(),
801 filesize_bounds: self.filesize_bounds,
802 };
803
804 rules.build_ac_automaton();
805 rules
806 }
807
808 /// Adds a linter to the compiler.
809 ///
810 /// Linters perform additional checks to each YARA rule, generating
811 /// warnings when a rule does not meet the linter's requirements. See
812 /// [`crate::linters`] for a list of available linters.
813 pub fn add_linter<L: linters::Linter + 'a>(
814 &mut self,
815 linter: L,
816 ) -> &mut Self {
817 self.linters.push(Box::new(linter));
818 self
819 }
820
821 /// Enables a feature on this compiler.
822 ///
823 /// When defining the structure of a module in a `.proto` file, you can
824 /// specify that certain fields are accessible only when one or more
825 /// features are enabled. For example, the snippet below shows the
826 /// definition of a field named `requires_foo_and_bar`, which can be
827 /// accessed only when both features "foo" and "bar" are enabled.
828 ///
829 /// ```protobuf
830 /// optional uint64 requires_foo_and_bar = 500 [
831 /// (yara.field_options) = {
832 /// acl: [
833 /// {
834 /// allow_if: "foo",
835 /// error_title: "foo is required",
836 /// error_label: "this field was used without foo"
837 /// },
838 /// {
839 /// allow_if: "bar",
840 /// error_title: "bar is required",
841 /// error_label: "this field was used without bar"
842 /// }
843 /// ]
844 /// }
845 /// ];
846 /// ```
847 ///
848 /// If some of the required features are not enabled, using this field in
849 /// a YARA rule will cause an error while compiling the rules. The error
850 /// looks like:
851 ///
852 /// ```text
853 /// error[E034]: foo is required
854 /// --> line:5:29
855 /// |
856 /// 5 | test_proto2.requires_foo_and_bar == 0
857 /// | ^^^^^^^^^^^^^^^^^^^^ this field was used without foo
858 /// |
859 /// ```
860 ///
861 /// Notice that both the title and label in the error message are defined
862 /// in the .proto file.
863 ///
864 /// # Important
865 ///
866 /// This API is hidden from the public documentation because it is unstable
867 /// and subject to change.
868 #[doc(hidden)]
869 pub fn enable_feature<F: Into<String>>(
870 &mut self,
871 feature: F,
872 ) -> &mut Self {
873 self.features.insert(feature.into());
874 self
875 }
876
877 /// Tell the compiler that a YARA module is not supported.
878 ///
879 /// Import statements for ignored modules will be ignored without errors,
880 /// but a warning will be issued. Any rule that makes use of an ignored
881 /// module will be also ignored, while the rest of the rules that don't
882 /// rely on that module will be correctly compiled.
883 pub fn ignore_module<M: Into<String>>(&mut self, module: M) -> &mut Self {
884 self.ignored_modules.insert(module.into());
885 self
886 }
887
888 /// Tell the compiler that a YARA module can't be used.
889 ///
890 /// Import statements for the banned module will cause an error. The error
891 /// message can be customized by using the given error title and message.
892 ///
893 /// If this function is called multiple times with the same module name,
894 /// the error title and message will be updated.
895 pub fn ban_module<M: Into<String>, T: Into<String>, E: Into<String>>(
896 &mut self,
897 module: M,
898 error_title: T,
899 error_message: E,
900 ) -> &mut Self {
901 self.banned_modules
902 .insert(module.into(), (error_title.into(), error_message.into()));
903 self
904 }
905
906 /// Specifies whether the compiler should produce colorful error messages.
907 ///
908 /// Colorized error messages contain ANSI escape sequences that make them
909 /// look nicer on compatible consoles.
910 ///
911 /// The default setting is `false`.
912 pub fn colorize_errors(&mut self, yes: bool) -> &mut Self {
913 self.report_builder.with_colors(yes);
914 self
915 }
916
917 /// Sets the maximum number of columns in error messages.
918 ///
919 /// The default value is 140.
920 pub fn errors_max_width(&mut self, width: usize) -> &mut Self {
921 self.report_builder.max_width(width);
922 self
923 }
924
925 /// Enables or disables a specific type of warning.
926 ///
927 /// Each warning type has a description code (i.e: `slow_pattern`,
928 /// `unsupported_module`, etc.). This function allows to enable or disable
929 /// a specific type of warning identified by the given code.
930 ///
931 /// Returns an error if the given warning code doesn't exist.
932 pub fn switch_warning(
933 &mut self,
934 code: &str,
935 enabled: bool,
936 ) -> Result<&mut Self, InvalidWarningCode> {
937 self.warnings.switch_warning(code, enabled)?;
938 Ok(self)
939 }
940
941 /// Enables or disables all warnings.
942 pub fn switch_all_warnings(&mut self, enabled: bool) -> &mut Self {
943 self.warnings.switch_all_warnings(enabled);
944 self
945 }
946
947 /// Enables a more relaxed syntax check for regular expressions.
948 ///
949 /// YARA-X enforces stricter regular expression syntax compared to YARA.
950 /// For instance, YARA accepts invalid escape sequences and treats them
951 /// as literal characters (e.g., \R is interpreted as a literal 'R'). It
952 /// also allows some special characters to appear unescaped, inferring
953 /// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
954 /// literal, but in `/foo{0,1}bar/` they form the repetition operator
955 /// `{0,1}`).
956 ///
957 /// This setting controls whether the compiler should mimic YARA's behavior,
958 /// allowing constructs that YARA-X doesn't accept by default.
959 ///
960 /// This should be called before any rule is added to the compiler.
961 ///
962 /// # Panics
963 ///
964 /// If called after adding rules to the compiler.
965 pub fn relaxed_re_syntax(&mut self, yes: bool) -> &mut Self {
966 if !self.rules.is_empty() {
967 panic!("calling relaxed_re_syntax in non-empty compiler")
968 }
969 self.relaxed_re_syntax = yes;
970 self
971 }
972
973 /// When enabled, slow patterns produce an error instead of a warning.
974 ///
975 /// This is disabled by default.
976 pub fn error_on_slow_pattern(&mut self, yes: bool) -> &mut Self {
977 self.error_on_slow_pattern = yes;
978 self
979 }
980
981 /// When enabled, potentially slow loops produce an error instead of a
982 /// warning.
983 ///
984 /// This is disabled by default.
985 pub fn error_on_slow_loop(&mut self, yes: bool) -> &mut Self {
986 self.error_on_slow_loop = yes;
987 self
988 }
989
990 /// Controls whether `include` statements are allowed.
991 ///
992 /// By default, the compiler allows the use of `include` statements, which
993 /// include the content of other files. When includes are disabled, any
994 /// attempt to use an `include` statement will result in a compile error.
995 ///
996 /// ```
997 /// # use yara_x::Compiler;
998 /// let mut compiler = Compiler::new();
999 /// compiler.enable_includes(false); // Disable includes
1000 /// ```
1001 pub fn enable_includes(&mut self, yes: bool) -> &mut Self {
1002 self.includes_enabled = yes;
1003 self
1004 }
1005
1006 /// When enabled, the compiler tries to optimize rule conditions.
1007 ///
1008 /// The optimizations usually reduce condition evaluation times, specially
1009 /// in complex rules that contain loops, but it can break short-circuit
1010 /// evaluation rules because some subexpressions are not executed in the
1011 /// order they appear in the source code.
1012 ///
1013 /// This is a very experimental feature.
1014 #[doc(hidden)]
1015 pub fn condition_optimization(&mut self, yes: bool) -> &mut Self {
1016 self.hoisting(yes)
1017 }
1018
1019 pub(crate) fn hoisting(&mut self, yes: bool) -> &mut Self {
1020 self.hoisting = yes;
1021 self
1022 }
1023
1024 /// Retrieves all errors generated by the compiler.
1025 ///
1026 /// This method returns every error encountered during the compilation,
1027 /// across all invocations of [`Compiler::add_source`].
1028 #[inline]
1029 pub fn errors(&self) -> &[CompileError] {
1030 self.errors.as_slice()
1031 }
1032
1033 /// Returns the warnings emitted by the compiler.
1034 ///
1035 /// This method returns every warning issued during the compilation,
1036 /// across all invocations of [`Compiler::add_source`].
1037 #[inline]
1038 pub fn warnings(&self) -> &[Warning] {
1039 self.warnings.as_slice()
1040 }
1041
1042 /// Emits a `.wasm` file with the WASM module generated by the compiler.
1043 ///
1044 /// This file can be inspected and converted to WASM text format by using
1045 /// third-party [tooling](https://github.com/WebAssembly/wabt). This is
1046 /// useful for debugging issues with incorrectly emitted WASM code.
1047 pub fn emit_wasm_file<P>(self, path: P) -> Result<(), EmitWasmError>
1048 where
1049 P: AsRef<Path>,
1050 {
1051 let mut wasm_mod = self.wasm_mod.build();
1052 Ok(wasm_mod.emit_wasm_file(path)?)
1053 }
1054
1055 /// Sets a writer where the compiler will write the Intermediate
1056 /// Representation (IR) of compiled conditions.
1057 ///
1058 /// This is used for testing and debugging purposes.
1059 #[doc(hidden)]
1060 pub fn set_ir_writer<W: Write + 'static>(&mut self, w: W) -> &mut Self {
1061 self.ir_writer = Some(Box::new(w));
1062 self
1063 }
1064}
1065
1066impl Compiler<'_> {
1067 fn add_sub_pattern<I, F, A>(
1068 &mut self,
1069 pattern_id: PatternId,
1070 sub_pattern: SubPattern,
1071 atoms: I,
1072 f: F,
1073 ) -> SubPatternId
1074 where
1075 I: Iterator<Item = A>,
1076 F: Fn(SubPatternId, A) -> SubPatternAtom,
1077 {
1078 let sub_pattern_id = SubPatternId(self.sub_patterns.len() as u32);
1079
1080 // Sub-patterns that are anchored at some fixed offset are not added to
1081 // the Aho-Corasick automata. Instead, their IDs are added to the
1082 // anchored_sub_patterns list.
1083 if let SubPattern::Literal { anchored_at: Some(_), .. } = sub_pattern {
1084 self.anchored_sub_patterns.push(sub_pattern_id);
1085 } else {
1086 self.atoms.extend(atoms.map(|atom| f(sub_pattern_id, atom)));
1087 }
1088
1089 self.sub_patterns.push((pattern_id, sub_pattern));
1090
1091 sub_pattern_id
1092 }
1093
1094 /// Checks if another rule, module or variable has the given identifier and
1095 /// return an error in that case.
1096 fn check_for_existing_identifier(
1097 &self,
1098 ident: &Ident,
1099 ) -> Result<(), CompileError> {
1100 if let Some(symbol) = self.symbol_table.lookup(ident.name) {
1101 return match symbol {
1102 // Found another rule with the same name.
1103 Symbol::Rule { rule_id, .. } => Err(DuplicateRule::build(
1104 &self.report_builder,
1105 ident.name.to_string(),
1106 self.report_builder.span_to_code_loc(ident.span()),
1107 self.rules
1108 .get(rule_id.0 as usize)
1109 .unwrap()
1110 .ident_ref
1111 .clone(),
1112 )),
1113 // Found another symbol that is not a rule, but has the same
1114 // name.
1115 _ => Err(ConflictingRuleIdentifier::build(
1116 &self.report_builder,
1117 ident.name.to_string(),
1118 self.report_builder.span_to_code_loc(ident.span()),
1119 )),
1120 };
1121 }
1122 Ok(())
1123 }
1124
1125 /// Checks that tags are not duplicate.
1126 fn check_for_duplicate_tags(
1127 &self,
1128 tags: &[Ident],
1129 ) -> Result<(), CompileError> {
1130 let mut s = HashSet::new();
1131 for tag in tags {
1132 if !s.insert(tag.name) {
1133 return Err(DuplicateTag::build(
1134 &self.report_builder,
1135 tag.name.to_string(),
1136 self.report_builder.span_to_code_loc(tag.span()),
1137 ));
1138 }
1139 }
1140 Ok(())
1141 }
1142
1143 /// Interns a literal in the literals pool.
1144 ///
1145 /// If `wide` is true the literal gets zeroes interleaved between each byte
1146 /// before being interned.
1147 fn intern_literal(&mut self, literal: &[u8], wide: bool) -> LiteralId {
1148 let wide_pattern;
1149 let literal_bytes = if wide {
1150 wide_pattern = make_wide(literal);
1151 wide_pattern.as_bytes()
1152 } else {
1153 literal
1154 };
1155 self.lit_pool.get_or_intern(literal_bytes)
1156 }
1157
1158 /// Takes a snapshot of the compiler's state at this moment.
1159 ///
1160 /// The returned [`Snapshot`] can be passed to [`Compiler::restore_snapshot`]
1161 /// for restoring the compiler to the state it was when the snapshot was
1162 /// taken.
1163 ///
1164 /// This is useful when the compilation of a rule fails, for restoring the
1165 /// compiler to the state it had before starting compiling the failed rule,
1166 /// which avoids leaving junk in the compiler's internal structures.
1167 fn take_snapshot(&self) -> Snapshot {
1168 Snapshot {
1169 next_pattern_id: self.next_pattern_id,
1170 rules_len: self.rules.len(),
1171 atoms_len: self.atoms.len(),
1172 re_code_len: self.re_code.len(),
1173 sub_patterns_len: self.sub_patterns.len(),
1174 symbol_table_len: self.symbol_table.len(),
1175 }
1176 }
1177
1178 /// Restores the compiler's to a previous state.
1179 ///
1180 /// Use [`Compiler::take_snapshot`] for taking a snapshot of the compiler's
1181 /// state.
1182 fn restore_snapshot(&mut self, snapshot: Snapshot) {
1183 self.next_pattern_id = snapshot.next_pattern_id;
1184 self.rules.truncate(snapshot.rules_len);
1185 self.sub_patterns.truncate(snapshot.sub_patterns_len);
1186 self.re_code.truncate(snapshot.re_code_len);
1187 self.atoms.truncate(snapshot.atoms_len);
1188 self.symbol_table.truncate(snapshot.symbol_table_len);
1189
1190 // Pattern IDs that are >= next_pattern_id, are being discarded. Any pattern
1191 // or file size bound associated to such IDs must be removed.
1192
1193 self.patterns
1194 .retain(|_, pattern_id| *pattern_id < snapshot.next_pattern_id);
1195
1196 self.filesize_bounds
1197 .retain(|pattern_id, _| *pattern_id < snapshot.next_pattern_id);
1198 }
1199
1200 /// Returns true if the bytes in the slice are all 0x00, 0x90, or 0xff.
1201 fn common_byte_repetition(bytes: &[u8]) -> bool {
1202 let mut all_x00 = true;
1203 let mut all_x90 = true;
1204 let mut all_xff = true;
1205
1206 for b in bytes {
1207 match *b {
1208 0x00 => {
1209 all_x90 = false;
1210 all_xff = false;
1211 }
1212 0x90 => {
1213 all_x00 = false;
1214 all_xff = false;
1215 }
1216 0xff => {
1217 all_x00 = false;
1218 all_x90 = false;
1219 }
1220 _ => return false,
1221 }
1222 if !all_x00 && !all_x90 && !all_xff {
1223 return false;
1224 }
1225 }
1226
1227 true
1228 }
1229
1230 /// Reads the file specified by an `include` statement.
1231 ///
1232 /// Tries to read the file in the include directories that were specified
1233 /// with [`Compiler::add_include_dir`], or in the current directory, if
1234 /// no include directories were specified.
1235 ///
1236 /// The function returns both the content and the path of the included file
1237 /// relative to the current directory, or an error if the included file could
1238 /// not be read.
1239 fn read_included_file(
1240 &mut self,
1241 include: &Include,
1242 ) -> Result<(Vec<u8>, PathBuf), CompileError> {
1243 let read_file =
1244 |path: PathBuf| -> Result<(Vec<u8>, PathBuf), io::Error> {
1245 let mut path = path.canonicalize()?;
1246 let content = fs::read(&path)?;
1247
1248 if let Ok(cwd) =
1249 env::current_dir().and_then(|dir| dir.canonicalize())
1250 {
1251 if let Ok(relative_path) = path.strip_prefix(cwd) {
1252 path = relative_path.to_path_buf();
1253 }
1254 }
1255
1256 Ok((content, path))
1257 };
1258
1259 // Look for the included file in the directory at the top of the
1260 // include stack.
1261 if let Some(dir) =
1262 self.include_stack.last().and_then(|path| path.parent())
1263 {
1264 if let Ok(result) = read_file(dir.join(include.file_name)) {
1265 return Ok(result);
1266 }
1267 }
1268
1269 // If one or more include directory were specified, try to find the
1270 // included file in them, in the order they were specified. Otherwise,
1271 // try to find the included file in the current directory.
1272 if let Some(include_dirs) = &self.include_dirs {
1273 if let Some(result) = include_dirs
1274 .iter()
1275 .find_map(|dir| read_file(dir.join(include.file_name)).ok())
1276 {
1277 Ok(result)
1278 } else {
1279 Err(IncludeNotFound::build(
1280 &self.report_builder,
1281 include.file_name.to_string(),
1282 self.report_builder.span_to_code_loc(include.span()),
1283 ))
1284 }
1285 } else {
1286 read_file(PathBuf::from(include.file_name)).map_err(|err| {
1287 if err.kind() == io::ErrorKind::NotFound {
1288 IncludeNotFound::build(
1289 &self.report_builder,
1290 include.file_name.to_string(),
1291 self.report_builder.span_to_code_loc(include.span()),
1292 )
1293 } else {
1294 IncludeError::build(
1295 &self.report_builder,
1296 self.report_builder.span_to_code_loc(include.span()),
1297 err.to_string(),
1298 )
1299 }
1300 })
1301 }
1302 }
1303}
1304
1305impl Compiler<'_> {
1306 fn c_items<'a, I>(&mut self, items: I)
1307 where
1308 I: Iterator<Item = &'a ast::Item<'a>>,
1309 {
1310 let mut already_imported = FxHashMap::default();
1311
1312 for item in items {
1313 match item {
1314 ast::Item::Import(import) => {
1315 // Checks that all imported modules actually exist, and
1316 // raise warnings in case of duplicated imports within
1317 // the same source file. For each module add a symbol to
1318 // the current namespace.
1319 if let Some(existing_import) = already_imported.insert(
1320 &import.module_name,
1321 self.report_builder.span_to_code_loc(import.span()),
1322 ) {
1323 let duplicated_import = self
1324 .report_builder
1325 .span_to_code_loc(import.span());
1326
1327 let mut warning = warnings::DuplicateImport::build(
1328 &self.report_builder,
1329 import.module_name.to_string(),
1330 duplicated_import.clone(),
1331 existing_import,
1332 );
1333
1334 warning.report_mut().patch(duplicated_import, "");
1335
1336 self.warnings.add(|| warning)
1337 }
1338 // Import the module. This updates `self.root_struct` if
1339 // necessary.
1340 if let Err(err) = self.c_import(import) {
1341 self.errors.push(err);
1342 }
1343 }
1344 ast::Item::Include(include) => {
1345 // Return an error if includes are disabled
1346 if !self.includes_enabled {
1347 self.errors.push(IncludeNotAllowed::build(
1348 &self.report_builder,
1349 self.report_builder
1350 .span_to_code_loc(include.span()),
1351 ));
1352 continue;
1353 }
1354
1355 let (included_src, included_path) =
1356 match self.read_included_file(include) {
1357 Ok(included) => included,
1358 Err(err) => {
1359 self.errors.push(err);
1360 continue;
1361 }
1362 };
1363
1364 if self.include_stack.contains(&included_path) {
1365 self.errors.push(CircularIncludes::build(
1366 &self.report_builder,
1367 self.report_builder
1368 .span_to_code_loc(include.span()),
1369 Some(format!(
1370 "include dependencies:\n{}",
1371 self.include_stack
1372 .iter()
1373 .enumerate()
1374 .map(|(i, path)| format!(
1375 "{:>width$}↳ {}",
1376 "",
1377 path.display(),
1378 width = i * 2
1379 ))
1380 .collect::<Vec<_>>()
1381 .join("\n")
1382 )),
1383 ));
1384 continue;
1385 }
1386
1387 // Save the current source ID from the report builder in
1388 // order to restore it later. Any recursive call to
1389 // `add_source` will change the current source ID, and we
1390 // need to restore after `add_source` returns.
1391 let source_id =
1392 self.report_builder.get_current_source_id().unwrap();
1393
1394 let source_code =
1395 SourceCode::from(included_src.as_slice()).with_origin(
1396 // In Windows the paths separators are backslashes, but we
1397 // want to use slashes.
1398 included_path.to_str().unwrap().replace("\\", "/"),
1399 );
1400
1401 self.include_stack.push(included_path);
1402
1403 // Any error generated while processing the included source
1404 // code will be added to `self.errors`. The error returned
1405 // by `add_source` is simply the first of the added errors,
1406 // we don't need to handle the error here.
1407 let _ = self.add_source(source_code);
1408
1409 // Restore the current source ID to the value it had before
1410 // calling `add_source`.
1411 self.report_builder.set_current_source_id(source_id);
1412
1413 self.include_stack.pop().unwrap();
1414 }
1415 ast::Item::Rule(rule) => {
1416 if let Err(err) = self.c_rule(rule) {
1417 self.errors.push(err);
1418 }
1419 }
1420 }
1421 }
1422 }
1423
1424 fn c_rule(&mut self, rule: &ast::Rule) -> Result<(), CompileError> {
1425 // Check if another rule, module or variable has the same identifier
1426 // and return an error in that case.
1427 self.check_for_existing_identifier(&rule.identifier)?;
1428
1429 // Check that rule tags, if any, doesn't contain duplicates.
1430 if let Some(tags) = &rule.tags {
1431 self.check_for_duplicate_tags(tags.as_slice())?;
1432 }
1433
1434 // Check the rule with all the linters.
1435 for linter in self.linters.iter() {
1436 match linter.check(&self.report_builder, rule) {
1437 LinterResult::Ok => {}
1438 LinterResult::Warn(warning) => {
1439 self.warnings.add(|| warning);
1440 }
1441 LinterResult::Warns(warnings) => {
1442 for warning in warnings {
1443 self.warnings.add(|| warning);
1444 }
1445 }
1446 LinterResult::Err(err) => return Err(err),
1447 }
1448 }
1449
1450 // Take snapshot of the current compiler state. In case of error
1451 // compiling the current rule this snapshot allows restoring the
1452 // compiler to the state it had before starting compiling the rule.
1453 // This way we don't leave too much junk, like atoms, or sub-patterns
1454 // corresponding to failed rules. However, there is some junk left
1455 // behind in `ident_pool` and `lit_pool`, because once a string is
1456 // added to one of these pools it can't be removed.
1457 let snapshot = self.take_snapshot();
1458
1459 let tags: Vec<IdentId> = rule
1460 .tags
1461 .iter()
1462 .flatten()
1463 .map(|t| self.ident_pool.get_or_intern(t.name))
1464 .collect();
1465
1466 // Helper function that converts from `ast::MetaValue` to
1467 // `compiler::rules::MetaValue`.
1468 let mut convert_meta_value = |value: &ast::MetaValue| match value {
1469 ast::MetaValue::Integer((i, _)) => MetaValue::Integer(*i),
1470 ast::MetaValue::Float((f, _)) => MetaValue::Float(*f),
1471 ast::MetaValue::Bool((b, _)) => MetaValue::Bool(*b),
1472 ast::MetaValue::String((s, _)) => {
1473 MetaValue::String(self.lit_pool.get_or_intern(s))
1474 }
1475 ast::MetaValue::Bytes((s, _)) => {
1476 MetaValue::Bytes(self.lit_pool.get_or_intern(s))
1477 }
1478 };
1479
1480 // Build a vector of pairs (IdentId, MetaValue) for every meta defined
1481 // in the rule.
1482 let metadata = rule
1483 .meta
1484 .iter()
1485 .flatten()
1486 .map(|m| {
1487 (
1488 self.ident_pool.get_or_intern(m.identifier.name),
1489 convert_meta_value(&m.value),
1490 )
1491 })
1492 .collect();
1493
1494 let mut rule_patterns = Vec::new();
1495
1496 let mut ctx = CompileContext {
1497 ir: &mut self.ir,
1498 relaxed_re_syntax: self.relaxed_re_syntax,
1499 error_on_slow_loop: self.error_on_slow_loop,
1500 one_shot_symbol_table: None,
1501 symbol_table: &mut self.symbol_table,
1502 report_builder: &self.report_builder,
1503 current_rule_patterns: &mut rule_patterns,
1504 warnings: &mut self.warnings,
1505 vars: VarStack::new(),
1506 for_of_depth: 0,
1507 features: &self.features,
1508 loop_iteration_multiplier: 1,
1509 };
1510
1511 // Convert the patterns from AST to IR. This populates the
1512 // `ctx.current_rule_patterns` vector.
1513 if let Err(err) = patterns_from_ast(&mut ctx, rule) {
1514 drop(ctx);
1515 self.restore_snapshot(snapshot);
1516 return Err(err);
1517 }
1518
1519 // Convert the condition from AST to IR. Also updates the patterns
1520 // with information about whether they are used in the condition and
1521 // if they are anchored or not.
1522 let condition = rule_condition_from_ast(&mut ctx, rule);
1523
1524 drop(ctx);
1525
1526 // Search for patterns that are very common byte repetitions like:
1527 //
1528 // 00 00 00 00 00 00 ....
1529 // 90 90 09 90 90 90 ....
1530 // FF FF FF FF FF FF ....
1531 //
1532 // Raise a warning when such a pattern is found, except in the
1533 // following cases:
1534 //
1535 // 1) When the pattern is anchored, because anchored pattern can appear
1536 // only at a fixed offset and are not searched by Aho-Corasick.
1537 //
1538 // 2) When the pattern has attributes: xor, fullword, base64 or
1539 // base64wide, because in those cases the real pattern is not that
1540 // common.
1541 //
1542 // Note: this can't be done before calling `rule_condition_from_ast`,
1543 // because we don't know which patterns are anchored until the condition
1544 // is processed.
1545 for pat in rule_patterns.iter() {
1546 if pat.anchored_at().is_none()
1547 && !pat.pattern().flags().intersects(
1548 PatternFlags::Xor
1549 | PatternFlags::Fullword
1550 | PatternFlags::Base64
1551 | PatternFlags::Base64Wide,
1552 )
1553 {
1554 let literal_bytes = match pat.pattern() {
1555 Pattern::Text(lit) => Some(lit.text.as_bytes()),
1556 Pattern::Regexp(re) => re.hir.as_literal_bytes(),
1557 Pattern::Hex(re) => re.hir.as_literal_bytes(),
1558 };
1559 if let Some(literal_bytes) = literal_bytes {
1560 if Self::common_byte_repetition(literal_bytes) {
1561 self.warnings.add(|| {
1562 warnings::SlowPattern::build(
1563 &self.report_builder,
1564 self.report_builder
1565 .span_to_code_loc(pat.span().clone()),
1566 None,
1567 )
1568 });
1569 }
1570 }
1571 }
1572 }
1573
1574 // In case of error, restore the compiler to the state it was before
1575 // entering this function. Also, if the error is due to an unknown
1576 // identifier, but the identifier is one of the unsupported modules,
1577 // the error is tolerated and a warning is issued instead.
1578 let mut condition = match condition {
1579 Ok(condition) => condition,
1580 Err(CompileError::UnknownIdentifier(unknown))
1581 if self.ignored_rules.contains_key(unknown.identifier())
1582 || self.ignored_modules.contains(unknown.identifier()) =>
1583 {
1584 self.restore_snapshot(snapshot);
1585
1586 if let Some(module_name) =
1587 self.ignored_rules.get(unknown.identifier())
1588 {
1589 self.warnings.add(|| {
1590 warnings::IgnoredRule::build(
1591 &self.report_builder,
1592 module_name.clone(),
1593 rule.identifier.name.to_string(),
1594 unknown.identifier_location().clone(),
1595 )
1596 });
1597 self.ignored_rules.insert(
1598 rule.identifier.name.to_string(),
1599 module_name.clone(),
1600 );
1601 } else {
1602 self.warnings.add(|| {
1603 warnings::IgnoredModule::build(
1604 &self.report_builder,
1605 unknown.identifier().to_string(),
1606 unknown.identifier_location().clone(),
1607 Some(format!(
1608 "the whole rule `{}` will be ignored",
1609 rule.identifier.name
1610 )),
1611 )
1612 });
1613 self.ignored_rules.insert(
1614 rule.identifier.name.to_string(),
1615 unknown.identifier().to_string(),
1616 );
1617 }
1618
1619 return Ok(());
1620 }
1621 Err(err) => {
1622 self.restore_snapshot(snapshot);
1623 return Err(err);
1624 }
1625 };
1626
1627 if self.hoisting {
1628 condition = self.ir.hoisting();
1629 }
1630
1631 // Analyze the condition and determine the bounds it imposes to
1632 // `filesize`, if any.
1633 let filesize_bounds = self.ir.filesize_bounds();
1634
1635 // Set the bounds to all patterns in the rule. This must be done
1636 // before assigning the PatternId to each pattern, as the filesize
1637 // bounds are taken into account when determining if the pattern
1638 // is unique or re-used from a previous rule.
1639 if !filesize_bounds.unbounded() {
1640 for pattern in &mut rule_patterns {
1641 pattern.pattern_mut().set_filesize_bounds(&filesize_bounds);
1642 }
1643 }
1644
1645 if let Some(w) = &mut self.ir_writer {
1646 writeln!(w, "RULE {}", rule.identifier.name).unwrap();
1647 writeln!(w, "{:?}", self.ir).unwrap();
1648 if !filesize_bounds.unbounded() {
1649 writeln!(w, "{filesize_bounds:?}\n",).unwrap();
1650 }
1651 }
1652
1653 let mut pattern_ids = Vec::with_capacity(rule_patterns.len());
1654 let mut patterns = Vec::with_capacity(rule_patterns.len());
1655 let mut pending_patterns = HashSet::new();
1656 let mut num_private_patterns = 0;
1657
1658 for pattern in &rule_patterns {
1659 // Raise error is some pattern was not used, except if the pattern
1660 // identifier starts with underscore.
1661 if !pattern.in_use() && !pattern.identifier().starts_with("$_") {
1662 self.restore_snapshot(snapshot);
1663 return Err(UnusedPattern::build(
1664 &self.report_builder,
1665 pattern.identifier().name.to_string(),
1666 self.report_builder
1667 .span_to_code_loc(pattern.identifier().span()),
1668 ));
1669 }
1670
1671 if pattern.pattern().flags().contains(PatternFlags::Private) {
1672 num_private_patterns += 1;
1673 }
1674
1675 // Check if this pattern has been declared before, in this rule or
1676 // in some other rule. In such cases the pattern ID is re-used, and
1677 // we don't need to process (i.e: extract atoms and add them to
1678 // Aho-Corasick automaton) the pattern again. Two patterns are
1679 // considered equal if they are exactly the same, including any
1680 // modifiers associated to the pattern, both are non-anchored
1681 // or anchored at the same file offset, and if they have the same
1682 // file size bounds.
1683 let pattern_id =
1684 match self.patterns.entry(pattern.pattern().clone()) {
1685 // The pattern already exists, return the existing ID.
1686 Entry::Occupied(entry) => *entry.get(),
1687 // The pattern didn't exist.
1688 Entry::Vacant(entry) => {
1689 let pattern_id = self.next_pattern_id;
1690 self.next_pattern_id.incr(1);
1691 pending_patterns.insert(pattern_id);
1692 entry.insert(pattern_id);
1693 pattern_id
1694 }
1695 };
1696
1697 let kind = match pattern.pattern() {
1698 Pattern::Text(_) => PatternKind::Text,
1699 Pattern::Regexp(_) => PatternKind::Regexp,
1700 Pattern::Hex(_) => PatternKind::Hex,
1701 };
1702
1703 patterns.push(PatternInfo {
1704 kind,
1705 pattern_id,
1706 ident_id: self
1707 .ident_pool
1708 .get_or_intern(pattern.identifier().name),
1709 is_private: pattern
1710 .pattern()
1711 .flags()
1712 .contains(PatternFlags::Private),
1713 });
1714
1715 pattern_ids.push(pattern_id);
1716 }
1717
1718 // The RuleId for the new rule is current length of `self.rules`. The
1719 // first rule has RuleId = 0.
1720 let rule_id = RuleId::from(self.rules.len());
1721
1722 self.rules.push(RuleInfo {
1723 tags,
1724 metadata,
1725 patterns,
1726 num_private_patterns,
1727 is_global: rule.flags.contains(RuleFlags::Global),
1728 is_private: rule.flags.contains(RuleFlags::Private),
1729 namespace_id: self.current_namespace.id,
1730 namespace_ident_id: self.current_namespace.ident_id,
1731 ident_id: self.ident_pool.get_or_intern(rule.identifier.name),
1732 ident_ref: self
1733 .report_builder
1734 .span_to_code_loc(rule.identifier.span()),
1735 });
1736
1737 // Process the patterns in the rule. This extracts the best atoms
1738 // from each pattern, adding them to the `self.atoms` vector, it
1739 // also creates one or more sub-patterns per pattern and adds them
1740 // to `self.sub_patterns`
1741 for (pattern_id, pattern) in
1742 izip!(pattern_ids.iter(), rule_patterns.into_iter())
1743 {
1744 if pending_patterns.contains(pattern_id) {
1745 let pattern_span = pattern.span().clone();
1746 match pattern.into_pattern() {
1747 Pattern::Text(pattern) => {
1748 self.c_literal_pattern(*pattern_id, pattern);
1749 }
1750 Pattern::Regexp(pattern) | Pattern::Hex(pattern) => {
1751 if let Err(err) = self.c_regexp_pattern(
1752 *pattern_id,
1753 pattern,
1754 pattern_span,
1755 ) {
1756 self.restore_snapshot(snapshot);
1757 return Err(err);
1758 }
1759 }
1760 };
1761 if !filesize_bounds.unbounded()
1762 && self
1763 .filesize_bounds
1764 .insert(*pattern_id, filesize_bounds.clone())
1765 .is_some()
1766 {
1767 // This should not happen.
1768 panic!("modifying the file size bounds of an existing pattern")
1769 }
1770 pending_patterns.remove(pattern_id);
1771 }
1772 }
1773
1774 // Create a new symbol of bool type for the rule.
1775 let new_symbol = Symbol::Rule {
1776 rule_id,
1777 is_global: rule.flags.contains(RuleFlags::Global),
1778 };
1779
1780 // Insert the symbol in the symbol table corresponding to the
1781 // current namespace. This must be done after every fallible function
1782 // has been called; once the symbol is inserted in the symbol table,
1783 // it can't be undone.
1784 let existing_symbol = self
1785 .current_namespace
1786 .symbols
1787 .as_ref()
1788 .borrow_mut()
1789 .insert(rule.identifier.name, new_symbol);
1790
1791 // No other symbol with the same identifier should exist.
1792 assert!(existing_symbol.is_none());
1793
1794 // The last step is emitting the WASM code corresponding to the rule's
1795 // condition. This is done after every fallible function has been called
1796 // because once the code is emitted it cannot be undone, which means
1797 // that if this function fails after emitting the code, some code debris
1798 // will remain in the WASM module.
1799 let mut ctx = EmitContext {
1800 current_rule: self.rules.last_mut().unwrap(),
1801 lit_pool: &mut self.lit_pool,
1802 regexp_pool: &mut self.regexp_pool,
1803 wasm_symbols: &self.wasm_symbols,
1804 wasm_exports: &self.wasm_exports,
1805 exception_handler_stack: Vec::new(),
1806 lookup_list: Vec::new(),
1807 emit_search_for_pattern_stack: Vec::new(),
1808 };
1809
1810 emit_rule_condition(
1811 &mut ctx,
1812 &self.ir,
1813 rule_id,
1814 condition,
1815 &mut self.wasm_mod,
1816 );
1817
1818 Ok(())
1819 }
1820
1821 fn c_import(&mut self, import: &Import) -> Result<(), CompileError> {
1822 let module_name = import.module_name;
1823 let module = BUILTIN_MODULES.get(module_name);
1824
1825 // Does a module with the given name actually exist? ...
1826 if module.is_none() {
1827 // The module does not exist, but it is included in the list
1828 // of unsupported modules. In such cases we don't raise an error,
1829 // only a warning.
1830 return if self.ignored_modules.iter().any(|m| m == module_name) {
1831 self.warnings.add(|| {
1832 warnings::IgnoredModule::build(
1833 &self.report_builder,
1834 module_name.to_string(),
1835 self.report_builder.span_to_code_loc(import.span()),
1836 None,
1837 )
1838 });
1839 Ok(())
1840 } else {
1841 // The module does not exist, and is not explicitly added to
1842 // the list of unsupported modules, that's an error.
1843 Err(UnknownModule::build(
1844 &self.report_builder,
1845 module_name.to_string(),
1846 self.report_builder.span_to_code_loc(import.span()),
1847 ))
1848 };
1849 }
1850
1851 // Yes, module exists.
1852 let module = module.unwrap();
1853
1854 // If the module has not been added to `self.root_struct` and
1855 // `self.imported_modules`, do it.
1856 if !self.root_struct.has_field(module_name) {
1857 // Add the module to the list of imported modules.
1858 self.imported_modules
1859 .push(self.ident_pool.get_or_intern(module_name));
1860
1861 // Create the structure that describes the module.
1862 let mut module_struct = Struct::from_proto_descriptor_and_msg(
1863 &module.root_struct_descriptor,
1864 None,
1865 true,
1866 );
1867
1868 // Get a mutable reference for the module's structure. This is
1869 // possible because there's only one Rc pointing to the structure,
1870 // otherwise the `.unwrap()` panics.
1871 let module_struct_mut =
1872 Rc::<Struct>::get_mut(&mut module_struct).unwrap();
1873
1874 // If the YARA module has an associated Rust module, check if it
1875 // exports some function and add it to the structure.
1876 if let Some(rust_module_name) = module.rust_module_name {
1877 let functions = WasmExport::get_functions(|export| {
1878 export.public
1879 && export.rust_module_path.contains(rust_module_name)
1880 });
1881 for (name, func) in functions {
1882 let func = TypeValue::Func(Rc::new(func));
1883 if module_struct_mut.add_field(name, func).is_some() {
1884 panic!(
1885 "function `{name}` has the same name than a field in `{rust_module_name}`",
1886 )
1887 };
1888 }
1889 }
1890
1891 // Iterate over all substructures of the module's main structure and
1892 // add any methods defined for them.
1893 module_struct_mut.enum_substructures(&mut |sub_struct| {
1894 let methods = sub_struct.protobuf_type_name().map(WasmExport::get_methods);
1895 if let Some(methods) = methods {
1896 for (name, func) in methods {
1897 let func = TypeValue::Func(Rc::new(func));
1898 if sub_struct.add_field(name, func).is_some() {
1899 panic!(
1900 "method `{name}` has the same name than a field in `{}`",
1901 sub_struct.protobuf_type_name().unwrap(),
1902 )
1903 };
1904 }
1905 }
1906 });
1907
1908 // Insert the module in the struct that contains all imported
1909 // modules. This struct contains all modules imported, from
1910 // all namespaces. Panic if the module was already in the struct.
1911 if self
1912 .root_struct
1913 .add_field(module_name, TypeValue::Struct(module_struct))
1914 .is_some()
1915 {
1916 panic!("duplicate module `{module_name}`")
1917 }
1918 }
1919
1920 let mut symbol_table =
1921 self.current_namespace.symbols.as_ref().borrow_mut();
1922
1923 // Create a symbol for the module and insert it in the symbol
1924 // table for this namespace, if it doesn't exist.
1925 if !symbol_table.contains(module_name) {
1926 symbol_table.insert(
1927 module_name,
1928 self.root_struct.lookup(module_name).unwrap(),
1929 );
1930 }
1931
1932 // Is the module banned? If yes, produce an error. Notice however that
1933 // this check is done after the module has been added to the symbol
1934 // table because we don't want additional errors due to undefined
1935 // identifiers when the banned module is used in some rule condition.
1936 if let Some((error_title, error_msg)) =
1937 self.banned_modules.get(module_name)
1938 {
1939 return Err(CustomError::build(
1940 &self.report_builder,
1941 error_title.clone(),
1942 error_msg.clone(),
1943 self.report_builder.span_to_code_loc(import.span()),
1944 ));
1945 }
1946
1947 Ok(())
1948 }
1949
1950 fn c_literal_pattern(
1951 &mut self,
1952 pattern_id: PatternId,
1953 pattern: LiteralPattern,
1954 ) {
1955 let full_word = pattern.flags.contains(PatternFlags::Fullword);
1956 let mut flags = SubPatternFlags::empty();
1957
1958 if full_word {
1959 flags.insert(SubPatternFlags::FullwordLeft);
1960 flags.insert(SubPatternFlags::FullwordRight);
1961 }
1962
1963 // Depending on the combination of `ascii` and `wide` modifiers, the
1964 // `main_patterns` vector will contain either the pattern's `ascii`
1965 // version, the `wide` version, or both. Each item in `main_patterns`
1966 // also contains the best atom for the pattern.
1967 let mut main_patterns = Vec::new();
1968 let wide_pattern;
1969
1970 if pattern.flags.contains(PatternFlags::Wide) {
1971 wide_pattern = make_wide(pattern.text.as_bytes());
1972 main_patterns.push((
1973 wide_pattern.as_slice(),
1974 best_atom_in_bytes(wide_pattern.as_slice()),
1975 flags | SubPatternFlags::Wide,
1976 ));
1977 }
1978
1979 if pattern.flags.contains(PatternFlags::Ascii) {
1980 main_patterns.push((
1981 pattern.text.as_bytes(),
1982 best_atom_in_bytes(pattern.text.as_bytes()),
1983 flags,
1984 ));
1985 }
1986
1987 for (main_pattern, best_atom, flags) in main_patterns {
1988 let pattern_lit_id = self.lit_pool.get_or_intern(main_pattern);
1989
1990 if pattern.flags.contains(PatternFlags::Xor) {
1991 // When `xor` is used, `base64`, `base64wide` and `nocase` are
1992 // not accepted.
1993 debug_assert!(!pattern.flags.contains(
1994 PatternFlags::Base64
1995 | PatternFlags::Base64Wide
1996 | PatternFlags::Nocase,
1997 ));
1998
1999 let xor_range = pattern.xor_range.clone().unwrap();
2000 self.add_sub_pattern(
2001 pattern_id,
2002 SubPattern::Xor { pattern: pattern_lit_id, flags },
2003 best_atom.xor_combinations(xor_range),
2004 SubPatternAtom::from_atom,
2005 );
2006 } else if pattern.flags.contains(PatternFlags::Nocase) {
2007 // When `nocase` is used, `base64`, `base64wide` and `xor` are
2008 // not accepted.
2009 debug_assert!(!pattern.flags.contains(
2010 PatternFlags::Base64
2011 | PatternFlags::Base64Wide
2012 | PatternFlags::Xor,
2013 ));
2014
2015 self.add_sub_pattern(
2016 pattern_id,
2017 SubPattern::Literal {
2018 pattern: pattern_lit_id,
2019 flags: flags | SubPatternFlags::Nocase,
2020 anchored_at: None,
2021 },
2022 best_atom.case_combinations(),
2023 SubPatternAtom::from_atom,
2024 );
2025 }
2026 // Used `base64`, or `base64wide`, or both.
2027 else if pattern
2028 .flags
2029 .intersects(PatternFlags::Base64 | PatternFlags::Base64Wide)
2030 {
2031 // When `base64` or `base64wide` are used, `xor`, `fullword`
2032 // and `nocase` are not accepted.
2033 debug_assert!(!pattern.flags.contains(
2034 PatternFlags::Xor
2035 | PatternFlags::Fullword
2036 | PatternFlags::Nocase,
2037 ));
2038
2039 if pattern.flags.contains(PatternFlags::Base64) {
2040 for (padding, base64_pattern) in base64_patterns(
2041 main_pattern,
2042 pattern.base64_alphabet.as_deref(),
2043 ) {
2044 let sub_pattern = if let Some(alphabet) =
2045 pattern.base64_alphabet.as_deref()
2046 {
2047 SubPattern::CustomBase64 {
2048 pattern: pattern_lit_id,
2049 alphabet: self
2050 .lit_pool
2051 .get_or_intern(alphabet),
2052 padding,
2053 }
2054 } else {
2055 SubPattern::Base64 {
2056 pattern: pattern_lit_id,
2057 padding,
2058 }
2059 };
2060
2061 self.add_sub_pattern(
2062 pattern_id,
2063 sub_pattern,
2064 iter::once({
2065 let mut atom = best_atom_in_bytes(
2066 base64_pattern.as_slice(),
2067 );
2068 // Atoms for base64 patterns are always
2069 // inexact, they require verification.
2070 atom.make_inexact();
2071 atom
2072 }),
2073 SubPatternAtom::from_atom,
2074 );
2075 }
2076 }
2077
2078 if pattern.flags.contains(PatternFlags::Base64Wide) {
2079 for (padding, base64_pattern) in base64_patterns(
2080 main_pattern,
2081 pattern.base64wide_alphabet.as_deref(),
2082 ) {
2083 let sub_pattern = if let Some(alphabet) =
2084 pattern.base64wide_alphabet.as_deref()
2085 {
2086 SubPattern::CustomBase64Wide {
2087 pattern: pattern_lit_id,
2088 alphabet: self
2089 .lit_pool
2090 .get_or_intern(alphabet),
2091 padding,
2092 }
2093 } else {
2094 SubPattern::Base64Wide {
2095 pattern: pattern_lit_id,
2096 padding,
2097 }
2098 };
2099
2100 let wide = make_wide(base64_pattern.as_slice());
2101
2102 self.add_sub_pattern(
2103 pattern_id,
2104 sub_pattern,
2105 iter::once({
2106 let mut atom =
2107 best_atom_in_bytes(wide.as_slice());
2108 // Atoms for base64 patterns are always
2109 // inexact, they require verification.
2110 atom.make_inexact();
2111 atom
2112 }),
2113 SubPatternAtom::from_atom,
2114 );
2115 }
2116 }
2117 } else {
2118 self.add_sub_pattern(
2119 pattern_id,
2120 SubPattern::Literal {
2121 pattern: pattern_lit_id,
2122 anchored_at: pattern.anchored_at,
2123 flags,
2124 },
2125 iter::once(best_atom),
2126 SubPatternAtom::from_atom,
2127 );
2128 }
2129 }
2130 }
2131
2132 fn c_regexp_pattern(
2133 &mut self,
2134 pattern_id: PatternId,
2135 pattern: RegexpPattern,
2136 span: Span,
2137 ) -> Result<(), CompileError> {
2138 // Try splitting the regexp into multiple chained sub-patterns if it
2139 // contains large gaps. For example, `{ 01 02 03 [-] 04 05 06 }` is
2140 // split into `{ 01 02 03 }` and `{ 04 05 06 }`, where `{ 04 05 06 }`
2141 // is chained to `{ 01 02 03 }`.
2142 //
2143 // If the regexp can't be split then `head` is the whole regexp.
2144 let (head, tail) = pattern.hir.split_at_large_gaps();
2145
2146 if !tail.is_empty() {
2147 // The pattern was split into multiple chained regexps.
2148 return self.c_chain(
2149 pattern_id,
2150 &head,
2151 &tail,
2152 pattern.flags,
2153 span,
2154 );
2155 }
2156
2157 if head.is_alternation_literal() {
2158 // The pattern is either a literal, or an alternation of literals.
2159 // Examples:
2160 // /foo/
2161 // /foo|bar|baz/
2162 // { 01 02 03 }
2163 // { (01 02 03 | 04 05 06 ) }
2164 return self.c_alternation_literal(
2165 pattern_id,
2166 head,
2167 pattern.anchored_at,
2168 pattern.flags,
2169 );
2170 }
2171
2172 // If this point is reached, this is a pattern that can't be split into
2173 // multiple chained patterns, and is neither a literal or alternation
2174 // of literals. Most patterns fall in this category.
2175 let mut flags = SubPatternFlags::empty();
2176
2177 if pattern.flags.contains(PatternFlags::Nocase) {
2178 flags.insert(SubPatternFlags::Nocase);
2179 }
2180
2181 if pattern.flags.contains(PatternFlags::Fullword) {
2182 flags.insert(SubPatternFlags::FullwordLeft);
2183 flags.insert(SubPatternFlags::FullwordRight);
2184 }
2185
2186 if matches!(head.is_greedy(), Some(true)) {
2187 flags.insert(SubPatternFlags::GreedyRegexp);
2188 }
2189
2190 let (atoms, is_fast_regexp) = self.c_regexp(&head, span)?;
2191
2192 if is_fast_regexp {
2193 flags.insert(SubPatternFlags::FastRegexp);
2194 }
2195
2196 if pattern.flags.contains(PatternFlags::Wide) {
2197 self.add_sub_pattern(
2198 pattern_id,
2199 SubPattern::Regexp { flags: flags | SubPatternFlags::Wide },
2200 atoms.iter().cloned().map(|atom| atom.make_wide()),
2201 SubPatternAtom::from_regexp_atom,
2202 );
2203 }
2204
2205 if pattern.flags.contains(PatternFlags::Ascii) {
2206 self.add_sub_pattern(
2207 pattern_id,
2208 SubPattern::Regexp { flags },
2209 atoms.into_iter(),
2210 SubPatternAtom::from_regexp_atom,
2211 );
2212 }
2213
2214 Ok(())
2215 }
2216
2217 fn c_alternation_literal(
2218 &mut self,
2219 pattern_id: PatternId,
2220 hir: re::hir::Hir,
2221 anchored_at: Option<usize>,
2222 flags: PatternFlags,
2223 ) -> Result<(), CompileError> {
2224 let ascii = flags.contains(PatternFlags::Ascii);
2225 let wide = flags.contains(PatternFlags::Wide);
2226 let case_insensitive = flags.contains(PatternFlags::Nocase);
2227 let full_word = flags.contains(PatternFlags::Fullword);
2228
2229 let mut flags = SubPatternFlags::empty();
2230
2231 if case_insensitive {
2232 flags.insert(SubPatternFlags::Nocase);
2233 }
2234
2235 if full_word {
2236 flags.insert(SubPatternFlags::FullwordLeft);
2237 flags.insert(SubPatternFlags::FullwordRight);
2238 }
2239
2240 let mut process_literal = |literal: &hir::Literal, wide: bool| {
2241 let pattern_lit_id =
2242 self.intern_literal(literal.0.as_bytes(), wide);
2243
2244 let best_atom = best_atom_in_bytes(
2245 self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2246 );
2247
2248 let flags =
2249 if wide { flags | SubPatternFlags::Wide } else { flags };
2250
2251 let sub_pattern = SubPattern::Literal {
2252 pattern: pattern_lit_id,
2253 anchored_at,
2254 flags,
2255 };
2256
2257 if case_insensitive {
2258 self.add_sub_pattern(
2259 pattern_id,
2260 sub_pattern,
2261 best_atom.case_combinations(),
2262 SubPatternAtom::from_atom,
2263 );
2264 } else {
2265 self.add_sub_pattern(
2266 pattern_id,
2267 sub_pattern,
2268 iter::once(best_atom),
2269 SubPatternAtom::from_atom,
2270 );
2271 }
2272 };
2273
2274 let inner;
2275
2276 let hir = if let hir::HirKind::Capture(group) = hir.kind() {
2277 group.sub.as_ref()
2278 } else {
2279 inner = hir.into_inner();
2280 &inner
2281 };
2282
2283 match hir.kind() {
2284 hir::HirKind::Literal(literal) => {
2285 if ascii {
2286 process_literal(literal, false);
2287 }
2288 if wide {
2289 process_literal(literal, true);
2290 }
2291 }
2292 hir::HirKind::Alternation(literals) => {
2293 let literals = literals
2294 .iter()
2295 .map(|l| cast!(l.kind(), hir::HirKind::Literal));
2296 for literal in literals {
2297 if ascii {
2298 process_literal(literal, false);
2299 }
2300 if wide {
2301 process_literal(literal, true);
2302 }
2303 }
2304 }
2305 _ => unreachable!(),
2306 }
2307
2308 Ok(())
2309 }
2310
2311 fn c_chain(
2312 &mut self,
2313 pattern_id: PatternId,
2314 leading: &re::hir::Hir,
2315 trailing: &[ChainedPattern],
2316 flags: PatternFlags,
2317 span: Span,
2318 ) -> Result<(), CompileError> {
2319 let ascii = flags.contains(PatternFlags::Ascii);
2320 let wide = flags.contains(PatternFlags::Wide);
2321 let case_insensitive = flags.contains(PatternFlags::Nocase);
2322 let full_word = flags.contains(PatternFlags::Fullword);
2323
2324 let mut common_flags = SubPatternFlags::empty();
2325
2326 if case_insensitive {
2327 common_flags.insert(SubPatternFlags::Nocase);
2328 }
2329
2330 if matches!(leading.is_greedy(), Some(true)) {
2331 common_flags.insert(SubPatternFlags::GreedyRegexp);
2332 }
2333
2334 let mut prev_sub_pattern_ascii = SubPatternId(0);
2335 let mut prev_sub_pattern_wide = SubPatternId(0);
2336
2337 if let hir::HirKind::Literal(literal) = leading.kind() {
2338 let mut flags = common_flags;
2339
2340 if full_word {
2341 flags.insert(SubPatternFlags::FullwordLeft);
2342 }
2343
2344 if ascii {
2345 prev_sub_pattern_ascii =
2346 self.c_literal_chain_head(pattern_id, literal, flags);
2347 }
2348
2349 if wide {
2350 prev_sub_pattern_wide = self.c_literal_chain_head(
2351 pattern_id,
2352 literal,
2353 flags | SubPatternFlags::Wide,
2354 );
2355 };
2356 } else {
2357 let mut flags = common_flags;
2358
2359 let (atoms, is_fast_regexp) =
2360 self.c_regexp(leading, span.clone())?;
2361
2362 if is_fast_regexp {
2363 flags.insert(SubPatternFlags::FastRegexp);
2364 }
2365
2366 if full_word {
2367 flags.insert(SubPatternFlags::FullwordLeft);
2368 }
2369
2370 if wide {
2371 prev_sub_pattern_wide = self.add_sub_pattern(
2372 pattern_id,
2373 SubPattern::RegexpChainHead {
2374 flags: flags | SubPatternFlags::Wide,
2375 },
2376 atoms.iter().cloned().map(|atom| atom.make_wide()),
2377 SubPatternAtom::from_regexp_atom,
2378 );
2379 }
2380
2381 if ascii {
2382 prev_sub_pattern_ascii = self.add_sub_pattern(
2383 pattern_id,
2384 SubPattern::RegexpChainHead { flags },
2385 atoms.into_iter(),
2386 SubPatternAtom::from_regexp_atom,
2387 );
2388 }
2389 }
2390
2391 for (i, p) in trailing.iter().enumerate() {
2392 let mut flags = common_flags;
2393
2394 // The last pattern in the chain has the `LastInChain` flag and
2395 // the `FullwordRight` if the original pattern was `Fullword`.
2396 // Patterns in the middle of the chain won't have either of these
2397 // flags.
2398 if i == trailing.len() - 1 {
2399 flags.insert(SubPatternFlags::LastInChain);
2400 if full_word {
2401 flags.insert(SubPatternFlags::FullwordRight);
2402 }
2403 }
2404
2405 if let hir::HirKind::Literal(literal) = p.hir.kind() {
2406 if wide {
2407 prev_sub_pattern_wide = self.c_literal_chain_tail(
2408 pattern_id,
2409 literal,
2410 prev_sub_pattern_wide,
2411 p.gap.clone(),
2412 flags | SubPatternFlags::Wide,
2413 );
2414 };
2415 if ascii {
2416 prev_sub_pattern_ascii = self.c_literal_chain_tail(
2417 pattern_id,
2418 literal,
2419 prev_sub_pattern_ascii,
2420 p.gap.clone(),
2421 flags,
2422 );
2423 }
2424 } else {
2425 if matches!(p.hir.is_greedy(), Some(true)) {
2426 flags.insert(SubPatternFlags::GreedyRegexp);
2427 }
2428
2429 let (atoms, is_fast_regexp) =
2430 self.c_regexp(&p.hir, span.clone())?;
2431
2432 if is_fast_regexp {
2433 flags.insert(SubPatternFlags::FastRegexp);
2434 }
2435
2436 if wide {
2437 prev_sub_pattern_wide = self.add_sub_pattern(
2438 pattern_id,
2439 SubPattern::RegexpChainTail {
2440 chained_to: prev_sub_pattern_wide,
2441 gap: p.gap.clone(),
2442 flags: flags | SubPatternFlags::Wide,
2443 },
2444 atoms.iter().cloned().map(|atom| atom.make_wide()),
2445 SubPatternAtom::from_regexp_atom,
2446 )
2447 }
2448
2449 if ascii {
2450 prev_sub_pattern_ascii = self.add_sub_pattern(
2451 pattern_id,
2452 SubPattern::RegexpChainTail {
2453 chained_to: prev_sub_pattern_ascii,
2454 gap: p.gap.clone(),
2455 flags,
2456 },
2457 atoms.into_iter(),
2458 SubPatternAtom::from_regexp_atom,
2459 );
2460 }
2461 }
2462 }
2463
2464 Ok(())
2465 }
2466
2467 fn c_regexp(
2468 &mut self,
2469 hir: &re::hir::Hir,
2470 span: Span,
2471 ) -> Result<(Vec<re::RegexpAtom>, bool), CompileError> {
2472 // When the `fast-regexp` feature is enabled, try to compile the regexp
2473 // for `FastVM` first, if it fails with `Error::FastIncompatible`, the
2474 // regexp is not compatible for `FastVM` and `PikeVM` must be used
2475 // instead.
2476 #[cfg(feature = "fast-regexp")]
2477 let (result, is_fast_regexp) = match re::fast::Compiler::new()
2478 .compile(hir, &mut self.re_code)
2479 {
2480 Err(re::Error::FastIncompatible) => (
2481 re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2482 false,
2483 ),
2484 result => (result, true),
2485 };
2486
2487 #[cfg(not(feature = "fast-regexp"))]
2488 let (result, is_fast_regexp) = (
2489 re::thompson::Compiler::new().compile(hir, &mut self.re_code),
2490 false,
2491 );
2492
2493 let re_atoms = result.map_err(|err| {
2494 InvalidRegexp::build(
2495 &self.report_builder,
2496 err.to_string(),
2497 self.report_builder.span_to_code_loc(span.clone()),
2498 None,
2499 )
2500 })?;
2501
2502 if matches!(hir.minimum_len(), Some(0)) {
2503 return Err(InvalidRegexp::build(
2504 &self.report_builder,
2505 "this regexp can match empty strings".to_string(),
2506 self.report_builder.span_to_code_loc(span),
2507 None,
2508 ));
2509 }
2510
2511 let (slow_pattern, note) =
2512 match re_atoms.iter().map(|re_atom| re_atom.atom.len()).minmax() {
2513 // No atoms, slow pattern.
2514 MinMaxResult::NoElements => (true, None),
2515 // Only one atom of len 0.
2516 MinMaxResult::OneElement(0) => (
2517 true,
2518 Some(
2519 "this is an exceptionally extreme case that may severely degrade scanning throughput"
2520 .to_string(),
2521 ),
2522 ),
2523 // Only one atom shorter than 2 bytes, slow pattern.
2524 MinMaxResult::OneElement(len) if len < 2 => (true, None),
2525 // More than one atom, at least one is shorter than 2 bytes.
2526 MinMaxResult::MinMax(min, _) if min < 2 => (true, None),
2527 // More than 2700 atoms, all with exactly 2 bytes.
2528 // Why 2700?. The larger the number of atoms the higher the
2529 // odds of finding one of them in the data, which slows down
2530 // the scan. The regex [A-Za-z]{N,} (with N>=2) produces
2531 // (26+26)^2 = 2704 atoms. So, 2700 is large enough, but
2532 // produces a warning with the aforementioned regex.
2533 MinMaxResult::MinMax(2, 2) if re_atoms.len() > 2700 => {
2534 (true, None)
2535 }
2536 // In all other cases the pattern is not slow.
2537 _ => (false, None),
2538 };
2539
2540 if slow_pattern {
2541 if self.error_on_slow_pattern {
2542 return Err(errors::SlowPattern::build(
2543 &self.report_builder,
2544 self.report_builder.span_to_code_loc(span),
2545 note,
2546 ));
2547 } else {
2548 self.warnings.add(|| {
2549 warnings::SlowPattern::build(
2550 &self.report_builder,
2551 self.report_builder.span_to_code_loc(span),
2552 note,
2553 )
2554 });
2555 }
2556 }
2557
2558 Ok((re_atoms, is_fast_regexp))
2559 }
2560
2561 fn c_literal_chain_head(
2562 &mut self,
2563 pattern_id: PatternId,
2564 literal: &hir::Literal,
2565 flags: SubPatternFlags,
2566 ) -> SubPatternId {
2567 let pattern_lit_id = self.intern_literal(
2568 literal.0.as_bytes(),
2569 flags.contains(SubPatternFlags::Wide),
2570 );
2571 self.add_sub_pattern(
2572 pattern_id,
2573 SubPattern::LiteralChainHead { pattern: pattern_lit_id, flags },
2574 extract_atoms(
2575 self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2576 flags,
2577 ),
2578 SubPatternAtom::from_atom,
2579 )
2580 }
2581
2582 fn c_literal_chain_tail(
2583 &mut self,
2584 pattern_id: PatternId,
2585 literal: &hir::Literal,
2586 chained_to: SubPatternId,
2587 gap: ChainedPatternGap,
2588 flags: SubPatternFlags,
2589 ) -> SubPatternId {
2590 let pattern_lit_id = self.intern_literal(
2591 literal.0.as_bytes(),
2592 flags.contains(SubPatternFlags::Wide),
2593 );
2594 self.add_sub_pattern(
2595 pattern_id,
2596 SubPattern::LiteralChainTail {
2597 pattern: pattern_lit_id,
2598 chained_to,
2599 gap,
2600 flags,
2601 },
2602 extract_atoms(
2603 self.lit_pool.get_bytes(pattern_lit_id).unwrap(),
2604 flags,
2605 ),
2606 SubPatternAtom::from_atom,
2607 )
2608 }
2609}
2610
2611impl fmt::Debug for Compiler<'_> {
2612 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2613 write!(f, "Compiler")
2614 }
2615}
2616
2617impl Default for Compiler<'_> {
2618 fn default() -> Self {
2619 Self::new()
2620 }
2621}
2622
2623/// ID associated to each identifier in the identifiers pool.
2624#[derive(Eq, PartialEq, Hash, Debug, Copy, Clone, Serialize, Deserialize)]
2625#[serde(transparent)]
2626pub(crate) struct IdentId(u32);
2627
2628impl From<u32> for IdentId {
2629 fn from(v: u32) -> Self {
2630 Self(v)
2631 }
2632}
2633
2634impl From<IdentId> for u32 {
2635 fn from(v: IdentId) -> Self {
2636 v.0
2637 }
2638}
2639
2640/// ID associated to each literal string in the literals pool.
2641#[derive(PartialEq, Debug, Copy, Clone, Serialize, Deserialize)]
2642#[serde(transparent)]
2643pub(crate) struct LiteralId(u32);
2644
2645impl From<i32> for LiteralId {
2646 fn from(v: i32) -> Self {
2647 Self(v as u32)
2648 }
2649}
2650
2651impl From<u32> for LiteralId {
2652 fn from(v: u32) -> Self {
2653 Self(v)
2654 }
2655}
2656
2657impl From<LiteralId> for u32 {
2658 fn from(v: LiteralId) -> Self {
2659 v.0
2660 }
2661}
2662
2663impl From<LiteralId> for i64 {
2664 fn from(v: LiteralId) -> Self {
2665 v.0 as i64
2666 }
2667}
2668
2669impl From<LiteralId> for u64 {
2670 fn from(v: LiteralId) -> Self {
2671 v.0 as u64
2672 }
2673}
2674
2675/// ID associated to each namespace.
2676#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
2677#[serde(transparent)]
2678pub(crate) struct NamespaceId(i32);
2679
2680/// ID associated to each rule.
2681#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Hash)]
2682pub(crate) struct RuleId(i32);
2683
2684impl RuleId {
2685 /// Returns the [`RuleId`] that comes after this one.
2686 ///
2687 /// This simply adds 1 to the ID.
2688 #[allow(dead_code)]
2689 pub(crate) fn next(&self) -> Self {
2690 RuleId(self.0 + 1)
2691 }
2692}
2693
2694impl From<i32> for RuleId {
2695 #[inline]
2696 fn from(value: i32) -> Self {
2697 Self(value)
2698 }
2699}
2700
2701impl From<usize> for RuleId {
2702 #[inline]
2703 fn from(value: usize) -> Self {
2704 Self(value.try_into().unwrap())
2705 }
2706}
2707
2708impl From<RuleId> for usize {
2709 #[inline]
2710 fn from(value: RuleId) -> Self {
2711 value.0 as usize
2712 }
2713}
2714
2715impl From<RuleId> for i32 {
2716 #[inline]
2717 fn from(value: RuleId) -> Self {
2718 value.0
2719 }
2720}
2721
2722/// ID associated to each regexp used in a rule condition.
2723#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
2724pub(crate) struct RegexpId(i32);
2725
2726impl From<i32> for RegexpId {
2727 #[inline]
2728 fn from(value: i32) -> Self {
2729 Self(value)
2730 }
2731}
2732
2733impl From<u32> for RegexpId {
2734 #[inline]
2735 fn from(value: u32) -> Self {
2736 Self(value.try_into().unwrap())
2737 }
2738}
2739
2740impl From<i64> for RegexpId {
2741 #[inline]
2742 fn from(value: i64) -> Self {
2743 Self(value.try_into().unwrap())
2744 }
2745}
2746
2747impl From<RegexpId> for usize {
2748 #[inline]
2749 fn from(value: RegexpId) -> Self {
2750 value.0 as usize
2751 }
2752}
2753
2754impl From<RegexpId> for i32 {
2755 #[inline]
2756 fn from(value: RegexpId) -> Self {
2757 value.0
2758 }
2759}
2760
2761impl From<RegexpId> for u32 {
2762 #[inline]
2763 fn from(value: RegexpId) -> Self {
2764 value.0.try_into().unwrap()
2765 }
2766}
2767
2768/// ID associated to each pattern.
2769///
2770/// For each unique pattern defined in a set of YARA rules there's a PatternId
2771/// that identifies it. If two different rules define exactly the same pattern
2772/// there's a single instance of the pattern and therefore a single PatternId
2773/// shared by both rules. For example, if one rule defines `$a = "mz"` and
2774/// another one `$mz = "mz"`, the pattern `"mz"` is shared by the two rules.
2775///
2776/// However, in order to be considered the same, the following conditions must
2777/// be met:
2778///
2779/// * Both patterns must have the same modifiers (i.e: `"mz" nocase` is not the
2780/// same pattern as `"mz"`),
2781/// * Both patterns must be either non-anchored, or anchored to the same offset.
2782/// * Both patterns must have the same file size bounds (or no bounds at all).
2783#[derive(
2784 Copy, Clone, Debug, Eq, Hash, PartialEq, PartialOrd, Serialize, Deserialize,
2785)]
2786#[serde(transparent)]
2787#[derive(Ord)]
2788pub(crate) struct PatternId(i32);
2789
2790impl PatternId {
2791 #[inline]
2792 fn incr(&mut self, amount: usize) {
2793 self.0 += amount as i32;
2794 }
2795}
2796
2797impl From<i32> for PatternId {
2798 #[inline]
2799 fn from(value: i32) -> Self {
2800 Self(value)
2801 }
2802}
2803
2804impl From<usize> for PatternId {
2805 #[inline]
2806 fn from(value: usize) -> Self {
2807 Self(value as i32)
2808 }
2809}
2810
2811impl From<PatternId> for i32 {
2812 #[inline]
2813 fn from(value: PatternId) -> Self {
2814 value.0
2815 }
2816}
2817
2818impl From<PatternId> for i64 {
2819 #[inline]
2820 fn from(value: PatternId) -> Self {
2821 value.0 as i64
2822 }
2823}
2824
2825impl From<PatternId> for usize {
2826 #[inline]
2827 fn from(value: PatternId) -> Self {
2828 value.0 as usize
2829 }
2830}
2831
2832/// ID associated to each sub-pattern.
2833///
2834/// For each pattern there's one or more sub-patterns, depending on the pattern
2835/// and its modifiers. For example the pattern `"foo" ascii wide` may have one
2836/// subpattern for the ascii case and another one for the wide case.
2837#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
2838#[serde(transparent)]
2839pub(crate) struct SubPatternId(u32);
2840
2841/// Iterator that yields the names of the modules imported by the rules.
2842pub struct Imports<'a> {
2843 iter: std::slice::Iter<'a, IdentId>,
2844 ident_pool: &'a StringPool<IdentId>,
2845}
2846
2847impl<'a> Iterator for Imports<'a> {
2848 type Item = &'a str;
2849
2850 fn next(&mut self) -> Option<Self::Item> {
2851 self.iter.next().map(|id| self.ident_pool.get(*id).unwrap())
2852 }
2853}
2854
2855bitflags! {
2856 /// Flags associated to some kinds of [`SubPattern`].
2857 #[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq)]
2858 pub struct SubPatternFlags: u16 {
2859 const Wide = 0x01;
2860 const Nocase = 0x02;
2861 // Indicates that the pattern is the last one in chain. Applies only
2862 // to chained sub-patterns.
2863 const LastInChain = 0x04;
2864 const FullwordLeft = 0x08;
2865 const FullwordRight = 0x10;
2866 // Indicates that the pattern is a greedy regexp. Apply only to regexp
2867 // sub-patterns, or to any sub-pattern is part of chain that corresponds
2868 // to a greedy regexp.
2869 const GreedyRegexp = 0x20;
2870 // Indicates that the pattern is a fast regexp. A fast regexp is one
2871 // that can be matched by the FastVM.
2872 const FastRegexp = 0x40;
2873 }
2874}
2875
2876/// A sub-pattern in the compiled rules.
2877///
2878/// Each pattern in a rule has one or more associated sub-patterns. For
2879/// example, the pattern `$a = "foo" ascii wide` has a sub-pattern for the
2880/// ASCII variant of "foo", and another one for the wide variant.
2881///
2882/// Also, each [`Atom`] is associated to a [`SubPattern`]. When the atom is
2883/// found in the scanned data by the Aho-Corasick algorithm, the scanner
2884/// verifies that the sub-pattern actually matches.
2885#[derive(Serialize, Deserialize)]
2886pub(crate) enum SubPattern {
2887 Literal {
2888 pattern: LiteralId,
2889 anchored_at: Option<usize>,
2890 flags: SubPatternFlags,
2891 },
2892
2893 LiteralChainHead {
2894 pattern: LiteralId,
2895 flags: SubPatternFlags,
2896 },
2897
2898 LiteralChainTail {
2899 pattern: LiteralId,
2900 chained_to: SubPatternId,
2901 gap: ChainedPatternGap,
2902 flags: SubPatternFlags,
2903 },
2904
2905 Regexp {
2906 flags: SubPatternFlags,
2907 },
2908
2909 RegexpChainHead {
2910 flags: SubPatternFlags,
2911 },
2912
2913 RegexpChainTail {
2914 chained_to: SubPatternId,
2915 gap: ChainedPatternGap,
2916 flags: SubPatternFlags,
2917 },
2918
2919 Xor {
2920 pattern: LiteralId,
2921 flags: SubPatternFlags,
2922 },
2923
2924 Base64 {
2925 pattern: LiteralId,
2926 padding: u8,
2927 },
2928
2929 Base64Wide {
2930 pattern: LiteralId,
2931 padding: u8,
2932 },
2933
2934 CustomBase64 {
2935 pattern: LiteralId,
2936 alphabet: LiteralId,
2937 padding: u8,
2938 },
2939
2940 CustomBase64Wide {
2941 pattern: LiteralId,
2942 alphabet: LiteralId,
2943 padding: u8,
2944 },
2945}
2946
2947impl SubPattern {
2948 /// If this sub-pattern is chained to another one, returns the
2949 /// [`SubPatternId`] associated to this other pattern.
2950 pub fn chained_to(&self) -> Option<SubPatternId> {
2951 match self {
2952 SubPattern::LiteralChainTail { chained_to, .. }
2953 | SubPattern::RegexpChainTail { chained_to, .. } => {
2954 Some(*chained_to)
2955 }
2956 _ => None,
2957 }
2958 }
2959}
2960
2961/// A snapshot that represents the state of the compiler at a particular moment.
2962#[derive(Debug, PartialEq, Eq)]
2963struct Snapshot {
2964 next_pattern_id: PatternId,
2965 rules_len: usize,
2966 atoms_len: usize,
2967 re_code_len: usize,
2968 sub_patterns_len: usize,
2969 symbol_table_len: usize,
2970}
2971
2972/// Represents a list of warnings.
2973///
2974/// This is a wrapper around a `Vec<Warning>` that contains additional logic
2975/// for limiting the number of warnings stored in the vector and silencing some
2976/// warnings types.
2977pub(crate) struct Warnings {
2978 warnings: Vec<Warning>,
2979 /// Maximum number of warnings that will be stored in `warnings`.
2980 max_warnings: usize,
2981 /// Warnings that are globally disabled.
2982 disabled_warnings: HashSet<String>,
2983 /// Warnings that are suppressed for a specific code span. Keys are
2984 /// warning identifiers, and values are the code spans in which the
2985 /// warning is disabled.
2986 suppressed_warnings: HashMap<String, Vec<Span>>,
2987}
2988
2989impl Default for Warnings {
2990 fn default() -> Self {
2991 Self {
2992 warnings: Vec::new(),
2993 max_warnings: 100,
2994 disabled_warnings: HashSet::default(),
2995 suppressed_warnings: HashMap::default(),
2996 }
2997 }
2998}
2999
3000impl Warnings {
3001 /// Adds the warning returned by `f` to the list.
3002 ///
3003 /// If the maximum number of warnings has been reached the warning is not
3004 /// added.
3005 #[inline]
3006 pub fn add(&mut self, f: impl FnOnce() -> Warning) {
3007 if self.warnings.len() < self.max_warnings {
3008 let warning = f();
3009 let mut warn = !self.disabled_warnings.contains(warning.code());
3010
3011 if warn {
3012 if let Some(spans) =
3013 self.suppressed_warnings.get(warning.code())
3014 {
3015 'l: for disabled_span in spans {
3016 for label in warning.labels() {
3017 if disabled_span.contains(label.span()) {
3018 warn = false;
3019 break 'l;
3020 }
3021 }
3022 }
3023 }
3024 }
3025
3026 if warn {
3027 self.warnings.push(warning);
3028 }
3029 }
3030 }
3031
3032 /// Returns true if the given code is a valid warning code.
3033 pub fn is_valid_code(code: &str) -> bool {
3034 Warning::all_codes().contains(&code)
3035 }
3036
3037 /// Enables or disables a specific warning identified by `code`.
3038 ///
3039 /// Returns `true` if the warning was previously enabled, or `false` if
3040 /// otherwise. Returns an error if the code doesn't correspond to any
3041 /// of the existing warnings.
3042 #[inline]
3043 pub fn switch_warning(
3044 &mut self,
3045 code: &str,
3046 enabled: bool,
3047 ) -> Result<bool, InvalidWarningCode> {
3048 if !Self::is_valid_code(code) {
3049 return Err(InvalidWarningCode::new(code.to_string()));
3050 }
3051 if enabled {
3052 Ok(!self.disabled_warnings.remove(code))
3053 } else {
3054 Ok(self.disabled_warnings.insert(code.to_string()))
3055 }
3056 }
3057
3058 /// Enable or disables all warnings.
3059 pub fn switch_all_warnings(&mut self, enabled: bool) {
3060 if enabled {
3061 self.disabled_warnings.clear();
3062 } else {
3063 for c in Warning::all_codes() {
3064 self.disabled_warnings.insert(c.to_string());
3065 }
3066 }
3067 }
3068
3069 /// Clear suppressed warnings.
3070 pub fn clear_suppressed(&mut self) {
3071 self.suppressed_warnings.clear();
3072 }
3073
3074 /// Suppress the warning with the given code, for the given span.
3075 pub fn suppress(&mut self, code: &str, span: Span) {
3076 self.suppressed_warnings
3077 .entry(code.to_string())
3078 .or_default()
3079 .push(span);
3080 }
3081
3082 #[inline]
3083 pub fn as_slice(&self) -> &[Warning] {
3084 self.warnings.as_slice()
3085 }
3086}
3087
3088impl From<Warnings> for Vec<Warning> {
3089 fn from(value: Warnings) -> Self {
3090 value.warnings
3091 }
3092}