1use std::fmt;
2use std::io::{BufWriter, Read, Write};
3use std::ops::{Bound, RangeBounds};
4use std::slice::Iter;
5#[cfg(feature = "logging")]
6use std::time::Instant;
7
8use aho_corasick::AhoCorasick;
9#[cfg(feature = "logging")]
10use log::*;
11use regex_automata::meta::Regex;
12use rustc_hash::FxHashMap;
13use serde::{Deserialize, Deserializer, Serialize, Serializer};
14
15use crate::compiler::atoms::Atom;
16use crate::compiler::errors::SerializationError;
17use crate::compiler::report::CodeLoc;
18use crate::compiler::warnings::Warning;
19use crate::compiler::{
20 IdentId, Imports, LiteralId, NamespaceId, PatternId, RegexpId, RuleId,
21 SubPattern, SubPatternId,
22};
23use crate::models::PatternKind;
24use crate::re::{BckCodeLoc, FwdCodeLoc, RegexpAtom};
25use crate::string_pool::{BStringPool, StringPool};
26use crate::{re, types, wasm, Rule};
27
28#[derive(Serialize, Deserialize)]
32pub struct Rules {
33 pub(in crate::compiler) ident_pool: StringPool<IdentId>,
37
38 pub(in crate::compiler) regexp_pool: StringPool<RegexpId>,
43
44 pub(in crate::compiler) relaxed_re_syntax: bool,
47
48 pub(in crate::compiler) lit_pool: BStringPool<LiteralId>,
52
53 pub(in crate::compiler) wasm_mod: Vec<u8>,
55
56 #[serde(
60 serialize_with = "serialize_wasm_mod",
61 deserialize_with = "deserialize_wasm_mod"
62 )]
63 pub(in crate::compiler) compiled_wasm_mod: Option<wasmtime::Module>,
64
65 pub(in crate::compiler) imported_modules: Vec<IdentId>,
68
69 pub(in crate::compiler) rules: Vec<RuleInfo>,
72
73 pub(in crate::compiler) num_patterns: usize,
76
77 pub(in crate::compiler) sub_patterns: Vec<(PatternId, SubPattern)>,
88
89 pub(in crate::compiler) filesize_bounds:
100 FxHashMap<PatternId, FilesizeBounds>,
101
102 pub(in crate::compiler) anchored_sub_patterns: Vec<SubPatternId>,
106
107 pub(in crate::compiler) atoms: Vec<SubPatternAtom>,
111
112 pub(in crate::compiler) re_code: Vec<u8>,
118
119 pub(in crate::compiler) serialized_globals: Vec<u8>,
123
124 #[serde(skip)]
131 pub(in crate::compiler) ac: Option<AhoCorasick>,
132
133 #[serde(skip)]
137 pub(in crate::compiler) warnings: Vec<Warning>,
138}
139
140impl Rules {
141 pub fn imports(&self) -> Imports<'_> {
144 Imports {
145 iter: self.imported_modules.iter(),
146 ident_pool: &self.ident_pool,
147 }
148 }
149
150 pub fn warnings(&self) -> &[Warning] {
152 self.warnings.as_slice()
153 }
154
155 pub fn serialize(&self) -> Result<Vec<u8>, SerializationError> {
160 let mut bytes = Vec::new();
161 self.serialize_into(&mut bytes)?;
162 Ok(bytes)
163 }
164
165 pub fn deserialize<B>(bytes: B) -> Result<Self, SerializationError>
168 where
169 B: AsRef<[u8]>,
170 {
171 let bytes = bytes.as_ref();
172 let magic = b"YARA-X";
173
174 if bytes.len() < magic.len() || &bytes[0..magic.len()] != magic {
175 return Err(SerializationError::InvalidFormat);
176 }
177
178 #[cfg(feature = "logging")]
179 let start = Instant::now();
180
181 let (mut rules, _len): (Self, usize) =
183 bincode::serde::decode_from_slice(
184 &bytes[magic.len()..],
185 bincode::config::standard(),
186 )?;
187
188 #[cfg(feature = "logging")]
189 info!("Deserialization time: {:?}", Instant::elapsed(&start));
190
191 if rules.compiled_wasm_mod.is_none() {
204 #[cfg(feature = "logging")]
205 let start = Instant::now();
206
207 rules.compiled_wasm_mod = Some(wasmtime::Module::from_binary(
208 wasm::get_engine(),
209 rules.wasm_mod.as_slice(),
210 )?);
211
212 #[cfg(feature = "logging")]
213 info!("WASM build time: {:?}", Instant::elapsed(&start));
214 }
215
216 rules.build_ac_automaton();
217
218 Ok(rules)
219 }
220
221 pub fn serialize_into<W>(
223 &self,
224 writer: W,
225 ) -> Result<(), SerializationError>
226 where
227 W: Write,
228 {
229 let mut writer = BufWriter::new(writer);
230
231 writer.write_all(b"YARA-X")?;
233
234 bincode::serde::encode_into_std_write(
235 self,
236 &mut writer,
237 bincode::config::standard(),
238 )?;
239
240 Ok(())
241 }
242
243 pub fn deserialize_from<R>(
245 mut reader: R,
246 ) -> Result<Self, SerializationError>
247 where
248 R: Read,
249 {
250 let mut bytes = Vec::new();
251 let _ = reader.read_to_end(&mut bytes)?;
252 Self::deserialize(bytes)
253 }
254
255 pub fn iter(&self) -> RulesIter<'_> {
275 RulesIter { rules: self, iterator: self.rules.iter() }
276 }
277
278 pub(crate) fn get(&self, rule_id: RuleId) -> &RuleInfo {
284 self.rules.get(rule_id.0 as usize).unwrap()
285 }
286
287 #[inline]
293 pub(crate) fn get_regexp(&self, regexp_id: RegexpId) -> Regex {
294 let re = types::Regexp::new(self.regexp_pool.get(regexp_id).unwrap());
295
296 let parser = re::parser::Parser::new()
297 .relaxed_re_syntax(self.relaxed_re_syntax);
298
299 let hir = parser.parse(&re).unwrap().into_inner();
300
301 let config = regex_automata::meta::Config::new()
305 .nfa_size_limit(Some(50 * 1024 * 1024));
306
307 regex_automata::meta::Builder::new()
308 .configure(config)
309 .build_from_hir(&hir)
310 .unwrap_or_else(|err| {
311 panic!("error compiling regex `{}`: {:#?}", re.as_str(), err)
312 })
313 }
314
315 #[inline]
317 pub(crate) fn get_sub_pattern(
318 &self,
319 sub_pattern_id: SubPatternId,
320 ) -> &(PatternId, SubPattern) {
321 unsafe { self.sub_patterns.get_unchecked(sub_pattern_id.0 as usize) }
322 }
323
324 #[cfg(feature = "logging")]
331 pub(crate) fn get_rule_and_pattern_by_sub_pattern_id(
332 &self,
333 sub_pattern_id: SubPatternId,
334 ) -> Option<(RuleId, IdentId)> {
335 let (target_pattern_id, _) = self.get_sub_pattern(sub_pattern_id);
336 for (rule_id, rule) in self.rules.iter().enumerate() {
337 for p in &rule.patterns {
338 if p.pattern_id == *target_pattern_id {
339 return Some((rule_id.into(), p.ident_id));
340 };
341 }
342 }
343 None
344 }
345
346 #[cfg(feature = "rules-profiling")]
347 #[inline]
348 pub(crate) fn rules(&self) -> &[RuleInfo] {
349 self.rules.as_slice()
350 }
351
352 #[inline]
353 pub(crate) fn atoms(&self) -> &[SubPatternAtom] {
354 self.atoms.as_slice()
355 }
356
357 #[inline]
358 pub(crate) fn anchored_sub_patterns(&self) -> &[SubPatternId] {
359 self.anchored_sub_patterns.as_slice()
360 }
361
362 #[inline]
363 pub(crate) fn re_code(&self) -> &[u8] {
364 self.re_code.as_slice()
365 }
366
367 #[inline]
368 pub(crate) fn num_rules(&self) -> usize {
369 self.rules.len()
370 }
371
372 #[inline]
373 pub(crate) fn num_patterns(&self) -> usize {
374 self.num_patterns
375 }
376
377 #[inline]
380 pub(crate) fn ac_automaton(&self) -> &AhoCorasick {
381 self.ac.as_ref().expect("Aho-Corasick automaton not compiled")
382 }
383
384 pub(crate) fn build_ac_automaton(&mut self) {
385 if self.ac.is_some() {
386 return;
387 }
388
389 #[cfg(feature = "logging")]
390 let start = Instant::now();
391
392 #[cfg(feature = "logging")]
393 let mut num_atoms = [0_usize; 6];
394
395 let atoms = self.atoms.iter().map(|x| {
396 #[cfg(feature = "logging")]
397 {
398 match x.atom.len() {
399 atom_len @ 0..=4 => num_atoms[atom_len] += 1,
400 _ => num_atoms[num_atoms.len() - 1] += 1,
401 }
402
403 if x.atom.len() < 2 {
404 let (rule_id, pattern_ident_id) = self
405 .get_rule_and_pattern_by_sub_pattern_id(
406 x.sub_pattern_id,
407 )
408 .unwrap();
409
410 let rule = self.get(rule_id);
411
412 info!(
413 "Very short atom in pattern `{}` in rule `{}:{}` (length: {})",
414 self.ident_pool.get(pattern_ident_id).unwrap(),
415 self.ident_pool
416 .get(rule.namespace_ident_id)
417 .unwrap(),
418 self.ident_pool.get(rule.ident_id).unwrap(),
419 x.atom.len()
420 );
421 }
422 }
423
424 x.atom.as_ref()
425 });
426
427 self.ac = Some(
428 AhoCorasick::new(atoms)
429 .expect("failed to build Aho-Corasick automaton"),
430 );
431
432 #[cfg(feature = "logging")]
433 {
434 info!(
435 "Aho-Corasick automaton build time: {:?}",
436 Instant::elapsed(&start)
437 );
438
439 info!("Number of rules: {}", self.num_rules());
440 info!("Number of patterns: {}", self.num_patterns());
441 info!(
442 "Number of anchored sub-patterns: {}",
443 self.anchored_sub_patterns.len()
444 );
445 info!("Number of atoms: {}", self.atoms.len());
446 info!("Atoms with len = 0: {}", num_atoms[0]);
447 info!("Atoms with len = 1: {}", num_atoms[1]);
448 info!("Atoms with len = 2: {}", num_atoms[2]);
449 info!("Atoms with len = 3: {}", num_atoms[3]);
450 info!("Atoms with len = 4: {}", num_atoms[4]);
451 info!("Atoms with len > 4: {}", num_atoms[5]);
452 }
453 }
454
455 #[inline]
456 pub(crate) fn lit_pool(&self) -> &BStringPool<LiteralId> {
457 &self.lit_pool
458 }
459
460 #[inline]
461 pub(crate) fn ident_pool(&self) -> &StringPool<IdentId> {
462 &self.ident_pool
463 }
464
465 #[inline]
466 pub(crate) fn globals(&self) -> types::Struct {
467 let (globals, _): (types::Struct, usize) =
468 bincode::serde::decode_from_slice(
469 self.serialized_globals.as_slice(),
470 bincode::config::standard(),
471 )
472 .expect("error deserializing global variables");
473 globals
474 }
475
476 #[inline]
477 pub(crate) fn wasm_mod(&self) -> &wasmtime::Module {
478 self.compiled_wasm_mod.as_ref().unwrap()
479 }
480
481 #[inline]
482 pub(crate) fn filesize_bounds(
483 &self,
484 pattern_id: PatternId,
485 ) -> Option<&FilesizeBounds> {
486 self.filesize_bounds.get(&pattern_id)
487 }
488}
489
490#[cfg(feature = "native-code-serialization")]
491fn serialize_wasm_mod<S>(
492 wasm_mod: &Option<wasmtime::Module>,
493 serializer: S,
494) -> Result<S::Ok, S::Error>
495where
496 S: Serializer,
497{
498 if let Some(wasm_mod) = wasm_mod {
499 let bytes = wasm_mod
500 .serialize()
501 .map_err(|err| serde::ser::Error::custom(err.to_string()))?;
502
503 serializer.serialize_some(bytes.as_slice())
504 } else {
505 serializer.serialize_none()
506 }
507}
508
509#[cfg(not(feature = "native-code-serialization"))]
510fn serialize_wasm_mod<S>(
511 _wasm_mod: &Option<wasmtime::Module>,
512 serializer: S,
513) -> Result<S::Ok, S::Error>
514where
515 S: Serializer,
516{
517 serializer.serialize_none()
518}
519
520pub fn deserialize_wasm_mod<'de, D>(
521 deserializer: D,
522) -> Result<Option<wasmtime::Module>, D::Error>
523where
524 D: Deserializer<'de>,
525{
526 let bytes: Option<&[u8]> = Deserialize::deserialize(deserializer)?;
527 let module = if let Some(bytes) = bytes {
528 unsafe {
529 wasmtime::Module::deserialize(wasm::get_engine(), bytes).ok()
530 }
531 } else {
532 None
533 };
534
535 Ok(module)
536}
537
538pub struct RulesIter<'a> {
540 rules: &'a Rules,
541 iterator: Iter<'a, RuleInfo>,
542}
543
544impl<'a> Iterator for RulesIter<'a> {
545 type Item = Rule<'a, 'a>;
546
547 fn next(&mut self) -> Option<Self::Item> {
548 Some(Rule {
549 ctx: None,
550 rules: self.rules,
551 rule_info: self.iterator.next()?,
552 })
553 }
554}
555
556impl ExactSizeIterator for RulesIter<'_> {
557 #[inline]
558 fn len(&self) -> usize {
559 self.iterator.len()
560 }
561}
562
563impl fmt::Debug for Rules {
564 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
565 for (id, rule) in self.rules.iter().enumerate() {
566 let name = self.ident_pool.get(rule.ident_id).unwrap();
567 let namespace =
568 self.ident_pool.get(rule.namespace_ident_id).unwrap();
569 writeln!(f, "RuleId({id})")?;
570 writeln!(f, " namespace: {namespace}")?;
571 writeln!(f, " name: {name}")?;
572 writeln!(f, " patterns:")?;
573 for pattern in &rule.patterns {
574 let ident = self.ident_pool.get(pattern.ident_id).unwrap();
575 writeln!(f, " {:?} {ident} ", pattern.pattern_id)?;
576 }
577 }
578
579 for (id, (pattern_id, _)) in self.sub_patterns.iter().enumerate() {
580 writeln!(f, "SubPatternId({id}) -> {pattern_id:?}")?;
581 }
582
583 Ok(())
584 }
585}
586
587#[derive(Serialize, Deserialize)]
589pub(crate) enum MetaValue {
590 Bool(bool),
591 Integer(i64),
592 Float(f64),
593 String(LiteralId),
594 Bytes(LiteralId),
595}
596
597#[derive(Serialize, Deserialize)]
599pub(crate) struct RuleInfo {
600 pub namespace_id: NamespaceId,
602 pub namespace_ident_id: IdentId,
604 pub ident_id: IdentId,
606 pub tags: Vec<IdentId>,
608 #[serde(skip)]
613 pub ident_ref: CodeLoc,
614 pub metadata: Vec<(IdentId, MetaValue)>,
616 pub patterns: Vec<PatternInfo>,
619 pub num_private_patterns: usize,
622 pub is_global: bool,
624 pub is_private: bool,
626}
627
628#[derive(Serialize, Deserialize)]
630pub(crate) struct PatternInfo {
631 pub pattern_id: PatternId,
633 pub ident_id: IdentId,
635 pub kind: PatternKind,
637 pub is_private: bool,
639}
640
641#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Hash, Eq)]
658pub(crate) struct FilesizeBounds {
659 start: Bound<i64>,
660 end: Bound<i64>,
661}
662
663impl Default for FilesizeBounds {
664 fn default() -> Self {
665 Self { start: Bound::Unbounded, end: Bound::Unbounded }
666 }
667}
668
669impl<T: RangeBounds<i64>> From<T> for FilesizeBounds {
670 fn from(value: T) -> Self {
671 Self {
672 start: value.start_bound().cloned(),
673 end: value.end_bound().cloned(),
674 }
675 }
676}
677
678impl FilesizeBounds {
679 pub fn unbounded(&self) -> bool {
680 matches!(self.start, Bound::Unbounded)
681 && matches!(self.end, Bound::Unbounded)
682 }
683
684 pub fn contains(&self, value: i64) -> bool {
685 let start_ok = match self.start {
686 Bound::Included(start) => value >= start,
687 Bound::Excluded(start) => value > start,
688 Bound::Unbounded => true,
689 };
690
691 let end_ok = match self.end {
692 Bound::Included(end) => value <= end,
693 Bound::Excluded(end) => value < end,
694 Bound::Unbounded => true,
695 };
696
697 start_ok && end_ok
698 }
699 pub fn max_start(&mut self, bound: Bound<i64>) -> &mut Self {
700 match (&self.start, &bound) {
701 (Bound::Included(current), Bound::Included(new)) => {
702 if new > current {
703 self.start = Bound::Included(*new);
704 }
705 }
706 (Bound::Included(current), Bound::Excluded(new)) => {
707 if new >= current {
708 self.start = Bound::Excluded(*new);
709 }
710 }
711 (Bound::Excluded(current), Bound::Included(new)) => {
712 if new > current {
713 self.start = Bound::Included(*new);
714 }
715 }
716 (Bound::Excluded(current), Bound::Excluded(new)) => {
717 if new > current {
718 self.start = Bound::Excluded(*new);
719 }
720 }
721 (Bound::Unbounded, new) => {
722 self.start = *new;
723 }
724 (_, Bound::Unbounded) => {}
725 }
726 self
727 }
728
729 pub fn min_end(&mut self, bound: Bound<i64>) -> &mut Self {
730 match (&self.end, &bound) {
731 (Bound::Included(current), Bound::Included(new)) => {
732 if new < current {
733 self.end = Bound::Included(*new);
734 }
735 }
736 (Bound::Included(current), Bound::Excluded(new)) => {
737 if new <= current {
738 self.end = Bound::Excluded(*new);
739 }
740 }
741 (Bound::Excluded(current), Bound::Included(new)) => {
742 if new < current {
743 self.end = Bound::Included(*new);
744 }
745 }
746 (Bound::Excluded(current), Bound::Excluded(new)) => {
747 if new < current {
748 self.end = Bound::Excluded(*new)
749 }
750 }
751 (Bound::Unbounded, new) => {
752 self.end = *new;
753 }
754 (_, Bound::Unbounded) => {}
755 }
756 self
757 }
758}
759
760#[derive(Serialize, Deserialize)]
767pub(crate) struct SubPatternAtom {
768 sub_pattern_id: SubPatternId,
771 atom: Atom,
773 fwd_code: Option<FwdCodeLoc>,
775 bck_code: Option<BckCodeLoc>,
777}
778
779impl SubPatternAtom {
780 #[inline]
781 pub(crate) fn from_atom(sub_pattern_id: SubPatternId, atom: Atom) -> Self {
782 Self { sub_pattern_id, atom, bck_code: None, fwd_code: None }
783 }
784
785 pub(crate) fn from_regexp_atom(
786 sub_pattern_id: SubPatternId,
787 value: RegexpAtom,
788 ) -> Self {
789 Self {
790 sub_pattern_id,
791 atom: value.atom,
792 fwd_code: value.fwd_code,
793 bck_code: value.bck_code,
794 }
795 }
796
797 #[inline]
798 pub(crate) fn sub_pattern_id(&self) -> SubPatternId {
799 self.sub_pattern_id
800 }
801
802 #[cfg(feature = "exact-atoms")]
803 #[inline]
804 pub(crate) fn is_exact(&self) -> bool {
805 self.atom.is_exact()
806 }
807
808 #[inline]
809 pub(crate) fn len(&self) -> usize {
810 self.atom.len()
811 }
812
813 #[inline]
814 pub(crate) fn backtrack(&self) -> usize {
815 self.atom.backtrack() as usize
816 }
817
818 #[inline]
819 pub(crate) fn as_slice(&self) -> &[u8] {
820 self.atom.as_ref()
821 }
822
823 #[inline]
824 pub(crate) fn fwd_code(&self) -> Option<FwdCodeLoc> {
825 self.fwd_code
826 }
827
828 #[inline]
829 pub(crate) fn bck_code(&self) -> Option<BckCodeLoc> {
830 self.bck_code
831 }
832}