use std::fmt;
use std::io::{BufWriter, Read, Write};
use std::ops::{Bound, RangeBounds};
use std::slice::Iter;
#[cfg(feature = "logging")]
use std::time::Instant;
use aho_corasick::AhoCorasick;
#[cfg(feature = "logging")]
use log::*;
use regex_automata::meta::Regex;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::compiler::atoms::Atom;
use crate::compiler::errors::SerializationError;
use crate::compiler::report::CodeLoc;
use crate::compiler::warnings::Warning;
use crate::compiler::{
IdentId, Imports, LiteralId, NamespaceId, PatternId, RegexpId, RuleId,
SubPattern, SubPatternId,
};
use crate::models::PatternKind;
use crate::re::{BckCodeLoc, FwdCodeLoc, RegexpAtom};
use crate::string_pool::{BStringPool, StringPool};
use crate::{Rule, re, types, wasm};
const MAGIC: &[u8] = b"YARA-X\0\0";
const SERIALIZATION_VERSION: u32 = 1;
#[derive(Serialize, Deserialize)]
pub struct Rules {
pub(in crate::compiler) ident_pool: StringPool<IdentId>,
pub(in crate::compiler) regexp_pool: StringPool<RegexpId>,
pub(in crate::compiler) relaxed_re_syntax: bool,
pub(in crate::compiler) lit_pool: BStringPool<LiteralId>,
pub(in crate::compiler) wasm_mod: Vec<u8>,
#[serde(
serialize_with = "serialize_wasm_mod",
deserialize_with = "deserialize_wasm_mod"
)]
pub(in crate::compiler) compiled_wasm_mod: Option<wasm::runtime::Module>,
pub(in crate::compiler) imported_modules: Vec<IdentId>,
pub(in crate::compiler) rules: Vec<RuleInfo>,
pub(in crate::compiler) num_patterns: usize,
pub(in crate::compiler) sub_patterns: Vec<(PatternId, SubPattern)>,
pub(in crate::compiler) filesize_bounds:
FxHashMap<PatternId, FilesizeBounds>,
pub(in crate::compiler) anchored_sub_patterns: Vec<SubPatternId>,
pub(in crate::compiler) atoms: Vec<SubPatternAtom>,
pub(in crate::compiler) re_code: Vec<u8>,
pub(in crate::compiler) serialized_globals: Vec<u8>,
#[serde(skip)]
pub(in crate::compiler) ac: Option<AhoCorasick>,
#[serde(skip)]
pub(in crate::compiler) warnings: Vec<Warning>,
}
impl Rules {
pub fn imports(&self) -> Imports<'_> {
Imports {
iter: self.imported_modules.iter(),
ident_pool: &self.ident_pool,
}
}
pub fn warnings(&self) -> &[Warning] {
self.warnings.as_slice()
}
pub fn serialize(&self) -> Result<Vec<u8>, SerializationError> {
let mut bytes = Vec::new();
self.serialize_into(&mut bytes)?;
Ok(bytes)
}
pub fn deserialize<B>(bytes: B) -> Result<Self, SerializationError>
where
B: AsRef<[u8]>,
{
let bytes = bytes.as_ref();
let version_offset = MAGIC.len();
let data_offset = version_offset + size_of::<u32>();
if bytes.len() < data_offset || &bytes[0..version_offset] != MAGIC {
return Err(SerializationError::InvalidFormat);
}
let version = u32::from_le_bytes(
bytes[version_offset..data_offset].try_into().unwrap(),
);
if version != SERIALIZATION_VERSION {
return Err(SerializationError::InvalidVersion {
expected: SERIALIZATION_VERSION,
actual: version,
});
}
#[cfg(feature = "logging")]
let start = Instant::now();
let (mut rules, _len): (Self, usize) =
bincode::serde::decode_from_slice(
&bytes[data_offset..],
bincode::config::standard(),
)?;
#[cfg(feature = "logging")]
info!("Deserialization time: {:?}", Instant::elapsed(&start));
if rules.compiled_wasm_mod.is_none() {
#[cfg(feature = "logging")]
let start = Instant::now();
rules.compiled_wasm_mod =
Some(wasm::runtime::Module::from_binary(
wasm::get_engine(),
rules.wasm_mod.as_slice(),
)?);
#[cfg(feature = "logging")]
info!("WASM build time: {:?}", Instant::elapsed(&start));
}
rules.build_ac_automaton();
Ok(rules)
}
pub fn serialize_into<W>(
&self,
writer: W,
) -> Result<(), SerializationError>
where
W: Write,
{
let mut writer = BufWriter::new(writer);
writer.write_all(MAGIC)?;
writer.write_all(&SERIALIZATION_VERSION.to_le_bytes())?;
bincode::serde::encode_into_std_write(
self,
&mut writer,
bincode::config::standard(),
)?;
Ok(())
}
pub fn deserialize_from<R>(
mut reader: R,
) -> Result<Self, SerializationError>
where
R: Read,
{
let mut bytes = Vec::new();
let _ = reader.read_to_end(&mut bytes)?;
Self::deserialize(bytes)
}
pub fn iter(&self) -> RulesIter<'_> {
RulesIter { rules: self, iterator: self.rules.iter() }
}
pub(crate) fn get(&self, rule_id: RuleId) -> &RuleInfo {
self.rules.get(rule_id.0 as usize).unwrap()
}
#[inline]
pub(crate) fn get_regexp(&self, regexp_id: RegexpId) -> Regex {
let re = types::Regexp::new(self.regexp_pool.get(regexp_id).unwrap());
let parser = re::parser::Parser::new()
.relaxed_re_syntax(self.relaxed_re_syntax);
let hir = parser.parse(&re).unwrap().into_inner();
let config = regex_automata::meta::Config::new()
.nfa_size_limit(Some(50 * 1024 * 1024));
regex_automata::meta::Builder::new()
.configure(config)
.build_from_hir(&hir)
.unwrap_or_else(|err| {
panic!("error compiling regex `{}`: {:#?}", re.as_str(), err)
})
}
#[inline]
pub(crate) fn get_sub_pattern(
&self,
sub_pattern_id: SubPatternId,
) -> &(PatternId, SubPattern) {
unsafe { self.sub_patterns.get_unchecked(sub_pattern_id.0 as usize) }
}
#[cfg(feature = "logging")]
pub(crate) fn get_rule_and_pattern_by_sub_pattern_id(
&self,
sub_pattern_id: SubPatternId,
) -> Option<(RuleId, IdentId)> {
let (target_pattern_id, _) = self.get_sub_pattern(sub_pattern_id);
for (rule_id, rule) in self.rules.iter().enumerate() {
for p in &rule.patterns {
if p.pattern_id == *target_pattern_id {
return Some((rule_id.into(), p.ident_id));
};
}
}
None
}
#[cfg(feature = "rules-profiling")]
#[inline]
pub(crate) fn rules(&self) -> &[RuleInfo] {
self.rules.as_slice()
}
#[inline]
pub(crate) fn atoms(&self) -> &[SubPatternAtom] {
self.atoms.as_slice()
}
#[inline]
pub(crate) fn anchored_sub_patterns(&self) -> &[SubPatternId] {
self.anchored_sub_patterns.as_slice()
}
#[inline]
pub(crate) fn re_code(&self) -> &[u8] {
self.re_code.as_slice()
}
#[inline]
pub(crate) fn num_rules(&self) -> usize {
self.rules.len()
}
#[inline]
pub(crate) fn num_patterns(&self) -> usize {
self.num_patterns
}
#[inline]
pub(crate) fn ac_automaton(&self) -> &AhoCorasick {
self.ac.as_ref().expect("Aho-Corasick automaton not compiled")
}
pub(crate) fn build_ac_automaton(&mut self) {
if self.ac.is_some() {
return;
}
#[cfg(feature = "logging")]
let start = Instant::now();
#[cfg(feature = "logging")]
let mut num_atoms = [0_usize; 6];
let atoms = self.atoms.iter().map(|x| {
#[cfg(feature = "logging")]
{
match x.atom.len() {
atom_len @ 0..=4 => num_atoms[atom_len] += 1,
_ => num_atoms[num_atoms.len() - 1] += 1,
}
if x.atom.len() < 2 {
let (rule_id, pattern_ident_id) = self
.get_rule_and_pattern_by_sub_pattern_id(
x.sub_pattern_id,
)
.unwrap();
let rule = self.get(rule_id);
info!(
"Very short atom in pattern `{}` in rule `{}:{}` (length: {})",
self.ident_pool.get(pattern_ident_id).unwrap(),
self.ident_pool
.get(rule.namespace_ident_id)
.unwrap(),
self.ident_pool.get(rule.ident_id).unwrap(),
x.atom.len()
);
}
}
x.atom.as_ref()
});
self.ac = Some(
AhoCorasick::new(atoms)
.expect("failed to build Aho-Corasick automaton"),
);
#[cfg(feature = "logging")]
{
info!(
"Aho-Corasick automaton build time: {:?}",
Instant::elapsed(&start)
);
info!("Number of rules: {}", self.num_rules());
info!("Number of patterns: {}", self.num_patterns());
info!(
"Number of anchored sub-patterns: {}",
self.anchored_sub_patterns.len()
);
info!("Number of atoms: {}", self.atoms.len());
info!("Atoms with len = 0: {}", num_atoms[0]);
info!("Atoms with len = 1: {}", num_atoms[1]);
info!("Atoms with len = 2: {}", num_atoms[2]);
info!("Atoms with len = 3: {}", num_atoms[3]);
info!("Atoms with len = 4: {}", num_atoms[4]);
info!("Atoms with len > 4: {}", num_atoms[5]);
}
}
#[inline]
pub(crate) fn lit_pool(&self) -> &BStringPool<LiteralId> {
&self.lit_pool
}
#[inline]
pub(crate) fn ident_pool(&self) -> &StringPool<IdentId> {
&self.ident_pool
}
#[inline]
pub(crate) fn globals(&self) -> types::Struct {
let (globals, _): (types::Struct, usize) =
bincode::serde::decode_from_slice(
self.serialized_globals.as_slice(),
bincode::config::standard(),
)
.expect("error deserializing global variables");
globals
}
#[inline]
pub(crate) fn wasm_mod(&self) -> &wasm::runtime::Module {
self.compiled_wasm_mod.as_ref().unwrap()
}
#[inline]
pub(crate) fn filesize_bounds(
&self,
pattern_id: PatternId,
) -> Option<&FilesizeBounds> {
self.filesize_bounds.get(&pattern_id)
}
}
#[cfg(feature = "native-code-serialization")]
fn serialize_wasm_mod<S>(
wasm_mod: &Option<wasm::runtime::Module>,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if let Some(wasm_mod) = wasm_mod {
let bytes = wasm_mod
.serialize()
.map_err(|err| serde::ser::Error::custom(err.to_string()))?;
serializer.serialize_some(bytes.as_slice())
} else {
serializer.serialize_none()
}
}
#[cfg(not(feature = "native-code-serialization"))]
fn serialize_wasm_mod<S>(
_wasm_mod: &Option<wasm::runtime::Module>,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_none()
}
pub fn deserialize_wasm_mod<'de, D>(
deserializer: D,
) -> Result<Option<wasm::runtime::Module>, D::Error>
where
D: Deserializer<'de>,
{
let bytes: Option<&[u8]> = Deserialize::deserialize(deserializer)?;
let module = if let Some(bytes) = bytes {
unsafe {
wasm::runtime::Module::deserialize(wasm::get_engine(), bytes).ok()
}
} else {
None
};
Ok(module)
}
pub struct RulesIter<'a> {
rules: &'a Rules,
iterator: Iter<'a, RuleInfo>,
}
impl<'a> Iterator for RulesIter<'a> {
type Item = Rule<'a, 'a>;
fn next(&mut self) -> Option<Self::Item> {
Some(Rule {
ctx: None,
rules: self.rules,
rule_info: self.iterator.next()?,
})
}
}
impl ExactSizeIterator for RulesIter<'_> {
#[inline]
fn len(&self) -> usize {
self.iterator.len()
}
}
impl fmt::Debug for Rules {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for (id, rule) in self.rules.iter().enumerate() {
let name = self.ident_pool.get(rule.ident_id).unwrap();
let namespace =
self.ident_pool.get(rule.namespace_ident_id).unwrap();
writeln!(f, "RuleId({id})")?;
writeln!(f, " namespace: {namespace}")?;
writeln!(f, " name: {name}")?;
writeln!(f, " patterns:")?;
for pattern in &rule.patterns {
let ident = self.ident_pool.get(pattern.ident_id).unwrap();
writeln!(f, " {:?} {ident} ", pattern.pattern_id)?;
}
}
for (id, (pattern_id, _)) in self.sub_patterns.iter().enumerate() {
writeln!(f, "SubPatternId({id}) -> {pattern_id:?}")?;
}
Ok(())
}
}
#[derive(Serialize, Deserialize)]
pub(crate) enum MetaValue {
Bool(bool),
Integer(i64),
Float(f64),
String(LiteralId),
Bytes(LiteralId),
}
#[derive(Serialize, Deserialize)]
pub(crate) struct RuleInfo {
pub namespace_id: NamespaceId,
pub namespace_ident_id: IdentId,
pub ident_id: IdentId,
pub tags: Vec<IdentId>,
#[serde(skip)]
pub ident_ref: CodeLoc,
pub metadata: Vec<(IdentId, MetaValue)>,
pub patterns: Vec<PatternInfo>,
pub num_private_patterns: usize,
pub is_global: bool,
pub is_private: bool,
}
#[derive(Serialize, Deserialize)]
pub(crate) struct PatternInfo {
pub pattern_id: PatternId,
pub ident_id: IdentId,
pub kind: PatternKind,
pub is_private: bool,
}
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Hash, Eq)]
pub(crate) struct FilesizeBounds {
start: Bound<i64>,
end: Bound<i64>,
}
impl Default for FilesizeBounds {
fn default() -> Self {
Self { start: Bound::Unbounded, end: Bound::Unbounded }
}
}
impl<T: RangeBounds<i64>> From<T> for FilesizeBounds {
fn from(value: T) -> Self {
Self {
start: value.start_bound().cloned(),
end: value.end_bound().cloned(),
}
}
}
impl FilesizeBounds {
pub fn unbounded(&self) -> bool {
matches!(self.start, Bound::Unbounded)
&& matches!(self.end, Bound::Unbounded)
}
pub fn contains(&self, value: i64) -> bool {
let start_ok = match self.start {
Bound::Included(start) => value >= start,
Bound::Excluded(start) => value > start,
Bound::Unbounded => true,
};
let end_ok = match self.end {
Bound::Included(end) => value <= end,
Bound::Excluded(end) => value < end,
Bound::Unbounded => true,
};
start_ok && end_ok
}
pub fn max_start(&mut self, bound: Bound<i64>) -> &mut Self {
match (&self.start, &bound) {
(Bound::Included(current), Bound::Included(new)) => {
if new > current {
self.start = Bound::Included(*new);
}
}
(Bound::Included(current), Bound::Excluded(new)) => {
if new >= current {
self.start = Bound::Excluded(*new);
}
}
(Bound::Excluded(current), Bound::Included(new)) => {
if new > current {
self.start = Bound::Included(*new);
}
}
(Bound::Excluded(current), Bound::Excluded(new)) => {
if new > current {
self.start = Bound::Excluded(*new);
}
}
(Bound::Unbounded, new) => {
self.start = *new;
}
(_, Bound::Unbounded) => {}
}
self
}
pub fn min_end(&mut self, bound: Bound<i64>) -> &mut Self {
match (&self.end, &bound) {
(Bound::Included(current), Bound::Included(new)) => {
if new < current {
self.end = Bound::Included(*new);
}
}
(Bound::Included(current), Bound::Excluded(new)) => {
if new <= current {
self.end = Bound::Excluded(*new);
}
}
(Bound::Excluded(current), Bound::Included(new)) => {
if new < current {
self.end = Bound::Included(*new);
}
}
(Bound::Excluded(current), Bound::Excluded(new)) => {
if new < current {
self.end = Bound::Excluded(*new)
}
}
(Bound::Unbounded, new) => {
self.end = *new;
}
(_, Bound::Unbounded) => {}
}
self
}
}
#[derive(Serialize, Deserialize)]
pub(crate) struct SubPatternAtom {
sub_pattern_id: SubPatternId,
atom: Atom,
fwd_code: Option<FwdCodeLoc>,
bck_code: Option<BckCodeLoc>,
}
impl SubPatternAtom {
#[inline]
pub(crate) fn from_atom(sub_pattern_id: SubPatternId, atom: Atom) -> Self {
Self { sub_pattern_id, atom, bck_code: None, fwd_code: None }
}
pub(crate) fn from_regexp_atom(
sub_pattern_id: SubPatternId,
value: RegexpAtom,
) -> Self {
Self {
sub_pattern_id,
atom: value.atom,
fwd_code: value.fwd_code,
bck_code: value.bck_code,
}
}
#[inline]
pub(crate) fn sub_pattern_id(&self) -> SubPatternId {
self.sub_pattern_id
}
#[cfg(feature = "exact-atoms")]
#[inline]
pub(crate) fn is_exact(&self) -> bool {
self.atom.is_exact()
}
#[inline]
pub(crate) fn len(&self) -> usize {
self.atom.len()
}
#[inline]
pub(crate) fn backtrack(&self) -> usize {
self.atom.backtrack() as usize
}
#[inline]
pub(crate) fn as_slice(&self) -> &[u8] {
self.atom.as_ref()
}
#[inline]
pub(crate) fn fwd_code(&self) -> Option<FwdCodeLoc> {
self.fwd_code
}
#[inline]
pub(crate) fn bck_code(&self) -> Option<BckCodeLoc> {
self.bck_code
}
}