use std::ops::Range;
use boreal_parser::{VariableDeclaration, VariableDeclarationValue};
use boreal_parser::{VariableFlags, VariableModifiers};
use crate::regex::Regex;
use super::base64::encode_base64;
use super::CompilationError;
mod atom;
pub use atom::atom_rank;
mod hex_string;
mod regex;
#[derive(Debug)]
pub struct Variable {
pub name: String,
pub is_private: bool,
pub literals: Vec<Vec<u8>>,
flags: VariableFlags,
matcher_type: MatcherType,
non_wide_regex: Option<Regex>,
}
#[derive(Debug)]
enum MatcherType {
Literals,
Atomized {
left_validator: Option<Regex>,
right_validator: Option<Regex>,
},
Raw(Regex),
}
#[derive(Clone, Debug)]
pub enum AcMatchStatus {
Multiple(Vec<Range<usize>>),
Single(Range<usize>),
None,
Unknown,
}
pub(crate) fn compile_variable(decl: VariableDeclaration) -> Result<Variable, CompilationError> {
let VariableDeclaration {
name,
value,
mut modifiers,
span,
} = decl;
if !modifiers.flags.contains(VariableFlags::WIDE) {
modifiers.flags.insert(VariableFlags::ASCII);
}
let res = match value {
VariableDeclarationValue::Bytes(s) => Ok(compile_bytes(s, &modifiers)),
VariableDeclarationValue::Regex(boreal_parser::Regex {
ast,
case_insensitive,
dot_all,
span: _,
}) => {
if case_insensitive {
modifiers.flags.insert(VariableFlags::NOCASE);
}
regex::compile_regex(&ast, case_insensitive, dot_all, modifiers.flags)
}
VariableDeclarationValue::HexString(hex_string) => {
modifiers.flags.remove(VariableFlags::FULLWORD);
modifiers.flags.remove(VariableFlags::WIDE);
if hex_string::can_use_only_literals(&hex_string) {
Ok(CompiledVariable {
literals: hex_string::hex_string_to_only_literals(hex_string),
matcher_type: MatcherType::Literals,
non_wide_regex: None,
})
} else {
let ast = hex_string::hex_string_to_ast(hex_string);
regex::compile_regex(&ast, false, true, modifiers.flags)
}
}
};
match res {
Ok(CompiledVariable {
literals,
matcher_type,
non_wide_regex,
}) => Ok(Variable {
name,
is_private: modifiers.flags.contains(VariableFlags::PRIVATE),
literals,
flags: modifiers.flags,
matcher_type,
non_wide_regex,
}),
Err(error) => Err(CompilationError::VariableCompilation {
variable_name: name,
span,
error,
}),
}
}
struct CompiledVariable {
literals: Vec<Vec<u8>>,
matcher_type: MatcherType,
non_wide_regex: Option<Regex>,
}
fn compile_bytes(value: Vec<u8>, modifiers: &VariableModifiers) -> CompiledVariable {
let mut literals = Vec::with_capacity(2);
if modifiers.flags.contains(VariableFlags::WIDE) {
if modifiers.flags.contains(VariableFlags::ASCII) {
literals.push(string_to_wide(&value));
literals.push(value);
} else {
literals.push(string_to_wide(&value));
}
} else {
literals.push(value);
}
if modifiers.flags.contains(VariableFlags::XOR) {
let xor_range = modifiers.xor_range.0..=modifiers.xor_range.1;
let xor_range_len = xor_range.len(); let mut new_literals: Vec<Vec<u8>> = Vec::with_capacity(literals.len() * xor_range_len);
for lit in literals {
for xor_byte in xor_range.clone() {
new_literals.push(lit.iter().map(|c| c ^ xor_byte).collect());
}
}
return CompiledVariable {
literals: new_literals,
matcher_type: MatcherType::Literals,
non_wide_regex: None,
};
}
if modifiers.flags.contains(VariableFlags::BASE64)
|| modifiers.flags.contains(VariableFlags::BASE64WIDE)
{
let mut old_literals = Vec::with_capacity(literals.len() * 3);
std::mem::swap(&mut old_literals, &mut literals);
if modifiers.flags.contains(VariableFlags::BASE64) {
for lit in &old_literals {
for offset in 0..=2 {
if let Some(lit) = encode_base64(lit, &modifiers.base64_alphabet, offset) {
if modifiers.flags.contains(VariableFlags::BASE64WIDE) {
literals.push(string_to_wide(&lit));
}
literals.push(lit);
}
}
}
} else if modifiers.flags.contains(VariableFlags::BASE64WIDE) {
for lit in &old_literals {
for offset in 0..=2 {
if let Some(lit) = encode_base64(lit, &modifiers.base64_alphabet, offset) {
literals.push(string_to_wide(&lit));
}
}
}
}
}
CompiledVariable {
literals,
matcher_type: MatcherType::Literals,
non_wide_regex: None,
}
}
impl Variable {
pub fn confirm_ac_literal(&self, mem: &[u8], mat: &Range<usize>, literal_index: usize) -> bool {
let literal = &self.literals[literal_index];
if self.flags.contains(VariableFlags::NOCASE) {
if !literal.eq_ignore_ascii_case(&mem[mat.start..mat.end]) {
return false;
}
} else if literal != &mem[mat.start..mat.end] {
return false;
}
true
}
pub fn process_ac_match(
&self,
mem: &[u8],
mat: Range<usize>,
mut start_position: usize,
) -> AcMatchStatus {
match &self.matcher_type {
MatcherType::Literals => match self.validate_and_update_match(mem, mat) {
Some(m) => AcMatchStatus::Single(m),
None => AcMatchStatus::None,
},
MatcherType::Atomized {
left_validator,
right_validator,
} => {
let end = match right_validator {
Some(validator) => match validator.as_regex().find(&mem[mat.start..]) {
Some(m) => mat.start + m.end(),
None => return AcMatchStatus::None,
},
None => mat.end,
};
match left_validator {
None => {
let mat = mat.start..end;
match self.validate_and_update_match(mem, mat) {
Some(m) => AcMatchStatus::Single(m),
None => AcMatchStatus::None,
}
}
Some(validator) => {
let mut matches = Vec::new();
while let Some(m) = validator.as_regex().find(&mem[start_position..mat.end])
{
let m = (m.start() + start_position)..end;
start_position = m.start + 1;
if let Some(m) = self.validate_and_update_match(mem, m) {
matches.push(m);
}
}
AcMatchStatus::Multiple(matches)
}
}
}
MatcherType::Raw(_) => AcMatchStatus::Unknown,
}
}
pub fn find_next_match_at(&self, mem: &[u8], mut offset: usize) -> Option<Range<usize>> {
let regex = match &self.matcher_type {
MatcherType::Raw(r) => r,
_ => {
debug_assert!(false);
return None;
}
};
while offset < mem.len() {
let mat = regex.as_regex().find_at(mem, offset).map(|m| m.range())?;
match self.validate_and_update_match(mem, mat.clone()) {
Some(m) => return Some(m),
None => {
offset = mat.start + 1;
}
}
}
None
}
fn validate_and_update_match(&self, mem: &[u8], mat: Range<usize>) -> Option<Range<usize>> {
if self.flags.contains(VariableFlags::FULLWORD) && !check_fullword(mem, &mat, self.flags) {
return None;
}
match self.non_wide_regex.as_ref() {
Some(regex) => apply_wide_word_boundaries(mat, mem, regex),
None => Some(mat),
}
}
}
fn check_fullword(mem: &[u8], mat: &Range<usize>, flags: VariableFlags) -> bool {
let mut match_is_wide = false;
if flags.contains(VariableFlags::WIDE) {
match_is_wide = is_match_wide(mat, mem);
if match_is_wide {
if mat.start > 1 && mem[mat.start - 1] == b'\0' && is_ascii_alnum(mem[mat.start - 2]) {
return false;
}
if mat.end + 1 < mem.len() && is_ascii_alnum(mem[mat.end]) && mem[mat.end + 1] == b'\0'
{
return false;
}
}
}
if flags.contains(VariableFlags::ASCII) && !match_is_wide {
if mat.start > 0 && is_ascii_alnum(mem[mat.start - 1]) {
return false;
}
if mat.end < mem.len() && is_ascii_alnum(mem[mat.end]) {
return false;
}
}
true
}
fn apply_wide_word_boundaries(
mut mat: Range<usize>,
mem: &[u8],
regex: &Regex,
) -> Option<Range<usize>> {
if !is_match_wide(&mat, mem) {
return Some(mat);
}
let start = if mat.start >= 2 && mem[mat.start - 1] == b'\0' {
mat.start - 2
} else {
mat.start
};
let unwiden_mem = unwide(&mem[start..std::cmp::min(mem.len(), mat.end + 500)]);
#[allow(clippy::bool_to_int_with_if)]
let expected_start = if start < mat.start { 1 } else { 0 };
match regex.as_regex().find(&unwiden_mem) {
Some(m) if m.start() == expected_start => {
mat.end = mat.start + 2 * (m.end() - m.start());
Some(mat)
}
_ => None,
}
}
fn unwide(mem: &[u8]) -> Vec<u8> {
let mut res = Vec::new();
for b in mem.chunks_exact(2) {
if b[1] != b'\0' {
break;
}
res.push(b[0]);
}
res
}
fn is_match_wide(mat: &Range<usize>, mem: &[u8]) -> bool {
if (mat.end - mat.start) % 2 != 0 {
return false;
}
if mat.is_empty() {
return true;
}
!mem[(mat.start + 1)..mat.end]
.iter()
.step_by(2)
.any(|c| *c != b'\0')
}
fn is_ascii_alnum(c: u8) -> bool {
(b'0'..=b'9').contains(&c) || (b'A'..=b'Z').contains(&c) || (b'a'..=b'z').contains(&c)
}
fn string_to_wide(s: &[u8]) -> Vec<u8> {
let mut res = Vec::with_capacity(s.len() * 2);
for b in s {
res.push(*b);
res.push(b'\0');
}
res
}
#[derive(Debug)]
pub enum VariableCompilationError {
Regex(crate::regex::Error),
AtomsExtractionError,
WidenError,
}
impl std::fmt::Display for VariableCompilationError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Regex(e) => e.fmt(f),
Self::AtomsExtractionError => write!(f, "unable to extract atoms"),
Self::WidenError => write!(f, "unable to apply the wide modifier to the regex"),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_helpers::{test_type_traits, test_type_traits_non_clonable};
#[test]
fn test_types_traits() {
test_type_traits_non_clonable(compile_variable(VariableDeclaration {
name: "a".to_owned(),
value: VariableDeclarationValue::Bytes(Vec::new()),
modifiers: VariableModifiers::default(),
span: 0..1,
}));
test_type_traits_non_clonable(MatcherType::Literals);
test_type_traits(AcMatchStatus::Unknown);
test_type_traits_non_clonable(VariableCompilationError::WidenError);
assert_eq!(
VariableCompilationError::AtomsExtractionError.to_string(),
"unable to extract atoms",
);
assert_eq!(
VariableCompilationError::WidenError.to_string(),
"unable to apply the wide modifier to the regex"
);
}
}