use lazy_static::lazy_static;
use regex::Regex;
use std::collections::{BTreeMap, HashMap};
lazy_static! {
static ref RE_NUMBER_HEX: Regex =
Regex::new(r"(?P<sign>[+\-])(?P<num>0x[a-fA-F0-9]+)").unwrap();
static ref RE_NUMBER_INT: Regex = Regex::new(r"(?P<sign>[+\-])(?P<num>[0-9]+)").unwrap();
}
use iced_x86::{FlowControl, Mnemonic, OpKind, Register};
use smda::{
Disassembler,
SmdaConfig,
disassembler::{
DecodedInsn,
aarch64_ops::{decode_adr, decode_adrp, decode_ldr_str_uimm},
},
function::{Function, Instruction},
report::DisassemblyReport,
};
use crate::{
Result,
consts::{FileFormat, Os},
error::Error,
};
#[derive(Debug, Clone)]
struct InstructionS {
i: Instruction,
}
impl super::Instruction for InstructionS {
fn is_mov_imm_to_stack(&self) -> Result<bool> {
Ok(self.i.get_printable_len()? > 0)
}
fn get_printable_len(&self) -> Result<u64> {
Ok(self.i.get_printable_len()?)
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[derive(Debug, Clone)]
struct FunctionS {
f: Function,
}
impl super::Function for FunctionS {
fn inrefs(&self) -> &Vec<u64> {
&self.f.inrefs
}
fn blockrefs(&self) -> &HashMap<u64, Vec<u64>> {
&self.f.blockrefs
}
fn offset(&self) -> u64 {
self.f.offset
}
fn get_blocks(&self) -> Result<BTreeMap<u64, Vec<Box<dyn super::Instruction>>>> {
let mut res = BTreeMap::<u64, Vec<Box<dyn super::Instruction>>>::new();
for (u, b) in self.f.get_blocks()? {
let mut instr: Vec<Box<dyn super::Instruction>> = vec![];
for i in b {
instr.push(Box::new(InstructionS { i: *i }));
}
res.insert(*u, instr);
}
Ok(res)
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
pub struct Extractor<'a> {
report: DisassemblyReport<'a>,
buf: &'a [u8],
path: String,
}
impl std::fmt::Debug for Extractor<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Extractor")
.field("path", &self.path)
.field("functions", &self.report.functions.len())
.finish_non_exhaustive()
}
}
impl<'data> super::Extractor for Extractor<'data> {
fn is_dot_net(&self) -> bool {
false
}
fn function_bytes(&self, addr: u64, max_len: u32) -> Option<&[u8]> {
self.report()
.binary_info
.bytes_at_best_effort(addr, max_len)
.ok()
}
fn get_base_address(&self) -> Result<u64> {
Ok(self.report().base_addr)
}
fn arch(&self) -> Result<crate::FileArchitecture> {
Ok(self.report().architecture)
}
fn format(&self) -> FileFormat {
match self.report().format {
smda::FileFormat::PE => FileFormat::PE,
smda::FileFormat::ELF => FileFormat::ELF,
smda::FileFormat::MachO => FileFormat::Macho,
smda::FileFormat::Buffer => FileFormat::PE,
_ => FileFormat::PE,
}
}
fn bitness(&self) -> u32 {
self.report().bitness
}
fn extract_global_features(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
Ok(vec![
(
crate::rules::features::Feature::Os(crate::rules::features::OsFeature::new(
&self.extract_os()?.to_string(),
"",
)?),
0,
),
(
crate::rules::features::Feature::Arch(crate::rules::features::ArchFeature::new(
&self.extract_arch()?.to_string(),
"",
)?),
0,
),
])
}
fn extract_file_features(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
res.extend(self.extract_file_export_names()?);
res.extend(self.extract_file_import_names()?);
res.extend(self.extract_file_section_names()?);
res.extend(self.extract_file_embedded_pe()?);
res.extend(self.extract_file_strings()?);
res.extend(self.extract_file_function_names()?);
res.extend(self.extract_file_format()?);
Ok(res)
}
fn get_functions(&self) -> Result<BTreeMap<u64, Box<dyn super::Function>>> {
let mut res = BTreeMap::<u64, Box<dyn super::Function>>::new();
for (u, f) in self.report().get_functions()? {
res.insert(*u, Box::new(FunctionS { f: f.clone() }));
}
Ok(res)
}
fn extract_function_features(
&self,
f: &Box<dyn super::Function>,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for inref in f.inrefs() {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("calls to", "")?,
),
*inref,
));
}
let mut vertices_names = std::collections::HashSet::new();
let mut edges = vec![];
for (bb_from, bb_tos) in f.blockrefs() {
for bb_to in bb_tos {
vertices_names.insert(*bb_from);
vertices_names.insert(*bb_to);
edges.push((*bb_from, *bb_to))
}
}
if !edges.is_empty() && self.has_loop(&vertices_names, &edges)? {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("loop", "")?,
),
f.offset(),
));
}
Ok(res)
}
fn get_basic_blocks(
&self,
f: &Box<dyn super::Function>,
) -> Result<BTreeMap<u64, Vec<Box<dyn super::Instruction>>>> {
f.get_blocks()
}
fn get_instructions<'a>(
&self,
_f: &Box<dyn super::Function>,
bb: &'a (&u64, &Vec<Box<dyn super::Instruction>>),
) -> Result<&'a Vec<Box<dyn super::Instruction>>> {
Ok(bb.1)
}
fn extract_basic_block_features(
&self,
f: &Box<dyn super::Function>,
bb: &(&u64, &Vec<Box<dyn super::Instruction>>),
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![(
crate::rules::features::Feature::BasicBlock(
crate::rules::features::BasicBlockFeature::new()?,
),
*bb.0,
)];
if f.blockrefs().contains_key(bb.0) && f.blockrefs()[bb.0].contains(bb.0) {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("tight loop", "")?,
),
*bb.0,
));
}
let mut count = 0;
for instr in bb.1 {
if instr.is_mov_imm_to_stack()? {
count += instr.get_printable_len()?;
}
if count > 8 {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("stack string", "")?,
),
*bb.0,
));
}
}
Ok(res)
}
fn extract_insn_features(
&self,
f: &Box<dyn super::Function>,
insn: &Box<dyn super::Instruction>,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let f: &FunctionS = f.as_any().downcast_ref::<FunctionS>().unwrap();
let insn: &InstructionS = insn.as_any().downcast_ref::<InstructionS>().unwrap();
let mut res = vec![];
res.extend(self.extract_insn_api_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_number_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_string_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_bytes_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_offset_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_nzxor_characteristic_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_obfs_call_plus_5_characteristic_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_mnemonic_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_peb_access_characteristic_features(&f.f, &insn.i)?);
res.extend(self.extract_insn_cross_section_cflow(&f.f, &insn.i)?);
res.extend(self.extract_insn_segment_access_features(&f.f, &insn.i)?);
res.extend(self.extract_function_calls_from(&f.f, &insn.i)?);
res.extend(self.extract_function_indirect_call_characteristic_features(&f.f, &insn.i)?);
res.extend(self.extract_file_format()?);
res.extend(self.extract_global_features()?);
Ok(res)
}
}
fn extract_insn_offset_features_aarch64(
f: &Function,
insn: &Instruction,
raw: u32,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if let Some(op) = decode_ldr_str_uimm(raw) {
if op.rn != 31 && op.rn != 29 {
let disp = op.offset as i64 as i128;
res.push((
crate::rules::features::Feature::Offset(
crate::rules::features::OffsetFeature::new(f.bitness, &disp, "")?,
),
insn.offset,
));
res.push((
crate::rules::features::Feature::OperandOffset(
crate::rules::features::OperandOffsetFeature::new(&1usize, &disp, "")?,
),
insn.offset,
));
}
}
if let Some((_rd, target)) = decode_adr(raw, insn.offset) {
let disp = target as i64 as i128;
res.push((
crate::rules::features::Feature::Number(crate::rules::features::NumberFeature::new(
f.bitness, &disp, "",
)?),
insn.offset,
));
}
if let Some((_rd, page_va)) = decode_adrp(raw, insn.offset) {
let disp = page_va as i64 as i128;
res.push((
crate::rules::features::Feature::Number(crate::rules::features::NumberFeature::new(
f.bitness, &disp, "",
)?),
insn.offset,
));
}
Ok(res)
}
impl<'data> Extractor<'data> {
pub fn new(
path: &str,
high_accuracy: bool,
resolve_tailcalls: bool,
data: &'data [u8],
) -> Result<Extractor<'data>> {
let cfg = SmdaConfig::new()
.path(path)
.high_accuracy(high_accuracy)
.resolve_tailcalls(resolve_tailcalls);
let report = Disassembler::parse(data, &cfg)?;
Ok(Extractor {
report,
buf: data,
path: path.to_string(),
})
}
pub fn from_buffer(
data: &'data [u8],
base_addr: u64,
bitness: u32,
high_accuracy: bool,
resolve_tailcalls: bool,
) -> Result<Extractor<'data>> {
let cfg = SmdaConfig::new()
.high_accuracy(high_accuracy)
.resolve_tailcalls(resolve_tailcalls);
let report = Disassembler::parse_buffer(data, base_addr, bitness, &cfg)?;
Ok(Extractor {
report,
buf: data,
path: "<buffer>".to_string(),
})
}
pub(crate) fn report(&self) -> &DisassemblyReport<'data> {
&self.report
}
fn buf(&self) -> &[u8] {
self.buf
}
pub fn get_elf_os(elf: &goblin::elf::Elf) -> Result<Os> {
match elf.header.e_ident[7] {
0x00 => Ok(Os::LINUX),
0x01 => Ok(Os::HPUX),
0x02 => Ok(Os::NETBSD),
0x03 => Ok(Os::LINUX),
0x04 => Ok(Os::HURD),
0x06 => Ok(Os::SOLARIS),
0x07 => Ok(Os::AIX),
0x08 => Ok(Os::IRIX),
0x09 => Ok(Os::FREEBSD),
0x0A => Ok(Os::TRU64),
0x0B => Ok(Os::MODESTO),
0x0C => Ok(Os::OPENBSD),
0x0D => Ok(Os::OPENVMS),
0x0E => Ok(Os::NSK),
0x0F => Ok(Os::AROS),
0x10 => Ok(Os::FENIXOS),
0x11 => Ok(Os::CLOUD),
_ => {
Ok(Os::LINUX)
}
}
}
pub fn extract_os(&self) -> Result<Os> {
match self.report().format {
smda::FileFormat::MachO => return classify_macho_os(self.buf()),
smda::FileFormat::Buffer => return Ok(Os::WINDOWS),
_ => {}
}
match goblin::Object::parse(self.buf())? {
goblin::Object::Elf(elf) => Extractor::get_elf_os(&elf),
goblin::Object::PE(_) => Ok(Os::WINDOWS),
goblin::Object::Mach(_) => classify_macho_os(self.buf()),
_ => Err(Error::UnsupportedOsError),
}
}
pub fn extract_arch(&self) -> Result<crate::FileArchitecture> {
Ok(self.report().architecture)
}
pub fn has_loop(
&self,
vertices_names: &std::collections::HashSet<u64>,
edges: &[(u64, u64)],
) -> Result<bool> {
let mut vertices = std::collections::HashMap::new();
let mut graph = petgraph::graph::Graph::<u64, ()>::new(); for n in vertices_names {
vertices.insert(n, graph.add_node(*n));
}
graph.extend_with_edges(
edges
.iter()
.map(|(a, b)| (vertices[a], vertices[b]))
.collect::<Vec<(petgraph::graph::NodeIndex, petgraph::graph::NodeIndex)>>(),
);
let scc = petgraph::algo::kosaraju_scc(&graph);
let mut res = false;
for s in &scc {
res |= s.len() >= 2 }
Ok(res)
}
fn extract_file_format(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let fmt = match self.report().format {
smda::FileFormat::PE => "pe",
smda::FileFormat::ELF => "elf",
smda::FileFormat::MachO => "macho",
smda::FileFormat::Buffer => "pe",
_ => "pe",
};
Ok(vec![(
crate::rules::features::Feature::Format(crate::rules::features::FormatFeature::new(
fmt, "",
)?),
0,
)])
}
fn extract_file_embedded_pe(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (mz_offset, _pe_offset, _key) in
Extractor::find_embedded_pe_headers(self.report().binary_info.raw_data)
{
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("embedded pe", "")?,
),
mz_offset,
));
}
Ok(res)
}
pub fn extract_file_section_names(
&self,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (n, b, _e) in &self.report().sections {
res.push((
crate::rules::features::Feature::Section(
crate::rules::features::SectionFeature::new(n.trim_matches(char::from(0)), "")?,
),
*b,
));
}
Ok(res)
}
pub fn extract_file_export_names(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (e, o, ree) in &self.report().exports {
match ree {
None => {
res.push((
crate::rules::features::Feature::Export(
crate::rules::features::ExportFeature::new(e, "")?,
),
*o as u64,
));
}
Some(re) => {
res.push((
crate::rules::features::Feature::Export(
crate::rules::features::ExportFeature::new(re, "")?,
),
*o as u64,
));
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new(
"forwarded export",
"",
)?,
),
*o as u64,
));
}
}
}
Ok(res)
}
pub fn extract_file_import_names(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (d, f, o) in &self.report().imports {
for n in generate_symbols(&Some(d.to_string()), &Some(f.to_string()))? {
res.push((
crate::rules::features::Feature::Import(
crate::rules::features::ImportFeature::new(&n, "")?,
),
*o as u64,
));
}
}
Ok(res)
}
fn extract_file_function_names(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (addr, func) in self.report().get_functions()? {
let name = func.function_name();
if name.is_empty() {
continue;
}
res.push((
crate::rules::features::Feature::FunctionName(
crate::rules::features::FunctionNameFeature::new(name, "")?,
),
*addr,
));
}
Ok(res)
}
fn extract_file_strings(&self) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for (s, a) in extract_file_strings(self.buf())? {
let trimmed = s.trim();
if trimmed.is_empty() {
continue;
}
res.push((
crate::rules::features::Feature::String(
crate::rules::features::StringFeature::new(trimmed, "")?,
),
a,
));
}
Ok(res)
}
pub fn extract_function_indirect_call_characteristic_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if !insn.is_call() {
return Ok(res);
}
if let Some(o) = insn.format_operands() {
if o.starts_with("0x") {
return Ok(res);
}
if o.contains("qword ptr") && o.contains("rip") {
return Ok(res);
}
if o.starts_with("dword ptr [0x") {
return Ok(res);
}
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("indirect call", "")?,
),
insn.offset,
));
}
Ok(res)
}
pub fn extract_function_calls_from(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if !insn.is_call() {
return Ok(res);
}
if f.outrefs.contains_key(&insn.offset) {
for outref in &f.outrefs[&insn.offset] {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("calls from", "")?,
),
*outref,
));
if outref == &f.offset {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new(
"recursive call",
"",
)?,
),
*outref,
));
}
}
}
if f.apirefs.contains_key(&insn.offset) {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("calls from", "")?,
),
insn.offset,
));
}
Ok(res)
}
pub fn extract_insn_segment_access_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if let Some(o) = insn.format_operands() {
let operands: Vec<String> = o.split(',').map(|s| s.trim().to_string()).collect();
for operand in operands {
if operand.contains("fs:") {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("fs access", "")?,
),
insn.offset,
));
}
if operand.contains("gs:") {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("gs access", "")?,
),
insn.offset,
));
}
}
}
Ok(res)
}
pub fn extract_insn_cross_section_cflow(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if insn.is_call() || insn.is_jmp() {
if f.apirefs.contains_key(&insn.offset) {
return Ok(res);
}
if f.outrefs.contains_key(&insn.offset) {
for target in &f.outrefs[&insn.offset] {
if self.report().get_section(&insn.offset)?
!= self.report().get_section(target)?
{
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new(
"cross section flow",
"",
)?,
),
insn.offset,
));
}
}
} else if let Some(o) = insn.format_operands() {
if let Some(x) = o.strip_prefix("0x") {
let target = u64::from_str_radix(x, 16)?;
if self.report().get_section(&insn.offset)?
!= self.report().get_section(&target)?
{
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new(
"cross section flow",
"",
)?,
),
insn.offset,
));
}
}
}
}
Ok(res)
}
pub fn extract_insn_peb_access_characteristic_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if let DecodedInsn::Aarch64(a) = insn.decoded {
if let Some(op) = decode_ldr_str_uimm(a.opcode) {
if op.rn == 18 && !op.is_store {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("peb access", "")?,
),
insn.offset,
));
}
}
return Ok(res);
}
if !matches!(insn.mnemonic_enum(), Mnemonic::Push | Mnemonic::Mov) {
return Ok(res);
}
if let Some(o) = insn.format_operands() {
let operands: Vec<String> = o.split(',').map(|s| s.trim().to_string()).collect();
for operand in operands {
if (operand.contains("fs:") && operand.contains("0x30"))
|| (operand.contains("gs:") && operand.contains("0x60"))
{
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("peb access", "")?,
),
insn.offset,
));
}
}
}
Ok(res)
}
pub fn extract_insn_mnemonic_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let name = match insn.mnemonic_aarch64() {
Some(arm64_name) => arm64_name,
None => insn.format_mnemonic(),
};
Ok(vec![(
crate::rules::features::Feature::Mnemonic(
crate::rules::features::MnemonicFeature::new(&name, "")?,
),
insn.offset,
)])
}
pub fn extract_insn_nzxor_characteristic_features(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if let DecodedInsn::Aarch64(a) = insn.decoded {
let mnem = insn.mnemonic_aarch64();
let is_eor = matches!(mnem.as_deref(), Some("eor") | Some("eor3") | Some("eors"));
if !is_eor {
return Ok(res);
}
let rn = (a.opcode >> 5) & 0x1f;
let rm = (a.opcode >> 16) & 0x1f;
if rn == rm {
res.push((
crate::rules::features::Feature::Number(
crate::rules::features::NumberFeature::new(f.bitness, &0_i128, "")?,
),
insn.offset,
));
return Ok(res);
}
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("nzxor", "")?,
),
insn.offset,
));
return Ok(res);
}
if !matches!(
insn.mnemonic_enum(),
Mnemonic::Xor | Mnemonic::Xorpd | Mnemonic::Xorps | Mnemonic::Pxor
) {
return Ok(res);
}
if let Some(o) = insn.format_operands() {
let operands: Vec<String> = o.split(',').map(|s| s.trim().to_string()).collect();
if operands[0] == operands[1] {
res.push((
crate::rules::features::Feature::Number(
crate::rules::features::NumberFeature::new(f.bitness, &0_i128, "")?,
),
insn.offset,
));
return Ok(res);
}
}
if is_security_cookie(f, insn)? {
return Ok(res);
}
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("nzxor", "")?,
),
insn.offset,
));
Ok(res)
}
pub fn extract_insn_obfs_call_plus_5_characteristic_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if !insn.is_call() {
return Ok(res);
}
if let Some(o) = insn.format_operands() {
if !o.starts_with("0x") {
return Ok(res);
}
if u64::from_str_radix(&o[2..], 16)? == insn.offset + 5 {
res.push((
crate::rules::features::Feature::Characteristic(
crate::rules::features::CharacteristicFeature::new("call $+5", "")?,
),
insn.offset,
));
}
}
Ok(res)
}
pub fn extract_insn_offset_features(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
if let DecodedInsn::Aarch64(a) = insn.decoded {
return extract_insn_offset_features_aarch64(f, insn, a.opcode);
}
let mut res = vec![];
let is_lea = insn.mnemonic_enum() == Mnemonic::Lea;
for i in 0..insn.op_count() {
if insn.op_kind(i) != OpKind::Memory {
continue;
}
let base = insn.memory_base();
if matches!(base, Register::EBP | Register::RBP) {
continue;
}
let disp = insn.memory_displacement64() as i64 as i128;
res.push((
crate::rules::features::Feature::Offset(
crate::rules::features::OffsetFeature::new(f.bitness, &disp, "")?,
),
insn.offset,
));
res.push((
crate::rules::features::Feature::OperandOffset(
crate::rules::features::OperandOffsetFeature::new(&(i as usize), &disp, "")?,
),
insn.offset,
));
if is_lea {
res.push((
crate::rules::features::Feature::Number(
crate::rules::features::NumberFeature::new(f.bitness, &disp, "")?,
),
insn.offset,
));
}
}
Ok(res)
}
pub fn extract_insn_string_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for data_ref in insn.get_data_refs(self.report())? {
for v in derefs(self.report(), &data_ref)? {
let string_read = read_string(self.report(), &v)?;
let trimmed = string_read.trim();
if trimmed.is_empty() {
continue;
}
res.push((
crate::rules::features::Feature::String(
crate::rules::features::StringFeature::new(
trimmed.trim_end_matches('\x00'),
"",
)?,
),
insn.offset,
));
}
}
Ok(res)
}
pub fn extract_insn_bytes_features(
&self,
_f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
for data_ref in insn.get_data_refs(self.report())? {
for v in derefs(self.report(), &data_ref)? {
let bytes_read = read_bytes(self.report(), &v, 0x100)?;
if all_zeros(bytes_read)? || is_padding(bytes_read)? {
continue;
}
res.push((
crate::rules::features::Feature::Bytes(
crate::rules::features::BytesFeature::new(bytes_read, "")?,
),
insn.offset,
));
}
}
Ok(res)
}
fn parse_operand_to_number(&self, operand: &str) -> Option<i128> {
let operand = operand.trim();
if let Some(x) = operand.strip_prefix("0x") {
return i128::from_str_radix(x, 16).ok();
}
if let Some(stripped_operand) = operand.strip_suffix('h') {
return i128::from_str_radix(stripped_operand, 16).ok();
}
if operand.starts_with('-') || operand.starts_with('+') {
if let Some(captures) = RE_NUMBER_HEX.captures(operand) {
let sign = &captures["sign"];
let number = &captures["num"];
let value = i128::from_str_radix(number, 16).ok()?;
return Some(if sign == "-" { -value } else { value });
}
if let Some(captures) = RE_NUMBER_INT.captures(operand) {
let sign = &captures["sign"];
let number = &captures["num"];
let value = number.parse::<i128>().ok()?;
return Some(if sign == "-" { -value } else { value });
}
}
if let Ok(val) = operand.parse::<i128>() {
return Some(val);
}
i128::from_str_radix(operand, 16).ok()
}
pub fn extract_insn_number_features(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
if let Some(o) = insn.format_operands() {
let operands: Vec<String> = o.split(',').map(|s| s.trim().to_string()).collect();
if insn.mnemonic_enum() == Mnemonic::Add
&& ["esp", "rsp"].contains(&operands[0].as_str())
{
return Ok(vec![]);
}
for (i, operand) in operands.iter().enumerate() {
if let Some(s) = self.parse_operand_to_number(operand) {
if s >= 0 {
res.push((
crate::rules::features::Feature::Number(
crate::rules::features::NumberFeature::new(f.bitness, &s, "")?,
),
insn.offset,
));
} else {
let masked_value = (s as u32) as i128; res.push((
crate::rules::features::Feature::Number(
crate::rules::features::NumberFeature::new(
f.bitness,
&masked_value,
"",
)?,
),
insn.offset,
));
}
res.push((
crate::rules::features::Feature::OperandNumber(
crate::rules::features::OperandNumberFeature::new(&i, &s, "")?,
),
insn.offset,
));
}
}
}
Ok(res)
}
pub fn extract_insn_api_features(
&self,
f: &Function,
insn: &Instruction,
) -> Result<Vec<(crate::rules::features::Feature, u64)>> {
let mut res = vec![];
let va = self.report().base_addr + insn.offset;
if let Some((dll, api)) = f.apirefs.get(&insn.offset) {
if let Some(api_name) = api {
let highest_addr = self.find_highest_address_for_symbol(api_name);
if let Some(addr) = highest_addr {
self.push_api_features(&mut res, dll, api, addr)?;
} else {
self.push_api_features(&mut res, dll, api, va)?;
}
} else {
self.push_api_features(&mut res, dll, api, va)?;
}
return Ok(res);
}
if let Some(targets) = f.outrefs.get(&insn.offset) {
let mut api_candidates = Vec::new();
for target in targets.iter().rev() {
if let Some((dll, api)) = self.report().addr_to_api.get(target) {
api_candidates.push((*target, dll.clone(), api.clone()));
}
}
if !api_candidates.is_empty() {
let best_candidate = api_candidates
.iter()
.max_by_key(|(addr, _, _)| *addr)
.unwrap();
let (mut best_addr, dll, api) = best_candidate.clone();
if let Some(api_name) = &api {
if let Some(highest) = self.find_highest_address_for_symbol(api_name) {
best_addr = highest;
}
}
self.push_api_features(&mut res, &dll, &api, best_addr)?;
return Ok(res);
}
}
let mut current_function = f;
let mut current_instruction = insn;
for _ in 0..5 {
if let Some(targets) = current_function.outrefs.get(¤t_instruction.offset) {
if targets.len() != 1 {
break;
}
let chain_target = targets[0];
let referenced_function = match self.report().get_function(chain_target) {
Ok(func) => func,
Err(_) => break,
};
if referenced_function.is_api_thunk()? {
if let Some((dll, api)) = referenced_function.apirefs.get(&chain_target) {
if let Some(api_name) = api {
if let Some(highest) = self.find_highest_address_for_symbol(api_name) {
self.push_api_features(&mut res, dll, api, highest)?;
return Ok(res);
}
}
self.push_api_features(&mut res, dll, api, chain_target)?;
}
break;
}
if referenced_function.get_num_instructions()? == 1
&& referenced_function.get_num_outrefs()? == 1
{
current_function = referenced_function;
current_instruction = referenced_function.get_instructions()?[0];
} else {
break;
}
} else {
break;
}
}
Ok(res)
}
fn find_highest_address_for_symbol(&self, symbol_name: &str) -> Option<u64> {
let mut addresses = Vec::new();
for (addr, (_, api)) in &self.report().addr_to_api {
if let Some(api_name) = api {
if api_name == symbol_name {
addresses.push(*addr);
}
}
}
addresses.into_iter().max()
}
fn push_api_features(
&self,
res: &mut Vec<(crate::rules::features::Feature, u64)>,
dll: &Option<String>,
api: &Option<String>,
va: u64,
) -> Result<()> {
for name in generate_symbols(dll, api)? {
res.push((
crate::rules::features::Feature::Api(crate::rules::features::ApiFeature::new(
&name, "",
)?),
va,
));
}
Ok(())
}
fn find_embedded_pe_headers(pbytes: &[u8]) -> Vec<(u64, u64, u8)> {
let mut results = Vec::new();
let start_offset = 64usize;
let end = pbytes.len();
let end_safe_zone = end.saturating_sub(0x40);
let mut current_offset = start_offset;
while current_offset < end_safe_zone {
if pbytes[current_offset + 0x3E] == pbytes[current_offset + 0x3F] {
let key = pbytes[current_offset + 0x3E];
if pbytes[current_offset] ^ key == b'M' && pbytes[current_offset + 1] ^ key == b'Z'
{
let e_lfanew = u32::from_le_bytes([
pbytes[current_offset + 0x3C] ^ key,
pbytes[current_offset + 0x3D] ^ key,
0,
0,
]) as usize;
if current_offset + e_lfanew + 0x18 <= end_safe_zone
&& pbytes[current_offset + e_lfanew] ^ key == b'P'
&& pbytes[current_offset + e_lfanew + 1] ^ key == b'E'
&& pbytes[current_offset + e_lfanew + 2] == key
&& pbytes[current_offset + e_lfanew + 3] == key
{
results.push((
current_offset as u64,
(current_offset + e_lfanew) as u64,
key,
));
current_offset = current_offset + e_lfanew + 4;
continue;
}
}
}
current_offset += 1;
}
results
}
}
fn clean_dll_name(dll_name: &str) -> String {
let mut clean = dll_name.to_string();
if clean.ends_with(".so.6") {
clean = clean[..clean.len() - 5].to_string();
} else if clean.ends_with(".so") {
clean = clean[..clean.len() - 3].to_string();
} else if clean.ends_with(".dll") {
clean = clean[..clean.len() - 4].to_string();
}
clean
}
pub fn generate_symbols(dll: &Option<String>, symbol: &Option<String>) -> Result<Vec<String>> {
let mut res = vec![];
let symbol_name = symbol
.clone()
.ok_or_else(|| Error::InvalidRule(line!(), file!().to_string()))?;
if !symbol_name.starts_with('#') {
res.push(symbol_name.clone());
}
if let Some(dll_ref) = dll {
let dll_clean = clean_dll_name(dll_ref);
let dll_symbol = format!("{}.{}", dll_clean, symbol_name);
if dll_symbol != symbol_name {
res.push(dll_symbol);
}
}
if !symbol_name.starts_with("_Z") && (symbol_name.ends_with('A') || symbol_name.ends_with('W'))
{
let base_name = &symbol_name[..symbol_name.len() - 1];
if !res.contains(&base_name.to_string()) {
res.push(base_name.to_string());
}
if let Some(dll_ref) = dll {
let dll_clean = clean_dll_name(dll_ref);
res.push(format!("{}.{}", dll_clean, base_name));
}
}
Ok(res)
}
pub fn derefs(report: &DisassemblyReport<'_>, p: &u64) -> Result<Vec<u64>> {
let mut res = vec![];
let mut depth = 0;
let mut pp = *p;
loop {
if !report.is_addr_within_memory_image(&pp)? {
break;
}
res.push(pp);
let bytes_: [u8; 4] = read_bytes(report, &pp, 4)?.try_into()?;
let val = u32::from_le_bytes(bytes_) as u64;
if val == pp {
break;
}
depth += 1;
if depth > 10 {
break;
}
pp = val;
}
Ok(res)
}
pub fn read_bytes<'a>(
report: &'a DisassemblyReport<'_>,
offset: &u64,
num_bytes: usize,
) -> Result<&'a [u8]> {
let raw = report.binary_info.raw_data;
let rva = offset - report.base_addr;
let buffer_end = raw.len();
let mut end_of_string = rva + num_bytes as u64;
if end_of_string > buffer_end as u64 {
end_of_string = buffer_end as u64;
}
if rva > buffer_end as u64 {
return Err(Error::BufferOverflowError);
}
Ok(&raw[rva as usize..end_of_string as usize])
}
pub fn read_string(report: &DisassemblyReport<'_>, offset: &u64) -> Result<String> {
let alen = detect_ascii_len(report, offset)?;
if alen > 1 {
let bytes = read_bytes(report, offset, alen)?;
return Ok(std::str::from_utf8(bytes)?.to_string());
}
let ulen = detect_unicode_len(report, offset)?;
if ulen > 2 {
let bytes = read_bytes(report, offset, ulen)?;
let utf16_units: Vec<u16> = bytes
.chunks_exact(2)
.map(|arr| u16::from_le_bytes([arr[0], arr[1]]))
.collect();
return Ok(std::string::String::from_utf16(&utf16_units)?);
}
Ok("".to_string())
}
pub fn detect_ascii_len(report: &DisassemblyReport<'_>, offset: &u64) -> Result<usize> {
let raw = report.binary_info.raw_data;
let buffer_len = raw.len() as u64;
let rva = offset.checked_sub(report.base_addr).ok_or_else(|| {
std::io::Error::other("Offset is out of bounds relative to the base address")
})?;
if rva as usize >= raw.len() {
Err(std::io::Error::other("RVA is beyond buffer length"))?;
}
let ascii_len = raw[rva as usize..]
.iter()
.take_while(|&&ch| ch != 0 && ch.is_ascii())
.take_while(|&&ch| b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+, -./:;<=>?@[\\]^_`{|}~ \r\n".contains(&ch))
.count();
if rva + ascii_len as u64 >= buffer_len {
Err(std::io::Error::other(
"Buffer overflow detected while detecting ASCII length",
))?;
}
Ok(ascii_len)
}
pub fn detect_unicode_len(report: &DisassemblyReport<'_>, offset: &u64) -> Result<usize> {
let raw = report.binary_info.raw_data;
let mut unicode_len = 0;
let mut rva = offset - report.base_addr;
if (rva as usize) + 1 >= raw.len() {
return Ok(0);
}
let mut ch = raw[rva as usize];
let mut second_char = raw[rva as usize + 1];
while ch < 127 && b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+, -./:;<=>?@[\\]^_`{|}~ \r\n".contains(&ch) && second_char == 0 {
unicode_len += 2;
rva += 2;
if (rva as usize) + 1 >= raw.len() {
return Ok(0);
}
ch = raw[rva as usize];
second_char = raw[rva as usize + 1];
}
if ch == 0 && second_char == 0 {
return Ok(unicode_len);
}
Ok(0)
}
pub fn all_zeros(bytez: &[u8]) -> Result<bool> {
let mut res = true;
for b in bytez {
res &= b == &0;
}
Ok(res)
}
pub fn is_padding(bytez: &[u8]) -> Result<bool> {
Ok(bytez.iter().all(|&b| b == 0x00 || b == 0xFF))
}
pub fn is_security_cookie(f: &Function, insn: &Instruction) -> Result<bool> {
if let Some(o) = insn.format_operands() {
let operands: Vec<String> = o.split(',').map(|s| s.trim().to_string()).collect();
if !["esp", "ebp", "rsp", "rbp"].contains(&&operands[1][..]) {
return Ok(false);
}
for (index, block) in f.get_blocks()?.iter().enumerate() {
if index == 0 && insn.offset < (block.1[0].offset + 0x40) {
return Ok(true);
}
let last = &block.1[block.1.len() - 1];
if last.flow_control() == FlowControl::Return && insn.offset > (last.offset - 0x40) {
return Ok(true);
}
}
}
Ok(false)
}
fn extract_file_strings(buf: &[u8]) -> Result<Vec<(String, u64)>> {
let mut res = vec![];
for (s, a) in extract_ascii_strings(buf, 4)? {
res.push((s, a));
}
for (s, a) in extract_unicode_strings(buf, 4)? {
res.push((s, a));
}
Ok(res)
}
const ASCII_BYTE: &str = r##" !"#$%&'()*+,-\./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_\x60abcdefghijklmnopqrstuvwxyz{|}\\~\t"##;
const SLICE_SIZE: usize = 4096;
lazy_static::lazy_static! {
static ref REPEATS: Vec<u8> = vec![b'A', 0, 0xfe, 0xff];
}
pub fn extract_ascii_strings(data: &[u8], min_length: usize) -> Result<Vec<(String, u64)>> {
if data
.first()
.is_some_and(|&b| REPEATS.contains(&b) && buf_filled_with(data, &b))
{
return Ok(vec![]);
}
let re = regex::bytes::Regex::new(&format!(r##"([{}]{{{},}})"##, ASCII_BYTE, min_length))?;
Ok(re
.find_iter(data)
.map(|d| {
(
std::string::String::from_utf8_lossy(d.as_bytes()).to_string(),
d.start() as u64,
)
})
.collect())
}
pub fn extract_unicode_strings(data: &[u8], min_length: usize) -> Result<Vec<(String, u64)>> {
if data.len() < min_length * 2 {
return Ok(vec![]);
}
let mut results = Vec::new();
let re_le = regex::bytes::Regex::new(&format!(r"((?:[\x20-\x7E]\x00){{{},}})", min_length))?;
let re_be = regex::bytes::Regex::new(&format!(r"((?:\x00[\x20-\x7E]){{{},}})", min_length))?;
let re_utf8 = regex::bytes::Regex::new(&format!(r"((?:[\x20-\x7E]){{{},}})", min_length))?;
for mat in re_le.find_iter(data) {
let matched_bytes = mat.as_bytes();
let utf16_units = matched_bytes
.chunks(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect::<Vec<u16>>();
if let Ok(decoded_string) = String::from_utf16(&utf16_units) {
results.push((decoded_string, mat.start() as u64));
}
}
for mat in re_be.find_iter(data) {
let matched_bytes = mat.as_bytes();
let utf16_units = matched_bytes
.chunks(2)
.map(|chunk| u16::from_be_bytes([chunk[1], chunk[0]]))
.collect::<Vec<u16>>();
if let Ok(decoded_string) = String::from_utf16(&utf16_units) {
results.push((decoded_string, mat.start() as u64));
}
}
for mat in re_utf8.find_iter(data) {
let matched_bytes = mat.as_bytes();
let decoded_string = String::from_utf8_lossy(matched_bytes).to_string();
results.push((decoded_string, mat.start() as u64));
}
let cleaned_results = results
.into_iter()
.filter(|(s, _)| !s.trim().is_empty())
.map(|(s, pos)| (clean_string(&s), pos))
.collect::<Vec<(String, u64)>>();
Ok(cleaned_results)
}
fn clean_string(s: &str) -> String {
s.replace('\u{0000}', "")
.chars()
.filter(|c| c.is_ascii_graphic() || c.is_ascii_whitespace())
.collect()
}
fn buf_filled_with(data: &[u8], character: &u8) -> bool {
let dupe_chunk = vec![*character; SLICE_SIZE];
let mut offset = 0;
while offset < data.len() {
let new_chunk = if offset + SLICE_SIZE >= data.len() {
data[offset..].to_vec()
} else {
data[offset..offset + SLICE_SIZE].to_vec()
};
if dupe_chunk[..new_chunk.len()] != new_chunk {
return false;
}
offset += SLICE_SIZE;
}
true
}
pub(crate) fn classify_macho_os(buf: &[u8]) -> Result<Os> {
use goblin::mach::{Mach, load_command::CommandVariant};
const PLATFORM_MACOS: u32 = 1;
const PLATFORM_IOS: u32 = 2;
const PLATFORM_TVOS: u32 = 3;
const PLATFORM_WATCHOS: u32 = 4;
const PLATFORM_BRIDGEOS: u32 = 5;
const PLATFORM_MACCATALYST: u32 = 6;
const PLATFORM_IOSSIMULATOR: u32 = 7;
const PLATFORM_TVOSSIMULATOR: u32 = 8;
const PLATFORM_WATCHOSSIMULATOR: u32 = 9;
const PLATFORM_DRIVERKIT: u32 = 10;
fn classify(slice: &goblin::mach::MachO) -> Os {
for lc in &slice.load_commands {
if let CommandVariant::BuildVersion(bv) = &lc.command {
return match bv.platform {
PLATFORM_MACOS | PLATFORM_MACCATALYST | PLATFORM_DRIVERKIT => Os::MACOS,
PLATFORM_IOS
| PLATFORM_TVOS
| PLATFORM_WATCHOS
| PLATFORM_BRIDGEOS
| PLATFORM_IOSSIMULATOR
| PLATFORM_TVOSSIMULATOR
| PLATFORM_WATCHOSSIMULATOR => Os::IOS,
_ => Os::MACOS,
};
}
}
for lc in &slice.load_commands {
match &lc.command {
CommandVariant::VersionMinMacosx(_) => return Os::MACOS,
CommandVariant::VersionMinIphoneos(_)
| CommandVariant::VersionMinTvos(_)
| CommandVariant::VersionMinWatchos(_) => return Os::IOS,
_ => {}
}
}
Os::MACOS
}
match goblin::Object::parse(buf)? {
goblin::Object::Mach(Mach::Binary(m)) => Ok(classify(&m)),
goblin::Object::Mach(Mach::Fat(fat)) => {
for (i, _arch) in fat.iter_arches().enumerate() {
if let Ok(goblin::mach::SingleArch::MachO(m)) = fat.get(i) {
return Ok(classify(&m));
}
}
Ok(Os::MACOS)
}
_ => Ok(Os::MACOS),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::features::{CharacteristicFeature, Feature, NumberFeature};
#[test]
fn upstream_parity_2997_xor_self_emits_number_zero() {
let path = "data/Demo64.dll";
let bytes = std::fs::read(path).unwrap_or_else(|e| {
panic!("test fixture missing: {path}: {e}");
});
let extractor = Extractor::new(path, false, false, &bytes).expect("smda parse Demo64.dll");
let report = extractor.report();
let functions = report.get_functions().expect("smda get_functions");
let want_zero =
Feature::Number(NumberFeature::new(64, &0_i128, "").expect("NumberFeature::new"));
let want_nzxor = Feature::Characteristic(
CharacteristicFeature::new("nzxor", "").expect("CharacteristicFeature::new"),
);
let mut sites_checked = 0_usize;
for smda_func in functions.values() {
let Ok(blocks) = smda_func.get_blocks() else {
continue;
};
for instrs in blocks.values() {
for insn in instrs {
if !matches!(
insn.mnemonic_enum(),
Mnemonic::Xor | Mnemonic::Xorpd | Mnemonic::Xorps | Mnemonic::Pxor
) {
continue;
}
let Some(operand_str) = insn.format_operands() else {
continue;
};
let parts: Vec<&str> = operand_str.split(',').map(|s| s.trim()).collect();
if parts.len() < 2 || parts[0] != parts[1] {
continue;
}
sites_checked += 1;
let res = extractor
.extract_insn_nzxor_characteristic_features(smda_func, insn)
.expect("extract_insn_nzxor_characteristic_features");
assert!(
res.iter().any(|(f, _)| f == &want_zero),
"self-XOR at {:#x} did not emit Number(0); features were {:?}",
insn.offset,
res.iter().map(|(f, _)| f).collect::<Vec<_>>(),
);
assert!(
res.iter().all(|(f, _)| f != &want_nzxor),
"self-XOR at {:#x} incorrectly tagged as nzxor",
insn.offset,
);
}
}
}
assert!(
sites_checked > 0,
"no `xor reg, reg` site found in {path} — \
the test cannot verify the fix; switch the fixture to a \
binary that contains at least one self-XOR.",
);
eprintln!("upstream parity #2997: verified {sites_checked} self-XOR site(s) in {path}");
}
}