#![allow(clippy::upper_case_acronyms)]
use std::{
collections::{BTreeMap, BTreeSet},
io::Read,
ops::Not,
};
use anyhow::Result;
use log::error;
use serde_json::json;
use lancelot::{
analysis::dis::{self, get_first_operand, get_operand_xref},
aspace::AddressSpace,
module::Permissions,
util,
workspace::{
config::{empty, Configuration},
workspace_from_bytes,
},
RVA, VA,
};
struct Features {
strings: BTreeSet<String>,
numbers: BTreeSet<u64>,
apis: BTreeSet<String>,
}
fn is_jump(insn: &dis::zydis::DecodedInstruction) -> bool {
matches!(
insn.mnemonic,
dis::zydis::Mnemonic::JB
| dis::zydis::Mnemonic::JBE
| dis::zydis::Mnemonic::JCXZ
| dis::zydis::Mnemonic::JECXZ
| dis::zydis::Mnemonic::JKNZD
| dis::zydis::Mnemonic::JKZD
| dis::zydis::Mnemonic::JL
| dis::zydis::Mnemonic::JLE
| dis::zydis::Mnemonic::JMP
| dis::zydis::Mnemonic::JNB
| dis::zydis::Mnemonic::JNBE
| dis::zydis::Mnemonic::JNL
| dis::zydis::Mnemonic::JNLE
| dis::zydis::Mnemonic::JNO
| dis::zydis::Mnemonic::JNP
| dis::zydis::Mnemonic::JNS
| dis::zydis::Mnemonic::JNZ
| dis::zydis::Mnemonic::JO
| dis::zydis::Mnemonic::JP
| dis::zydis::Mnemonic::JRCXZ
| dis::zydis::Mnemonic::JS
| dis::zydis::Mnemonic::JZ
)
}
fn extract_insn_features(
ws: &dyn lancelot::workspace::Workspace,
insn: &dis::zydis::DecodedInstruction,
va: VA,
) -> Result<Features> {
let mut strings: BTreeSet<String> = Default::default();
let mut numbers: BTreeSet<u64> = Default::default();
let mut apis: BTreeSet<String> = Default::default();
{
for op in insn
.operands
.iter()
.filter(|op| op.visibility == dis::zydis::OperandVisibility::EXPLICIT)
.take(3)
{
if is_jump(insn) {
continue;
}
if matches!(insn.mnemonic, dis::zydis::Mnemonic::CALL) {
continue;
}
let n = match op.ty {
dis::zydis::OperandType::IMMEDIATE => op.imm.value,
_ => continue,
};
if ws.module().probe_va(n, Permissions::R) {
continue;
}
let op0 = get_first_operand(insn).expect("no operands");
if matches!(insn.mnemonic, dis::zydis::Mnemonic::ADD)
&& matches!(op0.ty, dis::zydis::OperandType::REGISTER)
&& matches!(op0.reg, dis::zydis::Register::RSP | dis::zydis::Register::ESP)
{
continue;
}
if matches!(insn.mnemonic, dis::zydis::Mnemonic::SUB)
&& matches!(op0.ty, dis::zydis::OperandType::REGISTER)
&& matches!(op0.reg, dis::zydis::Register::RSP | dis::zydis::Register::ESP)
{
continue;
}
if n < 0x100 {
continue;
}
if n == 0xFFFFFFFF {
continue;
}
if n < 0x001_0000 && n % 0x1000 == 0 {
continue;
}
if n > u64::MAX - 0x1000 {
continue;
}
numbers.insert(n);
}
}
{
for op in insn
.operands
.iter()
.filter(|op| op.visibility == dis::zydis::OperandVisibility::EXPLICIT)
.take(3)
{
if is_jump(insn) {
continue;
}
if matches!(insn.mnemonic, dis::zydis::Mnemonic::CALL) {
continue;
}
let x = match get_operand_xref(ws.module(), va, insn, op) {
Err(_) => continue,
Ok(None) => continue,
Ok(Some(dis::Target::Indirect(ptr))) => ptr,
Ok(Some(dis::Target::Direct(va))) => va,
};
if ws.module().probe_va(x, Permissions::R).not() {
continue;
}
if let Ok(s) = ws.module().address_space.read_ascii(x, 4) {
strings.insert(s);
} else {
let Ok(ptr) = ws.module().read_va_at_va(x) else {
continue;
};
if let Ok(s) = ws.module().address_space.read_ascii(ptr, 4) {
strings.insert(s);
}
}
}
}
{
if matches!(insn.mnemonic, dis::zydis::Mnemonic::CALL | dis::zydis::Mnemonic::JMP) {
for op in insn
.operands
.iter()
.filter(|op| op.visibility == dis::zydis::OperandVisibility::EXPLICIT)
.take(1)
{
let x = match get_operand_xref(ws.module(), va, insn, op) {
Err(_) => continue,
Ok(None) => continue,
Ok(Some(dis::Target::Indirect(ptr))) => ptr,
Ok(Some(dis::Target::Direct(va))) => va,
};
if let Some(import) = ws.analysis().imports.get(&x) {
if let lancelot::analysis::pe::ImportedSymbol::Name(name) = &import.symbol {
let name = format!("{}!{}", import.dll, name);
apis.insert(name);
}
}
if let Some(extern_) = ws.analysis().externs.get(&x) {
if extern_.starts_with("__imp_") {
apis.insert(extern_.replace("__imp_", ""));
} else {
apis.insert(extern_.clone());
}
}
if ws.analysis().functions.contains_key(&x) {
if let Some(name) = ws.analysis().names.names_by_address.get(&x) {
if name.starts_with("sub_") {
continue;
}
apis.insert(name.clone());
}
}
}
}
}
Ok(Features { strings, numbers, apis })
}
fn extract_function_features(ws: &dyn lancelot::workspace::Workspace, va: VA) -> Result<Features> {
let mut strings: BTreeSet<String> = Default::default();
let mut numbers: BTreeSet<u64> = Default::default();
let mut apis: BTreeSet<String> = Default::default();
let mut blocks = ws.cfg().get_reachable_blocks(va).collect::<Vec<_>>();
blocks.sort_unstable_by_key(|&bb| bb.address);
let decoder = dis::get_disassembler(ws.module()).unwrap();
for bb in blocks.into_iter() {
let buf = ws
.module()
.address_space
.read_bytes(bb.address, bb.length as usize + 0x10)?;
for (offset, insn) in dis::linear_disassemble(&decoder, &buf) {
if offset >= bb.length as usize {
break;
}
if let Ok(Some(insn)) = insn {
let va = bb.address + offset as RVA;
let ifeatures = extract_insn_features(ws, &insn, va)?;
strings.extend(ifeatures.strings);
numbers.extend(ifeatures.numbers);
apis.extend(ifeatures.apis);
}
}
}
Ok(Features { strings, numbers, apis })
}
struct FunctionDescriptor {
name: String,
#[allow(dead_code)]
address: VA,
features: Features,
}
type FunctionsFeatures = BTreeMap<VA, FunctionDescriptor>;
fn extract_workspace_features(ws: &dyn lancelot::workspace::Workspace) -> Result<FunctionsFeatures> {
let descriptors = ws
.analysis()
.functions
.iter()
.filter_map(|(&va, md)| {
if md.flags.intersects(lancelot::workspace::FunctionFlags::THUNK) {
return None;
}
let Some(name) = ws.analysis().names.names_by_address.get(&va) else {
return None;
};
let Ok(features) = extract_function_features(ws, va) else {
return None;
};
Some((
va,
FunctionDescriptor {
name: name.to_string(),
address: va,
features,
},
))
})
.collect::<BTreeMap<VA, FunctionDescriptor>>();
Ok(descriptors)
}
fn extract_buf_features(config: Box<dyn Configuration>, buf: &[u8]) -> Result<FunctionsFeatures> {
let ws = workspace_from_bytes(config, buf)?;
extract_workspace_features(&*ws)
}
struct BuildSettings {
triplet: String,
compiler: String,
library: String,
version: String,
profile: String,
}
fn output_functions_features(build: &BuildSettings, path: &str, features: &FunctionsFeatures) -> Result<()> {
for desc in features.values() {
for v in desc.features.numbers.iter() {
print!(
"{},{},{},{},{},",
build.triplet, build.compiler, build.library, build.version, build.profile
);
println!("{},{},number,0x{:08x}", path, desc.name, v);
}
for v in desc.features.apis.iter() {
print!(
"{},{},{},{},{},",
build.triplet, build.compiler, build.library, build.version, build.profile
);
println!("{},{},api,{}", path, desc.name, v);
}
for v in desc.features.strings.iter() {
print!(
"{},{},{},{},{},",
build.triplet, build.compiler, build.library, build.version, build.profile
);
println!("{},{},string,{}", path, desc.name, json!(v));
}
}
Ok(())
}
fn _main() -> Result<()> {
better_panic::install();
let matches = clap::App::new("jh")
.author("Willi Ballenthin <william.ballenthin@mandiant.com>")
.about("extract interesting features from functions")
.arg(
clap::Arg::new("verbose")
.short('v')
.long("verbose")
.multiple_occurrences(true)
.help("log verbose messages"),
)
.arg(
clap::Arg::new("quiet")
.short('q')
.long("quiet")
.help("disable informational messages"),
)
.arg(clap::Arg::new("triplet").required(true).index(1))
.arg(clap::Arg::new("compiler").required(true).index(2))
.arg(clap::Arg::new("library").required(true).index(3))
.arg(clap::Arg::new("version").required(true).index(4))
.arg(clap::Arg::new("profile").required(true).index(5))
.arg(
clap::Arg::new("input")
.required(true)
.index(6)
.help("path to file to analyze"),
)
.get_matches();
let log_level = if matches.is_present("quiet") {
log::LevelFilter::Error
} else {
match matches.occurrences_of("verbose") {
0 => log::LevelFilter::Info,
1 => log::LevelFilter::Debug,
2 => log::LevelFilter::Trace,
_ => log::LevelFilter::Trace,
}
};
let build = BuildSettings {
triplet: matches.value_of("triplet").unwrap().to_string(),
compiler: matches.value_of("compiler").unwrap().to_string(),
library: matches.value_of("library").unwrap().to_string(),
version: matches.value_of("version").unwrap().to_string(),
profile: matches.value_of("profile").unwrap().to_string(),
};
fern::Dispatch::new()
.format(move |out, message, record| {
out.finish(format_args!(
"{} [{:5}] {} {}",
chrono::Local::now().format("%Y-%m-%d %H:%M:%S"),
record.level(),
if log_level == log::LevelFilter::Trace {
record.target()
} else {
""
},
message
))
})
.level(log_level)
.chain(std::io::stderr())
.filter(|metadata| !metadata.target().starts_with("goblin::pe"))
.apply()
.expect("failed to configure logging");
let config = empty();
let filename = matches.value_of("input").unwrap();
let buf = util::read_file(filename)?;
if buf.starts_with(b"MZ") || buf.starts_with(&[0x64, 0x86]) {
let features = extract_buf_features(config.clone(), &buf)?;
println!("# triplet,compiler,library,version,profile,path,function,type,value");
output_functions_features(&build, "/", &features)?;
} else if buf.starts_with(b"!<arch>\n") {
println!("# triplet,compiler,library,version,profile,path,function,type,value");
let mut ar = ar::Archive::new(buf.as_slice());
while let Some(entry_result) = ar.next_entry() {
let Ok(mut entry) = entry_result else {
continue;
};
let Ok(path) = String::from_utf8(entry.header().identifier().to_vec()) else {
continue;
};
let path = path.replace('\\', "/");
let mut sbuf = Vec::with_capacity(entry.header().size() as usize);
entry.read_to_end(&mut sbuf)?;
let features = extract_buf_features(config.clone(), &sbuf)?;
output_functions_features(&build, &path, &features)?;
}
} else {
error!("unrecognized file format");
return Ok(());
}
Ok(())
}
fn main() {
if let Err(e) = _main() {
#[cfg(debug_assertions)]
error!("{:?}", e);
#[cfg(not(debug_assertions))]
error!("{:}", e);
}
}