use super::{DetectionCategory, RecommendedAction, ScanResult, Scanner, Severity};
use serde::{Deserialize, Serialize};
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HeuristicConfig {
pub high_entropy_threshold: f64,
pub min_entropy_size: u64,
pub analyze_elf: bool,
pub analyze_scripts: bool,
pub max_scan_size: u64,
}
impl Default for HeuristicConfig {
fn default() -> Self {
Self {
high_entropy_threshold: 7.2,
min_entropy_size: 1024,
analyze_elf: true,
analyze_scripts: true,
max_scan_size: 52_428_800, }
}
}
pub struct HeuristicEngine {
config: HeuristicConfig,
active: bool,
}
impl HeuristicEngine {
pub fn new(config: HeuristicConfig) -> Self {
Self {
config,
active: true,
}
}
pub fn shannon_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut freq = [0u64; 256];
for &byte in data {
freq[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &freq {
if count > 0 {
let p = count as f64 / len;
entropy -= p * p.log2();
}
}
entropy
}
fn check_entropy(&self, data: &[u8], path: &Path) -> Vec<ScanResult> {
if (data.len() as u64) < self.config.min_entropy_size {
return Vec::new();
}
let entropy = Self::shannon_entropy(data);
if entropy > self.config.high_entropy_threshold {
vec![ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "high_entropy".to_string(),
},
format!(
"Suspiciously high entropy ({:.2}/8.0) — file may be packed, encrypted, or compressed",
entropy
),
0.7,
RecommendedAction::Alert,
)]
} else {
Vec::new()
}
}
fn check_elf_header(&self, data: &[u8], path: &Path) -> Vec<ScanResult> {
if !self.config.analyze_elf || data.len() < 64 {
return Vec::new();
}
if data[0] != 0x7F || data[1] != b'E' || data[2] != b'L' || data[3] != b'F' {
return Vec::new();
}
let mut results = Vec::new();
let is_64bit = data[4] == 2;
let little_endian = data[5] == 1;
if is_64bit && little_endian && data.len() >= 64 {
let e_shnum = u16::from_le_bytes([data[60], data[61]]);
if e_shnum == 0 {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Low,
DetectionCategory::HeuristicAnomaly {
rule: "elf_stripped".to_string(),
},
"ELF binary has no section headers (stripped) — may be intentionally obfuscated",
0.3,
RecommendedAction::LogOnly,
));
}
let e_phoff = u64::from_le_bytes([
data[32], data[33], data[34], data[35],
data[36], data[37], data[38], data[39],
]) as usize;
let e_phnum = u16::from_le_bytes([data[56], data[57]]) as usize;
let phent_size = 56usize;
for i in 0..e_phnum {
let offset = e_phoff + i * phent_size;
if offset + phent_size > data.len() {
break;
}
let p_type = u32::from_le_bytes([
data[offset], data[offset + 1], data[offset + 2], data[offset + 3],
]);
let p_flags = u32::from_le_bytes([
data[offset + 4], data[offset + 5], data[offset + 6], data[offset + 7],
]);
if p_type == 1 && (p_flags & 0x3) == 0x3 {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::High,
DetectionCategory::HeuristicAnomaly {
rule: "elf_wx_segment".to_string(),
},
format!(
"ELF has write+execute segment (PT_LOAD #{}) — common in packed malware",
i
),
0.8,
RecommendedAction::Alert,
));
}
}
} else if !is_64bit && little_endian && data.len() >= 52 {
let e_shnum = u16::from_le_bytes([data[48], data[49]]);
if e_shnum == 0 {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Low,
DetectionCategory::HeuristicAnomaly {
rule: "elf_stripped".to_string(),
},
"32-bit ELF binary has no section headers (stripped)",
0.3,
RecommendedAction::LogOnly,
));
}
}
results
}
fn check_file_type_mismatch(&self, data: &[u8], path: &Path) -> Vec<ScanResult> {
if data.len() < 4 {
return Vec::new();
}
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_default();
if ext.is_empty() {
return Vec::new();
}
let actual_type = detect_magic_type(data);
let claimed_type = match ext.as_str() {
"pdf" => Some("pdf"),
"png" => Some("png"),
"jpg" | "jpeg" => Some("jpeg"),
"gif" => Some("gif"),
"zip" | "jar" | "docx" | "xlsx" | "pptx" => Some("zip"),
"gz" | "tgz" => Some("gzip"),
"exe" | "dll" | "sys" => Some("pe"),
"elf" | "bin" => Some("elf"),
"bmp" => Some("bmp"),
_ => None,
};
let claimed = match claimed_type {
Some(c) => c,
None => return Vec::new(),
};
let actual = match actual_type {
Some(a) => a,
None => return Vec::new(),
};
if claimed == actual {
return Vec::new();
}
let is_exe_disguise = (claimed == "pdf" || claimed == "png" || claimed == "jpeg"
|| claimed == "gif" || claimed == "bmp" || claimed == "zip")
&& (actual == "pe" || actual == "elf");
let severity = if is_exe_disguise {
Severity::High
} else {
Severity::Medium
};
vec![ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
severity,
DetectionCategory::HeuristicAnomaly {
rule: "file_type_mismatch".to_string(),
},
format!(
"Extension claims '{}' ({}) but magic bytes indicate '{}' — possible disguised executable",
ext, claimed, actual
),
0.9,
RecommendedAction::Quarantine {
source_path: path.to_path_buf(),
},
)]
}
fn check_script_obfuscation(&self, data: &[u8], path: &Path) -> Vec<ScanResult> {
if !self.config.analyze_scripts {
return Vec::new();
}
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_default();
let is_script = matches!(
ext.as_str(),
"sh" | "bash" | "py" | "ps1" | "js" | "rb" | "pl" | "php" | "vbs"
);
if !is_script {
return Vec::new();
}
let text = String::from_utf8_lossy(data);
let mut results = Vec::new();
let b64_re = regex::Regex::new(r"[A-Za-z0-9+/]{200,}={0,2}").unwrap();
if b64_re.is_match(&text) {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "script_base64_block".to_string(),
},
"Script contains large base64-encoded block (>200 chars) — possible payload hiding",
0.7,
RecommendedAction::Alert,
));
}
let hex_re = regex::Regex::new(r"(\\x[0-9a-fA-F]{2}){50,}").unwrap();
if hex_re.is_match(&text) {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "script_hex_payload".to_string(),
},
"Script contains long hex-encoded byte sequence — possible shellcode",
0.75,
RecommendedAction::Alert,
));
}
let eval_re =
regex::Regex::new(r"(?i)(eval|exec)\s*\(.*?(base64|decode|fromcharcode|chr\()").unwrap();
if eval_re.is_match(&text) {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "script_eval_encoded".to_string(),
},
"Script uses eval/exec with encoded arguments — common obfuscation technique",
0.8,
RecommendedAction::Alert,
));
}
let rev_re = regex::Regex::new(r"(?i)(rev\s|reverse\(|\[::-1\]|\.reverse\(\))").unwrap();
if rev_re.is_match(&text) {
let has_exec = regex::Regex::new(r"(?i)(eval|exec|system|passthru|shell_exec)")
.unwrap()
.is_match(&text);
if has_exec {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "script_reversal_exec".to_string(),
},
"Script combines string reversal with code execution — obfuscation technique",
0.65,
RecommendedAction::Alert,
));
}
}
results
}
fn check_embedded_executable(&self, data: &[u8], path: &Path) -> Vec<ScanResult> {
let ext = path
.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_default();
let is_document = matches!(
ext.as_str(),
"pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" | "csv" | "txt" | "rtf" | "odt" | "ods"
);
if !is_document {
return Vec::new();
}
let search_start = 1024.min(data.len());
let search_data = &data[search_start..];
let mut results = Vec::new();
for (i, window) in search_data.windows(2).enumerate() {
if window[0] == 0x4D && window[1] == 0x5A {
let offset = search_start + i;
let remaining = &data[offset..];
if remaining.len() > 64 {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::High,
DetectionCategory::HeuristicAnomaly {
rule: "embedded_pe_executable".to_string(),
},
format!(
"Windows PE executable (MZ header) embedded at offset {} in {} file",
offset, ext
),
0.85,
RecommendedAction::Quarantine {
source_path: path.to_path_buf(),
},
));
break; }
}
}
if search_data.len() >= 4 {
for (i, window) in search_data.windows(4).enumerate() {
if window == b"\x7FELF" {
let offset = search_start + i;
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::High,
DetectionCategory::HeuristicAnomaly {
rule: "embedded_elf_executable".to_string(),
},
format!(
"Linux ELF executable embedded at offset {} in {} file",
offset, ext
),
0.85,
RecommendedAction::Quarantine {
source_path: path.to_path_buf(),
},
));
break;
}
}
}
if search_data.len() >= 2 {
for (i, window) in search_data.windows(2).enumerate() {
if window == b"#!" {
let offset = search_start + i;
if offset + 3 < data.len() && data[offset + 2] == b'/' {
results.push(ScanResult::new(
"heuristic_engine",
path.to_string_lossy(),
Severity::Medium,
DetectionCategory::HeuristicAnomaly {
rule: "embedded_script".to_string(),
},
format!(
"Script shebang (#!) found at offset {} in {} file",
offset, ext
),
0.6,
RecommendedAction::Alert,
));
break;
}
}
}
}
results
}
}
fn detect_magic_type(data: &[u8]) -> Option<&'static str> {
if data.len() < 4 {
return None;
}
if data.starts_with(b"%PDF") {
Some("pdf")
} else if data.starts_with(&[0x89, b'P', b'N', b'G']) {
Some("png")
} else if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
Some("jpeg")
} else if data.starts_with(b"GIF8") {
Some("gif")
} else if data.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
Some("zip")
} else if data.starts_with(&[0x1F, 0x8B]) {
Some("gzip")
} else if data.starts_with(&[0x4D, 0x5A]) {
Some("pe")
} else if data.starts_with(&[0x7F, b'E', b'L', b'F']) {
Some("elf")
} else if data.starts_with(&[0x42, 0x4D]) {
Some("bmp")
} else {
None
}
}
#[async_trait::async_trait]
impl Scanner for HeuristicEngine {
fn name(&self) -> &str {
"heuristic_engine"
}
fn is_active(&self) -> bool {
self.active
}
async fn scan_file(&self, path: &Path) -> Vec<ScanResult> {
let metadata = match std::fs::metadata(path) {
Ok(m) => m,
Err(_) => return Vec::new(),
};
if metadata.len() > self.config.max_scan_size {
return Vec::new();
}
let data = match std::fs::read(path) {
Ok(d) => d,
Err(_) => return Vec::new(),
};
let mut results = Vec::new();
results.extend(self.check_entropy(&data, path));
results.extend(self.check_elf_header(&data, path));
results.extend(self.check_file_type_mismatch(&data, path));
results.extend(self.check_script_obfuscation(&data, path));
results.extend(self.check_embedded_executable(&data, path));
results
}
async fn scan_bytes(&self, data: &[u8], label: &str) -> Vec<ScanResult> {
let path = Path::new(label);
let mut results = Vec::new();
results.extend(self.check_entropy(data, path));
results.extend(self.check_script_obfuscation(data, path));
results
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn entropy_all_zeros() {
let data = vec![0u8; 1024];
let entropy = HeuristicEngine::shannon_entropy(&data);
assert!((entropy - 0.0).abs() < 0.001);
}
#[test]
fn entropy_uniform_random() {
let mut data = Vec::with_capacity(256 * 100);
for _ in 0..100 {
for b in 0..=255u8 {
data.push(b);
}
}
let entropy = HeuristicEngine::shannon_entropy(&data);
assert!((entropy - 8.0).abs() < 0.001, "entropy was {}", entropy);
}
#[test]
fn entropy_normal_text() {
let text = b"The quick brown fox jumps over the lazy dog. This is a normal English sentence with typical entropy levels for natural language text content.";
let entropy = HeuristicEngine::shannon_entropy(text);
assert!(entropy > 3.0 && entropy < 6.0, "entropy was {}", entropy);
}
#[test]
fn entropy_empty() {
assert_eq!(HeuristicEngine::shannon_entropy(&[]), 0.0);
}
#[test]
fn elf_header_detection() {
let mut elf = vec![0u8; 128];
elf[0] = 0x7F; elf[1] = b'E'; elf[2] = b'L'; elf[3] = b'F'; elf[4] = 2; elf[5] = 1; elf[6] = 1; elf[60] = 0; elf[61] = 0;
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_elf_header(&elf, Path::new("/tmp/test.elf"));
assert!(!results.is_empty());
assert!(results.iter().any(|r| r.description.contains("stripped")));
}
#[test]
fn file_type_mismatch_pdf_with_mz() {
let mut data = vec![0x4D, 0x5A]; data.extend_from_slice(&[0u8; 100]);
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_file_type_mismatch(&data, Path::new("/tmp/report.pdf"));
assert!(!results.is_empty());
assert_eq!(results[0].severity, Severity::High);
}
#[test]
fn file_type_mismatch_clean_pdf() {
let mut data = b"%PDF-1.4 ".to_vec();
data.extend_from_slice(&[0u8; 100]);
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_file_type_mismatch(&data, Path::new("/tmp/report.pdf"));
assert!(results.is_empty());
}
#[test]
fn script_base64_obfuscation() {
let mut script = b"#!/bin/bash\neval $(echo '".to_vec();
script.extend_from_slice(&[b'A'; 250]);
script.extend_from_slice(b"==' | base64 -d)\n");
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_script_obfuscation(&script, Path::new("/tmp/evil.sh"));
assert!(!results.is_empty());
}
#[test]
fn clean_python_script_passes() {
let script = b"#!/usr/bin/env python3\nimport os\nprint('hello world')\n";
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_script_obfuscation(script, Path::new("/tmp/app.py"));
assert!(results.is_empty());
}
#[test]
fn embedded_elf_in_pdf() {
let mut data = b"%PDF-1.4 some pdf content here padding ".to_vec();
data.extend_from_slice(&[b' '; 1024]);
data.extend_from_slice(&[0x7F, b'E', b'L', b'F']);
data.extend_from_slice(&[0u8; 100]);
let engine = HeuristicEngine::new(HeuristicConfig::default());
let results = engine.check_embedded_executable(&data, Path::new("/tmp/report.pdf"));
assert!(!results.is_empty());
assert!(results.iter().any(|r| r.description.contains("ELF")));
}
#[test]
fn max_scan_size_respected() {
let config = HeuristicConfig {
max_scan_size: 100,
..Default::default()
};
let engine = HeuristicEngine::new(config);
let dir = std::env::temp_dir().join("nexus-heur-test");
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("large.bin");
std::fs::write(&path, &[0xAA; 200]).unwrap();
let rt = tokio::runtime::Runtime::new().unwrap();
let results = rt.block_on(engine.scan_file(&path));
assert!(results.is_empty());
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn config_defaults_are_reasonable() {
let config = HeuristicConfig::default();
assert!(config.high_entropy_threshold > 7.0);
assert!(config.high_entropy_threshold < 8.0);
assert!(config.min_entropy_size > 0);
assert!(config.max_scan_size > 1_000_000);
assert!(config.analyze_elf);
assert!(config.analyze_scripts);
}
#[test]
fn detect_magic_types() {
assert_eq!(detect_magic_type(b"%PDF-1.4"), Some("pdf"));
assert_eq!(detect_magic_type(&[0x89, b'P', b'N', b'G']), Some("png"));
assert_eq!(detect_magic_type(&[0xFF, 0xD8, 0xFF, 0xE0]), Some("jpeg"));
assert_eq!(detect_magic_type(b"GIF89a"), Some("gif"));
assert_eq!(detect_magic_type(&[0x4D, 0x5A, 0x90, 0x00]), Some("pe"));
assert_eq!(detect_magic_type(&[0x7F, b'E', b'L', b'F']), Some("elf"));
assert_eq!(detect_magic_type(b"Hello"), None);
}
}