use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use anyhow::Result;
use tokmd_analysis_types::{EntropyClass, EntropyFinding, EntropyReport};
use tokmd_types::{ExportData, FileKind, FileRow};
use tokmd_analysis_types::{AnalysisLimits, normalize_path};
const DEFAULT_SAMPLE_BYTES: usize = 1024;
const MAX_SUSPECTS: usize = 50;
pub(crate) fn build_entropy_report(
root: &Path,
files: &[PathBuf],
export: &ExportData,
limits: &AnalysisLimits,
) -> Result<EntropyReport> {
let mut row_map: BTreeMap<String, &FileRow> = BTreeMap::new();
for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
row_map.insert(normalize_path(&row.path, root), row);
}
let mut suspects = Vec::new();
let mut total_bytes = 0u64;
let max_total = limits.max_bytes;
let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_SAMPLE_BYTES as u64) as usize;
for rel in files {
if max_total.is_some_and(|limit| total_bytes >= limit) {
break;
}
let rel_str = rel.to_string_lossy().replace('\\', "/");
let module = row_map
.get(&rel_str)
.map(|r| r.module.clone())
.unwrap_or_else(|| "(unknown)".to_string());
let path = root.join(rel);
let bytes = crate::content::io::read_head_tail(&path, per_file_limit)?;
total_bytes += bytes.len() as u64;
if bytes.is_empty() {
continue;
}
let entropy = crate::content::io::entropy_bits_per_byte(&bytes);
let class = classify_entropy(entropy);
if class != EntropyClass::Normal {
suspects.push(EntropyFinding {
path: rel_str,
module,
entropy_bits_per_byte: entropy,
sample_bytes: bytes.len() as u32,
class,
});
}
}
suspects.sort_by(|a, b| {
b.entropy_bits_per_byte
.partial_cmp(&a.entropy_bits_per_byte)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.path.cmp(&b.path))
});
suspects.truncate(MAX_SUSPECTS);
Ok(EntropyReport { suspects })
}
#[cfg(test)]
#[path = "tests.rs"]
mod moved_tests;
fn classify_entropy(entropy: f32) -> EntropyClass {
if entropy > 7.5 {
EntropyClass::High
} else if entropy >= 6.5 {
EntropyClass::Suspicious
} else if entropy < 2.0 {
EntropyClass::Low
} else {
EntropyClass::Normal
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::tempdir;
use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
fn export_for_paths(paths: &[&str]) -> ExportData {
let rows = paths
.iter()
.map(|p| FileRow {
path: (*p).to_string(),
module: "(root)".to_string(),
lang: "Text".to_string(),
kind: FileKind::Parent,
code: 1,
comments: 0,
blanks: 0,
lines: 1,
bytes: 10,
tokens: 2,
})
.collect();
ExportData {
rows,
module_roots: vec![],
module_depth: 1,
children: ChildIncludeMode::Separate,
}
}
fn write_repeated(path: &Path, byte: u8, len: usize) {
let data = vec![byte; len];
fs::write(path, data).unwrap();
}
fn write_pseudorandom(path: &Path, len: usize) {
let mut data = Vec::with_capacity(len);
let mut x = 0x12345678u32;
for _ in 0..len {
x = x.wrapping_mul(1664525).wrapping_add(1013904223);
data.push((x & 0xFF) as u8);
}
fs::write(path, data).unwrap();
}
#[test]
fn detects_low_and_high_entropy() {
let dir = tempdir().unwrap();
let low = dir.path().join("low.txt");
let high = dir.path().join("high.bin");
write_repeated(&low, b'A', 1024);
write_pseudorandom(&high, 1024);
let export = export_for_paths(&["low.txt", "high.bin"]);
let files = vec![PathBuf::from("low.txt"), PathBuf::from("high.bin")];
let report =
build_entropy_report(dir.path(), &files, &export, &AnalysisLimits::default()).unwrap();
assert!(
report
.suspects
.iter()
.any(|f| f.path == "low.txt" && f.class == EntropyClass::Low)
);
assert!(
report
.suspects
.iter()
.any(|f| f.path == "high.bin" && f.class == EntropyClass::High)
);
}
}