tokmd_analysis_entropy/
lib.rs1use std::collections::BTreeMap;
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{EntropyClass, EntropyFinding, EntropyReport};
6use tokmd_types::{ExportData, FileKind, FileRow};
7
8use tokmd_analysis_util::{AnalysisLimits, normalize_path};
9
10const DEFAULT_SAMPLE_BYTES: usize = 1024;
11const MAX_SUSPECTS: usize = 50;
12
13pub fn build_entropy_report(
14 root: &Path,
15 files: &[PathBuf],
16 export: &ExportData,
17 limits: &AnalysisLimits,
18) -> Result<EntropyReport> {
19 let mut row_map: BTreeMap<String, &FileRow> = BTreeMap::new();
20 for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
21 row_map.insert(normalize_path(&row.path, root), row);
22 }
23
24 let mut suspects = Vec::new();
25 let mut total_bytes = 0u64;
26 let max_total = limits.max_bytes;
27 let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_SAMPLE_BYTES as u64) as usize;
28
29 for rel in files {
30 if max_total.is_some_and(|limit| total_bytes >= limit) {
31 break;
32 }
33 let rel_str = rel.to_string_lossy().replace('\\', "/");
34 let module = row_map
35 .get(&rel_str)
36 .map(|r| r.module.clone())
37 .unwrap_or_else(|| "(unknown)".to_string());
38
39 let path = root.join(rel);
40 let bytes = tokmd_content::read_head_tail(&path, per_file_limit)?;
41 total_bytes += bytes.len() as u64;
42 if bytes.is_empty() {
43 continue;
44 }
45 let entropy = tokmd_content::entropy_bits_per_byte(&bytes);
46 let class = classify_entropy(entropy);
47 if class != EntropyClass::Normal {
48 suspects.push(EntropyFinding {
49 path: rel_str,
50 module,
51 entropy_bits_per_byte: entropy,
52 sample_bytes: bytes.len() as u32,
53 class,
54 });
55 }
56 }
57
58 suspects.sort_by(|a, b| {
59 b.entropy_bits_per_byte
60 .partial_cmp(&a.entropy_bits_per_byte)
61 .unwrap_or(std::cmp::Ordering::Equal)
62 .then_with(|| a.path.cmp(&b.path))
63 });
64 suspects.truncate(MAX_SUSPECTS);
65
66 Ok(EntropyReport { suspects })
67}
68
69fn classify_entropy(entropy: f32) -> EntropyClass {
70 if entropy > 7.5 {
71 EntropyClass::High
72 } else if entropy >= 6.5 {
73 EntropyClass::Suspicious
74 } else if entropy < 2.0 {
75 EntropyClass::Low
76 } else {
77 EntropyClass::Normal
78 }
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84 use std::fs;
85 use tempfile::tempdir;
86 use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
87
88 fn export_for_paths(paths: &[&str]) -> ExportData {
89 let rows = paths
90 .iter()
91 .map(|p| FileRow {
92 path: (*p).to_string(),
93 module: "(root)".to_string(),
94 lang: "Text".to_string(),
95 kind: FileKind::Parent,
96 code: 1,
97 comments: 0,
98 blanks: 0,
99 lines: 1,
100 bytes: 10,
101 tokens: 2,
102 })
103 .collect();
104 ExportData {
105 rows,
106 module_roots: vec![],
107 module_depth: 1,
108 children: ChildIncludeMode::Separate,
109 }
110 }
111
112 fn write_repeated(path: &Path, byte: u8, len: usize) {
113 let data = vec![byte; len];
114 fs::write(path, data).unwrap();
115 }
116
117 fn write_pseudorandom(path: &Path, len: usize) {
118 let mut data = Vec::with_capacity(len);
119 let mut x = 0x12345678u32;
120 for _ in 0..len {
121 x = x.wrapping_mul(1664525).wrapping_add(1013904223);
122 data.push((x & 0xFF) as u8);
123 }
124 fs::write(path, data).unwrap();
125 }
126
127 #[test]
128 fn detects_low_and_high_entropy() {
129 let dir = tempdir().unwrap();
130 let low = dir.path().join("low.txt");
131 let high = dir.path().join("high.bin");
132 write_repeated(&low, b'A', 1024);
133 write_pseudorandom(&high, 1024);
134
135 let export = export_for_paths(&["low.txt", "high.bin"]);
136 let files = vec![PathBuf::from("low.txt"), PathBuf::from("high.bin")];
137 let report =
138 build_entropy_report(dir.path(), &files, &export, &AnalysisLimits::default()).unwrap();
139
140 assert!(
141 report
142 .suspects
143 .iter()
144 .any(|f| f.path == "low.txt" && f.class == EntropyClass::Low)
145 );
146 assert!(
147 report
148 .suspects
149 .iter()
150 .any(|f| f.path == "high.bin" && f.class == EntropyClass::High)
151 );
152 }
153}