Skip to main content

tokmd_analysis_entropy/
lib.rs

1use std::collections::BTreeMap;
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{EntropyClass, EntropyFinding, EntropyReport};
6use tokmd_types::{ExportData, FileKind, FileRow};
7
8use tokmd_analysis_util::{AnalysisLimits, normalize_path};
9
10const DEFAULT_SAMPLE_BYTES: usize = 1024;
11const MAX_SUSPECTS: usize = 50;
12
13pub fn build_entropy_report(
14    root: &Path,
15    files: &[PathBuf],
16    export: &ExportData,
17    limits: &AnalysisLimits,
18) -> Result<EntropyReport> {
19    let mut row_map: BTreeMap<String, &FileRow> = BTreeMap::new();
20    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
21        row_map.insert(normalize_path(&row.path, root), row);
22    }
23
24    let mut suspects = Vec::new();
25    let mut total_bytes = 0u64;
26    let max_total = limits.max_bytes;
27    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_SAMPLE_BYTES as u64) as usize;
28
29    for rel in files {
30        if max_total.is_some_and(|limit| total_bytes >= limit) {
31            break;
32        }
33        let rel_str = rel.to_string_lossy().replace('\\', "/");
34        let module = row_map
35            .get(&rel_str)
36            .map(|r| r.module.clone())
37            .unwrap_or_else(|| "(unknown)".to_string());
38
39        let path = root.join(rel);
40        let bytes = tokmd_content::read_head_tail(&path, per_file_limit)?;
41        total_bytes += bytes.len() as u64;
42        if bytes.is_empty() {
43            continue;
44        }
45        let entropy = tokmd_content::entropy_bits_per_byte(&bytes);
46        let class = classify_entropy(entropy);
47        if class != EntropyClass::Normal {
48            suspects.push(EntropyFinding {
49                path: rel_str,
50                module,
51                entropy_bits_per_byte: entropy,
52                sample_bytes: bytes.len() as u32,
53                class,
54            });
55        }
56    }
57
58    suspects.sort_by(|a, b| {
59        b.entropy_bits_per_byte
60            .partial_cmp(&a.entropy_bits_per_byte)
61            .unwrap_or(std::cmp::Ordering::Equal)
62            .then_with(|| a.path.cmp(&b.path))
63    });
64    suspects.truncate(MAX_SUSPECTS);
65
66    Ok(EntropyReport { suspects })
67}
68
69fn classify_entropy(entropy: f32) -> EntropyClass {
70    if entropy > 7.5 {
71        EntropyClass::High
72    } else if entropy >= 6.5 {
73        EntropyClass::Suspicious
74    } else if entropy < 2.0 {
75        EntropyClass::Low
76    } else {
77        EntropyClass::Normal
78    }
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84    use std::fs;
85    use tempfile::tempdir;
86    use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
87
88    fn export_for_paths(paths: &[&str]) -> ExportData {
89        let rows = paths
90            .iter()
91            .map(|p| FileRow {
92                path: (*p).to_string(),
93                module: "(root)".to_string(),
94                lang: "Text".to_string(),
95                kind: FileKind::Parent,
96                code: 1,
97                comments: 0,
98                blanks: 0,
99                lines: 1,
100                bytes: 10,
101                tokens: 2,
102            })
103            .collect();
104        ExportData {
105            rows,
106            module_roots: vec![],
107            module_depth: 1,
108            children: ChildIncludeMode::Separate,
109        }
110    }
111
112    fn write_repeated(path: &Path, byte: u8, len: usize) {
113        let data = vec![byte; len];
114        fs::write(path, data).unwrap();
115    }
116
117    fn write_pseudorandom(path: &Path, len: usize) {
118        let mut data = Vec::with_capacity(len);
119        let mut x = 0x12345678u32;
120        for _ in 0..len {
121            x = x.wrapping_mul(1664525).wrapping_add(1013904223);
122            data.push((x & 0xFF) as u8);
123        }
124        fs::write(path, data).unwrap();
125    }
126
127    #[test]
128    fn detects_low_and_high_entropy() {
129        let dir = tempdir().unwrap();
130        let low = dir.path().join("low.txt");
131        let high = dir.path().join("high.bin");
132        write_repeated(&low, b'A', 1024);
133        write_pseudorandom(&high, 1024);
134
135        let export = export_for_paths(&["low.txt", "high.bin"]);
136        let files = vec![PathBuf::from("low.txt"), PathBuf::from("high.bin")];
137        let report =
138            build_entropy_report(dir.path(), &files, &export, &AnalysisLimits::default()).unwrap();
139
140        assert!(
141            report
142                .suspects
143                .iter()
144                .any(|f| f.path == "low.txt" && f.class == EntropyClass::Low)
145        );
146        assert!(
147            report
148                .suspects
149                .iter()
150                .any(|f| f.path == "high.bin" && f.class == EntropyClass::High)
151        );
152    }
153}