1use statrs::function::erf::erfc;
14use std::f64::consts::SQRT_2;
15
16#[derive(Debug, Clone)]
18pub struct EntStats {
19 pub entropy: f64,
21 pub compression_percent: f64,
23 pub chisquare: f64,
25 pub p_value: f64,
27 pub mean: f64,
29 pub pi_estimate: f64,
31 pub serial_correlation: f64,
33 pub byte_frequencies: Option<Vec<(u8, usize, f64)>>,
35 pub bit_frequencies: Option<[(usize, f64); 2]>,
37}
38
39impl EntStats {
40 pub fn from_data(data: &[u8], bit_mode: bool) -> Self {
42 let entropy = calculate_entropy(data, bit_mode);
43 let compression_percent = if bit_mode {
44 100.0 * (1.0 - entropy)
45 } else {
46 100.0 * (1.0 - entropy / 8.0)
47 };
48 let (chisquare, p_value) = calculate_chisquare(data, bit_mode);
49 let mean = calculate_mean(data);
50 let pi_estimate = estimate_pi(data);
51 let serial_correlation = serial_correlation(data);
52
53 let (byte_frequencies, bit_frequencies) = if bit_mode {
54 (None, Some(bit_occurrences(data)))
55 } else {
56 (Some(byte_occurrences(data)), None)
57 };
58
59 EntStats {
60 entropy,
61 compression_percent,
62 chisquare,
63 p_value,
64 mean,
65 pi_estimate,
66 serial_correlation,
67 byte_frequencies,
68 bit_frequencies,
69 }
70 }
71}
72
73fn calculate_entropy(data: &[u8], bit_mode: bool) -> f64 {
76 let mut freq = if bit_mode {
77 vec![0f64; 2]
78 } else {
79 vec![0f64; 256]
80 };
81
82 if bit_mode {
83 for &b in data {
84 for i in 0..8 {
85 freq[(b >> i) as usize & 1] += 1.0;
86 }
87 }
88 let total = 8.0 * data.len() as f64;
89 for f in freq.iter_mut() {
90 *f /= total;
91 }
92 } else {
93 for &b in data {
94 freq[b as usize] += 1.0;
95 }
96 let total = data.len() as f64;
97 for f in freq.iter_mut() {
98 *f /= total;
99 }
100 }
101
102 freq.iter()
103 .filter(|&&p| p > 0.0)
104 .map(|&p| -p * p.log2())
105 .sum()
106}
107
108fn calculate_chisquare(data: &[u8], bit_mode: bool) -> (f64, f64) {
109 if bit_mode {
110 let mut count = [0usize; 2];
111 for &b in data {
112 for i in 0..8 {
113 count[(b >> i) as usize & 1] += 1;
114 }
115 }
116 let total = data.len() * 8;
117 let expected = total as f64 / 2.0;
118 let chisq = count
119 .iter()
120 .map(|&obs| {
121 let diff = obs as f64 - expected;
122 diff * diff / expected
123 })
124 .sum::<f64>();
125 let z = (chisq - 1.0).sqrt();
126 (chisq, 1.0 - 0.5 * erfc(-z / SQRT_2))
127 } else {
128 let mut count = [0usize; 256];
129 for &b in data {
130 count[b as usize] += 1;
131 }
132 let total = data.len();
133 let expected = total as f64 / 256.0;
134 let chisq = count
135 .iter()
136 .map(|&obs| {
137 let diff = obs as f64 - expected;
138 diff * diff / expected
139 })
140 .sum::<f64>();
141 let z = (chisq - 255.0).sqrt();
142 (chisq, 1.0 - 0.5 * erfc(-z / SQRT_2))
143 }
144}
145
146fn calculate_mean(data: &[u8]) -> f64 {
147 data.iter().map(|&b| b as f64).sum::<f64>() / data.len() as f64
148}
149
150fn estimate_pi(data: &[u8]) -> f64 {
151 let mut hits = 0;
152 let mut total = 0;
153 let r_sq = 1u64 << 48;
154
155 for chunk in data.chunks_exact(6) {
156 let x = ((chunk[0] as u64) << 16) | ((chunk[1] as u64) << 8) | chunk[2] as u64;
157 let y = ((chunk[3] as u64) << 16) | ((chunk[4] as u64) << 8) | chunk[5] as u64;
158 let dist_sq = x * x + y * y;
159 if dist_sq < r_sq {
160 hits += 1;
161 }
162 total += 1;
163 }
164
165 if total > 0 {
166 4.0 * hits as f64 / total as f64
167 } else {
168 0.0
169 }
170}
171
172fn serial_correlation(data: &[u8]) -> f64 {
173 if data.len() < 2 {
174 return -99999.0;
175 }
176
177 let mut sum_x = 0f64;
178 let mut sum_y = 0f64;
179 let mut sum_xy = 0f64;
180 let mut sum_x2 = 0f64;
181 let mut sum_y2 = 0f64;
182
183 for i in 1..data.len() {
184 let x = data[i - 1] as f64;
185 let y = data[i] as f64;
186 sum_x += x;
187 sum_y += y;
188 sum_xy += x * y;
189 sum_x2 += x * x;
190 sum_y2 += y * y;
191 }
192
193 let n = (data.len() - 1) as f64;
194 let num = n * sum_xy - sum_x * sum_y;
195 let denom = ((n * sum_x2 - sum_x.powi(2)) * (n * sum_y2 - sum_y.powi(2))).sqrt();
196
197 if denom == 0.0 {
198 -99999.0
199 } else {
200 num / denom
201 }
202}
203
204fn byte_occurrences(data: &[u8]) -> Vec<(u8, usize, f64)> {
205 let mut counts = [0usize; 256];
206 for &b in data {
207 counts[b as usize] += 1;
208 }
209 let total = data.len() as f64;
210 (0..=255)
211 .map(|i| (i as u8, counts[i], counts[i] as f64 / total))
212 .collect()
213}
214
215fn bit_occurrences(data: &[u8]) -> [(usize, f64); 2] {
216 let mut count = [0usize; 2];
217 for &b in data {
218 for i in 0..8 {
219 count[(b >> i) as usize & 1] += 1;
220 }
221 }
222 let total = (data.len() * 8) as f64;
223 [
224 (count[0], count[0] as f64 / total),
225 (count[1], count[1] as f64 / total),
226 ]
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 #[test]
234 fn test_entropy_on_uniform_data() {
235 let data = vec![0xAA; 4096]; let stats = EntStats::from_data(&data, false);
237 assert!(stats.entropy < 1.0, "Expected low entropy for uniform data");
238 assert!(stats.compression_percent > 85.0);
239 }
240
241 #[test]
242 fn test_entropy_on_random_data() {
243 let data: Vec<u8> = (0..=255).cycle().take(4096).collect(); let stats = EntStats::from_data(&data, false);
245 assert!(
246 stats.entropy > 7.5,
247 "Expected high entropy for diverse data"
248 );
249 assert!(stats.compression_percent < 10.0);
250 }
251
252 #[test]
253 fn test_chisquare_and_pvalue_validity() {
254 let mut data = vec![0u8; 3072];
256 data.extend((0..=255).cycle().take(1024));
257 let stats = EntStats::from_data(&data, false);
258 assert!(
259 stats.chisquare > 0.0,
260 "Chi-square should be > 0 for biased input"
261 );
262 assert!(
263 (0.0..=1.0).contains(&stats.p_value),
264 "p-value should be in [0, 1]"
265 );
266 }
267
268 #[test]
269 fn test_mean_value_byte_mode() {
270 let data = vec![0x00, 0xFF];
271 let stats = EntStats::from_data(&data, false);
272 assert!(
273 (stats.mean - 127.5).abs() < 1.0,
274 "Mean should be close to 127.5"
275 );
276 }
277
278 #[test]
279 fn test_pi_estimation_sanity() {
280 let data: Vec<u8> = (0..=255).cycle().take(8192).collect(); let stats = EntStats::from_data(&data, false);
282 assert!(
283 (2.5..=3.8).contains(&stats.pi_estimate),
284 "Pi estimate should be within realistic bounds"
285 );
286 }
287
288 #[test]
289 fn test_serial_correlation_constant() {
290 let data = vec![0x33; 2048];
291 let stats = EntStats::from_data(&data, false);
292 assert_eq!(
293 stats.serial_correlation, -99999.0,
294 "All values are equal, correlation should be undefined"
295 );
296 }
297
298 #[test]
299 fn test_bit_mode_entropy_and_frequencies() {
300 let data = vec![0b10101010u8; 1024]; let stats = EntStats::from_data(&data, true);
302 assert!(
303 (stats.entropy - 1.0).abs() < 0.01,
304 "Expected full bit entropy"
305 );
306 assert!(stats.bit_frequencies.is_some());
307 let freqs = stats.bit_frequencies.unwrap();
308 assert!((freqs[0].1 - 0.5).abs() < 0.01);
309 assert!((freqs[1].1 - 0.5).abs() < 0.01);
310 }
311
312 #[test]
313 fn test_byte_frequency_distribution_length() {
314 let data: Vec<u8> = (0..=255).cycle().take(4096).collect();
315 let stats = EntStats::from_data(&data, false);
316 let freqs = stats.byte_frequencies.as_ref().unwrap();
317 assert_eq!(freqs.len(), 256);
318 }
319}