ent_rs/
lib.rs

1//! # ent-rs
2//!
3//! `ent-rs` is a library for analyzing the entropy and randomness of binary data.
4//! It provides byte/bit entropy, chi-square testing, mean, Pi estimation, and serial correlation.
5//!
6//! ```rust
7//! use ent_rs::EntStats;
8//! let data = b"example data";
9//! let stats = EntStats::from_data(data, false);
10//! println!("Entropy: {}", stats.entropy);
11//! ```
12
13use statrs::function::erf::erfc;
14use std::f64::consts::SQRT_2;
15
16/// Result of statistical analysis on binary data.
17#[derive(Debug, Clone)]
18pub struct EntStats {
19    /// Shannon entropy in bits per byte (or bit).
20    pub entropy: f64,
21    /// Ideal compression percentage based on entropy.
22    pub compression_percent: f64,
23    /// Chi-square test value.
24    pub chisquare: f64,
25    /// p-value of chi-square test.
26    pub p_value: f64,
27    /// Arithmetic mean of all data bytes.
28    pub mean: f64,
29    /// Estimated value of Pi from Monte Carlo method.
30    pub pi_estimate: f64,
31    /// Serial correlation coefficient between adjacent values.
32    pub serial_correlation: f64,
33    /// Byte frequency table: (value, count, fraction).
34    pub byte_frequencies: Option<Vec<(u8, usize, f64)>>,
35    /// Bit frequency table: [(count, fraction) for 0, 1].
36    pub bit_frequencies: Option<[(usize, f64); 2]>,
37}
38
39impl EntStats {
40    /// Compute entropy statistics from byte slice, using bit mode or byte mode.
41    pub fn from_data(data: &[u8], bit_mode: bool) -> Self {
42        let entropy = calculate_entropy(data, bit_mode);
43        let compression_percent = if bit_mode {
44            100.0 * (1.0 - entropy)
45        } else {
46            100.0 * (1.0 - entropy / 8.0)
47        };
48        let (chisquare, p_value) = calculate_chisquare(data, bit_mode);
49        let mean = calculate_mean(data);
50        let pi_estimate = estimate_pi(data);
51        let serial_correlation = serial_correlation(data);
52
53        let (byte_frequencies, bit_frequencies) = if bit_mode {
54            (None, Some(bit_occurrences(data)))
55        } else {
56            (Some(byte_occurrences(data)), None)
57        };
58
59        EntStats {
60            entropy,
61            compression_percent,
62            chisquare,
63            p_value,
64            mean,
65            pi_estimate,
66            serial_correlation,
67            byte_frequencies,
68            bit_frequencies,
69        }
70    }
71}
72
73// Internal computation functions
74
75fn calculate_entropy(data: &[u8], bit_mode: bool) -> f64 {
76    let mut freq = if bit_mode {
77        vec![0f64; 2]
78    } else {
79        vec![0f64; 256]
80    };
81
82    if bit_mode {
83        for &b in data {
84            for i in 0..8 {
85                freq[(b >> i) as usize & 1] += 1.0;
86            }
87        }
88        let total = 8.0 * data.len() as f64;
89        for f in freq.iter_mut() {
90            *f /= total;
91        }
92    } else {
93        for &b in data {
94            freq[b as usize] += 1.0;
95        }
96        let total = data.len() as f64;
97        for f in freq.iter_mut() {
98            *f /= total;
99        }
100    }
101
102    freq.iter()
103        .filter(|&&p| p > 0.0)
104        .map(|&p| -p * p.log2())
105        .sum()
106}
107
108fn calculate_chisquare(data: &[u8], bit_mode: bool) -> (f64, f64) {
109    if bit_mode {
110        let mut count = [0usize; 2];
111        for &b in data {
112            for i in 0..8 {
113                count[(b >> i) as usize & 1] += 1;
114            }
115        }
116        let total = data.len() * 8;
117        let expected = total as f64 / 2.0;
118        let chisq = count
119            .iter()
120            .map(|&obs| {
121                let diff = obs as f64 - expected;
122                diff * diff / expected
123            })
124            .sum::<f64>();
125        let z = (chisq - 1.0).sqrt();
126        (chisq, 1.0 - 0.5 * erfc(-z / SQRT_2))
127    } else {
128        let mut count = [0usize; 256];
129        for &b in data {
130            count[b as usize] += 1;
131        }
132        let total = data.len();
133        let expected = total as f64 / 256.0;
134        let chisq = count
135            .iter()
136            .map(|&obs| {
137                let diff = obs as f64 - expected;
138                diff * diff / expected
139            })
140            .sum::<f64>();
141        let z = (chisq - 255.0).sqrt();
142        (chisq, 1.0 - 0.5 * erfc(-z / SQRT_2))
143    }
144}
145
146fn calculate_mean(data: &[u8]) -> f64 {
147    data.iter().map(|&b| b as f64).sum::<f64>() / data.len() as f64
148}
149
150fn estimate_pi(data: &[u8]) -> f64 {
151    let mut hits = 0;
152    let mut total = 0;
153    let r_sq = 1u64 << 48;
154
155    for chunk in data.chunks_exact(6) {
156        let x = ((chunk[0] as u64) << 16) | ((chunk[1] as u64) << 8) | chunk[2] as u64;
157        let y = ((chunk[3] as u64) << 16) | ((chunk[4] as u64) << 8) | chunk[5] as u64;
158        let dist_sq = x * x + y * y;
159        if dist_sq < r_sq {
160            hits += 1;
161        }
162        total += 1;
163    }
164
165    if total > 0 {
166        4.0 * hits as f64 / total as f64
167    } else {
168        0.0
169    }
170}
171
172fn serial_correlation(data: &[u8]) -> f64 {
173    if data.len() < 2 {
174        return -99999.0;
175    }
176
177    let mut sum_x = 0f64;
178    let mut sum_y = 0f64;
179    let mut sum_xy = 0f64;
180    let mut sum_x2 = 0f64;
181    let mut sum_y2 = 0f64;
182
183    for i in 1..data.len() {
184        let x = data[i - 1] as f64;
185        let y = data[i] as f64;
186        sum_x += x;
187        sum_y += y;
188        sum_xy += x * y;
189        sum_x2 += x * x;
190        sum_y2 += y * y;
191    }
192
193    let n = (data.len() - 1) as f64;
194    let num = n * sum_xy - sum_x * sum_y;
195    let denom = ((n * sum_x2 - sum_x.powi(2)) * (n * sum_y2 - sum_y.powi(2))).sqrt();
196
197    if denom == 0.0 {
198        -99999.0
199    } else {
200        num / denom
201    }
202}
203
204fn byte_occurrences(data: &[u8]) -> Vec<(u8, usize, f64)> {
205    let mut counts = [0usize; 256];
206    for &b in data {
207        counts[b as usize] += 1;
208    }
209    let total = data.len() as f64;
210    (0..=255)
211        .map(|i| (i as u8, counts[i], counts[i] as f64 / total))
212        .collect()
213}
214
215fn bit_occurrences(data: &[u8]) -> [(usize, f64); 2] {
216    let mut count = [0usize; 2];
217    for &b in data {
218        for i in 0..8 {
219            count[(b >> i) as usize & 1] += 1;
220        }
221    }
222    let total = (data.len() * 8) as f64;
223    [
224        (count[0], count[0] as f64 / total),
225        (count[1], count[1] as f64 / total),
226    ]
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_entropy_on_uniform_data() {
235        let data = vec![0xAA; 4096]; // Constant pattern
236        let stats = EntStats::from_data(&data, false);
237        assert!(stats.entropy < 1.0, "Expected low entropy for uniform data");
238        assert!(stats.compression_percent > 85.0);
239    }
240
241    #[test]
242    fn test_entropy_on_random_data() {
243        let data: Vec<u8> = (0..=255).cycle().take(4096).collect(); // Pseudo-random-like distribution
244        let stats = EntStats::from_data(&data, false);
245        assert!(
246            stats.entropy > 7.5,
247            "Expected high entropy for diverse data"
248        );
249        assert!(stats.compression_percent < 10.0);
250    }
251
252    #[test]
253    fn test_chisquare_and_pvalue_validity() {
254        // 75% zeros, 25% uniform noise
255        let mut data = vec![0u8; 3072];
256        data.extend((0..=255).cycle().take(1024));
257        let stats = EntStats::from_data(&data, false);
258        assert!(
259            stats.chisquare > 0.0,
260            "Chi-square should be > 0 for biased input"
261        );
262        assert!(
263            (0.0..=1.0).contains(&stats.p_value),
264            "p-value should be in [0, 1]"
265        );
266    }
267
268    #[test]
269    fn test_mean_value_byte_mode() {
270        let data = vec![0x00, 0xFF];
271        let stats = EntStats::from_data(&data, false);
272        assert!(
273            (stats.mean - 127.5).abs() < 1.0,
274            "Mean should be close to 127.5"
275        );
276    }
277
278    #[test]
279    fn test_pi_estimation_sanity() {
280        let data: Vec<u8> = (0..=255).cycle().take(8192).collect(); // Somewhat randomized
281        let stats = EntStats::from_data(&data, false);
282        assert!(
283            (2.5..=3.8).contains(&stats.pi_estimate),
284            "Pi estimate should be within realistic bounds"
285        );
286    }
287
288    #[test]
289    fn test_serial_correlation_constant() {
290        let data = vec![0x33; 2048];
291        let stats = EntStats::from_data(&data, false);
292        assert_eq!(
293            stats.serial_correlation, -99999.0,
294            "All values are equal, correlation should be undefined"
295        );
296    }
297
298    #[test]
299    fn test_bit_mode_entropy_and_frequencies() {
300        let data = vec![0b10101010u8; 1024]; // Equal 1s and 0s
301        let stats = EntStats::from_data(&data, true);
302        assert!(
303            (stats.entropy - 1.0).abs() < 0.01,
304            "Expected full bit entropy"
305        );
306        assert!(stats.bit_frequencies.is_some());
307        let freqs = stats.bit_frequencies.unwrap();
308        assert!((freqs[0].1 - 0.5).abs() < 0.01);
309        assert!((freqs[1].1 - 0.5).abs() < 0.01);
310    }
311
312    #[test]
313    fn test_byte_frequency_distribution_length() {
314        let data: Vec<u8> = (0..=255).cycle().take(4096).collect();
315        let stats = EntStats::from_data(&data, false);
316        let freqs = stats.byte_frequencies.as_ref().unwrap();
317        assert_eq!(freqs.len(), 256);
318    }
319}