entropy/
lib.rs

1use std::io::Read;
2use std::fs::File;
3
4/// Contains metadata about the file that's being used in the Entropy calculation.
5///
6/// `byte_count` is a lookup table that contains the number of occurances of
7/// a byte specified by the index, e.g. 0x00 is `byte_count[0]`.
8///
9/// `length` is the number of bytes in the file.
10pub struct Entropy {
11    pub byte_count: [u64; 256],
12    pub length: u64,
13}
14
15impl Entropy {
16    /// Gets metadata for the Entropy calculation from a File reference
17    pub fn new(file: &File) -> Entropy {
18        let mut byte_count = [0u64; 256];
19        for byte in file.bytes() {
20            byte_count[byte.unwrap() as usize] += 1
21        }
22
23        Entropy {
24            byte_count: byte_count,
25            length: file.metadata().unwrap().len(),
26        }
27    }
28
29    /// Measures the Shannon entropy based on the frequency table and returns
30    /// it as a float.
31    ///
32    /// The equation is defined as: H(X) = - \sum_{i=0}^{n} P(x_i) log_2 P(x_i)
33    /// It can be described as the minimum number of bits (per symbol) to encode
34    /// the input. Thus the output will be between 0 and 8.
35    /// See https://en.wikipedia.org/wiki/Entropy_(information_theory) for
36    /// more information.
37    pub fn shannon_entropy(&self) -> f32 {
38        let mut entropy = 0f32;
39        for &count in self.byte_count.iter() {
40            if count != 0 {
41                let symbol_probability = count as f32 / self.length as f32;
42                entropy += symbol_probability * symbol_probability.log2();
43            }
44        }
45        -entropy
46    }
47
48    /// Measures the metric entropy based on the Shannon entropy of the
49    /// generated frequency table and returns it as a float between 0 and 1.
50    ///
51    /// Metric entropy is derived by dividing the Shannon entropy by the length
52    /// of the string being measured.
53    /// It can be described as the uncertainty or randomness of a string, where
54    /// 1 means information is uniformly distributed across the string.
55    pub fn metric_entropy(&self) -> f32 {
56        self.shannon_entropy() / 8f32
57    }
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63    use std::io::{Seek, SeekFrom, Write};
64    use tempfile::tempfile;
65
66    #[test]
67    fn test_new() {
68        // Create a temporary file and write five bytes to it
69        let mut test_file = tempfile().unwrap();
70        test_file.write(&[0x00, 0x00, 0x01, 0x01, 0x02]).unwrap();
71
72        test_file.seek(SeekFrom::Start(0)).unwrap();
73        let test_entropy = Entropy::new(&test_file);
74
75        // Test that the frequency table was populated correctly
76        assert_eq!(test_entropy.byte_count[0], 2);
77        assert_eq!(test_entropy.byte_count[1], 2);
78        assert_eq!(test_entropy.byte_count[2], 1);
79
80        // Test the length
81        assert_eq!(test_entropy.length, 5);
82    }
83
84    #[test]
85    fn test_shannon_entropy() {
86        // Create a temporary file and write five bytes to it
87        let mut test_file = tempfile().unwrap();
88        test_file.write(&[0x00, 0x00, 0x01, 0x01, 0x02]).unwrap();
89
90        test_file.seek(SeekFrom::Start(0)).unwrap();
91        let test_entropy = Entropy::new(&test_file);
92
93        let shannon_entropy = test_entropy.shannon_entropy();
94        assert_eq!(shannon_entropy, 1.5219281);
95    }
96
97    #[test]
98    fn test_metric_entropy() {
99        // Create a temporary file and write five bytes to it
100        let mut test_file = tempfile().unwrap();
101        test_file.write(&[0x00, 0x00, 0x01, 0x01, 0x02]).unwrap();
102
103        test_file.seek(SeekFrom::Start(0)).unwrap();
104        let test_entropy = Entropy::new(&test_file);
105
106        let metric_entropy = test_entropy.metric_entropy();
107        assert_eq!(metric_entropy, 0.19024101);
108    }
109}