lossless_transform_utils/entropy/
mod.rs

1//! Entropy calculation module for histograms.
2//!
3//! This module provides functions to calculate the entropy (average code length) of data
4//! represented by histograms. The implementations are tuned for accuracy rather than raw
5//! performance, because the histogram only has 256 elements.
6//!
7//! # Examples
8//!
9//! Basic usage:
10//!
11//! ```
12//! use lossless_transform_utils::histogram::Histogram32;
13//! use lossless_transform_utils::entropy::code_length_of_histogram32;
14//!
15//! let mut histogram = Histogram32::default();
16//! histogram.inner.counter[0] = 3;
17//! histogram.inner.counter[1] = 2;
18//! histogram.inner.counter[2] = 1;
19//!
20//! let entropy = code_length_of_histogram32(&histogram, 6);
21//! println!("Entropy: {}", entropy);
22//! ```
23//!
24//! # Performance Note
25//!
26//! The implementation in this module favours accuracy rather than performance, so it does
27//! proper [f64] arithmetic, with `log2`; which normally is slow.
28//!
29//! However, because the input histograms only have 256 elements, the accuracy tradeoff for performance
30//! is considered worthwhile here.
31
32use crate::histogram::Histogram32;
33
34/// Calculates the Shannon entropy of a [Histogram32] using floating point arithmetic.
35/// The entropy is the average number of bits needed to represent each symbol.
36///
37/// This lets us estimate how compressible the data is during 'entropy coding' steps.
38///
39/// # Arguments
40///
41/// * `histogram` - A [Histogram32] containing symbol counts
42/// * `total` - The total count of all symbols
43///
44/// # Returns
45///
46/// The Shannon entropy in bits. i.e. the average number of bits needed to represent each symbol
47///
48/// # Example
49///
50/// ```
51/// use lossless_transform_utils::histogram::Histogram32;
52/// use lossless_transform_utils::entropy::shannon_entropy_of_histogram32;
53///
54/// let mut histogram = Histogram32::default();
55/// histogram.inner.counter[0] = 3;
56/// histogram.inner.counter[1] = 2;
57/// histogram.inner.counter[2] = 1;
58///
59/// let entropy = shannon_entropy_of_histogram32(&histogram.counter, 6);
60/// println!("Entropy: {}", entropy);
61/// ```
62///
63/// # Notes
64///
65/// - This implementation prioritizes accuracy over performance for small histograms (256 elements).
66/// - For high-throughput scenarios, consider using more optimized methods if performance is critical.
67pub fn shannon_entropy_of_histogram32(counter: &[u32; 256], total: u64) -> f64 {
68    // Pseudocode for Shannon Entropy 'the proper way':
69    //
70    // double entropy = 0.0;
71    // for (int i = 0; i < MAX_VALUE; i++) {
72    //    if (frequencies[i] > 0) {
73    //        double probability = (double)frequencies[i] / length;
74    //        entropy -= probability * log2(probability);
75    //    }
76    // }
77
78    let total = total as f64;
79    if counter.iter().all(|&x| x > 0) {
80        shannon_entropy_of_histogram32_fast(counter, total)
81    } else {
82        shannon_entropy_of_histogram32_slow(counter, total)
83    }
84}
85
86#[inline(always)]
87fn shannon_entropy_of_histogram32_fast(counter: &[u32; 256], total: f64) -> f64 {
88    let mut entropy0 = 0.0;
89    let mut entropy1 = 0.0;
90    let mut entropy2 = 0.0;
91    let mut entropy3 = 0.0;
92
93    for chunk in counter.chunks(4) {
94        let p0 = chunk[0] as f64 / total;
95        let p1 = chunk[1] as f64 / total;
96        let p2 = chunk[2] as f64 / total;
97        let p3 = chunk[3] as f64 / total;
98
99        entropy0 -= p0 * p0.log2();
100        entropy1 -= p1 * p1.log2();
101        entropy2 -= p2 * p2.log2();
102        entropy3 -= p3 * p3.log2();
103    }
104
105    entropy0 + entropy1 + entropy2 + entropy3
106}
107
108#[inline(always)]
109fn shannon_entropy_of_histogram32_slow(counter: &[u32; 256], total: f64) -> f64 {
110    let mut entropy = 0.0;
111    for count in counter {
112        if *count == 0 {
113            continue;
114        }
115        let probability = *count as f64 / total;
116        let entropy_value = probability * probability.log2();
117        entropy -= entropy_value;
118    }
119    entropy
120}
121
122/// Calculates the ideal code length in bits for a given histogram.
123/// This lets us estimate how compressible the data is during 'entropy coding' steps.
124///
125/// See [`shannon_entropy_of_histogram32`] for more details; this is just a wrapper around that function.
126pub fn code_length_of_histogram32_no_size(histogram: &Histogram32) -> f64 {
127    let total: u64 = histogram.counter.iter().map(|&x| x as u64).sum();
128    code_length_of_histogram32(histogram, total)
129}
130
131/// Calculates the ideal code length in bits for a given histogram.
132/// This lets us estimate how compressible the data is during 'entropy coding' steps.
133///
134/// See [`shannon_entropy_of_histogram32`] for more details; this is just a wrapper around that function.
135pub fn code_length_of_histogram32(histogram: &Histogram32, total: u64) -> f64 {
136    shannon_entropy_of_histogram32(&histogram.counter, total)
137}
138
139#[cfg(test)]
140mod tests {
141    use std::vec::Vec;
142
143    use super::*;
144    use crate::histogram::Histogram32;
145
146    #[test]
147    fn with_uniform_distribution() {
148        // Create a sequence with equal counts of 0,1,2,3: [0,1,2,3]
149        let hist = Histogram32::from_bytes(&[0, 1, 2, 3]);
150        let total = 4;
151
152        // For uniform distribution of 4 different values, entropy should be 2.0 bits
153        // Because log2(4) = 2 bits needed to encode 4 equally likely possibilities
154        assert!((code_length_of_histogram32(&hist, total) - 2.0).abs() < 1e-10);
155    }
156
157    #[test]
158    fn with_single_value() {
159        let hist = Histogram32::from_bytes(&[1, 1, 1, 1]);
160        let total = 4;
161
162        // For single value distribution, entropy should be 0 bits
163        assert!(code_length_of_histogram32(&hist, total).abs() < 1e-10);
164    }
165
166    #[test]
167    fn with_binary_distribution() {
168        let hist = Histogram32::from_bytes(&[0, 0, 1, 1]);
169        let total = 4;
170
171        // For 50-50 binary distribution, entropy should be 1.0 bit
172        assert!((code_length_of_histogram32(&hist, total) - 1.0).abs() < 1e-10);
173    }
174
175    #[test]
176    fn with_skewed_distribution() {
177        let hist = Histogram32::from_bytes(&[0, 0, 0, 1]);
178        let total = 4;
179
180        // For p=[0.75, 0.25] distribution:
181        // -0.75*log2(0.75) - 0.25*log2(0.25) ≈ 0.811278124459
182        let expected = 0.811278124459;
183        assert!((code_length_of_histogram32(&hist, total) - expected).abs() < 1e-10);
184    }
185
186    #[test]
187    fn with_empty_histogram() {
188        let hist = Histogram32::from_bytes(&[]);
189        assert!((code_length_of_histogram32(&hist, 0) - 0.0).abs() < 1e-10);
190    }
191
192    #[test]
193    fn code_length_no_size_equals_with_size() {
194        let hist = Histogram32::from_bytes(&[0, 0, 0, 1]);
195        let total = 4;
196
197        assert!(
198            (code_length_of_histogram32_no_size(&hist) - code_length_of_histogram32(&hist, total))
199                .abs()
200                < 1e-10
201        );
202    }
203
204    #[test]
205    fn fast_path_matches_slow_path() {
206        // Generate a large array of non-zero random bytes
207        let data: Vec<u8> = (0..10_000_u32)
208            .map(|x| (x * 33) as u8) // 1-255 ensures non-zero
209            .collect();
210
211        let total = data.len() as u64;
212        let hist = Histogram32::from_bytes(&data);
213
214        // Test non-zero case
215        let fast = shannon_entropy_of_histogram32_fast(&hist.counter, total as f64);
216        let slow = shannon_entropy_of_histogram32_slow(&hist.counter, total as f64);
217
218        assert!(
219            (fast - slow).abs() < 1e-10,
220            "Non-zero case mismatch: fast={fast} slow={slow}"
221        );
222    }
223}