lossless_transform_utils/entropy/mod.rs
1//! Entropy calculation module for histograms.
2//!
3//! This module provides functions to calculate the entropy (average code length) of data
4//! represented by histograms. The implementations are tuned for accuracy rather than raw
5//! performance, because the histogram only has 256 elements.
6//!
7//! # Examples
8//!
9//! Basic usage:
10//!
11//! ```
12//! use lossless_transform_utils::histogram::Histogram32;
13//! use lossless_transform_utils::entropy::code_length_of_histogram32;
14//!
15//! let mut histogram = Histogram32::default();
16//! histogram.inner.counter[0] = 3;
17//! histogram.inner.counter[1] = 2;
18//! histogram.inner.counter[2] = 1;
19//!
20//! let entropy = code_length_of_histogram32(&histogram, 6);
21//! println!("Entropy: {}", entropy);
22//! ```
23//!
24//! # Performance Note
25//!
26//! The implementation in this module favours accuracy rather than performance, so it does
27//! proper [f64] arithmetic, with `log2`; which normally is slow.
28//!
29//! However, because the input histograms only have 256 elements, the accuracy tradeoff for performance
30//! is considered worthwhile here.
31
32use crate::histogram::Histogram32;
33
34/// Calculates the Shannon entropy of a [Histogram32] using floating point arithmetic.
35/// The entropy is the average number of bits needed to represent each symbol.
36///
37/// This lets us estimate how compressible the data is during 'entropy coding' steps.
38///
39/// # Arguments
40///
41/// * `histogram` - A [Histogram32] containing symbol counts
42/// * `total` - The total count of all symbols
43///
44/// # Returns
45///
46/// The Shannon entropy in bits. i.e. the average number of bits needed to represent each symbol
47///
48/// # Example
49///
50/// ```
51/// use lossless_transform_utils::histogram::Histogram32;
52/// use lossless_transform_utils::entropy::shannon_entropy_of_histogram32;
53///
54/// let mut histogram = Histogram32::default();
55/// histogram.inner.counter[0] = 3;
56/// histogram.inner.counter[1] = 2;
57/// histogram.inner.counter[2] = 1;
58///
59/// let entropy = shannon_entropy_of_histogram32(&histogram.counter, 6);
60/// println!("Entropy: {}", entropy);
61/// ```
62///
63/// # Notes
64///
65/// - This implementation prioritizes accuracy over performance for small histograms (256 elements).
66/// - For high-throughput scenarios, consider using more optimized methods if performance is critical.
67pub fn shannon_entropy_of_histogram32(counter: &[u32; 256], total: u64) -> f64 {
68 // Pseudocode for Shannon Entropy 'the proper way':
69 //
70 // double entropy = 0.0;
71 // for (int i = 0; i < MAX_VALUE; i++) {
72 // if (frequencies[i] > 0) {
73 // double probability = (double)frequencies[i] / length;
74 // entropy -= probability * log2(probability);
75 // }
76 // }
77
78 let total = total as f64;
79 if counter.iter().all(|&x| x > 0) {
80 shannon_entropy_of_histogram32_fast(counter, total)
81 } else {
82 shannon_entropy_of_histogram32_slow(counter, total)
83 }
84}
85
86#[inline(always)]
87fn shannon_entropy_of_histogram32_fast(counter: &[u32; 256], total: f64) -> f64 {
88 let mut entropy0 = 0.0;
89 let mut entropy1 = 0.0;
90 let mut entropy2 = 0.0;
91 let mut entropy3 = 0.0;
92
93 for chunk in counter.chunks(4) {
94 let p0 = chunk[0] as f64 / total;
95 let p1 = chunk[1] as f64 / total;
96 let p2 = chunk[2] as f64 / total;
97 let p3 = chunk[3] as f64 / total;
98
99 entropy0 -= p0 * p0.log2();
100 entropy1 -= p1 * p1.log2();
101 entropy2 -= p2 * p2.log2();
102 entropy3 -= p3 * p3.log2();
103 }
104
105 entropy0 + entropy1 + entropy2 + entropy3
106}
107
108#[inline(always)]
109fn shannon_entropy_of_histogram32_slow(counter: &[u32; 256], total: f64) -> f64 {
110 let mut entropy = 0.0;
111 for count in counter {
112 if *count == 0 {
113 continue;
114 }
115 let probability = *count as f64 / total;
116 let entropy_value = probability * probability.log2();
117 entropy -= entropy_value;
118 }
119 entropy
120}
121
122/// Calculates the ideal code length in bits for a given histogram.
123/// This lets us estimate how compressible the data is during 'entropy coding' steps.
124///
125/// See [`shannon_entropy_of_histogram32`] for more details; this is just a wrapper around that function.
126pub fn code_length_of_histogram32_no_size(histogram: &Histogram32) -> f64 {
127 let total: u64 = histogram.counter.iter().map(|&x| x as u64).sum();
128 code_length_of_histogram32(histogram, total)
129}
130
131/// Calculates the ideal code length in bits for a given histogram.
132/// This lets us estimate how compressible the data is during 'entropy coding' steps.
133///
134/// See [`shannon_entropy_of_histogram32`] for more details; this is just a wrapper around that function.
135pub fn code_length_of_histogram32(histogram: &Histogram32, total: u64) -> f64 {
136 shannon_entropy_of_histogram32(&histogram.counter, total)
137}
138
139#[cfg(test)]
140mod tests {
141 use std::vec::Vec;
142
143 use super::*;
144 use crate::histogram::Histogram32;
145
146 #[test]
147 fn with_uniform_distribution() {
148 // Create a sequence with equal counts of 0,1,2,3: [0,1,2,3]
149 let hist = Histogram32::from_bytes(&[0, 1, 2, 3]);
150 let total = 4;
151
152 // For uniform distribution of 4 different values, entropy should be 2.0 bits
153 // Because log2(4) = 2 bits needed to encode 4 equally likely possibilities
154 assert!((code_length_of_histogram32(&hist, total) - 2.0).abs() < 1e-10);
155 }
156
157 #[test]
158 fn with_single_value() {
159 let hist = Histogram32::from_bytes(&[1, 1, 1, 1]);
160 let total = 4;
161
162 // For single value distribution, entropy should be 0 bits
163 assert!(code_length_of_histogram32(&hist, total).abs() < 1e-10);
164 }
165
166 #[test]
167 fn with_binary_distribution() {
168 let hist = Histogram32::from_bytes(&[0, 0, 1, 1]);
169 let total = 4;
170
171 // For 50-50 binary distribution, entropy should be 1.0 bit
172 assert!((code_length_of_histogram32(&hist, total) - 1.0).abs() < 1e-10);
173 }
174
175 #[test]
176 fn with_skewed_distribution() {
177 let hist = Histogram32::from_bytes(&[0, 0, 0, 1]);
178 let total = 4;
179
180 // For p=[0.75, 0.25] distribution:
181 // -0.75*log2(0.75) - 0.25*log2(0.25) ≈ 0.811278124459
182 let expected = 0.811278124459;
183 assert!((code_length_of_histogram32(&hist, total) - expected).abs() < 1e-10);
184 }
185
186 #[test]
187 fn with_empty_histogram() {
188 let hist = Histogram32::from_bytes(&[]);
189 assert!((code_length_of_histogram32(&hist, 0) - 0.0).abs() < 1e-10);
190 }
191
192 #[test]
193 fn code_length_no_size_equals_with_size() {
194 let hist = Histogram32::from_bytes(&[0, 0, 0, 1]);
195 let total = 4;
196
197 assert!(
198 (code_length_of_histogram32_no_size(&hist) - code_length_of_histogram32(&hist, total))
199 .abs()
200 < 1e-10
201 );
202 }
203
204 #[test]
205 fn fast_path_matches_slow_path() {
206 // Generate a large array of non-zero random bytes
207 let data: Vec<u8> = (0..10_000_u32)
208 .map(|x| (x * 33) as u8) // 1-255 ensures non-zero
209 .collect();
210
211 let total = data.len() as u64;
212 let hist = Histogram32::from_bytes(&data);
213
214 // Test non-zero case
215 let fast = shannon_entropy_of_histogram32_fast(&hist.counter, total as f64);
216 let slow = shannon_entropy_of_histogram32_slow(&hist.counter, total as f64);
217
218 assert!(
219 (fast - slow).abs() < 1e-10,
220 "Non-zero case mismatch: fast={fast} slow={slow}"
221 );
222 }
223}