Skip to main content

haagenti_simd/
histogram.rs

1//! SIMD-accelerated byte histogram computation.
2//!
3//! Used for entropy estimation in compression analysis.
4
5/// Compute byte histogram (frequency of each byte value 0-255).
6///
7/// Uses SIMD acceleration when available.
8#[inline]
9pub fn byte_histogram(data: &[u8]) -> [u32; 256] {
10    #[cfg(target_arch = "x86_64")]
11    {
12        if is_x86_feature_detected!("avx2") && data.len() >= 64 {
13            return unsafe { byte_histogram_avx2(data) };
14        }
15    }
16
17    byte_histogram_scalar(data)
18}
19
20/// SIMD-accelerated histogram using scatter/gather approach.
21///
22/// This uses multiple histogram arrays to reduce memory conflicts
23/// and processes data in parallel lanes.
24#[inline]
25pub fn byte_histogram_simd(data: &[u8]) -> [u32; 256] {
26    byte_histogram(data)
27}
28
29/// Scalar histogram implementation with loop unrolling.
30#[inline]
31fn byte_histogram_scalar(data: &[u8]) -> [u32; 256] {
32    let mut hist = [0u32; 256];
33
34    // Process 4 bytes at a time to reduce loop overhead
35    let chunks = data.chunks_exact(4);
36    let remainder = chunks.remainder();
37
38    for chunk in chunks {
39        hist[chunk[0] as usize] += 1;
40        hist[chunk[1] as usize] += 1;
41        hist[chunk[2] as usize] += 1;
42        hist[chunk[3] as usize] += 1;
43    }
44
45    for &b in remainder {
46        hist[b as usize] += 1;
47    }
48
49    hist
50}
51
52/// AVX2-accelerated histogram using multiple histogram banks.
53///
54/// Uses 4 parallel histogram arrays to avoid memory conflicts,
55/// then merges them at the end.
56#[cfg(target_arch = "x86_64")]
57#[target_feature(enable = "avx2")]
58unsafe fn byte_histogram_avx2(data: &[u8]) -> [u32; 256] {
59    // Use 4 parallel histograms to reduce conflicts
60    let mut hist0 = [0u32; 256];
61    let mut hist1 = [0u32; 256];
62    let mut hist2 = [0u32; 256];
63    let mut hist3 = [0u32; 256];
64
65    // Process 4 bytes per iteration, spread across 4 histograms
66    let chunks = data.chunks_exact(16);
67    let remainder = chunks.remainder();
68
69    for chunk in chunks {
70        // Unroll 16 bytes across 4 histograms
71        hist0[chunk[0] as usize] += 1;
72        hist1[chunk[1] as usize] += 1;
73        hist2[chunk[2] as usize] += 1;
74        hist3[chunk[3] as usize] += 1;
75
76        hist0[chunk[4] as usize] += 1;
77        hist1[chunk[5] as usize] += 1;
78        hist2[chunk[6] as usize] += 1;
79        hist3[chunk[7] as usize] += 1;
80
81        hist0[chunk[8] as usize] += 1;
82        hist1[chunk[9] as usize] += 1;
83        hist2[chunk[10] as usize] += 1;
84        hist3[chunk[11] as usize] += 1;
85
86        hist0[chunk[12] as usize] += 1;
87        hist1[chunk[13] as usize] += 1;
88        hist2[chunk[14] as usize] += 1;
89        hist3[chunk[15] as usize] += 1;
90    }
91
92    // Handle remainder
93    for &b in remainder {
94        hist0[b as usize] += 1;
95    }
96
97    // Merge histograms using SIMD
98    use std::arch::x86_64::*;
99
100    let mut result = [0u32; 256];
101
102    // Process 8 u32s at a time with AVX2
103    for i in (0..256).step_by(8) {
104        // SAFETY: AVX2 is enabled via target_feature, pointers are valid and aligned
105        unsafe {
106            let v0 = _mm256_loadu_si256(hist0[i..].as_ptr() as *const __m256i);
107            let v1 = _mm256_loadu_si256(hist1[i..].as_ptr() as *const __m256i);
108            let v2 = _mm256_loadu_si256(hist2[i..].as_ptr() as *const __m256i);
109            let v3 = _mm256_loadu_si256(hist3[i..].as_ptr() as *const __m256i);
110
111            // Add all 4 histograms
112            let sum01 = _mm256_add_epi32(v0, v1);
113            let sum23 = _mm256_add_epi32(v2, v3);
114            let sum = _mm256_add_epi32(sum01, sum23);
115
116            _mm256_storeu_si256(result[i..].as_mut_ptr() as *mut __m256i, sum);
117        }
118    }
119
120    result
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    #[test]
128    fn test_histogram_empty() {
129        let hist = byte_histogram(&[]);
130        assert!(hist.iter().all(|&c| c == 0));
131    }
132
133    #[test]
134    fn test_histogram_single_byte() {
135        let data = vec![42u8; 100];
136        let hist = byte_histogram(&data);
137        assert_eq!(hist[42], 100);
138        assert_eq!(hist.iter().filter(|&&c| c > 0).count(), 1);
139    }
140
141    #[test]
142    fn test_histogram_all_bytes() {
143        let data: Vec<u8> = (0..=255).collect();
144        let hist = byte_histogram(&data);
145        assert!(hist.iter().all(|&c| c == 1));
146    }
147
148    #[test]
149    fn test_histogram_repeated_pattern() {
150        let pattern = b"ABCD";
151        let data: Vec<u8> = pattern.iter().cycle().take(1000).cloned().collect();
152        let hist = byte_histogram(&data);
153
154        assert_eq!(hist[b'A' as usize], 250);
155        assert_eq!(hist[b'B' as usize], 250);
156        assert_eq!(hist[b'C' as usize], 250);
157        assert_eq!(hist[b'D' as usize], 250);
158    }
159
160    #[test]
161    fn test_histogram_large() {
162        // Test with large data to exercise SIMD path
163        let data: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
164        let hist = byte_histogram(&data);
165
166        // Each byte value should appear ~390 times (100000 / 256)
167        let expected = 100_000 / 256;
168        for &count in hist.iter() {
169            assert!(count >= expected as u32);
170            assert!(count <= (expected + 1) as u32);
171        }
172    }
173}