const SCALAR_THRESHOLD: usize = 96;
const AVX2_THRESHOLD: usize = 96;
const AVX512_THRESHOLD: usize = 256;
const NEON_UNROLLED_THRESHOLD: usize = 192;
#[inline]
pub fn byte_histogram(data: &[u8]) -> [u32; 256] {
if data.len() < SCALAR_THRESHOLD {
return histogram_scalar(data);
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f")
&& is_x86_feature_detected!("avx512bw")
&& data.len() >= AVX512_THRESHOLD
{
return unsafe { histogram_avx512(data) };
}
if is_x86_feature_detected!("avx2") && data.len() >= AVX2_THRESHOLD {
return unsafe { histogram_avx2(data) };
}
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") {
if data.len() >= NEON_UNROLLED_THRESHOLD {
return unsafe { histogram_neon_unrolled(data) };
}
return unsafe { histogram_neon(data) };
}
}
histogram_scalar(data)
}
#[inline]
fn histogram_scalar(data: &[u8]) -> [u32; 256] {
let mut freq = [0u32; 256];
for &b in data {
freq[b as usize] += 1;
}
freq
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn histogram_avx2(data: &[u8]) -> [u32; 256] {
use std::arch::x86_64::*;
let mut freq0 = [0u32; 256];
let mut freq1 = [0u32; 256];
let mut freq2 = [0u32; 256];
let mut freq3 = [0u32; 256];
let chunks = data.len() / 32;
let ptr = data.as_ptr();
for i in 0..chunks {
let chunk = _mm256_loadu_si256(ptr.add(i * 32) as *const __m256i);
let bytes: [u8; 32] = std::mem::transmute(chunk);
freq0[bytes[0] as usize] += 1;
freq1[bytes[1] as usize] += 1;
freq2[bytes[2] as usize] += 1;
freq3[bytes[3] as usize] += 1;
freq0[bytes[4] as usize] += 1;
freq1[bytes[5] as usize] += 1;
freq2[bytes[6] as usize] += 1;
freq3[bytes[7] as usize] += 1;
freq0[bytes[8] as usize] += 1;
freq1[bytes[9] as usize] += 1;
freq2[bytes[10] as usize] += 1;
freq3[bytes[11] as usize] += 1;
freq0[bytes[12] as usize] += 1;
freq1[bytes[13] as usize] += 1;
freq2[bytes[14] as usize] += 1;
freq3[bytes[15] as usize] += 1;
freq0[bytes[16] as usize] += 1;
freq1[bytes[17] as usize] += 1;
freq2[bytes[18] as usize] += 1;
freq3[bytes[19] as usize] += 1;
freq0[bytes[20] as usize] += 1;
freq1[bytes[21] as usize] += 1;
freq2[bytes[22] as usize] += 1;
freq3[bytes[23] as usize] += 1;
freq0[bytes[24] as usize] += 1;
freq1[bytes[25] as usize] += 1;
freq2[bytes[26] as usize] += 1;
freq3[bytes[27] as usize] += 1;
freq0[bytes[28] as usize] += 1;
freq1[bytes[29] as usize] += 1;
freq2[bytes[30] as usize] += 1;
freq3[bytes[31] as usize] += 1;
}
for &b in &data[chunks * 32..] {
freq0[b as usize] += 1;
}
for i in 0..256 {
freq0[i] += freq1[i] + freq2[i] + freq3[i];
}
freq0
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f", enable = "avx512bw")]
unsafe fn histogram_avx512(data: &[u8]) -> [u32; 256] {
use std::arch::x86_64::*;
let mut freq0 = [0u32; 256];
let mut freq1 = [0u32; 256];
let mut freq2 = [0u32; 256];
let mut freq3 = [0u32; 256];
let mut freq4 = [0u32; 256];
let mut freq5 = [0u32; 256];
let mut freq6 = [0u32; 256];
let mut freq7 = [0u32; 256];
let chunks = data.len() / 64;
let ptr = data.as_ptr();
for i in 0..chunks {
let chunk = _mm512_loadu_si512(ptr.add(i * 64) as *const i32);
let bytes: [u8; 64] = std::mem::transmute(chunk);
freq0[bytes[0] as usize] += 1;
freq1[bytes[1] as usize] += 1;
freq2[bytes[2] as usize] += 1;
freq3[bytes[3] as usize] += 1;
freq4[bytes[4] as usize] += 1;
freq5[bytes[5] as usize] += 1;
freq6[bytes[6] as usize] += 1;
freq7[bytes[7] as usize] += 1;
freq0[bytes[8] as usize] += 1;
freq1[bytes[9] as usize] += 1;
freq2[bytes[10] as usize] += 1;
freq3[bytes[11] as usize] += 1;
freq4[bytes[12] as usize] += 1;
freq5[bytes[13] as usize] += 1;
freq6[bytes[14] as usize] += 1;
freq7[bytes[15] as usize] += 1;
freq0[bytes[16] as usize] += 1;
freq1[bytes[17] as usize] += 1;
freq2[bytes[18] as usize] += 1;
freq3[bytes[19] as usize] += 1;
freq4[bytes[20] as usize] += 1;
freq5[bytes[21] as usize] += 1;
freq6[bytes[22] as usize] += 1;
freq7[bytes[23] as usize] += 1;
freq0[bytes[24] as usize] += 1;
freq1[bytes[25] as usize] += 1;
freq2[bytes[26] as usize] += 1;
freq3[bytes[27] as usize] += 1;
freq4[bytes[28] as usize] += 1;
freq5[bytes[29] as usize] += 1;
freq6[bytes[30] as usize] += 1;
freq7[bytes[31] as usize] += 1;
freq0[bytes[32] as usize] += 1;
freq1[bytes[33] as usize] += 1;
freq2[bytes[34] as usize] += 1;
freq3[bytes[35] as usize] += 1;
freq4[bytes[36] as usize] += 1;
freq5[bytes[37] as usize] += 1;
freq6[bytes[38] as usize] += 1;
freq7[bytes[39] as usize] += 1;
freq0[bytes[40] as usize] += 1;
freq1[bytes[41] as usize] += 1;
freq2[bytes[42] as usize] += 1;
freq3[bytes[43] as usize] += 1;
freq4[bytes[44] as usize] += 1;
freq5[bytes[45] as usize] += 1;
freq6[bytes[46] as usize] += 1;
freq7[bytes[47] as usize] += 1;
freq0[bytes[48] as usize] += 1;
freq1[bytes[49] as usize] += 1;
freq2[bytes[50] as usize] += 1;
freq3[bytes[51] as usize] += 1;
freq4[bytes[52] as usize] += 1;
freq5[bytes[53] as usize] += 1;
freq6[bytes[54] as usize] += 1;
freq7[bytes[55] as usize] += 1;
freq0[bytes[56] as usize] += 1;
freq1[bytes[57] as usize] += 1;
freq2[bytes[58] as usize] += 1;
freq3[bytes[59] as usize] += 1;
freq4[bytes[60] as usize] += 1;
freq5[bytes[61] as usize] += 1;
freq6[bytes[62] as usize] += 1;
freq7[bytes[63] as usize] += 1;
}
for &b in &data[chunks * 64..] {
freq0[b as usize] += 1;
}
let freq0_ptr = freq0.as_mut_ptr();
let freq1_ptr = freq1.as_ptr();
let freq2_ptr = freq2.as_ptr();
let freq3_ptr = freq3.as_ptr();
let freq4_ptr = freq4.as_ptr();
let freq5_ptr = freq5.as_ptr();
let freq6_ptr = freq6.as_ptr();
let freq7_ptr = freq7.as_ptr();
for i in 0..16 {
let offset = i * 16;
let v0 = _mm512_loadu_si512(freq0_ptr.add(offset) as *const i32);
let v1 = _mm512_loadu_si512(freq1_ptr.add(offset) as *const i32);
let v2 = _mm512_loadu_si512(freq2_ptr.add(offset) as *const i32);
let v3 = _mm512_loadu_si512(freq3_ptr.add(offset) as *const i32);
let v4 = _mm512_loadu_si512(freq4_ptr.add(offset) as *const i32);
let v5 = _mm512_loadu_si512(freq5_ptr.add(offset) as *const i32);
let v6 = _mm512_loadu_si512(freq6_ptr.add(offset) as *const i32);
let v7 = _mm512_loadu_si512(freq7_ptr.add(offset) as *const i32);
let sum01 = _mm512_add_epi32(v0, v1);
let sum23 = _mm512_add_epi32(v2, v3);
let sum45 = _mm512_add_epi32(v4, v5);
let sum67 = _mm512_add_epi32(v6, v7);
let sum0123 = _mm512_add_epi32(sum01, sum23);
let sum4567 = _mm512_add_epi32(sum45, sum67);
let total = _mm512_add_epi32(sum0123, sum4567);
_mm512_storeu_si512(freq0_ptr.add(offset) as *mut i32, total);
}
freq0
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn histogram_neon(data: &[u8]) -> [u32; 256] {
use std::arch::aarch64::*;
let mut freq0 = [0u32; 256];
let mut freq1 = [0u32; 256];
let mut freq2 = [0u32; 256];
let mut freq3 = [0u32; 256];
let chunks = data.len() / 16;
let ptr = data.as_ptr();
for i in 0..chunks {
let chunk = vld1q_u8(ptr.add(i * 16));
let bytes: [u8; 16] = std::mem::transmute(chunk);
freq0[bytes[0] as usize] += 1;
freq1[bytes[1] as usize] += 1;
freq2[bytes[2] as usize] += 1;
freq3[bytes[3] as usize] += 1;
freq0[bytes[4] as usize] += 1;
freq1[bytes[5] as usize] += 1;
freq2[bytes[6] as usize] += 1;
freq3[bytes[7] as usize] += 1;
freq0[bytes[8] as usize] += 1;
freq1[bytes[9] as usize] += 1;
freq2[bytes[10] as usize] += 1;
freq3[bytes[11] as usize] += 1;
freq0[bytes[12] as usize] += 1;
freq1[bytes[13] as usize] += 1;
freq2[bytes[14] as usize] += 1;
freq3[bytes[15] as usize] += 1;
}
for &b in &data[chunks * 16..] {
freq0[b as usize] += 1;
}
let freq0_ptr = freq0.as_mut_ptr();
let freq1_ptr = freq1.as_ptr();
let freq2_ptr = freq2.as_ptr();
let freq3_ptr = freq3.as_ptr();
for i in 0..64 {
let offset = i * 4;
let v0 = vld1q_u32(freq0_ptr.add(offset));
let v1 = vld1q_u32(freq1_ptr.add(offset));
let v2 = vld1q_u32(freq2_ptr.add(offset));
let v3 = vld1q_u32(freq3_ptr.add(offset));
let sum01 = vaddq_u32(v0, v1);
let sum23 = vaddq_u32(v2, v3);
let total = vaddq_u32(sum01, sum23);
vst1q_u32(freq0_ptr.add(offset), total);
}
freq0
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn histogram_neon_unrolled(data: &[u8]) -> [u32; 256] {
use std::arch::aarch64::*;
let mut freq0 = [0u32; 256];
let mut freq1 = [0u32; 256];
let mut freq2 = [0u32; 256];
let mut freq3 = [0u32; 256];
let mut freq4 = [0u32; 256];
let mut freq5 = [0u32; 256];
let mut freq6 = [0u32; 256];
let mut freq7 = [0u32; 256];
let chunks = data.len() / 64;
let ptr = data.as_ptr();
for i in 0..chunks {
let base = ptr.add(i * 64);
let chunk0 = vld1q_u8(base);
let chunk1 = vld1q_u8(base.add(16));
let chunk2 = vld1q_u8(base.add(32));
let chunk3 = vld1q_u8(base.add(48));
let bytes0: [u8; 16] = std::mem::transmute(chunk0);
let bytes1: [u8; 16] = std::mem::transmute(chunk1);
let bytes2: [u8; 16] = std::mem::transmute(chunk2);
let bytes3: [u8; 16] = std::mem::transmute(chunk3);
freq0[bytes0[0] as usize] += 1;
freq1[bytes0[1] as usize] += 1;
freq2[bytes0[2] as usize] += 1;
freq3[bytes0[3] as usize] += 1;
freq4[bytes0[4] as usize] += 1;
freq5[bytes0[5] as usize] += 1;
freq6[bytes0[6] as usize] += 1;
freq7[bytes0[7] as usize] += 1;
freq0[bytes0[8] as usize] += 1;
freq1[bytes0[9] as usize] += 1;
freq2[bytes0[10] as usize] += 1;
freq3[bytes0[11] as usize] += 1;
freq4[bytes0[12] as usize] += 1;
freq5[bytes0[13] as usize] += 1;
freq6[bytes0[14] as usize] += 1;
freq7[bytes0[15] as usize] += 1;
freq0[bytes1[0] as usize] += 1;
freq1[bytes1[1] as usize] += 1;
freq2[bytes1[2] as usize] += 1;
freq3[bytes1[3] as usize] += 1;
freq4[bytes1[4] as usize] += 1;
freq5[bytes1[5] as usize] += 1;
freq6[bytes1[6] as usize] += 1;
freq7[bytes1[7] as usize] += 1;
freq0[bytes1[8] as usize] += 1;
freq1[bytes1[9] as usize] += 1;
freq2[bytes1[10] as usize] += 1;
freq3[bytes1[11] as usize] += 1;
freq4[bytes1[12] as usize] += 1;
freq5[bytes1[13] as usize] += 1;
freq6[bytes1[14] as usize] += 1;
freq7[bytes1[15] as usize] += 1;
freq0[bytes2[0] as usize] += 1;
freq1[bytes2[1] as usize] += 1;
freq2[bytes2[2] as usize] += 1;
freq3[bytes2[3] as usize] += 1;
freq4[bytes2[4] as usize] += 1;
freq5[bytes2[5] as usize] += 1;
freq6[bytes2[6] as usize] += 1;
freq7[bytes2[7] as usize] += 1;
freq0[bytes2[8] as usize] += 1;
freq1[bytes2[9] as usize] += 1;
freq2[bytes2[10] as usize] += 1;
freq3[bytes2[11] as usize] += 1;
freq4[bytes2[12] as usize] += 1;
freq5[bytes2[13] as usize] += 1;
freq6[bytes2[14] as usize] += 1;
freq7[bytes2[15] as usize] += 1;
freq0[bytes3[0] as usize] += 1;
freq1[bytes3[1] as usize] += 1;
freq2[bytes3[2] as usize] += 1;
freq3[bytes3[3] as usize] += 1;
freq4[bytes3[4] as usize] += 1;
freq5[bytes3[5] as usize] += 1;
freq6[bytes3[6] as usize] += 1;
freq7[bytes3[7] as usize] += 1;
freq0[bytes3[8] as usize] += 1;
freq1[bytes3[9] as usize] += 1;
freq2[bytes3[10] as usize] += 1;
freq3[bytes3[11] as usize] += 1;
freq4[bytes3[12] as usize] += 1;
freq5[bytes3[13] as usize] += 1;
freq6[bytes3[14] as usize] += 1;
freq7[bytes3[15] as usize] += 1;
}
for &b in &data[chunks * 64..] {
freq0[b as usize] += 1;
}
let freq0_ptr = freq0.as_mut_ptr();
let freq1_ptr = freq1.as_ptr();
let freq2_ptr = freq2.as_ptr();
let freq3_ptr = freq3.as_ptr();
let freq4_ptr = freq4.as_ptr();
let freq5_ptr = freq5.as_ptr();
let freq6_ptr = freq6.as_ptr();
let freq7_ptr = freq7.as_ptr();
for i in 0..64 {
let offset = i * 4;
let v0 = vld1q_u32(freq0_ptr.add(offset));
let v1 = vld1q_u32(freq1_ptr.add(offset));
let v2 = vld1q_u32(freq2_ptr.add(offset));
let v3 = vld1q_u32(freq3_ptr.add(offset));
let v4 = vld1q_u32(freq4_ptr.add(offset));
let v5 = vld1q_u32(freq5_ptr.add(offset));
let v6 = vld1q_u32(freq6_ptr.add(offset));
let v7 = vld1q_u32(freq7_ptr.add(offset));
let sum01 = vaddq_u32(v0, v1);
let sum23 = vaddq_u32(v2, v3);
let sum45 = vaddq_u32(v4, v5);
let sum67 = vaddq_u32(v6, v7);
let sum0123 = vaddq_u32(sum01, sum23);
let sum4567 = vaddq_u32(sum45, sum67);
let total = vaddq_u32(sum0123, sum4567);
vst1q_u32(freq0_ptr.add(offset), total);
}
freq0
}
#[inline]
pub fn calculate_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let freq = byte_histogram(data);
let len = data.len() as f64;
freq.iter()
.filter(|&&c| c > 0)
.map(|&c| {
let p = c as f64 / len;
-p * p.log2()
})
.sum()
}
#[inline]
pub fn calculate_entropy_str(s: &str) -> f64 {
calculate_entropy(s.as_bytes())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_histogram_empty() {
let hist = byte_histogram(&[]);
assert!(hist.iter().all(|&c| c == 0));
}
#[test]
fn test_histogram_single_byte() {
let hist = byte_histogram(&[0x42]);
assert_eq!(hist[0x42], 1);
assert_eq!(hist.iter().sum::<u32>(), 1);
}
#[test]
fn test_histogram_repeated() {
let data = vec![0xAB; 100];
let hist = byte_histogram(&data);
assert_eq!(hist[0xAB], 100);
assert_eq!(hist.iter().sum::<u32>(), 100);
}
#[test]
fn test_histogram_all_bytes() {
let data: Vec<u8> = (0u8..=255).collect();
let hist = byte_histogram(&data);
assert!(hist.iter().all(|&c| c == 1));
}
#[test]
fn test_histogram_long_string() {
let data = "The quick brown fox jumps over the lazy dog".repeat(100);
let hist = byte_histogram(data.as_bytes());
assert_eq!(hist.iter().sum::<u32>(), data.len() as u32);
}
#[test]
fn test_histogram_scalar_matches_simd() {
for len in [0, 1, 15, 16, 31, 32, 63, 64, 100, 1000] {
let data: Vec<u8> = (0..len).map(|i| (i * 7) as u8).collect();
let scalar = histogram_scalar(&data);
let simd = byte_histogram(&data);
assert_eq!(scalar, simd, "Mismatch at length {}", len);
}
}
#[test]
fn test_entropy_empty() {
assert_eq!(calculate_entropy(&[]), 0.0);
}
#[test]
fn test_entropy_uniform() {
let data: Vec<u8> = (0u8..=255).collect();
let entropy = calculate_entropy(&data);
assert!((entropy - 8.0).abs() < 0.01);
}
#[test]
fn test_entropy_single_value() {
let data = vec![0x42; 100];
let entropy = calculate_entropy(&data);
assert_eq!(entropy, 0.0);
}
#[test]
fn test_entropy_str() {
let entropy = calculate_entropy_str("hello world");
assert!(entropy > 0.0);
assert!(entropy < 8.0);
}
#[test]
fn test_entropy_high_randomness() {
let data: Vec<u8> = (0..1000).map(|i| ((i * 17 + 31) % 256) as u8).collect();
let entropy = calculate_entropy(&data);
assert!(entropy > 5.0);
}
}