use super::Match;
#[inline]
pub fn fast_entropy_estimate(data: &[u8]) -> f32 {
if data.len() < 16 {
return 4.0; }
const SAMPLE_SIZE: usize = 256;
let step = data.len() / SAMPLE_SIZE.min(data.len());
let step = step.max(1);
let mut freq = [0u16; 256];
let mut sample_count = 0u32;
let mut i = 0;
while i < data.len() && sample_count < SAMPLE_SIZE as u32 {
freq[data[i] as usize] += 1;
sample_count += 1;
i += step;
}
if sample_count == 0 {
return 4.0;
}
let n = sample_count as f32;
let mut entropy: f32 = 0.0;
for &f in &freq {
if f > 0 {
let p = f as f32 / n;
entropy -= p * p.log2();
}
}
entropy
}
#[inline]
pub fn fast_should_compress(data: &[u8]) -> bool {
if data.len() < 32 {
return true; }
let entropy = fast_entropy_estimate(data);
entropy < 7.5
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FastBlockType {
Raw,
Rle,
Compress,
}
#[inline]
pub fn fast_predict_block_type(data: &[u8]) -> FastBlockType {
if data.len() < 4 {
return FastBlockType::Raw;
}
let first = data[0];
let is_uniform = data.iter().take(64.min(data.len())).all(|&b| b == first);
if is_uniform && data.len() >= 4 {
if data.len() <= 64 || {
let step = data.len() / 16;
(0..16).all(|i| data.get(i * step).copied() == Some(first))
} {
return FastBlockType::Rle;
}
}
let entropy = fast_entropy_estimate(data);
if entropy > 7.5 {
FastBlockType::Raw
} else {
FastBlockType::Compress
}
}
#[derive(Debug, Clone)]
pub struct CompressibilityFingerprint {
pub entropy: f32,
pub pattern: PatternType,
pub estimated_ratio: f32,
pub strategy: CompressionStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PatternType {
Uniform,
LowEntropy,
Periodic { period: usize },
TextLike,
HighEntropy,
Random,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionStrategy {
RleBlock,
RleSequences,
PredefinedFse,
RawBlock,
}
impl CompressibilityFingerprint {
pub fn analyze(data: &[u8]) -> Self {
if data.is_empty() {
return Self {
entropy: 0.0,
pattern: PatternType::Uniform,
estimated_ratio: 1.0,
strategy: CompressionStrategy::RawBlock,
};
}
#[cfg(feature = "simd")]
let freq = haagenti_simd::byte_histogram(data);
#[cfg(not(feature = "simd"))]
let freq = {
let mut f = [0u32; 256];
for &b in data {
f[b as usize] += 1;
}
f
};
let unique_bytes = freq.iter().filter(|&&f| f > 0).count();
if unique_bytes == 1 {
return Self {
entropy: 0.0,
pattern: PatternType::Uniform,
estimated_ratio: 0.01, strategy: CompressionStrategy::RleBlock,
};
}
let len = data.len() as f32;
let entropy: f32 = freq
.iter()
.filter(|&&f| f > 0)
.map(|&f| {
let p = f as f32 / len;
-p * p.log2()
})
.sum();
let period = detect_period(data);
let pattern = if let Some(p) = period {
PatternType::Periodic { period: p }
} else if entropy < 3.0 {
PatternType::LowEntropy
} else if entropy < 5.5 && is_text_like(&freq) {
PatternType::TextLike
} else if entropy > 7.5 {
PatternType::Random
} else {
PatternType::HighEntropy
};
let estimated_ratio = match pattern {
PatternType::Uniform => 0.01,
PatternType::Periodic { period } => 0.1 + (period as f32 * 0.02),
PatternType::LowEntropy => 0.3 + (entropy / 10.0),
PatternType::TextLike => 0.4 + (entropy / 20.0),
PatternType::HighEntropy => 0.8 + (entropy / 40.0),
PatternType::Random => 1.1, };
let has_runs = data.len() >= 8 && {
let mut runs = 0;
let mut i = 0;
while i < data.len().min(256) {
let start = i;
while i < data.len().min(256) && data[i] == data[start] {
i += 1;
}
if i - start >= 4 {
runs += 1;
}
}
runs >= 3 };
let strategy = match pattern {
PatternType::Uniform => CompressionStrategy::RleBlock,
PatternType::Periodic { period } if period <= 4 => CompressionStrategy::RleSequences,
PatternType::LowEntropy if unique_bytes <= 16 => CompressionStrategy::RleSequences,
PatternType::TextLike => CompressionStrategy::PredefinedFse,
PatternType::Periodic { .. } => CompressionStrategy::PredefinedFse,
PatternType::LowEntropy => CompressionStrategy::PredefinedFse,
PatternType::HighEntropy if has_runs => CompressionStrategy::PredefinedFse,
PatternType::HighEntropy if estimated_ratio < 0.95 => {
CompressionStrategy::PredefinedFse
}
_ => CompressionStrategy::RawBlock,
};
Self {
entropy,
pattern,
estimated_ratio,
strategy,
}
}
pub fn refine_with_matches(&mut self, data: &[u8], matches: &[Match]) {
if matches.is_empty() {
if self.strategy == CompressionStrategy::RleSequences
|| self.strategy == CompressionStrategy::PredefinedFse
{
self.strategy = CompressionStrategy::RawBlock;
self.estimated_ratio = 1.05; }
return;
}
let matched_bytes: usize = matches.iter().map(|m| m.length).sum();
let coverage = matched_bytes as f32 / data.len() as f32;
let (uniform_offsets, uniform_lengths) = analyze_match_uniformity(matches);
if coverage > 0.5 && uniform_offsets && uniform_lengths {
self.strategy = CompressionStrategy::RleSequences;
self.estimated_ratio = 0.2 + (1.0 - coverage) * 0.5;
} else if coverage > 0.3 {
self.strategy = CompressionStrategy::PredefinedFse;
self.estimated_ratio = 0.4 + (1.0 - coverage) * 0.4;
} else if coverage < 0.1 {
self.strategy = CompressionStrategy::RawBlock;
self.estimated_ratio = 1.02;
}
}
}
fn detect_period(data: &[u8]) -> Option<usize> {
if data.len() < 8 {
return None;
}
for period in 1..=8.min(data.len() / 2) {
let mut matches = 0;
let mut checks = 0;
let step = (data.len() / 32).max(1);
let mut i = period;
while i < data.len() {
if data[i] == data[i - period] {
matches += 1;
}
checks += 1;
i += step;
}
if checks > 0 && matches as f32 / checks as f32 > 0.9 {
return Some(period);
}
}
None
}
fn is_text_like(freq: &[u32; 256]) -> bool {
let printable: u32 = freq[32..=126].iter().sum();
let total: u32 = freq.iter().sum();
if total == 0 {
return false;
}
let common_text = freq[32]
+ freq[b'e' as usize]
+ freq[b't' as usize]
+ freq[b'a' as usize]
+ freq[b'o' as usize]
+ freq[b'n' as usize];
printable as f32 / total as f32 > 0.8 && common_text as f32 / total as f32 > 0.2
}
fn analyze_match_uniformity(matches: &[Match]) -> (bool, bool) {
if matches.len() < 2 {
return (true, true);
}
let first_offset = matches[0].offset;
let first_length = matches[0].length;
let uniform_offsets = matches.iter().all(|m| {
let diff = m.offset.abs_diff(first_offset);
diff <= 3 });
let uniform_lengths = matches.iter().all(|m| {
let diff = m.length.abs_diff(first_length);
diff <= 2 });
(uniform_offsets, uniform_lengths)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_uniform_detection() {
let data = vec![b'A'; 100];
let fp = CompressibilityFingerprint::analyze(&data);
assert_eq!(fp.pattern, PatternType::Uniform);
assert_eq!(fp.strategy, CompressionStrategy::RleBlock);
assert!(fp.estimated_ratio < 0.1);
}
#[test]
fn test_random_detection() {
let data: Vec<u8> = (0u64..256)
.map(|i| {
let x = i.wrapping_mul(1103515245).wrapping_add(12345);
(x >> 16) as u8
})
.collect();
let fp = CompressibilityFingerprint::analyze(&data);
assert!(fp.entropy > 6.0, "Entropy was {}", fp.entropy);
}
#[test]
fn test_periodic_detection() {
let pattern = b"ABCD";
let data: Vec<u8> = pattern.iter().cycle().take(100).copied().collect();
let fp = CompressibilityFingerprint::analyze(&data);
if let PatternType::Periodic { period } = fp.pattern {
assert_eq!(period, 4);
} else {
panic!("Expected Periodic pattern, got {:?}", fp.pattern);
}
}
#[test]
fn test_text_like_detection() {
let data = b"The quick brown fox jumps over the lazy dog.";
let fp = CompressibilityFingerprint::analyze(data);
assert_eq!(fp.pattern, PatternType::TextLike);
}
#[test]
fn test_low_entropy() {
let mut data = Vec::new();
data.extend(std::iter::repeat(0u8).take(50));
data.extend(std::iter::repeat(1u8).take(30));
data.extend(std::iter::repeat(2u8).take(20));
let fp = CompressibilityFingerprint::analyze(&data);
assert!(fp.entropy < 2.0, "Entropy was {}", fp.entropy);
assert!(matches!(
fp.pattern,
PatternType::LowEntropy | PatternType::Periodic { .. }
));
}
#[test]
fn test_empty_data() {
let fp = CompressibilityFingerprint::analyze(&[]);
assert_eq!(fp.strategy, CompressionStrategy::RawBlock);
}
#[test]
fn test_match_uniformity_analysis() {
let uniform_matches = vec![
Match::new(10, 5, 4),
Match::new(20, 5, 4),
Match::new(30, 5, 4),
];
let (uniform_off, uniform_len) = analyze_match_uniformity(&uniform_matches);
assert!(uniform_off);
assert!(uniform_len);
}
#[test]
fn test_match_non_uniformity() {
let varied_matches = vec![
Match::new(10, 5, 4),
Match::new(20, 100, 20),
Match::new(50, 3, 3),
];
let (uniform_off, uniform_len) = analyze_match_uniformity(&varied_matches);
assert!(!uniform_off);
assert!(!uniform_len);
}
#[test]
fn test_refine_with_no_matches() {
let data = b"unique data with no repetition xyz";
let mut fp = CompressibilityFingerprint::analyze(data);
fp.refine_with_matches(data, &[]);
assert_eq!(fp.strategy, CompressionStrategy::RawBlock);
}
#[test]
fn test_refine_with_good_matches() {
let data = b"abcdabcdabcdabcdabcdabcd";
let mut fp = CompressibilityFingerprint::analyze(data);
let matches = vec![
Match::new(4, 4, 4),
Match::new(8, 4, 4),
Match::new(12, 4, 4),
Match::new(16, 4, 4),
Match::new(20, 4, 4),
];
fp.refine_with_matches(data, &matches);
assert_eq!(fp.strategy, CompressionStrategy::RleSequences);
}
#[test]
fn test_fast_entropy_estimate_zeros() {
let data = vec![0u8; 1000];
let entropy = fast_entropy_estimate(&data);
assert!(
entropy < 0.1,
"Zeros should have ~0 entropy, got {}",
entropy
);
}
#[test]
fn test_fast_entropy_estimate_random() {
let data: Vec<u8> = (0u64..1000)
.map(|i| {
let x = i.wrapping_mul(1103515245).wrapping_add(12345);
(x % 256) as u8
})
.collect();
let entropy = fast_entropy_estimate(&data);
assert!(
entropy > 7.0,
"Random should have high entropy, got {}",
entropy
);
}
#[test]
fn test_fast_entropy_estimate_text() {
let data = b"The quick brown fox jumps over the lazy dog. ";
let entropy = fast_entropy_estimate(data);
assert!(
entropy > 3.5 && entropy < 6.0,
"Text should have moderate entropy, got {}",
entropy
);
}
#[test]
fn test_fast_should_compress_compressible() {
let zeros = vec![0u8; 1000];
assert!(fast_should_compress(&zeros), "Zeros should be compressible");
let text = b"The quick brown fox jumps over the lazy dog. ";
assert!(fast_should_compress(text), "Text should be compressible");
let repeated = b"abcdefgh".repeat(100);
assert!(
fast_should_compress(&repeated),
"Repeated pattern should be compressible"
);
}
#[test]
fn test_fast_should_compress_incompressible() {
let random: Vec<u8> = (0u64..1000)
.map(|i| {
let x = i
.wrapping_mul(0x5851f42d4c957f2d)
.wrapping_add(0x14057b7ef767814f);
((x >> 32) ^ x) as u8
})
.collect();
let should = fast_should_compress(&random);
println!(
"Random data should_compress: {} (entropy: {})",
should,
fast_entropy_estimate(&random)
);
}
#[test]
fn test_fast_predict_block_type_rle() {
let uniform = vec![b'X'; 1000];
assert_eq!(fast_predict_block_type(&uniform), FastBlockType::Rle);
}
#[test]
fn test_fast_predict_block_type_compress() {
let text = b"The quick brown fox jumps over the lazy dog repeatedly.";
assert_eq!(fast_predict_block_type(text), FastBlockType::Compress);
}
#[test]
fn test_fast_predict_block_type_raw_for_random() {
let data: Vec<u8> = (0..1000)
.map(|i| {
let x = (i as u64).wrapping_mul(0x5851f42d4c957f2d);
((x >> 32) ^ x) as u8
})
.collect();
let block_type = fast_predict_block_type(&data);
println!(
"High entropy block type: {:?} (entropy: {})",
block_type,
fast_entropy_estimate(&data)
);
}
#[test]
fn test_fast_entropy_estimate_speed() {
let data = vec![0u8; 100_000];
let start = std::time::Instant::now();
for _ in 0..10_000 {
std::hint::black_box(fast_entropy_estimate(&data));
}
let elapsed = start.elapsed();
assert!(
elapsed.as_millis() < 100,
"fast_entropy_estimate too slow: {:?} for 10K iterations",
elapsed
);
}
}