#[derive(Debug, Clone)]
pub struct BlockProfile {
pub optimal_block_size: usize,
pub entropy: f64,
pub repetitiveness: f64,
pub recommended_codec: &'static str,
}
pub struct AdaptiveBlocker;
impl AdaptiveBlocker {
const MIN_BLOCK_SIZE: usize = 4096; const MAX_BLOCK_SIZE: usize = 262144; const DEFAULT_BLOCK_SIZE: usize = 65536;
pub fn analyze_block_profile(data: &[u8]) -> BlockProfile {
let entropy = Self::calculate_entropy(data);
let repetitiveness = Self::calculate_repetitiveness(data);
let optimal_block_size = Self::calculate_optimal_block_size(entropy, repetitiveness);
let recommended_codec = Self::recommend_codec(entropy);
BlockProfile {
optimal_block_size,
entropy,
repetitiveness,
recommended_codec,
}
}
fn calculate_entropy(data: &[u8]) -> f64 {
let mut freq = [0usize; 256];
for &byte in data {
freq[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for count in &freq {
if *count > 0 {
let p = *count as f64 / len;
entropy -= p * p.log2();
}
}
entropy
}
fn calculate_repetitiveness(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut run_count = 0;
let mut total_run_bytes = 0;
let mut current_run = 1;
for window in data.windows(2) {
if window[0] == window[1] {
current_run += 1;
} else {
if current_run > 1 {
run_count += 1;
total_run_bytes += current_run;
}
current_run = 1;
}
}
if current_run > 1 {
run_count += 1;
total_run_bytes += current_run;
}
let run_ratio = total_run_bytes as f64 / data.len() as f64;
run_ratio.min(1.0)
}
fn calculate_optimal_block_size(entropy: f64, repetitiveness: f64) -> usize {
let compressibility = (8.0 - entropy) + repetitiveness;
match compressibility {
c if c > 6.0 => 262144, c if c > 5.0 => 131072, c if c > 4.0 => 65536, c if c > 3.0 => 32768, c if c > 2.0 => 16384, _ => 8192, }
}
fn recommend_codec(entropy: f64) -> &'static str {
match entropy {
e if e < 2.0 => "RLE", e if e < 3.0 => "Dictionary", e if e < 4.0 => "FOR", e if e < 5.0 => "LZSS", e if e < 6.0 => "ZSTD_Advanced", _ => "LZ4", }
}
pub fn create_adaptive_blocks(data: &[u8]) -> Vec<Vec<u8>> {
let profile = Self::analyze_block_profile(data);
let block_size = profile.optimal_block_size;
data.chunks(block_size)
.map(|chunk| chunk.to_vec())
.collect()
}
pub fn estimate_compression_ratio(block: &[u8]) -> f64 {
let entropy = Self::calculate_entropy(block);
let theoretical_min = entropy / 8.0;
let overhead = 0.02; (theoretical_min + overhead).min(1.0) }
}
#[derive(Debug, Clone)]
pub struct BlockCompressionResult {
pub block_index: usize,
pub original_size: usize,
pub compressed_size: usize,
pub compression_ratio: f64,
pub codec_used: String,
pub compression_time_us: u128,
}
impl BlockCompressionResult {
pub fn new(
block_index: usize,
original_size: usize,
compressed_size: usize,
codec_used: String,
) -> Self {
let compression_ratio = if original_size > 0 {
compressed_size as f64 / original_size as f64
} else {
1.0
};
Self {
block_index,
original_size,
compressed_size,
compression_ratio,
codec_used,
compression_time_us: 0,
}
}
pub fn bytes_saved(&self) -> usize {
self.original_size.saturating_sub(self.compressed_size)
}
}
pub struct BlockCompressionOrchestrator {
results: Vec<BlockCompressionResult>,
total_original: usize,
total_compressed: usize,
}
impl BlockCompressionOrchestrator {
pub fn new() -> Self {
Self {
results: Vec::new(),
total_original: 0,
total_compressed: 0,
}
}
pub fn add_result(&mut self, result: BlockCompressionResult) {
self.total_original += result.original_size;
self.total_compressed += result.compressed_size;
self.results.push(result);
}
pub fn overall_ratio(&self) -> f64 {
if self.total_original == 0 {
1.0
} else {
self.total_compressed as f64 / self.total_original as f64
}
}
pub fn best_block(&self) -> Option<&BlockCompressionResult> {
self.results
.iter()
.min_by(|a, b| a.compression_ratio.partial_cmp(&b.compression_ratio).unwrap_or(std::cmp::Ordering::Equal))
}
pub fn worst_block(&self) -> Option<&BlockCompressionResult> {
self.results
.iter()
.max_by(|a, b| a.compression_ratio.partial_cmp(&b.compression_ratio).unwrap_or(std::cmp::Ordering::Equal))
}
pub fn summary(&self) -> String {
format!(
"Blocks: {} | Original: {} MB | Compressed: {} MB | Ratio: {:.1}%",
self.results.len(),
self.total_original / 1024 / 1024,
self.total_compressed / 1024 / 1024,
self.overall_ratio() * 100.0
)
}
}
impl Default for BlockCompressionOrchestrator {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_entropy_uniform() {
let uniform = vec![42u8; 1000];
let entropy = AdaptiveBlocker::calculate_entropy(&uniform);
assert!(entropy < 0.1);
}
#[test]
fn test_entropy_random() {
let random: Vec<u8> = (0..256).cycle().take(1024).map(|x| x as u8).collect();
let entropy = AdaptiveBlocker::calculate_entropy(&random);
assert!(entropy > 7.5); }
#[test]
fn test_repetitiveness_high() {
let repetitive = [1u8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]
.to_vec();
let rep = AdaptiveBlocker::calculate_repetitiveness(&repetitive);
assert!(rep > 0.5);
}
#[test]
fn test_block_size_selection() {
let high_entropy_size = AdaptiveBlocker::calculate_optimal_block_size(7.5, 0.1);
assert!(high_entropy_size <= 65536);
let low_entropy_size = AdaptiveBlocker::calculate_optimal_block_size(1.0, 0.9);
assert_eq!(low_entropy_size, 262144);
}
#[test]
fn test_compression_ratio_estimation() {
let uniform = vec![1u8; 1000];
let ratio = AdaptiveBlocker::estimate_compression_ratio(&uniform);
assert!(ratio < 0.5); }
#[test]
fn test_adaptive_blocks() {
let data = (0..10000usize).map(|x| x as u8).collect::<Vec<u8>>();
let blocks = AdaptiveBlocker::create_adaptive_blocks(&data);
assert!(!blocks.is_empty());
assert!(blocks[0].len() <= AdaptiveBlocker::MAX_BLOCK_SIZE);
}
#[test]
fn test_orchestrator() {
let mut orch = BlockCompressionOrchestrator::new();
orch.add_result(BlockCompressionResult::new(0, 1000, 500, "RLE".to_string()));
orch.add_result(BlockCompressionResult::new(1, 1000, 700, "ZSTD".to_string()));
assert_eq!(orch.overall_ratio(), 0.6);
assert!(orch.best_block().is_some());
}
}