use crate::compression::dict_zip::{
compression_types::{CompressionType, Match, calculate_encoding_cost, choose_best_compression_type},
dictionary::{SuffixArrayDictionary, MatchStats},
local_matcher::{LocalMatcher, LocalMatcherConfig, LocalMatch, LocalMatcherStats},
dfa_cache::{CacheStats},
reference_encoding::{compress_record_reference},
};
use crate::error::{Result, ZiporaError};
use crate::memory::SecureMemoryPool;
#[cfg(test)]
use crate::memory::SecurePoolConfig;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct PaZipCompressorConfig {
pub local_config: LocalMatcherConfig,
pub max_local_probe_distance: u32,
pub max_global_probe_distance: u32,
pub min_net_benefit: i32,
pub literal_cost_bits: u32,
pub global_access_cost: u32,
pub learning_rate: f64,
pub adaptive_thresholds: bool,
pub use_reference_encoding: bool,
pub use_suffix_array_local_match: bool,
pub enable_simd: bool,
pub enable_multithreading: bool,
pub multithreading_threshold: usize,
pub output_buffer_size: usize,
pub collect_detailed_stats: bool,
}
impl Default for PaZipCompressorConfig {
fn default() -> Self {
Self {
local_config: LocalMatcherConfig::default(),
max_local_probe_distance: 8,
max_global_probe_distance: 16,
min_net_benefit: 2,
literal_cost_bits: 8,
global_access_cost: 4,
learning_rate: 0.1,
adaptive_thresholds: true,
use_reference_encoding: false, use_suffix_array_local_match: false, enable_simd: true,
enable_multithreading: true,
multithreading_threshold: 64 * 1024,
output_buffer_size: 1024 * 1024,
collect_detailed_stats: false,
}
}
}
impl PaZipCompressorConfig {
pub fn fast_compression() -> Self {
Self {
max_local_probe_distance: 4,
max_global_probe_distance: 8,
min_net_benefit: 1,
learning_rate: 0.05,
collect_detailed_stats: false,
..Default::default()
}
}
pub fn high_compression() -> Self {
Self {
max_local_probe_distance: 16,
max_global_probe_distance: 32,
min_net_benefit: 3,
learning_rate: 0.15,
collect_detailed_stats: true,
..Default::default()
}
}
pub fn balanced() -> Self {
Self::default()
}
pub fn realtime() -> Self {
Self {
max_local_probe_distance: 2,
max_global_probe_distance: 4,
min_net_benefit: 0,
adaptive_thresholds: false,
use_reference_encoding: false, use_suffix_array_local_match: false, enable_multithreading: false,
collect_detailed_stats: false,
..Default::default()
}
}
pub fn reference_compliant() -> Self {
Self {
max_local_probe_distance: 30, max_global_probe_distance: 100, use_reference_encoding: true, use_suffix_array_local_match: true, adaptive_thresholds: false, enable_multithreading: false, collect_detailed_stats: true, ..Default::default()
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum CompressionStrategy {
Literal { length: u8 },
Local { distance: u32, length: u32, match_type: CompressionType },
Global { dict_offset: u32, length: u32, match_type: CompressionType },
}
#[derive(Debug, Clone)]
pub struct CostAnalysis {
pub net_benefit: i32,
pub encoding_cost: u32,
pub access_cost: u32,
pub total_cost: u32,
pub match_length: u32,
pub efficiency: f64,
}
#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct CompressionStats {
pub bytes_processed: u64,
pub bytes_output: u64,
pub compression_ratio: f64,
pub literal_count: u64,
pub local_matches: u64,
pub global_matches: u64,
pub local_bytes_saved: u64,
pub global_bytes_saved: u64,
pub avg_local_length: f64,
pub avg_global_length: f64,
pub cache_hit_rate: f64,
pub compression_time: Duration,
pub processing_speed: f64,
pub strategy_switches: u64,
pub threshold_adjustments: u64,
pub compression_type_usage: [u64; 8],
}
impl CompressionStats {
pub fn new() -> Self {
Self::default()
}
pub fn finalize(&mut self, start_time: Instant) {
self.compression_time = start_time.elapsed();
if self.bytes_processed > 0 {
self.compression_ratio = self.bytes_output as f64 / self.bytes_processed as f64;
self.processing_speed = self.bytes_processed as f64 / self.compression_time.as_secs_f64();
}
if self.local_matches > 0 {
self.avg_local_length = self.local_bytes_saved as f64 / self.local_matches as f64;
}
if self.global_matches > 0 {
self.avg_global_length = self.global_bytes_saved as f64 / self.global_matches as f64;
}
}
pub fn merge(&mut self, other: &CompressionStats) {
self.bytes_processed += other.bytes_processed;
self.bytes_output += other.bytes_output;
self.literal_count += other.literal_count;
self.local_matches += other.local_matches;
self.global_matches += other.global_matches;
self.local_bytes_saved += other.local_bytes_saved;
self.global_bytes_saved += other.global_bytes_saved;
self.strategy_switches += other.strategy_switches;
self.threshold_adjustments += other.threshold_adjustments;
for i in 0..8 {
self.compression_type_usage[i] += other.compression_type_usage[i];
}
}
}
#[derive(Clone)]
pub struct PaZipCompressor {
dictionary: SuffixArrayDictionary,
local_matcher: LocalMatcher,
config: PaZipCompressorConfig,
memory_pool: Arc<SecureMemoryPool>,
adaptive_thresholds: AdaptiveThresholds,
stats: CompressionStats,
output_buffer: Vec<u8>,
current_strategy: Option<CompressionStrategy>,
}
#[derive(Debug, Clone)]
struct AdaptiveThresholds {
min_net_benefit: f64,
global_bias: f64,
literal_threshold: f64,
momentum: f64,
update_count: u64,
}
impl Default for AdaptiveThresholds {
fn default() -> Self {
Self {
min_net_benefit: 2.0,
global_bias: 0.0,
literal_threshold: 1.0,
momentum: 0.9,
update_count: 0,
}
}
}
impl AdaptiveThresholds {
fn update(&mut self, efficiency: f64, strategy: CompressionStrategy, learning_rate: f64) {
let update_factor = learning_rate * (1.0 - self.momentum) + self.momentum;
match strategy {
CompressionStrategy::Global { .. } => {
if efficiency > 0.8 {
self.global_bias += update_factor * 0.1;
} else if efficiency < 0.4 {
self.global_bias -= update_factor * 0.1;
}
},
CompressionStrategy::Local { .. } => {
if efficiency > 0.8 {
self.global_bias -= update_factor * 0.05;
} else if efficiency < 0.4 {
self.global_bias += update_factor * 0.05;
}
},
CompressionStrategy::Literal { .. } => {
if efficiency < 0.3 {
self.literal_threshold += update_factor * 0.2;
}
},
}
self.global_bias = self.global_bias.clamp(-2.0, 2.0);
self.literal_threshold = self.literal_threshold.clamp(0.5, 5.0);
self.min_net_benefit = self.min_net_benefit.clamp(0.5, 10.0);
self.update_count += 1;
}
}
impl PaZipCompressor {
pub fn new(
dictionary: SuffixArrayDictionary,
config: PaZipCompressorConfig,
memory_pool: Arc<SecureMemoryPool>,
) -> Result<Self> {
let local_matcher = LocalMatcher::new(config.local_config.clone(), memory_pool.clone())?;
let output_buffer = Vec::with_capacity(config.output_buffer_size);
Ok(Self {
dictionary,
local_matcher,
config,
memory_pool,
adaptive_thresholds: AdaptiveThresholds::default(),
stats: CompressionStats::new(),
output_buffer,
current_strategy: None,
})
}
pub fn compress(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<CompressionStats> {
let start_time = Instant::now();
self.stats = CompressionStats::new();
self.output_buffer.clear();
if input.is_empty() {
return Ok(self.stats.clone());
}
if self.config.enable_multithreading && input.len() >= self.config.multithreading_threshold {
self.compress_parallel(input, output)?;
} else {
self.compress_sequential(input, output)?;
}
self.stats.finalize(start_time);
Ok(self.stats.clone())
}
fn compress_sequential(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
if self.config.use_reference_encoding {
self.compress_sequential_reference(input, output)
} else {
self.compress_sequential_legacy(input, output)
}
}
fn compress_sequential_reference(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
let global_dictionary = if self.dictionary.size_in_bytes() > 0 {
Some(self.dictionary.data()) } else {
None
};
let g_offset_bits = 24; let g_max_short_len = 32;
let bytes_written = compress_record_reference(
input,
output,
self.config.use_suffix_array_local_match,
global_dictionary,
g_offset_bits,
g_max_short_len,
)?;
self.stats.bytes_processed = input.len() as u64;
self.stats.bytes_output = bytes_written as u64;
Ok(())
}
fn compress_sequential_legacy(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
let mut pos = 0;
while pos < input.len() {
let local_match = self.find_local_match(input, pos)?;
let global_match = self.find_global_match(input, pos)?;
let strategies = self.calculate_strategy_costs(input, pos, local_match, global_match)?;
let selected_strategy = self.select_optimal_strategy(strategies)?;
let mut temp_buffer = Vec::new();
let advance_length = self.apply_compression_strategy(input, pos, selected_strategy, &mut temp_buffer)?;
self.output_buffer.extend_from_slice(&temp_buffer);
self.update_statistics(selected_strategy, advance_length);
if self.config.adaptive_thresholds {
self.update_adaptive_thresholds(selected_strategy);
}
pos += advance_length;
self.current_strategy = Some(selected_strategy);
}
output.extend_from_slice(&self.output_buffer);
self.stats.bytes_processed = input.len() as u64;
self.stats.bytes_output = output.len() as u64;
Ok(())
}
pub fn decompress(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
if input.is_empty() {
return Ok(());
}
output.clear();
output.reserve(input.len() * 2);
let mut pos = 0;
while pos < input.len() {
if pos >= input.len() {
break;
}
let compression_type_byte = input[pos];
pos += 1;
let compression_type = match compression_type_byte {
0 => CompressionType::Literal,
1 => CompressionType::Global,
2 => CompressionType::RLE,
3 => CompressionType::NearShort,
4 => CompressionType::Far1Short,
5 => CompressionType::Far2Short,
6 => CompressionType::Far2Long,
7 => CompressionType::Far3Long,
_ => CompressionType::Literal, };
pos = self.decompress_match(input, pos, compression_type, output)?;
}
Ok(())
}
fn decompress_match(
&mut self,
input: &[u8],
pos: usize,
compression_type: CompressionType,
output: &mut Vec<u8>,
) -> Result<usize> {
let mut new_pos = pos;
match compression_type {
CompressionType::Literal => {
if new_pos >= input.len() {
return Ok(new_pos);
}
let length = input[new_pos] as usize;
new_pos += 1;
if new_pos + length > input.len() {
return Err(ZiporaError::invalid_data("Literal data exceeds input bounds"));
}
output.extend_from_slice(&input[new_pos..new_pos + length]);
new_pos += length;
}
CompressionType::Global => {
if new_pos + 3 >= input.len() {
return Ok(new_pos);
}
let offset = u16::from_le_bytes([input[new_pos], input[new_pos + 1]]) as usize;
let length = u16::from_le_bytes([input[new_pos + 2], input[new_pos + 3]]) as usize;
new_pos += 4;
let dict_text = self.dictionary.dictionary_text();
if offset + length <= dict_text.len() {
output.extend_from_slice(&dict_text[offset..offset + length]);
} else {
let available_length = dict_text.len().saturating_sub(offset);
if available_length > 0 {
output.extend_from_slice(&dict_text[offset..offset + available_length]);
}
return Err(ZiporaError::invalid_data("Global match exceeds dictionary bounds"));
}
}
CompressionType::RLE => {
if new_pos + 1 >= input.len() {
return Ok(new_pos);
}
let byte_value = input[new_pos];
let length = input[new_pos + 1] as usize;
new_pos += 2;
for _ in 0..length {
output.push(byte_value);
}
}
CompressionType::NearShort | CompressionType::Far1Short => {
if new_pos + 1 >= input.len() {
return Ok(new_pos);
}
let distance = input[new_pos] as usize;
let length = input[new_pos + 1] as usize;
new_pos += 2;
self.copy_from_distance(output, distance, length)?;
}
CompressionType::Far2Short => {
if new_pos + 2 >= input.len() {
return Ok(new_pos);
}
let distance = u16::from_le_bytes([input[new_pos], input[new_pos + 1]]) as usize;
let length = input[new_pos + 2] as usize;
new_pos += 3;
self.copy_from_distance(output, distance, length)?;
}
CompressionType::Far2Long => {
if new_pos + 3 >= input.len() {
return Ok(new_pos);
}
let distance = u16::from_le_bytes([input[new_pos], input[new_pos + 1]]) as usize;
let length = u16::from_le_bytes([input[new_pos + 2], input[new_pos + 3]]) as usize;
new_pos += 4;
self.copy_from_distance(output, distance, length)?;
}
CompressionType::Far3Long => {
if new_pos + 7 >= input.len() {
return Ok(new_pos);
}
let distance = u32::from_le_bytes([
input[new_pos], input[new_pos + 1], input[new_pos + 2], input[new_pos + 3]
]) as usize;
let length = u32::from_le_bytes([
input[new_pos + 4], input[new_pos + 5], input[new_pos + 6], input[new_pos + 7]
]) as usize;
new_pos += 8;
self.copy_from_distance(output, distance, length)?;
}
}
Ok(new_pos)
}
fn copy_from_distance(&self, output: &mut Vec<u8>, distance: usize, length: usize) -> Result<()> {
if distance == 0 || distance > output.len() {
return Err(ZiporaError::invalid_data("Invalid backreference distance"));
}
let start_pos = output.len() - distance;
for i in 0..length {
if start_pos + (i % distance) >= output.len() {
break;
}
let byte = output[start_pos + (i % distance)];
output.push(byte);
}
Ok(())
}
fn compress_parallel(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
const PARALLEL_THRESHOLD: usize = 1024 * 1024; const BLOCK_SIZE: usize = 64 * 1024;
if input.len() < PARALLEL_THRESHOLD {
return self.compress_sequential(input, output);
}
let num_blocks = (input.len() + BLOCK_SIZE - 1) / BLOCK_SIZE;
let mut compressed_blocks = Vec::with_capacity(num_blocks);
for i in 0..num_blocks {
let start = i * BLOCK_SIZE;
let end = (start + BLOCK_SIZE).min(input.len());
let block = &input[start..end];
let mut block_output = Vec::new();
self.compress_sequential(block, &mut block_output)?;
compressed_blocks.push(block_output);
}
output.clear();
for block in compressed_blocks {
output.extend_from_slice(&block);
}
Ok(())
}
fn find_local_match(&mut self, input: &[u8], pos: usize) -> Result<Option<LocalMatch>> {
if pos >= input.len() {
return Ok(None);
}
let remaining = &input[pos..];
let max_length = remaining.len().min(self.config.local_config.max_match_length);
self.local_matcher.find_match(remaining, self.config.max_local_probe_distance as usize, max_length)
}
fn find_global_match(&mut self, input: &[u8], pos: usize) -> Result<Option<crate::compression::dict_zip::matcher::Match>> {
if pos >= input.len() {
return Ok(None);
}
let remaining = &input[pos..];
let max_length = remaining.len().min(256);
self.dictionary.find_longest_match(remaining, 0, max_length)
}
fn calculate_strategy_costs(
&self,
_input: &[u8],
_pos: usize,
local_match: Option<LocalMatch>,
global_match: Option<crate::compression::dict_zip::matcher::Match>,
) -> Result<Vec<(CompressionStrategy, CostAnalysis)>> {
let mut strategies = Vec::new();
let literal_strategy = CompressionStrategy::Literal { length: 1 };
let literal_cost = self.calculate_literal_cost(1);
strategies.push((literal_strategy, literal_cost));
if let Some(local) = local_match {
if let Some((strategy, cost)) = self.calculate_local_match_cost(local)? {
strategies.push((strategy, cost));
}
}
if let Some(global) = global_match {
if let Some((strategy, cost)) = self.calculate_global_match_cost(global)? {
strategies.push((strategy, cost));
}
}
Ok(strategies)
}
fn calculate_literal_cost(&self, length: u32) -> CostAnalysis {
let encoding_cost = self.config.literal_cost_bits * length;
CostAnalysis {
net_benefit: -(encoding_cost as i32),
encoding_cost,
access_cost: 0,
total_cost: encoding_cost,
match_length: length,
efficiency: 0.0, }
}
fn calculate_local_match_cost(&self, local_match: LocalMatch) -> Result<Option<(CompressionStrategy, CostAnalysis)>> {
let compression_type = match choose_best_compression_type(local_match.distance, local_match.length) {
Some(ct) => ct,
None => return Ok(None), };
let temp_match = Match::from_local_match(local_match.clone(), compression_type);
let encoding_cost = calculate_encoding_cost(&temp_match);
let total_cost = encoding_cost;
let net_benefit = local_match.length as i32 * 8 - total_cost as i32;
let strategy = CompressionStrategy::Local {
distance: local_match.distance as u32,
length: local_match.length as u32,
match_type: compression_type,
};
let cost_analysis = CostAnalysis {
net_benefit,
encoding_cost: encoding_cost as u32,
access_cost: 0,
total_cost: total_cost as u32,
match_length: local_match.length as u32,
efficiency: if local_match.length > 0 {
(local_match.length as f64 * 8.0 - total_cost as f64) / (local_match.length as f64 * 8.0)
} else {
0.0
},
};
Ok(Some((strategy, cost_analysis)))
}
fn calculate_global_match_cost(&self, global_match: crate::compression::dict_zip::matcher::Match) -> Result<Option<(CompressionStrategy, CostAnalysis)>> {
let compression_type = CompressionType::Global;
let temp_match = Match::Global {
dict_position: global_match.dict_position as u32,
length: global_match.length as u16,
};
let encoding_cost = calculate_encoding_cost(&temp_match);
let access_cost = self.config.global_access_cost;
let total_cost = encoding_cost as u32 + access_cost;
let net_benefit = global_match.length as i32 * 8 - total_cost as i32;
let strategy = CompressionStrategy::Global {
dict_offset: global_match.dict_position as u32,
length: global_match.length as u32,
match_type: compression_type,
};
let cost_analysis = CostAnalysis {
net_benefit,
encoding_cost: encoding_cost as u32,
access_cost,
total_cost,
match_length: global_match.length as u32,
efficiency: if global_match.length > 0 {
(global_match.length as f64 * 8.0 - total_cost as f64) / (global_match.length as f64 * 8.0)
} else {
0.0
},
};
Ok(Some((strategy, cost_analysis)))
}
fn select_optimal_strategy(
&self,
strategies: Vec<(CompressionStrategy, CostAnalysis)>,
) -> Result<CompressionStrategy> {
if strategies.is_empty() {
return Ok(CompressionStrategy::Literal { length: 1 });
}
let mut best_strategy = strategies[0].0;
let mut best_benefit = strategies[0].1.net_benefit as f64;
for (strategy, analysis) in strategies {
let mut adjusted_benefit = analysis.net_benefit as f64;
match strategy {
CompressionStrategy::Global { .. } => {
adjusted_benefit += self.adaptive_thresholds.global_bias;
},
CompressionStrategy::Local { .. } => {
adjusted_benefit -= self.adaptive_thresholds.global_bias * 0.5;
},
CompressionStrategy::Literal { .. } => {
adjusted_benefit -= self.adaptive_thresholds.literal_threshold;
},
}
if adjusted_benefit >= self.adaptive_thresholds.min_net_benefit && adjusted_benefit > best_benefit {
best_strategy = strategy;
best_benefit = adjusted_benefit;
}
}
Ok(best_strategy)
}
fn apply_compression_strategy(
&mut self,
input: &[u8],
pos: usize,
strategy: CompressionStrategy,
output: &mut Vec<u8>,
) -> Result<usize> {
match strategy {
CompressionStrategy::Literal { length } => {
output.push(0); output.push(length as u8);
let end_pos = (pos + length as usize).min(input.len());
output.extend_from_slice(&input[pos..end_pos]); Ok(end_pos - pos)
},
CompressionStrategy::Local { distance, length, match_type } => {
let type_byte = match_type as u8;
output.push(type_byte);
match match_type {
CompressionType::RLE => {
if pos < input.len() {
output.push(input[pos]); } else {
output.push(0); }
output.push(length as u8);
},
CompressionType::NearShort => {
output.push(distance as u8);
output.push(length as u8);
},
CompressionType::Far1Short => {
output.extend_from_slice(&(distance as u16).to_le_bytes());
output.push(length as u8);
},
CompressionType::Far2Short => {
output.extend_from_slice(&distance.to_le_bytes());
output.push(length as u8);
},
CompressionType::Far2Long => {
output.extend_from_slice(&(distance as u16).to_le_bytes());
output.extend_from_slice(&(length as u16).to_le_bytes());
},
CompressionType::Far3Long => {
output.extend_from_slice(&distance.to_le_bytes());
output.extend_from_slice(&length.to_le_bytes());
},
_ => {
let type_byte_index = output.len() - 1;
output[type_byte_index] = 0; output.push(length as u8);
let end_pos = (pos + length as usize).min(input.len());
output.extend_from_slice(&input[pos..end_pos]);
}
}
Ok(length as usize)
},
CompressionStrategy::Global { dict_offset, length, match_type: _ } => {
output.push(1); output.extend_from_slice(&(dict_offset as u16).to_le_bytes()); output.extend_from_slice(&(length as u16).to_le_bytes()); Ok(length as usize)
},
}
}
fn create_local_match(&self, distance: u32, length: u32, match_type: CompressionType) -> Result<Match> {
match match_type {
CompressionType::RLE => {
Ok(Match::RLE {
byte_value: 0, length: length.try_into().map_err(|_| ZiporaError::invalid_data("RLE length too large"))?
})
},
CompressionType::NearShort => Ok(Match::NearShort {
distance: distance.try_into().map_err(|_| ZiporaError::invalid_data("NearShort distance too large"))?,
length: length.try_into().map_err(|_| ZiporaError::invalid_data("NearShort length too large"))?
}),
CompressionType::Far1Short => Ok(Match::Far1Short {
distance: distance.try_into().map_err(|_| ZiporaError::invalid_data("Far1Short distance too large"))?,
length: length.try_into().map_err(|_| ZiporaError::invalid_data("Far1Short length too large"))?
}),
CompressionType::Far2Short => Ok(Match::Far2Short {
distance,
length: length.try_into().map_err(|_| ZiporaError::invalid_data("Far2Short length too large"))?
}),
CompressionType::Far2Long => Ok(Match::Far2Long {
distance: distance.try_into().map_err(|_| ZiporaError::invalid_data("Far2Long distance too large"))?,
length: length.try_into().map_err(|_| ZiporaError::invalid_data("Far2Long length too large"))?
}),
CompressionType::Far3Long => Ok(Match::Far3Long { distance, length }),
_ => Err(ZiporaError::invalid_data("Invalid match type for local match")),
}
}
fn update_statistics(&mut self, strategy: CompressionStrategy, _advance_length: usize) {
match strategy {
CompressionStrategy::Literal { length: _ } => {
self.stats.literal_count += 1;
self.stats.compression_type_usage[0] += 1; },
CompressionStrategy::Local { length, match_type, .. } => {
self.stats.local_matches += 1;
self.stats.local_bytes_saved += length as u64;
self.stats.compression_type_usage[match_type as usize] += 1;
},
CompressionStrategy::Global { length, .. } => {
self.stats.global_matches += 1;
self.stats.global_bytes_saved += length as u64;
self.stats.compression_type_usage[CompressionType::Global as usize] += 1;
},
}
if let Some(prev_strategy) = self.current_strategy {
if std::mem::discriminant(&strategy) != std::mem::discriminant(&prev_strategy) {
self.stats.strategy_switches += 1;
}
}
}
fn update_adaptive_thresholds(&mut self, strategy: CompressionStrategy) {
if !self.config.adaptive_thresholds {
return;
}
let efficiency = if self.stats.bytes_processed > 0 {
1.0 - (self.stats.bytes_output as f64 / self.stats.bytes_processed as f64)
} else {
0.0
};
self.adaptive_thresholds.update(efficiency, strategy, self.config.learning_rate);
self.stats.threshold_adjustments += 1;
}
pub fn stats(&self) -> &CompressionStats {
&self.stats
}
pub fn dictionary_stats(&self) -> &MatchStats {
self.dictionary.match_stats()
}
pub fn local_matcher_stats(&self) -> &LocalMatcherStats {
self.local_matcher.stats()
}
pub fn cache_stats(&self) -> Result<CacheStats> {
Ok(self.dictionary.cache_stats())
}
pub fn reset_stats(&mut self) {
self.stats = CompressionStats::new();
self.local_matcher.reset_stats();
}
pub fn validate(&self) -> Result<()> {
if self.config.min_net_benefit < 0 {
return Err(ZiporaError::invalid_data("Minimum net benefit must be >= 0"));
}
if self.config.learning_rate < 0.0 || self.config.learning_rate > 1.0 {
return Err(ZiporaError::invalid_data("Learning rate must be between 0.0 and 1.0"));
}
if self.config.literal_cost_bits == 0 {
return Err(ZiporaError::invalid_data("Literal cost bits must be > 0"));
}
Ok(())
}
}
trait MatchConversion {
fn from_local_match(local: LocalMatch, compression_type: CompressionType) -> Self;
}
impl MatchConversion for Match {
fn from_local_match(local: LocalMatch, compression_type: CompressionType) -> Self {
match compression_type {
CompressionType::RLE => Match::RLE {
byte_value: 0, length: local.length.try_into().unwrap_or(255)
},
CompressionType::NearShort => Match::NearShort {
distance: local.distance.try_into().unwrap_or(255),
length: local.length.try_into().unwrap_or(255)
},
CompressionType::Far1Short => Match::Far1Short {
distance: local.distance.try_into().unwrap_or(65535),
length: local.length.try_into().unwrap_or(255)
},
CompressionType::Far2Short => Match::Far2Short {
distance: local.distance as u32,
length: local.length.try_into().unwrap_or(255)
},
CompressionType::Far2Long => Match::Far2Long {
distance: local.distance.try_into().unwrap_or(65535),
length: local.length.try_into().unwrap_or(65535)
},
CompressionType::Far3Long => Match::Far3Long {
distance: local.distance as u32,
length: local.length as u32
},
_ => Match::Literal { length: local.length.try_into().unwrap_or(255) }, }
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::compression::dict_zip::{DictionaryBuilder, DictionaryBuilderConfig};
use crate::memory::SecureMemoryPool;
pub fn setup_test_compressor() -> Result<PaZipCompressor> {
let training_data = b"the quick brown fox jumps over the lazy dog. the quick brown fox jumps again.";
let dict_config = DictionaryBuilderConfig {
target_dict_size: 2048,
max_dict_size: 4096,
validate_result: true,
..Default::default()
};
let builder = DictionaryBuilder::with_config(dict_config);
let dictionary = builder.build(training_data)?;
let config = PaZipCompressorConfig::balanced();
let pool = SecureMemoryPool::new(SecurePoolConfig::new(4096, 1024, 8))?;
PaZipCompressor::new(dictionary, config, pool)
}
#[test]
fn test_compressor_creation() -> Result<()> {
let compressor = setup_test_compressor()?;
assert!(compressor.validate().is_ok());
Ok(())
}
#[test]
fn test_empty_input_compression() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let input = b"";
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert_eq!(stats.bytes_processed, 0);
assert_eq!(stats.bytes_output, 0);
assert!(output.is_empty());
Ok(())
}
#[test]
fn test_small_input_compression() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let input = b"the quick brown fox";
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert_eq!(stats.bytes_processed, input.len() as u64);
assert!(stats.compression_ratio <= 1.0);
Ok(())
}
#[test]
fn test_compression_with_repetitive_data() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let input = b"the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog.";
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert!(stats.global_matches > 0 || stats.local_matches > 0);
assert!(stats.compression_ratio < 1.0);
Ok(())
}
#[test]
fn test_configuration_presets() {
let fast = PaZipCompressorConfig::fast_compression();
let high = PaZipCompressorConfig::high_compression();
let balanced = PaZipCompressorConfig::balanced();
let realtime = PaZipCompressorConfig::realtime();
assert!(fast.max_local_probe_distance < high.max_local_probe_distance);
assert!(realtime.max_global_probe_distance < balanced.max_global_probe_distance);
assert!(!realtime.adaptive_thresholds);
assert!(high.collect_detailed_stats);
assert!(!fast.collect_detailed_stats);
}
#[test]
fn test_adaptive_thresholds() {
let mut thresholds = AdaptiveThresholds::default();
let initial_bias = thresholds.global_bias;
thresholds.update(0.9, CompressionStrategy::Global {
dict_offset: 0,
length: 10,
match_type: CompressionType::Global
}, 0.1);
assert!(thresholds.global_bias > initial_bias);
assert_eq!(thresholds.update_count, 1);
}
#[test]
fn test_statistics_tracking() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let input = b"test data for statistics tracking";
let mut output = Vec::new();
let initial_stats = compressor.stats().clone();
compressor.compress(input, &mut output)?;
let final_stats = compressor.stats();
assert!(final_stats.bytes_processed > initial_stats.bytes_processed);
assert!(final_stats.compression_time > Duration::from_nanos(0));
Ok(())
}
#[test]
fn test_compression_decompression_roundtrip() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let original_input = b"The quick brown fox jumps over the lazy dog";
let mut compressed_output = Vec::new();
let _stats = compressor.compress(original_input, &mut compressed_output)?;
assert!(!compressed_output.is_empty(), "Compression should produce output");
let mut decompressed_output = Vec::new();
compressor.decompress(&compressed_output, &mut decompressed_output)?;
assert_eq!(
original_input,
&decompressed_output[..],
"Decompressed output should match original input.\nOriginal: {:?}\nDecompressed: {:?}",
std::str::from_utf8(original_input).unwrap_or("(invalid UTF-8)"),
std::str::from_utf8(&decompressed_output).unwrap_or("(invalid UTF-8)")
);
Ok(())
}
#[test]
fn test_simple_literal_compression() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let simple_input = b"hello world";
let mut compressed_output = Vec::new();
let _stats = compressor.compress(simple_input, &mut compressed_output)?;
assert!(!compressed_output.is_empty(), "Compression should produce output");
println!("Original: {:?}", simple_input);
println!("Compressed: {:?}", compressed_output);
let mut decompressed_output = Vec::new();
compressor.decompress(&compressed_output, &mut decompressed_output)?;
println!("Decompressed: {:?}", decompressed_output);
assert_eq!(
simple_input,
&decompressed_output[..],
"Simple literal compression/decompression failed"
);
Ok(())
}
#[test]
fn test_cost_analysis() -> Result<()> {
let compressor = setup_test_compressor()?;
let literal_cost = compressor.calculate_literal_cost(5);
assert_eq!(literal_cost.match_length, 5);
assert_eq!(literal_cost.encoding_cost, compressor.config.literal_cost_bits * 5);
assert_eq!(literal_cost.efficiency, 0.0);
Ok(())
}
#[test]
fn test_strategy_selection() -> Result<()> {
let compressor = setup_test_compressor()?;
let strategies = vec![
(CompressionStrategy::Literal { length: 1 }, CostAnalysis {
net_benefit: -8,
encoding_cost: 8,
access_cost: 0,
total_cost: 8,
match_length: 1,
efficiency: 0.0,
}),
(CompressionStrategy::Local {
distance: 10,
length: 8,
match_type: CompressionType::NearShort
}, CostAnalysis {
net_benefit: 50,
encoding_cost: 14,
access_cost: 0,
total_cost: 14,
match_length: 8,
efficiency: 0.7,
}),
];
let selected = compressor.select_optimal_strategy(strategies)?;
match selected {
CompressionStrategy::Local { length: 8, .. } => {},
_ => panic!("Expected local strategy to be selected"),
}
Ok(())
}
#[test]
fn test_validation() -> Result<()> {
let compressor = setup_test_compressor()?;
assert!(compressor.validate().is_ok());
let mut invalid_config = PaZipCompressorConfig::default();
invalid_config.min_net_benefit = -10;
invalid_config.learning_rate = 2.0;
let training_data = b"test";
let builder = DictionaryBuilder::default();
let dictionary = builder.build(training_data)?;
let pool = SecureMemoryPool::new(SecurePoolConfig::new(4096, 1024, 8))?;
let invalid_compressor = PaZipCompressor::new(dictionary, invalid_config, pool)?;
assert!(invalid_compressor.validate().is_err());
Ok(())
}
}
#[cfg(test)]
mod bench_tests {
use super::*;
use std::time::Instant;
#[test]
fn bench_compression_speed() -> Result<()> {
let mut compressor = setup_test_compressor()?;
let test_data = "the quick brown fox jumps over the lazy dog. ".repeat(1000);
let input = test_data.as_bytes();
let mut output = Vec::new();
let start = Instant::now();
let stats = compressor.compress(input, &mut output)?;
let elapsed = start.elapsed();
let speed_mbps = (input.len() as f64 / 1024.0 / 1024.0) / elapsed.as_secs_f64();
println!("Compression speed: {:.2} MB/s", speed_mbps);
println!("Compression ratio: {:.3}", stats.compression_ratio);
println!("Global matches: {}, Local matches: {}", stats.global_matches, stats.local_matches);
assert!(speed_mbps > 0.1); assert!(stats.compression_ratio < 1.0);
Ok(())
}
use super::tests::setup_test_compressor;
}
#[cfg(test)]
mod reference_compliance_tests {
use super::*;
use crate::compression::dict_zip::{DictionaryBuilder, DictionaryBuilderConfig};
use crate::memory::SecureMemoryPool;
fn setup_reference_compliant_compressor() -> Result<PaZipCompressor> {
let training_data = b"The quick brown fox jumps over the lazy dog.";
let dict_config = DictionaryBuilderConfig {
target_dict_size: 1024, max_dict_size: 4096, validate_result: false, sample_ratio: 1.0, ..Default::default()
};
let builder = DictionaryBuilder::with_config(dict_config);
let dictionary = builder.build(training_data)?;
let config = PaZipCompressorConfig::reference_compliant();
let pool = SecureMemoryPool::new(SecurePoolConfig::new(4096, 1024, 8))?;
PaZipCompressor::new(dictionary, config, pool)
}
#[test]
fn test_reference_compliant_compression_basic() -> Result<()> {
let mut compressor = setup_reference_compliant_compressor()?;
let input = b"The quick brown fox";
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert!(!output.is_empty(), "Compression should produce output");
assert!(output.len() <= input.len() + 20, "Output should not be much larger than input");
assert!(stats.bytes_processed > 0, "Should process some bytes");
assert!(stats.bytes_processed <= input.len() as u64, "Should not process more than input");
assert!(stats.bytes_output > 0, "Should produce some output");
assert!(stats.compression_ratio > 0.0);
assert!(output.len() >= 4, "Output should have at least some encoding overhead");
println!("Reference compression: {} -> {} bytes (ratio: {:.3})",
input.len(), output.len(), stats.compression_ratio);
println!("Output format validation: first 10 bytes = {:?}",
&output[..output.len().min(10)]);
Ok(())
}
#[test]
fn test_reference_compliant_compression_patterns() -> Result<()> {
let mut compressor = setup_reference_compliant_compressor()?;
let test_cases = vec![
b"abcdefghijklmnopqrstuvwxyz".to_vec(), b"aaaaaaaaaaaaaaaaaaaaaaaa".to_vec(), b"The quick brown fox jumps over the lazy dog".to_vec(), b"1234567890".repeat(5), vec![0u8; 64], (0..=255u8).collect(), ];
for (i, input) in test_cases.iter().enumerate() {
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert!(!output.is_empty(), "Test case {} should produce output", i);
assert!(output.len() >= 2, "Test case {} should have minimum encoding overhead", i);
println!("Test case {}: input len={}, output len={}, ratio={:.3}",
i, input.len(), output.len(), stats.compression_ratio);
}
Ok(())
}
#[test]
fn test_reference_compliant_compression_type_usage() -> Result<()> {
let mut compressor = setup_reference_compliant_compressor()?;
let input = b"The quick brown fox jumps over the lazy dog. The quick brown fox.";
let mut output = Vec::new();
let stats = compressor.compress(input, &mut output)?;
assert!(stats.bytes_processed > 0);
assert!(stats.bytes_output > 0);
assert!(stats.compression_ratio > 0.0);
assert!(stats.bytes_processed > 0, "Should process some bytes");
assert!(stats.bytes_output > 0, "Should produce some output");
println!("Compression stats: global={}, local={}, literals={}",
stats.global_matches, stats.local_matches, stats.literal_count);
Ok(())
}
#[test]
fn test_reference_vs_legacy_compression() -> Result<()> {
let base_compressor = setup_reference_compliant_compressor()?;
let dictionary1 = base_compressor.dictionary.clone();
let dictionary2 = dictionary1.clone();
let pool = SecureMemoryPool::new(SecurePoolConfig::new(4096, 1024, 8))?;
let ref_config = PaZipCompressorConfig::reference_compliant();
let mut ref_compressor = PaZipCompressor::new(dictionary1, ref_config, pool.clone())?;
let legacy_config = PaZipCompressorConfig::default(); let mut legacy_compressor = PaZipCompressor::new(dictionary2, legacy_config, pool)?;
let input = b"The quick brown fox jumps over the lazy dog";
let mut ref_output = Vec::new();
let ref_stats = ref_compressor.compress(input, &mut ref_output)?;
let mut legacy_output = Vec::new();
let legacy_stats = legacy_compressor.compress(input, &mut legacy_output)?;
assert!(!ref_output.is_empty());
assert!(!legacy_output.is_empty());
assert!(ref_output.len() >= 4, "Reference compression should have minimum overhead");
assert!(legacy_output.len() >= 4, "Legacy compression should have minimum overhead");
println!("Reference compression: {} -> {} bytes (ratio: {:.3})",
input.len(), ref_output.len(), ref_stats.compression_ratio);
println!("Legacy compression: {} -> {} bytes (ratio: {:.3})",
input.len(), legacy_output.len(), legacy_stats.compression_ratio);
assert!(ref_stats.compression_ratio > 0.0);
assert!(legacy_stats.compression_ratio > 0.0);
Ok(())
}
#[test]
fn test_reference_compliant_edge_cases() -> Result<()> {
let mut compressor = setup_reference_compliant_compressor()?;
let edge_cases = vec![
vec![0u8], vec![255u8], vec![0u8, 255u8], b"A".to_vec(), b"AA".to_vec(), b"AB".to_vec(), b"ABC".to_vec(), b"ABCD".to_vec(), ];
for (i, input) in edge_cases.iter().enumerate() {
let mut output = Vec::new();
let _stats = compressor.compress(input, &mut output)?;
assert!(!output.is_empty(), "Edge case {} should produce output", i);
assert!(output.len() >= 2, "Edge case {} should have minimum encoding overhead", i);
}
Ok(())
}
#[test]
fn test_reference_compliant_suffix_array_vs_hash_table() -> Result<()> {
let base_compressor = setup_reference_compliant_compressor()?;
let dictionary1 = base_compressor.dictionary.clone();
let dictionary2 = dictionary1.clone();
let pool = SecureMemoryPool::new(SecurePoolConfig::new(4096, 1024, 8))?;
let mut sa_config = PaZipCompressorConfig::reference_compliant();
sa_config.use_suffix_array_local_match = true;
let mut sa_compressor = PaZipCompressor::new(dictionary1, sa_config, pool.clone())?;
let mut ht_config = PaZipCompressorConfig::reference_compliant();
ht_config.use_suffix_array_local_match = false;
let mut ht_compressor = PaZipCompressor::new(dictionary2, ht_config, pool)?;
let input = b"The quick brown fox jumps";
let mut sa_output = Vec::new();
let sa_stats = sa_compressor.compress(input, &mut sa_output)?;
let mut ht_output = Vec::new();
let ht_stats = ht_compressor.compress(input, &mut ht_output)?;
assert!(!sa_output.is_empty());
assert!(!ht_output.is_empty());
assert!(sa_output.len() >= 4, "Suffix array compression should have minimum overhead");
assert!(ht_output.len() >= 4, "Hash table compression should have minimum overhead");
println!("Suffix array compression: {} -> {} bytes (ratio: {:.3})",
input.len(), sa_output.len(), sa_stats.compression_ratio);
println!("Hash table compression: {} -> {} bytes (ratio: {:.3})",
input.len(), ht_output.len(), ht_stats.compression_ratio);
Ok(())
}
}