mod analysis;
mod arena;
pub mod block;
mod match_finder;
mod repeat_offset_finder;
mod sequences;
pub mod speculative;
pub use analysis::{
fast_entropy_estimate,
fast_predict_block_type,
fast_should_compress,
CompressibilityFingerprint,
CompressionStrategy,
FastBlockType,
PatternType,
};
pub use arena::{Arena, ArenaVec, DEFAULT_ARENA_SIZE};
pub use block::{encode_block, BlockEncoder};
pub use match_finder::{CacheAligned, LazyMatchFinder, Match, MatchFinder};
pub use repeat_offset_finder::RepeatOffsetMatchFinder;
pub use sequences::{
analyze_for_rle, encode_sequences_fse, encode_sequences_fse_with_encoded, encode_sequences_rle,
encode_sequences_with_custom_tables, EncodedSequence, RleSuitability,
};
pub use speculative::{SpeculativeCompressor, SpeculativeStrategy};
use crate::frame::{xxhash64, ZSTD_MAGIC};
use crate::{CustomFseTables, CustomHuffmanTable};
use haagenti_core::{CompressionLevel, Result};
#[derive(Debug)]
enum MatchFinderVariant {
Greedy(MatchFinder),
Lazy(LazyMatchFinder),
RepeatAware(RepeatOffsetMatchFinder),
}
pub struct CompressContext {
#[allow(dead_code)] level: CompressionLevel,
match_finder: MatchFinderVariant,
dictionary_id: Option<u32>,
arena: Arena,
custom_tables: Option<CustomFseTables>,
custom_huffman: Option<CustomHuffmanTable>,
}
impl core::fmt::Debug for CompressContext {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("CompressContext")
.field("level", &self.level)
.field("match_finder", &self.match_finder)
.field("dictionary_id", &self.dictionary_id)
.field("arena_usage", &self.arena.usage())
.field("arena_capacity", &self.arena.capacity())
.field("has_custom_tables", &self.custom_tables.is_some())
.field("has_custom_huffman", &self.custom_huffman.is_some())
.finish()
}
}
impl CompressContext {
pub fn new(level: CompressionLevel) -> Self {
let search_depth = match level {
CompressionLevel::None => 1,
CompressionLevel::Fast => 8, CompressionLevel::Default => 24, CompressionLevel::Best => 48, CompressionLevel::Ultra => 128, CompressionLevel::Custom(n) => n.min(255) as usize,
};
let match_finder = match level {
CompressionLevel::None => MatchFinderVariant::Greedy(MatchFinder::new(search_depth)),
CompressionLevel::Best | CompressionLevel::Ultra => {
MatchFinderVariant::RepeatAware(RepeatOffsetMatchFinder::new(search_depth))
}
_ => MatchFinderVariant::Lazy(LazyMatchFinder::new(search_depth)),
};
Self {
level,
match_finder,
dictionary_id: None,
arena: Arena::with_default_size(),
custom_tables: None,
custom_huffman: None,
}
}
pub fn with_custom_tables(level: CompressionLevel, custom_tables: CustomFseTables) -> Self {
let mut ctx = Self::new(level);
ctx.custom_tables = Some(custom_tables);
ctx
}
pub fn with_options(
level: CompressionLevel,
custom_tables: Option<CustomFseTables>,
custom_huffman: Option<CustomHuffmanTable>,
) -> Self {
let mut ctx = Self::new(level);
ctx.custom_tables = custom_tables;
ctx.custom_huffman = custom_huffman;
ctx
}
pub fn with_arena_size(level: CompressionLevel, arena_size: usize) -> Self {
let mut ctx = Self::new(level);
ctx.arena = Arena::new(arena_size);
ctx
}
pub fn custom_tables(&self) -> Option<&CustomFseTables> {
self.custom_tables.as_ref()
}
pub fn custom_huffman(&self) -> Option<&CustomHuffmanTable> {
self.custom_huffman.as_ref()
}
pub fn arena_peak_usage(&self) -> usize {
self.arena.peak_usage()
}
pub fn set_dictionary_id(&mut self, dict_id: u32) {
self.dictionary_id = Some(dict_id);
}
fn find_matches(&mut self, input: &[u8]) -> Vec<Match> {
match &mut self.match_finder {
MatchFinderVariant::Greedy(mf) => mf.find_matches(input),
MatchFinderVariant::Lazy(mf) => mf.find_matches_auto(input),
MatchFinderVariant::RepeatAware(mf) => mf.find_matches(input),
}
}
pub fn compress(&mut self, input: &[u8]) -> Result<Vec<u8>> {
self.compress_internal(input, false)
}
#[allow(dead_code)]
pub fn compress_with_checksum(&mut self, input: &[u8]) -> Result<Vec<u8>> {
self.compress_internal(input, true)
}
#[allow(dead_code)]
pub fn compress_speculative(&self, input: &[u8]) -> Result<Vec<u8>> {
let compressor = speculative::SpeculativeCompressor::new();
compressor.compress(input)
}
#[allow(dead_code)]
pub fn compress_speculative_fast(&self, input: &[u8]) -> Result<Vec<u8>> {
let compressor = speculative::SpeculativeCompressor::fast();
compressor.compress(input)
}
#[allow(dead_code)]
pub fn compress_speculative_best(&self, input: &[u8]) -> Result<Vec<u8>> {
let compressor = speculative::SpeculativeCompressor::best();
compressor.compress(input)
}
fn compress_internal(&mut self, input: &[u8], include_checksum: bool) -> Result<Vec<u8>> {
self.arena.reset();
let mut output = Vec::with_capacity(input.len() + 32);
output.extend_from_slice(&ZSTD_MAGIC.to_le_bytes());
self.write_frame_header(input.len(), include_checksum, &mut output)?;
self.encode_blocks(input, &mut output)?;
if include_checksum {
let checksum = (xxhash64(input, 0) & 0xFFFFFFFF) as u32;
output.extend_from_slice(&checksum.to_le_bytes());
}
Ok(output)
}
pub fn arena(&self) -> &Arena {
&self.arena
}
fn write_frame_header(
&self,
content_size: usize,
include_checksum: bool,
output: &mut Vec<u8>,
) -> Result<()> {
let checksum_flag = if include_checksum { 0x04 } else { 0x00 };
let window_log = if content_size == 0 {
19u8 } else {
let log = (content_size as f64).log2().ceil() as u8;
log.clamp(19, 30) };
let (fcs_size, descriptor) = if content_size > 131071 {
(4, 0x80u8 | checksum_flag)
} else {
(0, checksum_flag)
};
output.push(descriptor);
let window_descriptor = (window_log - 10) << 3;
output.push(window_descriptor);
if fcs_size == 4 {
output.extend_from_slice(&(content_size as u32).to_le_bytes());
}
Ok(())
}
fn encode_blocks(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
const MAX_BLOCK_SIZE: usize = 128 * 1024 - 1;
if input.is_empty() {
let header = 1u32; output.push((header & 0xFF) as u8);
output.push(((header >> 8) & 0xFF) as u8);
output.push(((header >> 16) & 0xFF) as u8);
return Ok(());
}
let mut pos = 0;
while pos < input.len() {
let remaining = input.len() - pos;
let block_size = remaining.min(MAX_BLOCK_SIZE);
let is_last = pos + block_size >= input.len();
let block_data = &input[pos..pos + block_size];
self.encode_single_block(block_data, is_last, output)?;
pos += block_size;
}
Ok(())
}
fn encode_single_block(
&mut self,
input: &[u8],
is_last: bool,
output: &mut Vec<u8>,
) -> Result<()> {
let (block_type, encoded) = self.select_block_encoding(input)?;
let mut header = if is_last { 1u32 } else { 0u32 };
header |= (block_type as u32) << 1;
let size = match block_type {
0 => input.len(), 1 => input.len(), 2 => encoded.len(), _ => unreachable!(),
};
header |= (size as u32) << 3;
output.push((header & 0xFF) as u8);
output.push(((header >> 8) & 0xFF) as u8);
output.push(((header >> 16) & 0xFF) as u8);
output.extend_from_slice(&encoded);
Ok(())
}
fn select_block_encoding(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
if input.len() >= 4096 {
return self.select_block_encoding_fast(input);
}
let fingerprint = analysis::CompressibilityFingerprint::analyze(input);
if fingerprint.strategy == CompressionStrategy::RleBlock {
return Ok((1, vec![input[0]]));
}
let matches = self.find_matches(input);
let mut fingerprint = fingerprint;
fingerprint.refine_with_matches(input, &matches);
match fingerprint.strategy {
CompressionStrategy::RleBlock => Ok((1, vec![input[0]])),
CompressionStrategy::RawBlock => {
let compressed = self.encode_literals_only(input)?;
if compressed.len() < input.len() {
Ok((2, compressed))
} else {
Ok((0, input.to_vec()))
}
}
CompressionStrategy::RleSequences | CompressionStrategy::PredefinedFse => {
let compressed = self.encode_with_rle_sequences(input, &matches)?;
if compressed.len() < input.len() {
Ok((2, compressed))
} else {
Ok((0, input.to_vec()))
}
}
}
}
fn select_block_encoding_fast(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
let first = input[0];
let is_uniform = input.len() >= 64 && {
let first_uniform = input[..64].iter().all(|&b| b == first);
if first_uniform {
let step = input.len() / 32;
(1..32).all(|i| input.get(i * step).copied() == Some(first))
} else {
false
}
};
if is_uniform {
return Ok((1, vec![first]));
}
let matches = self.find_matches(input);
let compressed = self.encode_with_rle_sequences(input, &matches)?;
if compressed.len() < input.len() {
Ok((2, compressed))
} else {
Ok((0, input.to_vec()))
}
}
fn encode_with_rle_sequences(&mut self, input: &[u8], matches: &[Match]) -> Result<Vec<u8>> {
let mut output = Vec::with_capacity(input.len());
let (literals, seqs) = block::matches_to_sequences(input, matches);
if seqs.is_empty() {
self.encode_literals_with_options(input, &mut output)?;
block::encode_sequences(&[], &mut output)?;
} else {
let suitability = sequences::analyze_for_rle(&seqs);
self.encode_literals_with_options(&literals, &mut output)?;
if let Some(custom_tables) = &self.custom_tables {
sequences::encode_sequences_with_custom_tables(
&suitability.encoded,
custom_tables,
&mut output,
)?;
} else {
sequences::encode_sequences_fse_with_encoded(&suitability.encoded, &mut output)?;
}
}
Ok(output)
}
fn encode_literals_with_options(&self, literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
if let Some(custom_huffman) = &self.custom_huffman {
block::encode_literals_with_encoder(literals, custom_huffman.encoder(), output)
} else {
block::encode_literals(literals, output)
}
}
fn encode_literals_only(&mut self, input: &[u8]) -> Result<Vec<u8>> {
let mut output = Vec::with_capacity(input.len());
self.encode_literals_with_options(input, &mut output)?;
block::encode_sequences(&[], &mut output)?;
Ok(output)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compress_context_creation() {
let ctx = CompressContext::new(CompressionLevel::Default);
assert_eq!(ctx.level, CompressionLevel::Default);
}
#[test]
fn test_compress_empty() {
let mut ctx = CompressContext::new(CompressionLevel::Fast);
let result = ctx.compress(&[]).unwrap();
assert!(
result.len() >= 4 + 2 + 3,
"expected at least 9 bytes, got {}",
result.len()
);
assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
}
#[test]
fn test_compress_small() {
let mut ctx = CompressContext::new(CompressionLevel::Fast);
let input = b"Hello, World!";
let result = ctx.compress(input).unwrap();
assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
}
#[test]
fn test_compress_rle_detection() {
let mut ctx = CompressContext::new(CompressionLevel::Fast);
let input = vec![b'A'; 100];
let result = ctx.compress(&input).unwrap();
assert!(result.len() < input.len());
}
#[test]
fn test_compression_levels() {
for level in [
CompressionLevel::Fast,
CompressionLevel::Default,
CompressionLevel::Best,
] {
let mut ctx = CompressContext::new(level);
let input = b"Test compression with different levels";
let result = ctx.compress(input);
assert!(result.is_ok());
}
}
#[test]
fn test_text_compression_ratio_baseline() {
let data = b"The quick brown fox jumps over the lazy dog. ".repeat(1000);
let mut ctx = CompressContext::new(CompressionLevel::Default);
let compressed = ctx.compress(&data).unwrap();
let ratio = data.len() as f64 / compressed.len() as f64;
assert!(
ratio >= 2.5,
"Text compression ratio {:.2}x below baseline 2.5x",
ratio
);
}
#[test]
fn test_repetitive_data_compression_ratio() {
let data = b"abcdabcdabcdabcd".repeat(5000);
let mut ctx = CompressContext::new(CompressionLevel::Default);
let compressed = ctx.compress(&data).unwrap();
let ratio = data.len() as f64 / compressed.len() as f64;
assert!(
ratio >= 50.0,
"Repetitive data ratio {:.2}x below expected 50x",
ratio
);
}
#[test]
fn test_compression_levels_ratio_ordering() {
let data = b"This is test data that will be compressed at different levels. ".repeat(500);
let mut fast_ctx = CompressContext::new(CompressionLevel::Fast);
let mut default_ctx = CompressContext::new(CompressionLevel::Default);
let mut best_ctx = CompressContext::new(CompressionLevel::Best);
let fast = fast_ctx.compress(&data).unwrap();
let default = default_ctx.compress(&data).unwrap();
let best = best_ctx.compress(&data).unwrap();
assert!(
best.len() <= default.len(),
"Best ({}) should be <= Default ({})",
best.len(),
default.len()
);
assert!(
default.len() <= fast.len() + 50, "Default ({}) should be close to or better than Fast ({})",
default.len(),
fast.len()
);
}
#[test]
fn test_adaptive_search_depth_performance() {
let small_data = vec![b'a'; 1000];
let large_data = vec![b'a'; 100_000];
let mut ctx_small = CompressContext::new(CompressionLevel::Fast);
let mut ctx_large = CompressContext::new(CompressionLevel::Fast);
let start = std::time::Instant::now();
let _ = ctx_small.compress(&small_data).unwrap();
let small_time = start.elapsed();
let start = std::time::Instant::now();
let _ = ctx_large.compress(&large_data).unwrap();
let large_time = start.elapsed();
let time_ratio = large_time.as_nanos() as f64 / small_time.as_nanos().max(1) as f64;
assert!(
time_ratio < 200.0,
"Time ratio {:.1}x exceeds expected linear scaling",
time_ratio
);
}
#[test]
fn test_lazy_match_finder_produces_matches() {
let data = b"abcdefabcdefabcdef".repeat(100);
let mut lazy_mf = LazyMatchFinder::new(24);
let matches = lazy_mf.find_matches(&data);
assert!(
!matches.is_empty(),
"Lazy match finder should find matches in repetitive data"
);
for m in &matches {
assert!(m.offset > 0, "Match offset should be positive");
assert!(m.length >= 3, "Match length should be at least 3");
}
}
#[test]
fn test_long_distance_pattern_detection() {
let pattern = b"This is a distinctive pattern.";
let mut data = Vec::new();
data.extend_from_slice(pattern);
data.extend_from_slice(&vec![b'x'; 10_000]); data.extend_from_slice(pattern);
let mut ctx = CompressContext::new(CompressionLevel::Default);
let compressed = ctx.compress(&data).unwrap();
assert!(
compressed.len() < data.len() - 20,
"Should find long-distance pattern match"
);
}
#[test]
fn test_entropy_based_strategy_selection() {
let low_entropy = vec![0u8; 100_000];
let mut ctx = CompressContext::new(CompressionLevel::Fast);
let compressed = ctx.compress(&low_entropy).unwrap();
let ratio = low_entropy.len() as f64 / compressed.len() as f64;
assert!(
ratio > 1000.0,
"Low entropy ratio {:.0}x should be >1000x",
ratio
);
}
#[test]
fn test_mixed_content_compression() {
let mut data = Vec::new();
data.extend_from_slice(b"Compressible repeated text. ".repeat(50).as_slice());
data.extend_from_slice(&(0u8..=255u8).cycle().take(1000).collect::<Vec<u8>>());
data.extend_from_slice(b"More compressible text here. ".repeat(50).as_slice());
let mut ctx = CompressContext::new(CompressionLevel::Default);
let compressed = ctx.compress(&data).unwrap();
assert!(
compressed.len() < data.len(),
"Mixed content should still compress"
);
}
#[test]
fn test_speculative_compression_available() {
let data = b"Test data for speculative compression. ".repeat(100);
let ctx = CompressContext::new(CompressionLevel::Default);
let result = ctx.compress_speculative(&data);
assert!(result.is_ok(), "Speculative compression should work");
assert!(
result.unwrap().len() < data.len(),
"Should produce compression"
);
}
#[test]
fn test_compression_with_checksum() {
let data = b"Data to compress with checksum verification.".repeat(100);
let mut ctx = CompressContext::new(CompressionLevel::Default);
let with_checksum = ctx.compress_with_checksum(&data).unwrap();
let without_checksum = ctx.compress(&data).unwrap();
assert_eq!(
with_checksum.len(),
without_checksum.len() + 4,
"Checksum adds 4 bytes"
);
}
#[test]
fn test_arena_reuse_efficiency() {
let mut ctx = CompressContext::new(CompressionLevel::Default);
for i in 0..5 {
let data = format!("Test arena reuse iteration {}. ", i).repeat(1000);
let result = ctx.compress(data.as_bytes()).unwrap();
assert!(!result.is_empty());
}
let capacity = ctx.arena().capacity();
assert!(capacity > 0, "Arena should have capacity");
}
#[test]
fn test_block_size_handling() {
let large_data = vec![b'A'; 150_000];
let mut ctx = CompressContext::new(CompressionLevel::Fast);
let compressed = ctx.compress(&large_data).unwrap();
assert!(!compressed.is_empty());
assert!(
compressed.len() < large_data.len(),
"Large RLE data should compress well"
);
}
#[test]
fn test_match_finder_variant_selection() {
let data = b"test pattern for matching variants".repeat(100);
let mut none_ctx = CompressContext::new(CompressionLevel::None);
let none_result = none_ctx.compress(&data);
assert!(none_result.is_ok());
let mut best_ctx = CompressContext::new(CompressionLevel::Best);
let best_result = best_ctx.compress(&data);
assert!(best_result.is_ok());
assert!(
best_result.unwrap().len() <= none_result.unwrap().len(),
"Best should be at least as good as None"
);
}
#[test]
fn test_repeat_offset_match_finder() {
let data = b"abcdefghijklmnopabcdefghijklmnopabcdefghijklmnop".repeat(50);
let mut mf = RepeatOffsetMatchFinder::new(48);
let matches = mf.find_matches(&data);
assert!(
!matches.is_empty(),
"RepeatAware should find matches in repetitive data"
);
}
#[test]
fn test_compression_fingerprint_accuracy() {
let uniform_data = vec![42u8; 1000];
let fp_uniform = analysis::CompressibilityFingerprint::analyze(&uniform_data);
assert_eq!(
fp_uniform.pattern,
PatternType::Uniform,
"Should detect uniform pattern"
);
let text_data = b"The quick brown fox. ".repeat(100);
let fp_text = analysis::CompressibilityFingerprint::analyze(&text_data);
assert!(
matches!(
fp_text.pattern,
PatternType::TextLike | PatternType::Periodic { .. } | PatternType::LowEntropy
),
"Should detect text-like or periodic pattern, got {:?}",
fp_text.pattern
);
}
#[test]
fn test_fast_entropy_estimate() {
let low_entropy = vec![0u8; 10000];
let est_low = fast_entropy_estimate(&low_entropy);
assert!(est_low < 1.0, "Low entropy data should have low estimate");
let high_entropy: Vec<u8> = (0..10000).map(|i| (i % 256) as u8).collect();
let est_high = fast_entropy_estimate(&high_entropy);
assert!(
est_high > 5.0,
"High entropy data should have high estimate: {}",
est_high
);
}
#[test]
fn test_fast_predict_block_type() {
let zeros = vec![0u8; 1000];
assert_eq!(
fast_predict_block_type(&zeros),
FastBlockType::Rle,
"Uniform data should predict RLE"
);
let text = b"Hello world! ".repeat(100);
let predicted = fast_predict_block_type(&text);
assert!(
predicted == FastBlockType::Compress || predicted == FastBlockType::Raw,
"Text should predict Compress or Raw"
);
}
}