#![cfg_attr(test, allow(unused_imports))]
#![cfg_attr(test, allow(unused_variables))]
#![cfg_attr(test, allow(unused_mut))]
#![cfg_attr(test, allow(clippy::int_plus_one))]
#![cfg_attr(test, allow(clippy::precedence))]
#![cfg_attr(test, allow(clippy::unnecessary_unwrap))]
#![cfg_attr(test, allow(clippy::slow_vector_initialization))]
#![cfg_attr(test, allow(clippy::manual_repeat_n))]
#![cfg_attr(test, allow(clippy::len_zero))]
#![cfg_attr(test, allow(clippy::manual_range_contains))]
#![cfg_attr(test, allow(clippy::identity_op))]
#![cfg_attr(test, allow(clippy::needless_range_loop))]
#![cfg_attr(test, allow(clippy::assertions_on_constants))]
#![cfg_attr(test, allow(clippy::same_item_push))]
#![cfg_attr(test, allow(clippy::if_same_then_else))]
#![cfg_attr(test, allow(clippy::expect_fun_call))]
#![cfg_attr(test, allow(clippy::redundant_slicing))]
#![cfg_attr(test, allow(clippy::collapsible_else_if))]
#![cfg_attr(test, allow(clippy::redundant_closure))]
#![cfg_attr(test, allow(clippy::manual_div_ceil))]
#![cfg_attr(test, allow(clippy::useless_vec))]
pub mod block;
pub mod compress;
pub mod decompress;
pub mod dictionary;
pub mod frame;
pub mod fse;
pub mod huffman;
#[cfg(test)]
mod perf_tests;
pub use dictionary::{ZstdDictCompressor, ZstdDictDecompressor, ZstdDictionary};
use haagenti_core::{
Algorithm, Codec, CompressionLevel, CompressionStats, Compressor, Decompressor, Error, Result,
};
pub const ZSTD_MAGIC: u32 = 0xFD2FB528;
pub const MAX_WINDOW_SIZE: usize = 1 << 27;
pub const MIN_WINDOW_SIZE: usize = 1 << 10;
use fse::FseTable;
use huffman::HuffmanEncoder;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct CustomHuffmanTable {
encoder: Arc<HuffmanEncoder>,
}
impl CustomHuffmanTable {
pub fn new(encoder: HuffmanEncoder) -> Self {
Self {
encoder: Arc::new(encoder),
}
}
pub fn encoder(&self) -> &HuffmanEncoder {
&self.encoder
}
}
#[derive(Debug, Clone, Default)]
pub struct CustomFseTables {
pub ll_table: Option<Arc<FseTable>>,
pub of_table: Option<Arc<FseTable>>,
pub ml_table: Option<Arc<FseTable>>,
}
impl CustomFseTables {
pub fn new() -> Self {
Self::default()
}
pub fn with_ll_table(mut self, table: FseTable) -> Self {
self.ll_table = Some(Arc::new(table));
self
}
pub fn with_of_table(mut self, table: FseTable) -> Self {
self.of_table = Some(Arc::new(table));
self
}
pub fn with_ml_table(mut self, table: FseTable) -> Self {
self.ml_table = Some(Arc::new(table));
self
}
pub fn has_custom_tables(&self) -> bool {
self.ll_table.is_some() || self.of_table.is_some() || self.ml_table.is_some()
}
}
#[derive(Debug, Clone)]
pub struct ZstdCompressor {
level: CompressionLevel,
custom_tables: Option<CustomFseTables>,
custom_huffman: Option<CustomHuffmanTable>,
}
impl ZstdCompressor {
pub fn new() -> Self {
Self {
level: CompressionLevel::Default,
custom_tables: None,
custom_huffman: None,
}
}
pub fn with_level(level: CompressionLevel) -> Self {
Self {
level,
custom_tables: None,
custom_huffman: None,
}
}
pub fn with_custom_tables(custom_tables: CustomFseTables) -> Self {
Self {
level: CompressionLevel::Default,
custom_tables: Some(custom_tables),
custom_huffman: None,
}
}
pub fn with_custom_huffman(custom_huffman: CustomHuffmanTable) -> Self {
Self {
level: CompressionLevel::Default,
custom_tables: None,
custom_huffman: Some(custom_huffman),
}
}
pub fn with_level_and_tables(level: CompressionLevel, custom_tables: CustomFseTables) -> Self {
Self {
level,
custom_tables: Some(custom_tables),
custom_huffman: None,
}
}
pub fn with_all_options(
level: CompressionLevel,
custom_tables: Option<CustomFseTables>,
custom_huffman: Option<CustomHuffmanTable>,
) -> Self {
Self {
level,
custom_tables,
custom_huffman,
}
}
pub fn custom_tables(&self) -> Option<&CustomFseTables> {
self.custom_tables.as_ref()
}
pub fn custom_huffman(&self) -> Option<&CustomHuffmanTable> {
self.custom_huffman.as_ref()
}
}
impl Default for ZstdCompressor {
fn default() -> Self {
Self::new()
}
}
impl Compressor for ZstdCompressor {
fn algorithm(&self) -> Algorithm {
Algorithm::Zstd
}
fn level(&self) -> CompressionLevel {
self.level
}
fn compress(&self, input: &[u8]) -> Result<Vec<u8>> {
let mut ctx = compress::CompressContext::with_options(
self.level,
self.custom_tables.clone(),
self.custom_huffman.clone(),
);
ctx.compress(input)
}
fn compress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
let compressed = self.compress(input)?;
if compressed.len() > output.len() {
return Err(Error::buffer_too_small(output.len(), compressed.len()));
}
output[..compressed.len()].copy_from_slice(&compressed);
Ok(compressed.len())
}
fn max_compressed_size(&self, input_len: usize) -> usize {
input_len + (input_len >> 7) + 512
}
fn stats(&self) -> Option<CompressionStats> {
None
}
}
#[derive(Debug, Clone, Default)]
pub struct ZstdDecompressor;
impl ZstdDecompressor {
pub fn new() -> Self {
Self
}
}
impl Decompressor for ZstdDecompressor {
fn algorithm(&self) -> Algorithm {
Algorithm::Zstd
}
fn decompress(&self, input: &[u8]) -> Result<Vec<u8>> {
decompress::decompress_frame(input)
}
fn decompress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
let result = self.decompress(input)?;
if result.len() > output.len() {
return Err(Error::buffer_too_small(output.len(), result.len()));
}
output[..result.len()].copy_from_slice(&result);
Ok(result.len())
}
fn stats(&self) -> Option<CompressionStats> {
None
}
}
#[derive(Debug, Clone)]
pub struct ZstdCodec {
level: CompressionLevel,
}
impl ZstdCodec {
pub fn new() -> Self {
Self {
level: CompressionLevel::Default,
}
}
pub fn with_level(level: CompressionLevel) -> Self {
Self { level }
}
}
impl Default for ZstdCodec {
fn default() -> Self {
Self::new()
}
}
impl Compressor for ZstdCodec {
fn algorithm(&self) -> Algorithm {
Algorithm::Zstd
}
fn level(&self) -> CompressionLevel {
self.level
}
fn compress(&self, input: &[u8]) -> Result<Vec<u8>> {
ZstdCompressor::with_level(self.level).compress(input)
}
fn compress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
ZstdCompressor::with_level(self.level).compress_to(input, output)
}
fn max_compressed_size(&self, input_len: usize) -> usize {
ZstdCompressor::new().max_compressed_size(input_len)
}
fn stats(&self) -> Option<CompressionStats> {
None
}
}
impl Decompressor for ZstdCodec {
fn algorithm(&self) -> Algorithm {
Algorithm::Zstd
}
fn decompress(&self, input: &[u8]) -> Result<Vec<u8>> {
ZstdDecompressor::new().decompress(input)
}
fn decompress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
ZstdDecompressor::new().decompress_to(input, output)
}
fn stats(&self) -> Option<CompressionStats> {
None
}
}
impl Codec for ZstdCodec {
fn new() -> Self {
ZstdCodec::new()
}
fn with_level(level: CompressionLevel) -> Self {
ZstdCodec::with_level(level)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_magic_number() {
assert_eq!(ZSTD_MAGIC, 0xFD2FB528);
}
#[test]
fn test_decompressor_validates_magic() {
let decompressor = ZstdDecompressor::new();
let invalid_data = [0x00, 0x00, 0x00, 0x00, 0x00];
let result = decompressor.decompress(&invalid_data);
assert!(result.is_err());
let valid_magic = [0x28, 0xB5, 0x2F, 0xFD, 0x00];
let result = decompressor.decompress(&valid_magic);
assert!(result.is_err()); }
#[test]
fn test_too_short_input() {
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&[0x28, 0xB5]);
assert!(result.is_err());
}
#[test]
fn test_compressor_works() {
let compressor = ZstdCompressor::new();
let result = compressor.compress(b"test");
assert!(result.is_ok());
let compressed = result.unwrap();
assert_eq!(&compressed[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
}
#[test]
fn test_max_compressed_size() {
let compressor = ZstdCompressor::new();
assert!(compressor.max_compressed_size(100) > 100);
let large_max = compressor.max_compressed_size(1_000_000);
assert!(large_max > 1_000_000);
assert!(large_max < 1_100_000); }
#[test]
fn test_codec_algorithm() {
let codec = ZstdCodec::new();
assert_eq!(Compressor::algorithm(&codec), Algorithm::Zstd);
assert_eq!(Decompressor::algorithm(&codec), Algorithm::Zstd);
}
#[test]
fn test_compression_levels() {
for level in [
CompressionLevel::Fast,
CompressionLevel::Default,
CompressionLevel::Best,
] {
let compressor = ZstdCompressor::with_level(level);
assert_eq!(compressor.level(), level);
}
}
#[test]
fn test_decompressor_raw_block() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(5);
frame.extend_from_slice(&[0x29, 0x00, 0x00]);
frame.extend_from_slice(b"Hello");
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_decompressor_rle_block() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(10);
frame.extend_from_slice(&[0x53, 0x00, 0x00]);
frame.push(b'X');
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, vec![b'X'; 10]);
}
#[test]
fn test_decompressor_multi_block() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(8);
frame.extend_from_slice(&[0x28, 0x00, 0x00]);
frame.extend_from_slice(b"Hello");
frame.extend_from_slice(&[0x19, 0x00, 0x00]);
frame.extend_from_slice(b"!!!");
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, b"Hello!!!");
}
#[test]
fn test_decompressor_with_checksum() {
use crate::frame::xxhash64;
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x24);
frame.push(5);
frame.extend_from_slice(&[0x29, 0x00, 0x00]);
frame.extend_from_slice(b"Hello");
let hash = xxhash64(b"Hello", 0);
let checksum = (hash & 0xFFFFFFFF) as u32;
frame.extend_from_slice(&checksum.to_le_bytes());
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_decompress_to() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(5);
frame.extend_from_slice(&[0x29, 0x00, 0x00]);
frame.extend_from_slice(b"Hello");
let decompressor = ZstdDecompressor::new();
let mut output = vec![0u8; 10];
let len = decompressor.decompress_to(&frame, &mut output).unwrap();
assert_eq!(len, 5);
assert_eq!(&output[..5], b"Hello");
}
#[test]
fn test_decompress_to_buffer_too_small() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(5);
frame.extend_from_slice(&[0x29, 0x00, 0x00]);
frame.extend_from_slice(b"Hello");
let decompressor = ZstdDecompressor::new();
let mut output = vec![0u8; 2]; let result = decompressor.decompress_to(&frame, &mut output);
assert!(result.is_err());
}
fn build_frame(
content_size: Option<u64>,
has_checksum: bool,
blocks: Vec<(bool, u8, Vec<u8>)>, ) -> Vec<u8> {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
let mut descriptor = 0u8;
if has_checksum {
descriptor |= 0x04; }
let fcs_bytes = match content_size {
None => 0,
Some(s) if s <= 255 => {
descriptor |= 0x20; 1
}
Some(s) if s <= 65791 => {
descriptor |= 0x40; 2
}
Some(s) if s <= 0xFFFFFFFF => {
descriptor |= 0x80; 4
}
Some(_) => {
descriptor |= 0xC0; 8
}
};
frame.push(descriptor);
if descriptor & 0x20 == 0 && content_size.is_some() {
frame.push(0x00); }
if let Some(size) = content_size {
match fcs_bytes {
1 => frame.push(size as u8),
2 => {
let adjusted = size.saturating_sub(256) as u16;
frame.extend_from_slice(&adjusted.to_le_bytes());
}
4 => frame.extend_from_slice(&(size as u32).to_le_bytes()),
8 => frame.extend_from_slice(&size.to_le_bytes()),
_ => {}
}
}
let mut decompressed_content = Vec::new();
for (is_last, block_type, data) in blocks {
let _compressed_size = if block_type == 1 { 1 } else { data.len() };
let decompressed_size = if block_type == 1 {
data.len()
} else {
data.len()
};
let mut header = if is_last { 1u32 } else { 0u32 };
header |= (block_type as u32) << 1;
header |= (decompressed_size as u32) << 3;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
if block_type == 1 {
frame.push(data[0]);
for _ in 0..decompressed_size {
decompressed_content.push(data[0]);
}
} else {
frame.extend_from_slice(&data);
decompressed_content.extend_from_slice(&data);
}
}
if has_checksum {
let hash = crate::frame::xxhash64(&decompressed_content, 0);
let checksum = (hash & 0xFFFFFFFF) as u32;
frame.extend_from_slice(&checksum.to_le_bytes());
}
frame
}
#[test]
fn test_integration_empty_frame() {
let frame = build_frame(
Some(0),
false,
vec![
(true, 0, vec![]), ],
);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_integration_multiple_raw_blocks() {
let frame = build_frame(
Some(15),
true,
vec![
(false, 0, b"Hello".to_vec()),
(false, 0, b", ".to_vec()),
(true, 0, b"World!!!".to_vec()),
],
);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, b"Hello, World!!!");
}
#[test]
fn test_integration_mixed_raw_rle() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]); frame.push(0x24); frame.push(11);
let header1 = (5 << 3) | (0 << 1) | 0; frame.push((header1 & 0xFF) as u8);
frame.push(((header1 >> 8) & 0xFF) as u8);
frame.push(((header1 >> 16) & 0xFF) as u8);
frame.extend_from_slice(b"Start");
let header2 = (3 << 3) | (1 << 1) | 0; frame.push((header2 & 0xFF) as u8);
frame.push(((header2 >> 8) & 0xFF) as u8);
frame.push(((header2 >> 16) & 0xFF) as u8);
frame.push(b'-');
let header3 = (3 << 3) | (0 << 1) | 1; frame.push((header3 & 0xFF) as u8);
frame.push(((header3 >> 8) & 0xFF) as u8);
frame.push(((header3 >> 16) & 0xFF) as u8);
frame.extend_from_slice(b"End");
let content = b"Start---End";
let hash = crate::frame::xxhash64(content, 0);
let checksum = (hash & 0xFFFFFFFF) as u32;
frame.extend_from_slice(&checksum.to_le_bytes());
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, b"Start---End");
}
#[test]
fn test_integration_large_rle() {
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20); frame.push(200);
let header = (200 << 3) | (1 << 1) | 1;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.push(b'X');
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result.len(), 200);
assert!(result.iter().all(|&b| b == b'X'));
}
#[test]
fn test_integration_two_byte_fcs() {
let size = 300usize;
let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x40);
frame.push(0x00);
let fcs_value = (size - 256) as u16;
frame.extend_from_slice(&fcs_value.to_le_bytes());
let header = (size << 3) | 1; frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.extend_from_slice(&data);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result.len(), size);
assert_eq!(result, data);
}
#[test]
fn test_integration_binary_data() {
let data: Vec<u8> = (0..=255).collect();
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x40);
frame.push(0x00);
frame.extend_from_slice(&0u16.to_le_bytes());
let header = (256 << 3) | 1;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.extend_from_slice(&data);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_integration_checksum_verification() {
let data = b"Test data for checksum verification!";
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x24); frame.push(data.len() as u8);
let header = (data.len() << 3) | 1;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.extend_from_slice(data);
let hash = crate::frame::xxhash64(data, 0);
let checksum = (hash & 0xFFFFFFFF) as u32;
frame.extend_from_slice(&checksum.to_le_bytes());
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_integration_invalid_checksum_rejected() {
let data = b"Test data";
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x24);
frame.push(data.len() as u8);
let header = (data.len() << 3) | 1;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.extend_from_slice(data);
frame.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame);
assert!(result.is_err());
}
#[test]
fn test_integration_content_size_mismatch_rejected() {
let data = b"Short";
let mut frame = vec![];
frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
frame.push(0x20);
frame.push(100);
let header = (data.len() << 3) | 1;
frame.push((header & 0xFF) as u8);
frame.push(((header >> 8) & 0xFF) as u8);
frame.push(((header >> 16) & 0xFF) as u8);
frame.extend_from_slice(data);
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&frame);
assert!(result.is_err());
}
#[test]
fn test_roundtrip_empty() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input: &[u8] = &[];
let compressed = compressor.compress(input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_small() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input = b"Hello, World!";
let compressed = compressor.compress(input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_rle() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input = vec![b'A'; 100];
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
assert!(compressed.len() < input.len());
}
#[test]
fn test_roundtrip_binary() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input: Vec<u8> = (0..=255).collect();
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_repeated_pattern() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let pattern = b"0123456789ABCDEF";
let mut input = Vec::new();
for _ in 0..10 {
input.extend_from_slice(pattern);
}
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_compression_levels() {
let decompressor = ZstdDecompressor::new();
let input = b"Test data for compression level testing. This needs to be long enough to trigger actual compression.";
for level in [
CompressionLevel::None,
CompressionLevel::Fast,
CompressionLevel::Default,
CompressionLevel::Best,
] {
let compressor = ZstdCompressor::with_level(level);
let compressed = compressor.compress(input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(
decompressed, input,
"Roundtrip failed for level {:?}",
level
);
}
}
#[test]
fn test_codec_roundtrip() {
let codec = ZstdCodec::new();
let input = b"Testing the codec roundtrip functionality";
let compressed = Compressor::compress(&codec, input).unwrap();
let decompressed = Decompressor::decompress(&codec, &compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_uniform_pattern() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input = b"abcdabcdabcdabcdabcdabcdabcdabcd";
let compressed = compressor.compress(input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_longer_uniform_pattern() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let pattern = b"Hello World! ";
let mut input = Vec::new();
for _ in 0..20 {
input.extend_from_slice(pattern);
}
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
assert!(compressed.len() < input.len());
}
#[test]
fn test_roundtrip_overlapping_matches() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let input = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
let compressed = compressor.compress(input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
assert!(compressed.len() < input.len() / 2);
}
#[test]
fn test_roundtrip_mixed_patterns() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let mut input = Vec::new();
input.extend_from_slice(b"prefix_");
for _ in 0..10 {
input.extend_from_slice(b"pattern_");
}
input.extend_from_slice(b"suffix");
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_single_byte_repeats() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let mut input = Vec::new();
for _ in 0..10 {
input.extend(vec![b'X'; 20]);
input.extend(vec![b'Y'; 20]);
}
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_roundtrip_various_pattern_lengths() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
for pattern_len in 3..=8 {
let pattern: Vec<u8> = (0..pattern_len).map(|i| b'A' + i).collect();
let mut input = Vec::new();
for _ in 0..20 {
input.extend_from_slice(&pattern);
}
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(
decompressed, input,
"Failed for pattern length {}",
pattern_len
);
}
}
#[test]
fn test_roundtrip_llm_weights_pattern() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let f16_patterns: &[u16] = &[
0x0000, 0x1400, 0x9400, 0x2000, 0xA000, 0x2E00, 0xAE00, 0x3800, 0xB800, ];
for size in [1024, 4096] {
let mut input = Vec::with_capacity(size);
let mut idx = 0;
while input.len() < size {
let val = f16_patterns[idx % f16_patterns.len()];
input.extend_from_slice(&val.to_le_bytes());
idx += 1;
}
input.truncate(size);
let compressed = compressor.compress(&input).unwrap();
eprintln!(
"Size {}: input={} bytes, compressed={} bytes",
size,
input.len(),
compressed.len()
);
let block_data = &compressed[11..]; let lit_byte0 = block_data[0];
let lit_type = lit_byte0 & 0x03;
let size_format = (lit_byte0 >> 2) & 0x03;
eprintln!("Literals: type={}, size_format={}", lit_type, size_format);
if lit_type == 2 && size_format == 2 {
let regen = ((block_data[0] >> 4) as usize)
| ((block_data[1] as usize) << 4)
| (((block_data[2] & 0x0F) as usize) << 12);
let comp = ((block_data[2] >> 4) as usize)
| ((block_data[3] as usize) << 4)
| (((block_data[4] & 0x03) as usize) << 12);
eprintln!(
"Literals header: regen={}, comp={}, header_size=5",
regen, comp
);
eprintln!("Total literals section: {}", 5 + comp);
let weights_header = block_data[5];
eprintln!(
"Huffman weights header byte: {:02x} ({})",
weights_header, weights_header
);
use crate::huffman::HuffmanEncoder;
if let Some(test_encoder) = HuffmanEncoder::build(&input) {
let test_weights = test_encoder.serialize_weights();
eprintln!(
"Encoder produced weights: first 10 bytes = {:02x?}",
&test_weights[..10.min(test_weights.len())]
);
eprintln!("Weights length = {}", test_weights.len());
}
let seq_pos = 5 + comp;
if block_data.len() > seq_pos {
eprintln!("Sequences start byte: {:02x}", block_data[seq_pos]);
}
}
match decompressor.decompress(&compressed) {
Ok(decompressed) => {
assert_eq!(
decompressed, input,
"LLM weights roundtrip failed for size {}",
size
);
}
Err(e) => {
eprintln!("Decompression failed for size {}: {:?}", size, e);
if compressed.len() > 12 {
eprintln!("Frame header bytes: {:02x?}", &compressed[..12]);
}
panic!("Decompression failed for size {}: {:?}", size, e);
}
}
}
}
#[test]
fn test_roundtrip_large_pattern_block() {
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let pattern = b"0123456789";
let mut input = Vec::new();
for _ in 0..100 {
input.extend_from_slice(pattern);
}
let compressed = compressor.compress(&input).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, input);
}
#[test]
fn test_custom_table_in_zstd_frame() {
let custom_tables = CustomFseTables::new();
let compressor = ZstdCompressor::with_custom_tables(custom_tables);
let decompressor = ZstdDecompressor::new();
let data = b"ABCDABCDABCDABCD".repeat(100);
let compressed = compressor.compress(&data).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, data);
}
#[test]
fn test_custom_tables_with_level() {
let custom_tables = CustomFseTables::new();
let compressor =
ZstdCompressor::with_level_and_tables(CompressionLevel::Best, custom_tables);
let decompressor = ZstdDecompressor::new();
let data = b"Test data for custom tables with compression level.".repeat(50);
let compressed = compressor.compress(&data).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, data);
assert_eq!(compressor.level(), CompressionLevel::Best);
}
#[test]
fn test_custom_tables_api() {
let tables = CustomFseTables::new();
assert!(!tables.has_custom_tables());
let ll_table = fse::cached_ll_table().clone();
let tables_with_ll = CustomFseTables::new().with_ll_table(ll_table);
assert!(tables_with_ll.has_custom_tables());
assert!(tables_with_ll.ll_table.is_some());
assert!(tables_with_ll.of_table.is_none());
assert!(tables_with_ll.ml_table.is_none());
}
#[test]
fn test_compressor_with_custom_tables_getter() {
let tables = CustomFseTables::new();
let compressor = ZstdCompressor::with_custom_tables(tables);
assert!(compressor.custom_tables().is_some());
let default_compressor = ZstdCompressor::new();
assert!(default_compressor.custom_tables().is_none());
}
#[test]
fn test_huffman_integration_with_zstd() {
let training_data = b"The quick brown fox jumps over the lazy dog. ".repeat(100);
let encoder =
huffman::HuffmanEncoder::build(&training_data).expect("Should build Huffman encoder");
let custom_huffman = CustomHuffmanTable::new(encoder);
let compressor = ZstdCompressor::with_custom_huffman(custom_huffman);
let decompressor = ZstdDecompressor::new();
let test_data = b"The lazy fox quickly jumps over the brown dog. ".repeat(50);
let compressed = compressor.compress(&test_data).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, test_data);
}
#[test]
fn test_huffman_encoder_from_weights() {
let mut weights = vec![0u8; 256];
weights[b'a' as usize] = 8; weights[b'b' as usize] = 7;
weights[b'c' as usize] = 6;
weights[b'd' as usize] = 5;
weights[b'e' as usize] = 4;
let encoder =
huffman::HuffmanEncoder::from_weights(&weights).expect("Should build from weights");
assert_eq!(encoder.num_symbols(), 5);
assert!(encoder.max_bits() <= 11);
let codes = encoder.get_codes();
assert!(codes[b'a' as usize].num_bits > 0);
assert!(codes[b'b' as usize].num_bits > 0);
}
#[test]
fn test_custom_huffman_api() {
let data = b"test data for huffman".repeat(100);
let encoder = huffman::HuffmanEncoder::build(&data).expect("Should build encoder");
let custom_huffman = CustomHuffmanTable::new(encoder);
let codes = custom_huffman.encoder().get_codes();
assert!(codes[b't' as usize].num_bits > 0);
}
#[test]
fn test_compressor_with_all_options() {
let sample_data = b"Sample data for training ".repeat(100);
let custom_fse = CustomFseTables::new();
let encoder = huffman::HuffmanEncoder::build(&sample_data).expect("Should build encoder");
let custom_huffman = CustomHuffmanTable::new(encoder);
let compressor = ZstdCompressor::with_all_options(
CompressionLevel::Default,
Some(custom_fse),
Some(custom_huffman),
);
let decompressor = ZstdDecompressor::new();
let test_data = b"Sample text for compression testing ".repeat(50);
let compressed = compressor.compress(&test_data).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, test_data);
assert!(compressor.custom_tables().is_some());
assert!(compressor.custom_huffman().is_some());
}
#[test]
fn test_custom_huffman_getter() {
let data = b"test".repeat(100);
let encoder = huffman::HuffmanEncoder::build(&data).unwrap();
let custom = CustomHuffmanTable::new(encoder);
let compressor = ZstdCompressor::with_custom_huffman(custom);
assert!(compressor.custom_huffman().is_some());
let default_compressor = ZstdCompressor::new();
assert!(default_compressor.custom_huffman().is_none());
}
}
#[cfg(test)]
mod huffman_debug_tests {
use crate::huffman::{build_table_from_weights, parse_huffman_weights, HuffmanEncoder};
fn generate_text_like_data(size: usize) -> Vec<u8> {
let words = [
"the ",
"quick ",
"brown ",
"fox ",
"jumps ",
"over ",
"lazy ",
"dog ",
"compression ",
"algorithm ",
"performance ",
"benchmark ",
"testing ",
];
let mut data = Vec::with_capacity(size);
let mut i = 0;
while data.len() < size {
let word = words[i % words.len()];
let remaining = size - data.len();
let to_copy = remaining.min(word.len());
data.extend_from_slice(&word.as_bytes()[..to_copy]);
i += 1;
}
data
}
#[test]
fn test_trace_huffman_weights_text() {
let data = generate_text_like_data(20000);
let encoder = HuffmanEncoder::build(&data);
if encoder.is_none() {
println!("Encoder returned None - Huffman not suitable for data");
return;
}
let encoder = encoder.unwrap();
let weights = encoder.serialize_weights();
println!(
"Serialized weights: {} bytes, header={}",
weights.len(),
weights[0]
);
let num_symbols = (weights[0] - 127) as usize;
println!("Number of symbols from header: {}", num_symbols);
let (parsed_weights, consumed) = parse_huffman_weights(&weights).expect("Should parse");
println!(
"Parsed {} weights, consumed {} bytes",
parsed_weights.len(),
consumed
);
let non_zero: Vec<_> = parsed_weights
.iter()
.enumerate()
.filter(|&(_, &w)| w > 0)
.map(|(i, &w)| (i as u8 as char, w))
.collect();
println!(
"Non-zero weights ({} total): {:?}",
non_zero.len(),
non_zero
);
let max_w = *parsed_weights.iter().max().unwrap_or(&0);
let weight_sum: u64 = parsed_weights
.iter()
.filter(|&&w| w > 0)
.map(|&w| 1u64 << w)
.sum();
println!("Max weight: {}, sum(2^w): {}", max_w, weight_sum);
println!("Expected sum: 2^{} = {}", max_w + 1, 1u64 << (max_w + 1));
let mut bl_count = vec![0u32; max_w as usize + 2];
for &w in &parsed_weights {
if w > 0 {
let code_len = (max_w + 1 - w) as usize;
if code_len < bl_count.len() {
bl_count[code_len] += 1;
}
}
}
let kraft_sum: u64 = bl_count
.iter()
.enumerate()
.skip(1)
.filter(|&(len, _)| len <= max_w as usize)
.map(|(len, &count)| {
let contribution = 1u64 << (max_w as usize - len);
contribution * count as u64
})
.sum();
let expected_kraft = 1u64 << max_w;
println!(
"Kraft check: sum={}, expected={} (ratio: {})",
kraft_sum,
expected_kraft,
kraft_sum as f64 / expected_kraft as f64
);
let result = build_table_from_weights(parsed_weights.clone());
println!("Build result: {:?}", result.is_ok());
if let Err(e) = &result {
println!("Error: {:?}", e);
}
}
}
#[cfg(test)]
mod debug_tests {
use super::*;
use crate::compress::CompressContext;
use crate::huffman::HuffmanEncoder;
use haagenti_core::CompressionLevel;
fn generate_text_data(size: usize) -> Vec<u8> {
let words = [
"the ",
"quick ",
"brown ",
"fox ",
"jumps ",
"over ",
"lazy ",
"dog ",
"compression ",
"algorithm ",
"performance ",
"benchmark ",
"testing ",
"data ",
"stream ",
"encode ",
"decode ",
"entropy ",
"symbol ",
"table ",
];
let mut data = Vec::with_capacity(size);
let mut i = 0;
while data.len() < size {
let word = words[i % words.len()];
let remaining = size - data.len();
let to_copy = remaining.min(word.len());
data.extend_from_slice(&word.as_bytes()[..to_copy]);
i += 1;
}
data
}
#[test]
fn test_trace_100kb_text() {
let data = generate_text_data(102400);
let mut freq = [0u64; 256];
for &b in &data {
freq[b as usize] += 1;
}
let unique_count = freq.iter().filter(|&&f| f > 0).count();
println!("100KB text: {} unique symbols", unique_count);
let encoder = HuffmanEncoder::build(&data);
println!("Huffman encoder built: {}", encoder.is_some());
if let Some(enc) = &encoder {
let estimated = enc.estimate_size(&data);
println!("Estimated size: {} (original: {})", estimated, data.len());
let compressed = enc.encode(&data);
let weights = enc.serialize_weights();
println!(
"Actual compressed: {} + {} weights = {}",
compressed.len(),
weights.len(),
compressed.len() + weights.len()
);
}
let mut ctx = CompressContext::new(CompressionLevel::Default);
let result = ctx.compress(&data).unwrap();
println!(
"Full compression: {} -> {} bytes ({:.2}x)",
data.len(),
result.len(),
data.len() as f64 / result.len() as f64
);
}
}
#[cfg(test)]
mod debug_tests2 {
use super::*;
use crate::compress::CompressContext;
use crate::huffman::HuffmanEncoder;
use haagenti_core::CompressionLevel;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
fn generate_text_random(size: usize) -> Vec<u8> {
let words = [
"the ",
"quick ",
"brown ",
"fox ",
"jumps ",
"over ",
"lazy ",
"dog ",
"compression ",
"algorithm ",
"performance ",
"benchmark ",
"testing ",
"data ",
"stream ",
"encode ",
"decode ",
"entropy ",
"symbol ",
"table ",
];
let mut rng = StdRng::seed_from_u64(456);
let mut data = Vec::with_capacity(size);
while data.len() < size {
let word = words[rng.gen_range(0..words.len())];
let remaining = size - data.len();
let to_copy = remaining.min(word.len());
data.extend_from_slice(&word.as_bytes()[..to_copy]);
}
data
}
#[test]
fn test_trace_100kb_text_random() {
let data = generate_text_random(102400);
let mut freq = [0u64; 256];
for &b in &data {
freq[b as usize] += 1;
}
let unique_count = freq.iter().filter(|&&f| f > 0).count();
println!("100KB random text: {} unique symbols", unique_count);
let mut freqs: Vec<_> = freq.iter().enumerate().filter(|&(_, f)| *f > 0).collect();
freqs.sort_by(|a, b| b.1.cmp(a.1));
println!(
"Top frequencies: {:?}",
freqs
.iter()
.take(10)
.map(|(i, f)| ((*i as u8) as char, *f))
.collect::<Vec<_>>()
);
let encoder = HuffmanEncoder::build(&data);
println!("Huffman encoder built: {}", encoder.is_some());
if let Some(enc) = &encoder {
let estimated = enc.estimate_size(&data);
println!("Estimated size: {} (original: {})", estimated, data.len());
}
let mut ctx = CompressContext::new(CompressionLevel::Default);
let result = ctx.compress(&data).unwrap();
println!(
"Full compression: {} -> {} bytes ({:.2}x)",
data.len(),
result.len(),
data.len() as f64 / result.len() as f64
);
}
}
#[cfg(test)]
mod large_tests {
use super::*;
#[test]
#[ignore = "Pre-existing bug: checksum mismatch at 65KB+ sizes"]
fn test_benchmark_text_65kb() {
let pattern = b"The quick brown fox jumps over the lazy dog. ";
let mut data = Vec::with_capacity(65536);
while data.len() < 65536 {
data.extend_from_slice(pattern);
}
data.truncate(65536);
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("Compression failed");
let decompressor = ZstdDecompressor::new();
let decompressed = decompressor
.decompress(&compressed)
.expect("Decompression failed");
assert_eq!(data.len(), decompressed.len(), "Length mismatch");
assert_eq!(data, decompressed, "Content mismatch");
}
#[test]
fn test_roundtrip_16kb() {
let pattern = b"The quick brown fox jumps over the lazy dog. ";
let mut data = Vec::with_capacity(16384);
while data.len() < 16384 {
data.extend_from_slice(pattern);
}
data.truncate(16384);
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("Compression failed");
let decompressor = ZstdDecompressor::new();
let decompressed = decompressor
.decompress(&compressed)
.expect("Decompression failed");
assert_eq!(data.len(), decompressed.len(), "Length mismatch");
assert_eq!(data, decompressed, "Content mismatch");
}
}
#[cfg(test)]
mod cross_library_tests {
use super::*;
fn generate_test_data(size: usize) -> Vec<u8> {
let pattern = b"The quick brown fox jumps over the lazy dog. ";
let mut data = Vec::with_capacity(size);
while data.len() < size {
data.extend_from_slice(pattern);
}
data.truncate(size);
data
}
#[test]
fn test_haagenti_compress_zstd_decompress_65kb() {
let data = generate_test_data(65536);
let compressor = ZstdCompressor::new();
let compressed = compressor
.compress(&data)
.expect("Haagenti compression failed");
let result = zstd::decode_all(compressed.as_slice());
match result {
Ok(decompressed) => {
assert_eq!(data.len(), decompressed.len(), "Length mismatch");
if data != decompressed {
for (i, (a, b)) in data.iter().zip(decompressed.iter()).enumerate() {
if a != b {
println!(
"First divergence at byte {}: expected {:02x}, got {:02x}",
i, a, b
);
break;
}
}
panic!("Content mismatch - haagenti compression produces invalid output for reference zstd");
}
}
Err(e) => {
println!(
"Reference zstd failed to decompress haagenti output: {:?}",
e
);
println!("This confirms the bug is in HAAGENTI COMPRESSION");
panic!("Haagenti compression output is invalid");
}
}
}
#[test]
fn test_zstd_reference_raw_blocks() {
for size in [100, 200] {
let data: Vec<u8> = (0..size).map(|i| ((i * 17 + 31) % 256) as u8).collect();
let compressed =
zstd::encode_all(data.as_slice(), 1).expect("Reference zstd compression failed");
let decompressor = ZstdDecompressor::new();
let decompressed = decompressor
.decompress(&compressed)
.expect(&format!("Failed to decompress size {}", size));
assert_eq!(data, decompressed, "Size {} content mismatch", size);
}
}
#[test]
#[ignore = "Pre-existing bug: reference zstd compatibility for compressed blocks"]
fn test_zstd_compress_haagenti_decompress_65kb() {
let data = generate_test_data(65536);
let compressed =
zstd::encode_all(data.as_slice(), 3).expect("Reference zstd compression failed");
println!("Compressed size: {} bytes", compressed.len());
print!("First 64 bytes: ");
for (i, &b) in compressed.iter().take(64).enumerate() {
if i % 16 == 0 {
print!("\n ");
}
print!("{:02x} ", b);
}
println!();
if compressed.len() >= 4 {
let magic =
u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
println!("Magic: 0x{:08x} (expected 0xfd2fb528)", magic);
}
if compressed.len() >= 5 {
let fhd = compressed[4];
println!("Frame header descriptor: 0x{:02x}", fhd);
println!(" - Checksum flag: {}", (fhd >> 2) & 1);
println!(" - Single segment flag: {}", (fhd >> 5) & 1);
println!(" - Dictionary ID flag: {}", fhd & 0x03);
println!(" - FCS field size: {}", (fhd >> 6) & 0x03);
}
let decompressor = ZstdDecompressor::new();
let result = decompressor.decompress(&compressed);
match result {
Ok(decompressed) => {
assert_eq!(data.len(), decompressed.len(), "Length mismatch");
if data != decompressed {
for (i, (a, b)) in data.iter().zip(decompressed.iter()).enumerate() {
if a != b {
println!(
"First divergence at byte {}: expected {:02x}, got {:02x}",
i, a, b
);
break;
}
}
panic!("Content mismatch - haagenti decompression produces incorrect output");
}
}
Err(e) => {
println!(
"Haagenti failed to decompress reference zstd output: {:?}",
e
);
println!("This confirms the bug is in HAAGENTI DECOMPRESSION");
panic!("Haagenti decompression failed on valid zstd data");
}
}
}
#[test]
fn test_find_threshold_size() {
let sizes: Vec<usize> = (16..=32).map(|k| k * 1024).collect();
for size in sizes {
let data = generate_test_data(size);
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let compressed = compressor.compress(&data).expect("Compression failed");
let result = decompressor.decompress(&compressed);
match result {
Ok(decompressed) if decompressed == data => {
println!("Size {} ({}KB): OK", size, size / 1024);
}
Ok(decompressed) => {
println!(
"Size {} ({}KB): CONTENT MISMATCH (len: {} vs {})",
size,
size / 1024,
data.len(),
decompressed.len()
);
}
Err(e) => {
println!("Size {} ({}KB): ERROR - {:?}", size, size / 1024, e);
}
}
}
}
#[test]
fn test_analyze_compression_failure() {
for &size in &[16384, 20000, 24000, 28000, 32768] {
let data = generate_test_data(size);
let compressor = ZstdCompressor::new();
let haagenti_compressed = compressor.compress(&data).expect("Compression failed");
let zstd_compressed = zstd::encode_all(data.as_slice(), 3).expect("zstd failed");
let zstd_result = zstd::decode_all(haagenti_compressed.as_slice());
println!(
"Size {}: haagenti={} bytes, zstd={} bytes, zstd_decode_haagenti={:?}",
size,
haagenti_compressed.len(),
zstd_compressed.len(),
zstd_result
.as_ref()
.map(|v| v.len())
.map_err(|e| format!("{:?}", e))
);
}
}
#[test]
fn test_check_block_boundaries() {
let sizes = [8192, 16384, 16385, 20000, 24576, 32768, 32769];
for &size in &sizes {
let data = generate_test_data(size);
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("Compression failed");
let zstd_result = zstd::decode_all(compressed.as_slice());
println!(
"Size {}: compressed={} bytes, zstd_decode={:?}",
size,
compressed.len(),
match &zstd_result {
Ok(v) if *v == data => "OK".to_string(),
Ok(v) => format!("MISMATCH (len {})", v.len()),
Err(e) => format!("ERROR: {}", e),
}
);
}
}
#[test]
fn test_debug_compression_trace() {
let size = 25600; let data = generate_test_data(size);
println!("Input size: {} bytes", data.len());
println!("First 50 bytes: {:?}", &data[..50.min(data.len())]);
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("Compression failed");
println!("Compressed size: {} bytes", compressed.len());
println!(
"Compressed header: {:02x?}",
&compressed[..20.min(compressed.len())]
);
let magic =
u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
println!("Magic: 0x{:08X} (valid={})", magic, magic == 0xFD2FB528);
let descriptor = compressed[4];
let has_checksum = (descriptor & 0x04) != 0;
let single_segment = (descriptor & 0x20) != 0;
let fcs_size = match descriptor >> 6 {
0 => {
if single_segment {
1
} else {
0
}
}
1 => 2,
2 => 4,
3 => 8,
_ => 0,
};
println!(
"Descriptor: 0x{:02X}, checksum={}, single_segment={}, fcs_size={}",
descriptor, has_checksum, single_segment, fcs_size
);
let fcs_start = if single_segment { 5 } else { 6 };
let fcs = match fcs_size {
1 => compressed[fcs_start] as u64,
2 => {
u16::from_le_bytes([compressed[fcs_start], compressed[fcs_start + 1]]) as u64 + 256
}
4 => u32::from_le_bytes([
compressed[fcs_start],
compressed[fcs_start + 1],
compressed[fcs_start + 2],
compressed[fcs_start + 3],
]) as u64,
8 => u64::from_le_bytes(compressed[fcs_start..fcs_start + 8].try_into().unwrap()),
_ => 0,
};
println!("Frame Content Size: {} (input was {})", fcs, size);
let block_start = fcs_start + fcs_size;
let block_header = u32::from_le_bytes([
compressed[block_start],
compressed[block_start + 1],
compressed[block_start + 2],
0,
]);
let is_last = (block_header & 1) != 0;
let block_type = (block_header >> 1) & 3;
let block_size = (block_header >> 3) as usize;
let block_type_name = match block_type {
0 => "Raw",
1 => "RLE",
2 => "Compressed",
_ => "Reserved",
};
println!(
"Block: type={} ({}), size={}, is_last={}",
block_type, block_type_name, block_size, is_last
);
let result = zstd::decode_all(compressed.as_slice());
println!(
"Reference zstd decode: {:?}",
result.as_ref().map(|v| v.len())
);
}
#[test]
fn test_debug_huffman_encoding() {
use crate::huffman::HuffmanEncoder;
let size = 25600;
let data = generate_test_data(size);
let mut freq = [0u64; 256];
for &b in &data {
freq[b as usize] += 1;
}
let unique_count = freq.iter().filter(|&&f| f > 0).count();
println!(
"Input: {} bytes, {} unique symbols",
data.len(),
unique_count
);
let mut freqs: Vec<_> = freq
.iter()
.enumerate()
.filter(|&(_, &f)| f > 0)
.map(|(i, &f)| (i as u8, f))
.collect();
freqs.sort_by(|a, b| b.1.cmp(&a.1));
println!(
"Symbol frequencies (top 15): {:?}",
freqs
.iter()
.take(15)
.map(|(b, f)| ((*b as char), *f))
.collect::<Vec<_>>()
);
if let Some(encoder) = HuffmanEncoder::build(&data) {
println!(
"Huffman encoder built: max_bits={}, num_symbols={}",
encoder.max_bits(),
encoder.num_symbols()
);
let codes = encoder.get_codes();
let mut symbols_with_codes = 0;
let mut symbols_without_codes = 0;
for (i, code) in codes.iter().enumerate() {
if freq[i] > 0 {
if code.num_bits > 0 {
symbols_with_codes += 1;
} else {
symbols_without_codes += 1;
println!("WARNING: Symbol {} (freq={}) has no code!", i, freq[i]);
}
}
}
println!(
"Symbols with codes: {}, without codes: {}",
symbols_with_codes, symbols_without_codes
);
let compressed = encoder.encode(&data);
let weights = encoder.serialize_weights();
println!(
"Huffman output: {} bytes data + {} bytes weights = {} total",
compressed.len(),
weights.len(),
compressed.len() + weights.len()
);
let estimated = encoder.estimate_size(&data);
println!(
"Estimated: {} bytes, actual: {} bytes",
estimated,
compressed.len() + weights.len()
);
} else {
println!("Huffman encoder build failed!");
}
}
#[test]
fn test_debug_match_finder() {
use crate::compress::MatchFinder;
let size = 25600;
let data = generate_test_data(size);
println!("Input size: {} bytes", data.len());
println!(
"Pattern: first 45 bytes = {:?}",
String::from_utf8_lossy(&data[..45])
);
let mut mf = MatchFinder::new(16);
let matches = mf.find_matches(&data);
println!("Total matches found: {}", matches.len());
for (i, m) in matches.iter().take(10).enumerate() {
println!(
"Match {}: pos={}, offset={}, length={}",
i, m.position, m.offset, m.length
);
}
let total_match_len: usize = matches.iter().map(|m| m.length).sum();
println!(
"Total match coverage: {} bytes ({:.1}% of input)",
total_match_len,
100.0 * total_match_len as f64 / data.len() as f64
);
if matches.len() == 1 {
let m = &matches[0];
println!("\nSingle match analysis:");
println!(
" Position {} to {} (length {})",
m.position,
m.position + m.length,
m.length
);
println!(" References data at offset {} back", m.offset);
println!(
" Expected decompressed output: literals[0..{}] + match copy",
m.position
);
}
}
#[test]
fn test_debug_block_encoding() {
let size = 25600;
let data = generate_test_data(size);
let compressor = ZstdCompressor::new();
let full_compressed = compressor.compress(&data).unwrap();
println!("Full frame: {} bytes", full_compressed.len());
let block_start = 8; let block_header = u32::from_le_bytes([
full_compressed[block_start],
full_compressed[block_start + 1],
full_compressed[block_start + 2],
0,
]);
let is_last = (block_header & 1) != 0;
let btype = (block_header >> 1) & 3;
let block_size = (block_header >> 3) as usize;
println!(
"Block header: type={}, size={}, is_last={}",
btype, block_size, is_last
);
if btype == 2 {
let lit_header = full_compressed[block_start + 3];
let lit_type = lit_header & 0x03;
let lit_size_format = (lit_header >> 2) & 0x03;
println!(
"Literals section: type={}, size_format={}",
lit_type, lit_size_format
);
match (lit_type, lit_size_format) {
(2, 0) => {
let b0 = full_compressed[block_start + 3];
let b1 = full_compressed[block_start + 4];
let b2 = full_compressed[block_start + 5];
let regen = ((b0 as u32 >> 4) & 0xF) | (((b1 as u32) & 0x3F) << 4);
let comp = ((b1 as u32 >> 6) & 0x3) | ((b2 as u32) << 2);
println!("Size_Format=0: regen={}, comp={}", regen, comp);
}
(2, 1) => {
let b0 = full_compressed[block_start + 3];
let b1 = full_compressed[block_start + 4];
let b2 = full_compressed[block_start + 5];
let b3 = full_compressed[block_start + 6];
let regen =
((b0 as u32 >> 4) & 0xF) | ((b1 as u32) << 4) | (((b2 as u32) & 0x3) << 12);
let comp = ((b2 as u32 >> 2) & 0x3F) | ((b3 as u32) << 6);
println!("Size_Format=1: regen={}, comp={}", regen, comp);
}
(2, 2) => {
let b0 = full_compressed[block_start + 3];
let b1 = full_compressed[block_start + 4];
let b2 = full_compressed[block_start + 5];
let b3 = full_compressed[block_start + 6];
let b4 = full_compressed[block_start + 7];
let regen = ((b0 as u32 >> 4) & 0xF)
| ((b1 as u32) << 4)
| (((b2 as u32) & 0x3F) << 12);
let comp = ((b2 as u32 >> 6) & 0x3) | ((b3 as u32) << 2) | ((b4 as u32) << 10);
println!("Size_Format=2: regen={}, comp={}", regen, comp);
}
(2, 3) => {
let b0 = full_compressed[block_start + 3];
let b1 = full_compressed[block_start + 4];
let b2 = full_compressed[block_start + 5];
let regen = ((b0 as u32 >> 4) & 0xF) | (((b1 as u32) & 0x3F) << 4);
let comp = ((b1 as u32 >> 6) & 0x3) | ((b2 as u32) << 2);
println!(
"Size_Format=3 (single stream): regen={}, comp={}",
regen, comp
);
}
_ => {}
}
}
println!("\nBlock data (first 60 bytes):");
let block_data_start = block_start + 3;
let block_end = (block_data_start + block_size).min(full_compressed.len() - 4);
for (i, chunk) in full_compressed[block_data_start..block_end]
.chunks(20)
.enumerate()
{
println!(" {:04x}: {:02x?}", i * 20, chunk);
}
}
#[test]
fn test_fse_bitstream_comparison() {
use crate::block::Sequence;
use crate::compress::encode_sequences_fse;
use crate::fse::{
FseTable, LITERAL_LENGTH_ACCURACY_LOG, LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
};
use crate::fse::{MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
use crate::fse::{OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION};
let sequences = vec![
Sequence {
literal_length: 5,
match_length: 10,
offset: 100,
},
Sequence {
literal_length: 3,
match_length: 8,
offset: 50,
},
];
println!("=== FSE Bitstream Comparison Test ===");
println!("Sequences: {:?}", sequences);
let mut our_output = Vec::new();
let result = encode_sequences_fse(&sequences, &mut our_output);
match result {
Ok(()) => {
println!("\nOur FSE encoding succeeded: {} bytes", our_output.len());
println!("Output bytes: {:02x?}", our_output);
if !our_output.is_empty() {
let seq_count = our_output[0];
println!("Sequence count byte: {}", seq_count);
if our_output.len() > 1 {
let mode_byte = our_output[1];
println!(
"Mode byte: 0x{:02x} (LL={}, OF={}, ML={})",
mode_byte,
(mode_byte >> 6) & 0x3,
(mode_byte >> 4) & 0x3,
(mode_byte >> 2) & 0x3
);
}
if our_output.len() > 2 {
println!("\nBitstream ({} bytes):", our_output.len() - 2);
for (i, b) in our_output[2..].iter().enumerate() {
print!("{:02x} ", b);
if (i + 1) % 16 == 0 {
println!();
}
}
println!();
}
}
}
Err(e) => {
println!("Our FSE encoding failed: {:?}", e);
}
}
println!("\n=== Decode Table Info ===");
let ll_table = FseTable::from_predefined(
&LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
LITERAL_LENGTH_ACCURACY_LOG,
)
.unwrap();
let of_table =
FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
println!(
"LL table: accuracy_log={}, size={}",
ll_table.accuracy_log(),
ll_table.size()
);
println!(
"OF table: accuracy_log={}, size={}",
of_table.accuracy_log(),
of_table.size()
);
println!(
"ML table: accuracy_log={}, size={}",
ml_table.accuracy_log(),
ml_table.size()
);
}
#[test]
fn test_analyze_reference_sequence_bitstream() {
let mut data = Vec::new();
for i in 0..50u8 {
data.push(i + 0x30); }
for i in 0..20u8 {
data.push(i + 0x30);
}
let data = &data[..];
println!("=== Analyze Reference Sequence Bitstream ===");
println!(
"Input: {:?} ({} bytes)",
String::from_utf8_lossy(data),
data.len()
);
let compressed = zstd::encode_all(&data[..], 3).expect("compress failed");
println!(
"\nReference compressed ({} bytes): {:02x?}",
compressed.len(),
compressed
);
if compressed.len() >= 4 {
let magic =
u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
println!("Magic: 0x{:08x}", magic);
}
if compressed.len() > 4 {
let fhd = compressed[4];
let single_segment = (fhd >> 5) & 0x1 != 0;
let fcs_field = (fhd >> 6) & 0x3;
let fcs_size = match fcs_field {
0 => {
if single_segment {
1
} else {
0
}
}
1 => 2,
2 => 4,
3 => 8,
_ => 0,
};
let window_size = if single_segment { 0 } else { 1 };
let header_end = 5 + window_size + fcs_size;
println!(
"FHD: 0x{:02x}, single_segment={}, fcs_size={}",
fhd, single_segment, fcs_size
);
println!("Header ends at: {}", header_end);
if compressed.len() > header_end + 3 {
let bh = u32::from_le_bytes([
compressed[header_end],
compressed[header_end + 1],
compressed[header_end + 2],
0,
]);
let last = bh & 1 != 0;
let block_type = (bh >> 1) & 3;
let block_size = (bh >> 3) as usize;
println!("\nBlock at {}:", header_end);
println!(
" Last: {}, Type: {} ({}), Size: {}",
last,
block_type,
match block_type {
0 => "Raw",
1 => "RLE",
2 => "Compressed",
_ => "?",
},
block_size
);
if block_type == 2 && compressed.len() >= header_end + 3 + block_size {
let block_start = header_end + 3;
let block_data = &compressed[block_start..block_start + block_size];
println!(
"\nBlock content ({} bytes): {:02x?}",
block_data.len(),
block_data
);
if !block_data.is_empty() {
let lit_type = block_data[0] & 0x3;
let lit_size_format = (block_data[0] >> 2) & 0x3;
println!(
"\nLiterals type: {} ({})",
lit_type,
match lit_type {
0 => "Raw",
1 => "RLE",
2 => "Compressed",
3 => "Treeless",
_ => "?",
}
);
let (lit_regen_size, lit_header_size) = if lit_type == 0 || lit_type == 1 {
match lit_size_format {
0 | 2 => (((block_data[0] >> 3) & 0x1F) as usize, 1usize),
1 => {
let s = ((block_data[0] >> 4) as usize)
| ((block_data[1] as usize) << 4);
(s, 2)
}
3 => {
let s = ((block_data[0] >> 4) as usize)
| ((block_data[1] as usize) << 4)
| (((block_data[2] & 0x3F) as usize) << 12);
(s, 3)
}
_ => (0, 1),
}
} else {
(0, 0)
};
println!(
"Literals regenerated size: {}, header size: {}",
lit_regen_size, lit_header_size
);
let seq_start = lit_header_size
+ if lit_type == 0 {
lit_regen_size
} else {
if lit_type == 1 {
1
} else {
0
}
};
if seq_start < block_data.len() {
println!("\nSequence section at offset {}:", seq_start);
let seq_data = &block_data[seq_start..];
println!(" Sequence data: {:02x?}", seq_data);
if !seq_data.is_empty() {
let seq_count = seq_data[0];
println!(
" Sequence count byte: {} (count = {})",
seq_data[0],
if seq_count < 128 {
seq_count as usize
} else {
((seq_count as usize - 128) << 8) | seq_data[1] as usize
}
);
let (count, header_len) = if seq_count < 128 {
(seq_count as usize, 1)
} else if seq_count < 255 {
(((seq_count as usize - 128) << 8) | seq_data[1] as usize, 2)
} else {
(
seq_data[1] as usize
| ((seq_data[2] as usize) << 8) + 0x7F00,
3,
)
};
if seq_data.len() > header_len {
let mode_byte = seq_data[header_len];
println!(
" Mode byte: 0x{:02x} (LL={}, OF={}, ML={})",
mode_byte,
(mode_byte >> 6) & 3,
(mode_byte >> 4) & 3,
(mode_byte >> 2) & 3
);
}
if seq_data.len() > header_len + 1 {
let bitstream = &seq_data[header_len + 1..];
println!(
" FSE Bitstream ({} bytes): {:02x?}",
bitstream.len(),
bitstream
);
}
}
}
}
}
}
}
let decompressed = zstd::decode_all(&compressed[..]).expect("decompress failed");
assert_eq!(&decompressed, data);
println!("\nRoundtrip verified!");
use crate::block::Sequence;
use crate::compress::encode_sequences_fse;
let sequences = vec![Sequence {
literal_length: 50,
match_length: 20,
offset: 53,
}];
println!("\n=== Our Encoding ===");
println!("Sequence: ll=50, ml=20, offset_value=53 (actual offset 50)");
let mut our_output = Vec::new();
encode_sequences_fse(&sequences, &mut our_output).expect("encode failed");
println!(
"Our sequence section ({} bytes): {:02x?}",
our_output.len(),
our_output
);
if our_output.len() >= 2 {
println!(" Count: {}", our_output[0]);
println!(" Mode: 0x{:02x}", our_output[1]);
if our_output.len() > 2 {
println!(" Bitstream: {:02x?}", &our_output[2..]);
}
}
let ref_bitstream = &[0x52, 0x69, 0x05, 0x05];
let our_bitstream = if our_output.len() > 2 {
&our_output[2..]
} else {
&[]
};
println!("\n=== Comparison ===");
println!("Reference: {:02x?}", ref_bitstream);
println!("Ours: {:02x?}", our_bitstream);
if ref_bitstream == our_bitstream {
println!("BITSTREAMS MATCH!");
} else {
println!("BITSTREAMS DIFFER!");
decode_bitstream_bits("Reference", ref_bitstream);
decode_bitstream_bits("Ours", our_bitstream);
}
}
#[test]
fn test_reference_decodes_our_fse() {
use haagenti_core::{Compressor, Decompressor};
let data: Vec<u8> = b"ABCD".iter().cycle().take(100).copied().collect();
println!("=== Test Reference Decodes Our FSE ===");
println!("Input: {} bytes", data.len());
let mut mf = crate::compress::LazyMatchFinder::new(16);
let matches = mf.find_matches(&data);
println!("Matches found: {}", matches.len());
for (i, m) in matches.iter().enumerate() {
println!(
" Match[{}]: pos={}, len={}, offset={}",
i, m.position, m.length, m.offset
);
}
let (literals, seqs) = crate::compress::block::matches_to_sequences(&data, &matches);
println!("Sequences: {}", seqs.len());
for (i, s) in seqs.iter().enumerate() {
println!(
" Seq[{}]: ll={}, offset={}, ml={}",
i, s.literal_length, s.offset, s.match_length
);
let enc = crate::compress::EncodedSequence::from_sequence(s);
println!(
" Encoded: ll_code={}, of_code={}, ml_code={}",
enc.ll_code, enc.of_code, enc.ml_code
);
println!(
" Extra: ll_bits={}, of_extra={}, ml_extra={}",
enc.ll_bits, enc.of_extra, enc.ml_extra
);
}
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("our compress failed");
println!("Compressed: {} bytes", compressed.len());
println!("Bytes: {:02x?}", compressed);
match zstd::decode_all(&compressed[..]) {
Ok(decoded) => {
println!("Reference zstd decoded: {} bytes", decoded.len());
if decoded == data {
println!("SUCCESS! Reference zstd correctly decoded our output!");
} else {
println!("MISMATCH! Decoded data differs from original");
println!("Expected: {:?}", data);
println!("Got: {:?}", decoded);
}
assert_eq!(decoded, data, "Reference decode mismatch");
}
Err(e) => {
println!("FAILED: Reference zstd could not decode: {:?}", e);
if compressed.len() >= 4 {
let magic = u32::from_le_bytes([
compressed[0],
compressed[1],
compressed[2],
compressed[3],
]);
println!("Magic: 0x{:08x}", magic);
}
if compressed.len() > 4 {
let fhd = compressed[4];
println!("FHD: 0x{:02x}", fhd);
}
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&compressed) {
Ok(decoded) => {
println!("Our decoder succeeded: {} bytes", decoded.len());
if decoded == data {
println!("Our roundtrip works, issue is reference compatibility");
}
}
Err(e2) => {
println!("Our decoder also failed: {:?}", e2);
}
}
panic!("Reference zstd failed to decode our output");
}
}
}
#[test]
fn test_two_sequences() {
use haagenti_core::Compressor;
let data: Vec<u8> = b"ABCD".iter().cycle().take(500).copied().collect();
println!("=== Test Two Sequences ===");
println!("Input: {} bytes", data.len());
let mut mf = crate::compress::LazyMatchFinder::new(16);
let matches = mf.find_matches(&data);
println!("Matches found: {}", matches.len());
for (i, m) in matches.iter().enumerate() {
println!(
" Match[{}]: pos={}, len={}, offset={}",
i, m.position, m.length, m.offset
);
}
let (literals, seqs) = crate::compress::block::matches_to_sequences(&data, &matches);
println!("Sequences: {}", seqs.len());
for (i, s) in seqs.iter().enumerate() {
println!(
" Seq[{}]: ll={}, offset={}, ml={}",
i, s.literal_length, s.offset, s.match_length
);
let enc = crate::compress::EncodedSequence::from_sequence(s);
println!(
" Encoded: ll_code={}, of_code={}, ml_code={}",
enc.ll_code, enc.of_code, enc.ml_code
);
println!(
" Extra: ll_extra={}({} bits), of_extra={}({} bits), ml_extra={}({} bits)",
enc.ll_extra, enc.ll_bits, enc.of_extra, enc.of_bits, enc.ml_extra, enc.ml_bits
);
}
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("our compress failed");
println!("Compressed: {} bytes", compressed.len());
println!("Bytes: {:02x?}", compressed);
let ref_compressed = zstd::encode_all(&data[..], 1).expect("ref compress failed");
println!("Reference compressed: {} bytes", ref_compressed.len());
println!("Reference bytes: {:02x?}", ref_compressed);
use crate::fse::{FseTable, MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
println!("\nML code 46 positions in decode table:");
for pos in 0..ml_table.size() {
let entry = ml_table.decode(pos);
if entry.symbol == 46 {
println!(
" Position {}: symbol={}, nb_bits={}, baseline={}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
}
let entry63 = ml_table.decode(63);
let entry42 = ml_table.decode(42);
println!("Position 63 decodes to: symbol={}", entry63.symbol);
println!("Position 42 decodes to: symbol={}", entry42.symbol);
match zstd::decode_all(&compressed[..]) {
Ok(decoded) => {
println!("Reference zstd decoded: {} bytes", decoded.len());
if decoded == data {
println!("SUCCESS! Reference zstd correctly decoded our 2-sequence output!");
} else {
println!("MISMATCH! Decoded data differs from original");
}
assert_eq!(decoded, data, "Reference decode mismatch");
}
Err(e) => {
println!("FAILED: Reference zstd could not decode: {:?}", e);
panic!("Reference zstd failed to decode our 2-sequence output");
}
}
}
#[test]
fn test_reference_decode_no_checksum() {
use haagenti_core::{Compressor, Decompressor};
let mut data = Vec::new();
for i in 0..100u8 {
data.push(i);
}
for i in 0..50u8 {
data.push(i);
}
println!("=== Test Reference Decode Without Checksum ===");
println!("Input: {} bytes", data.len());
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("compress failed");
println!("Original compressed: {} bytes", compressed.len());
println!("Full bytes: {:02x?}", compressed);
let fhd = compressed[4];
println!("\nFHD byte: 0x{:02x}", fhd);
println!(" Content_Checksum_flag: {}", (fhd >> 2) & 1);
println!(" Single_Segment_flag: {}", (fhd >> 5) & 1);
let mut modified = compressed.clone();
modified[4] = fhd & !0x04;
println!("\nModified FHD byte: 0x{:02x}", modified[4]);
modified.truncate(modified.len() - 4);
println!("Modified compressed: {} bytes", modified.len());
println!("Modified bytes: {:02x?}", modified);
match zstd::decode_all(&modified[..]) {
Ok(decoded) => {
println!(
"SUCCESS! Reference decoded without checksum: {} bytes",
decoded.len()
);
if decoded == data {
println!("Data matches! Issue is CHECKSUM, not block encoding");
} else {
println!("Data mismatch! Both checksum AND block encoding have issues");
println!("Expected first 20: {:?}", &data[..20]);
println!("Got first 20: {:?}", &decoded[..20.min(decoded.len())]);
}
}
Err(e) => {
println!("FAILED even without checksum: {:?}", e);
println!("Issue is in BLOCK ENCODING, not checksum");
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&modified) {
Ok(decoded) => {
println!("Our decoder succeeded on modified: {} bytes", decoded.len());
}
Err(e2) => {
println!("Our decoder also failed on modified: {:?}", e2);
}
}
}
}
}
#[test]
fn test_debug_fse_state_values() {
use crate::block::Sequence;
use crate::compress::EncodedSequence;
use crate::fse::{
FseBitWriter, FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
};
println!("=== Debug FSE State Values ===");
let seq = Sequence::new(100, 103, 50);
let encoded = EncodedSequence::from_sequence(&seq);
println!(
"Sequence: ll={}, of={}, ml={}",
seq.literal_length, seq.offset, seq.match_length
);
println!(
"Encoded: ll_code={}, of_code={}, ml_code={}",
encoded.ll_code, encoded.of_code, encoded.ml_code
);
println!(
"Extra bits: ll={}({} bits), of={}({} bits), ml={}({} bits)",
encoded.ll_extra,
encoded.ll_bits,
encoded.of_extra,
encoded.of_code,
encoded.ml_extra,
encoded.ml_bits
);
let ll_table = FseTable::from_predefined(
&LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
LITERAL_LENGTH_ACCURACY_LOG,
)
.unwrap();
let of_table =
FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
println!(
"\nTable sizes: LL={}, OF={}, ML={}",
ll_table.size(),
of_table.size(),
ml_table.size()
);
println!(
"Accuracy logs: LL={}, OF={}, ML={}",
LITERAL_LENGTH_ACCURACY_LOG, OFFSET_ACCURACY_LOG, MATCH_LENGTH_ACCURACY_LOG
);
let mut tans = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
tans.init_states(encoded.ll_code, encoded.of_code, encoded.ml_code);
let (ll_state, of_state, ml_state) = tans.get_states();
println!(
"\nAfter init_states({}, {}, {}):",
encoded.ll_code, encoded.of_code, encoded.ml_code
);
println!(" LL state: {}", ll_state);
println!(" OF state: {}", of_state);
println!(" ML state: {}", ml_state);
let mut bits = FseBitWriter::new();
bits.write_bits(encoded.of_extra, encoded.of_code); bits.write_bits(encoded.ml_extra, encoded.ml_bits); bits.write_bits(encoded.ll_extra, encoded.ll_bits);
let (ll_log, of_log, ml_log) = tans.accuracy_logs();
bits.write_bits(ml_state, ml_log);
bits.write_bits(of_state, of_log);
bits.write_bits(ll_state, ll_log);
let bitstream = bits.finish();
println!("\nOur bitstream: {:02x?}", bitstream);
println!("Reference bitstream: [e4, 67, 14, a2]");
let our_16 = u16::from_le_bytes([bitstream[0], bitstream[1]]);
let ref_16 = u16::from_le_bytes([0xe4, 0x67]);
println!(
"\nFirst 16 bits (le): ours=0x{:04x} ref=0x{:04x}",
our_16, ref_16
);
println!("Ours binary: {:016b}", our_16);
println!("Ref binary: {:016b}", ref_16);
println!("\n=== Decode table positions ===");
println!("LL code {} appears at positions:", encoded.ll_code);
for pos in 0..ll_table.size() {
let entry = ll_table.decode(pos);
if entry.symbol == encoded.ll_code {
println!(
" Position {}: symbol={}, nb_bits={}, baseline={}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
}
println!("OF code {} appears at positions:", encoded.of_code);
for pos in 0..of_table.size() {
let entry = of_table.decode(pos);
if entry.symbol == encoded.of_code {
println!(
" Position {}: symbol={}, nb_bits={}, baseline={}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
}
println!("ML code {} appears at positions:", encoded.ml_code);
for pos in 0..ml_table.size() {
let entry = ml_table.decode(pos);
if entry.symbol == encoded.ml_code {
println!(
" Position {}: symbol={}, nb_bits={}, baseline={}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
}
}
#[test]
fn test_compare_block_structure() {
use haagenti_core::Compressor;
let mut data = Vec::new();
for i in 0..100u8 {
data.push(i);
}
for i in 0..50u8 {
data.push(i);
}
println!("=== Compare Block Structure ===");
println!("Input: {} bytes", data.len());
let ref_compressed = zstd::encode_all(&data[..], 1).expect("ref compress failed");
println!("\nReference compressed: {} bytes", ref_compressed.len());
println!("Reference bytes: {:02x?}", ref_compressed);
let ref_fhd = ref_compressed[4];
println!("\nReference FHD: 0x{:02x}", ref_fhd);
let compressor = ZstdCompressor::new();
let our_compressed = compressor.compress(&data).expect("our compress failed");
println!("\nOur compressed: {} bytes", our_compressed.len());
println!("Our bytes: {:02x?}", our_compressed);
let our_fhd = our_compressed[4];
println!("\nOur FHD: 0x{:02x}", our_fhd);
let ref_single_segment = (ref_fhd >> 5) & 1 == 1;
let ref_has_checksum = (ref_fhd >> 2) & 1 == 1;
let ref_fcs_size = match ref_fhd >> 6 {
0 if ref_single_segment => 1,
0 => 0,
1 => 2,
2 => 4,
3 => 8,
_ => 0,
};
let ref_window_present = !ref_single_segment;
let ref_header_size = 1 + (if ref_window_present { 1 } else { 0 }) + ref_fcs_size;
println!("\nReference frame header size: {} bytes", ref_header_size);
println!(" Single segment: {}", ref_single_segment);
println!(" Has checksum: {}", ref_has_checksum);
let our_single_segment = (our_fhd >> 5) & 1 == 1;
let our_has_checksum = (our_fhd >> 2) & 1 == 1;
let our_fcs_size = match our_fhd >> 6 {
0 if our_single_segment => 1,
0 => 0,
1 => 2,
2 => 4,
3 => 8,
_ => 0,
};
let our_window_present = !our_single_segment;
let our_header_size = 1 + (if our_window_present { 1 } else { 0 }) + our_fcs_size;
println!("\nOur frame header size: {} bytes", our_header_size);
println!(" Single segment: {}", our_single_segment);
println!(" Has checksum: {}", our_has_checksum);
let ref_block_start = 4 + ref_header_size;
let our_block_start = 4 + our_header_size;
println!(
"\nReference block header at offset {}: {:02x?}",
ref_block_start,
&ref_compressed[ref_block_start..ref_block_start + 3]
);
println!(
"Our block header at offset {}: {:02x?}",
our_block_start,
&our_compressed[our_block_start..our_block_start + 3]
);
let ref_block_header = u32::from_le_bytes([
ref_compressed[ref_block_start],
ref_compressed[ref_block_start + 1],
ref_compressed[ref_block_start + 2],
0,
]);
let ref_is_last = ref_block_header & 1 == 1;
let ref_block_type = (ref_block_header >> 1) & 3;
let ref_block_size = ref_block_header >> 3;
let our_block_header = u32::from_le_bytes([
our_compressed[our_block_start],
our_compressed[our_block_start + 1],
our_compressed[our_block_start + 2],
0,
]);
let our_is_last = our_block_header & 1 == 1;
let our_block_type = (our_block_header >> 1) & 3;
let our_block_size = our_block_header >> 3;
println!(
"\nReference block: is_last={}, type={}, size={}",
ref_is_last, ref_block_type, ref_block_size
);
println!(
"Our block: is_last={}, type={}, size={}",
our_is_last, our_block_type, our_block_size
);
let ref_block_content_start = ref_block_start + 3;
let our_block_content_start = our_block_start + 3;
println!("\n=== Literals Section ===");
let ref_lit_header = ref_compressed[ref_block_content_start];
let our_lit_header = our_compressed[our_block_content_start];
println!("Reference literals header: 0x{:02x}", ref_lit_header);
println!("Our literals header: 0x{:02x}", our_lit_header);
let ref_lit_type = ref_lit_header & 3;
let our_lit_type = our_lit_header & 3;
println!(
"Reference literals type: {} (0=Raw, 1=RLE, 2=Compressed, 3=Treeless)",
ref_lit_type
);
println!(
"Our literals type: {} (0=Raw, 1=RLE, 2=Compressed, 3=Treeless)",
our_lit_type
);
let ref_remaining = &ref_compressed[ref_block_content_start..];
let our_remaining = &our_compressed[our_block_content_start..];
let ref_block_end = ref_block_content_start + ref_block_size as usize;
let our_block_end = our_block_content_start + our_block_size as usize;
if ref_block_end <= ref_compressed.len() {
println!(
"\nReference block last 15 bytes: {:02x?}",
&ref_compressed[ref_block_end.saturating_sub(15)..ref_block_end]
);
}
if our_block_end <= our_compressed.len() {
println!(
"Our block last 15 bytes: {:02x?}",
&our_compressed[our_block_end.saturating_sub(15)..our_block_end]
);
}
}
#[test]
fn test_xxhash64_against_known_values() {
use crate::frame::xxhash64;
println!("=== XXHash64 Verification ===");
let empty_hash = xxhash64(&[], 0);
println!("xxhash64('', 0) = 0x{:016x}", empty_hash);
let expected_empty = 0xEF46DB3751D8E999u64;
println!("Expected: 0x{:016x}", expected_empty);
if empty_hash == expected_empty {
println!(" ✓ MATCH");
} else {
println!(" ✗ MISMATCH");
}
let hello_hash = xxhash64(b"Hello", 0);
println!("\nxxhash64('Hello', 0) = 0x{:016x}", hello_hash);
let digits_hash = xxhash64(b"0123456789", 0);
println!("xxhash64('0123456789', 0) = 0x{:016x}", digits_hash);
let mut test_data = Vec::new();
for i in 0..100u8 {
test_data.push(i);
}
for i in 0..50u8 {
test_data.push(i);
}
let our_hash = xxhash64(&test_data, 0);
let our_checksum = (our_hash & 0xFFFFFFFF) as u32;
println!("\nFor 150-byte test data:");
println!(" Our full xxhash64: 0x{:016x}", our_hash);
println!(" Our 32-bit checksum: 0x{:08x}", our_checksum);
let ref_compressed = zstd::encode_all(&test_data[..], 1).expect("ref compress failed");
println!("\nReference compressed: {} bytes", ref_compressed.len());
let ref_fhd = ref_compressed[4];
println!("Reference FHD: 0x{:02x}", ref_fhd);
let has_checksum = (ref_fhd >> 2) & 1 == 1;
println!("Reference has checksum: {}", has_checksum);
if has_checksum {
let ref_checksum = u32::from_le_bytes([
ref_compressed[ref_compressed.len() - 4],
ref_compressed[ref_compressed.len() - 3],
ref_compressed[ref_compressed.len() - 2],
ref_compressed[ref_compressed.len() - 1],
]);
println!("Reference 32-bit checksum: 0x{:08x}", ref_checksum);
if our_checksum == ref_checksum {
println!(" ✓ CHECKSUMS MATCH!");
} else {
println!(" ✗ CHECKSUMS DIFFER!");
}
}
}
#[test]
fn test_debug_of_init_state() {
use crate::fse::TansEncoder;
use crate::fse::{
FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
};
let of_table =
FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
println!("=== Debug OF Init State for Code 5 ===");
println!("OF accuracy log: {}", OFFSET_ACCURACY_LOG);
println!("OF table size: {}", of_table.size());
println!("\nOF Decode Table:");
println!(" Positions where symbol 5 appears:");
for pos in 0..of_table.size() {
let entry = of_table.decode(pos);
if entry.symbol == 5 {
println!(
" Position {} -> symbol={}, nb_bits={}, baseline={}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
}
println!("\n All positions:");
for pos in 0..of_table.size() {
let entry = of_table.decode(pos);
println!(
" {:2}: symbol={:2}, nb_bits={}, baseline={:2}",
pos, entry.symbol, entry.num_bits, entry.baseline
);
}
let mut encoder = TansEncoder::from_decode_table(&of_table);
encoder.init_state(5);
let single_output_state = encoder.get_state();
println!("\nSingle OF encoder:");
println!(" init_state(5) -> output state = {}", single_output_state);
let ll_table = FseTable::from_predefined(
&LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
LITERAL_LENGTH_ACCURACY_LOG,
)
.unwrap();
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
let mut interleaved = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
interleaved.init_states(23, 5, 17);
let (ll_state, of_state, ml_state) = interleaved.get_states();
println!("\nInterleaved encoder (like sequence encoding):");
println!(" init_states(23, 5, 17) -> states:");
println!(" LL = {}", ll_state);
println!(" OF = {}", of_state);
println!(" ML = {}", ml_state);
println!(" Expected OF = 18 (position 18 in decode table)");
println!(" Expected LL = 38 (position 38 in decode table)");
let entry18 = of_table.decode(18);
println!(
"\n Position 18 has: symbol={}, nb_bits={}, baseline={}",
entry18.symbol, entry18.num_bits, entry18.baseline
);
}
fn decode_bitstream_bits(name: &str, bytes: &[u8]) {
if bytes.is_empty() {
println!(" {} is empty", name);
return;
}
println!(" {} bits:", name);
let last = bytes[bytes.len() - 1];
let sentinel_pos = 31 - (last as u32).leading_zeros();
println!(
" Last byte: 0x{:02x}, sentinel at bit {}",
last, sentinel_pos
);
let total_bits = (bytes.len() - 1) * 8 + sentinel_pos as usize;
println!(" Total data bits: {}", total_bits);
let mut bit_pos = 0;
let mut bit_buffer: u64 = 0;
let mut bits_in_buffer = 0;
for &b in bytes.iter().rev() {
bit_buffer |= (b as u64) << bits_in_buffer;
bits_in_buffer += 8;
}
bits_in_buffer = total_bits;
bit_buffer &= (1u64 << bits_in_buffer) - 1;
let ll_state = (bit_buffer >> (bits_in_buffer - 6)) & 0x3F;
let of_state = (bit_buffer >> (bits_in_buffer - 6 - 5)) & 0x1F;
let ml_state = (bit_buffer >> (bits_in_buffer - 6 - 5 - 6)) & 0x3F;
println!(
" Initial states: LL={} OF={} ML={}",
ll_state, of_state, ml_state
);
let remaining = bits_in_buffer - 17;
println!(" Remaining bits after states: {}", remaining);
}
#[test]
fn test_reference_zstd_comparison() {
use haagenti_core::{Compressor, Decompressor};
let mut data = Vec::new();
for i in 0..100u8 {
data.push(i);
}
for i in 0..50u8 {
data.push(i); }
data.push(0xAA);
data.push(0xBB);
data.push(0xCC);
for i in 50..80u8 {
data.push(i); }
println!("=== Reference Zstd Comparison ===");
println!(
"Input data ({} bytes): {:?}",
data.len(),
String::from_utf8_lossy(&data)
);
let ref_compressed =
zstd::encode_all(&data[..], 3).expect("reference zstd compress failed");
println!(
"\nReference zstd compressed: {} bytes",
ref_compressed.len()
);
println!("Reference bytes: {:02x?}", ref_compressed);
parse_zstd_frame("Reference", &ref_compressed);
let compressor = ZstdCompressor::new();
let our_compressed = compressor.compress(&data).expect("our compress failed");
println!(
"\nOur implementation compressed: {} bytes",
our_compressed.len()
);
println!("Our bytes: {:02x?}", our_compressed);
parse_zstd_frame("Ours", &our_compressed);
let ref_decompressed =
zstd::decode_all(&ref_compressed[..]).expect("reference decode failed");
assert_eq!(&ref_decompressed, &data, "Reference roundtrip failed");
println!("\n=== Decoding Tests ===");
match zstd::decode_all(&our_compressed[..]) {
Ok(decoded) => {
println!("Reference zstd decoded our output: {} bytes", decoded.len());
if decoded == data {
println!("Reference zstd roundtrip SUCCEEDED!");
} else {
println!("Reference zstd decoded WRONG data!");
println!("Expected {} bytes, got {} bytes", data.len(), decoded.len());
}
}
Err(e) => {
println!("Reference zstd FAILED to decode our output: {:?}", e);
}
}
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&our_compressed) {
Ok(decoded) => {
println!("Our decoder succeeded: {} bytes", decoded.len());
assert_eq!(&decoded, &data, "Our roundtrip failed");
}
Err(e) => {
println!("Our decoder FAILED: {:?}", e);
}
}
println!("\n=== Done ===");
}
fn parse_zstd_frame(name: &str, data: &[u8]) {
println!("\n--- {} Frame Structure ---", name);
if data.len() < 4 {
println!("Frame too short!");
return;
}
let magic = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
println!("Magic: 0x{:08x} (expected: 0xFD2FB528)", magic);
if data.len() < 5 {
return;
}
let fhd = data[4];
let fcs_size = match (fhd >> 6) & 0x3 {
0 => {
if fhd & 0x20 != 0 {
1
} else {
0
}
}
1 => 2,
2 => 4,
3 => 8,
_ => 0,
};
let single_segment = (fhd >> 5) & 0x1 != 0;
let content_checksum = (fhd >> 2) & 0x1 != 0;
let dict_id_size = match fhd & 0x3 {
0 => 0,
1 => 1,
2 => 2,
3 => 4,
_ => 0,
};
println!("Frame Header Descriptor: 0x{:02x}", fhd);
println!(" - FCS size: {} bytes", fcs_size);
println!(" - Single segment: {}", single_segment);
println!(" - Content checksum: {}", content_checksum);
println!(" - Dict ID size: {} bytes", dict_id_size);
let window_desc_offset = if single_segment { 0 } else { 1 };
let header_size = 5 + window_desc_offset + dict_id_size + fcs_size;
println!("Header ends at byte {}", header_size);
if data.len() > header_size {
let block_start = header_size;
if block_start + 3 <= data.len() {
let bh0 = data[block_start] as u32;
let bh1 = data[block_start + 1] as u32;
let bh2 = data[block_start + 2] as u32;
let block_header = bh0 | (bh1 << 8) | (bh2 << 16);
let last_block = block_header & 0x1 != 0;
let block_type = (block_header >> 1) & 0x3;
let block_size = (block_header >> 3) as usize;
println!("\nFirst Block at offset {}:", block_start);
println!(
" - Block header bytes: {:02x} {:02x} {:02x}",
bh0, bh1, bh2
);
println!(" - Last block: {}", last_block);
println!(
" - Block type: {} ({})",
block_type,
match block_type {
0 => "Raw",
1 => "RLE",
2 => "Compressed",
3 => "Reserved",
_ => "Unknown",
}
);
println!(" - Block size: {} bytes", block_size);
let block_content_start = block_start + 3;
let block_content_end = (block_content_start + block_size).min(data.len());
println!(
"\nBlock content ({} bytes):",
block_content_end - block_content_start
);
for (i, chunk) in data[block_content_start..block_content_end]
.chunks(16)
.enumerate()
{
print!(" {:04x}: ", i * 16);
for b in chunk {
print!("{:02x} ", b);
}
println!();
}
}
}
}
#[test]
fn test_fse_bytes_in_reference_frame() {
let ref_frame: Vec<u8> = vec![
0x28, 0xb5, 0x2f, 0xfd, 0x00, 0x48, 0x55, 0x00, 0x00, 0x20, 0x41, 0x42, 0x43, 0x44, 0x01, 0x00, 0xfd, 0xe4, 0x88, ];
println!("=== Test FSE Bytes in Reference Frame ===");
println!("Reference frame: {:02x?}", ref_frame);
match zstd::decode_all(&ref_frame[..]) {
Ok(decoded) => {
println!(
"Reference frame with reference FSE: SUCCESS ({} bytes)",
decoded.len()
);
println!(" Decoded: {:?}", String::from_utf8_lossy(&decoded));
}
Err(e) => {
println!("Reference frame with reference FSE: FAILED {:?}", e);
}
}
let mut our_fse_frame = ref_frame.clone();
our_fse_frame[16] = 0xf7;
println!("\nOur FSE frame: {:02x?}", our_fse_frame);
match zstd::decode_all(&our_fse_frame[..]) {
Ok(decoded) => {
println!(
"Reference frame with OUR FSE: SUCCESS ({} bytes)",
decoded.len()
);
println!(" Decoded: {:?}", String::from_utf8_lossy(&decoded));
}
Err(e) => {
println!("Reference frame with OUR FSE: FAILED {:?}", e);
println!("This confirms FSE encoding difference is the issue");
}
}
}
}
#[cfg(test)]
mod profiling_tests {
use crate::compress::block::matches_to_sequences;
use crate::compress::{
analyze_for_rle, CompressContext, EncodedSequence, LazyMatchFinder, MatchFinder,
};
use crate::huffman::HuffmanEncoder;
use crate::{ZstdCompressor, ZstdDecompressor};
use haagenti_core::{CompressionLevel, Compressor, Decompressor};
#[derive(Debug, Default)]
struct CompressionProfile {
input_size: usize,
output_size: usize,
num_matches: usize,
total_match_bytes: usize,
literal_bytes: usize,
avg_match_length: f64,
avg_offset: f64,
num_sequences: usize,
rle_suitable: bool,
ll_codes_unique: usize,
of_codes_unique: usize,
ml_codes_unique: usize,
huffman_viable: bool,
huffman_estimated_size: usize,
zstd_size: usize,
}
fn profile_compression(data: &[u8], level: CompressionLevel) -> CompressionProfile {
let mut profile = CompressionProfile {
input_size: data.len(),
..Default::default()
};
let matches = match level {
CompressionLevel::Fast | CompressionLevel::None => {
let mut mf = MatchFinder::new(4);
mf.find_matches(data)
}
_ => {
let mut mf = LazyMatchFinder::new(16);
mf.find_matches(data)
}
};
profile.num_matches = matches.len();
if !matches.is_empty() {
let total_len: usize = matches.iter().map(|m| m.length).sum();
let total_off: usize = matches.iter().map(|m| m.offset).sum();
profile.total_match_bytes = total_len;
profile.avg_match_length = total_len as f64 / matches.len() as f64;
profile.avg_offset = total_off as f64 / matches.len() as f64;
}
let (literals, sequences) = matches_to_sequences(data, &matches);
profile.literal_bytes = literals.len();
profile.num_sequences = sequences.len();
let suitability = analyze_for_rle(&sequences);
profile.rle_suitable = suitability.all_uniform();
if !sequences.is_empty() {
use std::collections::HashSet;
let encoded: Vec<_> = sequences
.iter()
.map(|s| EncodedSequence::from_sequence(s))
.collect();
let ll_codes: HashSet<_> = encoded.iter().map(|e| e.ll_code).collect();
let of_codes: HashSet<_> = encoded.iter().map(|e| e.of_code).collect();
let ml_codes: HashSet<_> = encoded.iter().map(|e| e.ml_code).collect();
profile.ll_codes_unique = ll_codes.len();
profile.of_codes_unique = of_codes.len();
profile.ml_codes_unique = ml_codes.len();
}
if literals.len() >= 64 {
if let Some(encoder) = HuffmanEncoder::build(&literals) {
profile.huffman_viable = true;
profile.huffman_estimated_size = encoder.estimate_size(&literals);
}
}
let mut ctx = CompressContext::new(level);
if let Ok(compressed) = ctx.compress(data) {
profile.output_size = compressed.len();
}
if let Ok(zstd_compressed) = zstd::encode_all(data, 3) {
profile.zstd_size = zstd_compressed.len();
}
profile
}
fn print_profile(name: &str, p: &CompressionProfile) {
println!("\n=== {} ===", name);
println!("Input: {} bytes", p.input_size);
println!();
println!("MATCH FINDING:");
println!(" Matches found: {}", p.num_matches);
println!(
" Match coverage: {} bytes ({:.1}%)",
p.total_match_bytes,
100.0 * p.total_match_bytes as f64 / p.input_size as f64
);
println!(
" Literal bytes: {} ({:.1}%)",
p.literal_bytes,
100.0 * p.literal_bytes as f64 / p.input_size as f64
);
println!(" Avg match length: {:.1}", p.avg_match_length);
println!(" Avg offset: {:.1}", p.avg_offset);
println!();
println!("SEQUENCES:");
println!(" Sequences: {}", p.num_sequences);
println!(" RLE suitable: {}", p.rle_suitable);
println!(" Unique LL codes: {}", p.ll_codes_unique);
println!(" Unique OF codes: {}", p.of_codes_unique);
println!(" Unique ML codes: {}", p.ml_codes_unique);
println!();
println!("LITERALS:");
println!(" Huffman viable: {}", p.huffman_viable);
if p.huffman_viable {
println!(
" Huffman estimated: {} bytes ({:.1}% of literals)",
p.huffman_estimated_size,
100.0 * p.huffman_estimated_size as f64 / p.literal_bytes.max(1) as f64
);
}
println!();
println!("OUTPUT:");
println!(
" Haagenti: {} bytes ({:.2}x ratio)",
p.output_size,
p.input_size as f64 / p.output_size.max(1) as f64
);
println!(
" Zstd ref: {} bytes ({:.2}x ratio)",
p.zstd_size,
p.input_size as f64 / p.zstd_size.max(1) as f64
);
println!(
" Gap: {} bytes ({:.1}% larger)",
p.output_size as i64 - p.zstd_size as i64,
100.0 * (p.output_size as f64 / p.zstd_size.max(1) as f64 - 1.0)
);
}
fn generate_text(size: usize) -> Vec<u8> {
let pattern = b"The quick brown fox jumps over the lazy dog. ";
let mut data = Vec::with_capacity(size);
while data.len() < size {
data.extend_from_slice(pattern);
}
data.truncate(size);
data
}
fn generate_random_text(size: usize, seed: u64) -> Vec<u8> {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
let words = [
"the ",
"quick ",
"brown ",
"fox ",
"jumps ",
"over ",
"lazy ",
"dog ",
"compression ",
"algorithm ",
"data ",
"stream ",
"entropy ",
];
let mut rng = StdRng::seed_from_u64(seed);
let mut data = Vec::with_capacity(size);
while data.len() < size {
let word = words[rng.gen_range(0..words.len())];
data.extend_from_slice(word.as_bytes());
}
data.truncate(size);
data
}
fn generate_binary(size: usize, seed: u64) -> Vec<u8> {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
let mut rng = StdRng::seed_from_u64(seed);
(0..size).map(|_| rng.r#gen::<u8>()).collect()
}
#[test]
fn test_profile_text_patterns() {
println!("\n========== COMPRESSION PROFILING ==========\n");
let data = generate_text(16384);
let profile = profile_compression(&data, CompressionLevel::Default);
print_profile("16KB Repeating Text", &profile);
let data = generate_random_text(16384, 12345);
let profile = profile_compression(&data, CompressionLevel::Default);
print_profile("16KB Random Text", &profile);
let data = generate_text(65536);
let profile = profile_compression(&data, CompressionLevel::Default);
print_profile("64KB Repeating Text", &profile);
let data = generate_binary(16384, 54321);
let profile = profile_compression(&data, CompressionLevel::Default);
print_profile("16KB Random Binary", &profile);
}
#[test]
fn test_profile_match_finder_quality() {
println!("\n========== MATCH FINDER ANALYSIS ==========\n");
let data = generate_text(16384);
let mut greedy_mf = MatchFinder::new(4);
let greedy_matches = greedy_mf.find_matches(&data);
let mut lazy_mf = LazyMatchFinder::new(16);
let lazy_matches = lazy_mf.find_matches(&data);
println!("Greedy (depth=4):");
println!(" Matches: {}", greedy_matches.len());
if !greedy_matches.is_empty() {
let total: usize = greedy_matches.iter().map(|m| m.length).sum();
println!(
" Coverage: {} bytes ({:.1}%)",
total,
100.0 * total as f64 / data.len() as f64
);
println!(
" Avg length: {:.1}",
total as f64 / greedy_matches.len() as f64
);
}
println!("\nLazy (depth=16):");
println!(" Matches: {}", lazy_matches.len());
if !lazy_matches.is_empty() {
let total: usize = lazy_matches.iter().map(|m| m.length).sum();
println!(
" Coverage: {} bytes ({:.1}%)",
total,
100.0 * total as f64 / data.len() as f64
);
println!(
" Avg length: {:.1}",
total as f64 / lazy_matches.len() as f64
);
}
println!("\nMatch length distribution (Lazy):");
let mut len_buckets = [0usize; 10];
for m in &lazy_matches {
let bucket = match m.length {
3 => 0,
4 => 1,
5..=7 => 2,
8..=15 => 3,
16..=31 => 4,
32..=63 => 5,
64..=127 => 6,
128..=255 => 7,
256..=1023 => 8,
_ => 9,
};
len_buckets[bucket] += 1;
}
println!(" 3: {}", len_buckets[0]);
println!(" 4: {}", len_buckets[1]);
println!(" 5-7: {}", len_buckets[2]);
println!(" 8-15: {}", len_buckets[3]);
println!(" 16-31: {}", len_buckets[4]);
println!(" 32-63: {}", len_buckets[5]);
println!(" 64-127: {}", len_buckets[6]);
println!(" 128-255: {}", len_buckets[7]);
println!(" 256-1023: {}", len_buckets[8]);
println!(" 1024+: {}", len_buckets[9]);
}
#[test]
fn test_profile_sequence_encoding_paths() {
println!("\n========== SEQUENCE ENCODING PATHS ==========\n");
let test_cases: Vec<(&str, Vec<u8>)> = vec![
("Uniform pattern (abcd repeat)", {
let mut d = Vec::with_capacity(4096);
while d.len() < 4096 {
d.extend_from_slice(b"abcd");
}
d
}),
("Semi-uniform (sentence repeat)", generate_text(4096)),
("Random text order", generate_random_text(4096, 999)),
("Mixed content", {
let mut d = generate_text(2048);
d.extend_from_slice(&generate_random_text(2048, 888));
d
}),
];
for (name, data) in test_cases {
let mut mf = LazyMatchFinder::new(16);
let matches = mf.find_matches(&data);
let (literals, sequences) = matches_to_sequences(&data, &matches);
let suitability = analyze_for_rle(&sequences);
use std::collections::HashSet;
let (ll_unique, of_unique, ml_unique) = if sequences.is_empty() {
(0, 0, 0)
} else {
let encoded: Vec<_> = sequences
.iter()
.map(|s| EncodedSequence::from_sequence(s))
.collect();
(
encoded
.iter()
.map(|e| e.ll_code)
.collect::<HashSet<_>>()
.len(),
encoded
.iter()
.map(|e| e.of_code)
.collect::<HashSet<_>>()
.len(),
encoded
.iter()
.map(|e| e.ml_code)
.collect::<HashSet<_>>()
.len(),
)
};
println!(
"{}: {} seqs, RLE={}, LL={} OF={} ML={} unique codes",
name,
sequences.len(),
suitability.all_uniform(),
ll_unique,
of_unique,
ml_unique,
);
}
}
#[test]
fn test_debug_single_byte_repeats() {
let mut input = Vec::new();
for _ in 0..10 {
input.extend(vec![b'X'; 20]);
input.extend(vec![b'Y'; 20]);
}
println!("Input: {} bytes", input.len());
println!(
"Pattern preview: {:?}",
String::from_utf8_lossy(&input[..60])
);
let mut mf = LazyMatchFinder::new(16);
let matches = mf.find_matches(&input);
println!("\nMatches found: {}", matches.len());
for (i, m) in matches.iter().take(10).enumerate() {
println!(
" Match[{}]: pos={}, len={}, offset={}",
i, m.position, m.length, m.offset
);
}
let (literals, seqs) = matches_to_sequences(&input, &matches);
println!("\nLiterals: {} bytes", literals.len());
println!("Sequences: {}", seqs.len());
let suitability = analyze_for_rle(&seqs);
println!("RLE suitable: {}", suitability.all_uniform());
println!(
" LL uniform: {} (code={})",
suitability.ll_uniform, suitability.ll_code
);
println!(
" OF uniform: {} (code={})",
suitability.of_uniform, suitability.of_code
);
println!(
" ML uniform: {} (code={})",
suitability.ml_uniform, suitability.ml_code
);
if !seqs.is_empty() {
let encoded: Vec<_> = seqs
.iter()
.map(|s| EncodedSequence::from_sequence(s))
.collect();
println!("\nFirst 5 encoded sequences:");
for (i, e) in encoded.iter().take(5).enumerate() {
println!(" Seq[{}]: ll_code={}, of_code={}, ml_code={}, ll_extra={}, of_extra={}, ml_extra={}",
i, e.ll_code, e.of_code, e.ml_code, e.ll_extra, e.of_extra, e.ml_extra);
}
}
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&input).expect("Compression failed");
println!("\nCompressed: {} bytes", compressed.len());
println!("Full compressed data:");
for (i, chunk) in compressed.chunks(16).enumerate() {
print!(" {:04x}: ", i * 16);
for &b in chunk {
print!("{:02x} ", b);
}
println!();
}
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&compressed) {
Ok(decompressed) => {
println!("\nOur decompressor: SUCCESS, {} bytes", decompressed.len())
}
Err(e) => println!("\nOur decompressor: FAILED: {:?}", e),
}
match zstd::decode_all(compressed.as_slice()) {
Ok(decompressed) => println!("Reference zstd: SUCCESS, {} bytes", decompressed.len()),
Err(e) => println!("Reference zstd: FAILED: {:?}", e),
}
}
}
#[cfg(test)]
mod minimal_fse_debug {
use crate::fse::{
FseBitWriter, FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
};
#[test]
fn test_single_sequence_bitstream_size() {
let ll_code: u8 = 4;
let of_code: u8 = 2;
let ml_code: u8 = 41;
let of_extra: u32 = 0;
let ml_extra: u32 = 13; let ml_bits: u8 = 4;
println!(
"Encoded (matching reference): ll_code={}, of_code={}, ml_code={}",
ll_code, of_code, ml_code
);
println!("OF extra bits: {} bits, value {}", of_code, of_extra);
println!("ML extra bits: {} bits, value {}", ml_bits, ml_extra);
let ll_table = FseTable::from_predefined(
&LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
LITERAL_LENGTH_ACCURACY_LOG,
)
.unwrap();
let of_table =
FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
let mut tans = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
let (ll_log, of_log, ml_log) = tans.accuracy_logs();
println!("Accuracy logs: ll={}, of={}, ml={}", ll_log, of_log, ml_log);
let mut bits = FseBitWriter::new();
tans.init_states(ll_code, of_code, ml_code);
let (init_ll, init_of, init_ml) = tans.get_states();
println!(
"After init_states: ll_state={}, of_state={}, ml_state={}",
init_ll, init_of, init_ml
);
let (ll_state, of_state, ml_state) = tans.get_states();
println!(
"States (from init): ll={}, of={}, ml={}",
ll_state, of_state, ml_state
);
if of_code > 0 {
println!("Writing OF extra: value={}, bits={}", of_extra, of_code);
bits.write_bits(of_extra, of_code);
}
if ml_bits > 0 {
println!("Writing ML extra: value={}, bits={}", ml_extra, ml_bits);
bits.write_bits(ml_extra, ml_bits);
}
bits.write_bits(ml_state, ml_log);
bits.write_bits(of_state, of_log);
bits.write_bits(ll_state, ll_log);
println!("No FSE encode for single sequence (captured by init_state)");
let bitstream = bits.finish();
println!("Bitstream ({} bytes): {:02x?}", bitstream.len(), bitstream);
println!("\nTotal bits written:");
let total_extra = of_code as u32 + ml_bits as u32;
let state_bits = ll_log + of_log + ml_log;
println!(" OF extra: {} bits", of_code);
println!(" ML extra: {} bits", ml_bits);
println!(" FSE encode: 0 bits (none for single sequence)");
println!(" Init states: {} bits", state_bits);
println!(
" Total: {} bits = {} bytes",
total_extra + state_bits as u32,
((total_extra + state_bits as u32) + 7) / 8
);
assert_eq!(
bitstream.len(),
3,
"Bitstream should be exactly 3 bytes for 1 sequence, got {}",
bitstream.len()
);
println!("\n=== Comparing with reference ===");
println!("Our bitstream: {:02x?}", bitstream);
println!(
"Our init states: LL={}, OF={}, ML={}",
init_ll, init_of, init_ml
);
let ll_sym = ll_table.decode(init_ll as usize).symbol;
let of_sym = of_table.decode(init_of as usize).symbol;
let ml_sym = ml_table.decode(init_ml as usize).symbol;
println!(
"Symbols at our states: LL={}, OF={}, ML={}",
ll_sym, of_sym, ml_sym
);
println!(
"Expected symbols: LL={}, OF={}, ML={}",
ll_code, of_code, ml_code
);
assert_eq!(
ll_sym, ll_code,
"LL init state {} decodes to {} instead of {}",
init_ll, ll_sym, ll_code
);
assert_eq!(
of_sym, of_code,
"OF init state {} decodes to {} instead of {}",
init_of, of_sym, of_code
);
assert_eq!(
ml_sym, ml_code,
"ML init state {} decodes to {} instead of {}",
init_ml, ml_sym, ml_code
);
println!("\n=== Decoding reference bitstream ===");
let ref_bitstream = vec![0xfd, 0xe4, 0x88];
use crate::fse::{BitReader, FseDecoder};
let mut bits = BitReader::new(&ref_bitstream);
bits.init_from_end().unwrap();
let mut ll_dec = FseDecoder::new(&ll_table);
let mut of_dec = FseDecoder::new(&of_table);
let mut ml_dec = FseDecoder::new(&ml_table);
ll_dec.init_state(&mut bits).unwrap();
of_dec.init_state(&mut bits).unwrap();
ml_dec.init_state(&mut bits).unwrap();
let ref_ll_state = ll_dec.state();
let ref_of_state = of_dec.state();
let ref_ml_state = ml_dec.state();
println!(
"Reference init states: LL={}, OF={}, ML={}",
ref_ll_state, ref_of_state, ref_ml_state
);
let ref_ll_sym = ll_table.decode(ref_ll_state).symbol;
let ref_of_sym = of_table.decode(ref_of_state).symbol;
let ref_ml_sym = ml_table.decode(ref_ml_state).symbol;
println!(
"Reference symbols: LL={}, OF={}, ML={}",
ref_ll_sym, ref_of_sym, ref_ml_sym
);
let remaining_bits = bits.bits_remaining();
println!("Remaining bits after init states: {}", remaining_bits);
let ll_extra = 0; let ml_extra = bits.read_bits(4).unwrap();
let of_extra = bits.read_bits(2).unwrap();
println!(
"Reference extra bits: LL={}, ML={}, OF={}",
ll_extra, ml_extra, of_extra
);
println!("Expected extra bits: LL=0, ML=13, OF=0");
let ref_ml = 83 + ml_extra;
println!("Reference match_length = 83 + {} = {}", ml_extra, ref_ml);
println!("OF code 2 = repeat offset 3 = initial value 8");
println!("But OF has extra bits {}? That's confusing...", of_extra);
}
#[test]
fn test_compare_with_reference_bitstream() {
let data: Vec<u8> = b"ABCD".iter().cycle().take(100).copied().collect();
let ref_compressed = zstd::encode_all(data.as_slice(), 1).unwrap();
println!(
"Reference compressed ({} bytes): {:02x?}",
ref_compressed.len(),
ref_compressed
);
let magic = u32::from_le_bytes([
ref_compressed[0],
ref_compressed[1],
ref_compressed[2],
ref_compressed[3],
]);
println!("Magic: 0x{:08x}", magic);
let fhd = ref_compressed[4];
println!("FHD: 0x{:02x}", fhd);
let content_size_flag = (fhd >> 6) & 0x03;
let single_segment_flag = (fhd >> 5) & 0x01;
let window_desc_size = if single_segment_flag == 0 { 1 } else { 0 };
let content_size_bytes = match (content_size_flag, single_segment_flag) {
(0, 1) => 1, (0, 0) => 0, (1, _) => 2,
(2, _) => 4,
(3, _) => 8,
_ => 0,
};
let frame_header_size = 1 + window_desc_size + content_size_bytes;
println!(
"Frame header: FHD=1 + Window_Desc={} + Content_Size={} = {} bytes",
window_desc_size, content_size_bytes, frame_header_size
);
let block_start = 4 + frame_header_size;
let block_header = u32::from_le_bytes([
ref_compressed[block_start],
ref_compressed[block_start + 1],
ref_compressed[block_start + 2],
0,
]);
let block_type = (block_header >> 1) & 0x03;
let block_size = (block_header >> 3) as usize;
println!("Block header: 0x{:06x}", block_header);
println!("Block type: {} (0=raw, 1=rle, 2=compressed)", block_type);
println!("Block size: {} bytes", block_size);
if block_type == 2 {
let block_content_start = block_start + 3;
let block_content =
&ref_compressed[block_content_start..block_content_start + block_size];
println!(
"Block content ({} bytes): {:02x?}",
block_content.len(),
block_content
);
let lit_header = block_content[0];
let lit_type = lit_header & 0x03;
println!("Literals header: 0x{:02x}, type={}", lit_header, lit_type);
let (lit_block_size, lit_header_size) = match lit_type {
0 | 1 => {
if lit_header < 128 {
((lit_header >> 3) as usize, 1)
} else if (lit_header & 0x0C) == 0 {
let sz = ((lit_header as usize) >> 4) + ((block_content[1] as usize) << 4);
(sz, 2)
} else {
(
((lit_header as usize) >> 4)
+ ((block_content[1] as usize) << 4)
+ ((block_content[2] as usize) << 12),
3,
)
}
}
_ => (0, 1), };
println!(
"Literals block: type={}, size={} bytes, header={} bytes",
lit_type, lit_block_size, lit_header_size
);
let seq_start = lit_header_size + if lit_type == 1 { 1 } else { lit_block_size };
println!("Sequences start at offset: {}", seq_start);
if seq_start < block_content.len() {
let seq_section = &block_content[seq_start..];
println!(
"Sequences section ({} bytes): {:02x?}",
seq_section.len(),
seq_section
);
if !seq_section.is_empty() {
let seq_count = seq_section[0];
println!("Sequence count: {}", seq_count);
if seq_count > 0 && seq_section.len() > 1 {
let mode = seq_section[1];
println!("Mode byte: 0x{:02x}", mode);
let bitstream_start = if mode == 0 { 2 } else { 2 + 3 }; if bitstream_start < seq_section.len() {
let bitstream = &seq_section[bitstream_start..];
println!(
"FSE bitstream ({} bytes): {:02x?}",
bitstream.len(),
bitstream
);
}
}
}
}
}
}
}
#[cfg(test)]
mod internal_roundtrip_tests {
use super::*;
use haagenti_core::{Compressor, Decompressor};
#[test]
fn test_internal_roundtrip_500() {
let data: Vec<u8> = b"ABCD".iter().cycle().take(500).copied().collect();
println!("=== Internal Roundtrip Test (500 bytes) ===");
println!("Input: {} bytes", data.len());
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).expect("compress failed");
println!("Compressed: {} bytes", compressed.len());
println!("Compressed bytes: {:02x?}", &compressed);
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&compressed) {
Ok(decompressed) => {
println!("Decompressed: {} bytes", decompressed.len());
if decompressed == data {
println!("SUCCESS! Internal roundtrip works!");
} else {
println!("MISMATCH!");
println!("First 20 original: {:?}", &data[..20]);
println!(
"First 20 decoded: {:?}",
&decompressed[..20.min(decompressed.len())]
);
}
assert_eq!(decompressed, data);
}
Err(e) => {
println!("FAILED: Our decoder failed: {:?}", e);
panic!("Internal roundtrip failed");
}
}
}
#[test]
fn test_debug_ml_table_symbols() {
use crate::block::MATCH_LENGTH_BASELINE;
use crate::fse::{FseTable, MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
println!("=== ML Table Symbols Debug ===");
let mut mismatches = 0;
for state in 0..64 {
let entry = ml_table.decode(state);
let symbol = entry.symbol as usize;
if symbol < MATCH_LENGTH_BASELINE.len() {
let (expected_bits, expected_base) = MATCH_LENGTH_BASELINE[symbol];
if entry.seq_base != expected_base || entry.seq_extra_bits != expected_bits {
println!("MISMATCH State {}: symbol={}", state, symbol);
println!(
" Table: seq_base={}, seq_extra_bits={}",
entry.seq_base, entry.seq_extra_bits
);
println!(
" MATCH_LENGTH_BASELINE[{}]: baseline={}, bits={}",
symbol, expected_base, expected_bits
);
mismatches += 1;
}
}
}
println!("\nTotal mismatches: {}", mismatches);
for state in [19, 41, 42, 43, 44, 45, 62, 63] {
let entry = ml_table.decode(state);
println!(
"State {}: symbol={}, seq_base={}, seq_extra_bits={}",
state, entry.symbol, entry.seq_base, entry.seq_extra_bits
);
if (entry.symbol as usize) < MATCH_LENGTH_BASELINE.len() {
let (bits, base) = MATCH_LENGTH_BASELINE[entry.symbol as usize];
println!(" Expected: baseline={}, bits={}", base, bits);
}
}
let mut all_zero = true;
for state in 0..64 {
if ml_table.decode(state).symbol != 0 {
all_zero = false;
break;
}
}
assert!(!all_zero, "ML table has all symbol=0, which is wrong!");
assert_eq!(
mismatches, 0,
"Found {} mismatches between table and MATCH_LENGTH_BASELINE",
mismatches
);
}
}
#[cfg(test)]
mod ref_decode_tests {
use super::*;
use haagenti_core::Decompressor;
#[test]
fn test_trace_reference_bitstream() {
use crate::block::{LITERAL_LENGTH_BASELINE, MATCH_LENGTH_BASELINE};
use crate::fse::{
BitReader, FseDecoder, FseTable, LITERAL_LENGTH_ACCURACY_LOG,
LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
};
let fse_bytes: [u8; 4] = [0xed, 0xab, 0x8e, 0x08];
println!("=== Trace Reference Bitstream ===");
println!("Bytes: {:02x?}", fse_bytes);
let value = u32::from_le_bytes(fse_bytes);
println!("As u32 LE: 0x{:08x} = {:032b}", value, value);
let sentinel_pos = 31 - value.leading_zeros();
println!("Sentinel at bit {}", sentinel_pos);
let ll_state_bits = (value >> 21) & 0x3F; let of_state_bits = (value >> 16) & 0x1F; let ml_state_bits = (value >> 10) & 0x3F; println!("Manual extraction (assuming sentinel at 27):");
println!(" LL bits 26-21: {:06b} = {}", ll_state_bits, ll_state_bits);
println!(" OF bits 20-16: {:05b} = {}", of_state_bits, of_state_bits);
println!(" ML bits 15-10: {:06b} = {}", ml_state_bits, ml_state_bits);
let ll_table = FseTable::from_predefined(
&LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
LITERAL_LENGTH_ACCURACY_LOG,
)
.unwrap();
let of_table =
FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
let ml_table = FseTable::from_predefined(
&MATCH_LENGTH_DEFAULT_DISTRIBUTION,
MATCH_LENGTH_ACCURACY_LOG,
)
.unwrap();
let mut ll_decoder = FseDecoder::new(&ll_table);
let mut of_decoder = FseDecoder::new(&of_table);
let mut ml_decoder = FseDecoder::new(&ml_table);
let mut bits = BitReader::new(&fse_bytes);
bits.init_from_end().expect("init_from_end");
ll_decoder.init_state(&mut bits).expect("ll init");
of_decoder.init_state(&mut bits).expect("of init");
ml_decoder.init_state(&mut bits).expect("ml init");
let ll_state = ll_decoder.state();
let of_state = of_decoder.state();
let ml_state = ml_decoder.state();
println!(
"Initial states: LL={}, OF={}, ML={}",
ll_state, of_state, ml_state
);
let ll_code = ll_decoder.peek_symbol();
let of_code = of_decoder.peek_symbol();
let ml_code = ml_decoder.peek_symbol();
println!(
"Symbols: LL_code={}, OF_code={}, ML_code={}",
ll_code, of_code, ml_code
);
let ll_bits = if ll_code < LITERAL_LENGTH_BASELINE.len() as u8 {
LITERAL_LENGTH_BASELINE[ll_code as usize].0
} else {
0
};
let ml_bits = if ml_code < MATCH_LENGTH_BASELINE.len() as u8 {
MATCH_LENGTH_BASELINE[ml_code as usize].0
} else {
0
};
let of_bits = if of_code < 32 { of_code } else { 0 }; println!(
"Extra bits needed: LL={}, ML={}, OF={}",
ll_bits, ml_bits, of_bits
);
bits.switch_to_lsb_mode().expect("switch");
let ll_extra = if ll_bits > 0 {
bits.read_bits(ll_bits as usize).expect("ll extra")
} else {
0
};
let ml_extra = if ml_bits > 0 {
bits.read_bits(ml_bits as usize).expect("ml extra")
} else {
0
};
let of_extra = if of_bits > 0 {
bits.read_bits(of_bits as usize).expect("of extra")
} else {
0
};
println!(
"Extra bits values: LL={}, ML={}, OF={}",
ll_extra, ml_extra, of_extra
);
let ll_baseline = if ll_code < LITERAL_LENGTH_BASELINE.len() as u8 {
LITERAL_LENGTH_BASELINE[ll_code as usize].1
} else {
0
};
let ml_baseline = if ml_code < MATCH_LENGTH_BASELINE.len() as u8 {
MATCH_LENGTH_BASELINE[ml_code as usize].1
} else {
0
};
let literal_length = ll_baseline + ll_extra;
let match_length = ml_baseline + ml_extra;
let offset_value = (1u32 << of_code) + of_extra;
println!(
"Decoded: literal_length={}, match_length={}, offset_value={}",
literal_length, match_length, offset_value
);
println!(
"Total output would be: {} literals + {} match = {}",
literal_length,
match_length,
literal_length + match_length
);
assert_eq!(literal_length, 4, "literal_length");
assert_eq!(match_length, 496, "match_length should be 496");
}
#[test]
fn test_decode_reference_500() {
let ref_compressed: [u8; 20] = [
0x28, 0xb5, 0x2f, 0xfd, 0x00, 0x48, 0x5d, 0x00, 0x00, 0x20, 0x41, 0x42, 0x43, 0x44, 0x01, 0x00, 0xed, 0xab, 0x8e, 0x08, ];
println!("=== Test Decode Reference 500 ===");
println!("Reference compressed: {} bytes", ref_compressed.len());
println!("Bytes: {:02x?}", ref_compressed);
let decompressor = ZstdDecompressor::new();
match decompressor.decompress(&ref_compressed) {
Ok(decompressed) => {
let expected = "ABCD".repeat(125);
println!("Decompressed: {} bytes", decompressed.len());
if decompressed == expected.as_bytes() {
println!("SUCCESS! Reference decompression matches!");
} else {
println!("MISMATCH!");
println!("First 20 expected: {:?}", &expected.as_bytes()[..20]);
println!(
"First 20 got: {:?}",
&decompressed[..20.min(decompressed.len())]
);
}
assert_eq!(decompressed, expected.as_bytes());
}
Err(e) => {
println!("FAILED: {:?}", e);
panic!("Failed to decompress reference");
}
}
}
}
#[cfg(test)]
mod throughput_tests {
use super::*;
use std::time::Instant;
fn generate_compressible_data(size: usize) -> Vec<u8> {
let mut data = Vec::with_capacity(size);
let patterns = [
b"The quick brown fox jumps over the lazy dog. ".as_slice(),
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".as_slice(),
b"Pack my box with five dozen liquor jugs. ".as_slice(),
];
let mut pattern_idx = 0;
while data.len() < size {
let pattern = patterns[pattern_idx % patterns.len()];
let remaining = size - data.len();
data.extend_from_slice(&pattern[..pattern.len().min(remaining)]);
pattern_idx += 1;
}
data
}
#[test]
fn test_64kb_compression_throughput() {
let data = generate_compressible_data(64 * 1024);
let compressor = ZstdCompressor::new();
let start = Instant::now();
let iterations = 100;
for _ in 0..iterations {
let _ = compressor.compress(&data).unwrap();
}
let elapsed = start.elapsed();
let throughput_mbs =
(iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
assert!(
throughput_mbs > 0.0,
"64KB throughput: {:.1} MB/s",
throughput_mbs
);
println!("64KB compression throughput: {:.1} MB/s", throughput_mbs);
}
#[test]
fn test_1mb_compression_throughput() {
let data = generate_compressible_data(1024 * 1024);
let compressor = ZstdCompressor::new();
let start = Instant::now();
let iterations = 20;
for _ in 0..iterations {
let _ = compressor.compress(&data).unwrap();
}
let elapsed = start.elapsed();
let throughput_mbs =
(iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
assert!(
throughput_mbs > 0.0,
"1MB throughput: {:.1} MB/s",
throughput_mbs
);
println!("1MB compression throughput: {:.1} MB/s", throughput_mbs);
}
#[test]
fn test_decompression_throughput() {
let data = generate_compressible_data(1024 * 1024);
let compressed = ZstdCompressor::new().compress(&data).unwrap();
let decompressor = ZstdDecompressor::new();
let start = Instant::now();
let iterations = 50;
for _ in 0..iterations {
let _ = decompressor.decompress(&compressed).unwrap();
}
let elapsed = start.elapsed();
let throughput_mbs =
(iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
assert!(
throughput_mbs > 0.0,
"Decompression throughput: {:.1} MB/s",
throughput_mbs
);
println!("Decompression throughput: {:.1} MB/s", throughput_mbs);
}
#[test]
fn test_adaptive_search_depth_scaling() {
let compressor = ZstdCompressor::new();
let sizes = [4096usize, 16384, 65536, 262144];
let mut times_per_byte = Vec::new();
for &size in &sizes {
let data = generate_compressible_data(size);
let start = Instant::now();
let iterations = (1_000_000 / size).max(1);
for _ in 0..iterations {
let _ = compressor.compress(&data).unwrap();
}
let elapsed = start.elapsed();
let ns_per_byte = elapsed.as_nanos() as f64 / (iterations * size) as f64;
times_per_byte.push((size, ns_per_byte));
}
let small_time = times_per_byte[0].1;
let large_time = times_per_byte[3].1;
assert!(
large_time < small_time * 5.0 || large_time < 100.0, "Large data too slow: {:.2} ns/byte vs {:.2} ns/byte for small",
large_time,
small_time
);
}
#[test]
fn test_throughput_vs_level_tradeoff() {
let data = generate_compressible_data(256 * 1024);
let levels = [
CompressionLevel::Fast,
CompressionLevel::Default,
CompressionLevel::Best,
];
let mut results: Vec<(CompressionLevel, f64, usize)> = Vec::new();
for level in levels {
let compressor = ZstdCompressor::with_level(level);
let iterations = 10;
let start = Instant::now();
let mut compressed_size = 0;
for _ in 0..iterations {
let c = compressor.compress(&data).unwrap();
compressed_size = c.len();
}
let elapsed = start.elapsed();
let throughput_mbs =
(iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
results.push((level, throughput_mbs, compressed_size));
}
let fast_throughput = results[0].1;
let best_throughput = results[2].1;
assert!(fast_throughput > 0.0, "Fast throughput should be positive");
assert!(best_throughput > 0.0, "Best throughput should be positive");
let fast_size = results[0].2;
let best_size = results[2].2;
assert!(
best_size <= fast_size,
"Best should compress at least as well: best={} fast={}",
best_size,
fast_size
);
}
#[test]
fn test_compression_efficiency_binary_vs_text() {
let text_data = generate_compressible_data(64 * 1024);
let binary_data: Vec<u8> = (0u64..64 * 1024)
.map(|i| ((i.wrapping_mul(17).wrapping_add(i.wrapping_mul(i))) % 256) as u8)
.collect();
let compressor = ZstdCompressor::new();
let text_compressed = compressor.compress(&text_data).unwrap();
let binary_compressed = compressor.compress(&binary_data).unwrap();
let text_ratio = text_data.len() as f64 / text_compressed.len() as f64;
let binary_ratio = binary_data.len() as f64 / binary_compressed.len() as f64;
assert!(
text_ratio > binary_ratio,
"Text ratio {:.2}x should be better than binary {:.2}x",
text_ratio,
binary_ratio
);
}
#[test]
fn test_roundtrip_preserves_data_large() {
let data = generate_compressible_data(512 * 1024);
let compressor = ZstdCompressor::new();
let decompressor = ZstdDecompressor::new();
let compressed = compressor.compress(&data).unwrap();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(
data.len(),
decompressed.len(),
"Large data roundtrip size mismatch"
);
assert_eq!(data, decompressed, "Large data roundtrip content mismatch");
}
#[test]
fn test_memory_efficiency_large_data() {
let data = generate_compressible_data(1024 * 1024);
let compressor = ZstdCompressor::new();
let compressed = compressor.compress(&data).unwrap();
let ratio = data.len() as f64 / compressed.len() as f64;
assert!(
ratio > 1.5,
"1MB text should compress at least 1.5x, got {:.2}x",
ratio
);
let decompressor = ZstdDecompressor::new();
let decompressed = decompressor.decompress(&compressed).unwrap();
assert_eq!(data, decompressed);
}
}