use crate::error::{Result, ZiporaError};
use once_cell::sync::OnceCell;
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum InterleavingFactor {
X1,
X2,
X4,
X8,
}
impl InterleavingFactor {
#[inline]
pub const fn streams(&self) -> usize {
match self {
Self::X1 => 1,
Self::X2 => 2,
Self::X4 => 4,
Self::X8 => 8,
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
pub fn has_simd_support(&self) -> bool {
match self {
Self::X4 | Self::X8 => is_x86_feature_detected!("avx2"),
_ => false,
}
}
#[cfg(not(target_arch = "x86_64"))]
#[inline]
pub fn has_simd_support(&self) -> bool {
false
}
}
impl Default for InterleavingFactor {
fn default() -> Self {
Self::X1
}
}
#[derive(Debug, Clone, Copy, Default)]
#[repr(C)]
pub struct HuffmanEncSymbol {
pub bits: u16,
pub bit_count: u16,
}
impl HuffmanEncSymbol {
#[inline(always)]
pub const fn new(bits: u16, bit_count: u16) -> Self {
Self { bits, bit_count }
}
}
type FastSymbolTable = Box<[[HuffmanEncSymbol; 256]; 257]>;
#[derive(Debug)]
struct BitStreamWriter {
buffer: Vec<u8>,
current: u64,
bit_count: usize,
}
impl BitStreamWriter {
fn new() -> Self {
Self {
buffer: Vec::new(),
current: 0,
bit_count: 0,
}
}
#[inline]
fn write(&mut self, bits: u64, count: usize) {
debug_assert!(count <= 64);
self.current |= bits << self.bit_count;
self.bit_count += count;
while self.bit_count >= 8 {
self.buffer.push(self.current as u8);
self.current >>= 8;
self.bit_count -= 8;
}
}
fn finish(mut self) -> Vec<u8> {
if self.bit_count > 0 {
self.buffer.push(self.current as u8);
}
self.buffer
}
#[inline]
fn len_bits(&self) -> usize {
self.buffer.len() * 8 + self.bit_count
}
}
#[derive(Debug)]
struct BitStreamReader<'a> {
data: &'a [u8],
current: u64,
bit_count: usize,
byte_pos: usize,
}
impl<'a> BitStreamReader<'a> {
fn new(data: &'a [u8]) -> Self {
let mut reader = Self {
data,
current: 0,
bit_count: 0,
byte_pos: 0,
};
reader.refill();
reader
}
#[inline]
fn refill(&mut self) {
while self.bit_count <= 56 && self.byte_pos < self.data.len() {
self.current |= (self.data[self.byte_pos] as u64) << self.bit_count;
self.bit_count += 8;
self.byte_pos += 1;
}
}
#[inline]
fn peek(&self, count: usize) -> u64 {
debug_assert!(count <= 64);
self.current & ((1u64 << count) - 1)
}
#[inline]
fn consume(&mut self, count: usize) {
debug_assert!(count <= self.bit_count);
self.current >>= count;
self.bit_count -= count;
}
#[inline]
fn read(&mut self, count: usize) -> u64 {
if count > self.bit_count {
self.refill();
}
let result = self.peek(count);
self.consume(count);
result
}
#[inline]
fn has_bits(&self) -> bool {
self.bit_count > 0 || self.byte_pos < self.data.len()
}
#[inline]
fn remaining_bits(&self) -> usize {
self.bit_count + (self.data.len() - self.byte_pos) * 8
}
}
#[derive(Debug, Clone, Copy)]
struct HuffmanSymbol {
bits: u64, bit_count: u8,
}
impl HuffmanSymbol {
#[inline]
fn new(bits: u64, bit_count: u8) -> Self {
Self { bits, bit_count }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum HuffmanNode {
Leaf {
symbol: u8,
frequency: u32,
},
Internal {
frequency: u32,
left: Box<HuffmanNode>,
right: Box<HuffmanNode>,
},
}
impl HuffmanNode {
fn frequency(&self) -> u32 {
match self {
HuffmanNode::Leaf { frequency, .. } => *frequency,
HuffmanNode::Internal { frequency, .. } => *frequency,
}
}
}
impl PartialOrd for HuffmanNode {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for HuffmanNode {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other.frequency().cmp(&self.frequency())
}
}
#[derive(Debug, Clone)]
pub struct HuffmanTree {
root: Option<HuffmanNode>,
codes: HashMap<u8, Vec<bool>>,
max_code_length: usize,
}
impl HuffmanTree {
pub fn from_frequencies(frequencies: &[u32; 256]) -> Result<Self> {
const MAX_CODE_LENGTH: usize = 64;
let mut heap = BinaryHeap::new();
let mut symbol_count = 0;
for (symbol, &freq) in frequencies.iter().enumerate() {
if freq > 0 {
heap.push(Reverse(HuffmanNode::Leaf {
symbol: symbol as u8,
frequency: freq,
}));
symbol_count += 1;
}
}
if symbol_count == 0 {
return Ok(Self {
root: None,
codes: HashMap::new(),
max_code_length: 0,
});
}
if symbol_count == 1 {
let node = heap.pop().expect("heap non-empty by loop invariant").0;
if let HuffmanNode::Leaf { symbol, .. } = node {
let mut codes = HashMap::new();
codes.insert(symbol, vec![false]); return Ok(Self {
root: Some(node),
codes,
max_code_length: 1,
});
}
}
while heap.len() > 1 {
let left = heap.pop().expect("heap has >= 2 nodes").0;
let right = heap.pop().expect("heap has >= 2 nodes").0;
let merged = HuffmanNode::Internal {
frequency: left.frequency() + right.frequency(),
left: Box::new(left),
right: Box::new(right),
};
heap.push(Reverse(merged));
}
let root = heap.pop().expect("heap has final root").0;
let mut codes = HashMap::new();
let mut max_code_length = 0;
Self::generate_codes(&root, Vec::new(), &mut codes, &mut max_code_length);
if max_code_length > MAX_CODE_LENGTH {
return Self::from_frequencies_fixed_length(frequencies);
}
Ok(Self {
root: Some(root),
codes,
max_code_length,
})
}
fn from_frequencies_fixed_length(frequencies: &[u32; 256]) -> Result<Self> {
let symbols: Vec<u8> = frequencies.iter()
.enumerate()
.filter(|(_, f)| **f > 0)
.map(|(s, _)| s as u8)
.collect();
if symbols.is_empty() {
return Ok(Self {
root: None,
codes: HashMap::new(),
max_code_length: 0,
});
}
let code_length = 8;
let mut codes = HashMap::new();
for (i, &symbol) in symbols.iter().enumerate() {
let mut code = Vec::with_capacity(code_length);
let mut val = i;
for _ in 0..code_length {
code.push((val & 1) != 0);
val >>= 1;
}
codes.insert(symbol, code);
}
let root = Self::build_decoding_tree_from_codes(&codes)?;
Ok(Self {
root,
codes,
max_code_length: code_length,
})
}
pub fn from_data(data: &[u8]) -> Result<Self> {
let mut frequencies = [0u32; 256];
for &byte in data {
frequencies[byte as usize] += 1;
}
Self::from_frequencies(&frequencies)
}
fn generate_codes(
node: &HuffmanNode,
code: Vec<bool>,
codes: &mut HashMap<u8, Vec<bool>>,
max_length: &mut usize,
) {
match node {
HuffmanNode::Leaf { symbol, .. } => {
*max_length = (*max_length).max(code.len());
codes.insert(*symbol, code);
}
HuffmanNode::Internal { left, right, .. } => {
let mut left_code = code.clone();
left_code.push(false);
Self::generate_codes(left, left_code, codes, max_length);
let mut right_code = code;
right_code.push(true);
Self::generate_codes(right, right_code, codes, max_length);
}
}
}
pub fn get_code(&self, symbol: u8) -> Option<&Vec<bool>> {
self.codes.get(&symbol)
}
pub fn max_code_length(&self) -> usize {
self.max_code_length
}
fn root(&self) -> Option<&HuffmanNode> {
self.root.as_ref()
}
pub fn serialize(&self) -> Vec<u8> {
let mut result = Vec::new();
let symbol_count = self.codes.len() as u16;
result.extend_from_slice(&symbol_count.to_le_bytes());
for (&symbol, code) in &self.codes {
result.push(symbol);
result.push(code.len() as u8);
let mut bit_index = 0;
let mut current_byte = 0u8;
for &bit in code {
if bit {
current_byte |= 1 << bit_index;
}
bit_index += 1;
if bit_index == 8 {
result.push(current_byte);
current_byte = 0;
bit_index = 0;
}
}
if bit_index > 0 {
result.push(current_byte);
}
}
result
}
pub fn deserialize(data: &[u8]) -> Result<Self> {
if data.len() < 2 {
return Err(ZiporaError::invalid_data("Huffman tree data too short"));
}
let symbol_count = u16::from_le_bytes([data[0], data[1]]) as usize;
let mut codes = HashMap::new();
let mut max_code_length = 0;
let mut offset = 2;
for _ in 0..symbol_count {
if offset + 2 > data.len() {
return Err(ZiporaError::invalid_data("Truncated Huffman tree data"));
}
let symbol = data[offset];
let code_length = data[offset + 1] as usize;
offset += 2;
max_code_length = max_code_length.max(code_length);
let byte_count = (code_length + 7) / 8;
if offset + byte_count > data.len() {
return Err(ZiporaError::invalid_data("Truncated Huffman code data"));
}
let mut code = Vec::with_capacity(code_length);
for i in 0..code_length {
let byte_offset = i / 8;
let bit_offset = i % 8;
let byte_value = data[offset + byte_offset];
let bit = (byte_value >> bit_offset) & 1 == 1;
code.push(bit);
}
codes.insert(symbol, code);
offset += byte_count;
}
let root = Self::build_decoding_tree_from_codes(&codes)?;
Ok(Self {
root,
codes,
max_code_length,
})
}
fn build_decoding_tree_from_codes(
codes: &HashMap<u8, Vec<bool>>,
) -> Result<Option<HuffmanNode>> {
if codes.is_empty() {
return Ok(None);
}
if codes.len() == 1 {
let (&symbol, _) = codes.iter().next().expect("at least one symbol in codes");
return Ok(Some(HuffmanNode::Leaf {
symbol,
frequency: 1, }));
}
let mut root = HuffmanNode::Internal {
frequency: 0,
left: Box::new(HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
}),
right: Box::new(HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
}),
};
for (&symbol, code) in codes {
Self::insert_code_into_tree(&mut root, symbol, code)?;
}
Ok(Some(root))
}
fn insert_code_into_tree(node: &mut HuffmanNode, symbol: u8, code: &[bool]) -> Result<()> {
if code.is_empty() {
*node = HuffmanNode::Leaf {
symbol,
frequency: 1, };
return Ok(());
}
match node {
HuffmanNode::Leaf { frequency: 0, .. } => {
let next_bit = code[0];
let remaining_code = &code[1..];
if remaining_code.is_empty() {
let leaf = HuffmanNode::Leaf {
symbol,
frequency: 1,
};
let placeholder = HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
};
if next_bit {
*node = HuffmanNode::Internal {
frequency: 0,
left: Box::new(placeholder),
right: Box::new(leaf),
};
} else {
*node = HuffmanNode::Internal {
frequency: 0,
left: Box::new(leaf),
right: Box::new(placeholder),
};
}
} else {
let placeholder = HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
};
if next_bit {
let mut right_child = HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
};
Self::insert_code_into_tree(&mut right_child, symbol, remaining_code)?;
*node = HuffmanNode::Internal {
frequency: 0,
left: Box::new(placeholder),
right: Box::new(right_child),
};
} else {
let mut left_child = HuffmanNode::Leaf {
symbol: 0,
frequency: 0,
};
Self::insert_code_into_tree(&mut left_child, symbol, remaining_code)?;
*node = HuffmanNode::Internal {
frequency: 0,
left: Box::new(left_child),
right: Box::new(placeholder),
};
}
}
return Ok(());
}
HuffmanNode::Leaf { .. } => {
return Err(ZiporaError::invalid_data(
"Code collision: trying to overwrite existing symbol",
));
}
HuffmanNode::Internal { .. } => {
}
}
match node {
HuffmanNode::Internal { left, right, .. } => {
let next_bit = code[0];
let remaining_code = &code[1..];
if next_bit {
Self::insert_code_into_tree(right, symbol, remaining_code)?;
} else {
Self::insert_code_into_tree(left, symbol, remaining_code)?;
}
}
HuffmanNode::Leaf { .. } => {
return Err(ZiporaError::invalid_data(
"Unexpected leaf node during tree construction",
));
}
}
Ok(())
}
}
#[derive(Debug)]
pub struct HuffmanEncoder {
tree: HuffmanTree,
}
impl HuffmanEncoder {
pub fn new(data: &[u8]) -> Result<Self> {
let tree = HuffmanTree::from_data(data)?;
Ok(Self { tree })
}
pub fn from_frequencies(frequencies: &[u32; 256]) -> Result<Self> {
let tree = HuffmanTree::from_frequencies(frequencies)?;
Ok(Self { tree })
}
pub fn encode(&self, data: &[u8]) -> Result<Vec<u8>> {
if data.is_empty() {
return Ok(Vec::new());
}
let mut bits = Vec::new();
for &symbol in data {
if let Some(code) = self.tree.get_code(symbol) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"Symbol {} not in Huffman tree",
symbol
)));
}
}
let mut result = Vec::new();
let mut current_byte = 0u8;
let mut bit_count = 0;
for bit in bits {
if bit {
current_byte |= 1 << bit_count;
}
bit_count += 1;
if bit_count == 8 {
result.push(current_byte);
current_byte = 0;
bit_count = 0;
}
}
if bit_count > 0 {
result.push(current_byte);
}
Ok(result)
}
pub fn tree(&self) -> &HuffmanTree {
&self.tree
}
pub fn estimate_compression_ratio(&self, data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut total_bits = 0;
for &symbol in data {
if let Some(code) = self.tree.get_code(symbol) {
total_bits += code.len();
}
}
let compressed_bytes = (total_bits + 7) / 8;
compressed_bytes as f64 / data.len() as f64
}
}
#[derive(Debug)]
pub struct HuffmanDecoder {
tree: HuffmanTree,
}
impl HuffmanDecoder {
pub fn new(tree: HuffmanTree) -> Self {
Self { tree }
}
pub fn decode(&self, encoded_data: &[u8], output_length: usize) -> Result<Vec<u8>> {
if encoded_data.is_empty() || output_length == 0 {
return Ok(Vec::new());
}
let root = match self.tree.root() {
Some(root) => root,
None => return Err(ZiporaError::invalid_data("Empty Huffman tree")),
};
let mut result = Vec::with_capacity(output_length);
let mut current_node = root;
for &byte in encoded_data {
for bit_pos in 0..8 {
if result.len() >= output_length {
break;
}
let bit = (byte >> bit_pos) & 1 == 1;
match current_node {
HuffmanNode::Leaf { symbol, .. } => {
result.push(*symbol);
current_node = root;
current_node = match current_node {
HuffmanNode::Leaf { .. } => {
current_node
}
HuffmanNode::Internal { left, right, .. } => {
if bit {
right
} else {
left
}
}
};
}
HuffmanNode::Internal { left, right, .. } => {
current_node = if bit { right } else { left };
}
}
}
if result.len() >= output_length {
break;
}
}
if let HuffmanNode::Leaf { symbol, .. } = current_node {
if result.len() < output_length {
result.push(*symbol);
}
}
if result.len() != output_length {
return Err(ZiporaError::invalid_data(format!(
"Decoded length {} != expected {}",
result.len(),
output_length
)));
}
Ok(result)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HuffmanOrder {
Order0,
Order1,
Order2,
}
impl Default for HuffmanOrder {
fn default() -> Self {
Self::Order0
}
}
#[derive(Debug)]
pub struct ContextualHuffmanEncoder {
order: HuffmanOrder,
trees: Vec<HuffmanTree>,
context_map: HashMap<u32, usize>,
fast_symbol_table: OnceCell<FastSymbolTable>,
}
impl ContextualHuffmanEncoder {
pub fn new(data: &[u8], order: HuffmanOrder) -> Result<Self> {
match order {
HuffmanOrder::Order0 => Self::new_order0(data),
HuffmanOrder::Order1 => Self::new_order1(data),
HuffmanOrder::Order2 => Self::new_order2(data),
}
}
fn new_order0(data: &[u8]) -> Result<Self> {
let tree = HuffmanTree::from_data(data)?;
Ok(Self {
order: HuffmanOrder::Order0,
trees: vec![tree],
context_map: {
let mut map = HashMap::new();
map.insert(0, 0);
map
},
fast_symbol_table: OnceCell::new(),
})
}
fn new_order1(data: &[u8]) -> Result<Self> {
if data.len() < 2 {
return Self::new_order0(data);
}
let mut order0_freqs = [0u32; 256];
for &byte in data {
order0_freqs[byte as usize] += 1;
}
for freq in &mut order0_freqs {
if *freq == 0 {
*freq = 1;
}
}
let order0_tree = HuffmanTree::from_frequencies(&order0_freqs)?;
let mut trees = vec![order0_tree];
let mut context_map = HashMap::new();
let mut context_frequencies: HashMap<u8, [u32; 256]> = HashMap::new();
for i in 1..data.len() {
let context = data[i - 1];
let symbol = data[i];
let freqs = context_frequencies.entry(context).or_insert([0u32; 256]);
freqs[symbol as usize] += 1;
}
for (&context, context_freqs) in &context_frequencies {
let mut merged_freqs = [0u32; 256];
let context_total: u32 = context_freqs.iter().sum();
for symbol in 0..256 {
if context_freqs[symbol] > 0 {
merged_freqs[symbol] = context_freqs[symbol] * 100;
} else if order0_freqs[symbol] > 0 {
merged_freqs[symbol] = order0_freqs[symbol];
} else {
merged_freqs[symbol] = 1;
}
}
let symbol_count = merged_freqs.iter().filter(|&&f| f > 0).count();
if symbol_count > 0 {
let tree = HuffmanTree::from_frequencies(&merged_freqs)?;
context_map.insert(context as u32, trees.len());
trees.push(tree);
}
}
Ok(Self {
order: HuffmanOrder::Order1,
trees,
context_map,
fast_symbol_table: OnceCell::new(),
})
}
fn new_order2(data: &[u8]) -> Result<Self> {
if data.len() < 3 {
return Self::new_order1(data);
}
let mut order0_freqs = [0u32; 256];
for &byte in data {
order0_freqs[byte as usize] += 1;
}
for freq in &mut order0_freqs {
if *freq == 0 {
*freq = 1;
}
}
let order0_tree = HuffmanTree::from_frequencies(&order0_freqs)?;
let mut trees = vec![order0_tree];
let mut context_map = HashMap::new();
let mut context_frequencies: HashMap<u16, [u32; 256]> = HashMap::new();
for i in 2..data.len() {
let context = ((data[i - 2] as u16) << 8) | (data[i - 1] as u16);
let symbol = data[i];
let freqs = context_frequencies.entry(context).or_insert([0u32; 256]);
freqs[symbol as usize] += 1;
}
let mut contexts: Vec<_> = context_frequencies.into_iter().collect();
contexts.sort_by_key(|(_, freqs)| freqs.iter().sum::<u32>());
contexts.reverse();
let max_contexts = 1024.min(contexts.len());
for (context, context_freqs) in contexts.into_iter().take(max_contexts) {
let mut merged_freqs = [0u32; 256];
for symbol in 0..256 {
if context_freqs[symbol] > 0 {
merged_freqs[symbol] = context_freqs[symbol] * 100;
} else if order0_freqs[symbol] > 0 {
merged_freqs[symbol] = order0_freqs[symbol];
} else {
merged_freqs[symbol] = 1;
}
}
let symbol_count = merged_freqs.iter().filter(|&&f| f > 0).count();
if symbol_count > 0 {
let tree = HuffmanTree::from_frequencies(&merged_freqs)?;
context_map.insert(context as u32, trees.len());
trees.push(tree);
}
}
Ok(Self {
order: HuffmanOrder::Order2,
trees,
context_map,
fast_symbol_table: OnceCell::new(),
})
}
pub fn encode(&self, data: &[u8]) -> Result<Vec<u8>> {
if data.is_empty() {
return Ok(Vec::new());
}
let mut bits = Vec::new();
match self.order {
HuffmanOrder::Order0 => {
let tree = &self.trees[0];
for &symbol in data {
if let Some(code) = tree.get_code(symbol) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"Symbol {} not in Huffman tree", symbol
)));
}
}
}
HuffmanOrder::Order1 => {
let order0_tree = &self.trees[0];
if let Some(code) = order0_tree.get_code(data[0]) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"First symbol {} not in Order-0 tree", data[0]
)));
}
for i in 1..data.len() {
let context = data[i - 1] as u32;
let symbol = data[i];
if let Some(&tree_idx) = self.context_map.get(&context) {
let tree = &self.trees[tree_idx];
if let Some(code) = tree.get_code(symbol) {
bits.extend_from_slice(code);
continue;
}
}
if let Some(code) = self.trees[0].get_code(symbol) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"Symbol {} not found in any tree", symbol
)));
}
}
}
HuffmanOrder::Order2 => {
let order0_tree = &self.trees[0];
for i in 0..2.min(data.len()) {
if let Some(code) = order0_tree.get_code(data[i]) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"Symbol {} not in Order-0 tree", data[i]
)));
}
}
for i in 2..data.len() {
let context = ((data[i - 2] as u32) << 8) | (data[i - 1] as u32);
let symbol = data[i];
if let Some(&tree_idx) = self.context_map.get(&context) {
let tree = &self.trees[tree_idx];
if let Some(code) = tree.get_code(symbol) {
bits.extend_from_slice(code);
continue;
}
}
if let Some(code) = self.trees[0].get_code(symbol) {
bits.extend_from_slice(code);
} else {
return Err(ZiporaError::invalid_data(format!(
"Symbol {} not found in any tree", symbol
)));
}
}
}
}
let mut result = Vec::new();
let mut current_byte = 0u8;
let mut bit_count = 0;
for bit in bits {
if bit {
current_byte |= 1 << bit_count;
}
bit_count += 1;
if bit_count == 8 {
result.push(current_byte);
current_byte = 0;
bit_count = 0;
}
}
if bit_count > 0 {
result.push(current_byte);
}
Ok(result)
}
pub fn order(&self) -> HuffmanOrder {
self.order
}
pub fn tree_count(&self) -> usize {
self.trees.len()
}
pub fn estimate_compression_ratio(&self, data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut total_bits = 0;
match self.order {
HuffmanOrder::Order0 => {
let tree = &self.trees[0];
for &symbol in data {
if let Some(code) = tree.get_code(symbol) {
total_bits += code.len();
}
}
}
HuffmanOrder::Order1 => {
if let Some(tree) = self.trees.first() {
if let Some(code) = tree.get_code(data[0]) {
total_bits += code.len();
}
}
for i in 1..data.len() {
let context = data[i - 1] as u32;
let symbol = data[i];
let tree_idx = self.context_map.get(&context).copied().unwrap_or(0);
let tree = &self.trees[tree_idx];
if let Some(code) = tree.get_code(symbol) {
total_bits += code.len();
} else if let Some(code) = self.trees[0].get_code(symbol) {
total_bits += code.len();
}
}
}
HuffmanOrder::Order2 => {
for i in 0..2.min(data.len()) {
if let Some(tree) = self.trees.first() {
if let Some(code) = tree.get_code(data[i]) {
total_bits += code.len();
}
}
}
for i in 2..data.len() {
let context = ((data[i - 2] as u32) << 8) | (data[i - 1] as u32);
let symbol = data[i];
let tree_idx = self.context_map.get(&context).copied().unwrap_or(0);
let tree = &self.trees[tree_idx];
if let Some(code) = tree.get_code(symbol) {
total_bits += code.len();
} else if let Some(code) = self.trees[0].get_code(symbol) {
total_bits += code.len();
}
}
}
}
let compressed_bytes = (total_bits + 7) / 8;
compressed_bytes as f64 / data.len() as f64
}
pub fn serialize(&self) -> Vec<u8> {
let mut result = Vec::new();
result.push(self.order as u8);
let tree_count = self.trees.len() as u32;
result.extend_from_slice(&tree_count.to_le_bytes());
let context_count = self.context_map.len() as u32;
result.extend_from_slice(&context_count.to_le_bytes());
for (&context, &tree_idx) in &self.context_map {
result.extend_from_slice(&context.to_le_bytes());
result.extend_from_slice(&(tree_idx as u32).to_le_bytes());
}
for tree in &self.trees {
let tree_data = tree.serialize();
result.extend_from_slice(&(tree_data.len() as u32).to_le_bytes());
result.extend_from_slice(&tree_data);
}
result
}
pub fn deserialize(data: &[u8]) -> Result<Self> {
if data.is_empty() {
return Err(ZiporaError::invalid_data("Empty contextual Huffman data"));
}
let mut offset = 0;
let order = match data[offset] {
0 => HuffmanOrder::Order0,
1 => HuffmanOrder::Order1,
2 => HuffmanOrder::Order2,
_ => return Err(ZiporaError::invalid_data("Invalid Huffman order")),
};
offset += 1;
if offset + 4 > data.len() {
return Err(ZiporaError::invalid_data("Truncated tree count"));
}
let tree_count = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]) as usize;
offset += 4;
if offset + 4 > data.len() {
return Err(ZiporaError::invalid_data("Truncated context count"));
}
let context_count = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]) as usize;
offset += 4;
let mut context_map = HashMap::new();
for _ in 0..context_count {
if offset + 8 > data.len() {
return Err(ZiporaError::invalid_data("Truncated context map"));
}
let context = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]);
offset += 4;
let tree_idx = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]) as usize;
offset += 4;
context_map.insert(context, tree_idx);
}
let mut trees = Vec::with_capacity(tree_count);
for _ in 0..tree_count {
if offset + 4 > data.len() {
return Err(ZiporaError::invalid_data("Truncated tree size"));
}
let tree_size = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]) as usize;
offset += 4;
if offset + tree_size > data.len() {
return Err(ZiporaError::invalid_data("Truncated tree data"));
}
let tree_data = &data[offset..offset + tree_size];
offset += tree_size;
let tree = HuffmanTree::deserialize(tree_data)?;
trees.push(tree);
}
Ok(Self {
order,
trees,
context_map,
fast_symbol_table: OnceCell::new(),
})
}
pub fn encode_with_interleaving(
&self,
data: &[u8],
factor: InterleavingFactor,
) -> Result<Vec<u8>> {
if self.order != HuffmanOrder::Order1 {
return Err(ZiporaError::invalid_operation(
"Interleaving only supported for Order-1 Huffman encoding",
));
}
if data.is_empty() {
return Ok(Vec::new());
}
match factor {
InterleavingFactor::X1 => self.encode_xn::<1>(data),
InterleavingFactor::X2 => self.encode_xn::<2>(data),
InterleavingFactor::X4 => self.encode_xn::<4>(data),
InterleavingFactor::X8 => self.encode_xn::<8>(data),
}
}
pub fn decode_with_interleaving(
&self,
data: &[u8],
output_size: usize,
factor: InterleavingFactor,
) -> Result<Vec<u8>> {
if self.order != HuffmanOrder::Order1 {
return Err(ZiporaError::invalid_operation(
"Interleaving only supported for Order-1 Huffman decoding",
));
}
match factor {
InterleavingFactor::X1 => self.decode_xn::<1>(data, output_size),
InterleavingFactor::X2 => self.decode_xn::<2>(data, output_size),
InterleavingFactor::X4 => self.decode_xn::<4>(data, output_size),
InterleavingFactor::X8 => self.decode_xn::<8>(data, output_size),
}
}
pub fn encode_x1(&self, data: &[u8]) -> Result<Vec<u8>> {
self.encode_with_interleaving(data, InterleavingFactor::X1)
}
pub fn encode_x2(&self, data: &[u8]) -> Result<Vec<u8>> {
self.encode_with_interleaving(data, InterleavingFactor::X2)
}
pub fn encode_x4(&self, data: &[u8]) -> Result<Vec<u8>> {
self.encode_with_interleaving(data, InterleavingFactor::X4)
}
pub fn encode_x8(&self, data: &[u8]) -> Result<Vec<u8>> {
self.encode_with_interleaving(data, InterleavingFactor::X8)
}
pub fn decode_x1(&self, data: &[u8], output_size: usize) -> Result<Vec<u8>> {
self.decode_with_interleaving(data, output_size, InterleavingFactor::X1)
}
pub fn decode_x2(&self, data: &[u8], output_size: usize) -> Result<Vec<u8>> {
self.decode_with_interleaving(data, output_size, InterleavingFactor::X2)
}
pub fn decode_x4(&self, data: &[u8], output_size: usize) -> Result<Vec<u8>> {
self.decode_with_interleaving(data, output_size, InterleavingFactor::X4)
}
pub fn decode_x8(&self, data: &[u8], output_size: usize) -> Result<Vec<u8>> {
self.decode_with_interleaving(data, output_size, InterleavingFactor::X8)
}
fn encode_xn<const N: usize>(&self, data: &[u8]) -> Result<Vec<u8>> {
debug_assert!(N > 0 && N <= 8);
if data.is_empty() {
return Ok(Vec::new());
}
let syms = self.get_or_init_fast_symbol_table();
let mut writer = BitStreamWriter::new();
let record_size = data.len();
let mut stream_starts = [0usize; 8];
let mut stream_ends = [0usize; 8];
for n in 0..N {
let stream_size = record_size / N + if n < record_size % N { 1 } else { 0 };
stream_starts[n] = if n == 0 { 0 } else { stream_ends[n - 1] };
stream_ends[n] = stream_starts[n] + stream_size;
}
debug_assert_eq!(stream_ends[N - 1], record_size);
let mut contexts = [256usize; 8]; let mut positions = stream_starts;
let mut total_encoded = 0;
while total_encoded < record_size {
for n in 0..N {
if positions[n] >= stream_ends[n] {
continue;
}
let pos = positions[n];
let symbol = data[pos] as usize;
let context = contexts[n];
let code = syms[context][symbol];
writer.write(code.bits as u64, code.bit_count as usize);
contexts[n] = symbol;
positions[n] += 1;
total_encoded += 1;
}
}
Ok(writer.finish())
}
fn decode_xn<const N: usize>(&self, data: &[u8], output_size: usize) -> Result<Vec<u8>> {
debug_assert!(N > 0 && N <= 8);
if data.is_empty() {
return Ok(Vec::new());
}
let decode_table = self.build_decode_table()?;
let mut reader = BitStreamReader::new(data);
let mut stream_starts = [0usize; 8];
let mut stream_ends = [0usize; 8];
for n in 0..N {
let stream_size = output_size / N + if n < output_size % N { 1 } else { 0 };
stream_starts[n] = if n == 0 { 0 } else { stream_ends[n - 1] };
stream_ends[n] = stream_starts[n] + stream_size;
}
let mut output = vec![0u8; output_size];
let mut contexts = [256u16; 8]; let mut stream_positions = stream_starts;
let mut total_decoded = 0;
while total_decoded < output_size {
for n in 0..N {
if stream_positions[n] >= stream_ends[n] {
continue;
}
let pos = stream_positions[n];
let context = contexts[n];
let symbol = self.decode_one_symbol(&mut reader, context, &decode_table)?;
output[pos] = symbol;
contexts[n] = symbol as u16;
stream_positions[n] += 1;
total_decoded += 1;
if total_decoded >= output_size {
break;
}
}
}
Ok(output)
}
fn build_symbol_table(&self) -> Result<HashMap<(u16, u8), HuffmanSymbol>> {
let mut table = HashMap::new();
for context in 0..=256u16 {
let tree_idx = if context == 256 {
0 } else {
*self.context_map.get(&(context as u32)).unwrap_or(&0)
};
let tree = &self.trees[tree_idx];
for symbol in 0..=255u8 {
if let Some(code) = tree.get_code(symbol) {
let mut bits = 0u64;
let bit_count = code.len() as u8;
if bit_count > 64 {
return Err(ZiporaError::invalid_data(
format!("Huffman code too long: {} bits", bit_count)
));
}
for (i, &bit) in code.iter().enumerate() {
if bit && i < 64 {
bits |= 1u64 << i;
}
}
table.insert((context, symbol), HuffmanSymbol::new(bits, bit_count));
} else {
return Err(ZiporaError::invalid_data(
format!("Symbol {} not found in tree for context {}", symbol, context)
));
}
}
}
Ok(table)
}
fn get_or_init_fast_symbol_table(&self) -> &FastSymbolTable {
self.fast_symbol_table.get_or_init(|| {
self.build_fast_symbol_table_inner()
})
}
fn build_fast_symbol_table_inner(&self) -> FastSymbolTable {
let mut table: FastSymbolTable = Box::new([[HuffmanEncSymbol::default(); 256]; 257]);
for context in 0..=256usize {
let tree_idx = if context == 256 {
0 } else {
*self.context_map.get(&(context as u32)).unwrap_or(&0)
};
let tree = &self.trees[tree_idx];
for symbol in 0..=255u8 {
if let Some(code) = tree.get_code(symbol) {
let mut bits = 0u16;
let bit_count = code.len() as u16;
let safe_bit_count = bit_count.min(16);
for (i, &bit) in code.iter().take(16).enumerate() {
if bit {
bits |= 1u16 << i;
}
}
table[context][symbol as usize] = HuffmanEncSymbol::new(bits, safe_bit_count);
} else {
table[context][symbol as usize] = HuffmanEncSymbol::new(0, 1);
}
}
}
table
}
fn build_decode_table(&self) -> Result<HashMap<u16, Vec<(u64, u8, u8)>>> {
const BLOCK_BITS: usize = 12;
let mut table: HashMap<u16, Vec<(u64, u8, u8)>> = HashMap::new();
let build_tree_table = |tree: &HuffmanTree| -> Vec<(u64, u8, u8)> {
let mut tree_table = vec![(0u64, 0u8, 0u8); 1 << BLOCK_BITS];
if let Some(root) = tree.root() {
for peek_value in 0..(1 << BLOCK_BITS) {
let mut current = root;
let mut bits_used = 0;
let mut code_bits = 0u64;
for bit_pos in 0..BLOCK_BITS {
match current {
HuffmanNode::Leaf { symbol, .. } => {
tree_table[peek_value] = (code_bits, *symbol, bits_used);
break;
}
HuffmanNode::Internal { left, right, .. } => {
let bit = (peek_value >> bit_pos) & 1;
if bit == 1 {
code_bits |= 1u64 << bit_pos;
current = right;
} else {
current = left;
}
bits_used += 1;
}
}
}
if let HuffmanNode::Leaf { symbol, .. } = current {
if tree_table[peek_value].2 == 0 {
tree_table[peek_value] = (code_bits, *symbol, bits_used);
}
}
}
}
tree_table
};
for context in 0..=256u16 {
let tree_idx = if context == 256 {
0
} else {
*self.context_map.get(&(context as u32)).unwrap_or(&0)
};
let context_table = build_tree_table(&self.trees[tree_idx]);
table.insert(context, context_table);
}
Ok(table)
}
fn decode_one_symbol(
&self,
reader: &mut BitStreamReader,
context: u16,
decode_table: &HashMap<u16, Vec<(u64, u8, u8)>>,
) -> Result<u8> {
const BLOCK_BITS: usize = 12;
if reader.bit_count < BLOCK_BITS {
reader.refill();
}
if reader.bit_count < BLOCK_BITS {
return self.decode_one_symbol_tree(reader, context);
}
let context_table = decode_table.get(&context)
.ok_or_else(|| ZiporaError::invalid_data(format!("Context {} not found in decode table", context)))?;
let peek_bits = reader.peek(BLOCK_BITS);
let (_, symbol, bit_count) = context_table[peek_bits as usize];
if bit_count == 0 {
return self.decode_one_symbol_tree(reader, context);
}
if reader.bit_count < bit_count as usize {
return self.decode_one_symbol_tree(reader, context);
}
reader.consume(bit_count as usize);
reader.refill();
Ok(symbol)
}
fn decode_one_symbol_tree(
&self,
reader: &mut BitStreamReader,
context: u16,
) -> Result<u8> {
let tree_idx = if context == 256 {
0
} else {
*self.context_map.get(&(context as u32)).unwrap_or(&0)
};
let tree = &self.trees[tree_idx];
let root = tree.root().ok_or_else(|| ZiporaError::invalid_data("Empty tree"))?;
let mut current = root;
loop {
match current {
HuffmanNode::Leaf { symbol, .. } => {
return Ok(*symbol);
}
HuffmanNode::Internal { left, right, .. } => {
if reader.bit_count == 0 {
reader.refill();
if reader.bit_count == 0 {
return Err(ZiporaError::invalid_data("Unexpected end of stream"));
}
}
let bit = reader.peek(1) & 1;
reader.consume(1);
current = if bit == 1 { right } else { left };
}
}
}
}
}
#[derive(Debug)]
pub struct ContextualHuffmanDecoder {
encoder: ContextualHuffmanEncoder,
}
impl ContextualHuffmanDecoder {
pub fn new(encoder: ContextualHuffmanEncoder) -> Self {
Self { encoder }
}
pub fn decode(&self, encoded_data: &[u8], output_length: usize) -> Result<Vec<u8>> {
if encoded_data.is_empty() || output_length == 0 {
return Ok(Vec::new());
}
let mut result = Vec::with_capacity(output_length);
match self.encoder.order {
HuffmanOrder::Order0 => {
let tree = &self.encoder.trees[0];
result = self.decode_order0(encoded_data, tree, output_length)?;
}
HuffmanOrder::Order1 => {
result = self.decode_order1(encoded_data, output_length)?;
}
HuffmanOrder::Order2 => {
result = self.decode_order2(encoded_data, output_length)?;
}
}
if result.len() != output_length {
return Err(ZiporaError::invalid_data(format!(
"Decoded length {} != expected {}",
result.len(),
output_length
)));
}
Ok(result)
}
fn decode_order0(&self, encoded_data: &[u8], tree: &HuffmanTree, output_length: usize) -> Result<Vec<u8>> {
let root = tree.root().ok_or_else(|| ZiporaError::invalid_data("Empty tree"))?;
let mut result = Vec::with_capacity(output_length);
let mut current_node = root;
for &byte in encoded_data {
for bit_pos in 0..8 {
if result.len() >= output_length {
break;
}
let bit = (byte >> bit_pos) & 1 == 1;
match current_node {
HuffmanNode::Leaf { symbol, .. } => {
result.push(*symbol);
current_node = root;
current_node = match current_node {
HuffmanNode::Leaf { .. } => current_node,
HuffmanNode::Internal { left, right, .. } => {
if bit { right } else { left }
}
};
}
HuffmanNode::Internal { left, right, .. } => {
current_node = if bit { right } else { left };
}
}
}
if result.len() >= output_length {
break;
}
}
if let HuffmanNode::Leaf { symbol, .. } = current_node {
if result.len() < output_length {
result.push(*symbol);
}
}
Ok(result)
}
fn decode_order1(&self, encoded_data: &[u8], output_length: usize) -> Result<Vec<u8>> {
if self.encoder.trees.is_empty() {
return Ok(Vec::new());
}
let mut result = Vec::with_capacity(output_length);
let mut byte_idx = 0;
let mut bit_pos = 0;
let first_tree = &self.encoder.trees[0];
if let Ok(first_symbol) = self.decode_next_symbol(encoded_data, &mut byte_idx, &mut bit_pos, first_tree) {
result.push(first_symbol);
}
while result.len() < output_length && byte_idx < encoded_data.len() {
let context = *result.last().expect("result non-empty after push") as u32;
let tree_idx = self.encoder.context_map.get(&context).copied().unwrap_or(0);
let tree = &self.encoder.trees[tree_idx];
if let Ok(symbol) = self.decode_next_symbol(encoded_data, &mut byte_idx, &mut bit_pos, tree) {
result.push(symbol);
} else {
break;
}
}
Ok(result)
}
fn decode_order2(&self, encoded_data: &[u8], output_length: usize) -> Result<Vec<u8>> {
if self.encoder.trees.is_empty() {
return Ok(Vec::new());
}
let mut result = Vec::with_capacity(output_length);
let mut byte_idx = 0;
let mut bit_pos = 0;
let first_tree = &self.encoder.trees[0];
for _ in 0..2.min(output_length) {
if let Ok(symbol) = self.decode_next_symbol(encoded_data, &mut byte_idx, &mut bit_pos, first_tree) {
result.push(symbol);
} else {
break;
}
}
while result.len() < output_length && byte_idx < encoded_data.len() {
let len = result.len();
let context = ((result[len - 2] as u32) << 8) | (result[len - 1] as u32);
let tree_idx = self.encoder.context_map.get(&context).copied().unwrap_or(0);
let tree = &self.encoder.trees[tree_idx];
if let Ok(symbol) = self.decode_next_symbol(encoded_data, &mut byte_idx, &mut bit_pos, tree) {
result.push(symbol);
} else {
break;
}
}
Ok(result)
}
fn decode_next_symbol(&self, encoded_data: &[u8], byte_idx: &mut usize, bit_pos: &mut usize, tree: &HuffmanTree) -> Result<u8> {
let root = tree.root().ok_or_else(|| ZiporaError::invalid_data("Empty tree"))?;
let mut current_node = root;
while *byte_idx < encoded_data.len() {
let byte = encoded_data[*byte_idx];
while *bit_pos < 8 {
let bit = (byte >> *bit_pos) & 1 == 1;
match current_node {
HuffmanNode::Leaf { symbol, .. } => {
return Ok(*symbol);
}
HuffmanNode::Internal { left, right, .. } => {
current_node = if bit { right } else { left };
*bit_pos += 1;
}
}
}
*bit_pos = 0;
*byte_idx += 1;
}
if let HuffmanNode::Leaf { symbol, .. } = current_node {
Ok(*symbol)
} else {
Err(ZiporaError::invalid_data("Incomplete symbol"))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_huffman_tree_single_symbol() {
let mut frequencies = [0u32; 256];
frequencies[65] = 100;
let tree = HuffmanTree::from_frequencies(&frequencies).unwrap();
assert_eq!(tree.max_code_length(), 1);
assert_eq!(tree.get_code(65).unwrap(), &vec![false]);
}
#[test]
fn test_huffman_tree_two_symbols() {
let mut frequencies = [0u32; 256];
frequencies[65] = 100; frequencies[66] = 50;
let tree = HuffmanTree::from_frequencies(&frequencies).unwrap();
assert!(tree.get_code(65).is_some());
assert!(tree.get_code(66).is_some());
assert_eq!(tree.max_code_length(), 1);
}
#[test]
fn test_huffman_encoding_decoding() {
let data = b"hello world! this is a test message for huffman coding.";
let encoder = HuffmanEncoder::new(data).unwrap();
let encoded = encoder.encode(data).unwrap();
let decoder = HuffmanDecoder::new(encoder.tree().clone());
let decoded = decoder.decode(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded);
}
#[test]
fn test_huffman_compression_ratio() {
let data = b"aaaaaabbbbcccc";
let encoder = HuffmanEncoder::new(data).unwrap();
let ratio = encoder.estimate_compression_ratio(data);
assert!(ratio < 1.0);
}
#[test]
fn test_huffman_tree_serialization() {
let data = b"hello world";
let tree = HuffmanTree::from_data(data).unwrap();
let serialized = tree.serialize();
let deserialized = HuffmanTree::deserialize(&serialized).unwrap();
for (&symbol, code) in &tree.codes {
assert_eq!(deserialized.get_code(symbol), Some(code));
}
}
#[test]
fn test_empty_data() {
let data = b"";
let encoder = HuffmanEncoder::new(data).unwrap();
let encoded = encoder.encode(data).unwrap();
assert!(encoded.is_empty());
}
#[test]
fn test_large_alphabet() {
let data: Vec<u8> = (0..=255).cycle().take(1000).collect();
let encoder = HuffmanEncoder::new(&data).unwrap();
let encoded = encoder.encode(&data).unwrap();
let decoder = HuffmanDecoder::new(encoder.tree().clone());
let decoded = decoder.decode(&encoded, data.len()).unwrap();
assert_eq!(data, decoded);
}
#[test]
fn test_huffman_tree_frequencies() {
let mut frequencies = [0u32; 256];
frequencies[b'a' as usize] = 45;
frequencies[b'b' as usize] = 13;
frequencies[b'c' as usize] = 12;
frequencies[b'd' as usize] = 16;
frequencies[b'e' as usize] = 9;
frequencies[b'f' as usize] = 5;
let tree = HuffmanTree::from_frequencies(&frequencies).unwrap();
let code_a = tree.get_code(b'a').unwrap();
let code_f = tree.get_code(b'f').unwrap();
assert!(!code_a.is_empty());
assert!(!code_f.is_empty());
let max_length = tree.max_code_length();
assert!(max_length > 0);
}
#[test]
fn test_contextual_huffman_order0() {
let data = b"hello world! this is a test message for huffman coding.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order0).unwrap();
assert_eq!(encoder.order(), HuffmanOrder::Order0);
assert_eq!(encoder.tree_count(), 1);
let encoded = encoder.encode(data).unwrap();
let decoder = ContextualHuffmanDecoder::new(encoder);
let decoded = decoder.decode(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded);
}
#[test]
fn test_contextual_huffman_order1() {
let data = b"abababab";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
assert_eq!(encoder.order(), HuffmanOrder::Order1);
assert!(encoder.tree_count() >= 1);
let encoded = encoder.encode(data).unwrap();
let decoder = ContextualHuffmanDecoder::new(encoder);
let decoded = decoder.decode(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded);
}
#[test]
fn test_contextual_huffman_order2() {
let data = b"abcabcabcabc";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order2).unwrap();
assert_eq!(encoder.order(), HuffmanOrder::Order2);
assert!(encoder.tree_count() >= 1);
let encoded = encoder.encode(data).unwrap();
let decoder = ContextualHuffmanDecoder::new(encoder);
let decoded = decoder.decode(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded);
}
#[test]
fn test_contextual_huffman_compression_comparison() {
let data = b"aaaaabbbbbcccccdddddeeeeefffff";
let encoder0 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order0).unwrap();
let encoder1 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoder2 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order2).unwrap();
let ratio0 = encoder0.estimate_compression_ratio(data);
let ratio1 = encoder1.estimate_compression_ratio(data);
let ratio2 = encoder2.estimate_compression_ratio(data);
println!("Order-0 ratio: {:.3}", ratio0);
println!("Order-1 ratio: {:.3}", ratio1);
println!("Order-2 ratio: {:.3}", ratio2);
assert!(ratio0 < 1.0, "Order-0 ratio should be < 1.0, got {:.3}", ratio0);
assert!(ratio1 <= 1.5, "Order-1 ratio too high, got {:.3}", ratio1);
assert!(ratio2 <= 1.5, "Order-2 ratio too high, got {:.3}", ratio2);
let encoded0 = encoder0.encode(data).unwrap();
let decoder0 = ContextualHuffmanDecoder::new(encoder0);
let decoded0 = decoder0.decode(&encoded0, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded0);
let encoder1 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoded1 = encoder1.encode(data).unwrap();
let decoder1 = ContextualHuffmanDecoder::new(encoder1);
let decoded1 = decoder1.decode(&encoded1, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded1);
let encoder2 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order2).unwrap();
let encoded2 = encoder2.encode(data).unwrap();
let decoder2 = ContextualHuffmanDecoder::new(encoder2);
let decoded2 = decoder2.decode(&encoded2, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded2);
}
#[test]
fn test_contextual_huffman_serialization() {
let data = b"test data for serialization";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let serialized = encoder.serialize();
let deserialized = ContextualHuffmanEncoder::deserialize(&serialized).unwrap();
assert_eq!(encoder.order(), deserialized.order());
assert_eq!(encoder.tree_count(), deserialized.tree_count());
let encoded1 = encoder.encode(data).unwrap();
let encoded2 = deserialized.encode(data).unwrap();
assert_eq!(encoded1, encoded2);
}
#[test]
fn test_contextual_huffman_edge_cases() {
let short_data = b"a";
let encoder = ContextualHuffmanEncoder::new(short_data, HuffmanOrder::Order2).unwrap();
assert!(encoder.order() == HuffmanOrder::Order0 || encoder.order() == HuffmanOrder::Order1);
let empty_data = b"";
let encoder = ContextualHuffmanEncoder::new(empty_data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode(empty_data).unwrap();
assert!(encoded.is_empty());
let repeated_data = b"aaaaaaaaaa";
let encoder = ContextualHuffmanEncoder::new(repeated_data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode(repeated_data).unwrap();
let decoder = ContextualHuffmanDecoder::new(encoder);
let decoded = decoder.decode(&encoded, repeated_data.len()).unwrap();
assert_eq!(repeated_data.to_vec(), decoded);
}
#[test]
fn test_huffman_order_enum() {
assert_eq!(HuffmanOrder::default(), HuffmanOrder::Order0);
let orders = [HuffmanOrder::Order0, HuffmanOrder::Order1, HuffmanOrder::Order2];
for order in orders {
let data = b"test data";
let encoder = ContextualHuffmanEncoder::new(data, order).unwrap();
assert_eq!(encoder.order(), order);
}
}
#[test]
fn test_interleaving_factor_streams() {
assert_eq!(InterleavingFactor::X1.streams(), 1);
assert_eq!(InterleavingFactor::X2.streams(), 2);
assert_eq!(InterleavingFactor::X4.streams(), 4);
assert_eq!(InterleavingFactor::X8.streams(), 8);
}
#[test]
fn test_interleaving_factor_default() {
assert_eq!(InterleavingFactor::default(), InterleavingFactor::X1);
}
#[test]
fn test_encode_x1_basic() {
let data = b"hello world! this is a test for interleaved huffman coding with order-1 context.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode_x1(data).unwrap();
let decoded = encoder.decode_x1(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded, "X1 encode-decode round trip failed");
}
#[test]
fn test_encode_x2_basic() {
let data = b"hello world! this is a test for x2 interleaved huffman coding.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode_x2(data).unwrap();
let decoded = encoder.decode_x2(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded, "X2 encode-decode round trip failed");
}
#[test]
fn test_encode_x4_basic() {
let data = b"hello world! this is a test for x4 interleaved huffman coding with more data.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode_x4(data).unwrap();
let decoded = encoder.decode_x4(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded, "X4 encode-decode round trip failed");
}
#[test]
fn test_encode_x8_basic() {
let data = b"hello world! this is a test for x8 interleaved huffman coding with even more data to test.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode_x8(data).unwrap();
let decoded = encoder.decode_x8(&encoded, data.len()).unwrap();
assert_eq!(data.to_vec(), decoded, "X8 encode-decode round trip failed");
}
#[test]
fn test_interleaving_all_variants() {
let data = b"The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Round trip failed for {:?} interleaving", factor);
}
}
#[test]
fn test_interleaving_empty_data() {
let data = b"";
let encoder = ContextualHuffmanEncoder::new(b"training data", HuffmanOrder::Order1).unwrap();
let encoded = encoder.encode_x1(data).unwrap();
assert!(encoded.is_empty());
let decoded = encoder.decode_x1(&encoded, 0).unwrap();
assert!(decoded.is_empty());
}
#[test]
fn test_interleaving_single_byte() {
let data = b"a";
let encoder = ContextualHuffmanEncoder::new(b"abcdef", HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Single byte failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_two_bytes() {
let data = b"ab";
let encoder = ContextualHuffmanEncoder::new(b"abcdef", HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Two bytes failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_power_of_two_sizes() {
let training_data = b"The quick brown fox jumps over the lazy dog.";
let encoder = ContextualHuffmanEncoder::new(training_data, HuffmanOrder::Order1).unwrap();
for size in [8, 16, 32, 64, 128, 256] {
let data: Vec<u8> = training_data.iter().cycle().take(size).copied().collect();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(&data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data, decoded,
"Power-of-2 size {} failed for {:?}", size, factor);
}
}
}
#[test]
fn test_interleaving_non_power_of_two_sizes() {
let training_data = b"The quick brown fox jumps over the lazy dog.";
let encoder = ContextualHuffmanEncoder::new(training_data, HuffmanOrder::Order1).unwrap();
for size in [7, 15, 31, 63, 127, 255] {
let data: Vec<u8> = training_data.iter().cycle().take(size).copied().collect();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(&data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data, decoded,
"Non-power-of-2 size {} failed for {:?}", size, factor);
}
}
}
#[test]
fn test_interleaving_repeated_symbols() {
let data = b"aaaaaaaaaaaaaaaa"; let encoder = ContextualHuffmanEncoder::new(b"abc", HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Repeated symbols failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_alternating_symbols() {
let data = b"abababababababab"; let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Alternating pattern failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_all_bytes() {
let data: Vec<u8> = (0..=255u8).cycle().take(512).collect();
let encoder = ContextualHuffmanEncoder::new(&data, HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(&data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data, decoded,
"All bytes test failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_large_data() {
let base = b"The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.";
let data: Vec<u8> = base.iter().cycle().take(1024).copied().collect();
let encoder = ContextualHuffmanEncoder::new(&data, HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(&data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data, decoded,
"Large data (1KB) failed for {:?}", factor);
}
}
#[test]
fn test_interleaving_only_order1() {
let data = b"test data";
let encoder0 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order0).unwrap();
assert!(encoder0.encode_with_interleaving(data, InterleavingFactor::X2).is_err());
let encoder2 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order2).unwrap();
assert!(encoder2.encode_with_interleaving(data, InterleavingFactor::X2).is_err());
let encoder1 = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
assert!(encoder1.encode_with_interleaving(data, InterleavingFactor::X2).is_ok());
}
#[test]
fn test_interleaving_compression_ratio() {
let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.";
let encoder = ContextualHuffmanEncoder::new(data, HuffmanOrder::Order1).unwrap();
for factor in [InterleavingFactor::X1, InterleavingFactor::X2,
InterleavingFactor::X4, InterleavingFactor::X8] {
let encoded = encoder.encode_with_interleaving(data, factor).unwrap();
let decoded = encoder.decode_with_interleaving(&encoded, data.len(), factor).unwrap();
assert_eq!(data.to_vec(), decoded,
"Round trip failed for {:?}", factor);
let ratio = encoded.len() as f64 / data.len() as f64;
assert!(ratio <= 1.2,
"Compression ratio too high for {:?}, ratio: {:.3}", factor, ratio);
}
}
#[test]
fn test_bitstream_writer_basic() {
let mut writer = BitStreamWriter::new();
writer.write(0b10101010, 8);
let result = writer.finish();
assert_eq!(result.len(), 1);
assert_eq!(result[0], 0b10101010);
}
#[test]
fn test_bitstream_writer_partial_byte() {
let mut writer = BitStreamWriter::new();
writer.write(0b1010, 4);
let result = writer.finish();
assert_eq!(result.len(), 1);
assert_eq!(result[0], 0b1010);
}
#[test]
fn test_bitstream_writer_multiple_writes() {
let mut writer = BitStreamWriter::new();
writer.write(0b1010, 4);
writer.write(0b0101, 4);
let result = writer.finish();
assert_eq!(result.len(), 1);
assert_eq!(result[0], 0b01011010); }
#[test]
fn test_bitstream_reader_basic() {
let data = vec![0b10101010];
let mut reader = BitStreamReader::new(&data);
let bits = reader.read(8);
assert_eq!(bits, 0b10101010);
}
#[test]
fn test_bitstream_reader_partial() {
let data = vec![0b10101010];
let mut reader = BitStreamReader::new(&data);
let first = reader.read(4);
let second = reader.read(4);
assert_eq!(first, 0b1010);
assert_eq!(second, 0b1010);
}
#[test]
fn test_bitstream_roundtrip() {
let mut writer = BitStreamWriter::new();
writer.write(0b101, 3);
writer.write(0b11110000, 8);
writer.write(0b1, 1);
writer.write(0b111111, 6);
let data = writer.finish();
let mut reader = BitStreamReader::new(&data);
assert_eq!(reader.read(3), 0b101);
assert_eq!(reader.read(8), 0b11110000);
assert_eq!(reader.read(1), 0b1);
assert_eq!(reader.read(6), 0b111111);
}
}