mod decoder;
mod encoder;
mod table;
pub use decoder::{build_table_from_weights, parse_huffman_weights, HuffmanDecoder};
pub use encoder::{HuffmanCode, HuffmanEncoder};
pub use table::{HuffmanTable, HuffmanTableEntry};
pub const HUFFMAN_MAX_SYMBOLS: usize = 256;
pub const HUFFMAN_MAX_WEIGHT: u8 = 12;
pub const HUFFMAN_MAX_BITS: u8 = 11;
pub const HUFFMAN_MIN_HEADER_SIZE: usize = 1;
#[cfg(test)]
mod tests {
use super::*;
use crate::fse::BitReader;
#[test]
fn test_constants() {
assert_eq!(HUFFMAN_MAX_SYMBOLS, 256);
assert!(HUFFMAN_MAX_WEIGHT <= 12);
assert!(HUFFMAN_MAX_BITS <= 11);
}
#[test]
fn test_huffman_encoder_decoder_roundtrip() {
let mut data = Vec::new();
for _ in 0..100 {
data.push(b'a');
}
for _ in 0..50 {
data.push(b'b');
}
for _ in 0..25 {
data.push(b'c');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let weights = encoder.serialize_weights();
let (parsed_weights, _) = parse_huffman_weights(&weights).expect("Should parse weights");
let table = build_table_from_weights(parsed_weights).expect("Should build table");
let decoder = HuffmanDecoder::new(&table);
let encoded = encoder.encode(&data);
assert!(encoded.len() < data.len(), "Huffman should compress");
let mut bits = BitReader::new_reversed(&encoded).expect("Should create reversed reader");
let mut decoded = Vec::new();
for _ in 0..data.len() {
let symbol = decoder
.decode_symbol(&mut bits)
.expect("Should decode symbol");
decoded.push(symbol);
}
assert_eq!(decoded.len(), data.len(), "Should decode all symbols");
assert_eq!(decoded, data, "Huffman roundtrip should match");
}
#[test]
fn test_huffman_simple_encode_decode() {
let mut data = Vec::new();
for _ in 0..50 {
data.push(b'a');
}
for _ in 0..50 {
data.push(b'b');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let encoded = encoder.encode(&data);
assert!(encoded.len() < data.len(), "Huffman should compress");
let last_byte = encoded[encoded.len() - 1];
assert!(last_byte != 0, "Should have sentinel bit");
}
#[test]
fn test_huffman_minimal() {
let weights = [2u8, 1, 1];
let table = HuffmanTable::from_weights(&weights).unwrap();
let decoder = HuffmanDecoder::new(&table);
assert_eq!(table.max_bits(), 2);
let encoded = [0b00000100u8];
let mut bits = BitReader::new_reversed(&encoded).unwrap();
assert_eq!(bits.bits_remaining(), 2);
let sym = decoder.decode_symbol(&mut bits).unwrap();
assert_eq!(sym, 0, "Should decode symbol 0");
}
#[test]
fn test_huffman_table_from_frequencies() {
let mut data = Vec::new();
for _ in 0..100 {
data.push(b'a');
}
for _ in 0..50 {
data.push(b'b');
}
for _ in 0..25 {
data.push(b'c');
}
for _ in 0..12 {
data.push(b'd');
}
for _ in 0..6 {
data.push(b'e');
}
for _ in 0..3 {
data.push(b'f');
}
for _ in 0..2 {
data.push(b'g');
}
for _ in 0..1 {
data.push(b'h');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build from frequencies");
let codes = encoder.get_codes();
let a_bits = codes[b'a' as usize].num_bits;
let h_bits = codes[b'h' as usize].num_bits;
assert!(
a_bits <= h_bits,
"Most frequent symbol should have shorter code: a={} bits, h={} bits",
a_bits,
h_bits
);
}
#[test]
fn test_huffman_table_serialization_roundtrip() {
let mut data = Vec::new();
for _ in 0..100 {
data.push(b'a');
}
for _ in 0..50 {
data.push(b'b');
}
for _ in 0..25 {
data.push(b'c');
}
for _ in 0..12 {
data.push(b'd');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let serialized = encoder.serialize_weights();
let (parsed_weights, bytes_read) =
parse_huffman_weights(&serialized).expect("Should parse serialized weights");
assert!(bytes_read > 0, "Should read some bytes");
assert!(!parsed_weights.is_empty(), "Should have parsed weights");
let table = build_table_from_weights(parsed_weights)
.expect("Should build table from parsed weights");
assert!(
table.max_bits() <= HUFFMAN_MAX_BITS,
"Max bits should be within limit"
);
}
#[test]
fn test_huffman_max_code_length() {
let mut data = Vec::new();
for i in 0..11u8 {
let count = 1usize << i; for _ in 0..count {
data.push(i);
}
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
assert!(
encoder.max_bits() <= HUFFMAN_MAX_BITS,
"Max code length {} exceeds limit {}",
encoder.max_bits(),
HUFFMAN_MAX_BITS
);
let codes = encoder.get_codes();
for i in 0..11u8 {
let code = &codes[i as usize];
if code.num_bits > 0 {
assert!(
code.num_bits <= HUFFMAN_MAX_BITS,
"Symbol {} has code length {} exceeding max {}",
i,
code.num_bits,
HUFFMAN_MAX_BITS
);
}
}
}
#[test]
fn test_huffman_many_symbols() {
let mut data = Vec::new();
for i in 0..20u8 {
let count = 200usize.saturating_sub(i as usize * 8).max(10);
for _ in 0..count {
data.push(i);
}
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder for many symbols");
assert!(encoder.num_symbols() >= 15, "Should have many symbols");
assert!(
encoder.max_bits() <= HUFFMAN_MAX_BITS,
"Should respect max bits"
);
let encoded = encoder.encode(&data);
assert!(encoded.len() < data.len(), "Should compress data");
let weights = encoder.serialize_weights();
assert!(!weights.is_empty(), "Should produce non-empty weights");
assert!(weights[0] >= 128, "Should use direct encoding format");
}
#[test]
fn test_huffman_compression_ratio() {
let data = b"aaaaaabbbbcccdde".repeat(1000);
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let encoded = encoder.encode(&data);
let weights = encoder.serialize_weights();
let compressed_size = encoded.len() + weights.len();
let original_size = data.len();
let ratio = original_size as f64 / compressed_size as f64;
assert!(
ratio >= 2.0,
"Compression ratio {:.2}x is below expected 2x (original: {}, compressed: {})",
ratio,
original_size,
compressed_size
);
}
#[test]
fn test_huffman_empty_data() {
let encoder = HuffmanEncoder::build(&[]);
assert!(encoder.is_none(), "Empty data should not build encoder");
}
#[test]
fn test_huffman_single_value_data() {
let data = vec![b'x'; 1000];
let encoder = HuffmanEncoder::build(&data);
assert!(
encoder.is_none(),
"Single symbol data should not use Huffman"
);
}
#[test]
fn test_huffman_two_symbols_equal_frequency() {
let mut data = Vec::new();
for _ in 0..500 {
data.push(b'0');
}
for _ in 0..500 {
data.push(b'1');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let codes = encoder.get_codes();
assert_eq!(
codes[b'0' as usize].num_bits, 1,
"Symbol '0' should have 1-bit code"
);
assert_eq!(
codes[b'1' as usize].num_bits, 1,
"Symbol '1' should have 1-bit code"
);
let encoded = encoder.encode(&data);
assert!(
encoded.len() <= data.len() / 2 + 10, "Two equal-freq symbols should compress to ~half size"
);
}
#[test]
fn test_huffman_text_pattern() {
let text = b"The quick brown fox jumps over the lazy dog. ";
let data = text.repeat(100);
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder for text");
assert!(
encoder.num_symbols() > 10,
"Text should have multiple symbols"
);
let encoded = encoder.encode(&data);
let weights = encoder.serialize_weights();
let total_compressed = encoded.len() + weights.len();
assert!(
total_compressed < data.len(),
"Should compress text: {} < {}",
total_compressed,
data.len()
);
let space_code = encoder.get_codes()[b' ' as usize];
assert!(space_code.num_bits > 0, "Space should be encoded");
assert!(
space_code.num_bits <= encoder.max_bits(),
"Space code should fit within max bits"
);
}
#[test]
fn test_huffman_batch_encode_consistency() {
let mut data = Vec::new();
for _ in 0..100 {
data.push(b'a');
}
for _ in 0..50 {
data.push(b'b');
}
for _ in 0..25 {
data.push(b'c');
}
for _ in 0..12 {
data.push(b'd');
}
let encoder = HuffmanEncoder::build(&data).expect("Should build encoder");
let regular = encoder.encode(&data);
let batch = encoder.encode_batch(&data);
assert!(!regular.is_empty(), "Regular encode should produce output");
assert!(!batch.is_empty(), "Batch encode should produce output");
assert!(regular.len() < data.len(), "Regular should compress");
assert!(batch.len() < data.len(), "Batch should compress");
}
}