use alloc::string::{String, ToString};
use alloc::vec::Vec;
use super::types::{
CompressOp, CompressedHeader, CompressionDict, DictError, DictResult, MIN_MATCH_LEN,
OP_DICT_REF, OP_LITERAL,
};
fn compute_checksum(data: &[u8]) -> u32 {
let mut hash: u32 = 0;
for (i, &byte) in data.iter().enumerate() {
hash = hash.wrapping_add(byte as u32);
hash = hash.wrapping_mul(31);
hash ^= (i as u32).rotate_left(5);
}
hash
}
pub fn compress(data: &[u8], dict: &CompressionDict) -> DictResult<Vec<u8>> {
if data.is_empty() {
let header = CompressedHeader::new(dict.id, 0, 0, 0);
return Ok(header.to_bytes().to_vec());
}
let checksum = compute_checksum(data);
let ops = compress_to_ops(data, &dict.data);
let compressed_data = encode_ops(&ops);
let header = CompressedHeader::new(
dict.id,
data.len() as u32,
compressed_data.len() as u32,
checksum,
);
let mut result = Vec::with_capacity(CompressedHeader::SIZE + compressed_data.len());
result.extend_from_slice(&header.to_bytes());
result.extend_from_slice(&compressed_data);
Ok(result)
}
fn compress_to_ops(data: &[u8], dict: &[u8]) -> Vec<CompressOp> {
let mut ops = Vec::new();
let mut pos = 0;
let mut literal_start = 0;
while pos < data.len() {
let (match_offset, match_len) = find_best_match(dict, &data[pos..]);
if match_len >= MIN_MATCH_LEN {
if pos > literal_start {
ops.push(CompressOp::literal(data[literal_start..pos].to_vec()));
}
ops.push(CompressOp::dict_ref(match_offset as u16, match_len as u16));
pos += match_len;
literal_start = pos;
} else {
pos += 1;
}
}
if literal_start < data.len() {
ops.push(CompressOp::literal(data[literal_start..].to_vec()));
}
ops
}
fn encode_ops(ops: &[CompressOp]) -> Vec<u8> {
let mut result = Vec::new();
for op in ops {
op.encode(&mut result);
}
result
}
fn find_best_match(dict: &[u8], data: &[u8]) -> (usize, usize) {
if dict.is_empty() || data.len() < MIN_MATCH_LEN {
return (0, 0);
}
let mut best_offset = 0;
let mut best_len = 0;
let max_match = data.len().min(u16::MAX as usize);
for offset in 0..dict.len() {
let max_len = (dict.len() - offset).min(max_match);
let mut len = 0;
while len < max_len && len < data.len() && dict[offset + len] == data[len] {
len += 1;
}
if len > best_len && len >= MIN_MATCH_LEN {
best_offset = offset;
best_len = len;
if best_len >= 64 {
break;
}
}
}
(best_offset, best_len)
}
pub fn decompress(compressed: &[u8], dict: &CompressionDict) -> DictResult<Vec<u8>> {
let header = CompressedHeader::from_bytes(compressed)
.ok_or_else(|| DictError::InvalidData("invalid header".to_string()))?;
if header.dict_id != dict.id {
return Err(DictError::DictNotFound(header.dict_id));
}
if header.original_size == 0 {
return Ok(Vec::new());
}
let compressed_data = &compressed[CompressedHeader::SIZE..];
let result = decompress_ops(compressed_data, &dict.data, header.original_size as usize)?;
let actual_checksum = compute_checksum(&result);
if actual_checksum != header.checksum {
return Err(DictError::ChecksumMismatch {
expected: header.checksum,
actual: actual_checksum,
});
}
if result.len() != header.original_size as usize {
return Err(DictError::InvalidData(alloc::format!(
"size mismatch: expected {}, got {}",
header.original_size,
result.len()
)));
}
Ok(result)
}
fn decompress_ops(compressed: &[u8], dict: &[u8], expected_size: usize) -> DictResult<Vec<u8>> {
let mut result = Vec::with_capacity(expected_size);
let mut pos = 0;
while pos < compressed.len() {
let (op, consumed) = CompressOp::decode(&compressed[pos..])
.ok_or_else(|| DictError::InvalidData("invalid operation".to_string()))?;
match op {
CompressOp::DictRef { offset, length } => {
let start = offset as usize;
let end = start + length as usize;
if end > dict.len() {
return Err(DictError::InvalidData(alloc::format!(
"dict reference out of bounds: {}..{}",
start,
end
)));
}
result.extend_from_slice(&dict[start..end]);
}
CompressOp::Literal { data } => {
result.extend_from_slice(&data);
}
}
pos += consumed;
}
Ok(result)
}
pub fn decompress_with_dict_data(compressed: &[u8], dict_data: &[u8]) -> DictResult<Vec<u8>> {
let header = CompressedHeader::from_bytes(compressed)
.ok_or_else(|| DictError::InvalidData("invalid header".to_string()))?;
if header.original_size == 0 {
return Ok(Vec::new());
}
let compressed_data = &compressed[CompressedHeader::SIZE..];
let result = decompress_ops(compressed_data, dict_data, header.original_size as usize)?;
let actual_checksum = compute_checksum(&result);
if actual_checksum != header.checksum {
return Err(DictError::ChecksumMismatch {
expected: header.checksum,
actual: actual_checksum,
});
}
Ok(result)
}
pub fn compression_ratio(data: &[u8], dict: &CompressionDict) -> f64 {
if data.is_empty() {
return 1.0;
}
let compressed = match compress(data, dict) {
Ok(c) => c,
Err(_) => return 1.0,
};
compressed.len() as f64 / data.len() as f64
}
pub fn is_beneficial(data: &[u8], dict: &CompressionDict) -> bool {
if data.len() < 100 {
return false;
}
let ratio = compression_ratio(data, dict);
ratio < 0.95 }
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn make_dict(data: &[u8]) -> CompressionDict {
CompressionDict::new(1, "test", data.to_vec(), "*", "ds", 0)
}
#[test]
fn test_checksum() {
let data1 = b"hello world";
let data2 = b"hello worle";
let sum1 = compute_checksum(data1);
let sum2 = compute_checksum(data2);
assert_ne!(sum1, sum2);
assert_eq!(sum1, compute_checksum(data1));
}
#[test]
fn test_compress_empty() {
let dict = make_dict(b"some dict data");
let result = compress(&[], &dict).unwrap();
assert_eq!(result.len(), CompressedHeader::SIZE);
}
#[test]
fn test_compress_decompress_roundtrip() {
let dict = make_dict(b"hello world this is a test");
let data = b"hello world test hello";
let compressed = compress(data, &dict).unwrap();
let decompressed = decompress(&compressed, &dict).unwrap();
assert_eq!(decompressed, data);
}
#[test]
fn test_compress_with_dict_matches() {
let dict_data = b"the quick brown fox";
let dict = make_dict(dict_data);
let data = b"the quick brown fox jumps over the quick fox";
let compressed = compress(data, &dict).unwrap();
let decompressed = decompress(&compressed, &dict).unwrap();
assert_eq!(decompressed, data);
}
#[test]
fn test_compress_no_dict_match() {
let dict = make_dict(b"aaaa bbbb cccc");
let data = b"xxxx yyyy zzzz";
let compressed = compress(data, &dict).unwrap();
let decompressed = decompress(&compressed, &dict).unwrap();
assert_eq!(decompressed, data);
}
#[test]
fn test_decompress_wrong_dict() {
let dict1 = CompressionDict::new(1, "d1", vec![1, 2, 3], "*", "ds", 0);
let dict2 = CompressionDict::new(2, "d2", vec![4, 5, 6], "*", "ds", 0);
let data = b"test data";
let compressed = compress(data, &dict1).unwrap();
let result = decompress(&compressed, &dict2);
assert!(matches!(result, Err(DictError::DictNotFound(1))));
}
#[test]
fn test_compress_to_ops() {
let dict = b"hello";
let data = b"hello world hello";
let ops = compress_to_ops(data, dict);
assert!(!ops.is_empty());
let encoded = encode_ops(&ops);
assert!(!encoded.is_empty());
}
#[test]
fn test_find_best_match() {
let dict = b"hello world";
let (off1, len1) = find_best_match(dict, b"hello there");
assert_eq!(off1, 0);
assert_eq!(len1, 6);
let (off2, len2) = find_best_match(dict, b"world cup");
assert_eq!(off2, 6);
assert_eq!(len2, 5);
let (_, len3) = find_best_match(dict, b"xyz");
assert_eq!(len3, 0);
}
#[test]
fn test_compression_ratio() {
let dict = make_dict(b"repeated pattern here");
let data = b"repeated pattern here and repeated pattern here again";
let ratio = compression_ratio(data, &dict);
assert!(ratio <= 1.5); }
#[test]
fn test_is_beneficial() {
let dict = make_dict(b"common substring");
assert!(!is_beneficial(b"tiny", &dict));
let large_data = b"common substring appears multiple times in common substring data";
let _ = is_beneficial(large_data, &dict);
}
#[test]
fn test_large_data_roundtrip() {
let dict_data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
let dict = make_dict(&dict_data);
let mut data = Vec::with_capacity(10000);
for i in 0..10000 {
data.push((i % 256) as u8);
}
let compressed = compress(&data, &dict).unwrap();
let decompressed = decompress(&compressed, &dict).unwrap();
assert_eq!(decompressed, data);
}
#[test]
fn test_decompress_with_dict_data() {
let dict_data = b"test dictionary";
let dict = make_dict(dict_data);
let data = b"test dictionary is used here test";
let compressed = compress(data, &dict).unwrap();
let decompressed = decompress_with_dict_data(&compressed, dict_data).unwrap();
assert_eq!(decompressed, data);
}
}