use crate::error::{Result, ZiporaError};
use std::io::Write;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum DzType {
Literal = 0,
Global = 1,
RLE = 2,
NearShort = 3,
Far1Short = 4,
Far2Short = 5,
Far2Long = 6,
Far3Long = 7,
}
impl DzType {
pub fn from_u8(value: u8) -> Result<Self> {
match value {
0 => Ok(DzType::Literal),
1 => Ok(DzType::Global),
2 => Ok(DzType::RLE),
3 => Ok(DzType::NearShort),
4 => Ok(DzType::Far1Short),
5 => Ok(DzType::Far2Short),
6 => Ok(DzType::Far2Long),
7 => Ok(DzType::Far3Long),
_ => Err(ZiporaError::invalid_data(format!("Invalid DzType: {}", value))),
}
}
pub fn as_u8(self) -> u8 {
self as u8
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DzEncodingMeta {
pub dz_type: DzType,
pub len: i8, }
pub fn get_back_ref_encoding_meta(distance: usize, length: usize) -> DzEncodingMeta {
if length == 1 {
return DzEncodingMeta {
dz_type: DzType::Literal,
len: 2,
};
}
if distance == 1 && length <= 33 {
return DzEncodingMeta {
dz_type: DzType::RLE,
len: 1,
};
}
if distance >= 2 && distance <= 9 && length <= 5 {
return DzEncodingMeta {
dz_type: DzType::NearShort,
len: 1,
};
}
if distance >= 2 && distance <= 257 && length <= 33 {
return DzEncodingMeta {
dz_type: DzType::Far1Short,
len: 2,
};
}
if distance >= 258 && distance <= 258 + 65535 && length <= 33 {
return DzEncodingMeta {
dz_type: DzType::Far2Short,
len: 3,
};
}
if distance <= 65535 && length >= 34 {
if length <= 34 + 30 {
return DzEncodingMeta {
dz_type: DzType::Far2Long,
len: 3,
};
} else {
return DzEncodingMeta {
dz_type: DzType::Far2Long,
len: 6,
};
}
}
if length <= 35 {
DzEncodingMeta {
dz_type: DzType::Far3Long,
len: 4,
}
} else {
DzEncodingMeta {
dz_type: DzType::Far3Long,
len: 7,
}
}
}
#[inline]
pub fn write_uint_bytes<W: Write>(writer: &mut W, value: u32, bytes: usize) -> Result<()> {
match bytes {
1 => writer.write_all(&[value as u8])?,
2 => writer.write_all(&(value as u16).to_le_bytes())?,
3 => {
let bytes = value.to_le_bytes();
writer.write_all(&bytes[0..3])?;
}
4 => writer.write_all(&value.to_le_bytes())?,
_ => return Err(ZiporaError::invalid_parameter(format!("Invalid byte count: {}", bytes))),
}
Ok(())
}
#[inline]
pub fn write_var_size_t<W: Write>(writer: &mut W, mut value: usize) -> Result<()> {
while value >= 128 {
writer.write_all(&[(value & 0x7F) as u8 | 0x80])?;
value >>= 7;
}
writer.write_all(&[value as u8])?;
Ok(())
}
pub struct ReferenceEncoder<W: Write> {
writer: W,
}
impl<W: Write> ReferenceEncoder<W> {
pub fn new(writer: W) -> Self {
Self { writer }
}
pub fn encode_rle(&mut self, length: usize) -> Result<()> {
debug_assert!(length >= 2 && length <= 33);
let encoded = DzType::RLE.as_u8() | (((length - 2) << 3) as u8);
self.writer.write_all(&[encoded])?;
Ok(())
}
pub fn encode_near_short(&mut self, distance: usize, length: usize) -> Result<()> {
debug_assert!(distance >= 2 && distance <= 9);
debug_assert!(length >= 2 && length <= 5);
let encoded = DzType::NearShort.as_u8()
| (((length - 2) << 3) as u8)
| (((distance - 2) << 5) as u8);
self.writer.write_all(&[encoded])?;
Ok(())
}
pub fn encode_far1_short(&mut self, distance: usize, length: usize) -> Result<()> {
debug_assert!(distance >= 2 && distance <= 257);
debug_assert!(length >= 2 && length <= 33);
let encoded = DzType::Far1Short.as_u8() | (((length - 2) << 3) as u8);
self.writer.write_all(&[encoded])?;
self.writer.write_all(&[(distance - 2) as u8])?;
Ok(())
}
pub fn encode_far2_short(&mut self, distance: usize, length: usize) -> Result<()> {
debug_assert!(distance >= 258 && distance <= 258 + 65535);
debug_assert!(length >= 2 && length <= 33);
let encoded = DzType::Far2Short.as_u8() | (((length - 2) << 3) as u8);
self.writer.write_all(&[encoded])?;
write_uint_bytes(&mut self.writer, (distance - 258) as u32, 2)?;
Ok(())
}
pub fn encode_far2_long(&mut self, distance: usize, length: usize) -> Result<()> {
debug_assert!(distance <= 65535);
debug_assert!(length >= 34);
if length <= 34 + 30 && (length - 34) <= 31 {
let encoded = DzType::Far2Long.as_u8() | (((length - 34) << 3) as u8);
self.writer.write_all(&[encoded])?;
} else {
let encoded = DzType::Far2Long.as_u8() | (31 << 3);
self.writer.write_all(&[encoded])?;
write_var_size_t(&mut self.writer, length - 65)?;
}
write_uint_bytes(&mut self.writer, distance as u32, 2)?;
Ok(())
}
pub fn encode_far3_long(&mut self, distance: usize, length: usize) -> Result<()> {
debug_assert!(distance < (1 << 24));
debug_assert!(length >= 5);
if length <= 35 && (length - 5) <= 31 {
let encoded = DzType::Far3Long.as_u8() | (((length - 5) << 3) as u8);
self.writer.write_all(&[encoded])?;
} else {
let encoded = DzType::Far3Long.as_u8() | (31 << 3);
self.writer.write_all(&[encoded])?;
write_var_size_t(&mut self.writer, length - 36)?;
}
write_uint_bytes(&mut self.writer, distance as u32, 3)?;
Ok(())
}
pub fn encode_global(
&mut self,
dict_position: u32,
length: usize,
g_offset_bits: usize,
g_max_short_len: usize
) -> Result<()> {
debug_assert!(length >= 6);
let enc_len = length - 6;
if length <= g_max_short_len && enc_len <= 31 {
let encoded = DzType::Global.as_u8() | ((enc_len << 3) as u8);
self.writer.write_all(&[encoded])?;
if g_offset_bits < 24 {
let offset_encoded = (dict_position << (24 - g_offset_bits)) | ((enc_len >> 5) as u32);
write_uint_bytes(&mut self.writer, offset_encoded, 3)?;
} else if g_offset_bits > 24 {
let offset_encoded = (dict_position << (32 - g_offset_bits)) | ((enc_len >> 5) as u32);
write_uint_bytes(&mut self.writer, offset_encoded, 4)?;
} else {
write_uint_bytes(&mut self.writer, dict_position, 3)?;
}
} else {
let encoded = DzType::Global.as_u8() | (31 << 3);
self.writer.write_all(&[encoded])?;
if g_offset_bits < 24 {
let offset_encoded = (dict_position << (24 - g_offset_bits)) | (0x00FFFFFF >> g_offset_bits);
write_uint_bytes(&mut self.writer, offset_encoded, 3)?;
} else if g_offset_bits > 24 {
let offset_encoded = (dict_position << (32 - g_offset_bits)) | (0xFFFFFFFF >> g_offset_bits);
write_uint_bytes(&mut self.writer, offset_encoded, 4)?;
} else {
write_uint_bytes(&mut self.writer, dict_position, 3)?;
}
write_var_size_t(&mut self.writer, length - g_max_short_len - 1)?;
}
Ok(())
}
pub fn encode_literal(&mut self, literal_data: &[u8]) -> Result<()> {
let mut remaining = literal_data.len();
let mut pos = 0;
while remaining >= 32 {
let encoded = DzType::Literal.as_u8() | (31 << 3);
self.writer.write_all(&[encoded])?;
self.writer.write_all(&literal_data[pos..pos + 32])?;
pos += 32;
remaining -= 32;
}
if remaining > 0 {
let encoded = DzType::Literal.as_u8() | (((remaining - 1) << 3) as u8);
self.writer.write_all(&[encoded])?;
self.writer.write_all(&literal_data[pos..pos + remaining])?;
}
Ok(())
}
pub fn into_writer(self) -> W {
self.writer
}
}
#[derive(Debug)]
struct LocalMatchHashTable {
table: Vec<u32>,
table_size: usize,
hash_mask: u32,
base_position: usize,
}
impl LocalMatchHashTable {
fn new(log_size: u8) -> Self {
let table_size = 1 << log_size;
let hash_mask = (table_size - 1) as u32;
Self {
table: vec![0; table_size],
table_size,
hash_mask,
base_position: 0,
}
}
#[inline]
fn hash_3bytes(data: &[u8], pos: usize) -> u32 {
if pos + 2 >= data.len() {
return 0;
}
let b0 = data[pos] as u32;
let b1 = data[pos + 1] as u32;
let b2 = data[pos + 2] as u32;
((b0 << 16) | (b1 << 8) | b2).wrapping_mul(0x1e35a7bd)
}
#[inline]
fn insert(&mut self, data: &[u8], pos: usize) {
let hash = Self::hash_3bytes(data, pos) & self.hash_mask;
self.table[hash as usize] = (pos + 1) as u32;
}
#[inline]
fn find_match(&self, data: &[u8], pos: usize, max_distance: usize) -> Option<(usize, usize)> {
if pos + 2 >= data.len() {
return None;
}
let hash = Self::hash_3bytes(data, pos) & self.hash_mask;
let stored_pos_plus_one = self.table[hash as usize];
if stored_pos_plus_one == 0 {
return None; }
let match_pos = (stored_pos_plus_one - 1) as usize;
if match_pos >= pos || pos - match_pos > max_distance {
return None;
}
let match_len = self.find_match_length(data, match_pos, pos);
if match_len >= 3 {
Some((pos - match_pos, match_len))
} else {
None
}
}
#[inline]
fn find_match_length(&self, data: &[u8], pos1: usize, pos2: usize) -> usize {
let max_len = (data.len() - pos2).min(255); let mut len = 0;
while len + 8 <= max_len && pos1 + len + 8 <= data.len() && pos2 + len + 8 <= data.len() {
let chunk1 = u64::from_le_bytes([
data[pos1 + len], data[pos1 + len + 1], data[pos1 + len + 2], data[pos1 + len + 3],
data[pos1 + len + 4], data[pos1 + len + 5], data[pos1 + len + 6], data[pos1 + len + 7],
]);
let chunk2 = u64::from_le_bytes([
data[pos2 + len], data[pos2 + len + 1], data[pos2 + len + 2], data[pos2 + len + 3],
data[pos2 + len + 4], data[pos2 + len + 5], data[pos2 + len + 6], data[pos2 + len + 7],
]);
if chunk1 != chunk2 {
break;
}
len += 8;
}
while len < max_len && pos1 + len < data.len() && pos2 + len < data.len()
&& data[pos1 + len] == data[pos2 + len] {
len += 1;
}
len
}
#[inline]
fn update_base_position(&mut self, new_base: usize) {
self.base_position = new_base;
}
}
pub fn compress_record_reference<W: Write>(
input_data: &[u8],
writer: &mut W,
use_suffix_array_local_match: bool,
global_dictionary: Option<&[u8]>,
g_offset_bits: usize,
g_max_short_len: usize,
) -> Result<usize> {
if use_suffix_array_local_match {
compress_record_with_suffix_array(input_data, writer, global_dictionary, g_offset_bits, g_max_short_len)
} else {
compress_record_with_hash_table(input_data, writer, global_dictionary, g_offset_bits, g_max_short_len)
}
}
#[inline]
fn compress_record_with_hash_table<W: Write>(
input_data: &[u8],
writer: &mut W,
global_dictionary: Option<&[u8]>,
g_offset_bits: usize,
g_max_short_len: usize,
) -> Result<usize> {
if input_data.is_empty() {
return Ok(0);
}
let mut encoder = ReferenceEncoder::new(writer);
let mut hash_table = LocalMatchHashTable::new(16); let mut pos = 0;
let mut literal_len = 0;
let mut literal_start = 0;
while pos < input_data.len() {
let mut best_match: Option<(usize, usize, DzType)> = None;
let mut best_cost = i32::MAX;
if let Some((distance, length)) = hash_table.find_match(input_data, pos, 16777215) {
if length >= 2 {
let meta = get_back_ref_encoding_meta(distance, length);
let cost = meta.len as i32;
let benefit = (length as i32 * 8) - (cost * 8);
if benefit > best_cost {
best_match = Some((distance, length, meta.dz_type));
best_cost = benefit;
}
}
}
if let Some(dict_data) = global_dictionary {
if let Some((dict_offset, length)) = find_global_match(input_data, pos, dict_data) {
if length >= 6 { let cost = if length <= g_max_short_len { 3 } else { 6 }; let benefit = (length as i32 * 8) - (cost * 8);
if benefit > best_cost {
best_match = Some((dict_offset, length, DzType::Global));
best_cost = benefit;
}
}
}
}
if let Some((distance_or_offset, length, dz_type)) = best_match {
if best_cost > 0 { if literal_len > 0 {
encoder.encode_literal(&input_data[literal_start..literal_start + literal_len])?;
literal_len = 0;
}
match dz_type {
DzType::RLE => {
encoder.encode_rle(length)?;
},
DzType::NearShort => {
encoder.encode_near_short(distance_or_offset, length)?;
},
DzType::Far1Short => {
encoder.encode_far1_short(distance_or_offset, length)?;
},
DzType::Far2Short => {
encoder.encode_far2_short(distance_or_offset, length)?;
},
DzType::Far2Long => {
encoder.encode_far2_long(distance_or_offset, length)?;
},
DzType::Far3Long => {
encoder.encode_far3_long(distance_or_offset, length)?;
},
DzType::Global => {
encoder.encode_global(
distance_or_offset as u32,
length,
g_offset_bits,
g_max_short_len
)?;
},
DzType::Literal => {
if literal_len == 0 {
literal_start = pos;
}
literal_len += 1;
},
}
for i in 0..length {
if pos + i + 2 < input_data.len() {
hash_table.insert(input_data, pos + i);
}
}
pos += length;
continue;
}
}
if literal_len == 0 {
literal_start = pos;
}
literal_len += 1;
hash_table.insert(input_data, pos);
pos += 1;
}
if literal_len > 0 {
encoder.encode_literal(&input_data[literal_start..literal_start + literal_len])?;
}
Ok(pos)
}
#[inline]
fn compress_record_with_suffix_array<W: Write>(
input_data: &[u8],
writer: &mut W,
global_dictionary: Option<&[u8]>,
g_offset_bits: usize,
g_max_short_len: usize,
) -> Result<usize> {
if input_data.is_empty() {
return Ok(0);
}
let mut encoder = ReferenceEncoder::new(writer);
let suffix_array = build_suffix_array(input_data)?;
let mut pos = 0;
let mut literal_len = 0;
let mut literal_start = 0;
while pos < input_data.len() {
let mut best_match: Option<(usize, usize, DzType)> = None;
let mut best_cost = i32::MAX;
if let Some((distance, length)) = find_suffix_array_match(input_data, pos, &suffix_array, 30) {
if length >= 2 {
let meta = get_back_ref_encoding_meta(distance, length);
let cost = meta.len as i32;
let benefit = (length as i32 * 8) - (cost * 8);
if benefit > best_cost {
best_match = Some((distance, length, meta.dz_type));
best_cost = benefit;
}
}
}
if let Some(dict_data) = global_dictionary {
if let Some((dict_offset, length)) = find_global_match(input_data, pos, dict_data) {
if length >= 6 { let cost = if length <= g_max_short_len { 3 } else { 6 }; let benefit = (length as i32 * 8) - (cost * 8);
if benefit > best_cost {
best_match = Some((dict_offset, length, DzType::Global));
best_cost = benefit;
}
}
}
}
if let Some((distance_or_offset, length, dz_type)) = best_match {
if best_cost > 0 { if literal_len > 0 {
encoder.encode_literal(&input_data[literal_start..literal_start + literal_len])?;
literal_len = 0;
}
match dz_type {
DzType::RLE => {
encoder.encode_rle(length)?;
},
DzType::NearShort => {
encoder.encode_near_short(distance_or_offset, length)?;
},
DzType::Far1Short => {
encoder.encode_far1_short(distance_or_offset, length)?;
},
DzType::Far2Short => {
encoder.encode_far2_short(distance_or_offset, length)?;
},
DzType::Far2Long => {
encoder.encode_far2_long(distance_or_offset, length)?;
},
DzType::Far3Long => {
encoder.encode_far3_long(distance_or_offset, length)?;
},
DzType::Global => {
encoder.encode_global(
distance_or_offset as u32,
length,
g_offset_bits,
g_max_short_len
)?;
},
DzType::Literal => {
if literal_len == 0 {
literal_start = pos;
}
literal_len += 1;
},
}
pos += length;
continue;
}
}
if literal_len == 0 {
literal_start = pos;
}
literal_len += 1;
pos += 1;
}
if literal_len > 0 {
encoder.encode_literal(&input_data[literal_start..literal_start + literal_len])?;
}
Ok(pos)
}
fn build_suffix_array(data: &[u8]) -> Result<Vec<u32>> {
let n = data.len();
if n == 0 {
return Ok(Vec::new());
}
let mut suffixes: Vec<(u32, &[u8])> = (0..n as u32)
.map(|i| (i, &data[i as usize..]))
.collect();
suffixes.sort_unstable_by(|a, b| a.1.cmp(b.1));
Ok(suffixes.into_iter().map(|(i, _)| i).collect())
}
fn find_suffix_array_match(
data: &[u8],
pos: usize,
suffix_array: &[u32],
max_probe: usize
) -> Option<(usize, usize)> {
if pos + 2 >= data.len() || suffix_array.is_empty() {
return None;
}
let pattern = &data[pos..];
let mut best_match: Option<(usize, usize)> = None;
let mut probes = 0;
let mut left = 0;
let mut right = suffix_array.len();
while left < right && probes < max_probe {
let mid = (left + right) / 2;
let suffix_pos = suffix_array[mid] as usize;
if suffix_pos >= pos || suffix_pos + 2 >= data.len() {
right = mid;
probes += 1;
continue;
}
let suffix = &data[suffix_pos..];
let cmp = pattern.cmp(suffix);
match cmp {
std::cmp::Ordering::Equal => {
let distance = pos - suffix_pos;
if distance > 0 && distance < (1 << 24) {
let length = find_match_length(data, suffix_pos, pos);
if length >= 2 {
best_match = Some((distance, length));
}
}
break;
},
std::cmp::Ordering::Less => {
right = mid;
},
std::cmp::Ordering::Greater => {
left = mid + 1;
}
}
probes += 1;
}
let search_start = left.saturating_sub(max_probe / 2);
let search_end = (left + max_probe / 2).min(suffix_array.len());
for i in search_start..search_end {
if probes >= max_probe {
break;
}
let suffix_pos = suffix_array[i] as usize;
if suffix_pos >= pos || suffix_pos + 2 >= data.len() {
probes += 1;
continue;
}
let distance = pos - suffix_pos;
if distance > 0 && distance < (1 << 24) {
let length = find_match_length(data, suffix_pos, pos);
if length >= 2 {
if best_match.is_none() || length > best_match.expect("best_match set by condition").1 {
best_match = Some((distance, length));
}
}
}
probes += 1;
}
best_match
}
#[inline]
fn find_match_length(data: &[u8], pos1: usize, pos2: usize) -> usize {
let max_len = (data.len() - pos2).min(255); let mut len = 0;
while len + 8 <= max_len && pos1 + len + 8 <= data.len() && pos2 + len + 8 <= data.len() {
let chunk1 = u64::from_le_bytes([
data[pos1 + len], data[pos1 + len + 1], data[pos1 + len + 2], data[pos1 + len + 3],
data[pos1 + len + 4], data[pos1 + len + 5], data[pos1 + len + 6], data[pos1 + len + 7],
]);
let chunk2 = u64::from_le_bytes([
data[pos2 + len], data[pos2 + len + 1], data[pos2 + len + 2], data[pos2 + len + 3],
data[pos2 + len + 4], data[pos2 + len + 5], data[pos2 + len + 6], data[pos2 + len + 7],
]);
if chunk1 != chunk2 {
break;
}
len += 8;
}
while len < max_len && pos1 + len < data.len() && pos2 + len < data.len()
&& data[pos1 + len] == data[pos2 + len] {
len += 1;
}
len
}
fn find_global_match(input_data: &[u8], pos: usize, dict_data: &[u8]) -> Option<(usize, usize)> {
if pos >= input_data.len() || dict_data.is_empty() {
return None;
}
let remaining = &input_data[pos..];
let mut best_match: Option<(usize, usize)> = None;
for dict_pos in 0..dict_data.len() {
if dict_pos + 6 > dict_data.len() {
break; }
let max_len = (dict_data.len() - dict_pos).min(remaining.len()).min(255);
let mut match_len = 0;
while match_len < max_len && dict_data[dict_pos + match_len] == remaining[match_len] {
match_len += 1;
}
if match_len >= 6 { if best_match.is_none() || match_len > best_match.expect("best_match set by condition").1 {
best_match = Some((dict_pos, match_len));
}
}
}
best_match
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dz_type_conversions() {
assert_eq!(DzType::Literal.as_u8(), 0);
assert_eq!(DzType::Global.as_u8(), 1);
assert_eq!(DzType::RLE.as_u8(), 2);
assert_eq!(DzType::NearShort.as_u8(), 3);
assert_eq!(DzType::Far1Short.as_u8(), 4);
assert_eq!(DzType::Far2Short.as_u8(), 5);
assert_eq!(DzType::Far2Long.as_u8(), 6);
assert_eq!(DzType::Far3Long.as_u8(), 7);
assert_eq!(DzType::from_u8(0).unwrap(), DzType::Literal);
assert_eq!(DzType::from_u8(7).unwrap(), DzType::Far3Long);
assert!(DzType::from_u8(8).is_err());
}
#[test]
fn test_get_back_ref_encoding_meta_reference() {
let meta = get_back_ref_encoding_meta(0, 1);
assert_eq!(meta.dz_type, DzType::Literal);
assert_eq!(meta.len, 2);
let meta = get_back_ref_encoding_meta(1, 5);
assert_eq!(meta.dz_type, DzType::RLE);
assert_eq!(meta.len, 1);
let meta = get_back_ref_encoding_meta(1, 33);
assert_eq!(meta.dz_type, DzType::RLE);
assert_eq!(meta.len, 1);
let meta = get_back_ref_encoding_meta(2, 5);
assert_eq!(meta.dz_type, DzType::NearShort);
assert_eq!(meta.len, 1);
let meta = get_back_ref_encoding_meta(9, 5);
assert_eq!(meta.dz_type, DzType::NearShort);
assert_eq!(meta.len, 1);
let meta = get_back_ref_encoding_meta(10, 20);
assert_eq!(meta.dz_type, DzType::Far1Short);
assert_eq!(meta.len, 2);
let meta = get_back_ref_encoding_meta(257, 33);
assert_eq!(meta.dz_type, DzType::Far1Short);
assert_eq!(meta.len, 2);
let meta = get_back_ref_encoding_meta(258, 20);
assert_eq!(meta.dz_type, DzType::Far2Short);
assert_eq!(meta.len, 3);
let meta = get_back_ref_encoding_meta(258 + 65535, 33);
assert_eq!(meta.dz_type, DzType::Far2Short);
assert_eq!(meta.len, 3);
let meta = get_back_ref_encoding_meta(30000, 50); assert_eq!(meta.dz_type, DzType::Far2Long);
assert_eq!(meta.len, 3);
let meta = get_back_ref_encoding_meta(30000, 100); assert_eq!(meta.dz_type, DzType::Far2Long);
assert_eq!(meta.len, 6);
let meta = get_back_ref_encoding_meta(1000000, 35); assert_eq!(meta.dz_type, DzType::Far3Long);
assert_eq!(meta.len, 4);
let meta = get_back_ref_encoding_meta(1000000, 100); assert_eq!(meta.dz_type, DzType::Far3Long);
assert_eq!(meta.len, 7);
}
#[test]
fn test_reference_encoder() {
let mut buffer = Vec::new();
{
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_rle(5).unwrap();
}
assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::RLE.as_u8()); assert_eq!(encoded >> 3, 5 - 2);
buffer.clear();
{
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_near_short(3, 4).unwrap();
}
assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::NearShort.as_u8()); assert_eq!((encoded >> 3) & 0x3, 4 - 2); assert_eq!(encoded >> 5, 3 - 2); }
#[test]
fn test_write_uint_bytes() {
let mut buffer = Vec::new();
write_uint_bytes(&mut buffer, 0x12, 1).unwrap();
assert_eq!(buffer, vec![0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x1234, 2).unwrap();
assert_eq!(buffer, vec![0x34, 0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x123456, 3).unwrap();
assert_eq!(buffer, vec![0x56, 0x34, 0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x12345678, 4).unwrap();
assert_eq!(buffer, vec![0x78, 0x56, 0x34, 0x12]); }
#[test]
fn test_write_var_size_t() {
let mut buffer = Vec::new();
write_var_size_t(&mut buffer, 42).unwrap();
assert_eq!(buffer, vec![42]);
buffer.clear();
write_var_size_t(&mut buffer, 300).unwrap();
assert_eq!(buffer, vec![(300 & 0x7F | 0x80) as u8, (300 >> 7) as u8]);
buffer.clear();
write_var_size_t(&mut buffer, 16384).unwrap();
assert_eq!(buffer, vec![0x80, 0x80, 0x01]); }
#[test]
fn test_local_match_hash_table() {
let mut hash_table = LocalMatchHashTable::new(8); let test_data = b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnop";
hash_table.insert(test_data, 0); hash_table.insert(test_data, 10);
if let Some((distance, length)) = hash_table.find_match(test_data, 26, 100) {
assert_eq!(distance, 26); assert!(length >= 3); } else {
panic!("Expected to find match for repeated pattern");
}
let _no_match = hash_table.find_match(test_data, 5, 100);
}
#[test]
fn test_find_global_match() {
let input_data = b"the quick brown fox jumps over the lazy dog";
let dict_data = b"the quick brown fox";
if let Some((offset, length)) = find_global_match(input_data, 0, dict_data) {
assert_eq!(offset, 0);
assert_eq!(length, dict_data.len());
} else {
panic!("Expected to find global match at beginning");
}
if let Some((offset, length)) = find_global_match(input_data, 31, dict_data) {
assert_eq!(offset, 0); assert!(length >= 3);
}
let no_match = find_global_match(b"xyz", 0, dict_data);
assert!(no_match.is_none() || no_match.unwrap().1 < 6); }
#[test]
fn test_compress_record_reference_simple() {
let input_data = b"hello world hello world";
let mut output = Vec::new();
let result = compress_record_reference(
input_data,
&mut output,
false, None, 0, 0, );
assert!(result.is_ok());
assert!(!output.is_empty());
println!("Input: {} bytes", input_data.len());
println!("Output: {} bytes", output.len());
println!("Compression ratio: {:.2}", output.len() as f64 / input_data.len() as f64);
}
#[test]
fn test_compress_record_reference_with_dictionary() {
let input_data = b"the quick brown fox jumps over the lazy dog";
let dictionary = b"the quick brown fox jumps over the lazy";
let mut output = Vec::new();
let result = compress_record_reference(
input_data,
&mut output,
false, Some(dictionary),
24, 32, );
assert!(result.is_ok());
assert!(!output.is_empty());
println!("Input: {} bytes", input_data.len());
println!("Output: {} bytes", output.len());
println!("Compression ratio: {:.2}", output.len() as f64 / input_data.len() as f64);
assert!(output.len() <= input_data.len() + 10); }
#[test]
fn test_compress_record_reference_rle_pattern() {
let input_data = b"aaaaaaaaaa"; let mut output = Vec::new();
let result = compress_record_reference(
input_data,
&mut output,
false,
None,
0,
0,
);
assert!(result.is_ok());
assert!(!output.is_empty());
println!("RLE Input: {} bytes", input_data.len());
println!("RLE Output: {} bytes", output.len());
assert!(output.len() <= input_data.len() + 5); }
#[test]
fn test_compress_record_reference_literal_only() {
let input_data = b"abcdefghijklmnopqrstuvwxyz"; let mut output = Vec::new();
let result = compress_record_reference(
input_data,
&mut output,
false,
None,
0,
0,
);
assert!(result.is_ok());
assert!(!output.is_empty());
println!("Literal Input: {} bytes", input_data.len());
println!("Literal Output: {} bytes", output.len());
}
#[test]
fn test_compress_record_various_compression_types() {
let test_cases = vec![
(b"aaaaaa".as_slice(), "RLE pattern"),
(b"abcabcabc".as_slice(), "Near short match pattern"),
(b"hello world hello world hello".as_slice(), "Repeated phrase"),
(b"abcdefghijklmnopqrstuvwxyz0123456789".as_slice(), "Mixed content"),
];
for (input_data, description) in test_cases {
let mut output = Vec::new();
let result = compress_record_reference(
input_data,
&mut output,
false,
None,
0,
0,
);
assert!(result.is_ok(), "Compression failed for: {}", description);
assert!(!output.is_empty(), "No output for: {}", description);
println!("{}: {} -> {} bytes (ratio: {:.2})",
description,
input_data.len(),
output.len(),
output.len() as f64 / input_data.len() as f64);
}
}
#[test]
fn test_literal_encoding() {
let mut buffer = Vec::new();
let data = b"hello";
{
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_literal(data).unwrap();
}
assert_eq!(buffer.len(), 1 + data.len());
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Literal.as_u8());
assert_eq!(encoded >> 3, (data.len() - 1) as u8);
assert_eq!(&buffer[1..], data);
buffer.clear();
let large_data = vec![b'A'; 64];
{
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_literal(&large_data).unwrap();
}
assert_eq!(buffer.len(), 2 + 64); }
#[test]
fn test_compression_type_boundaries() {
assert_eq!(get_back_ref_encoding_meta(1, 2).dz_type, DzType::RLE);
assert_eq!(get_back_ref_encoding_meta(1, 33).dz_type, DzType::RLE);
assert_ne!(get_back_ref_encoding_meta(2, 2).dz_type, DzType::RLE);
assert_ne!(get_back_ref_encoding_meta(1, 34).dz_type, DzType::RLE);
assert_eq!(get_back_ref_encoding_meta(2, 2).dz_type, DzType::NearShort);
assert_eq!(get_back_ref_encoding_meta(9, 5).dz_type, DzType::NearShort);
assert_ne!(get_back_ref_encoding_meta(10, 2).dz_type, DzType::NearShort);
assert_ne!(get_back_ref_encoding_meta(2, 6).dz_type, DzType::NearShort);
assert_eq!(get_back_ref_encoding_meta(10, 6).dz_type, DzType::Far1Short); assert_eq!(get_back_ref_encoding_meta(257, 33).dz_type, DzType::Far1Short);
assert_ne!(get_back_ref_encoding_meta(258, 33).dz_type, DzType::Far1Short);
assert_eq!(get_back_ref_encoding_meta(258, 33).dz_type, DzType::Far2Short);
assert_eq!(get_back_ref_encoding_meta(258 + 65535, 33).dz_type, DzType::Far2Short);
assert_ne!(get_back_ref_encoding_meta(258, 34).dz_type, DzType::Far2Short);
assert_eq!(get_back_ref_encoding_meta(65535, 34).dz_type, DzType::Far2Long);
assert_eq!(get_back_ref_encoding_meta(1000, 100).dz_type, DzType::Far2Long);
assert_eq!(get_back_ref_encoding_meta(65536, 34).dz_type, DzType::Far3Long);
assert_eq!(get_back_ref_encoding_meta(1000000, 35).dz_type, DzType::Far3Long);
assert_eq!(get_back_ref_encoding_meta(1000000, 100).dz_type, DzType::Far3Long);
}
#[test]
fn test_encoding_length_calculations() {
assert_eq!(get_back_ref_encoding_meta(1, 2).len, 1);
assert_eq!(get_back_ref_encoding_meta(1, 33).len, 1);
assert_eq!(get_back_ref_encoding_meta(2, 2).len, 1);
assert_eq!(get_back_ref_encoding_meta(9, 5).len, 1);
assert_eq!(get_back_ref_encoding_meta(10, 20).len, 2);
assert_eq!(get_back_ref_encoding_meta(257, 33).len, 2);
assert_eq!(get_back_ref_encoding_meta(258, 20).len, 3);
assert_eq!(get_back_ref_encoding_meta(65000, 33).len, 3);
assert_eq!(get_back_ref_encoding_meta(30000, 34).len, 3); assert_eq!(get_back_ref_encoding_meta(30000, 64).len, 3); assert_eq!(get_back_ref_encoding_meta(30000, 65).len, 6);
assert_eq!(get_back_ref_encoding_meta(1000000, 35).len, 4); assert_eq!(get_back_ref_encoding_meta(1000000, 36).len, 7); }
#[test]
fn test_all_encoder_types() {
let mut buffer = Vec::new();
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_rle(2).unwrap(); assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::RLE.as_u8());
assert_eq!(encoded >> 3, 0); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_rle(33).unwrap(); assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::RLE.as_u8());
assert_eq!(encoded >> 3, 31); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_near_short(2, 2).unwrap(); assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::NearShort.as_u8());
assert_eq!((encoded >> 3) & 0x3, 0); assert_eq!(encoded >> 5, 0); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_near_short(9, 5).unwrap(); assert_eq!(buffer.len(), 1);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::NearShort.as_u8());
assert_eq!((encoded >> 3) & 0x3, 3); assert_eq!(encoded >> 5, 7); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_far1_short(2, 2).unwrap(); assert_eq!(buffer.len(), 2);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Far1Short.as_u8());
assert_eq!(encoded >> 3, 0); assert_eq!(buffer[1], 0); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_far1_short(257, 33).unwrap(); assert_eq!(buffer.len(), 2);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Far1Short.as_u8());
assert_eq!(encoded >> 3, 31); assert_eq!(buffer[1], 255); }
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_far2_short(258, 2).unwrap(); assert_eq!(buffer.len(), 3);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Far2Short.as_u8());
assert_eq!(encoded >> 3, 0); assert_eq!(u16::from_le_bytes([buffer[1], buffer[2]]), 0);
}
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_far2_long(1000, 34).unwrap(); assert_eq!(buffer.len(), 3);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Far2Long.as_u8());
assert_eq!(encoded >> 3, 0); assert_eq!(u16::from_le_bytes([buffer[1], buffer[2]]), 1000);
}
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_far3_long(1000000, 5).unwrap(); assert_eq!(buffer.len(), 4);
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Far3Long.as_u8());
assert_eq!(encoded >> 3, 0); let distance_bytes = [buffer[1], buffer[2], buffer[3]];
let distance = u32::from_le_bytes([distance_bytes[0], distance_bytes[1], distance_bytes[2], 0]);
assert_eq!(distance, 1000000);
}
}
#[test]
fn test_variable_length_encoding() {
let mut buffer = Vec::new();
write_var_size_t(&mut buffer, 0).unwrap();
assert_eq!(buffer, vec![0]);
buffer.clear();
write_var_size_t(&mut buffer, 127).unwrap();
assert_eq!(buffer, vec![127]);
buffer.clear();
write_var_size_t(&mut buffer, 128).unwrap();
assert_eq!(buffer, vec![0x80, 0x01]);
buffer.clear();
write_var_size_t(&mut buffer, 255).unwrap();
assert_eq!(buffer, vec![0xFF, 0x01]);
buffer.clear();
write_var_size_t(&mut buffer, 16384).unwrap();
assert_eq!(buffer, vec![0x80, 0x80, 0x01]);
buffer.clear();
write_var_size_t(&mut buffer, 2097151).unwrap(); assert_eq!(buffer, vec![0xFF, 0xFF, 0x7F]);
}
#[test]
fn test_uint_bytes_encoding() {
let mut buffer = Vec::new();
write_uint_bytes(&mut buffer, 0x42, 1).unwrap();
assert_eq!(buffer, vec![0x42]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x1234, 2).unwrap();
assert_eq!(buffer, vec![0x34, 0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x123456, 3).unwrap();
assert_eq!(buffer, vec![0x56, 0x34, 0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0x12345678, 4).unwrap();
assert_eq!(buffer, vec![0x78, 0x56, 0x34, 0x12]);
buffer.clear();
write_uint_bytes(&mut buffer, 0xFF, 1).unwrap();
assert_eq!(buffer, vec![0xFF]);
buffer.clear();
write_uint_bytes(&mut buffer, 0xFFFF, 2).unwrap();
assert_eq!(buffer, vec![0xFF, 0xFF]);
buffer.clear();
write_uint_bytes(&mut buffer, 0xFFFFFF, 3).unwrap();
assert_eq!(buffer, vec![0xFF, 0xFF, 0xFF]);
}
#[test]
fn test_global_encoding_boundary_conditions() {
let mut buffer = Vec::new();
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_global(1000, 6, 20, 32).unwrap();
assert!(!buffer.is_empty());
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Global.as_u8());
}
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_global(1000, 6, 24, 32).unwrap();
assert!(!buffer.is_empty());
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Global.as_u8());
}
{
buffer.clear();
let mut encoder = ReferenceEncoder::new(&mut buffer);
encoder.encode_global(1000, 6, 28, 32).unwrap();
assert!(!buffer.is_empty());
let encoded = buffer[0];
assert_eq!(encoded & 0x7, DzType::Global.as_u8());
}
}
#[test]
fn test_edge_case_compression_scenarios() {
let rle_data = vec![b'A'; 20];
let mut output = Vec::new();
let result = compress_record_reference(&rle_data, &mut output, false, None, 0, 0);
assert!(result.is_ok());
assert!(!output.is_empty());
let pattern_data = b"abababababab";
let mut output = Vec::new();
let result = compress_record_reference(pattern_data, &mut output, false, None, 0, 0);
assert!(result.is_ok());
assert!(!output.is_empty());
let dictionary = b"common pattern text";
let input_data = b"common pattern";
let mut output = Vec::new();
let result = compress_record_reference(input_data, &mut output, false, Some(dictionary), 20, 32);
assert!(result.is_ok());
assert!(!output.is_empty());
let empty_data = b"";
let mut output = Vec::new();
let result = compress_record_reference(empty_data, &mut output, false, None, 0, 0);
assert!(result.is_ok());
assert!(output.is_empty());
let single_byte = b"A";
let mut output = Vec::new();
let result = compress_record_reference(single_byte, &mut output, false, None, 0, 0);
assert!(result.is_ok());
assert!(!output.is_empty());
}
#[test]
fn test_hash_table_collision_handling() {
let mut hash_table = LocalMatchHashTable::new(4); let test_data = b"abcdefghijklmnopqrstuvwxyz";
for i in 0..test_data.len().saturating_sub(2) {
hash_table.insert(test_data, i);
}
if let Some((distance, length)) = hash_table.find_match(test_data, 3, 100) {
assert!(distance > 0);
assert!(length >= 3);
}
let repeated_data = b"abcdefabcdefabcdef";
let mut hash_table = LocalMatchHashTable::new(8);
hash_table.insert(repeated_data, 0);
if let Some((distance, length)) = hash_table.find_match(repeated_data, 6, 100) {
assert_eq!(distance, 6);
assert!(length >= 3);
}
}
}