pub const MAX_DISTANCE: usize = 32768;
const GOOD_MATCH_LENGTH: usize = 16;
pub const MAX_MATCH_LENGTH: usize = 258;
pub const MIN_MATCH_LENGTH: usize = 3;
const HASH_SIZE: usize = 1 << 16;
const HASH3_SIZE: usize = 1 << 15;
const HT_HASH_BITS: usize = 15;
const HT_HASH_SIZE: usize = 1 << HT_HASH_BITS;
const HT_BUCKET_SIZE: usize = 2;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Token {
Literal(u8),
Match {
length: u16,
distance: u16,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct PackedToken(u32);
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum LazyKind {
None,
Lazy,
Lazy2,
}
#[derive(Debug, Clone, Copy)]
struct LevelConfig {
max_chain_length: usize,
max_search_depth: usize,
nice_length: usize,
lazy: LazyKind,
use_ht: bool,
}
pub struct LongestMatchCache {
entries: Vec<CacheEntry>,
}
#[derive(Clone)]
struct CacheEntry {
sublen: Box<[u16; 259]>,
max_length: u16,
}
impl Default for CacheEntry {
fn default() -> Self {
Self {
sublen: Box::new([0u16; 259]),
max_length: 0,
}
}
}
impl LongestMatchCache {
pub fn new(data_len: usize) -> Self {
Self {
entries: (0..data_len).map(|_| CacheEntry::default()).collect(),
}
}
#[inline]
pub fn get(&self, pos: usize) -> Option<(&[u16; 259], usize)> {
self.entries
.get(pos)
.map(|e| (&*e.sublen, e.max_length as usize))
}
#[inline]
pub fn set(&mut self, pos: usize, sublen: [u16; 259], max_length: usize) {
if let Some(entry) = self.entries.get_mut(pos) {
*entry.sublen = sublen;
entry.max_length = max_length as u16;
}
}
#[inline]
pub fn is_populated(&self) -> bool {
self.entries.iter().any(|e| e.max_length > 0)
}
}
impl PackedToken {
const LITERAL_FLAG: u32 = 0x8000_0000;
#[inline]
pub fn literal(byte: u8) -> Self {
Self(Self::LITERAL_FLAG | byte as u32)
}
#[inline]
pub fn match_(length: u16, distance: u16) -> Self {
debug_assert!(distance >= 1, "Distance must be at least 1");
let dist_minus_one = (distance - 1) as u32;
let val = dist_minus_one << 16 | (length as u32);
Self(val)
}
#[inline]
pub fn is_literal(self) -> bool {
(self.0 & Self::LITERAL_FLAG) != 0
}
#[inline]
pub fn as_literal(self) -> Option<u8> {
if self.is_literal() {
Some(self.0 as u8)
} else {
None
}
}
#[inline]
pub fn as_match(self) -> Option<(u16, u16)> {
if self.is_literal() {
None
} else {
let length = (self.0 & 0xFFFF) as u16;
let distance = ((self.0 >> 16) as u16) + 1;
Some((length, distance))
}
}
}
trait TokenSink {
fn clear(&mut self);
fn reserve(&mut self, additional: usize);
fn push_literal(&mut self, byte: u8);
fn push_match(&mut self, length: u16, distance: u16);
}
impl TokenSink for Vec<Token> {
fn clear(&mut self) {
Vec::clear(self)
}
fn reserve(&mut self, additional: usize) {
Vec::reserve(self, additional)
}
fn push_literal(&mut self, byte: u8) {
self.push(Token::Literal(byte));
}
fn push_match(&mut self, length: u16, distance: u16) {
self.push(Token::Match { length, distance });
}
}
impl TokenSink for Vec<PackedToken> {
fn clear(&mut self) {
Vec::clear(self)
}
fn reserve(&mut self, additional: usize) {
Vec::reserve(self, additional)
}
fn push_literal(&mut self, byte: u8) {
self.push(PackedToken::literal(byte));
}
fn push_match(&mut self, length: u16, distance: u16) {
self.push(PackedToken::match_(length, distance));
}
}
#[inline(always)]
fn hash4(data: &[u8], pos: usize) -> usize {
if pos + 3 >= data.len() {
return 0;
}
let val = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
((val.wrapping_mul(0x1E35_A7BD)) >> 16) as usize & (HASH_SIZE - 1)
}
#[inline(always)]
fn hash3(data: &[u8], pos: usize) -> usize {
if pos + 2 >= data.len() {
return 0;
}
let val = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], 0]);
((val.wrapping_mul(0x1E35_A7BD)) >> 17) as usize & (HASH3_SIZE - 1)
}
#[inline]
fn detect_same_byte_run(data: &[u8], pos: usize) -> usize {
if pos >= data.len() {
return 0;
}
if pos + 1 >= data.len() {
return 1;
}
let byte = data[pos];
let remaining = data.len() - pos;
let max_run = remaining.min(MAX_MATCH_LENGTH);
if data[pos + 1] != byte {
return 1;
}
let mut len = 2;
while len + 8 <= max_run {
let chunk = &data[pos + len..pos + len + 8];
if chunk.iter().all(|&b| b == byte) {
len += 8;
} else {
for &b in chunk {
if b == byte && len < max_run {
len += 1;
} else {
return len;
}
}
return len;
}
}
while len < max_run && data[pos + len] == byte {
len += 1;
}
len
}
#[inline(always)]
fn hash4_ht(data: &[u8], pos: usize) -> usize {
if pos + 3 >= data.len() {
return 0;
}
let val = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
(val.wrapping_mul(0x1E35_A7BD) >> (32 - HT_HASH_BITS)) as usize & (HT_HASH_SIZE - 1)
}
fn calculate_min_match_len(data: &[u8], max_search_depth: usize) -> usize {
let mut used = [false; 256];
let mut num_used = 0usize;
let scan = data.len().min(4096);
for &b in &data[..scan] {
if !used[b as usize] {
used[b as usize] = true;
num_used += 1;
}
}
choose_min_match_len(num_used, max_search_depth)
}
fn choose_min_match_len(num_used_literals: usize, max_search_depth: usize) -> usize {
if max_search_depth <= 4 {
return MIN_MATCH_LENGTH;
}
let mut min_len = MIN_MATCH_LENGTH;
if num_used_literals > 32 {
min_len = 4;
}
if num_used_literals > 64 && max_search_depth >= 10 {
min_len = 5;
}
if num_used_literals > 96 && max_search_depth >= 20 {
min_len = 6;
}
min_len.min(MAX_MATCH_LENGTH)
}
pub struct Lz77Compressor {
head: Vec<i32>,
head3: Vec<i32>,
ht_buckets: Vec<[i32; HT_BUCKET_SIZE]>,
prev: Vec<i32>,
config: LevelConfig,
}
impl Lz77Compressor {
pub fn new(level: u8) -> Self {
let level = level.clamp(1, 9);
let config = Self::config_for_level(level);
Self {
head: vec![-1; HASH_SIZE],
head3: vec![-1; HASH3_SIZE],
ht_buckets: vec![[-1; HT_BUCKET_SIZE]; HT_HASH_SIZE],
prev: vec![-1; MAX_DISTANCE],
config,
}
}
pub fn compress(&mut self, data: &[u8]) -> Vec<Token> {
let mut tokens = Vec::with_capacity(data.len());
self.compress_into_sink(data, &mut tokens);
tokens
}
pub fn compress_into(&mut self, data: &[u8], tokens: &mut Vec<Token>) {
self.compress_into_sink(data, tokens);
}
fn compress_into_sink<T: TokenSink>(&mut self, data: &[u8], sink: &mut T) {
if data.is_empty() {
sink.clear();
return;
}
let min_match_len = calculate_min_match_len(data, self.config.max_search_depth);
sink.clear();
sink.reserve(data.len());
let mut pos = 0;
let mut literal_streak = 0usize;
let mut incompressible_mode = false;
let mut probe_since_last = 0usize;
let mut incompressible_updates = 0usize;
let mut pending_match: Option<(usize, usize)> = None;
self.head.fill(-1);
self.head3.fill(-1);
for bucket in &mut self.ht_buckets {
*bucket = [-1; HT_BUCKET_SIZE];
}
self.prev.fill(-1);
while pos < data.len() {
if incompressible_mode {
if probe_since_last >= INCOMPRESSIBLE_PROBE_INTERVAL {
probe_since_last = 0;
if let Some((length, distance)) = self.find_best_match(
data,
pos,
INCOMPRESSIBLE_CHAIN_LIMIT.min(self.config.max_search_depth),
self.config.nice_length,
min_match_len,
) {
incompressible_mode = false;
literal_streak = 0;
sink.push_match(length as u16, distance as u16);
if distance == 1 && length >= MIN_MATCH_LENGTH {
self.update_hash(data, pos);
if length > 1 {
self.update_hash(data, pos + length - 1);
}
} else {
for i in 0..length {
self.update_hash(data, pos + i);
}
}
pos += length;
continue;
}
}
sink.push_literal(data[pos]);
incompressible_updates += 1;
if incompressible_updates >= INCOMPRESSIBLE_UPDATE_INTERVAL {
self.update_hash(data, pos);
incompressible_updates = 0;
}
pos += 1;
literal_streak = literal_streak.saturating_add(1);
probe_since_last = probe_since_last.saturating_add(1);
continue;
}
let chain_limit = if literal_streak >= INCOMPRESSIBLE_LITERAL_THRESHOLD {
incompressible_mode = true;
probe_since_last = 0;
INCOMPRESSIBLE_CHAIN_LIMIT
} else {
self.config.max_chain_length
};
let best_match = if let Some(pending) = pending_match.take() {
Some(pending)
} else if self.config.use_ht {
self.find_best_match_ht(data, pos, self.config.nice_length, min_match_len)
} else {
self.find_best_match(
data,
pos,
chain_limit.min(self.config.max_search_depth),
self.config.nice_length,
min_match_len,
)
};
if let Some((length, distance)) = best_match {
literal_streak = 0;
incompressible_mode = false;
probe_since_last = 0;
if distance == 0 {
sink.push_literal(data[pos]);
self.update_hash(data, pos);
pos += 1;
continue;
}
if self.config.lazy != LazyKind::None
&& length < self.config.nice_length
&& length < GOOD_MATCH_LENGTH
&& pos + 1 < data.len()
{
self.update_hash(data, pos);
let next_chain = if self.config.lazy == LazyKind::Lazy2 {
(chain_limit / 2).max(1)
} else {
chain_limit
};
let next_match = if self.config.use_ht {
self.find_best_match_ht(
data,
pos + 1,
self.config.nice_length,
min_match_len,
)
} else {
self.find_best_match(
data,
pos + 1,
next_chain.min(self.config.max_search_depth),
self.config.nice_length,
min_match_len,
)
};
if let Some((next_length, next_distance)) = next_match {
if next_length >= length + 3 || next_length >= self.config.nice_length {
sink.push_literal(data[pos]);
pending_match = Some((next_length, next_distance));
pos += 1;
continue;
}
}
}
sink.push_match(length as u16, distance as u16);
if distance == 1 && length >= MIN_MATCH_LENGTH {
self.update_hash(data, pos);
if length > 1 {
self.update_hash(data, pos + length - 1);
}
} else {
for i in 0..length {
self.update_hash(data, pos + i);
}
}
pos += length;
} else {
literal_streak = literal_streak.saturating_add(1);
if literal_streak >= INCOMPRESSIBLE_LITERAL_THRESHOLD {
incompressible_mode = true;
probe_since_last = 0;
incompressible_updates = 0;
}
sink.push_literal(data[pos]);
self.update_hash(data, pos);
pos += 1;
}
}
}
pub fn compress_packed(&mut self, data: &[u8]) -> Vec<PackedToken> {
let mut tokens = Vec::with_capacity(data.len());
self.compress_into_sink(data, &mut tokens);
tokens
}
pub fn compress_packed_into(&mut self, data: &[u8], tokens: &mut Vec<PackedToken>) {
self.compress_into_sink(data, tokens);
}
fn find_best_match(
&self,
data: &[u8],
pos: usize,
chain_limit: usize,
nice_length: usize,
min_match_length: usize,
) -> Option<(usize, usize)> {
if pos + MIN_MATCH_LENGTH > data.len() {
return None;
}
let run_len = detect_same_byte_run(data, pos);
if run_len >= min_match_length && pos >= 1 && data[pos - 1] == data[pos] {
if run_len >= nice_length || run_len >= MAX_MATCH_LENGTH {
return Some((run_len.min(MAX_MATCH_LENGTH), 1));
}
}
let mut best_length = min_match_length.saturating_sub(1);
let mut best_distance = 0;
if run_len >= min_match_length && pos >= 1 && data[pos - 1] == data[pos] {
best_length = run_len;
best_distance = 1;
}
let hash3 = hash3(data, pos);
let cand3 = self.head3[hash3];
if cand3 >= 0 {
let match_pos = cand3 as usize;
let distance = pos - match_pos;
if distance == 0 {
} else if distance <= MAX_DISTANCE && match_pos + 3 <= data.len() {
let a = &data[pos..pos + 3];
let b = &data[match_pos..match_pos + 3];
if a == b {
let len = self
.match_length(data, match_pos, pos)
.min(MAX_MATCH_LENGTH);
if len >= min_match_length
&& !(len == 3 && distance > 8192)
&& (len > best_length || (len == best_length && distance < best_distance))
{
best_length = len;
best_distance = distance;
if best_length >= nice_length {
return Some((best_length, best_distance));
}
}
}
}
}
let hash = hash4(data, pos);
let mut chain_pos = self.head[hash];
let max_distance = pos.min(MAX_DISTANCE);
let mut chain_remaining = chain_limit;
let target_prefix = if pos + 4 <= data.len() {
Some(u32::from_le_bytes([
data[pos],
data[pos + 1],
data[pos + 2],
data[pos + 3],
]))
} else {
None
};
while chain_pos >= 0 && chain_remaining > 0 {
let match_pos = chain_pos as usize;
let distance = pos - match_pos;
if distance == 0 {
chain_pos = self.prev[match_pos % MAX_DISTANCE];
chain_remaining -= 1;
continue;
}
if distance > max_distance {
break;
}
if let Some(prefix) = target_prefix {
if match_pos + 4 <= data.len() {
let cand = u32::from_le_bytes([
data[match_pos],
data[match_pos + 1],
data[match_pos + 2],
data[match_pos + 3],
]);
if cand != prefix {
chain_pos = self.prev[match_pos % MAX_DISTANCE];
chain_remaining -= 1;
continue;
}
}
}
let length = self.match_length(data, match_pos, pos);
if length >= min_match_length
&& !(length == 3 && distance > 8192)
&& (length > best_length || (length == best_length && distance < best_distance))
{
best_length = length;
best_distance = distance;
if length >= MAX_MATCH_LENGTH || best_length >= nice_length {
break;
}
}
chain_pos = self.prev[match_pos % MAX_DISTANCE];
chain_remaining -= 1;
}
if best_length >= min_match_length {
Some((best_length, best_distance))
} else {
None
}
}
fn find_best_match_ht(
&mut self,
data: &[u8],
pos: usize,
nice_length: usize,
min_match_length: usize,
) -> Option<(usize, usize)> {
if pos + MIN_MATCH_LENGTH > data.len() {
return None;
}
let hash = hash4_ht(data, pos);
let bucket = &mut self.ht_buckets[hash];
let cand0 = bucket[0];
let cand1 = bucket[1];
bucket[1] = cand0;
bucket[0] = pos as i32;
let mut best_len = min_match_length.saturating_sub(1);
let mut best_dist = 0usize;
for &cand in [cand0, cand1].iter() {
if cand < 0 {
continue;
}
let match_pos = cand as usize;
let distance = pos - match_pos;
if distance == 0 || distance > MAX_DISTANCE || match_pos + 3 > data.len() {
continue;
}
let a = &data[pos..pos + 3];
let b = &data[match_pos..match_pos + 3];
if a != b {
continue;
}
let len = self
.match_length(data, match_pos, pos)
.min(MAX_MATCH_LENGTH);
if len < min_match_length || (len == 3 && distance > 8192) {
continue;
}
if len > best_len {
best_len = len;
best_dist = distance;
if best_len >= nice_length {
break;
}
}
}
if best_len >= min_match_length {
Some((best_len, best_dist))
} else {
None
}
}
#[inline(always)]
fn match_length(&self, data: &[u8], pos1: usize, pos2: usize) -> usize {
let max_len = (data.len() - pos2).min(MAX_MATCH_LENGTH);
#[cfg(feature = "simd")]
{
crate::simd::match_length(data, pos1, pos2, max_len)
}
#[cfg(not(feature = "simd"))]
{
Self::match_length_scalar(data, pos1, pos2, max_len)
}
}
#[cfg(not(feature = "simd"))]
#[inline]
fn match_length_scalar(data: &[u8], pos1: usize, pos2: usize, max_len: usize) -> usize {
let mut length = 0;
while length + 8 <= max_len {
let a = u64::from_ne_bytes(data[pos1 + length..pos1 + length + 8].try_into().unwrap());
let b = u64::from_ne_bytes(data[pos2 + length..pos2 + length + 8].try_into().unwrap());
if a != b {
let xor = a ^ b;
#[cfg(target_endian = "little")]
{
length += (xor.trailing_zeros() / 8) as usize;
}
#[cfg(target_endian = "big")]
{
length += (xor.leading_zeros() / 8) as usize;
}
return length;
}
length += 8;
}
while length < max_len && data[pos1 + length] == data[pos2 + length] {
length += 1;
}
length
}
#[inline(always)]
fn update_hash(&mut self, data: &[u8], pos: usize) {
if pos + 3 >= data.len() {
return;
}
let h3 = hash3(data, pos);
self.head3[h3] = pos as i32;
let hash = hash4(data, pos);
self.prev[pos % MAX_DISTANCE] = self.head[hash];
self.head[hash] = pos as i32;
}
fn find_match_with_sublen(&self, data: &[u8], pos: usize) -> ([u16; 259], usize) {
let mut sublen = [0u16; 259]; let mut max_length = 0usize;
if pos + MIN_MATCH_LENGTH > data.len() {
return (sublen, 0);
}
let run_len = detect_same_byte_run(data, pos);
if run_len >= MIN_MATCH_LENGTH && pos >= 1 && data[pos - 1] == data[pos] {
for len in MIN_MATCH_LENGTH..=run_len {
sublen[len] = 1;
}
max_length = run_len;
if run_len >= MAX_MATCH_LENGTH {
return (sublen, max_length);
}
}
let hash3 = hash3(data, pos);
let cand3 = self.head3[hash3];
if cand3 >= 0 {
let match_pos = cand3 as usize;
let distance = pos - match_pos;
if distance == 0 {
} else if distance <= MAX_DISTANCE && match_pos + 3 <= data.len() {
let a = &data[pos..pos + 3];
let b = &data[match_pos..match_pos + 3];
if a == b {
if sublen[3] == 0 || (distance as u16) < sublen[3] {
sublen[3] = distance as u16;
}
if max_length < 3 {
max_length = 3;
}
}
}
}
let hash = hash4(data, pos);
let mut chain_pos = self.head[hash];
let max_distance = pos.min(MAX_DISTANCE);
let mut chain_remaining = self
.config
.max_chain_length
.min(self.config.max_search_depth);
while chain_pos >= 0 && chain_remaining > 0 {
let match_pos = chain_pos as usize;
let distance = pos - match_pos;
if distance == 0 {
chain_pos = self.prev[match_pos % MAX_DISTANCE];
chain_remaining -= 1;
continue;
}
if distance > max_distance {
break;
}
let length = self.match_length(data, match_pos, pos);
if length >= MIN_MATCH_LENGTH && !(length == 3 && distance > 8192) {
for len in MIN_MATCH_LENGTH..=length {
if sublen[len] == 0 || (distance as u16) < sublen[len] {
sublen[len] = distance as u16;
}
}
if length > max_length {
max_length = length;
if max_length >= MAX_MATCH_LENGTH {
break;
}
}
}
chain_pos = self.prev[match_pos % MAX_DISTANCE];
chain_remaining -= 1;
}
(sublen, max_length)
}
pub fn compress_optimal(&mut self, data: &[u8], cost_model: &CostModel) -> Vec<Token> {
if data.is_empty() {
return Vec::new();
}
let n = data.len();
self.head.fill(-1);
self.head3.fill(-1);
self.prev.fill(-1);
let mut costs = vec![f32::MAX; n + 1];
let mut length_array = vec![0u16; n + 1];
let mut dist_array = vec![0u16; n + 1];
costs[0] = 0.0;
for i in 0..n {
if costs[i] >= f32::MAX {
continue;
}
let lit_cost = costs[i] + cost_model.literal_cost(data[i]);
if lit_cost < costs[i + 1] {
costs[i + 1] = lit_cost;
length_array[i + 1] = 1;
dist_array[i + 1] = 0;
}
let (sublen, max_len) = self.find_match_with_sublen(data, i);
for len in MIN_MATCH_LENGTH..=max_len {
let dist = sublen[len];
if dist == 0 {
continue;
}
let match_cost = costs[i] + cost_model.match_cost(len as u16, dist);
let end_pos = i + len;
if end_pos <= n && match_cost < costs[end_pos] {
costs[end_pos] = match_cost;
length_array[end_pos] = len as u16;
dist_array[end_pos] = dist;
}
}
self.update_hash(data, i);
}
self.trace_backwards(&length_array, &dist_array, data)
}
pub fn compress_optimal_cached(
&mut self,
data: &[u8],
cost_model: &CostModel,
cache: &mut LongestMatchCache,
) -> Vec<Token> {
if data.is_empty() {
return Vec::new();
}
let n = data.len();
let use_cache = cache.is_populated();
if !use_cache {
self.head.fill(-1);
self.head3.fill(-1);
self.prev.fill(-1);
}
let mut costs = vec![f32::MAX; n + 1];
let mut length_array = vec![0u16; n + 1];
let mut dist_array = vec![0u16; n + 1];
costs[0] = 0.0;
for i in 0..n {
if costs[i] >= f32::MAX {
continue;
}
let lit_cost = costs[i] + cost_model.literal_cost(data[i]);
if lit_cost < costs[i + 1] {
costs[i + 1] = lit_cost;
length_array[i + 1] = 1;
dist_array[i + 1] = 0;
}
let (sublen, max_len) = if use_cache {
cache.get(i).unwrap_or((&[0u16; 259], 0))
} else {
let (sublen, max_len) = self.find_match_with_sublen(data, i);
cache.set(i, sublen, max_len);
self.update_hash(data, i);
cache.get(i).unwrap_or((&[0u16; 259], 0))
};
for len in MIN_MATCH_LENGTH..=max_len {
let dist = sublen[len];
if dist == 0 {
continue;
}
let match_cost = costs[i] + cost_model.match_cost(len as u16, dist);
let end_pos = i + len;
if end_pos <= n && match_cost < costs[end_pos] {
costs[end_pos] = match_cost;
length_array[end_pos] = len as u16;
dist_array[end_pos] = dist;
}
}
}
self.trace_backwards(&length_array, &dist_array, data)
}
fn trace_backwards(&self, length_array: &[u16], dist_array: &[u16], data: &[u8]) -> Vec<Token> {
let n = data.len();
let mut tokens = Vec::new();
let mut pos = n;
while pos > 0 {
let len = length_array[pos] as usize;
if len == 0 {
break;
}
pos -= len;
}
let mut lengths_rev = Vec::new();
let mut p = n;
while p > 0 {
let len = length_array[p] as usize;
let dist = dist_array[p];
if len == 0 {
break;
}
lengths_rev.push((len, dist));
p -= len;
}
let mut data_pos = 0;
for (len, dist) in lengths_rev.into_iter().rev() {
if len == 1 && dist == 0 {
tokens.push(Token::Literal(data[data_pos]));
} else {
if dist == 0 {
tokens.push(Token::Literal(data[data_pos]));
data_pos += len;
continue;
}
tokens.push(Token::Match {
length: len as u16,
distance: dist,
});
}
data_pos += len;
}
tokens
}
}
#[derive(Clone)]
pub struct CostModel {
pub lit_len_costs: [f32; 286],
pub dist_costs: [f32; 30],
}
impl CostModel {
pub fn fixed() -> Self {
let mut lit_len_costs = [0.0f32; 286];
let mut dist_costs = [0.0f32; 30];
for i in 0..144 {
lit_len_costs[i] = 8.0;
}
for i in 144..256 {
lit_len_costs[i] = 9.0;
}
for i in 256..280 {
lit_len_costs[i] = 7.0;
}
for i in 280..286 {
lit_len_costs[i] = 8.0;
}
for i in 0..30 {
dist_costs[i] = 5.0;
}
Self {
lit_len_costs,
dist_costs,
}
}
pub fn from_statistics(lit_len_counts: &[u32; 286], dist_counts: &[u32; 30]) -> Self {
let mut lit_len_costs = [0.0f32; 286];
let mut dist_costs = [0.0f32; 30];
let lit_total: u32 = lit_len_counts.iter().sum();
if lit_total > 0 {
let log_total = (lit_total as f32).log2();
for (i, &count) in lit_len_counts.iter().enumerate() {
if count > 0 {
lit_len_costs[i] = log_total - (count as f32).log2();
} else {
lit_len_costs[i] = 15.0;
}
}
} else {
return Self::fixed();
}
let dist_total: u32 = dist_counts.iter().sum();
if dist_total > 0 {
let log_total = (dist_total as f32).log2();
for (i, &count) in dist_counts.iter().enumerate() {
if count > 0 {
dist_costs[i] = log_total - (count as f32).log2();
} else {
dist_costs[i] = 15.0;
}
}
} else {
for i in 0..30 {
dist_costs[i] = 5.0;
}
}
Self {
lit_len_costs,
dist_costs,
}
}
#[inline]
pub fn literal_cost(&self, byte: u8) -> f32 {
self.lit_len_costs[byte as usize]
}
#[inline]
pub fn match_cost(&self, length: u16, distance: u16) -> f32 {
let (len_symbol, len_extra_bits) = length_to_symbol(length);
let len_cost = self.lit_len_costs[len_symbol as usize] + len_extra_bits as f32;
let (dist_symbol, dist_extra_bits) = distance_to_symbol(distance);
let dist_cost = self.dist_costs[dist_symbol as usize] + dist_extra_bits as f32;
len_cost + dist_cost
}
}
impl Default for CostModel {
fn default() -> Self {
Self::fixed()
}
}
const LENGTH_BASE: [u16; 29] = [
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131,
163, 195, 227, 258,
];
const LENGTH_EXTRA: [u8; 29] = [
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0,
];
#[allow(dead_code)]
const DISTANCE_BASE: [u16; 30] = [
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537,
2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577,
];
const DISTANCE_EXTRA: [u8; 30] = [
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
13,
];
#[inline]
fn length_to_symbol(length: u16) -> (u16, u8) {
debug_assert!(
(MIN_MATCH_LENGTH as u16..=MAX_MATCH_LENGTH as u16).contains(&length),
"Invalid length: {length}"
);
for i in 0..28 {
if length < LENGTH_BASE[i + 1] {
return (257 + i as u16, LENGTH_EXTRA[i]);
}
}
(285, 0) }
#[inline]
fn distance_to_symbol(distance: u16) -> (u16, u8) {
debug_assert!((1..=32768).contains(&distance), "Invalid distance");
if distance < 5 {
return (distance - 1, 0);
}
let d = distance as u32 - 1;
let msb = 31 - d.leading_zeros();
let second_bit = (d >> (msb - 1)) & 1;
let code = (2 * msb + second_bit) as usize;
let code = code.min(29);
(code as u16, DISTANCE_EXTRA[code])
}
const INCOMPRESSIBLE_LITERAL_THRESHOLD: usize = 512;
const INCOMPRESSIBLE_CHAIN_LIMIT: usize = 1;
const INCOMPRESSIBLE_PROBE_INTERVAL: usize = 256;
const INCOMPRESSIBLE_UPDATE_INTERVAL: usize = 64;
impl Default for Lz77Compressor {
fn default() -> Self {
Self::new(6)
}
}
impl Lz77Compressor {
fn config_for_level(level: u8) -> LevelConfig {
match level {
1 => LevelConfig {
max_chain_length: 4,
max_search_depth: 4,
nice_length: 32,
lazy: LazyKind::None,
use_ht: true,
},
2 => LevelConfig {
max_chain_length: 8,
max_search_depth: 6,
nice_length: 10,
lazy: LazyKind::None,
use_ht: false,
},
3 => LevelConfig {
max_chain_length: 16,
max_search_depth: 12,
nice_length: 14,
lazy: LazyKind::None,
use_ht: false,
},
4 => LevelConfig {
max_chain_length: 32,
max_search_depth: 16,
nice_length: 30,
lazy: LazyKind::None,
use_ht: false,
},
5 => LevelConfig {
max_chain_length: 64,
max_search_depth: 16,
nice_length: 30,
lazy: LazyKind::Lazy,
use_ht: false,
},
6 => LevelConfig {
max_chain_length: 128,
max_search_depth: 35,
nice_length: 65,
lazy: LazyKind::Lazy,
use_ht: false,
},
7 => LevelConfig {
max_chain_length: 256,
max_search_depth: 100,
nice_length: 130,
lazy: LazyKind::Lazy,
use_ht: false,
},
8 => LevelConfig {
max_chain_length: 1024,
max_search_depth: 300,
nice_length: MAX_MATCH_LENGTH,
lazy: LazyKind::Lazy2,
use_ht: false,
},
9 => LevelConfig {
max_chain_length: 4096,
max_search_depth: 600,
nice_length: MAX_MATCH_LENGTH,
lazy: LazyKind::Lazy2,
use_ht: false,
},
_ => LevelConfig {
max_chain_length: 4096,
max_search_depth: 600,
nice_length: MAX_MATCH_LENGTH,
lazy: LazyKind::Lazy2,
use_ht: false,
},
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lz77_no_matches() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcdefgh";
let tokens = compressor.compress(data);
assert_eq!(tokens.len(), 8);
for (i, &token) in tokens.iter().enumerate() {
assert_eq!(token, Token::Literal(data[i]));
}
}
#[test]
fn test_lz77_simple_repeat() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcabcabc";
let tokens = compressor.compress(data);
assert!(tokens.len() < 9); }
#[test]
fn test_lz77_long_repeat() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcdefghijabcdefghijabcdefghij";
let tokens = compressor.compress(data);
assert!(tokens.len() < 20);
}
#[test]
fn test_lz77_empty() {
let mut compressor = Lz77Compressor::new(6);
let tokens = compressor.compress(&[]);
assert!(tokens.is_empty());
}
#[test]
fn test_packed_token_literal() {
for byte in 0u8..=255 {
let token = PackedToken::literal(byte);
assert!(
token.is_literal(),
"Literal should be identified as literal"
);
assert_eq!(token.as_literal(), Some(byte), "Literal value mismatch");
assert_eq!(token.as_match(), None, "Literal should not be a match");
}
}
#[test]
fn test_packed_token_match() {
let test_cases = [
(3, 1), (258, 1), (3, 32768), (258, 32768), (100, 100), (3, 32767), ];
for (length, distance) in test_cases {
let token = PackedToken::match_(length, distance);
assert!(
!token.is_literal(),
"Match with length={length}, distance={distance} should NOT be identified as literal",
);
assert_eq!(
token.as_match(),
Some((length, distance)),
"Match values mismatch for length={length}, distance={distance}",
);
assert_eq!(
token.as_literal(),
None,
"Match should not return a literal value for length={length}, distance={distance}",
);
}
}
#[test]
fn test_packed_token_max_distance_not_literal() {
let token = PackedToken::match_(10, 32768);
assert!(
!token.is_literal(),
"Match with max distance (32768) was incorrectly identified as literal"
);
assert_eq!(
token.as_match(),
Some((10, 32768)),
"Match with max distance failed to roundtrip correctly"
);
}
#[test]
fn test_optimal_lz77_empty() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let tokens = compressor.compress_optimal(&[], &cost_model);
assert!(tokens.is_empty());
}
#[test]
fn test_optimal_lz77_no_matches() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let data = b"abcdefgh";
let tokens = compressor.compress_optimal(data, &cost_model);
assert_eq!(tokens.len(), 8);
for (i, &token) in tokens.iter().enumerate() {
assert_eq!(token, Token::Literal(data[i]));
}
}
#[test]
fn test_optimal_lz77_simple_repeat() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let data = b"abcabcabc";
let tokens = compressor.compress_optimal(data, &cost_model);
assert!(
tokens.len() < 9,
"Expected fewer than 9 tokens, got {}",
tokens.len()
);
}
#[test]
fn test_optimal_lz77_produces_valid_tokens() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps.";
let tokens = compressor.compress_optimal(data, &cost_model);
let mut reconstructed = Vec::new();
for token in &tokens {
match token {
Token::Literal(b) => reconstructed.push(*b),
Token::Match { length, distance } => {
let start = reconstructed.len() - *distance as usize;
for i in 0..*length as usize {
reconstructed.push(reconstructed[start + i]);
}
}
}
}
assert_eq!(reconstructed.as_slice(), data.as_slice());
}
#[test]
fn test_cost_model_fixed() {
let model = CostModel::fixed();
assert!((model.literal_cost(b'a') - 8.0).abs() < 0.01);
assert!((model.literal_cost(200) - 9.0).abs() < 0.01);
let match_cost = model.match_cost(3, 1);
assert!((match_cost - 12.0).abs() < 0.01);
}
#[test]
fn test_cost_model_from_statistics() {
let mut lit_counts = [0u32; 286];
let mut dist_counts = [0u32; 30];
lit_counts[b'a' as usize] = 100;
lit_counts[b'b' as usize] = 10;
lit_counts[256] = 1;
dist_counts[0] = 50;
dist_counts[1] = 10;
let model = CostModel::from_statistics(&lit_counts, &dist_counts);
assert!(model.literal_cost(b'a') < model.literal_cost(b'b'));
}
#[test]
fn test_length_to_symbol() {
assert_eq!(length_to_symbol(3), (257, 0)); assert_eq!(length_to_symbol(10), (264, 0)); assert_eq!(length_to_symbol(11), (265, 1)); assert_eq!(length_to_symbol(258), (285, 0)); }
#[test]
fn test_distance_to_symbol() {
assert_eq!(distance_to_symbol(1), (0, 0));
assert_eq!(distance_to_symbol(4), (3, 0));
assert_eq!(distance_to_symbol(5), (4, 1));
assert_eq!(distance_to_symbol(6), (4, 1));
}
#[test]
fn test_find_best_match_no_zero_distance() {
let mut compressor = Lz77Compressor::new(6);
let mut unique_data: Vec<u8> = (0..=255).collect();
unique_data.extend_from_slice(b"xyz");
let result = compressor.compress(&unique_data);
for token in &result {
if let Token::Match { distance, .. } = token {
assert!(*distance > 0, "Found invalid match with distance 0");
}
}
}
#[test]
fn test_find_best_match_returns_none_for_no_match() {
let compressor = Lz77Compressor::new(6);
let data: Vec<u8> = (0..10).collect();
let result = compressor.find_best_match(&data, 5, 128, 65, 4);
assert!(
result.is_none() || result.unwrap().1 > 0,
"find_best_match returned invalid match with distance 0: {result:?}"
);
}
#[test]
fn test_find_best_match_respects_min_match_length() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcdXYZWabcdPQRS";
compressor.head.fill(-1);
compressor.head3.fill(-1);
compressor.prev.fill(-1);
for i in 0..8 {
compressor.update_hash(data, i);
}
let result = compressor.find_best_match(data, 8, 100, 258, 5);
assert!(
result.is_none(),
"find_best_match returned {result:?} when no match >= min_match_length exists"
);
let result = compressor.find_best_match(data, 8, 100, 258, 4);
assert!(result.is_some(), "Expected a match with min_match_length=4");
let (len, dist) = result.unwrap();
assert!(len >= 4, "Match length should be >= min_match_length");
assert!(dist > 0, "Match distance must be > 0");
assert_eq!(dist, 8, "Distance should be 8 (back to position 0)");
}
#[test]
fn test_find_best_match_ht_respects_min_match_length() {
let mut compressor = Lz77Compressor::new(1);
let data = b"abcdXYZWabcdPQRS";
for bucket in &mut compressor.ht_buckets {
*bucket = [-1; HT_BUCKET_SIZE];
}
let hash = hash4_ht(data, 0);
compressor.ht_buckets[hash][0] = 0;
let result = compressor.find_best_match_ht(data, 8, 258, 5);
assert!(
result.is_none(),
"find_best_match_ht returned {result:?} when no match >= min_match_length exists"
);
let result = compressor.find_best_match_ht(data, 8, 258, 4);
assert!(result.is_some(), "Expected a match with min_match_length=4");
let (len, dist) = result.unwrap();
assert!(len >= 4, "Match length should be >= min_match_length");
assert!(dist > 0, "Match distance must be > 0");
assert_eq!(dist, 8, "Distance should be 8 (back to position 0)");
}
#[test]
fn test_find_best_match_never_returns_zero_distance() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcdefghijklmnop";
compressor.head.fill(-1);
compressor.head3.fill(-1);
compressor.prev.fill(-1);
for i in 0..8 {
compressor.update_hash(data, i);
}
for min_len in 3..=6 {
let result = compressor.find_best_match(data, 8, 100, 258, min_len);
if let Some((len, dist)) = result {
assert!(
dist > 0,
"find_best_match returned zero distance: len={len}, dist={dist}, min_len={min_len}"
);
assert!(
len >= min_len,
"find_best_match returned length {len} < min_match_length {min_len}"
);
}
}
}
#[test]
fn test_lz77_default() {
let compressor = Lz77Compressor::default();
assert_eq!(compressor.config.max_chain_length, 128);
}
#[test]
fn test_lz77_compress_into() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcdefgh";
let mut tokens = Vec::new();
compressor.compress_into(data, &mut tokens);
assert_eq!(tokens.len(), 8);
}
#[test]
fn test_lz77_compress_packed() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcabcabc";
let tokens = compressor.compress_packed(data);
assert!(!tokens.is_empty());
}
#[test]
fn test_lz77_compress_packed_into() {
let mut compressor = Lz77Compressor::new(6);
let data = b"abcabcabc";
let mut tokens = Vec::new();
compressor.compress_packed_into(data, &mut tokens);
assert!(!tokens.is_empty());
}
#[test]
fn test_cost_model_default() {
let model = CostModel::default();
assert!((model.literal_cost(b'a') - 8.0).abs() < 0.01);
}
#[test]
fn test_cost_model_from_empty_statistics() {
let lit_counts = [0u32; 286];
let dist_counts = [0u32; 30];
let model = CostModel::from_statistics(&lit_counts, &dist_counts);
assert!((model.literal_cost(b'a') - 8.0).abs() < 0.01);
}
#[test]
fn test_cost_model_from_statistics_no_distances() {
let mut lit_counts = [0u32; 286];
let dist_counts = [0u32; 30];
lit_counts[b'a' as usize] = 100;
lit_counts[256] = 1;
let model = CostModel::from_statistics(&lit_counts, &dist_counts);
assert!(model.dist_costs[0] > 0.0);
}
#[test]
fn test_length_to_symbol_all_ranges() {
for length in 3..=258u16 {
let (symbol, extra_bits) = length_to_symbol(length);
assert!((257..=285).contains(&symbol));
assert!(extra_bits <= 5);
}
}
#[test]
fn test_distance_to_symbol_all_ranges() {
for &dist in &[1, 2, 3, 4, 5, 10, 100, 1000, 10000, 32768] {
let (symbol, extra_bits) = distance_to_symbol(dist);
assert!(symbol < 30);
assert!(extra_bits <= 13);
}
}
#[test]
fn test_lz77_level_clamping() {
let compressor = Lz77Compressor::new(0); assert!(compressor.config.max_chain_length > 0);
let compressor = Lz77Compressor::new(100); assert!(compressor.config.max_chain_length > 0);
}
#[test]
fn test_lz77_all_levels() {
let data = b"abcabcabcabcabc";
for level in 1..=9 {
let mut compressor = Lz77Compressor::new(level);
let tokens = compressor.compress(data);
assert!(!tokens.is_empty());
}
}
#[test]
fn test_cost_model_match_cost_various_lengths() {
let model = CostModel::fixed();
let cost_3 = model.match_cost(3, 1);
let cost_10 = model.match_cost(10, 1);
let cost_100 = model.match_cost(100, 1);
let cost_258 = model.match_cost(258, 1);
assert!(cost_3 > 0.0);
assert!(cost_10 > 0.0);
assert!(cost_100 > 0.0);
assert!(cost_258 > 0.0);
}
#[test]
fn test_cost_model_match_cost_various_distances() {
let model = CostModel::fixed();
let cost_1 = model.match_cost(3, 1);
let cost_100 = model.match_cost(3, 100);
let cost_1000 = model.match_cost(3, 1000);
let cost_max = model.match_cost(3, 32768);
assert!(cost_1 > 0.0);
assert!(cost_100 > 0.0);
assert!(cost_1000 > 0.0);
assert!(cost_max > 0.0);
assert!(cost_max >= cost_1);
}
#[test]
fn test_cost_model_literal_all_bytes() {
let model = CostModel::fixed();
for byte in 0u8..=255 {
let cost = model.literal_cost(byte);
assert!(cost > 0.0, "Literal {byte} should have positive cost");
assert!(cost <= 16.0, "Literal {byte} cost should be reasonable");
}
}
#[test]
fn test_lz77_single_byte() {
let mut compressor = Lz77Compressor::new(6);
let data = b"a";
let tokens = compressor.compress(data);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], Token::Literal(b'a'));
}
#[test]
fn test_lz77_two_bytes() {
let mut compressor = Lz77Compressor::new(6);
let data = b"ab";
let tokens = compressor.compress(data);
assert_eq!(tokens.len(), 2);
}
#[test]
fn test_lz77_all_same_bytes() {
let mut compressor = Lz77Compressor::new(6);
let data = vec![b'a'; 100];
let tokens = compressor.compress(&data);
assert!(tokens.len() < 10, "All same bytes should compress well");
}
#[test]
fn test_lz77_run_length_encoding() {
let mut compressor = Lz77Compressor::new(6);
let data: Vec<u8> = vec![0xFF; 1000];
let tokens = compressor.compress(&data);
assert!(tokens.len() < 50);
}
#[test]
fn test_packed_token_boundary_lengths() {
let min = PackedToken::match_(3, 1);
assert_eq!(min.as_match(), Some((3, 1)));
let max_len = PackedToken::match_(258, 1);
assert_eq!(max_len.as_match(), Some((258, 1)));
}
#[test]
fn test_packed_token_boundary_distances() {
let min = PackedToken::match_(3, 1);
assert_eq!(min.as_match(), Some((3, 1)));
let max_dist = PackedToken::match_(3, 32768);
assert_eq!(max_dist.as_match(), Some((3, 32768)));
}
#[test]
fn test_lz77_compress_deterministic() {
let data = b"hello world hello world";
let mut compressor1 = Lz77Compressor::new(6);
let tokens1 = compressor1.compress(data);
let mut compressor2 = Lz77Compressor::new(6);
let tokens2 = compressor2.compress(data);
assert_eq!(tokens1, tokens2, "Compression should be deterministic");
}
#[test]
fn test_optimal_vs_greedy() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let data = b"abcdeabcdfabcde";
let greedy_tokens = compressor.compress(data);
let optimal_tokens = compressor.compress_optimal(data, &cost_model);
let greedy_output = reconstruct_from_tokens(&greedy_tokens);
let optimal_output = reconstruct_from_tokens(&optimal_tokens);
assert_eq!(greedy_output, data.to_vec());
assert_eq!(optimal_output, data.to_vec());
}
fn reconstruct_from_tokens(tokens: &[Token]) -> Vec<u8> {
let mut result = Vec::new();
for token in tokens {
match token {
Token::Literal(b) => result.push(*b),
Token::Match { length, distance } => {
let start = result.len() - *distance as usize;
for i in 0..*length as usize {
result.push(result[start + i]);
}
}
}
}
result
}
#[test]
fn test_cost_model_short_match_cost() {
let model = CostModel::fixed();
let short = model.match_cost(3, 1);
let long = model.match_cost(100, 1);
assert!(short > 0.0);
assert!(long > 0.0);
}
#[test]
fn test_cost_model_frequency_based() {
let mut lit_counts = [0u32; 286];
let mut dist_counts = [0u32; 30];
lit_counts[0] = 1000; lit_counts[255] = 1;
dist_counts[0] = 100;
let model = CostModel::from_statistics(&lit_counts, &dist_counts);
assert!(model.literal_cost(0) < model.literal_cost(255));
}
#[test]
fn test_lz77_window_size() {
let mut data = vec![0u8; 40000];
for i in 0..10 {
data[i] = (i + 1) as u8;
}
for i in 0..10 {
data[35000 + i] = (i + 1) as u8;
}
let mut compressor = Lz77Compressor::new(6);
let tokens = compressor.compress(&data);
let reconstructed = reconstruct_from_tokens(&tokens);
assert_eq!(reconstructed, data);
}
#[test]
fn test_length_to_symbol_edge_cases() {
assert_eq!(length_to_symbol(3).0, 257);
assert_eq!(length_to_symbol(4).0, 258);
assert_eq!(length_to_symbol(5).0, 259);
assert_eq!(length_to_symbol(6).0, 260);
assert_eq!(length_to_symbol(7).0, 261);
assert_eq!(length_to_symbol(8).0, 262);
assert_eq!(length_to_symbol(9).0, 263);
assert_eq!(length_to_symbol(10).0, 264);
}
#[test]
fn test_distance_to_symbol_edge_cases() {
assert_eq!(distance_to_symbol(1).0, 0);
assert_eq!(distance_to_symbol(2).0, 1);
assert_eq!(distance_to_symbol(3).0, 2);
assert_eq!(distance_to_symbol(4).0, 3);
}
#[test]
fn test_optimal_lz77_long_input() {
let mut compressor = Lz77Compressor::new(6);
let cost_model = CostModel::fixed();
let mut data = Vec::new();
for _ in 0..100 {
data.extend_from_slice(b"pattern");
}
let tokens = compressor.compress_optimal(&data, &cost_model);
let reconstructed = reconstruct_from_tokens(&tokens);
assert_eq!(reconstructed, data);
}
#[test]
fn test_hash_collisions() {
let mut compressor = Lz77Compressor::new(6);
let mut data = Vec::new();
for i in 0..1000 {
data.push((i % 4) as u8);
}
let tokens = compressor.compress(&data);
let reconstructed = reconstruct_from_tokens(&tokens);
assert_eq!(reconstructed, data);
}
}
#[cfg(test)]
mod trace_backwards_tests {
use super::*;
#[test]
fn test_trace_backwards_zero_distance_match_multibyte() {
let compressor = Lz77Compressor::new(6);
let length_array = vec![0u16, 0, 0, 0, 0, 5, 0, 2];
let dist_array = vec![0u16, 0, 0, 0, 0, 0, 0, 0];
let data = b"abcdefg";
let tokens = compressor.trace_backwards(&length_array, &dist_array, data);
assert_eq!(tokens.len(), 2, "Should produce 2 tokens");
match tokens[1] {
Token::Literal(b) => {
assert_eq!(
b, b'f',
"Second literal should be 'f' (byte at position 5), got '{}'. \
This indicates data_pos was not advanced correctly.",
b as char
);
}
Token::Match { .. } => {
panic!("Expected literal, got match");
}
}
}
}
#[cfg(test)]
mod cache_tests {
use super::*;
#[test]
fn test_longest_match_cache_new() {
let cache = LongestMatchCache::new(100);
assert!(!cache.is_populated());
}
#[test]
fn test_longest_match_cache_set_get() {
let mut cache = LongestMatchCache::new(10);
let mut sublen = [0u16; 259];
sublen[3] = 5;
sublen[4] = 3;
sublen[5] = 2;
cache.set(3, sublen, 5);
let (retrieved_sublen, max_len) = cache.get(3).unwrap();
assert_eq!(max_len, 5);
assert_eq!(retrieved_sublen[3], 5);
assert_eq!(retrieved_sublen[4], 3);
assert_eq!(retrieved_sublen[5], 2);
}
#[test]
fn test_longest_match_cache_is_populated() {
let mut cache = LongestMatchCache::new(10);
assert!(!cache.is_populated());
let mut sublen = [0u16; 259];
sublen[3] = 1;
cache.set(0, sublen, 3);
assert!(cache.is_populated());
}
#[test]
fn test_compress_optimal_cached_equivalence() {
let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps.";
let cost_model = CostModel::fixed();
let mut compressor1 = Lz77Compressor::new(6);
let tokens1 = compressor1.compress_optimal(data, &cost_model);
let mut compressor2 = Lz77Compressor::new(6);
let mut cache = LongestMatchCache::new(data.len());
let tokens2 = compressor2.compress_optimal_cached(data, &cost_model, &mut cache);
assert_eq!(
tokens1, tokens2,
"Cached version should match non-cached version"
);
}
#[test]
fn test_compress_optimal_cached_reuse() {
let data = b"abcabcabcabcabc";
let mut compressor = Lz77Compressor::new(6);
let mut cache = LongestMatchCache::new(data.len());
let cost_model1 = CostModel::fixed();
let tokens1 = compressor.compress_optimal_cached(data, &cost_model1, &mut cache);
assert!(cache.is_populated());
let cost_model2 = CostModel::fixed();
let tokens2 = compressor.compress_optimal_cached(data, &cost_model2, &mut cache);
let reconstructed1 = reconstruct_from_tokens(&tokens1);
let reconstructed2 = reconstruct_from_tokens(&tokens2);
assert_eq!(reconstructed1, data.to_vec());
assert_eq!(reconstructed2, data.to_vec());
}
fn reconstruct_from_tokens(tokens: &[Token]) -> Vec<u8> {
let mut result = Vec::new();
for token in tokens {
match token {
Token::Literal(b) => result.push(*b),
Token::Match { length, distance } => {
let start = result.len() - *distance as usize;
for i in 0..*length as usize {
result.push(result[start + i]);
}
}
}
}
result
}
}
#[cfg(test)]
mod run_detection_tests {
use super::*;
#[test]
fn test_detect_same_byte_run_no_run() {
let data = b"abcdefgh";
assert_eq!(detect_same_byte_run(data, 0), 1);
assert_eq!(detect_same_byte_run(data, 3), 1);
}
#[test]
fn test_detect_same_byte_run_short_run() {
let data = b"aabbcc";
assert_eq!(detect_same_byte_run(data, 0), 2);
assert_eq!(detect_same_byte_run(data, 2), 2);
assert_eq!(detect_same_byte_run(data, 4), 2);
}
#[test]
fn test_detect_same_byte_run_long_run() {
let data = vec![0u8; 500];
let run_len = detect_same_byte_run(&data, 0);
assert_eq!(run_len, MAX_MATCH_LENGTH); }
#[test]
fn test_detect_same_byte_run_at_end() {
let data = b"abcaaaa";
assert_eq!(detect_same_byte_run(data, 3), 4); }
#[test]
fn test_detect_same_byte_run_empty() {
let data: &[u8] = &[];
assert_eq!(detect_same_byte_run(data, 0), 0);
}
#[test]
fn test_detect_same_byte_run_single_byte() {
let data = b"a";
assert_eq!(detect_same_byte_run(data, 0), 1);
}
#[test]
fn test_lz77_zero_run_compression() {
let mut data = Vec::new();
data.extend_from_slice(&[0u8; 1000]);
let mut compressor = Lz77Compressor::new(6);
let tokens = compressor.compress(&data);
let match_count = tokens
.iter()
.filter(|t| matches!(t, Token::Match { .. }))
.count();
assert!(
match_count > 0,
"Zero run should produce at least one match"
);
assert!(tokens.len() < 50, "Zero run should compress to few tokens");
}
#[test]
fn test_lz77_mixed_runs_compression() {
let mut data = Vec::new();
for _ in 0..10 {
data.extend_from_slice(&[0u8; 100]); data.extend_from_slice(&[1, 2, 3, 4, 5]); }
let mut compressor = Lz77Compressor::new(6);
let tokens = compressor.compress(&data);
let mut reconstructed = Vec::new();
for token in &tokens {
match token {
Token::Literal(b) => reconstructed.push(*b),
Token::Match { length, distance } => {
let start = reconstructed.len() - *distance as usize;
for i in 0..*length as usize {
reconstructed.push(reconstructed[start + i]);
}
}
}
}
assert_eq!(reconstructed, data);
}
#[test]
fn test_run_detection_in_optimal_parsing() {
let data = vec![42u8; 500];
let mut compressor = Lz77Compressor::new(9);
let cost_model = CostModel::fixed();
let tokens = compressor.compress_optimal(&data, &cost_model);
let mut reconstructed = Vec::new();
for token in &tokens {
match token {
Token::Literal(b) => reconstructed.push(*b),
Token::Match { length, distance } => {
assert!(
*distance >= 1,
"Match distance should be >= 1, got {distance}"
);
let start = reconstructed.len() - *distance as usize;
for i in 0..*length as usize {
reconstructed.push(reconstructed[start + i]);
}
}
}
}
assert_eq!(reconstructed, data);
}
}