pub const WINDOW_SIZE: usize = 32768;
pub const MIN_MATCH: usize = 3;
pub const MAX_MATCH: usize = 258;
const HASH_SIZE: usize = 32768;
const HASH_MASK: usize = HASH_SIZE - 1;
const MAX_CHAIN_LENGTH: usize = 4096;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Lz77Token {
Literal(u8),
Match {
length: u16,
distance: u16,
},
}
#[derive(Debug)]
pub struct Lz77Encoder {
window: Vec<u8>,
window_pos: usize,
hash_table: Vec<u16>,
hash_chain: Vec<u16>,
max_chain: usize,
min_match: usize,
lazy_match: bool,
nice_length: usize,
}
impl Lz77Encoder {
pub fn new() -> Self {
Self::with_level(6)
}
pub fn with_level(level: u8) -> Self {
let level = level.min(9);
let (max_chain, min_match, lazy_match, nice_length) = match level {
0 => (0, MAX_MATCH + 1, false, MAX_MATCH), 1 => (4, 4, false, MAX_MATCH),
2 => (8, 4, false, MAX_MATCH),
3 => (16, 4, false, MAX_MATCH),
4 => (32, 4, false, MAX_MATCH),
5 => (64, 4, true, 64),
6 => (128, 4, true, 128),
7 => (256, 3, true, 192),
8 => (1024, 3, true, 250),
9 => (MAX_CHAIN_LENGTH, 3, true, MAX_MATCH),
_ => unreachable!(),
};
Self {
window: vec![0; WINDOW_SIZE * 2],
window_pos: 0,
hash_table: vec![0; HASH_SIZE],
hash_chain: vec![0; WINDOW_SIZE],
max_chain,
min_match,
lazy_match,
nice_length,
}
}
pub fn reset(&mut self) {
self.window_pos = 0;
self.hash_table.fill(0);
self.hash_chain.fill(0);
}
pub fn set_dictionary(&mut self, dictionary: &[u8]) -> u32 {
self.reset();
let dict_to_use = if dictionary.len() > WINDOW_SIZE {
&dictionary[dictionary.len() - WINDOW_SIZE..]
} else {
dictionary
};
self.window[..dict_to_use.len()].copy_from_slice(dict_to_use);
self.window_pos = dict_to_use.len();
let len = dict_to_use.len();
if len >= 4 {
for pos in 0..len.saturating_sub(3) {
let h = Self::hash(
dict_to_use[pos],
dict_to_use[pos + 1],
dict_to_use[pos + 2],
dict_to_use[pos + 3],
);
let prev = self.hash_table[h];
self.hash_chain[pos & (WINDOW_SIZE - 1)] = prev;
self.hash_table[h] = pos as u16;
}
}
Self::adler32(dictionary)
}
fn adler32(data: &[u8]) -> u32 {
const MOD_ADLER: u32 = 65521;
const NMAX: usize = 5552;
let mut a: u32 = 1;
let mut b: u32 = 0;
let mut remaining = data;
while remaining.len() >= NMAX {
let (chunk, rest) = remaining.split_at(NMAX);
remaining = rest;
for &byte in chunk {
a += byte as u32;
b += a;
}
a %= MOD_ADLER;
b %= MOD_ADLER;
}
for &byte in remaining {
a += byte as u32;
b += a;
}
((b % MOD_ADLER) << 16) | (a % MOD_ADLER)
}
pub fn has_dictionary(&self) -> bool {
self.window_pos > 0
}
pub fn dictionary_size(&self) -> usize {
self.window_pos
}
#[inline(always)]
fn hash(b0: u8, b1: u8, b2: u8, b3: u8) -> usize {
let h = ((b0 as usize).wrapping_mul(506832829))
^ ((b1 as usize).wrapping_mul(2654435761) << 8)
^ ((b2 as usize).wrapping_mul(374761393) << 16)
^ ((b3 as usize).wrapping_mul(1000000007) << 24);
(h ^ (h >> 15)) & HASH_MASK
}
fn update_hash(&mut self, pos: usize) {
if pos + 3 < self.window.len() {
let h = Self::hash(
self.window[pos],
self.window[pos + 1],
self.window[pos + 2],
self.window[pos + 3],
);
let prev = self.hash_table[h];
self.hash_chain[pos & (WINDOW_SIZE - 1)] = prev;
self.hash_table[h] = pos as u16;
}
}
fn find_match(&self, pos: usize, max_len: usize) -> Option<(u16, u16)> {
if pos < MIN_MATCH || max_len < self.min_match {
return None;
}
let h = if pos + 3 < self.window.len() {
Self::hash(
self.window[pos],
self.window[pos + 1],
self.window[pos + 2],
self.window[pos + 3],
)
} else if pos + 2 < self.window.len() {
Self::hash(
self.window[pos],
self.window[pos + 1],
self.window[pos + 2],
0,
)
} else {
return None;
};
let mut match_pos = self.hash_table[h] as usize;
let mut best_len = self.min_match - 1;
let mut best_dist = 0usize;
let min_pos = pos.saturating_sub(WINDOW_SIZE);
let mut chain_len = 0;
let max_check = max_len.min(MAX_MATCH);
while match_pos >= min_pos && match_pos < pos && chain_len < self.max_chain {
let dist = pos - match_pos;
if dist > 0 && dist <= WINDOW_SIZE {
if self.window[match_pos + best_len] == self.window[pos + best_len] {
if self.window[match_pos] == self.window[pos] {
let mut len = 0;
if len < max_check && self.window[match_pos] == self.window[pos] {
len = 1;
if len < max_check && self.window[match_pos + 1] == self.window[pos + 1]
{
len = 2;
if len < max_check
&& self.window[match_pos + 2] == self.window[pos + 2]
{
len = 3;
while len < max_check
&& self.window[match_pos + len] == self.window[pos + len]
{
len += 1;
}
}
}
}
if len > best_len {
best_len = len;
best_dist = dist;
if len >= max_len || len >= MAX_MATCH || best_len >= self.nice_length {
break;
}
}
}
}
}
match_pos = self.hash_chain[match_pos & (WINDOW_SIZE - 1)] as usize;
chain_len += 1;
}
if best_len >= self.min_match && best_len >= MIN_MATCH {
Some((best_len as u16, best_dist as u16))
} else {
None
}
}
pub fn compress(&mut self, input: &[u8]) -> Vec<Lz77Token> {
let mut tokens = Vec::with_capacity(input.len());
let mut input_pos = 0;
while input_pos < input.len() {
let space_in_window = self.window.len().saturating_sub(self.window_pos);
let chunk_size = space_in_window.min(input.len() - input_pos);
let start = self.window_pos;
self.window[start..start + chunk_size]
.copy_from_slice(&input[input_pos..input_pos + chunk_size]);
let end = start + chunk_size;
let mut pos = start;
while pos < end {
let remaining = end - pos;
let match_result = self.find_match(pos, remaining);
if let Some((length, distance)) = match_result {
let mut use_match = true;
if self.lazy_match && pos + 1 < end {
self.update_hash(pos);
if let Some((next_len, _)) = self.find_match(pos + 1, remaining - 1) {
if next_len > length + 1 {
use_match = false;
}
}
}
if use_match {
tokens.push(Lz77Token::Match { length, distance });
for i in 0..length as usize {
self.update_hash(pos + i);
}
pos += length as usize;
continue;
}
}
tokens.push(Lz77Token::Literal(self.window[pos]));
self.update_hash(pos);
pos += 1;
}
self.window_pos = end;
input_pos += chunk_size;
if self.window_pos >= WINDOW_SIZE + WINDOW_SIZE / 2 {
self.slide_window();
}
}
tokens
}
fn slide_window(&mut self) {
let slide_amount = WINDOW_SIZE;
self.window.copy_within(slide_amount..self.window_pos, 0);
self.window_pos -= slide_amount;
for entry in &mut self.hash_table {
if *entry >= slide_amount as u16 {
*entry -= slide_amount as u16;
} else {
*entry = 0;
}
}
for entry in &mut self.hash_chain {
if *entry >= slide_amount as u16 {
*entry -= slide_amount as u16;
} else {
*entry = 0;
}
}
}
pub fn compress_all(input: &[u8], level: u8) -> Vec<Lz77Token> {
let mut encoder = Self::with_level(level);
encoder.compress(input)
}
}
impl Default for Lz77Encoder {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_literals_only() {
let input = b"abcdefgh";
let tokens = Lz77Encoder::compress_all(input, 6);
assert!(tokens.iter().all(|t| matches!(t, Lz77Token::Literal(_))));
assert_eq!(tokens.len(), 8);
}
#[test]
fn test_simple_match() {
let input = b"abcabcabc";
let tokens = Lz77Encoder::compress_all(input, 6);
let has_match = tokens.iter().any(|t| matches!(t, Lz77Token::Match { .. }));
assert!(has_match, "Should find at least one match");
}
#[test]
fn test_repeated_char() {
let input = b"aaaaaaaaaa";
let tokens = Lz77Encoder::compress_all(input, 6);
let total_output: usize = tokens
.iter()
.map(|t| match t {
Lz77Token::Literal(_) => 1,
Lz77Token::Match { length, .. } => *length as usize,
})
.sum();
assert_eq!(total_output, 10);
assert!(tokens.len() < 10, "Should compress repeated chars");
}
#[test]
fn test_decode_matches() {
let input = b"Hello, Hello, Hello!";
let tokens = Lz77Encoder::compress_all(input, 6);
let mut output = Vec::new();
for token in &tokens {
match token {
Lz77Token::Literal(b) => output.push(*b),
Lz77Token::Match { length, distance } => {
for _ in 0..*length {
let pos = output.len() - *distance as usize;
output.push(output[pos]);
}
}
}
}
assert_eq!(output, input);
}
#[test]
fn test_level_0_store() {
let input = b"test data test data";
let tokens = Lz77Encoder::compress_all(input, 0);
assert!(tokens.iter().all(|t| matches!(t, Lz77Token::Literal(_))));
}
#[test]
fn test_hash() {
let h1 = Lz77Encoder::hash(b'a', b'b', b'c', b'd');
let h2 = Lz77Encoder::hash(b'a', b'b', b'c', b'd');
assert_eq!(h1, h2);
let h3 = Lz77Encoder::hash(b'x', b'y', b'z', b'w');
let _ = h3;
}
}