#![deny(missing_docs)]
mod bitstream;
mod huffman;
mod match_finder;
mod pretree;
mod verbatim;
use bitstream::BitWriter;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(missing_docs)] pub enum WindowSize {
KB32,
KB64,
KB128,
KB256,
KB512,
MB1,
MB2,
MB4,
MB8,
MB16,
MB32,
}
impl WindowSize {
pub fn bytes(self) -> u32 {
match self {
WindowSize::KB32 => 0x8000,
WindowSize::KB64 => 0x10000,
WindowSize::KB128 => 0x20000,
WindowSize::KB256 => 0x40000,
WindowSize::KB512 => 0x80000,
WindowSize::MB1 => 0x100000,
WindowSize::MB2 => 0x200000,
WindowSize::MB4 => 0x400000,
WindowSize::MB8 => 0x800000,
WindowSize::MB16 => 0x1000000,
WindowSize::MB32 => 0x2000000,
}
}
pub fn position_slots(self) -> usize {
match self {
WindowSize::KB32 => 30,
WindowSize::KB64 => 32,
WindowSize::KB128 => 34,
WindowSize::KB256 => 36,
WindowSize::KB512 => 38,
WindowSize::MB1 => 42,
WindowSize::MB2 => 50,
WindowSize::MB4 => 66,
WindowSize::MB8 => 98,
WindowSize::MB16 => 162,
WindowSize::MB32 => 290,
}
}
pub fn max_match_offset(self) -> usize {
const fn footer_bits(s: usize) -> u32 {
if s < 4 {
0
} else if s >= 36 {
17
} else {
(s as u32 - 2) / 2
}
}
const BASE: [u32; 291] = {
let mut t = [0u32; 291];
let mut i = 1usize;
while i < 291 {
t[i] = t[i - 1].wrapping_add(1u32 << footer_bits(i - 1));
i += 1;
}
t
};
let top = BASE[self.position_slots()];
(top as usize).saturating_sub(3)
}
}
pub const MAX_CHUNK_SIZE: usize = 32 * 1024;
const BLOCK_TYPE_UNCOMPRESSED: u32 = 0b011;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Strategy {
Uncompressed,
LiteralOnly,
Greedy,
}
pub struct Encoder {
window_size: WindowSize,
e8_translation_size: Option<u32>,
first_chunk: bool,
input_offset: u64,
strategy: Strategy,
verbatim_state: verbatim::VerbatimState,
match_finder: match_finder::MatchFinder,
tokens: Vec<verbatim::Token>,
e8_scratch: Vec<u8>,
}
impl Encoder {
pub fn new(window_size: WindowSize) -> Self {
Self {
window_size,
e8_translation_size: None,
first_chunk: true,
input_offset: 0,
strategy: Strategy::Greedy,
verbatim_state: verbatim::VerbatimState::new(window_size),
match_finder: match_finder::MatchFinder::new(window_size.max_match_offset()),
tokens: Vec::new(),
e8_scratch: Vec::new(),
}
}
pub fn set_e8_translation(&mut self, translation_size: u32) {
self.e8_translation_size = Some(translation_size);
}
pub fn with_e8_translation(mut self, translation_size: u32) -> Self {
self.set_e8_translation(translation_size);
self
}
pub fn set_strategy(&mut self, strategy: Strategy) {
self.strategy = strategy;
}
pub fn with_strategy(mut self, strategy: Strategy) -> Self {
self.set_strategy(strategy);
self
}
pub fn encode_chunk(&mut self, input: &[u8]) -> Vec<u8> {
assert!(input.len() <= MAX_CHUNK_SIZE);
let mut w = BitWriter::with_capacity(input.len() + 64);
if self.first_chunk {
self.first_chunk = false;
match self.e8_translation_size {
Some(size) => {
w.write_bit(true);
w.write_bits(size, 32);
}
None => w.write_bit(false),
}
}
let effective_input: &[u8] = if let Some(size) = self.e8_translation_size {
self.e8_scratch.clear();
self.e8_scratch.extend_from_slice(input);
e8_preprocess_in_place(&mut self.e8_scratch, self.input_offset, size as i32);
&self.e8_scratch
} else {
input
};
let has_tokens = match self.strategy {
Strategy::Uncompressed => {
self.tokens.clear();
false
}
Strategy::LiteralOnly => {
self.tokens.clear();
self.tokens.extend(effective_input.iter().map(|&b| verbatim::Token::Literal(b)));
true
}
Strategy::Greedy => {
self.match_finder.process(effective_input, &mut self.tokens);
true
}
};
let current_r = self.verbatim_state.r;
let use_verbatim = has_tokens && verbatim::main_tree_is_nondegenerate(&self.tokens, current_r);
if use_verbatim {
verbatim::emit_verbatim_block(
&mut w,
&mut self.verbatim_state,
&self.tokens,
effective_input.len() as u32,
self.window_size,
);
} else {
self.emit_uncompressed_block(&mut w, effective_input);
}
self.input_offset += input.len() as u64;
w.finish()
}
fn emit_uncompressed_block(&self, w: &mut BitWriter, data: &[u8]) {
w.write_bits(BLOCK_TYPE_UNCOMPRESSED, 3);
w.write_u24_be(data.len() as u32);
w.align();
w.write_u32_le(1);
w.write_u32_le(1);
w.write_u32_le(1);
w.write_raw(data);
if !data.len().is_multiple_of(2) {
w.write_raw(&[0]);
}
}
pub fn window_size(&self) -> WindowSize {
self.window_size
}
}
fn e8_preprocess_in_place(buf: &mut [u8], base_offset: u64, translation_size: i32) {
if buf.len() <= 10 || translation_size <= 0 {
return;
}
let scan_end = buf.len() - 10;
let mut pos = 0;
while pos < scan_end {
let cp64 = base_offset.saturating_add(pos as u64);
if cp64 >= translation_size as u64 {
break;
}
if buf[pos] != 0xE8 {
pos += 1;
continue;
}
let cp = cp64 as i32;
let rel = i32::from_le_bytes([buf[pos + 1], buf[pos + 2], buf[pos + 3], buf[pos + 4]]);
let abs_pos = rel.wrapping_add(cp);
let abs_neg = rel.wrapping_sub(translation_size);
let abs = if abs_pos > 0 && abs_pos < translation_size {
Some(abs_pos)
} else if abs_neg >= -cp && abs_neg <= 0 {
Some(abs_neg)
} else {
None
};
if let Some(a) = abs {
buf[pos + 1..pos + 5].copy_from_slice(&a.to_le_bytes());
pos += 5;
} else {
pos += 1;
}
}
}
#[must_use = "EncoderWriter is an unterminated compressed stream; call .finish() \
to surface the final write's errors and recover the inner sink"]
pub struct EncoderWriter<W: std::io::Write> {
inner: Option<W>,
encoder: Encoder,
buf: Vec<u8>,
}
impl<W: std::io::Write> EncoderWriter<W> {
pub fn new(inner: W, window_size: WindowSize) -> Self {
Self { inner: Some(inner), encoder: Encoder::new(window_size), buf: Vec::with_capacity(MAX_CHUNK_SIZE) }
}
pub fn with_e8_translation(mut self, translation_size: u32) -> Self {
self.encoder.set_e8_translation(translation_size);
self
}
pub fn with_strategy(mut self, strategy: Strategy) -> Self {
self.encoder.set_strategy(strategy);
self
}
pub fn finish(mut self) -> std::io::Result<W> {
self.flush_trailing()?;
Ok(self.inner.take().expect("EncoderWriter::finish called twice"))
}
fn flush_trailing(&mut self) -> std::io::Result<()> {
if !self.buf.is_empty() && self.inner.is_some() {
self.emit_chunk()?;
}
Ok(())
}
fn emit_chunk(&mut self) -> std::io::Result<()> {
let chunk = self.encoder.encode_chunk(&self.buf);
let size = u16::try_from(chunk.len()).map_err(|_| {
std::io::Error::new(std::io::ErrorKind::InvalidData, "compressed chunk exceeds u16 size prefix")
})?;
let inner = self.inner.as_mut().expect("inner sink taken; cannot emit");
inner.write_all(&size.to_be_bytes())?;
inner.write_all(&chunk)?;
self.buf.clear();
Ok(())
}
}
impl<W: std::io::Write> std::io::Write for EncoderWriter<W> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
assert!(self.inner.is_some(), "EncoderWriter: write after finish");
let mut written = 0;
while written < buf.len() {
let want = MAX_CHUNK_SIZE - self.buf.len();
let take = want.min(buf.len() - written);
self.buf.extend_from_slice(&buf[written..written + take]);
written += take;
if self.buf.len() == MAX_CHUNK_SIZE {
self.emit_chunk()?;
}
}
Ok(written)
}
fn flush(&mut self) -> std::io::Result<()> {
self.inner.as_mut().expect("EncoderWriter: flush after finish").flush()
}
}
impl<W: std::io::Write> Drop for EncoderWriter<W> {
fn drop(&mut self) {
let _ = self.flush_trailing();
}
}
#[cfg(test)]
mod tests {
use super::*;
fn to_lzxd(w: WindowSize) -> lzxd::WindowSize {
match w {
WindowSize::KB32 => lzxd::WindowSize::KB32,
WindowSize::KB64 => lzxd::WindowSize::KB64,
WindowSize::KB128 => lzxd::WindowSize::KB128,
WindowSize::KB256 => lzxd::WindowSize::KB256,
WindowSize::KB512 => lzxd::WindowSize::KB512,
WindowSize::MB1 => lzxd::WindowSize::MB1,
WindowSize::MB2 => lzxd::WindowSize::MB2,
WindowSize::MB4 => lzxd::WindowSize::MB4,
WindowSize::MB8 => lzxd::WindowSize::MB8,
WindowSize::MB16 => lzxd::WindowSize::MB16,
WindowSize::MB32 => lzxd::WindowSize::MB32,
}
}
fn roundtrip(input: &[u8], window: WindowSize) {
let mut enc = Encoder::new(window);
let mut dec = lzxd::Lzxd::new(to_lzxd(window));
let mut decoded: Vec<u8> = Vec::with_capacity(input.len());
let mut offset = 0;
let chunks: Vec<&[u8]> = if input.is_empty() { vec![&[][..]] } else { input.chunks(MAX_CHUNK_SIZE).collect() };
for chunk in chunks {
let chunk_bytes = enc.encode_chunk(chunk);
if chunk.is_empty() {
continue;
}
let out = dec.decompress_next(&chunk_bytes, chunk.len()).unwrap();
assert_eq!(out, chunk, "chunk at offset {} mismatch", offset);
decoded.extend_from_slice(out);
offset += chunk.len();
}
assert_eq!(decoded, input);
}
#[test]
fn empty_input() {
roundtrip(&[], WindowSize::KB32);
}
#[test]
fn short_input() {
roundtrip(b"hello", WindowSize::KB32);
}
#[test]
fn odd_length_input() {
roundtrip(b"hello world!", WindowSize::KB32);
}
#[test]
fn max_chunk_size() {
let input: Vec<u8> = (0..MAX_CHUNK_SIZE).map(|i| (i & 0xFF) as u8).collect();
roundtrip(&input, WindowSize::KB32);
}
#[test]
fn multiple_chunks() {
let input: Vec<u8> = (0..MAX_CHUNK_SIZE * 3 + 100).map(|i| (i & 0xFF) as u8).collect();
roundtrip(&input, WindowSize::KB64);
}
#[test]
fn random_bytes() {
let mut state = 0xDEAD_BEEFu32;
let input: Vec<u8> = (0..5000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
roundtrip(&input, WindowSize::KB128);
}
#[test]
fn highly_repetitive_compresses() {
let mut input = Vec::with_capacity(20_000);
for _ in 0..(20_000 / 16) {
input.extend_from_slice(b"abcdefghijklmnop");
}
roundtrip(&input, WindowSize::KB32);
let mut enc = Encoder::new(WindowSize::KB32);
let compressed: usize = input.chunks(MAX_CHUNK_SIZE).map(|c| enc.encode_chunk(c).len()).sum();
assert!(
compressed < input.len() / 4,
"expected at least 4x compression, got {} -> {}",
input.len(),
compressed
);
}
#[test]
fn text_data_roundtrips() {
let text = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. \
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris \
nisi ut aliquip ex ea commodo consequat. \
Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
roundtrip(text, WindowSize::KB32);
}
#[test]
fn literal_only_strategy_roundtrips() {
let mut enc = Encoder::new(WindowSize::KB32).with_strategy(Strategy::LiteralOnly);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB32);
let input = b"hello literal-only path!";
let bytes = enc.encode_chunk(input);
let out = dec.decompress_next(&bytes, input.len()).unwrap();
assert_eq!(out, input);
}
#[test]
fn uncompressed_strategy_roundtrips() {
let mut enc = Encoder::new(WindowSize::KB32).with_strategy(Strategy::Uncompressed);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB32);
let input = b"hello uncompressed!";
let bytes = enc.encode_chunk(input);
let out = dec.decompress_next(&bytes, input.len()).unwrap();
assert_eq!(out, input);
}
#[test]
fn two_chunks_random() {
let mut state = 0x1234_5678u32;
let input: Vec<u8> = (0..40_000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
roundtrip(&input, WindowSize::MB1);
}
#[test]
fn four_chunks_random() {
let mut state = 0x1234_5678u32;
let input: Vec<u8> = (0..130_000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
roundtrip(&input, WindowSize::MB1);
}
#[test]
fn three_chunks_random() {
let mut state = 0x1234_5678u32;
let input: Vec<u8> = (0..70_000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
roundtrip(&input, WindowSize::MB1);
}
#[test]
fn large_random_multichunk() {
let mut state = 0x1234_5678u32;
let input: Vec<u8> = (0..200_000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
roundtrip(&input, WindowSize::MB1);
}
#[test]
fn large_random_literal_only() {
let mut state = 0x1234_5678u32;
let input: Vec<u8> = (0..200_000)
.map(|_| {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(state >> 16) as u8
})
.collect();
let mut enc = Encoder::new(WindowSize::MB1).with_strategy(Strategy::LiteralOnly);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::MB1);
for chunk in input.chunks(MAX_CHUNK_SIZE) {
let bytes = enc.encode_chunk(chunk);
let out = dec.decompress_next(&bytes, chunk.len()).unwrap();
assert_eq!(out, chunk);
}
}
#[test]
fn real_pe_basefile_roundtrips() {
let Ok(basefile) = std::fs::read("../xex_files/afplayer.xex") else {
eprintln!("skipping: afplayer.xex not found");
return;
};
let mut enc = Encoder::new(WindowSize::KB64);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB64);
let mut decoded = Vec::with_capacity(basefile.len());
let mut compressed: usize = 0;
for chunk in basefile.chunks(MAX_CHUNK_SIZE) {
let bytes = enc.encode_chunk(chunk);
compressed += bytes.len();
let out = dec.decompress_next(&bytes, chunk.len()).unwrap();
decoded.extend_from_slice(out);
}
assert_eq!(decoded, basefile, "round-trip failed on real XEX bytes");
eprintln!(
"afplayer.xex: {} -> {} ({:.1}%)",
basefile.len(),
compressed,
100.0 * compressed as f64 / basefile.len() as f64
);
}
#[test]
fn single_byte_input_falls_back_to_uncompressed() {
let mut enc = Encoder::new(WindowSize::KB32);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB32);
let bytes = enc.encode_chunk(b"a");
let out = dec.decompress_next(&bytes, 1).unwrap();
assert_eq!(out, b"a");
}
#[test]
fn identical_chunks_produce_identical_trees() {
let input: Vec<u8> = (0..40_000).map(|i| (i & 0xFF) as u8).collect();
roundtrip(&input, WindowSize::KB64);
}
#[test]
fn e8_preprocessing_round_trips_through_lzxd() {
let mut input = vec![0u8; 8192];
for (i, b) in input.iter_mut().enumerate() {
*b = (i & 0x7F) as u8; }
input[100] = 0xE8;
input[101..105].copy_from_slice(&0x1000i32.to_le_bytes());
input[2000] = 0xE8;
input[2001..2005].copy_from_slice(&42i32.to_le_bytes());
let mut enc = Encoder::new(WindowSize::KB32).with_e8_translation(input.len() as u32);
let bytes = enc.encode_chunk(&input);
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB32);
let out = dec.decompress_next(&bytes, input.len()).unwrap();
assert_eq!(out, input, "E8 preprocessing must round-trip through lzxd");
}
#[test]
fn e8_preprocessing_leaves_non_e8_bytes_alone() {
let input = vec![0x00u8; 4096];
let mut buf = input.clone();
e8_preprocess_in_place(&mut buf, 0, input.len() as i32);
assert_eq!(buf, input);
}
#[test]
fn encoder_writer_drop_flushes_trailing_chunk() {
let input = b"just a tiny bit of input, not even one chunk worth".to_vec();
let mut out: Vec<u8> = Vec::new();
{
let mut w = EncoderWriter::new(&mut out, WindowSize::KB32);
use std::io::Write as _;
w.write_all(&input).unwrap();
}
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB32);
let size = u16::from_be_bytes([out[0], out[1]]) as usize;
let chunk = &out[2..2 + size];
let plain = dec.decompress_next(chunk, input.len()).unwrap();
assert_eq!(plain, input);
}
#[test]
fn encoder_writer_roundtrips_through_u16_prefixed_frames() {
let input: Vec<u8> = (0..80_000).map(|i| ((i * 7) & 0xFF) as u8).collect();
let mut out: Vec<u8> = Vec::new();
{
let mut w = EncoderWriter::new(&mut out, WindowSize::KB64);
use std::io::Write as _;
w.write_all(&input).unwrap();
w.finish().unwrap();
}
let mut dec = lzxd::Lzxd::new(lzxd::WindowSize::KB64);
let mut decoded = Vec::with_capacity(input.len());
let mut p = 0;
let mut remaining = input.len();
while p < out.len() && remaining > 0 {
let size = u16::from_be_bytes([out[p], out[p + 1]]) as usize;
p += 2;
let chunk = &out[p..p + size];
p += size;
let want = remaining.min(MAX_CHUNK_SIZE);
let plain = dec.decompress_next(chunk, want).unwrap();
decoded.extend_from_slice(plain);
remaining -= plain.len();
}
assert_eq!(decoded, input);
}
#[test]
fn mixed_pattern_compression() {
let mut input = Vec::with_capacity(128 * 1024);
for _ in 0..(64 * 1024 / 16) {
input.extend_from_slice(b"The quick brown ");
}
let mut state = 0xFEED_FACEu32;
for _ in 0..(64 * 1024) {
state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
input.push((state >> 16) as u8);
}
roundtrip(&input, WindowSize::KB128);
}
}