use std::io::{self, Write};
use super::radix_mf::RadixMatchFinder;
const MAX_UNCOMPRESSED_CHUNK_SIZE: usize = 65535;
pub const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Token {
Literal(u8),
Match {
distance: u32,
length: u32,
},
}
impl Token {
pub fn is_literal(&self) -> bool {
matches!(self, Token::Literal(_))
}
pub fn is_match(&self) -> bool {
matches!(self, Token::Match { .. })
}
}
pub fn encode_greedy(data: &[u8], match_finder: &RadixMatchFinder) -> Vec<Token> {
let mut tokens = Vec::with_capacity(data.len());
let mut pos = 0;
while pos < data.len() {
if let Some(m) = match_finder.get_match(data, pos) {
if m.length >= 2 {
tokens.push(Token::Match {
distance: m.offset,
length: m.length,
});
pos += m.length as usize;
continue;
}
}
tokens.push(Token::Literal(data[pos]));
pos += 1;
}
tokens
}
#[derive(Debug, Clone, Default)]
pub struct EncodingStats {
pub literals: usize,
pub matches: usize,
pub matched_bytes: usize,
}
impl EncodingStats {
pub fn from_tokens(tokens: &[Token]) -> Self {
let mut stats = Self::default();
for token in tokens {
match token {
Token::Literal(_) => stats.literals += 1,
Token::Match { length, .. } => {
stats.matches += 1;
stats.matched_bytes += *length as usize;
}
}
}
stats
}
pub fn match_ratio(&self) -> f64 {
let total = self.literals + self.matched_bytes;
if total == 0 {
0.0
} else {
self.matched_bytes as f64 / total as f64
}
}
}
pub fn write_uncompressed_chunk(
output: &mut impl Write,
data: &[u8],
reset_dict: bool,
) -> io::Result<()> {
if data.is_empty() {
return Ok(());
}
let chunk_size = data.len().min(MAX_UNCOMPRESSED_CHUNK_SIZE);
let ctrl = if reset_dict { 0x01u8 } else { 0x02u8 };
let size_minus_one = (chunk_size - 1) as u16;
output.write_all(&[ctrl])?;
output.write_all(&size_minus_one.to_be_bytes())?;
output.write_all(&data[..chunk_size])?;
Ok(())
}
pub fn write_uncompressed_data(
output: &mut impl Write,
data: &[u8],
reset_dict_first: bool,
) -> io::Result<()> {
if data.is_empty() {
return Ok(());
}
let mut pos = 0;
let mut first = true;
while pos < data.len() {
let remaining = data.len() - pos;
let chunk_size = remaining.min(MAX_UNCOMPRESSED_CHUNK_SIZE);
let reset = first && reset_dict_first;
write_uncompressed_chunk(output, &data[pos..pos + chunk_size], reset)?;
pos += chunk_size;
first = false;
}
Ok(())
}
pub fn write_end_marker(output: &mut impl Write) -> io::Result<()> {
output.write_all(&[0x00])
}
#[allow(dead_code)] mod ctrl {
pub const END_MARKER: u8 = 0x00;
pub const UNCOMPRESSED_RESET: u8 = 0x01;
pub const UNCOMPRESSED_NO_RESET: u8 = 0x02;
pub const LZMA_BASE: u8 = 0x80;
pub const LZMA_RESET_STATE: u8 = 0x80;
pub const LZMA_RESET_STATE_PROPS: u8 = 0xC0;
pub const LZMA_RESET_ALL: u8 = 0xE0;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum ChunkResetMode {
None = 0xA0,
StateReset = 0x80,
StatePropsReset = 0xC0,
AllReset = 0xE0,
}
impl ChunkResetMode {
pub fn includes_props(self) -> bool {
matches!(
self,
ChunkResetMode::StatePropsReset | ChunkResetMode::AllReset
)
}
}
const MAX_COMPRESSED_CHUNK_UNPACK_SIZE: usize = 1 << 16;
pub fn write_compressed_chunk(
output: &mut impl Write,
compressed: &[u8],
uncompressed_size: usize,
reset_mode: ChunkResetMode,
props: Option<u8>,
) -> io::Result<()> {
if compressed.is_empty() || uncompressed_size == 0 {
return Ok(());
}
debug_assert!(uncompressed_size <= MAX_COMPRESSED_CHUNK_UNPACK_SIZE);
let unpack_size_minus1 = (uncompressed_size - 1) as u32;
let unpack_high_bits = ((unpack_size_minus1 >> 16) & 0x1F) as u8;
let ctrl = (reset_mode as u8) | unpack_high_bits;
let pack_size_minus1 = (compressed.len() - 1) as u16;
let unpack_size_low = (unpack_size_minus1 & 0xFFFF) as u16;
output.write_all(&[ctrl])?;
output.write_all(&unpack_size_low.to_be_bytes())?;
output.write_all(&pack_size_minus1.to_be_bytes())?;
if reset_mode.includes_props() {
output.write_all(&[props.unwrap_or(0x5D)])?;
}
output.write_all(compressed)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_is_literal() {
assert!(Token::Literal(b'a').is_literal());
assert!(
!Token::Match {
distance: 1,
length: 2
}
.is_literal()
);
}
#[test]
fn test_token_is_match() {
assert!(!Token::Literal(b'a').is_match());
assert!(
Token::Match {
distance: 1,
length: 2
}
.is_match()
);
}
#[test]
fn test_encoding_stats() {
let tokens = vec![
Token::Literal(b'a'),
Token::Literal(b'b'),
Token::Match {
distance: 2,
length: 5,
},
Token::Literal(b'c'),
];
let stats = EncodingStats::from_tokens(&tokens);
assert_eq!(stats.literals, 3);
assert_eq!(stats.matches, 1);
assert_eq!(stats.matched_bytes, 5);
}
#[test]
fn test_write_uncompressed_chunk() {
let mut output = Vec::new();
let data = b"Hello, World!";
write_uncompressed_chunk(&mut output, data, true).unwrap();
assert_eq!(output[0], 0x01);
let size = u16::from_be_bytes([output[1], output[2]]);
assert_eq!(size, 12);
assert_eq!(&output[3..], data);
}
#[test]
fn test_write_uncompressed_chunk_no_reset() {
let mut output = Vec::new();
let data = b"Test";
write_uncompressed_chunk(&mut output, data, false).unwrap();
assert_eq!(output[0], 0x02); }
#[test]
fn test_write_end_marker() {
let mut output = Vec::new();
write_end_marker(&mut output).unwrap();
assert_eq!(output, vec![0x00]);
}
#[test]
fn test_encode_greedy_all_literals() {
let data = b"abcdefgh";
let mut mf = RadixMatchFinder::new(1024, 32);
mf.build(data);
let tokens = encode_greedy(data, &mf);
assert_eq!(tokens.len(), 8);
for (i, token) in tokens.iter().enumerate() {
assert_eq!(*token, Token::Literal(data[i]));
}
}
#[test]
fn test_encode_greedy_with_matches() {
let data = b"abcabcabc";
let mut mf = RadixMatchFinder::new(1024, 32);
mf.build(data);
let tokens = encode_greedy(data, &mf);
let stats = EncodingStats::from_tokens(&tokens);
assert!(stats.matches > 0, "Should find at least one match");
assert!(stats.literals > 0, "Should have some literals");
}
#[test]
fn test_write_uncompressed_data_large() {
let data: Vec<u8> = (0..70000).map(|i| (i % 256) as u8).collect();
let mut output = Vec::new();
write_uncompressed_data(&mut output, &data, true).unwrap();
assert!(output.len() > data.len()); }
#[test]
fn test_chunk_reset_mode_includes_props() {
assert!(!ChunkResetMode::None.includes_props());
assert!(!ChunkResetMode::StateReset.includes_props());
assert!(ChunkResetMode::StatePropsReset.includes_props());
assert!(ChunkResetMode::AllReset.includes_props());
}
#[test]
fn test_write_compressed_chunk_format() {
let mut output = Vec::new();
let compressed = vec![0x00, 0x01, 0x02, 0x03, 0x04]; let uncompressed_size = 100;
write_compressed_chunk(
&mut output,
&compressed,
uncompressed_size,
ChunkResetMode::AllReset,
Some(0x5D),
)
.unwrap();
assert_eq!(output[0], 0xE0);
let unpack_size = u16::from_be_bytes([output[1], output[2]]);
assert_eq!(unpack_size, 99);
let pack_size = u16::from_be_bytes([output[3], output[4]]);
assert_eq!(pack_size, 4);
assert_eq!(output[5], 0x5D);
assert_eq!(&output[6..], &compressed[..]);
}
#[test]
fn test_write_compressed_chunk_no_props() {
let mut output = Vec::new();
let compressed = vec![0xAA, 0xBB, 0xCC];
let uncompressed_size = 50;
write_compressed_chunk(
&mut output,
&compressed,
uncompressed_size,
ChunkResetMode::StateReset, None,
)
.unwrap();
assert_eq!(output[0], 0x80);
assert_eq!(output.len(), 5 + 3);
assert_eq!(&output[5..], &compressed[..]);
}
#[test]
fn test_write_compressed_chunk_empty() {
let mut output = Vec::new();
write_compressed_chunk(&mut output, &[], 100, ChunkResetMode::AllReset, None).unwrap();
assert!(output.is_empty());
write_compressed_chunk(
&mut output,
&[0x01, 0x02],
0,
ChunkResetMode::AllReset,
None,
)
.unwrap();
assert!(output.is_empty());
}
}