use super::sequences::{analyze_for_rle, encode_sequences_fse_with_encoded};
use super::Match;
use crate::block::Sequence;
use crate::huffman::HuffmanEncoder;
use haagenti_core::Result;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
#[derive(Debug)]
pub struct BlockEncoder {
literals: Vec<u8>,
sequences: Vec<Sequence>,
}
impl BlockEncoder {
pub fn new() -> Self {
Self {
literals: Vec::new(),
sequences: Vec::new(),
}
}
pub fn add_literal(&mut self, byte: u8) {
self.literals.push(byte);
}
pub fn add_literals(&mut self, bytes: &[u8]) {
self.literals.extend_from_slice(bytes);
}
pub fn add_match(&mut self, literal_length: u32, offset: u32, match_length: u32) {
self.sequences
.push(Sequence::new(literal_length, offset, match_length));
}
pub fn literals(&self) -> &[u8] {
&self.literals
}
pub fn sequences(&self) -> &[Sequence] {
&self.sequences
}
pub fn clear(&mut self) {
self.literals.clear();
self.sequences.clear();
}
}
impl Default for BlockEncoder {
fn default() -> Self {
Self::new()
}
}
const MAX_MATCH_LENGTH_PER_SEQUENCE: usize = 131074;
const MIN_MATCH_LENGTH: usize = 3;
struct RepeatOffsetsEncoder {
offsets: [u32; 3],
}
impl RepeatOffsetsEncoder {
fn new() -> Self {
Self { offsets: [1, 4, 8] }
}
fn encode(&mut self, actual_offset: u32, literal_length: u32) -> u32 {
if literal_length > 0 {
if actual_offset == self.offsets[0] {
return 1;
} else if actual_offset == self.offsets[1] {
self.offsets.swap(1, 0);
return 2;
} else if actual_offset == self.offsets[2] {
let temp = self.offsets[2];
self.offsets[2] = self.offsets[1];
self.offsets[1] = self.offsets[0];
self.offsets[0] = temp;
return 3;
}
} else {
if actual_offset == self.offsets[1] {
self.offsets.swap(0, 1);
return 1;
} else if actual_offset == self.offsets[2] {
let temp = self.offsets[2];
self.offsets[2] = self.offsets[1];
self.offsets[1] = self.offsets[0];
self.offsets[0] = temp;
return 2;
} else if actual_offset == self.offsets[0].saturating_sub(1).max(1) {
let new_offset = self.offsets[0].saturating_sub(1).max(1);
self.offsets[2] = self.offsets[1];
self.offsets[1] = self.offsets[0];
self.offsets[0] = new_offset;
return 3;
}
}
self.offsets[2] = self.offsets[1];
self.offsets[1] = self.offsets[0];
self.offsets[0] = actual_offset;
actual_offset + 3
}
}
pub fn matches_to_sequences(input: &[u8], matches: &[Match]) -> (Vec<u8>, Vec<Sequence>) {
let mut literals = Vec::with_capacity(input.len() / 2);
let mut sequences = Vec::with_capacity(matches.len());
let mut repeat_offsets = RepeatOffsetsEncoder::new();
let mut pos = 0;
for m in matches {
let literal_length = m.position - pos;
if literal_length > 0 {
literals.extend_from_slice(&input[pos..m.position]);
}
let actual_offset = m.offset as u32;
let chunk_size = MAX_MATCH_LENGTH_PER_SEQUENCE;
let mut remaining_match = m.length;
let mut first_sequence = true;
while remaining_match > 0 {
let match_len = remaining_match.min(chunk_size);
let after_this = remaining_match - match_len;
let match_len = if after_this > 0 && after_this < MIN_MATCH_LENGTH {
remaining_match - MIN_MATCH_LENGTH
} else {
match_len
};
let ll = if first_sequence {
literal_length as u32
} else {
0
};
first_sequence = false;
let offset_value = repeat_offsets.encode(actual_offset, ll);
sequences.push(Sequence::new(ll, offset_value, match_len as u32));
remaining_match -= match_len;
}
pos = m.position + m.length;
}
if pos < input.len() {
literals.extend_from_slice(&input[pos..]);
}
(literals, sequences)
}
pub fn encode_literals(literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
let size = literals.len();
if size == 0 {
output.push(0x00);
return Ok(());
}
if literals.iter().all(|&b| b == literals[0]) {
return encode_rle_literals(literals[0], size, output);
}
if size >= 64 {
if let Some(encoder) = HuffmanEncoder::build(literals) {
let estimated = encoder.estimate_size(literals);
if estimated + 20 < size {
return encode_huffman_literals(literals, &encoder, output);
}
}
}
encode_raw_literals(literals, output)
}
pub fn encode_literals_with_encoder(
literals: &[u8],
encoder: &HuffmanEncoder,
output: &mut Vec<u8>,
) -> Result<()> {
let size = literals.len();
if size == 0 {
output.push(0x00);
return Ok(());
}
if literals.iter().all(|&b| b == literals[0]) {
return encode_rle_literals(literals[0], size, output);
}
if size >= 64 {
let estimated = encoder.estimate_size(literals);
if estimated + 20 < size {
return encode_huffman_literals(literals, encoder, output);
}
}
encode_raw_literals(literals, output)
}
fn encode_huffman_literals(
literals: &[u8],
encoder: &HuffmanEncoder,
output: &mut Vec<u8>,
) -> Result<()> {
let regenerated_size = literals.len();
let weights = encoder.serialize_weights();
if weights.is_empty() {
return encode_raw_literals(literals, output);
}
if regenerated_size <= 1023 {
let compressed = encoder.encode(literals);
let compressed_size = weights.len() + compressed.len();
if compressed_size <= 1023 {
let byte0 = (((regenerated_size & 0x0F) << 4) | 0x0E) as u8;
let byte1 = (((compressed_size & 0x03) << 6) | ((regenerated_size >> 4) & 0x3F)) as u8;
let byte2 = ((compressed_size >> 2) & 0xFF) as u8;
output.push(byte0);
output.push(byte1);
output.push(byte2);
output.extend_from_slice(&weights);
output.extend_from_slice(&compressed);
return Ok(());
}
}
if regenerated_size <= 262143 {
return encode_huffman_4stream(literals, encoder, &weights, output);
}
encode_raw_literals(literals, output)
}
fn encode_huffman_4stream(
literals: &[u8],
encoder: &HuffmanEncoder,
weights: &[u8],
output: &mut Vec<u8>,
) -> Result<()> {
let regenerated_size = literals.len();
let segment_size = regenerated_size.div_ceil(4);
let mut segments: [&[u8]; 4] = [&[], &[], &[], &[]];
for (i, segment) in segments.iter_mut().enumerate() {
let start = i * segment_size;
if start >= regenerated_size {
*segment = &[];
} else {
let end = ((i + 1) * segment_size).min(regenerated_size);
*segment = &literals[start..end];
}
}
#[cfg(feature = "parallel")]
let (compressed_0, compressed_1, compressed_2, compressed_3) = {
let compressed: Vec<Vec<u8>> = segments.par_iter().map(|seg| encoder.encode(seg)).collect();
let mut iter = compressed.into_iter();
(
iter.next().unwrap_or_default(),
iter.next().unwrap_or_default(),
iter.next().unwrap_or_default(),
iter.next().unwrap_or_default(),
)
};
#[cfg(not(feature = "parallel"))]
let (compressed_0, compressed_1, compressed_2, compressed_3) = (
encoder.encode(segments[0]),
encoder.encode(segments[1]),
encoder.encode(segments[2]),
encoder.encode(segments[3]),
);
let stream_sizes = [
compressed_0.len(),
compressed_1.len(),
compressed_2.len(),
compressed_3.len(),
];
let total_compressed = weights.len() + 6 + stream_sizes.iter().sum::<usize>();
if stream_sizes.iter().any(|&s| s > 65535) {
return encode_raw_literals(literals, output);
}
if regenerated_size <= 1023 && total_compressed <= 1023 {
let byte0 = (((regenerated_size & 0x0F) << 4) | 0x02) as u8;
let byte1 = (((total_compressed & 0x03) << 6) | ((regenerated_size >> 4) & 0x3F)) as u8;
let byte2 = ((total_compressed >> 2) & 0xFF) as u8;
output.push(byte0);
output.push(byte1);
output.push(byte2);
} else if regenerated_size <= 65535 && total_compressed <= 16383 {
let byte0 = (((regenerated_size & 0x0F) << 4) | 0x0A) as u8;
let byte1 = ((regenerated_size >> 4) & 0xFF) as u8;
let byte2 = (((total_compressed & 0x0F) << 4) | ((regenerated_size >> 12) & 0x0F)) as u8;
let byte3 = ((total_compressed >> 4) & 0xFF) as u8;
let byte4 = ((total_compressed >> 12) & 0x03) as u8;
output.push(byte0);
output.push(byte1);
output.push(byte2);
output.push(byte3);
output.push(byte4);
} else {
return encode_raw_literals(literals, output);
}
output.extend_from_slice(weights);
let jump1 = stream_sizes[0];
let jump2 = jump1 + stream_sizes[1];
let jump3 = jump2 + stream_sizes[2];
output.extend_from_slice(&(jump1 as u16).to_le_bytes());
output.extend_from_slice(&(jump2 as u16).to_le_bytes());
output.extend_from_slice(&(jump3 as u16).to_le_bytes());
output.extend_from_slice(&compressed_0);
output.extend_from_slice(&compressed_1);
output.extend_from_slice(&compressed_2);
output.extend_from_slice(&compressed_3);
Ok(())
}
fn encode_raw_literals(literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
let size = literals.len();
if size == 0 {
output.push(0x00);
} else if size <= 31 {
let byte0 = (size << 3) as u8;
output.push(byte0);
} else if size <= 4095 {
let byte0 = ((size & 0x0F) << 4) | 0x04;
let byte1 = (size >> 4) & 0xFF;
output.push(byte0 as u8);
output.push(byte1 as u8);
} else {
let byte0 = ((size & 0x0F) << 4) | 0x0C;
let byte1 = (size >> 4) & 0xFF;
let byte2 = (size >> 12) & 0xFF;
output.push(byte0 as u8);
output.push(byte1 as u8);
output.push(byte2 as u8);
}
output.extend_from_slice(literals);
Ok(())
}
fn encode_rle_literals(byte: u8, count: usize, output: &mut Vec<u8>) -> Result<()> {
if count == 0 {
output.push(0x01); } else if count <= 31 {
let byte0 = ((count << 3) | 1) as u8;
output.push(byte0);
} else if count <= 4095 {
let byte0 = ((count & 0x0F) << 4) | 0x05;
let byte1 = (count >> 4) & 0xFF;
output.push(byte0 as u8);
output.push(byte1 as u8);
} else {
let byte0 = ((count & 0x0F) << 4) | 0x0D;
let byte1 = (count >> 4) & 0xFF;
let byte2 = (count >> 12) & 0xFF;
output.push(byte0 as u8);
output.push(byte1 as u8);
output.push(byte2 as u8);
}
output.push(byte);
Ok(())
}
pub fn encode_sequences(sequences: &[Sequence], output: &mut Vec<u8>) -> Result<()> {
if sequences.is_empty() {
output.push(0);
return Ok(());
}
let suitability = analyze_for_rle(sequences);
encode_sequences_fse_with_encoded(&suitability.encoded, output)
}
pub fn encode_block(input: &[u8], matches: &[Match], output: &mut Vec<u8>) -> Result<()> {
let (literals, sequences) = matches_to_sequences(input, matches);
encode_literals(&literals, output)?;
encode_sequences(&sequences, output)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_block_encoder_creation() {
let encoder = BlockEncoder::new();
assert!(encoder.literals().is_empty());
assert!(encoder.sequences().is_empty());
}
#[test]
fn test_add_literals() {
let mut encoder = BlockEncoder::new();
encoder.add_literal(b'A');
encoder.add_literals(b"BC");
assert_eq!(encoder.literals(), b"ABC");
}
#[test]
fn test_add_match() {
let mut encoder = BlockEncoder::new();
encoder.add_match(5, 10, 4);
assert_eq!(encoder.sequences().len(), 1);
let seq = &encoder.sequences()[0];
assert_eq!(seq.literal_length, 5);
assert_eq!(seq.offset, 10);
assert_eq!(seq.match_length, 4);
}
#[test]
fn test_matches_to_sequences_empty() {
let input = b"hello";
let matches = vec![];
let (literals, sequences) = matches_to_sequences(input, &matches);
assert_eq!(literals, b"hello");
assert!(sequences.is_empty());
}
#[test]
fn test_matches_to_sequences_with_match() {
let input = b"abcdabcd";
let matches = vec![Match::new(4, 4, 4)];
let (literals, sequences) = matches_to_sequences(input, &matches);
assert_eq!(literals, b"abcd"); assert_eq!(sequences.len(), 1);
assert_eq!(sequences[0].literal_length, 4);
assert_eq!(sequences[0].offset, 2);
assert_eq!(sequences[0].match_length, 4);
}
#[test]
fn test_matches_to_sequences_new_offset() {
let input = b"abcdefXXXabcdef";
let matches = vec![Match::new(9, 9, 6)];
let (literals, sequences) = matches_to_sequences(input, &matches);
assert_eq!(literals, b"abcdefXXX"); assert_eq!(sequences.len(), 1);
assert_eq!(sequences[0].literal_length, 9);
assert_eq!(sequences[0].offset, 12);
assert_eq!(sequences[0].match_length, 6);
}
#[test]
fn test_repeat_offset_encoder() {
let mut encoder = RepeatOffsetsEncoder::new();
assert_eq!(encoder.encode(4, 5), 2);
assert_eq!(encoder.encode(4, 5), 1);
assert_eq!(encoder.encode(1, 5), 2);
assert_eq!(encoder.encode(100, 5), 103);
}
#[test]
fn test_encode_raw_literals_small() {
let mut output = Vec::new();
encode_raw_literals(b"Hello", &mut output).unwrap();
assert_eq!(output[0], 0x28);
assert_eq!(&output[1..], b"Hello");
}
#[test]
fn test_encode_raw_literals_medium() {
let data: Vec<u8> = (0..100).collect();
let mut output = Vec::new();
encode_raw_literals(&data, &mut output).unwrap();
assert_eq!(output.len(), 2 + 100);
}
#[test]
fn test_encode_rle_literals() {
let mut output = Vec::new();
encode_rle_literals(b'X', 10, &mut output).unwrap();
assert_eq!(output[0], 0x51);
assert_eq!(output[1], b'X');
}
#[test]
fn test_encode_sequences_empty() {
let mut output = Vec::new();
encode_sequences(&[], &mut output).unwrap();
assert_eq!(output, vec![0]);
}
#[test]
fn test_encode_sequences_count_small() {
let sequences = vec![Sequence::new(0, 4, 3)];
let mut output = Vec::new();
encode_sequences(&sequences, &mut output).unwrap();
assert_eq!(output[0], 1);
}
#[test]
fn test_encode_literals_detects_rle() {
let rle_data = vec![b'A'; 50];
let mut output = Vec::new();
encode_literals(&rle_data, &mut output).unwrap();
assert!(output.len() < 50);
}
#[test]
fn test_encode_block() {
let input = b"abcdabcd";
let matches = vec![Match::new(4, 4, 4)];
let mut output = Vec::new();
encode_block(input, &matches, &mut output).unwrap();
assert!(!output.is_empty());
}
}