use alloc::vec::Vec;
use crate::error::Error;
const MIN_MATCH: usize = 4;
const MAX_DISTANCE: usize = 65_535;
const LAST_LITERALS: usize = 5;
const MFLIMIT: usize = 12;
const HASH_LOG: u32 = 12;
const HASH_TABLE_SIZE: usize = 1 << HASH_LOG;
const HC_HASH_LOG: u32 = 15;
const HC_HASH_TABLE_SIZE: usize = 1 << HC_HASH_LOG;
const HASH_EMPTY: u32 = u32::MAX;
const HC_LEVEL_THRESHOLD: u8 = 3;
const OPT_LEVEL_THRESHOLD: u8 = 10;
#[inline]
fn hash4(bytes: [u8; 4]) -> usize {
let v = u32::from_le_bytes(bytes);
((v.wrapping_mul(2_654_435_761)) >> (32 - HASH_LOG)) as usize
}
#[inline]
fn hc_hash4(bytes: [u8; 4]) -> usize {
let v = u32::from_le_bytes(bytes);
((v.wrapping_mul(2_654_435_761)) >> (32 - HC_HASH_LOG)) as usize
}
pub fn compress_bound(input_len: usize) -> usize {
input_len + (input_len / 255) + 16
}
pub fn encode_block(input: &[u8], out: &mut Vec<u8>) {
out.clear();
if input.is_empty() {
return;
}
if input.len() < MFLIMIT + 1 {
emit_last_literals(input, out);
return;
}
let mut table = [HASH_EMPTY; HASH_TABLE_SIZE];
let mut ip: usize = 0; let mut anchor: usize = 0;
let match_limit = input.len() - MFLIMIT;
let mut next_ip = ip;
while next_ip <= match_limit {
ip = next_ip;
let mut step = 1usize;
let mut search_match_nb = 1u32 << 6;
let mut match_pos;
loop {
if ip > match_limit {
emit_last_literals(&input[anchor..], out);
return;
}
let h = hash4([input[ip], input[ip + 1], input[ip + 2], input[ip + 3]]);
let candidate = table[h];
table[h] = ip as u32;
if candidate != HASH_EMPTY {
let cand = candidate as usize;
if ip - cand <= MAX_DISTANCE
&& input[cand] == input[ip]
&& input[cand + 1] == input[ip + 1]
&& input[cand + 2] == input[ip + 2]
&& input[cand + 3] == input[ip + 3]
{
match_pos = cand;
break;
}
}
next_ip = ip + step;
step = (search_match_nb >> 6) as usize;
search_match_nb += 1;
ip = next_ip;
}
while ip > anchor && match_pos > 0 && input[ip - 1] == input[match_pos - 1] {
ip -= 1;
match_pos -= 1;
}
let forward_limit = input.len() - LAST_LITERALS;
let mut match_len = MIN_MATCH;
while ip + match_len < forward_limit
&& input[match_pos + match_len] == input[ip + match_len]
{
match_len += 1;
}
let literal_len = ip - anchor;
let offset = (ip - match_pos) as u16;
let match_excess = match_len - MIN_MATCH;
emit_sequence(&input[anchor..ip], literal_len, offset, match_excess, out);
ip += match_len;
anchor = ip;
if ip >= 2 {
let seed = ip - 2;
if seed + MIN_MATCH <= input.len() {
let h = hash4([
input[seed],
input[seed + 1],
input[seed + 2],
input[seed + 3],
]);
table[h] = seed as u32;
}
}
next_ip = ip;
}
emit_last_literals(&input[anchor..], out);
}
pub fn encode_block_level(input: &[u8], out: &mut Vec<u8>, level: u8) {
encode_block_level_dict(&[], input, out, level);
}
pub fn encode_block_level_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
if dict.is_empty() {
if level < HC_LEVEL_THRESHOLD {
encode_block(input, out);
} else if level < OPT_LEVEL_THRESHOLD {
encode_block_hc_dict(&[], input, out, level);
} else {
encode_block_optimal_dict(&[], input, out, level);
}
return;
}
if level < OPT_LEVEL_THRESHOLD {
encode_block_hc_dict(dict, input, out, level);
} else {
encode_block_optimal_dict(dict, input, out, level);
}
}
fn nb_attempts_for_level(level: u8) -> u32 {
match level {
0..=3 => 8,
4 => 16,
5 => 32,
6 => 64,
7 => 128,
8 => 256,
9 => 512,
10 => 1024,
11 => 2048,
_ => 4096,
}
}
#[inline]
fn hc_insert(input: &[u8], p: usize, head: &mut [u32], chain: &mut [u32]) {
let h = hc_hash4([input[p], input[p + 1], input[p + 2], input[p + 3]]);
chain[p] = head[h];
head[h] = p as u32;
}
fn hc_longest_match(
input: &[u8],
pos: usize,
head: &[u32],
chain: &[u32],
nb_attempts: u32,
forward_limit: usize,
) -> Option<(usize, usize)> {
let h = hc_hash4([input[pos], input[pos + 1], input[pos + 2], input[pos + 3]]);
let mut cand = head[h];
let min_pos = pos.saturating_sub(MAX_DISTANCE);
let mut best_len = MIN_MATCH - 1;
let mut best_pos = 0usize;
let mut attempts = nb_attempts;
while cand != HASH_EMPTY && attempts > 0 {
let c = cand as usize;
if c >= pos {
cand = chain[c];
attempts -= 1;
continue;
}
if c < min_pos {
break; }
if pos + best_len < forward_limit
&& input[c + best_len] == input[pos + best_len]
&& input[c] == input[pos]
{
let mut l = 0usize;
while pos + l < forward_limit && input[c + l] == input[pos + l] {
l += 1;
}
if l > best_len {
best_len = l;
best_pos = c;
if pos + best_len >= forward_limit {
break; }
}
}
cand = chain[c];
attempts -= 1;
}
if best_len < MIN_MATCH {
None
} else {
Some((best_pos, best_len))
}
}
#[inline]
fn hc_resolve(
input: &[u8],
pos: usize,
found: (usize, usize),
anchor: usize,
) -> (usize, usize, usize) {
let (mut mpos, mut mlen) = found;
let mut spos = pos;
while spos > anchor && mpos > 0 && input[spos - 1] == input[mpos - 1] {
spos -= 1;
mpos -= 1;
mlen += 1;
}
(spos, mpos, mlen)
}
fn encode_block_hc_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
out.clear();
if input.is_empty() {
return;
}
if input.len() < MFLIMIT + 1 {
emit_last_literals(input, out);
return;
}
let dict_len = dict.len();
let buf: Vec<u8>;
let input: &[u8] = if dict_len == 0 {
input
} else {
let mut v = Vec::with_capacity(dict_len + input.len());
v.extend_from_slice(dict);
v.extend_from_slice(input);
buf = v;
&buf
};
let n = input.len();
let nb_attempts = nb_attempts_for_level(level);
let mut head = alloc::vec![HASH_EMPTY; HC_HASH_TABLE_SIZE];
let mut chain = alloc::vec![HASH_EMPTY; n];
let match_limit = n - MFLIMIT; let hash_limit = n - MIN_MATCH - LAST_LITERALS; let forward_limit = n - LAST_LITERALS;
let mut inserted_through: usize = 0;
let mut anchor: usize = dict_len;
let mut ip: usize = dict_len;
macro_rules! insert_up_to {
($up_to:expr) => {{
let up_to = $up_to;
while inserted_through < up_to && inserted_through <= hash_limit {
hc_insert(input, inserted_through, &mut head, &mut chain);
inserted_through += 1;
}
}};
}
insert_up_to!(dict_len);
while ip <= match_limit && ip <= hash_limit {
insert_up_to!(ip + 1);
let found = hc_longest_match(input, ip, &head, &chain, nb_attempts, forward_limit);
let (mut cur_start, mut cur_mpos, mut cur_len) = match found {
None => {
ip += 1;
continue;
}
Some(f) => hc_resolve(input, ip, f, anchor),
};
loop {
let next = cur_start + 1;
if next > match_limit || next > hash_limit {
break;
}
insert_up_to!(next + 1);
if let Some(f) =
hc_longest_match(input, next, &head, &chain, nb_attempts, forward_limit)
{
let (ns, nmp, nl) = hc_resolve(input, next, f, anchor);
if nl > cur_len {
cur_start = ns;
cur_mpos = nmp;
cur_len = nl;
continue;
}
}
break;
}
let literal_len = cur_start - anchor;
let offset = (cur_start - cur_mpos) as u16;
let match_excess = cur_len - MIN_MATCH;
emit_sequence(
&input[anchor..cur_start],
literal_len,
offset,
match_excess,
out,
);
let match_end = cur_start + cur_len;
insert_up_to!(match_end);
anchor = match_end;
ip = match_end;
}
emit_last_literals(&input[anchor..], out);
}
#[inline]
fn literals_price(litlen: usize) -> usize {
let mut price = litlen;
if litlen >= 15 {
price += 1 + (litlen - 15) / 255;
}
price
}
#[inline]
fn marginal_literal_price(run: usize) -> usize {
1 + (literals_price(run + 1) - literals_price(run) - 1)
}
#[inline]
fn sequence_overhead(mlen: usize) -> usize {
let mut price = 1 + 2; if mlen >= ML_MASK_PLUS_MIN {
price += 1 + (mlen - ML_MASK_PLUS_MIN) / 255;
}
price
}
const ML_MASK_PLUS_MIN: usize = 15 + MIN_MATCH;
const OPT_SUFFICIENT_LEN: usize = 64;
#[derive(Clone, Copy)]
struct OptStep {
litlen: usize,
match_pos: usize,
match_len: usize,
}
fn encode_block_optimal_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
out.clear();
if input.is_empty() {
return;
}
if input.len() < MFLIMIT + 1 {
emit_last_literals(input, out);
return;
}
let dict_len = dict.len();
let buf: Vec<u8>;
let input: &[u8] = if dict_len == 0 {
input
} else {
let mut v = Vec::with_capacity(dict_len + input.len());
v.extend_from_slice(dict);
v.extend_from_slice(input);
buf = v;
&buf
};
let n = input.len();
let nb_attempts = nb_attempts_for_level(level);
let mut head = alloc::vec![HASH_EMPTY; HC_HASH_TABLE_SIZE];
let mut chain = alloc::vec![HASH_EMPTY; n];
let match_limit = n - MFLIMIT; let hash_limit = n - MIN_MATCH - LAST_LITERALS; let forward_limit = n - LAST_LITERALS;
let mut price = alloc::vec![usize::MAX; n + 1];
let mut run = alloc::vec![0usize; n + 1];
let mut step = alloc::vec![
OptStep {
litlen: 0,
match_pos: usize::MAX,
match_len: 0,
};
n + 1
];
price[dict_len] = 0;
let mut inserted_through = 0usize;
macro_rules! insert_up_to {
($up_to:expr) => {{
let up_to = $up_to;
while inserted_through < up_to && inserted_through <= hash_limit {
hc_insert(input, inserted_through, &mut head, &mut chain);
inserted_through += 1;
}
}};
}
insert_up_to!(dict_len);
let mut i = dict_len;
while i < n {
if price[i] == usize::MAX {
i += 1;
continue; }
let cur_price = price[i];
let cur_run = run[i];
{
let lit_cost = cur_price + marginal_literal_price(cur_run);
if lit_cost < price[i + 1] {
price[i + 1] = lit_cost;
run[i + 1] = cur_run + 1;
step[i + 1] = OptStep {
litlen: cur_run + 1,
match_pos: usize::MAX,
match_len: 0,
};
}
}
if i > match_limit || i > hash_limit {
i += 1;
continue;
}
insert_up_to!(i + 1);
let found = hc_longest_match(input, i, &head, &chain, nb_attempts, forward_limit);
let (best_pos, best_len) = match found {
Some(f) => f,
None => {
i += 1;
continue;
}
};
if best_len >= OPT_SUFFICIENT_LEN {
let end = i + best_len;
let cost = cur_price + sequence_overhead(best_len);
if cost < price[end] {
price[end] = cost;
run[end] = 0;
step[end] = OptStep {
litlen: cur_run,
match_pos: best_pos,
match_len: best_len,
};
}
insert_up_to!(end);
i = end;
continue;
}
for mlen in MIN_MATCH..=best_len {
let end = i + mlen;
if end > n {
break;
}
let cost = cur_price + sequence_overhead(mlen);
if cost < price[end] {
price[end] = cost;
run[end] = 0;
step[end] = OptStep {
litlen: cur_run,
match_pos: best_pos,
match_len: mlen,
};
}
}
i += 1;
}
let mut path: Vec<OptStep> = Vec::new();
let mut pos = n;
while pos > dict_len {
let s = step[pos];
if s.match_pos == usize::MAX {
pos -= 1;
} else {
path.push(s);
pos -= s.match_len;
}
}
path.reverse();
let mut anchor = dict_len;
for s in &path {
let match_start = {
anchor + s.litlen
};
let offset = (match_start - s.match_pos) as u16;
let match_excess = s.match_len - MIN_MATCH;
emit_sequence(
&input[anchor..match_start],
s.litlen,
offset,
match_excess,
out,
);
anchor = match_start + s.match_len;
}
emit_last_literals(&input[anchor..], out);
}
fn emit_sequence(
literals: &[u8],
literal_len: usize,
offset: u16,
match_excess: usize,
out: &mut Vec<u8>,
) {
let lit_high = if literal_len >= 15 {
15u8
} else {
literal_len as u8
};
let match_low = if match_excess >= 15 {
15u8
} else {
match_excess as u8
};
let token = (lit_high << 4) | match_low;
out.push(token);
if literal_len >= 15 {
let mut rem = literal_len - 15;
while rem >= 255 {
out.push(255);
rem -= 255;
}
out.push(rem as u8);
}
out.extend_from_slice(literals);
out.push((offset & 0xFF) as u8);
out.push((offset >> 8) as u8);
if match_excess >= 15 {
let mut rem = match_excess - 15;
while rem >= 255 {
out.push(255);
rem -= 255;
}
out.push(rem as u8);
}
}
fn emit_last_literals(literals: &[u8], out: &mut Vec<u8>) {
let literal_len = literals.len();
let lit_high = if literal_len >= 15 {
15u8
} else {
literal_len as u8
};
out.push(lit_high << 4);
if literal_len >= 15 {
let mut rem = literal_len - 15;
while rem >= 255 {
out.push(255);
rem -= 255;
}
out.push(rem as u8);
}
out.extend_from_slice(literals);
}
pub fn decode_block(input: &[u8], out: &mut Vec<u8>, raw_max: usize) -> Result<(), Error> {
out.clear();
if input.is_empty() {
return Ok(());
}
let mut ip = 0usize;
let n = input.len();
loop {
if ip >= n {
return Err(Error::UnexpectedEnd);
}
let token = input[ip];
ip += 1;
let mut lit_len = (token >> 4) as usize;
if lit_len == 15 {
loop {
if ip >= n {
return Err(Error::UnexpectedEnd);
}
let b = input[ip];
ip += 1;
lit_len = lit_len.checked_add(b as usize).ok_or(Error::Corrupt)?;
if b != 255 {
break;
}
}
}
if lit_len > 0 {
if ip + lit_len > n {
return Err(Error::UnexpectedEnd);
}
if out.len() + lit_len > raw_max {
return Err(Error::Corrupt);
}
out.extend_from_slice(&input[ip..ip + lit_len]);
ip += lit_len;
}
if ip == n {
return Ok(());
}
if ip + 2 > n {
return Err(Error::UnexpectedEnd);
}
let offset = (input[ip] as usize) | ((input[ip + 1] as usize) << 8);
ip += 2;
if offset == 0 {
return Err(Error::InvalidDistance);
}
if offset > out.len() {
return Err(Error::InvalidDistance);
}
let mut match_excess = (token & 0x0F) as usize;
if match_excess == 15 {
loop {
if ip >= n {
return Err(Error::UnexpectedEnd);
}
let b = input[ip];
ip += 1;
match_excess = match_excess.checked_add(b as usize).ok_or(Error::Corrupt)?;
if b != 255 {
break;
}
}
}
let match_len = MIN_MATCH + match_excess;
if out.len() + match_len > raw_max {
return Err(Error::Corrupt);
}
let start = out.len() - offset;
if offset >= match_len {
out.extend_from_within(start..start + match_len);
} else if offset == 1 {
let b = out[start];
out.resize(out.len() + match_len, b);
} else {
let mut remaining = match_len;
while remaining > 0 {
let chunk = remaining.min(offset);
let s = out.len() - offset;
out.extend_from_within(s..s + chunk);
remaining -= chunk;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn round_trip(data: &[u8]) {
let mut encoded = Vec::new();
encode_block(data, &mut encoded);
let mut decoded = Vec::new();
decode_block(&encoded, &mut decoded, usize::MAX).expect("decode");
assert_eq!(decoded, data);
}
fn round_trip_level(data: &[u8], level: u8) {
let mut encoded = Vec::new();
encode_block_level(data, &mut encoded, level);
let mut decoded = Vec::new();
decode_block(&encoded, &mut decoded, usize::MAX).expect("decode");
assert_eq!(decoded, data, "round-trip mismatch at level {level}");
}
#[test]
fn empty() {
round_trip(&[]);
}
#[test]
fn short() {
round_trip(b"hello");
}
#[test]
fn run() {
let v = alloc::vec![b'a'; 1024];
round_trip(&v);
}
#[test]
fn repeated_text() {
let mut v = Vec::new();
for _ in 0..200 {
v.extend_from_slice(b"the quick brown fox jumps over the lazy dog. ");
}
round_trip(&v);
}
#[test]
fn hc_round_trip_all_levels() {
let mut text = Vec::new();
for _ in 0..200 {
text.extend_from_slice(b"the quick brown fox jumps over the lazy dog. ");
}
let mut prng = Vec::new();
let mut s: u32 = 0x1234_5678;
for _ in 0..8192 {
s = s.wrapping_mul(1_103_515_245).wrapping_add(12345);
prng.push((s >> 16) as u8);
}
for level in 0..=12u8 {
round_trip_level(&text, level);
round_trip_level(b"hello", level);
round_trip_level(&[], level);
round_trip_level(&alloc::vec![b'x'; 5000], level);
round_trip_level(&prng, level);
}
}
#[test]
fn hc_not_worse_than_fast() {
let mut v = Vec::new();
for i in 0..5000u32 {
v.extend_from_slice(&i.to_le_bytes());
v.extend_from_slice(b"common suffix string here ");
}
let mut fast = Vec::new();
encode_block(&v, &mut fast);
let mut hc = Vec::new();
encode_block_level(&v, &mut hc, 9);
assert!(
hc.len() <= fast.len(),
"hc {} should be <= fast {}",
hc.len(),
fast.len()
);
}
fn assert_eob_rules(encoded: &[u8], raw_len: usize) {
if encoded.is_empty() {
assert_eq!(raw_len, 0);
return;
}
let mut i = 0usize;
let mut outpos = 0usize;
let n = encoded.len();
loop {
let token = encoded[i];
i += 1;
let mut lit = (token >> 4) as usize;
if lit == 15 {
loop {
let b = encoded[i];
i += 1;
lit += b as usize;
if b != 255 {
break;
}
}
}
i += lit;
outpos += lit;
if i == n {
if raw_len >= LAST_LITERALS {
assert!(
lit >= LAST_LITERALS,
"final literal run {lit} < {LAST_LITERALS}"
);
}
break;
}
let match_start = outpos;
assert!(
match_start + MFLIMIT <= raw_len,
"match starts at {match_start}, within MFLIMIT of end {raw_len}"
);
i += 2; let mut ml = (token & 0x0F) as usize;
if ml == 15 {
loop {
let b = encoded[i];
i += 1;
ml += b as usize;
if b != 255 {
break;
}
}
}
ml += MIN_MATCH;
outpos += ml;
}
assert_eq!(outpos, raw_len, "decoded length mismatch");
}
#[test]
fn end_of_block_rules_all_levels() {
let mut v = Vec::new();
for _ in 0..400 {
v.extend_from_slice(b"alpha beta gamma delta epsilon ");
}
v.extend_from_slice(b"alpha beta gamma delta epsilon");
for level in 0..=12u8 {
let mut enc = Vec::new();
encode_block_level(&v, &mut enc, level);
assert_eob_rules(&enc, v.len());
let mut dec = Vec::new();
decode_block(&enc, &mut dec, usize::MAX).expect("decode");
assert_eq!(dec, v, "round-trip at level {level}");
}
}
}