const MAX_HASH_TABLE_SIZE: usize = 1 << 14;
const MIN_MATCH_LEN: usize = 4;
const MAX_BLOCK_SIZE: usize = 1 << 24;
pub fn max_compress_len(input_len: usize) -> usize {
let varint_len = varint_encoded_len(input_len);
varint_len + 32 + input_len + input_len / 6
}
pub fn compress(input: &[u8]) -> Vec<u8> {
if input.is_empty() {
return vec![0];
}
let mut output = Vec::with_capacity(max_compress_len(input.len()));
encode_varint(input.len(), &mut output);
if input.len() < MIN_MATCH_LEN {
emit_literal(input, &mut output);
return output;
}
let mut src_pos = 0;
while src_pos < input.len() {
let block_end = (src_pos + MAX_BLOCK_SIZE).min(input.len());
let block = &input[src_pos..block_end];
compress_block(block, src_pos, &mut output);
src_pos = block_end;
}
output
}
fn compress_block(input: &[u8], _base_offset: usize, output: &mut Vec<u8>) {
let input_len = input.len();
if input_len < MIN_MATCH_LEN {
emit_literal(input, output);
return;
}
let hash_table_size = hash_table_size(input_len);
let hash_shift = 32u32.saturating_sub(log2_floor(hash_table_size as u32));
let mut hash_table = vec![0u32; hash_table_size];
let mut literal_start = 0;
let input_limit = if input_len > MIN_MATCH_LEN + 1 {
input_len - MIN_MATCH_LEN - 1
} else {
emit_literal(input, output);
return;
};
let mut next_hash = hash4(&input[1..], hash_shift);
let mut pos = 1;
'outer: loop {
let mut candidate;
let mut skip = 32;
let mut next_pos;
loop {
let hash = next_hash;
let bytes_between = skip >> 5;
skip += 1;
next_pos = pos + bytes_between;
if next_pos > input_limit {
break 'outer;
}
candidate = hash_table[hash as usize] as usize;
hash_table[hash as usize] = pos as u32;
next_hash = hash4(&input[next_pos..], hash_shift);
if candidate < pos
&& pos.wrapping_sub(candidate) <= 65535
&& input[candidate..candidate + 4] == input[pos..pos + 4]
{
break;
}
pos = next_pos;
}
if literal_start < pos {
emit_literal(&input[literal_start..pos], output);
}
let match_offset = pos - candidate;
let match_len = find_match_length(&input[candidate + 4..], &input[pos + 4..]) + 4;
emit_copy(match_offset, match_len, output);
pos += match_len;
literal_start = pos;
if pos >= input_limit {
break;
}
let insert_end = (pos - 1).min(pos.saturating_sub(1));
if pos >= 2 {
let h = hash4(&input[pos - 2..], hash_shift);
hash_table[h as usize] = (pos - 2) as u32;
let h = hash4(&input[pos - 1..], hash_shift);
hash_table[h as usize] = (pos - 1) as u32;
}
let _ = insert_end;
literal_start = pos;
next_hash = hash4(&input[pos + 1..], hash_shift);
pos += 1;
}
if literal_start < input_len {
emit_literal(&input[literal_start..], output);
}
}
fn find_match_length(s1: &[u8], s2: &[u8]) -> usize {
let limit = s1.len().min(s2.len());
let mut i = 0;
while i + 8 <= limit {
let v1 = u64::from_le_bytes([
s1[i],
s1[i + 1],
s1[i + 2],
s1[i + 3],
s1[i + 4],
s1[i + 5],
s1[i + 6],
s1[i + 7],
]);
let v2 = u64::from_le_bytes([
s2[i],
s2[i + 1],
s2[i + 2],
s2[i + 3],
s2[i + 4],
s2[i + 5],
s2[i + 6],
s2[i + 7],
]);
if v1 != v2 {
let xor = v1 ^ v2;
return i + (xor.trailing_zeros() as usize / 8);
}
i += 8;
}
while i < limit && s1[i] == s2[i] {
i += 1;
}
i
}
fn emit_literal(literal: &[u8], output: &mut Vec<u8>) {
let n = literal.len();
if n == 0 {
return;
}
let n_minus_1 = n - 1;
if n_minus_1 < 60 {
output.push((n_minus_1 as u8) << 2);
} else if n_minus_1 < 256 {
output.push(60 << 2);
output.push(n_minus_1 as u8);
} else if n_minus_1 < 65536 {
output.push(61 << 2);
output.push(n_minus_1 as u8);
output.push((n_minus_1 >> 8) as u8);
} else if n_minus_1 < (1 << 24) {
output.push(62 << 2);
output.push(n_minus_1 as u8);
output.push((n_minus_1 >> 8) as u8);
output.push((n_minus_1 >> 16) as u8);
} else {
output.push(63 << 2);
output.push(n_minus_1 as u8);
output.push((n_minus_1 >> 8) as u8);
output.push((n_minus_1 >> 16) as u8);
output.push((n_minus_1 >> 24) as u8);
}
output.extend_from_slice(literal);
}
fn emit_copy(offset: usize, mut length: usize, output: &mut Vec<u8>) {
while length > 0 {
if (4..=11).contains(&length) && offset <= 2047 {
let len_field = (length - 4) as u8;
let offset_hi = ((offset >> 8) & 0x07) as u8;
let offset_lo = (offset & 0xFF) as u8;
output.push((offset_hi << 5) | (len_field << 2) | 0x01);
output.push(offset_lo);
return;
} else if offset <= 65535 {
let emit_len = length.min(64);
let len_field = (emit_len - 1) as u8;
output.push((len_field << 2) | 0x02);
output.push((offset & 0xFF) as u8);
output.push(((offset >> 8) & 0xFF) as u8);
length -= emit_len;
} else {
let emit_len = length.min(64);
let len_field = (emit_len - 1) as u8;
output.push((len_field << 2) | 0x03);
output.push((offset & 0xFF) as u8);
output.push(((offset >> 8) & 0xFF) as u8);
output.push(((offset >> 16) & 0xFF) as u8);
output.push(((offset >> 24) & 0xFF) as u8);
length -= emit_len;
}
}
}
fn hash4(data: &[u8], shift: u32) -> u32 {
debug_assert!(data.len() >= 4);
let v = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
v.wrapping_mul(0x1E35A7BD) >> shift
}
fn hash_table_size(input_len: usize) -> usize {
let mut size = 256;
while size < MAX_HASH_TABLE_SIZE && size < input_len {
size *= 2;
}
size
}
fn log2_floor(n: u32) -> u32 {
if n == 0 {
return 0;
}
31 - n.leading_zeros()
}
fn encode_varint(mut value: usize, output: &mut Vec<u8>) {
loop {
if value < 128 {
output.push(value as u8);
return;
}
output.push((value as u8 & 0x7F) | 0x80);
value >>= 7;
}
}
fn varint_encoded_len(value: usize) -> usize {
let mut n = value;
let mut len = 1;
while n >= 128 {
n >>= 7;
len += 1;
}
len
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_varint_encoding() {
let mut buf = Vec::new();
encode_varint(0, &mut buf);
assert_eq!(buf, [0]);
buf.clear();
encode_varint(127, &mut buf);
assert_eq!(buf, [127]);
buf.clear();
encode_varint(128, &mut buf);
assert_eq!(buf, [0x80, 0x01]);
buf.clear();
encode_varint(300, &mut buf);
assert_eq!(buf, [0xAC, 0x02]);
}
#[test]
fn test_max_compress_len() {
assert!(max_compress_len(0) > 0);
assert!(max_compress_len(100) > 100);
assert!(max_compress_len(1_000_000) > 1_000_000);
}
#[test]
fn test_compress_empty() {
let result = compress(b"");
assert_eq!(result, vec![0]);
}
#[test]
fn test_compress_small() {
let input = b"abc";
let compressed = compress(input);
assert_eq!(compressed[0], 3);
assert!(compressed.len() > 1);
}
#[test]
fn test_compress_repeated() {
let input = vec![b'A'; 1000];
let compressed = compress(&input);
assert!(compressed.len() < input.len());
}
#[test]
fn test_hash4_deterministic() {
let data = [0x01, 0x02, 0x03, 0x04, 0x05];
let h1 = hash4(&data, 18);
let h2 = hash4(&data, 18);
assert_eq!(h1, h2);
}
#[test]
fn test_find_match_length() {
let s1 = b"abcdefgh";
let s2 = b"abcdefgh";
assert_eq!(find_match_length(s1, s2), 8);
let s2 = b"abcdXfgh";
assert_eq!(find_match_length(s1, s2), 4);
let s2 = b"Xbcdefgh";
assert_eq!(find_match_length(s1, s2), 0);
}
#[test]
fn test_emit_literal_small() {
let mut out = Vec::new();
emit_literal(b"Hello", &mut out);
assert_eq!(out[0], 16);
assert_eq!(&out[1..], b"Hello");
}
#[test]
fn test_emit_literal_large() {
let data = vec![0x42; 256];
let mut out = Vec::new();
emit_literal(&data, &mut out);
assert_eq!(out[0], (60 << 2));
assert_eq!(out[1], 255); }
#[test]
fn test_emit_copy_short() {
let mut out = Vec::new();
emit_copy(10, 4, &mut out);
assert_eq!(out.len(), 2);
assert_eq!(out[0] & 0x03, 0x01); }
#[test]
fn test_emit_copy_medium() {
let mut out = Vec::new();
emit_copy(3000, 20, &mut out);
assert_eq!(out.len(), 3);
assert_eq!(out[0] & 0x03, 0x02); }
#[test]
fn test_log2_floor() {
assert_eq!(log2_floor(1), 0);
assert_eq!(log2_floor(2), 1);
assert_eq!(log2_floor(3), 1);
assert_eq!(log2_floor(4), 2);
assert_eq!(log2_floor(16384), 14);
}
}