use crate::verbatim::Token;
use core::mem::MaybeUninit;
const HASH_BITS: u32 = 15;
const HASH_SIZE: usize = 1 << HASH_BITS;
const HASH_MASK: usize = HASH_SIZE - 1;
const MAX_CHAIN_DEPTH: usize = 16;
pub const MIN_MATCH: usize = 3;
pub const MAX_MATCH: usize = crate::verbatim::MAX_MATCH_LEN;
struct BestMatch {
length: usize,
offset: u32,
}
fn hash4(bytes: &[u8]) -> usize {
let v = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
((v.wrapping_mul(2654435761)) >> (32 - HASH_BITS)) as usize & HASH_MASK
}
fn common_prefix_len(a: &[u8], b: &[u8], max: usize) -> usize {
let max = max.min(a.len()).min(b.len());
let mut len = 0;
while len + 8 <= max {
let aw = usize::from_ne_bytes(a[len..len + 8].try_into().unwrap());
let bw = usize::from_ne_bytes(b[len..len + 8].try_into().unwrap());
let diff = aw ^ bw;
if diff == 0 {
len += 8;
continue;
}
let bit = if cfg!(target_endian = "little") { diff.trailing_zeros() } else { diff.leading_zeros() };
return len + (bit / 8) as usize;
}
while len < max && a[len] == b[len] {
len += 1;
}
len
}
pub struct MatchFinder {
max_offset: usize,
history: Vec<u8>,
base: u64,
head: Vec<u32>,
prev: Vec<u32>,
}
impl MatchFinder {
pub fn new(max_offset: usize) -> Self {
Self { max_offset, history: Vec::new(), base: 0, head: vec![u32::MAX; HASH_SIZE], prev: Vec::new() }
}
pub fn process(&mut self, chunk: &[u8], tokens: &mut Vec<Token>) {
tokens.clear();
tokens.reserve(chunk.len());
let chunk_start_abs = self.base + self.history.len() as u64;
self.history.extend_from_slice(chunk);
self.prev.resize(self.history.len(), u32::MAX);
let post_extend_end_abs = self.base + self.history.len() as u64;
debug_assert!(
post_extend_end_abs <= u32::MAX as u64,
"lzxc MatchFinder: input crossed 4 GiB (base={}, history_len={}); \
stored chain positions truncate to u32 past this point and compression \
will silently fall back to all-literal output. Segment your input and \
construct a fresh Encoder per segment.",
self.base,
self.history.len(),
);
let base = self.base;
let head: &mut [u32] = &mut self.head;
let prev: &mut [u32] = &mut self.prev;
let history: &[u8] = &self.history;
let history_len = history.len();
let end_abs = base + history_len as u64;
let chunk_start_rel = (chunk_start_abs - base) as usize;
let mut p_rel = chunk_start_rel;
let mut tok_written: usize = 0;
{
let tokens_spare: &mut [MaybeUninit<Token>] = tokens.spare_capacity_mut();
while p_rel + 4 <= history_len {
let p_abs = base + p_rel as u64;
let h = hash4(&history[p_rel..p_rel + 4]);
let best = Self::find_best_match(history, head, prev, base, p_abs, p_rel, h, end_abs, self.max_offset);
if best.length >= MIN_MATCH {
let length = core::num::NonZeroU32::new(best.length as u32).expect("best.length >= MIN_MATCH >= 3");
tokens_spare[tok_written] = MaybeUninit::new(Token::Match { offset: best.offset, length });
prev[p_rel] = head[h];
head[h] = p_abs as u32;
tok_written += 1;
p_rel += best.length;
} else {
tokens_spare[tok_written] = MaybeUninit::new(Token::Literal(history[p_rel]));
prev[p_rel] = head[h];
head[h] = p_abs as u32;
tok_written += 1;
p_rel += 1;
}
}
for &byte in &history[p_rel..history_len] {
tokens_spare[tok_written] = MaybeUninit::new(Token::Literal(byte));
tok_written += 1;
}
}
unsafe { tokens.set_len(tok_written) };
self.trim_history();
}
#[allow(clippy::too_many_arguments)]
fn find_best_match(
history: &[u8],
head: &[u32],
prev: &[u32],
base: u64,
p_abs: u64,
p_rel: usize,
hash: usize,
end_abs: u64,
max_offset: usize,
) -> BestMatch {
let max_possible = MAX_MATCH.min((end_abs - p_abs) as usize);
let mut best = BestMatch { length: 0, offset: 0 };
let mut candidate_abs = head[hash] as u64;
let mut depth = 0;
while depth < MAX_CHAIN_DEPTH && candidate_abs != u32::MAX as u64 {
if candidate_abs < base || candidate_abs >= p_abs {
break;
}
let offset = p_abs - candidate_abs;
if offset > max_offset as u64 || offset == 0 {
break;
}
let c_rel = (candidate_abs - base) as usize;
let next_candidate_abs = prev[c_rel] as u64;
if best.length > 0 && history[c_rel + best.length] != history[p_rel + best.length] {
candidate_abs = next_candidate_abs;
depth += 1;
continue;
}
let len = common_prefix_len(&history[c_rel..], &history[p_rel..], max_possible);
if len > best.length && len >= MIN_MATCH {
best = BestMatch { length: len, offset: offset as u32 };
if len >= max_possible {
break;
}
}
candidate_abs = next_candidate_abs;
depth += 1;
}
best
}
fn trim_history(&mut self) {
let keep = self.max_offset + crate::MAX_CHUNK_SIZE;
let trim_threshold = keep.saturating_add(keep);
if self.history.len() >= trim_threshold {
let drop = self.history.len() - keep;
self.history.drain(0..drop);
self.prev.drain(0..drop);
self.base += drop as u64;
}
}
}
#[cfg(test)]
fn greedy_match_finder(input: &[u8]) -> Vec<Token> {
let mut mf = MatchFinder::new(input.len().max(MAX_MATCH));
let mut tokens = Vec::new();
mf.process(input, &mut tokens);
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_matches_short_input() {
let tokens = greedy_match_finder(b"ab");
assert_eq!(tokens.len(), 2);
for t in &tokens {
assert!(matches!(t, Token::Literal(_)));
}
}
#[test]
fn finds_simple_repeat() {
let input = b"abcdabcd";
let tokens = greedy_match_finder(input);
let mut seen_match = false;
for t in &tokens {
if let Token::Match { offset, length } = t {
seen_match = true;
assert_eq!(*offset, 4);
assert_eq!(length.get(), 4);
}
}
assert!(seen_match);
}
#[test]
fn cross_chunk_match_detected() {
let mut mf = MatchFinder::new(1 << 16);
let chunk1 = b"filler-bytes-here..WXYZ";
let chunk2 = b"more-stuff-then-WXYZ-again";
let mut tokens1 = Vec::new();
let mut tokens2 = Vec::new();
mf.process(chunk1, &mut tokens1);
mf.process(chunk2, &mut tokens2);
let mut pos = 0usize;
let mut found_cross_chunk = false;
for t in &tokens2 {
match t {
Token::Literal(_) => pos += 1,
Token::Match { offset, length } => {
if length.get() >= 4 && (*offset as usize) > pos {
found_cross_chunk = true;
}
pos += length.get() as usize;
}
}
}
assert!(found_cross_chunk, "expected a match referencing chunk 1; tokens2 = {:?}", tokens2);
}
}