#![allow(dead_code)]
use alloc::vec;
use alloc::vec::Vec;
use core::hash::Hasher;
use twox_hash::XxHash64;
use super::opt::ldm::HcRawSeq;
pub(crate) mod gear_hash;
pub(crate) mod params;
pub(crate) mod search;
pub(crate) mod table;
use gear_hash::{GearHashState, LDM_BATCH_SIZE};
use params::LdmParams;
use search::{FindBestMatchInputs, find_best_match};
use table::LdmHashTable;
const LDM_XXH64_SEED: u64 = 0;
pub(crate) struct LdmProducer {
params: LdmParams,
hash_state: GearHashState,
hash_table: LdmHashTable,
splits_scratch: Vec<usize>,
}
impl LdmProducer {
pub(crate) fn new(params: LdmParams) -> Self {
let hash_state = GearHashState::new(params.min_match_length as usize, params.hash_rate_log);
let hash_table = LdmHashTable::new(params.hash_log, params.bucket_size_log);
Self {
params,
hash_state,
hash_table,
splits_scratch: vec![0usize; LDM_BATCH_SIZE],
}
}
pub(crate) fn with_window_and_strategy(window_log: u32, strategy: u32) -> Self {
Self::new(LdmParams::adjust_for(window_log, strategy))
}
pub(crate) fn clear(&mut self) {
self.hash_table.clear();
self.hash_state = GearHashState::new(
self.params.min_match_length as usize,
self.params.hash_rate_log,
);
}
pub(crate) fn params(&self) -> LdmParams {
self.params
}
pub(crate) fn generate_into(
&mut self,
live_history: &[u8],
history_abs_start: usize,
block_start_abs: usize,
block_end_abs: usize,
out: &mut Vec<HcRawSeq>,
) {
debug_assert!(history_abs_start <= block_start_abs);
debug_assert!(block_start_abs <= block_end_abs);
debug_assert!(block_end_abs <= history_abs_start + live_history.len());
let min_match = self.params.min_match_length as usize;
if block_end_abs.saturating_sub(block_start_abs) < min_match {
return;
}
let hash_id_mask: u32 = self.hash_table.bucket_mask();
let to_idx = |abs: usize| abs - history_abs_start;
self.hash_state = GearHashState::new(min_match, self.params.hash_rate_log);
let reset_start_idx = to_idx(block_start_abs);
gear_hash::reset(
&mut self.hash_state,
&live_history[reset_start_idx..reset_start_idx + min_match],
);
let mut anchor_abs = block_start_abs;
let mut ip_abs = block_start_abs + min_match;
const HASH_READ_SIZE: usize = 8;
let ilimit_abs = block_end_abs.saturating_sub(HASH_READ_SIZE);
while ip_abs < ilimit_abs {
let chunk_idx = to_idx(ip_abs);
let chunk_end_idx = to_idx(ilimit_abs);
let chunk = &live_history[chunk_idx..chunk_end_idx];
let (hashed, num_splits) =
gear_hash::feed(&mut self.hash_state, chunk, &mut self.splits_scratch);
let mut split_n = 0usize;
while split_n < num_splits {
let s = self.splits_scratch[split_n];
split_n += 1;
if s + ip_abs < block_start_abs + min_match {
continue;
}
let split_abs = ip_abs + s - min_match;
let window_end_abs = split_abs + min_match;
if window_end_abs > block_end_abs {
continue;
}
let split_idx = to_idx(split_abs);
let window_end_idx = to_idx(window_end_abs);
let mut hasher = XxHash64::with_seed(LDM_XXH64_SEED);
hasher.write(&live_history[split_idx..window_end_idx]);
let xxhash = hasher.finish();
let hash_id = (xxhash as u32) & hash_id_mask;
let checksum = (xxhash >> 32) as u32;
self.hash_table.ensure_room_for(split_abs);
if split_abs < anchor_abs {
self.hash_table
.insert_absolute(hash_id, split_abs, checksum);
continue;
}
let best = find_best_match(
&self.hash_table,
hash_id,
checksum,
FindBestMatchInputs {
live_history,
history_abs_start,
split_abs,
anchor_abs,
lowest_index_abs: history_abs_start,
iend_abs: block_end_abs,
min_match_length: min_match,
},
);
let Some(best) = best else {
self.hash_table
.insert_absolute(hash_id, split_abs, checksum);
continue;
};
let offset = split_abs - best.match_pos;
let lit_length = split_abs - best.backward_len - anchor_abs;
let match_length = best.total_len();
out.push(HcRawSeq {
lit_length,
offset,
match_length,
});
self.hash_table
.insert_absolute(hash_id, split_abs, checksum);
anchor_abs = split_abs + best.forward_len;
if anchor_abs > ip_abs + hashed {
let reset_start_abs = anchor_abs.saturating_sub(min_match);
if reset_start_abs + min_match <= block_end_abs
&& reset_start_abs >= history_abs_start
{
let reset_idx = to_idx(reset_start_abs);
self.hash_state = GearHashState::new(min_match, self.params.hash_rate_log);
gear_hash::reset(
&mut self.hash_state,
&live_history[reset_idx..reset_idx + min_match],
);
ip_abs = anchor_abs.saturating_sub(hashed).max(history_abs_start);
}
break;
}
}
ip_abs += hashed.max(1);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn producer_constructs_with_donor_default_params() {
let p = LdmParams::adjust_for(27, 9);
assert_eq!(p.window_log, 27);
assert_eq!(p.min_match_length, 32);
assert_eq!(p.hash_rate_log, 4);
assert_eq!(p.bucket_size_log, 8);
}
fn test_params() -> LdmParams {
LdmParams {
window_log: 27,
hash_log: 10,
hash_rate_log: 4,
min_match_length: 32,
bucket_size_log: 4,
}
}
#[test]
fn clear_resets_rolling_hash_state() {
let mut producer = LdmProducer::new(test_params());
let mut out = Vec::new();
let data = [0xAAu8; 256];
producer.generate_into(&data, 0, 0, data.len(), &mut out);
let advanced = producer.hash_state.rolling;
assert_ne!(
advanced,
gear_hash::GEAR_HASH_INIT,
"rolling hash should have moved after generate_into"
);
producer.clear();
assert_eq!(
producer.hash_state.rolling,
gear_hash::GEAR_HASH_INIT,
"clear must rewind to GEAR_HASH_INIT"
);
}
#[test]
fn generate_into_emits_long_range_match_on_repeated_payload() {
let mut producer = LdmProducer::new(test_params());
let p = producer.params();
assert_eq!(p.min_match_length, 32);
const PAYLOAD: usize = 4096;
const GAP: usize = 64 * 1024;
let mut history = Vec::with_capacity(2 * PAYLOAD + GAP);
let mut prng: u32 = 0x1234_5678;
let payload: alloc::vec::Vec<u8> = (0..PAYLOAD)
.map(|_| {
prng = prng.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(prng >> 16) as u8
})
.collect();
history.extend_from_slice(&payload);
history.extend((0..GAP).map(|i| (i % 251) as u8));
history.extend_from_slice(&payload);
let mut out = Vec::new();
producer.generate_into(&history, 0, 0, history.len(), &mut out);
assert!(
!out.is_empty(),
"long-range repetition must produce at least one LDM sequence"
);
for seq in &out {
assert!(
seq.match_length >= p.min_match_length as usize,
"every emitted match must reach min_match_length \
(got match_length = {})",
seq.match_length
);
}
let crossing_gap = out.iter().any(|s| s.offset >= GAP);
assert!(
crossing_gap,
"at least one emitted sequence must have offset >= GAP \
({GAP}); offsets observed: {:?}",
out.iter().map(|s| s.offset).collect::<alloc::vec::Vec<_>>()
);
}
#[test]
fn generate_into_preserves_bucket_entries_across_history_slide() {
let mut producer = LdmProducer::new(test_params());
let p = producer.params();
assert_eq!(p.min_match_length, 32);
const PAYLOAD: usize = 4096;
const GAP_A: usize = 32 * 1024;
const GAP_B: usize = 32 * 1024;
let mut prng: u32 = 0xC0FFEEEE;
let payload: alloc::vec::Vec<u8> = (0..PAYLOAD)
.map(|_| {
prng = prng.wrapping_mul(1_103_515_245).wrapping_add(12_345);
(prng >> 16) as u8
})
.collect();
let mut frame = alloc::vec::Vec::with_capacity(2 * PAYLOAD + GAP_A + GAP_B);
frame.extend_from_slice(&payload);
frame.extend((0..GAP_A).map(|i| (i % 251) as u8));
frame.extend_from_slice(&payload);
frame.extend((0..GAP_B).map(|i| ((i + 17) % 241) as u8));
let mut out1 = Vec::new();
let split_at = PAYLOAD + GAP_A;
producer.generate_into(&frame[..split_at], 0, 0, split_at, &mut out1);
let eviction = PAYLOAD / 2; let live = &frame[eviction..];
let history_abs_start = eviction;
let block_start_abs = PAYLOAD + GAP_A; let block_end_abs = block_start_abs + PAYLOAD;
let mut out2 = Vec::new();
producer.generate_into(
live,
history_abs_start,
block_start_abs,
block_end_abs,
&mut out2,
);
assert!(
!out2.is_empty(),
"cross-slide long-range match must survive a window eviction"
);
let live_len = live.len();
for seq in &out2 {
assert!(
seq.match_length >= p.min_match_length as usize,
"every emitted match must reach min_match_length (got {})",
seq.match_length
);
assert!(
seq.offset <= live_len,
"back-ref offset {} must stay within the live \
history (= {} bytes); offsets larger than this \
are the smoking gun for stale slice-relative \
entries surviving the eviction",
seq.offset,
live_len
);
}
let crossed_gap = out2.iter().any(|s| s.offset >= GAP_A);
assert!(
crossed_gap,
"at least one emitted sequence must hit a back-ref of \
>= GAP_A ({GAP_A}) — that's the long-range match \
into the surviving tail of payload #1 the bucket \
entries from call 1 are supposed to produce; \
offsets observed: {:?}",
out2.iter()
.map(|s| s.offset)
.collect::<alloc::vec::Vec<_>>()
);
}
#[test]
fn generate_into_empty_range_is_noop() {
let mut producer = LdmProducer::new(test_params());
let mut out = Vec::new();
let data = [0u8; 128];
let pre = producer.hash_state.rolling;
producer.generate_into(&data, 0, 64, 64, &mut out);
assert!(out.is_empty());
assert_eq!(producer.hash_state.rolling, pre);
}
}