use super::count::{count_forward, count_forward_dict_2segment};
use super::hash_table::{FastHashTable, hash_ptr_raw};
use crate::encoding::Sequence;
#[cfg(feature = "kernel_trace")]
macro_rules! ktrace {
($($arg:tt)*) => {
if crate::encoding::simple::fast_kernel::kernel::kernel_trace_enabled() {
::std::eprintln!($($arg)*);
}
};
}
#[cfg(not(feature = "kernel_trace"))]
macro_rules! ktrace {
($($arg:tt)*) => {};
}
#[cfg(feature = "kernel_trace")]
pub(crate) fn kernel_trace_enabled() -> bool {
use core::sync::atomic::{AtomicU8, Ordering};
static CACHED: AtomicU8 = AtomicU8::new(0); match CACHED.load(Ordering::Relaxed) {
1 => false,
2 => true,
_ => {
let on = std::env::var("STRUCTURED_ZSTD_KERNEL_TRACE")
.map(|v| !v.is_empty())
.unwrap_or(false);
CACHED.store(if on { 2 } else { 1 }, Ordering::Relaxed);
on
}
}
}
const SEARCH_STRENGTH: usize = 8;
const K_STEP_INCR: usize = 1 << (SEARCH_STRENGTH - 1);
const HASH_READ_SIZE: usize = 8;
const MIN_DICT_MATCH_LEN: usize = 8;
#[inline(always)]
unsafe fn read32(ptr: *const u8) -> u32 {
unsafe { core::ptr::read_unaligned(ptr.cast::<u32>()) }
}
const CMOV_DUMMY: [u8; 4] = [0x12, 0x34, 0x56, 0x78];
#[inline(always)]
unsafe fn match_found<const USE_CMOV: bool>(
ip: *const u8,
base: *const u8,
match_idx: u32,
prefix_start_index: u32,
) -> bool {
let match_pos = match_idx as usize;
if USE_CMOV {
let in_range = match_idx >= prefix_start_index;
let mval_addr = if in_range {
unsafe { base.add(match_pos) }
} else {
CMOV_DUMMY.as_ptr()
};
let bytes_match = unsafe { read32(ip) == read32(mval_addr) };
#[allow(clippy::needless_bitwise_bool)]
let r = bytes_match & in_range;
r
} else {
if match_idx < prefix_start_index {
return false;
}
unsafe { read32(ip) == read32(base.add(match_pos)) }
}
}
pub(crate) struct FastBlockResult {
pub(crate) rep: [u32; 2],
pub(crate) tail_literals_len: usize,
}
#[derive(Clone, Copy)]
pub(crate) struct PrefixBounds {
pub prefix_start_index: u32,
pub window_low: u32,
}
#[inline(always)]
pub(crate) fn compress_block_fast<const MLS: u32, const USE_CMOV: bool>(
data: &[u8],
block_start: usize,
bounds: PrefixBounds,
hash_table: &mut FastHashTable,
rep: [u32; 2],
step_size: usize,
mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
) -> FastBlockResult {
let prefix_start_index = bounds.prefix_start_index;
let window_low = bounds.window_low;
assert!(
step_size >= 2,
"Fast kernel requires step_size >= 2 (got {step_size}); \
the upstream zstd formula clamps to a min of 2",
);
assert!(
(4..=8).contains(&MLS),
"Fast kernel only supports MLS in 4..=8 (got {MLS})",
);
assert_eq!(
MLS,
hash_table.mls(),
"compress_block_fast<{MLS}> called with hash_table whose mls = {}; \
the table's hash formula must match the kernel's monomorphised mls",
hash_table.mls(),
);
assert!(
block_start <= data.len(),
"block_start ({block_start}) must not exceed data.len() ({})",
data.len(),
);
assert!(
data.len() <= u32::MAX as usize,
"FastKernel does not support data.len() ({}) > u32::MAX ({}); \
the kernel stores absolute positions in a u32 hash table and \
u32 offset codes, so larger inputs would silently truncate",
data.len(),
u32::MAX,
);
if data.len() < block_start + HASH_READ_SIZE {
return FastBlockResult {
rep,
tail_literals_len: data.len() - block_start,
};
}
let base = data.as_ptr();
let iend_addr = data.len();
let ilimit = iend_addr - HASH_READ_SIZE;
let mut anchor: usize = block_start;
let mut ip0: usize = block_start;
if ip0 == 0 {
ip0 = 1;
}
let mut rep_offset1: u32 = rep[0];
let mut rep_offset2: u32 = rep[1];
let mut offset_saved1: u32 = 0;
let mut offset_saved2: u32 = 0;
{
let max_rep = (ip0 as u32).saturating_sub(window_low);
if rep_offset2 > max_rep {
offset_saved2 = rep_offset2;
rep_offset2 = 0;
}
if rep_offset1 > max_rep {
offset_saved1 = rep_offset1;
rep_offset1 = 0;
}
}
let mut step: usize = step_size;
let mut next_step: usize = ip0.saturating_add(K_STEP_INCR);
ktrace!(
"ENTER block_start={} ip0_initial={} ilimit={} window_low={} prefix={} rep1={} rep2={} step={} mls={}",
block_start,
ip0,
ilimit,
window_low,
prefix_start_index,
rep_offset1,
rep_offset2,
step_size,
MLS,
);
let (table, hlog) = hash_table.hot_state();
'restart: while ip0 < ilimit {
let mut ip1 = ip0 + 1;
let Some(mut ip2) = ip0.checked_add(step) else {
break;
};
let Some(mut ip3) = ip2.checked_add(1) else {
break;
};
if ip3 > ilimit {
break;
}
let mut hash0 = unsafe { hash_ptr_raw::<MLS>(base.add(ip0), hlog) };
let mut hash1 = unsafe { hash_ptr_raw::<MLS>(base.add(ip1), hlog) };
let mut match_idx = unsafe { *table.get_unchecked(hash0 as usize) };
ktrace!(
"OUTER ip0={} ip1={} ip2={} ip3={} step={} hash0={} hash1={} match_idx={} rep1={} rep2={}",
ip0,
ip1,
ip2,
ip3,
step,
hash0,
hash1,
match_idx,
rep_offset1,
rep_offset2
);
enum MatchFound {
Rep {
new_ip: usize,
match0: usize,
m_len: usize,
current0: usize,
},
Explicit {
new_ip: usize,
match_idx: u32,
current0: usize,
},
}
let found: Option<MatchFound> = loop {
let rval = unsafe { read32(base.add(ip2 - rep_offset1 as usize)) };
ktrace!("PUT hash0={} pos={} (iter-start)", hash0, ip0);
unsafe { *table.get_unchecked_mut(hash0 as usize) = ip0 as u32 };
if (rep_offset1 > 0) & (unsafe { read32(base.add(ip2)) } == rval) {
let mut new_ip = ip2;
let mut match0 = new_ip - rep_offset1 as usize;
let mut m_len: usize = 4;
if new_ip > anchor
&& match0 > window_low as usize
&& unsafe { *base.add(new_ip - 1) == *base.add(match0 - 1) }
{
new_ip -= 1;
match0 -= 1;
m_len += 1;
}
ktrace!("PUT hash1={} pos={} (rep-emit post)", hash1, ip1);
unsafe { *table.get_unchecked_mut(hash1 as usize) = ip1 as u32 };
ktrace!(
"MATCH rep new_ip={} match0={} m_len={} offset={}",
new_ip,
match0,
m_len,
rep_offset1
);
break Some(MatchFound::Rep {
new_ip,
match0,
m_len,
current0: ip0,
});
}
ktrace!("PROBE1 ip0={} match_idx={}", ip0, match_idx);
if unsafe {
match_found::<USE_CMOV>(base.add(ip0), base, match_idx, prefix_start_index)
} {
ktrace!("PUT hash1={} pos={} (explicit1 post)", hash1, ip1);
unsafe { *table.get_unchecked_mut(hash1 as usize) = ip1 as u32 };
ktrace!(
"MATCH explicit1 ip0={} match_idx={} offset={}",
ip0,
match_idx,
ip0 as i64 - match_idx as i64
);
break Some(MatchFound::Explicit {
new_ip: ip0,
match_idx,
current0: ip0,
});
}
match_idx = unsafe { *table.get_unchecked(hash1 as usize) };
hash0 = hash1;
hash1 = unsafe { hash_ptr_raw::<MLS>(base.add(ip2), hlog) };
ip0 = ip1;
ip1 = ip2;
ip2 = ip3;
ktrace!("PUT hash0={} pos={} (post-shift1)", hash0, ip0);
unsafe { *table.get_unchecked_mut(hash0 as usize) = ip0 as u32 };
ktrace!("PROBE2 ip0={} match_idx={}", ip0, match_idx);
if unsafe {
match_found::<USE_CMOV>(base.add(ip0), base, match_idx, prefix_start_index)
} {
if step <= 4 {
ktrace!("PUT hash1={} pos={} (explicit2 post, step<=4)", hash1, ip1);
unsafe { *table.get_unchecked_mut(hash1 as usize) = ip1 as u32 };
}
ktrace!(
"MATCH explicit2 ip0={} match_idx={} offset={}",
ip0,
match_idx,
ip0 as i64 - match_idx as i64
);
break Some(MatchFound::Explicit {
new_ip: ip0,
match_idx,
current0: ip0,
});
}
match_idx = unsafe { *table.get_unchecked(hash1 as usize) };
hash0 = hash1;
hash1 = unsafe { hash_ptr_raw::<MLS>(base.add(ip2), hlog) };
ip0 = ip1;
ip1 = ip2;
let Some(new_ip2) = ip0.checked_add(step) else {
break None;
};
let Some(new_ip3) = ip1.checked_add(step) else {
break None;
};
ip2 = new_ip2;
ip3 = new_ip3;
if ip2 >= next_step {
step += 1;
next_step = next_step.saturating_add(K_STEP_INCR);
}
if ip3 > ilimit {
break None;
}
};
let Some(found) = found else {
break 'restart;
};
let current0 = match found {
MatchFound::Rep { current0, .. } => current0,
MatchFound::Explicit { current0, .. } => current0,
};
let (mut match_ip, mut match_pos, mut m_len, offset, is_rep) = match found {
MatchFound::Rep {
new_ip,
match0,
m_len,
current0: _,
} => (new_ip, match0, m_len, rep_offset1 as usize, true),
MatchFound::Explicit {
new_ip,
match_idx,
current0: _,
} => {
let match_pos = match_idx as usize;
debug_assert!(
match_pos < new_ip,
"kernel invariant violated: match_pos ({match_pos}) >= new_ip ({new_ip}); \
hash table holds forward-pointing entry — driver/test broke writeback ordering"
);
let offset = new_ip - match_pos;
rep_offset2 = rep_offset1;
rep_offset1 = offset as u32;
(new_ip, match_pos, 4usize, offset, false)
}
};
if !is_rep {
while match_ip > anchor
&& match_pos > window_low as usize
&& unsafe { *base.add(match_ip - 1) == *base.add(match_pos - 1) }
{
match_ip -= 1;
match_pos -= 1;
m_len += 1;
}
}
let forward = unsafe {
count_forward(
base.add(match_ip + m_len),
base.add(match_pos + m_len),
base.add(iend_addr),
)
};
m_len += forward;
let literals = unsafe { data.get_unchecked(anchor..match_ip) };
handle_sequence(Sequence::Triple {
literals,
offset,
match_len: m_len,
});
ip0 = match_ip + m_len;
anchor = ip0;
if ip0 <= ilimit {
let in_range_fwd = current0
.checked_add(2)
.and_then(|c| c.checked_add(HASH_READ_SIZE))
.is_some_and(|end| end <= iend_addr);
if in_range_fwd {
let current0_plus_2 = current0 + 2;
let h_fwd = unsafe { hash_ptr_raw::<MLS>(base.add(current0_plus_2), hlog) };
unsafe { *table.get_unchecked_mut(h_fwd as usize) = current0_plus_2 as u32 };
}
if ip0 >= 2 {
let h_back = unsafe { hash_ptr_raw::<MLS>(base.add(ip0 - 2), hlog) };
unsafe { *table.get_unchecked_mut(h_back as usize) = (ip0 - 2) as u32 };
}
while rep_offset2 > 0
&& ip0 <= ilimit
&& ip0 >= rep_offset2 as usize
&& unsafe { read32(base.add(ip0)) == read32(base.add(ip0 - rep_offset2 as usize)) }
{
let r_off = rep_offset2 as usize;
let r_extra = unsafe {
count_forward(
base.add(ip0 + 4),
base.add(ip0 + 4 - r_off),
base.add(iend_addr),
)
};
let r_len = 4 + r_extra;
core::mem::swap(&mut rep_offset1, &mut rep_offset2);
let h_at = unsafe { hash_ptr_raw::<MLS>(base.add(ip0), hlog) };
unsafe { *table.get_unchecked_mut(h_at as usize) = ip0 as u32 };
handle_sequence(Sequence::Triple {
literals: unsafe { data.get_unchecked(anchor..ip0) },
offset: r_off,
match_len: r_len,
});
ip0 += r_len;
anchor = ip0;
}
}
step = step_size;
next_step = ip0.saturating_add(K_STEP_INCR);
continue 'restart;
}
if offset_saved1 != 0 && rep_offset1 != 0 {
offset_saved2 = offset_saved1;
}
let final_rep = [
if rep_offset1 != 0 {
rep_offset1
} else {
offset_saved1
},
if rep_offset2 != 0 {
rep_offset2
} else {
offset_saved2
},
];
FastBlockResult {
rep: final_rep,
tail_literals_len: data.len() - anchor,
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn compress_block_fast_dict<const MLS: u32, const USE_CMOV: bool>(
data: &[u8],
block_start: usize,
bounds: PrefixBounds,
main_table: &mut FastHashTable,
dict_table: &FastHashTable,
dict_end: u32,
rep: [u32; 2],
step_size: usize,
mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
) -> FastBlockResult {
assert!(
block_start <= data.len(),
"block_start ({block_start}) must not exceed data.len() ({})",
data.len(),
);
assert!(
data.len() <= u32::MAX as usize,
"FastKernel does not support data.len() ({}) > u32::MAX",
data.len(),
);
debug_assert_eq!(MLS, main_table.mls());
debug_assert_eq!(MLS, dict_table.mls());
debug_assert_eq!(main_table.hash_log(), dict_table.hash_log());
let prefix_start_index = bounds.prefix_start_index;
let window_low = bounds.window_low as usize;
let dict_end = dict_end as usize;
if data.len() < block_start + HASH_READ_SIZE {
return FastBlockResult {
rep,
tail_literals_len: data.len() - block_start,
};
}
let base = data.as_ptr();
let iend_addr = data.len();
let ilimit = iend_addr - HASH_READ_SIZE;
let mut anchor: usize = block_start;
let mut ip0: usize = block_start;
if ip0 == 0 {
ip0 = 1;
}
let mut ip1 = ip0 + step_size;
let mut offset_1: u32 = rep[0];
let mut offset_2: u32 = rep[1];
struct DictMatch {
lit_end: usize,
offset: usize,
m_len: usize,
curr: usize,
}
'outer: while ip1 <= ilimit {
let mut hash0 = unsafe { main_table.hash_ptr::<MLS>(base.add(ip0)) };
let mut main_idx = unsafe { main_table.get(hash0) };
let mut dict_idx = unsafe { dict_table.get(hash0) };
let mut curr = ip0;
let found: Option<DictMatch> = loop {
let hash1 = unsafe { main_table.hash_ptr::<MLS>(base.add(ip1)) };
unsafe { main_table.put(hash0, curr as u32) };
if offset_1 > 0 && curr + 1 >= offset_1 as usize + window_low {
let rep_index = curr + 1 - offset_1 as usize;
if unsafe { read32(base.add(ip0 + 1)) == read32(base.add(rep_index)) } {
let m_len = 4 + unsafe {
count_forward(
base.add(ip0 + 1 + 4),
base.add(rep_index + 4),
base.add(iend_addr),
)
};
break Some(DictMatch {
lit_end: ip0 + 1,
offset: offset_1 as usize,
m_len,
curr,
});
}
}
if main_idx < prefix_start_index {
let dpos = dict_idx as usize;
if dict_idx >= 1
&& dpos < dict_end
&& dpos >= window_low
&& unsafe { read32(base.add(ip0)) == read32(base.add(dpos)) }
{
let mut match_ip = ip0;
let mut match_pos = dpos;
let mut m_len = 4 + unsafe {
count_forward(base.add(ip0 + 4), base.add(dpos + 4), base.add(iend_addr))
};
while match_ip > anchor
&& match_pos > window_low
&& unsafe { *base.add(match_ip - 1) == *base.add(match_pos - 1) }
{
match_ip -= 1;
match_pos -= 1;
m_len += 1;
}
if m_len >= MIN_DICT_MATCH_LEN {
let offset = match_ip - match_pos;
offset_2 = offset_1;
offset_1 = offset as u32;
break Some(DictMatch {
lit_end: match_ip,
offset,
m_len,
curr,
});
}
}
}
if unsafe { match_found::<USE_CMOV>(base.add(ip0), base, main_idx, prefix_start_index) }
{
let mut match_ip = ip0;
let mut match_pos = main_idx as usize;
let mut m_len = 4 + unsafe {
count_forward(
base.add(ip0 + 4),
base.add(match_pos + 4),
base.add(iend_addr),
)
};
while match_ip > anchor
&& match_pos > window_low
&& unsafe { *base.add(match_ip - 1) == *base.add(match_pos - 1) }
{
match_ip -= 1;
match_pos -= 1;
m_len += 1;
}
let offset = match_ip - match_pos;
offset_2 = offset_1;
offset_1 = offset as u32;
break Some(DictMatch {
lit_end: match_ip,
offset,
m_len,
curr,
});
}
dict_idx = unsafe { dict_table.get(hash1) };
main_idx = unsafe { main_table.get(hash1) };
ip0 = ip1;
ip1 += step_size;
if ip1 > ilimit {
break None;
}
curr = ip0;
hash0 = hash1;
};
let Some(m) = found else {
break 'outer;
};
handle_sequence(Sequence::Triple {
literals: &data[anchor..m.lit_end],
offset: m.offset,
match_len: m.m_len,
});
ip0 = m.lit_end + m.m_len;
anchor = ip0;
if ip0 <= ilimit {
if m.curr + 2 + HASH_READ_SIZE <= iend_addr {
let h = unsafe { main_table.hash_ptr::<MLS>(base.add(m.curr + 2)) };
unsafe { main_table.put(h, (m.curr + 2) as u32) };
}
if ip0 >= 2 {
let h = unsafe { main_table.hash_ptr::<MLS>(base.add(ip0 - 2)) };
unsafe { main_table.put(h, (ip0 - 2) as u32) };
}
while ip0 <= ilimit
&& offset_2 > 0
&& ip0 >= offset_2 as usize + window_low
&& unsafe { read32(base.add(ip0)) == read32(base.add(ip0 - offset_2 as usize)) }
{
let r_off = offset_2 as usize;
let r_len = 4 + unsafe {
count_forward(
base.add(ip0 + 4),
base.add(ip0 + 4 - r_off),
base.add(iend_addr),
)
};
core::mem::swap(&mut offset_1, &mut offset_2);
let h = unsafe { main_table.hash_ptr::<MLS>(base.add(ip0)) };
unsafe { main_table.put(h, ip0 as u32) };
handle_sequence(Sequence::Triple {
literals: &data[anchor..ip0],
offset: r_off,
match_len: r_len,
});
ip0 += r_len;
anchor = ip0;
}
}
ip1 = ip0 + step_size;
}
FastBlockResult {
rep: [offset_1, offset_2],
tail_literals_len: data.len() - anchor,
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn borrowed_candidate_len<C: Fn(*const u8, *const u8, usize) -> usize>(
cand_abs: usize,
cur_off: usize,
dict_end: usize,
dict: &[u8],
inp: &[u8],
inp_base: *const u8,
block_end: usize,
cpl: &C,
) -> usize {
if cand_abs >= dict_end {
let cand_off = cand_abs - dict_end;
if unsafe { read32(inp_base.add(cur_off)) != read32(inp_base.add(cand_off)) } {
return 0;
}
4 + unsafe {
cpl(
inp_base.add(cur_off + 4),
inp_base.add(cand_off + 4),
block_end - (cur_off + 4),
)
}
} else {
let l = count_forward_dict_2segment(dict, cand_abs, inp, cur_off);
if l >= 4 { l } else { 0 }
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn compress_block_fast_dict_borrowed_impl<
const MLS: u32,
const USE_CMOV: bool,
C: Fn(*const u8, *const u8, usize) -> usize,
>(
inp: &[u8],
dict: &[u8],
block_start: usize,
block_end: usize,
main_table: &mut FastHashTable,
dict_table: &FastHashTable,
bounds: PrefixBounds,
rep: [u32; 2],
step_size: usize,
mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
cpl: C,
) -> FastBlockResult {
assert!(
block_start <= block_end && block_end <= inp.len(),
"borrowed dict block bounds out of range: start={block_start} end={block_end} inp_len={}",
inp.len(),
);
let dict_end = dict.len();
assert!(
dict_end >= 1,
"borrowed dict kernel requires a non-empty dictionary (sentinel-0 safety)",
);
assert!(
dict_end
.checked_add(block_end)
.is_some_and(|v| v <= u32::MAX as usize),
"FastKernel does not support dict_end + block_end > u32::MAX (dict_end={dict_end}, block_end={block_end})",
);
assert!(
step_size >= 2,
"borrowed dict kernel requires step_size >= 2 (got {step_size})",
);
assert_eq!(MLS, main_table.mls());
assert_eq!(MLS, dict_table.mls());
assert_eq!(main_table.hash_log(), dict_table.hash_log());
let prefix_start_index = bounds.prefix_start_index;
let window_low = bounds.window_low as usize;
if block_end < block_start + HASH_READ_SIZE {
return FastBlockResult {
rep,
tail_literals_len: block_end - block_start,
};
}
let inp_base = inp.as_ptr();
let ilimit = block_end - HASH_READ_SIZE;
let mut anchor: usize = block_start;
let mut ip0: usize = block_start;
let mut ip1 = ip0 + step_size;
let mut offset_1: u32 = rep[0];
let mut offset_2: u32 = rep[1];
struct DictMatch {
lit_end: usize,
offset: usize,
m_len: usize,
curr: usize,
}
'outer: while ip1 <= ilimit {
let mut hash0 = unsafe { main_table.hash_ptr::<MLS>(inp_base.add(ip0)) };
let mut main_idx = unsafe { main_table.get(hash0) };
let mut dict_idx = unsafe { dict_table.get(hash0) };
let mut curr = ip0;
let found: Option<DictMatch> = loop {
let hash1 = unsafe { main_table.hash_ptr::<MLS>(inp_base.add(ip1)) };
let cur_abs = dict_end + curr;
unsafe { main_table.put(hash0, cur_abs as u32) };
if offset_1 > 0 && cur_abs + 1 >= offset_1 as usize + window_low {
let rep_abs = cur_abs + 1 - offset_1 as usize;
let m_len = unsafe {
borrowed_candidate_len(
rep_abs,
curr + 1,
dict_end,
dict,
inp,
inp_base,
block_end,
&cpl,
)
};
if m_len >= 4 {
break Some(DictMatch {
lit_end: curr + 1,
offset: offset_1 as usize,
m_len,
curr,
});
}
}
if main_idx < prefix_start_index {
let dpos = dict_idx as usize;
if dict_idx >= 1 && dpos < dict_end && dpos >= window_low {
let m0 = count_forward_dict_2segment(dict, dpos, inp, curr);
if m0 >= 4 {
let mut match_ip = curr;
let mut match_pos = dpos;
let mut m_len = m0;
while match_ip > anchor
&& match_pos > window_low
&& unsafe { *inp_base.add(match_ip - 1) == dict[match_pos - 1] }
{
match_ip -= 1;
match_pos -= 1;
m_len += 1;
}
if m_len >= MIN_DICT_MATCH_LEN {
let offset = (dict_end + match_ip) - match_pos;
offset_2 = offset_1;
offset_1 = offset as u32;
break Some(DictMatch {
lit_end: match_ip,
offset,
m_len,
curr,
});
}
}
}
}
debug_assert!(
main_idx == 0 || main_idx as usize >= dict_end,
"main-table entry must be the sentinel or a virtual input position (>= dict_end)",
);
let main_valid = if USE_CMOV {
let in_range = main_idx >= prefix_start_index;
let mval_addr = if in_range {
unsafe { inp_base.add(main_idx as usize - dict_end) }
} else {
CMOV_DUMMY.as_ptr()
};
let bytes_match = unsafe { read32(inp_base.add(ip0)) == read32(mval_addr) };
#[allow(clippy::needless_bitwise_bool)]
let r = bytes_match & in_range;
r
} else {
main_idx >= prefix_start_index
&& unsafe {
read32(inp_base.add(ip0))
== read32(inp_base.add(main_idx as usize - dict_end))
}
};
if main_valid {
let mut match_ip = ip0;
let mut match_pos = main_idx as usize - dict_end;
let mut m_len = 4 + unsafe {
cpl(
inp_base.add(ip0 + 4),
inp_base.add(match_pos + 4),
block_end - (ip0 + 4),
)
};
while match_ip > anchor
&& match_pos > 0
&& (dict_end + match_pos) > window_low
&& unsafe { *inp_base.add(match_ip - 1) == *inp_base.add(match_pos - 1) }
{
match_ip -= 1;
match_pos -= 1;
m_len += 1;
}
let offset = match_ip - match_pos;
offset_2 = offset_1;
offset_1 = offset as u32;
break Some(DictMatch {
lit_end: match_ip,
offset,
m_len,
curr,
});
}
dict_idx = unsafe { dict_table.get(hash1) };
main_idx = unsafe { main_table.get(hash1) };
ip0 = ip1;
ip1 += step_size;
if ip1 > ilimit {
break None;
}
curr = ip0;
hash0 = hash1;
};
let Some(m) = found else {
break 'outer;
};
handle_sequence(Sequence::Triple {
literals: &inp[anchor..m.lit_end],
offset: m.offset,
match_len: m.m_len,
});
ip0 = m.lit_end + m.m_len;
anchor = ip0;
if ip0 <= ilimit {
if m.curr + 2 + HASH_READ_SIZE <= block_end {
let h = unsafe { main_table.hash_ptr::<MLS>(inp_base.add(m.curr + 2)) };
unsafe { main_table.put(h, (dict_end + m.curr + 2) as u32) };
}
if ip0 >= 2 {
let h = unsafe { main_table.hash_ptr::<MLS>(inp_base.add(ip0 - 2)) };
unsafe { main_table.put(h, (dict_end + ip0 - 2) as u32) };
}
while ip0 <= ilimit && offset_2 > 0 && dict_end + ip0 >= offset_2 as usize + window_low
{
let rep_abs = dict_end + ip0 - offset_2 as usize;
let r_len = unsafe {
borrowed_candidate_len(
rep_abs, ip0, dict_end, dict, inp, inp_base, block_end, &cpl,
)
};
if r_len < 4 {
break;
}
let r_off = offset_2 as usize;
core::mem::swap(&mut offset_1, &mut offset_2);
let h = unsafe { main_table.hash_ptr::<MLS>(inp_base.add(ip0)) };
unsafe { main_table.put(h, (dict_end + ip0) as u32) };
handle_sequence(Sequence::Triple {
literals: &inp[anchor..ip0],
offset: r_off,
match_len: r_len,
});
ip0 += r_len;
anchor = ip0;
}
}
ip1 = ip0 + step_size;
}
FastBlockResult {
rep: [offset_1, offset_2],
tail_literals_len: block_end - anchor,
}
}
macro_rules! fast_dict_borrowed_wrapper {
($(#[$attr:meta])* $name:ident, $cpl:path) => {
$(#[$attr])*
#[allow(clippy::too_many_arguments)]
unsafe fn $name<const MLS: u32, const USE_CMOV: bool>(
inp: &[u8],
dict: &[u8],
block_start: usize,
block_end: usize,
main_table: &mut FastHashTable,
dict_table: &FastHashTable,
bounds: PrefixBounds,
rep: [u32; 2],
step_size: usize,
handle_sequence: impl for<'a> FnMut(Sequence<'a>),
) -> FastBlockResult {
compress_block_fast_dict_borrowed_impl::<MLS, USE_CMOV, _>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
|l, r, m| unsafe { $cpl(l, r, m) },
)
}
};
}
fast_dict_borrowed_wrapper!(
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2,bmi2")]
cbfd_borrowed_avx2_bmi2,
crate::encoding::fastpath::avx2_bmi2::common_prefix_len_ptr
);
fast_dict_borrowed_wrapper!(
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.2")]
cbfd_borrowed_sse42,
crate::encoding::fastpath::sse42::common_prefix_len_ptr
);
fast_dict_borrowed_wrapper!(
#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
#[target_feature(enable = "neon")]
cbfd_borrowed_neon,
crate::encoding::fastpath::neon::common_prefix_len_ptr
);
fast_dict_borrowed_wrapper!(
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
#[target_feature(enable = "simd128")]
cbfd_borrowed_simd128,
crate::encoding::fastpath::simd128::common_prefix_len_ptr
);
#[allow(clippy::too_many_arguments)]
pub(crate) fn compress_block_fast_dict_borrowed<const MLS: u32, const USE_CMOV: bool>(
inp: &[u8],
dict: &[u8],
block_start: usize,
block_end: usize,
main_table: &mut FastHashTable,
dict_table: &FastHashTable,
bounds: PrefixBounds,
rep: [u32; 2],
step_size: usize,
handle_sequence: impl for<'a> FnMut(Sequence<'a>),
kernel: crate::encoding::fastpath::FastpathKernel,
) -> FastBlockResult {
#[allow(unused_imports)]
use crate::encoding::fastpath::FastpathKernel;
let scalar = |inp: &[u8],
dict: &[u8],
main_table: &mut FastHashTable,
dict_table: &FastHashTable,
handle_sequence: &mut dyn for<'a> FnMut(Sequence<'a>)|
-> FastBlockResult {
compress_block_fast_dict_borrowed_impl::<MLS, USE_CMOV, _>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
|l, r, m| unsafe { count_forward(l, r, l.add(m)) },
)
};
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
match kernel {
FastpathKernel::Avx2Bmi2 => unsafe {
cbfd_borrowed_avx2_bmi2::<MLS, USE_CMOV>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
)
},
FastpathKernel::Sse42 => unsafe {
cbfd_borrowed_sse42::<MLS, USE_CMOV>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
)
},
FastpathKernel::Scalar => {
let mut hs = handle_sequence;
scalar(inp, dict, main_table, dict_table, &mut hs)
}
}
}
#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
{
match kernel {
FastpathKernel::Neon => unsafe {
cbfd_borrowed_neon::<MLS, USE_CMOV>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
)
},
FastpathKernel::Scalar => {
let mut hs = handle_sequence;
scalar(inp, dict, main_table, dict_table, &mut hs)
}
}
}
#[cfg(all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
))]
{
match kernel {
FastpathKernel::Simd128 => unsafe {
cbfd_borrowed_simd128::<MLS, USE_CMOV>(
inp,
dict,
block_start,
block_end,
main_table,
dict_table,
bounds,
rep,
step_size,
handle_sequence,
)
},
FastpathKernel::Scalar => {
let mut hs = handle_sequence;
scalar(inp, dict, main_table, dict_table, &mut hs)
}
}
}
#[cfg(not(any(
target_arch = "x86",
target_arch = "x86_64",
all(target_arch = "aarch64", target_endian = "little"),
all(
target_arch = "wasm32",
target_feature = "simd128",
feature = "kernel_simd128"
)
)))]
{
let _ = kernel;
let mut hs = handle_sequence;
scalar(inp, dict, main_table, dict_table, &mut hs)
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
use alloc::vec::Vec;
fn run_block(
data: &[u8],
hash_log: u32,
mls: u32,
) -> (Vec<(Vec<u8>, usize, usize)>, FastBlockResult) {
let mut table = FastHashTable::new(hash_log, mls);
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => {
tuples.push((literals.to_vec(), offset, match_len));
}
Sequence::Literals { literals } => {
tuples.push((literals.to_vec(), 0, 0));
}
};
let result = match mls {
4 => compress_block_fast::<4, false>(
data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
[0, 0],
2,
&mut handle,
),
5 => compress_block_fast::<5, false>(
data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
[0, 0],
2,
&mut handle,
),
_ => panic!("test helper only supports mls=4 and mls=5"),
};
let acct: usize = tuples
.iter()
.map(|(lits, _off, mlen)| lits.len() + mlen)
.sum::<usize>()
+ result.tail_literals_len;
assert_eq!(acct, data.len(), "kernel must account for every input byte",);
(tuples, result)
}
#[test]
fn short_input_reports_tail_without_emission() {
let data = [1u8, 2, 3, 4, 5];
let (tuples, result) = run_block(&data, 8, 4);
assert!(
tuples.is_empty(),
"kernel must NOT emit sequences for short inputs (got {tuples:?})",
);
assert_eq!(result.tail_literals_len, data.len());
}
#[test]
fn finds_long_repeat_in_simple_pattern() {
let mut data = Vec::new();
data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
data.extend_from_slice(b"________");
let (tuples, _result) = run_block(&data, 12, 4);
let triple = tuples
.iter()
.find(|(_, _, m)| *m > 0)
.expect("kernel must emit at least one Triple for the repeated half");
assert!(
triple.2 >= 4,
"match_len must be ≥ MIN_MATCH=4 (got {})",
triple.2,
);
assert!(
triple.1 > 0,
"explicit-offset match must have offset > 0 (got {})",
triple.1,
);
}
fn run_block_with_rep(
data: &[u8],
hash_log: u32,
rep: [u32; 2],
) -> (Vec<(Vec<u8>, usize, usize)>, FastBlockResult) {
let mut table = FastHashTable::new(hash_log, 4);
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => tuples.push((literals.to_vec(), offset, match_len)),
Sequence::Literals { literals } => tuples.push((literals.to_vec(), 0, 0)),
};
let result = compress_block_fast::<4, false>(
data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
rep,
2,
&mut handle,
);
let acct: usize = tuples
.iter()
.map(|(lits, _off, mlen)| lits.len() + mlen)
.sum::<usize>()
+ result.tail_literals_len;
assert_eq!(acct, data.len(), "kernel must account for every input byte");
(tuples, result)
}
#[test]
fn repcode_match_emits_with_rep_offset_one() {
let data = vec![0x42u8; 64];
let (tuples, _) = run_block_with_rep(&data, 8, [1, 4]);
let rep_triple = tuples
.iter()
.find(|(_, off, m)| *off == 1 && *m > 0)
.unwrap_or_else(|| panic!("repcode Triple at offset=1 expected, got {tuples:?}"));
assert!(
rep_triple.2 >= 4,
"match_len must be ≥ MIN_MATCH=4 (got {})",
rep_triple.2,
);
assert!(
rep_triple.2 >= 32,
"uniform-byte rep extension must consume most of the buffer, got {}",
rep_triple.2,
);
}
#[test]
fn explicit_match_backward_extension_extends_by_marker_byte() {
let mut data: Vec<u8> = (0..15u8).collect();
data.push(b'Z');
data.extend_from_slice(b"AAAAAAAA");
for i in 0..8u8 {
data.push(0x80 + i);
}
data.push(b'Z');
data.extend_from_slice(b"AAAAAAAA");
for i in 0..16u8 {
data.push(0x40 + (i % 16));
}
let (tuples, _) = run_block_with_rep(&data, 12, [0, 0]);
let triple = tuples
.iter()
.find(|(_, _, m)| *m > 0)
.unwrap_or_else(|| panic!("expected an explicit-match Triple, got {tuples:?}"));
assert!(
triple.2 >= 5,
"expected match_len ≥ 5 from backward extension (got {})",
triple.2,
);
assert!(
!triple.0.ends_with(b"Z"),
"backward extension must consume the 'Z' marker (literals = {:?})",
triple.0,
);
}
#[test]
fn prefix_start_index_filter_rejects_below_window() {
let data = vec![0xAAu8; 64];
let mut table = FastHashTable::new(8, 4);
let h = unsafe { table.hash_ptr::<4>(data.as_ptr().add(1)) };
unsafe { table.put(h, 0) };
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => tuples.push((literals.to_vec(), offset, match_len)),
Sequence::Literals { literals } => tuples.push((literals.to_vec(), 0, 0)),
};
let _ = compress_block_fast::<4, false>(
&data,
0,
PrefixBounds {
prefix_start_index: 5,
window_low: 5,
},
&mut table,
[0, 0],
2,
&mut handle,
);
let mut anchor: usize = 0;
for (lits, off, m) in &tuples {
if *m > 0 {
let match_start = anchor + lits.len();
let match_src = match_start
.checked_sub(*off)
.expect("offset must not exceed match_start (would wrap)");
assert!(
match_src >= 5,
"match source {match_src} below prefix_start_index=5 \
(match_start={match_start}, offset={off})",
);
anchor = match_start + m;
} else {
anchor += lits.len();
}
}
}
#[test]
fn match_found_rejects_stale_entry_below_prefix_floor() {
let data = vec![0u8; 200];
let mut table = FastHashTable::new(8, 4);
let h = unsafe { table.hash_ptr::<4>(data.as_ptr().add(50)) };
unsafe { table.put(h, 5) };
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => tuples.push((literals.to_vec(), offset, match_len)),
Sequence::Literals { literals } => tuples.push((literals.to_vec(), 0, 0)),
};
let _ = compress_block_fast::<4, false>(
&data,
50,
PrefixBounds {
prefix_start_index: 50,
window_low: 50,
},
&mut table,
[0, 0],
2,
&mut handle,
);
for (_, off, m) in &tuples {
if *m > 0 {
assert!(
*off > 0 && *off <= data.len(),
"every emitted offset must reference an in-buffer backward position (got {off})",
);
}
}
}
#[test]
fn block_exactly_hash_read_size_emits_no_sequences() {
let data = [1u8, 2, 3, 4, 5, 6, 7, 8];
let (tuples, result) = run_block_with_rep(&data, 8, [0, 0]);
assert!(
tuples.is_empty(),
"exactly HASH_READ_SIZE bytes must produce no main-loop iterations",
);
assert_eq!(result.tail_literals_len, data.len());
}
#[test]
fn block_just_below_hash_read_size_emits_no_sequences() {
let data = [1u8, 2, 3, 4, 5, 6, 7];
let (tuples, result) = run_block_with_rep(&data, 8, [0, 0]);
assert!(tuples.is_empty());
assert_eq!(result.tail_literals_len, data.len());
}
#[test]
fn rep_offset_save_restore_when_out_of_range() {
let mut data = vec![0u8; 64];
let mut state = 0x1234_5678u32;
for byte in &mut data {
state ^= state << 13;
state ^= state >> 17;
state ^= state << 5;
*byte = state as u8;
}
let huge = 9999;
let mut table = FastHashTable::new(10, 4);
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => tuples.push((literals.to_vec(), offset, match_len)),
Sequence::Literals { literals } => tuples.push((literals.to_vec(), 0, 0)),
};
let result = compress_block_fast::<4, false>(
&data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
[huge, 7],
2,
&mut handle,
);
assert_eq!(
result.rep[0], huge,
"out-of-range rep_offset1 must be restored verbatim across the block",
);
assert_eq!(result.rep[1], 7, "rep_offset2 (also stashed) must restore");
}
#[test]
fn cmov_variant_matches_branch_variant_output() {
let mut data = alloc::vec::Vec::new();
for i in 0..512u32 {
data.push((i & 0xFF) as u8);
}
let tail = data[0..64].to_vec();
data.extend_from_slice(&tail);
let collect = |use_cmov: bool| -> alloc::vec::Vec<(alloc::vec::Vec<u8>, usize, usize)> {
let mut table = FastHashTable::new(12, 4);
let mut tuples = alloc::vec::Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => {
tuples.push((literals.to_vec(), offset, match_len));
}
Sequence::Literals { literals } => {
tuples.push((literals.to_vec(), 0, 0));
}
};
if use_cmov {
let _ = compress_block_fast::<4, true>(
&data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
[0, 0],
2,
&mut handle,
);
} else {
let _ = compress_block_fast::<4, false>(
&data,
0,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
&mut table,
[0, 0],
2,
&mut handle,
);
}
tuples
};
let out_branch = collect(false);
let out_cmov = collect(true);
assert_eq!(
out_branch, out_cmov,
"cmov and branch variants must emit identical sequences"
);
}
#[test]
fn cmov_variant_rejects_out_of_window_when_ip_equals_dummy() {
let mut data: alloc::vec::Vec<u8> = alloc::vec![0xAA; 32];
data[16] = 0x12;
data[17] = 0x34;
data[18] = 0x56;
data[19] = 0x78;
let base = data.as_ptr();
let ip_pos = 16usize;
let ip = unsafe { base.add(ip_pos) };
let branch_result = unsafe { match_found::<false>(ip, base, 4, 10) };
assert!(
!branch_result,
"branch variant must reject out-of-window match_idx"
);
let cmov_result = unsafe { match_found::<true>(ip, base, 4, 10) };
assert!(
!cmov_result,
"cmov variant must reject out-of-window match_idx even when \
ip bytes coincide with CMOV_DUMMY",
);
}
#[test]
fn borrowed_dict_kernel_reconstructs_via_dual_base() {
use crate::encoding::fastpath::FastpathKernel;
let hash_log = 12u32;
const MLS: u32 = 4;
let dict: Vec<u8> = (0u8..40).collect();
let mut inp: Vec<u8> = Vec::new();
inp.extend_from_slice(&dict[0..20]); inp.extend_from_slice(&dict[0..20]); inp.extend_from_slice(&dict[4..24]); inp.extend_from_slice(b"tail-literals-xyz");
let dict_end = dict.len();
let mut main_table = FastHashTable::new(hash_log, MLS);
let mut dict_table = FastHashTable::new(hash_log, MLS);
for pos in 0..=dict.len() - HASH_READ_SIZE {
let h = unsafe { dict_table.hash_ptr::<MLS>(dict.as_ptr().add(pos)) };
unsafe { dict_table.put(h, pos as u32) };
}
let mut tuples: Vec<(Vec<u8>, usize, usize)> = Vec::new();
let mut handle = |seq: Sequence<'_>| match seq {
Sequence::Triple {
literals,
offset,
match_len,
} => tuples.push((literals.to_vec(), offset, match_len)),
Sequence::Literals { literals } => tuples.push((literals.to_vec(), 0, 0)),
};
let result = compress_block_fast_dict_borrowed::<MLS, false>(
&inp,
&dict,
0,
inp.len(),
&mut main_table,
&dict_table,
PrefixBounds {
prefix_start_index: 1,
window_low: 0,
},
[0, 0],
2,
&mut handle,
FastpathKernel::Scalar,
);
let mut window = dict.clone();
let mut saw_dict_region_match = false;
for (literals, offset, match_len) in &tuples {
window.extend_from_slice(literals);
if *match_len > 0 {
let start = window.len() - offset;
if start < dict_end {
saw_dict_region_match = true;
}
for i in 0..*match_len {
let b = window[start + i];
window.push(b);
}
}
}
let tail_start = inp.len() - result.tail_literals_len;
window.extend_from_slice(&inp[tail_start..]);
assert_eq!(
&window[dict_end..],
&inp[..],
"borrowed dict kernel must reconstruct the input from the [dict][input] window",
);
assert!(
tuples.iter().any(|(_, _, m)| *m >= 4),
"expected at least one match Triple",
);
assert!(
saw_dict_region_match,
"expected at least one match reading from the dictionary region (dual-base path)",
);
}
}