#![allow(dead_code)]
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64;
use crate::decompress::inflate::jit_decode::TableFingerprint;
use crate::decompress::inflate::libdeflate_entry::{DistTable, LitLenTable};
use std::cell::RefCell;
use std::collections::HashMap;
use std::io::{Error, ErrorKind, Result};
#[inline(always)]
fn unlikely(b: bool) -> bool {
b
}
thread_local! {
static TABLE_CACHE: RefCell<HashMap<TableFingerprint, (LitLenTable, DistTable)>> =
RefCell::new(HashMap::new());
static CACHE_STATS: RefCell<(usize, usize)> = const { RefCell::new((0, 0)) }; static SPEC_CACHE: RefCell<crate::decompress::inflate::specialized_decode::SpecializedCache> =
RefCell::new(crate::decompress::inflate::specialized_decode::SpecializedCache::new());
static SPEC_STATS: RefCell<(usize, usize)> = const { RefCell::new((0, 0)) }; static BLOCK_STATS: RefCell<BlockStats> = const { RefCell::new(BlockStats::new()) };
static TIMING_STATS: RefCell<TimingStats> = const { RefCell::new(TimingStats::new()) };
}
#[derive(Debug, Clone, Copy, Default)]
pub struct TimingStats {
pub table_build_nanos: u64,
pub decode_nanos: u64,
pub header_parse_nanos: u64,
pub table_build_count: u64,
pub decode_count: u64,
}
impl TimingStats {
pub const fn new() -> Self {
Self {
table_build_nanos: 0,
decode_nanos: 0,
header_parse_nanos: 0,
table_build_count: 0,
decode_count: 0,
}
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct BlockStats {
pub stored_blocks: usize,
pub fixed_blocks: usize,
pub dynamic_blocks: usize,
pub stored_bytes: usize,
pub fixed_bytes: usize,
pub dynamic_bytes: usize,
}
impl BlockStats {
pub const fn new() -> Self {
Self {
stored_blocks: 0,
fixed_blocks: 0,
dynamic_blocks: 0,
stored_bytes: 0,
fixed_bytes: 0,
dynamic_bytes: 0,
}
}
pub fn total_blocks(&self) -> usize {
self.stored_blocks + self.fixed_blocks + self.dynamic_blocks
}
pub fn total_bytes(&self) -> usize {
self.stored_bytes + self.fixed_bytes + self.dynamic_bytes
}
}
pub fn get_cache_stats() -> (usize, usize, f64) {
CACHE_STATS.with(|stats| {
let (hits, misses) = *stats.borrow();
let total = hits + misses;
let rate = if total > 0 {
hits as f64 / total as f64
} else {
0.0
};
(hits, misses, rate)
})
}
pub fn get_spec_stats() -> (usize, usize) {
SPEC_STATS.with(|stats| *stats.borrow())
}
pub fn get_block_stats() -> BlockStats {
BLOCK_STATS.with(|stats| *stats.borrow())
}
pub fn get_table_cache_size() -> usize {
TABLE_CACHE.with(|cache| cache.borrow().len())
}
pub fn get_spec_cache_stats() -> (usize, usize, usize, usize) {
SPEC_CACHE.with(|cache| cache.borrow().detailed_stats())
}
pub fn get_timing_stats() -> TimingStats {
TIMING_STATS.with(|stats| *stats.borrow())
}
pub fn reset_cache_stats() {
CACHE_STATS.with(|stats| {
*stats.borrow_mut() = (0, 0);
});
SPEC_STATS.with(|stats| {
*stats.borrow_mut() = (0, 0);
});
BLOCK_STATS.with(|stats| {
*stats.borrow_mut() = BlockStats::new();
});
TABLE_CACHE.with(|cache| {
cache.borrow_mut().clear();
});
SPEC_CACHE.with(|cache| {
*cache.borrow_mut() =
crate::decompress::inflate::specialized_decode::SpecializedCache::new();
});
TIMING_STATS.with(|stats| {
*stats.borrow_mut() = TimingStats::new();
});
}
#[inline(always)]
fn bzhi_u64(x: u64, n: u32) -> u64 {
#[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
{
use std::arch::x86_64::_bzhi_u64;
unsafe { _bzhi_u64(x, n) }
}
#[cfg(not(all(target_arch = "x86_64", target_feature = "bmi2")))]
{
let mask = (1u64 << (n & 63)).wrapping_sub(1);
x & mask
}
}
#[allow(dead_code)]
#[inline(always)]
fn extract_bits(value: u64, n: u32) -> u64 {
bzhi_u64(value, n)
}
pub struct Bits<'a> {
pub data: &'a [u8],
pub pos: usize,
pub bitbuf: u64,
pub bitsleft: u32,
}
impl<'a> Bits<'a> {
pub fn new(data: &'a [u8]) -> Self {
let mut bits = Self {
data,
pos: 0,
bitbuf: 0,
bitsleft: 0,
};
bits.refill();
bits
}
#[inline(always)]
pub fn refill(&mut self) {
let bits_u8 = self.bitsleft as u8;
let bits_u8 = if bits_u8 > 64 {
self.bitbuf = 0;
0
} else {
bits_u8
};
if self.pos + 8 <= self.data.len() {
let word = unsafe { (self.data.as_ptr().add(self.pos) as *const u64).read_unaligned() };
let word = u64::from_le(word);
self.bitbuf |= word << bits_u8;
self.pos += (7 - ((bits_u8 >> 3) & 7)) as usize;
self.bitsleft = (bits_u8 as u32) | 56;
} else {
self.refill_slow_with_bits(bits_u8);
}
}
#[inline(never)]
fn refill_slow_with_bits(&mut self, mut bits_u8: u8) {
while bits_u8 <= 56 {
if self.pos < self.data.len() {
self.bitbuf |= (self.data[self.pos] as u64) << bits_u8;
self.pos += 1;
bits_u8 += 8;
} else {
break;
}
}
self.bitsleft = bits_u8 as u32;
}
#[inline(never)]
pub fn refill_slow(&mut self) {
let bits_u8 = if (self.bitsleft as u8) > 64 {
self.bitbuf = 0;
0u8
} else {
self.bitsleft as u8
};
self.refill_slow_with_bits(bits_u8);
}
#[inline(always)]
pub fn peek(&self) -> u64 {
self.bitbuf
}
#[inline(always)]
pub fn consume(&mut self, n: u32) {
self.bitbuf >>= n as u8;
self.bitsleft -= n;
}
#[inline(always)]
pub fn consume_entry(&mut self, entry: u32) {
self.bitbuf >>= entry as u8;
self.bitsleft = self.bitsleft.wrapping_sub(entry & 0x1F);
}
#[inline(always)]
pub fn available(&self) -> u32 {
(self.bitsleft as u8) as u32
}
pub fn align_to_byte(&mut self) {
let discard = (self.bitsleft as u8) & 7;
self.consume(discard as u32);
}
pub fn read_u16(&mut self) -> u16 {
self.align_to_byte();
if self.available() >= 16 {
let val = (self.bitbuf & 0xFFFF) as u16;
self.consume(16);
return val;
}
self.refill();
let val = (self.bitbuf & 0xFFFF) as u16;
self.consume(16);
val
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn prefetch_read(ptr: *const u8) {
unsafe {
core::arch::x86_64::_mm_prefetch(ptr as *const i8, core::arch::x86_64::_MM_HINT_T0);
}
}
#[cfg(not(target_arch = "x86_64"))]
#[inline(always)]
fn prefetch_read(_ptr: *const u8) {}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn copy_match_avx2(mut dst: *mut u8, mut src: *const u8, end: *const u8) {
while dst.add(32) <= end as *mut u8 {
let v = x86_64::_mm256_loadu_si256(src as *const x86_64::__m256i);
x86_64::_mm256_storeu_si256(dst as *mut x86_64::__m256i, v);
src = src.add(32);
dst = dst.add(32);
}
while dst < end as *mut u8 {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn fill_byte_avx2(mut dst: *mut u8, end: *const u8, byte: u8) {
let pattern = x86_64::_mm256_set1_epi8(byte as i8);
while dst.add(32) <= end as *mut u8 {
x86_64::_mm256_storeu_si256(dst as *mut x86_64::__m256i, pattern);
dst = dst.add(32);
}
let v = 0x0101010101010101u64 * (byte as u64);
while dst < end as *mut u8 {
(dst as *mut u64).write_unaligned(v);
dst = dst.add(8);
}
}
#[inline(always)]
fn copy_match_fast(output: &mut [u8], out_pos: usize, distance: u32, length: u32) -> usize {
let dist = distance as usize;
let len = length as usize;
unsafe {
let out_ptr = output.as_mut_ptr();
let mut dst = out_ptr.add(out_pos);
let mut src = out_ptr.add(out_pos - dist);
let end = dst.add(len);
if len > 40 {
prefetch_read(src.add(40));
}
if dist >= 32 && len >= 64 {
#[cfg(target_arch = "aarch64")]
{
while dst.add(32) <= end {
let v0 = vld1q_u8(src);
let v1 = vld1q_u8(src.add(16));
vst1q_u8(dst, v0);
vst1q_u8(dst.add(16), v1);
src = src.add(32);
dst = dst.add(32);
}
while dst.add(16) <= end {
let v = vld1q_u8(src);
vst1q_u8(dst, v);
src = src.add(16);
dst = dst.add(16);
}
while dst < end {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
}
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
copy_match_avx2(dst, src, end);
} else {
while dst < end {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
}
}
}
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
{
while dst < end {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
}
}
} else if dist >= 8 {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
while dst < end {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(8);
dst = dst.add(8);
}
} else if dist == 1 {
let byte = *src;
#[cfg(target_arch = "aarch64")]
{
if len >= 32 {
let pattern = vdupq_n_u8(byte);
while dst.add(32) <= end {
vst1q_u8(dst, pattern);
vst1q_u8(dst.add(16), pattern);
dst = dst.add(32);
}
while dst.add(16) <= end {
vst1q_u8(dst, pattern);
dst = dst.add(16);
}
}
}
#[cfg(target_arch = "x86_64")]
{
if len >= 64 && is_x86_feature_detected!("avx2") {
fill_byte_avx2(dst, end, byte);
return out_pos + len;
}
}
let v = 0x0101010101010101u64 * (byte as u64);
while dst < end {
(dst as *mut u64).write_unaligned(v);
dst = dst.add(8);
}
} else {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(dist);
dst = dst.add(dist);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(dist);
dst = dst.add(dist);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(dist);
dst = dst.add(dist);
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(dist);
dst = dst.add(dist);
while dst < end {
(dst as *mut u64).write_unaligned((src as *const u64).read_unaligned());
src = src.add(dist);
dst = dst.add(dist);
}
}
}
out_pos + len
}
#[inline(always)]
fn copy_match_safe(output: &mut [u8], out_pos: usize, distance: u32, length: u32) -> usize {
let dist = distance as usize;
let len = length as usize;
let src_start = out_pos - dist;
if dist >= len {
output.copy_within(src_start..src_start + len, out_pos);
} else if dist == 1 {
let byte = output[src_start];
for i in 0..len {
output[out_pos + i] = byte;
}
} else {
for i in 0..len {
output[out_pos + i] = output[src_start + (i % dist)];
}
}
out_pos + len
}
fn decode_huffman_cf_vector(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist_table: &DistTable,
vector_table: &crate::decompress::inflate::vector_huffman::VectorTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
while out_pos + FASTLOOP_MARGIN <= output.len() && bits.pos < bits.data.len() {
if bits.available() < 32 {
bits.refill();
}
let (symbols, count, bits_count) =
crate::decompress::inflate::vector_huffman::decode_multi_literals(
bits.peek(),
&vector_table.table,
);
if count > 0 {
output[out_pos..(out_pos + count)].copy_from_slice(&symbols[..count]);
out_pos += count;
bits.consume(bits_count);
continue;
}
let saved_bitbuf = bits.peek();
let mut entry = litlen.lookup(saved_bitbuf);
bits.consume_entry(entry.raw());
if (entry.raw() as i32) < 0 {
output[out_pos] = entry.literal_value();
out_pos += 1;
continue;
}
if entry.is_exceptional() {
if entry.is_end_of_block() {
return Ok(out_pos);
}
let sub_saved = bits.peek();
entry = litlen.lookup_subtable(entry, saved_bitbuf);
bits.consume_entry(entry.raw());
if entry.is_end_of_block() {
return Ok(out_pos);
}
if (entry.raw() as i32) < 0 {
output[out_pos] = entry.literal_value();
out_pos += 1;
continue;
}
let length_val = entry.decode_length(sub_saved);
out_pos = decode_huffman_match(bits, output, out_pos, length_val, dist_table)?;
} else {
let length_val = entry.decode_length(saved_bitbuf);
out_pos = decode_huffman_match(bits, output, out_pos, length_val, dist_table)?;
}
}
decode_huffman_cf(bits, output, out_pos, litlen, dist_table)
}
fn decode_huffman_match(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
length: u32,
dist_table: &DistTable,
) -> Result<usize> {
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist_table.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist_table.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
out_pos = copy_match_fast(output, out_pos, distance, length);
bits.refill();
Ok(out_pos)
}
#[cfg(feature = "debug_decode")]
use std::io::Write;
#[cfg(feature = "debug_decode")]
thread_local! {
static DEBUG_LOG: std::cell::RefCell<Option<std::fs::File>> = const { std::cell::RefCell::new(None) };
static DEBUG_POS: std::cell::Cell<usize> = std::cell::Cell::new(
std::env::var("DEBUG_POS").ok().and_then(|s| s.parse().ok()).unwrap_or(124088477)
);
}
#[cfg(feature = "debug_decode")]
macro_rules! debug_write {
($out_pos:expr, $($arg:tt)*) => {
DEBUG_POS.with(|pos| {
let target = pos.get();
if $out_pos >= target.saturating_sub(200) && $out_pos <= target + 200 {
DEBUG_LOG.with(|log| {
let mut log = log.borrow_mut();
if log.is_none() {
*log = std::fs::File::create("/tmp/decode_debug.log").ok();
}
if let Some(ref mut f) = *log {
let _ = writeln!(f, $($arg)*);
}
});
}
});
};
}
#[inline(never)]
fn decode_huffman_libdeflate_style(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
const LITLEN_TABLEMASK: u64 = (1u64 << LitLenTable::TABLE_BITS) - 1;
let out_ptr = output.as_mut_ptr();
let out_end = output.len();
let litlen_ptr = litlen.entries_ptr();
let mut bitbuf = bits.bitbuf;
let mut bitsleft = bits.bitsleft;
let mut in_pos = bits.pos;
let in_data = bits.data;
let in_ptr = in_data.as_ptr();
let in_fastloop_end = in_data.len().saturating_sub(32);
macro_rules! refill_branchless_fast {
() => {
unsafe {
let bits_u8 = bitsleft as u8;
let word = (in_ptr.add(in_pos) as *const u64).read_unaligned();
let word = u64::from_le(word);
bitbuf |= word << bits_u8;
in_pos += (7 - ((bits_u8 >> 3) & 7)) as usize;
bitsleft = (bits_u8 as u32) | 56;
}
};
}
macro_rules! refill_branchless {
() => {
if in_pos + 8 <= in_data.len() {
refill_branchless_fast!();
} else {
let mut bits_u8 = bitsleft as u8;
if bits_u8 > 64 {
bits_u8 = 0;
bitbuf = 0;
}
while bits_u8 <= 56 && in_pos < in_data.len() {
bitbuf |= (in_data[in_pos] as u64) << bits_u8;
in_pos += 1;
bits_u8 = bits_u8.wrapping_add(8);
}
bitsleft = bits_u8 as u32;
}
};
}
macro_rules! lookup {
() => {
unsafe { (*litlen_ptr.add((bitbuf & LITLEN_TABLEMASK) as usize)).raw() }
};
}
macro_rules! consume {
($entry:expr) => {{
let saved = bitbuf;
bitbuf >>= $entry as u8;
bitsleft = bitsleft.wrapping_sub($entry);
saved
}};
}
macro_rules! huffdec {
($entry:expr) => {{
bitbuf >>= $entry as u8;
bitsleft = bitsleft.wrapping_sub($entry);
$entry = lookup!();
}};
}
refill_branchless!();
let mut entry = lookup!();
const REFILL_THRESHOLD: u8 = if LitLenTable::TABLE_BITS * 4 <= 56 {
LitLenTable::TABLE_BITS * 4
} else {
56
};
while in_pos < in_fastloop_end && out_pos + FASTLOOP_MARGIN <= out_end {
if (bitsleft as u8) < REFILL_THRESHOLD {
refill_branchless_fast!();
}
let saved_bitbuf = consume!(entry);
if (entry as i32) < 0 {
let lit1 = (entry >> 16) as u8;
entry = lookup!();
if (entry as i32) < 0 {
let lit2 = (entry >> 16) as u8;
huffdec!(entry);
if (entry as i32) < 0 {
let lit3 = (entry >> 16) as u8;
if LitLenTable::TABLE_BITS > 13 && (bitsleft as u8) < 32 {
refill_branchless_fast!();
}
huffdec!(entry);
if (entry as i32) < 0 {
let lit4 = (entry >> 16) as u8;
consume!(entry);
refill_branchless_fast!();
entry = lookup!();
if (entry as i32) < 0 {
let lit5 = (entry >> 16) as u8;
huffdec!(entry);
if (entry as i32) < 0 {
let lit6 = (entry >> 16) as u8;
huffdec!(entry);
if (entry as i32) < 0 {
let lit7 = (entry >> 16) as u8;
consume!(entry);
refill_branchless_fast!();
entry = lookup!();
if (entry as i32) < 0 {
let lit8 = (entry >> 16) as u8;
huffdec!(entry);
let packed = (lit1 as u64)
| ((lit2 as u64) << 8)
| ((lit3 as u64) << 16)
| ((lit4 as u64) << 24)
| ((lit5 as u64) << 32)
| ((lit6 as u64) << 40)
| ((lit7 as u64) << 48)
| ((lit8 as u64) << 56);
unsafe {
(out_ptr.add(out_pos) as *mut u64)
.write_unaligned(packed);
}
out_pos += 8;
continue;
}
let packed = (lit1 as u64)
| ((lit2 as u64) << 8)
| ((lit3 as u64) << 16)
| ((lit4 as u64) << 24)
| ((lit5 as u64) << 32)
| ((lit6 as u64) << 40)
| ((lit7 as u64) << 48);
unsafe {
(out_ptr.add(out_pos) as *mut u64).write_unaligned(packed);
}
out_pos += 7;
continue;
}
let packed = (lit1 as u64)
| ((lit2 as u64) << 8)
| ((lit3 as u64) << 16)
| ((lit4 as u64) << 24)
| ((lit5 as u64) << 32)
| ((lit6 as u64) << 40);
unsafe {
(out_ptr.add(out_pos) as *mut u64).write_unaligned(packed);
}
out_pos += 6;
refill_branchless_fast!();
continue;
}
let packed = (lit1 as u64)
| ((lit2 as u64) << 8)
| ((lit3 as u64) << 16)
| ((lit4 as u64) << 24)
| ((lit5 as u64) << 32);
unsafe {
(out_ptr.add(out_pos) as *mut u64).write_unaligned(packed);
}
out_pos += 5;
refill_branchless_fast!();
continue;
}
let packed = (lit1 as u32)
| ((lit2 as u32) << 8)
| ((lit3 as u32) << 16)
| ((lit4 as u32) << 24);
unsafe {
(out_ptr.add(out_pos) as *mut u32).write_unaligned(packed);
}
out_pos += 4;
continue;
}
let packed = (lit1 as u32) | ((lit2 as u32) << 8) | ((lit3 as u32) << 16);
unsafe {
(out_ptr.add(out_pos) as *mut u32).write_unaligned(packed);
}
out_pos += 3;
if (bitsleft as u8) < 32 {
refill_branchless_fast!();
}
continue;
}
let packed = (lit1 as u16) | ((lit2 as u16) << 8);
unsafe {
(out_ptr.add(out_pos) as *mut u16).write_unaligned(packed);
}
out_pos += 2;
if (bitsleft as u8) < 32 {
refill_branchless_fast!();
}
continue;
}
unsafe {
*out_ptr.add(out_pos) = lit1;
}
out_pos += 1;
if (bitsleft as u8) < 32 {
refill_branchless_fast!();
}
continue;
}
if (entry & 0x8000) != 0 {
if (entry & 0x2000) != 0 {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Ok(out_pos);
}
let subtable_start = (entry >> 16) as usize;
let subtable_bits = ((entry >> 8) & 0x3F) as u64;
let sub_idx = (bitbuf & ((1u64 << subtable_bits) - 1)) as usize;
entry = unsafe { (*litlen_ptr.add(subtable_start + sub_idx)).raw() };
let saved_sub = consume!(entry);
if (entry as i32) < 0 {
let lit = ((entry >> 16) & 0xFF) as u8;
refill_branchless_fast!();
entry = lookup!();
unsafe {
*out_ptr.add(out_pos) = lit;
}
out_pos += 1;
continue;
}
if (entry & 0x2000) != 0 {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Ok(out_pos);
}
let length = (entry >> 16)
+ (extract_bits(saved_sub, (entry as u8) as u32) >> ((entry >> 8) as u8)) as u32;
refill_branchless_fast!();
let mut dist_entry = dist.lookup(bitbuf);
if dist_entry.is_subtable_ptr() {
bitbuf >>= DistTable::TABLE_BITS;
bitsleft = bitsleft.wrapping_sub(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable_direct(dist_entry, bitbuf);
}
let dist_extra_saved = bitbuf;
let dist_raw = dist_entry.raw();
bitbuf >>= dist_raw as u8;
bitsleft = bitsleft.wrapping_sub(dist_raw);
let distance = (dist_raw >> 16)
+ (extract_bits(dist_extra_saved, (dist_raw as u8) as u32)
>> ((dist_raw >> 8) as u8)) as u32;
if distance == 0 || distance as usize > out_pos {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
refill_branchless_fast!();
entry = lookup!();
out_pos = copy_match_fast(output, out_pos, distance, length);
continue;
}
let mut dist_entry = dist.lookup(bitbuf);
let length = (entry >> 16)
+ (extract_bits(saved_bitbuf, (entry as u8) as u32) >> ((entry >> 8) as u8)) as u32;
if (bitsleft as u8) < 32 {
refill_branchless_fast!();
}
if dist_entry.is_subtable_ptr() {
bitbuf >>= DistTable::TABLE_BITS;
bitsleft = bitsleft.wrapping_sub(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable_direct(dist_entry, bitbuf);
}
let dist_extra_saved = bitbuf;
let dist_raw = dist_entry.raw();
bitbuf >>= dist_raw as u8;
bitsleft = bitsleft.wrapping_sub(dist_raw);
let distance = (dist_raw >> 16)
+ (extract_bits(dist_extra_saved, (dist_raw as u8) as u32) >> ((dist_raw >> 8) as u8))
as u32;
if distance == 0 || distance as usize > out_pos {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
refill_branchless_fast!();
entry = lookup!();
out_pos = copy_match_fast(output, out_pos, distance, length);
}
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft & 0xFF; bits.pos = in_pos;
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
#[inline(never)]
fn decode_huffman_optimized(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
const LITLEN_TABLEMASK: u64 = (1u64 << LitLenTable::TABLE_BITS) - 1;
let out_ptr = output.as_mut_ptr();
let out_end = output.len();
let litlen_ptr = litlen.entries_ptr();
let mut bitbuf = bits.bitbuf;
let mut bitsleft = bits.bitsleft;
let mut in_pos = bits.pos;
let in_data = bits.data;
macro_rules! refill_branchless {
() => {
let mut bits_u8 = bitsleft as u8;
if bits_u8 > 64 {
bits_u8 = 0;
bitbuf = 0;
}
if in_pos + 8 <= in_data.len() {
let word = unsafe { (in_data.as_ptr().add(in_pos) as *const u64).read_unaligned() };
let word = u64::from_le(word);
bitbuf |= word << bits_u8;
in_pos += (7 - ((bits_u8 >> 3) & 7)) as usize;
bitsleft = (bits_u8 as u32) | 56;
} else {
while bits_u8 <= 56 && in_pos < in_data.len() {
bitbuf |= (in_data[in_pos] as u64) << bits_u8;
in_pos += 1;
bits_u8 = bits_u8.wrapping_add(8);
}
bitsleft = bits_u8 as u32;
}
};
}
macro_rules! lookup {
() => {
unsafe { (*litlen_ptr.add((bitbuf & LITLEN_TABLEMASK) as usize)).raw() }
};
}
macro_rules! bzhi {
($val:expr, $bits:expr) => {
bzhi_u64($val, $bits)
};
}
macro_rules! consume {
($entry:expr) => {{
let saved = bitbuf;
bitbuf >>= $entry as u8;
bitsleft = bitsleft.wrapping_sub($entry);
saved
}};
}
macro_rules! huffdec {
($entry:expr) => {{
bitbuf >>= $entry as u8;
bitsleft = bitsleft.wrapping_sub($entry);
$entry = lookup!();
}};
}
refill_branchless!();
let mut entry = lookup!();
const REFILL_THRESHOLD2: u8 = if LitLenTable::TABLE_BITS * 4 <= 56 {
LitLenTable::TABLE_BITS * 4
} else {
56
};
while out_pos + FASTLOOP_MARGIN <= out_end {
if (bitsleft as u8) < REFILL_THRESHOLD2 {
refill_branchless!();
}
let saved_bitbuf = consume!(entry);
if (entry as i32) < 0 {
let lit1 = (entry >> 16) as u8;
entry = lookup!();
if (entry as i32) < 0 {
let lit2 = (entry >> 16) as u8;
huffdec!(entry);
if (entry as i32) < 0 {
let lit3 = (entry >> 16) as u8;
if LitLenTable::TABLE_BITS > 13 && (bitsleft as u8) < 32 {
refill_branchless!();
}
huffdec!(entry);
if (entry as i32) < 0 {
let lit4 = (entry >> 16) as u8;
consume!(entry);
refill_branchless!();
entry = lookup!();
let packed = (lit1 as u32)
| ((lit2 as u32) << 8)
| ((lit3 as u32) << 16)
| ((lit4 as u32) << 24);
unsafe { (out_ptr.add(out_pos) as *mut u32).write_unaligned(packed) };
out_pos += 4;
continue;
}
let packed = (lit1 as u32) | ((lit2 as u32) << 8) | ((lit3 as u32) << 16);
unsafe { (out_ptr.add(out_pos) as *mut u32).write_unaligned(packed) };
out_pos += 3;
if (bitsleft as u8) < 32 {
refill_branchless!();
}
continue;
}
let packed = (lit1 as u16) | ((lit2 as u16) << 8);
unsafe { (out_ptr.add(out_pos) as *mut u16).write_unaligned(packed) };
out_pos += 2;
if (bitsleft as u8) < 32 {
refill_branchless!();
}
continue;
}
unsafe { *out_ptr.add(out_pos) = lit1 };
out_pos += 1;
if (bitsleft as u8) < 32 {
refill_branchless!();
}
continue;
}
if (entry & 0x8000) != 0 {
if (entry & 0x2000) != 0 {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Ok(out_pos);
}
let subtable_start = (entry >> 16) as usize;
let subtable_bits = (entry >> 8) & 0x3F;
let sub_idx = bzhi!(bitbuf, subtable_bits) as usize;
entry = unsafe { (*litlen_ptr.add(subtable_start + sub_idx)).raw() };
let saved_sub = consume!(entry);
if (entry as i32) < 0 {
let lit = ((entry >> 16) & 0xFF) as u8;
refill_branchless!();
entry = lookup!();
unsafe { *out_ptr.add(out_pos) = lit };
out_pos += 1;
continue;
}
if (entry & 0x2000) != 0 {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Ok(out_pos);
}
let length = (entry >> 16)
+ (bzhi!(saved_sub, (entry as u8) as u32) >> ((entry >> 8) as u8)) as u32;
refill_branchless!();
let mut dist_entry = dist.lookup(bitbuf);
if dist_entry.is_subtable_ptr() {
bitbuf >>= DistTable::TABLE_BITS;
bitsleft = bitsleft.wrapping_sub(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable_direct(dist_entry, bitbuf);
}
let dist_extra_saved = bitbuf;
let dist_raw = dist_entry.raw();
bitbuf >>= dist_raw as u8;
bitsleft = bitsleft.wrapping_sub(dist_raw);
let distance = (dist_raw >> 16)
+ (bzhi!(dist_extra_saved, (dist_raw as u8) as u32) >> ((dist_raw >> 8) as u8))
as u32;
if distance == 0 || distance as usize > out_pos {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
refill_branchless!();
entry = lookup!();
out_pos = copy_match_fast(output, out_pos, distance, length);
continue;
}
let length = (entry >> 16)
+ (bzhi!(saved_bitbuf, (entry as u8) as u32) >> ((entry >> 8) as u8)) as u32;
let mut dist_entry = dist.lookup(bitbuf);
if (bitsleft as u8) < 32 {
refill_branchless!();
}
if dist_entry.is_subtable_ptr() {
bitbuf >>= DistTable::TABLE_BITS;
bitsleft = bitsleft.wrapping_sub(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable_direct(dist_entry, bitbuf);
}
let dist_extra_saved = bitbuf;
let dist_raw = dist_entry.raw();
bitbuf >>= dist_raw as u8;
bitsleft = bitsleft.wrapping_sub(dist_raw);
let distance = (dist_raw >> 16)
+ (bzhi!(dist_extra_saved, (dist_raw as u8) as u32) >> ((dist_raw >> 8) as u8)) as u32;
if distance == 0 || distance as usize > out_pos {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
refill_branchless!();
entry = lookup!();
out_pos = copy_match_fast(output, out_pos, distance, length);
}
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft & 0xFF;
bits.pos = in_pos;
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
fn decode_huffman_with_dispatch(
bits: &mut Bits,
output: &mut [u8],
out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
decode_huffman_libdeflate_style(bits, output, out_pos, litlen, dist)
}
fn decode_huffman_flat(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
let out_ptr = output.as_mut_ptr();
let out_end = output.len();
'fastloop: while out_pos + FASTLOOP_MARGIN <= out_end {
bits.refill();
'literals: loop {
let saved_bitbuf = bits.peek();
let entry = litlen.lookup(saved_bitbuf);
bits.consume_entry(entry.raw());
if (entry.raw() as i32) >= 0 {
if entry.is_exceptional() {
if entry.is_end_of_block() {
return Ok(out_pos);
}
let sub_entry = litlen.lookup_subtable(entry, saved_bitbuf);
let sub_saved = bits.peek();
bits.consume_entry(sub_entry.raw());
if sub_entry.is_end_of_block() {
return Ok(out_pos);
}
if (sub_entry.raw() as i32) < 0 {
unsafe {
*out_ptr.add(out_pos) = sub_entry.literal_value();
}
out_pos += 1;
continue 'literals;
}
let length = sub_entry.decode_length(sub_saved);
out_pos = decode_match_inline(bits, output, out_pos, length, dist)?;
continue 'fastloop;
}
let length = entry.decode_length(saved_bitbuf);
out_pos = decode_match_inline(bits, output, out_pos, length, dist)?;
continue 'fastloop;
}
unsafe {
*out_ptr.add(out_pos) = entry.literal_value();
}
out_pos += 1;
if bits.available() < 32 {
continue 'fastloop;
}
}
}
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
#[inline(always)]
fn decode_match_inline(
bits: &mut Bits,
output: &mut [u8],
out_pos: usize,
length: u32,
dist: &DistTable,
) -> Result<usize> {
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
Ok(copy_match_fast(output, out_pos, distance, length))
}
pub fn decode_huffman_cf_pub(
bits: &mut Bits,
output: &mut [u8],
out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
pub fn decode_stored_pub(bits: &mut Bits, output: &mut [u8], out_pos: usize) -> Result<usize> {
decode_stored(bits, output, out_pos)
}
pub fn decode_fixed_pub(bits: &mut Bits, output: &mut [u8], out_pos: usize) -> Result<usize> {
decode_fixed(bits, output, out_pos)
}
pub fn decode_dynamic_pub(bits: &mut Bits, output: &mut [u8], out_pos: usize) -> Result<usize> {
decode_dynamic(bits, output, out_pos)
}
fn decode_huffman_cf(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
bits.refill();
let mut entry = litlen.lookup(bits.peek());
while out_pos + FASTLOOP_MARGIN <= output.len() {
let saved_bitbuf = bits.peek();
bits.consume_entry(entry.raw());
if (entry.raw() as i32) < 0 {
let out_ptr = output.as_mut_ptr();
let lit1 = entry.literal_value();
entry = litlen.lookup(bits.peek());
unsafe {
*out_ptr.add(out_pos) = lit1;
}
out_pos += 1;
if (entry.raw() as i32) < 0 {
bits.consume_entry(entry.raw());
let lit2 = entry.literal_value();
entry = litlen.lookup(bits.peek());
unsafe {
*out_ptr.add(out_pos) = lit2;
}
out_pos += 1;
if (entry.raw() as i32) < 0 {
bits.consume_entry(entry.raw());
let lit3 = entry.literal_value();
entry = litlen.lookup(bits.peek());
unsafe {
*out_ptr.add(out_pos) = lit3;
}
out_pos += 1;
if (entry.raw() as i32) < 0 {
bits.consume_entry(entry.raw());
let lit4 = entry.literal_value();
entry = litlen.lookup(bits.peek());
unsafe {
*out_ptr.add(out_pos) = lit4;
}
out_pos += 1;
if (entry.raw() as i32) < 0 {
bits.consume_entry(entry.raw());
let lit5 = entry.literal_value();
bits.refill();
entry = litlen.lookup(bits.peek());
unsafe {
*out_ptr.add(out_pos) = lit5;
}
out_pos += 1;
continue;
}
}
if bits.available() < 32 {
bits.refill();
}
continue;
}
if bits.available() < 32 {
bits.refill();
}
continue;
}
if bits.available() < 32 {
bits.refill();
}
continue;
}
if entry.is_exceptional() {
if entry.is_end_of_block() {
return Ok(out_pos);
}
entry = litlen.lookup_subtable(entry, saved_bitbuf);
let saved_sub = bits.peek();
bits.consume_entry(entry.raw());
if (entry.raw() as i32) < 0 {
let lit = entry.literal_value();
bits.refill();
entry = litlen.lookup(bits.peek());
unsafe {
*output.as_mut_ptr().add(out_pos) = lit;
}
out_pos += 1;
continue; }
if entry.is_end_of_block() {
return Ok(out_pos);
}
let length = entry.decode_length(saved_sub);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
let is_subtable = dist_entry.is_subtable_ptr();
if is_subtable {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!(
"Invalid distance {} at pos {} from subtable",
distance, out_pos
),
));
}
bits.refill();
entry = litlen.lookup(bits.peek());
out_pos = copy_match_fast(output, out_pos, distance, length);
continue; }
let length = entry.decode_length(saved_bitbuf);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
bits.refill();
entry = litlen.lookup(bits.peek());
out_pos = copy_match_fast(output, out_pos, distance, length);
}
loop {
bits.refill();
let mut saved_bitbuf = bits.peek();
entry = litlen.lookup(saved_bitbuf);
if entry.is_subtable_ptr() {
bits.consume(LitLenTable::TABLE_BITS as u32);
entry = litlen.lookup_subtable(entry, saved_bitbuf);
saved_bitbuf = bits.peek();
bits.consume_entry(entry.raw()); } else {
bits.consume_entry(entry.raw()); }
if (entry.raw() as i32) < 0 {
if out_pos >= output.len() {
return Err(Error::new(
ErrorKind::WriteZero,
format!(
"Generic literal overflow: out_pos={} output.len={}",
out_pos,
output.len()
),
));
}
output[out_pos] = entry.literal_value();
out_pos += 1;
continue;
}
if entry.is_end_of_block() {
return Ok(out_pos);
}
let length = entry.decode_length(saved_bitbuf);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
if out_pos + length as usize > output.len() {
return Err(Error::new(
ErrorKind::WriteZero,
format!(
"Generic match overflow: out_pos={} length={} output.len={}",
out_pos,
length,
output.len()
),
));
}
out_pos = copy_match_safe(output, out_pos, distance, length);
}
}
fn decode_stored(bits: &mut Bits, output: &mut [u8], mut out_pos: usize) -> Result<usize> {
bits.align_to_byte();
let len = bits.read_u16();
let nlen = bits.read_u16();
if len != !nlen {
eprintln!(
"Invalid stored block: len={:#x}, nlen={:#x}, !nlen={:#x}, pos={}, out_pos={}",
len, nlen, !nlen, bits.pos, out_pos
);
return Err(Error::new(ErrorKind::InvalidData, "Invalid stored block"));
}
let len = len as usize;
if len == 0 {
return Ok(out_pos);
}
if out_pos + len > output.len() {
return Err(Error::new(ErrorKind::WriteZero, "Output full"));
}
let _bytes_in_buffer = bits.available() as usize / 8;
let mut remaining = len;
while remaining > 0 && bits.available() >= 8 {
output[out_pos] = (bits.bitbuf & 0xFF) as u8;
bits.consume(8);
out_pos += 1;
remaining -= 1;
}
if remaining > 0 {
if bits.pos + remaining > bits.data.len() {
return Err(Error::new(
ErrorKind::UnexpectedEof,
"Truncated stored block",
));
}
output[out_pos..out_pos + remaining]
.copy_from_slice(&bits.data[bits.pos..bits.pos + remaining]);
bits.pos += remaining;
out_pos += remaining;
}
bits.bitbuf = 0;
bits.bitsleft = 0;
Ok(out_pos)
}
fn get_fixed_double_lit_cache(
) -> &'static crate::decompress::inflate::double_literal::DoubleLitCache {
use std::sync::OnceLock;
static CACHE: OnceLock<crate::decompress::inflate::double_literal::DoubleLitCache> =
OnceLock::new();
CACHE.get_or_init(|| {
let tables = crate::decompress::inflate::libdeflate_decode::get_fixed_tables();
crate::decompress::inflate::double_literal::DoubleLitCache::build(&tables.0)
})
}
fn get_fixed_specialized_decoder(
) -> &'static crate::decompress::inflate::specialized_decode::SpecializedDecoder {
use std::sync::OnceLock;
static DECODER: OnceLock<crate::decompress::inflate::specialized_decode::SpecializedDecoder> =
OnceLock::new();
DECODER.get_or_init(|| {
let mut litlen_lens = vec![0u8; 288];
litlen_lens[0..144].fill(8);
litlen_lens[144..256].fill(9);
litlen_lens[256..280].fill(7);
litlen_lens[280..288].fill(8);
let mut dist_lens = vec![0u8; 32];
dist_lens.fill(5);
crate::decompress::inflate::specialized_decode::SpecializedDecoder::build(
&litlen_lens,
&dist_lens,
)
.expect("Fixed Huffman should always build")
})
}
fn decode_fixed(bits: &mut Bits, output: &mut [u8], out_pos: usize) -> Result<usize> {
let (litlen, dist) = crate::decompress::inflate::libdeflate_decode::get_fixed_tables();
decode_huffman_libdeflate_style(bits, output, out_pos, litlen, dist)
}
fn decode_huffman_cf_double(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
double_cache: &crate::decompress::inflate::double_literal::DoubleLitCache,
) -> Result<usize> {
#![allow(unused_imports)]
use crate::decompress::inflate::double_literal::DoubleLitEntry;
const FASTLOOP_MARGIN: usize = 320;
while out_pos + FASTLOOP_MARGIN <= output.len() && bits.available() >= 16 {
let saved_bitbuf = bits.peek();
let double_entry = double_cache.lookup(saved_bitbuf);
if double_entry.is_literal() {
if double_entry.has_second() {
let out_ptr = output.as_mut_ptr();
unsafe {
*out_ptr.add(out_pos) = double_entry.symbol1();
*out_ptr.add(out_pos + 1) = double_entry.symbol2();
}
out_pos += 2;
bits.consume(double_entry.total_bits() as u32);
bits.refill();
continue;
} else {
let out_ptr = output.as_mut_ptr();
unsafe {
*out_ptr.add(out_pos) = double_entry.symbol1();
}
out_pos += 1;
bits.consume(double_entry.total_bits() as u32);
bits.refill();
continue;
}
}
let entry = litlen.lookup(saved_bitbuf);
bits.consume_entry(entry.raw());
if entry.is_exceptional() {
if entry.is_end_of_block() {
return Ok(out_pos);
}
let sub_saved = bits.peek();
let sub_entry = litlen.lookup_subtable(entry, saved_bitbuf);
bits.consume_entry(sub_entry.raw());
if sub_entry.is_end_of_block() {
return Ok(out_pos);
}
if (sub_entry.raw() as i32) < 0 {
let lit = sub_entry.literal_value();
let out_ptr = output.as_mut_ptr();
unsafe {
*out_ptr.add(out_pos) = lit;
}
out_pos += 1;
bits.refill();
continue;
}
let length = sub_entry.decode_length(sub_saved);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
bits.refill();
out_pos = copy_match_fast(output, out_pos, distance, length);
continue;
}
let length = entry.decode_length(saved_bitbuf);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
bits.refill();
out_pos = copy_match_fast(output, out_pos, distance, length);
}
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
fn decode_dynamic(bits: &mut Bits, output: &mut [u8], out_pos: usize) -> Result<usize> {
if bits.available() < 14 {
bits.refill();
}
let hlit = (bits.peek() & 0x1F) as usize + 257;
bits.consume(5);
let hdist = (bits.peek() & 0x1F) as usize + 1;
bits.consume(5);
let hclen = (bits.peek() & 0xF) as usize + 4;
bits.consume(4);
const CODE_LENGTH_ORDER: [usize; 19] = [
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
];
let mut code_length_lengths = [0u8; 19];
for i in 0..hclen {
if bits.available() < 3 {
bits.refill();
}
code_length_lengths[CODE_LENGTH_ORDER[i]] = (bits.peek() & 0x7) as u8;
bits.consume(3);
}
let cl_table = build_code_length_table(&code_length_lengths)?;
let mut all_lengths = vec![0u8; hlit + hdist];
let mut i = 0;
while i < hlit + hdist {
if bits.available() < 15 {
bits.refill();
}
let entry = cl_table[(bits.peek() & 0x7F) as usize];
let symbol = (entry >> 8) as u8;
let len = (entry & 0xFF) as u8;
bits.consume(len as u32);
match symbol {
0..=15 => {
all_lengths[i] = symbol;
i += 1;
}
16 => {
if i == 0 {
return Err(Error::new(ErrorKind::InvalidData, "Invalid repeat"));
}
let repeat = 3 + (bits.peek() & 0x3) as usize;
bits.consume(2);
let val = all_lengths[i - 1];
for _ in 0..repeat {
if i >= hlit + hdist {
break;
}
all_lengths[i] = val;
i += 1;
}
}
17 => {
let repeat = 3 + (bits.peek() & 0x7) as usize;
bits.consume(3);
for _ in 0..repeat {
if i >= hlit + hdist {
break;
}
all_lengths[i] = 0;
i += 1;
}
}
18 => {
let repeat = 11 + (bits.peek() & 0x7F) as usize;
bits.consume(7);
for _ in 0..repeat {
if i >= hlit + hdist {
break;
}
all_lengths[i] = 0;
i += 1;
}
}
_ => {
return Err(Error::new(
ErrorKind::InvalidData,
"Invalid code length symbol",
))
}
}
}
let litlen_lengths = &all_lengths[..hlit];
let dist_lengths = &all_lengths[hlit..];
let fingerprint = crate::decompress::inflate::jit_decode::TableFingerprint::combined(
litlen_lengths,
dist_lengths,
);
SPEC_STATS.with(|s| s.borrow_mut().1 += 1);
decode_dynamic_fallback(
bits,
output,
out_pos,
litlen_lengths,
dist_lengths,
fingerprint,
)
}
fn decode_dynamic_fallback(
bits: &mut Bits,
output: &mut [u8],
out_pos: usize,
litlen_lengths: &[u8],
dist_lengths: &[u8],
fingerprint: TableFingerprint,
) -> Result<usize> {
#[cfg(feature = "profile")]
let ((litlen_table, dist_table), build_time) = {
use std::time::Instant;
TABLE_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
if let Some(tables) = cache.get(&fingerprint) {
CACHE_STATS.with(|s| s.borrow_mut().0 += 1); return (tables.clone(), 0u64);
}
CACHE_STATS.with(|s| s.borrow_mut().1 += 1);
let start = Instant::now();
let litlen = LitLenTable::build(litlen_lengths).expect("Invalid litlen table");
let dist = DistTable::build(dist_lengths).expect("Invalid dist table");
let build_nanos = start.elapsed().as_nanos() as u64;
let tables = (litlen, dist);
cache.insert(fingerprint, tables.clone());
(tables, build_nanos)
})
};
#[cfg(not(feature = "profile"))]
let (litlen_table, dist_table) = {
#[cfg(feature = "debug_decode")]
{
let litlen = LitLenTable::build(litlen_lengths).expect("Invalid litlen table");
let dist = DistTable::build(dist_lengths).expect("Invalid dist table");
(litlen, dist)
}
#[cfg(not(feature = "debug_decode"))]
TABLE_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
if let Some(tables) = cache.get(&fingerprint) {
CACHE_STATS.with(|s| s.borrow_mut().0 += 1); return tables.clone();
}
CACHE_STATS.with(|s| s.borrow_mut().1 += 1);
let litlen = LitLenTable::build(litlen_lengths).expect("Invalid litlen table");
let dist = DistTable::build(dist_lengths).expect("Invalid dist table");
let tables = (litlen, dist);
cache.insert(fingerprint, tables.clone());
tables
})
};
#[cfg(feature = "profile")]
if build_time > 0 {
TIMING_STATS.with(|s| {
let mut stats = s.borrow_mut();
stats.table_build_nanos += build_time;
stats.table_build_count += 1;
});
}
#[cfg(feature = "profile")]
let decode_start = std::time::Instant::now();
let result = decode_dynamic_speculative(bits, output, out_pos, &litlen_table, &dist_table);
#[cfg(feature = "profile")]
{
let decode_nanos = decode_start.elapsed().as_nanos() as u64;
TIMING_STATS.with(|s| {
let mut stats = s.borrow_mut();
stats.decode_nanos += decode_nanos;
stats.decode_count += 1;
});
}
result
}
#[inline(never)]
fn decode_with_specialized_tables(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
spec: &crate::decompress::inflate::specialized_decode::SpecializedDecoder,
) -> Result<usize> {
#[allow(unused_imports)]
use crate::decompress::inflate::specialized_decode::SpecEntry;
const FASTLOOP_MARGIN: usize = 320;
let out_ptr = output.as_mut_ptr();
let out_end = output.len();
let in_data = bits.data;
let mut bitbuf = bits.bitbuf;
let mut bitsleft = bits.bitsleft;
let mut in_pos = bits.pos;
macro_rules! refill {
() => {
let mut bits_u8 = bitsleft as u8;
if bits_u8 > 64 {
bits_u8 = 0;
bitbuf = 0;
}
if in_pos + 8 <= in_data.len() {
let word = unsafe { (in_data.as_ptr().add(in_pos) as *const u64).read_unaligned() };
let word = u64::from_le(word);
bitbuf |= word << bits_u8;
in_pos += (7 - ((bits_u8 >> 3) & 7)) as usize;
bitsleft = (bits_u8 as u32) | 56;
} else {
while bits_u8 <= 56 && in_pos < in_data.len() {
bitbuf |= (in_data[in_pos] as u64) << bits_u8;
in_pos += 1;
bits_u8 = bits_u8.wrapping_add(8);
}
bitsleft = bits_u8 as u32;
}
};
}
refill!();
while out_pos + FASTLOOP_MARGIN <= out_end {
if (bitsleft as u8) < 48 {
refill!();
}
let mut entry = unsafe { *spec.litlen.get_unchecked((bitbuf & 0x7FF) as usize) };
let bits_used = entry.total_bits() as u32;
bitbuf >>= bits_used;
bitsleft = bitsleft.wrapping_sub(bits_used);
if entry.is_literal() {
if entry.is_double() {
unsafe {
let ptr = out_ptr.add(out_pos);
ptr.write(entry.lit1());
ptr.add(1).write(entry.lit2());
}
out_pos += 2;
} else {
unsafe {
*out_ptr.add(out_pos) = entry.literal_value();
}
out_pos += 1;
}
entry = unsafe { *spec.litlen.get_unchecked((bitbuf & 0x7FF) as usize) };
if entry.is_literal() {
let bits2 = entry.total_bits() as u32;
bitbuf >>= bits2;
bitsleft = bitsleft.wrapping_sub(bits2);
if entry.is_double() {
unsafe {
let ptr = out_ptr.add(out_pos);
ptr.write(entry.lit1());
ptr.add(1).write(entry.lit2());
}
out_pos += 2;
} else {
unsafe {
*out_ptr.add(out_pos) = entry.literal_value();
}
out_pos += 1;
}
if bitsleft >= 32 {
let next_entry =
unsafe { *spec.litlen.get_unchecked((bitbuf & 0x7FF) as usize) };
if next_entry.is_literal() {
let bits3 = next_entry.total_bits() as u32;
bitbuf >>= bits3;
bitsleft = bitsleft.wrapping_sub(bits3);
if next_entry.is_double() {
unsafe {
let ptr = out_ptr.add(out_pos);
ptr.write(next_entry.lit1());
ptr.add(1).write(next_entry.lit2());
}
out_pos += 2;
} else {
unsafe {
*out_ptr.add(out_pos) = next_entry.literal_value();
}
out_pos += 1;
}
}
}
refill!();
continue;
}
refill!();
continue;
}
if entry.is_eob() {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Ok(out_pos);
}
let length_base = entry.length_base() as u32;
let extra = entry.extra_bits();
let length = if extra > 0 {
let extra_val = (bitbuf & ((1u64 << extra) - 1)) as u32;
bitbuf >>= extra;
bitsleft = bitsleft.wrapping_sub(extra as u32);
length_base + extra_val
} else {
length_base
};
refill!();
let mut dist_entry = spec.dist[(bitbuf & 0x7FF) as usize];
if unlikely(dist_entry.is_subtable()) {
let offset = dist_entry.subtable_offset() as usize;
let sub_bits = dist_entry.subtable_bits();
let idx = (bitbuf >> 11) & ((1 << sub_bits) - 1);
dist_entry = spec.dist[offset + idx as usize];
}
let dist_bits = dist_entry.total_bits() as u32;
bitbuf >>= dist_bits;
bitsleft = bitsleft.wrapping_sub(dist_bits);
let dist_base = dist_entry.symbol() as u32;
let dist_extra = dist_entry.extra_bits();
let distance = if dist_extra > 0 {
let extra_val = (bitbuf & ((1u64 << dist_extra) - 1)) as u32;
bitbuf >>= dist_extra;
bitsleft = bitsleft.wrapping_sub(dist_extra as u32);
dist_base + extra_val
} else {
dist_base
};
if distance == 0 || distance as usize > out_pos {
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft;
bits.pos = in_pos;
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
refill!();
out_pos = copy_match_fast(output, out_pos, distance, length);
}
bits.bitbuf = bitbuf;
bits.bitsleft = bitsleft & 0xFF;
bits.pos = in_pos;
decode_generic_with_spec(bits, output, out_pos, spec)
}
fn decode_generic_with_spec(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
spec: &crate::decompress::inflate::specialized_decode::SpecializedDecoder,
) -> Result<usize> {
loop {
bits.refill();
let entry = spec.decode_symbol(bits.peek());
let bits_used = entry.total_bits() as u32;
bits.consume(bits_used);
if entry.is_eob() {
return Ok(out_pos);
}
if entry.is_literal() {
if out_pos + 2 > output.len() {
return Err(Error::new(ErrorKind::InvalidData, "Output overflow"));
}
if entry.is_double() {
output[out_pos] = entry.lit1();
output[out_pos + 1] = entry.lit2();
out_pos += 2;
} else {
output[out_pos] = entry.literal_value();
out_pos += 1;
}
continue;
}
let length_base = entry.length_base() as u32;
let extra = entry.extra_bits();
let length = if extra > 0 {
let extra_val = (bits.peek() & ((1u64 << extra) - 1)) as u32;
bits.consume(extra as u32);
length_base + extra_val
} else {
length_base
};
bits.refill();
let dist_entry = spec.decode_distance(bits.peek());
let dist_bits = dist_entry.total_bits() as u32;
bits.consume(dist_bits);
let dist_base = dist_entry.symbol() as u32;
let dist_extra = dist_entry.extra_bits();
let distance = if dist_extra > 0 {
let extra_val = (bits.peek() & ((1u64 << dist_extra) - 1)) as u32;
bits.consume(dist_extra as u32);
dist_base + extra_val
} else {
dist_base
};
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
if out_pos + length as usize > output.len() {
return Err(Error::new(ErrorKind::InvalidData, "Output overflow"));
}
out_pos = copy_match_safe(output, out_pos, distance, length);
}
}
#[inline(never)]
fn decode_dynamic_speculative(
bits: &mut Bits,
output: &mut [u8],
out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
decode_huffman_with_dispatch(bits, output, out_pos, litlen, dist)
}
#[inline(never)] fn decode_dynamic_hyperfast(
bits: &mut Bits,
output: &mut [u8],
mut out_pos: usize,
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<usize> {
const FASTLOOP_MARGIN: usize = 320;
let litlen_ptr = litlen.entries_ptr();
let out_ptr = output.as_mut_ptr();
let out_end = output.len();
bits.refill();
while out_pos + FASTLOOP_MARGIN <= out_end {
if bits.available() < 48 {
bits.refill();
}
let saved_bitbuf = bits.bitbuf;
let entry = unsafe { (*litlen_ptr.add((saved_bitbuf as usize) & 0x7FF)).raw() };
bits.bitbuf >>= entry as u8;
bits.bitsleft = bits.bitsleft.wrapping_sub(entry & 0x1F);
if (entry as i32) < 0 {
let lit = ((entry >> 16) & 0xFF) as u8;
unsafe {
*out_ptr.add(out_pos) = lit;
}
out_pos += 1;
let entry2 = unsafe { (*litlen_ptr.add((bits.bitbuf as usize) & 0x7FF)).raw() };
if (entry2 as i32) < 0 {
bits.bitbuf >>= entry2 as u8;
bits.bitsleft = bits.bitsleft.wrapping_sub(entry2 & 0x1F);
let lit2 = ((entry2 >> 16) & 0xFF) as u8;
unsafe {
*out_ptr.add(out_pos) = lit2;
}
out_pos += 1;
let entry3 = unsafe { (*litlen_ptr.add((bits.bitbuf as usize) & 0x7FF)).raw() };
if (entry3 as i32) < 0 {
bits.bitbuf >>= entry3 as u8;
bits.bitsleft = bits.bitsleft.wrapping_sub(entry3 & 0x1F);
let lit3 = ((entry3 >> 16) & 0xFF) as u8;
unsafe {
*out_ptr.add(out_pos) = lit3;
}
out_pos += 1;
let entry4 = unsafe { (*litlen_ptr.add((bits.bitbuf as usize) & 0x7FF)).raw() };
if (entry4 as i32) < 0 {
bits.bitbuf >>= entry4 as u8;
bits.bitsleft = bits.bitsleft.wrapping_sub(entry4 & 0x1F);
let lit4 = ((entry4 >> 16) & 0xFF) as u8;
unsafe {
*out_ptr.add(out_pos) = lit4;
}
out_pos += 1;
if (bits.bitsleft as u8) < 32 {
bits.refill();
}
let entry5 =
unsafe { (*litlen_ptr.add((bits.bitbuf as usize) & 0x7FF)).raw() };
if (entry5 as i32) < 0 {
bits.bitbuf >>= entry5 as u8;
bits.bitsleft = bits.bitsleft.wrapping_sub(entry5 & 0x1F);
let lit5 = ((entry5 >> 16) & 0xFF) as u8;
unsafe {
*out_ptr.add(out_pos) = lit5;
}
out_pos += 1;
bits.refill();
continue;
}
continue;
}
if (bits.bitsleft as u8) < 32 {
bits.refill();
}
continue;
}
if (bits.bitsleft as u8) < 32 {
bits.refill();
}
continue;
}
if (bits.bitsleft as u8) < 32 {
bits.refill();
}
continue;
}
let entry = crate::decompress::inflate::libdeflate_entry::LitLenEntry::from_raw(entry);
if entry.is_exceptional() {
if entry.is_end_of_block() {
return Ok(out_pos);
}
let sub_entry = litlen.lookup_subtable(entry, saved_bitbuf);
let sub_saved = bits.peek();
bits.consume_entry(sub_entry.raw());
if (sub_entry.raw() as i32) < 0 {
unsafe {
*out_ptr.add(out_pos) = sub_entry.literal_value();
}
out_pos += 1;
bits.refill();
continue;
}
if sub_entry.is_end_of_block() {
return Ok(out_pos);
}
let length = sub_entry.decode_length(sub_saved);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
bits.refill();
out_pos = copy_match_fast(output, out_pos, distance, length);
continue;
}
let length = entry.decode_length(saved_bitbuf);
bits.refill();
let dist_saved = bits.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
bits.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = bits.peek();
bits.consume_entry(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > out_pos {
return Err(Error::new(
ErrorKind::InvalidData,
format!("Invalid distance {} at pos {}", distance, out_pos),
));
}
bits.refill();
out_pos = copy_match_fast(output, out_pos, distance, length);
}
decode_huffman_cf(bits, output, out_pos, litlen, dist)
}
fn build_code_length_table(lengths: &[u8; 19]) -> Result<[u16; 128]> {
let mut table = [0u16; 128];
let mut count = [0u16; 8];
for &len in lengths.iter() {
if len > 0 && len <= 7 {
count[len as usize] += 1;
}
}
let mut code = 0u32;
let mut first_code = [0u32; 8];
for len in 1..=7 {
code = (code + count[len - 1] as u32) << 1;
first_code[len] = code;
}
let mut next_code = first_code;
for (symbol, &len) in lengths.iter().enumerate() {
if len == 0 {
continue;
}
let len = len as usize;
let codeword = next_code[len];
next_code[len] += 1;
let mut reversed = 0u32;
let mut c = codeword;
for _ in 0..len {
reversed = (reversed << 1) | (c & 1);
c >>= 1;
}
let stride = 1usize << len;
let mut idx = reversed as usize;
while idx < 128 {
table[idx] = ((symbol as u16) << 8) | (len as u16);
idx += stride;
}
}
Ok(table)
}
pub fn inflate_consume_first(input: &[u8], output: &mut [u8]) -> Result<usize> {
let mut bits = Bits::new(input);
let out_size = inflate_consume_first_bits(&mut bits, output)?;
Ok(out_size)
}
pub fn inflate_consume_first_bits(bits: &mut Bits, output: &mut [u8]) -> Result<usize> {
let mut out_pos = 0;
loop {
if bits.available() < 3 {
bits.refill();
}
let bfinal = (bits.peek() & 1) != 0;
let btype = ((bits.peek() >> 1) & 3) as u8;
bits.consume(3);
let prev_pos = out_pos;
match btype {
0 => out_pos = decode_stored(bits, output, out_pos)?,
1 => out_pos = decode_fixed(bits, output, out_pos)?,
2 => out_pos = decode_dynamic(bits, output, out_pos)?,
3 => return Err(Error::new(ErrorKind::InvalidData, "Reserved block type")),
_ => unreachable!(),
}
let block_bytes = out_pos - prev_pos;
BLOCK_STATS.with(|stats| {
let mut s = stats.borrow_mut();
match btype {
0 => {
s.stored_blocks += 1;
s.stored_bytes += block_bytes;
}
1 => {
s.fixed_blocks += 1;
s.fixed_bytes += block_bytes;
}
2 => {
s.dynamic_blocks += 1;
s.dynamic_bytes += block_bytes;
}
_ => {}
}
});
if bfinal {
bits.align_to_byte();
return Ok(out_pos);
}
}
}
#[derive(Clone)]
pub struct DecodeLane<'a> {
pub data: &'a [u8],
pub pos: usize,
pub bitbuf: u64,
pub bitsleft: u32,
pub out_pos: usize,
pub done: bool,
}
impl<'a> DecodeLane<'a> {
pub fn new(data: &'a [u8]) -> Self {
let mut lane = Self {
data,
pos: 0,
bitbuf: 0,
bitsleft: 0,
out_pos: 0,
done: false,
};
lane.refill();
lane
}
#[inline(always)]
pub fn refill(&mut self) {
let mut bits_u8 = self.bitsleft as u8;
if bits_u8 > 64 {
bits_u8 = 0;
self.bitbuf = 0;
}
if self.pos + 8 <= self.data.len() {
let word = unsafe { (self.data.as_ptr().add(self.pos) as *const u64).read_unaligned() };
let word = u64::from_le(word);
self.bitbuf |= word << bits_u8;
self.pos += (7 - ((bits_u8 >> 3) & 7)) as usize;
self.bitsleft = (bits_u8 as u32) | 56;
} else {
while bits_u8 <= 56 && self.pos < self.data.len() {
self.bitbuf |= (self.data[self.pos] as u64) << bits_u8;
self.pos += 1;
bits_u8 += 8;
}
self.bitsleft = bits_u8 as u32;
}
}
#[inline(always)]
pub fn peek(&self) -> u64 {
self.bitbuf
}
#[inline(always)]
pub fn consume(&mut self, n: u32) {
self.bitbuf >>= n as u8;
self.bitsleft -= n;
}
}
pub fn decode_interleaved_4(
blocks: [&[u8]; 4],
outputs: [&mut [u8]; 4],
litlen: &LitLenTable,
dist: &DistTable,
) -> Result<[usize; 4]> {
const LITLEN_TABLEMASK: u64 = (1u64 << LitLenTable::TABLE_BITS) - 1;
let litlen_ptr = litlen.entries_ptr();
let mut lanes: [DecodeLane; 4] = [
DecodeLane::new(blocks[0]),
DecodeLane::new(blocks[1]),
DecodeLane::new(blocks[2]),
DecodeLane::new(blocks[3]),
];
let out_ptrs: [*mut u8; 4] = [
outputs[0].as_mut_ptr(),
outputs[1].as_mut_ptr(),
outputs[2].as_mut_ptr(),
outputs[3].as_mut_ptr(),
];
let _out_ends: [usize; 4] = [
outputs[0].len(),
outputs[1].len(),
outputs[2].len(),
outputs[3].len(),
];
let mut entries: [u32; 4] = [
unsafe { (*litlen_ptr.add((lanes[0].peek() & LITLEN_TABLEMASK) as usize)).raw() },
unsafe { (*litlen_ptr.add((lanes[1].peek() & LITLEN_TABLEMASK) as usize)).raw() },
unsafe { (*litlen_ptr.add((lanes[2].peek() & LITLEN_TABLEMASK) as usize)).raw() },
unsafe { (*litlen_ptr.add((lanes[3].peek() & LITLEN_TABLEMASK) as usize)).raw() },
];
loop {
let mut all_done = true;
let mut any_literal = false;
for i in 0..4 {
if lanes[i].done {
continue;
}
all_done = false;
if (entries[i] as i32) < 0 {
any_literal = true;
}
}
if all_done {
break;
}
if any_literal {
for i in 0..4 {
if lanes[i].done || (entries[i] as i32) >= 0 {
continue;
}
let lit = (entries[i] >> 16) as u8;
lanes[i].bitbuf >>= entries[i] as u8;
lanes[i].bitsleft = lanes[i].bitsleft.wrapping_sub(entries[i]);
unsafe {
*out_ptrs[i].add(lanes[i].out_pos) = lit;
}
lanes[i].out_pos += 1;
if lanes[i].bitsleft < 32 {
lanes[i].refill();
}
entries[i] = unsafe {
(*litlen_ptr.add((lanes[i].peek() & LITLEN_TABLEMASK) as usize)).raw()
};
}
continue;
}
for i in 0..4 {
if lanes[i].done {
continue;
}
let entry =
crate::decompress::inflate::libdeflate_entry::LitLenEntry::from_raw(entries[i]);
let saved = lanes[i].peek();
lanes[i].bitbuf >>= entries[i] as u8;
lanes[i].bitsleft = lanes[i].bitsleft.wrapping_sub(entries[i]);
if entry.is_exceptional() {
if entry.is_end_of_block() {
lanes[i].done = true;
continue;
}
let sub_entry = litlen.lookup_subtable(entry, saved);
let sub_saved = lanes[i].peek();
lanes[i].bitbuf >>= sub_entry.raw() as u8;
lanes[i].bitsleft = lanes[i].bitsleft.wrapping_sub(sub_entry.raw());
if (sub_entry.raw() as i32) < 0 {
unsafe {
*out_ptrs[i].add(lanes[i].out_pos) = sub_entry.literal_value();
}
lanes[i].out_pos += 1;
lanes[i].refill();
entries[i] = unsafe {
(*litlen_ptr.add((lanes[i].peek() & LITLEN_TABLEMASK) as usize)).raw()
};
continue;
}
if sub_entry.is_end_of_block() {
lanes[i].done = true;
continue;
}
let length = sub_entry.decode_length(sub_saved);
decode_match_for_lane(&mut lanes[i], outputs[i], out_ptrs[i], length, dist)?;
entries[i] = unsafe {
(*litlen_ptr.add((lanes[i].peek() & LITLEN_TABLEMASK) as usize)).raw()
};
continue;
}
let length = entry.decode_length(saved);
decode_match_for_lane(&mut lanes[i], outputs[i], out_ptrs[i], length, dist)?;
entries[i] =
unsafe { (*litlen_ptr.add((lanes[i].peek() & LITLEN_TABLEMASK) as usize)).raw() };
}
}
Ok([
lanes[0].out_pos,
lanes[1].out_pos,
lanes[2].out_pos,
lanes[3].out_pos,
])
}
#[inline(always)]
fn decode_match_for_lane(
lane: &mut DecodeLane,
_output: &mut [u8],
out_ptr: *mut u8,
length: u32,
dist: &DistTable,
) -> Result<()> {
lane.refill();
let dist_saved = lane.peek();
let mut dist_entry = dist.lookup(dist_saved);
if dist_entry.is_subtable_ptr() {
lane.consume(DistTable::TABLE_BITS as u32);
dist_entry = dist.lookup_subtable(dist_entry, dist_saved);
}
let dist_extra_saved = lane.peek();
lane.bitbuf >>= dist_entry.raw() as u8;
lane.bitsleft = lane.bitsleft.wrapping_sub(dist_entry.raw());
let distance = dist_entry.decode_distance(dist_extra_saved);
if distance == 0 || distance as usize > lane.out_pos {
return Err(Error::new(ErrorKind::InvalidData, "Invalid distance"));
}
lane.refill();
let dist_usize = distance as usize;
let len_usize = length as usize;
unsafe {
let dst = out_ptr.add(lane.out_pos);
let src = out_ptr.add(lane.out_pos - dist_usize);
if dist_usize >= 8 {
let mut s = src;
let mut d = dst;
let end = dst.add(len_usize);
while d < end {
(d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
s = s.add(8);
d = d.add(8);
}
} else {
for j in 0..len_usize {
*dst.add(j) = *src.add(j % dist_usize);
}
}
}
lane.out_pos += len_usize;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::assert_slices_eq;
fn run_bench(name: &str, gz_path: &str) {
let _ = crate::tests::datasets::prepare_datasets();
let gz = match std::fs::read(gz_path) {
Ok(d) => d,
Err(_) => {
eprintln!("Skipping {} benchmark - file not found: {}", name, gz_path);
return;
}
};
let mut pos = 10;
let flg = gz[3];
if (flg & 0x04) != 0 {
let xlen = u16::from_le_bytes([gz[pos], gz[pos + 1]]) as usize;
pos += 2 + xlen;
}
if (flg & 0x08) != 0 {
while pos < gz.len() && gz[pos] != 0 {
pos += 1;
}
pos += 1;
}
if (flg & 0x10) != 0 {
while pos < gz.len() && gz[pos] != 0 {
pos += 1;
}
pos += 1;
}
if (flg & 0x02) != 0 {
pos += 2;
}
let deflate = &gz[pos..gz.len() - 8];
let isize = u32::from_le_bytes([
gz[gz.len() - 4],
gz[gz.len() - 3],
gz[gz.len() - 2],
gz[gz.len() - 1],
]) as usize;
let mut output = vec![0u8; isize + 1024];
let mut lib_output = vec![0u8; isize + 1024];
let lib_size = libdeflater::Decompressor::new()
.deflate_decompress(deflate, &mut lib_output)
.expect("libdeflate failed");
let our_result = inflate_consume_first(deflate, &mut output);
if let Err(e) = &our_result {
eprintln!("Error decoding {}: {:?}", name, e);
let check_len = lib_size.min(output.len());
for i in 0..check_len {
if output[i] != lib_output[i] {
eprintln!(
"First mismatch at byte {}: got {:02x} expected {:02x}",
i, output[i], lib_output[i]
);
break;
}
}
panic!("Our decode failed for {}", name);
}
let our_size = our_result.unwrap();
assert_eq!(our_size, lib_size, "Size mismatch for {}", name);
let check_len = 10000.min(our_size);
if output[..check_len] != lib_output[..check_len] {
for i in 0..check_len {
if output[i] != lib_output[i] {
panic!("Data mismatch at byte {} for {}", i, name);
}
}
}
let iterations = 10;
reset_cache_stats();
let start_t = std::time::Instant::now();
for _ in 0..iterations {
let _ = inflate_consume_first(deflate, &mut output);
}
let our_time = start_t.elapsed();
let start_t = std::time::Instant::now();
for _ in 0..iterations {
let _ = libdeflater::Decompressor::new().deflate_decompress(deflate, &mut lib_output);
}
let lib_time = start_t.elapsed();
let our_throughput = (isize * iterations) as f64 / our_time.as_secs_f64() / 1e6;
let lib_throughput = (isize * iterations) as f64 / lib_time.as_secs_f64() / 1e6;
let (hits, misses, hit_rate) = get_cache_stats();
eprintln!(
"\n=== [EXPERIMENTAL] PURE-RUST INFLATE {} ===",
name.to_uppercase()
);
eprintln!("Data size: {:.1} MB", isize as f64 / 1_000_000.0);
eprintln!(
"experimental inflate_consume_first: {:>8.1} MB/s",
our_throughput
);
eprintln!(
"production inflate_into_pub (FFI): {:>8.1} MB/s",
lib_throughput
);
eprintln!(
"Ratio: {:.1}% (goal: >100% to replace production)",
100.0 * our_throughput / lib_throughput
);
eprintln!(
"Cache: {} hits, {} misses ({:.1}% hit rate)",
hits,
misses,
hit_rate * 100.0
);
let (spec_used, spec_fallback) = get_spec_stats();
if spec_used + spec_fallback > 0 {
let spec_rate = spec_used as f64 / (spec_used + spec_fallback) as f64;
eprintln!(
"Specialized: {} used, {} fallback ({:.1}% specialized)",
spec_used,
spec_fallback,
spec_rate * 100.0
);
}
eprintln!("=============================\n");
}
#[test]
fn test_cf_simple_literals() {
let original = b"Hello, World!";
let mut compressed = Vec::new();
{
use std::io::Write;
let mut enc =
flate2::write::DeflateEncoder::new(&mut compressed, flate2::Compression::default());
enc.write_all(original).unwrap();
enc.finish().unwrap();
}
let mut output = vec![0u8; original.len() + 100];
let size = inflate_consume_first(&compressed, &mut output).expect("Decode failed");
assert_eq!(size, original.len());
assert_slices_eq!(&output[..size], original.as_slice());
}
#[test]
fn test_cf_rle() {
let original = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
let mut compressed = Vec::new();
{
use std::io::Write;
let mut enc =
flate2::write::DeflateEncoder::new(&mut compressed, flate2::Compression::default());
enc.write_all(original).unwrap();
enc.finish().unwrap();
}
let mut output = vec![0u8; original.len() + 100];
let size = inflate_consume_first(&compressed, &mut output).expect("Decode failed");
assert_eq!(size, original.len());
assert_slices_eq!(&output[..size], original.as_slice());
}
#[test]
fn test_cf_matches() {
let original = b"abcabcabcabcabcabcabcabc";
let mut compressed = Vec::new();
{
use std::io::Write;
let mut enc =
flate2::write::DeflateEncoder::new(&mut compressed, flate2::Compression::best());
enc.write_all(original).unwrap();
enc.finish().unwrap();
}
let mut output = vec![0u8; original.len() + 100];
let size = inflate_consume_first(&compressed, &mut output).expect("Decode failed");
assert_eq!(size, original.len());
assert_slices_eq!(&output[..size], original.as_slice());
}
#[test]
fn test_cf_large_data() {
let original = b"This is a test of the consume-first decoder. ".repeat(1000);
let mut compressed = Vec::new();
{
use std::io::Write;
let mut enc =
flate2::write::DeflateEncoder::new(&mut compressed, flate2::Compression::best());
enc.write_all(&original).unwrap();
enc.finish().unwrap();
}
let mut output = vec![0u8; original.len() + 100];
let size = inflate_consume_first(&compressed, &mut output).expect("Decode failed");
assert_eq!(size, original.len());
assert_slices_eq!(&output[..size], original.as_slice());
}
#[test]
fn bench_cf_silesia() {
run_bench("silesia", "benchmark_data/silesia-gzip.tar.gz");
}
#[test]
fn bench_cf_software() {
run_bench("software", "benchmark_data/software.archive.gz");
}
#[test]
fn bench_cf_logs() {
run_bench("logs", "benchmark_data/logs.txt.gz");
}
#[test]
fn test_cf_single_bgzf_block() {
let data = match std::fs::read("benchmark_data/test-gzippy-l1-t14.gz") {
Ok(d) => d,
Err(_) => {
eprintln!("Skip - no file");
return;
}
};
use std::io::Read;
let mut expected = Vec::new();
let mut decoder = flate2::read::MultiGzDecoder::new(&data[..]);
decoder.read_to_end(&mut expected).unwrap();
let xlen = u16::from_le_bytes([data[10], data[11]]) as usize;
let header_end = 12 + xlen;
let mut pos = 12;
let mut bsize = None;
while pos < header_end {
let slen = u16::from_le_bytes([data[pos + 2], data[pos + 3]]) as usize;
let si = [data[pos], data[pos + 1]];
if si == *b"GZ" && slen >= 4 {
bsize = Some(u32::from_le_bytes([
data[pos + 4],
data[pos + 5],
data[pos + 6],
data[pos + 7],
]) as usize);
} else if si == *b"BC" && slen == 2 {
bsize = Some(u16::from_le_bytes([data[pos + 4], data[pos + 5]]) as usize + 1);
}
pos += 4 + slen;
}
let bsize = bsize.expect("No block size in BGZF header");
let isize_expected = u32::from_le_bytes([
data[bsize - 4],
data[bsize - 3],
data[bsize - 2],
data[bsize - 1],
]) as usize;
let deflate_data = &data[header_end..bsize - 8];
let mut lib_out = vec![0u8; isize_expected];
let lib_size = libdeflater::Decompressor::new()
.deflate_decompress(deflate_data, &mut lib_out)
.expect("libdeflate failed");
assert_eq!(lib_size, isize_expected, "libdeflate size mismatch");
let mut our_out = vec![0u8; isize_expected];
let our_size =
inflate_consume_first(deflate_data, &mut our_out).expect("our decoder failed");
assert_eq!(our_size, isize_expected, "size mismatch");
assert_eq!(
&our_out[..our_size],
&lib_out[..lib_size],
"content mismatch"
);
}
#[test]
fn bench_interleaved_4_blocks() {
use flate2::write::DeflateEncoder;
use flate2::Compression;
use std::io::Write;
let chunks: Vec<Vec<u8>> = (0..4)
.map(|i| {
(0..50_000)
.map(|j| ((i * 37 + j * 13) % 256) as u8)
.collect()
})
.collect();
let compressed: Vec<Vec<u8>> = chunks
.iter()
.map(|chunk| {
let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
encoder.write_all(chunk).unwrap();
encoder.finish().unwrap()
})
.collect();
let tables = crate::decompress::inflate::libdeflate_decode::get_fixed_tables();
let _litlen = &tables.0;
let _dist = &tables.1;
let mut outputs: Vec<Vec<u8>> = chunks.iter().map(|c| vec![0u8; c.len() + 1000]).collect();
for (i, (comp, expected)) in compressed.iter().zip(chunks.iter()).enumerate() {
let mut out = vec![0u8; expected.len() + 100];
let size = inflate_consume_first(comp, &mut out).expect("decode failed");
assert_eq!(size, expected.len(), "Block {} size mismatch", i);
assert_eq!(&out[..size], &expected[..], "Block {} content mismatch", i);
}
eprintln!("\n=== INTERLEAVED 4-BLOCK BENCHMARK ===");
let iterations = 100;
let start = std::time::Instant::now();
for _ in 0..iterations {
for (comp, out) in compressed.iter().zip(outputs.iter_mut()) {
let _ = inflate_consume_first(comp, out);
}
}
let seq_time = start.elapsed();
let total_bytes: usize = chunks.iter().map(|c| c.len()).sum();
let seq_throughput = (total_bytes * iterations) as f64 / seq_time.as_secs_f64() / 1e6;
eprintln!("Sequential 4 blocks: {:.1} MB/s", seq_throughput);
eprintln!("(Interleaved decode requires per-block table building - testing concept)");
eprintln!("=====================================\n");
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_avx2_detected_on_x86() {
assert!(
is_x86_feature_detected!("avx2"),
"AVX2 not detected on this x86_64 machine. \
GitHub Actions runners always support AVX2. \
If this fails in CI, the AVX2 fast path in copy_match_fast \
and fill_byte_avx2 will be unused, reducing decompression throughput."
);
}
}