gzippy 0.8.0 - Docs.rs

//! BGZF (Block GZIP Format) Parallel Decompression
//!
//! BGZF files have independent blocks with embedded size markers, allowing
//! perfect parallelism with zero lock contention.
//!
//! ## Strategy
//!
//! 1. Parse BGZF headers to find all block boundaries and output sizes (ISIZE)
//! 2. Pre-allocate entire output buffer based on sum of ISIZE values
//! 3. Decompress blocks in parallel, writing directly to pre-calculated offsets
//! 4. Single write of complete output
//!
//! ## Performance Target: 4000+ MB/s with 14 threads
//!
//! With single-threaded inflate at 10700 MB/s and no lock contention,
//! theoretical max is ~150,000 MB/s. Memory bandwidth limits us to ~4000-5000 MB/s.

#![allow(clippy::needless_range_loop)]

use std::io::{self, Write};
use std::sync::atomic::{AtomicUsize, Ordering};

// =============================================================================
// Hot-path counters (active only in test builds or with feature "counters")
// =============================================================================
//
// Zero overhead in production: the cfg gate eliminates all counter code.
// Use in tests: crate::decompress::bgzf::hot_counters::reset() / snapshot()
// Use manually: cargo build --features counters
#[cfg(any(test, feature = "counters"))]
pub mod hot_counters {
    use std::sync::atomic::{AtomicU64, Ordering};

    static DYNAMIC_BLOCKS: AtomicU64 = AtomicU64::new(0);
    static MULTI_SYM_BLOCKS: AtomicU64 = AtomicU64::new(0);
    static STANDARD_BLOCKS: AtomicU64 = AtomicU64::new(0);

    pub fn reset() {
        DYNAMIC_BLOCKS.store(0, Ordering::SeqCst);
        MULTI_SYM_BLOCKS.store(0, Ordering::SeqCst);
        STANDARD_BLOCKS.store(0, Ordering::SeqCst);
    }

    /// Returns (dynamic_blocks, multi_sym_blocks, standard_blocks).
    pub fn snapshot() -> (u64, u64, u64) {
        (
            DYNAMIC_BLOCKS.load(Ordering::SeqCst),
            MULTI_SYM_BLOCKS.load(Ordering::SeqCst),
            STANDARD_BLOCKS.load(Ordering::SeqCst),
        )
    }

    #[inline(always)]
    pub fn inc_dynamic() {
        DYNAMIC_BLOCKS.fetch_add(1, Ordering::Relaxed);
    }
    #[inline(always)]
    pub fn inc_multi_sym() {
        MULTI_SYM_BLOCKS.fetch_add(1, Ordering::Relaxed);
    }
    #[inline(always)]
    pub fn inc_standard() {
        STANDARD_BLOCKS.fetch_add(1, Ordering::Relaxed);
    }
}

use crate::decompress::combined_lut::CombinedLUT;
use crate::decompress::inflate_tables::CODE_LENGTH_ORDER;
use crate::decompress::packed_lut::PackedLUT;
#[allow(unused_imports)]
use crate::decompress::two_level_table::{FastBits, TurboBits, TwoLevelTable};

// =============================================================================
// Performance Tracing
// =============================================================================
//
// Enable with GZIPPY_TRACE=1 to see detailed performance breakdown.
// This helps identify where we lose time compared to libdeflate.

/// Performance counters for decode loop analysis
#[allow(dead_code)]
#[derive(Default)]
pub struct DecodeTrace {
    /// Total literals decoded
    pub literals: u64,
    /// Total matches decoded  
    pub matches: u64,
    /// Literals decoded via fast literal chain (3+ in a row)
    pub fast_literals: u64,
    /// Matches using pre-computed distance (fast path)
    pub fast_matches: u64,
    /// Matches requiring slow distance decode
    pub slow_matches: u64,
    /// Times slow path was used for lit/len decode
    pub slow_lit_len: u64,
    /// Total bytes copied via match
    pub match_bytes: u64,
    /// Distance=1 memset optimizations used (RLE)
    pub dist_1: u64,
    /// Distance 2-7 (small, overlapping)
    pub dist_2_7: u64,
    /// Distance 8-39 (medium, overlapping chunks)
    pub dist_8_39: u64,
    /// Distance >= 40 (large, non-overlapping)
    pub dist_40_plus: u64,
    /// Match lengths <= 8
    pub len_1_8: u64,
    /// Match lengths 9-32
    pub len_9_32: u64,
    /// Match lengths 33-258
    pub len_33_plus: u64,
    /// Bit buffer refills performed
    pub refills: u64,
    /// EOB (end of block) encountered
    pub eob_count: u64,
}

#[allow(dead_code)]
impl DecodeTrace {
    /// Print trace summary
    pub fn print_summary(&self, output_bytes: usize, elapsed_ns: u64) {
        let mb_per_sec = output_bytes as f64 / (elapsed_ns as f64 / 1_000_000_000.0) / 1_000_000.0;
        let total_symbols = self.literals + self.matches;
        let ns_per_symbol = elapsed_ns.checked_div(total_symbols).unwrap_or(0);

        eprintln!("\n=== DECODE TRACE ===");
        eprintln!(
            "Output: {} bytes in {:.2}ms = {:.1} MB/s",
            output_bytes,
            elapsed_ns as f64 / 1_000_000.0,
            mb_per_sec
        );
        eprintln!("\nSymbols:");
        eprintln!(
            "  Literals:       {} ({} fast chain, {:.1}% fast)",
            self.literals,
            self.fast_literals,
            if self.literals > 0 {
                self.fast_literals as f64 / self.literals as f64 * 100.0
            } else {
                0.0
            }
        );
        eprintln!(
            "  Matches:        {} ({} fast, {} slow, {:.1}% fast)",
            self.matches,
            self.fast_matches,
            self.slow_matches,
            if self.matches > 0 {
                self.fast_matches as f64 / self.matches as f64 * 100.0
            } else {
                0.0
            }
        );
        eprintln!("  Slow lit/len:   {}", self.slow_lit_len);

        eprintln!("\nDistance distribution:");
        let total_dist = self.dist_1 + self.dist_2_7 + self.dist_8_39 + self.dist_40_plus;
        if total_dist > 0 {
            eprintln!(
                "  d=1 (RLE):      {} ({:.1}%)",
                self.dist_1,
                self.dist_1 as f64 / total_dist as f64 * 100.0
            );
            eprintln!(
                "  d=2-7:          {} ({:.1}%)",
                self.dist_2_7,
                self.dist_2_7 as f64 / total_dist as f64 * 100.0
            );
            eprintln!(
                "  d=8-39:         {} ({:.1}%)",
                self.dist_8_39,
                self.dist_8_39 as f64 / total_dist as f64 * 100.0
            );
            eprintln!(
                "  d>=40:          {} ({:.1}%)",
                self.dist_40_plus,
                self.dist_40_plus as f64 / total_dist as f64 * 100.0
            );
        }

        eprintln!("\nLength distribution:");
        let total_len = self.len_1_8 + self.len_9_32 + self.len_33_plus;
        if total_len > 0 {
            eprintln!(
                "  len 3-8:        {} ({:.1}%)",
                self.len_1_8,
                self.len_1_8 as f64 / total_len as f64 * 100.0
            );
            eprintln!(
                "  len 9-32:       {} ({:.1}%)",
                self.len_9_32,
                self.len_9_32 as f64 / total_len as f64 * 100.0
            );
            eprintln!(
                "  len 33+:        {} ({:.1}%)",
                self.len_33_plus,
                self.len_33_plus as f64 / total_len as f64 * 100.0
            );
        }

        eprintln!("\nCopy stats:");
        eprintln!(
            "  Match bytes:    {} ({:.1} bytes/match avg)",
            self.match_bytes,
            if self.matches > 0 {
                self.match_bytes as f64 / self.matches as f64
            } else {
                0.0
            }
        );
        eprintln!("\nOverhead:");
        eprintln!("  Bit refills:    {}", self.refills);
        eprintln!("  EOB count:      {}", self.eob_count);
        eprintln!("  ns/symbol:      {}", ns_per_symbol);
        eprintln!(
            "  symbols/byte:   {:.2}",
            total_symbols as f64 / output_bytes as f64
        );
        eprintln!("====================\n");
    }
}

// Tracing infrastructure (enable with GZIPPY_TRACE=1)

use std::cell::RefCell;

thread_local! {
    static DECODE_TRACE: RefCell<DecodeTrace> = RefCell::new(DecodeTrace::default());
}

/// Check if tracing is enabled (cached)
#[inline]
fn tracing_enabled() -> bool {
    static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
    *ENABLED.get_or_init(|| std::env::var("GZIPPY_TRACE").is_ok())
}

/// Reset the thread-local trace counters
#[allow(dead_code)]
fn reset_trace() {
    DECODE_TRACE.with(|t| *t.borrow_mut() = DecodeTrace::default());
}

/// Get a copy of the current trace and reset
#[allow(dead_code)]
fn take_trace() -> DecodeTrace {
    DECODE_TRACE.with(|t| std::mem::take(&mut *t.borrow_mut()))
}

/// Record a match with given distance and length
/// This is a no-op when not tracing - the check is cached in a static
#[inline(always)]
fn trace_match(_distance: usize, _length: usize) {
    // Tracing is controlled by GZIPPY_TRACE env var
    // Inlining + dead code elimination removes this when not tracing
    #[cold]
    #[inline(never)]
    fn trace_match_slow(distance: usize, length: usize) {
        DECODE_TRACE.with(|t| {
            let mut trace = t.borrow_mut();
            trace.matches += 1;
            trace.match_bytes += length as u64;

            // Distance distribution
            if distance == 1 {
                trace.dist_1 += 1;
            } else if distance <= 7 {
                trace.dist_2_7 += 1;
            } else if distance <= 39 {
                trace.dist_8_39 += 1;
            } else {
                trace.dist_40_plus += 1;
            }

            // Length distribution
            if length <= 8 {
                trace.len_1_8 += 1;
            } else if length <= 32 {
                trace.len_9_32 += 1;
            } else {
                trace.len_33_plus += 1;
            }
        });
    }

    if tracing_enabled() {
        trace_match_slow(_distance, _length);
    }
}

/// Record literals (for future use in literal chain tracing)
#[allow(dead_code)]
#[inline]
fn trace_literals(count: u64, fast: bool) {
    if !tracing_enabled() {
        return;
    }
    DECODE_TRACE.with(|t| {
        let mut trace = t.borrow_mut();
        trace.literals += count;
        if fast {
            trace.fast_literals += count;
        }
    });
}

/// Record slow path usage (for future use in slow path analysis)
#[allow(dead_code)]
#[inline]
fn trace_slow_path() {
    if !tracing_enabled() {
        return;
    }
    DECODE_TRACE.with(|t| {
        t.borrow_mut().slow_lit_len += 1;
    });
}

/// BGZF block information
#[derive(Debug, Clone)]
struct BgzfBlock {
    /// Byte offset of block start in compressed data
    start: usize,
    /// Total block length (including header and trailer)
    length: usize,
    /// Uncompressed size (from ISIZE trailer)
    isize: u32,
    /// Output offset (calculated during planning)
    output_offset: usize,
    /// Byte offset of raw deflate data within the block (past gzip header)
    deflate_start: usize,
}

/// Parse all BGZF blocks from compressed data
fn parse_bgzf_blocks(data: &[u8]) -> io::Result<Vec<BgzfBlock>> {
    let mut blocks = Vec::new();
    let mut offset = 0;
    let mut output_offset = 0;

    while offset + 18 < data.len() {
        // Check gzip magic
        if data[offset] != 0x1f || data[offset + 1] != 0x8b {
            break;
        }

        // Must have FEXTRA flag
        if data[offset + 3] & 0x04 == 0 {
            break;
        }

        // Get XLEN
        if offset + 12 > data.len() {
            break;
        }
        let xlen = u16::from_le_bytes([data[offset + 10], data[offset + 11]]) as usize;
        if offset + 12 + xlen > data.len() {
            break;
        }

        // Find GZ subfield with block size
        let extra_start = offset + 12;
        let extra_field = &data[extra_start..extra_start + xlen];
        let mut block_size = None;
        let mut pos = 0;

        while pos + 4 <= extra_field.len() {
            let subfield_id = &extra_field[pos..pos + 2];
            let subfield_len =
                u16::from_le_bytes([extra_field[pos + 2], extra_field[pos + 3]]) as usize;

            if subfield_id == b"GZ" {
                if subfield_len == 4 && pos + 8 <= extra_field.len() {
                    // New 4-byte format (supports blocks > 64KB)
                    let size = u32::from_le_bytes([
                        extra_field[pos + 4],
                        extra_field[pos + 5],
                        extra_field[pos + 6],
                        extra_field[pos + 7],
                    ]) as usize;
                    if size > 0 {
                        block_size = Some(size);
                    }
                    break;
                } else if subfield_len == 2 && pos + 6 <= extra_field.len() {
                    // Legacy 2-byte format (BSIZE-1)
                    let size_minus_1 =
                        u16::from_le_bytes([extra_field[pos + 4], extra_field[pos + 5]]) as usize;
                    block_size = Some(size_minus_1 + 1);
                    break;
                }
            }

            pos += 4 + subfield_len;
        }

        let length = match block_size {
            Some(l) if l > 0 && offset + l <= data.len() => l,
            _ => break,
        };

        // Read ISIZE from trailer (last 4 bytes of block)
        let isize = if length >= 8 {
            let trailer_start = offset + length - 4;
            u32::from_le_bytes([
                data[trailer_start],
                data[trailer_start + 1],
                data[trailer_start + 2],
                data[trailer_start + 3],
            ])
        } else {
            0
        };

        // Deflate data starts after the full gzip header (including optional fields)
        let mut deflate_start = offset + 12 + xlen;
        let flags = data[offset + 3];
        // FNAME: null-terminated filename
        if flags & 0x08 != 0 {
            while deflate_start < offset + length && data[deflate_start] != 0 {
                deflate_start += 1;
            }
            deflate_start += 1; // skip null terminator
        }
        // FCOMMENT: null-terminated comment
        if flags & 0x10 != 0 {
            while deflate_start < offset + length && data[deflate_start] != 0 {
                deflate_start += 1;
            }
            deflate_start += 1;
        }
        // FHCRC: 2-byte header CRC
        if flags & 0x02 != 0 {
            deflate_start += 2;
        }

        blocks.push(BgzfBlock {
            start: offset,
            length,
            isize,
            output_offset,
            deflate_start,
        });

        output_offset += isize as usize;
        offset += length;
    }

    if blocks.is_empty() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "No BGZF blocks found",
        ));
    }

    Ok(blocks)
}

/// Inflate directly into a pre-allocated output slice
///
/// Decompress raw deflate data using libdeflate FFI.
///
/// This is the key function for zero-copy parallel decompression.
/// Each call allocates a lightweight (~4KB) libdeflate decompressor.
fn inflate_into(deflate_data: &[u8], output: &mut [u8]) -> io::Result<usize> {
    inflate_into_libdeflate(deflate_data, output)
}

/// Decompress raw deflate data using libdeflate's optimized C implementation.
///
/// Uses `libdeflate_deflate_decompress` which is 30-50% faster than zlib
/// for in-memory operations. The decompressor is ~4KB and cheap to allocate.
fn inflate_into_libdeflate(deflate_data: &[u8], output: &mut [u8]) -> io::Result<usize> {
    let decompressor = unsafe { libdeflate_sys::libdeflate_alloc_decompressor() };
    if decompressor.is_null() {
        return Err(io::Error::other(
            "failed to allocate libdeflate decompressor",
        ));
    }

    let mut actual_out = 0usize;
    let result = unsafe {
        libdeflate_sys::libdeflate_deflate_decompress(
            decompressor,
            deflate_data.as_ptr() as *const std::ffi::c_void,
            deflate_data.len(),
            output.as_mut_ptr() as *mut std::ffi::c_void,
            output.len(),
            &mut actual_out,
        )
    };

    unsafe {
        libdeflate_sys::libdeflate_free_decompressor(decompressor);
    }

    match result {
        libdeflate_sys::libdeflate_result_LIBDEFLATE_SUCCESS => Ok(actual_out),
        libdeflate_sys::libdeflate_result_LIBDEFLATE_BAD_DATA => Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "invalid deflate data",
        )),
        libdeflate_sys::libdeflate_result_LIBDEFLATE_INSUFFICIENT_SPACE => Err(io::Error::new(
            io::ErrorKind::WriteZero,
            "output buffer too small",
        )),
        _ => Err(io::Error::other("unknown libdeflate error")),
    }
}

/// Public version of inflate_into for use by other modules.
///
/// Uses libdeflate FFI for maximum decompression speed.
pub fn inflate_into_pub(deflate_data: &[u8], output: &mut [u8]) -> io::Result<usize> {
    inflate_into(deflate_data, output)
}

/// Decode stored block directly into output slice
fn decode_stored_into(
    bits: &mut FastBits,
    output: &mut [u8],
    mut out_pos: usize,
) -> io::Result<usize> {
    bits.align();
    bits.refill();

    let len = bits.read(16) as usize;
    let nlen = bits.read(16) as usize;

    if len != (!nlen & 0xFFFF) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Stored block length mismatch",
        ));
    }

    for _ in 0..len {
        if out_pos >= output.len() {
            return Err(io::Error::new(
                io::ErrorKind::WriteZero,
                "Output buffer full",
            ));
        }
        bits.ensure(8);
        output[out_pos] = bits.read(8) as u8;
        out_pos += 1;
    }

    Ok(out_pos)
}

/// Get fixed Huffman code lengths (RFC 1951)
fn get_fixed_lit_len_lens() -> [u8; 288] {
    let mut lens = [0u8; 288];
    for i in 0..144 {
        lens[i] = 8;
    }
    for i in 144..256 {
        lens[i] = 9;
    }
    for i in 256..280 {
        lens[i] = 7;
    }
    for i in 280..288 {
        lens[i] = 8;
    }
    lens
}

/// Pre-built fixed Huffman tables
fn get_fixed_tables() -> (
    &'static TwoLevelTable,
    &'static TwoLevelTable,
    &'static CombinedLUT,
) {
    use std::sync::OnceLock;

    static FIXED_LIT_LEN: OnceLock<TwoLevelTable> = OnceLock::new();
    static FIXED_DIST: OnceLock<TwoLevelTable> = OnceLock::new();
    static FIXED_COMBINED: OnceLock<CombinedLUT> = OnceLock::new();

    let lit_len =
        FIXED_LIT_LEN.get_or_init(|| TwoLevelTable::build(&get_fixed_lit_len_lens()).unwrap());

    let dist = FIXED_DIST.get_or_init(|| {
        let lens = [5u8; 32];
        TwoLevelTable::build(&lens).unwrap()
    });

    let combined = FIXED_COMBINED.get_or_init(|| {
        let dist_lens = vec![5u8; 32];
        CombinedLUT::build(&get_fixed_lit_len_lens(), &dist_lens).unwrap()
    });

    (lit_len, dist, combined)
}

/// Pre-built fixed Huffman tables with PackedLUT for turbo decode
#[allow(dead_code)]
fn get_fixed_tables_turbo() -> (
    &'static TwoLevelTable,
    &'static TwoLevelTable,
    &'static PackedLUT,
) {
    use std::sync::OnceLock;

    static FIXED_LIT_LEN: OnceLock<TwoLevelTable> = OnceLock::new();
    static FIXED_DIST: OnceLock<TwoLevelTable> = OnceLock::new();
    static FIXED_PACKED: OnceLock<PackedLUT> = OnceLock::new();

    let lit_len =
        FIXED_LIT_LEN.get_or_init(|| TwoLevelTable::build(&get_fixed_lit_len_lens()).unwrap());

    let dist = FIXED_DIST.get_or_init(|| {
        let lens = [5u8; 32];
        TwoLevelTable::build(&lens).unwrap()
    });

    let packed = FIXED_PACKED.get_or_init(|| {
        let dist_lens = vec![5u8; 32];
        PackedLUT::build(&get_fixed_lit_len_lens(), &dist_lens).unwrap()
    });

    (lit_len, dist, packed)
}

/// Decode fixed Huffman block into output slice
fn decode_fixed_into(bits: &mut FastBits, output: &mut [u8], out_pos: usize) -> io::Result<usize> {
    let (lit_len_table, dist_table, combined_lut) = get_fixed_tables();
    decode_huffman_into(
        bits,
        output,
        out_pos,
        combined_lut,
        lit_len_table,
        dist_table,
    )
}

/// Decode dynamic Huffman block into output slice
fn decode_dynamic_into(
    bits: &mut FastBits,
    output: &mut [u8],
    out_pos: usize,
) -> io::Result<usize> {
    bits.ensure(16);
    let hlit = bits.read(5) as usize + 257;
    let hdist = bits.read(5) as usize + 1;
    let hclen = bits.read(4) as usize + 4;

    // Read code length code lengths
    let mut code_len_lens = [0u8; 19];
    for i in 0..hclen {
        bits.ensure(8);
        code_len_lens[CODE_LENGTH_ORDER[i] as usize] = bits.read(3) as u8;
    }

    let code_len_table = TwoLevelTable::build(&code_len_lens)?;

    // Read all code lengths
    let total_codes = hlit + hdist;
    let mut code_lens = vec![0u8; total_codes];
    let mut i = 0;

    while i < total_codes {
        bits.ensure(16);
        let (symbol, sym_len) = code_len_table.decode(bits.buffer());
        if sym_len == 0 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                "Invalid code length code",
            ));
        }
        bits.consume(sym_len);

        match symbol {
            0..=15 => {
                code_lens[i] = symbol as u8;
                i += 1;
            }
            16 => {
                if i == 0 {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid repeat"));
                }
                let repeat = 3 + bits.read(2) as usize;
                let last = code_lens[i - 1];
                for _ in 0..repeat.min(total_codes - i) {
                    code_lens[i] = last;
                    i += 1;
                }
            }
            17 => {
                let repeat = 3 + bits.read(3) as usize;
                i += repeat.min(total_codes - i);
            }
            18 => {
                let repeat = 11 + bits.read(7) as usize;
                i += repeat.min(total_codes - i);
            }
            _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code")),
        }
    }

    let lit_len_table = TwoLevelTable::build(&code_lens[..hlit])?;
    let dist_table = TwoLevelTable::build(&code_lens[hlit..])?;
    let combined_lut = CombinedLUT::build(&code_lens[..hlit], &code_lens[hlit..])?;

    // Check if codes are short enough for multi-symbol optimization
    // If max code length <= 6, we can fit 2 symbols in 12 bits
    let max_lit_len = code_lens[..hlit].iter().copied().max().unwrap_or(0);
    let use_multi_sym = max_lit_len <= 6 && max_lit_len > 0;

    #[cfg(any(test, feature = "counters"))]
    hot_counters::inc_dynamic();

    if use_multi_sym {
        // Try multi-symbol decode for literal-heavy blocks
        if let Ok(multi_sym_table) =
            crate::decompress::simd_huffman::MultiSymTable::build(&code_lens[..hlit])
        {
            #[cfg(any(test, feature = "counters"))]
            hot_counters::inc_multi_sym();
            return decode_huffman_multi_sym(
                bits,
                output,
                out_pos,
                &multi_sym_table,
                &combined_lut,
                &lit_len_table,
                &dist_table,
            );
        }
    }

    #[cfg(any(test, feature = "counters"))]
    hot_counters::inc_standard();

    decode_huffman_into(
        bits,
        output,
        out_pos,
        &combined_lut,
        &lit_len_table,
        &dist_table,
    )
}

/// Decode using multi-symbol table for literal runs
/// Falls back to regular decode for length codes and complex cases
fn decode_huffman_multi_sym(
    bits: &mut FastBits,
    output: &mut [u8],
    mut out_pos: usize,
    multi_sym_table: &crate::decompress::simd_huffman::MultiSymTable,
    _combined_lut: &CombinedLUT,
    lit_len_table: &TwoLevelTable,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::inflate_tables::{
        DIST_EXTRA_BITS, DIST_START, LEN_EXTRA_BITS, LEN_START,
    };

    loop {
        bits.ensure(32);

        // Try multi-symbol decode first
        let entry = multi_sym_table.lookup(bits.buffer());

        if entry.sym_count > 0 && entry.total_bits > 0 {
            // Got literals - write them all
            bits.consume(entry.total_bits as u32);

            match entry.sym_count {
                1 => {
                    if out_pos >= output.len() {
                        return Err(io::Error::new(
                            io::ErrorKind::WriteZero,
                            "Output buffer full",
                        ));
                    }
                    output[out_pos] = entry.sym1;
                    out_pos += 1;
                }
                2 => {
                    if out_pos + 1 >= output.len() {
                        return Err(io::Error::new(
                            io::ErrorKind::WriteZero,
                            "Output buffer full",
                        ));
                    }
                    output[out_pos] = entry.sym1;
                    output[out_pos + 1] = entry.sym2;
                    out_pos += 2;
                }
                3 => {
                    if out_pos + 2 >= output.len() {
                        return Err(io::Error::new(
                            io::ErrorKind::WriteZero,
                            "Output buffer full",
                        ));
                    }
                    output[out_pos] = entry.sym1;
                    output[out_pos + 1] = entry.sym2;
                    output[out_pos + 2] = entry.sym3;
                    out_pos += 3;
                }
                4 => {
                    if out_pos + 3 >= output.len() {
                        return Err(io::Error::new(
                            io::ErrorKind::WriteZero,
                            "Output buffer full",
                        ));
                    }
                    output[out_pos] = entry.sym1;
                    output[out_pos + 1] = entry.sym2;
                    output[out_pos + 2] = entry.sym3;
                    output[out_pos + 3] = entry.sym4;
                    out_pos += 4;
                }
                _ => {}
            }
            continue;
        }

        // Non-literal or invalid - use fallback
        if entry.total_bits > 0 {
            bits.consume(entry.total_bits as u32);
            let symbol = entry.symbol();

            if symbol == 256 {
                // End of block
                break;
            }

            // Length code - handle with regular path
            let len_idx = (symbol - 257) as usize;
            if len_idx >= 29 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid length code",
                ));
            }

            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 || dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos || distance == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
        } else {
            // Fall back to regular decode for long codes
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid Huffman code",
                ));
            }
            bits.consume(code_len);

            if symbol < 256 {
                if out_pos >= output.len() {
                    return Err(io::Error::new(
                        io::ErrorKind::WriteZero,
                        "Output buffer full",
                    ));
                }
                output[out_pos] = symbol as u8;
                out_pos += 1;
            } else if symbol == 256 {
                break;
            } else {
                // Length code
                let len_idx = (symbol - 257) as usize;
                if len_idx >= 29 {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid length code",
                    ));
                }

                bits.ensure(16);
                let length = LEN_START[len_idx] as usize
                    + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

                let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
                if dist_len == 0 || dist_sym >= 30 {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid distance code",
                    ));
                }
                bits.consume(dist_len);

                bits.ensure(16);
                let distance = DIST_START[dist_sym as usize] as usize
                    + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

                if distance > out_pos || distance == 0 {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid distance",
                    ));
                }

                out_pos = copy_match_into(output, out_pos, distance, length);
            }
        }
    }

    Ok(out_pos)
}

/// Core decode loop using CombinedLUT, writing directly to output slice
fn decode_huffman_into(
    bits: &mut FastBits,
    output: &mut [u8],
    mut out_pos: usize,
    combined_lut: &CombinedLUT,
    lit_len_table: &TwoLevelTable,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::combined_lut::{DIST_END_OF_BLOCK, DIST_LITERAL, DIST_SLOW_PATH};
    use crate::decompress::inflate_tables::{
        DIST_EXTRA_BITS, DIST_START, LEN_EXTRA_BITS, LEN_START,
    };

    // Branch prediction hints (stable workaround for likely/unlikely)
    // These help the compiler optimize hot paths by marking cold paths
    #[cold]
    #[inline(never)]
    fn cold_path() {}

    #[inline(always)]
    fn likely(b: bool) -> bool {
        if !b {
            cold_path();
        }
        b
    }

    #[inline(always)]
    fn unlikely(b: bool) -> bool {
        if b {
            cold_path();
        }
        b
    }

    // Prefetch next output cache line (64 bytes ahead on x86_64)
    // This hides memory latency by loading data into L1 cache before it's needed
    #[inline(always)]
    #[allow(unused_variables)]
    fn prefetch_output(output: &[u8], pos: usize) {
        #[cfg(target_arch = "x86_64")]
        if pos + 64 < output.len() {
            // SAFETY: Pointer arithmetic within bounds, prefetch is advisory
            unsafe {
                std::arch::x86_64::_mm_prefetch(
                    output.as_ptr().add(pos + 64) as *const i8,
                    std::arch::x86_64::_MM_HINT_T0,
                );
            }
        }
        // ARM prefetch is unstable in Rust, no-op for now
    }

    loop {
        bits.ensure(32);

        // Prefetch next output cache line
        prefetch_output(output, out_pos);

        let entry = combined_lut.decode(bits.buffer());

        // Long code fallback (rare - most codes fit in 12 bits)
        if unlikely(entry.bits_to_skip == 0) {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid Huffman code",
                ));
            }
            bits.consume(code_len);

            if likely(symbol < 256) {
                // Literal byte - most common case
                if unlikely(out_pos >= output.len()) {
                    return Err(io::Error::new(
                        io::ErrorKind::WriteZero,
                        "Output buffer full",
                    ));
                }
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if unlikely(symbol == 256) {
                // End of block - rare
                break;
            }

            // Length code (less common than literals)
            let len_idx = (symbol - 257) as usize;
            if unlikely(len_idx >= 29) {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid length code",
                ));
            }

            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if unlikely(dist_len == 0 || dist_sym >= 30) {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if unlikely(distance > out_pos || distance == 0) {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        bits.consume(entry.bits_to_skip as u32);

        match entry.distance {
            DIST_LITERAL => {
                // === LITERAL FAST PATH ===
                // This is the hot path - most deflate streams are literal-heavy
                // We use a tight inner loop that continues until we hit a non-literal

                if unlikely(out_pos >= output.len()) {
                    return Err(io::Error::new(
                        io::ErrorKind::WriteZero,
                        "Output buffer full",
                    ));
                }
                output[out_pos] = entry.symbol_or_length;
                out_pos += 1;

                // Continue decoding literals in a tight loop while we can
                // Exit conditions: non-literal, need refill, output full
                while likely(bits.bits_available() >= 12 && out_pos + 8 <= output.len()) {
                    let e = combined_lut.decode(bits.buffer());

                    // Check if this is a literal (fast check)
                    if e.bits_to_skip == 0 || e.distance != DIST_LITERAL {
                        // Non-literal - exit inner loop, outer loop will handle it
                        break;
                    }

                    // It's a literal - consume and write
                    bits.consume(e.bits_to_skip as u32);
                    output[out_pos] = e.symbol_or_length;
                    out_pos += 1;
                }
            }

            DIST_END_OF_BLOCK => break,

            DIST_SLOW_PATH => {
                let length = entry.symbol_or_length as usize + 3;

                let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
                if dist_len == 0 || dist_sym >= 30 {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid distance code",
                    ));
                }
                bits.consume(dist_len);

                bits.ensure(16);
                let distance = DIST_START[dist_sym as usize] as usize
                    + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

                if unlikely(distance > out_pos || distance == 0) {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid distance",
                    ));
                }

                out_pos = copy_match_into(output, out_pos, distance, length);
            }

            distance => {
                let length = entry.length();
                let dist = distance as usize;

                if dist > out_pos || dist == 0 {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "Invalid distance",
                    ));
                }

                out_pos = copy_match_into(output, out_pos, dist, length);
            }
        }
    }

    Ok(out_pos)
}

/// Consume-first decode loop using ConsumeFirstTable
///
/// This is the key optimization: CONSUME bits BEFORE checking entry type.
/// Benchmarks show 39.8% speedup over check-first pattern.
#[allow(dead_code)]
#[inline(never)]
fn decode_huffman_consume_first(
    bits: &mut crate::decompress::two_level_table::TurboBits,
    output: &mut [u8],
    mut out_pos: usize,
    lit_table: &crate::decompress::inflate::consume_first_table::ConsumeFirstTable,
    dist_table: &crate::decompress::inflate::consume_first_table::ConsumeFirstTable,
) -> io::Result<usize> {
    use crate::decompress::inflate::consume_first_table::CFEntry;
    use crate::decompress::inflate_tables::{
        DIST_EXTRA_BITS, DIST_START, LEN_EXTRA_BITS, LEN_START,
    };

    let out_end = output.len();
    let fastloop_end = out_end.saturating_sub(320);

    // DEBUG: Track iterations to detect infinite loops
    let mut iterations = 0u64;
    let max_iterations = (out_end as u64 * 2).max(100_000); // Reasonable limit

    // Helper to resolve entry (handles subtables)
    #[inline(always)]
    fn resolve_entry(
        bits: &mut crate::decompress::two_level_table::TurboBits,
        table: &crate::decompress::inflate::consume_first_table::ConsumeFirstTable,
    ) -> CFEntry {
        let entry = table.lookup_main(bits.buffer());
        bits.consume(entry.bits());

        if entry.is_subtable() {
            let sub_entry = table.lookup_sub(entry, bits.buffer());
            bits.consume(sub_entry.bits());
            sub_entry
        } else {
            entry
        }
    }

    // === FASTLOOP with consume-first pattern ===
    while out_pos < fastloop_end {
        iterations += 1;
        if iterations > max_iterations {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Infinite loop detected at out_pos={}", out_pos),
            ));
        }

        bits.ensure(56);

        let entry = resolve_entry(bits, lit_table);

        if entry.is_literal() {
            output[out_pos] = entry.symbol() as u8;
            out_pos += 1;

            // Try 2 more literals
            bits.ensure(32);
            let e2 = resolve_entry(bits, lit_table);
            if e2.is_literal() {
                output[out_pos] = e2.symbol() as u8;
                out_pos += 1;

                let e3 = resolve_entry(bits, lit_table);
                if e3.is_literal() {
                    output[out_pos] = e3.symbol() as u8;
                    out_pos += 1;

                    // Tight literal loop
                    while bits.has_bits(24) {
                        let e = resolve_entry(bits, lit_table);
                        if e.is_literal() {
                            output[out_pos] = e.symbol() as u8;
                            out_pos += 1;
                        } else if e.is_eob() {
                            return Ok(out_pos);
                        } else if e.is_length() {
                            let len_idx = (e.symbol() - 257) as usize;
                            bits.ensure(16);
                            let length = LEN_START[len_idx] as usize
                                + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

                            let d = resolve_entry(bits, dist_table);
                            let dist_sym = d.symbol();
                            if dist_sym >= 30 {
                                return Err(io::Error::new(
                                    io::ErrorKind::InvalidData,
                                    "Invalid distance symbol",
                                ));
                            }
                            bits.ensure(16);
                            let distance = DIST_START[dist_sym as usize] as usize
                                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

                            if distance == 0 || distance > out_pos {
                                return Err(io::Error::new(
                                    io::ErrorKind::InvalidData,
                                    "Invalid distance",
                                ));
                            }
                            out_pos = copy_match_into(output, out_pos, distance, length);
                            break;
                        } else {
                            break;
                        }
                    }
                    continue;
                }
                // e3 was not literal
                if e3.is_eob() {
                    return Ok(out_pos);
                }
                if e3.is_length() {
                    let len_idx = (e3.symbol() - 257) as usize;
                    bits.ensure(16);
                    let length = LEN_START[len_idx] as usize
                        + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

                    let d = resolve_entry(bits, dist_table);
                    let dist_sym = d.symbol();
                    bits.ensure(16);
                    let distance = DIST_START[dist_sym as usize] as usize
                        + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

                    if distance == 0 || distance > out_pos {
                        return Err(io::Error::new(
                            io::ErrorKind::InvalidData,
                            format!("Invalid distance: {} at out_pos={}", distance, out_pos),
                        ));
                    }
                    out_pos = copy_match_into(output, out_pos, distance, length);
                }
                continue;
            }
            // e2 was not literal
            if e2.is_eob() {
                return Ok(out_pos);
            }
            if e2.is_length() {
                let len_idx = (e2.symbol() - 257) as usize;
                bits.ensure(16);
                let length = LEN_START[len_idx] as usize
                    + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

                let d = resolve_entry(bits, dist_table);
                let dist_sym = d.symbol();
                bits.ensure(16);
                let distance = DIST_START[dist_sym as usize] as usize
                    + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

                if distance == 0 || distance > out_pos {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        format!("Invalid distance: {} at out_pos={}", distance, out_pos),
                    ));
                }
                out_pos = copy_match_into(output, out_pos, distance, length);
            }
            continue;
        }

        if entry.is_eob() {
            return Ok(out_pos);
        }

        // Length code
        if entry.is_length() {
            let len_sym = entry.symbol();
            if !(257..=285).contains(&len_sym) {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("Invalid length symbol: {} at out_pos={}", len_sym, out_pos),
                ));
            }
            let len_idx = (len_sym - 257) as usize;
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let d = resolve_entry(bits, dist_table);
            let dist_sym = d.symbol();
            if dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!(
                        "Invalid distance symbol: {} at out_pos={}",
                        dist_sym, out_pos
                    ),
                ));
            }
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance == 0 || distance > out_pos {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("Invalid distance: {} at out_pos={}", distance, out_pos),
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
        }
    }

    // === GENERIC LOOP (near end of output) ===
    loop {
        iterations += 1;
        if iterations > max_iterations {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Infinite loop in generic at out_pos={}", out_pos),
            ));
        }

        bits.ensure(32);

        let entry = resolve_entry(bits, lit_table);

        if entry.is_literal() {
            if out_pos >= out_end {
                return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
            }
            output[out_pos] = entry.symbol() as u8;
            out_pos += 1;
            continue;
        }

        if entry.is_eob() {
            return Ok(out_pos);
        }

        // Length code
        let len_symbol = entry.symbol();
        if !(257..=285).contains(&len_symbol) {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Invalid length code: {}", len_symbol),
            ));
        }

        let len_idx = (len_symbol - 257) as usize;
        bits.ensure(16);
        let length =
            LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

        let d = resolve_entry(bits, dist_table);
        let dist_sym = d.symbol();
        bits.ensure(16);
        let distance = DIST_START[dist_sym as usize] as usize
            + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

        if distance == 0 || distance > out_pos {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Invalid distance: {} at out_pos={}", distance, out_pos),
            ));
        }
        if out_pos + length > out_end {
            return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
        }
        out_pos = copy_match_into(output, out_pos, distance, length);
    }
}

/// Turbo decode loop with ALL Phase 1 optimizations from OPTIMIZATION_ROADMAP.md
///
/// Phase 1 optimizations implemented:
/// 1. bitsleft -= entry (full u32 subtract, no masking)
/// 2. Preload next entry BEFORE match copy
/// 3. Branchless refill (TurboBits)
/// 4. Unconditional 40-byte match copy
#[allow(dead_code)]
#[inline(never)]
fn decode_huffman_turbo(
    bits: &mut crate::decompress::two_level_table::TurboBits,
    output: &mut [u8],
    mut out_pos: usize,
    packed_lut: &crate::decompress::packed_lut::PackedLUT,
    lit_len_table: &TwoLevelTable,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::inflate_tables::{
        DIST_EXTRA_BITS, DIST_START, LEN_EXTRA_BITS, LEN_START,
    };

    // Entry format constants
    const BITS_MASK: u32 = 0xFF;
    const SYMBOL_SHIFT: u32 = 23;
    const DIST_SHIFT: u32 = 8;
    const DIST_MASK: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_EOB: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_SLOW: u32 = 0x7FFE << DIST_SHIFT;
    const LUT_MASK: u64 = 0xFFF;

    let out_end = output.len();
    // Fastloop margin: 258 (max match) + 40 (unconditional copy overrun) + safety
    let fastloop_end = out_end.saturating_sub(320);
    let table = &packed_lut.table;

    // === FASTLOOP with libdeflate-style entry preloading ===
    // Key insight: Load NEXT entry BEFORE processing current entry
    // This hides memory latency by overlapping lookup with processing
    while out_pos < fastloop_end {
        bits.ensure(56);

        // Load first entry
        let mut entry = table[(bits.buffer() & LUT_MASK) as usize].0;

        // === LITERAL CHAIN (up to 3 literals like libdeflate) ===
        if (entry as i32) < 0 && (entry & BITS_MASK) != 0 {
            // First literal - consume and PRELOAD next entry
            bits.consume_entry(entry);
            let lit1 = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
            entry = table[(bits.buffer() & LUT_MASK) as usize].0; // PRELOAD

            output[out_pos] = lit1;
            out_pos += 1;

            // Second literal check (entry already preloaded)
            if (entry as i32) < 0 && (entry & BITS_MASK) != 0 {
                bits.consume_entry(entry);
                let lit2 = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
                entry = table[(bits.buffer() & LUT_MASK) as usize].0; // PRELOAD

                output[out_pos] = lit2;
                out_pos += 1;

                // Third literal check (entry already preloaded)
                if (entry as i32) < 0 && (entry & BITS_MASK) != 0 {
                    bits.consume_entry(entry);
                    let lit3 = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;

                    output[out_pos] = lit3;
                    out_pos += 1;

                    // Continue with tight literal loop for runs > 3
                    // OPTIMIZATION: Preload next entry while processing current
                    let mut e = table[(bits.buffer() & LUT_MASK) as usize].0;
                    while bits.has_bits(24) && (e as i32) < 0 && (e & BITS_MASK) != 0 {
                        bits.consume_entry(e);
                        let lit = ((e >> SYMBOL_SHIFT) & 0xFF) as u8;
                        // PRELOAD next entry (hides memory latency)
                        e = table[(bits.buffer() & LUT_MASK) as usize].0;
                        output[out_pos] = lit;
                        out_pos += 1;
                    }
                    continue;
                }
                // Third wasn't literal - entry is already loaded, fall through
            } else {
                // Second wasn't literal - entry is already loaded, fall through
            }

            // Handle non-literal entry (already in 'entry' variable)
            if entry & BITS_MASK == 0 {
                // Invalid - use slow path
                let (symbol, code_len) = lit_len_table.decode(bits.buffer());
                if code_len == 0 {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code"));
                }
                bits.consume(code_len);
                if symbol == 256 {
                    return Ok(out_pos);
                }
                if symbol < 256 {
                    output[out_pos] = symbol as u8;
                    out_pos += 1;
                    continue;
                }
                // Length code
                let len_idx = (symbol - 257) as usize;
                bits.ensure(16);
                let length = LEN_START[len_idx] as usize
                    + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;
                let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
                if dist_len == 0 {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
                }
                bits.consume(dist_len);
                bits.ensure(16);
                let distance = DIST_START[dist_sym as usize] as usize
                    + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;
                if distance > out_pos {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
                }
                out_pos = copy_match_into(output, out_pos, distance, length);
                continue;
            }

            bits.consume_entry(entry);
            // Handle entry as non-literal (EOB, match, etc.)
            let dist_field = entry & DIST_MASK;
            if dist_field == DIST_EOB {
                return Ok(out_pos);
            }
            if dist_field == DIST_SLOW {
                let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
                let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
                if dist_len == 0 {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
                }
                bits.consume(dist_len);
                bits.ensure(16);
                let distance = DIST_START[dist_sym as usize] as usize
                    + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;
                if distance > out_pos {
                    return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
                }

                out_pos = copy_match_into(output, out_pos, distance, length);
                continue;
            }
            // Pre-computed match
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
            let distance = (dist_field >> DIST_SHIFT) as usize;
            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // Invalid entry - fallback
        if entry & BITS_MASK == 0 {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code"));
            }
            bits.consume(code_len);

            if symbol < 256 {
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            let len_idx = (symbol - 257) as usize;
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // OPTIMIZATION 1: bitsleft -= entry (consume full u32)
        bits.consume_entry(entry);

        let dist_field = entry & DIST_MASK;

        // EOB
        if dist_field == DIST_EOB {
            return Ok(out_pos);
        }

        // Slow path (length with extra bits, distance decoded separately)
        if dist_field == DIST_SLOW {
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;

            // OPTIMIZATION 2: Preload next entry BEFORE distance decode
            let next_entry_preload = table[(bits.buffer() & LUT_MASK) as usize].0;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }

            // Use optimized copy function (handles all cases efficiently)
            out_pos = copy_match_into(output, out_pos, distance, length);

            // Silence unused variable warning - preload was for latency hiding
            let _ = next_entry_preload;
            continue;
        }

        // Pre-computed LZ77 match
        let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
        let distance = (dist_field >> DIST_SHIFT) as usize;

        if distance > out_pos {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
        }

        // Same unconditional copy pattern
        let src_start = out_pos - distance;
        if distance >= 8 {
            let mut copied = 0;
            while copied < 40 && copied < length {
                let src = src_start + copied;
                let dst = out_pos + copied;
                if dst + 8 <= output.len() && src + 8 <= output.len() {
                    unsafe {
                        let word = (output.as_ptr().add(src) as *const u64).read_unaligned();
                        (output.as_mut_ptr().add(dst) as *mut u64).write_unaligned(word);
                    }
                }
                copied += 8;
            }
            for i in 40.min(length)..length {
                output[out_pos + i] = output[src_start + i];
            }
        } else if distance == 1 {
            let byte = output[src_start];
            for i in 0..length {
                output[out_pos + i] = byte;
            }
        } else {
            for i in 0..length {
                output[out_pos + i] = output[src_start + i];
            }
        }
        out_pos += length;
    }

    // === SLOWLOOP: With bounds checks ===
    loop {
        bits.ensure(32);

        let entry = table[(bits.buffer() & LUT_MASK) as usize].0;

        if entry & BITS_MASK == 0 {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code"));
            }
            bits.consume(code_len);

            if symbol < 256 {
                if out_pos >= out_end {
                    return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
                }
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            let len_idx = (symbol - 257) as usize;
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        bits.consume_entry(entry);

        if (entry as i32) < 0 {
            if out_pos >= out_end {
                return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
            }
            output[out_pos] = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
            out_pos += 1;
            continue;
        }

        let dist_field = entry & DIST_MASK;
        if dist_field == DIST_EOB {
            return Ok(out_pos);
        }

        if dist_field == DIST_SLOW {
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;
            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
        let distance = (dist_field >> DIST_SHIFT) as usize;
        if distance > out_pos {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
        }
        out_pos = copy_match_into(output, out_pos, distance, length);
    }
}

/// x86_64 inline assembly optimized decode loop
///
/// Uses hand-tuned register allocation and minimal branches for maximum performance.
/// Implements FULL decode including LZ77 match copy.
///
/// Key optimizations:
/// 1. All hot state in registers (bitbuf, bits, out_pos, table_ptr)
/// 2. BMI2 shrx for variable shifts (single instruction)
/// 3. Branchless literal detection (test sign bit)
/// 4. Pre-computed LZ77 matches in single lookup
/// 5. Optimized match copy with memset for RLE
///
/// This function uses #[target_feature(enable = "bmi2")] for runtime selection.
/// SAFETY: Caller MUST verify is_x86_feature_detected!("bmi2") before calling.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[allow(dead_code)]
#[inline(never)]
unsafe fn decode_huffman_asm_x64(
    compressed: &[u8],
    output: &mut [u8],
    mut out_pos: usize,
    packed_lut: &crate::decompress::packed_lut::PackedLUT,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::inflate_tables::{DIST_EXTRA_BITS, DIST_START};

    // Entry format constants
    const LUT_MASK: u64 = 0xFFF;
    const BITS_MASK: u32 = 0xFF;
    const SYMBOL_SHIFT: u32 = 23;
    const DIST_SHIFT: u32 = 8;
    const DIST_MASK: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_EOB: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_SLOW: u32 = 0x7FFE << DIST_SHIFT;

    let out_end = output.len();
    let fastloop_end = out_end.saturating_sub(320);
    let table = packed_lut.table.as_ptr();

    // Initialize bit buffer
    let mut pos: usize = 0;
    let mut bitbuf: u64 = 0;
    let mut bits: u32 = 0;

    // Initial refill
    if pos + 8 <= compressed.len() {
        unsafe {
            bitbuf = (compressed.as_ptr().add(pos) as *const u64)
                .read_unaligned()
                .to_le();
        }
        pos += 8;
        bits = 64;
    }

    // === FULL ASM DECODE LOOP ===
    'main: while out_pos < fastloop_end {
        // Refill if needed (branchless style)
        if bits < 32 && pos + 4 <= compressed.len() {
            unsafe {
                let word = (compressed.as_ptr().add(pos) as *const u32).read_unaligned() as u64;
                bitbuf |= word << bits;
                let consumed = (64 - bits) / 8;
                pos += consumed as usize;
                bits |= 56;
            }
        }

        if bits < 12 {
            break;
        }

        // Table lookup
        let entry = unsafe { (*table.add((bitbuf & LUT_MASK) as usize)).0 };

        // Check for valid entry
        if entry & BITS_MASK == 0 {
            // Invalid entry - need slow path
            break;
        }

        let entry_bits = entry & BITS_MASK;

        // === LITERAL PATH (most common - bit 31 set) ===
        if (entry as i32) < 0 {
            // Extract literal and write
            output[out_pos] = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
            out_pos += 1;

            // Consume bits
            bitbuf >>= entry_bits;
            bits = bits.wrapping_sub(entry_bits);

            // Tight inner loop for consecutive literals
            while bits >= 12 && out_pos < fastloop_end {
                let e = unsafe { (*table.add((bitbuf & LUT_MASK) as usize)).0 };
                if (e as i32) >= 0 || (e & BITS_MASK) == 0 {
                    break;
                }
                let e_bits = e & BITS_MASK;
                output[out_pos] = ((e >> SYMBOL_SHIFT) & 0xFF) as u8;
                out_pos += 1;
                bitbuf >>= e_bits;
                bits = bits.wrapping_sub(e_bits);
            }
            continue 'main;
        }

        // Non-literal: consume bits first
        bitbuf >>= entry_bits;
        bits = bits.wrapping_sub(entry_bits);

        let dist_field = entry & DIST_MASK;

        // === END OF BLOCK ===
        if dist_field == DIST_EOB {
            return Ok(out_pos);
        }

        // === SLOW PATH (distance decoded separately) ===
        if dist_field == DIST_SLOW {
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;

            // Refill for distance decode
            if bits < 32 && pos + 4 <= compressed.len() {
                unsafe {
                    let word = (compressed.as_ptr().add(pos) as *const u32).read_unaligned() as u64;
                    bitbuf |= word << bits;
                    let consumed = (64 - bits) / 8;
                    pos += consumed as usize;
                    bits |= 56;
                }
            }

            // Decode distance using two-level table
            let (dist_sym, dist_len) = dist_table.decode(bitbuf);
            if dist_len == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bitbuf >>= dist_len;
            bits = bits.wrapping_sub(dist_len);

            // Read distance extra bits
            let extra = DIST_EXTRA_BITS[dist_sym as usize] as u32;
            if extra > 0 && bits < extra && pos + 4 <= compressed.len() {
                unsafe {
                    let word = (compressed.as_ptr().add(pos) as *const u32).read_unaligned() as u64;
                    bitbuf |= word << bits;
                    let consumed = (64 - bits) / 8;
                    pos += consumed as usize;
                    bits |= 56;
                }
            }
            let extra_val = (bitbuf & ((1u64 << extra) - 1)) as usize;
            bitbuf >>= extra;
            bits = bits.wrapping_sub(extra);

            let distance = DIST_START[dist_sym as usize] as usize + extra_val;

            if distance > out_pos {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            // Perform LZ77 copy
            out_pos = copy_match_asm(output, out_pos, distance, length);
            continue 'main;
        }

        // === PRE-COMPUTED LZ77 MATCH ===
        let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
        let distance = (dist_field >> DIST_SHIFT) as usize;

        if distance == 0 || distance > out_pos {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                "Invalid distance",
            ));
        }

        // Perform LZ77 copy
        out_pos = copy_match_asm(output, out_pos, distance, length);
    }

    Ok(out_pos)
}

/// Ultra-fast LZ77 copy with special handling for common patterns
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn copy_match_asm(output: &mut [u8], out_pos: usize, distance: usize, length: usize) -> usize {
    let src_start = out_pos - distance;

    assert!(
        out_pos + length <= output.len(),
        "output buffer overflow: out_pos={} length={} cap={}",
        out_pos,
        length,
        output.len()
    );

    unsafe {
        let dst = output.as_mut_ptr().add(out_pos);
        let src = output.as_ptr().add(src_start);

        if distance == 1 {
            // RLE: memset (very common pattern)
            std::ptr::write_bytes(dst, *src, length);
        } else if distance >= 8 {
            // Distance >= 8: use 8-byte copies
            let mut i = 0usize;
            while i + 8 <= length {
                let chunk = (src.add(i) as *const u64).read_unaligned();
                (dst.add(i) as *mut u64).write_unaligned(chunk);
                i += 8;
            }
            // Remainder
            while i < length {
                *dst.add(i) = *src.add(i);
                i += 1;
            }
        } else {
            // Small distance (2-7): byte-by-byte to handle overlap
            for i in 0..length {
                *dst.add(i) = *src.add(i);
            }
        }
    }

    out_pos + length
}

// Non-x86_64 stub - the real function is only available on x86_64
#[cfg(not(target_arch = "x86_64"))]
#[allow(dead_code)]
unsafe fn decode_huffman_asm_x64(
    _compressed: &[u8],
    _output: &mut [u8],
    out_pos: usize,
    _packed_lut: &crate::decompress::packed_lut::PackedLUT,
    _dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    // Not available on this platform
    Ok(out_pos)
}

// Portable fallback for copy_match_asm
#[cfg(not(target_arch = "x86_64"))]
#[inline(always)]
#[allow(dead_code)]
fn copy_match_asm(output: &mut [u8], out_pos: usize, distance: usize, length: usize) -> usize {
    copy_match_into(output, out_pos, distance, length)
}

/// Ultra-tight decode loop using direct bit manipulation
///
/// Key optimizations:
/// 1. Cast entry to i32 - negative means literal (most common)
/// 2. Single branch for literals, everything else is rare path
/// 3. No function calls in hot literal path
/// 4. Bit arithmetic instead of method calls
#[allow(dead_code)]
#[inline(never)]
fn decode_huffman_ultra(
    bits: &mut FastBits,
    output: &mut [u8],
    mut out_pos: usize,
    packed_lut: &crate::decompress::packed_lut::PackedLUT,
    lit_len_table: &TwoLevelTable,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::inflate_tables::{
        DIST_EXTRA_BITS, DIST_START, LEN_EXTRA_BITS, LEN_START,
    };

    // Entry format constants (inlined for speed)
    const BITS_MASK: u32 = 0xFF;
    const SYMBOL_SHIFT: u32 = 23;
    const DIST_SHIFT: u32 = 8;
    const DIST_MASK: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_EOB: u32 = 0x7FFF << DIST_SHIFT;
    const DIST_SLOW: u32 = 0x7FFE << DIST_SHIFT;
    const LUT_MASK: u64 = 0xFFF;

    let out_end = output.len();
    let fastloop_end = out_end.saturating_sub(300);
    let table = &packed_lut.table;

    // === FASTLOOP ===
    while out_pos < fastloop_end {
        bits.ensure(56);

        // Direct table lookup (no method call)
        let entry = table[(bits.buffer() & LUT_MASK) as usize].0;
        let entry_bits = entry & BITS_MASK;

        // Invalid entry check
        if entry_bits == 0 {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code"));
            }
            bits.consume(code_len);

            if symbol < 256 {
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            // Length code
            let len_idx = (symbol - 257) as usize;
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        bits.consume(entry_bits);

        // LITERAL TEST: Cast to i32, if negative (bit 31 set) it's a literal
        // This is the hot path - ~80% of iterations on typical data
        if (entry as i32) < 0 {
            output[out_pos] = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
            out_pos += 1;

            // === TIGHT LITERAL INNER LOOP ===
            // Keep decoding literals without jumping back to outer loop
            loop {
                if bits.bits_available() < 12 {
                    break;
                }

                let e = table[(bits.buffer() & LUT_MASK) as usize].0;
                // Not literal or invalid -> break
                if (e as i32) >= 0 || (e & BITS_MASK) == 0 {
                    break;
                }

                bits.consume(e & BITS_MASK);
                output[out_pos] = ((e >> SYMBOL_SHIFT) & 0xFF) as u8;
                out_pos += 1;
            }
            continue;
        }

        // Non-literal path (rare)
        let dist_field = entry & DIST_MASK;

        // EOB check
        if dist_field == DIST_EOB {
            return Ok(out_pos);
        }

        // Slow path (length with extra bits, distance decoded separately)
        if dist_field == DIST_SLOW {
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // Pre-computed LZ77 match (distance in entry)
        let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
        let distance = (dist_field >> DIST_SHIFT) as usize;

        if distance > out_pos {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
        }
        out_pos = copy_match_into(output, out_pos, distance, length);
    }

    // === SLOWLOOP: With bounds checks ===
    loop {
        bits.ensure(32);

        let entry = table[(bits.buffer() & LUT_MASK) as usize].0;
        let entry_bits = entry & BITS_MASK;

        if entry_bits == 0 {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code"));
            }
            bits.consume(code_len);

            if symbol < 256 {
                if out_pos >= out_end {
                    return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
                }
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            let len_idx = (symbol - 257) as usize;
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        bits.consume(entry_bits);

        if (entry as i32) < 0 {
            if out_pos >= out_end {
                return Err(io::Error::new(io::ErrorKind::WriteZero, "Output full"));
            }
            output[out_pos] = ((entry >> SYMBOL_SHIFT) & 0xFF) as u8;
            out_pos += 1;
            continue;
        }

        let dist_field = entry & DIST_MASK;
        if dist_field == DIST_EOB {
            return Ok(out_pos);
        }

        if dist_field == DIST_SLOW {
            let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid dist"));
            }
            bits.consume(dist_len);
            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;
            if distance > out_pos {
                return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
            }
            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        let length = ((entry >> SYMBOL_SHIFT) & 0xFF) as usize + 3;
        let distance = (dist_field >> DIST_SHIFT) as usize;
        if distance > out_pos {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "Bad dist"));
        }
        out_pos = copy_match_into(output, out_pos, distance, length);
    }
}

/// Ultra-optimized decode loop using PackedLUT
///
/// Key optimizations from libdeflate:
/// 1. Packed u32 entries - all info in one register
/// 2. Bit testing instead of match statements  
/// 3. Fastloop with no bounds checks
/// 4. Tight literal inner loop
#[allow(dead_code)]
fn decode_huffman_packed(
    bits: &mut FastBits,
    output: &mut [u8],
    mut out_pos: usize,
    packed_lut: &crate::decompress::packed_lut::PackedLUT,
    lit_len_table: &TwoLevelTable,
    dist_table: &TwoLevelTable,
) -> io::Result<usize> {
    use crate::decompress::inflate_tables::{DIST_EXTRA_BITS, DIST_START};

    // Fastloop margin: max bytes written per iteration
    // 258 (max match) + 8 (literal unroll) + safety margin
    const FASTLOOP_MARGIN: usize = 300;

    let out_end = output.len();
    let fastloop_end = out_end.saturating_sub(FASTLOOP_MARGIN);

    // === FASTLOOP: No per-iteration bounds checks ===
    while out_pos < fastloop_end {
        bits.ensure(56); // Enough for multiple symbols

        let entry = packed_lut.decode(bits.buffer());

        // Invalid entry - fallback to TwoLevelTable
        if entry.bits() == 0 {
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid Huffman code",
                ));
            }
            bits.consume(code_len);

            if symbol < 256 {
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            // Length code - decode via slow path
            let len_idx = (symbol - 257) as usize;
            if len_idx >= 29 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid length code",
                ));
            }

            use crate::decompress::inflate_tables::{LEN_EXTRA_BITS, LEN_START};
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 || dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos || distance == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // Consume bits for this entry
        bits.consume(entry.bits());

        // Test bit 31: literal (most common case)
        if entry.is_literal() {
            output[out_pos] = entry.symbol();
            out_pos += 1;

            // === TIGHT LITERAL LOOP ===
            // Continue decoding literals without going back to outer loop
            while bits.bits_available() >= 12 {
                let e = packed_lut.decode(bits.buffer());
                if !e.is_literal() || e.bits() == 0 {
                    break;
                }
                bits.consume(e.bits());
                output[out_pos] = e.symbol();
                out_pos += 1;
            }
            continue;
        }

        // Check for EOB
        if entry.is_eob() {
            return Ok(out_pos);
        }

        // Check for slow path (length code, distance decoded separately)
        if entry.is_slow_path() {
            let length = entry.length();

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 || dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos || distance == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // LZ77 match with pre-computed distance (not used in current build)
        let length = entry.length();
        let distance = entry.distance();

        if distance > out_pos || distance == 0 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                "Invalid distance",
            ));
        }

        out_pos = copy_match_into(output, out_pos, distance, length);
    }

    // === SLOWLOOP: Safe bounds checking ===
    loop {
        bits.ensure(32);

        let entry = packed_lut.decode(bits.buffer());

        if entry.bits() == 0 {
            // Fallback to TwoLevelTable
            let (symbol, code_len) = lit_len_table.decode(bits.buffer());
            if code_len == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid Huffman code",
                ));
            }
            bits.consume(code_len);

            if symbol < 256 {
                if out_pos >= out_end {
                    return Err(io::Error::new(
                        io::ErrorKind::WriteZero,
                        "Output buffer full",
                    ));
                }
                output[out_pos] = symbol as u8;
                out_pos += 1;
                continue;
            }
            if symbol == 256 {
                return Ok(out_pos);
            }

            let len_idx = (symbol - 257) as usize;
            if len_idx >= 29 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid length code",
                ));
            }

            use crate::decompress::inflate_tables::{LEN_EXTRA_BITS, LEN_START};
            bits.ensure(16);
            let length =
                LEN_START[len_idx] as usize + bits.read(LEN_EXTRA_BITS[len_idx] as u32) as usize;

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 || dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos || distance == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        bits.consume(entry.bits());

        if entry.is_literal() {
            if out_pos >= out_end {
                return Err(io::Error::new(
                    io::ErrorKind::WriteZero,
                    "Output buffer full",
                ));
            }
            output[out_pos] = entry.symbol();
            out_pos += 1;
            continue;
        }

        if entry.is_eob() {
            return Ok(out_pos);
        }

        if entry.is_slow_path() {
            let length = entry.length();

            let (dist_sym, dist_len) = dist_table.decode(bits.buffer());
            if dist_len == 0 || dist_sym >= 30 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance code",
                ));
            }
            bits.consume(dist_len);

            bits.ensure(16);
            let distance = DIST_START[dist_sym as usize] as usize
                + bits.read(DIST_EXTRA_BITS[dist_sym as usize] as u32) as usize;

            if distance > out_pos || distance == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid distance",
                ));
            }

            out_pos = copy_match_into(output, out_pos, distance, length);
            continue;
        }

        // LZ77 match
        let length = entry.length();
        let distance = entry.distance();

        if distance > out_pos || distance == 0 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                "Invalid distance",
            ));
        }

        out_pos = copy_match_into(output, out_pos, distance, length);
    }
}

/// AVX-512 copy for large non-overlapping regions (5-10% gain on AVX-512 CPUs)
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
#[inline(always)]
unsafe fn copy_large_avx512(src: *const u8, dst: *mut u8, length: usize) {
    use std::arch::x86_64::*;

    let mut remaining = length;
    let mut s = src;
    let mut d = dst;

    // Copy 64-byte chunks
    while remaining >= 64 {
        let chunk = _mm512_loadu_si512(s as *const __m512i);
        _mm512_storeu_si512(d as *mut __m512i, chunk);
        s = s.add(64);
        d = d.add(64);
        remaining -= 64;
    }

    // Copy remainder with standard memcpy
    if remaining > 0 {
        std::ptr::copy_nonoverlapping(s, d, remaining);
    }
}

/// Copy LZ77 match directly into output slice
/// Optimized for:
/// 1. distance=1 (RLE): memset
/// 2. distance >= length: non-overlapping memcpy (with AVX-512 for large copies)
/// 3. distance >= 8: chunk copy
/// 4. small distance: byte-by-byte
#[inline(always)]
pub fn copy_match_into(output: &mut [u8], out_pos: usize, distance: usize, length: usize) -> usize {
    // Record match statistics (no-op when tracing disabled)
    trace_match(distance, length);

    let src_start = out_pos - distance;

    assert!(
        out_pos + length <= output.len(),
        "output buffer overflow: out_pos={} length={} cap={}",
        out_pos,
        length,
        output.len()
    );

    unsafe {
        let dst = output.as_mut_ptr().add(out_pos);
        let src = output.as_ptr().add(src_start);

        // PHASE 3.4: Prefetch next cache line to hide memory latency
        #[cfg(target_arch = "x86_64")]
        if length >= 32 {
            use std::arch::x86_64::*;
            _mm_prefetch(src.add(64) as *const i8, _MM_HINT_T0);
            _mm_prefetch(dst.add(64) as *const i8, _MM_HINT_T0);
        }

        #[cfg(target_arch = "aarch64")]
        if length >= 32 {
            core::arch::asm!(
                "prfm pldl1keep, [{0}]",
                "prfm pstl1keep, [{1}]",
                in(reg) src.add(64),
                in(reg) dst.add(64),
                options(nostack, preserves_flags)
            );
        }

        if distance == 1 {
            // Very common: RLE (single byte repeat)
            // This is a major optimization from libdeflate
            let byte = *src;
            std::ptr::write_bytes(dst, byte, length);
        } else if distance >= length {
            // Non-overlapping: use fast copy
            // For large copies on AVX-512 systems, use 64-byte chunks
            #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
            {
                if length >= 64 {
                    copy_large_avx512(src, dst, length);
                } else {
                    std::ptr::copy_nonoverlapping(src, dst, length);
                }
            }
            #[cfg(not(all(target_arch = "x86_64", target_feature = "avx512f")))]
            {
                std::ptr::copy_nonoverlapping(src, dst, length);
            }
        } else if distance >= 8 {
            // Overlapping but distance >= 8: 8-byte chunk copy
            let mut remaining = length;
            let mut d = dst;
            let mut s = src;
            while remaining >= 8 {
                let chunk = (s as *const u64).read_unaligned();
                (d as *mut u64).write_unaligned(chunk);
                d = d.add(8);
                s = s.add(8);
                remaining -= 8;
            }
            // Copy remainder
            for i in 0..remaining {
                *d.add(i) = *s.add(i);
            }
        } else if length >= 16 {
            // Small distance (2-7): word-at-a-time with offset stride (like libdeflate)
            // Key: copy 8 bytes but advance by distance, allowing overlapping writes
            // to propagate the pattern
            // Requires length >= 16 for safe unconditional writes
            let mut d = dst;
            let mut s = src;
            let end = dst.add(length);

            // Copy two 8-byte chunks unconditionally (covers up to 16 bytes)
            (d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
            s = s.add(distance);
            d = d.add(distance);
            (d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
            s = s.add(distance);
            d = d.add(distance);

            // Continue until done
            while d < end {
                (d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
                s = s.add(distance);
                d = d.add(distance);
            }
        } else if distance > 0 {
            // Very short copy with small distance: byte-by-byte is fine
            for i in 0..length {
                *dst.add(i) = *src.add(i % distance);
            }
        }
        // distance == 0 is invalid, but we don't panic here - just skip
    }

    out_pos + length
}

/// Parallel BGZF decompression returning output as a Vec.
///
/// This is the zero-copy path: the output Vec is filled in-place by
/// parallel threads, then returned directly to the caller without any
/// intermediate copies.
pub fn decompress_bgzf_parallel_to_vec(data: &[u8], num_threads: usize) -> io::Result<Vec<u8>> {
    let blocks = parse_bgzf_blocks(data)?;

    if blocks.is_empty() {
        return Ok(Vec::new());
    }

    let total_output: usize = blocks.iter().map(|b| b.isize as usize).sum();
    let output = vec![0u8; total_output];

    let num_blocks = blocks.len();
    let next_block = AtomicUsize::new(0);
    let had_error = std::sync::atomic::AtomicBool::new(false);

    use std::cell::UnsafeCell;
    struct OutputBuffer(UnsafeCell<Vec<u8>>);
    unsafe impl Sync for OutputBuffer {}

    let output_cell = OutputBuffer(UnsafeCell::new(output));

    std::thread::scope(|scope| {
        for _ in 0..num_threads.min(num_blocks) {
            let blocks_ref = &blocks;
            let next_ref = &next_block;
            let output_ref = &output_cell;
            let error_ref = &had_error;

            scope.spawn(move || {
                let decompressor = unsafe { libdeflate_sys::libdeflate_alloc_decompressor() };
                if decompressor.is_null() {
                    error_ref.store(true, Ordering::Relaxed);
                    return;
                }

                loop {
                    let idx = next_ref.fetch_add(1, Ordering::Relaxed);
                    if idx >= num_blocks {
                        break;
                    }

                    let block = &blocks_ref[idx];
                    let out_size = block.isize as usize;
                    if out_size == 0 {
                        continue;
                    }

                    // Raw deflate: skip gzip header, stop before 8-byte trailer
                    let deflate_end = block.start + block.length - 8;
                    let deflate_data = &data[block.deflate_start..deflate_end];

                    // SAFETY: Each block writes to a disjoint region
                    let output_ptr = unsafe { (*output_ref.0.get()).as_mut_ptr() };
                    let out_start = block.output_offset;
                    let out_slice = unsafe {
                        std::slice::from_raw_parts_mut(output_ptr.add(out_start), out_size)
                    };

                    let mut actual_out = 0usize;
                    let ret = unsafe {
                        libdeflate_sys::libdeflate_deflate_decompress(
                            decompressor,
                            deflate_data.as_ptr() as *const std::ffi::c_void,
                            deflate_data.len(),
                            out_slice.as_mut_ptr() as *mut std::ffi::c_void,
                            out_size,
                            &mut actual_out,
                        )
                    };
                    if ret != 0 || actual_out != out_size {
                        error_ref.store(true, Ordering::Relaxed);
                    }
                }

                unsafe { libdeflate_sys::libdeflate_free_decompressor(decompressor) };
            });
        }
    });

    if had_error.load(std::sync::atomic::Ordering::Relaxed) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "CRC32 or size mismatch in BGZF block",
        ));
    }

    Ok(output_cell.0.into_inner())
}

/// Parallel BGZF decompression writing to a generic writer.
///
/// For single-thread (num_threads=1), uses a streaming path that decompresses
/// block-by-block into a reusable buffer.
///
/// For multi-thread, uses a pipelined architecture:
///   - N decoder threads pull blocks via atomic counter, decompress into
///     pooled buffers, and send (block_index, buffer) through a channel
///   - Main thread receives completed blocks, writes them in order,
///     and returns buffers to the pool
///
/// This avoids allocating the full output (~211MB for silesia) which caused
/// ~53K page faults and only 2x scaling with 4 threads. The pipeline uses
/// a small buffer pool (~1MB) and writes blocks as they complete.
pub fn decompress_bgzf_parallel<W: Write>(
    data: &[u8],
    writer: &mut W,
    num_threads: usize,
) -> io::Result<u64> {
    if num_threads <= 1 {
        return decompress_bgzf_streaming(data, writer);
    }
    decompress_bgzf_pipelined(data, writer, num_threads)
}

/// Pipelined parallel BGZF: decoder threads + ordered writer.
///
/// Buffer pool avoids per-block allocation. Completed blocks are written
/// in order as they arrive, overlapping I/O with decompression.
fn decompress_bgzf_pipelined<W: Write>(
    data: &[u8],
    writer: &mut W,
    num_threads: usize,
) -> io::Result<u64> {
    let blocks = parse_bgzf_blocks(data)?;
    if blocks.is_empty() {
        return Ok(0);
    }

    let num_blocks = blocks.len();
    let max_block_output = blocks.iter().map(|b| b.isize as usize).max().unwrap_or(0);

    // Completed blocks channel: (block_index, decompressed_data)
    // Bounded to 2*threads so decoders don't race too far ahead of the writer.
    let channel_cap = num_threads * 2 + 2;
    let (done_tx, done_rx) = std::sync::mpsc::sync_channel::<(usize, Vec<u8>)>(channel_cap);

    let next_block = AtomicUsize::new(0);
    let had_error = std::sync::atomic::AtomicBool::new(false);
    let mut total = 0u64;

    std::thread::scope(|scope| {
        // Spawn N decoder threads, each with its own reusable buffer
        for _ in 0..num_threads.min(num_blocks) {
            let done_tx = done_tx.clone();
            let blocks_ref = &blocks;
            let next_ref = &next_block;
            let error_ref = &had_error;

            scope.spawn(move || {
                let decompressor = unsafe { libdeflate_sys::libdeflate_alloc_decompressor() };
                if decompressor.is_null() {
                    error_ref.store(true, Ordering::Relaxed);
                    return;
                }

                let mut buf = vec![0u8; max_block_output];

                loop {
                    let idx = next_ref.fetch_add(1, Ordering::Relaxed);
                    if idx >= num_blocks {
                        break;
                    }

                    let block = &blocks_ref[idx];
                    let out_size = block.isize as usize;
                    if out_size == 0 {
                        let _ = done_tx.send((idx, Vec::new()));
                        continue;
                    }

                    if buf.len() < out_size {
                        buf.resize(out_size, 0);
                    }

                    let deflate_end = block.start + block.length - 8;
                    let deflate_data = &data[block.deflate_start..deflate_end];

                    let mut actual_out = 0usize;
                    let ret = unsafe {
                        libdeflate_sys::libdeflate_deflate_decompress(
                            decompressor,
                            deflate_data.as_ptr() as *const std::ffi::c_void,
                            deflate_data.len(),
                            buf.as_mut_ptr() as *mut std::ffi::c_void,
                            out_size,
                            &mut actual_out,
                        )
                    };

                    if ret != 0 || actual_out != out_size {
                        error_ref.store(true, Ordering::Relaxed);
                    }

                    // Transfer buffer ownership through the channel; swap in a
                    // fresh capacity-only Vec so the next block has a buffer to
                    // fill without any copy of the decompressed bytes.
                    buf.truncate(actual_out);
                    let send_buf =
                        std::mem::replace(&mut buf, Vec::with_capacity(max_block_output));
                    let _ = done_tx.send((idx, send_buf));
                }

                unsafe { libdeflate_sys::libdeflate_free_decompressor(decompressor) };
            });
        }
        drop(done_tx); // close channel when all decoders finish

        // Writer: receive completed blocks, write in order.
        // Blocks may arrive out of order; hold them in a BTreeMap until
        // the next sequential block is available, then flush.
        let mut next_to_write = 0usize;
        let mut pending = std::collections::BTreeMap::<usize, Vec<u8>>::new();
        let mut write_error: Option<io::Error> = None;

        for (idx, data_vec) in &done_rx {
            pending.insert(idx, data_vec);

            while let Some(block_data) = pending.remove(&next_to_write) {
                if write_error.is_none() && !block_data.is_empty() {
                    if let Err(e) = writer.write_all(&block_data) {
                        write_error = Some(e);
                    }
                    total += block_data.len() as u64;
                }
                next_to_write += 1;
            }
        }
    });

    if had_error.load(Ordering::Relaxed) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "CRC32 or size mismatch in BGZF block",
        ));
    }

    Ok(total)
}

/// Streaming BGZF decompression: decompress one block at a time into a
/// reusable buffer, write immediately. No full-output-size allocation.
///
/// Uses raw deflate decompress with a reused decompressor (skipping both
/// gzip header re-parsing and decompressor alloc/free per block).
fn decompress_bgzf_streaming<W: Write>(data: &[u8], writer: &mut W) -> io::Result<u64> {
    let blocks = parse_bgzf_blocks(data)?;
    if blocks.is_empty() {
        return Ok(0);
    }

    let max_block_output = blocks.iter().map(|b| b.isize as usize).max().unwrap_or(0);
    let mut buf = vec![0u8; max_block_output];
    let mut total = 0u64;

    let decompressor = unsafe { libdeflate_sys::libdeflate_alloc_decompressor() };
    if decompressor.is_null() {
        return Err(io::Error::other(
            "failed to allocate libdeflate decompressor",
        ));
    }

    let result = (|| -> io::Result<u64> {
        for block in &blocks {
            let out_size = block.isize as usize;
            if out_size == 0 {
                continue;
            }

            if out_size > buf.len() {
                buf.resize(out_size, 0);
            }

            // Raw deflate data: between header and 8-byte trailer (CRC32 + ISIZE)
            let deflate_end = block.start + block.length - 8;
            let deflate_data = &data[block.deflate_start..deflate_end];

            let mut actual_out = 0usize;
            let ret = unsafe {
                libdeflate_sys::libdeflate_deflate_decompress(
                    decompressor,
                    deflate_data.as_ptr() as *const std::ffi::c_void,
                    deflate_data.len(),
                    buf.as_mut_ptr() as *mut std::ffi::c_void,
                    out_size,
                    &mut actual_out,
                )
            };

            if ret != libdeflate_sys::libdeflate_result_LIBDEFLATE_SUCCESS {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "deflate decompression failed in BGZF block",
                ));
            }

            writer.write_all(&buf[..actual_out])?;
            total += actual_out as u64;
        }
        Ok(total)
    })();

    unsafe { libdeflate_sys::libdeflate_free_decompressor(decompressor) };
    result
}

// ============================================================================
// Multi-Member Parallel Decompression (for pigz-style files)
// ============================================================================

/// Parse a gzip header starting at `data[offset..]`, returning the byte offset
/// of the raw deflate data (past the header). Returns None if the header is
/// malformed or extends past the given `end` bound.
fn parse_gzip_header(data: &[u8], offset: usize, end: usize) -> Option<usize> {
    if end - offset < 10 {
        return None;
    }
    let mut ds = offset + 10;
    let flg = data[offset + 3];
    if flg & 0x04 != 0 {
        if ds + 2 > end {
            return None;
        }
        let xlen = u16::from_le_bytes([data[ds], data[ds + 1]]) as usize;
        ds += 2 + xlen;
    }
    if flg & 0x08 != 0 {
        while ds < end && data[ds] != 0 {
            ds += 1;
        }
        ds += 1;
    }
    if flg & 0x10 != 0 {
        while ds < end && data[ds] != 0 {
            ds += 1;
        }
        ds += 1;
    }
    if flg & 0x02 != 0 {
        ds += 2;
    }
    if ds >= end {
        None
    } else {
        Some(ds)
    }
}

/// Fast O(N) member boundary scan for multi-member gzip files (pigz-style).
///
/// Scans for gzip magic bytes (0x1f 0x8b 0x08) with header validation to find
/// member boundaries without any decompression. Reads ISIZE from each member's
/// trailer for output pre-allocation. This replaces the old `scan_member_boundaries_exact`
/// which fully decompressed every member (doing 2x total work).
///
/// Returns None if the data is not multi-member or boundaries look suspicious.
fn scan_member_boundaries_fast(data: &[u8]) -> Option<Vec<BgzfBlock>> {
    if data.len() < 18 || data[0] != 0x1f || data[1] != 0x8b || data[2] != 0x08 {
        return None;
    }

    let header_size = crate::decompress::format::parse_gzip_header_size(data).unwrap_or(10);
    let mut starts = vec![0usize];
    let mut pos = header_size + 1;

    while pos + 10 < data.len() {
        if data[pos] == 0x1f
            && data[pos + 1] == 0x8b
            && data[pos + 2] == 0x08
            && data[pos + 3] & 0xE0 == 0
            // Validate preceding ISIZE field (same heuristic as is_likely_multi_member).
            // Filters false positives from stored-block streams where raw bytes appear
            // verbatim and can accidentally match the gzip magic sequence.
            && pos >= 4
            && {
                let isize = u32::from_le_bytes([
                    data[pos - 4], data[pos - 3], data[pos - 2], data[pos - 1],
                ]);
                isize > 0 && isize <= 1_073_741_824
            }
        {
            starts.push(pos);
        }
        pos += 1;
    }

    if starts.len() < 2 {
        return None;
    }

    let mut members = Vec::with_capacity(starts.len());
    let mut output_offset = 0usize;

    for i in 0..starts.len() {
        let start = starts[i];
        let end = if i + 1 < starts.len() {
            starts[i + 1]
        } else {
            data.len()
        };
        let length = end - start;

        if length < 18 {
            return None;
        }

        let isize_val =
            u32::from_le_bytes([data[end - 4], data[end - 3], data[end - 2], data[end - 1]]);

        let deflate_start = parse_gzip_header(data, start, end)?;

        members.push(BgzfBlock {
            start,
            length,
            isize: isize_val,
            output_offset,
            deflate_start,
        });

        output_offset += isize_val as usize;
    }

    // Sanity: total output shouldn't be wildly disproportionate to input
    if output_offset > data.len().saturating_mul(100) {
        return None;
    }

    Some(members)
}

/// Zero-copy parallel decompression for multi-member gzip files.
///
/// Uses the same approach as BGZF parallel: pre-allocate output, write directly
/// to disjoint slices. Member boundaries are found by sequential scanning with
/// libdeflate (each member is trial-decompressed to get input_consumed).
///
/// This avoids the old approach's issues:
/// - No intermediate Vec copies (~1GB saved for 503MB output)
/// - No per-chunk buffer allocation
/// - Work-stealing across all members for optimal load balancing
pub fn decompress_multi_member_parallel_to_vec(
    data: &[u8],
    num_threads: usize,
) -> io::Result<Vec<u8>> {
    let members = scan_member_boundaries_fast(data).ok_or_else(|| {
        io::Error::new(
            io::ErrorKind::InvalidData,
            "Not a multi-member gzip file or boundary scan failed",
        )
    })?;

    let total_output: usize = members.iter().map(|m| m.isize as usize).sum();
    let output = vec![0u8; total_output];

    let num_members = members.len();
    let next_member = AtomicUsize::new(0);
    let had_error = std::sync::atomic::AtomicBool::new(false);

    use std::cell::UnsafeCell;
    struct OutputBuffer(UnsafeCell<Vec<u8>>);
    unsafe impl Sync for OutputBuffer {}

    let output_cell = OutputBuffer(UnsafeCell::new(output));

    std::thread::scope(|scope| {
        for _ in 0..num_threads.min(num_members) {
            let members_ref = &members;
            let next_ref = &next_member;
            let output_ref = &output_cell;
            let error_ref = &had_error;

            scope.spawn(move || {
                // Allocate a raw libdeflate decompressor (same as BGZF path).
                // scan_member_boundaries_fast already parsed each gzip header and
                // stored deflate_start, so we skip re-parsing with gzip_decompress_ex.
                let decompressor = unsafe { libdeflate_sys::libdeflate_alloc_decompressor() };
                if decompressor.is_null() {
                    error_ref.store(true, Ordering::Relaxed);
                    return;
                }

                loop {
                    let idx = next_ref.fetch_add(1, Ordering::Relaxed);
                    if idx >= num_members {
                        break;
                    }

                    let member = &members_ref[idx];
                    // deflate_start is absolute offset in data; trailer is 8 bytes (CRC32+ISIZE)
                    let deflate_end = member.start + member.length - 8;
                    let deflate_data = &data[member.deflate_start..deflate_end];

                    // SAFETY: Each member writes to a disjoint region
                    let output_ptr = unsafe { (*output_ref.0.get()).as_mut_ptr() };
                    let out_start = member.output_offset;
                    let out_size = member.isize as usize;
                    let out_slice = unsafe {
                        std::slice::from_raw_parts_mut(output_ptr.add(out_start), out_size)
                    };

                    let mut actual_out = 0usize;
                    let ret = unsafe {
                        libdeflate_sys::libdeflate_deflate_decompress(
                            decompressor,
                            deflate_data.as_ptr() as *const std::ffi::c_void,
                            deflate_data.len(),
                            out_slice.as_mut_ptr() as *mut std::ffi::c_void,
                            out_size,
                            &mut actual_out,
                        )
                    };
                    if ret != 0 || actual_out != out_size {
                        error_ref.store(true, Ordering::Relaxed);
                    }
                }

                unsafe { libdeflate_sys::libdeflate_free_decompressor(decompressor) };
            });
        }
    });

    if had_error.load(std::sync::atomic::Ordering::Relaxed) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Decompression error in multi-member parallel",
        ));
    }

    Ok(output_cell.0.into_inner())
}

/// Parallel decompression for multi-member gzip files (pigz-style output).
///
/// Delegates to `decompress_multi_member_parallel_to_vec` for zero-copy parallel,
/// then writes the result. Falls back to sequential for single-member files.
pub fn decompress_multi_member_parallel<W: Write>(
    data: &[u8],
    writer: &mut W,
    num_threads: usize,
) -> io::Result<u64> {
    if data.len() < 18 || data[0] != 0x1f || data[1] != 0x8b {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Not a gzip file",
        ));
    }

    let output = decompress_multi_member_parallel_to_vec(data, num_threads)?;
    let len = output.len() as u64;
    writer.write_all(&output)?;
    Ok(len)
}

/// Single-member decompression using libdeflate gzip_decompress_ex
fn decompress_single_member<W: Write>(data: &[u8], writer: &mut W) -> io::Result<u64> {
    use crate::backends::libdeflate::{DecompressError, DecompressorEx};

    let mut decompressor = DecompressorEx::new();
    let isize_hint = if data.len() >= 8 {
        u32::from_le_bytes([
            data[data.len() - 4],
            data[data.len() - 3],
            data[data.len() - 2],
            data[data.len() - 1],
        ]) as usize
    } else {
        data.len() * 4
    };
    let initial_size = if isize_hint > 0 && isize_hint < 1024 * 1024 * 1024 {
        isize_hint + 1024
    } else {
        data.len().saturating_mul(4).max(64 * 1024)
    };

    let mut buf = vec![0u8; initial_size];
    loop {
        match decompressor.gzip_decompress_ex(data, &mut buf) {
            Ok(result) => {
                writer.write_all(&buf[..result.output_size])?;
                return Ok(result.output_size as u64);
            }
            Err(DecompressError::InsufficientSpace) => {
                buf.resize(buf.len() * 2, 0);
            }
            Err(DecompressError::BadData) => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Invalid gzip data",
                ));
            }
        }
    }
}

// ============================================================================
// Single-Member Parallel Decompression (rapidgzip strategy)
// ============================================================================
//
// For single-member gzip files, we use a two-phase approach:
// 1. Sequential first pass: decode and record block boundaries + windows
// 2. Parallel second pass: re-decode each segment using windows as dictionaries
//
// This provides speedup when the file is large enough to amortize the overhead.

/// Chunk boundary information collected during first pass
/// Note: This will be used in future parallel single-member implementation
#[allow(dead_code)]
#[derive(Debug, Clone)]
struct ChunkBoundary {
    /// Bit position where this chunk starts
    deflate_bit_start: usize,
    /// Bit position where this chunk ends
    deflate_bit_end: usize,
    /// Output offset where this chunk's data starts
    output_offset: usize,
    /// Output size for this chunk
    output_size: usize,
    /// 32KB window at the end of this chunk (for next chunk's dictionary)
    window: Vec<u8>,
}

/// Parallel decompression for single-member gzip files
///
/// Uses the rapidgzip two-pass strategy:
/// 1. **First pass (sequential)**: Decode and collect 32KB windows at chunk intervals
/// 2. **Second pass (parallel)**: Re-decode each chunk using windows as dictionaries
///
/// Target: 2x-3x speedup over single-threaded on large files
#[allow(dead_code)] // Keep for future use - currently libdeflater is faster for single-member
pub fn decompress_single_member_parallel<W: Write>(
    data: &[u8],
    writer: &mut W,
    num_threads: usize,
) -> io::Result<u64> {
    // For small files or single-thread, use fast sequential
    let isize_hint = if data.len() >= 8 {
        u32::from_le_bytes([
            data[data.len() - 4],
            data[data.len() - 3],
            data[data.len() - 2],
            data[data.len() - 1],
        ]) as usize
    } else {
        data.len() * 4
    };

    // Only use parallel for large files (>20MB uncompressed) with multiple threads
    // The overhead of two-pass decode isn't worth it for smaller files
    const MIN_SIZE_FOR_PARALLEL: usize = 20 * 1024 * 1024;
    const CHUNK_SIZE: usize = 4 * 1024 * 1024; // 4MB chunks like rapidgzip

    if isize_hint < MIN_SIZE_FOR_PARALLEL || num_threads <= 1 {
        return decompress_single_member(data, writer);
    }

    // Parse gzip header
    let header_size = crate::decompress::parallel::marker_decode::skip_gzip_header(data)?;
    let deflate_data = &data[header_size..data.len().saturating_sub(8)];

    // === FIRST PASS: Sequential decode to collect chunk boundaries and windows ===
    // We decode the entire file and record windows at regular intervals.
    // This is the "boundary finding" pass that rapidgzip does.

    let mut output = Vec::with_capacity(isize_hint);
    let mut chunk_windows: Vec<(usize, Vec<u8>)> = Vec::new(); // (output_offset, window)

    // Decode using CombinedLUT (our fastest pure-Rust decoder)
    let mut bits = FastBits::new(deflate_data);
    let mut out_pos = 0;

    // Pre-allocate output
    output.resize(isize_hint.max(1024), 0);

    loop {
        bits.refill();
        let bfinal = bits.read(1);
        let btype = bits.read(2);

        let start_out_pos = out_pos;

        match btype {
            0 => out_pos = decode_stored_into(&mut bits, &mut output, out_pos)?,
            1 => out_pos = decode_fixed_into(&mut bits, &mut output, out_pos)?,
            2 => out_pos = decode_dynamic_into(&mut bits, &mut output, out_pos)?,
            3 => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "Reserved block type",
                ))
            }
            _ => unreachable!(),
        }

        // Record window at chunk boundaries (every CHUNK_SIZE bytes of output)
        let chunk_before = start_out_pos / CHUNK_SIZE;
        let chunk_after = out_pos / CHUNK_SIZE;

        if chunk_after > chunk_before && out_pos >= 32 * 1024 {
            // We crossed a chunk boundary - save the 32KB window
            let boundary_pos = chunk_after * CHUNK_SIZE;
            let window_start = boundary_pos.saturating_sub(32 * 1024);
            let window = output[window_start..boundary_pos.min(out_pos)].to_vec();
            chunk_windows.push((boundary_pos, window));
        }

        if bfinal == 1 {
            break;
        }
    }

    // Truncate output to actual size
    output.truncate(out_pos);

    // If we didn't find enough chunk boundaries, just use the sequential result
    if chunk_windows.len() < 2 {
        writer.write_all(&output)?;
        return Ok(out_pos as u64);
    }

    // === SECOND PASS: Parallel re-decode using windows ===
    // Note: For now, we just use the first-pass output since re-decoding is complex
    // and our first pass is already fast. The main benefit of two-pass is when the
    // first pass uses a simpler (slower) decoder and the second pass uses SIMD.
    //
    // Since our CombinedLUT first pass is already optimized, the benefit of re-decode
    // is minimal. We keep the first-pass result.
    //
    // Future optimization: Use marker-based decode in first pass (with u16 buffers),
    // then parallel marker resolution in second pass.

    if std::env::var("GZIPPY_DEBUG").is_ok() {
        eprintln!(
            "[gzippy] Single-member parallel: {} bytes, {} chunk boundaries found",
            out_pos,
            chunk_windows.len()
        );
    }

    writer.write_all(&output)?;
    Ok(out_pos as u64)
}

/// Check if data is a multi-member gzip file by attempting to decompress
/// the first member and checking if more data follows.
#[allow(dead_code)]
pub fn is_multi_member(data: &[u8]) -> bool {
    use crate::backends::libdeflate::{DecompressError, DecompressorEx};

    if data.len() < 36 {
        return false;
    }
    let mut decompressor = DecompressorEx::new();
    let mut buf = vec![0u8; 256 * 1024];
    loop {
        match decompressor.gzip_decompress_ex(data, &mut buf) {
            Ok(result) => {
                return result.input_consumed < data.len()
                    && data.len() - result.input_consumed >= 18
                    && data[result.input_consumed] == 0x1f
                    && data[result.input_consumed + 1] == 0x8b;
            }
            Err(DecompressError::InsufficientSpace) => {
                buf.resize(buf.len() * 2, 0);
            }
            Err(DecompressError::BadData) => return false,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::assert_slices_eq;

    /// Helper to compare byte slices with concise error output
    fn assert_bytes_eq(actual: &[u8], expected: &[u8], context: &str) {
        if actual == expected {
            return;
        }
        let first_diff = actual
            .iter()
            .zip(expected.iter())
            .enumerate()
            .find(|(_, (a, b))| a != b)
            .map(|(i, _)| i);

        let mut msg = format!("\n{} - byte mismatch:\n", context);
        msg.push_str(&format!(
            "  lengths: actual={}, expected={}\n",
            actual.len(),
            expected.len()
        ));
        if let Some(pos) = first_diff {
            msg.push_str(&format!("  first diff at byte {}\n", pos));
            msg.push_str(&format!(
                "  actual[{}]={:#04x}, expected[{}]={:#04x}\n",
                pos, actual[pos], pos, expected[pos]
            ));
            let start = pos.saturating_sub(10);
            let end = (pos + 20).min(actual.len()).min(expected.len());
            if end > start {
                let actual_ctx: String = actual[start..end]
                    .iter()
                    .map(|&b| {
                        if b.is_ascii_graphic() || b == b' ' {
                            b as char
                        } else {
                            '.'
                        }
                    })
                    .collect();
                let expected_ctx: String = expected[start..end]
                    .iter()
                    .map(|&b| {
                        if b.is_ascii_graphic() || b == b' ' {
                            b as char
                        } else {
                            '.'
                        }
                    })
                    .collect();
                msg.push_str(&format!(
                    "  actual  [{}..{}]: \"{}\"\n",
                    start, end, actual_ctx
                ));
                msg.push_str(&format!(
                    "  expected[{}..{}]: \"{}\"\n",
                    start, end, expected_ctx
                ));
            }
        }
        panic!("{}", msg);
    }

    // =========================================================================
    // TURBO PATH UNIT TESTS - Debug the optimized decoder
    // =========================================================================

    /// Test TurboBits basic operations
    #[test]
    fn test_turbo_bits_basic() {
        // Simple data: 8 bytes
        let data = [0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0];
        let mut bits = TurboBits::new(&data);

        // Should have loaded data
        assert!(bits.has_bits(8), "Should have at least 8 bits");

        // Read first byte
        let byte1 = bits.read(8);
        assert_eq!(byte1, 0x12, "First byte should be 0x12");

        // Read second byte
        bits.ensure(8);
        let byte2 = bits.read(8);
        assert_eq!(byte2, 0x34, "Second byte should be 0x34");
    }

    /// Test TurboBits align operation
    #[test]
    fn test_turbo_bits_align() {
        let data = [0xFF, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE];
        let mut bits = TurboBits::new(&data);

        // Read 3 bits
        let _ = bits.read(3);

        // Align to byte boundary (should skip 5 bits)
        bits.align();

        // Now read should give second byte
        bits.ensure(8);
        let byte = bits.read(8);
        assert_eq!(byte, 0x12, "After align, should read 0x12");
    }

    /// Test turbo inflate with simple literal-only data
    #[test]
    fn test_turbo_inflate_literals() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Simple literals - no back-references
        let original = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::new(1)); // Fast = mostly literals
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!(
            "Original: {} bytes, Compressed: {} bytes",
            original.len(),
            compressed.len()
        );
        eprintln!(
            "Compressed hex: {:02x?}",
            &compressed[..compressed.len().min(32)]
        );

        // Test standard path
        let mut output_std = vec![0u8; original.len() + 100];
        let size_std = inflate_into_pub(&compressed, &mut output_std).unwrap();
        assert_eq!(
            &output_std[..size_std],
            &original[..],
            "Standard path failed"
        );
        eprintln!("Standard decoded: {} bytes", size_std);

        // Test turbo path
        let mut output_turbo = vec![0u8; original.len() + 100];
        let size_turbo = inflate_into_pub(&compressed, &mut output_turbo).unwrap();
        eprintln!("Turbo decoded: {} bytes", size_turbo);
        eprintln!(
            "Turbo output: {:?}",
            String::from_utf8_lossy(&output_turbo[..size_turbo])
        );
        eprintln!("Expected:     {:?}", String::from_utf8_lossy(original));

        assert_eq!(
            size_turbo, size_std,
            "Turbo size mismatch: {} vs {}",
            size_turbo, size_std
        );
        assert_eq!(
            &output_turbo[..size_turbo],
            &original[..],
            "Turbo content mismatch"
        );
    }

    /// Test turbo inflate with repetitive data (back-references)
    #[test]
    fn test_turbo_inflate_rle() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Repetitive data - will use RLE (distance=1)
        let original = vec![b'X'; 1000];

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Test standard path
        let mut output_std = vec![0u8; original.len() + 100];
        let size_std = inflate_into_pub(&compressed, &mut output_std).unwrap();
        assert_eq!(
            &output_std[..size_std],
            &original[..],
            "Standard path failed"
        );

        // Test turbo path
        let mut output_turbo = vec![0u8; original.len() + 100];
        let size_turbo = inflate_into_pub(&compressed, &mut output_turbo).unwrap();

        assert_eq!(
            size_turbo, size_std,
            "Turbo size mismatch: {} vs {}",
            size_turbo, size_std
        );
        assert_eq!(
            &output_turbo[..size_turbo],
            &original[..],
            "Turbo content mismatch"
        );
    }

    /// Test turbo inflate with mixed data (literals + back-references)
    #[test]
    fn test_turbo_inflate_mixed() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Mixed data - pattern that repeats
        let pattern = b"The quick brown fox jumps over the lazy dog. ";
        let original: Vec<u8> = pattern.iter().cycle().take(500).copied().collect();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Test standard path
        let mut output_std = vec![0u8; original.len() + 100];
        let size_std = inflate_into_pub(&compressed, &mut output_std).unwrap();
        assert_bytes_eq(&output_std[..size_std], &original[..], "standard path");

        // Test turbo path
        let mut output_turbo = vec![0u8; original.len() + 100];
        let size_turbo = inflate_into_pub(&compressed, &mut output_turbo).unwrap();

        assert_eq!(
            size_turbo, size_std,
            "Turbo size mismatch: {} vs {}",
            size_turbo, size_std
        );
        assert_bytes_eq(
            &output_turbo[..size_turbo],
            &original[..],
            "turbo_inflate_mixed",
        );
    }

    /// Test decode_huffman_turbo with fixed Huffman tables
    #[test]
    fn test_decode_huffman_turbo_fixed() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        let original = b"Hello, World!";

        // Use fast compression which uses fixed Huffman
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Skip the 3-bit block header (we assume BFINAL=1, BTYPE=01 for fixed)
        let (lit_len_table, dist_table, packed_lut) = get_fixed_tables_turbo();

        // Create TurboBits and skip header
        let mut bits = TurboBits::new(&compressed);
        bits.ensure(16);
        let bfinal = bits.read(1);
        let btype = bits.read(2);

        eprintln!("Block: bfinal={}, btype={}", bfinal, btype);

        if btype == 1 {
            // Fixed Huffman - test our turbo decoder
            let mut output = vec![0u8; original.len() + 100];
            let result = decode_huffman_turbo(
                &mut bits,
                &mut output,
                0,
                packed_lut,
                lit_len_table,
                dist_table,
            );

            match result {
                Ok(size) => {
                    eprintln!("Decoded {} bytes", size);
                    assert_eq!(size, original.len(), "Size mismatch");
                    assert_slices_eq!(&output[..size], &original[..], "Content mismatch");
                }
                Err(e) => {
                    panic!("decode_huffman_turbo failed: {}", e);
                }
            }
        } else {
            eprintln!("Skipping - not a fixed Huffman block (btype={})", btype);
        }
    }

    /// Test PackedLUT entry format
    #[test]
    fn test_packed_lut_entries() {
        use crate::decompress::packed_lut::PackedLUT;

        // Build fixed Huffman tables
        let lit_len_lens = get_fixed_lit_len_lens();
        let dist_lens = vec![5u8; 32];

        eprintln!("Code lengths for A-Z:");
        for ch in b'A'..=b'Z' {
            eprintln!(
                "  '{}' ({}) = {} bits",
                ch as char, ch, lit_len_lens[ch as usize]
            );
        }

        let packed_lut = PackedLUT::build(&lit_len_lens, &dist_lens).unwrap();

        // Check some entries
        let mut literals = 0;
        let mut eobs = 0;
        let mut slow_paths = 0;
        let mut invalid = 0;

        for entry in packed_lut.table.iter() {
            if entry.is_valid() {
                if entry.is_literal() {
                    literals += 1;
                } else if entry.is_eob() {
                    eobs += 1;
                } else if entry.is_slow_path() {
                    slow_paths += 1;
                }
            } else {
                invalid += 1;
            }
        }

        eprintln!(
            "PackedLUT entries: literals={}, eobs={}, slow_paths={}, invalid={}",
            literals, eobs, slow_paths, invalid
        );

        assert!(literals > 0, "Should have literal entries");
        assert!(eobs > 0, "Should have EOB entries");
    }

    /// Debug test: trace through turbo decode to find the bug
    #[test]
    fn test_turbo_decode_trace() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Test with increasing sizes to find where it breaks
        for size in [8, 10, 12, 16, 20, 24, 26] {
            let original: Vec<u8> = (b'A'..).take(size).collect();

            let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
            encoder.write_all(&original).unwrap();
            let compressed = encoder.finish().unwrap();

            // Standard path
            let mut output_std = vec![0u8; 100];
            let size_std = inflate_into_pub(&compressed, &mut output_std).unwrap();

            // Turbo path
            let mut output_turbo = vec![0u8; 100];
            let size_turbo = inflate_into_pub(&compressed, &mut output_turbo).unwrap();

            let match_ok =
                size_turbo == size_std && output_turbo[..size_turbo] == output_std[..size_std];

            if !match_ok {
                eprintln!("\n=== MISMATCH at size {} ===", size);
                eprintln!("Original: {:?}", String::from_utf8_lossy(&original));
                eprintln!(
                    "Compressed: {} bytes, hex: {:02x?}",
                    compressed.len(),
                    &compressed
                );
                eprintln!(
                    "Standard: {} bytes, output: {:?}",
                    size_std,
                    String::from_utf8_lossy(&output_std[..size_std])
                );
                eprintln!(
                    "Turbo: {} bytes, output: {:?}",
                    size_turbo,
                    String::from_utf8_lossy(&output_turbo[..size_turbo])
                );

                // Show byte-by-byte comparison
                for i in 0..size_std.max(size_turbo) {
                    let std_byte = if i < size_std { output_std[i] } else { 0 };
                    let turbo_byte = if i < size_turbo { output_turbo[i] } else { 0 };
                    if std_byte != turbo_byte {
                        eprintln!(
                            "  Position {}: std='{}' (0x{:02x}) vs turbo='{}' (0x{:02x})",
                            i, std_byte as char, std_byte, turbo_byte as char, turbo_byte
                        );
                    }
                }
                panic!("Turbo mismatch at size {}", size);
            } else {
                eprintln!("Size {}: OK", size);
            }
        }
    }

    // =========================================================================
    // ORIGINAL TESTS
    // =========================================================================

    #[test]
    fn test_inflate_into() {
        // Create test data
        let original = b"Hello, World! This is a test of the BGZF inflate_into function.";

        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Decompress into pre-allocated buffer
        let mut output = vec![0u8; original.len()];
        let actual_size = inflate_into(&compressed, &mut output).unwrap();

        assert_eq!(actual_size, original.len());
        assert_slices_eq!(&output[..actual_size], &original[..]);
    }

    /// Test x86_64 ASM decoder with full LZ77 match handling
    #[test]
    fn test_decode_huffman_asm_x64() {
        use crate::decompress::packed_lut::PackedLUT;
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Test 1: Pure literals (no matches)
        let original1 = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(original1).unwrap();
        let compressed1 = encoder.finish().unwrap();

        // Test 2: RLE pattern (distance=1, common optimization)
        let original2 = vec![b'X'; 1000];
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original2).unwrap();
        let compressed2 = encoder.finish().unwrap();

        // Test 3: Repeated pattern (tests LZ77 match copy)
        let pattern = b"The quick brown fox jumps over the lazy dog. ";
        let original3: Vec<u8> = pattern.iter().cycle().take(2000).copied().collect();
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original3).unwrap();
        let compressed3 = encoder.finish().unwrap();

        // Test 4: Mixed content (literals + matches)
        let mut original4 = Vec::new();
        for i in 0u8..200 {
            original4.push(i);
            if i % 10 == 0 {
                original4.extend(b"REPEAT");
            }
        }
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original4).unwrap();
        let compressed4 = encoder.finish().unwrap();

        // Build tables for fixed Huffman codes
        let lit_len_lens = {
            let mut v = vec![0u8; 288];
            for i in 0..144 {
                v[i] = 8;
            }
            for i in 144..256 {
                v[i] = 9;
            }
            for i in 256..280 {
                v[i] = 7;
            }
            for i in 280..288 {
                v[i] = 8;
            }
            v
        };
        let dist_lens = vec![5u8; 32];

        let packed_lut = PackedLUT::build(&lit_len_lens, &dist_lens).unwrap();
        let dist_table = TwoLevelTable::build(&dist_lens).unwrap();

        // Helper to test a compressed stream
        let test_stream = |compressed: &[u8], expected: &[u8], name: &str| {
            let mut output = vec![0u8; expected.len() + 1000];

            // Use the asm decoder
            let result = unsafe {
                decode_huffman_asm_x64(compressed, &mut output, 0, &packed_lut, &dist_table)
            };

            match result {
                Ok(size) => {
                    // Verify output matches (at least partial - may not decode entire stream with fixed tables)
                    if size > 0 {
                        eprintln!("{}: decoded {} bytes", name, size);
                        // For this test, we just verify it doesn't panic/error
                        // Full verification would require dynamic table building
                    }
                }
                Err(e) => {
                    // Some errors are expected when using fixed tables on dynamic blocks
                    eprintln!("{}: error (expected for dynamic blocks): {}", name, e);
                }
            }
        };

        test_stream(&compressed1, original1, "literals");
        test_stream(&compressed2, &original2, "rle");
        test_stream(&compressed3, &original3, "repeated");
        test_stream(&compressed4, &original4, "mixed");

        // Verify the main inflate_into still works correctly
        for (compressed, original, name) in [
            (&compressed1[..], &original1[..], "literals"),
            (&compressed2[..], &original2[..], "rle"),
            (&compressed3[..], &original3[..], "repeated"),
            (&compressed4[..], &original4[..], "mixed"),
        ] {
            let mut output = vec![0u8; original.len() + 1000];
            let size = inflate_into(compressed, &mut output).unwrap();
            assert_slices_eq!(&output[..size], original, format!("{} mismatch", name));
        }
    }

    /// Test multi-literal decode correctness with various data patterns
    #[test]
    fn test_multi_literal_correctness() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Test 1: Mostly literals (random-ish data)
        let original1: Vec<u8> = (0..10_000).map(|i| (i % 256) as u8).collect();
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(&original1).unwrap();
        let compressed = encoder.finish().unwrap();
        let mut output = vec![0u8; original1.len() + 1000];
        let size = inflate_into(&compressed, &mut output).unwrap();
        assert_eq!(size, original1.len(), "Size mismatch for literals-only");
        assert_eq!(&output[..size], &original1[..], "Content mismatch");

        // Test 2: Highly repetitive (many back-references)
        let original2: Vec<u8> = "ABCDEFGHIJ".repeat(1000).into_bytes();
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original2).unwrap();
        let compressed = encoder.finish().unwrap();
        let mut output = vec![0u8; original2.len() + 1000];
        let size = inflate_into(&compressed, &mut output).unwrap();
        assert_eq!(size, original2.len(), "Size mismatch for repetitive");
        assert_eq!(&output[..size], &original2[..], "Content mismatch");

        // Test 3: Mixed patterns
        let mut original3 = Vec::new();
        for i in 0..100 {
            original3.extend_from_slice(&[(i * 7) as u8; 50]);
            original3.extend_from_slice(b"REPEAT_THIS_STRING_");
        }
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original3).unwrap();
        let compressed = encoder.finish().unwrap();
        let mut output = vec![0u8; original3.len() + 1000];
        let size = inflate_into(&compressed, &mut output).unwrap();
        assert_eq!(size, original3.len(), "Size mismatch for mixed");
        assert_eq!(&output[..size], &original3[..], "Content mismatch");
    }

    /// Micro-benchmark: decode loop without branching overhead
    /// This shows the theoretical maximum throughput if we eliminate branching
    #[test]
    fn microbench_decode_loop() {
        use crate::decompress::two_level_table::FastBits;

        // Create synthetic bit stream
        let data: Vec<u8> = (0..8_000_000u64).map(|i| (i * 7 % 256) as u8).collect();

        // Build a simple LUT with fixed Huffman codes
        let lens: Vec<u8> = (0..288u16)
            .map(|i| {
                if i < 144 {
                    8
                } else if i < 256 {
                    9
                } else if i < 280 {
                    7
                } else {
                    8
                }
            })
            .collect();
        let lut = crate::decompress::combined_lut::CombinedLUT::build(&lens, &[5u8; 32]).unwrap();

        // Benchmark: tight loop (lookup + consume, no branching)
        let iterations = 5_000_000u64;
        let mut sum = 0u64;
        let mut bits = FastBits::new(&data);

        let start = std::time::Instant::now();
        for _ in 0..iterations {
            bits.ensure(12);
            let entry = lut.decode(bits.buffer());
            bits.consume(entry.bits_to_skip as u32);
            sum += entry.symbol_or_length as u64;
        }
        let elapsed = start.elapsed();
        let ops_per_sec = iterations as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!("\n=== Decode Loop Micro-Benchmark ===");
        eprintln!("Tight loop (no branching): {:.1} M/s", ops_per_sec);
        eprintln!("Sum (prevent optimization): {}", sum);

        // Key insight: ~1500 M ops/s is possible without branching
        // Real decode loop is ~1470 M symbols/s (11,773 MB/s)
        // The 61% gap to libdeflate (18,952 MB/s) is NOT from bit operations
        // It's from branch overhead in the main decode loop
    }

    /// Benchmark inflate_into vs libdeflate
    #[test]
    fn benchmark_inflate_into() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Create 1MB of compressible data (same pattern as fast_inflate benchmark)
        let original: Vec<u8> = (0..1_000_000)
            .map(|i| ((i * 7 + i / 100) % 256) as u8)
            .collect();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Warm up
        let mut output = vec![0u8; original.len() + 1000];
        for _ in 0..3 {
            let _ = inflate_into(&compressed, &mut output);
        }

        // Benchmark our implementation
        let start = std::time::Instant::now();
        let iterations = 50;
        for _ in 0..iterations {
            let _ = inflate_into(&compressed, &mut output);
        }
        let our_time = start.elapsed();
        let our_speed =
            original.len() as f64 * iterations as f64 / our_time.as_secs_f64() / 1_000_000.0;

        // Benchmark libdeflate
        let mut libdeflate = libdeflater::Decompressor::new();
        let mut ld_output = vec![0u8; original.len() + 1000];

        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = libdeflate.deflate_decompress(&compressed, &mut ld_output);
        }
        let ld_time = start.elapsed();
        let ld_speed =
            original.len() as f64 * iterations as f64 / ld_time.as_secs_f64() / 1_000_000.0;

        let ratio = our_time.as_secs_f64() / ld_time.as_secs_f64();

        eprintln!("\n=== inflate_into vs libdeflate ===");
        eprintln!("Our inflate_into: {:.1} MB/s", our_speed);
        eprintln!("libdeflate:       {:.1} MB/s", ld_speed);
        eprintln!("Ratio: {:.2}x slower than libdeflate", ratio);
        eprintln!("Gap to close: {:.0}%", (ratio - 1.0) * 100.0);

        // Verify correctness
        let size = inflate_into(&compressed, &mut output).unwrap();
        assert_eq!(size, original.len());
    }

    /// Benchmark packed LUT decode vs CombinedLUT  
    #[test]
    fn benchmark_packed_vs_combined() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Create 1MB of mixed content data for realistic testing
        let mut original = Vec::with_capacity(1_000_000);
        for i in 0..100_000 {
            // Mix of literals, runs, and varied patterns
            original.push(((i * 7) % 256) as u8);
            original.push((i % 256) as u8);
            if i % 100 == 0 {
                // Add some runs
                original.extend(std::iter::repeat_n(b'A', 10));
            }
        }

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Warm up by running the full decompression a few times
        let mut output = vec![0u8; original.len() + 10000];
        for _ in 0..5 {
            let _ = inflate_into(&compressed, &mut output);
        }

        // Benchmark the inflate_into function which uses CombinedLUT
        let iterations = 100;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into(&compressed, &mut output);
        }
        let time = start.elapsed();
        let speed = original.len() as f64 * iterations as f64 / time.as_secs_f64() / 1_000_000.0;

        eprintln!("\n=== inflate_into (CombinedLUT) Benchmark ===");
        eprintln!("Output size: {} bytes", original.len());
        eprintln!("Iterations: {}", iterations);
        eprintln!("Speed: {:.1} MB/s", speed);
    }

    /// Benchmark turbo decoder with Phase 1 optimizations
    #[test]
    fn benchmark_turbo_decoder() {
        use crate::decompress::packed_lut::PackedLUT;
        use crate::decompress::two_level_table::TurboBits;
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        // Create 1MB of mixed data
        let mut original = Vec::with_capacity(1_000_000);
        for i in 0..100_000 {
            original.push(((i * 7) % 256) as u8);
            original.push((i % 256) as u8);
            if i % 100 == 0 {
                original.extend(std::iter::repeat_n(b'A', 10));
            }
        }

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Build tables (fixed Huffman for simplicity)
        #[allow(clippy::needless_range_loop)]
        let lit_len_lens = {
            let mut v = vec![0u8; 288];
            for i in 0..144 {
                v[i] = 8;
            }
            for i in 144..256 {
                v[i] = 9;
            }
            for i in 256..280 {
                v[i] = 7;
            }
            for i in 280..288 {
                v[i] = 8;
            }
            v
        };
        let dist_lens = vec![5u8; 32];

        let packed_lut = PackedLUT::build(&lit_len_lens, &dist_lens).unwrap();
        let lit_len_table = TwoLevelTable::build(&lit_len_lens).unwrap();
        let dist_table = TwoLevelTable::build(&dist_lens).unwrap();

        // Warm up
        let mut output = vec![0u8; original.len() + 10000];
        for _ in 0..5 {
            let mut bits = TurboBits::new(&compressed);
            bits.ensure(16);
            let _ = bits.read(3); // Skip header
            let _ = decode_huffman_turbo(
                &mut bits,
                &mut output,
                0,
                &packed_lut,
                &lit_len_table,
                &dist_table,
            );
        }

        // Benchmark turbo decoder
        let iterations = 100;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let mut bits = TurboBits::new(&compressed);
            bits.ensure(16);
            let _ = bits.read(3);
            let _ = decode_huffman_turbo(
                &mut bits,
                &mut output,
                0,
                &packed_lut,
                &lit_len_table,
                &dist_table,
            );
        }
        let turbo_time = start.elapsed();
        let turbo_speed =
            original.len() as f64 * iterations as f64 / turbo_time.as_secs_f64() / 1_000_000.0;

        // Benchmark standard inflate_into for comparison
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into(&compressed, &mut output);
        }
        let standard_time = start.elapsed();
        let standard_speed =
            original.len() as f64 * iterations as f64 / standard_time.as_secs_f64() / 1_000_000.0;

        // Also test libdeflate
        let mut decompressor = libdeflater::Decompressor::new();
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = decompressor.deflate_decompress(&compressed, &mut output);
        }
        let libdeflate_time = start.elapsed();
        let libdeflate_speed =
            original.len() as f64 * iterations as f64 / libdeflate_time.as_secs_f64() / 1_000_000.0;

        eprintln!("\n=== Phase 1 Turbo Decoder Benchmark ===");
        eprintln!("Data size: {} bytes", original.len());
        eprintln!("Turbo (Phase 1):  {:.1} MB/s", turbo_speed);
        eprintln!("Standard:         {:.1} MB/s", standard_speed);
        eprintln!("libdeflate:       {:.1} MB/s", libdeflate_speed);
        eprintln!("Turbo vs standard: {:.2}x", turbo_speed / standard_speed);
        eprintln!(
            "Turbo vs libdeflate: {:.0}%",
            turbo_speed / libdeflate_speed * 100.0
        );
    }

    #[test]
    fn test_bgzf_parallel() {
        // Test with a gzippy-compressed file if available
        let data = match std::fs::read("benchmark_data/test-gzippy-l1-t14.gz") {
            Ok(d) => d,
            Err(_) => {
                eprintln!("Skipping test - no gzippy test file");
                return;
            }
        };

        // Get expected output from flate2
        use std::io::Read;
        let mut expected = Vec::new();
        let mut decoder = flate2::read::MultiGzDecoder::new(&data[..]);
        decoder.read_to_end(&mut expected).unwrap();

        // Test our parallel decompressor
        let mut output = Vec::new();
        decompress_bgzf_parallel(&data, &mut output, 8).unwrap();

        assert_eq!(output.len(), expected.len(), "Size mismatch");

        // Find first mismatch
        for (i, (&a, &b)) in output.iter().zip(expected.iter()).enumerate() {
            if a != b {
                let start = i.saturating_sub(10);
                let end = (i + 20).min(output.len());
                eprintln!(
                    "First mismatch at byte {}: got {:02x} expected {:02x}",
                    i, a, b
                );
                eprintln!("Context - ours: {:02x?}", &output[start..end]);
                eprintln!("Context - expected: {:02x?}", &expected[start..end]);
                panic!("Content mismatch at byte {}", i);
            }
        }
    }

    #[test]
    fn benchmark_bgzf_parallel() {
        let data = match std::fs::read("benchmark_data/test-gzippy-l1-t14.gz") {
            Ok(d) => d,
            Err(_) => {
                eprintln!("Skipping benchmark - no test file");
                return;
            }
        };

        // Get expected size
        use std::io::Read;
        let mut expected = Vec::new();
        let mut decoder = flate2::read::MultiGzDecoder::new(&data[..]);
        decoder.read_to_end(&mut expected).unwrap();
        let expected_size = expected.len();

        // Warm up
        for _ in 0..3 {
            let mut output = Vec::new();
            decompress_bgzf_parallel(&data, &mut output, 8).unwrap();
        }

        // Benchmark
        let start = std::time::Instant::now();
        let iterations = 5;
        for _ in 0..iterations {
            let mut output = Vec::new();
            decompress_bgzf_parallel(&data, &mut output, 8).unwrap();
        }
        let elapsed = start.elapsed() / iterations;
        let speed = expected_size as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!("BGZF parallel (8 threads): {:.1} MB/s", speed);
    }

    #[test]
    fn test_multi_member_parallel() {
        // Create a multi-member gzip file programmatically
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        let part1: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
        let part2: Vec<u8> = (0..100_000).map(|i| ((i + 50) % 256) as u8).collect();
        let part3: Vec<u8> = (0..100_000).map(|i| ((i + 100) % 256) as u8).collect();

        // Compress each part separately
        let mut encoder1 = GzEncoder::new(Vec::new(), Compression::default());
        encoder1.write_all(&part1).unwrap();
        let compressed1 = encoder1.finish().unwrap();

        let mut encoder2 = GzEncoder::new(Vec::new(), Compression::default());
        encoder2.write_all(&part2).unwrap();
        let compressed2 = encoder2.finish().unwrap();

        let mut encoder3 = GzEncoder::new(Vec::new(), Compression::default());
        encoder3.write_all(&part3).unwrap();
        let compressed3 = encoder3.finish().unwrap();

        // Concatenate them (like `cat part1.gz part2.gz part3.gz > multi.gz`)
        let mut multi = compressed1.clone();
        multi.extend_from_slice(&compressed2);
        multi.extend_from_slice(&compressed3);

        assert!(is_multi_member(&multi), "Should detect multi-member");

        // Get expected output
        let mut expected = part1.clone();
        expected.extend_from_slice(&part2);
        expected.extend_from_slice(&part3);

        // Test our parallel decompressor
        let mut output = Vec::new();
        decompress_multi_member_parallel(&multi, &mut output, 4).unwrap();

        assert_eq!(output.len(), expected.len(), "Size mismatch");
        assert_slices_eq!(output, expected, "Content mismatch");
    }

    #[test]
    fn test_multi_member_large() {
        // Create a larger multi-member test
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write as IoWrite;

        let mut multi = Vec::new();
        let mut expected = Vec::new();
        let num_members = 10;

        for i in 0..num_members {
            let part: Vec<u8> = (0..50_000).map(|j| ((i * 17 + j) % 256) as u8).collect();

            let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
            encoder.write_all(&part).unwrap();
            multi.extend_from_slice(&encoder.finish().unwrap());

            expected.extend_from_slice(&part);
        }

        assert!(is_multi_member(&multi), "Should detect multi-member");

        // Test parallel decompressor
        let mut output = Vec::new();
        decompress_multi_member_parallel(&multi, &mut output, 8).unwrap();

        assert_eq!(output.len(), expected.len(), "Size mismatch");
        assert_slices_eq!(output, expected, "Content mismatch");
    }

    /// Main decompression benchmark - compares gzippy vs libdeflater crate
    /// Benchmarks raw deflate decompression using the PRODUCTION path (libdeflate C FFI).
    /// This is what inflate_into_pub() delivers — the function used by every production
    /// decode call (BGZF blocks, multi-member members, single-member stream).
    ///
    /// Run with: cargo test --release bench_production_inflate -- --nocapture
    #[test]
    fn bench_production_inflate() {
        let _ = crate::tests::datasets::prepare_datasets();

        let datasets = [
            (
                "silesia",
                "benchmark_data/silesia-gzip.tar.gz",
                "mixed content",
            ),
            (
                "software",
                "benchmark_data/software.archive.gz",
                "source code",
            ),
            ("logs", "benchmark_data/logs.txt.gz", "repetitive logs"),
        ];

        const WARMUP: usize = 3;
        let iterations: usize = std::env::var("BENCH_RUNS")
            .ok()
            .and_then(|s| s.parse().ok())
            .unwrap_or(10);

        eprintln!("\n╔══════════════════════════════════════════════════════════════╗");
        eprintln!("║           GZIPPY DECOMPRESSION BENCHMARK                     ║");
        eprintln!("╠══════════════════════════════════════════════════════════════╣");
        eprintln!(
            "║  Warmup: {} iterations, Measured: {} iterations              ║",
            WARMUP, iterations
        );
        eprintln!("╚══════════════════════════════════════════════════════════════╝\n");

        for (name, path, desc) in &datasets {
            let gz = match std::fs::read(path) {
                Ok(d) => d,
                Err(_) => {
                    eprintln!("⚠ Skipping {} - file not found: {}", name, path);
                    continue;
                }
            };

            // Parse gzip header to get raw deflate data
            let mut pos = 10;
            let flg = gz[3];
            if (flg & 0x04) != 0 {
                let xlen = u16::from_le_bytes([gz[pos], gz[pos + 1]]) as usize;
                pos += 2 + xlen;
            }
            if (flg & 0x08) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x10) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x02) != 0 {
                pos += 2;
            }

            let deflate = &gz[pos..gz.len() - 8];
            let expected_size = u32::from_le_bytes([
                gz[gz.len() - 4],
                gz[gz.len() - 3],
                gz[gz.len() - 2],
                gz[gz.len() - 1],
            ]) as usize;

            let mut output = vec![0u8; expected_size + 1024];

            eprintln!(
                "┌─ {} ({}) ─────────────────────────────",
                name.to_uppercase(),
                desc
            );
            eprintln!(
                "│  Size: {:.1} MB uncompressed",
                expected_size as f64 / 1_000_000.0
            );

            // === PRODUCTION PATH: inflate_into_pub() → libdeflate C FFI ===
            // This is THE function called for every block in production:
            //   BGZF blocks, multi-member members, single-member inflate.
            // Warmup
            for _ in 0..WARMUP {
                let _ = inflate_into_pub(deflate, &mut output);
            }
            // Measure
            let start = std::time::Instant::now();
            for _ in 0..iterations {
                let _ = inflate_into_pub(deflate, &mut output);
            }
            let production_speed =
                (expected_size * iterations) as f64 / start.elapsed().as_secs_f64() / 1_000_000.0;

            // === REFERENCE: libdeflater crate direct (no gzippy wrapper overhead) ===
            // Shows how much wrapper overhead inflate_into_pub adds vs direct call.
            // Should be within 1-2% — if not, investigate.
            for _ in 0..WARMUP {
                libdeflater::Decompressor::new()
                    .deflate_decompress(deflate, &mut output)
                    .unwrap();
            }
            let start = std::time::Instant::now();
            for _ in 0..iterations {
                libdeflater::Decompressor::new()
                    .deflate_decompress(deflate, &mut output)
                    .unwrap();
            }
            let direct_libdeflate_speed =
                (expected_size * iterations) as f64 / start.elapsed().as_secs_f64() / 1_000_000.0;

            let overhead_pct = (1.0 - production_speed / direct_libdeflate_speed) * 100.0;

            eprintln!(
                "│  production (inflate_into_pub):  {:>8.1} MB/s",
                production_speed
            );
            eprintln!(
                "│  direct libdeflater (reference): {:>8.1} MB/s  (wrapper overhead: {:.1}%)",
                direct_libdeflate_speed, overhead_pct
            );
            eprintln!("│  NOTE: These are raw deflate numbers. CLI throughput is lower due to");
            eprintln!("│        gzip header parsing, mmap, routing, CRC32, and write I/O.");
            eprintln!("└────────────────────────────────────────────────\n");
        }
    }

    /// Analyze decompression with detailed statistics
    /// Run with: cargo test --release bench_analyze -- --nocapture
    #[test]
    fn bench_analyze() {
        use crate::decompress::inflate::consume_first_decode::{
            get_block_stats, get_cache_stats, get_spec_cache_stats, get_spec_stats,
            get_table_cache_size, reset_cache_stats,
        };

        let _ = crate::tests::datasets::prepare_datasets();

        let datasets = [
            (
                "silesia",
                "benchmark_data/silesia-gzip.tar.gz",
                "mixed content",
            ),
            (
                "software",
                "benchmark_data/software.archive.gz",
                "source code",
            ),
            ("logs", "benchmark_data/logs.txt.gz", "repetitive logs"),
        ];

        eprintln!("\n╔══════════════════════════════════════════════════════════════╗");
        eprintln!("║           GZIPPY DECOMPRESSION ANALYSIS                      ║");
        eprintln!("╠══════════════════════════════════════════════════════════════╣");
        eprintln!("║  Block types, cache stats, path usage                        ║");
        eprintln!("╚══════════════════════════════════════════════════════════════╝\n");

        for (name, path, desc) in &datasets {
            let gz = match std::fs::read(path) {
                Ok(d) => d,
                Err(_) => {
                    eprintln!("  Skipping {} - file not found: {}", name, path);
                    continue;
                }
            };

            // Parse gzip header to get raw deflate data
            let mut pos = 10;
            let flg = gz[3];
            if (flg & 0x04) != 0 {
                let xlen = u16::from_le_bytes([gz[pos], gz[pos + 1]]) as usize;
                pos += 2 + xlen;
            }
            if (flg & 0x08) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x10) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x02) != 0 {
                pos += 2;
            }

            let deflate = &gz[pos..gz.len() - 8];
            let expected_size = u32::from_le_bytes([
                gz[gz.len() - 4],
                gz[gz.len() - 3],
                gz[gz.len() - 2],
                gz[gz.len() - 1],
            ]) as usize;

            let mut output = vec![0u8; expected_size + 1024];

            // Reset stats and decompress
            reset_cache_stats();
            let start = std::time::Instant::now();
            let _ = inflate_into_pub(deflate, &mut output);
            let elapsed = start.elapsed();

            // Gather stats
            let block_stats = get_block_stats();
            let (cache_hits, cache_misses, cache_rate) = get_cache_stats();
            let (spec_used, spec_fallback) = get_spec_stats();

            let speed = expected_size as f64 / elapsed.as_secs_f64() / 1_000_000.0;

            eprintln!(
                "┌─ {} ({}) ─────────────────────────────",
                name.to_uppercase(),
                desc
            );
            eprintln!("│");
            eprintln!(
                "│  Size: {:.2} MB compressed → {:.2} MB uncompressed",
                deflate.len() as f64 / 1_000_000.0,
                expected_size as f64 / 1_000_000.0
            );
            eprintln!("│  Speed: {:.1} MB/s", speed);
            eprintln!("│");

            // Block type breakdown
            let total_blocks = block_stats.total_blocks();
            eprintln!("│  BLOCK TYPES ({} total):", total_blocks);
            if block_stats.stored_blocks > 0 {
                let pct = block_stats.stored_blocks as f64 / total_blocks as f64 * 100.0;
                let bytes_pct = block_stats.stored_bytes as f64 / expected_size as f64 * 100.0;
                eprintln!(
                    "│    Stored:   {:>5} blocks ({:>5.1}%) → {:>10} bytes ({:>5.1}%)",
                    block_stats.stored_blocks, pct, block_stats.stored_bytes, bytes_pct
                );
            }
            if block_stats.fixed_blocks > 0 {
                let pct = block_stats.fixed_blocks as f64 / total_blocks as f64 * 100.0;
                let bytes_pct = block_stats.fixed_bytes as f64 / expected_size as f64 * 100.0;
                eprintln!(
                    "│    Fixed:    {:>5} blocks ({:>5.1}%) → {:>10} bytes ({:>5.1}%)",
                    block_stats.fixed_blocks, pct, block_stats.fixed_bytes, bytes_pct
                );
            }
            if block_stats.dynamic_blocks > 0 {
                let pct = block_stats.dynamic_blocks as f64 / total_blocks as f64 * 100.0;
                let bytes_pct = block_stats.dynamic_bytes as f64 / expected_size as f64 * 100.0;
                eprintln!(
                    "│    Dynamic:  {:>5} blocks ({:>5.1}%) → {:>10} bytes ({:>5.1}%)",
                    block_stats.dynamic_blocks, pct, block_stats.dynamic_bytes, bytes_pct
                );
            }
            eprintln!("│");

            // Cache stats
            let total_cache = cache_hits + cache_misses;
            let cache_size = get_table_cache_size();
            if total_cache > 0 {
                eprintln!("│  TABLE CACHE:");
                eprintln!(
                    "│    Hits:     {:>5} ({:.1}%)",
                    cache_hits,
                    cache_rate * 100.0
                );
                eprintln!("│    Misses:   {:>5}", cache_misses);
                eprintln!("│    Unique:   {:>5} fingerprints", cache_size);
                eprintln!("│");
            }

            // Specialized decoder stats
            let total_spec = spec_used + spec_fallback;
            let (spec_decoders, spec_failed, spec_total_uses, spec_max_uses) =
                get_spec_cache_stats();
            if total_spec > 0 {
                let spec_rate = spec_used as f64 / total_spec as f64 * 100.0;
                eprintln!("│  DECODE PATH:");
                eprintln!("│    Specialized: {:>5} ({:.1}%)", spec_used, spec_rate);
                eprintln!("│    Generic:     {:>5}", spec_fallback);
                if spec_decoders > 0 || spec_failed > 0 {
                    eprintln!("│  SPEC CACHE:");
                    eprintln!("│    Decoders:  {:>5} unique tables", spec_decoders);
                    eprintln!("│    Failed:    {:>5} (tables too complex)", spec_failed);
                    if spec_decoders > 0 {
                        let avg_uses = spec_total_uses as f64 / spec_decoders as f64;
                        eprintln!(
                            "│    Reuse:     {:>5.1}x avg, {}x max",
                            avg_uses, spec_max_uses
                        );
                    }
                }
                eprintln!("│");
            }

            // Archive characteristics summary
            let dominant_type = if block_stats.dynamic_bytes > block_stats.fixed_bytes
                && block_stats.dynamic_bytes > block_stats.stored_bytes
            {
                "dynamic"
            } else if block_stats.fixed_bytes > block_stats.stored_bytes {
                "fixed"
            } else {
                "stored"
            };
            eprintln!("│  CHARACTERISTICS:");
            eprintln!("│    Dominant block type: {}", dominant_type);
            let compression_ratio = deflate.len() as f64 / expected_size as f64;
            eprintln!(
                "│    Compression ratio: {:.2}x ({:.1}% of original)",
                1.0 / compression_ratio,
                compression_ratio * 100.0
            );

            eprintln!("└────────────────────────────────────────────────\n");
        }

        // Summary recommendations
        eprintln!("╔══════════════════════════════════════════════════════════════╗");
        eprintln!("║  OPTIMIZATION NOTES                                          ║");
        eprintln!("╠══════════════════════════════════════════════════════════════╣");
        eprintln!("║  - Dynamic blocks: libdeflate-style decode (fastest)         ║");
        eprintln!("║  - Fixed blocks: need optimization (currently slower)        ║");
        eprintln!("║  - High cache hit rate: table reuse working                  ║");
        eprintln!("║  - Low cache hit rate: consider fingerprint tuning           ║");
        eprintln!("╚══════════════════════════════════════════════════════════════╝\n");
    }

    /// Profile time spent in table building vs decoding
    /// Run with: cargo test --release bench_profile -- --nocapture
    #[test]
    fn bench_profile() {
        use crate::decompress::inflate::consume_first_decode::{
            get_timing_stats, reset_cache_stats,
        };

        let _ = crate::tests::datasets::prepare_datasets();

        let datasets = [
            (
                "silesia",
                "benchmark_data/silesia-gzip.tar.gz",
                "mixed content",
            ),
            (
                "software",
                "benchmark_data/software.archive.gz",
                "source code",
            ),
            ("logs", "benchmark_data/logs.txt.gz", "repetitive logs"),
        ];

        eprintln!("\n╔══════════════════════════════════════════════════════════════╗");
        eprintln!("║           GZIPPY TIMING PROFILE                              ║");
        eprintln!("╠══════════════════════════════════════════════════════════════╣");
        eprintln!("║  Breakdown of table building vs decode time                  ║");
        eprintln!("╚══════════════════════════════════════════════════════════════╝\n");

        for (name, path, desc) in &datasets {
            let gz = match std::fs::read(path) {
                Ok(d) => d,
                Err(_) => {
                    eprintln!("⚠ Skipping {} - file not found: {}", name, path);
                    continue;
                }
            };

            // Parse gzip header
            let mut pos = 10;
            let flg = gz[3];
            if (flg & 0x04) != 0 {
                let xlen = u16::from_le_bytes([gz[pos], gz[pos + 1]]) as usize;
                pos += 2 + xlen;
            }
            if (flg & 0x08) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x10) != 0 {
                while pos < gz.len() && gz[pos] != 0 {
                    pos += 1;
                }
                pos += 1;
            }
            if (flg & 0x02) != 0 {
                pos += 2;
            }

            let deflate = &gz[pos..gz.len() - 8];
            let expected_size = u32::from_le_bytes([
                gz[gz.len() - 4],
                gz[gz.len() - 3],
                gz[gz.len() - 2],
                gz[gz.len() - 1],
            ]) as usize;

            let mut output = vec![0u8; expected_size + 1024];

            // Reset stats and run decompression
            reset_cache_stats();

            let start = std::time::Instant::now();
            let _ = inflate_into_pub(deflate, &mut output);
            let total_time = start.elapsed();

            let timing = get_timing_stats();

            let total_nanos = total_time.as_nanos() as f64;
            let table_pct = timing.table_build_nanos as f64 / total_nanos * 100.0;
            let decode_pct = timing.decode_nanos as f64 / total_nanos * 100.0;
            let other_pct = 100.0 - table_pct - decode_pct;

            let avg_table_us = if timing.table_build_count > 0 {
                timing.table_build_nanos as f64 / timing.table_build_count as f64 / 1000.0
            } else {
                0.0
            };
            let avg_decode_us = if timing.decode_count > 0 {
                timing.decode_nanos as f64 / timing.decode_count as f64 / 1000.0
            } else {
                0.0
            };

            eprintln!(
                "┌─ {} ({}) ─────────────────────────────",
                name.to_uppercase(),
                desc
            );
            eprintln!(
                "│  Size: {:.1} MB, Total time: {:.1}ms",
                expected_size as f64 / 1_000_000.0,
                total_time.as_secs_f64() * 1000.0
            );
            eprintln!("│");
            eprintln!("│  TIME BREAKDOWN:");
            eprintln!(
                "│    Table building: {:>6.1}ms ({:>5.1}%) - {} tables, {:.1}µs avg",
                timing.table_build_nanos as f64 / 1_000_000.0,
                table_pct,
                timing.table_build_count,
                avg_table_us
            );
            eprintln!(
                "│    Decoding:       {:>6.1}ms ({:>5.1}%) - {} blocks, {:.1}µs avg",
                timing.decode_nanos as f64 / 1_000_000.0,
                decode_pct,
                timing.decode_count,
                avg_decode_us
            );
            eprintln!(
                "│    Other/overhead: {:>6.1}ms ({:>5.1}%)",
                (total_nanos - timing.table_build_nanos as f64 - timing.decode_nanos as f64)
                    / 1_000_000.0,
                other_pct
            );
            eprintln!(
                "│  Speed: {:.1} MB/s",
                expected_size as f64 / total_time.as_secs_f64() / 1_000_000.0
            );
            eprintln!("└────────────────────────────────────────────────\n");
        }
    }
}

// =============================================================================
// Optimization Tests - TDD for closing the libdeflate gap
// =============================================================================

#[cfg(test)]
mod optimization_tests {
    use super::*;

    /// Helper to compare byte slices with concise error output
    /// Shows first mismatch position and surrounding context instead of dumping entire arrays
    fn assert_bytes_eq(actual: &[u8], expected: &[u8], context: &str) {
        if actual == expected {
            return;
        }

        let len_match = actual.len() == expected.len();
        let first_diff = actual
            .iter()
            .zip(expected.iter())
            .enumerate()
            .find(|(_, (a, b))| a != b)
            .map(|(i, _)| i);

        let mut msg = format!("\n{} - byte mismatch:\n", context);
        msg.push_str(&format!(
            "  lengths: actual={}, expected={}\n",
            actual.len(),
            expected.len()
        ));

        if let Some(pos) = first_diff {
            msg.push_str(&format!("  first diff at byte {}\n", pos));
            msg.push_str(&format!(
                "  actual[{}]={:#04x}, expected[{}]={:#04x}\n",
                pos, actual[pos], pos, expected[pos]
            ));

            // Show context around mismatch (as string if printable)
            let start = pos.saturating_sub(10);
            let end = (pos + 20).min(actual.len()).min(expected.len());
            if end > start {
                let actual_ctx: String = actual[start..end]
                    .iter()
                    .map(|&b| {
                        if b.is_ascii_graphic() || b == b' ' {
                            b as char
                        } else {
                            '.'
                        }
                    })
                    .collect();
                let expected_ctx: String = expected[start..end]
                    .iter()
                    .map(|&b| {
                        if b.is_ascii_graphic() || b == b' ' {
                            b as char
                        } else {
                            '.'
                        }
                    })
                    .collect();
                msg.push_str(&format!(
                    "  actual  [{}..{}]: \"{}\"\n",
                    start, end, actual_ctx
                ));
                msg.push_str(&format!(
                    "  expected[{}..{}]: \"{}\"\n",
                    start, end, expected_ctx
                ));
            }
        } else if !len_match {
            msg.push_str("  content matches up to shorter length\n");
        }

        panic!("{}", msg);
    }

    // =========================================================================
    // Phase 1: saved_bitbuf pattern tests
    // =========================================================================

    /// Test that we can extract extra bits from saved buffer without re-reading
    #[test]
    fn test_saved_bitbuf_extra_bits() {
        // Create test data with known bit patterns
        let data: [u8; 16] = [
            0b11010101, 0b10101010, 0b11001100, 0b00110011, 0xFF, 0x00, 0xAA, 0x55, 0x12, 0x34,
            0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0,
        ];

        let mut bits = TurboBits::new(&data);
        bits.ensure(32);

        // Save the current buffer state
        let saved = bits.buffer();

        // Simulate consuming 8 bits (like libdeflate's bitsleft -= entry)
        let entry_bits = 8u32;
        bits.consume(entry_bits);

        // Extract 5 extra bits from saved buffer (at position 8-12)
        let extra_shift = entry_bits;
        let extra_mask = (1u64 << 5) - 1;
        let extra = ((saved >> extra_shift) & extra_mask) as u32;

        // Verify we got the right bits
        let expected = 0b10101010u64 & 0b11111;
        assert_eq!(
            extra, expected as u32,
            "saved_bitbuf extra extraction failed: got {:05b}, expected {:05b}",
            extra, expected
        );
    }

    /// Test saved_bitbuf pattern for length+distance decode
    #[test]
    fn test_saved_bitbuf_length_decode() {
        let data: [u8; 8] = [0b10101011, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
        let mut bits = TurboBits::new(&data);
        bits.ensure(16);

        let saved = bits.buffer();

        // Simulate entry: 7-bit code + 1 extra bit = 8 total bits
        let codeword_len = 7u32;
        let extra_count = 1u32;
        let length_base = 7u32;

        bits.consume(codeword_len + extra_count);

        let extra_bits = (saved >> codeword_len) & ((1 << extra_count) - 1);
        let length = length_base + extra_bits as u32;

        // First 7 bits: 0b0101011 = 43, extra bit (8th) = 1
        assert_eq!(length, 8, "Length decode with saved_bitbuf failed");
    }

    // =========================================================================
    // Phase 2: 5-word unconditional copy tests
    // =========================================================================

    /// Test unconditional 40-byte copy for short matches
    #[test]
    fn test_unconditional_copy_short_match() {
        let mut output = vec![0u8; 1024];

        // Set up source pattern
        for i in 0..8 {
            output[i] = (i as u8) + 1;
        }

        // Copy 5 bytes using the existing copy function
        let out_pos = copy_match_into(&mut output, 100, 100, 5);

        assert_eq!(out_pos, 105);
        for i in 0..5 {
            assert_eq!(output[100 + i], output[i], "Mismatch at byte {}", i);
        }
    }

    /// Test that copy doesn't corrupt for distance >= 8
    #[test]
    fn test_unconditional_copy_non_overlapping() {
        let mut output = vec![0u8; 1024];

        // Create known pattern
        for i in 0..100 {
            output[i] = (i as u8).wrapping_mul(7);
        }

        let src_start = 10;
        let dst_start = 200;
        let length = 35;

        let out_pos = copy_match_into(&mut output, dst_start, dst_start - src_start, length);

        assert_eq!(out_pos, dst_start + length);

        for i in 0..length {
            assert_eq!(
                output[dst_start + i],
                output[src_start + i],
                "Copy mismatch at offset {}",
                i
            );
        }
    }

    /// Test RLE (distance=1) optimization
    #[test]
    fn test_rle_optimization() {
        let mut output = vec![0u8; 1024];
        output[50] = 0xAA;

        let out_pos = copy_match_into(&mut output, 51, 1, 100);

        assert_eq!(out_pos, 151);

        for i in 51..151 {
            assert_eq!(output[i], 0xAA, "RLE mismatch at position {}", i);
        }
    }

    // =========================================================================
    // Phase 3: JIT table caching tests
    // =========================================================================

    /// Test that identical code lengths produce same table fingerprint
    #[test]
    fn test_table_fingerprint_identical() {
        let lens1: Vec<u8> = vec![8, 8, 8, 8, 7, 7, 7, 7, 6, 6];
        let lens2: Vec<u8> = vec![8, 8, 8, 8, 7, 7, 7, 7, 6, 6];

        fn fingerprint(lens: &[u8]) -> u64 {
            use std::collections::hash_map::DefaultHasher;
            use std::hash::{Hash, Hasher};
            let mut hasher = DefaultHasher::new();
            lens.hash(&mut hasher);
            hasher.finish()
        }

        let fp1 = fingerprint(&lens1);
        let fp2 = fingerprint(&lens2);

        assert_eq!(
            fp1, fp2,
            "Identical code lengths should have same fingerprint"
        );
    }

    /// Test that different code lengths produce different fingerprints
    #[test]
    fn test_table_fingerprint_different() {
        let lens1: Vec<u8> = vec![8, 8, 8, 8, 7, 7, 7, 7];
        let lens2: Vec<u8> = vec![8, 8, 8, 7, 7, 7, 7, 7];

        fn fingerprint(lens: &[u8]) -> u64 {
            use std::collections::hash_map::DefaultHasher;
            use std::hash::{Hash, Hasher};
            let mut hasher = DefaultHasher::new();
            lens.hash(&mut hasher);
            hasher.finish()
        }

        let fp1 = fingerprint(&lens1);
        let fp2 = fingerprint(&lens2);

        assert_ne!(
            fp1, fp2,
            "Different code lengths should have different fingerprints"
        );
    }

    // =========================================================================
    // Phase 4: BMI2 intrinsics tests
    // =========================================================================

    /// Test that BMI2 path produces same results as generic path
    #[test]
    #[cfg(target_arch = "x86_64")]
    fn test_bmi2_equivalence() {
        if !is_x86_feature_detected!("bmi2") {
            eprintln!("BMI2 not available, skipping test");
            return;
        }

        let original: Vec<u8> = (0..10000).map(|i| ((i * 7) % 256) as u8).collect();

        use crate::assert_slices_eq;
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        let mut output = vec![0u8; original.len() + 1024];
        let size = inflate_into_pub(deflate_data, &mut output).unwrap();
        output.truncate(size);

        assert_slices_eq!(output, original, "BMI2 path produced different output");
    }

    /// Test variable shift operation (simulating BMI2 shrx)
    #[test]
    fn test_variable_shift() {
        let value: u64 = 0xDEADBEEFCAFEBABE;

        for shift in 0..64u32 {
            let generic = value >> shift;
            let simulated_bmi2 = value.wrapping_shr(shift);

            assert_eq!(
                generic, simulated_bmi2,
                "Shift mismatch for shift={}",
                shift
            );
        }
    }

    // =========================================================================
    // Phase 5: 11-bit table tests
    // =========================================================================

    /// Test that table works with subtables
    #[test]
    fn test_11bit_table_with_subtable() {
        let mut lens = vec![0u8; 288];

        for i in 0..256 {
            lens[i] = 8;
        }
        lens[256] = 7;
        for i in 257..288 {
            lens[i] = if i < 280 { 8 } else { 12 };
        }

        let table = TwoLevelTable::build(&lens).unwrap();

        let test_bits: u64 = 0b111111111111;
        let (symbol, code_len) = table.decode(test_bits);

        assert!(code_len > 0, "Should decode valid code");
        assert!(symbol < 288, "Symbol should be in range");
    }

    // =========================================================================
    // Phase 6: Preload slack tests
    // =========================================================================

    /// Test that we don't refill unnecessarily
    #[test]
    fn test_minimal_refill() {
        let data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
        let mut bits = TurboBits::new(&data);

        assert!(bits.has_bits(56), "Initial refill should give 56+ bits");

        bits.consume(40);

        assert!(bits.has_bits(16), "Should have 16+ bits after consuming 40");

        bits.ensure(20);
        assert!(bits.has_bits(20), "Ensure should work");
    }

    /// Test preload-before-consume pattern
    #[test]
    fn test_preload_before_consume() {
        let data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
        let mut bits = TurboBits::new(&data);
        bits.ensure(56);

        let current_bits = bits.buffer();
        let _next_entry_preview = current_bits >> 12;

        bits.consume(12);

        let actual_next = bits.buffer() & 0xFFF;
        assert_eq!(
            _next_entry_preview & 0xFFF,
            actual_next,
            "Preload should match actual next bits"
        );
    }

    // =========================================================================
    // Integration test: Full decode with optimizations
    // =========================================================================

    /// Verify optimizations don't break correctness
    #[test]
    fn test_optimized_decode_correctness() {
        let original: Vec<u8> = {
            let mut data = Vec::with_capacity(100_000);
            for i in 0..100_000 {
                let byte = match i % 100 {
                    0..=30 => (i * 17 % 256) as u8,
                    31..=60 => 0xAA,
                    61..=80 => ((i / 100) % 256) as u8,
                    _ => 0x00,
                };
                data.push(byte);
            }
            data
        };

        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        let mut output = vec![0u8; original.len() + 1024];
        let size = inflate_into_pub(deflate_data, &mut output).unwrap();
        output.truncate(size);

        assert_eq!(output.len(), original.len(), "Size mismatch");

        let mismatches: Vec<usize> = original
            .iter()
            .zip(output.iter())
            .enumerate()
            .filter(|(_, (a, b))| a != b)
            .map(|(i, _)| i)
            .take(10)
            .collect();

        assert!(
            mismatches.is_empty(),
            "Content mismatch at positions: {:?}",
            mismatches
        );
    }

    // =========================================================================
    // Micro-benchmarks for each decode path with instrumentation
    // =========================================================================

    /// Benchmark pure literal decoding (no matches)
    /// Creates data that compresses to mostly literals
    #[test]
    fn bench_pure_literals() {
        // Random-ish data that doesn't compress well = mostly literals
        let original: Vec<u8> = (0..50_000).map(|i| ((i * 17 + 31) % 256) as u8).collect();

        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        // Warm up
        let mut output = vec![0u8; original.len() + 1024];
        for _ in 0..3 {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }

        // Measure
        let iterations = 50;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }
        let elapsed = start.elapsed();
        let bytes_per_iter = original.len();
        let total_bytes = bytes_per_iter * iterations;
        let mb_per_sec = total_bytes as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!(
            "\n[BENCH] Pure Literals: {} iterations, {} bytes each",
            iterations, bytes_per_iter
        );
        eprintln!(
            "[BENCH]   Time: {:.2}ms total, {:.1} MB/s",
            elapsed.as_secs_f64() * 1000.0,
            mb_per_sec
        );
        eprintln!(
            "[BENCH]   Compression ratio: {:.2}x",
            original.len() as f64 / deflate_data.len() as f64
        );
    }

    /// Benchmark pure RLE (distance=1 matches)
    /// Creates data with long runs of same byte
    #[test]
    fn bench_rle_matches() {
        // Long runs of same byte = RLE (distance=1)
        let mut original = Vec::with_capacity(50_000);
        for i in 0..100 {
            let byte = (i % 256) as u8;
            for _ in 0..500 {
                original.push(byte);
            }
        }

        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        let mut output = vec![0u8; original.len() + 1024];
        for _ in 0..3 {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }

        let iterations = 100;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }
        let elapsed = start.elapsed();
        let mb_per_sec = (original.len() * iterations) as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!("\n[BENCH] RLE Matches (d=1): {} iterations", iterations);
        eprintln!(
            "[BENCH]   Time: {:.2}ms total, {:.1} MB/s",
            elapsed.as_secs_f64() * 1000.0,
            mb_per_sec
        );
        eprintln!(
            "[BENCH]   Compression ratio: {:.2}x",
            original.len() as f64 / deflate_data.len() as f64
        );
    }

    /// Benchmark short-distance matches (d=2-7)
    /// Creates data with repeating 4-byte patterns
    #[test]
    fn bench_short_distance_matches() {
        // Repeating 4-byte pattern = distance 4 matches
        let pattern = [0xDE, 0xAD, 0xBE, 0xEF];
        let original: Vec<u8> = pattern.iter().cycle().take(50_000).copied().collect();

        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        let mut output = vec![0u8; original.len() + 1024];
        for _ in 0..3 {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }

        let iterations = 100;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }
        let elapsed = start.elapsed();
        let mb_per_sec = (original.len() * iterations) as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!(
            "\n[BENCH] Short Distance Matches (d=2-7): {} iterations",
            iterations
        );
        eprintln!(
            "[BENCH]   Time: {:.2}ms total, {:.1} MB/s",
            elapsed.as_secs_f64() * 1000.0,
            mb_per_sec
        );
        eprintln!(
            "[BENCH]   Compression ratio: {:.2}x",
            original.len() as f64 / deflate_data.len() as f64
        );
    }

    /// Benchmark long-distance matches (d>=40)
    /// Creates data with matches to earlier content
    #[test]
    fn bench_long_distance_matches() {
        // Create content that will have long-distance matches
        let mut original = Vec::with_capacity(50_000);
        let base: Vec<u8> = (0..500).map(|i| (i * 7 % 256) as u8).collect();
        for _ in 0..100 {
            original.extend_from_slice(&base);
        }

        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let header_size =
            crate::decompress::parallel::marker_decode::skip_gzip_header(&compressed).unwrap();
        let deflate_data = &compressed[header_size..compressed.len() - 8];

        let mut output = vec![0u8; original.len() + 1024];
        for _ in 0..3 {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }

        let iterations = 100;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }
        let elapsed = start.elapsed();
        let mb_per_sec = (original.len() * iterations) as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!(
            "\n[BENCH] Long Distance Matches (d>=40): {} iterations",
            iterations
        );
        eprintln!(
            "[BENCH]   Time: {:.2}ms total, {:.1} MB/s",
            elapsed.as_secs_f64() * 1000.0,
            mb_per_sec
        );
        eprintln!(
            "[BENCH]   Compression ratio: {:.2}x",
            original.len() as f64 / deflate_data.len() as f64
        );
    }

    // =========================================================================
    // Decode loop micro-benchmarks (isolated component testing)
    // =========================================================================

    /// Benchmark raw table lookup speed (isolated from decode)
    #[test]
    fn bench_table_lookup_speed() {
        // Create a 12-bit table (4096 entries)
        let table: Vec<u32> = (0..4096).map(|i| i as u32 * 0x12345).collect();

        // Simulate random lookups
        let lookups: Vec<u64> = (0..100_000).map(|i| (i * 7919) % 4096).collect();

        let iterations = 1000;
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        for _ in 0..iterations {
            for &idx in &lookups {
                sum = sum.wrapping_add(table[(idx & 0xFFF) as usize] as u64);
            }
        }
        let elapsed = start.elapsed();
        let lookups_per_sec =
            (lookups.len() * iterations) as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!(
            "\n[BENCH] Table Lookup: {:.1} M lookups/sec",
            lookups_per_sec
        );
        eprintln!("[BENCH]   (sum={} to prevent optimization)", sum % 1000);
    }

    /// Benchmark branch prediction: check-first vs consume-first pattern
    #[test]
    fn bench_branch_pattern() {
        // Entry format: high bit = literal, low byte = bits to consume
        let entries: Vec<u32> = (0..100_000)
            .map(|i| {
                // 70% literals, 30% matches (realistic)
                if (i * 7919) % 10 < 7 {
                    0x8000_0008 | ((i as u32 & 0xFF) << 16) // Literal: high bit set
                } else {
                    0x0000_0008 | ((i as u32 & 0xFF) << 16) // Match: high bit clear
                }
            })
            .collect();

        let iterations = 1000;

        // Pattern 1: Check first, then consume (our current pattern)
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        let mut bits_consumed = 0u64;
        for _ in 0..iterations {
            for &entry in &entries {
                if (entry as i32) < 0 {
                    bits_consumed += (entry & 0xFF) as u64;
                    sum += ((entry >> 16) & 0xFF) as u64;
                } else {
                    bits_consumed += (entry & 0xFF) as u64;
                    sum += entry as u64;
                }
            }
        }
        let elapsed1 = start.elapsed();

        // Pattern 2: Consume first, then check (libdeflate pattern)
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        let mut bits_consumed2 = 0u64;
        for _ in 0..iterations {
            for &entry in &entries {
                bits_consumed2 += (entry & 0xFF) as u64;
                if (entry as i32) < 0 {
                    sum2 += ((entry >> 16) & 0xFF) as u64;
                } else {
                    sum2 += entry as u64;
                }
            }
        }
        let elapsed2 = start.elapsed();

        eprintln!("\n[BENCH] Branch Pattern Comparison:");
        eprintln!(
            "[BENCH]   Check-first (ours):  {:.2}ms",
            elapsed1.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Consume-first (libdeflate): {:.2}ms",
            elapsed2.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Consume-first speedup: {:.0}%",
            (elapsed1.as_secs_f64() / elapsed2.as_secs_f64() - 1.0) * 100.0
        );
        eprintln!(
            "[BENCH]   (sums: {} {} bits: {} {})",
            sum % 1000,
            sum2 % 1000,
            bits_consumed % 1000,
            bits_consumed2 % 1000
        );
    }

    /// Benchmark bit extraction methods (mask vs BMI2-style)
    #[test]
    fn bench_bit_extraction() {
        let words: Vec<u64> = (0..100_000).map(|i| i * 0x12345678ABCD).collect();
        let counts: Vec<u32> = (0..100_000).map(|i| (i % 15 + 1) as u32).collect();

        let iterations = 500;

        // Pattern 1: Mask with shift (standard approach)
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        for _ in 0..iterations {
            for (&word, &count) in words.iter().zip(counts.iter()) {
                sum = sum.wrapping_add(word & ((1u64 << count) - 1));
            }
        }
        let elapsed_mask = start.elapsed();

        // Pattern 2: Using wrapping operations (may optimize differently)
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        for _ in 0..iterations {
            for (&word, &count) in words.iter().zip(counts.iter()) {
                sum2 = sum2.wrapping_add(word & (1u64.wrapping_shl(count).wrapping_sub(1)));
            }
        }
        let elapsed_wrap = start.elapsed();

        // Pattern 3: On x86_64 with BMI2, use bzhi intrinsic
        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
        let elapsed_bmi2 = {
            use std::arch::x86_64::_bzhi_u64;
            let start = std::time::Instant::now();
            let mut sum3 = 0u64;
            for _ in 0..iterations {
                for (&word, &count) in words.iter().zip(counts.iter()) {
                    sum3 = sum3.wrapping_add(unsafe { _bzhi_u64(word, count) });
                }
            }
            let _ = sum3; // prevent optimization
            start.elapsed()
        };

        eprintln!("\n[BENCH] Bit Extraction Methods:");
        eprintln!(
            "[BENCH]   Mask (standard): {:.2}ms",
            elapsed_mask.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Wrapping ops:    {:.2}ms",
            elapsed_wrap.as_secs_f64() * 1000.0
        );
        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
        eprintln!(
            "[BENCH]   BMI2 bzhi:       {:.2}ms",
            elapsed_bmi2.as_secs_f64() * 1000.0
        );
        #[cfg(not(all(target_arch = "x86_64", target_feature = "bmi2")))]
        eprintln!("[BENCH]   BMI2 bzhi:       (not available on this CPU)");
        eprintln!(
            "[BENCH]   (sums: {} {} to prevent optimization)",
            sum % 1000,
            sum2 % 1000
        );
    }

    // =========================================================================
    // Advanced optimization benchmarks
    // =========================================================================

    /// Benchmark subtable lookup vs fallback decode (current approach)
    #[test]
    fn bench_subtable_vs_fallback() {
        // Simulate entries: 95% direct decode, 5% need subtable/fallback
        // This matches real-world deflate where most codes are < 12 bits
        let main_table: Vec<u32> = (0..4096)
            .map(|i| {
                if i % 20 == 0 {
                    // Subtable pointer: index in high bits, extra bits in 8-13
                    0x4000 | ((i as u32) << 16) | (4 << 8) | 12 // 12 bits consumed
                } else {
                    // Direct entry: literal or match
                    0x8000_0008 | ((i as u32 & 0xFF) << 16) // Literal
                }
            })
            .collect();

        let subtable: Vec<u32> = (0..256).map(|i| 0x8000_0004 | (i << 16)).collect();

        let bits_sequence: Vec<u64> = (0..100_000)
            .map(|i| (i * 0x12345678) % 0xFFFFFFFF)
            .collect();

        let iterations = 500;

        // Pattern 1: Fallback decode (our current approach)
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        for _ in 0..iterations {
            for &bits in &bits_sequence {
                let idx = (bits & 0xFFF) as usize;
                let entry = main_table[idx];
                if entry & 0xFF == 0 {
                    // Fallback: expensive bit-by-bit decode
                    sum = sum.wrapping_add(bits & 0xFF);
                } else {
                    sum = sum.wrapping_add(entry as u64);
                }
            }
        }
        let elapsed_fallback = start.elapsed();

        // Pattern 2: Subtable lookup (libdeflate approach)
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        for _ in 0..iterations {
            for &bits in &bits_sequence {
                let idx = (bits & 0xFFF) as usize;
                let entry = main_table[idx];
                if entry & 0x4000 != 0 {
                    // Subtable: use high bits as index, extra bits from input
                    let subtable_idx = (entry >> 16) as usize;
                    let extra_bits = (entry >> 8) & 0x3F;
                    let sub_idx = (bits >> 12) & ((1 << extra_bits) - 1);
                    let sub_entry = subtable[(subtable_idx + sub_idx as usize) % subtable.len()];
                    sum2 = sum2.wrapping_add(sub_entry as u64);
                } else {
                    sum2 = sum2.wrapping_add(entry as u64);
                }
            }
        }
        let elapsed_subtable = start.elapsed();

        eprintln!("\n[BENCH] Subtable vs Fallback (5% long codes):");
        eprintln!(
            "[BENCH]   Fallback (current):  {:.2}ms",
            elapsed_fallback.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Subtable (libdeflate): {:.2}ms",
            elapsed_subtable.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Ratio: {:.2}x",
            elapsed_fallback.as_secs_f64() / elapsed_subtable.as_secs_f64()
        );
        eprintln!(
            "[BENCH]   (sums: {} {} to prevent opt)",
            sum % 1000,
            sum2 % 1000
        );
    }

    /// Benchmark JIT table caching (fingerprint + reuse)
    #[test]
    fn bench_jit_table_cache() {
        use std::collections::HashMap;

        // Simulate building vs caching Huffman tables
        let code_lengths: Vec<Vec<u8>> = (0..100)
            .map(|seed| {
                (0..288)
                    .map(|i| {
                        // Realistic distribution: most codes 7-9 bits
                        let base = (seed * 7 + i * 3) % 15;
                        base.clamp(1, 15) as u8
                    })
                    .collect()
            })
            .collect();

        // Create 10 unique patterns that repeat
        let patterns: Vec<&Vec<u8>> = code_lengths.iter().take(10).collect();
        let queries: Vec<usize> = (0..10000).map(|i| i % 10).collect();

        let iterations = 50;

        // Pattern 1: Always rebuild table (no caching)
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        for _ in 0..iterations {
            for &pattern_idx in &queries {
                let lens = &patterns[pattern_idx];
                // Simulate table build cost (hash all code lengths)
                let hash: u64 = lens.iter().map(|&b| b as u64).sum();
                sum = sum.wrapping_add(hash);
            }
        }
        let elapsed_rebuild = start.elapsed();

        // Pattern 2: Cache by fingerprint
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        let mut cache: HashMap<u64, u64> = HashMap::new();
        for _ in 0..iterations {
            for &pattern_idx in &queries {
                let lens = &patterns[pattern_idx];
                // Simple fingerprint (in practice: use FNV or similar)
                let fingerprint: u64 = lens
                    .iter()
                    .enumerate()
                    .map(|(i, &b)| (b as u64) << (i % 8))
                    .fold(0, |a, b| a ^ b);

                let value = *cache.entry(fingerprint).or_insert_with(|| {
                    // Simulate table build
                    lens.iter().map(|&b| b as u64).sum()
                });
                sum2 = sum2.wrapping_add(value);
            }
        }
        let elapsed_cached = start.elapsed();

        eprintln!("\n[BENCH] JIT Table Cache (10 unique patterns):");
        eprintln!(
            "[BENCH]   Always rebuild:  {:.2}ms",
            elapsed_rebuild.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Fingerprint cache: {:.2}ms",
            elapsed_cached.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Speedup: {:.1}x",
            elapsed_rebuild.as_secs_f64() / elapsed_cached.as_secs_f64()
        );
        eprintln!(
            "[BENCH]   (sums: {} {} to prevent opt)",
            sum % 1000,
            sum2 % 1000
        );
    }

    /// Benchmark multi-symbol decode (2 symbols per lookup)
    #[test]
    fn bench_multi_symbol_decode() {
        // Entry format for multi-symbol:
        // [Sym1:8][Bits1:4][Sym2:8][Bits2:4][Flags:8]
        // If Sym2 is valid (flag set), decode both in one lookup

        let single_table: Vec<u32> = (0..4096)
            .map(|i| {
                // Single symbol: bits in low 8, symbol in high 8
                ((i & 0xFF) as u32) << 16 | 8 // 8 bits consumed
            })
            .collect();

        // Multi-symbol table: 60% have two literals
        let multi_table: Vec<u64> = (0..4096)
            .map(|i| {
                if i % 10 < 6 {
                    // Two literals: sym1 in bits 56-63, bits1 in 52-55
                    //               sym2 in bits 44-51, bits2 in 40-43
                    //               combined bits in low 8
                    let sym1 = (i & 0xFF) as u64;
                    let sym2 = ((i >> 4) & 0xFF) as u64;
                    (sym1 << 56) | (8 << 52) | (sym2 << 44) | (8 << 40) | (16) | (1 << 8)
                // flag: has second
                } else {
                    // Single symbol
                    let sym1 = (i & 0xFF) as u64;
                    (sym1 << 56) | (8 << 52) | 8 // no second symbol
                }
            })
            .collect();

        let indices: Vec<usize> = (0..100_000).map(|i| (i * 7919) % 4096).collect();

        let iterations = 500;

        // Pattern 1: Single symbol per lookup
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        let mut decoded = 0u64;
        for _ in 0..iterations {
            for &idx in &indices {
                let entry = single_table[idx];
                sum = sum.wrapping_add((entry >> 16) as u64);
                decoded += 1;
            }
        }
        let elapsed_single = start.elapsed();

        // Pattern 2: Multi-symbol per lookup
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        let mut decoded2 = 0u64;
        for _ in 0..iterations {
            for &idx in &indices {
                let entry = multi_table[idx];
                let sym1 = entry >> 56;
                sum2 = sum2.wrapping_add(sym1);
                decoded2 += 1;

                if entry & (1 << 8) != 0 {
                    // Has second symbol
                    let sym2 = (entry >> 44) & 0xFF;
                    sum2 = sum2.wrapping_add(sym2);
                    decoded2 += 1;
                }
            }
        }
        let elapsed_multi = start.elapsed();

        eprintln!("\n[BENCH] Multi-Symbol Decode (60% doubles):");
        eprintln!(
            "[BENCH]   Single symbol: {:.2}ms ({} symbols)",
            elapsed_single.as_secs_f64() * 1000.0,
            decoded
        );
        eprintln!(
            "[BENCH]   Multi symbol:  {:.2}ms ({} symbols)",
            elapsed_multi.as_secs_f64() * 1000.0,
            decoded2
        );
        eprintln!(
            "[BENCH]   Symbols/ms single: {:.0}",
            decoded as f64 / elapsed_single.as_secs_f64() / 1000.0
        );
        eprintln!(
            "[BENCH]   Symbols/ms multi:  {:.0}",
            decoded2 as f64 / elapsed_multi.as_secs_f64() / 1000.0
        );
        eprintln!(
            "[BENCH]   (sums: {} {} to prevent opt)",
            sum % 1000,
            sum2 % 1000
        );
    }

    /// Test consume-first decode against libdeflate on real data
    #[test]
    fn test_consume_first_integration() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Test data
        let original = b"Hello World! This is a test of the consume-first decoder. ".repeat(50);

        // Compress with flate2 (dynamic Huffman)
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        // Decompress with libdeflate (reference)
        let mut libdeflate_out = vec![0u8; original.len()];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        eprintln!("\n[TEST] Consume-first integration test:");
        eprintln!("[TEST]   Original: {} bytes", original.len());
        eprintln!("[TEST]   Compressed: {} bytes", compressed.len());
        eprintln!("[TEST]   libdeflate output: {} bytes", libdeflate_size);

        // Verify libdeflate matches original
        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);
        eprintln!("[TEST]   ✓ libdeflate matches original");
    }

    /// Debug consume-first entry types
    #[test]
    fn test_consume_first_entry_debug() {
        use crate::decompress::inflate::consume_first_table::ConsumeFirstTable;

        // Fixed Huffman code lengths
        let mut lit_len_lengths = vec![0u8; 288];
        lit_len_lengths[..144].fill(8);
        lit_len_lengths[144..256].fill(9);
        lit_len_lengths[256] = 7; // EOB
        lit_len_lengths[257..280].fill(7); // Length codes
        lit_len_lengths[280..288].fill(8);

        let lit_table = ConsumeFirstTable::build(&lit_len_lengths).unwrap();

        eprintln!("\n[DEBUG] ConsumeFirstTable entry types:");

        // Check that literals are identified correctly
        let mut literal_count = 0;
        let mut length_count = 0;
        let mut eob_count = 0;
        let mut subtable_count = 0;

        for pattern in 0..2048u64 {
            let entry = lit_table.lookup_main(pattern);
            if entry.is_literal() {
                literal_count += 1;
            } else if entry.is_length() {
                length_count += 1;
            } else if entry.is_eob() {
                eob_count += 1;
            } else if entry.is_subtable() {
                subtable_count += 1;
            }
        }

        eprintln!("[DEBUG]   Literals: {}", literal_count);
        eprintln!("[DEBUG]   Lengths: {}", length_count);
        eprintln!("[DEBUG]   EOB: {}", eob_count);
        eprintln!("[DEBUG]   Subtables: {}", subtable_count);

        // With fixed Huffman, we should have:
        // - 8-bit literals: symbols 0-143 (but we have 11-bit table, so 2^(11-8) * 144 = 1152 entries)
        // - 9-bit literals: symbols 144-255 (2^(11-9) * 112 = 448 entries)
        // - 7-bit EOB (symbol 256): 2^(11-7) * 1 = 16 entries
        // - 7-bit lengths (257-279): 2^(11-7) * 23 = 368 entries
        // - 8-bit lengths (280-285): 2^(11-8) * 6 = 48 entries (but only 280-285 are valid)

        // Check first few entries
        for pattern in 0u64..16 {
            let entry = lit_table.lookup_main(pattern);
            eprintln!(
                "[DEBUG]   Pattern {:3}: sym={:3}, bits={}, lit={}, len={}, eob={}, sub={}",
                pattern,
                entry.symbol(),
                entry.bits(),
                entry.is_literal(),
                entry.is_length(),
                entry.is_eob(),
                entry.is_subtable()
            );
        }
    }

    /// Benchmark consume-first decode vs current turbo decode
    #[test]
    fn bench_consume_first_vs_turbo() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Create test data
        let original: Vec<u8> = (0..50_000).map(|i| (i % 256) as u8).collect();

        // Compress
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        let iterations = 100;

        // Benchmark current turbo path
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let mut out = vec![0u8; original.len()];
            super::inflate_into_pub(&compressed, &mut out).unwrap();
        }
        let elapsed_turbo = start.elapsed();

        // Benchmark libdeflate
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let mut out = vec![0u8; original.len()];
            libdeflater::Decompressor::new()
                .deflate_decompress(&compressed, &mut out)
                .unwrap();
        }
        let elapsed_libdeflate = start.elapsed();

        let bytes_total = original.len() * iterations;
        let turbo_mbs = bytes_total as f64 / elapsed_turbo.as_secs_f64() / 1_000_000.0;
        let libdeflate_mbs = bytes_total as f64 / elapsed_libdeflate.as_secs_f64() / 1_000_000.0;

        eprintln!("\n[BENCH] Consume-First Integration Benchmark:");
        eprintln!(
            "[BENCH]   Current turbo:  {:.2}ms ({:.1} MB/s)",
            elapsed_turbo.as_secs_f64() * 1000.0,
            turbo_mbs
        );
        eprintln!(
            "[BENCH]   libdeflate:     {:.2}ms ({:.1} MB/s)",
            elapsed_libdeflate.as_secs_f64() * 1000.0,
            libdeflate_mbs
        );
        eprintln!(
            "[BENCH]   Ratio: {:.1}%",
            turbo_mbs / libdeflate_mbs * 100.0
        );
    }

    // =========================================================================
    // CONSUME-FIRST DEBUG TESTS
    // These validate each assumption about the consume_first decode path
    // =========================================================================

    /// Test 1: Very small literal-only data (no matches)
    #[test]
    fn test_cf_small_literals_only() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Simple data: just "ABCD"
        let original = b"ABCD".to_vec();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Small literals only:");
        eprintln!("[CF-TEST]   Original: {:?}", original);
        eprintln!("[CF-TEST]   Compressed: {:?}", compressed);

        // Decompress with libdeflate (reference)
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        eprintln!("[CF-TEST]   libdeflate output: {} bytes", libdeflate_size);
        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with our turbo path
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        eprintln!("[CF-TEST]   turbo output: {} bytes", turbo_size);
        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 2: Data with simple RLE pattern (distance=1 matches)
    #[test]
    fn test_cf_rle_pattern() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // RLE pattern: "AAAAAAAAAA" (10 A's - should compress to match)
        let original = b"AAAAAAAAAA".to_vec();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] RLE pattern:");
        eprintln!(
            "[CF-TEST]   Original: {:?} ({} bytes)",
            String::from_utf8_lossy(&original),
            original.len()
        );
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 3: Data with back-references (typical deflate)
    #[test]
    fn test_cf_with_backrefs() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Pattern that creates back-references: "Hello Hello Hello"
        let original = b"Hello Hello Hello".to_vec();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] With back-references:");
        eprintln!(
            "[CF-TEST]   Original: {:?}",
            String::from_utf8_lossy(&original)
        );
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 4: Larger data with complex patterns
    #[test]
    fn test_cf_complex_pattern() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Complex pattern
        let original = b"The quick brown fox jumps over the lazy dog. ".repeat(10);

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Complex pattern:");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());
        eprintln!(
            "[CF-TEST]   Ratio: {:.1}%",
            compressed.len() as f64 / original.len() as f64 * 100.0
        );

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_bytes_eq(
            &libdeflate_out[..libdeflate_size],
            &original[..],
            "libdeflate",
        );

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_bytes_eq(
            &turbo_out[..turbo_size],
            &original[..],
            "cf_complex_pattern",
        );
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 5: Binary data (all byte values)
    #[test]
    fn test_cf_binary_data() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Binary data with all byte values
        let original: Vec<u8> = (0..=255).collect();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Binary data (0-255):");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 6: Edge case - empty data
    #[test]
    fn test_cf_empty_data() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        let original: Vec<u8> = vec![];

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Empty data:");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());
        eprintln!("[CF-TEST]   Compressed bytes: {:?}", compressed);

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(libdeflate_size, 0);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(turbo_size, 0);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 7: Medium-sized repetitive data (stress test for matches)
    #[test]
    fn test_cf_medium_repetitive() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // 1KB of repetitive data
        let original = b"ABCDEFGH".repeat(128);

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Medium repetitive (1KB):");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 8: Large data (10KB) - stress test
    #[test]
    fn test_cf_large_data() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // 10KB of varied data
        let original: Vec<u8> = (0..10240).map(|i| ((i * 7 + 13) % 256) as u8).collect();

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Large data (10KB):");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(&turbo_out[..turbo_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test: Dickens file from silesia with MAX compression
    #[test]
    fn test_cf_dickens_max() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Read dickens file (first 100KB to keep test fast)
        let original = match std::fs::read("benchmark_data/dickens") {
            Ok(d) => d[..100_000.min(d.len())].to_vec(),
            Err(_) => {
                eprintln!("[CF-TEST] Skipping - no dickens file");
                return;
            }
        };

        eprintln!("\n[CF-TEST] Dickens MAX compression test:");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());

        // Use BEST compression like silesia-gzip.tar.gz
        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with our turbo path
        let mut turbo_out = vec![0u8; original.len() + 100];
        let result = super::inflate_into_pub(&compressed, &mut turbo_out);

        match result {
            Ok(turbo_size) => {
                if turbo_out[..turbo_size] != original[..] {
                    // Find first mismatch
                    let first_mismatch = turbo_out[..turbo_size]
                        .iter()
                        .zip(original.iter())
                        .enumerate()
                        .find(|(_, (a, b))| a != b);

                    if let Some((pos, (got, exp))) = first_mismatch {
                        eprintln!(
                            "[CF-TEST]   First mismatch at byte {}: got {} exp {}",
                            pos, got, exp
                        );
                        panic!("Content mismatch at byte {}", pos);
                    }
                }
                eprintln!("[CF-TEST]   ✓ Passed");
            }
            Err(e) => {
                panic!("Decompression failed: {:?}", e);
            }
        }
    }

    /// Test: Multi-block deflate stream
    #[test]
    fn test_cf_multi_block() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Create data that will produce multiple deflate blocks
        // By using different data patterns, we force block boundaries
        let mut original = Vec::new();
        for _ in 0..10 {
            // Each iteration adds ~8KB of varied data
            original.extend(b"The quick brown fox jumps over the lazy dog. ".repeat(200));
            original.extend((0u8..=255).collect::<Vec<_>>().repeat(30));
        }

        eprintln!("\n[CF-TEST] Multi-block test:");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::best());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate (reference)
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_bytes_eq(
            &libdeflate_out[..libdeflate_size],
            &original[..],
            "libdeflate",
        );

        // Decompress with our turbo path
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_eq!(turbo_size, original.len());
        assert_bytes_eq(&turbo_out[..turbo_size], &original[..], "cf_multi_block");
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test 10: Gzip format (not just deflate) - like the failing test
    #[test]
    fn test_cf_gzip_format() {
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        // 1MB of varied data (like silesia)
        let original: Vec<u8> = (0..1_000_000)
            .map(|i| ((i * 17 + 13) % 256) as u8)
            .collect();

        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Gzip format:");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());

        // Decompress with libdeflate (reference)
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .gzip_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_eq!(&libdeflate_out[..libdeflate_size], &original[..]);

        // Decompress with our gzip preallocated function
        let mut our_out = Vec::new();
        let our_size = crate::decompress::parallel::ultra_fast_inflate::inflate_gzip_preallocated(
            &compressed,
            &mut our_out,
        )
        .expect("our inflate failed");

        assert_eq!(our_size, original.len());
        assert_eq!(&our_out[..our_size], &original[..]);
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Test: Compare ConsumeFirstTable and TwoLevelTable decode results
    #[test]
    fn test_cf_table_comparison() {
        use crate::decompress::inflate::consume_first_table::ConsumeFirstTable;
        use crate::decompress::two_level_table::TwoLevelTable;

        // Use the same code lengths as would be built for a real stream
        // Standard fixed Huffman: 0-143 have 8 bits, 144-255 have 9 bits, etc.
        let mut lit_len_lens = vec![0u8; 286];
        for i in 0..144 {
            lit_len_lens[i] = 8;
        }
        for i in 144..256 {
            lit_len_lens[i] = 9;
        }
        for i in 256..280 {
            lit_len_lens[i] = 7;
        }
        for i in 280..286 {
            lit_len_lens[i] = 8;
        }

        let cf_table = ConsumeFirstTable::build(&lit_len_lens).expect("CF build failed");
        let tl_table = TwoLevelTable::build(&lit_len_lens).expect("TL build failed");

        // Compare what both tables decode for various bit patterns
        let mut mismatches = 0;
        for bits in 0u64..2048 {
            let cf_entry = cf_table.lookup_main(bits);
            let (tl_sym, tl_len) = tl_table.decode(bits);

            // CF might return a subtable pointer - skip those for now
            if cf_entry.is_subtable() {
                continue;
            }

            // For EOB, CF stores type but not symbol; symbol() returns 0, TL returns 256
            // Map CF symbol to match TL's convention
            let cf_sym = if cf_entry.is_eob() {
                256
            } else {
                cf_entry.symbol()
            };
            let cf_len = cf_entry.bits() as u16;

            // Skip invalid TL entries (len=0 means code doesn't exist)
            if tl_len == 0 {
                continue;
            }

            if cf_sym != tl_sym || cf_len as u32 != tl_len {
                if mismatches < 10 {
                    eprintln!("[TABLE-CMP] bits={:#06x} CF(sym={}, len={}, lit={}, eob={}, length={}) TL(sym={}, len={})",
                        bits, cf_sym, cf_len, cf_entry.is_literal(), cf_entry.is_eob(), cf_entry.is_length(), tl_sym, tl_len);
                }
                mismatches += 1;
            }
        }

        if mismatches > 0 {
            panic!("Found {} table mismatches!", mismatches);
        }
        eprintln!("[CF-TEST] Tables match for all 2048 patterns");
    }

    /// Test: Verify subtable construction for long codes (>11 bits)
    #[test]
    fn test_cf_subtable_construction() {
        use crate::decompress::inflate::consume_first_table::ConsumeFirstTable;

        // Create code lengths that require subtables:
        // Symbols 0-255 get 12-bit codes (need 1-bit subtables)
        // Symbol 256 (EOB) gets a 7-bit code
        let mut code_lens = vec![0u8; 286];
        code_lens[256] = 7; // EOB is short
        for i in 0..256 {
            code_lens[i] = 12; // All literals are 12 bits (need subtable)
        }

        let cf_table = ConsumeFirstTable::build(&code_lens).expect("CF build failed");

        // Helper to reverse bits
        fn reverse_bits(mut val: u32, n: u32) -> u32 {
            let mut result = 0;
            for _ in 0..n {
                result = (result << 1) | (val & 1);
                val >>= 1;
            }
            result
        }

        // Generate codewords using canonical Huffman algorithm
        let mut bl_count = [0u32; 16];
        for &len in &code_lens {
            if len > 0 {
                bl_count[len as usize] += 1;
            }
        }
        let mut next_code = [0u32; 16];
        let mut code = 0u32;
        for bits in 1..=15 {
            code = (code + bl_count[bits - 1]) << 1;
            next_code[bits] = code;
        }

        eprintln!("\n[CF-SUBTABLE-TEST] Testing 12-bit codes with subtables:");
        eprintln!(
            "[CF-SUBTABLE-TEST]   12-bit codes: {} symbols",
            bl_count[12]
        );
        eprintln!("[CF-SUBTABLE-TEST]   7-bit codes: {} symbols", bl_count[7]);
        eprintln!(
            "[CF-SUBTABLE-TEST]   CF subtable size: {} entries",
            cf_table.sub.len()
        );

        // Test a few symbols
        let mut test_next_code = next_code;
        let mut errors = 0;

        // Test EOB (symbol 256, 7 bits)
        {
            let code = test_next_code[7];
            test_next_code[7] += 1;
            let reversed = reverse_bits(code, 7);

            // Use ConsumeFirstTable to decode
            let cf_entry = cf_table.lookup_main(reversed as u64);

            eprintln!(
                "[CF-SUBTABLE-TEST]   EOB (sym=256): code={:#b}, reversed={:#b}",
                code, reversed
            );
            eprintln!(
                "[CF-SUBTABLE-TEST]     CF: sym={}, bits={}, is_eob={}",
                cf_entry.symbol(),
                cf_entry.bits(),
                cf_entry.is_eob()
            );

            // Note: EOB symbol() returns 0 (by design - we only need is_eob() check)
            if !cf_entry.is_eob() {
                eprintln!("[CF-SUBTABLE-TEST]     ERROR: EOB mismatch - not marked as EOB!");
                errors += 1;
            }
            if cf_entry.bits() != 7 {
                eprintln!("[CF-SUBTABLE-TEST]     ERROR: EOB should consume 7 bits!");
                errors += 1;
            }
        }

        // Test first 12-bit literal (symbol 0)
        {
            let sym = 0;
            let code = test_next_code[12];
            test_next_code[12] += 1;
            let reversed = reverse_bits(code, 12);

            // Use ConsumeFirstTable to decode
            let main_entry = cf_table.lookup_main(reversed as u64);
            eprintln!(
                "[CF-SUBTABLE-TEST]   Literal (sym=0): code={:#b}, reversed={:#b}",
                code, reversed
            );
            eprintln!(
                "[CF-SUBTABLE-TEST]     Main entry: sym={}, bits={}, is_subtable={}",
                main_entry.symbol(),
                main_entry.bits(),
                main_entry.is_subtable()
            );

            if main_entry.is_subtable() {
                // Need to look up in subtable
                let remaining_bits = reversed >> 11; // Low 11 bits consumed, use next bits
                let sub_entry = cf_table.lookup_sub(main_entry, remaining_bits as u64);
                eprintln!(
                    "[CF-SUBTABLE-TEST]     Sub entry: sym={}, bits={}, is_literal={}",
                    sub_entry.symbol(),
                    sub_entry.bits(),
                    sub_entry.is_literal()
                );
                let total_bits = main_entry.bits() + sub_entry.bits();
                eprintln!(
                    "[CF-SUBTABLE-TEST]     Total bits: {} (main={}, sub={})",
                    total_bits,
                    main_entry.bits(),
                    sub_entry.bits()
                );

                if !sub_entry.is_literal() || sub_entry.symbol() != sym as u16 {
                    eprintln!("[CF-SUBTABLE-TEST]     ERROR: Symbol mismatch!");
                    errors += 1;
                }
                if total_bits != 12 {
                    eprintln!("[CF-SUBTABLE-TEST]     ERROR: Total bits should be 12!");
                    errors += 1;
                }
            } else if !main_entry.is_literal() || main_entry.symbol() != sym as u16 {
                eprintln!("[CF-SUBTABLE-TEST]     ERROR: Direct entry mismatch!");
                errors += 1;
            }
        }

        // Test symbol 100 (12-bit literal)
        {
            let sym = 100;
            // After symbol 0 test, we're at code for symbol 1
            // Skip to symbol 100 (need to skip 99 more)
            for _ in 0..99 {
                test_next_code[12] += 1;
            }
            let code = test_next_code[12];
            let reversed = reverse_bits(code, 12);

            let main_entry = cf_table.lookup_main(reversed as u64);
            eprintln!(
                "[CF-SUBTABLE-TEST]   Literal (sym=100): code={:#b}, reversed={:#b}",
                code, reversed
            );
            eprintln!(
                "[CF-SUBTABLE-TEST]     Main entry: sym={}, bits={}, is_subtable={}",
                main_entry.symbol(),
                main_entry.bits(),
                main_entry.is_subtable()
            );

            if main_entry.is_subtable() {
                let remaining_bits = reversed >> 11;
                let sub_entry = cf_table.lookup_sub(main_entry, remaining_bits as u64);
                eprintln!(
                    "[CF-SUBTABLE-TEST]     Sub entry: sym={}, bits={}, is_literal={}",
                    sub_entry.symbol(),
                    sub_entry.bits(),
                    sub_entry.is_literal()
                );

                if !sub_entry.is_literal() || sub_entry.symbol() != sym as u16 {
                    eprintln!("[CF-SUBTABLE-TEST]     ERROR: Symbol mismatch!");
                    errors += 1;
                }
            }
        }

        assert_eq!(errors, 0, "Found {} subtable construction errors", errors);
        eprintln!("[CF-SUBTABLE-TEST] PASSED");
    }

    /// Test: Mixed code lengths with subtables on x86 / direct on ARM64
    #[test]
    fn test_cf_subtable_mixed() {
        use crate::decompress::inflate::consume_first_table::{ConsumeFirstTable, CF_TABLE_BITS};
        use crate::decompress::two_level_table::TurboBits;

        // Create a more realistic code length distribution:
        // - Some short codes (frequent symbols)
        // - Some long codes > 11 bits (need subtables on x86, direct on ARM64)
        let mut code_lens = vec![0u8; 286];

        // Common literals (0-127) get short 8-bit codes
        for i in 0..128 {
            code_lens[i] = 8;
        }
        // Less common (128-191) get 9-bit codes
        for i in 128..192 {
            code_lens[i] = 9;
        }
        // Rare (192-223) get 10-bit codes
        for i in 192..224 {
            code_lens[i] = 10;
        }
        // Very rare (224-255) get 12-bit codes (needs subtables!)
        for i in 224..256 {
            code_lens[i] = 12;
        }
        // EOB (256) gets 7 bits
        code_lens[256] = 7;
        // Length codes (257-285) get 8 bits
        for i in 257..286 {
            code_lens[i] = 8;
        }

        let cf_table = ConsumeFirstTable::build(&code_lens).expect("CF build failed");

        eprintln!("\n[CF-MIXED-TEST] Mixed code lengths with subtables:");
        eprintln!(
            "[CF-MIXED-TEST]   Subtable size: {} entries",
            cf_table.sub.len()
        );

        // Helper to reverse bits
        fn reverse_bits(mut val: u32, n: u32) -> u32 {
            let mut result = 0;
            for _ in 0..n {
                result = (result << 1) | (val & 1);
                val >>= 1;
            }
            result
        }

        // Generate codewords using canonical Huffman algorithm
        let mut bl_count = [0u32; 16];
        for &len in &code_lens {
            if len > 0 {
                bl_count[len as usize] += 1;
            }
        }
        let mut next_code = [0u32; 16];
        let mut code = 0u32;
        for bits in 1..=15 {
            code = (code + bl_count[bits - 1]) << 1;
            next_code[bits] = code;
        }

        // Store codes for each symbol
        let mut symbol_codes: Vec<(u32, u8)> = Vec::with_capacity(286);
        let mut next_code_temp = next_code;
        for &len in code_lens.iter() {
            if len > 0 {
                let c = next_code_temp[len as usize];
                next_code_temp[len as usize] += 1;
                symbol_codes.push((reverse_bits(c, len as u32), len));
            } else {
                symbol_codes.push((0, 0));
            }
        }

        let mut errors = 0;

        // Test a 12-bit symbol
        // On ARM64 (15-bit table): resolves directly, no subtable
        // On x86 (11-bit table): needs subtable lookup
        let sym = 230;
        let (reversed, len) = symbol_codes[sym];
        eprintln!(
            "[CF-MIXED-TEST]   Testing 12-bit symbol {}: code_len={}, reversed={:#b} (CF_TABLE_BITS={})",
            sym, len, reversed, CF_TABLE_BITS
        );

        let main_entry = cf_table.lookup_main(reversed as u64);
        eprintln!(
            "[CF-MIXED-TEST]     Main entry: sym={}, bits={}, is_subtable={}",
            main_entry.symbol(),
            main_entry.bits(),
            main_entry.is_subtable()
        );

        if CF_TABLE_BITS >= 15 {
            // 15-bit table: 12-bit codes resolve directly
            if main_entry.is_subtable() {
                eprintln!("[CF-MIXED-TEST]     ERROR: 12-bit code should NOT need subtable with {}-bit table!", CF_TABLE_BITS);
                errors += 1;
            } else if !main_entry.is_literal() || main_entry.symbol() != sym as u16 {
                eprintln!("[CF-MIXED-TEST]     ERROR: Symbol mismatch!");
                errors += 1;
            }
        } else {
            // 11-bit table: 12-bit codes need subtable
            if main_entry.is_subtable() {
                let remaining_bits = reversed >> CF_TABLE_BITS;
                let sub_entry = cf_table.lookup_sub(main_entry, remaining_bits as u64);
                eprintln!(
                    "[CF-MIXED-TEST]     Sub entry: sym={}, bits={}, is_literal={}",
                    sub_entry.symbol(),
                    sub_entry.bits(),
                    sub_entry.is_literal()
                );
                let total_bits = main_entry.bits() + sub_entry.bits();
                eprintln!("[CF-MIXED-TEST]     Total bits: {}", total_bits);

                if !sub_entry.is_literal() || sub_entry.symbol() != sym as u16 {
                    eprintln!("[CF-MIXED-TEST]     ERROR: Symbol mismatch!");
                    errors += 1;
                }
            } else {
                eprintln!("[CF-MIXED-TEST]     ERROR: Expected subtable for 12-bit code with {}-bit table!", CF_TABLE_BITS);
                errors += 1;
            }
        }

        // Test that we can decode a sequence correctly
        // Build a bitstream with: literal 'A' (8-bit), literal 230 (12-bit), EOB (7-bit)
        let sym_a = 65;
        let sym_rare = 230;
        let (code_a, len_a) = symbol_codes[sym_a];
        let (code_rare, len_rare) = symbol_codes[sym_rare];
        let (code_eob, len_eob) = symbol_codes[256];

        // Pack bits: A first (low bits), then rare, then EOB
        let mut bitstream: u64 = 0;
        let mut bit_pos = 0;
        bitstream |= code_a as u64;
        bit_pos += len_a as u32;
        bitstream |= (code_rare as u64) << bit_pos;
        bit_pos += len_rare as u32;
        bitstream |= (code_eob as u64) << bit_pos;

        eprintln!(
            "[CF-MIXED-TEST]   Decoding sequence: A({} bits), 230({} bits), EOB({} bits)",
            len_a, len_rare, len_eob
        );
        eprintln!("[CF-MIXED-TEST]     Bitstream: {:#066b}", bitstream);

        // Create a fake input buffer for TurboBits
        let buf = bitstream.to_le_bytes();
        let mut bits = TurboBits::new(&buf);
        bits.refill_branchless();

        // Decode symbol 1: should be 'A'
        let e1 = cf_table.lookup_main(bits.buffer());
        bits.consume(e1.bits());
        if e1.is_subtable() {
            eprintln!("[CF-MIXED-TEST]     ERROR: 'A' should not need subtable!");
            errors += 1;
        } else if e1.symbol() != sym_a as u16 {
            eprintln!(
                "[CF-MIXED-TEST]     ERROR: Expected 'A' ({}), got {}!",
                sym_a,
                e1.symbol()
            );
            errors += 1;
        } else {
            eprintln!("[CF-MIXED-TEST]     Decoded 'A' correctly");
        }

        // Decode symbol 2: should be 230
        let e2_main = cf_table.lookup_main(bits.buffer());
        bits.consume(e2_main.bits());
        if CF_TABLE_BITS >= 15 {
            // 15-bit table: direct lookup
            if e2_main.is_subtable() {
                eprintln!("[CF-MIXED-TEST]     ERROR: Symbol 230 should NOT need subtable with {}-bit table!", CF_TABLE_BITS);
                errors += 1;
            } else if e2_main.symbol() != sym_rare as u16 {
                eprintln!(
                    "[CF-MIXED-TEST]     ERROR: Expected {}, got {}!",
                    sym_rare,
                    e2_main.symbol()
                );
                errors += 1;
            } else {
                eprintln!("[CF-MIXED-TEST]     Decoded {} correctly", sym_rare);
            }
        } else {
            // 11-bit table: subtable lookup
            if e2_main.is_subtable() {
                let e2_sub = cf_table.lookup_sub(e2_main, bits.buffer());
                bits.consume(e2_sub.bits());
                if e2_sub.symbol() != sym_rare as u16 {
                    eprintln!(
                        "[CF-MIXED-TEST]     ERROR: Expected {}, got {}!",
                        sym_rare,
                        e2_sub.symbol()
                    );
                    errors += 1;
                } else {
                    eprintln!("[CF-MIXED-TEST]     Decoded {} correctly", sym_rare);
                }
            } else {
                eprintln!(
                    "[CF-MIXED-TEST]     ERROR: Symbol 230 should need subtable with {}-bit table!",
                    CF_TABLE_BITS
                );
                errors += 1;
            }
        }

        // Decode symbol 3: should be EOB
        let e3 = cf_table.lookup_main(bits.buffer());
        bits.consume(e3.bits());
        if e3.is_subtable() {
            let e3_sub = cf_table.lookup_sub(e3, bits.buffer());
            bits.consume(e3_sub.bits());
            if !e3_sub.is_eob() {
                eprintln!("[CF-MIXED-TEST]     ERROR: Expected EOB!");
                errors += 1;
            } else {
                eprintln!("[CF-MIXED-TEST]     Decoded EOB correctly");
            }
        } else if !e3.is_eob() {
            eprintln!("[CF-MIXED-TEST]     ERROR: Expected EOB!");
            errors += 1;
        } else {
            eprintln!("[CF-MIXED-TEST]     Decoded EOB correctly");
        }

        assert_eq!(errors, 0, "Found {} errors in mixed subtable test", errors);
        eprintln!("[CF-MIXED-TEST] PASSED");
    }

    /// Test: Verify distance table builds correctly
    #[test]
    fn test_cf_distance_table() {
        use crate::decompress::inflate::consume_first_table::ConsumeFirstTable;

        // Standard 5-bit distance codes
        let dist_lens: Vec<u8> = vec![5; 30];

        let table =
            ConsumeFirstTable::build_distance(&dist_lens).expect("Failed to build distance table");

        // Helper to reverse bits
        fn reverse_bits(mut val: u32, n: u32) -> u32 {
            let mut result = 0;
            for _ in 0..n {
                result = (result << 1) | (val & 1);
                val >>= 1;
            }
            result
        }

        // Verify a few distance symbols
        for sym in 0..10 {
            let code = reverse_bits(sym as u32, 5);
            let entry = table.lookup_main(code as u64);

            // All distance symbols should be length codes (not literals)
            assert!(
                entry.is_length(),
                "Distance symbol {} should be length type",
                sym
            );
            assert_eq!(
                entry.symbol(),
                sym,
                "Distance symbol {} decoded incorrectly",
                sym
            );
            assert_eq!(
                entry.bits(),
                5,
                "Distance symbol {} should consume 5 bits",
                sym
            );
        }

        eprintln!("[CF-TEST] Distance table builds correctly");
    }

    /// Test: Raw deflate from silesia
    #[test]
    fn test_cf_silesia_raw() {
        // Read silesia-gzip.tar.gz and extract raw deflate data
        let gzip_data = match std::fs::read("benchmark_data/silesia-gzip.tar.gz") {
            Ok(d) => d,
            Err(_) => {
                eprintln!("[CF-TEST] Skipping - no silesia file");
                return;
            }
        };

        eprintln!("\n[CF-TEST] Silesia raw deflate:");
        eprintln!("[CF-TEST]   Gzip file: {} bytes", gzip_data.len());

        // Parse gzip header to find deflate start
        let flags = gzip_data[3];
        let mut pos = 10;

        // Skip FEXTRA
        if flags & 0x04 != 0 {
            let xlen = u16::from_le_bytes([gzip_data[pos], gzip_data[pos + 1]]) as usize;
            pos += 2 + xlen;
        }
        // Skip FNAME
        if flags & 0x08 != 0 {
            while gzip_data[pos] != 0 {
                pos += 1;
            }
            pos += 1;
        }
        // Skip FCOMMENT
        if flags & 0x10 != 0 {
            while gzip_data[pos] != 0 {
                pos += 1;
            }
            pos += 1;
        }
        // Skip FHCRC
        if flags & 0x02 != 0 {
            pos += 2;
        }

        let deflate_start = pos;
        let deflate_end = gzip_data.len() - 8; // Subtract CRC32 + ISIZE trailer
        let deflate_data = &gzip_data[deflate_start..deflate_end];

        eprintln!(
            "[CF-TEST]   Deflate data: {} bytes (start at {})",
            deflate_data.len(),
            deflate_start
        );

        // Get expected output from libdeflate
        let mut libdeflate_out = vec![0u8; 212_000_000]; // Big enough
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(deflate_data, &mut libdeflate_out)
            .expect("libdeflate failed");

        eprintln!("[CF-TEST]   Expected output: {} bytes", libdeflate_size);

        // Test with our turbo path - only first 100KB to find first mismatch
        let test_size = 100_000.min(libdeflate_size);
        let mut turbo_out = vec![0u8; libdeflate_size + 1000];

        match super::inflate_into_pub(deflate_data, &mut turbo_out) {
            Ok(size) => {
                eprintln!("[CF-TEST]   Our output: {} bytes", size);

                // Compare first 100KB
                let first_mismatch = turbo_out[..test_size]
                    .iter()
                    .zip(libdeflate_out[..test_size].iter())
                    .enumerate()
                    .find(|(_, (a, b))| a != b);

                if let Some((pos, (got, exp))) = first_mismatch {
                    eprintln!(
                        "[CF-TEST]   FIRST MISMATCH at byte {}: got {} expected {}",
                        pos, got, exp
                    );
                    panic!("Mismatch at byte {}", pos);
                }
                eprintln!("[CF-TEST]   ✓ First {} bytes match", test_size);
            }
            Err(e) => {
                // Still compare what we have
                let cmp_size = turbo_out.len().min(libdeflate_out.len());
                let first_mismatch = turbo_out[..cmp_size]
                    .iter()
                    .zip(libdeflate_out[..cmp_size].iter())
                    .enumerate()
                    .find(|(_, (a, b))| a != b);

                if let Some((pos, _)) = first_mismatch {
                    eprintln!("[CF-TEST]   First mismatch at byte {}", pos);
                }
                panic!("Decompression failed: {:?}", e);
            }
        }
    }

    /// Test 11: Actual silesia data (first 1MB only for faster testing)
    #[test]
    fn test_cf_silesia_small() {
        // Read first 1MB of silesia-gzip.tar.gz (compressed)
        let data = match std::fs::read("benchmark_data/silesia-gzip.tar.gz") {
            Ok(d) => d[..1_000_000.min(d.len())].to_vec(),
            Err(_) => {
                eprintln!("[CF-TEST] Skipping - no silesia file");
                return;
            }
        };

        eprintln!("\n[CF-TEST] Silesia (first 1MB compressed):");
        eprintln!("[CF-TEST]   Compressed chunk: {} bytes", data.len());

        // We can't decompress a partial gzip, so let's try the full file
        // but only compare first 10MB of output
        let full_data = std::fs::read("benchmark_data/silesia-gzip.tar.gz").unwrap();

        // Get expected output from flate2 (first 10MB)
        use std::io::Read;
        let mut flate2_dec = flate2::read::GzDecoder::new(&full_data[..]);
        let mut expected = Vec::new();
        flate2_dec.read_to_end(&mut expected).unwrap();
        let check_size = 10_000_000.min(expected.len());
        eprintln!(
            "[CF-TEST]   Expected total: {} bytes, checking first {} bytes",
            expected.len(),
            check_size
        );

        // Decompress with our function
        let mut our_out = Vec::new();
        let result = crate::decompress::parallel::ultra_fast_inflate::inflate_gzip_preallocated(
            &full_data,
            &mut our_out,
        );

        match result {
            Ok(our_size) => {
                eprintln!("[CF-TEST]   Our output: {} bytes", our_size);
                // Check first 10MB
                if our_size >= check_size {
                    assert_eq!(
                        &our_out[..check_size],
                        &expected[..check_size],
                        "Mismatch in first {} bytes",
                        check_size
                    );
                    eprintln!("[CF-TEST]   ✓ First {} bytes match", check_size);
                } else {
                    panic!(
                        "Output too small: {} vs expected {}",
                        our_size,
                        expected.len()
                    );
                }
            }
            Err(e) => {
                // Find where the error occurred
                eprintln!("[CF-TEST]   Error: {:?}", e);
                eprintln!("[CF-TEST]   Output so far: {} bytes", our_out.len());

                // Find first mismatch
                let cmp_size = our_out.len().min(expected.len());
                if cmp_size > 0 {
                    let first_mismatch = our_out[..cmp_size]
                        .iter()
                        .zip(expected[..cmp_size].iter())
                        .enumerate()
                        .find(|(_, (a, b))| a != b);

                    if let Some((pos, _)) = first_mismatch {
                        eprintln!("[CF-TEST]   FIRST MISMATCH at byte {}:", pos);
                        eprintln!(
                            "[CF-TEST]   Got: {:?}",
                            &our_out[pos..pos.saturating_add(20).min(cmp_size)]
                        );
                        eprintln!(
                            "[CF-TEST]   Exp: {:?}",
                            &expected[pos..pos.saturating_add(20).min(cmp_size)]
                        );

                        // Show context before
                        let ctx_start = pos.saturating_sub(10);
                        eprintln!("[CF-TEST]   Context before ({}-{}):", ctx_start, pos);
                        eprintln!(
                            "[CF-TEST]   Got: {:?}",
                            String::from_utf8_lossy(&our_out[ctx_start..pos])
                        );
                        eprintln!(
                            "[CF-TEST]   Exp: {:?}",
                            String::from_utf8_lossy(&expected[ctx_start..pos])
                        );
                    } else {
                        eprintln!("[CF-TEST]   First {} bytes match perfectly", cmp_size);
                    }
                }

                panic!("Decompression failed: {:?}", e);
            }
        }
    }

    /// Test 9: Large repetitive data (100KB) - more stress
    #[test]
    fn test_cf_very_large_repetitive() {
        use flate2::write::DeflateEncoder;
        use flate2::Compression;
        use std::io::Write;

        // 100KB of repetitive text
        let original = b"The quick brown fox jumps over the lazy dog. ".repeat(2000);

        let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();

        eprintln!("\n[CF-TEST] Very large repetitive (100KB):");
        eprintln!("[CF-TEST]   Original: {} bytes", original.len());
        eprintln!("[CF-TEST]   Compressed: {} bytes", compressed.len());
        eprintln!(
            "[CF-TEST]   Ratio: {:.1}%",
            compressed.len() as f64 / original.len() as f64 * 100.0
        );

        // Decompress with libdeflate
        let mut libdeflate_out = vec![0u8; original.len() + 100];
        let libdeflate_size = libdeflater::Decompressor::new()
            .deflate_decompress(&compressed, &mut libdeflate_out)
            .expect("libdeflate failed");

        assert_bytes_eq(
            &libdeflate_out[..libdeflate_size],
            &original[..],
            "libdeflate",
        );

        // Decompress with turbo
        let mut turbo_out = vec![0u8; original.len() + 100];
        let turbo_size =
            super::inflate_into_pub(&compressed, &mut turbo_out).expect("turbo failed");

        assert_bytes_eq(
            &turbo_out[..turbo_size],
            &original[..],
            "cf_very_large_repetitive",
        );
        eprintln!("[CF-TEST]   ✓ Passed");
    }

    /// Benchmark valid-entries table with consume-first decode loop
    #[test]
    fn bench_valid_entries_consume_first() {
        // Key insight: Make EVERY entry valid so we can consume-first
        // Even subtable pointers have valid bits to consume

        // Entry format: [type:2][data:22][bits:8]
        // type: 00=subtable, 01=literal, 10=length, 11=EOB
        const TYPE_SUBTABLE: u32 = 0b00 << 30;
        const TYPE_LITERAL: u32 = 0b01 << 30;
        const TYPE_LENGTH: u32 = 0b10 << 30;
        const TYPE_EOB: u32 = 0b11 << 30;

        // Build a table where ALL entries are valid
        let table: Vec<u32> = (0..4096)
            .map(|i| {
                if i % 100 == 0 {
                    // 1% subtable pointers (still has valid bits)
                    TYPE_SUBTABLE | ((i & 0x3FFFFF) << 8) | 12 // 12 bits consumed
                } else if i % 50 == 0 {
                    // 2% EOB
                    TYPE_EOB | 7 // 7 bits consumed
                } else if i % 20 == 0 {
                    // 5% length codes
                    TYPE_LENGTH | ((i & 0x1F) << 8) | 10 // 10 bits consumed
                } else {
                    // 92% literals
                    TYPE_LITERAL | ((i & 0xFF) << 8) | 8 // 8 bits consumed
                }
            })
            .collect();

        let bits_sequence: Vec<u64> = (0..100_000).map(|i| i * 0x1234567).collect();

        let iterations = 500;

        // Consume-first decode loop (like libdeflate)
        let start = std::time::Instant::now();
        let mut literals = 0u64;
        let mut lengths = 0u64;
        let mut subtables = 0u64;
        let mut eobs = 0u64;
        let mut bitbuf_accum = 0u64;
        for _ in 0..iterations {
            for &bits in &bits_sequence {
                let mut bitbuf = bits;
                let entry = table[(bitbuf & 0xFFF) as usize];

                // CONSUME FIRST - always valid!
                let bits_to_skip = entry & 0xFF;
                bitbuf >>= bits_to_skip;
                bitbuf_accum ^= bitbuf; // Use the result to prevent optimization

                // Then branch on type
                match entry >> 30 {
                    0b01 => literals += 1, // literal
                    0b10 => lengths += 1,  // length
                    0b11 => eobs += 1,     // EOB
                    _ => subtables += 1,   // subtable (needs extra lookup)
                }
            }
        }
        let elapsed = start.elapsed();

        eprintln!("\n[BENCH] Valid-Entries Consume-First:");
        eprintln!("[BENCH]   Time: {:.2}ms", elapsed.as_secs_f64() * 1000.0);
        eprintln!(
            "[BENCH]   Throughput: {:.1} M entries/sec",
            (iterations * bits_sequence.len()) as f64 / elapsed.as_secs_f64() / 1_000_000.0
        );
        eprintln!(
            "[BENCH]   Distribution: {} lit, {} len, {} eob, {} sub (accum {})",
            literals,
            lengths,
            eobs,
            subtables,
            bitbuf_accum % 1000
        );
    }

    /// Benchmark libdeflate's consume-first pattern simulation
    #[test]
    fn bench_consume_first_simulation() {
        // Simulate the key difference: consume-first vs check-first

        // Entry format: [literal_flag:1][symbol:8][bits:8]
        let table: Vec<u32> = (0..4096)
            .map(|i| {
                // 95% literals (high bit set), 5% other
                if i % 20 != 0 {
                    0x8000_0000 | ((i & 0xFF) << 16) | 8 // literal, symbol, 8 bits
                } else {
                    0x4000_0000 | 10 // match, 10 bits
                }
            })
            .collect();

        let bits_sequence: Vec<u64> = (0..100_000).map(|i| i * 0x1234567).collect();

        let iterations = 500;

        // Pattern 1: Check-first (our current approach)
        // if is_literal { consume(); process(); }
        let start = std::time::Instant::now();
        let mut sum = 0u64;
        for _ in 0..iterations {
            for &bits in &bits_sequence {
                let mut bitbuf = bits;
                let entry = table[(bitbuf & 0xFFF) as usize];

                // CHECK FIRST
                if entry & 0x8000_0000 != 0 {
                    // Then consume
                    let bits_to_skip = entry & 0xFF;
                    bitbuf >>= bits_to_skip;
                    sum = sum.wrapping_add((entry >> 16) as u64 & 0xFF);
                    sum ^= bitbuf; // Use bitbuf to prevent optimization
                }
            }
        }
        let elapsed_check_first = start.elapsed();

        // Pattern 2: Consume-first (libdeflate approach)
        // consume(); if is_literal { process(); }
        let start = std::time::Instant::now();
        let mut sum2 = 0u64;
        for _ in 0..iterations {
            for &bits in &bits_sequence {
                let mut bitbuf = bits;
                let entry = table[(bitbuf & 0xFFF) as usize];

                // CONSUME FIRST (unconditionally)
                let bits_to_skip = entry & 0xFF;
                bitbuf >>= bits_to_skip;

                // Then check
                if entry & 0x8000_0000 != 0 {
                    sum2 = sum2.wrapping_add((entry >> 16) as u64 & 0xFF);
                }
                sum2 ^= bitbuf; // Use bitbuf to prevent optimization
            }
        }
        let elapsed_consume_first = start.elapsed();

        eprintln!("\n[BENCH] Consume-First vs Check-First (95% literals):");
        eprintln!(
            "[BENCH]   Check-first:   {:.2}ms",
            elapsed_check_first.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Consume-first: {:.2}ms",
            elapsed_consume_first.as_secs_f64() * 1000.0
        );
        eprintln!(
            "[BENCH]   Speedup: {:.1}%",
            (elapsed_check_first.as_secs_f64() / elapsed_consume_first.as_secs_f64() - 1.0) * 100.0
        );
        eprintln!("[BENCH]   (sums: {}, {} to prevent opt)", sum, sum2);
    }

    /// Combined benchmark with detailed path counting
    #[test]
    fn bench_with_path_counts() {
        let data = match std::fs::read("benchmark_data/silesia.tar.gz") {
            Ok(d) => d,
            Err(_) => {
                eprintln!("Skipping - no silesia.tar.gz");
                return;
            }
        };

        // Extract deflate data (skip gzip header)
        let header_size = match crate::decompress::parallel::marker_decode::skip_gzip_header(&data)
        {
            Ok(n) => n,
            Err(_) => {
                eprintln!("Skipping - not a valid gzip file");
                return;
            }
        };
        let deflate_data = &data[header_size..data.len().saturating_sub(8)];

        // Pre-allocate output (use ISIZE from trailer)
        let isize = u32::from_le_bytes([
            data[data.len() - 4],
            data[data.len() - 3],
            data[data.len() - 2],
            data[data.len() - 1],
        ]) as usize;
        let mut output = vec![0u8; isize + 1024];

        // Warm up
        for _ in 0..2 {
            let _ = inflate_into_pub(deflate_data, &mut output);
        }

        // Run with GZIPPY_TRACE to get path counts
        eprintln!("\n[BENCH] Real-world data (silesia) with GZIPPY_TRACE=1:");
        eprintln!("[BENCH]   Set GZIPPY_TRACE=1 to see detailed path counts");

        let iterations = 3;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let size = inflate_into_pub(deflate_data, &mut output).unwrap();
            assert!(size > 0);
        }
        let elapsed = start.elapsed();
        let mb_per_sec = (isize * iterations) as f64 / elapsed.as_secs_f64() / 1_000_000.0;

        eprintln!("[BENCH]   Output size: {} bytes", isize);
        eprintln!("[BENCH]   Speed: {:.1} MB/s", mb_per_sec);
    }

    /// Benchmark: Turbo path vs libdeflate on silesia
    #[test]
    fn bench_turbo_vs_libdeflate() {
        let gzip_data = match std::fs::read("benchmark_data/silesia-gzip.tar.gz") {
            Ok(d) => d,
            Err(_) => {
                eprintln!("[BENCH] Skipping - no silesia file");
                return;
            }
        };

        // Extract deflate data from gzip
        let deflate_start = 10
            + if (gzip_data[3] & 0x08) != 0 {
                gzip_data[10..].iter().position(|&b| b == 0).unwrap_or(0) + 1
            } else {
                0
            };
        let deflate_end = gzip_data.len() - 8;
        let deflate_data = &gzip_data[deflate_start..deflate_end];

        // Get expected size from ISIZE
        let isize_bytes = &gzip_data[gzip_data.len() - 4..];
        let isize = u32::from_le_bytes([
            isize_bytes[0],
            isize_bytes[1],
            isize_bytes[2],
            isize_bytes[3],
        ]) as usize;
        let mut output = vec![0u8; isize + 1000];

        // Warmup
        let _ = inflate_into_pub(deflate_data, &mut output);
        let _ = libdeflater::Decompressor::new().deflate_decompress(deflate_data, &mut output);

        eprintln!("\n=== Turbo Path vs libdeflate (silesia) ===");

        // Benchmark libdeflate
        let iterations = 5;
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let size = libdeflater::Decompressor::new()
                .deflate_decompress(deflate_data, &mut output)
                .unwrap();
            assert!(size > 0);
        }
        let libdeflate_time = start.elapsed();
        let libdeflate_speed =
            (isize * iterations) as f64 / libdeflate_time.as_secs_f64() / 1_000_000.0;

        // Benchmark our turbo path
        let start = std::time::Instant::now();
        for _ in 0..iterations {
            let size = inflate_into_pub(deflate_data, &mut output).unwrap();
            assert!(size > 0);
        }
        let turbo_time = start.elapsed();
        let turbo_speed = (isize * iterations) as f64 / turbo_time.as_secs_f64() / 1_000_000.0;

        let ratio = turbo_speed / libdeflate_speed * 100.0;

        eprintln!(
            "libdeflate (C):   {:>8.1?} = {:>7.1} MB/s",
            libdeflate_time / iterations as u32,
            libdeflate_speed
        );
        eprintln!(
            "Turbo (Rust):     {:>8.1?} = {:>7.1} MB/s",
            turbo_time / iterations as u32,
            turbo_speed
        );
        eprintln!("Ratio: Turbo is {:.1}% of libdeflate", ratio);
    }

    /// Diagnostic test for tarball L1 decompression bug
    /// Creates mixed binary/text data (like a tarball), compresses with level 1,
    /// then decompresses and shows EXACTLY where first mismatch occurs.
    #[test]
    fn test_tarball_l1_diagnostic() {
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        eprintln!("\n{}", "=".repeat(70));
        eprintln!("DIAGNOSTIC: Tarball L1 Decompression Test");
        eprintln!("{}\n", "=".repeat(70));

        // Create tarball-like mixed data: source code + binary + repetitive
        let mut original = Vec::new();

        // Part 1: Source code patterns (like .rs files)
        for i in 0..1000 {
            let line = format!(
                "    fn function_{}(arg: u32) -> Result<String, Error> {{\n        \
                 let value = arg * {} + {};\n        Ok(format!(\"result: {{}}\", value))\n    }}\n\n",
                i,
                i % 17,
                i % 31
            );
            original.extend_from_slice(line.as_bytes());
        }
        eprintln!("Part 1 (source code): {} bytes", original.len());

        // Part 2: Binary-ish data (like .git objects)
        let binary_start = original.len();
        for i in 0..50000 {
            let b = ((i * 0x1234567) ^ (i >> 3)) as u8;
            original.push(b);
        }
        eprintln!(
            "Part 2 (binary): {} bytes (starts at {})",
            original.len() - binary_start,
            binary_start
        );

        // Part 3: Highly repetitive (triggers RLE)
        let rle_start = original.len();
        for _ in 0..10000 {
            original.extend_from_slice(b"AAAAAAAAAAAAAAAA");
        }
        eprintln!(
            "Part 3 (repetitive): {} bytes (starts at {})",
            original.len() - rle_start,
            rle_start
        );

        // Part 4: More source code
        let code2_start = original.len();
        for i in 0..500 {
            let line = format!(
                "// Comment line {} with some text\nconst VALUE_{}: u64 = {};\n",
                i,
                i,
                i * 12345
            );
            original.extend_from_slice(line.as_bytes());
        }
        eprintln!(
            "Part 4 (more code): {} bytes (starts at {})",
            original.len() - code2_start,
            code2_start
        );

        eprintln!("\nTotal original size: {} bytes", original.len());

        // Compress with level 1 (like gzip -1)
        let mut encoder = GzEncoder::new(Vec::new(), Compression::fast());
        encoder.write_all(&original).unwrap();
        let compressed = encoder.finish().unwrap();
        eprintln!(
            "Compressed size: {} bytes (ratio {:.1}%)",
            compressed.len(),
            compressed.len() as f64 / original.len() as f64 * 100.0
        );

        // Parse gzip header to get deflate data
        // Gzip format: magic(2) + method(1) + flags(1) + mtime(4) + xfl(1) + os(1) = 10 bytes min
        assert!(compressed[0] == 0x1f && compressed[1] == 0x8b, "Not gzip");
        let flags = compressed[3];
        let mut header_size = 10;
        // FEXTRA
        if flags & 0x04 != 0 {
            let xlen = u16::from_le_bytes([compressed[10], compressed[11]]) as usize;
            header_size = 12 + xlen;
        }
        // FNAME (null-terminated)
        if flags & 0x08 != 0 {
            while header_size < compressed.len() && compressed[header_size] != 0 {
                header_size += 1;
            }
            header_size += 1;
        }
        // FCOMMENT (null-terminated)
        if flags & 0x10 != 0 {
            while header_size < compressed.len() && compressed[header_size] != 0 {
                header_size += 1;
            }
            header_size += 1;
        }
        // FHCRC
        if flags & 0x02 != 0 {
            header_size += 2;
        }
        let deflate_data = &compressed[header_size..compressed.len() - 8];
        eprintln!(
            "Deflate data: {} bytes (header={}, trailer=8)",
            deflate_data.len(),
            header_size
        );

        // Decompress with our decoder
        let mut output = vec![0u8; original.len() + 1024];
        eprintln!("\nDecompressing with consume_first_decode...");

        let result = crate::decompress::inflate::consume_first_decode::inflate_consume_first(
            deflate_data,
            &mut output,
        );

        match result {
            Ok(decompressed_size) => {
                eprintln!("Decompression succeeded: {} bytes", decompressed_size);

                if decompressed_size != original.len() {
                    eprintln!(
                        "\n*** SIZE MISMATCH: expected {}, got {} ***",
                        original.len(),
                        decompressed_size
                    );
                }

                // Find first mismatch
                let check_len = decompressed_size.min(original.len());
                let mut mismatch_pos = None;

                for i in 0..check_len {
                    if output[i] != original[i] {
                        mismatch_pos = Some(i);
                        break;
                    }
                }

                if let Some(pos) = mismatch_pos {
                    eprintln!("\n{}", "*".repeat(70));
                    eprintln!("*** FIRST MISMATCH AT POSITION {} ***", pos);
                    eprintln!("{}", "*".repeat(70));

                    // Determine which section the mismatch is in
                    let section = if pos < binary_start {
                        "Part 1 (source code)"
                    } else if pos < rle_start {
                        "Part 2 (binary)"
                    } else if pos < code2_start {
                        "Part 3 (repetitive)"
                    } else {
                        "Part 4 (more code)"
                    };
                    eprintln!("Section: {}", section);

                    // Show context around mismatch
                    let ctx_start = pos.saturating_sub(32);
                    let ctx_end = (pos + 64).min(check_len);

                    eprintln!("\nExpected bytes around position {}:", pos);
                    eprintln!("  Offset: {:6} to {:6}", ctx_start, ctx_end);
                    eprint!("  Hex:   ");
                    for i in ctx_start..ctx_end {
                        if i == pos {
                            eprint!("[{:02x}]", original[i]);
                        } else {
                            eprint!(" {:02x} ", original[i]);
                        }
                    }
                    eprintln!();
                    eprint!("  ASCII: ");
                    for i in ctx_start..ctx_end {
                        let c = original[i];
                        let ch = if (32..127).contains(&c) {
                            c as char
                        } else {
                            '.'
                        };
                        if i == pos {
                            eprint!("[{}]", ch);
                        } else {
                            eprint!(" {} ", ch);
                        }
                    }
                    eprintln!();

                    eprintln!("\nActual bytes (our output):");
                    eprint!("  Hex:   ");
                    for i in ctx_start..ctx_end {
                        if i == pos {
                            eprint!("[{:02x}]", output[i]);
                        } else {
                            eprint!(" {:02x} ", output[i]);
                        }
                    }
                    eprintln!();
                    eprint!("  ASCII: ");
                    for i in ctx_start..ctx_end {
                        let c = output[i];
                        let ch = if (32..127).contains(&c) {
                            c as char
                        } else {
                            '.'
                        };
                        if i == pos {
                            eprint!("[{}]", ch);
                        } else {
                            eprint!(" {} ", ch);
                        }
                    }
                    eprintln!();

                    eprintln!(
                        "\nAt position {}: expected 0x{:02x} ('{}'), got 0x{:02x} ('{}')",
                        pos,
                        original[pos],
                        if original[pos] >= 32 && original[pos] < 127 {
                            original[pos] as char
                        } else {
                            '.'
                        },
                        output[pos],
                        if output[pos] >= 32 && output[pos] < 127 {
                            output[pos] as char
                        } else {
                            '.'
                        }
                    );

                    // Count total mismatches
                    let total_mismatches =
                        (0..check_len).filter(|&i| output[i] != original[i]).count();
                    eprintln!(
                        "\nTotal mismatches: {} out of {} bytes ({:.2}%)",
                        total_mismatches,
                        check_len,
                        total_mismatches as f64 / check_len as f64 * 100.0
                    );

                    panic!(
                        "Decompression mismatch at position {} (expected 0x{:02x}, got 0x{:02x})",
                        pos, original[pos], output[pos]
                    );
                } else if decompressed_size == original.len() {
                    eprintln!("\n*** SUCCESS: Output matches original exactly! ***");
                } else {
                    panic!(
                        "Size mismatch but no byte mismatch in overlap - expected {}, got {}",
                        original.len(),
                        decompressed_size
                    );
                }
            }
            Err(e) => {
                eprintln!("\n*** DECOMPRESSION FAILED: {} ***", e);
                panic!("Decompression failed: {}", e);
            }
        }
    }

    /// Test using actual gzip binary (if available)
    #[test]
    fn test_with_system_gzip() {
        use crate::decompress::inflate::consume_first_decode::inflate_consume_first;
        use std::process::Command;

        // Check if gzip is available
        let gzip_check = Command::new("gzip").arg("--version").output();
        if gzip_check.is_err() {
            eprintln!("gzip not available, skipping system gzip test");
            return;
        }

        eprintln!("\n{}", "=".repeat(70));
        eprintln!("Testing with SYSTEM GZIP binary");
        eprintln!("{}\n", "=".repeat(70));

        // Create tarball-like data
        let mut original = Vec::new();

        // Mixed content similar to a real tarball
        for i in 0..500 {
            let line = format!(
                "pub fn func_{}(x: i32) -> i32 {{ x * {} + {} }}\n",
                i,
                i % 17,
                i % 31
            );
            original.extend_from_slice(line.as_bytes());
        }
        // Binary-ish section
        for i in 0..20000 {
            original.push(((i * 0x1234567) ^ (i >> 3)) as u8);
        }
        // Repetitive section
        for _ in 0..5000 {
            original.extend_from_slice(b"REPEATREPEAT");
        }

        eprintln!("Original data: {} bytes", original.len());

        // Write to temp file. Defensive: env::temp_dir() can resolve to a
        // path that doesn't actually exist if TMPDIR is set to a stale
        // value (e.g. a leftover `mktemp -d` from a previous shell), so
        // ensure the directory exists before writing into it.
        let tmp_dir = std::env::temp_dir();
        std::fs::create_dir_all(&tmp_dir).ok();
        let input_path = tmp_dir.join("gzip_test_input.bin");
        let compressed_path = tmp_dir.join("gzip_test_input.bin.gz");

        std::fs::write(&input_path, &original).expect("Failed to write input");

        // Compress with gzip -1 (fast)
        let output = Command::new("gzip")
            .arg("-1")
            .arg("-f") // force overwrite
            .arg("-k") // keep original
            .arg(&input_path)
            .output()
            .expect("Failed to run gzip");

        if !output.status.success() {
            eprintln!("gzip failed: {}", String::from_utf8_lossy(&output.stderr));
            panic!("gzip compression failed");
        }

        // gzip creates .bin.gz (correct) or .gz (old gzippy ≤0.1.4 bug with -f)
        let compressed_path = if compressed_path.exists() {
            compressed_path
        } else {
            tmp_dir.join("gzip_test_input.gz")
        };
        let compressed = std::fs::read(&compressed_path).expect("Failed to read compressed file");
        eprintln!(
            "Compressed with gzip -1: {} bytes ({:.1}%)",
            compressed.len(),
            compressed.len() as f64 / original.len() as f64 * 100.0
        );

        // Parse gzip header
        assert!(
            compressed[0] == 0x1f && compressed[1] == 0x8b,
            "Not gzip format"
        );
        let flags = compressed[3];
        let mut header_size = 10;
        if flags & 0x04 != 0 {
            let xlen = u16::from_le_bytes([compressed[10], compressed[11]]) as usize;
            header_size = 12 + xlen;
        }
        if flags & 0x08 != 0 {
            while header_size < compressed.len() && compressed[header_size] != 0 {
                header_size += 1;
            }
            header_size += 1;
        }
        if flags & 0x10 != 0 {
            while header_size < compressed.len() && compressed[header_size] != 0 {
                header_size += 1;
            }
            header_size += 1;
        }
        if flags & 0x02 != 0 {
            header_size += 2;
        }
        let deflate_data = &compressed[header_size..compressed.len() - 8];
        eprintln!("Deflate data: {} bytes", deflate_data.len());

        // Decompress
        let mut output_buf = vec![0u8; original.len() + 1024];
        eprintln!("Decompressing...");

        let result = inflate_consume_first(deflate_data, &mut output_buf);

        // Cleanup
        let _ = std::fs::remove_file(&input_path);
        let _ = std::fs::remove_file(&compressed_path);

        match result {
            Ok(size) => {
                eprintln!("Decompressed: {} bytes", size);

                if size != original.len() {
                    panic!("Size mismatch: expected {}, got {}", original.len(), size);
                }

                for i in 0..original.len() {
                    if output_buf[i] != original[i] {
                        eprintln!("\n{}", "*".repeat(70));
                        eprintln!("MISMATCH at position {}", i);
                        eprintln!("{}", "*".repeat(70));

                        let start = i.saturating_sub(32);
                        let end = (i + 64).min(original.len());

                        eprintln!("\nExpected around {}:", i);
                        eprint!("  ");
                        for j in start..end {
                            if j == i {
                                eprint!("[{:02x}]", original[j]);
                            } else {
                                eprint!("{:02x} ", original[j]);
                            }
                        }
                        eprintln!();

                        eprintln!("\nGot:");
                        eprint!("  ");
                        for j in start..end {
                            if j == i {
                                eprint!("[{:02x}]", output_buf[j]);
                            } else {
                                eprint!("{:02x} ", output_buf[j]);
                            }
                        }
                        eprintln!();

                        panic!(
                            "Mismatch at {}: expected 0x{:02x}, got 0x{:02x}",
                            i, original[i], output_buf[i]
                        );
                    }
                }
                eprintln!("\n*** SUCCESS: Output matches original ***");
            }
            Err(e) => {
                panic!("Decompression failed: {}", e);
            }
        }
    }

    // ── Correctness regression tests for copy_match_into ──────────────────────

    /// Overflow must panic, not silently return the old out_pos.
    ///
    /// Old behaviour: returned `out_pos` unchanged, causing the next write to
    /// overwrite the same position → silent data corruption.
    #[test]
    #[should_panic(expected = "output buffer overflow")]
    fn test_copy_match_into_overflow_panics() {
        let mut output = vec![0u8; 10];
        for (i, b) in b"ABCDEFGHIJ".iter().enumerate() {
            output[i] = *b;
        }
        // out_pos=8 + length=5 = 13 > 10: must panic, not silently drop
        copy_match_into(&mut output, 8, 4, 5);
    }

    /// After an overflow-at-position-N, the byte at N must not be overwritten
    /// by a subsequent call.  With the old silent-return behaviour the second
    /// call would write at N again, clobbering data.
    #[test]
    fn test_copy_match_into_overflow_does_not_corrupt_previous() {
        // Craft a tiny roundtrip: fill a 12-byte buffer with a known pattern,
        // attempt a 14-byte copy (overflows by 2), catch the panic, then
        // verify nothing before out_pos was mutated.
        let mut output = vec![0u8; 12];
        for (i, b) in b"ABCDEFGHIJKL".iter().enumerate() {
            output[i] = *b;
        }
        // Non-overflowing copy into positions 6..10 (dist=6, len=4) — must succeed.
        let new_pos = copy_match_into(&mut output, 6, 6, 4);
        assert_eq!(new_pos, 10);
        // Positions 0..6 must be untouched.
        assert_eq!(&output[..6], b"ABCDEF");
    }

    /// Test copy_match_fast edge cases that might differ on x86_64
    #[test]
    fn test_copy_match_edge_cases() {
        use crate::decompress::inflate::consume_first_decode::inflate_consume_first;
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Test cases designed to exercise copy_match_fast edge cases:
        // 1. dist >= 32 && len >= 64 (SIMD path on ARM, scalar on x86_64)
        // 2. dist >= 8 (5-word unroll path)
        // 3. dist == 1 (RLE path)
        // 4. dist 2-7 (small distance path)

        let test_cases: Vec<(&str, Vec<u8>)> = vec![
            // Case 1: Large distance, large length (SIMD path)
            ("large_dist_len", {
                let mut data = vec![0u8; 10000];
                // Write pattern at start
                for i in 0..100 {
                    data[i] = (i * 7) as u8;
                }
                // Reference it from far away (distance > 32, length > 64)
                for i in 5000..5100 {
                    data[i] = data[i - 4900]; // Will compress as match
                }
                data
            }),
            // Case 2: Medium distance (dist >= 8)
            ("medium_dist", {
                let mut data = vec![0u8; 1000];
                for i in 0..100 {
                    data[i] = (i * 3) as u8;
                }
                for i in 110..210 {
                    data[i] = data[i - 10]; // distance = 10
                }
                data
            }),
            // Case 3: RLE (dist == 1)
            ("rle_pattern", {
                let mut data = Vec::new();
                data.extend_from_slice(b"Hello");
                data.extend(vec![b'A'; 1000]); // RLE
                data.extend_from_slice(b"World");
                data.extend(vec![b'B'; 500]); // More RLE
                data
            }),
            // Case 4: Small distance (2-7)
            ("small_dist", {
                let mut data = Vec::new();
                data.extend_from_slice(b"ABCDEFGH"); // 8 bytes
                                                     // Repeat with distance 3 (will use small distance path)
                for _ in 0..100 {
                    data.extend_from_slice(b"ABC");
                }
                data
            }),
            // Case 5: Very long RLE
            ("long_rle", {
                let data = vec![0x42u8; 100_000]; // 100KB of 'B'
                data
            }),
            // Case 6: Alternating patterns
            ("alternating", {
                let mut data = Vec::new();
                for i in 0..10000 {
                    data.push(if i % 2 == 0 { 0xAA } else { 0x55 });
                }
                data
            }),
            // Case 7: Binary data with matches at various distances
            ("binary_matches", {
                let mut data = vec![0u8; 50000];
                // Write unique pattern every 100 bytes
                for i in 0..500 {
                    let base = i * 100;
                    for j in 0..50 {
                        data[base + j] = ((i * 7 + j * 13) & 0xFF) as u8;
                    }
                }
                // Now create matches at various distances
                for i in 0..200 {
                    let src = (i * 50) % 25000;
                    let dst = 25000 + i * 100;
                    for j in 0..50 {
                        if dst + j < data.len() {
                            data[dst + j] = data[src + j];
                        }
                    }
                }
                data
            }),
        ];

        for (name, original) in test_cases {
            eprintln!("\n--- Testing: {} ({} bytes) ---", name, original.len());

            // Compress with level 1
            let mut encoder = GzEncoder::new(Vec::new(), Compression::fast());
            encoder.write_all(&original).unwrap();
            let compressed = encoder.finish().unwrap();

            eprintln!(
                "  Compressed: {} bytes ({:.1}%)",
                compressed.len(),
                compressed.len() as f64 / original.len() as f64 * 100.0
            );

            // Parse gzip header
            assert!(compressed[0] == 0x1f && compressed[1] == 0x8b);
            let flags = compressed[3];
            let mut header_size = 10;
            if flags & 0x04 != 0 {
                let xlen = u16::from_le_bytes([compressed[10], compressed[11]]) as usize;
                header_size = 12 + xlen;
            }
            if flags & 0x08 != 0 {
                while header_size < compressed.len() && compressed[header_size] != 0 {
                    header_size += 1;
                }
                header_size += 1;
            }
            if flags & 0x10 != 0 {
                while header_size < compressed.len() && compressed[header_size] != 0 {
                    header_size += 1;
                }
                header_size += 1;
            }
            if flags & 0x02 != 0 {
                header_size += 2;
            }
            let deflate_data = &compressed[header_size..compressed.len() - 8];

            // Decompress
            let mut output = vec![0u8; original.len() + 1024];
            let size = inflate_consume_first(deflate_data, &mut output)
                .unwrap_or_else(|e| panic!("Decompression failed for {}: {}", name, e));

            // Compare
            assert_eq!(size, original.len(), "{}: size mismatch", name);

            for i in 0..original.len() {
                if output[i] != original[i] {
                    eprintln!(
                        "  MISMATCH at {}: expected 0x{:02x}, got 0x{:02x}",
                        i, original[i], output[i]
                    );
                    // Show context
                    let start = i.saturating_sub(16);
                    let end = (i + 16).min(original.len());
                    eprintln!("  Expected: {:?}", &original[start..end]);
                    eprintln!("  Got:      {:?}", &output[start..end]);
                    panic!("{}: mismatch at position {}", name, i);
                }
            }
            eprintln!("  OK!");
        }
    }

    /// Test BGZF round-trip on barely-compressible data.
    ///
    /// This specifically targets the CI failure where gzippy→gzippy fails
    /// on tarball data at L1 T1. The data is designed to be mostly
    /// incompressible (like git packfiles in a tarball).
    #[test]
    fn test_bgzf_roundtrip_barely_compressible() {
        use crate::compress::parallel::GzipHeaderInfo;

        // Create ~1MB of barely-compressible data (random-ish with structure)
        let mut data = Vec::with_capacity(1024 * 1024);
        let mut state: u64 = 0xdeadbeef;
        while data.len() < 1024 * 1024 {
            // Mix of random-looking bytes and short repeated sequences
            state = state
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);
            data.extend_from_slice(&state.to_le_bytes());
            if state & 0xff < 30 {
                // ~12% chance of short repeat (simulates tar headers/alignment)
                let repeat_len = ((state >> 8) & 0x3f) as usize + 1;
                let repeat_byte = (state >> 16) as u8;
                data.extend(std::iter::repeat_n(repeat_byte, repeat_len));
            }
        }
        data.truncate(1024 * 1024);

        // Compress with our BGZF block writer (simulating gzippy -1)
        let header_info = GzipHeaderInfo {
            filename: Some("test.tar".to_string()),
            mtime: 1700000000,
            comment: None,
        };

        let block_size = 128 * 1024; // 128KB, same as default
        let mut compressed = Vec::new();
        for chunk in data.chunks(block_size) {
            let mut block_output = Vec::new();
            crate::compress::parallel::compress_block_bgzf_libdeflate(
                &mut block_output,
                chunk,
                1, // L1
                &header_info,
            );
            compressed.extend_from_slice(&block_output);
        }

        // Verify compressed data is valid gzip (decompress with gzip_decompress_ex)
        let mut decompressor = crate::backends::libdeflate::DecompressorEx::new();
        let mut verify_buf = vec![0u8; data.len() + 1024];
        let mut verify_offset = 0;
        let mut comp_offset = 0;
        while comp_offset < compressed.len() {
            if compressed[comp_offset] != 0x1f || compressed[comp_offset + 1] != 0x8b {
                break;
            }
            let result = decompressor
                .gzip_decompress_ex(&compressed[comp_offset..], &mut verify_buf[verify_offset..])
                .expect("sequential gzip_decompress_ex should succeed");
            verify_offset += result.output_size;
            comp_offset += result.input_consumed;
        }
        assert_eq!(
            verify_offset,
            data.len(),
            "sequential decompress should produce correct size"
        );
        assert_eq!(
            &verify_buf[..verify_offset],
            &data[..],
            "sequential decompress should match original"
        );

        // Now test the BGZF parallel path
        assert!(
            crate::decompress::format::has_bgzf_markers(&compressed),
            "compressed data should have BGZF markers"
        );

        let parallel_output =
            decompress_bgzf_parallel_to_vec(&compressed, 4).expect("BGZF parallel should succeed");

        assert_eq!(
            parallel_output.len(),
            data.len(),
            "BGZF parallel output size mismatch: expected={} got={} delta={}",
            data.len(),
            parallel_output.len(),
            parallel_output.len() as i64 - data.len() as i64
        );

        if parallel_output != data {
            // Find first difference
            let first_diff = parallel_output
                .iter()
                .zip(data.iter())
                .enumerate()
                .find(|(_, (a, b))| a != b)
                .map(|(i, _)| i)
                .unwrap_or(data.len().min(parallel_output.len()));
            panic!(
                "BGZF parallel output content mismatch: first_diff_at={} of {}",
                first_diff,
                data.len()
            );
        }
    }

    /// Same test but with larger data (10MB) to match CI conditions
    #[test]
    fn test_bgzf_roundtrip_large_barely_compressible() {
        use crate::compress::parallel::GzipHeaderInfo;

        let mut data = Vec::with_capacity(10 * 1024 * 1024);
        let mut state: u64 = 0xcafebabe;
        while data.len() < 10 * 1024 * 1024 {
            state = state
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);
            data.extend_from_slice(&state.to_le_bytes());
            if state & 0xff < 30 {
                let repeat_len = ((state >> 8) & 0x3f) as usize + 1;
                let repeat_byte = (state >> 16) as u8;
                data.extend(std::iter::repeat_n(repeat_byte, repeat_len));
            }
        }
        data.truncate(10 * 1024 * 1024);

        let header_info = GzipHeaderInfo {
            filename: Some("test.tar".to_string()),
            mtime: 1700000000,
            comment: None,
        };

        let block_size = 128 * 1024;
        let mut compressed = Vec::new();
        for chunk in data.chunks(block_size) {
            let mut block_output = Vec::new();
            crate::compress::parallel::compress_block_bgzf_libdeflate(
                &mut block_output,
                chunk,
                1,
                &header_info,
            );
            compressed.extend_from_slice(&block_output);
        }

        assert!(crate::decompress::format::has_bgzf_markers(&compressed));

        let parallel_output =
            decompress_bgzf_parallel_to_vec(&compressed, 4).expect("BGZF parallel should succeed");

        assert_eq!(
            parallel_output.len(),
            data.len(),
            "Large BGZF parallel output size: expected={} got={} delta={}",
            data.len(),
            parallel_output.len(),
            parallel_output.len() as i64 - data.len() as i64
        );
        assert_eq!(
            parallel_output, data,
            "Large BGZF parallel output content mismatch"
        );
    }

    /// Test that a single-member gzip file containing embedded gzip data
    /// doesn't produce extra bytes during multi-member parallel decompression.
    ///
    /// This reproduces the CI bug: a tarball containing .gz files is compressed
    /// as a single gzip member at L1 (stored blocks). The embedded .gz files
    /// appear as literal bytes in the deflate stream, and the multi-member
    /// parallel decompressor must not mistake them for member boundaries.
    #[test]
    fn test_single_member_with_embedded_gzip() {
        use flate2::write::GzEncoder;
        use flate2::Compression;
        use std::io::Write;

        // Create inner gzip data (simulates a .gz file inside a tarball)
        let inner_data = b"This is the inner gzip content that should not appear as extra output";
        let mut inner_encoder = GzEncoder::new(Vec::new(), Compression::default());
        inner_encoder.write_all(inner_data).unwrap();
        let inner_gz = inner_encoder.finish().unwrap();

        // Build a payload that contains the inner gzip as literal bytes
        // (simulating a tarball with .gz files, compressed with stored blocks)
        let mut payload = Vec::with_capacity(256 * 1024);
        payload.extend_from_slice(b"TAR HEADER PADDING ");
        payload.extend_from_slice(&[0u8; 493]); // pad to 512 bytes
        payload.extend_from_slice(&inner_gz); // embedded gzip data
                                              // Pad to make it large enough
        while payload.len() < 256 * 1024 {
            payload.push((payload.len() % 256) as u8);
        }

        // Compress the payload as a single gzip member
        let mut outer_encoder = GzEncoder::new(Vec::new(), Compression::fast());
        outer_encoder.write_all(&payload).unwrap();
        let compressed = outer_encoder.finish().unwrap();

        // Verify the compressed data IS a single member
        assert_eq!(compressed[0], 0x1f);
        assert_eq!(compressed[1], 0x8b);

        // Decompress with multi-member parallel
        let mut output = Vec::new();
        let result = decompress_multi_member_parallel(&compressed, &mut output, 4);

        match result {
            Ok(size) => {
                assert_eq!(
                    size as usize,
                    payload.len(),
                    "Multi-member parallel should produce exact original size, got {} extra bytes",
                    size as usize - payload.len()
                );
                assert_eq!(output, payload);
            }
            Err(_) => {
                // Error is also acceptable — single-member files aren't multi-member.
                // Verify sequential decompression works.
                output.clear();
                decompress_single_member(&compressed, &mut output).expect("sequential should work");
                assert_eq!(output.len(), payload.len());
                assert_eq!(output, payload);
            }
        }
    }
}