zrip-encode 0.2.1

#[cfg(feature = "alloc")]
use alloc::vec::Vec;

use zrip_core::Sequence;
use zrip_core::fse::table_builder::{
    build_decode_table, normalize_counts, serialize_fse_table_description,
};
use zrip_core::fse::{
    FseDecodeEntry, LL_BASELINE_TABLE, LL_BITS_TABLE, LL_DEFAULT_ACCURACY, LL_DEFAULT_DIST,
    ML_BASELINE_TABLE, ML_BITS_TABLE, ML_DEFAULT_ACCURACY, ML_DEFAULT_DIST, OF_DEFAULT_ACCURACY,
    OF_DEFAULT_DIST,
};
use zrip_core::huffman::encode::HuffmanEncodeTable;

#[derive(Clone, Copy)]
struct PackedSeq {
    extra_bits: u64,
    ll_c: u8,
    ml_c: u8,
    of_c: u8,
    extra_nbits: u8,
}

pub(crate) struct BlockEncodeWorkspace {
    pub lit_buf: Vec<u8>,
    pub lit_section: Vec<u8>,
    pub pred_seq: Vec<u8>,
    pub cust_seq: Vec<u8>,
    pub huf_concat: Vec<u8>,
    pub huf_stream: Vec<u8>,
    pub pred_writer_buf: Vec<u8>,
    pub cust_writer_buf: Vec<u8>,
    packed_seqs: Vec<PackedSeq>,
    pub prev_huffman: Option<HuffmanEncodeTable>,
}

impl BlockEncodeWorkspace {
    pub fn new() -> Self {
        Self {
            lit_buf: Vec::new(),
            lit_section: Vec::new(),
            pred_seq: Vec::new(),
            cust_seq: Vec::new(),
            huf_concat: Vec::new(),
            huf_stream: Vec::new(),
            pred_writer_buf: Vec::new(),
            cust_writer_buf: Vec::new(),
            packed_seqs: Vec::new(),
            prev_huffman: None,
        }
    }
}

/// C zstd-style symbolTT entry. Two fields per symbol encode the
/// variable-width bit emission and state transition in one lookup.
/// Total predefined table footprint: ~1.3 KB (was ~20 KB).
#[derive(Clone, Copy)]
#[repr(C)]
struct SymbolTT {
    delta_nb_bits: u32,
    delta_find_state: i32,
}

struct FseEncodeTable {
    symbol_tt: [SymbolTT; MAX_SYMBOLS],
    state_table: [u16; MAX_TABLE_SIZE],
    table_size: u32,
    table_log: u8,
}

const MAX_SYMBOLS: usize = 53;
const MAX_TABLE_SIZE: usize = 512;

impl FseEncodeTable {
    fn build(
        decode_table: &[FseDecodeEntry],
        table_size: usize,
        max_symbol: usize,
        table_log: u8,
    ) -> Self {
        let num_symbols = max_symbol + 1;
        debug_assert!(num_symbols <= MAX_SYMBOLS);
        debug_assert!(table_size <= MAX_TABLE_SIZE);

        let mut count = [0u32; MAX_SYMBOLS];
        for i in 0..table_size {
            count[decode_table[i].symbol as usize] += 1;
        }

        let mut cumul = [0u32; MAX_SYMBOLS + 1];
        for s in 0..num_symbols {
            cumul[s + 1] = cumul[s] + count[s];
        }

        let default_delta = (table_log as u32 + 1) << 16;
        let mut symbol_tt = [SymbolTT {
            delta_nb_bits: default_delta,
            delta_find_state: 0,
        }; MAX_SYMBOLS];
        let mut total = 0i32;
        for s in 0..num_symbols {
            let c = count[s];
            if c == 0 {
                symbol_tt[s].delta_nb_bits = default_delta | (1u32 << table_log);
                continue;
            }
            let max_bits_out = if c == 1 {
                table_log as u32
            } else {
                table_log as u32 - (31 - (c - 1).leading_zeros())
            };
            let min_state_plus = c << max_bits_out;
            symbol_tt[s].delta_nb_bits = (max_bits_out << 16).wrapping_sub(min_state_plus);
            symbol_tt[s].delta_find_state = total - c as i32;
            total += c as i32;
        }

        let mut state_table = [0u16; MAX_TABLE_SIZE];
        let mut cumul_copy = cumul;
        for i in 0..table_size {
            let s = decode_table[i].symbol as usize;
            let idx = cumul_copy[s] as usize;
            state_table[idx] = (table_size + i) as u16;
            cumul_copy[s] += 1;
        }

        Self {
            symbol_tt,
            state_table,
            table_size: table_size as u32,
            table_log,
        }
    }

    #[inline]
    fn init_state(&self, symbol: u8) -> u32 {
        let tt = unsafe { self.symbol_tt.get_unchecked(symbol as usize) };
        let nb_bits_out = tt.delta_nb_bits.wrapping_add(1 << 15) >> 16;
        let base_state = (nb_bits_out << 16).wrapping_sub(tt.delta_nb_bits);
        let idx = (base_state >> nb_bits_out) as i32 + tt.delta_find_state;
        (unsafe { *self.state_table.get_unchecked(idx as usize) }) as u32
    }
}

static PREDEFINED_TABLES: std::sync::LazyLock<PredefinedEncodeTables> =
    std::sync::LazyLock::new(PredefinedEncodeTables::build);

struct PredefinedEncodeTables {
    ll: FseEncodeTable,
    ml: FseEncodeTable,
    of: FseEncodeTable,
}

impl PredefinedEncodeTables {
    fn build() -> Self {
        use zrip_core::fse::table_builder::build_decode_table_from_default;

        let ll_decode = build_decode_table_from_default(&LL_DEFAULT_DIST, LL_DEFAULT_ACCURACY);
        let ml_decode = build_decode_table_from_default(&ML_DEFAULT_DIST, ML_DEFAULT_ACCURACY);
        let of_decode = build_decode_table_from_default(&OF_DEFAULT_DIST, OF_DEFAULT_ACCURACY);

        Self {
            ll: FseEncodeTable::build(
                &ll_decode,
                1 << LL_DEFAULT_ACCURACY,
                35,
                LL_DEFAULT_ACCURACY,
            ),
            ml: FseEncodeTable::build(
                &ml_decode,
                1 << ML_DEFAULT_ACCURACY,
                52,
                ML_DEFAULT_ACCURACY,
            ),
            of: FseEncodeTable::build(
                &of_decode,
                1 << OF_DEFAULT_ACCURACY,
                31,
                OF_DEFAULT_ACCURACY,
            ),
        }
    }
}

pub fn encode_raw_block(data: &[u8], last: bool, output: &mut Vec<u8>) {
    let block_size = data.len() as u32;
    let header = (block_size << 3) | if last { 1 } else { 0 };
    output.push(header as u8);
    output.push((header >> 8) as u8);
    output.push((header >> 16) as u8);
    output.extend_from_slice(data);
}

pub(crate) fn encode_compressed_block(
    src: &[u8],
    sequences: &[Sequence],
    rep_offsets: &mut [u32; 3],
    last: bool,
    output: &mut Vec<u8>,
    workspace: &mut BlockEncodeWorkspace,
) {
    if sequences.is_empty() {
        encode_raw_block(src, last, output);
        return;
    }

    let n = sequences.len();
    let total_match: usize = sequences.iter().map(|s| s.match_length as usize).sum();
    if total_match <= n * 3 {
        encode_raw_block(src, last, output);
        return;
    }

    let saved_rep = *rep_offsets;

    workspace.lit_buf.clear();
    workspace.lit_buf.reserve(src.len() + 16);
    workspace.packed_seqs.clear();
    workspace.packed_seqs.reserve(n);
    let lit_dst = workspace.lit_buf.as_mut_ptr();
    let src_ptr = src.as_ptr();
    let mut lit_pos = 0usize;
    let mut lit_offset = 0usize;

    let do_codes = n >= 64;
    let mut ll_freq = [0u32; 36];
    let mut ml_freq = [0u32; 53];
    let mut of_freq = [0u32; 32];
    let mut total_extra_bits: u64 = 0;

    // Local rep offsets: avoids &mut aliasing barrier per iteration
    let mut rep0 = rep_offsets[0];
    let mut rep1 = rep_offsets[1];
    let mut rep2 = rep_offsets[2];

    let packed_ptr = workspace.packed_seqs.as_mut_ptr();
    let seq_ptr = sequences.as_ptr();
    let src_len = src.len();

    for i in 0..n {
        let seq = unsafe { &*seq_ptr.add(i) };
        let ll = seq.literal_length as usize;
        if lit_offset + ll <= src_len {
            unsafe {
                let s = src_ptr.add(lit_offset);
                let d = lit_dst.add(lit_pos);
                if ll <= 16 && lit_offset + 16 <= src_len {
                    (d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
                    (d.add(8) as *mut u64)
                        .write_unaligned((s.add(8) as *const u64).read_unaligned());
                } else {
                    core::ptr::copy_nonoverlapping(s, d, ll);
                }
            }
            lit_pos += ll;
        }
        lit_offset += ll + seq.match_length as usize;

        let actual_offset = seq.offset;
        let ov = if seq.literal_length > 0 {
            if actual_offset == rep0 {
                1
            } else if actual_offset == rep1 {
                rep1 = rep0;
                rep0 = actual_offset;
                2
            } else if actual_offset == rep2 {
                rep2 = rep1;
                rep1 = rep0;
                rep0 = actual_offset;
                3
            } else {
                rep2 = rep1;
                rep1 = rep0;
                rep0 = actual_offset;
                actual_offset + 3
            }
        } else if actual_offset == rep1 {
            rep1 = rep0;
            rep0 = actual_offset;
            1
        } else if actual_offset == rep2 {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            2
        } else if rep0 > 1 && actual_offset == rep0 - 1 {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            3
        } else {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            actual_offset + 3
        };

        let ll_c = ll_code(seq.literal_length);
        let ml_c = ml_code(seq.match_length);
        let of_c = of_code(ov);

        let ll_nb = LL_BITS_TABLE[ll_c as usize];
        let ml_nb = ML_BITS_TABLE[ml_c as usize];
        let ll_extra = seq.literal_length - LL_BASELINE_TABLE[ll_c as usize];
        let ml_extra = seq.match_length - ML_BASELINE_TABLE[ml_c as usize];
        let of_extra = ov - (1u32 << of_c);
        let extra_bits = (ll_extra as u64)
            | ((ml_extra as u64) << ll_nb)
            | ((of_extra as u64) << (ll_nb + ml_nb));
        let extra_nbits = ll_nb + ml_nb + of_c;

        unsafe {
            packed_ptr.add(i).write(PackedSeq {
                extra_bits,
                ll_c,
                ml_c,
                of_c,
                extra_nbits,
            });
        }

        if do_codes {
            ll_freq[ll_c as usize] += 1;
            ml_freq[ml_c as usize] += 1;
            of_freq[of_c as usize] += 1;
            total_extra_bits += extra_nbits as u64;
        }
    }
    unsafe {
        workspace.packed_seqs.set_len(n);
    }
    rep_offsets[0] = rep0;
    rep_offsets[1] = rep1;
    rep_offsets[2] = rep2;

    if lit_offset < src_len {
        let tail = src_len - lit_offset;
        unsafe {
            core::ptr::copy_nonoverlapping(src_ptr.add(lit_offset), lit_dst.add(lit_pos), tail);
        }
        lit_pos += tail;
    }
    unsafe {
        workspace.lit_buf.set_len(lit_pos);
    }

    encode_literals_section(
        &workspace.lit_buf,
        &mut workspace.lit_section,
        &mut workspace.huf_concat,
        &mut workspace.huf_stream,
        &mut workspace.prev_huffman,
    );

    let seq_data = if do_codes {
        let pred_est = estimate_predefined_cost(&ll_freq, &ml_freq, &of_freq, total_extra_bits, n);
        let use_custom = encode_seq_custom(
            &workspace.packed_seqs,
            &ll_freq,
            &ml_freq,
            &of_freq,
            n,
            pred_est,
            &mut workspace.cust_seq,
            &mut workspace.cust_writer_buf,
        );
        if use_custom && workspace.cust_seq.len() < pred_est {
            &workspace.cust_seq
        } else {
            encode_seq_predefined(
                &workspace.packed_seqs,
                &mut workspace.pred_seq,
                &mut workspace.pred_writer_buf,
            );
            if use_custom && workspace.cust_seq.len() < workspace.pred_seq.len() {
                &workspace.cust_seq
            } else {
                &workspace.pred_seq
            }
        }
    } else {
        encode_seq_predefined(
            &workspace.packed_seqs,
            &mut workspace.pred_seq,
            &mut workspace.pred_writer_buf,
        );
        &workspace.pred_seq
    };

    let block_len = workspace.lit_section.len() + seq_data.len();
    if block_len >= src.len() {
        *rep_offsets = saved_rep;
        encode_raw_block(src, last, output);
        return;
    }

    let block_size = block_len as u32;
    let header = (block_size << 3) | 0x04 | if last { 1 } else { 0 };
    output.push(header as u8);
    output.push((header >> 8) as u8);
    output.push((header >> 16) as u8);
    output.extend_from_slice(&workspace.lit_section);
    output.extend_from_slice(seq_data);
}

pub(crate) fn encode_compressed_block_raw(
    src: &[u8],
    sequences: &[Sequence],
    rep_offsets: &mut [u32; 3],
    last: bool,
    output: &mut Vec<u8>,
    workspace: &mut BlockEncodeWorkspace,
) {
    if sequences.is_empty() {
        encode_raw_block(src, last, output);
        return;
    }

    let n = sequences.len();
    let total_match: usize = sequences.iter().map(|s| s.match_length as usize).sum();
    if total_match <= n * 3 {
        encode_raw_block(src, last, output);
        return;
    }

    let saved_rep = *rep_offsets;

    workspace.lit_buf.clear();
    workspace.lit_buf.reserve(src.len() + 16);
    workspace.packed_seqs.clear();
    workspace.packed_seqs.reserve(n);
    let lit_dst = workspace.lit_buf.as_mut_ptr();
    let src_ptr = src.as_ptr();
    let mut lit_pos = 0usize;
    let mut lit_offset = 0usize;

    let mut rep0 = rep_offsets[0];
    let mut rep1 = rep_offsets[1];
    let mut rep2 = rep_offsets[2];
    let packed_ptr = workspace.packed_seqs.as_mut_ptr();
    let seq_ptr = sequences.as_ptr();
    let src_len = src.len();

    for i in 0..n {
        let seq = unsafe { &*seq_ptr.add(i) };
        let ll = seq.literal_length as usize;
        if lit_offset + ll <= src_len {
            unsafe {
                let s = src_ptr.add(lit_offset);
                let d = lit_dst.add(lit_pos);
                if ll <= 16 && lit_offset + 16 <= src_len {
                    (d as *mut u64).write_unaligned((s as *const u64).read_unaligned());
                    (d.add(8) as *mut u64)
                        .write_unaligned((s.add(8) as *const u64).read_unaligned());
                } else {
                    core::ptr::copy_nonoverlapping(s, d, ll);
                }
            }
            lit_pos += ll;
        }
        lit_offset += ll + seq.match_length as usize;

        let actual_offset = seq.offset;
        let ov = if seq.literal_length > 0 {
            if actual_offset == rep0 {
                1
            } else if actual_offset == rep1 {
                rep1 = rep0;
                rep0 = actual_offset;
                2
            } else if actual_offset == rep2 {
                rep2 = rep1;
                rep1 = rep0;
                rep0 = actual_offset;
                3
            } else {
                rep2 = rep1;
                rep1 = rep0;
                rep0 = actual_offset;
                actual_offset + 3
            }
        } else if actual_offset == rep1 {
            rep1 = rep0;
            rep0 = actual_offset;
            1
        } else if actual_offset == rep2 {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            2
        } else if rep0 > 1 && actual_offset == rep0 - 1 {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            3
        } else {
            rep2 = rep1;
            rep1 = rep0;
            rep0 = actual_offset;
            actual_offset + 3
        };

        let ll_c = ll_code(seq.literal_length);
        let ml_c = ml_code(seq.match_length);
        let of_c = of_code(ov);

        let ll_nb = LL_BITS_TABLE[ll_c as usize];
        let ml_nb = ML_BITS_TABLE[ml_c as usize];
        let ll_extra = seq.literal_length - LL_BASELINE_TABLE[ll_c as usize];
        let ml_extra = seq.match_length - ML_BASELINE_TABLE[ml_c as usize];
        let of_extra = ov - (1u32 << of_c);
        let extra_bits = (ll_extra as u64)
            | ((ml_extra as u64) << ll_nb)
            | ((of_extra as u64) << (ll_nb + ml_nb));
        let extra_nbits = ll_nb + ml_nb + of_c;

        unsafe {
            packed_ptr.add(i).write(PackedSeq {
                extra_bits,
                ll_c,
                ml_c,
                of_c,
                extra_nbits,
            });
        }
    }
    unsafe {
        workspace.packed_seqs.set_len(n);
    }
    rep_offsets[0] = rep0;
    rep_offsets[1] = rep1;
    rep_offsets[2] = rep2;

    if lit_offset < src_len {
        let tail = src_len - lit_offset;
        unsafe {
            core::ptr::copy_nonoverlapping(src_ptr.add(lit_offset), lit_dst.add(lit_pos), tail);
        }
        lit_pos += tail;
    }
    unsafe {
        workspace.lit_buf.set_len(lit_pos);
    }

    workspace.lit_section.clear();
    encode_raw_literals_section(&workspace.lit_buf, &mut workspace.lit_section);

    encode_seq_predefined(
        &workspace.packed_seqs,
        &mut workspace.pred_seq,
        &mut workspace.pred_writer_buf,
    );

    let block_len = workspace.lit_section.len() + workspace.pred_seq.len();
    if block_len >= src.len() {
        *rep_offsets = saved_rep;
        encode_raw_block(src, last, output);
        return;
    }

    let block_size = block_len as u32;
    let header = (block_size << 3) | 0x04 | if last { 1 } else { 0 };
    output.push(header as u8);
    output.push((header >> 8) as u8);
    output.push((header >> 16) as u8);
    output.extend_from_slice(&workspace.lit_section);
    output.extend_from_slice(&workspace.pred_seq);
}

fn encode_literals_section(
    lits: &[u8],
    output: &mut Vec<u8>,
    huf_concat: &mut Vec<u8>,
    huf_stream: &mut Vec<u8>,
    prev_huffman: &mut Option<HuffmanEncodeTable>,
) {
    output.clear();

    let use_4_streams = lits.len() >= 1024;

    // Try reusing the previous Huffman table (treeless block)
    if let Some(prev) = prev_huffman.as_ref() {
        if prev.can_encode(lits) {
            if use_4_streams {
                prev.encode_4_streams_into(lits, huf_concat, huf_stream);
            } else {
                prev.encode_single_stream_into(lits, huf_concat);
            }
            let compressed_size = huf_concat.len();
            if compressed_size < lits.len() {
                encode_treeless_literals_header(lits.len(), compressed_size, use_4_streams, output);
                output.extend_from_slice(huf_concat);
                return;
            }
        }
    }

    if let Some(table) = HuffmanEncodeTable::from_data(lits) {
        let tree_desc = table.serialize_weights();
        if use_4_streams {
            table.encode_4_streams_into(lits, huf_concat, huf_stream);
        } else {
            table.encode_single_stream_into(lits, huf_concat);
        }

        let compressed_size = tree_desc.len() + huf_concat.len();

        if compressed_size < lits.len() {
            encode_compressed_literals_header(lits.len(), compressed_size, use_4_streams, output);
            output.extend_from_slice(&tree_desc);
            output.extend_from_slice(huf_concat);
            *prev_huffman = Some(table);
            return;
        }
    }

    *prev_huffman = None;
    encode_raw_literals_section(lits, output);
}

fn encode_treeless_literals_header(
    regen_size: usize,
    comp_size: usize,
    four_streams: bool,
    output: &mut Vec<u8>,
) {
    encode_huf_literals_header(0x03, regen_size, comp_size, four_streams, output);
}

fn encode_compressed_literals_header(
    regen_size: usize,
    comp_size: usize,
    four_streams: bool,
    output: &mut Vec<u8>,
) {
    encode_huf_literals_header(0x02, regen_size, comp_size, four_streams, output);
}

fn encode_huf_literals_header(
    block_type: u8,
    regen_size: usize,
    comp_size: usize,
    four_streams: bool,
    output: &mut Vec<u8>,
) {
    if regen_size <= 1023 && comp_size <= 1023 {
        let size_format: u8 = if four_streams { 1 } else { 0 };
        let both = (regen_size as u32) | ((comp_size as u32) << 10);
        output.push(block_type | (size_format << 2) | (((both & 0x0F) as u8) << 4));
        output.push((both >> 4) as u8);
        output.push((both >> 12) as u8);
    } else if regen_size <= 16383 && comp_size <= 16383 {
        let both = (regen_size as u32) | ((comp_size as u32) << 14);
        output.push(block_type | (2 << 2) | (((both & 0x0F) as u8) << 4));
        output.push((both >> 4) as u8);
        output.push((both >> 12) as u8);
        output.push((both >> 20) as u8);
    } else {
        let both = (regen_size as u64) | ((comp_size as u64) << 18);
        output.push(block_type | (3 << 2) | (((both & 0x0F) as u8) << 4));
        output.push((both >> 4) as u8);
        output.push((both >> 12) as u8);
        output.push((both >> 20) as u8);
        output.push((both >> 28) as u8);
    }
}

fn encode_raw_literals_section(lits: &[u8], output: &mut Vec<u8>) {
    let size = lits.len();

    if size <= 31 {
        output.push((size as u8) << 3);
    } else if size <= 4095 {
        let b0 = 0x04 | ((size as u8 & 0x0F) << 4);
        let b1 = (size >> 4) as u8;
        output.push(b0);
        output.push(b1);
    } else {
        let b0 = 0x0C | ((size as u8 & 0x0F) << 4);
        let b1 = (size >> 4) as u8;
        let b2 = (size >> 12) as u8;
        output.push(b0);
        output.push(b1);
        output.push(b2);
    }

    output.extend_from_slice(lits);
}

fn encode_seq_predefined(packed: &[PackedSeq], output: &mut Vec<u8>, writer_buf: &mut Vec<u8>) {
    let n = packed.len();
    let num_seq = n as u32;
    output.clear();
    output.reserve(n * 4 + 4);

    if num_seq < 128 {
        output.push(num_seq as u8);
    } else if num_seq < 0x7F00 {
        output.push(((num_seq >> 8) + 128) as u8);
        output.push(num_seq as u8);
    } else {
        output.push(0xFF);
        let adj = num_seq - 0x7F00;
        output.push(adj as u8);
        output.push((adj >> 8) as u8);
    }

    output.push(0x00);

    let tables = &*PREDEFINED_TABLES;
    let last = unsafe { packed.get_unchecked(n - 1) };

    let max_out = n * 18 + 16;
    writer_buf.clear();
    writer_buf.reserve(max_out);

    let mut ll_state = tables.ll.init_state(last.ll_c);
    let mut of_state = tables.of.init_state(last.of_c);
    let mut ml_state = tables.ml.init_state(last.ml_c);

    let buf_ptr = writer_buf.as_mut_ptr();
    let mut pos: usize = 0;
    let mut bits: u64 = 0;
    let mut bits_used: u32 = 0;

    macro_rules! add_bits {
        ($val:expr, $n:expr) => {
            bits |= ($val as u64) << bits_used;
            bits_used += $n as u32;
        };
    }
    macro_rules! flush_fast {
        () => {
            unsafe {
                (buf_ptr.add(pos) as *mut u64).write_unaligned(bits.to_le());
            }
            let nb = (bits_used >> 3) as usize;
            pos += nb;
            bits >>= (nb << 3) as u64;
            bits_used &= 7;
        };
    }
    macro_rules! encode_transition {
        ($table:expr, $symbol:expr, $state:expr) => {{
            let tt = unsafe { $table.symbol_tt.get_unchecked($symbol as usize) };
            let nb = (tt.delta_nb_bits.wrapping_add($state)) >> 16;
            add_bits!($state & ((1u32 << nb) - 1), nb);
            let idx = ($state >> nb) as i32 + tt.delta_find_state;
            $state = unsafe { *$table.state_table.get_unchecked(idx as usize) } as u32;
        }};
    }

    add_bits!(last.extra_bits, last.extra_nbits);

    for k in (0..n - 1).rev() {
        let p = unsafe { packed.get_unchecked(k) };

        flush_fast!();
        encode_transition!(tables.of, p.of_c, of_state);
        encode_transition!(tables.ml, p.ml_c, ml_state);
        encode_transition!(tables.ll, p.ll_c, ll_state);
        flush_fast!();

        add_bits!(p.extra_bits, p.extra_nbits);
    }

    flush_fast!();
    let ll_ts = tables.ll.table_size;
    let of_ts = tables.of.table_size;
    let ml_ts = tables.ml.table_size;
    add_bits!(ml_state - ml_ts, ML_DEFAULT_ACCURACY);
    add_bits!(of_state - of_ts, OF_DEFAULT_ACCURACY);
    add_bits!(ll_state - ll_ts, LL_DEFAULT_ACCURACY);

    add_bits!(1u32, 1u8);
    flush_fast!();
    while bits_used > 0 {
        unsafe {
            *buf_ptr.add(pos) = bits as u8;
        }
        bits >>= 8;
        bits_used = bits_used.saturating_sub(8);
        pos += 1;
    }
    unsafe {
        writer_buf.set_len(pos);
    }
    output.extend_from_slice(writer_buf);
}

enum TableEnc {
    Compressed {
        enc_table: FseEncodeTable,
        header: Vec<u8>,
    },
    Rle(u8),
}

fn build_table_enc(freqs: &[u32], max_sym: usize, max_log: u8, n: usize) -> Option<TableEnc> {
    let distinct: Vec<usize> = (0..=max_sym).filter(|&s| freqs[s] > 0).collect();
    if distinct.len() <= 1 {
        return Some(TableEnc::Rle(distinct.first().copied().unwrap_or(0) as u8));
    }
    let num_distinct = distinct.len();
    let min_log_for_sym = (32 - ((num_distinct as u32 + 1).leading_zeros())) as u8;
    let acc = optimal_table_log(max_log, n, max_sym).max(min_log_for_sym);
    if acc > max_log {
        return None;
    }
    let dist = normalize_counts(&freqs[..=max_sym], acc);
    for s in 0..=max_sym {
        if freqs[s] > 0 && dist[s] == 0 {
            return None;
        }
    }
    let header = serialize_fse_table_description(&dist, acc);
    let decode_table = build_decode_table(&dist, acc).ok()?;
    let enc_table = FseEncodeTable::build(&decode_table, 1 << acc, max_sym, acc);
    Some(TableEnc::Compressed { enc_table, header })
}

fn encode_seq_custom(
    packed: &[PackedSeq],
    ll_freq: &[u32; 36],
    ml_freq: &[u32; 53],
    of_freq: &[u32; 32],
    n: usize,
    pred_size: usize,
    output: &mut Vec<u8>,
    writer_buf: &mut Vec<u8>,
) -> bool {
    output.clear();

    let ll_max = ll_freq.iter().rposition(|&f| f > 0).unwrap_or(0);
    let ml_max = ml_freq.iter().rposition(|&f| f > 0).unwrap_or(0);
    let of_max = of_freq.iter().rposition(|&f| f > 0).unwrap_or(0);

    let estimated_bits = estimate_fse_bits(&ll_freq[..=ll_max], n)
        + estimate_fse_bits(&ml_freq[..=ml_max], n)
        + estimate_fse_bits(&of_freq[..=of_max], n);
    let estimated_header = 20;
    let estimated_bytes = (estimated_bits + 7) / 8 + estimated_header + 4;
    if estimated_bytes >= pred_size {
        return false;
    }

    let ll_enc = match build_table_enc(ll_freq, ll_max, 9, n) {
        Some(e) => e,
        None => return false,
    };
    let of_enc = match build_table_enc(of_freq, of_max, 8, n) {
        Some(e) => e,
        None => return false,
    };
    let ml_enc = match build_table_enc(ml_freq, ml_max, 9, n) {
        Some(e) => e,
        None => return false,
    };

    let ll_mode: u8 = match &ll_enc {
        TableEnc::Compressed { .. } => 2,
        TableEnc::Rle(_) => 1,
    };
    let of_mode: u8 = match &of_enc {
        TableEnc::Compressed { .. } => 2,
        TableEnc::Rle(_) => 1,
    };
    let ml_mode: u8 = match &ml_enc {
        TableEnc::Compressed { .. } => 2,
        TableEnc::Rle(_) => 1,
    };

    let num_seq = n as u32;
    output.reserve(n * 4 + 64);

    if num_seq < 128 {
        output.push(num_seq as u8);
    } else if num_seq < 0x7F00 {
        output.push(((num_seq >> 8) + 128) as u8);
        output.push(num_seq as u8);
    } else {
        output.push(0xFF);
        let adj = num_seq - 0x7F00;
        output.push(adj as u8);
        output.push((adj >> 8) as u8);
    }

    output.push((ll_mode << 6) | (of_mode << 4) | (ml_mode << 2));

    match &ll_enc {
        TableEnc::Rle(sym) => output.push(*sym),
        TableEnc::Compressed { header, .. } => output.extend_from_slice(header),
    }
    match &of_enc {
        TableEnc::Rle(sym) => output.push(*sym),
        TableEnc::Compressed { header, .. } => output.extend_from_slice(header),
    }
    match &ml_enc {
        TableEnc::Rle(sym) => output.push(*sym),
        TableEnc::Compressed { header, .. } => output.extend_from_slice(header),
    }

    let last = &packed[n - 1];

    let max_out = n * 18 + 16;
    writer_buf.clear();
    writer_buf.reserve(max_out);
    let buf_ptr = writer_buf.as_mut_ptr();
    let mut pos: usize = 0;
    let mut bits: u64 = 0;
    let mut bits_used: u32 = 0;

    macro_rules! add_bits {
        ($val:expr, $n:expr) => {
            bits |= ($val as u64) << bits_used;
            bits_used += $n as u32;
        };
    }
    macro_rules! flush_fast {
        () => {
            unsafe {
                (buf_ptr.add(pos) as *mut u64).write_unaligned(bits.to_le());
            }
            let nb = (bits_used >> 3) as usize;
            pos += nb;
            bits >>= (nb << 3) as u64;
            bits_used &= 7;
        };
    }

    add_bits!(last.extra_bits, last.extra_nbits);

    match (&ll_enc, &of_enc, &ml_enc) {
        (
            TableEnc::Compressed {
                enc_table: ll_t, ..
            },
            TableEnc::Compressed {
                enc_table: of_t, ..
            },
            TableEnc::Compressed {
                enc_table: ml_t, ..
            },
        ) => {
            let mut ll_s = ll_t.init_state(last.ll_c);
            let mut of_s = of_t.init_state(last.of_c);
            let mut ml_s = ml_t.init_state(last.ml_c);

            for k in (0..n - 1).rev() {
                let p = unsafe { packed.get_unchecked(k) };

                flush_fast!();
                {
                    let tt = unsafe { of_t.symbol_tt.get_unchecked(p.of_c as usize) };
                    let nb = (tt.delta_nb_bits.wrapping_add(of_s)) >> 16;
                    add_bits!(of_s & ((1u32 << nb) - 1), nb);
                    let idx = (of_s >> nb) as i32 + tt.delta_find_state;
                    of_s = unsafe { *of_t.state_table.get_unchecked(idx as usize) } as u32;
                }
                {
                    let tt = unsafe { ml_t.symbol_tt.get_unchecked(p.ml_c as usize) };
                    let nb = (tt.delta_nb_bits.wrapping_add(ml_s)) >> 16;
                    add_bits!(ml_s & ((1u32 << nb) - 1), nb);
                    let idx = (ml_s >> nb) as i32 + tt.delta_find_state;
                    ml_s = unsafe { *ml_t.state_table.get_unchecked(idx as usize) } as u32;
                }
                {
                    let tt = unsafe { ll_t.symbol_tt.get_unchecked(p.ll_c as usize) };
                    let nb = (tt.delta_nb_bits.wrapping_add(ll_s)) >> 16;
                    add_bits!(ll_s & ((1u32 << nb) - 1), nb);
                    let idx = (ll_s >> nb) as i32 + tt.delta_find_state;
                    ll_s = unsafe { *ll_t.state_table.get_unchecked(idx as usize) } as u32;
                }
                flush_fast!();

                add_bits!(p.extra_bits, p.extra_nbits);
            }

            flush_fast!();
            add_bits!(ml_s - ml_t.table_size, ml_t.table_log);
            add_bits!(of_s - of_t.table_size, of_t.table_log);
            add_bits!(ll_s - ll_t.table_size, ll_t.table_log);
        }
        _ => {
            let ll_table = match &ll_enc {
                TableEnc::Compressed { enc_table, .. } => Some(enc_table),
                TableEnc::Rle(_) => None,
            };
            let of_table = match &of_enc {
                TableEnc::Compressed { enc_table, .. } => Some(enc_table),
                TableEnc::Rle(_) => None,
            };
            let ml_table = match &ml_enc {
                TableEnc::Compressed { enc_table, .. } => Some(enc_table),
                TableEnc::Rle(_) => None,
            };

            let mut ll_state = ll_table.as_ref().map(|t| t.init_state(last.ll_c));
            let mut of_state = of_table.as_ref().map(|t| t.init_state(last.of_c));
            let mut ml_state = ml_table.as_ref().map(|t| t.init_state(last.ml_c));

            macro_rules! encode_transition_opt {
                ($table:expr, $state:expr, $symbol:expr) => {
                    if let (Some(t), Some(s)) = (&$table, &mut $state) {
                        let tt = unsafe { t.symbol_tt.get_unchecked($symbol as usize) };
                        let nb = (tt.delta_nb_bits.wrapping_add(*s)) >> 16;
                        add_bits!(*s & ((1u32 << nb) - 1), nb);
                        let idx = (*s >> nb) as i32 + tt.delta_find_state;
                        *s = unsafe { *t.state_table.get_unchecked(idx as usize) } as u32;
                    }
                };
            }

            for k in (0..n - 1).rev() {
                let p = &packed[k];

                flush_fast!();
                encode_transition_opt!(of_table, of_state, p.of_c);
                encode_transition_opt!(ml_table, ml_state, p.ml_c);
                encode_transition_opt!(ll_table, ll_state, p.ll_c);
                flush_fast!();

                add_bits!(p.extra_bits, p.extra_nbits);
            }

            flush_fast!();
            if let (Some(t), Some(s)) = (&ml_table, ml_state) {
                add_bits!(s - t.table_size, t.table_log);
            }
            if let (Some(t), Some(s)) = (&of_table, of_state) {
                add_bits!(s - t.table_size, t.table_log);
            }
            if let (Some(t), Some(s)) = (&ll_table, ll_state) {
                add_bits!(s - t.table_size, t.table_log);
            }
        }
    }

    add_bits!(1u32, 1u8);
    flush_fast!();
    while bits_used > 0 {
        unsafe {
            *buf_ptr.add(pos) = bits as u8;
        }
        bits >>= 8;
        bits_used = bits_used.saturating_sub(8);
        pos += 1;
    }
    unsafe {
        writer_buf.set_len(pos);
    }
    output.extend_from_slice(writer_buf);

    true
}

// Predefined FSE bit costs per symbol, in 256ths of a bit.
// cost_256 = 256 * (accuracy_log - log2(max(1, |dist[s]|)))
#[rustfmt::skip]
static LL_PRED_COST_256: [u16; 36] = [
    1024, 1130, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280,
    1536, 1536, 1536,
    1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280,
    1130, 1280,
    1536, 1536, 1536, 1536, 1536,
    1536, 1536, 1536, 1536,
];

#[rustfmt::skip]
static ML_PRED_COST_256: [u16; 53] = [
    1536,
    1024, 1130, 1280, 1280, 1280, 1280, 1280, 1280,
    1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536,
    1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536,
    1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536, 1536,
    1536, 1536, 1536, 1536, 1536, 1536, 1536,
];

#[rustfmt::skip]
static OF_PRED_COST_256: [u16; 32] = [
    1280, 1280, 1280, 1280, 1280, 1280,
    1024, 1024, 1024,
    1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280,
    1280, 1280, 1280, 1280,
    1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280,
];

fn estimate_predefined_cost(
    ll_freq: &[u32; 36],
    ml_freq: &[u32; 53],
    of_freq: &[u32; 32],
    extra_bits: u64,
    n: usize,
) -> usize {
    let mut fse_cost_256: u64 = 0;
    for (c, &f) in ll_freq.iter().enumerate() {
        fse_cost_256 += f as u64 * LL_PRED_COST_256[c] as u64;
    }
    for (c, &f) in ml_freq.iter().enumerate() {
        fse_cost_256 += f as u64 * ML_PRED_COST_256[c] as u64;
    }
    for (c, &f) in of_freq.iter().enumerate() {
        fse_cost_256 += f as u64 * OF_PRED_COST_256[c] as u64;
    }

    let total_bits = (fse_cost_256 + 255) / 256 + extra_bits;
    let header_bytes = if n < 128 {
        2
    } else if n < 0x7F00 {
        3
    } else {
        4
    };
    (total_bits as usize + 17 + 8) / 8 + header_bytes
}

fn estimate_fse_bits(freqs: &[u32], total: usize) -> usize {
    if total == 0 {
        return 0;
    }
    let t = total as f64;
    let mut bits = 0.0f64;
    for &f in freqs {
        if f > 0 {
            let p = f as f64 / t;
            bits -= f as f64 * p.log2();
        }
    }
    bits as usize
}

fn optimal_table_log(max_log: u8, num_seq: usize, max_symbol: usize) -> u8 {
    let min_log = 5u8;
    if num_seq <= 1 {
        return min_log;
    }
    let seq_log = 32 - ((num_seq as u32 - 1).leading_zeros());
    let min_bits_sym = if max_symbol > 0 {
        (32 - (max_symbol as u32).leading_zeros()) as u8 + 2
    } else {
        0
    };
    let target = seq_log.saturating_sub(2) as u8;
    target.max(min_log).max(min_bits_sym).min(max_log)
}

#[rustfmt::skip]
static LL_CODE_TABLE: [u8; 64] = [
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
    22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
];

#[inline]
fn ll_code(ll: u32) -> u8 {
    if ll < 64 {
        return LL_CODE_TABLE[ll as usize];
    }
    let high_bit = 31 - ll.leading_zeros();
    high_bit as u8 + 19
}

#[rustfmt::skip]
static ML_CODE_TABLE: [u8; 128] = [
    // mlBase = matchLength - 3, indexed 0..127
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
    38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
    42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
    42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
];

#[inline]
fn ml_code(ml: u32) -> u8 {
    let ml_base = ml - 3;
    if ml_base < 128 {
        return ML_CODE_TABLE[ml_base as usize];
    }
    let high_bit = 31 - ml_base.leading_zeros();
    high_bit as u8 + 36
}

#[inline]
fn of_code(offset_value: u32) -> u8 {
    (31 - offset_value.leading_zeros()) as u8
}