structured-zstd 0.0.24

Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI.
Documentation
//! FSE, short for Finite State Entropy, is an encoding technique
//! that assigns shorter codes to symbols that appear more frequently in data,
//! and longer codes to less frequent symbols.
//!
//! FSE works by mutating a state and using that state to index into a table.
//!
//! Zstandard uses two different kinds of entropy encoding: FSE, and Huffman coding.
//! Huffman is used to compress literals,
//! while FSE is used for all other symbols (literal length code, match length code, offset code).
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse>
//!
//! <https://arxiv.org/pdf/1311.2540>

mod fse_decoder;

pub use fse_decoder::*;

pub mod fse_encoder;

#[test]
fn decoder_entry_is_packed_4_bytes() {
    assert_eq!(core::mem::size_of::<fse_decoder::Entry>(), 4);
    assert_eq!(core::mem::offset_of!(fse_decoder::Entry, new_state), 0);
    assert_eq!(core::mem::offset_of!(fse_decoder::Entry, symbol), 2);
    assert_eq!(core::mem::offset_of!(fse_decoder::Entry, num_bits), 3);
}

#[test]
fn build_from_probabilities_rejects_acc_log_over_entry_limit() {
    let mut dec_table = FSETable::new(255);
    let err = dec_table
        .build_from_probabilities(17, &[1, 1, 1, 1])
        .unwrap_err();
    assert!(matches!(
        err,
        crate::decoding::errors::FSETableError::AccLogTooBig { got: 17, max: 16 }
    ));
}

#[test]
fn build_decoder_empty_input_reports_bits_error_with_large_max_log() {
    let mut dec_table = FSETable::new(255);
    let err = dec_table.build_decoder(&[], 17).unwrap_err();
    assert!(matches!(
        err,
        crate::decoding::errors::FSETableError::GetBitsError(_)
    ));
}

#[test]
fn tables_equal() {
    let probs = &[0, 0, -1, 3, 2, 2, (1 << 6) - 8];
    let mut dec_table = FSETable::new(255);
    dec_table.build_from_probabilities(6, probs).unwrap();
    let enc_table = fse_encoder::build_table_from_probabilities(probs, 6);

    check_tables(&dec_table, &enc_table);
}

#[cfg(any(test, feature = "fuzz_exports"))]
fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
    // Per-symbol `Vec<State>` storage was dropped in #110 — the encoder now
    // holds only `nextStateTable` (donor parity) and per-symbol
    // `symbolTT`. Verify decoder/encoder parity by enumerating every
    // valid `(symbol, input_state)` transition and asserting that:
    //   (a) `next.index` lands on a decoder slot owned by `symbol`
    //       (the transition reaches a state that decodes back to the
    //       same symbol — without this, a broken encoder routing
    //       symbol A into symbol B's slot would slip through), and
    //   (b) `baseline` / `num_bits` from the encoder match what the
    //       decoder records for that slot.
    // After enumerating, every decoder slot must have been hit at
    // least once (full coverage).
    let table_size = enc_table.table_size;
    let mut hit = alloc::vec![false; table_size];
    for symbol_u in 0..=255u16 {
        let symbol = symbol_u as u8;
        if enc_table.symbol_probability(symbol) == 0 {
            continue;
        }
        for input_state in 0..table_size {
            let next = enc_table.next_state(symbol, input_state);
            let dec_state = &dec_table.decode[next.index];
            assert_eq!(
                dec_state.symbol, symbol,
                "encoder transition for symbol {symbol} from state {input_state} \
                 lands on decoder slot {} which decodes to symbol {} \
                 (encoder/decoder routing mismatch)",
                next.index, dec_state.symbol
            );
            assert_eq!(
                next.baseline, dec_state.new_state as usize,
                "decode/encode baseline mismatch at slot {} (symbol {symbol})",
                next.index
            );
            assert_eq!(
                next.num_bits, dec_state.num_bits,
                "decode/encode num_bits mismatch at slot {} (symbol {symbol})",
                next.index
            );
            hit[next.index] = true;
        }
    }
    for (idx, was_hit) in hit.iter().enumerate() {
        assert!(
            *was_hit,
            "decoder slot {idx} not reached by any (symbol, prev_state) encoder transition"
        );
    }
}

/// Verify `table_header_bits()` matches the actual byte count written by `write_table()`.
#[test]
#[allow(clippy::borrow_deref_ref)]
fn table_header_bits_exact() {
    use crate::bit_io::BitWriter;
    use fse_encoder::{
        build_table_from_data, build_table_from_probabilities, default_ll_table, default_ml_table,
        default_of_table,
    };

    let check = |table: &fse_encoder::FSETable| {
        let mut buf = alloc::vec::Vec::new();
        let mut writer = BitWriter::from(&mut buf);
        table.write_table(&mut writer);
        writer.flush();
        let written_bits = buf.len() * 8; // flush pads to byte boundary
        let computed_bits = table.table_header_bits();
        assert_eq!(
            computed_bits, written_bits,
            "table_header_bits() mismatch: computed={computed_bits}, written={written_bits}"
        );
    };

    // Predefined tables. Borrow via `&*` so the call compiles on
    // both `FseDefaultTable` shapes — `&'static FSETable` (atomic /
    // `critical-section`) and `Box<FSETable>` (no-atomic-no-CS).
    check(&*default_ll_table());
    check(&*default_ml_table());
    check(&*default_of_table());

    // Tables built from synthetic data
    let data: alloc::vec::Vec<u8> = (0u8..32).cycle().take(1000).collect();
    check(&build_table_from_data(data.iter().copied(), 9, true));

    let data2: alloc::vec::Vec<u8> = alloc::vec![0, 1, 2, 3]
        .into_iter()
        .cycle()
        .take(500)
        .collect();
    check(&build_table_from_data(data2.iter().copied(), 8, true));

    // Uniform distribution: 32 symbols × prob=2 = 64 = 1<<6 (acc_log=6 requires sum=64)
    check(&build_table_from_probabilities(
        &[
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2,
        ],
        6,
    ));
}

#[test]
fn roundtrip() {
    round_trip(&(0..64).collect::<alloc::vec::Vec<_>>());
    let mut data = alloc::vec![];
    data.extend(0..32);
    data.extend(0..32);
    data.extend(0..32);
    data.extend(0..32);
    data.extend(0..32);
    data.extend(20..32);
    data.extend(20..32);
    data.extend(0..32);
    data.extend(20..32);
    data.extend(100..255);
    data.extend(20..32);
    data.extend(20..32);
    round_trip(&data);

    #[cfg(feature = "std")]
    if std::fs::exists("fuzz/artifacts/fse").unwrap_or(false) {
        for file in std::fs::read_dir("fuzz/artifacts/fse").unwrap() {
            if file.as_ref().unwrap().file_type().unwrap().is_file() {
                let data = std::fs::read(file.unwrap().path()).unwrap();
                round_trip(&data);
            }
        }
    }
}

/// Only needed for testing.
///
/// Encodes the data with a table built from that data
/// Decodes the result again by first decoding the table and then the data
/// Asserts that the decoded data equals the input
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn round_trip(data: &[u8]) {
    use crate::bit_io::{BitReaderReversed, BitWriter};
    use fse_encoder::FSEEncoder;

    if data.len() < 2 {
        return;
    }
    if data.iter().all(|x| *x == data[0]) {
        return;
    }
    if data.len() < 64 {
        return;
    }

    let mut writer = BitWriter::new();
    let mut encoder = FSEEncoder::new(
        fse_encoder::build_table_from_data(data.iter().copied(), 22, false),
        &mut writer,
    );
    let mut dec_table = FSETable::new(255);
    encoder.encode(data);
    let acc_log = encoder.acc_log();
    let enc_table = encoder.into_table();
    let encoded = writer.dump();

    let table_bytes = dec_table.build_decoder(&encoded, acc_log).unwrap();
    let encoded = &encoded[table_bytes..];
    let mut decoder = FSEDecoder::new(&dec_table);

    check_tables(&dec_table, &enc_table);

    let mut br = BitReaderReversed::new(encoded);
    let mut skipped_bits = 0;
    loop {
        let val = br.get_bits(1);
        skipped_bits += 1;
        if val == 1 || skipped_bits > 8 {
            break;
        }
    }
    if skipped_bits > 8 {
        //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
        panic!("Corrupted end marker");
    }
    decoder.init_state(&mut br).unwrap();
    let mut decoded = alloc::vec::Vec::new();

    for x in data {
        let w = decoder.decode_symbol();
        assert_eq!(w, *x);
        decoded.push(w);
        if decoded.len() < data.len() {
            decoder.update_state(&mut br);
        }
    }

    assert_eq!(&decoded, data);

    assert_eq!(br.bits_remaining(), 0);
}