compcol 0.6.4

A no_std collection of compression algorithms behind a uniform streaming trait, gated per-algorithm by Cargo features.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! Single-block Lizard decode (compressed-block payload only).
//!
//! The frame-level state machine in [`super`] hands us the bytes of a
//! single compressed block — everything between the 4-byte block-size
//! word and the start of the next block. We parse the block-internal
//! 1-byte `compressionLevel`, then the 1-byte `res` flag, then either
//! the in-block uncompressed payload (when `res == 0x80`) or the
//! five sub-streams (lengths, offset16, offset24, flags, literals).
//!
//! Only the LZ4-codeword sequence loop (levels 10..=19, 30..=39) with
//! all sub-streams stored raw (no Huffman entropy stage) is
//! implemented; everything else returns [`Error::Unsupported`].
//!
//! Two paths stay `Unsupported` for documented, validation-driven
//! reasons (see the inline comments at the `huffman_bits` and LIZv1
//! rejections below):
//!
//!  * **Huff0 entropy stage** (any sub-stream flag bit set): Lizard's
//!    generic `HUF_decompress` recomputes an X1-vs-X2 decoder choice
//!    that is never carried in the stream; the crate has only an X1
//!    Huff0 decoder (private to `zstd`), and there is no `lizard` CLI
//!    or fixture here to validate an X2 decoder against. A round-trip
//!    against our own X1-only encoder would prove nothing.
//!  * **LIZv1 codewords** (levels 20..=29, 40..=49): a separate, larger
//!    sequence format, out of scope for this round.

use alloc::vec::Vec;

use crate::error::Error;

const LIZARD_MIN_CLEVEL: u8 = 10;
const LIZARD_MAX_CLEVEL: u8 = 49;

const FLAG_LITERALS: u8 = 0x01;
const FLAG_FLAGS: u8 = 0x02;
const FLAG_OFFSET16: u8 = 0x04;
const FLAG_OFFSET24: u8 = 0x08;
const FLAG_LEN: u8 = 0x10;
const FLAG_UNCOMPRESSED: u8 = 0x80;

const MINMATCH: usize = 4;
const RUN_BITS_LZ4: u32 = 4;
const RUN_MASK_LZ4: u8 = (1 << RUN_BITS_LZ4) - 1;
const ML_MASK_LZ4: u8 = (1 << RUN_BITS_LZ4) - 1;

/// Type alias kept exported for callers that want to instantiate the
/// internal LZ4-codeword decoder directly (e.g. for tests). The
/// frame-level [`Decoder`](super::Decoder) is the normal entry point.
pub struct Lz4ModeDecoder;

/// Decode a single Lizard compressed block (everything between the
/// frame's 4-byte block-size word and the start of the next block).
/// `out` is appended to — the caller is expected to clear it first if
/// they want only this block's bytes.
///
/// `cap` is the per-frame ceiling on this block's decoded size. Output
/// is checked against it *before* every append (literal run, match copy,
/// in-block-uncompressed payload, trailing literals) so a malformed
/// stream cannot balloon `out` to gigabytes before the size is rejected.
/// A valid block always stays `<= cap`. On overflow we return
/// `Error::Corrupt`, matching the caller's post-hoc size check.
pub fn decode_compressed_block(input: &[u8], out: &mut Vec<u8>, cap: usize) -> Result<(), Error> {
    if input.is_empty() {
        return Err(Error::UnexpectedEnd);
    }
    let mut ip = 0usize;

    // compressionLevel byte.
    let clevel = input[ip];
    ip += 1;
    if !(LIZARD_MIN_CLEVEL..=LIZARD_MAX_CLEVEL).contains(&clevel) {
        return Err(Error::Corrupt);
    }
    // Lizard groups levels by decompression strategy:
    //   10..=19, 30..=39  →  LZ4 codewords (this build supports)
    //   20..=29, 40..=49  →  LIZv1 codewords (not supported)
    //
    // LIZv1 is a distinct, larger sequence format (`Lizard_decompress_LIZv1`
    // vs `Lizard_decompress_LZ4` in the reference): different token layout,
    // explicit `lengths`/`offset16`/`offset24` streams, and a 24-bit offset
    // path. Implementing it is a separate effort from the Huffman stage and
    // is out of scope for this round, so it stays `Unsupported`.
    let is_lz4_mode = matches!(clevel, 10..=19 | 30..=39);
    if !is_lz4_mode {
        return Err(Error::Unsupported);
    }

    // res flag byte.
    if ip >= input.len() {
        return Err(Error::UnexpectedEnd);
    }
    let res = input[ip];
    ip += 1;

    if res == FLAG_UNCOMPRESSED {
        // In-block uncompressed: 3-byte LE length + raw bytes.
        if ip + 3 > input.len() {
            return Err(Error::UnexpectedEnd);
        }
        let length = read_u24_le(&input[ip..]);
        ip += 3;
        if ip + length > input.len() {
            return Err(Error::UnexpectedEnd);
        }
        if out.len() + length > cap {
            return Err(Error::Corrupt);
        }
        out.extend_from_slice(&input[ip..ip + length]);
        return Ok(());
    }

    // LIZARD_FLAG_LEN being set on the block-flag byte is the reference
    // implementation's "reserved / not used" marker; reject so unknown
    // future variants don't decode silently to wrong output.
    if res & FLAG_LEN != 0 {
        return Err(Error::Corrupt);
    }
    // Any Huffman bit set on a sub-stream means the stream is entropy-coded
    // with Huff0 (Yann Collet's FiniteStateEntropy library) and must be
    // `HUF_decompress`'d before the sequence loop runs. Each such sub-stream
    // is framed as a 6-byte header (3-byte LE regenerated size + 3-byte LE
    // compressed size) followed by `compressed_size` bytes of Huff0 payload
    // (`Lizard_readStream` → `HUF_decompress(op, regenSize, ip + 6, comprLen)`).
    //
    // This stays `Unsupported`. The decision is deliberate, not a TODO —
    // there is no faithful way to *validate* such a decoder in this
    // environment, and the crate's policy (see `lzham`, `sit13`) is to mark
    // formats we cannot validate bit-exactly as `Unsupported` rather than
    // ship a blind decoder. Concretely:
    //
    //   * The crate already has a Huff0 decoder in `src/zstd/huffman.rs`, but
    //     it is (a) private to the `zstd` module (`mod huffman;`, not
    //     reachable from here without re-exporting it) and (b) implements
    //     only the **X1** (single-symbol) decode table that zstd's *literals*
    //     spec restricts itself to.
    //   * Lizard calls the *generic* `HUF_decompress`, which selects **X1 or
    //     X2** (double-symbol) at runtime via `HUF_selectDecoder`. That
    //     choice is **recomputed from (regenSize, comprLen)** and is **never
    //     stored in the stream**, so a conformant decoder must implement both
    //     X1 and X2 *and* reproduce `HUF_selectDecoder`'s timing heuristic
    //     exactly. The crate has no X2 decoder anywhere. (The 4-stream jump
    //     table — three LE u16 sizes — does match zstd's literals framing, so
    //     that part would be reusable; the X1/X2 split is the blocker.)
    //   * The lz5 encoder here is store-only, and there is no `lizard` CLI or
    //     Huff0 fixture in this environment. A round-trip against a
    //     hand-written X1-only encoder would always select X1 and "pass"
    //     while proving nothing about a real (possibly X2) Lizard block — a
    //     self-validating fiction. Absent a real fixture or reference
    //     encoder there is no honest round-trip, so we do not ship.
    let huffman_bits = res & (FLAG_LITERALS | FLAG_FLAGS | FLAG_OFFSET16 | FLAG_OFFSET24);
    if huffman_bits != 0 {
        return Err(Error::Unsupported);
    }

    // Parse five raw streams: lengths, offset16, offset24, flags, literals.
    // Each starts with a 3-byte LE length.
    let lengths = read_raw_stream(input, &mut ip)?;
    let offset16 = read_raw_stream(input, &mut ip)?;
    let offset24 = read_raw_stream(input, &mut ip)?;
    let flags = read_raw_stream(input, &mut ip)?;
    let literals = read_raw_stream(input, &mut ip)?;
    if ip != input.len() {
        return Err(Error::Corrupt);
    }

    // In LZ4 mode the lengths/offset16/offset24 streams must be empty
    // — the encoder packs all length-extensions and offsets inline in
    // the literals stream. Reject any extra bytes there as malformed.
    if !lengths.is_empty() || !offset16.is_empty() || !offset24.is_empty() {
        return Err(Error::Corrupt);
    }

    decode_lz4_sequences(flags, literals, out, cap)
}

/// Read one raw (non-Huffman) sub-stream: 3-byte LE length + bytes.
/// Returns the byte slice and advances `ip` past it.
fn read_raw_stream<'a>(input: &'a [u8], ip: &mut usize) -> Result<&'a [u8], Error> {
    if *ip + 3 > input.len() {
        return Err(Error::UnexpectedEnd);
    }
    let len = read_u24_le(&input[*ip..]);
    *ip += 3;
    if *ip + len > input.len() {
        return Err(Error::UnexpectedEnd);
    }
    let slice = &input[*ip..*ip + len];
    *ip += len;
    Ok(slice)
}

#[inline]
fn read_u24_le(s: &[u8]) -> usize {
    (s[0] as usize) | ((s[1] as usize) << 8) | ((s[2] as usize) << 16)
}

/// LZ4-codeword sequence loop.
///
/// In Lizard's LZ4 mode the `flags` stream contains the sequence
/// tokens (one byte per sequence). The `literals` stream contains, in
/// order, the literal bytes themselves, the 1- or 3-byte literal-length
/// extension bytes, the 2-byte offsets, and the 1- or 3-byte
/// match-length extension bytes. All bytes that the token's literal-run
/// and match-length nibbles refer to are pulled from `literals` via a
/// single cursor that advances through it monotonically.
fn decode_lz4_sequences(
    flags: &[u8],
    literals: &[u8],
    out: &mut Vec<u8>,
    cap: usize,
) -> Result<(), Error> {
    let mut lp = 0usize; // literals-stream cursor

    for &token in flags {
        // Literal length: low nibble of the token (LZ4 mode reverses
        // the LZ4-classic ordering — literal length is low, match
        // length excess is high). Sentinel value 15 → one or three
        // extension bytes follow.
        let mut lit_len = (token & RUN_MASK_LZ4) as usize;
        if lit_len == RUN_MASK_LZ4 as usize {
            lit_len = read_length_ext(literals, &mut lp)?;
            lit_len = lit_len
                .checked_add(RUN_MASK_LZ4 as usize)
                .ok_or(Error::Corrupt)?;
        }
        // Copy literals.
        if lit_len > 0 {
            if lp + lit_len > literals.len() {
                return Err(Error::UnexpectedEnd);
            }
            if out.len() + lit_len > cap {
                return Err(Error::Corrupt);
            }
            out.extend_from_slice(&literals[lp..lp + lit_len]);
            lp += lit_len;
        }
        // 2-byte LE offset.
        if lp + 2 > literals.len() {
            return Err(Error::UnexpectedEnd);
        }
        let offset = (literals[lp] as usize) | ((literals[lp + 1] as usize) << 8);
        lp += 2;
        if offset == 0 {
            return Err(Error::InvalidDistance);
        }
        if offset > out.len() {
            return Err(Error::InvalidDistance);
        }
        // Match length: high nibble of the token. Sentinel 15 → ext.
        let mut match_excess = ((token >> RUN_BITS_LZ4) & ML_MASK_LZ4) as usize;
        if match_excess == ML_MASK_LZ4 as usize {
            match_excess = read_length_ext(literals, &mut lp)?;
            match_excess = match_excess
                .checked_add(ML_MASK_LZ4 as usize)
                .ok_or(Error::Corrupt)?;
        }
        let match_len = match_excess.checked_add(MINMATCH).ok_or(Error::Corrupt)?;
        copy_match(out, offset, match_len, cap)?;
    }

    // Trailing literals: everything left in the literals stream is
    // copied verbatim after the last token.
    if lp < literals.len() {
        let tail = literals.len() - lp;
        if out.len() + tail > cap {
            return Err(Error::Corrupt);
        }
        out.extend_from_slice(&literals[lp..]);
    }
    Ok(())
}

/// Read one literal-/match-length extension from the literals stream.
/// Encoding (per `lizard_compress_lz4.h`):
/// - byte < 254 → length = byte
/// - byte == 254 → length = next 2 bytes LE
/// - byte == 255 → length = next 3 bytes LE
fn read_length_ext(literals: &[u8], lp: &mut usize) -> Result<usize, Error> {
    if *lp >= literals.len() {
        return Err(Error::UnexpectedEnd);
    }
    let first = literals[*lp];
    if first < 254 {
        *lp += 1;
        return Ok(first as usize);
    }
    if first == 254 {
        if *lp + 3 > literals.len() {
            return Err(Error::UnexpectedEnd);
        }
        let v = (literals[*lp + 1] as usize) | ((literals[*lp + 2] as usize) << 8);
        *lp += 3;
        return Ok(v);
    }
    // first == 255
    if *lp + 4 > literals.len() {
        return Err(Error::UnexpectedEnd);
    }
    let v = (literals[*lp + 1] as usize)
        | ((literals[*lp + 2] as usize) << 8)
        | ((literals[*lp + 3] as usize) << 16);
    *lp += 4;
    Ok(v)
}

/// Copy `match_len` bytes from `out[start..]` to the end of `out`,
/// where `start = out.len() - offset`. Handles LZ77 self-overlap
/// (offset < match_len) byte-by-byte. Rejects (`Error::Corrupt`) any
/// match that would grow `out` past `cap` before copying a single byte.
fn copy_match(out: &mut Vec<u8>, offset: usize, match_len: usize, cap: usize) -> Result<(), Error> {
    if offset > out.len() {
        return Err(Error::InvalidDistance);
    }
    if out.len() + match_len > cap {
        return Err(Error::Corrupt);
    }
    let start = out.len() - offset;
    if offset >= match_len {
        // Non-overlapping — bulk copy.
        out.extend_from_within(start..start + match_len);
    } else if offset == 1 {
        // Byte-splat fast path (very common: long runs of one byte).
        let b = out[start];
        out.resize(out.len() + match_len, b);
    } else {
        // Self-overlap — copy in `offset`-sized chunks. Each round duplicates
        // the tail produced so far, doubling the source region, so the loop
        // runs a logarithmic number of times instead of once per byte.
        let mut remaining = match_len;
        while remaining > 0 {
            let chunk = remaining.min(offset);
            let s = out.len() - offset;
            out.extend_from_within(s..s + chunk);
            remaining -= chunk;
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    // Tiny synthetic block (no sequences, just trailing literals): the
    // simplest possible LZ4-mode block — produces a few bytes of output.
    #[test]
    fn empty_flags_just_literals() {
        // compressionLevel=10, res=0, len streams all empty, flags=0 bytes,
        // literals = "hi" (2 bytes).
        let mut block = alloc::vec![
            10, // clevel
            0,  // res
            0, 0, 0, // lengths len 0
            0, 0, 0, // offset16 len 0
            0, 0, 0, // offset24 len 0
            0, 0, 0, // flags len 0
            2, 0, 0, // literals len 2
        ];
        block.extend_from_slice(b"hi");

        let mut out = Vec::new();
        decode_compressed_block(&block, &mut out, usize::MAX).unwrap();
        assert_eq!(out, b"hi");
    }

    #[test]
    fn in_block_uncompressed() {
        let mut block = alloc::vec![
            10,                // clevel
            FLAG_UNCOMPRESSED, // res with uncompressed bit
            5,
            0,
            0, // 3-byte length = 5
        ];
        block.extend_from_slice(b"hello");

        let mut out = Vec::new();
        decode_compressed_block(&block, &mut out, usize::MAX).unwrap();
        assert_eq!(out, b"hello");
    }

    #[test]
    fn rejects_lizv1_mode() {
        let block = alloc::vec![20u8, 0u8]; // clevel 20 → LIZv1
        let mut out = Vec::new();
        assert_eq!(
            decode_compressed_block(&block, &mut out, usize::MAX),
            Err(Error::Unsupported)
        );
    }

    #[test]
    fn rejects_huffman_flag() {
        let block = alloc::vec![10u8, FLAG_LITERALS]; // Huffman literals stream
        let mut out = Vec::new();
        assert_eq!(
            decode_compressed_block(&block, &mut out, usize::MAX),
            Err(Error::Unsupported)
        );
    }

    #[test]
    fn rejects_bad_clevel() {
        let block = alloc::vec![9u8, 0u8];
        let mut out = Vec::new();
        assert_eq!(
            decode_compressed_block(&block, &mut out, usize::MAX),
            Err(Error::Corrupt)
        );
    }
}