structured_zstd/decoding/frame_decoder.rs
1//! Framedecoder is the main low-level struct users interact with to decode zstd frames
2//!
3//! Zstandard compressed data is made of one or more frames. Each frame is independent and can be
4//! decompressed independently of other frames. This module contains structures
5//! and utilities that can be used to decode a frame.
6
7use super::frame;
8use crate::decoding;
9use crate::decoding::block_decoder::BlockDecoder;
10use crate::decoding::buffer_backend::BufferBackend;
11use crate::decoding::dictionary::{Dictionary, DictionaryHandle};
12use crate::decoding::errors::{DecodeBlockContentError, FrameDecoderError};
13use crate::decoding::flat_buf::FlatBuf;
14use crate::decoding::ringbuffer::RingBuffer;
15use crate::decoding::scratch::DecoderScratch;
16use crate::io::{Error, Read, Write};
17use alloc::collections::BTreeMap;
18use alloc::vec::Vec;
19use core::convert::TryInto;
20
21use crate::common::MAXIMUM_ALLOWED_WINDOW_SIZE;
22
23/// Build the block-header decode error. With the `lsm` feature it captures
24/// the failing block's index and frame offset (block-precise recovery);
25/// without it, the legacy positionless variant — so the default build's
26/// error surface stays byte-identical to the donor.
27#[cfg(feature = "lsm")]
28fn block_header_decode_error(
29 source: crate::decoding::errors::BlockHeaderReadError,
30 block_index: u32,
31 frame_offset: u32,
32) -> FrameDecoderError {
33 FrameDecoderError::FailedToReadBlockHeaderAt {
34 source,
35 block_index,
36 frame_offset,
37 }
38}
39#[cfg(not(feature = "lsm"))]
40fn block_header_decode_error(
41 source: crate::decoding::errors::BlockHeaderReadError,
42 _block_index: u32,
43 _frame_offset: u32,
44) -> FrameDecoderError {
45 FrameDecoderError::FailedToReadBlockHeader(source)
46}
47
48/// Build the block-body decode error. With `lsm` it captures the block
49/// index, frame offset, and the failing block's structural metadata
50/// (reconstructed from its header); without it, the legacy variant.
51#[cfg(feature = "lsm")]
52fn block_body_decode_error(
53 source: DecodeBlockContentError,
54 block_index: u32,
55 frame_offset: u32,
56 header: &crate::blocks::block::BlockHeader,
57 header_size: u8,
58) -> FrameDecoderError {
59 use crate::blocks::block::BlockType;
60 // Physical wire body vs the raw `Block_Size` field: RLE writes a single
61 // body byte while `Block_Size` carries the repeat count; Raw/Compressed
62 // bodies match the field.
63 let (body_size, block_size_field) = match header.block_type {
64 BlockType::RLE => (1u32, header.decompressed_size),
65 _ => (header.content_size, header.content_size),
66 };
67 FrameDecoderError::FailedToReadBlockBodyAt {
68 source,
69 block_index,
70 frame_offset,
71 block: crate::encoding::frame_emit_info::FrameBlock {
72 offset_in_frame: frame_offset,
73 header_size,
74 body_size,
75 block_size_field,
76 block_type: header.block_type,
77 last_block: header.last_block,
78 // Raw/RLE carry their regenerated size in the header;
79 // a Compressed block's is unknown until decoded, so
80 // `read_block_header` leaves `decompressed_size` 0 here.
81 decompressed_size: header.decompressed_size,
82 },
83 }
84}
85#[cfg(not(feature = "lsm"))]
86fn block_body_decode_error(
87 source: DecodeBlockContentError,
88 _block_index: u32,
89 _frame_offset: u32,
90 _header: &crate::blocks::block::BlockHeader,
91 _header_size: u8,
92) -> FrameDecoderError {
93 FrameDecoderError::FailedToReadBlockBody(source)
94}
95
96/// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded.
97///
98/// This decoder is able to decode frames only partially and gives control
99/// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once).
100/// It reads bytes as needed from a provided source and can be read from to collect partial results.
101///
102/// If you want to just read the whole frame with an `io::Read` without having to deal with manually calling [FrameDecoder::decode_blocks]
103/// you can use the provided [crate::decoding::StreamingDecoder] wich wraps this FrameDecoder.
104///
105/// Workflow is as follows:
106/// ```
107/// use structured_zstd::decoding::BlockDecodingStrategy;
108///
109/// # #[cfg(feature = "std")]
110/// use std::io::{Read, Write};
111///
112/// // no_std environments can use the crate's own Read traits
113/// # #[cfg(not(feature = "std"))]
114/// use structured_zstd::io::{Read, Write};
115///
116/// fn decode_this(mut file: impl Read) {
117/// //Create a new decoder
118/// let mut frame_dec = structured_zstd::decoding::FrameDecoder::new();
119/// let mut result = Vec::new();
120///
121/// // Use reset or init to make the decoder ready to decode the frame from the io::Read
122/// frame_dec.reset(&mut file).unwrap();
123///
124/// // Loop until the frame has been decoded completely
125/// while !frame_dec.is_finished() {
126/// // decode (roughly) batch_size many bytes
127/// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap();
128///
129/// // read from the decoder to collect bytes from the internal buffer
130/// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap();
131///
132/// // then do something with it
133/// do_something(&result[0..bytes_read]);
134/// }
135///
136/// // handle the last chunk of data
137/// while frame_dec.can_collect() > 0 {
138/// let x = frame_dec.read(result.as_mut_slice()).unwrap();
139///
140/// do_something(&result[0..x]);
141/// }
142/// }
143///
144/// fn do_something(data: &[u8]) {
145/// # #[cfg(feature = "std")]
146/// std::io::stdout().write_all(data).unwrap();
147/// }
148/// ```
149pub struct FrameDecoder {
150 state: Option<FrameDecoderState>,
151 /// Test-only observability: frames decoded via `run_direct_decode`.
152 /// The direct and buffered paths are byte-identical, so dispatch
153 /// regressions (e.g. re-excluding dictionary frames from the direct
154 /// gate) are invisible to output assertions; tests pin the path here.
155 #[cfg(test)]
156 direct_frames: u64,
157 // Registered dictionaries are stored by shared handle (Arc/Rc) so a
158 // single content copy is referenced by every frame the decoder decodes
159 // (donor `ZSTD_refDDict`), rather than re-copied into the decode buffer
160 // per frame. `add_dict` wraps an owned `Dictionary` into a handle.
161 owned_dicts: BTreeMap<u32, DictionaryHandle>,
162 #[cfg(target_has_atomic = "ptr")]
163 shared_dicts: BTreeMap<u32, DictionaryHandle>,
164 #[cfg(not(target_has_atomic = "ptr"))]
165 shared_dicts: (),
166 /// `ZSTD_f_zstd1_magicless` — when true, [`init`] / [`reset`]
167 /// expect frames without the 4-byte magic number prefix.
168 /// Default false (standard zstd format).
169 magicless: bool,
170 /// How the optional content checksum is handled. Default
171 /// [`ContentChecksum::EmitOnly`] (compute + expose, no error on
172 /// mismatch). Set via [`Self::set_content_checksum`].
173 content_checksum: ContentChecksum,
174 /// Pinned `Dictionary_ID` expectation set via
175 /// [`Self::expect_dict_id`]. `None` (default) disables the
176 /// check; `Some(0)` matches frames whose header omits the
177 /// optional dict_id (treated as "no dictionary"). Validated in
178 /// [`Self::reset`] AFTER the frame header parses successfully
179 /// and BEFORE any block decode work.
180 #[cfg(feature = "lsm")]
181 expect_dict_id: Option<u32>,
182 /// Pinned `Window_Descriptor` byte expectation set via
183 /// [`Self::expect_window_descriptor`]. `None` (default)
184 /// disables the check. Validated in [`Self::reset`] AFTER the
185 /// frame header parses successfully and BEFORE any block
186 /// decode work. Single-segment frames (which omit the
187 /// `Window_Descriptor` byte from the wire) surface as
188 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedWindowDescriptor`]
189 /// with `found: None`.
190 #[cfg(feature = "lsm")]
191 expect_window_descriptor: Option<u8>,
192 /// When `true`, the per-block decode loop XXH64-hashes each
193 /// block's decompressed bytes and stores the low-32-bit digest in
194 /// [`Self::computed_block_checksums`]. Default `false` (zero
195 /// cost). Set via [`Self::enable_per_block_checksums`]. Gated on
196 /// `all(lsm, hash)` because XXH64 lives behind the `hash`
197 /// feature.
198 #[cfg(all(feature = "lsm", feature = "hash"))]
199 per_block_checksums_enabled: bool,
200 /// Per-block XXH64 (low 32 bits) digests captured during the
201 /// current frame's decode when `per_block_checksums_enabled` is
202 /// set. Reset at the start of every new frame. Gated on
203 /// `all(lsm, hash)` (see `per_block_checksums_enabled`).
204 #[cfg(all(feature = "lsm", feature = "hash"))]
205 computed_block_checksums: alloc::vec::Vec<u32>,
206}
207
208/// How the decoder treats a frame's optional XXH64 content checksum
209/// (RFC 8878 Content_Checksum_flag). The XXH64 pass over the decompressed
210/// output is a measurable share of decode time, so it is made skippable.
211///
212/// ```
213/// use structured_zstd::decoding::{ContentChecksum, FrameDecoder};
214/// let mut decoder = FrameDecoder::new();
215/// decoder.set_content_checksum(ContentChecksum::Verify);
216/// ```
217#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
218pub enum ContentChecksum {
219 /// Skip the XXH64 pass entirely: no compute, no verify.
220 /// `get_calculated_checksum()` returns `None`.
221 None,
222 /// Compute the checksum and expose it via the accessors, but do not
223 /// error on a mismatch. This is the default and matches the historical
224 /// behaviour (callers verify manually if they wish).
225 #[default]
226 EmitOnly,
227 /// Compute the checksum and compare it against the frame's stored value;
228 /// a disagreement fails the decode with
229 /// [`FrameDecoderError::ChecksumMismatch`](crate::decoding::errors::FrameDecoderError::ChecksumMismatch).
230 /// Without the `hash` feature there is no way to compute a digest, so
231 /// `Verify` cannot detect a mismatch and behaves like `None`.
232 Verify,
233}
234
235/// Decode-relevant identity of a frame, used to reject a [`ResumeState`]
236/// captured from one frame being applied to a frame of a different shape. Covers
237/// every header field that changes how blocks decode (buffer sizing, backend
238/// kind, entropy/dictionary context, trailing-checksum handling, declared
239/// content size, magicless framing).
240///
241/// This is a SHAPE guard, not a content-unique fingerprint: two distinct frames
242/// that happen to share all these header fields produce the same key (no cheap
243/// header field uniquely identifies frame content). It catches the realistic
244/// accidental misuse — applying a snapshot to a frame with a different
245/// window/dictionary/size — with a typed error instead of byte-wrong output.
246/// Pairing a `ResumeState` with the correct frame's compressed source and
247/// `window_prime` remains the caller's contract.
248#[cfg(feature = "lsm")]
249#[derive(Clone, Copy, PartialEq, Eq, Debug)]
250struct FrameKey {
251 window_size: u64,
252 frame_content_size: u64,
253 /// `Dictionary_ID` declared in the frame header (`None` when omitted).
254 dictionary_id: Option<u32>,
255 /// Dictionary actually applied to the decoder (`state.using_dict`). This is
256 /// distinct from `dictionary_id`: a frame with a dictless header can still
257 /// be decoded with an explicit dictionary via `reset_with_dict_handle` /
258 /// `force_dict`, and two such decodes with different dictionaries must NOT
259 /// compare equal — keying only on the header field would miss that.
260 active_dictionary_id: Option<u32>,
261 single_segment: bool,
262 content_checksum: bool,
263 magicless: bool,
264}
265
266#[cfg(feature = "lsm")]
267impl FrameKey {
268 fn from_state(state: &FrameDecoderState, magicless: bool) -> FrameKey {
269 let header = &state.frame_header;
270 FrameKey {
271 window_size: header.window_size().unwrap_or(0),
272 frame_content_size: header.frame_content_size(),
273 dictionary_id: header.dictionary_id(),
274 active_dictionary_id: state.using_dict,
275 single_segment: header.descriptor.single_segment_flag(),
276 content_checksum: header.descriptor.content_checksum_flag(),
277 magicless,
278 }
279 }
280}
281
282/// XXH64 of a contiguous byte slice — the resume-side counterpart to
283/// [`DecoderScratchKind::window_tail_hash`]. Streaming XXH64 is chunk-boundary
284/// independent, so this single-slice hash equals the emit-side two-slice hash
285/// over the same bytes.
286#[cfg(all(feature = "lsm", feature = "hash"))]
287fn xxh64_of(bytes: &[u8]) -> u64 {
288 use core::hash::Hasher;
289 let mut h = twox_hash::XxHash64::with_seed(0);
290 h.write(bytes);
291 h.finish()
292}
293
294/// Cross-block decode state needed to resume a cold partial decode at an inner
295/// block boundary, emitted by [`FrameDecoder::decode_blocks_partial`] when its
296/// `emit_resume` argument is `true` (returned in
297/// [`PartialDecode::resume_state`]) and fed back via that same method's
298/// [`resume`](FrameDecoder::decode_blocks_partial) argument
299/// ([`ResumeInput`]).
300///
301/// A zstd block does not carry all the state required to decode it in
302/// isolation: besides the shared match window (the decompressed output history),
303/// a Compressed block may reuse the previous block's entropy tables via
304/// `Repeat_Mode` (literals Huffman + the LL/OF/ML FSE distributions) and always
305/// continues the running repeat-offset history. This snapshot carries exactly
306/// that carry-over state plus the resume coordinates, so resuming is
307/// byte-identical to a contiguous decode even across a dropped decoder. The
308/// window itself is NOT stored here — the caller supplies it back through
309/// [`ResumeInput::window_prime`] from the decompressed output it already
310/// persists. Neither is the dictionary: for a dictionary frame the caller
311/// re-attaches it to the resuming decoder via [`FrameDecoder::reset`] /
312/// [`FrameDecoder::reset_with_dict_handle`] (it already holds the dictionary
313/// from encode time), and the snapshot records only the dictionary's identity
314/// so a resume under a different dictionary is rejected.
315///
316/// Behind the `lsm` Cargo feature.
317#[cfg(feature = "lsm")]
318#[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
319pub struct ResumeState {
320 /// Identity of the frame this state was captured from. Compared against the
321 /// frame currently reset into the decoder before any state is restored, so a
322 /// snapshot from a different frame shape is rejected with
323 /// [`FrameDecoderError::ResumeFrameMismatch`] instead of silently producing
324 /// byte-wrong output.
325 frame_key: FrameKey,
326 /// Index of the block to resume AT (the first block NOT yet decoded).
327 block_index: u32,
328 /// Cumulative decompressed byte count produced before `block_index`.
329 output_offset: u64,
330 /// FSE tables (LL/OF/ML) as of the last decoded block — the source for a
331 /// `Repeat_Mode` resume block.
332 fse: crate::decoding::scratch::FSEScratch,
333 /// Huffman literals table as of the last decoded block — the source for a
334 /// treeless (repeat) literals resume block.
335 huf: crate::decoding::scratch::HuffmanScratch,
336 /// Running repeat-offset history (`offset_hist`) as of the last decoded
337 /// block.
338 offset_hist: [u32; 3],
339 /// XXH64 of the exact window-prime bytes (the last `min(window_size,
340 /// output_offset)` decompressed bytes) captured at emit. Verified at resume
341 /// against the caller-supplied [`ResumeInput::window_prime`]: a content
342 /// mismatch (wrong frame, wrong or corrupted prime) is a near-unique
343 /// (≈2⁻⁶⁴) signal and is rejected with
344 /// [`FrameDecoderError::ResumeFrameMismatch`]. This is the content-exact
345 /// guard; [`FrameKey`] is the cheap shape pre-check that works without the
346 /// `hash` feature. Behind `all(lsm, hash)`.
347 #[cfg(feature = "hash")]
348 window_hash: u64,
349}
350
351#[cfg(feature = "lsm")]
352impl ResumeState {
353 /// Inner block index this state resumes at (the first block not yet
354 /// decoded). Pass it as the `end_block` lower bound (and as `start_block`)
355 /// of the resuming
356 /// [`decode_blocks_partial`](FrameDecoder::decode_blocks_partial) call.
357 pub fn block_index(&self) -> u32 {
358 self.block_index
359 }
360
361 /// Cumulative decompressed byte count produced before
362 /// [`block_index`](Self::block_index) — i.e. the decompressed offset at
363 /// which the resumed output begins. Equals
364 /// `FrameEmitInfo::decompressed_byte_range(block_index).start`. Use it to
365 /// slice the `window_prime` tail the resumed call needs.
366 pub fn output_offset(&self) -> u64 {
367 self.output_offset
368 }
369}
370
371// Manual Debug: the entropy tables are large internal scratch with no useful
372// Debug surface; only the resume coordinates are worth printing (and this lets
373// `PartialDecode` keep its derived Debug).
374#[cfg(feature = "lsm")]
375impl core::fmt::Debug for ResumeState {
376 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
377 f.debug_struct("ResumeState")
378 .field("block_index", &self.block_index)
379 .field("output_offset", &self.output_offset)
380 .finish_non_exhaustive()
381 }
382}
383
384/// Resume input fed to [`FrameDecoder::decode_blocks_partial`]'s `resume`
385/// argument to continue a cold partial decode without re-decompressing the
386/// preceding blocks.
387///
388/// Behind the `lsm` Cargo feature.
389#[cfg(feature = "lsm")]
390#[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
391pub struct ResumeInput<'a> {
392 /// The caller's already-decompressed output ending just before
393 /// [`ResumeState::block_index`]. Must contain at least the last
394 /// `min(window_size, output_offset)` bytes (a full match window, or the
395 /// whole prefix when it is shorter than one window); anything beyond the
396 /// last `window_size` bytes is ignored, so passing the entire prefix is
397 /// also valid (capped internally, bounding resume memory to one window).
398 pub window_prime: &'a [u8],
399 /// Cross-block entropy/repcode state emitted by the prior
400 /// [`decode_blocks_partial`](FrameDecoder::decode_blocks_partial) call.
401 pub state: &'a ResumeState,
402}
403
404/// Backend-tagged decode scratch — chosen at frame-reset time based
405/// on the parsed `FrameHeader.descriptor.single_segment_flag()` and
406/// kept stable through the lifetime of the frame. The match in each
407/// helper below dispatches **once per call** (e.g. once per block in
408/// `decode_block_content`, once per drain in `drain_to_writer`) —
409/// never inside the hot push/repeat loop, which is fully
410/// monomorphised through the `DecoderScratch<B>` generic.
411enum DecoderScratchKind {
412 Ring(DecoderScratch<RingBuffer>),
413 Flat(DecoderScratch<FlatBuf>),
414}
415
416impl DecoderScratchKind {
417 fn new_ring(window_size: usize) -> Self {
418 // Lazy ring-buffer allocation: do NOT `reserve(window_size)` here.
419 // The direct-decode path (`run_direct_decode`) writes through
420 // `UserSliceBackend` and never touches the ring; allocating it
421 // eagerly wastes one full window of peak memory on the common
422 // direct-eligible frame. On the non-direct path the window is
423 // pre-reserved once at frame entry (`decode_all_impl` and
424 // `decode_blocks` both call `DecoderScratchKind::reserve_buffer`
425 // before any block writes), so multi-block frames pay one
426 // amortised grow instead of repeated `reserve_amortized` steps
427 // per block. Issue #279 round 2.
428 let s = DecoderScratch::<RingBuffer>::new(window_size);
429 Self::Ring(s)
430 }
431
432 /// Construct a flat-backed scratch for a single-segment frame.
433 /// `frame_content_size` is the upcoming output size in bytes
434 /// (== `window_size` when the flag is set).
435 ///
436 /// Lazy buffer allocation (mirrors [`Self::new_ring`]): do NOT
437 /// pre-size the `FlatBuf`. The direct-decode path
438 /// (`run_direct_decode`) writes through `UserSliceBackend` and never
439 /// touches this buffer, so eagerly allocating a full FCS wastes one
440 /// whole content-size of peak memory on the common direct-eligible
441 /// single-segment frame. The non-direct fallback reserves it once via
442 /// `reserve_buffer(window_size)` at frame entry before any block
443 /// write (`FlatBuf::reserve` adds the `WILDCOPY_OVERLENGTH` slack),
444 /// and every inline-exec site (trait method and per-kernel macros)
445 /// now carries a tight-tail bounded copy, so a tight buffer can never
446 /// overshoot regardless of construction-time slack.
447 fn new_flat(frame_content_size: usize) -> Self {
448 let s = DecoderScratch::<FlatBuf>::new(frame_content_size);
449 Self::Flat(s)
450 }
451
452 /// Reset (or transition between) backends for a new frame.
453 /// Reuses the existing `DecoderScratch` allocations (FSE / HUF
454 /// tables, sequence vec, etc.) when the backend kind is unchanged
455 /// — only the underlying buffer is re-sized for the new frame.
456 /// Building a fresh `DecoderScratch` on every frame would
457 /// re-allocate everything and was measured at +255 % vs ring on
458 /// small frames; reusing it keeps the small-frame cost flat.
459 fn reset(&mut self, frame: &frame::FrameHeader, window_size: usize) {
460 if frame.descriptor.single_segment_flag() {
461 match self {
462 Self::Flat(s) => {
463 s.reset(window_size);
464 // `DecoderScratch::reset` clears the backing buffer and
465 // updates `window_size` WITHOUT reserving it (it may still
466 // resize the per-block scratch Vecs up to
467 // `min(window_size, MAX_BLOCK_SIZE)`). Backing-buffer
468 // capacity is decided one layer up: direct-eligible frames
469 // never touch it, and the non-direct path pre-reserves once
470 // via `reserve_buffer(window_size)` at frame entry.
471 }
472 Self::Ring(_) => *self = Self::new_flat(window_size),
473 }
474 } else {
475 match self {
476 Self::Ring(s) => s.reset(window_size),
477 Self::Flat(_) => *self = Self::new_ring(window_size),
478 }
479 }
480 }
481
482 fn init_from_dict(&mut self, dict: &DictionaryHandle) {
483 match self {
484 Self::Ring(s) => s.init_from_dict(dict),
485 Self::Flat(s) => s.init_from_dict(dict),
486 }
487 }
488
489 #[inline]
490 fn buffer_len(&self) -> usize {
491 match self {
492 Self::Ring(s) => s.buffer.len(),
493 Self::Flat(s) => s.buffer.len(),
494 }
495 }
496
497 fn workspace_bytes(&self) -> usize {
498 match self {
499 Self::Ring(s) => s.workspace_bytes(),
500 Self::Flat(s) => s.workspace_bytes(),
501 }
502 }
503
504 /// Pre-reserve the backing buffer to `window_size` in a single
505 /// allocation. Called once on the non-direct (`decode_blocks`) path
506 /// after direct-eligibility is ruled out, so multi-segment fallback
507 /// decodes don't pay repeated `reserve_amortized` grow steps
508 /// (128 KiB → 256 KiB → ... → window) as blocks accumulate.
509 ///
510 /// Direct-eligible frames never call this and pay zero backing-buffer
511 /// allocation for the window, on BOTH backends: `new_ring` and
512 /// `new_flat` are each lazy (no pre-reserve), so a direct-eligible
513 /// frame writes only through `UserSliceBackend` and leaves this
514 /// buffer empty.
515 ///
516 /// `window_size` is the TARGET visible-window capacity: callers pass
517 /// the full window, and the method itself computes the shortfall past
518 /// the bytes already buffered before calling the backend's
519 /// ADDITIONAL-semantics `reserve_exact`. That keeps re-entries (the
520 /// decode_all fallback loop runs `decode_blocks` once per strategy
521 /// chunk, and streaming callers invoke it per call) from growing a
522 /// window-full buffer toward 2x window, while per-block growth keeps
523 /// the amortized `reserve`.
524 #[inline]
525 fn reserve_buffer(&mut self, window_size: usize) {
526 // Exact growth: this is the one-shot pre-reservation, and a request
527 // landing one slack past the retained capacity (e.g. a dictionary
528 // prefix already loaded into the buffer) must not DOUBLE a
529 // window-sized allocation through the amortized policy. Per-block
530 // growth keeps the amortized `reserve`.
531 //
532 // `reserve_exact` takes ADDITIONAL capacity, so request only the
533 // shortfall past the bytes already buffered: the decode_all
534 // fallback loop re-enters `decode_blocks` once per strategy chunk,
535 // and re-requesting the full window each iteration would grow a
536 // window-sized buffer toward 2x window.
537 match self {
538 Self::Ring(s) => {
539 let additional = window_size.saturating_sub(s.buffer.len());
540 s.buffer.reserve_exact(additional);
541 }
542 Self::Flat(s) => {
543 let additional = window_size.saturating_sub(s.buffer.len());
544 s.buffer.reserve_exact(additional);
545 }
546 }
547 }
548
549 /// Last `n` bytes of the visible buffer as `(s1, s2)` (wrap-aware).
550 /// Routes through whichever backend the current scratch holds.
551 #[cfg(all(feature = "lsm", feature = "hash"))]
552 fn last_n_as_slices(&self, n: usize) -> (&[u8], &[u8]) {
553 match self {
554 Self::Ring(s) => s.buffer.last_n_as_slices(n),
555 Self::Flat(s) => s.buffer.last_n_as_slices(n),
556 }
557 }
558
559 fn buffer_drain(&mut self) -> Vec<u8> {
560 match self {
561 Self::Ring(s) => s.buffer.drain(),
562 Self::Flat(s) => s.buffer.drain(),
563 }
564 }
565
566 fn buffer_drain_to_window_size(&mut self) -> Option<Vec<u8>> {
567 match self {
568 Self::Ring(s) => s.buffer.drain_to_window_size(),
569 Self::Flat(s) => s.buffer.drain_to_window_size(),
570 }
571 }
572
573 fn buffer_drain_to_writer(&mut self, sink: impl Write) -> Result<usize, Error> {
574 match self {
575 Self::Ring(s) => s.buffer.drain_to_writer(sink),
576 Self::Flat(s) => s.buffer.drain_to_writer(sink),
577 }
578 }
579
580 fn buffer_drain_to_window_size_writer(&mut self, sink: impl Write) -> Result<usize, Error> {
581 match self {
582 Self::Ring(s) => s.buffer.drain_to_window_size_writer(sink),
583 Self::Flat(s) => s.buffer.drain_to_window_size_writer(sink),
584 }
585 }
586
587 fn buffer_can_drain(&self) -> usize {
588 match self {
589 Self::Ring(s) => s.buffer.can_drain(),
590 Self::Flat(s) => s.buffer.can_drain(),
591 }
592 }
593
594 fn buffer_can_drain_to_window_size(&self) -> Option<usize> {
595 match self {
596 Self::Ring(s) => s.buffer.can_drain_to_window_size(),
597 Self::Flat(s) => s.buffer.can_drain_to_window_size(),
598 }
599 }
600
601 fn buffer_read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
602 match self {
603 Self::Ring(s) => s.buffer.read(target),
604 Self::Flat(s) => s.buffer.read(target),
605 }
606 }
607
608 fn buffer_read_all(&mut self, target: &mut [u8]) -> Result<usize, Error> {
609 match self {
610 Self::Ring(s) => s.buffer.read_all(target),
611 Self::Flat(s) => s.buffer.read_all(target),
612 }
613 }
614
615 /// Drop visible output beyond `window_size` without producing it,
616 /// keeping the most recent `window_size` bytes available to back
617 /// future match copies. Used by `decode_blocks_partial` to bound
618 /// memory while decoding the leading (skipped) blocks into the window.
619 #[cfg(feature = "lsm")]
620 fn buffer_drop_to_window_size(&mut self) -> usize {
621 match self {
622 Self::Ring(s) => s.buffer.drop_to_window_size(),
623 Self::Flat(s) => s.buffer.drop_to_window_size(),
624 }
625 }
626
627 /// Drop exactly `n` bytes from the front of the visible output without
628 /// producing them. Used by `decode_blocks_partial` to discard the
629 /// leading blocks' window-context bytes once the in-range blocks are
630 /// decoded (match resolution complete), leaving only the in-range output.
631 #[cfg(feature = "lsm")]
632 fn buffer_discard_front(&mut self, n: usize) {
633 match self {
634 Self::Ring(s) => s.buffer.discard_front(n),
635 Self::Flat(s) => s.buffer.discard_front(n),
636 }
637 }
638
639 /// Prime the match window with the caller's already-decompressed tail for
640 /// a resumed partial decode. Routes through whichever backend the current
641 /// scratch holds. See [`DecodeBuffer::prime_window`].
642 #[cfg(feature = "lsm")]
643 fn prime_window(&mut self, prefix: &[u8], total_output: u64) {
644 match self {
645 Self::Ring(s) => s.buffer.prime_window(prefix, total_output),
646 Self::Flat(s) => s.buffer.prime_window(prefix, total_output),
647 }
648 }
649
650 /// Total decompressed bytes produced so far (the buffer's running output
651 /// counter, unaffected by window drops / drains). Used to stamp a captured
652 /// [`ResumeState`]'s `output_offset`.
653 #[cfg(feature = "lsm")]
654 fn total_output(&self) -> u64 {
655 match self {
656 Self::Ring(s) => s.buffer.total_output(),
657 Self::Flat(s) => s.buffer.total_output(),
658 }
659 }
660
661 /// Clone the cross-block entropy/repcode state (FSE + Huffman tables +
662 /// `offset_hist`) out of the live scratch for a [`ResumeState`] snapshot.
663 #[cfg(feature = "lsm")]
664 fn export_entropy(
665 &self,
666 ) -> (
667 crate::decoding::scratch::FSEScratch,
668 crate::decoding::scratch::HuffmanScratch,
669 [u32; 3],
670 ) {
671 let (fse_src, huf_src, offset_hist) = match self {
672 Self::Ring(s) => (&s.fse, &s.huf, s.offset_hist),
673 Self::Flat(s) => (&s.fse, &s.huf, s.offset_hist),
674 };
675 let mut fse = crate::decoding::scratch::FSEScratch::new();
676 fse.reinit_from(fse_src);
677 let mut huf = crate::decoding::scratch::HuffmanScratch::new();
678 huf.reinit_resolved_from(huf_src);
679 (fse, huf, offset_hist)
680 }
681
682 /// Install entropy/repcode state from a [`ResumeState`] into the live
683 /// scratch so a `Repeat_Mode` / treeless resume block resolves against the
684 /// same tables a contiguous decode would have carried over.
685 #[cfg(feature = "lsm")]
686 fn restore_entropy(&mut self, state: &ResumeState) {
687 match self {
688 Self::Ring(s) => {
689 s.fse.reinit_from(&state.fse);
690 s.huf.reinit_resolved_from(&state.huf);
691 s.offset_hist = state.offset_hist;
692 }
693 Self::Flat(s) => {
694 s.fse.reinit_from(&state.fse);
695 s.huf.reinit_resolved_from(&state.huf);
696 s.offset_hist = state.offset_hist;
697 }
698 }
699 }
700
701 /// XXH64 of the window-prime bytes for a [`ResumeState`]: the last
702 /// `min(window_size, buffer_len)` bytes of the current buffer, which at emit
703 /// time are exactly the match-window context the resume block will see.
704 /// Wrap-aware via `last_n_as_slices` — streaming XXH64 over the two slices
705 /// equals a single hash over the contiguous `window_prime` at resume.
706 #[cfg(all(feature = "lsm", feature = "hash"))]
707 fn window_tail_hash(&self, window_size: usize) -> u64 {
708 use core::hash::Hasher;
709 let n = core::cmp::min(window_size, self.buffer_len());
710 let (s1, s2) = self.last_n_as_slices(n);
711 let mut h = twox_hash::XxHash64::with_seed(0);
712 h.write(s1);
713 h.write(s2);
714 h.finish()
715 }
716
717 fn decode_block_content<R: Read>(
718 &mut self,
719 decoder: &mut BlockDecoder,
720 header: &crate::blocks::block::BlockHeader,
721 source: R,
722 ) -> Result<u64, DecodeBlockContentError> {
723 match self {
724 Self::Ring(s) => decoder.decode_block_content(header, s, source),
725 Self::Flat(s) => decoder.decode_block_content(header, s, source),
726 }
727 }
728
729 #[cfg(feature = "hash")]
730 fn hash_finish(&self) -> u64 {
731 use core::hash::Hasher;
732 match self {
733 Self::Ring(s) => s.buffer.hash.finish(),
734 Self::Flat(s) => s.buffer.hash.finish(),
735 }
736 }
737
738 /// Forward the drain-time hash toggle to the inner `DecodeBuffer`
739 /// (streaming path). Called by the frame layer from the decoder's
740 /// `ContentChecksum` mode before each decode.
741 #[cfg(feature = "hash")]
742 fn set_compute_hash(&mut self, compute: bool) {
743 match self {
744 Self::Ring(s) => s.buffer.set_compute_hash(compute),
745 Self::Flat(s) => s.buffer.set_compute_hash(compute),
746 }
747 }
748}
749
750struct FrameDecoderState {
751 pub frame_header: frame::FrameHeader,
752 decoder_scratch: DecoderScratchKind,
753 frame_finished: bool,
754 block_counter: usize,
755 bytes_read_counter: u64,
756 check_sum: Option<u32>,
757 using_dict: Option<u32>,
758}
759
760pub enum BlockDecodingStrategy {
761 All,
762 UptoBlocks(usize),
763 UptoBytes(usize),
764}
765
766/// Outcome of [`FrameDecoder::decode_blocks_partial`]: the decompressed
767/// bytes of the requested inner-block range plus where (if anywhere)
768/// decoding stopped early.
769///
770/// Behind the `lsm` Cargo feature.
771#[cfg(feature = "lsm")]
772#[derive(Debug)]
773pub struct PartialDecode {
774 /// Decompressed bytes of the in-range blocks actually decoded, in
775 /// frame order, as one contiguous buffer. `data.len()` equals the sum
776 /// of the decompressed sizes of blocks `start_block .. start_block +
777 /// blocks_decoded`.
778 pub data: alloc::vec::Vec<u8>,
779 /// First block whose output is in [`data`](Self::data): the requested
780 /// `start_block` on a fresh decode, or [`ResumeState::block_index`] when
781 /// resuming (the caller-supplied `start_block` is ignored in resume mode).
782 pub start_block: u32,
783 /// Number of in-range blocks successfully decoded into
784 /// [`data`](Self::data).
785 pub blocks_decoded: u32,
786 /// `Some((block_index, error))` if decoding stopped on a failing block
787 /// before reaching `end_block` (a corrupt block inside the range, or a
788 /// leading block needed for window context). `None` if the requested
789 /// range decoded cleanly or the frame's last block was reached first.
790 ///
791 /// When the failing block is a leading context block
792 /// (`block_index < start_block`), the in-range window could not be
793 /// built so [`data`](Self::data) is empty and `blocks_decoded` is 0.
794 pub stopped_at: Option<(u32, FrameDecoderError)>,
795 /// `true` if the frame's last block was reached during this decode.
796 pub frame_finished: bool,
797 /// Cross-block carry-over state for resuming the next extent. Feed it back
798 /// (with the matching `window_prime`) via the `resume` argument of a
799 /// later [`FrameDecoder::decode_blocks_partial`] to continue from
800 /// [`ResumeState::block_index`] without re-decompressing the prefix.
801 ///
802 /// `None` in two cases: emission was not requested (`emit_resume = false`),
803 /// OR this decode reached the frame's last block ([`frame_finished`] is
804 /// `true`) — there is no following block to resume from, so no snapshot is
805 /// emitted even with `emit_resume = true`. Callers walking a frame
806 /// incrementally should therefore stop when `frame_finished` is set rather
807 /// than treat a `None` here as "emission disabled".
808 ///
809 /// [`frame_finished`]: Self::frame_finished
810 pub resume_state: Option<ResumeState>,
811}
812
813impl FrameDecoderState {
814 /// Window size to actually reserve for this frame's decode buffer.
815 /// A declared content size caps the useful window: matches can never
816 /// reference further back than the bytes that will ever exist, so an
817 /// encoder-declared window above the FCS (e.g. a level-preset window
818 /// on a smaller input) must not inflate the reservation. Every
819 /// `reserve_buffer` site routes through this so the cap is uniform
820 /// across `decode_all_impl`, `decode_blocks`, and the partial path.
821 fn useful_window_size(&self) -> usize {
822 let window_size = self.frame_header.window_size().unwrap_or(0);
823 if self.frame_header.fcs_declared() {
824 window_size.min(self.frame_header.frame_content_size()) as usize
825 } else {
826 window_size as usize
827 }
828 }
829
830 /// Construct a new frame decoder state, reading the frame header
831 /// from `source`. When `magicless` is `true`, the 4-byte magic
832 /// number prefix is NOT consumed (donor `ZSTD_f_zstd1_magicless`).
833 /// Crate-internal — reached only via `FrameDecoder::init` /
834 /// `FrameDecoder::init_with_dict_handle`. The decode buffer is
835 /// allocated lazily on BOTH backends (`new_ring` and `new_flat`):
836 /// direct-eligible frames pay zero buffer allocation, and the
837 /// non-direct fallback reserves `window_size` once in
838 /// `decode_all_impl` / `decode_blocks` via `reserve_buffer` before
839 /// any block write.
840 pub(crate) fn new_with_format(
841 source: impl Read,
842 magicless: bool,
843 ) -> Result<FrameDecoderState, FrameDecoderError> {
844 let (frame, header_size) = frame::read_frame_header_with_format(source, magicless)?;
845 let window_size = frame.window_size()?;
846
847 if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
848 return Err(FrameDecoderError::WindowSizeTooBig {
849 requested: window_size,
850 });
851 }
852
853 let decoder_scratch = if frame.descriptor.single_segment_flag() {
854 DecoderScratchKind::new_flat(window_size as usize)
855 } else {
856 DecoderScratchKind::new_ring(window_size as usize)
857 };
858 Ok(FrameDecoderState {
859 frame_header: frame,
860 frame_finished: false,
861 block_counter: 0,
862 decoder_scratch,
863 bytes_read_counter: u64::from(header_size),
864 check_sum: None,
865 using_dict: None,
866 })
867 }
868
869 /// Reset this state for a new frame read from `source`, reusing
870 /// existing allocations. When `magicless` is `true`, the frame
871 /// header is read WITHOUT expecting a magic-number prefix
872 /// (donor `ZSTD_f_zstd1_magicless`). Crate-internal — reached
873 /// only via `FrameDecoder::reset`.
874 ///
875 /// `DecodeBuffer::reset` no longer reserves window_size for either
876 /// backend — capacity decisions live one layer up. Both backends are
877 /// lazy: direct-eligible frames pay zero backing-buffer allocation
878 /// here (they write through `UserSliceBackend`), and the non-direct
879 /// path is pre-reserved by `decode_all_impl` / `decode_blocks` via
880 /// `DecoderScratchKind::reserve_buffer(window_size)` before any block
881 /// write. A reused scratch whose new frame fits within prior capacity
882 /// reuses it; a larger one grows on that same `reserve_buffer` call.
883 pub(crate) fn reset_with_format(
884 &mut self,
885 source: impl Read,
886 magicless: bool,
887 ) -> Result<(), FrameDecoderError> {
888 let (frame_header, header_size) = frame::read_frame_header_with_format(source, magicless)?;
889 let window_size = frame_header.window_size()?;
890
891 if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
892 return Err(FrameDecoderError::WindowSizeTooBig {
893 requested: window_size,
894 });
895 }
896
897 self.decoder_scratch
898 .reset(&frame_header, window_size as usize);
899 self.frame_header = frame_header;
900 self.frame_finished = false;
901 self.block_counter = 0;
902 self.bytes_read_counter = u64::from(header_size);
903 self.check_sum = None;
904 self.using_dict = None;
905 Ok(())
906 }
907}
908
909impl Default for FrameDecoder {
910 fn default() -> Self {
911 Self::new()
912 }
913}
914
915impl FrameDecoder {
916 /// This will create a new decoder without allocating anything yet.
917 /// init()/reset() will allocate all needed buffers if it is the first time this decoder is used
918 /// else they just reset these buffers with not further allocations
919 pub fn new() -> FrameDecoder {
920 FrameDecoder {
921 state: None,
922 #[cfg(test)]
923 direct_frames: 0,
924 owned_dicts: BTreeMap::new(),
925 #[cfg(target_has_atomic = "ptr")]
926 shared_dicts: BTreeMap::new(),
927 #[cfg(not(target_has_atomic = "ptr"))]
928 shared_dicts: (),
929 magicless: false,
930 content_checksum: ContentChecksum::EmitOnly,
931 #[cfg(feature = "lsm")]
932 expect_dict_id: None,
933 #[cfg(feature = "lsm")]
934 expect_window_descriptor: None,
935 #[cfg(all(feature = "lsm", feature = "hash"))]
936 per_block_checksums_enabled: false,
937 #[cfg(all(feature = "lsm", feature = "hash"))]
938 computed_block_checksums: alloc::vec::Vec::new(),
939 }
940 }
941
942 /// Heap bytes currently held by the decoder's lazily-grown workspace:
943 /// the decode-window buffer plus the per-block literal/content buffers
944 /// and the entropy tables. Returns 0 before the first frame is initialised
945 /// (no workspace allocated yet). The window allocation dominates and grows
946 /// with the frame's window size; this is the value to track for decode-time
947 /// memory pressure, mirroring the workspace term of upstream
948 /// `ZSTD_sizeof_DCtx`. Shared dictionaries (ref-counted handles) are not
949 /// counted, matching upstream excluding `refDDict` memory.
950 pub fn workspace_size(&self) -> usize {
951 self.state
952 .as_ref()
953 .map_or(0, |s| s.decoder_scratch.workspace_bytes())
954 }
955
956 /// Select how the frame's optional content checksum is handled
957 /// (compute, expose, verify, or skip). See [`ContentChecksum`].
958 /// Default [`ContentChecksum::EmitOnly`]. Takes effect on the next
959 /// decode; safe to call between frames on a reused decoder.
960 pub fn set_content_checksum(&mut self, mode: ContentChecksum) {
961 self.content_checksum = mode;
962 }
963
964 /// Opt in to per-block XXH64 verification during decode.
965 /// Default off; zero cost when disabled. Each block's decompressed
966 /// bytes are XXH64-hashed (low 32 bits) and appended to
967 /// [`Self::computed_block_checksums`] as the decode progresses.
968 /// Callers compare the captured digests against externally-stored
969 /// expected values (e.g. from a per-block sidecar in the
970 /// containing application protocol).
971 ///
972 /// Behind `all(feature = "lsm", feature = "hash")` — the XXH64
973 /// primitive lives behind the `hash` feature, so this method
974 /// only compiles when both are enabled.
975 #[cfg(all(feature = "lsm", feature = "hash"))]
976 pub fn enable_per_block_checksums(&mut self) {
977 self.per_block_checksums_enabled = true;
978 }
979
980 /// Per-block XXH64 (low 32 bits) digests captured during the
981 /// current frame's decode. Empty unless
982 /// [`Self::enable_per_block_checksums`] was called before
983 /// [`Self::decode_all`] / [`Self::reset`].
984 ///
985 /// Reset at the start of every new frame.
986 ///
987 /// Behind `all(feature = "lsm", feature = "hash")`.
988 #[cfg(all(feature = "lsm", feature = "hash"))]
989 pub fn computed_block_checksums(&self) -> &[u32] {
990 &self.computed_block_checksums
991 }
992
993 /// Pin the expected `Dictionary_ID` for the next frame.
994 ///
995 /// When `expected` is set, [`Self::init`] / [`Self::reset`]
996 /// validate it against the parsed frame header BEFORE any
997 /// block decode work runs. A mismatch returns
998 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedDictId`]
999 /// before any block decode and before any output is produced.
1000 /// Scratch buffer allocation / reservation for the decode
1001 /// pipeline happens during frame-header parsing, which is
1002 /// already complete when this validation fires — the cost of
1003 /// scratch sizing is paid even on a mismatched header. The
1004 /// guarantee is "no block decode, no XXH64 init, no partial
1005 /// output", not "zero allocation".
1006 ///
1007 /// `Some(0)` is treated as "no dictionary expected": a frame
1008 /// whose header omits the optional `Dictionary_ID` field
1009 /// (flag value 0) passes the check; a frame that carries an
1010 /// explicit non-zero id fails.
1011 ///
1012 /// `None` (default) disables the check.
1013 ///
1014 /// Primary use case: post-AEAD-decrypt sanity check in
1015 /// wire-format consumers (e.g. lsm-tree's encrypted block
1016 /// format pins the `dict_id` baked into the AAD against the
1017 /// inner zstd frame's `dict_id` to defeat dict-substitution
1018 /// attacks).
1019 ///
1020 /// NOT a replacement for AEAD authentication. NOT the same
1021 /// semantic as donor `ZSTD_d_windowLogMax` (which is a
1022 /// ceiling-style limit, separate concern).
1023 #[cfg(feature = "lsm")]
1024 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
1025 pub fn expect_dict_id(&mut self, expected: Option<u32>) {
1026 self.expect_dict_id = expected;
1027 }
1028
1029 /// Pin the expected raw `Window_Descriptor` byte (RFC 8878
1030 /// §3.1.1.1.2 layout: `(exp << 3) | mantissa`) for the next
1031 /// frame.
1032 ///
1033 /// When `expected` is set, [`Self::init`] / [`Self::reset`]
1034 /// validate it against the parsed frame header BEFORE any
1035 /// block decode work runs. A mismatch returns
1036 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedWindowDescriptor`].
1037 ///
1038 /// Single-segment frames omit the `Window_Descriptor` byte
1039 /// from the wire entirely. Setting an expectation while
1040 /// receiving a single-segment frame fails the check with
1041 /// `found: None` — there is no on-wire byte to match against,
1042 /// which is reported explicitly rather than silently passing.
1043 ///
1044 /// `None` (default) disables the check.
1045 ///
1046 /// Byte-exact equality, NOT a ceiling. Donor
1047 /// `ZSTD_d_windowLogMax` is a separate ceiling-style limit
1048 /// available through the C FFI surface; this method is for
1049 /// strict equality validation against a pinned expectation
1050 /// (e.g. lsm-tree's wire format pins the window descriptor
1051 /// from the AAD to defeat decompression-bomb-swap attacks).
1052 #[cfg(feature = "lsm")]
1053 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
1054 pub fn expect_window_descriptor(&mut self, expected: Option<u8>) {
1055 self.expect_window_descriptor = expected;
1056 }
1057
1058 /// Validate the just-parsed frame header against any pinned
1059 /// expectations set via [`Self::expect_dict_id`] /
1060 /// [`Self::expect_window_descriptor`].
1061 ///
1062 /// Returns the typed error variant on mismatch and leaves
1063 /// `self.state` in a re-resettable shape — a subsequent
1064 /// `reset()` will overwrite `frame_header` from the new source
1065 /// without needing intermediate cleanup.
1066 #[cfg(feature = "lsm")]
1067 fn validate_expectations(
1068 &self,
1069 frame_header: &frame::FrameHeader,
1070 ) -> Result<(), FrameDecoderError> {
1071 if let Some(expected) = self.expect_dict_id {
1072 let found = frame_header.dictionary_id();
1073 // `Some(0)` is the "no dictionary expected" sentinel —
1074 // matches a frame whose header omits the optional
1075 // dict_id field (which is reported as `None` by the
1076 // parser). All other values must match exactly.
1077 let matches = match (expected, found) {
1078 (0, None) => true,
1079 (e, Some(f)) => e == f,
1080 _ => false,
1081 };
1082 if !matches {
1083 return Err(FrameDecoderError::UnexpectedDictId {
1084 expected: Some(expected),
1085 found,
1086 });
1087 }
1088 }
1089 if let Some(expected) = self.expect_window_descriptor {
1090 let found = frame_header.window_descriptor();
1091 if found != Some(expected) {
1092 return Err(FrameDecoderError::UnexpectedWindowDescriptor { expected, found });
1093 }
1094 }
1095 Ok(())
1096 }
1097
1098 /// Enable or disable magicless frame format
1099 /// (`ZSTD_f_zstd1_magicless`). When set to `true`, subsequent
1100 /// [`init`] / [`reset`] calls expect the frame header to begin
1101 /// directly with the frame-header descriptor — no 4-byte magic
1102 /// number prefix. Default false. Must match the encoder's
1103 /// magicless setting; the format is unambiguous only when the
1104 /// caller knows it out-of-band.
1105 ///
1106 /// Note: magicless mode also disables skippable-frame detection.
1107 /// The `0x184D2A50..=0x184D2A5F` skippable-frame magic range is
1108 /// only recognised when the 4-byte magic prefix is consumed, so
1109 /// `decode_all` / `init` / `reset` will treat a skippable frame
1110 /// at the head of a magicless stream as a malformed frame header
1111 /// (bad descriptor / window-size error) instead of skipping it.
1112 /// Mixed-format streams that interleave skippable frames must be
1113 /// pre-split by the caller; `set_magicless(true)` is only safe
1114 /// when the entire stream is known to be magicless zstd frames.
1115 pub fn set_magicless(&mut self, magicless: bool) {
1116 self.magicless = magicless;
1117 }
1118
1119 #[cfg(target_has_atomic = "ptr")]
1120 fn shared_dict_exists(&self, dict_id: u32) -> bool {
1121 self.shared_dicts.contains_key(&dict_id)
1122 }
1123
1124 #[cfg(not(target_has_atomic = "ptr"))]
1125 fn shared_dict_exists(&self, _dict_id: u32) -> bool {
1126 false
1127 }
1128
1129 fn validate_registered_dictionary(dict: &Dictionary) -> Result<(), FrameDecoderError> {
1130 use crate::decoding::errors::DictionaryDecodeError as dict_err;
1131
1132 if dict.id == 0 {
1133 return Err(FrameDecoderError::from(dict_err::ZeroDictionaryId));
1134 }
1135 if let Some(index) = dict.offset_hist.iter().position(|&rep| rep == 0) {
1136 return Err(FrameDecoderError::from(
1137 dict_err::ZeroRepeatOffsetInDictionary { index: index as u8 },
1138 ));
1139 }
1140 Ok(())
1141 }
1142
1143 /// init() will allocate all needed buffers if it is the first time this decoder is used
1144 /// else they just reset these buffers with not further allocations
1145 ///
1146 /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
1147 ///
1148 /// equivalent to reset()
1149 pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
1150 self.reset(source)
1151 }
1152
1153 /// Initialize the decoder for a new frame using a pre-parsed dictionary handle.
1154 ///
1155 /// If the frame header has a dictionary ID, this validates it against
1156 /// `dict.id()` and returns [`FrameDecoderError::DictIdMismatch`] on mismatch.
1157 ///
1158 /// If the header omits the optional dictionary ID, this still applies the
1159 /// provided dictionary handle.
1160 ///
1161 /// # Warning
1162 ///
1163 /// This method always applies `dict` unless the frame header contains a
1164 /// non-matching dictionary ID. Callers must only use this API when they
1165 /// already know the frame was encoded with the provided dictionary, even if
1166 /// the frame header omits the dictionary ID or encodes an explicit
1167 /// dictionary ID of `0`.
1168 ///
1169 /// Passing a dictionary for a frame that was not encoded with it can
1170 /// silently corrupt the decoded output.
1171 pub fn init_with_dict_handle(
1172 &mut self,
1173 source: impl Read,
1174 dict: &DictionaryHandle,
1175 ) -> Result<(), FrameDecoderError> {
1176 self.reset_with_dict_handle(source, dict)
1177 }
1178
1179 /// reset() will allocate all needed buffers if it is the first time this decoder is used
1180 /// else they just reset these buffers with not further allocations
1181 ///
1182 /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
1183 ///
1184 /// equivalent to init()
1185 pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
1186 use FrameDecoderError as err;
1187 // Fresh frame → start with an empty per-block checksum vec so
1188 // the values for the next frame don't carry over from the
1189 // previous one.
1190 #[cfg(all(feature = "lsm", feature = "hash"))]
1191 self.computed_block_checksums.clear();
1192 let magicless = self.magicless;
1193 let dict_id = match &mut self.state {
1194 Some(s) => {
1195 s.reset_with_format(source, magicless)?;
1196 s.frame_header.dictionary_id()
1197 }
1198 None => {
1199 self.state = Some(FrameDecoderState::new_with_format(source, magicless)?);
1200 self.state
1201 .as_ref()
1202 .and_then(|state| state.frame_header.dictionary_id())
1203 }
1204 };
1205 // Validate any pinned expectations BEFORE block decode work
1206 // runs. Catches dict_id substitution / window-descriptor
1207 // tampering on inputs already authenticated by an outer
1208 // layer (e.g. AEAD). Returning here leaves `self.state` in
1209 // a re-resettable shape — next `reset()` re-parses the
1210 // frame header without intermediate cleanup.
1211 #[cfg(feature = "lsm")]
1212 if let Some(state) = self.state.as_ref() {
1213 self.validate_expectations(&state.frame_header)?;
1214 }
1215 if let Some(dict_id) = dict_id {
1216 let state = self.state.as_mut().expect("state initialized");
1217 let owned_dicts = &self.owned_dicts;
1218 #[cfg(target_has_atomic = "ptr")]
1219 let shared_dicts = &self.shared_dicts;
1220 let dict = owned_dicts
1221 .get(&dict_id)
1222 .or_else(|| {
1223 #[cfg(target_has_atomic = "ptr")]
1224 {
1225 shared_dicts.get(&dict_id)
1226 }
1227 #[cfg(not(target_has_atomic = "ptr"))]
1228 {
1229 None
1230 }
1231 })
1232 .ok_or(err::DictNotProvided { dict_id })?;
1233 state.decoder_scratch.init_from_dict(dict);
1234 state.using_dict = Some(dict_id);
1235 }
1236 Ok(())
1237 }
1238
1239 /// Reset this decoder for a new frame using a pre-parsed dictionary handle.
1240 ///
1241 /// If the frame header has a dictionary ID, this validates it against
1242 /// `dict.id()` and returns [`FrameDecoderError::DictIdMismatch`] on mismatch.
1243 ///
1244 /// If the header omits the optional dictionary ID, this still applies the
1245 /// provided dictionary handle.
1246 ///
1247 /// # Warning
1248 ///
1249 /// This method always applies `dict` unless the frame header contains a
1250 /// non-matching dictionary ID. Callers must only use this API when they
1251 /// already know the frame was encoded with the provided dictionary, even if
1252 /// the frame header omits the dictionary ID or encodes an explicit
1253 /// dictionary ID of `0`.
1254 ///
1255 /// Passing a dictionary for a frame that was not encoded with it can
1256 /// silently corrupt the decoded output.
1257 pub fn reset_with_dict_handle(
1258 &mut self,
1259 source: impl Read,
1260 dict: &DictionaryHandle,
1261 ) -> Result<(), FrameDecoderError> {
1262 use FrameDecoderError as err;
1263 // Fresh frame → drop the previous frame's per-block checksum
1264 // digests so the next decode starts with an empty vec.
1265 // Mirrors the same clear in `reset()`; reset_with_dict_handle
1266 // is a parallel entry point so it needs its own call.
1267 #[cfg(all(feature = "lsm", feature = "hash"))]
1268 self.computed_block_checksums.clear();
1269 Self::validate_registered_dictionary(dict.as_dict())?;
1270 let magicless = self.magicless;
1271 // Scope the &mut borrow of `self.state` to the header parse
1272 // alone, so the subsequent `validate_expectations(&self, ...)`
1273 // call below can take a fresh shared borrow of self without
1274 // tripping the borrow checker.
1275 match &mut self.state {
1276 Some(s) => s.reset_with_format(source, magicless)?,
1277 None => {
1278 self.state = Some(FrameDecoderState::new_with_format(source, magicless)?);
1279 }
1280 }
1281 // Single source of truth: route through the same
1282 // `validate_expectations` used by `reset()`. Routing through
1283 // the helper keeps the two code paths from drifting (e.g.,
1284 // if expect-semantics or error wiring changes later).
1285 #[cfg(feature = "lsm")]
1286 {
1287 let header = &self
1288 .state
1289 .as_ref()
1290 .expect("state populated by reset_with_format/new_with_format")
1291 .frame_header;
1292 self.validate_expectations(header)?;
1293 }
1294 let state = self
1295 .state
1296 .as_mut()
1297 .expect("state populated by reset_with_format/new_with_format");
1298 if let Some(dict_id) = state.frame_header.dictionary_id()
1299 && dict_id != dict.id()
1300 {
1301 return Err(err::DictIdMismatch {
1302 expected: dict_id,
1303 provided: dict.id(),
1304 });
1305 }
1306 state.decoder_scratch.init_from_dict(dict);
1307 state.using_dict = Some(dict.id());
1308 Ok(())
1309 }
1310
1311 /// Add a dictionary that can be selected dynamically by frame dictionary ID.
1312 ///
1313 /// Returns [`FrameDecoderError::DictAlreadyRegistered`] if the ID is already
1314 /// registered (either as owned or shared).
1315 pub fn add_dict(&mut self, dict: Dictionary) -> Result<(), FrameDecoderError> {
1316 Self::validate_registered_dictionary(&dict)?;
1317 let dict_id = dict.id;
1318 if self.owned_dicts.contains_key(&dict_id) || self.shared_dict_exists(dict_id) {
1319 return Err(FrameDecoderError::DictAlreadyRegistered { dict_id });
1320 }
1321 self.owned_dicts
1322 .insert(dict_id, DictionaryHandle::from_dictionary(dict));
1323 Ok(())
1324 }
1325
1326 /// Parse and add a serialized dictionary blob.
1327 pub fn add_dict_from_bytes(&mut self, raw_dictionary: &[u8]) -> Result<(), FrameDecoderError> {
1328 let dict = Dictionary::decode_dict(raw_dictionary)?;
1329 self.add_dict(dict)
1330 }
1331
1332 /// Add a pre-parsed dictionary handle for reuse across decoders.
1333 ///
1334 /// This API is available on targets with pointer-width atomics
1335 /// (`target_has_atomic = "ptr"`).
1336 ///
1337 /// Returns [`FrameDecoderError::DictAlreadyRegistered`] if the ID is already
1338 /// registered (either as owned or shared).
1339 #[cfg(target_has_atomic = "ptr")]
1340 pub fn add_dict_handle(&mut self, dict: DictionaryHandle) -> Result<(), FrameDecoderError> {
1341 Self::validate_registered_dictionary(dict.as_dict())?;
1342 let dict_id = dict.id();
1343 if self.owned_dicts.contains_key(&dict_id) || self.shared_dicts.contains_key(&dict_id) {
1344 return Err(FrameDecoderError::DictAlreadyRegistered { dict_id });
1345 }
1346 self.shared_dicts.insert(dict_id, dict);
1347 Ok(())
1348 }
1349
1350 pub fn force_dict(&mut self, dict_id: u32) -> Result<(), FrameDecoderError> {
1351 use FrameDecoderError as err;
1352 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
1353 let owned_dicts = &self.owned_dicts;
1354 #[cfg(target_has_atomic = "ptr")]
1355 let shared_dicts = &self.shared_dicts;
1356
1357 let dict = owned_dicts
1358 .get(&dict_id)
1359 .or_else(|| {
1360 #[cfg(target_has_atomic = "ptr")]
1361 {
1362 shared_dicts.get(&dict_id)
1363 }
1364 #[cfg(not(target_has_atomic = "ptr"))]
1365 {
1366 None
1367 }
1368 })
1369 .ok_or(err::DictNotProvided { dict_id })?;
1370 state.decoder_scratch.init_from_dict(dict);
1371 state.using_dict = Some(dict_id);
1372
1373 Ok(())
1374 }
1375
1376 /// Returns how many bytes the frame contains after decompression
1377 pub fn content_size(&self) -> u64 {
1378 match &self.state {
1379 None => 0,
1380 Some(s) => s.frame_header.frame_content_size(),
1381 }
1382 }
1383
1384 /// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame
1385 pub fn get_checksum_from_data(&self) -> Option<u32> {
1386 let state = self.state.as_ref()?;
1387
1388 state.check_sum
1389 }
1390
1391 /// Returns the checksum that was calculated while decoding.
1392 /// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder.
1393 /// Returns `None` when the frame header has `content_checksum_flag = 0`:
1394 /// no hash is computed for such frames (the post-decode XXH64 pass was a
1395 /// 63 % decode-wall hotspot on flag-off frames; skipping it when the
1396 /// frame format declares no trailing digest avoids that wasted work).
1397 #[cfg(feature = "hash")]
1398 pub fn get_calculated_checksum(&self) -> Option<u32> {
1399 let state = self.state.as_ref()?;
1400 // `ContentChecksum::None` skips the XXH64 pass entirely, so there is
1401 // no calculated digest to report.
1402 if self.content_checksum == ContentChecksum::None {
1403 return None;
1404 }
1405 if !state.frame_header.descriptor.content_checksum_flag() {
1406 return None;
1407 }
1408 let cksum_64bit = state.decoder_scratch.hash_finish();
1409 //truncate to lower 32bit because reasons...
1410 Some(cksum_64bit as u32)
1411 }
1412
1413 /// Compare the frame's stored content checksum against the digest the
1414 /// decoder computed, returning [`FrameDecoderError::ChecksumMismatch`] on
1415 /// disagreement. No-op unless the mode is [`ContentChecksum::Verify`] and
1416 /// the frame carries a trailing checksum.
1417 ///
1418 /// [`decode_all`](Self::decode_all) and the streaming reader call this
1419 /// automatically. Callers driving [`decode_blocks`](Self::decode_blocks)
1420 /// directly invoke it themselves once per frame, after the frame is fully
1421 /// decoded AND fully drained (e.g. via [`collect`](Self::collect)), so both
1422 /// the stored value and the running digest are final.
1423 #[cfg(feature = "hash")]
1424 pub fn verify_content_checksum(&self) -> Result<(), FrameDecoderError> {
1425 if self.content_checksum != ContentChecksum::Verify {
1426 return Ok(());
1427 }
1428 let Some(state) = self.state.as_ref() else {
1429 return Ok(());
1430 };
1431 if !state.frame_header.descriptor.content_checksum_flag() {
1432 return Ok(());
1433 }
1434 let Some(expected) = state.check_sum else {
1435 return Ok(());
1436 };
1437 let calculated = state.decoder_scratch.hash_finish() as u32;
1438 if expected != calculated {
1439 return Err(FrameDecoderError::ChecksumMismatch {
1440 expected,
1441 calculated,
1442 });
1443 }
1444 Ok(())
1445 }
1446
1447 /// Counter for how many bytes have been consumed while decoding the frame
1448 pub fn bytes_read_from_source(&self) -> u64 {
1449 let state = match &self.state {
1450 None => return 0,
1451 Some(s) => s,
1452 };
1453 state.bytes_read_counter
1454 }
1455
1456 /// Whether the current frames last block has been decoded yet
1457 /// If this returns true you can call the drain* functions to get all content
1458 /// (the read() function will drain automatically if this returns true)
1459 pub fn is_finished(&self) -> bool {
1460 let state = match &self.state {
1461 None => return true,
1462 Some(s) => s,
1463 };
1464 if state.frame_header.descriptor.content_checksum_flag() {
1465 state.frame_finished && state.check_sum.is_some()
1466 } else {
1467 state.frame_finished
1468 }
1469 }
1470
1471 /// Counter for how many blocks have already been decoded
1472 pub fn blocks_decoded(&self) -> usize {
1473 let state = match &self.state {
1474 None => return 0,
1475 Some(s) => s,
1476 };
1477 state.block_counter
1478 }
1479
1480 /// Decodes blocks from a reader. It requires that the framedecoder has been initialized first.
1481 /// The Strategy influences how many blocks will be decoded before the function returns
1482 /// This is important if you want to manage memory consumption carefully. If you don't care
1483 /// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer
1484 pub fn decode_blocks(
1485 &mut self,
1486 mut source: impl Read,
1487 strat: BlockDecodingStrategy,
1488 ) -> Result<bool, FrameDecoderError> {
1489 use FrameDecoderError as err;
1490 // Apply the content-checksum mode to the streaming drain hash before
1491 // any block decodes into the ring. Hash only when a digest is both
1492 // wanted (mode != None) AND present in the frame (content_checksum_flag
1493 // set) — a flag-off frame has nothing to verify or expose, so hashing
1494 // it is wasted work. Mirrors the direct path and get_calculated_checksum.
1495 #[cfg(feature = "hash")]
1496 let checksum_mode = self.content_checksum;
1497 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
1498 #[cfg(feature = "hash")]
1499 {
1500 let compute_hash = checksum_mode != ContentChecksum::None
1501 && state.frame_header.descriptor.content_checksum_flag();
1502 state.decoder_scratch.set_compute_hash(compute_hash);
1503 }
1504
1505 // Streaming entry point: pre-reserve the backing buffer to
1506 // the FCS-capped window so multi-block frames don't pay repeated
1507 // `reserve_amortized` grow steps (128 KiB → 256 KiB → ... →
1508 // window) as blocks accumulate. `decode_all` does the same up
1509 // front in `decode_all_impl`; this mirrors it for callers
1510 // driving `decode_blocks` directly. Idempotent — the
1511 // backend's `reserve` early-returns when capacity is already
1512 // sufficient.
1513 let useful_window = state.useful_window_size();
1514 state.decoder_scratch.reserve_buffer(useful_window);
1515
1516 let mut block_dec = decoding::block_decoder::new();
1517
1518 let buffer_size_before = state.decoder_scratch.buffer_len();
1519 let block_counter_before = state.block_counter;
1520 loop {
1521 vprintln!("################");
1522 vprintln!("Next Block: {}", state.block_counter);
1523 vprintln!("################");
1524 // Capture the failing-block coordinates BEFORE the header read so
1525 // the error carries where it happened: `bytes_read_counter` is the
1526 // frame-absolute offset of this block's header (not yet advanced),
1527 // `block_counter` its 0-based index. Used by both the header- and
1528 // body-error builders below (block-precise recovery under `lsm`).
1529 let block_index = state.block_counter as u32;
1530 let block_frame_offset = state.bytes_read_counter as u32;
1531 let (block_header, block_header_size) =
1532 block_dec.read_block_header(&mut source).map_err(|source| {
1533 block_header_decode_error(source, block_index, block_frame_offset)
1534 })?;
1535 state.bytes_read_counter += u64::from(block_header_size);
1536
1537 vprintln!();
1538 vprintln!(
1539 "Found {} block with size: {}, which will be of size: {}",
1540 block_header.block_type,
1541 block_header.content_size,
1542 block_header.decompressed_size
1543 );
1544
1545 #[cfg(all(feature = "lsm", feature = "hash"))]
1546 let len_before_block: Option<usize> = if self.per_block_checksums_enabled {
1547 Some(state.decoder_scratch.buffer_len())
1548 } else {
1549 None
1550 };
1551 let bytes_read_in_block_body = state
1552 .decoder_scratch
1553 .decode_block_content(&mut block_dec, &block_header, &mut source)
1554 .map_err(|source| {
1555 block_body_decode_error(
1556 source,
1557 block_index,
1558 block_frame_offset,
1559 &block_header,
1560 block_header_size,
1561 )
1562 })?;
1563 state.bytes_read_counter += bytes_read_in_block_body;
1564
1565 // Per-block XXH64 (low 32 bits) of the just-decompressed
1566 // bytes. Hashed from `last_n_as_slices` so RingBuffer wrap
1567 // is handled in-place, no extra copy.
1568 #[cfg(all(feature = "lsm", feature = "hash"))]
1569 if let Some(len_before_block) = len_before_block {
1570 let added = state.decoder_scratch.buffer_len() - len_before_block;
1571 let (s1, s2) = state.decoder_scratch.last_n_as_slices(added);
1572 let mut h = twox_hash::XxHash64::with_seed(0);
1573 use core::hash::Hasher;
1574 h.write(s1);
1575 h.write(s2);
1576 self.computed_block_checksums.push(h.finish() as u32);
1577 }
1578
1579 state.block_counter += 1;
1580
1581 vprintln!("Output: {}", state.decoder_scratch.buffer_len());
1582
1583 if block_header.last_block {
1584 state.frame_finished = true;
1585 if state.frame_header.descriptor.content_checksum_flag() {
1586 let mut chksum = [0u8; 4];
1587 source
1588 .read_exact(&mut chksum)
1589 .map_err(err::FailedToReadChecksum)?;
1590 state.bytes_read_counter += 4;
1591 let chksum = u32::from_le_bytes(chksum);
1592 state.check_sum = Some(chksum);
1593 }
1594 break;
1595 }
1596
1597 match strat {
1598 BlockDecodingStrategy::All => { /* keep going */ }
1599 BlockDecodingStrategy::UptoBlocks(n) => {
1600 if state.block_counter - block_counter_before >= n {
1601 break;
1602 }
1603 }
1604 BlockDecodingStrategy::UptoBytes(n) => {
1605 if state.decoder_scratch.buffer_len() - buffer_size_before >= n {
1606 break;
1607 }
1608 }
1609 }
1610 }
1611
1612 Ok(state.frame_finished)
1613 }
1614
1615 /// Decode the inner blocks `[start_block, end_block)` of the current
1616 /// frame and return their decompressed bytes as one contiguous buffer.
1617 ///
1618 /// Serves two consumer needs with one call:
1619 ///
1620 /// - **Range-query performance:** decode only the inner zstd blocks that
1621 /// cover a key range instead of the whole frame. Blocks before
1622 /// `start_block` are decoded into the window (zstd blocks share one
1623 /// window, so a leading block's bytes may be the match source for an
1624 /// in-range block and cannot simply be skipped) but their output is not
1625 /// returned; blocks at or after `end_block` are not decoded at all,
1626 /// which is the trailing-block work saving. Map a decompressed byte
1627 /// offset to a block index with
1628 /// [`FrameEmitInfo::decompressed_byte_range`].
1629 /// - **Best-effort recovery:** if a block decode fails, decoding stops,
1630 /// the clean prefix of in-range output is preserved in
1631 /// [`PartialDecode::data`], and the failure is reported via
1632 /// [`PartialDecode::stopped_at`]. Passing `(0, u32::MAX)` decodes the
1633 /// whole frame, stopping at the first corrupt block (pure recovery).
1634 ///
1635 /// `end_block` is exclusive; pass `u32::MAX` to decode to the end of the
1636 /// frame. Call on a freshly [`reset`](Self::reset) decoder (it decodes
1637 /// from the frame's first block).
1638 ///
1639 /// # Resume (cold incremental / top-up)
1640 ///
1641 /// A plain call drains its in-range output from the match window on return,
1642 /// so two consecutive calls cannot resume one another and growing a decoded
1643 /// extent would mean re-decoding the covering prefix from block 0
1644 /// (`O(extent)` per growth, `O(N²)` for a forward walk). The `resume` /
1645 /// `emit_resume` arguments make a symmetric one-call grow-loop possible:
1646 ///
1647 /// - `emit_resume = true` captures the cross-block carry-over state (entropy
1648 /// tables + repcode history + the next block index / output offset) into
1649 /// [`PartialDecode::resume_state`]. The entropy-table snapshot clone is
1650 /// only paid when this is set. The snapshot is `None` when the decode
1651 /// reaches the frame's last block ([`PartialDecode::frame_finished`]):
1652 /// there is no following block to resume from, so an incremental walk
1653 /// stops on `frame_finished` rather than on a `None` snapshot.
1654 /// - `resume = Some(`[`ResumeInput`]`)` continues from a previously emitted
1655 /// [`ResumeState`] WITHOUT re-decompressing the preceding blocks: the
1656 /// match window is primed from [`ResumeInput::window_prime`] and the
1657 /// entropy/repcode tables are restored from the state, so a `Repeat_Mode`
1658 /// resume block resolves byte-identically to a contiguous decode — even
1659 /// across a dropped (cold) decoder.
1660 ///
1661 /// When `resume` is `Some`, decoding resumes at
1662 /// [`ResumeState::block_index`] and the `start_block` argument is ignored
1663 /// (pass `resume.state.block_index()`); position `source` at that block's
1664 /// compressed frame offset
1665 /// ([`FrameEmitInfo::blocks`]`[block_index].offset_in_frame`). After a
1666 /// resumed call, [`bytes_read_from_source`](Self::bytes_read_from_source)
1667 /// and any `stopped_at` offsets are relative to the repositioned `source`.
1668 ///
1669 /// **Dictionaries:** [`ResumeState`] does NOT carry the dictionary content.
1670 /// For a dictionary frame, attach the dictionary to the resuming decoder the
1671 /// same way as for a fresh decode — [`reset`](Self::reset) with the
1672 /// dictionary registered (or
1673 /// [`reset_with_dict_handle`](Self::reset_with_dict_handle)) BEFORE this
1674 /// call — so dict-sourced matches near the frame start resolve. The caller
1675 /// already holds the dictionary (it supplied it at encode time), so
1676 /// re-supplying it on resume is free; storing it in the snapshot would only
1677 /// duplicate it. The resume guard records the applied dictionary's identity
1678 /// and rejects ([`FrameDecoderError::ResumeFrameMismatch`]) a resume whose
1679 /// active dictionary differs from the one the snapshot was captured under.
1680 ///
1681 /// # Errors
1682 ///
1683 /// Returns [`FrameDecoderError::NotYetInitialized`] if the decoder has not
1684 /// been reset, [`FrameDecoderError::InvalidBlockRange`] if the effective
1685 /// start exceeds `end_block`, [`FrameDecoderError::ResumeWindowTooShort`]
1686 /// if `resume`'s `window_prime` is shorter than the match window the resume
1687 /// block can reach back into (`min(window_size, output_offset)`), and
1688 /// [`FrameDecoderError::ResumeFrameMismatch`] if the snapshot was captured
1689 /// from a frame with a different decode shape / dictionary, or (with the
1690 /// `hash` feature) a `window_prime` whose content does not match what was
1691 /// captured — all rejected up front rather than silently mis-resolving
1692 /// matches. A corrupt block is NOT an `Err` here: it is reported via
1693 /// [`PartialDecode::stopped_at`] so the clean prefix survives.
1694 ///
1695 /// [`FrameEmitInfo::decompressed_byte_range`]: crate::encoding::frame_emit_info::FrameEmitInfo::decompressed_byte_range
1696 /// [`FrameEmitInfo::blocks`]: crate::encoding::frame_emit_info::FrameEmitInfo::blocks
1697 #[cfg(feature = "lsm")]
1698 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
1699 pub fn decode_blocks_partial(
1700 &mut self,
1701 mut source: impl Read,
1702 start_block: u32,
1703 end_block: u32,
1704 resume: Option<ResumeInput<'_>>,
1705 emit_resume: bool,
1706 ) -> Result<PartialDecode, FrameDecoderError> {
1707 use FrameDecoderError as err;
1708 #[cfg(feature = "hash")]
1709 let checksum_mode = self.content_checksum;
1710 let magicless = self.magicless;
1711 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
1712
1713 // Honor the checksum mode before any drain/read can hash: `None` must
1714 // compute no XXH64. `decode_blocks` sets this; the partial path must too,
1715 // or a reused scratch keeps hashing with the default-enabled state.
1716 #[cfg(feature = "hash")]
1717 {
1718 let compute_hash = checksum_mode != ContentChecksum::None
1719 && state.frame_header.descriptor.content_checksum_flag();
1720 state.decoder_scratch.set_compute_hash(compute_hash);
1721 }
1722
1723 // Mirror `decode_blocks`: pre-reserve the backing buffer to the
1724 // FCS-capped window so multi-block frames don't pay repeated grow
1725 // steps. The RAW frame window stays separately bound — the resume
1726 // logic below bounds match reach by the frame's window semantics,
1727 // not by the (possibly smaller) reservation cap.
1728 let window_size = state.frame_header.window_size().unwrap_or(0) as usize;
1729 let useful_window = state.useful_window_size();
1730 state.decoder_scratch.reserve_buffer(useful_window);
1731
1732 // Cold resume: prime the match window + restore entropy/repcode state +
1733 // advance the block cursor BEFORE the loop, so the first in-range block
1734 // resolves its matches and `Repeat_Mode` tables against the caller's
1735 // persisted state instead of re-decoded prefix blocks. The effective
1736 // start is the resume state's block index (the passed `start_block` is
1737 // ignored in resume mode, per the doc).
1738 let effective_start = if let Some(r) = resume {
1739 // Reject a snapshot captured from a different frame shape BEFORE
1740 // touching any decoder state: restoring entropy/repcode tables that
1741 // belong to another frame would silently produce byte-wrong output.
1742 let current_key = FrameKey::from_state(state, magicless);
1743 if current_key != r.state.frame_key {
1744 return Err(err::ResumeFrameMismatch);
1745 }
1746 let output_offset = r.state.output_offset;
1747 // The window the resume block can reach back into is bounded by the
1748 // smaller of the frame's window_size and the bytes produced so far.
1749 let required = core::cmp::min(window_size as u64, output_offset) as usize;
1750 if r.window_prime.len() < required {
1751 return Err(err::ResumeWindowTooShort {
1752 got: r.window_prime.len(),
1753 need: required,
1754 });
1755 }
1756 // Only the most recent `window_size` bytes can ever back a match
1757 // (offset <= window_size by the frame invariant); load just those
1758 // even if the caller handed us a longer prefix, bounding resume
1759 // memory to one window regardless of the skipped prefix's size.
1760 let prime = if r.window_prime.len() > window_size {
1761 &r.window_prime[r.window_prime.len() - window_size..]
1762 } else {
1763 r.window_prime
1764 };
1765 // Content-exact identity: the primed window must hash to what was
1766 // captured at emit. Catches a same-shape-but-different-frame
1767 // snapshot and a wrong/corrupted window_prime (which FrameKey alone
1768 // cannot), before any state is restored. O(window) one-time per
1769 // resume — negligible next to the decode it guards.
1770 #[cfg(feature = "hash")]
1771 if xxh64_of(prime) != r.state.window_hash {
1772 return Err(err::ResumeFrameMismatch);
1773 }
1774 // Validate the effective range (resume mode begins at the resume
1775 // block, ignoring the caller's `start_block`) BEFORE mutating the
1776 // decoder: an inverted `end_block` must fail without priming the
1777 // window / entropy or advancing the cursor, leaving the decoder
1778 // re-resettable rather than in a half-resumed state.
1779 let effective_start = r.state.block_index;
1780 if effective_start > end_block {
1781 return Err(err::InvalidBlockRange {
1782 start_block: effective_start,
1783 end_block,
1784 });
1785 }
1786 state.decoder_scratch.restore_entropy(r.state);
1787 state.decoder_scratch.prime_window(prime, output_offset);
1788 state.block_counter = effective_start as usize;
1789 // The caller repositions `source` to the resume block; report
1790 // consumed bytes relative to that point (reset left this at the
1791 // frame-header size).
1792 state.bytes_read_counter = 0;
1793 effective_start
1794 } else {
1795 // Fresh decode: validate the caller's range (no state to mutate).
1796 if start_block > end_block {
1797 return Err(err::InvalidBlockRange {
1798 start_block,
1799 end_block,
1800 });
1801 }
1802 start_block
1803 };
1804
1805 let mut block_dec = decoding::block_decoder::new();
1806
1807 // Bytes of prefix-window output that physically precede the first
1808 // in-range block in the buffer. Captured at the prefix → in-range
1809 // transition (after leading blocks were dropped to the window) so we
1810 // can discard exactly those bytes once decoding is done. `None` until
1811 // the first in-range block is reached.
1812 let mut prefix_window_len: Option<usize> = None;
1813 // Exact count of clean in-range decompressed bytes (sum of per-block
1814 // length deltas of the in-range blocks that succeeded). Any partial
1815 // bytes of a failing in-range block are excluded — the fused executor
1816 // rolls the buffer back to the pre-block checkpoint on a sequence
1817 // error, and anything left over is never counted here, so it is not
1818 // drained into `data`.
1819 let mut subset_bytes: u64 = 0;
1820 let mut blocks_decoded: u32 = 0;
1821 let mut stopped_at: Option<(u32, FrameDecoderError)> = None;
1822
1823 loop {
1824 let block_index = state.block_counter as u32;
1825 // Stop before decoding `end_block`: the trailing blocks are never
1826 // touched (the perf win), and the frame's tail is left unread.
1827 if block_index >= end_block || state.frame_finished {
1828 break;
1829 }
1830 let in_range = block_index >= effective_start;
1831 // Snapshot the window length at the prefix → in-range boundary.
1832 if in_range && prefix_window_len.is_none() {
1833 prefix_window_len = Some(state.decoder_scratch.buffer_len());
1834 }
1835
1836 let block_frame_offset = state.bytes_read_counter as u32;
1837 let (block_header, block_header_size) = match block_dec.read_block_header(&mut source) {
1838 Ok(v) => v,
1839 Err(e) => {
1840 stopped_at = Some((
1841 block_index,
1842 block_header_decode_error(e, block_index, block_frame_offset),
1843 ));
1844 break;
1845 }
1846 };
1847 state.bytes_read_counter += u64::from(block_header_size);
1848
1849 let len_before = state.decoder_scratch.buffer_len();
1850 match state.decoder_scratch.decode_block_content(
1851 &mut block_dec,
1852 &block_header,
1853 &mut source,
1854 ) {
1855 Ok(body_read) => state.bytes_read_counter += body_read,
1856 Err(e) => {
1857 stopped_at = Some((
1858 block_index,
1859 block_body_decode_error(
1860 e,
1861 block_index,
1862 block_frame_offset,
1863 &block_header,
1864 block_header_size,
1865 ),
1866 ));
1867 break;
1868 }
1869 }
1870 let produced = state.decoder_scratch.buffer_len() - len_before;
1871 // Per-block XXH64 capture, mirroring `decode_blocks`: hash this
1872 // block's just-decoded bytes BEFORE any window drop so the digest
1873 // count stays 1:1 with the blocks decoded on this path too. Covers
1874 // context (out-of-range) blocks as well, matching `decode_blocks`
1875 // which hashes every block it decodes.
1876 #[cfg(all(feature = "lsm", feature = "hash"))]
1877 if self.per_block_checksums_enabled {
1878 use core::hash::Hasher;
1879 let (s1, s2) = state.decoder_scratch.last_n_as_slices(produced);
1880 let mut h = twox_hash::XxHash64::with_seed(0);
1881 h.write(s1);
1882 h.write(s2);
1883 self.computed_block_checksums.push(h.finish() as u32);
1884 }
1885 state.block_counter += 1;
1886 if in_range {
1887 subset_bytes += produced as u64;
1888 blocks_decoded += 1;
1889 }
1890
1891 if block_header.last_block {
1892 state.frame_finished = true;
1893 if state.frame_header.descriptor.content_checksum_flag() {
1894 let mut chksum = [0u8; 4];
1895 match source.read_exact(&mut chksum) {
1896 Ok(()) => {
1897 state.bytes_read_counter += 4;
1898 state.check_sum = Some(u32::from_le_bytes(chksum));
1899 }
1900 // A trailing-checksum read failure does not invalidate
1901 // the decoded bytes; surface it so the caller knows the
1902 // frame tail was truncated, but keep `data`.
1903 Err(e) => {
1904 stopped_at = Some((block_index, err::FailedToReadChecksum(e)));
1905 }
1906 }
1907 }
1908 break;
1909 }
1910
1911 // Leading (out-of-range) block: bound memory to the window. We
1912 // must NOT drop once in-range, or the in-range output we are about
1913 // to return would be discarded.
1914 if !in_range {
1915 state.decoder_scratch.buffer_drop_to_window_size();
1916 }
1917 }
1918
1919 // Emit cross-block carry-over state for a later resume, if requested.
1920 // Captured AFTER the loop (entropy tables / repcode history are final)
1921 // but BEFORE the drain — the drain only touches the visible output, not
1922 // the entropy state or `total_output_counter`. `block_counter` /
1923 // `total_output()` give the resume coordinates: the next block to decode
1924 // and the cumulative decompressed offset before it (clean even after an
1925 // early stop, since a failed block rolls both back to its checkpoint).
1926 // Suppress the snapshot on the terminal block: `block_counter` is then
1927 // one past the last block (EOF), for which there is no next-block source
1928 // position to resume from. A resume needs a real following block.
1929 let resume_state = if emit_resume && !state.frame_finished {
1930 let (fse, huf, offset_hist) = state.decoder_scratch.export_entropy();
1931 Some(ResumeState {
1932 frame_key: FrameKey::from_state(state, magicless),
1933 block_index: state.block_counter as u32,
1934 output_offset: state.decoder_scratch.total_output(),
1935 fse,
1936 huf,
1937 offset_hist,
1938 #[cfg(feature = "hash")]
1939 window_hash: state.decoder_scratch.window_tail_hash(window_size),
1940 })
1941 } else {
1942 None
1943 };
1944
1945 // The visible buffer is now `[prefix window][in-range clean][maybe
1946 // trailing garbage from a failed in-range block]`. Drop the prefix
1947 // window from the front (match resolution is complete, so it is no
1948 // longer needed), then drain exactly the clean in-range byte count.
1949 let w = prefix_window_len.unwrap_or(0);
1950 state.decoder_scratch.buffer_discard_front(w);
1951 let mut data = alloc::vec![0u8; subset_bytes as usize];
1952 state
1953 .decoder_scratch
1954 .buffer_read_all(&mut data)
1955 .map_err(err::FailedToDrainDecodebuffer)?;
1956
1957 // Clear anything still buffered so a later `read()`/`collect()` on this
1958 // decoder cannot surface out-of-range bytes: the leading-block window
1959 // when no in-range block was reached (`prefix_window_len` stayed
1960 // `None`, so `w` was 0), or trailing garbage from a failed in-range
1961 // block. Only the returned `data` is the partial decode's output.
1962 let residual = state.decoder_scratch.buffer_len();
1963 state.decoder_scratch.buffer_discard_front(residual);
1964
1965 Ok(PartialDecode {
1966 data,
1967 start_block: effective_start,
1968 blocks_decoded,
1969 stopped_at,
1970 frame_finished: state.frame_finished,
1971 resume_state,
1972 })
1973 }
1974
1975 /// Collect bytes and retain window_size bytes while decoding is still going on.
1976 /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
1977 pub fn collect(&mut self) -> Option<Vec<u8>> {
1978 let finished = self.is_finished();
1979 let state = self.state.as_mut()?;
1980 if finished {
1981 Some(state.decoder_scratch.buffer_drain())
1982 } else {
1983 state.decoder_scratch.buffer_drain_to_window_size()
1984 }
1985 }
1986
1987 /// Collect bytes and retain window_size bytes while decoding is still going on.
1988 /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
1989 pub fn collect_to_writer(&mut self, w: impl Write) -> Result<usize, Error> {
1990 let finished = self.is_finished();
1991 let state = match &mut self.state {
1992 None => return Ok(0),
1993 Some(s) => s,
1994 };
1995 if finished {
1996 state.decoder_scratch.buffer_drain_to_writer(w)
1997 } else {
1998 state.decoder_scratch.buffer_drain_to_window_size_writer(w)
1999 }
2000 }
2001
2002 /// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size
2003 /// because window_size bytes need to be retained for decoding.
2004 /// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes
2005 pub fn can_collect(&self) -> usize {
2006 let finished = self.is_finished();
2007 let state = match &self.state {
2008 None => return 0,
2009 Some(s) => s,
2010 };
2011 if finished {
2012 state.decoder_scratch.buffer_can_drain()
2013 } else {
2014 state
2015 .decoder_scratch
2016 .buffer_can_drain_to_window_size()
2017 .unwrap_or(0)
2018 }
2019 }
2020
2021 /// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice
2022 /// The source slice may contain only parts of a frame but must contain at least one full block to make progress
2023 ///
2024 /// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors
2025 /// which try to serve an old-style c api
2026 ///
2027 /// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same
2028 /// input will not make any progress!
2029 ///
2030 /// Note that no kind of block can be bigger than 128kb.
2031 /// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer
2032 ///
2033 /// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target)
2034 pub fn decode_from_to(
2035 &mut self,
2036 source: &[u8],
2037 target: &mut [u8],
2038 ) -> Result<(usize, usize), FrameDecoderError> {
2039 use FrameDecoderError as err;
2040 let bytes_read_at_start = match &self.state {
2041 Some(s) => s.bytes_read_counter,
2042 None => 0,
2043 };
2044
2045 if !self.is_finished() || self.state.is_none() {
2046 let mut mt_source = source;
2047
2048 if self.state.is_none() {
2049 self.init(&mut mt_source)?;
2050 }
2051
2052 //pseudo block to scope "state" so we can borrow self again after the block
2053 {
2054 let state = match &mut self.state {
2055 Some(s) => s,
2056 None => panic!("Bug in library"),
2057 };
2058 let mut block_dec = decoding::block_decoder::new();
2059
2060 // Honour the content-checksum mode on this hand-rolled decode
2061 // loop (it does not go through `decode_blocks`): hash only when
2062 // a digest is wanted and the frame carries one. `None` skips the
2063 // XXH64 pass; verification happens after the final drain below.
2064 #[cfg(feature = "hash")]
2065 {
2066 let compute_hash = self.content_checksum != ContentChecksum::None
2067 && state.frame_header.descriptor.content_checksum_flag();
2068 state.decoder_scratch.set_compute_hash(compute_hash);
2069 }
2070
2071 if state.frame_header.descriptor.content_checksum_flag()
2072 && state.frame_finished
2073 && state.check_sum.is_none()
2074 {
2075 // The trailing checksum arrived on a separate call (the last
2076 // block finished earlier). Consume it and fall through to the
2077 // shared `self.read` + post-drain verify below — NOT an early
2078 // return — so any output still buffered from a prior
2079 // small-`target` call is flushed on this call too, and the
2080 // checksum is verified through the one shared path.
2081 if mt_source.len() >= 4 {
2082 let chksum = mt_source[..4].try_into().expect("optimized away");
2083 state.bytes_read_counter += 4;
2084 let chksum = u32::from_le_bytes(chksum);
2085 state.check_sum = Some(chksum);
2086 mt_source = &mt_source[4..];
2087 }
2088 }
2089
2090 loop {
2091 // The frame is fully decoded (last block seen, trailer
2092 // consumed above); no more blocks to read. Any leftover
2093 // bytes are not a block header — stop before misreading them.
2094 if state.frame_finished {
2095 break;
2096 }
2097 //check if there are enough bytes for the next header
2098 if mt_source.len() < 3 {
2099 break;
2100 }
2101 let block_index = state.block_counter as u32;
2102 let block_frame_offset = state.bytes_read_counter as u32;
2103 let (block_header, block_header_size) = block_dec
2104 .read_block_header(&mut mt_source)
2105 .map_err(|source| {
2106 block_header_decode_error(source, block_index, block_frame_offset)
2107 })?;
2108
2109 // check the needed size for the block before updating counters.
2110 // If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place
2111 if mt_source.len() < block_header.content_size as usize {
2112 break;
2113 }
2114 state.bytes_read_counter += u64::from(block_header_size);
2115
2116 let bytes_read_in_block_body = state
2117 .decoder_scratch
2118 .decode_block_content(&mut block_dec, &block_header, &mut mt_source)
2119 .map_err(|source| {
2120 block_body_decode_error(
2121 source,
2122 block_index,
2123 block_frame_offset,
2124 &block_header,
2125 block_header_size,
2126 )
2127 })?;
2128 state.bytes_read_counter += bytes_read_in_block_body;
2129 state.block_counter += 1;
2130
2131 if block_header.last_block {
2132 state.frame_finished = true;
2133 if state.frame_header.descriptor.content_checksum_flag() {
2134 //if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call
2135 if mt_source.len() >= 4 {
2136 let chksum = mt_source[..4].try_into().expect("optimized away");
2137 state.bytes_read_counter += 4;
2138 let chksum = u32::from_le_bytes(chksum);
2139 state.check_sum = Some(chksum);
2140 }
2141 }
2142 break;
2143 }
2144 }
2145 }
2146 }
2147
2148 let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?;
2149 // Once the frame is fully decoded and drained, the running digest is
2150 // final: validate it in `Verify` mode (no-op otherwise). Same finish
2151 // point as the streaming reader.
2152 #[cfg(feature = "hash")]
2153 if self.is_finished() && self.can_collect() == 0 {
2154 self.verify_content_checksum()?;
2155 }
2156 let bytes_read_at_end = match &mut self.state {
2157 Some(s) => s.bytes_read_counter,
2158 None => panic!("Bug in library"),
2159 };
2160 let read_len = bytes_read_at_end - bytes_read_at_start;
2161 Ok((read_len as usize, result_len))
2162 }
2163
2164 /// Decode multiple frames into the output slice.
2165 ///
2166 /// `input` must contain an exact number of frames. Skippable frames are allowed and will be
2167 /// skipped during decode.
2168 ///
2169 /// `output` must be large enough to hold the decompressed data. If you don't know
2170 /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
2171 ///
2172 /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
2173 ///
2174 /// Returns the number of bytes written to `output`.
2175 pub fn decode_all(
2176 &mut self,
2177 input: &[u8],
2178 output: &mut [u8],
2179 ) -> Result<usize, FrameDecoderError> {
2180 #[cfg(not(feature = "lsm"))]
2181 {
2182 self.decode_all_impl(input, output, |this, src| this.init(src))
2183 }
2184 #[cfg(feature = "lsm")]
2185 {
2186 self.decode_all_impl(input, output, |this, src| this.init(src), None)
2187 }
2188 }
2189
2190 /// Decode multiple frames into the output slice, invoking `visitor`
2191 /// for every skippable frame encountered before advancing past it.
2192 ///
2193 /// `input` must contain an exact number of frames. Skippable frames
2194 /// (RFC 8878 §3.1.2 magic numbers `0x184D2A50..=0x184D2A5F`) are
2195 /// allowed and will be both visited AND skipped: the visitor gets
2196 /// `(magic_variant, payload)` where `magic_variant` is the low
2197 /// nibble of the magic (`magic - 0x184D2A50`, range `0..=15`) and
2198 /// `payload` is a borrowed slice of the on-wire payload bytes (the
2199 /// skippable frame's `Frame_Size` field worth of data) into
2200 /// `input` — no allocation.
2201 ///
2202 /// The visitor sees skippable frames in stream order; interleaved
2203 /// regular zstd frames continue to decompress into `output` exactly
2204 /// as `decode_all` does.
2205 ///
2206 /// `output` must be large enough to hold the decompressed data.
2207 /// Returns the number of bytes written to `output`.
2208 ///
2209 /// # Example
2210 ///
2211 /// ```ignore
2212 /// use structured_zstd::decoding::FrameDecoder;
2213 ///
2214 /// let mut decoder = FrameDecoder::new();
2215 /// let mut output = vec![0u8; 1024];
2216 /// let mut collected: Vec<(u8, Vec<u8>)> = Vec::new();
2217 /// let n = decoder.decode_all_with_skippable_visitor(
2218 /// input,
2219 /// &mut output,
2220 /// |variant, payload| collected.push((variant, payload.to_vec())),
2221 /// )?;
2222 /// ```
2223 #[cfg(feature = "lsm")]
2224 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
2225 pub fn decode_all_with_skippable_visitor<F>(
2226 &mut self,
2227 input: &[u8],
2228 output: &mut [u8],
2229 mut visitor: F,
2230 ) -> Result<usize, FrameDecoderError>
2231 where
2232 F: FnMut(u8, &[u8]),
2233 {
2234 self.decode_all_impl(
2235 input,
2236 output,
2237 |this, src| this.init(src),
2238 Some(&mut visitor),
2239 )
2240 }
2241
2242 /// Decode multiple frames into the output slice using a pre-parsed dictionary handle.
2243 ///
2244 /// `input` must contain an exact number of frames. Skippable frames are allowed and will be
2245 /// skipped during decode.
2246 ///
2247 /// `output` must be large enough to hold the decompressed data. If you don't know
2248 /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
2249 ///
2250 /// This calls [`FrameDecoder::init_with_dict_handle`], and all bytes currently in the
2251 /// decoder will be lost.
2252 ///
2253 /// # Warning
2254 ///
2255 /// Each decoded frame is initialized with `dict`, even when a frame header
2256 /// omits the optional dictionary ID. Callers must only use this API when
2257 /// they already know the input frames were encoded with the provided
2258 /// dictionary; otherwise decoded output can be silently corrupted.
2259 pub fn decode_all_with_dict_handle(
2260 &mut self,
2261 input: &[u8],
2262 output: &mut [u8],
2263 dict: &DictionaryHandle,
2264 ) -> Result<usize, FrameDecoderError> {
2265 #[cfg(not(feature = "lsm"))]
2266 {
2267 self.decode_all_impl(input, output, |this, src| {
2268 this.init_with_dict_handle(src, dict)
2269 })
2270 }
2271 #[cfg(feature = "lsm")]
2272 {
2273 self.decode_all_impl(
2274 input,
2275 output,
2276 |this, src| this.init_with_dict_handle(src, dict),
2277 None,
2278 )
2279 }
2280 }
2281
2282 /// Default-feature decode_all_impl: no visitor parameter so the
2283 /// no-lsm build's call surface and codegen are byte-identical to
2284 /// the pre-#172 implementation. Compiles only when `lsm` is OFF.
2285 #[cfg(not(feature = "lsm"))]
2286 fn decode_all_impl(
2287 &mut self,
2288 mut input: &[u8],
2289 mut output: &mut [u8],
2290 mut init_frame: impl FnMut(&mut Self, &mut &[u8]) -> Result<(), FrameDecoderError>,
2291 ) -> Result<usize, FrameDecoderError> {
2292 let mut total_bytes_written = 0;
2293 while !input.is_empty() {
2294 match init_frame(self, &mut input) {
2295 Ok(_) => {}
2296 Err(FrameDecoderError::ReadFrameHeaderError(
2297 crate::decoding::errors::ReadFrameHeaderError::SkipFrame { length, .. },
2298 )) => {
2299 input = input
2300 .get(length as usize..)
2301 .ok_or(FrameDecoderError::FailedToSkipFrame)?;
2302 continue;
2303 }
2304 Err(e) => return Err(e),
2305 };
2306 // Per-frame direct-path dispatch. Now safe to route the
2307 // public `decode_all` here because
2308 // `UserSliceBackend::exec_sequence_inline` returns
2309 // `Result<(), ExecuteSequencesError>` instead of
2310 // panicking on capacity overflow; the error propagates
2311 // up as `FrameDecoderError`. Eligibility (FCS > 0,
2312 // remaining `output` slice holds the declared content)
2313 // puts the frame on the fast path that bypasses the
2314 // FlatBuf/Ring -> `read()` drain copy. Ineligible frames
2315 // (no FCS, output too small) fall through to the legacy
2316 // `decode_blocks` + `read` drain loop below. Dictionary
2317 // frames are eligible: `run_direct_decode` hands the
2318 // shared dict handle to its buffer, and beyond-prefix
2319 // offsets resolve through `repeat_from_dict`.
2320 let (content_size, fcs_declared) = {
2321 let state_ref = self.state.as_ref().expect("init populated state");
2322 (
2323 state_ref.frame_header.frame_content_size(),
2324 state_ref.frame_header.fcs_declared(),
2325 )
2326 };
2327 // Direct decode requires only that the caller slice holds the
2328 // declared content; the inline sequence-exec path no longer
2329 // needs `WILDCOPY_OVERLENGTH` trailing slack because the
2330 // trailing sequence(s) take the bounded (non-overshooting)
2331 // copy in `UserSliceBackend::exec_sequence_bounded`. This is
2332 // the universal "decode into an FCS-sized buffer" case (a
2333 // caller sizing `output` to exactly `frame_content_size`),
2334 // so dropping the slack requirement halves its peak alloc.
2335 //
2336 // Per-block checksums collected inside `run_direct_decode`
2337 // post-loop (over recorded (start, end) ranges of `output`)
2338 // so the direct path stays eligible AND keeps the
2339 // window-size cap (`drop_to_window_size`) between blocks
2340 // that the spec relies on for `offset <= window_size`
2341 // validation. Path choice no longer alters checksum
2342 // semantics.
2343 let direct_eligible = content_size > 0 && (output.len() as u64) >= content_size;
2344 if direct_eligible {
2345 let written = self.run_direct_decode(&mut input, output, content_size)?;
2346 output = &mut output[written..];
2347 total_bytes_written += written;
2348 // Per-frame content-checksum verification (no-op unless the
2349 // mode is `Verify` and the frame carries a checksum).
2350 #[cfg(feature = "hash")]
2351 self.verify_content_checksum()?;
2352 continue;
2353 }
2354 // Non-direct fallback: pre-reserve the backing buffer to
2355 // `window_size` in a single allocation before block decode
2356 // starts, so multi-segment frames don't pay repeated
2357 // `reserve_amortized` grow steps as blocks accumulate (each
2358 // block only reserves MAX_BLOCK_SIZE = 128 KiB, so a window
2359 // > 128 KiB otherwise grows through several intermediate
2360 // sizes with `alloc_zeroed + memcpy` each time).
2361 if let Some(state) = self.state.as_mut() {
2362 // FCS-capped via `useful_window_size` — the same cap
2363 // `decode_blocks` applies, so its per-iteration reserve in
2364 // the loop below cannot grow the buffer back to the raw
2365 // frame window.
2366 let useful_window = state.useful_window_size();
2367 state.decoder_scratch.reserve_buffer(useful_window);
2368 }
2369 let frame_start_total = total_bytes_written;
2370 loop {
2371 self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
2372 let bytes_written = self
2373 .read(output)
2374 .map_err(FrameDecoderError::FailedToDrainDecodebuffer)?;
2375 output = &mut output[bytes_written..];
2376 total_bytes_written += bytes_written;
2377 if self.can_collect() != 0 {
2378 return Err(FrameDecoderError::TargetTooSmall);
2379 }
2380 if self.is_finished() {
2381 break;
2382 }
2383 }
2384 // Per-frame FCS validation on the legacy fallback path.
2385 // Use `fcs_declared()` (NOT `content_size > 0`) so an
2386 // empty frame with explicit FCS=0 on the wire still gets
2387 // validated.
2388 if fcs_declared {
2389 let produced = (total_bytes_written - frame_start_total) as u64;
2390 if produced != content_size {
2391 return Err(FrameDecoderError::FrameContentSizeMismatch {
2392 declared: content_size,
2393 produced,
2394 });
2395 }
2396 }
2397 // Per-frame content-checksum verification on the drain path: the
2398 // frame is fully decoded and drained here (is_finished + nothing
2399 // left to collect), so the running digest and stored value are
2400 // final. No-op unless the mode is `Verify`.
2401 #[cfg(feature = "hash")]
2402 self.verify_content_checksum()?;
2403 }
2404
2405 Ok(total_bytes_written)
2406 }
2407
2408 /// `lsm`-feature decode_all_impl: adds the optional skippable
2409 /// visitor parameter consumed by
2410 /// [`Self::decode_all_with_skippable_visitor`]. Mirrors the no-lsm
2411 /// variant including the direct-path dispatch + FCS-validation
2412 /// rationale comments, so the two functions stay in sync; the only
2413 /// behavioral difference is the SkipFrame arm, which uses
2414 /// `split_at(length)` (single bounds check) instead of two
2415 /// separate `get(..length)` / `get(length..)` slices and invokes
2416 /// the visitor (when `Some`) on the borrowed payload before
2417 /// advancing past it.
2418 #[cfg(feature = "lsm")]
2419 #[allow(clippy::type_complexity)]
2420 fn decode_all_impl(
2421 &mut self,
2422 mut input: &[u8],
2423 mut output: &mut [u8],
2424 mut init_frame: impl FnMut(&mut Self, &mut &[u8]) -> Result<(), FrameDecoderError>,
2425 mut skippable_visitor: Option<&mut dyn FnMut(u8, &[u8])>,
2426 ) -> Result<usize, FrameDecoderError> {
2427 let mut total_bytes_written = 0;
2428 while !input.is_empty() {
2429 match init_frame(self, &mut input) {
2430 Ok(_) => {}
2431 Err(FrameDecoderError::ReadFrameHeaderError(
2432 crate::decoding::errors::ReadFrameHeaderError::SkipFrame {
2433 magic_number,
2434 length,
2435 },
2436 )) => {
2437 let length = length as usize;
2438 // Visitor sees the payload slice BEFORE we advance
2439 // past it. Borrowed slice — no allocation. The
2440 // variant is the low nibble of the magic number
2441 // (RFC 8878 §3.1.2). `read_frame_header` only emits
2442 // SkipFrame for magic in 0x184D2A50..=0x184D2A5F, so
2443 // the subtraction fits in 0..=15.
2444 if input.len() < length {
2445 return Err(FrameDecoderError::FailedToSkipFrame);
2446 }
2447 let (payload, rest) = input.split_at(length);
2448 if let Some(visitor) = skippable_visitor.as_mut() {
2449 let variant = (magic_number - 0x184D2A50) as u8;
2450 visitor(variant, payload);
2451 }
2452 input = rest;
2453 continue;
2454 }
2455 Err(e) => return Err(e),
2456 };
2457 // Per-frame direct-path dispatch. Now safe to route the
2458 // public `decode_all` here because
2459 // `UserSliceBackend::exec_sequence_inline` returns
2460 // `Result<(), ExecuteSequencesError>` instead of
2461 // panicking on capacity overflow; the error propagates
2462 // up as `FrameDecoderError`. Eligibility (FCS > 0,
2463 // remaining `output` slice holds the declared content)
2464 // puts the frame on the fast path that bypasses the
2465 // FlatBuf/Ring -> `read()` drain copy. Ineligible frames
2466 // (no FCS, output too small) fall through to the legacy
2467 // `decode_blocks` + `read` drain loop below. Dictionary
2468 // frames are eligible (see the no-lsm path above).
2469 let (content_size, fcs_declared) = {
2470 let state_ref = self.state.as_ref().expect("init populated state");
2471 (
2472 state_ref.frame_header.frame_content_size(),
2473 state_ref.frame_header.fcs_declared(),
2474 )
2475 };
2476 // Only `cap >= frame_content_size` needed; the trailing
2477 // sequence(s) take the bounded copy in
2478 // `UserSliceBackend::exec_sequence_bounded`, so no
2479 // `WILDCOPY_OVERLENGTH` trailing slack is required (see the
2480 // no-lsm path above).
2481 let direct_eligible = content_size > 0 && (output.len() as u64) >= content_size;
2482 if direct_eligible {
2483 let written = self.run_direct_decode(&mut input, output, content_size)?;
2484 output = &mut output[written..];
2485 total_bytes_written += written;
2486 // Per-frame content-checksum verification (no-op unless the
2487 // mode is `Verify` and the frame carries a checksum).
2488 #[cfg(feature = "hash")]
2489 self.verify_content_checksum()?;
2490 continue;
2491 }
2492 // Non-direct fallback: pre-reserve the backing buffer to
2493 // `window_size` once so the per-block growth cycle is
2494 // skipped (see same comment on the no-lsm path above).
2495 if let Some(state) = self.state.as_mut() {
2496 // FCS-capped via `useful_window_size` — the same cap
2497 // `decode_blocks` applies, so its per-iteration reserve in
2498 // the loop below cannot grow the buffer back to the raw
2499 // frame window.
2500 let useful_window = state.useful_window_size();
2501 state.decoder_scratch.reserve_buffer(useful_window);
2502 }
2503 let frame_start_total = total_bytes_written;
2504 loop {
2505 self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
2506 let bytes_written = self
2507 .read(output)
2508 .map_err(FrameDecoderError::FailedToDrainDecodebuffer)?;
2509 output = &mut output[bytes_written..];
2510 total_bytes_written += bytes_written;
2511 if self.can_collect() != 0 {
2512 return Err(FrameDecoderError::TargetTooSmall);
2513 }
2514 if self.is_finished() {
2515 break;
2516 }
2517 }
2518 // Per-frame FCS validation on the legacy fallback path.
2519 // Use `fcs_declared()` (NOT `content_size > 0`) so an
2520 // empty frame with explicit FCS=0 on the wire still gets
2521 // validated.
2522 if fcs_declared {
2523 let produced = (total_bytes_written - frame_start_total) as u64;
2524 if produced != content_size {
2525 return Err(FrameDecoderError::FrameContentSizeMismatch {
2526 declared: content_size,
2527 produced,
2528 });
2529 }
2530 }
2531 // Per-frame content-checksum verification on the drain path: the
2532 // frame is fully decoded and drained here (is_finished + nothing
2533 // left to collect), so the running digest and stored value are
2534 // final. No-op unless the mode is `Verify`.
2535 #[cfg(feature = "hash")]
2536 self.verify_content_checksum()?;
2537 }
2538
2539 Ok(total_bytes_written)
2540 }
2541
2542 /// Decode multiple frames into the output slice using a serialized dictionary.
2543 ///
2544 /// # Warning
2545 ///
2546 /// Each decoded frame is initialized with the parsed dictionary, even when a
2547 /// frame header omits the optional dictionary ID. Callers must only use this
2548 /// API when they already know the input frames were encoded with that
2549 /// dictionary; otherwise decoded output can be silently corrupted.
2550 pub fn decode_all_with_dict_bytes(
2551 &mut self,
2552 input: &[u8],
2553 output: &mut [u8],
2554 raw_dictionary: &[u8],
2555 ) -> Result<usize, FrameDecoderError> {
2556 let dict = DictionaryHandle::decode_dict(raw_dictionary)?;
2557 self.decode_all_with_dict_handle(input, output, &dict)
2558 }
2559
2560 /// Decode multiple frames into the extra capacity of the output vector.
2561 ///
2562 /// `input` must contain an exact number of frames.
2563 ///
2564 /// `output` must have enough spare capacity to hold the decompressed
2565 /// data. This adds no extra slack: exact-fit output is now eligible
2566 /// for the direct decode path, so a `Vec::with_capacity(fcs)` is
2567 /// decoded straight into without a growth/reallocation. It will NOT
2568 /// grow the vector to fit the decompressed payload itself; the
2569 /// caller's pre-allocated capacity must already cover the data. If
2570 /// you don't know how large the output will be, use
2571 /// [`FrameDecoder::decode_blocks`] instead.
2572 ///
2573 /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
2574 ///
2575 /// The length of the output vector is updated to include the
2576 /// decompressed data. The length is not changed if an error occurs.
2577 pub fn decode_all_to_vec(
2578 &mut self,
2579 input: &[u8],
2580 output: &mut Vec<u8>,
2581 ) -> Result<(), FrameDecoderError> {
2582 let len = output.len();
2583 let cap = output.capacity();
2584 output.resize(cap, 0);
2585 match self.decode_all(input, &mut output[len..]) {
2586 Ok(bytes_written) => {
2587 let new_len = core::cmp::min(len + bytes_written, cap); // Sanitizes `bytes_written`.
2588 output.resize(new_len, 0);
2589 Ok(())
2590 }
2591 Err(e) => {
2592 output.resize(len, 0);
2593 Err(e)
2594 }
2595 }
2596 }
2597
2598 /// Single-frame direct-decode path. Decodes one zstd frame into
2599 /// `output[..content_size]` via a stack-local
2600 /// `DecodeBuffer<UserSliceBackend>`, bypassing the per-block
2601 /// FlatBuf/Ring -> `read()` drain copy.
2602 ///
2603 /// # Preconditions (caller-enforced)
2604 ///
2605 /// - `self.init` (or `init_with_dict_handle`) was called for
2606 /// this frame so `self.state` is populated.
2607 /// - `content_size` matches `self.state.frame_header
2608 /// .frame_content_size()` and is `> 0` (caller already passed
2609 /// the eligibility gate).
2610 /// - `output.len() >= content_size`. No `WILDCOPY_OVERLENGTH`
2611 /// trailing slack is required: the trailing sequence(s) take the
2612 /// bounded (non-overshooting) copy in
2613 /// [`UserSliceBackend::exec_sequence_bounded`].
2614 ///
2615 /// Dictionary frames are supported: the scratch buffer's shared
2616 /// dict handle is forwarded to the stack-local `DecodeBuffer`, so
2617 /// offsets reaching past the frame's own output resolve through
2618 /// `repeat_from_dict` (the ext-dict slow path).
2619 ///
2620 /// On return, `input` points at the byte immediately after the
2621 /// frame's checksum (or after the last block, when the frame
2622 /// has `content_checksum_flag = 0`). `self.state.frame_finished`
2623 /// is set so [`Self::is_finished`] reports `true`.
2624 fn run_direct_decode(
2625 &mut self,
2626 input: &mut &[u8],
2627 output: &mut [u8],
2628 content_size: u64,
2629 ) -> Result<usize, FrameDecoderError> {
2630 #[cfg(test)]
2631 {
2632 self.direct_frames += 1;
2633 }
2634 use super::block_decoder;
2635 use super::decode_buffer::DecodeBuffer;
2636 use super::scratch::DirectScratch;
2637 use super::user_slice_buf::UserSliceBackend;
2638 use crate::io::Read;
2639 use FrameDecoderError as err;
2640
2641 let state = self
2642 .state
2643 .as_mut()
2644 .expect("caller ensures init populated state");
2645
2646 // Borrow persistent fields out of whichever scratch variant
2647 // `init` produced (Flat for single_segment, Ring for
2648 // multi-segment) — both expose the same HUF/FSE/Vec
2649 // fields; only `buffer` differs and we don't use that here.
2650 // Macro-style binding avoids the closure / generic
2651 // gymnastics of returning multiple `&mut` from a match arm.
2652 let (huf, fse, offset_hist, literals_buffer, block_content_buffer, window_size, dict) =
2653 match &mut state.decoder_scratch {
2654 DecoderScratchKind::Flat(s) => (
2655 &mut s.huf,
2656 &mut s.fse,
2657 &mut s.offset_hist,
2658 &mut s.literals_buffer,
2659 &mut s.block_content_buffer,
2660 s.buffer.window_size,
2661 s.buffer.dict.clone(),
2662 ),
2663 DecoderScratchKind::Ring(s) => (
2664 &mut s.huf,
2665 &mut s.fse,
2666 &mut s.offset_hist,
2667 &mut s.literals_buffer,
2668 &mut s.block_content_buffer,
2669 s.buffer.window_size,
2670 s.buffer.dict.clone(),
2671 ),
2672 };
2673 let backend = UserSliceBackend::from_slice(output);
2674 let mut buffer = DecodeBuffer::from_backend(backend, window_size);
2675 // Dictionary matches on the direct path: hand the shared handle
2676 // (refcount bump, no copy) to the stack-local buffer so offsets
2677 // reaching past the frame's own output resolve through
2678 // `repeat_from_dict` — the same ext-dict slow path the
2679 // FlatBuf/Ring backends use. The per-sequence hot path is
2680 // untouched: the inline-exec dispatch already routes
2681 // beyond-prefix offsets to the cold `repeat()` fallback.
2682 if let Some(handle) = dict {
2683 buffer.set_dict(handle);
2684 }
2685 let mut direct = DirectScratch {
2686 huf,
2687 fse,
2688 offset_hist,
2689 literals_buffer,
2690 block_content_buffer,
2691 buffer,
2692 };
2693
2694 // Block loop. Mirrors `decode_blocks` (without the
2695 // strategy-bounded early exit — we always decode the whole
2696 // frame in one shot for the direct path). Keeps
2697 // `state.bytes_read_counter` / `state.block_counter` in
2698 // sync with `decode_blocks` so post-call accessors
2699 // (`bytes_read_from_source`, `blocks_decoded`) return
2700 // accurate values.
2701 let mut block_dec = block_decoder::new();
2702 // Track total output bytes against the declared
2703 // `frame_content_size` via the buffer's actual write
2704 // counter — `BlockHeader.decompressed_size` is 0 for
2705 // Compressed blocks (the header parser can't know the
2706 // expanded size before decoding the body), so per-header
2707 // tracking would always count 0 for those blocks and
2708 // miscount frames that aren't pure Raw/RLE.
2709 let mut produced: u64 = 0;
2710 // Per-block output ranges captured during the direct-path
2711 // loop. After the loop we re-borrow `output` (post-drop of
2712 // `direct`) and XXH64 each range into
2713 // `self.computed_block_checksums`, so the digests vector
2714 // stays consistent with the legacy `decode_blocks` path
2715 // regardless of which dispatch the frame took.
2716 // `Vec::new()` does not allocate, so this stays free when
2717 // `per_block_checksums_enabled` is false: the `push` and the
2718 // post-loop hashing loop are both gated by the same flag.
2719 #[cfg(all(feature = "lsm", feature = "hash"))]
2720 let mut block_ranges: alloc::vec::Vec<(usize, usize)> = alloc::vec::Vec::new();
2721 // Frame-level XXH64, accumulated PER BLOCK right after each block
2722 // decodes — the bytes are still cache-resident then. The previous
2723 // shape hashed the whole output once after the loop, which re-read
2724 // the entire frame cold: a full extra memory pass that the
2725 // reference implementation does not make (it hashes incrementally
2726 // per block). Invisible on outputs that fit L3, ~1.14x wall on a
2727 // 100 MiB all-raw decode and the dominant CI gap on
2728 // bandwidth-limited hosts.
2729 #[cfg(feature = "hash")]
2730 let mut running_hash: Option<twox_hash::XxHash64> =
2731 if state.frame_header.descriptor.content_checksum_flag()
2732 && self.content_checksum != ContentChecksum::None
2733 {
2734 Some(twox_hash::XxHash64::with_seed(0))
2735 } else {
2736 None
2737 };
2738 loop {
2739 #[cfg(all(feature = "lsm", feature = "hash"))]
2740 let produced_before: Option<usize> = if self.per_block_checksums_enabled {
2741 Some(produced as usize)
2742 } else {
2743 None
2744 };
2745 // Failing-block coordinates captured before the header read (see
2746 // the `decode_blocks` loop for the rationale).
2747 let block_index = state.block_counter as u32;
2748 let block_frame_offset = state.bytes_read_counter as u32;
2749 let (block_header, hsize) =
2750 block_dec.read_block_header(&mut *input).map_err(|source| {
2751 block_header_decode_error(source, block_index, block_frame_offset)
2752 })?;
2753 state.bytes_read_counter += u64::from(hsize);
2754 // Pre-flight FCS check ONLY for Raw / RLE blocks where
2755 // `decompressed_size` is the actual block output size.
2756 // For Compressed blocks the header field is 0; the
2757 // post-decode check below catches overflow via the
2758 // backend's actual write counter delta.
2759 let block_upper = u64::from(block_header.decompressed_size);
2760 if block_upper > 0 && produced + block_upper > content_size {
2761 // Frame is corrupt — Raw/RLE block headers claim
2762 // more output than the FCS allows.
2763 return Err(err::FrameContentSizeMismatch {
2764 declared: content_size,
2765 produced: produced + block_upper,
2766 });
2767 }
2768 // Slice-source fast path: consume the block body
2769 // straight from `input` without copying into the
2770 // persistent `block_content_buffer`.
2771 let body_consumed = match block_dec.decode_block_content_from_slice(
2772 &block_header,
2773 &mut direct,
2774 &mut *input,
2775 ) {
2776 Ok(n) => n,
2777 // Defense-in-depth: RLE / Raw block whose declared
2778 // `decompressed_size` slipped past the per-block
2779 // pre-flight above and tripped the backend's
2780 // fallible write surface.
2781 Err(crate::decoding::errors::DecodeBlockContentError::BackendOverflow {
2782 ..
2783 }) => {
2784 // Use saturating_add on the
2785 // `produced + decompressed_size` sum. Each block
2786 // is bounded by 128 KiB (MAX_BLOCK_SIZE), but
2787 // accumulated `produced` can grow toward
2788 // u64::MAX across adversarial frames. Saturating
2789 // avoids a panic on the error path itself.
2790 return Err(err::FrameContentSizeMismatch {
2791 declared: content_size,
2792 produced: produced
2793 .saturating_add(u64::from(block_header.decompressed_size)),
2794 });
2795 }
2796 // Compressed-block in-block overshoot: the sequence
2797 // executor (donor-inline path) or the match-repeat
2798 // fallback tripped the fixed-capacity backend's per-write
2799 // check. Unlike Raw/RLE, a Compressed block carries no
2800 // header-declared output size, so `produced` is computed
2801 // from the partial fill: `tail` bytes were written before
2802 // the failing op, and `requested` is what overflowed —
2803 // their sum is a strict lower bound on the frame's true
2804 // expanded size and is always > `content_size` (the
2805 // direct path is only entered when the slice is sized to
2806 // `content_size + WILDCOPY_OVERLENGTH`, so any overflow
2807 // means the frame exceeded the declared FCS, never a
2808 // caller-undersized buffer). Folds into the same
2809 // `FrameContentSizeMismatch` contract as Raw/RLE.
2810 Err(crate::decoding::errors::DecodeBlockContentError::DecompressBlockError(
2811 crate::decoding::errors::DecompressBlockError::ExecuteSequencesError(ref e),
2812 )) if e.output_overflow_requested().is_some() => {
2813 let requested = e
2814 .output_overflow_requested()
2815 .expect("guard guarantees Some") as u64;
2816 let tail = direct.buffer.buffer_ref().tail() as u64;
2817 return Err(err::FrameContentSizeMismatch {
2818 declared: content_size,
2819 produced: tail.saturating_add(requested),
2820 });
2821 }
2822 Err(e) => {
2823 return Err(block_body_decode_error(
2824 e,
2825 block_index,
2826 block_frame_offset,
2827 &block_header,
2828 hsize,
2829 ));
2830 }
2831 };
2832 // Hash this block's freshly-written bytes while they are hot
2833 // (see `running_hash` above). `tail()` is the physical write
2834 // cursor: `drop_to_window_size` below only advances the head,
2835 // so `[prev_tail, tail)` is exactly this block's output.
2836 #[cfg(feature = "hash")]
2837 if let Some(hasher) = running_hash.as_mut() {
2838 use core::hash::Hasher;
2839 hasher.write(direct.buffer.buffer_ref().written_since(produced as usize));
2840 }
2841 produced = direct.buffer.buffer_ref().tail() as u64;
2842 // Post-decode FCS overflow check.
2843 if produced > content_size {
2844 return Err(err::FrameContentSizeMismatch {
2845 declared: content_size,
2846 produced,
2847 });
2848 }
2849 state.bytes_read_counter += body_consumed;
2850 state.block_counter += 1;
2851 #[cfg(all(feature = "lsm", feature = "hash"))]
2852 if let Some(produced_before) = produced_before {
2853 block_ranges.push((produced_before, produced as usize));
2854 }
2855 // Cap the visible buffer at window_size between blocks
2856 // so the next block's match-offset validation matches
2857 // the spec's `offset <= window_size` rule.
2858 direct.buffer.drop_to_window_size();
2859 if block_header.last_block {
2860 if state.frame_header.descriptor.content_checksum_flag() {
2861 let mut chksum = [0u8; 4];
2862 input
2863 .read_exact(&mut chksum)
2864 .map_err(err::FailedToReadChecksum)?;
2865 state.bytes_read_counter += 4;
2866 state.check_sum = Some(u32::from_le_bytes(chksum));
2867 }
2868 break;
2869 }
2870 }
2871 // Final sanity: blocks summed to exactly `content_size`.
2872 if produced != content_size {
2873 return Err(err::FrameContentSizeMismatch {
2874 declared: content_size,
2875 produced,
2876 });
2877 }
2878
2879 let written = content_size as usize;
2880 state.frame_finished = true;
2881 // Drop the stack-local DirectScratch (and its DecodeBuffer
2882 // borrow on `output`) so we can re-borrow `output` for the
2883 // hash pass below.
2884 drop(direct);
2885 // Per-block XXH64 (low 32 bits) over the captured ranges.
2886 // Mirrors `decode_blocks`' per-block hashing so the digests
2887 // vector stays identical regardless of which dispatch path
2888 // the frame took. Ranges were recorded inside the loop while
2889 // `direct` held a mutable borrow on `output`; now that the
2890 // borrow is dropped we can read the slices directly.
2891 #[cfg(all(feature = "lsm", feature = "hash"))]
2892 if self.per_block_checksums_enabled {
2893 use core::hash::Hasher;
2894 for (start, end) in &block_ranges {
2895 let mut h = twox_hash::XxHash64::with_seed(0);
2896 h.write(&output[*start..*end]);
2897 self.computed_block_checksums.push(h.finish() as u32);
2898 }
2899 }
2900 #[cfg(feature = "hash")]
2901 if let Some(hasher) = running_hash {
2902 // Propagate the per-block-accumulated hasher state (see the
2903 // `running_hash` rationale above the loop) so the frame-tail
2904 // XXH64 check and `get_calculated_checksum()` read the digest.
2905 // `running_hash` is `None` for flag-off frames or
2906 // `ContentChecksum::None` — nothing to verify there, and
2907 // `get_calculated_checksum()` returns `None`, matching the skip.
2908 match &mut state.decoder_scratch {
2909 DecoderScratchKind::Flat(s) => s.buffer.hash = hasher,
2910 DecoderScratchKind::Ring(s) => s.buffer.hash = hasher,
2911 }
2912 }
2913 Ok(written)
2914 }
2915}
2916
2917/// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished
2918/// this will retain window_size bytes, else it will drain it completely
2919impl Read for FrameDecoder {
2920 fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
2921 let state = match &mut self.state {
2922 None => return Ok(0),
2923 Some(s) => s,
2924 };
2925 if state.frame_finished {
2926 state.decoder_scratch.buffer_read_all(target)
2927 } else {
2928 state.decoder_scratch.buffer_read(target)
2929 }
2930 }
2931}
2932
2933#[cfg(test)]
2934mod tests {
2935 extern crate std;
2936
2937 use super::{DictionaryHandle, FrameDecoder};
2938 use crate::encoding::{CompressionLevel, FrameCompressor};
2939 use alloc::vec::Vec;
2940
2941 #[test]
2942 fn decode_all_tight_and_slack_outputs_match_on_single_segment_frame() {
2943 // Roundtrip a small payload through the encoder, then decode
2944 // it via `decode_all` on two output shapes that select
2945 // different internal sequence-exec paths within the direct
2946 // decode:
2947 // 1. Tight output (exactly `frame_content_size`, no
2948 // WILDCOPY_OVERLENGTH slack) → direct path whose trailing
2949 // sequence(s) take the bounded (non-overshooting) copy in
2950 // `UserSliceBackend::exec_sequence_bounded`.
2951 // 2. Output with WILDCOPY slack → direct path whose
2952 // sequences all take the SIMD wildcopy fast path.
2953 // Both must produce identical output bytes — the bounded tail
2954 // copy must reconstruct the same data as the overshooting fast
2955 // path. This is the regression gate for the relaxed
2956 // direct-decode gate (`cap >= content_size`).
2957 let payload: Vec<u8> = (0..4096u32).map(|i| (i & 0xFF) as u8).collect();
2958 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
2959 compressor.set_source(payload.as_slice());
2960 let mut compressed = Vec::new();
2961 compressor.set_drain(&mut compressed);
2962 compressor.compress();
2963
2964 // Baseline: tight output → legacy drain path.
2965 let mut dec_a = FrameDecoder::new();
2966 let mut out_a = alloc::vec![0u8; payload.len()];
2967 let n_a = dec_a
2968 .decode_all(compressed.as_slice(), &mut out_a)
2969 .expect("decode_all (legacy drain) should succeed");
2970 assert_eq!(n_a, payload.len());
2971 assert_eq!(&out_a[..n_a], payload.as_slice());
2972
2973 // Direct: output with WILDCOPY slack → direct path.
2974 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
2975 let mut dec_b = FrameDecoder::new();
2976 let mut out_b = alloc::vec![0u8; payload.len() + slack];
2977 let n_b = dec_b
2978 .decode_all(compressed.as_slice(), &mut out_b)
2979 .expect("decode_all (direct path) should succeed");
2980 assert_eq!(
2981 n_b,
2982 payload.len(),
2983 "direct decode produced wrong byte count"
2984 );
2985 assert_eq!(&out_b[..n_b], payload.as_slice());
2986 }
2987
2988 #[test]
2989 fn decode_all_tight_output_overlapping_tail_match_roundtrips() {
2990 // The bounded tail copy must handle an OVERLAPPING match
2991 // (offset < match_length) as the trailing sequence when the
2992 // output slice is sized to exactly `frame_content_size`. A long
2993 // run of a single byte at the end of the payload encodes as an
2994 // offset-1 match whose length far exceeds the offset, so the
2995 // bounded copy's overlapping (forward byte-by-byte) branch is
2996 // exercised at the buffer tail where the SIMD overshoot would
2997 // otherwise run past `cap`. Decoding into a tight buffer and
2998 // matching the original payload byte-for-byte is the regression
2999 // gate for the overlap branch of `exec_sequence_bounded`.
3000 let mut payload: Vec<u8> = (0..256u32).map(|i| (i & 0xFF) as u8).collect();
3001 payload.extend(core::iter::repeat_n(0xABu8, 8192));
3002 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3003 compressor.set_source(payload.as_slice());
3004 let mut compressed = Vec::new();
3005 compressor.set_drain(&mut compressed);
3006 compressor.compress();
3007
3008 // Anti-vacuous precondition: the 8 KiB trailing run of a single
3009 // byte must compress to a Compressed block dominated by ONE long
3010 // offset-1 (overlapping, offset < match_length) match — not a Raw
3011 // block. If the encoder ever stopped emitting that overlapping
3012 // tail match the test would pass without exercising
3013 // `exec_sequence_bounded`'s overlapping forward-copy branch, so
3014 // gate on the output being a tiny fraction of the input (a raw
3015 // block would be ~`payload.len()`; an offset-1 run match is tens
3016 // of bytes).
3017 assert!(
3018 compressed.len() < payload.len() / 8,
3019 "expected an overlapping-tail match to dominate the frame \
3020 (compressed={} payload={}); the bounded overlap branch would \
3021 not be exercised otherwise",
3022 compressed.len(),
3023 payload.len(),
3024 );
3025
3026 // Tight output: exactly content_size, no WILDCOPY slack.
3027 let mut dec = FrameDecoder::new();
3028 let mut out = alloc::vec![0u8; payload.len()];
3029 let n = dec
3030 .decode_all(compressed.as_slice(), &mut out)
3031 .expect("tight-output decode with overlapping tail match should succeed");
3032 assert_eq!(n, payload.len());
3033 assert_eq!(out, payload, "bounded overlap tail copy corrupted output");
3034 }
3035
3036 #[test]
3037 fn decode_all_multi_segment_frame_decodes_correctly() {
3038 // Multi-segment frame: payload large enough that the
3039 // encoder's default frame layout has `single_segment_flag =
3040 // false` and `window_size < frame_content_size`. The direct
3041 // path must cap the visible buffer at window_size after each
3042 // block (drop_to_window_size) so match-offset validation
3043 // matches the spec rule `offset <= window_size`, and still
3044 // produce the same bytes as decode_all on the
3045 // FlatBuf/Ring-backed path.
3046 //
3047 // Make the payload structured so multi-segment behavior
3048 // actually kicks in: 2 MiB of repeating + random-ish bytes
3049 // forces window_size lower than content_size at the encoder.
3050 let mut payload: Vec<u8> = Vec::with_capacity(2 * 1024 * 1024);
3051 for i in 0..payload.capacity() {
3052 payload.push((i.wrapping_mul(2_654_435_761) & 0xFF) as u8);
3053 }
3054 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3055 compressor.set_source(payload.as_slice());
3056 let mut compressed = Vec::new();
3057 compressor.set_drain(&mut compressed);
3058 compressor.compress();
3059
3060 // Baseline: decode_all through the FlatBuf+drain path.
3061 let mut dec_a = FrameDecoder::new();
3062 let mut out_a = alloc::vec![0u8; payload.len()];
3063 let n_a = dec_a
3064 .decode_all(compressed.as_slice(), &mut out_a)
3065 .expect("decode_all should succeed");
3066 assert_eq!(n_a, payload.len());
3067 assert_eq!(&out_a[..n_a], payload.as_slice());
3068
3069 // Direct path: must give identical bytes via UserSliceBackend
3070 // + per-block drop_to_window_size.
3071 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3072 let mut dec_b = FrameDecoder::new();
3073 let mut out_b = alloc::vec![0u8; payload.len() + slack];
3074 let n_b = dec_b
3075 .decode_all(compressed.as_slice(), &mut out_b)
3076 .expect("decode_all should succeed on multi-segment frame");
3077 assert_eq!(n_b, payload.len(), "wrong byte count on direct path");
3078 assert_eq!(&out_b[..n_b], payload.as_slice());
3079
3080 // Sanity-check: confirm the encoded frame really IS
3081 // multi-segment. If a future encoder default changes,
3082 // catching the assumption here is better than silently
3083 // testing single_segment on this name.
3084 let mut sanity = FrameDecoder::new();
3085 sanity.init(&mut compressed.as_slice()).unwrap();
3086 assert!(
3087 !sanity
3088 .state
3089 .as_ref()
3090 .unwrap()
3091 .frame_header
3092 .descriptor
3093 .single_segment_flag(),
3094 "test precondition violated: frame is single-segment, rename or resize"
3095 );
3096 }
3097
3098 #[cfg(feature = "hash")]
3099 #[test]
3100 fn decode_all_propagates_checksum_into_persistent_scratch() {
3101 // Direct path on a checksum-flagged frame: the FrameCompressor
3102 // under `feature = "hash"` sets content_checksum_flag, so the
3103 // decoded frame has a recorded checksum. After
3104 // decode_all we must be able to verify it matches via
3105 // the public get_calculated_checksum() accessor — the digest
3106 // is computed by walking output at end of decode and stored
3107 // into the persistent scratch's hasher.
3108 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3109 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3110 compressor.set_content_checksum(true);
3111 compressor.set_source(payload.as_slice());
3112 let mut compressed = Vec::new();
3113 compressor.set_drain(&mut compressed);
3114 compressor.compress();
3115
3116 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3117 let mut dec = FrameDecoder::new();
3118 let mut out = alloc::vec![0u8; payload.len() + slack];
3119 let n = dec
3120 .decode_all(compressed.as_slice(), &mut out)
3121 .expect("decode_all with checksum must succeed");
3122 assert_eq!(n, payload.len());
3123 assert_eq!(&out[..n], payload.as_slice());
3124
3125 // Both sides must report the same checksum: the frame header
3126 // carries the stored u32, and get_calculated_checksum reads
3127 // the running digest the direct path just propagated.
3128 let stored = dec.get_checksum_from_data();
3129 let calculated = dec.get_calculated_checksum();
3130 assert!(stored.is_some(), "frame must carry stored checksum");
3131 assert!(
3132 calculated.is_some(),
3133 "direct path must propagate calculated checksum"
3134 );
3135 assert_eq!(
3136 stored, calculated,
3137 "stored vs calculated checksum mismatch on direct path"
3138 );
3139 }
3140
3141 #[cfg(feature = "hash")]
3142 #[test]
3143 fn verify_mode_accepts_a_valid_frame() {
3144 use crate::decoding::ContentChecksum;
3145 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3146 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3147 compressor.set_content_checksum(true);
3148 compressor.set_source(payload.as_slice());
3149 let mut compressed = Vec::new();
3150 compressor.set_drain(&mut compressed);
3151 compressor.compress();
3152
3153 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3154 let mut dec = FrameDecoder::new();
3155 dec.set_content_checksum(ContentChecksum::Verify);
3156 let mut out = alloc::vec![0u8; payload.len() + slack];
3157 let n = dec
3158 .decode_all(compressed.as_slice(), &mut out)
3159 .expect("Verify mode must accept a frame with a correct checksum");
3160 assert_eq!(&out[..n], payload.as_slice());
3161 }
3162
3163 #[cfg(feature = "hash")]
3164 #[test]
3165 fn verify_mode_rejects_a_corrupted_checksum() {
3166 use crate::decoding::ContentChecksum;
3167 use crate::decoding::errors::FrameDecoderError;
3168 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3169 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3170 compressor.set_content_checksum(true);
3171 compressor.set_source(payload.as_slice());
3172 let mut compressed = Vec::new();
3173 compressor.set_drain(&mut compressed);
3174 compressor.compress();
3175
3176 // Flip a bit in the trailing 4-byte content checksum: the frame body
3177 // still decodes to the correct bytes, but the stored digest no longer
3178 // matches the one the decoder computes.
3179 let last = compressed.len() - 1;
3180 compressed[last] ^= 0xFF;
3181
3182 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3183 let mut dec = FrameDecoder::new();
3184 dec.set_content_checksum(ContentChecksum::Verify);
3185 let mut out = alloc::vec![0u8; payload.len() + slack];
3186 let err = dec
3187 .decode_all(compressed.as_slice(), &mut out)
3188 .expect_err("Verify mode must reject a corrupted checksum");
3189 assert!(
3190 matches!(err, FrameDecoderError::ChecksumMismatch { .. }),
3191 "expected ChecksumMismatch, got {err:?}"
3192 );
3193 }
3194
3195 #[cfg(feature = "hash")]
3196 #[test]
3197 fn decode_from_to_verify_rejects_corrupted_checksum() {
3198 // decode_from_to has its own block loop (not decode_blocks); it must
3199 // still honour Verify and reject a corrupted trailer rather than
3200 // silently accept it.
3201 use crate::decoding::ContentChecksum;
3202 use crate::decoding::errors::FrameDecoderError;
3203 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3204 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3205 compressor.set_content_checksum(true);
3206 compressor.set_source(payload.as_slice());
3207 let mut compressed = Vec::new();
3208 compressor.set_drain(&mut compressed);
3209 compressor.compress();
3210 let last = compressed.len() - 1;
3211 compressed[last] ^= 0xFF;
3212
3213 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3214 let mut dec = FrameDecoder::new();
3215 dec.set_content_checksum(ContentChecksum::Verify);
3216 let mut out = alloc::vec![0u8; payload.len() + slack];
3217
3218 // Split the trailing 4-byte checksum into a SEPARATE call so the
3219 // verification must happen on the checksum-only early-return path (not
3220 // the post-drain path) — the incremental case CodeRabbit flagged.
3221 let split = compressed.len() - 4;
3222 let (_r1, w1) = dec
3223 .decode_from_to(&compressed[..split], &mut out)
3224 .expect("blocks decode without the trailer");
3225 let err = dec
3226 .decode_from_to(&compressed[split..], &mut out[w1..])
3227 .expect_err("decode_from_to in Verify mode must reject a corrupted checksum");
3228 assert!(
3229 matches!(err, FrameDecoderError::ChecksumMismatch { .. }),
3230 "expected ChecksumMismatch, got {err:?}"
3231 );
3232 }
3233
3234 #[cfg(feature = "hash")]
3235 #[test]
3236 fn decode_from_to_small_target_split_trailer_flushes_tail() {
3237 // Regression: when a prior call decoded the last block but a small
3238 // `target` left output buffered, the trailer-only call must still flush
3239 // the buffered tail (it used to early-return Ok((4,0)) and lose it).
3240 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3241 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3242 compressor.set_content_checksum(true);
3243 compressor.set_source(payload.as_slice());
3244 let mut compressed = Vec::new();
3245 compressor.set_drain(&mut compressed);
3246 compressor.compress();
3247
3248 let split = compressed.len() - 4;
3249 let mut dec = FrameDecoder::new();
3250 let mut out = alloc::vec![0u8; payload.len()];
3251 // Call 1: all blocks, but a SMALL (64-byte) target leaves the rest
3252 // buffered on the decoder side.
3253 let (_r1, w1) = dec
3254 .decode_from_to(&compressed[..split], &mut out[..64])
3255 .expect("blocks decode with a small target");
3256 assert!(w1 <= 64);
3257 // Call 2: the 4-byte trailer alone must flush the buffered tail through
3258 // the shared read path, not return early and drop it.
3259 let (_r2, w2) = dec
3260 .decode_from_to(&compressed[split..], &mut out[w1..])
3261 .expect("trailer call must flush the buffered tail");
3262 assert_eq!(w1 + w2, payload.len(), "buffered tail was dropped");
3263 assert_eq!(&out[..w1 + w2], payload.as_slice());
3264 }
3265
3266 #[cfg(feature = "hash")]
3267 #[test]
3268 fn none_mode_skips_the_checksum_pass() {
3269 use crate::decoding::ContentChecksum;
3270 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3271 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3272 compressor.set_content_checksum(true);
3273 compressor.set_source(payload.as_slice());
3274 let mut compressed = Vec::new();
3275 compressor.set_drain(&mut compressed);
3276 compressor.compress();
3277
3278 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3279 let mut dec = FrameDecoder::new();
3280 dec.set_content_checksum(ContentChecksum::None);
3281 let mut out = alloc::vec![0u8; payload.len() + slack];
3282 let n = dec
3283 .decode_all(compressed.as_slice(), &mut out)
3284 .expect("None mode must still decode correctly");
3285 assert_eq!(&out[..n], payload.as_slice());
3286 // No digest is computed in None mode, even though the frame carries one.
3287 assert!(dec.get_checksum_from_data().is_some());
3288 assert!(dec.get_calculated_checksum().is_none());
3289 }
3290
3291 #[cfg(feature = "hash")]
3292 #[test]
3293 fn encoder_without_checksum_emits_no_trailing_digest() {
3294 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
3295
3296 let mut with = Vec::new();
3297 let mut c_with = FrameCompressor::new(CompressionLevel::Default);
3298 c_with.set_content_checksum(true);
3299 c_with.set_source(payload.as_slice());
3300 c_with.set_drain(&mut with);
3301 c_with.compress();
3302
3303 let mut without = Vec::new();
3304 let mut c_without = FrameCompressor::new(CompressionLevel::Default);
3305 c_without.set_content_checksum(false);
3306 c_without.set_source(payload.as_slice());
3307 c_without.set_drain(&mut without);
3308 c_without.compress();
3309
3310 // The checksum-off frame is exactly the 4-byte trailing digest shorter.
3311 assert_eq!(with.len(), without.len() + 4);
3312
3313 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3314 let mut dec = FrameDecoder::new();
3315 let mut out = alloc::vec![0u8; payload.len() + slack];
3316 let n = dec
3317 .decode_all(without.as_slice(), &mut out)
3318 .expect("a frame without a content checksum must decode");
3319 assert_eq!(&out[..n], payload.as_slice());
3320 assert!(
3321 dec.get_checksum_from_data().is_none(),
3322 "no trailing checksum should be reported"
3323 );
3324 }
3325
3326 #[test]
3327 fn decode_all_fcs_overflow_via_corrupt_frame_returns_structured_error() {
3328 // Hand-build a corrupt frame that declares
3329 // frame_content_size = 4 but the (last) block carries a
3330 // larger Raw payload. The pre-flight FCS check inside the
3331 // direct path's block loop catches this and returns the
3332 // structured FrameContentSizeMismatch variant — not a
3333 // panic, not a generic TargetTooSmall.
3334 //
3335 // Frame layout (single_segment, FCS=4):
3336 // magic 4 bytes 0xFD2FB528
3337 // FHD 1 byte single_segment=1, no checksum,
3338 // FCS field size = 0 (-> 1-byte FCS)
3339 // FCS 1 byte 0x04
3340 // block_header 3 bytes last=1, type=Raw, block_size=10
3341 // block_payload 10 bytes 0xAA repeated
3342 let mut frame = alloc::vec::Vec::new();
3343 // magic
3344 frame.extend_from_slice(&0xFD2FB528u32.to_le_bytes());
3345 // FHD: single_segment=1, fcs_flag=0 (1-byte FCS), no checksum,
3346 // no dict. Bit layout: FCS(7-6)=0, single_segment(5)=1,
3347 // reserved/uncs(4)=0, content_checksum(2)=0, dict(0-1)=00.
3348 frame.push(0b0010_0000);
3349 // FCS: 1 byte
3350 frame.push(4);
3351 // Block header: cBlockSize=10, type=Raw (0), last=1
3352 // 3-byte LE: bit0=last, bits1-2=type(2 bits), bits3-23=size
3353 let cblock_size: u32 = 10;
3354 let bh: u32 = 1 | (cblock_size << 3); // last=1, type=Raw=0
3355 frame.push((bh & 0xFF) as u8);
3356 frame.push((bh >> 8) as u8);
3357 frame.push((bh >> 16) as u8);
3358 // Payload — 10 bytes that, if decoded, would exceed FCS=4.
3359 frame.extend(core::iter::repeat_n(0xAAu8, 10));
3360
3361 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3362 let mut dec = FrameDecoder::new();
3363 let mut out = alloc::vec![0u8; 4 + slack];
3364 let err = dec
3365 .decode_all(&frame, &mut out)
3366 .expect_err("FCS-overflow frame must fail decode");
3367 assert!(
3368 matches!(
3369 err,
3370 super::FrameDecoderError::FrameContentSizeMismatch { .. }
3371 ),
3372 "expected FrameContentSizeMismatch, got {:?}",
3373 err
3374 );
3375 }
3376
3377 #[test]
3378 fn decode_all_compressed_block_fcs_overflow_returns_structured_error() {
3379 // Acceptance test for #246: a malformed frame whose *Compressed*
3380 // block expands past the declared `frame_content_size` must
3381 // surface `FrameContentSizeMismatch` from the direct-decode path
3382 // (UserSliceBackend sequence executor), NOT panic and NOT a
3383 // generic FailedToReadBlockBody. The Raw-block sibling above
3384 // covers the `BackendOverflow` arm; this covers the Compressed
3385 // sequence-executor overflow arm (`ExecuteSequencesError::
3386 // OutputBufferOverflow` folded into FrameContentSizeMismatch in
3387 // `run_direct_decode`).
3388 //
3389 // Construction: compress a compressible payload to get a genuine
3390 // Compressed block + a header-declared FCS, then surgically patch
3391 // the FCS field down to a tiny value. The block body still
3392 // decodes (literals/sequences are independent of FCS) and the
3393 // sequence executor overflows the small output slice.
3394 // Highly compressible payload (repeated phrase) → Compressed
3395 // block whose sequence executor produces ~4 KiB of output.
3396 let unit = b"The quick brown fox jumps over the lazy dog. ";
3397 let mut payload = Vec::with_capacity(4 * 1024);
3398 while payload.len() < 4 * 1024 {
3399 payload.extend_from_slice(unit);
3400 }
3401 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3402 compressor.set_source(payload.as_slice());
3403 let mut frame = Vec::new();
3404 compressor.set_drain(&mut frame);
3405 compressor.compress();
3406 // Sanity: the encoder actually compressed (=> a Compressed block,
3407 // not a raw-stored fallback) so we exercise the sequence path.
3408 assert!(frame.len() < payload.len());
3409
3410 // Locate the FCS field: it is the last `fcs_len` bytes of the
3411 // frame header, whose total size `header_size` includes the magic.
3412 // A ~4 KiB single-segment frame declares FCS = 4096, which lands in
3413 // the 2-byte field range [256, 65791] (RFC 8878 §3.1.1.1.4) — assert
3414 // that so the patch logic below stays a single deterministic branch.
3415 let (header, header_size) =
3416 super::super::frame::read_frame_header(frame.as_slice()).expect("valid header");
3417 let fcs_len = header
3418 .descriptor
3419 .frame_content_size_bytes()
3420 .expect("fcs present") as usize;
3421 assert_eq!(
3422 fcs_len, 2,
3423 "4 KiB single-segment frame must use a 2-byte FCS"
3424 );
3425 let fcs_off = header_size as usize - fcs_len;
3426
3427 // Patch the 2-byte FCS to its floor: stored bytes 0 decode to 256
3428 // (the field's `+256` bias), far below the 4 KiB the block actually
3429 // produces, so the sequence executor overflows the output slice.
3430 let patched_declared: u64 = 256;
3431 frame[fcs_off] = 0;
3432 frame[fcs_off + 1] = 0;
3433
3434 // Size the output to declared + WILDCOPY slack so the direct path
3435 // is eligible (output.len() >= content_size + slack) — the
3436 // overflow then comes from the frame, not an undersized buffer.
3437 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
3438 let mut out = alloc::vec![0u8; patched_declared as usize + slack];
3439 let mut dec = FrameDecoder::new();
3440 let err = dec
3441 .decode_all(frame.as_slice(), &mut out)
3442 .expect_err("Compressed block exceeding FCS must fail decode");
3443 match err {
3444 super::FrameDecoderError::FrameContentSizeMismatch { declared, produced } => {
3445 assert_eq!(declared, patched_declared, "declared echoes patched FCS");
3446 assert!(produced > declared, "produced must exceed declared");
3447 }
3448 other => panic!("expected FrameContentSizeMismatch, got {other:?}"),
3449 }
3450 }
3451
3452 /// Block-precise error positions (#174): a failing block header / body
3453 /// reports its 0-based index and frame-absolute offset, consistent with
3454 /// the encoder's `FrameEmitInfo.blocks[index].offset_in_frame`.
3455 #[cfg(feature = "lsm")]
3456 #[test]
3457 fn block_precise_errors_carry_index_and_offset() {
3458 use crate::encoding::{CompressionLevel, FrameCompressor};
3459 // ~1.3 MiB of incompressible (xorshift) bytes → many 128 KiB raw
3460 // blocks, so blocks 3 and 7 both exist and are not the last block.
3461 let mut data = alloc::vec::Vec::with_capacity(1_300_000);
3462 let mut s: u64 = 0x2545_F491_4F6C_DD1D;
3463 while data.len() < 1_300_000 {
3464 s ^= s << 13;
3465 s ^= s >> 7;
3466 s ^= s << 17;
3467 data.push((s >> 33) as u8);
3468 }
3469
3470 let mut frame = alloc::vec::Vec::new();
3471 let blocks = {
3472 let mut fc = FrameCompressor::new(CompressionLevel::Level(1));
3473 fc.set_source(data.as_slice());
3474 fc.set_drain(&mut frame);
3475 fc.compress();
3476 fc.last_frame_emit_info()
3477 .expect("emit info present under lsm")
3478 .blocks
3479 .clone()
3480 };
3481 assert!(blocks.len() > 7, "need >7 blocks, got {}", blocks.len());
3482
3483 let mut out = alloc::vec![0u8; data.len() + 4096];
3484
3485 // (1) Corrupt block 7's header: force its Block_Type to Reserved (3)
3486 // by setting both type bits — fails the header read at block 7.
3487 let off7 = blocks[7].offset_in_frame as usize;
3488 let mut corrupt = frame.clone();
3489 corrupt[off7] |= 0b0000_0110;
3490 let mut dec = FrameDecoder::new();
3491 let err = dec
3492 .decode_all(&corrupt, &mut out)
3493 .expect_err("reserved block-7 header must fail");
3494 match err {
3495 super::FrameDecoderError::FailedToReadBlockHeaderAt {
3496 block_index,
3497 frame_offset,
3498 ..
3499 } => {
3500 assert_eq!(block_index, 7);
3501 assert_eq!(frame_offset, blocks[7].offset_in_frame);
3502 }
3503 other => panic!("expected FailedToReadBlockHeaderAt, got {other:?}"),
3504 }
3505
3506 // (2) Truncate at block 3's body start: header intact, body missing
3507 // → the body decode fails at block 3 with its FrameBlock metadata.
3508 let body3 = blocks[3].offset_in_frame as usize + blocks[3].header_size as usize;
3509 let mut dec = FrameDecoder::new();
3510 let err = dec
3511 .decode_all(&frame[..body3], &mut out)
3512 .expect_err("truncated block-3 body must fail");
3513 match err {
3514 super::FrameDecoderError::FailedToReadBlockBodyAt {
3515 block_index,
3516 frame_offset,
3517 block,
3518 ..
3519 } => {
3520 assert_eq!(block_index, 3);
3521 assert_eq!(frame_offset, blocks[3].offset_in_frame);
3522 assert_eq!(block.offset_in_frame, blocks[3].offset_in_frame);
3523 }
3524 other => panic!("expected FailedToReadBlockBodyAt, got {other:?}"),
3525 }
3526 }
3527
3528 #[test]
3529 fn decode_all_exact_fit_output_decodes_correctly() {
3530 // Output sized exactly to frame_content_size (no
3531 // WILDCOPY_OVERLENGTH slack) is now eligible for the direct
3532 // path: every output-write site is exact-fit-safe (sequence
3533 // exec falls back to the bounded, non-overshooting copy on the
3534 // trailing sequence(s), Raw/RLE blocks copy exactly). This must
3535 // produce the same bytes as a slack-padded buffer. Exercised on
3536 // x86 through the per-kernel AVX2/SSE2 inline-exec macros, which
3537 // carry the same tight-tail branch.
3538 let payload: Vec<u8> = (0..2048u32)
3539 .map(|i| (i.wrapping_mul(31) & 0xFF) as u8)
3540 .collect();
3541 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3542 compressor.set_source(payload.as_slice());
3543 let mut compressed = Vec::new();
3544 compressor.set_drain(&mut compressed);
3545 compressor.compress();
3546
3547 let mut dec = FrameDecoder::new();
3548 // Exactly payload.len(), no slack.
3549 let mut out = alloc::vec![0u8; payload.len()];
3550 let n = dec
3551 .decode_all(compressed.as_slice(), &mut out)
3552 .expect("exact-fit decode_all should succeed");
3553 assert_eq!(n, payload.len());
3554 assert_eq!(&out[..n], payload.as_slice());
3555 }
3556
3557 #[test]
3558 fn decode_all_fallback_validates_fcs_against_total_output() {
3559 // Synthetic single-segment frame: FCS = 20 bytes, but the
3560 // last-block flag fires after only 4 bytes of raw payload.
3561 // On the direct path this would trip the post-block
3562 // `produced > content_size` check; the fallback path
3563 // (eligible=false because output is sized exactly to FCS,
3564 // no WILDCOPY slack) used to silently return Ok(4). With
3565 // the fix it now surfaces `FrameContentSizeMismatch`
3566 // matching the direct path.
3567 //
3568 // Frame layout: 4 B magic | 1 B FHD (single_segment=1,
3569 // FCS_flag=3 → 8-byte FCS) | 8 B FCS=20 | block header
3570 // (Raw, last, size=4) | 4 raw bytes.
3571 let mut wire = Vec::new();
3572 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes()); // magic
3573 // FHD: FCS_flag=3 (8-byte FCS) <<6 | single_segment=1 <<5.
3574 wire.push(0b1110_0000);
3575 wire.extend_from_slice(&20u64.to_le_bytes()); // declared FCS
3576 // Block header: (size << 3) | (block_type << 1) | last_block.
3577 // Raw block (block_type=0), last_block=1, size=4 → 0b00100001 = 0x21.
3578 wire.push(0x21);
3579 wire.push(0x00);
3580 wire.push(0x00);
3581 wire.extend_from_slice(&[1u8, 2, 3, 4]);
3582
3583 let mut dec = FrameDecoder::new();
3584 // Size output SMALLER than the declared FCS so direct-decode is
3585 // gated out (`output.len() >= content_size` is false) and the
3586 // frame takes the legacy fallback drain loop — the path this test
3587 // guards. The corrupt frame only produces 4 bytes, so 19 is ample
3588 // room; the point is `19 != declared FCS (20)`.
3589 const DECLARED_FCS: usize = 20;
3590 let mut out = alloc::vec![0u8; DECLARED_FCS - 1];
3591 assert_ne!(
3592 out.len(),
3593 DECLARED_FCS,
3594 "output must be smaller than FCS to exercise the fallback path",
3595 );
3596 let err = dec
3597 .decode_all(wire.as_slice(), &mut out)
3598 .expect_err("fallback must reject corrupt FCS underflow");
3599 match err {
3600 crate::decoding::errors::FrameDecoderError::FrameContentSizeMismatch {
3601 declared,
3602 produced,
3603 } => {
3604 assert_eq!(declared, 20);
3605 assert_eq!(produced, 4);
3606 }
3607 other => panic!("expected FrameContentSizeMismatch, got {other:?}"),
3608 }
3609 }
3610
3611 #[test]
3612 fn decode_all_fallback_treats_explicit_fcs_zero_as_declared() {
3613 // Synthetic multi-segment frame with FCS_flag=2 (4-byte
3614 // FCS) explicitly set to 0. The header DECLARES zero
3615 // content, but the body carries a 5-byte raw last-block.
3616 // `fcs_declared()` must return true (the field is on the
3617 // wire) so the fallback's post-decode size check sees the
3618 // mismatch — even though `frame_content_size == 0`. This
3619 // is exactly the FCS=0 edge case where the previous
3620 // `content_size > 0` proxy would have silently accepted
3621 // the corrupt frame.
3622 //
3623 // Frame layout:
3624 // 4 B magic — 28 B5 2F FD
3625 // 1 B FHD — FCS_flag=2 (bits 7-6), no
3626 // single_segment, content_checksum=0,
3627 // dict_id_flag=0 → 0b1000_0000
3628 // 1 B window_descriptor — exp=10, mantissa=0 → window=1 MiB
3629 // 4 B FCS — 0 LE
3630 // 3 B block header — raw, last, size=5 → 0x29 0x00 0x00
3631 // 5 B raw payload — anything non-empty
3632 let mut wire = Vec::new();
3633 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
3634 wire.push(0b1000_0000); // FHD: FCS_flag=2, others 0.
3635 wire.push(0x50); // window_descriptor: exp=10, mantissa=0.
3636 wire.extend_from_slice(&0u32.to_le_bytes()); // FCS = 0.
3637 // Block header (24-bit LE): (size << 3) | (block_type << 1) | last_block
3638 // = (5 << 3) | (0 << 1) | 1 = 0x29.
3639 wire.push(0x29);
3640 wire.push(0x00);
3641 wire.push(0x00);
3642 wire.extend_from_slice(&[1u8, 2, 3, 4, 5]);
3643
3644 let mut dec = FrameDecoder::new();
3645 // FCS=0 declared, so eligibility (`content_size > 0`)
3646 // false — falls through to the drain loop. Output buffer
3647 // size doesn't matter for the eligibility check here;
3648 // give it some room so `read()` can drain the block.
3649 let mut out = alloc::vec![0u8; 16];
3650 let err = dec
3651 .decode_all(wire.as_slice(), &mut out)
3652 .expect_err("corrupt FCS=0 + 5-byte block must error");
3653 match err {
3654 crate::decoding::errors::FrameDecoderError::FrameContentSizeMismatch {
3655 declared,
3656 produced,
3657 } => {
3658 assert_eq!(declared, 0);
3659 assert_eq!(produced, 5);
3660 }
3661 other => panic!("expected FrameContentSizeMismatch, got {other:?}"),
3662 }
3663 }
3664
3665 #[test]
3666 fn decode_all_fallback_accepts_honest_explicit_fcs_zero() {
3667 // Companion to the corrupt-FCS=0 test above: an HONEST
3668 // empty frame with FCS_flag=2 (4-byte FCS) explicitly set
3669 // to 0 AND a 0-byte raw last-block. `fcs_declared()`
3670 // returns true and `content_size == 0 == total_written`,
3671 // so the fallback validation accepts the frame instead of
3672 // misreporting a mismatch.
3673 //
3674 // (Single-segment FCS=0 would test a similar invariant
3675 // but trips header-stage validation: `window_size =
3676 // frame_content_size = 0 < MIN_WINDOW_SIZE` fails the
3677 // window-size sanity check before decode runs. Use the
3678 // multi-segment shape where `window_size` comes from
3679 // `window_descriptor` independently of FCS.)
3680 //
3681 // Frame layout:
3682 // 4 B magic
3683 // 1 B FHD — FCS_flag=2, others 0 → 0x80
3684 // 1 B window_descriptor — exp=10 → 1 MiB window
3685 // 4 B FCS — 0 LE
3686 // 3 B block header — raw, last, size=0 → 0x01 0x00 0x00
3687 let mut wire = Vec::new();
3688 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
3689 wire.push(0b1000_0000);
3690 wire.push(0x50);
3691 wire.extend_from_slice(&0u32.to_le_bytes());
3692 // Block header: (0 << 3) | (0 << 1) | 1 = 0x01.
3693 wire.push(0x01);
3694 wire.push(0x00);
3695 wire.push(0x00);
3696
3697 let mut dec = FrameDecoder::new();
3698 let mut out = alloc::vec![0u8; 16];
3699 let n = dec
3700 .decode_all(wire.as_slice(), &mut out)
3701 .expect("honest FCS=0 + empty block must succeed");
3702 assert_eq!(n, 0);
3703 }
3704
3705 #[test]
3706 fn reset_with_dict_handle_applies_dict_when_no_dict_id() {
3707 let payload = b"reset-without-dict-id";
3708 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3709 compressor.set_source(payload.as_slice());
3710 let mut compressed = Vec::new();
3711 compressor.set_drain(&mut compressed);
3712 compressor.compress();
3713
3714 let dict_raw = include_bytes!("../../dict_tests/dictionary");
3715 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dictionary should parse");
3716
3717 let mut decoder = FrameDecoder::new();
3718 decoder
3719 .reset_with_dict_handle(compressed.as_slice(), &handle)
3720 .expect("reset should succeed");
3721 let state = decoder.state.as_ref().expect("state should be initialized");
3722 assert!(state.frame_header.dictionary_id().is_none());
3723 assert_eq!(state.using_dict, Some(handle.id()));
3724 }
3725
3726 #[test]
3727 fn reserve_buffer_reserves_the_shortfall_not_the_full_window_again() {
3728 // `Vec::reserve_exact` takes ADDITIONAL capacity. The decode_all
3729 // fallback loop re-enters decode_blocks once per strategy chunk,
3730 // and each entry pre-reserves the window: re-requesting the FULL
3731 // window on a buffer already holding ~window bytes of history
3732 // would grow it toward 2x window, defeating the peak-memory cap
3733 // the exact-growth policy exists for.
3734 use super::DecoderScratchKind;
3735 let window = 1usize << 20;
3736 let mut scratch = DecoderScratchKind::new_flat(window);
3737 scratch.reserve_buffer(window);
3738 let data = alloc::vec![0u8; window];
3739 match &mut scratch {
3740 super::DecoderScratchKind::Flat(s) => s.buffer.push(&data),
3741 super::DecoderScratchKind::Ring(_) => unreachable!("new_flat builds Flat"),
3742 }
3743 scratch.reserve_buffer(window);
3744 let workspace = scratch.workspace_bytes();
3745 assert!(
3746 workspace < window * 3 / 2,
3747 "second reserve_buffer grew a full window past the buffered \
3748 history: workspace {workspace} bytes vs window {window}"
3749 );
3750 }
3751
3752 #[test]
3753 fn dict_frame_decodes_through_direct_path() {
3754 // A dictionary frame decoded via `decode_all_with_dict_handle`
3755 // into a buffer sized exactly to FCS takes the direct path
3756 // (UserSliceBackend); matches reaching into the dictionary
3757 // content must resolve through `repeat_from_dict`. The payload
3758 // embeds dictionary content verbatim so the encoder emits
3759 // dict-region matches from the first bytes of the frame.
3760 let dict_raw = include_bytes!("../../dict_tests/dictionary");
3761 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dictionary should parse");
3762 let dict_tail: alloc::vec::Vec<u8> = handle
3763 .as_dict()
3764 .dict_content
3765 .iter()
3766 .rev()
3767 .take(2048)
3768 .rev()
3769 .copied()
3770 .collect();
3771 // No in-frame duplicate of the dictionary bytes: with a second
3772 // copy in the payload the encoder may emit the later copy as an
3773 // in-frame match, and the test would stay green even if the
3774 // direct path stopped forwarding the dictionary handle. A
3775 // single copy forces every dict-region match through
3776 // `repeat_from_dict`.
3777 let mut payload = dict_tail;
3778 payload.extend_from_slice(b"unique suffix after dictionary material 0123456789");
3779
3780 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3781 compressor
3782 .set_dictionary_from_bytes(dict_raw)
3783 .expect("dict load");
3784 compressor.set_source(payload.as_slice());
3785 let mut compressed = Vec::new();
3786 compressor.set_drain(&mut compressed);
3787 compressor.compress();
3788
3789 // Fixture sanity: the frame must actually depend on the
3790 // dictionary, otherwise the decode below never exercises
3791 // dict-region match resolution.
3792 let mut plain = Vec::new();
3793 let mut no_dict = FrameCompressor::new(CompressionLevel::Default);
3794 no_dict.set_source(payload.as_slice());
3795 no_dict.set_drain(&mut plain);
3796 no_dict.compress();
3797 assert!(
3798 compressed.len() < plain.len(),
3799 "fixture must depend on the dictionary: dict {} bytes vs plain {} bytes",
3800 compressed.len(),
3801 plain.len()
3802 );
3803
3804 let mut decoder = FrameDecoder::new();
3805 let mut out = alloc::vec![0u8; payload.len()];
3806 let n = decoder
3807 .decode_all_with_dict_handle(compressed.as_slice(), &mut out, &handle)
3808 .expect("dict frame must decode on the direct path");
3809 assert_eq!(n, payload.len());
3810 assert_eq!(out, payload, "direct-path dict decode must be byte-exact");
3811 // Both paths are byte-identical, so pin the dispatch itself: a
3812 // re-introduced dict exclusion in the direct gate would silently
3813 // fall back to the buffered path and leave the asserts above green.
3814 assert_eq!(
3815 decoder.direct_frames, 1,
3816 "dict frame must take the direct path, not the buffered fallback"
3817 );
3818 }
3819
3820 #[cfg(feature = "lsm")]
3821 mod expect_validation {
3822 use super::*;
3823 use crate::decoding::errors::FrameDecoderError;
3824
3825 fn compress(payload: &[u8]) -> Vec<u8> {
3826 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3827 compressor.set_source(payload);
3828 let mut compressed = Vec::new();
3829 compressor.set_drain(&mut compressed);
3830 compressor.compress();
3831 compressed
3832 }
3833
3834 fn compress_with_dict(payload: &[u8], dict_raw: &[u8]) -> Vec<u8> {
3835 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
3836 compressor
3837 .set_dictionary_from_bytes(dict_raw)
3838 .expect("dict load");
3839 compressor.set_source(payload);
3840 let mut compressed = Vec::new();
3841 compressor.set_drain(&mut compressed);
3842 compressor.compress();
3843 compressed
3844 }
3845
3846 #[test]
3847 fn expect_dict_id_none_default_allows_anything() {
3848 let compressed = compress(b"hello-no-expect");
3849 let mut decoder = FrameDecoder::new();
3850 decoder
3851 .reset(compressed.as_slice())
3852 .expect("default None passes");
3853 }
3854
3855 #[test]
3856 fn expect_dict_id_zero_matches_frame_without_dict_id() {
3857 // Default-encoded frame has no dict_id; pinning Some(0)
3858 // ("no dictionary expected") must accept it.
3859 let compressed = compress(b"payload");
3860 let mut decoder = FrameDecoder::new();
3861 decoder.expect_dict_id(Some(0));
3862 decoder
3863 .reset(compressed.as_slice())
3864 .expect("Some(0) ~ None");
3865 }
3866
3867 #[test]
3868 fn expect_dict_id_matching_value_passes() {
3869 let dict_raw = include_bytes!("../../dict_tests/dictionary");
3870 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dict parse");
3871 let actual_id = handle.id();
3872
3873 let compressed = compress_with_dict(b"payload-with-dict", dict_raw);
3874
3875 let mut decoder = FrameDecoder::new();
3876 decoder.expect_dict_id(Some(actual_id));
3877 // Decode requires the dict to be registered; using
3878 // reset_with_dict_handle for that.
3879 decoder
3880 .reset_with_dict_handle(compressed.as_slice(), &handle)
3881 .expect("matching dict_id passes");
3882 }
3883
3884 #[test]
3885 fn expect_dict_id_mismatching_value_fails_before_decode() {
3886 let dict_raw = include_bytes!("../../dict_tests/dictionary");
3887 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dict parse");
3888 let actual_id = handle.id();
3889 let wrong_id = actual_id.wrapping_add(1);
3890
3891 let compressed = compress_with_dict(b"payload-with-dict", dict_raw);
3892
3893 let mut decoder = FrameDecoder::new();
3894 decoder.expect_dict_id(Some(wrong_id));
3895 let err = decoder
3896 .reset_with_dict_handle(compressed.as_slice(), &handle)
3897 .expect_err("mismatch must fail");
3898 match err {
3899 FrameDecoderError::UnexpectedDictId { expected, found } => {
3900 assert_eq!(expected, Some(wrong_id));
3901 assert_eq!(found, Some(actual_id));
3902 }
3903 other => panic!("expected UnexpectedDictId, got {other:?}"),
3904 }
3905 }
3906
3907 #[test]
3908 fn expect_dict_id_nonzero_fails_on_frame_without_dict_id() {
3909 // Frame has no dict_id; expecting Some(42) (non-zero)
3910 // must fail with found = None.
3911 let compressed = compress(b"no-dict-frame");
3912 let mut decoder = FrameDecoder::new();
3913 decoder.expect_dict_id(Some(42));
3914 let err = decoder
3915 .reset(compressed.as_slice())
3916 .expect_err("nonzero expectation on dictless frame must fail");
3917 match err {
3918 FrameDecoderError::UnexpectedDictId { expected, found } => {
3919 assert_eq!(expected, Some(42));
3920 assert_eq!(found, None);
3921 }
3922 other => panic!("expected UnexpectedDictId, got {other:?}"),
3923 }
3924 }
3925
3926 #[test]
3927 fn expect_window_descriptor_none_default_allows_anything() {
3928 let compressed = compress(b"hello-no-wd-expect");
3929 let mut decoder = FrameDecoder::new();
3930 decoder
3931 .reset(compressed.as_slice())
3932 .expect("default None passes");
3933 }
3934
3935 #[test]
3936 fn expect_window_descriptor_mismatch_fails_before_decode() {
3937 // Compress a payload large enough to force a
3938 // multi-segment frame (window_descriptor on wire).
3939 // Default compression at >256 KiB produces multi-
3940 // segment frames with a real window_descriptor byte.
3941 let payload = alloc::vec![0xABu8; 512 * 1024];
3942 let compressed = compress(&payload);
3943
3944 // Read the actual window_descriptor by decoding once
3945 // without expectations, then pin a wrong value.
3946 let mut probe_decoder = FrameDecoder::new();
3947 probe_decoder.reset(compressed.as_slice()).unwrap();
3948 let probe_state = probe_decoder.state.as_ref().unwrap();
3949 let actual_wd = probe_state
3950 .frame_header
3951 .window_descriptor()
3952 .expect("multi-segment frame should expose window_descriptor");
3953 let wrong_wd = actual_wd.wrapping_add(0x10); // bump exponent
3954
3955 let mut decoder = FrameDecoder::new();
3956 decoder.expect_window_descriptor(Some(wrong_wd));
3957 let err = decoder
3958 .reset(compressed.as_slice())
3959 .expect_err("wrong window_descriptor must fail");
3960 match err {
3961 FrameDecoderError::UnexpectedWindowDescriptor { expected, found } => {
3962 assert_eq!(expected, wrong_wd);
3963 assert_eq!(found, Some(actual_wd));
3964 }
3965 other => panic!("expected UnexpectedWindowDescriptor, got {other:?}"),
3966 }
3967 }
3968
3969 /// Build a minimal synthetic single-segment zstd frame
3970 /// carrying a 4-byte raw payload. RFC 8878 §3.1.1.1
3971 /// layout, hand-rolled because our default
3972 /// `FrameCompressor` settings don't emit
3973 /// `single_segment_flag` for tiny inputs.
3974 ///
3975 /// Wire bytes (13 total for 4-byte payload):
3976 /// ```text
3977 /// 28 B5 2F FD magic
3978 /// 20 FHD: single_segment=1, FCS_flag=0
3979 /// 04 FCS (single byte, value = payload.len())
3980 /// 21 00 00 block header: raw, last, size=4
3981 /// .. .. .. .. payload bytes
3982 /// ```
3983 fn synth_single_segment_frame(payload: &[u8]) -> Vec<u8> {
3984 assert!(payload.len() <= 255, "1-byte FCS field caps at 255");
3985 assert!(payload.len() < (1usize << 21), "block size 21-bit max");
3986 let mut out = Vec::new();
3987 // Magic 0xFD2FB528 LE.
3988 out.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
3989 // FHD: single_segment_flag (bit 5) set, everything
3990 // else zero. With single_segment + FCS_flag=0 the FCS
3991 // field is 1 byte. No window_descriptor on wire.
3992 out.push(0b0010_0000);
3993 // 1-byte FCS = payload length.
3994 out.push(payload.len() as u8);
3995 // Block header (3 bytes LE):
3996 // last_block=1, block_type=0 (Raw), block_size=payload.len().
3997 // Encoded: (size << 3) | (block_type << 1) | last_block.
3998 // Block header: last_block flag in bit 0, block_type
3999 // (0 = Raw) in bits 1-2, block size in bits 3+.
4000 let bh: u32 = ((payload.len() as u32) << 3) | 1;
4001 out.push((bh & 0xFF) as u8);
4002 out.push(((bh >> 8) & 0xFF) as u8);
4003 out.push(((bh >> 16) & 0xFF) as u8);
4004 // Raw payload.
4005 out.extend_from_slice(payload);
4006 out
4007 }
4008
4009 #[test]
4010 fn expect_window_descriptor_on_single_segment_frame_fails_with_found_none() {
4011 // Single-segment frames omit the window_descriptor
4012 // byte from the wire entirely. Setting an expectation
4013 // here must surface `found: None` so callers
4014 // distinguish "wrong descriptor" from "no descriptor
4015 // on the wire" — never silently pass.
4016 let compressed = synth_single_segment_frame(b"tiny");
4017
4018 // First sanity-check: the synthetic frame decodes
4019 // cleanly without any expectation.
4020 {
4021 let mut probe = FrameDecoder::new();
4022 probe
4023 .reset(compressed.as_slice())
4024 .expect("synth frame parses");
4025 let probe_state = probe.state.as_ref().unwrap();
4026 assert!(
4027 probe_state.frame_header.window_descriptor().is_none(),
4028 "synth frame must be single-segment"
4029 );
4030 }
4031
4032 let mut decoder = FrameDecoder::new();
4033 decoder.expect_window_descriptor(Some(0x40));
4034 let err = decoder
4035 .reset(compressed.as_slice())
4036 .expect_err("single-segment + expectation must fail");
4037 match err {
4038 FrameDecoderError::UnexpectedWindowDescriptor { expected, found } => {
4039 assert_eq!(expected, 0x40);
4040 assert_eq!(found, None);
4041 }
4042 other => panic!("expected UnexpectedWindowDescriptor, got {other:?}"),
4043 }
4044 }
4045
4046 #[test]
4047 fn validation_failure_leaves_decoder_re_resettable() {
4048 // After UnexpectedDictId on a wrong-expectation reset,
4049 // clearing the expectation and re-calling reset must
4050 // succeed on the same source — no lingering failed
4051 // state.
4052 let compressed = compress(b"re-resettable");
4053
4054 let mut decoder = FrameDecoder::new();
4055 decoder.expect_dict_id(Some(42));
4056 let err = decoder
4057 .reset(compressed.as_slice())
4058 .expect_err("first reset fails");
4059 assert!(matches!(err, FrameDecoderError::UnexpectedDictId { .. }));
4060
4061 // Clear expectation and retry on a fresh source.
4062 decoder.expect_dict_id(None);
4063 decoder
4064 .reset(compressed.as_slice())
4065 .expect("retry after clearing expectation should succeed");
4066 }
4067 }
4068
4069 /// Build a skippable frame on the wire: 4-byte LE magic + 4-byte LE
4070 /// length + payload bytes. RFC 8878 §3.1.2 restricts the magic
4071 /// variant to `0..=15`; assert here so accidental misuse of the
4072 /// helper can't smuggle a non-skippable magic past the tests.
4073 #[cfg(feature = "lsm")]
4074 fn build_skippable_frame(variant: u8, payload: &[u8]) -> Vec<u8> {
4075 assert!(
4076 variant <= 15,
4077 "skippable-frame variant {variant} outside RFC 8878 0..=15 range",
4078 );
4079 let mut out = Vec::with_capacity(8 + payload.len());
4080 let magic: u32 = 0x184D2A50 + u32::from(variant);
4081 out.extend_from_slice(&magic.to_le_bytes());
4082 out.extend_from_slice(&u32::try_from(payload.len()).unwrap().to_le_bytes());
4083 out.extend_from_slice(payload);
4084 out
4085 }
4086
4087 #[cfg(feature = "lsm")]
4088 #[test]
4089 fn decode_all_with_skippable_visitor_sees_payloads_in_order() {
4090 // Build a stream: skippable(v0, "alpha") + zstd_frame +
4091 // skippable(v3, "beta") + zstd_frame + skippable(v15, "")
4092 // and verify the visitor is invoked exactly three times with
4093 // the correct (variant, payload) pairs in stream order while
4094 // the zstd frames decode normally.
4095 let payload_a: Vec<u8> = (0..256u16).map(|i| i as u8).collect();
4096 let payload_b: Vec<u8> = (0..256u16).map(|i| (i ^ 0xAA) as u8).collect();
4097
4098 let mut comp_a = Vec::new();
4099 let mut c = FrameCompressor::new(CompressionLevel::Default);
4100 c.set_source(payload_a.as_slice());
4101 c.set_drain(&mut comp_a);
4102 c.compress();
4103
4104 let mut comp_b = Vec::new();
4105 let mut c = FrameCompressor::new(CompressionLevel::Default);
4106 c.set_source(payload_b.as_slice());
4107 c.set_drain(&mut comp_b);
4108 c.compress();
4109
4110 let skip0 = build_skippable_frame(0, b"alpha");
4111 let skip3 = build_skippable_frame(3, b"beta");
4112 let skip15 = build_skippable_frame(15, &[]);
4113
4114 let mut stream = Vec::new();
4115 stream.extend_from_slice(&skip0);
4116 stream.extend_from_slice(&comp_a);
4117 stream.extend_from_slice(&skip3);
4118 stream.extend_from_slice(&comp_b);
4119 stream.extend_from_slice(&skip15);
4120
4121 let mut decoder = FrameDecoder::new();
4122 let mut out = alloc::vec![0u8; payload_a.len() + payload_b.len()];
4123 let mut collected: Vec<(u8, Vec<u8>)> = Vec::new();
4124 let n = decoder
4125 .decode_all_with_skippable_visitor(stream.as_slice(), &mut out, |variant, payload| {
4126 collected.push((variant, payload.to_vec()));
4127 })
4128 .expect("decode_all_with_skippable_visitor should succeed");
4129
4130 // All three skippables visited in stream order.
4131 assert_eq!(collected.len(), 3);
4132 assert_eq!(collected[0], (0u8, b"alpha".to_vec()));
4133 assert_eq!(collected[1], (3u8, b"beta".to_vec()));
4134 assert_eq!(collected[2], (15u8, Vec::<u8>::new()));
4135
4136 // Both zstd frames decoded into `out` back-to-back.
4137 assert_eq!(n, payload_a.len() + payload_b.len());
4138 assert_eq!(&out[..payload_a.len()], payload_a.as_slice());
4139 assert_eq!(&out[payload_a.len()..n], payload_b.as_slice());
4140 }
4141
4142 #[cfg(feature = "lsm")]
4143 #[test]
4144 fn decode_all_silently_skips_when_no_visitor() {
4145 // Regression gate: plain decode_all must still silently skip
4146 // skippable frames (RFC 8878 mandated behavior) with no
4147 // behavioral change after the visitor refactor.
4148 let payload: Vec<u8> = (0..512u16).map(|i| i as u8).collect();
4149 let mut comp = Vec::new();
4150 let mut c = FrameCompressor::new(CompressionLevel::Default);
4151 c.set_source(payload.as_slice());
4152 c.set_drain(&mut comp);
4153 c.compress();
4154
4155 let skip = build_skippable_frame(7, b"ignored sidecar");
4156 let mut stream = Vec::new();
4157 stream.extend_from_slice(&skip);
4158 stream.extend_from_slice(&comp);
4159
4160 let mut decoder = FrameDecoder::new();
4161 let mut out = alloc::vec![0u8; payload.len()];
4162 let n = decoder
4163 .decode_all(stream.as_slice(), &mut out)
4164 .expect("decode_all should succeed on skippable + zstd stream");
4165 assert_eq!(n, payload.len());
4166 assert_eq!(&out[..n], payload.as_slice());
4167 }
4168
4169 #[cfg(feature = "lsm")]
4170 #[test]
4171 fn frame_emit_info_describes_emitted_block_layout() {
4172 // Encode a payload large enough to force >1 block, fetch
4173 // FrameEmitInfo, walk blocks[] and verify each block's
4174 // (offset_in_frame, header_size, body_size) matches the bytes
4175 // actually emitted into the drain buffer.
4176 let payload: Vec<u8> = (0..200_000u32).map(|i| (i & 0xFF) as u8).collect();
4177 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
4178 // Content checksum is opt-in (library default mirrors libzstd's
4179 // checksum-off); request it so the checksum_range assertion below
4180 // exercises the hash-gated trailer accounting.
4181 compressor.set_content_checksum(true);
4182 compressor.set_source(payload.as_slice());
4183 let mut compressed = Vec::new();
4184 compressor.set_drain(&mut compressed);
4185 compressor.compress();
4186
4187 let info = compressor
4188 .last_frame_emit_info()
4189 .expect("last_frame_emit_info populated after compress")
4190 .clone();
4191 drop(compressor);
4192
4193 // Frame header range starts at 0 and is non-empty.
4194 assert_eq!(info.frame_header_range.start, 0);
4195 assert!(info.frame_header_range.end > 0);
4196 // Total size matches what was written to the drain.
4197 assert_eq!(info.total_size as usize, compressed.len());
4198 // At least one block, and the last entry has last_block=true.
4199 assert!(!info.blocks.is_empty());
4200 assert!(info.blocks.last().unwrap().last_block);
4201 // All non-final blocks have last_block=false.
4202 for b in &info.blocks[..info.blocks.len() - 1] {
4203 assert!(!b.last_block);
4204 }
4205 // Walk and verify each block's header bytes match the
4206 // recorded type / size by re-decoding the 3-byte header.
4207 // Walking arithmetic: offset_in_frame + header_size + body_size
4208 // must land exactly on the next block's offset_in_frame (or,
4209 // for the last block, on the checksum / end of frame).
4210 for (i, b) in info.blocks.iter().enumerate() {
4211 let off = b.offset_in_frame as usize;
4212 assert_eq!(b.header_size, 3);
4213 let mut hdr = [0u8; 4];
4214 hdr[..3].copy_from_slice(&compressed[off..off + 3]);
4215 let raw = u32::from_le_bytes(hdr);
4216 let last = (raw & 1) != 0;
4217 let ty = (raw >> 1) & 0b11;
4218 let sz = raw >> 3;
4219 assert_eq!(last, b.last_block);
4220 assert_eq!(sz, b.block_size_field);
4221 // body_size is the PHYSICAL length on the wire: spec's
4222 // Block_Size for Raw/Compressed, always 1 for RLE.
4223 let expected_physical = match b.block_type {
4224 crate::encoding::frame_emit_info::BlockType::RLE => 1,
4225 _ => sz,
4226 };
4227 assert_eq!(b.body_size, expected_physical);
4228 let expected_ty = match b.block_type {
4229 crate::encoding::frame_emit_info::BlockType::Raw => 0,
4230 crate::encoding::frame_emit_info::BlockType::RLE => 1,
4231 crate::encoding::frame_emit_info::BlockType::Compressed => 2,
4232 crate::encoding::frame_emit_info::BlockType::Reserved => 3,
4233 };
4234 assert_eq!(ty, expected_ty);
4235 // Walking-arithmetic invariant.
4236 let next_off = b.offset_in_frame + b.header_size as u32 + b.body_size;
4237 if let Some(next) = info.blocks.get(i + 1) {
4238 assert_eq!(
4239 next_off, next.offset_in_frame,
4240 "block {i} body_size doesn't reach next block's offset_in_frame",
4241 );
4242 } else if let Some(cs) = info.checksum_range.as_ref() {
4243 assert_eq!(
4244 next_off, cs.start,
4245 "last block body_size doesn't reach checksum_range.start",
4246 );
4247 } else {
4248 assert_eq!(
4249 next_off, info.total_size,
4250 "last block body_size doesn't reach total_size",
4251 );
4252 }
4253 }
4254 // Checksum range present iff `feature = "hash"` is enabled.
4255 assert_eq!(info.checksum_range.is_some(), cfg!(feature = "hash"));
4256 }
4257
4258 #[cfg(all(feature = "lsm", feature = "hash"))]
4259 #[test]
4260 fn per_block_checksum_round_trip() {
4261 // Encode with per-block checksums enabled. Decode with
4262 // per-block verification. Both sides emit exactly 1
4263 // checksum per physical block written to / read from the
4264 // wire (encoder hashes per emission site, including each
4265 // post-split partition; decoder hashes each decoded block).
4266 // Cardinality and element-wise contents must match
4267 // round-trip.
4268 let payload: Vec<u8> = (0..200_000u32).map(|i| (i & 0xFF) as u8).collect();
4269 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
4270 compressor.set_source(payload.as_slice());
4271 compressor.enable_per_block_checksums();
4272 let mut compressed = Vec::new();
4273 compressor.set_drain(&mut compressed);
4274 compressor.compress();
4275
4276 let encoder_checksums = compressor
4277 .last_frame_block_checksums()
4278 .expect("checksums populated after enable + compress")
4279 .to_vec();
4280 drop(compressor);
4281 assert!(!encoder_checksums.is_empty());
4282
4283 // Decode side: enable verification, decode, compare.
4284 let mut decoder = FrameDecoder::new();
4285 decoder.enable_per_block_checksums();
4286 let mut output = alloc::vec![0u8; payload.len()];
4287 let n = decoder
4288 .decode_all(compressed.as_slice(), &mut output)
4289 .expect("decode_all should succeed");
4290 assert_eq!(n, payload.len());
4291 assert_eq!(&output[..n], payload.as_slice());
4292
4293 let decoder_checksums = decoder.computed_block_checksums();
4294 assert_eq!(decoder_checksums, encoder_checksums.as_slice());
4295 }
4296
4297 // ── decode_blocks_partial (block-subset partial decode, lsm) ──
4298
4299 /// Build a multi-block compressible frame and return
4300 /// `(compressed, full_decode, emit_info)`. The emit info's
4301 /// `decompressed_byte_range` maps decompressed offsets to block indices.
4302 #[cfg(feature = "lsm")]
4303 fn multi_block_fixture() -> (
4304 Vec<u8>,
4305 Vec<u8>,
4306 crate::encoding::frame_emit_info::FrameEmitInfo,
4307 ) {
4308 let mut data: Vec<u8> = Vec::with_capacity(400 * 1024);
4309 let mut x = 0x9E37_79B9u32;
4310 while data.len() < 400 * 1024 {
4311 x ^= x << 13;
4312 x ^= x >> 17;
4313 x ^= x << 5;
4314 let run = 16 + (x as usize % 48);
4315 let byte = (x >> 24) as u8;
4316 for _ in 0..run {
4317 data.push(byte);
4318 }
4319 data.extend_from_slice(b"the quick brown fox jumps over the lazy dog\n");
4320 }
4321
4322 let mut compressed = Vec::new();
4323 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
4324 compressor.set_source(data.as_slice());
4325 compressor.set_drain(&mut compressed);
4326 compressor.compress();
4327 let info = compressor
4328 .last_frame_emit_info()
4329 .expect("emit info populated")
4330 .clone();
4331 drop(compressor);
4332
4333 let mut dec = FrameDecoder::new();
4334 let mut full = alloc::vec![0u8; data.len()];
4335 let n = dec
4336 .decode_all(compressed.as_slice(), &mut full)
4337 .expect("full decode");
4338 full.truncate(n);
4339 assert_eq!(full, data, "fixture must round-trip");
4340 (compressed, full, info)
4341 }
4342
4343 #[cfg(feature = "lsm")]
4344 #[test]
4345 fn decode_blocks_partial_subset_matches_full_decode() {
4346 let (compressed, full, info) = multi_block_fixture();
4347 let nblocks = info.blocks.len() as u32;
4348 assert!(
4349 nblocks >= 4,
4350 "fixture must have several blocks, got {nblocks}"
4351 );
4352 let half = nblocks / 2;
4353 // Boundaries: 1 block, 2 blocks, half, all, and a non-zero start.
4354 // `(0, u32::MAX)` exercises the "decode to end of frame" sentinel,
4355 // a distinct public contract from an explicit upper bound.
4356 for &(s, e) in &[
4357 (0u32, u32::MAX),
4358 (0, 1),
4359 (0, 2),
4360 (0, half),
4361 (0, nblocks),
4362 (1, 2),
4363 (half, nblocks),
4364 ] {
4365 // The sentinel decodes through the last block; map it to nblocks
4366 // for the expected-slice / block-count arithmetic below.
4367 let effective_end = if e == u32::MAX { nblocks } else { e };
4368 let mut source = compressed.as_slice();
4369 let mut dec = FrameDecoder::new();
4370 dec.reset(&mut source).unwrap();
4371 let pd = dec
4372 .decode_blocks_partial(&mut source, s, e, None, false)
4373 .unwrap_or_else(|err| panic!("range [{s},{e}) errored: {err:?}"));
4374
4375 let start = info.decompressed_byte_range(s as usize).unwrap().start as usize;
4376 let end = info
4377 .decompressed_byte_range((effective_end - 1) as usize)
4378 .unwrap()
4379 .end as usize;
4380 assert_eq!(
4381 pd.data.as_slice(),
4382 &full[start..end],
4383 "subset bytes must equal the full-decode slice for [{s},{e})"
4384 );
4385 assert_eq!(pd.start_block, s);
4386 assert_eq!(pd.blocks_decoded, effective_end - s);
4387 assert!(pd.stopped_at.is_none(), "clean range [{s},{e})");
4388 }
4389 }
4390
4391 #[cfg(feature = "lsm")]
4392 #[test]
4393 fn decode_blocks_partial_recovers_clean_prefix_on_truncated_block() {
4394 let (compressed, full, info) = multi_block_fixture();
4395 let nblocks = info.blocks.len();
4396 let k = nblocks / 2;
4397 assert!(k >= 1, "need a clean prefix before the failing block");
4398
4399 // Truncate the source right after block k's 3-byte header, so its body
4400 // read fails regardless of block type (0 body bytes available).
4401 let cut = info.blocks[k].offset_in_frame as usize + info.blocks[k].header_size as usize;
4402 let truncated = &compressed[..cut];
4403
4404 let mut source = truncated;
4405 let mut dec = FrameDecoder::new();
4406 dec.reset(&mut source).unwrap();
4407 let pd = dec
4408 .decode_blocks_partial(&mut source, 0, u32::MAX, None, false)
4409 .unwrap();
4410
4411 let (idx, _err) = pd.stopped_at.expect("must stop on the truncated block");
4412 assert_eq!(idx, k as u32, "stopped at the truncated block index");
4413 assert_eq!(pd.blocks_decoded, k as u32, "blocks 0..k decoded cleanly");
4414 assert!(!pd.frame_finished);
4415 let clean_end = info.decompressed_byte_range(k).unwrap().start as usize;
4416 assert_eq!(
4417 pd.data.as_slice(),
4418 &full[..clean_end],
4419 "clean prefix preserved through the failure"
4420 );
4421 }
4422
4423 #[cfg(feature = "lsm")]
4424 #[test]
4425 fn decode_blocks_partial_invalid_range_errors() {
4426 let (compressed, _full, _info) = multi_block_fixture();
4427 let mut source = compressed.as_slice();
4428 let mut dec = FrameDecoder::new();
4429 dec.reset(&mut source).unwrap();
4430 let err = dec
4431 .decode_blocks_partial(&mut source, 5, 2, None, false)
4432 .expect_err("start > end must error");
4433 assert!(matches!(
4434 err,
4435 crate::decoding::errors::FrameDecoderError::InvalidBlockRange {
4436 start_block: 5,
4437 end_block: 2,
4438 }
4439 ));
4440 }
4441
4442 #[cfg(feature = "lsm")]
4443 #[test]
4444 fn decode_blocks_partial_skips_trailing_blocks() {
4445 let (compressed, full, info) = multi_block_fixture();
4446 assert!(info.blocks.len() >= 3);
4447 let mut source = compressed.as_slice();
4448 let mut dec = FrameDecoder::new();
4449 dec.reset(&mut source).unwrap();
4450 let pd = dec
4451 .decode_blocks_partial(&mut source, 0, 1, None, false)
4452 .unwrap();
4453
4454 assert_eq!(pd.blocks_decoded, 1);
4455 assert!(pd.stopped_at.is_none());
4456 assert!(!pd.frame_finished, "block 0 is not the last block");
4457 let end = info.decompressed_byte_range(0).unwrap().end as usize;
4458 assert_eq!(pd.data.as_slice(), &full[..end]);
4459 // The trailing blocks + checksum were never consumed from the source.
4460 assert!(
4461 dec.bytes_read_from_source() < u64::from(info.total_size),
4462 "only block 0's region should be consumed, read {} of {}",
4463 dec.bytes_read_from_source(),
4464 info.total_size
4465 );
4466 }
4467
4468 #[cfg(feature = "lsm")]
4469 #[test]
4470 fn lsm_style_range_query_partial_recovery() {
4471 // Simulates lsm-tree's range-query path: a key range resolves to a
4472 // decompressed byte window, which maps to inner zstd block indices via
4473 // `decompressed_byte_range`; decode only the covering blocks and check
4474 // the wanted window is recovered exactly (no key outside, all inside).
4475 let (compressed, full, info) = multi_block_fixture();
4476 let total = full.len() as u64;
4477 let want_start = total / 3;
4478 let want_end = (total * 2) / 3;
4479
4480 // Map [want_start, want_end) to covering block indices.
4481 let nblocks = info.blocks.len();
4482 let mut start_block = 0u32;
4483 let mut end_block = nblocks as u32;
4484 for i in 0..nblocks {
4485 let r = info.decompressed_byte_range(i).unwrap();
4486 if r.start <= want_start && want_start < r.end {
4487 start_block = i as u32;
4488 }
4489 if r.start < want_end && want_end <= r.end {
4490 end_block = i as u32 + 1;
4491 break;
4492 }
4493 }
4494
4495 let mut source = compressed.as_slice();
4496 let mut dec = FrameDecoder::new();
4497 dec.reset(&mut source).unwrap();
4498 let pd = dec
4499 .decode_blocks_partial(&mut source, start_block, end_block, None, false)
4500 .unwrap();
4501 assert!(pd.stopped_at.is_none());
4502
4503 let covered_start = info
4504 .decompressed_byte_range(start_block as usize)
4505 .unwrap()
4506 .start;
4507 let covered_end = info
4508 .decompressed_byte_range((end_block - 1) as usize)
4509 .unwrap()
4510 .end;
4511 assert!(
4512 covered_start <= want_start && want_end <= covered_end,
4513 "covering blocks must contain the wanted window"
4514 );
4515 assert_eq!(
4516 pd.data.as_slice(),
4517 &full[covered_start as usize..covered_end as usize],
4518 "covered subset must equal the full-decode slice"
4519 );
4520 // Slice the exact key range out of the covered subset.
4521 let off = (want_start - covered_start) as usize;
4522 let len = (want_end - want_start) as usize;
4523 assert_eq!(
4524 &pd.data[off..off + len],
4525 &full[want_start as usize..want_end as usize],
4526 "exact key range recovered from the partial decode"
4527 );
4528 }
4529
4530 #[cfg(feature = "lsm")]
4531 #[test]
4532 fn decode_blocks_partial_leaves_no_residual_when_no_in_range_block() {
4533 // Regression: when the requested range reaches no in-range block (here
4534 // start_block is past EOF, so every block is decoded only as window
4535 // context), `PartialDecode::data` is empty — but the context bytes must
4536 // NOT linger in the decoder buffer, or a later collect()/read() on the
4537 // same decoder returns out-of-range data.
4538 let (compressed, _full, info) = multi_block_fixture();
4539 let nblocks = info.blocks.len() as u32;
4540 let mut source = compressed.as_slice();
4541 let mut dec = FrameDecoder::new();
4542 dec.reset(&mut source).unwrap();
4543 let pd = dec
4544 .decode_blocks_partial(&mut source, nblocks + 5, u32::MAX, None, false)
4545 .unwrap();
4546 assert!(pd.data.is_empty(), "no in-range block → empty data");
4547 assert_eq!(pd.blocks_decoded, 0);
4548 assert!(
4549 pd.frame_finished,
4550 "frame's last block was reached as context"
4551 );
4552 assert_eq!(
4553 dec.can_collect(),
4554 0,
4555 "context bytes must not leak via collect()/read() when data is empty"
4556 );
4557 }
4558
4559 #[cfg(feature = "lsm")]
4560 #[test]
4561 fn decode_blocks_partial_empty_range_leaves_no_residual() {
4562 // Companion to the start-past-EOF case: an in-frame empty range `[k, k)`
4563 // (k < EOF) takes the same `prefix_window_len == None` path but with
4564 // `frame_finished == false` and up to `window_size` context bytes still
4565 // physically present. Assert the buffer is fully cleared directly (a
4566 // `can_collect()` check alone would pass even with <= window_size bytes
4567 // retained, because it holds the window back).
4568 let (compressed, _full, info) = multi_block_fixture();
4569 let k = ((info.blocks.len() as u32) / 2).max(1);
4570 let mut source = compressed.as_slice();
4571 let mut dec = FrameDecoder::new();
4572 dec.reset(&mut source).unwrap();
4573 let pd = dec
4574 .decode_blocks_partial(&mut source, k, k, None, false)
4575 .unwrap();
4576
4577 assert!(pd.data.is_empty(), "empty range must yield empty data");
4578 assert_eq!(pd.blocks_decoded, 0);
4579 assert!(
4580 !pd.frame_finished,
4581 "frame should still have trailing blocks"
4582 );
4583 assert_eq!(
4584 dec.state.as_ref().unwrap().decoder_scratch.buffer_len(),
4585 0,
4586 "empty-range partial decode must not retain context bytes"
4587 );
4588 }
4589
4590 #[cfg(all(feature = "lsm", feature = "hash"))]
4591 #[test]
4592 fn decode_blocks_partial_captures_per_block_checksums() {
4593 // Regression: with per-block checksums enabled, decode_blocks_partial
4594 // must populate computed_block_checksums just like decode_blocks /
4595 // decode_all — otherwise callers verifying per-block digests silently
4596 // lose them on the partial path.
4597 let (compressed, full, _info) = multi_block_fixture();
4598
4599 // Reference digests via decode_blocks (the path that captures them).
4600 let mut ref_dec = FrameDecoder::new();
4601 ref_dec.enable_per_block_checksums();
4602 let mut rsrc = compressed.as_slice();
4603 ref_dec.reset(&mut rsrc).unwrap();
4604 while !ref_dec.is_finished() {
4605 ref_dec
4606 .decode_blocks(&mut rsrc, crate::decoding::BlockDecodingStrategy::All)
4607 .unwrap();
4608 }
4609 let expected = ref_dec.computed_block_checksums().to_vec();
4610 assert!(!expected.is_empty(), "fixture must have multiple blocks");
4611 let _ = full;
4612
4613 // Partial decode of the whole frame must capture the same digests.
4614 let mut source = compressed.as_slice();
4615 let mut dec = FrameDecoder::new();
4616 dec.enable_per_block_checksums();
4617 dec.reset(&mut source).unwrap();
4618 let _ = dec
4619 .decode_blocks_partial(&mut source, 0, u32::MAX, None, false)
4620 .unwrap();
4621 assert_eq!(
4622 dec.computed_block_checksums(),
4623 expected.as_slice(),
4624 "partial decode must capture the same per-block checksums as full decode"
4625 );
4626 }
4627
4628 // ── resume (window-priming + entropy cold resume, lsm) ───────────
4629
4630 /// Window size of `compressed`'s frame, read from a freshly-reset decoder.
4631 #[cfg(feature = "lsm")]
4632 fn frame_window_size(compressed: &[u8]) -> usize {
4633 let mut src = compressed;
4634 let mut dec = FrameDecoder::new();
4635 dec.reset(&mut src).unwrap();
4636 dec.state
4637 .as_ref()
4638 .unwrap()
4639 .frame_header
4640 .window_size()
4641 .unwrap_or(0) as usize
4642 }
4643
4644 /// Build a large compressible MULTI-SEGMENT frame (window_size < content,
4645 /// so mid-frame blocks reach back only into a bounded window) and return
4646 /// `(compressed, full_decode, emit_info)`.
4647 #[cfg(feature = "lsm")]
4648 fn multi_segment_block_fixture() -> (
4649 Vec<u8>,
4650 Vec<u8>,
4651 crate::encoding::frame_emit_info::FrameEmitInfo,
4652 ) {
4653 // ~3 MiB of compressible (runs + repeated phrase) data — large enough
4654 // that the encoder picks window_size < content_size (multi-segment).
4655 let mut data: Vec<u8> = Vec::with_capacity(3 * 1024 * 1024);
4656 let mut x = 0x9E37_79B9u32;
4657 while data.len() < 3 * 1024 * 1024 {
4658 x ^= x << 13;
4659 x ^= x >> 17;
4660 x ^= x << 5;
4661 let run = 16 + (x as usize % 48);
4662 let byte = (x >> 24) as u8;
4663 for _ in 0..run {
4664 data.push(byte);
4665 }
4666 data.extend_from_slice(b"the quick brown fox jumps over the lazy dog\n");
4667 }
4668
4669 let mut compressed = Vec::new();
4670 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
4671 compressor.set_source(data.as_slice());
4672 compressor.set_drain(&mut compressed);
4673 compressor.compress();
4674 let info = compressor
4675 .last_frame_emit_info()
4676 .expect("emit info populated")
4677 .clone();
4678 drop(compressor);
4679
4680 // Confirm the precondition: the frame must be multi-segment.
4681 let mut sanity = FrameDecoder::new();
4682 sanity.init(&mut compressed.as_slice()).unwrap();
4683 assert!(
4684 !sanity
4685 .state
4686 .as_ref()
4687 .unwrap()
4688 .frame_header
4689 .descriptor
4690 .single_segment_flag(),
4691 "fixture precondition: frame must be multi-segment (resize if encoder default changed)"
4692 );
4693
4694 let mut dec = FrameDecoder::new();
4695 let mut full = alloc::vec![0u8; data.len()];
4696 let n = dec
4697 .decode_all(compressed.as_slice(), &mut full)
4698 .expect("full decode");
4699 full.truncate(n);
4700 assert_eq!(full, data, "fixture must round-trip");
4701 (compressed, full, info)
4702 }
4703
4704 /// Emit a [`ResumeState`] for resuming at block `n` by decoding `[0, n)` on
4705 /// a throwaway decoder with `emit_resume = true`.
4706 #[cfg(feature = "lsm")]
4707 fn emit_resume_state_at(compressed: &[u8], n: u32) -> super::ResumeState {
4708 let mut src = compressed;
4709 let mut dec = FrameDecoder::new();
4710 dec.reset(&mut src).unwrap();
4711 let pd = dec
4712 .decode_blocks_partial(&mut src, 0, n, None, true)
4713 .expect("prefix decode for resume-state emission");
4714 pd.resume_state
4715 .expect("emit_resume should populate resume_state")
4716 }
4717
4718 #[cfg(feature = "lsm")]
4719 #[test]
4720 fn resume_matches_full_decode_at_first_mid_last() {
4721 // Acceptance criterion: after resuming at block N (cold decoder, primed
4722 // window + restored entropy), decode_blocks_partial yields bytes
4723 // byte-identical to a full decode's [ends[N-1]..ends[end-1]) slice, for
4724 // N in {1, mid, last}. Repeat_Mode entropy blocks are covered because
4725 // the emitted ResumeState carries the carry-over tables.
4726 let (compressed, full, info) = multi_block_fixture();
4727 let nblocks = info.blocks.len() as u32;
4728 assert!(nblocks >= 4, "need several blocks, got {nblocks}");
4729
4730 for &n in &[1u32, nblocks / 2, nblocks - 1] {
4731 // Producer: emit resume state for block n (separate decoder).
4732 let st = emit_resume_state_at(&compressed, n);
4733 assert_eq!(st.block_index(), n);
4734 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start;
4735 assert_eq!(st.output_offset(), output_offset);
4736
4737 // Consumer: a FRESH (cold) decoder resumes at n. Pass the WHOLE
4738 // decompressed prefix as window_prime; it is capped to one window
4739 // internally, exercising the cap path.
4740 let window_prime = &full[..output_offset as usize];
4741 let mut header_src = compressed.as_slice();
4742 let mut dec = FrameDecoder::new();
4743 dec.reset(&mut header_src).unwrap();
4744 // Caller positions the source at block n's compressed frame offset.
4745 let off = info.blocks[n as usize].offset_in_frame as usize;
4746 let mut block_src = &compressed[off..];
4747 let pd = dec
4748 .decode_blocks_partial(
4749 &mut block_src,
4750 n,
4751 u32::MAX,
4752 Some(super::ResumeInput {
4753 window_prime,
4754 state: &st,
4755 }),
4756 false,
4757 )
4758 .unwrap_or_else(|e| panic!("resume decode at N={n} errored: {e:?}"));
4759
4760 let start = output_offset as usize;
4761 let end = info
4762 .decompressed_byte_range((nblocks - 1) as usize)
4763 .unwrap()
4764 .end as usize;
4765 assert_eq!(
4766 pd.data.as_slice(),
4767 &full[start..end],
4768 "resumed bytes must equal the full-decode slice for N={n}"
4769 );
4770 assert_eq!(pd.start_block, n);
4771 assert_eq!(pd.blocks_decoded, nblocks - n);
4772 assert!(pd.stopped_at.is_none(), "clean resume at N={n}");
4773 assert!(pd.frame_finished, "decoded through the last block");
4774 }
4775 }
4776
4777 #[cfg(feature = "lsm")]
4778 #[test]
4779 fn resume_with_exact_window_tail_matches_full_decode() {
4780 // Realistic cold-resume shape on a MULTI-SEGMENT frame: caller supplies
4781 // only the last `window_size` decompressed bytes (not the whole prefix),
4782 // which is all that can ever back a match.
4783 let (compressed, full, info) = multi_segment_block_fixture();
4784 let nblocks = info.blocks.len() as u32;
4785 let window_size = frame_window_size(&compressed);
4786 // First block whose preceding output exceeds one window, so the tail
4787 // genuinely truncates the prefix.
4788 let n = (1..nblocks)
4789 .find(|&i| {
4790 info.decompressed_byte_range(i as usize).unwrap().start as usize > window_size
4791 })
4792 .expect("multi-segment frame must have a block past one window");
4793 let st = emit_resume_state_at(&compressed, n);
4794 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start;
4795 assert!(output_offset as usize > window_size);
4796 let tail_start = output_offset as usize - window_size;
4797 let window_prime = &full[tail_start..output_offset as usize];
4798
4799 let mut header_src = compressed.as_slice();
4800 let mut dec = FrameDecoder::new();
4801 dec.reset(&mut header_src).unwrap();
4802 let off = info.blocks[n as usize].offset_in_frame as usize;
4803 let mut block_src = &compressed[off..];
4804 let pd = dec
4805 .decode_blocks_partial(
4806 &mut block_src,
4807 n,
4808 u32::MAX,
4809 Some(super::ResumeInput {
4810 window_prime,
4811 state: &st,
4812 }),
4813 false,
4814 )
4815 .unwrap();
4816
4817 let end = info
4818 .decompressed_byte_range((nblocks - 1) as usize)
4819 .unwrap()
4820 .end as usize;
4821 assert_eq!(pd.data.as_slice(), &full[output_offset as usize..end]);
4822 assert_eq!(pd.blocks_decoded, nblocks - n);
4823 }
4824
4825 #[cfg(feature = "lsm")]
4826 #[test]
4827 fn resume_rejects_short_window_prime() {
4828 // Acceptance criterion: a window_prime shorter than the required window
4829 // is rejected with a typed error, not a silent mis-decode.
4830 let (compressed, full, info) = multi_block_fixture();
4831 let nblocks = info.blocks.len() as u32;
4832 let window_size = frame_window_size(&compressed);
4833 let n = nblocks / 2;
4834 let st = emit_resume_state_at(&compressed, n);
4835 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start;
4836 let required = core::cmp::min(window_size as u64, output_offset) as usize;
4837 assert!(required > 0, "mid block must require a non-empty window");
4838
4839 // One byte short of the required window.
4840 let prime = &full[output_offset as usize - (required - 1)..output_offset as usize];
4841
4842 let mut header_src = compressed.as_slice();
4843 let mut dec = FrameDecoder::new();
4844 dec.reset(&mut header_src).unwrap();
4845 let off = info.blocks[n as usize].offset_in_frame as usize;
4846 let mut block_src = &compressed[off..];
4847 let err = dec
4848 .decode_blocks_partial(
4849 &mut block_src,
4850 n,
4851 u32::MAX,
4852 Some(super::ResumeInput {
4853 window_prime: prime,
4854 state: &st,
4855 }),
4856 false,
4857 )
4858 .expect_err("short window_prime must be rejected");
4859 match err {
4860 crate::decoding::errors::FrameDecoderError::ResumeWindowTooShort { got, need } => {
4861 assert_eq!(got, required - 1);
4862 assert_eq!(need, required);
4863 }
4864 other => panic!("expected ResumeWindowTooShort, got {other:?}"),
4865 }
4866 }
4867
4868 #[cfg(feature = "lsm")]
4869 #[test]
4870 fn resume_range_validates_against_effective_start_not_start_block() {
4871 // In resume mode `start_block` is ignored and decoding begins at
4872 // `state.block_index()`. The range guard must therefore validate the
4873 // EFFECTIVE start against `end_block`: `end_block` below the resume
4874 // block is an inverted range and must error, not silently return an
4875 // empty decode. Caller passes the conventional ignored `start_block = 0`.
4876 let (compressed, _full, info) = multi_block_fixture();
4877 let nblocks = info.blocks.len() as u32;
4878 let n = (nblocks / 2).max(2);
4879 let st = emit_resume_state_at(&compressed, n);
4880 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start;
4881
4882 let mut header_src = compressed.as_slice();
4883 let mut dec = FrameDecoder::new();
4884 dec.reset(&mut header_src).unwrap();
4885 let off = info.blocks[n as usize].offset_in_frame as usize;
4886 let mut block_src = &compressed[off..];
4887 // end_block = n - 1 is below the resume block n → inverted range.
4888 let err = dec
4889 .decode_blocks_partial(
4890 &mut block_src,
4891 0,
4892 n - 1,
4893 Some(super::ResumeInput {
4894 window_prime: &_full[..output_offset as usize],
4895 state: &st,
4896 }),
4897 false,
4898 )
4899 .expect_err("end_block below the resume block must be an inverted range");
4900 match err {
4901 crate::decoding::errors::FrameDecoderError::InvalidBlockRange {
4902 start_block,
4903 end_block,
4904 } => {
4905 assert_eq!(start_block, n, "error must report the effective start");
4906 assert_eq!(end_block, n - 1);
4907 }
4908 other => panic!("expected InvalidBlockRange, got {other:?}"),
4909 }
4910 }
4911
4912 #[cfg(feature = "lsm")]
4913 #[test]
4914 fn resume_rejects_state_from_a_different_frame() {
4915 // A ResumeState captured from one frame must not be applied to a frame
4916 // with a different decode shape (window size / single-segment / dict):
4917 // restoring foreign entropy tables would yield byte-wrong output. The
4918 // frame-identity guard must reject it up front with a typed error.
4919 let (frame_a, _full_a, info_a) = multi_block_fixture();
4920 let (frame_b, full_b, _info_b) = multi_segment_block_fixture();
4921 // Sanity: the two fixtures must differ in decode shape for the guard to
4922 // be exercised (single-segment vs multi-segment here).
4923 let st = emit_resume_state_at(&frame_a, (info_a.blocks.len() as u32 / 2).max(1));
4924
4925 let mut header_src = frame_b.as_slice();
4926 let mut dec = FrameDecoder::new();
4927 dec.reset(&mut header_src).unwrap();
4928 // The frame-key check runs before the window-length check, so even a
4929 // valid-length window_prime for frame B is rejected on identity.
4930 let err = dec
4931 .decode_blocks_partial(
4932 &mut frame_b.as_slice(),
4933 st.block_index(),
4934 u32::MAX,
4935 Some(super::ResumeInput {
4936 window_prime: &full_b,
4937 state: &st,
4938 }),
4939 false,
4940 )
4941 .expect_err("resume state from a different frame must be rejected");
4942 assert!(
4943 matches!(
4944 err,
4945 crate::decoding::errors::FrameDecoderError::ResumeFrameMismatch
4946 ),
4947 "expected ResumeFrameMismatch, got {err:?}"
4948 );
4949 }
4950
4951 #[cfg(all(feature = "lsm", feature = "hash"))]
4952 #[test]
4953 fn resume_rejects_wrong_window_prime_content() {
4954 // Same frame (FrameKey matches) but the caller supplies a window_prime
4955 // with one byte flipped. The shape key cannot catch this; the
4956 // content-exact XXH64 of the window must, rejecting before any restore
4957 // rather than mis-resolving matches against corrupted history.
4958 let (compressed, full, info) = multi_block_fixture();
4959 let nblocks = info.blocks.len() as u32;
4960 let n = (nblocks / 2).max(1);
4961 let st = emit_resume_state_at(&compressed, n);
4962 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start as usize;
4963 assert!(output_offset > 0);
4964
4965 // Correct prefix with the last byte corrupted (this byte is inside the
4966 // window the resume block reaches back into).
4967 let mut corrupted = full[..output_offset].to_vec();
4968 let last = corrupted.len() - 1;
4969 corrupted[last] ^= 0xFF;
4970
4971 let mut header_src = compressed.as_slice();
4972 let mut dec = FrameDecoder::new();
4973 dec.reset(&mut header_src).unwrap();
4974 let off = info.blocks[n as usize].offset_in_frame as usize;
4975 let mut block_src = &compressed[off..];
4976 let err = dec
4977 .decode_blocks_partial(
4978 &mut block_src,
4979 n,
4980 u32::MAX,
4981 Some(super::ResumeInput {
4982 window_prime: &corrupted,
4983 state: &st,
4984 }),
4985 false,
4986 )
4987 .expect_err("corrupted window_prime must be rejected by content hash");
4988 assert!(
4989 matches!(
4990 err,
4991 crate::decoding::errors::FrameDecoderError::ResumeFrameMismatch
4992 ),
4993 "expected ResumeFrameMismatch, got {err:?}"
4994 );
4995 }
4996
4997 #[cfg(feature = "lsm")]
4998 #[test]
4999 fn resume_rejects_state_with_different_active_dictionary() {
5000 // A dictless-header frame can be decoded with an explicit dictionary
5001 // applied at runtime (force_dict / reset_with_dict_handle). Two such
5002 // decodes differ in entropy/repcode/dict context even though the header
5003 // dictionary_id is identically absent, so the resume guard must key on
5004 // the ACTIVE dictionary, not just the header field. Here the snapshot is
5005 // captured with no active dictionary; resuming with one applied must be
5006 // rejected before any state is restored.
5007 let (compressed, full, info) = multi_block_fixture();
5008 let nblocks = info.blocks.len() as u32;
5009 let n = (nblocks / 2).max(1);
5010 let st = emit_resume_state_at(&compressed, n); // active_dictionary_id = None
5011 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start as usize;
5012
5013 let raw = std::fs::read("./dict_tests/dictionary").expect("dictionary fixture");
5014 let dict = crate::decoding::dictionary::Dictionary::decode_dict(&raw).expect("parse dict");
5015 let dict_id = dict.id;
5016
5017 let mut header_src = compressed.as_slice();
5018 let mut dec = FrameDecoder::new();
5019 dec.add_dict(dict).unwrap();
5020 dec.reset(&mut header_src).unwrap();
5021 dec.force_dict(dict_id).unwrap(); // active_dictionary_id = Some(dict_id)
5022 let off = info.blocks[n as usize].offset_in_frame as usize;
5023 let mut block_src = &compressed[off..];
5024 let err = dec
5025 .decode_blocks_partial(
5026 &mut block_src,
5027 n,
5028 u32::MAX,
5029 Some(super::ResumeInput {
5030 window_prime: &full[..output_offset],
5031 state: &st,
5032 }),
5033 false,
5034 )
5035 .expect_err("resume with a different active dictionary must be rejected");
5036 assert!(
5037 matches!(
5038 err,
5039 crate::decoding::errors::FrameDecoderError::ResumeFrameMismatch
5040 ),
5041 "expected ResumeFrameMismatch, got {err:?}"
5042 );
5043 }
5044
5045 #[cfg(feature = "lsm")]
5046 #[test]
5047 fn resume_invalid_range_does_not_mutate_decoder_state() {
5048 // An inverted effective range must be rejected WITHOUT priming the
5049 // decoder: no entropy restore, no window prime, no cursor advance. As
5050 // written before the fix, those mutations ran before the range check,
5051 // leaving the decoder in a synthetic resumed state on the error path.
5052 let (compressed, full, info) = multi_block_fixture();
5053 let nblocks = info.blocks.len() as u32;
5054 let n = (nblocks / 2).max(2);
5055 let st = emit_resume_state_at(&compressed, n);
5056 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start as usize;
5057
5058 let mut header_src = compressed.as_slice();
5059 let mut dec = FrameDecoder::new();
5060 dec.reset(&mut header_src).unwrap();
5061 // Freshly reset: cursor at block 0.
5062 assert_eq!(dec.state.as_ref().unwrap().block_counter, 0);
5063
5064 let off = info.blocks[n as usize].offset_in_frame as usize;
5065 let mut block_src = &compressed[off..];
5066 let err = dec
5067 .decode_blocks_partial(
5068 &mut block_src,
5069 0,
5070 n - 1, // below the resume block → inverted range
5071 Some(super::ResumeInput {
5072 window_prime: &full[..output_offset],
5073 state: &st,
5074 }),
5075 false,
5076 )
5077 .expect_err("inverted range must error");
5078 assert!(matches!(
5079 err,
5080 crate::decoding::errors::FrameDecoderError::InvalidBlockRange { .. }
5081 ));
5082 assert_eq!(
5083 dec.state.as_ref().unwrap().block_counter,
5084 0,
5085 "error path must not advance the cursor (validate before priming)"
5086 );
5087 }
5088
5089 #[cfg(feature = "lsm")]
5090 #[test]
5091 fn emit_resume_state_absent_on_terminal_block() {
5092 // When a decode reaches the frame's last block there is no "next block"
5093 // to resume at: the snapshot's block_index would be one past EOF and the
5094 // caller has no offset_in_frame for it. emit_resume must therefore yield
5095 // None on the terminal block, not a dangling snapshot.
5096 let (compressed, _full, info) = multi_block_fixture();
5097 let nblocks = info.blocks.len() as u32;
5098 let mut src = compressed.as_slice();
5099 let mut dec = FrameDecoder::new();
5100 dec.reset(&mut src).unwrap();
5101 let pd = dec
5102 .decode_blocks_partial(&mut src, 0, nblocks, None, true)
5103 .unwrap();
5104 assert!(pd.frame_finished, "decode must reach the last block");
5105 assert!(
5106 pd.resume_state.is_none(),
5107 "no resume state past the frame's last block"
5108 );
5109 }
5110
5111 #[cfg(feature = "lsm")]
5112 #[test]
5113 fn emit_resume_state_absent_when_not_requested() {
5114 // Default partial decode (emit_resume = false) must NOT pay the entropy
5115 // clone: resume_state stays None.
5116 let (compressed, _full, info) = multi_block_fixture();
5117 let nblocks = info.blocks.len() as u32;
5118 let mut src = compressed.as_slice();
5119 let mut dec = FrameDecoder::new();
5120 dec.reset(&mut src).unwrap();
5121 let pd = dec
5122 .decode_blocks_partial(&mut src, 0, nblocks, None, false)
5123 .unwrap();
5124 assert!(
5125 pd.resume_state.is_none(),
5126 "resume_state must be None unless emit_resume is set"
5127 );
5128 }
5129
5130 #[cfg(feature = "lsm")]
5131 #[test]
5132 fn resume_grow_loop_reconstructs_full() {
5133 // The motivating scenario: a symmetric one-call grow-loop. Each call
5134 // takes the previous ResumeState and emits the next, decoding only the
5135 // new extent — concatenated, the extents reconstruct the full output
5136 // with no prefix ever re-decompressed.
5137 let (compressed, full, info) = multi_block_fixture();
5138 let nblocks = info.blocks.len() as u32;
5139 assert!(nblocks >= 4);
5140
5141 // Walk the frame in extents of `step` blocks each.
5142 let step = (nblocks / 3).max(1);
5143 let mut combined: Vec<u8> = Vec::new();
5144 let mut next: u32 = 0;
5145 let mut carry: Option<super::ResumeState> = None;
5146
5147 while next < nblocks {
5148 let end = (next + step).min(nblocks);
5149 let mut dec = FrameDecoder::new();
5150 let mut header_src = compressed.as_slice();
5151 dec.reset(&mut header_src).unwrap();
5152
5153 let off = info.blocks[next as usize].offset_in_frame as usize;
5154 let mut block_src = &compressed[off..];
5155
5156 let output_offset = info.decompressed_byte_range(next as usize).unwrap().start;
5157 let pd = if let Some(st) = carry.as_ref() {
5158 // Resume from the prior extent's state (cold: fresh decoder).
5159 let window_prime = &full[..output_offset as usize];
5160 dec.decode_blocks_partial(
5161 &mut block_src,
5162 next,
5163 end,
5164 Some(super::ResumeInput {
5165 window_prime,
5166 state: st,
5167 }),
5168 true,
5169 )
5170 .unwrap()
5171 } else {
5172 // First extent: no resume input, just emit for the next.
5173 dec.decode_blocks_partial(&mut block_src, next, end, None, true)
5174 .unwrap()
5175 };
5176
5177 combined.extend_from_slice(&pd.data);
5178 carry = pd.resume_state;
5179 next = end;
5180 }
5181
5182 assert_eq!(
5183 combined, full,
5184 "grow-loop extents must reconstruct the full output"
5185 );
5186 }
5187
5188 #[cfg(all(feature = "lsm", feature = "hash"))]
5189 #[test]
5190 fn resume_does_not_redecode_prefix_blocks() {
5191 // Instrumented confirmation that blocks < N are not re-decoded on
5192 // resume. With per-block checksums enabled on the resuming decoder, the
5193 // resumed decode must record exactly one digest per in-range block
5194 // (end - N), never one per frame block.
5195 let (compressed, full, info) = multi_block_fixture();
5196 let nblocks = info.blocks.len() as u32;
5197 let n = nblocks / 2;
5198 let st = emit_resume_state_at(&compressed, n);
5199 let output_offset = info.decompressed_byte_range(n as usize).unwrap().start;
5200
5201 let mut header_src = compressed.as_slice();
5202 let mut dec = FrameDecoder::new();
5203 dec.enable_per_block_checksums();
5204 dec.reset(&mut header_src).unwrap();
5205 let off = info.blocks[n as usize].offset_in_frame as usize;
5206 let mut block_src = &compressed[off..];
5207 let _ = dec
5208 .decode_blocks_partial(
5209 &mut block_src,
5210 n,
5211 u32::MAX,
5212 Some(super::ResumeInput {
5213 window_prime: &full[..output_offset as usize],
5214 state: &st,
5215 }),
5216 false,
5217 )
5218 .unwrap();
5219
5220 assert_eq!(
5221 dec.computed_block_checksums().len() as u32,
5222 nblocks - n,
5223 "resume must decode only in-range blocks, not re-decode the prefix"
5224 );
5225 }
5226}