structured_zstd/decoding/frame_decoder.rs
1//! Framedecoder is the main low-level struct users interact with to decode zstd frames
2//!
3//! Zstandard compressed data is made of one or more frames. Each frame is independent and can be
4//! decompressed independently of other frames. This module contains structures
5//! and utilities that can be used to decode a frame.
6
7use super::frame;
8use crate::decoding;
9use crate::decoding::block_decoder::BlockDecoder;
10use crate::decoding::decode_buffer::DecodeBuffer;
11use crate::decoding::dictionary::{Dictionary, DictionaryHandle};
12use crate::decoding::errors::{DecodeBlockContentError, FrameDecoderError};
13use crate::decoding::flat_buf::FlatBuf;
14use crate::decoding::ringbuffer::RingBuffer;
15use crate::decoding::scratch::DecoderScratch;
16use crate::io::{Error, Read, Write};
17use alloc::collections::BTreeMap;
18use alloc::vec::Vec;
19use core::convert::TryInto;
20
21use crate::common::MAXIMUM_ALLOWED_WINDOW_SIZE;
22
23/// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded.
24///
25/// This decoder is able to decode frames only partially and gives control
26/// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once).
27/// It reads bytes as needed from a provided source and can be read from to collect partial results.
28///
29/// If you want to just read the whole frame with an `io::Read` without having to deal with manually calling [FrameDecoder::decode_blocks]
30/// you can use the provided [crate::decoding::StreamingDecoder] wich wraps this FrameDecoder.
31///
32/// Workflow is as follows:
33/// ```
34/// use structured_zstd::decoding::BlockDecodingStrategy;
35///
36/// # #[cfg(feature = "std")]
37/// use std::io::{Read, Write};
38///
39/// // no_std environments can use the crate's own Read traits
40/// # #[cfg(not(feature = "std"))]
41/// use structured_zstd::io::{Read, Write};
42///
43/// fn decode_this(mut file: impl Read) {
44/// //Create a new decoder
45/// let mut frame_dec = structured_zstd::decoding::FrameDecoder::new();
46/// let mut result = Vec::new();
47///
48/// // Use reset or init to make the decoder ready to decode the frame from the io::Read
49/// frame_dec.reset(&mut file).unwrap();
50///
51/// // Loop until the frame has been decoded completely
52/// while !frame_dec.is_finished() {
53/// // decode (roughly) batch_size many bytes
54/// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap();
55///
56/// // read from the decoder to collect bytes from the internal buffer
57/// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap();
58///
59/// // then do something with it
60/// do_something(&result[0..bytes_read]);
61/// }
62///
63/// // handle the last chunk of data
64/// while frame_dec.can_collect() > 0 {
65/// let x = frame_dec.read(result.as_mut_slice()).unwrap();
66///
67/// do_something(&result[0..x]);
68/// }
69/// }
70///
71/// fn do_something(data: &[u8]) {
72/// # #[cfg(feature = "std")]
73/// std::io::stdout().write_all(data).unwrap();
74/// }
75/// ```
76pub struct FrameDecoder {
77 state: Option<FrameDecoderState>,
78 owned_dicts: BTreeMap<u32, Dictionary>,
79 #[cfg(target_has_atomic = "ptr")]
80 shared_dicts: BTreeMap<u32, DictionaryHandle>,
81 #[cfg(not(target_has_atomic = "ptr"))]
82 shared_dicts: (),
83 /// `ZSTD_f_zstd1_magicless` — when true, [`init`] / [`reset`]
84 /// expect frames without the 4-byte magic number prefix.
85 /// Default false (standard zstd format).
86 magicless: bool,
87 /// Pinned `Dictionary_ID` expectation set via
88 /// [`Self::expect_dict_id`]. `None` (default) disables the
89 /// check; `Some(0)` matches frames whose header omits the
90 /// optional dict_id (treated as "no dictionary"). Validated in
91 /// [`Self::reset`] AFTER the frame header parses successfully
92 /// and BEFORE any block decode work.
93 #[cfg(feature = "lsm")]
94 expect_dict_id: Option<u32>,
95 /// Pinned `Window_Descriptor` byte expectation set via
96 /// [`Self::expect_window_descriptor`]. `None` (default)
97 /// disables the check. Validated in [`Self::reset`] AFTER the
98 /// frame header parses successfully and BEFORE any block
99 /// decode work. Single-segment frames (which omit the
100 /// `Window_Descriptor` byte from the wire) surface as
101 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedWindowDescriptor`]
102 /// with `found: None`.
103 #[cfg(feature = "lsm")]
104 expect_window_descriptor: Option<u8>,
105}
106
107/// Backend-tagged decode scratch — chosen at frame-reset time based
108/// on the parsed `FrameHeader.descriptor.single_segment_flag()` and
109/// kept stable through the lifetime of the frame. The match in each
110/// helper below dispatches **once per call** (e.g. once per block in
111/// `decode_block_content`, once per drain in `drain_to_writer`) —
112/// never inside the hot push/repeat loop, which is fully
113/// monomorphised through the `DecoderScratch<B>` generic.
114enum DecoderScratchKind {
115 Ring(DecoderScratch<RingBuffer>),
116 Flat(DecoderScratch<FlatBuf>),
117}
118
119impl DecoderScratchKind {
120 fn new_ring(window_size: usize) -> Self {
121 let mut s = DecoderScratch::<RingBuffer>::new(window_size);
122 s.buffer.reserve(window_size);
123 Self::Ring(s)
124 }
125
126 /// Construct a flat-backed scratch sized for a single-segment
127 /// frame. `frame_content_size` is the upcoming output size in
128 /// bytes (== `window_size` when the flag is set).
129 fn new_flat(frame_content_size: usize) -> Self {
130 let flat = FlatBuf::with_capacity(frame_content_size);
131 // DecoderScratch's default ctor would discard the pre-sized
132 // FlatBuf — go through from_backend so the buffer carries the
133 // capacity the constructor wants.
134 let mut s = DecoderScratch::<FlatBuf>::new(frame_content_size);
135 s.buffer = DecodeBuffer::from_backend(flat, frame_content_size);
136 Self::Flat(s)
137 }
138
139 /// Reset (or transition between) backends for a new frame.
140 /// Reuses the existing `DecoderScratch` allocations (FSE / HUF
141 /// tables, sequence vec, etc.) when the backend kind is unchanged
142 /// — only the underlying buffer is re-sized for the new frame.
143 /// Building a fresh `DecoderScratch` on every frame would
144 /// re-allocate everything and was measured at +255 % vs ring on
145 /// small frames; reusing it keeps the small-frame cost flat.
146 fn reset(&mut self, frame: &frame::FrameHeader, window_size: usize) {
147 if frame.descriptor.single_segment_flag() {
148 match self {
149 Self::Flat(s) => {
150 s.reset(window_size);
151 // DecodeBuffer::reset clears + reserves
152 // window_size; FlatBuf's reserve grows the
153 // backing Vec if the new FCS is larger than
154 // what's already allocated. No alloc when the
155 // previous flat frame had >= this capacity.
156 }
157 Self::Ring(_) => *self = Self::new_flat(window_size),
158 }
159 } else {
160 match self {
161 Self::Ring(s) => s.reset(window_size),
162 Self::Flat(_) => *self = Self::new_ring(window_size),
163 }
164 }
165 }
166
167 fn init_from_dict(&mut self, dict: &Dictionary) {
168 match self {
169 Self::Ring(s) => s.init_from_dict(dict),
170 Self::Flat(s) => s.init_from_dict(dict),
171 }
172 }
173
174 #[inline]
175 fn buffer_len(&self) -> usize {
176 match self {
177 Self::Ring(s) => s.buffer.len(),
178 Self::Flat(s) => s.buffer.len(),
179 }
180 }
181
182 fn buffer_drain(&mut self) -> Vec<u8> {
183 match self {
184 Self::Ring(s) => s.buffer.drain(),
185 Self::Flat(s) => s.buffer.drain(),
186 }
187 }
188
189 fn buffer_drain_to_window_size(&mut self) -> Option<Vec<u8>> {
190 match self {
191 Self::Ring(s) => s.buffer.drain_to_window_size(),
192 Self::Flat(s) => s.buffer.drain_to_window_size(),
193 }
194 }
195
196 fn buffer_drain_to_writer(&mut self, sink: impl Write) -> Result<usize, Error> {
197 match self {
198 Self::Ring(s) => s.buffer.drain_to_writer(sink),
199 Self::Flat(s) => s.buffer.drain_to_writer(sink),
200 }
201 }
202
203 fn buffer_drain_to_window_size_writer(&mut self, sink: impl Write) -> Result<usize, Error> {
204 match self {
205 Self::Ring(s) => s.buffer.drain_to_window_size_writer(sink),
206 Self::Flat(s) => s.buffer.drain_to_window_size_writer(sink),
207 }
208 }
209
210 fn buffer_can_drain(&self) -> usize {
211 match self {
212 Self::Ring(s) => s.buffer.can_drain(),
213 Self::Flat(s) => s.buffer.can_drain(),
214 }
215 }
216
217 fn buffer_can_drain_to_window_size(&self) -> Option<usize> {
218 match self {
219 Self::Ring(s) => s.buffer.can_drain_to_window_size(),
220 Self::Flat(s) => s.buffer.can_drain_to_window_size(),
221 }
222 }
223
224 fn buffer_read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
225 match self {
226 Self::Ring(s) => s.buffer.read(target),
227 Self::Flat(s) => s.buffer.read(target),
228 }
229 }
230
231 fn buffer_read_all(&mut self, target: &mut [u8]) -> Result<usize, Error> {
232 match self {
233 Self::Ring(s) => s.buffer.read_all(target),
234 Self::Flat(s) => s.buffer.read_all(target),
235 }
236 }
237
238 fn decode_block_content<R: Read>(
239 &mut self,
240 decoder: &mut BlockDecoder,
241 header: &crate::blocks::block::BlockHeader,
242 source: R,
243 ) -> Result<u64, DecodeBlockContentError> {
244 match self {
245 Self::Ring(s) => decoder.decode_block_content(header, s, source),
246 Self::Flat(s) => decoder.decode_block_content(header, s, source),
247 }
248 }
249
250 #[cfg(feature = "hash")]
251 fn hash_finish(&self) -> u64 {
252 use core::hash::Hasher;
253 match self {
254 Self::Ring(s) => s.buffer.hash.finish(),
255 Self::Flat(s) => s.buffer.hash.finish(),
256 }
257 }
258}
259
260struct FrameDecoderState {
261 pub frame_header: frame::FrameHeader,
262 decoder_scratch: DecoderScratchKind,
263 frame_finished: bool,
264 block_counter: usize,
265 bytes_read_counter: u64,
266 check_sum: Option<u32>,
267 using_dict: Option<u32>,
268}
269
270pub enum BlockDecodingStrategy {
271 All,
272 UptoBlocks(usize),
273 UptoBytes(usize),
274}
275
276impl FrameDecoderState {
277 /// Construct a new frame decoder state, reading the frame header
278 /// from `source`. When `magicless` is `true`, the 4-byte magic
279 /// number prefix is NOT consumed (donor `ZSTD_f_zstd1_magicless`).
280 /// Crate-internal — reached only via `FrameDecoder::init` /
281 /// `FrameDecoder::init_with_dict_handle`. Pre-allocates the
282 /// decode buffer to `window_size` so the first block does not
283 /// trigger incremental growth from zero capacity.
284 pub(crate) fn new_with_format(
285 source: impl Read,
286 magicless: bool,
287 ) -> Result<FrameDecoderState, FrameDecoderError> {
288 let (frame, header_size) = frame::read_frame_header_with_format(source, magicless)?;
289 let window_size = frame.window_size()?;
290
291 if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
292 return Err(FrameDecoderError::WindowSizeTooBig {
293 requested: window_size,
294 });
295 }
296
297 let decoder_scratch = if frame.descriptor.single_segment_flag() {
298 DecoderScratchKind::new_flat(window_size as usize)
299 } else {
300 DecoderScratchKind::new_ring(window_size as usize)
301 };
302 Ok(FrameDecoderState {
303 frame_header: frame,
304 frame_finished: false,
305 block_counter: 0,
306 decoder_scratch,
307 bytes_read_counter: u64::from(header_size),
308 check_sum: None,
309 using_dict: None,
310 })
311 }
312
313 /// Reset this state for a new frame read from `source`, reusing
314 /// existing allocations. When `magicless` is `true`, the frame
315 /// header is read WITHOUT expecting a magic-number prefix
316 /// (donor `ZSTD_f_zstd1_magicless`). Crate-internal — reached
317 /// only via `FrameDecoder::reset`.
318 ///
319 /// `DecodeBuffer::reset` reserves `window_size` internally, so
320 /// no additional frame-level reservation is needed here.
321 /// Further buffer growth during decoding is performed on demand
322 /// by the active block path.
323 pub(crate) fn reset_with_format(
324 &mut self,
325 source: impl Read,
326 magicless: bool,
327 ) -> Result<(), FrameDecoderError> {
328 let (frame_header, header_size) = frame::read_frame_header_with_format(source, magicless)?;
329 let window_size = frame_header.window_size()?;
330
331 if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
332 return Err(FrameDecoderError::WindowSizeTooBig {
333 requested: window_size,
334 });
335 }
336
337 self.decoder_scratch
338 .reset(&frame_header, window_size as usize);
339 self.frame_header = frame_header;
340 self.frame_finished = false;
341 self.block_counter = 0;
342 self.bytes_read_counter = u64::from(header_size);
343 self.check_sum = None;
344 self.using_dict = None;
345 Ok(())
346 }
347}
348
349impl Default for FrameDecoder {
350 fn default() -> Self {
351 Self::new()
352 }
353}
354
355impl FrameDecoder {
356 /// This will create a new decoder without allocating anything yet.
357 /// init()/reset() will allocate all needed buffers if it is the first time this decoder is used
358 /// else they just reset these buffers with not further allocations
359 pub fn new() -> FrameDecoder {
360 FrameDecoder {
361 state: None,
362 owned_dicts: BTreeMap::new(),
363 #[cfg(target_has_atomic = "ptr")]
364 shared_dicts: BTreeMap::new(),
365 #[cfg(not(target_has_atomic = "ptr"))]
366 shared_dicts: (),
367 magicless: false,
368 #[cfg(feature = "lsm")]
369 expect_dict_id: None,
370 #[cfg(feature = "lsm")]
371 expect_window_descriptor: None,
372 }
373 }
374
375 /// Pin the expected `Dictionary_ID` for the next frame.
376 ///
377 /// When `expected` is set, [`Self::init`] / [`Self::reset`]
378 /// validate it against the parsed frame header BEFORE any
379 /// block decode work runs. A mismatch returns
380 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedDictId`]
381 /// before any block decode and before any output is produced.
382 /// Scratch buffer allocation / reservation for the decode
383 /// pipeline happens during frame-header parsing, which is
384 /// already complete when this validation fires — the cost of
385 /// scratch sizing is paid even on a mismatched header. The
386 /// guarantee is "no block decode, no XXH64 init, no partial
387 /// output", not "zero allocation".
388 ///
389 /// `Some(0)` is treated as "no dictionary expected": a frame
390 /// whose header omits the optional `Dictionary_ID` field
391 /// (flag value 0) passes the check; a frame that carries an
392 /// explicit non-zero id fails.
393 ///
394 /// `None` (default) disables the check.
395 ///
396 /// Primary use case: post-AEAD-decrypt sanity check in
397 /// wire-format consumers (e.g. lsm-tree's encrypted block
398 /// format pins the `dict_id` baked into the AAD against the
399 /// inner zstd frame's `dict_id` to defeat dict-substitution
400 /// attacks).
401 ///
402 /// NOT a replacement for AEAD authentication. NOT the same
403 /// semantic as donor `ZSTD_d_windowLogMax` (which is a
404 /// ceiling-style limit, separate concern).
405 #[cfg(feature = "lsm")]
406 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
407 pub fn expect_dict_id(&mut self, expected: Option<u32>) {
408 self.expect_dict_id = expected;
409 }
410
411 /// Pin the expected raw `Window_Descriptor` byte (RFC 8878
412 /// §3.1.1.1.2 layout: `(exp << 3) | mantissa`) for the next
413 /// frame.
414 ///
415 /// When `expected` is set, [`Self::init`] / [`Self::reset`]
416 /// validate it against the parsed frame header BEFORE any
417 /// block decode work runs. A mismatch returns
418 /// [`crate::decoding::errors::FrameDecoderError::UnexpectedWindowDescriptor`].
419 ///
420 /// Single-segment frames omit the `Window_Descriptor` byte
421 /// from the wire entirely. Setting an expectation while
422 /// receiving a single-segment frame fails the check with
423 /// `found: None` — there is no on-wire byte to match against,
424 /// which is reported explicitly rather than silently passing.
425 ///
426 /// `None` (default) disables the check.
427 ///
428 /// Byte-exact equality, NOT a ceiling. Donor
429 /// `ZSTD_d_windowLogMax` is a separate ceiling-style limit
430 /// available through the C FFI surface; this method is for
431 /// strict equality validation against a pinned expectation
432 /// (e.g. lsm-tree's wire format pins the window descriptor
433 /// from the AAD to defeat decompression-bomb-swap attacks).
434 #[cfg(feature = "lsm")]
435 #[cfg_attr(docsrs, doc(cfg(feature = "lsm")))]
436 pub fn expect_window_descriptor(&mut self, expected: Option<u8>) {
437 self.expect_window_descriptor = expected;
438 }
439
440 /// Validate the just-parsed frame header against any pinned
441 /// expectations set via [`Self::expect_dict_id`] /
442 /// [`Self::expect_window_descriptor`].
443 ///
444 /// Returns the typed error variant on mismatch and leaves
445 /// `self.state` in a re-resettable shape — a subsequent
446 /// `reset()` will overwrite `frame_header` from the new source
447 /// without needing intermediate cleanup.
448 #[cfg(feature = "lsm")]
449 fn validate_expectations(
450 &self,
451 frame_header: &frame::FrameHeader,
452 ) -> Result<(), FrameDecoderError> {
453 if let Some(expected) = self.expect_dict_id {
454 let found = frame_header.dictionary_id();
455 // `Some(0)` is the "no dictionary expected" sentinel —
456 // matches a frame whose header omits the optional
457 // dict_id field (which is reported as `None` by the
458 // parser). All other values must match exactly.
459 let matches = match (expected, found) {
460 (0, None) => true,
461 (e, Some(f)) => e == f,
462 _ => false,
463 };
464 if !matches {
465 return Err(FrameDecoderError::UnexpectedDictId {
466 expected: Some(expected),
467 found,
468 });
469 }
470 }
471 if let Some(expected) = self.expect_window_descriptor {
472 let found = frame_header.window_descriptor();
473 if found != Some(expected) {
474 return Err(FrameDecoderError::UnexpectedWindowDescriptor { expected, found });
475 }
476 }
477 Ok(())
478 }
479
480 /// Enable or disable magicless frame format
481 /// (`ZSTD_f_zstd1_magicless`). When set to `true`, subsequent
482 /// [`init`] / [`reset`] calls expect the frame header to begin
483 /// directly with the frame-header descriptor — no 4-byte magic
484 /// number prefix. Default false. Must match the encoder's
485 /// magicless setting; the format is unambiguous only when the
486 /// caller knows it out-of-band.
487 ///
488 /// Note: magicless mode also disables skippable-frame detection.
489 /// The `0x184D2A50..=0x184D2A5F` skippable-frame magic range is
490 /// only recognised when the 4-byte magic prefix is consumed, so
491 /// `decode_all` / `init` / `reset` will treat a skippable frame
492 /// at the head of a magicless stream as a malformed frame header
493 /// (bad descriptor / window-size error) instead of skipping it.
494 /// Mixed-format streams that interleave skippable frames must be
495 /// pre-split by the caller; `set_magicless(true)` is only safe
496 /// when the entire stream is known to be magicless zstd frames.
497 pub fn set_magicless(&mut self, magicless: bool) {
498 self.magicless = magicless;
499 }
500
501 #[cfg(target_has_atomic = "ptr")]
502 fn shared_dict_exists(&self, dict_id: u32) -> bool {
503 self.shared_dicts.contains_key(&dict_id)
504 }
505
506 #[cfg(not(target_has_atomic = "ptr"))]
507 fn shared_dict_exists(&self, _dict_id: u32) -> bool {
508 false
509 }
510
511 fn validate_registered_dictionary(dict: &Dictionary) -> Result<(), FrameDecoderError> {
512 use crate::decoding::errors::DictionaryDecodeError as dict_err;
513
514 if dict.id == 0 {
515 return Err(FrameDecoderError::from(dict_err::ZeroDictionaryId));
516 }
517 if let Some(index) = dict.offset_hist.iter().position(|&rep| rep == 0) {
518 return Err(FrameDecoderError::from(
519 dict_err::ZeroRepeatOffsetInDictionary { index: index as u8 },
520 ));
521 }
522 Ok(())
523 }
524
525 /// init() will allocate all needed buffers if it is the first time this decoder is used
526 /// else they just reset these buffers with not further allocations
527 ///
528 /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
529 ///
530 /// equivalent to reset()
531 pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
532 self.reset(source)
533 }
534
535 /// Initialize the decoder for a new frame using a pre-parsed dictionary handle.
536 ///
537 /// If the frame header has a dictionary ID, this validates it against
538 /// `dict.id()` and returns [`FrameDecoderError::DictIdMismatch`] on mismatch.
539 ///
540 /// If the header omits the optional dictionary ID, this still applies the
541 /// provided dictionary handle.
542 ///
543 /// # Warning
544 ///
545 /// This method always applies `dict` unless the frame header contains a
546 /// non-matching dictionary ID. Callers must only use this API when they
547 /// already know the frame was encoded with the provided dictionary, even if
548 /// the frame header omits the dictionary ID or encodes an explicit
549 /// dictionary ID of `0`.
550 ///
551 /// Passing a dictionary for a frame that was not encoded with it can
552 /// silently corrupt the decoded output.
553 pub fn init_with_dict_handle(
554 &mut self,
555 source: impl Read,
556 dict: &DictionaryHandle,
557 ) -> Result<(), FrameDecoderError> {
558 self.reset_with_dict_handle(source, dict)
559 }
560
561 /// reset() will allocate all needed buffers if it is the first time this decoder is used
562 /// else they just reset these buffers with not further allocations
563 ///
564 /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
565 ///
566 /// equivalent to init()
567 pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
568 use FrameDecoderError as err;
569 let magicless = self.magicless;
570 let dict_id = match &mut self.state {
571 Some(s) => {
572 s.reset_with_format(source, magicless)?;
573 s.frame_header.dictionary_id()
574 }
575 None => {
576 self.state = Some(FrameDecoderState::new_with_format(source, magicless)?);
577 self.state
578 .as_ref()
579 .and_then(|state| state.frame_header.dictionary_id())
580 }
581 };
582 // Validate any pinned expectations BEFORE block decode work
583 // runs. Catches dict_id substitution / window-descriptor
584 // tampering on inputs already authenticated by an outer
585 // layer (e.g. AEAD). Returning here leaves `self.state` in
586 // a re-resettable shape — next `reset()` re-parses the
587 // frame header without intermediate cleanup.
588 #[cfg(feature = "lsm")]
589 if let Some(state) = self.state.as_ref() {
590 self.validate_expectations(&state.frame_header)?;
591 }
592 if let Some(dict_id) = dict_id {
593 let state = self.state.as_mut().expect("state initialized");
594 let owned_dicts = &self.owned_dicts;
595 #[cfg(target_has_atomic = "ptr")]
596 let shared_dicts = &self.shared_dicts;
597 let dict = owned_dicts
598 .get(&dict_id)
599 .or_else(|| {
600 #[cfg(target_has_atomic = "ptr")]
601 {
602 shared_dicts.get(&dict_id).map(DictionaryHandle::as_dict)
603 }
604 #[cfg(not(target_has_atomic = "ptr"))]
605 {
606 None
607 }
608 })
609 .ok_or(err::DictNotProvided { dict_id })?;
610 state.decoder_scratch.init_from_dict(dict);
611 state.using_dict = Some(dict_id);
612 }
613 Ok(())
614 }
615
616 /// Reset this decoder for a new frame using a pre-parsed dictionary handle.
617 ///
618 /// If the frame header has a dictionary ID, this validates it against
619 /// `dict.id()` and returns [`FrameDecoderError::DictIdMismatch`] on mismatch.
620 ///
621 /// If the header omits the optional dictionary ID, this still applies the
622 /// provided dictionary handle.
623 ///
624 /// # Warning
625 ///
626 /// This method always applies `dict` unless the frame header contains a
627 /// non-matching dictionary ID. Callers must only use this API when they
628 /// already know the frame was encoded with the provided dictionary, even if
629 /// the frame header omits the dictionary ID or encodes an explicit
630 /// dictionary ID of `0`.
631 ///
632 /// Passing a dictionary for a frame that was not encoded with it can
633 /// silently corrupt the decoded output.
634 pub fn reset_with_dict_handle(
635 &mut self,
636 source: impl Read,
637 dict: &DictionaryHandle,
638 ) -> Result<(), FrameDecoderError> {
639 use FrameDecoderError as err;
640 Self::validate_registered_dictionary(dict.as_dict())?;
641 let magicless = self.magicless;
642 // Scope the &mut borrow of `self.state` to the header parse
643 // alone, so the subsequent `validate_expectations(&self, ...)`
644 // call below can take a fresh shared borrow of self without
645 // tripping the borrow checker.
646 match &mut self.state {
647 Some(s) => s.reset_with_format(source, magicless)?,
648 None => {
649 self.state = Some(FrameDecoderState::new_with_format(source, magicless)?);
650 }
651 }
652 // Single source of truth: route through the same
653 // `validate_expectations` used by `reset()`. Routing through
654 // the helper keeps the two code paths from drifting (e.g.,
655 // if expect-semantics or error wiring changes later).
656 #[cfg(feature = "lsm")]
657 {
658 let header = &self
659 .state
660 .as_ref()
661 .expect("state populated by reset_with_format/new_with_format")
662 .frame_header;
663 self.validate_expectations(header)?;
664 }
665 let state = self
666 .state
667 .as_mut()
668 .expect("state populated by reset_with_format/new_with_format");
669 if let Some(dict_id) = state.frame_header.dictionary_id()
670 && dict_id != dict.id()
671 {
672 return Err(err::DictIdMismatch {
673 expected: dict_id,
674 provided: dict.id(),
675 });
676 }
677 state.decoder_scratch.init_from_dict(dict.as_dict());
678 state.using_dict = Some(dict.id());
679 Ok(())
680 }
681
682 /// Add a dictionary that can be selected dynamically by frame dictionary ID.
683 ///
684 /// Returns [`FrameDecoderError::DictAlreadyRegistered`] if the ID is already
685 /// registered (either as owned or shared).
686 pub fn add_dict(&mut self, dict: Dictionary) -> Result<(), FrameDecoderError> {
687 Self::validate_registered_dictionary(&dict)?;
688 let dict_id = dict.id;
689 if self.owned_dicts.contains_key(&dict_id) || self.shared_dict_exists(dict_id) {
690 return Err(FrameDecoderError::DictAlreadyRegistered { dict_id });
691 }
692 self.owned_dicts.insert(dict_id, dict);
693 Ok(())
694 }
695
696 /// Parse and add a serialized dictionary blob.
697 pub fn add_dict_from_bytes(&mut self, raw_dictionary: &[u8]) -> Result<(), FrameDecoderError> {
698 let dict = Dictionary::decode_dict(raw_dictionary)?;
699 self.add_dict(dict)
700 }
701
702 /// Add a pre-parsed dictionary handle for reuse across decoders.
703 ///
704 /// This API is available on targets with pointer-width atomics
705 /// (`target_has_atomic = "ptr"`).
706 ///
707 /// Returns [`FrameDecoderError::DictAlreadyRegistered`] if the ID is already
708 /// registered (either as owned or shared).
709 #[cfg(target_has_atomic = "ptr")]
710 pub fn add_dict_handle(&mut self, dict: DictionaryHandle) -> Result<(), FrameDecoderError> {
711 Self::validate_registered_dictionary(dict.as_dict())?;
712 let dict_id = dict.id();
713 if self.owned_dicts.contains_key(&dict_id) || self.shared_dicts.contains_key(&dict_id) {
714 return Err(FrameDecoderError::DictAlreadyRegistered { dict_id });
715 }
716 self.shared_dicts.insert(dict_id, dict);
717 Ok(())
718 }
719
720 pub fn force_dict(&mut self, dict_id: u32) -> Result<(), FrameDecoderError> {
721 use FrameDecoderError as err;
722 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
723 let owned_dicts = &self.owned_dicts;
724 #[cfg(target_has_atomic = "ptr")]
725 let shared_dicts = &self.shared_dicts;
726
727 let dict = owned_dicts
728 .get(&dict_id)
729 .or_else(|| {
730 #[cfg(target_has_atomic = "ptr")]
731 {
732 shared_dicts.get(&dict_id).map(DictionaryHandle::as_dict)
733 }
734 #[cfg(not(target_has_atomic = "ptr"))]
735 {
736 None
737 }
738 })
739 .ok_or(err::DictNotProvided { dict_id })?;
740 state.decoder_scratch.init_from_dict(dict);
741 state.using_dict = Some(dict_id);
742
743 Ok(())
744 }
745
746 /// Returns how many bytes the frame contains after decompression
747 pub fn content_size(&self) -> u64 {
748 match &self.state {
749 None => 0,
750 Some(s) => s.frame_header.frame_content_size(),
751 }
752 }
753
754 /// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame
755 pub fn get_checksum_from_data(&self) -> Option<u32> {
756 let state = self.state.as_ref()?;
757
758 state.check_sum
759 }
760
761 /// Returns the checksum that was calculated while decoding.
762 /// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder
763 #[cfg(feature = "hash")]
764 pub fn get_calculated_checksum(&self) -> Option<u32> {
765 let state = self.state.as_ref()?;
766 let cksum_64bit = state.decoder_scratch.hash_finish();
767 //truncate to lower 32bit because reasons...
768 Some(cksum_64bit as u32)
769 }
770
771 /// Counter for how many bytes have been consumed while decoding the frame
772 pub fn bytes_read_from_source(&self) -> u64 {
773 let state = match &self.state {
774 None => return 0,
775 Some(s) => s,
776 };
777 state.bytes_read_counter
778 }
779
780 /// Whether the current frames last block has been decoded yet
781 /// If this returns true you can call the drain* functions to get all content
782 /// (the read() function will drain automatically if this returns true)
783 pub fn is_finished(&self) -> bool {
784 let state = match &self.state {
785 None => return true,
786 Some(s) => s,
787 };
788 if state.frame_header.descriptor.content_checksum_flag() {
789 state.frame_finished && state.check_sum.is_some()
790 } else {
791 state.frame_finished
792 }
793 }
794
795 /// Counter for how many blocks have already been decoded
796 pub fn blocks_decoded(&self) -> usize {
797 let state = match &self.state {
798 None => return 0,
799 Some(s) => s,
800 };
801 state.block_counter
802 }
803
804 /// Decodes blocks from a reader. It requires that the framedecoder has been initialized first.
805 /// The Strategy influences how many blocks will be decoded before the function returns
806 /// This is important if you want to manage memory consumption carefully. If you don't care
807 /// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer
808 pub fn decode_blocks(
809 &mut self,
810 mut source: impl Read,
811 strat: BlockDecodingStrategy,
812 ) -> Result<bool, FrameDecoderError> {
813 use FrameDecoderError as err;
814 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
815
816 let mut block_dec = decoding::block_decoder::new();
817
818 let buffer_size_before = state.decoder_scratch.buffer_len();
819 let block_counter_before = state.block_counter;
820 loop {
821 vprintln!("################");
822 vprintln!("Next Block: {}", state.block_counter);
823 vprintln!("################");
824 let (block_header, block_header_size) = block_dec
825 .read_block_header(&mut source)
826 .map_err(err::FailedToReadBlockHeader)?;
827 state.bytes_read_counter += u64::from(block_header_size);
828
829 vprintln!();
830 vprintln!(
831 "Found {} block with size: {}, which will be of size: {}",
832 block_header.block_type,
833 block_header.content_size,
834 block_header.decompressed_size
835 );
836
837 let bytes_read_in_block_body = state
838 .decoder_scratch
839 .decode_block_content(&mut block_dec, &block_header, &mut source)
840 .map_err(err::FailedToReadBlockBody)?;
841 state.bytes_read_counter += bytes_read_in_block_body;
842
843 state.block_counter += 1;
844
845 vprintln!("Output: {}", state.decoder_scratch.buffer_len());
846
847 if block_header.last_block {
848 state.frame_finished = true;
849 if state.frame_header.descriptor.content_checksum_flag() {
850 let mut chksum = [0u8; 4];
851 source
852 .read_exact(&mut chksum)
853 .map_err(err::FailedToReadChecksum)?;
854 state.bytes_read_counter += 4;
855 let chksum = u32::from_le_bytes(chksum);
856 state.check_sum = Some(chksum);
857 }
858 break;
859 }
860
861 match strat {
862 BlockDecodingStrategy::All => { /* keep going */ }
863 BlockDecodingStrategy::UptoBlocks(n) => {
864 if state.block_counter - block_counter_before >= n {
865 break;
866 }
867 }
868 BlockDecodingStrategy::UptoBytes(n) => {
869 if state.decoder_scratch.buffer_len() - buffer_size_before >= n {
870 break;
871 }
872 }
873 }
874 }
875
876 Ok(state.frame_finished)
877 }
878
879 /// Collect bytes and retain window_size bytes while decoding is still going on.
880 /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
881 pub fn collect(&mut self) -> Option<Vec<u8>> {
882 let finished = self.is_finished();
883 let state = self.state.as_mut()?;
884 if finished {
885 Some(state.decoder_scratch.buffer_drain())
886 } else {
887 state.decoder_scratch.buffer_drain_to_window_size()
888 }
889 }
890
891 /// Collect bytes and retain window_size bytes while decoding is still going on.
892 /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
893 pub fn collect_to_writer(&mut self, w: impl Write) -> Result<usize, Error> {
894 let finished = self.is_finished();
895 let state = match &mut self.state {
896 None => return Ok(0),
897 Some(s) => s,
898 };
899 if finished {
900 state.decoder_scratch.buffer_drain_to_writer(w)
901 } else {
902 state.decoder_scratch.buffer_drain_to_window_size_writer(w)
903 }
904 }
905
906 /// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size
907 /// because window_size bytes need to be retained for decoding.
908 /// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes
909 pub fn can_collect(&self) -> usize {
910 let finished = self.is_finished();
911 let state = match &self.state {
912 None => return 0,
913 Some(s) => s,
914 };
915 if finished {
916 state.decoder_scratch.buffer_can_drain()
917 } else {
918 state
919 .decoder_scratch
920 .buffer_can_drain_to_window_size()
921 .unwrap_or(0)
922 }
923 }
924
925 /// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice
926 /// The source slice may contain only parts of a frame but must contain at least one full block to make progress
927 ///
928 /// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors
929 /// which try to serve an old-style c api
930 ///
931 /// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same
932 /// input will not make any progress!
933 ///
934 /// Note that no kind of block can be bigger than 128kb.
935 /// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer
936 ///
937 /// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target)
938 pub fn decode_from_to(
939 &mut self,
940 source: &[u8],
941 target: &mut [u8],
942 ) -> Result<(usize, usize), FrameDecoderError> {
943 use FrameDecoderError as err;
944 let bytes_read_at_start = match &self.state {
945 Some(s) => s.bytes_read_counter,
946 None => 0,
947 };
948
949 if !self.is_finished() || self.state.is_none() {
950 let mut mt_source = source;
951
952 if self.state.is_none() {
953 self.init(&mut mt_source)?;
954 }
955
956 //pseudo block to scope "state" so we can borrow self again after the block
957 {
958 let state = match &mut self.state {
959 Some(s) => s,
960 None => panic!("Bug in library"),
961 };
962 let mut block_dec = decoding::block_decoder::new();
963
964 if state.frame_header.descriptor.content_checksum_flag()
965 && state.frame_finished
966 && state.check_sum.is_none()
967 {
968 //this block is needed if the checksum were the only 4 bytes that were not included in the last decode_from_to call for a frame
969 if mt_source.len() >= 4 {
970 let chksum = mt_source[..4].try_into().expect("optimized away");
971 state.bytes_read_counter += 4;
972 let chksum = u32::from_le_bytes(chksum);
973 state.check_sum = Some(chksum);
974 }
975 return Ok((4, 0));
976 }
977
978 loop {
979 //check if there are enough bytes for the next header
980 if mt_source.len() < 3 {
981 break;
982 }
983 let (block_header, block_header_size) = block_dec
984 .read_block_header(&mut mt_source)
985 .map_err(err::FailedToReadBlockHeader)?;
986
987 // check the needed size for the block before updating counters.
988 // If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place
989 if mt_source.len() < block_header.content_size as usize {
990 break;
991 }
992 state.bytes_read_counter += u64::from(block_header_size);
993
994 let bytes_read_in_block_body = state
995 .decoder_scratch
996 .decode_block_content(&mut block_dec, &block_header, &mut mt_source)
997 .map_err(err::FailedToReadBlockBody)?;
998 state.bytes_read_counter += bytes_read_in_block_body;
999 state.block_counter += 1;
1000
1001 if block_header.last_block {
1002 state.frame_finished = true;
1003 if state.frame_header.descriptor.content_checksum_flag() {
1004 //if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call
1005 if mt_source.len() >= 4 {
1006 let chksum = mt_source[..4].try_into().expect("optimized away");
1007 state.bytes_read_counter += 4;
1008 let chksum = u32::from_le_bytes(chksum);
1009 state.check_sum = Some(chksum);
1010 }
1011 }
1012 break;
1013 }
1014 }
1015 }
1016 }
1017
1018 let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?;
1019 let bytes_read_at_end = match &mut self.state {
1020 Some(s) => s.bytes_read_counter,
1021 None => panic!("Bug in library"),
1022 };
1023 let read_len = bytes_read_at_end - bytes_read_at_start;
1024 Ok((read_len as usize, result_len))
1025 }
1026
1027 /// Decode multiple frames into the output slice.
1028 ///
1029 /// `input` must contain an exact number of frames. Skippable frames are allowed and will be
1030 /// skipped during decode.
1031 ///
1032 /// `output` must be large enough to hold the decompressed data. If you don't know
1033 /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
1034 ///
1035 /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
1036 ///
1037 /// Returns the number of bytes written to `output`.
1038 pub fn decode_all(
1039 &mut self,
1040 input: &[u8],
1041 output: &mut [u8],
1042 ) -> Result<usize, FrameDecoderError> {
1043 self.decode_all_impl(input, output, |this, src| this.init(src))
1044 }
1045
1046 /// Decode multiple frames into the output slice using a pre-parsed dictionary handle.
1047 ///
1048 /// `input` must contain an exact number of frames. Skippable frames are allowed and will be
1049 /// skipped during decode.
1050 ///
1051 /// `output` must be large enough to hold the decompressed data. If you don't know
1052 /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
1053 ///
1054 /// This calls [`FrameDecoder::init_with_dict_handle`], and all bytes currently in the
1055 /// decoder will be lost.
1056 ///
1057 /// # Warning
1058 ///
1059 /// Each decoded frame is initialized with `dict`, even when a frame header
1060 /// omits the optional dictionary ID. Callers must only use this API when
1061 /// they already know the input frames were encoded with the provided
1062 /// dictionary; otherwise decoded output can be silently corrupted.
1063 pub fn decode_all_with_dict_handle(
1064 &mut self,
1065 input: &[u8],
1066 output: &mut [u8],
1067 dict: &DictionaryHandle,
1068 ) -> Result<usize, FrameDecoderError> {
1069 self.decode_all_impl(input, output, |this, src| {
1070 this.init_with_dict_handle(src, dict)
1071 })
1072 }
1073
1074 fn decode_all_impl(
1075 &mut self,
1076 mut input: &[u8],
1077 mut output: &mut [u8],
1078 mut init_frame: impl FnMut(&mut Self, &mut &[u8]) -> Result<(), FrameDecoderError>,
1079 ) -> Result<usize, FrameDecoderError> {
1080 let mut total_bytes_written = 0;
1081 while !input.is_empty() {
1082 match init_frame(self, &mut input) {
1083 Ok(_) => {}
1084 Err(FrameDecoderError::ReadFrameHeaderError(
1085 crate::decoding::errors::ReadFrameHeaderError::SkipFrame { length, .. },
1086 )) => {
1087 input = input
1088 .get(length as usize..)
1089 .ok_or(FrameDecoderError::FailedToSkipFrame)?;
1090 continue;
1091 }
1092 Err(e) => return Err(e),
1093 };
1094 loop {
1095 self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
1096 let bytes_written = self
1097 .read(output)
1098 .map_err(FrameDecoderError::FailedToDrainDecodebuffer)?;
1099 output = &mut output[bytes_written..];
1100 total_bytes_written += bytes_written;
1101 if self.can_collect() != 0 {
1102 return Err(FrameDecoderError::TargetTooSmall);
1103 }
1104 if self.is_finished() {
1105 break;
1106 }
1107 }
1108 }
1109
1110 Ok(total_bytes_written)
1111 }
1112
1113 /// Decode multiple frames into the output slice using a serialized dictionary.
1114 ///
1115 /// # Warning
1116 ///
1117 /// Each decoded frame is initialized with the parsed dictionary, even when a
1118 /// frame header omits the optional dictionary ID. Callers must only use this
1119 /// API when they already know the input frames were encoded with that
1120 /// dictionary; otherwise decoded output can be silently corrupted.
1121 pub fn decode_all_with_dict_bytes(
1122 &mut self,
1123 input: &[u8],
1124 output: &mut [u8],
1125 raw_dictionary: &[u8],
1126 ) -> Result<usize, FrameDecoderError> {
1127 let dict = DictionaryHandle::decode_dict(raw_dictionary)?;
1128 self.decode_all_with_dict_handle(input, output, &dict)
1129 }
1130
1131 /// Decode multiple frames into the extra capacity of the output vector.
1132 ///
1133 /// `input` must contain an exact number of frames.
1134 ///
1135 /// `output` must have enough extra capacity to hold the decompressed data.
1136 /// This function will not reallocate or grow the vector. If you don't know
1137 /// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
1138 ///
1139 /// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
1140 ///
1141 /// The length of the output vector is updated to include the decompressed data.
1142 /// The length is not changed if an error occurs.
1143 pub fn decode_all_to_vec(
1144 &mut self,
1145 input: &[u8],
1146 output: &mut Vec<u8>,
1147 ) -> Result<(), FrameDecoderError> {
1148 let len = output.len();
1149 let cap = output.capacity();
1150 output.resize(cap, 0);
1151 match self.decode_all(input, &mut output[len..]) {
1152 Ok(bytes_written) => {
1153 let new_len = core::cmp::min(len + bytes_written, cap); // Sanitizes `bytes_written`.
1154 output.resize(new_len, 0);
1155 Ok(())
1156 }
1157 Err(e) => {
1158 output.resize(len, 0);
1159 Err(e)
1160 }
1161 }
1162 }
1163
1164 /// Decode a single zstd frame from `input` directly into
1165 /// `output`, bypassing the internal `DecodeBuffer` -> `read()`
1166 /// drain copy when the frame is eligible. Donor parity with the
1167 /// `ZSTD_in_dst` litBuffer placement strategy.
1168 ///
1169 /// Eligibility requires all of:
1170 /// - `frame_content_size` is present in the header (> 0).
1171 /// - `output.len() >= frame_content_size + WILDCOPY_OVERLENGTH`
1172 /// (room for the SIMD wildcopy overshoot slack).
1173 /// - No active dictionary on `self.state` (dict_content is not
1174 /// carried into the stack-local DecodeBuffer this method
1175 /// builds).
1176 ///
1177 /// `content_checksum_flag` is NOT a disqualifier: when set,
1178 /// the direct path hashes the decoded `output[..content_size]`
1179 /// once at the end of decode and propagates the digest into
1180 /// the persistent scratch's `hash` so
1181 /// [`Self::get_calculated_checksum`] returns the right value.
1182 ///
1183 /// Multi-segment frames are supported via a per-block
1184 /// `DecodeBuffer::drop_to_window_size` call that caps the
1185 /// visible buffer at `window_size` at block boundaries. The
1186 /// discarded bytes stay physically in the user slice (they're
1187 /// the frame's already-decoded output); only their
1188 /// `BufferBackend::head` visibility moves forward.
1189 ///
1190 /// Note: `drop_to_window_size` runs only BETWEEN blocks, so
1191 /// within a single block `buffer.len()` can temporarily exceed
1192 /// `window_size`. `DecodeBuffer::repeat` validates match
1193 /// offsets against `buffer.len()` (not against `window_size`),
1194 /// so corrupted streams with `offset > window_size` but
1195 /// `offset <= current buffer.len()` are NOT rejected by this
1196 /// gate. Strict spec compliance for offsets in multi-segment
1197 /// frames would require an in-block offset bound that we don't
1198 /// currently enforce on either the direct or the fallback path.
1199 ///
1200 /// Non-eligible frames fall back transparently to the existing
1201 /// `decode_blocks` + `read` drain path.
1202 ///
1203 /// `input` is expected to contain a single zstd frame. Bytes
1204 /// past the end of that frame are NOT validated and are silently
1205 /// ignored — this differs from [`Self::decode_all`], which loops
1206 /// until `input` is fully consumed and will attempt to parse a
1207 /// second frame (or error) on trailing bytes. Multi-frame
1208 /// streams must use [`Self::decode_all`].
1209 ///
1210 /// On the direct path the literal pushes and
1211 /// sequence-execution match copies write straight into
1212 /// `output`, eliminating the FlatBuf-as-intermediate `read()`
1213 /// drain that dominates poorly-compressed L-7-class corpora
1214 /// (~28% of decode time on
1215 /// `level_-7_fast/decodecorpus-z000033/rust_stream`). Both
1216 /// single-segment and multi-segment frames take the direct
1217 /// path; multi-segment frames cap the visible buffer at
1218 /// `window_size` between blocks via
1219 /// `DecodeBuffer::drop_to_window_size`.
1220 ///
1221 /// Frames that aren't eligible (zero `frame_content_size`,
1222 /// active dictionary, undersized `output`) transparently fall
1223 /// back to the internal block-decode + read drain loop. The
1224 /// fallback is NOT [`Self::decode_all`] semantics: it decodes
1225 /// exactly one frame and returns; trailing bytes past the
1226 /// frame are silently ignored. Use [`Self::decode_all`] for
1227 /// multi-frame input or streams that may contain skippable
1228 /// frames.
1229 ///
1230 /// `input` is expected to contain exactly ONE non-skippable
1231 /// zstd frame. **Skippable frames are rejected with
1232 /// `ReadFrameHeaderError::SkipFrame` from `init`** — this
1233 /// method does NOT skip them. Multi-frame input or input that
1234 /// might contain skippable frames must go through
1235 /// [`Self::decode_all`], which iterates `init` and handles
1236 /// `SkipFrame` by advancing past the skippable payload.
1237 ///
1238 /// # State observability after this call
1239 ///
1240 /// On the direct path, decoded bytes are written into `output`
1241 /// via a stack-local `DecodeBuffer<UserSliceBackend>` that is
1242 /// dropped before this function returns. The persistent
1243 /// `state.decoder_scratch.buffer` stays empty. Consequently,
1244 /// after `decode_to_slice_trusted` returns:
1245 ///
1246 /// - [`Self::is_finished`] returns `true`,
1247 /// - [`Self::can_collect`] returns `0`,
1248 /// - [`Self::read`] (the crate's `io::Read` impl, which under
1249 /// `feature = "std"` is `std::io::Read`) reads 0 bytes,
1250 /// - [`Self::collect`] returns `Some(Vec::new())`,
1251 /// - [`Self::get_calculated_checksum`] returns the correct
1252 /// value when the frame had `content_checksum_flag` set —
1253 /// the direct path walks the output once at end of decode
1254 /// and propagates the digest into the persistent scratch's
1255 /// hasher so this accessor reads the right state.
1256 ///
1257 /// Callers must use the bytes from `output[..n]` (where `n`
1258 /// is the returned count); do not mix `decode_to_slice_trusted` with
1259 /// `read`/`collect` on the same `FrameDecoder`.
1260 ///
1261 /// When the frame is NOT eligible (no FCS in the header, or
1262 /// output buffer too small for the WILDCOPY slack, or active
1263 /// dictionary), this method falls back to a single-frame
1264 /// `decode_blocks` + `read` drain loop, draining into the
1265 /// caller's `output` slice. This is NOT `decode_all`: it
1266 /// processes only one frame (no trailing-frame iteration, no
1267 /// silent skippable-frame skip) and returns
1268 /// [`FrameDecoderError::TargetTooSmall`] if the decoded
1269 /// output does not fit in `output`.
1270 ///
1271 /// # Panic / DoS surface
1272 ///
1273 /// **For trusted input only.** On the direct path
1274 /// `UserSliceBackend` uses release-mode `assert!` for capacity
1275 /// checks across all three write entry points (`extend`,
1276 /// `extend_and_fill`, `extend_from_within_unchecked`). A
1277 /// malformed Compressed block whose payload expands past the
1278 /// declared `frame_content_size` (and beyond the
1279 /// `WILDCOPY_OVERLENGTH` slack the caller sized into `output`)
1280 /// will panic mid-block rather than returning a structured
1281 /// error. The per-block `produced > content_size` guard catches
1282 /// the overshoot AFTER the block, but cannot prevent the
1283 /// in-block writes from running first.
1284 ///
1285 /// The trade-off is deliberate for this PR. Making the writes
1286 /// fallible requires extending the `BufferBackend` trait
1287 /// surface, touching every backend implementation, and
1288 /// propagating `Result<_, _>` through the entire sequence
1289 /// executor — a refactor too large to fold into the direct
1290 /// decode wiring without losing review tractability. The
1291 /// follow-up issue tracking that work (referenced below in
1292 /// "Fallible BufferBackend writes") is a hard prerequisite
1293 /// before this entry point becomes safe to expose on
1294 /// untrusted streams.
1295 ///
1296 /// Callers handling untrusted input must use [`Self::decode_all`]
1297 /// which routes through `FlatBuf` / `RingBuffer`. Those
1298 /// backends grow via `Vec::reserve` (succeeds or aborts on
1299 /// alloc failure — not error-returning), but the growable Vec
1300 /// capacity absorbs a malformed block's overshoot inside the
1301 /// allocation; the frame-level checks then turn the size
1302 /// mismatch into `FrameContentSizeMismatch` instead of OOB
1303 /// writes into a fixed-size user slice. Fallible
1304 /// `BufferBackend` writes that would let `decode_to_slice_trusted`
1305 /// remain safe on adversarial input are tracked in issue #246.
1306 // The `_trusted` suffix is part of the API contract: this
1307 // entry point is for trusted input only. Adversarial /
1308 // malformed input can panic via release-mode `assert!` inside
1309 // `UserSliceBackend`. Callers handling untrusted data MUST use
1310 // `decode_all` instead. The fallible-`BufferBackend` refactor
1311 // that would let this entry point be safe on adversarial
1312 // input is tracked as a follow-up.
1313 #[doc(alias = "decode_to_slice")]
1314 #[must_use = "decode_to_slice_trusted returns the decoded byte count; ignoring it leaves the output's effective length ambiguous"]
1315 pub fn decode_to_slice_trusted(
1316 &mut self,
1317 mut input: &[u8],
1318 output: &mut [u8],
1319 ) -> Result<usize, FrameDecoderError> {
1320 use super::block_decoder;
1321 use super::buffer_backend::WILDCOPY_OVERLENGTH;
1322 use super::decode_buffer::DecodeBuffer;
1323 use super::scratch::DirectScratch;
1324 use super::user_slice_buf::UserSliceBackend;
1325 use crate::io::Read;
1326 use FrameDecoderError as err;
1327
1328 // Parse the frame header. This populates `self.state` with
1329 // the frame descriptor + resets the per-frame scratch
1330 // (DecoderScratchKind::Flat for single-segment, ::Ring
1331 // otherwise).
1332 //
1333 // Skippable frames are reported by `init` as
1334 // `ReadFrameHeaderError::SkipFrame`. The direct path
1335 // doesn't have a state model for "skip + decode next" since
1336 // it processes a single frame at most — propagate the error
1337 // unchanged so callers learn this input needs `decode_all`
1338 // (which iterates init and advances past skippable
1339 // payloads).
1340 self.init(&mut input)?;
1341
1342 let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
1343 let content_size = state.frame_header.frame_content_size();
1344 let needed = content_size.saturating_add(WILDCOPY_OVERLENGTH as u64);
1345 // Eligibility independent of `single_segment_flag`:
1346 // multi-segment frames work via a coarse, block-boundary
1347 // cap on the visible buffer. The post-block
1348 // `drop_to_window_size` call in the loop below advances the
1349 // backend's `head` so `buffer.len()` doesn't grow past
1350 // `window_size` between blocks — bytes physically remain
1351 // in the user slice, just leave `len()`'s visible range so
1352 // a subsequent block's match-offset cannot reach back
1353 // arbitrarily far via stale history.
1354 //
1355 // This is NOT strict spec enforcement of
1356 // `offset <= window_size`: within a single block
1357 // `buffer.len()` can temporarily exceed `window_size` and
1358 // `DecodeBuffer::repeat` validates against `buffer.len()`
1359 // (not `window_size`), so an in-block match with
1360 // `offset > window_size` but `offset <= current
1361 // buffer.len()` is accepted on both direct and fallback
1362 // paths. See the `decode_to_slice_trusted` doc for the full
1363 // limitation note.
1364 //
1365 // Disabled when a dictionary is active: the persistent
1366 // `DecoderScratch::buffer.dict_content` seeded by
1367 // `init_from_dict` is NOT carried into the stack-local
1368 // `DecodeBuffer<UserSliceBackend>` we build below, so any
1369 // match that reaches into the external dictionary would
1370 // decode against an empty prefix. Fall back to the regular
1371 // path which keeps the dict in the persistent buffer.
1372 let dict_active = state.using_dict.is_some();
1373 // `content_checksum_flag` is no longer a disqualifier. The
1374 // direct path writes the decoded bytes into the user's
1375 // `output` slice, and we hash that slice once at the end of
1376 // decode (single sequential pass over cache-hot data). The
1377 // computed digest is then propagated into the persistent
1378 // `state.decoder_scratch.buffer.hash` so the public
1379 // `get_calculated_checksum()` accessor reads the correct
1380 // value just like on the `decode_all` path.
1381 let eligible = content_size > 0 && !dict_active && (output.len() as u64) >= needed;
1382
1383 if !eligible {
1384 // Frame doesn't qualify for direct decode (empty
1385 // frame_content_size — header lacks FCS — or output too
1386 // small for the WILDCOPY slack). Fall through to the
1387 // existing per-block decode + drain path — `init`
1388 // already populated `self.state`, so `decode_blocks` +
1389 // `read` pick up from there.
1390 let mut output_tail: &mut [u8] = output;
1391 let mut total_bytes_written = 0;
1392 loop {
1393 self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
1394 let bytes_written = self
1395 .read(output_tail)
1396 .map_err(err::FailedToDrainDecodebuffer)?;
1397 output_tail = &mut output_tail[bytes_written..];
1398 total_bytes_written += bytes_written;
1399 if self.can_collect() != 0 {
1400 return Err(err::TargetTooSmall);
1401 }
1402 if self.is_finished() {
1403 break;
1404 }
1405 }
1406 // When the frame header declares a `frame_content_size`,
1407 // ensure the drained byte count actually matches it.
1408 // Without this check a corrupt frame that sets FCS but
1409 // ends early (e.g. last-block flag on a sub-FCS payload)
1410 // would silently return success here, while the
1411 // eligible-frame direct path catches the same condition
1412 // via `FrameContentSizeMismatch`. The fallback path now
1413 // matches that behaviour.
1414 //
1415 // Use `fcs_declared()` (NOT `content_size > 0`) as the
1416 // "is FCS on the wire" gate. The two diverge on the
1417 // legitimate edge case of an empty frame with an
1418 // EXPLICIT FCS=0 on the wire (FCS_flag>=1 with bytes
1419 // reading 0, or single_segment+FCS_flag=0 with the
1420 // 1-byte FCS=0): `content_size` is 0 in BOTH the
1421 // "absent" and "explicitly zero" cases, while
1422 // `fcs_declared()` returns false only in the truly
1423 // absent case.
1424 let state = self.state.as_ref().expect("state populated by init");
1425 if state.frame_header.fcs_declared() && (total_bytes_written as u64) != content_size {
1426 return Err(err::FrameContentSizeMismatch {
1427 declared: content_size,
1428 produced: total_bytes_written as u64,
1429 });
1430 }
1431 return Ok(total_bytes_written);
1432 }
1433
1434 // Direct decode path. Borrow the persistent fields
1435 // (HUF/FSE tables, offset_hist, scratch Vecs) out of the
1436 // existing single-segment Flat scratch; we keep them
1437 // populated across `decode_to_slice_trusted` calls (HUF table reuse
1438 // is the main scratch-reuse win on small frames). Then
1439 // construct a stack-local DecodeBuffer<UserSliceBackend<'o>>
1440 // over `output` and bundle into a `DirectScratch`.
1441 // Borrow persistent fields out of whichever scratch variant
1442 // `init` produced (Flat for single_segment, Ring for
1443 // multi-segment) — both expose the same set of HUF/FSE/Vec
1444 // fields; only `buffer` differs and we don't use that here.
1445 // Macro-style binding to avoid the closure / generic
1446 // gymnastics of returning multiple &mut from a match arm.
1447 let (huf, fse, offset_hist, literals_buffer, sequences, block_content_buffer, window_size) =
1448 match &mut state.decoder_scratch {
1449 DecoderScratchKind::Flat(s) => (
1450 &mut s.huf,
1451 &mut s.fse,
1452 &mut s.offset_hist,
1453 &mut s.literals_buffer,
1454 &mut s.sequences,
1455 &mut s.block_content_buffer,
1456 s.buffer.window_size,
1457 ),
1458 DecoderScratchKind::Ring(s) => (
1459 &mut s.huf,
1460 &mut s.fse,
1461 &mut s.offset_hist,
1462 &mut s.literals_buffer,
1463 &mut s.sequences,
1464 &mut s.block_content_buffer,
1465 s.buffer.window_size,
1466 ),
1467 };
1468 let backend = UserSliceBackend::from_slice(output);
1469 let buffer = DecodeBuffer::from_backend(backend, window_size);
1470 let mut direct = DirectScratch {
1471 huf,
1472 fse,
1473 offset_hist,
1474 literals_buffer,
1475 sequences,
1476 block_content_buffer,
1477 buffer,
1478 };
1479
1480 // Block loop. Mirrors `decode_blocks` (without the
1481 // strategy-bounded early exit — we always decode the whole
1482 // frame in one shot for the direct path). Keeps
1483 // `state.bytes_read_counter` / `state.block_counter` in
1484 // sync with `decode_blocks` so post-call accessors
1485 // (`bytes_read_from_source`, `blocks_decoded`) return
1486 // accurate values.
1487 let mut block_dec = block_decoder::new();
1488 // Track total output bytes against the declared
1489 // `frame_content_size` via the buffer's actual write
1490 // counter — `BlockHeader.decompressed_size` is 0 for
1491 // Compressed blocks (the header parser can't know the
1492 // expanded size before decoding the body), so per-header
1493 // tracking would always count 0 for those blocks and
1494 // miscount frames that aren't pure Raw/RLE.
1495 let mut produced: u64 = 0;
1496 loop {
1497 let (block_header, hsize) = block_dec
1498 .read_block_header(&mut input)
1499 .map_err(err::FailedToReadBlockHeader)?;
1500 state.bytes_read_counter += u64::from(hsize);
1501 // Pre-flight FCS check ONLY for Raw / RLE blocks where
1502 // `decompressed_size` is the actual block output size.
1503 // For Compressed blocks the header field is 0; the
1504 // post-decode check below catches overflow via the
1505 // backend's actual write counter delta.
1506 let block_upper = u64::from(block_header.decompressed_size);
1507 if block_upper > 0 && produced + block_upper > content_size {
1508 // Frame is corrupt — Raw/RLE block headers claim
1509 // more output than the FCS allows. Caller's buffer
1510 // was sized against FCS, so this is decoder-side
1511 // corruption, not user sizing.
1512 return Err(err::FrameContentSizeMismatch {
1513 declared: content_size,
1514 produced: produced + block_upper,
1515 });
1516 }
1517 // Slice-source fast path: consume the block body
1518 // straight from `input` without copying into the
1519 // persistent `block_content_buffer`.
1520 let before = direct.buffer.total_produced();
1521 let body_consumed = block_dec
1522 .decode_block_content_from_slice(&block_header, &mut direct, &mut input)
1523 .map_err(err::FailedToReadBlockBody)?;
1524 produced = direct.buffer.total_produced();
1525 // Post-decode FCS overflow check. Works uniformly for
1526 // Raw/RLE/Compressed since it reads the actual bytes
1527 // written by the backend rather than the header's
1528 // (possibly zero) `decompressed_size`.
1529 if produced > content_size {
1530 return Err(err::FrameContentSizeMismatch {
1531 declared: content_size,
1532 produced,
1533 });
1534 }
1535 // Silence unused-binding warning when this delta isn't
1536 // consulted — it's there so future debug builds can
1537 // assert (produced - before) <= MAX_BLOCK_SIZE if the
1538 // spec invariant ever needs an explicit gate.
1539 let _ = before;
1540 state.bytes_read_counter += body_consumed;
1541 state.block_counter += 1;
1542 // Cap the visible buffer at window_size between blocks
1543 // so the next block's match-offset validation matches
1544 // the spec's `offset <= window_size` rule. Bytes
1545 // physically stay in the user slice; we just narrow
1546 // the visible range via `head`. For single-segment
1547 // frames `content_size <= window_size` so this is a
1548 // no-op on every iteration; for multi-segment frames
1549 // it advances `head` once `tail - head` outgrows the
1550 // window.
1551 direct.buffer.drop_to_window_size();
1552 if block_header.last_block {
1553 if state.frame_header.descriptor.content_checksum_flag() {
1554 let mut chksum = [0u8; 4];
1555 input
1556 .read_exact(&mut chksum)
1557 .map_err(err::FailedToReadChecksum)?;
1558 state.bytes_read_counter += 4;
1559 state.check_sum = Some(u32::from_le_bytes(chksum));
1560 }
1561 break;
1562 }
1563 }
1564 // Final sanity: blocks summed to exactly `content_size`. A
1565 // malformed frame with `last_block` set early (or one whose
1566 // block headers under-count) would land here. Distinct from
1567 // TargetTooSmall — the caller did their part, the frame
1568 // itself is corrupt.
1569 if produced != content_size {
1570 return Err(err::FrameContentSizeMismatch {
1571 declared: content_size,
1572 produced,
1573 });
1574 }
1575
1576 // `direct.buffer.len()` would only show the visible (post
1577 // head-advance) range, not the total bytes physically
1578 // written into the user slice. The decode succeeded so
1579 // `content_size` is the authoritative byte count — the
1580 // backend wrote exactly that many bytes starting at
1581 // `output[0]`.
1582 let written = content_size as usize;
1583 state.frame_finished = true;
1584 // Drop the stack-local DirectScratch (and its DecodeBuffer
1585 // borrow on `output`) so we can re-borrow `output` for the
1586 // hash pass below. After this point `direct` is gone.
1587 drop(direct);
1588 #[cfg(feature = "hash")]
1589 {
1590 // Direct path bypasses the per-write hash accounting
1591 // (DecodeBuffer hashes during drain; the direct path
1592 // never drains because the user slice IS the buffer).
1593 // Walk the decoded output once and propagate the
1594 // resulting hasher state into the persistent scratch's
1595 // buffer so `get_calculated_checksum()` returns the
1596 // right value. Cost: ~330 us / MiB at xxhash's
1597 // ~3 GB/s throughput on x86_64, against cache-hot data.
1598 //
1599 // Done unconditionally for every successful direct
1600 // decode (not just frames with `content_checksum_flag`)
1601 // so `get_calculated_checksum()` returns the running
1602 // digest path-independently — matches what
1603 // `decode_all`'s drain-time hashing produces on
1604 // checksumless frames too.
1605 use core::hash::Hasher;
1606 let mut hasher = twox_hash::XxHash64::with_seed(0);
1607 hasher.write(&output[..written]);
1608 match &mut state.decoder_scratch {
1609 DecoderScratchKind::Flat(s) => s.buffer.hash = hasher,
1610 DecoderScratchKind::Ring(s) => s.buffer.hash = hasher,
1611 }
1612 }
1613 Ok(written)
1614 }
1615}
1616
1617/// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished
1618/// this will retain window_size bytes, else it will drain it completely
1619impl Read for FrameDecoder {
1620 fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
1621 let state = match &mut self.state {
1622 None => return Ok(0),
1623 Some(s) => s,
1624 };
1625 if state.frame_finished {
1626 state.decoder_scratch.buffer_read_all(target)
1627 } else {
1628 state.decoder_scratch.buffer_read(target)
1629 }
1630 }
1631}
1632
1633#[cfg(test)]
1634mod tests {
1635 extern crate std;
1636
1637 use super::{DictionaryHandle, FrameDecoder};
1638 use crate::encoding::{CompressionLevel, FrameCompressor};
1639 use alloc::vec::Vec;
1640
1641 #[test]
1642 fn decode_to_slice_trusted_matches_decode_all_on_single_segment_frame() {
1643 // Roundtrip a small payload through the encoder, then decode
1644 // it via both `decode_all` and `decode_to_slice_trusted`. Both paths
1645 // must produce identical output bytes — the only difference
1646 // is the internal buffer/drain shape, not the decoded
1647 // semantics. This is the regression gate for the
1648 // direct-decode wiring.
1649 let payload: Vec<u8> = (0..4096u32).map(|i| (i & 0xFF) as u8).collect();
1650 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
1651 compressor.set_source(payload.as_slice());
1652 let mut compressed = Vec::new();
1653 compressor.set_drain(&mut compressed);
1654 compressor.compress();
1655
1656 // Baseline: decode_all.
1657 let mut dec_a = FrameDecoder::new();
1658 let mut out_a = alloc::vec![0u8; payload.len()];
1659 let n_a = dec_a
1660 .decode_all(compressed.as_slice(), &mut out_a)
1661 .expect("decode_all should succeed");
1662 assert_eq!(n_a, payload.len());
1663 assert_eq!(&out_a[..n_a], payload.as_slice());
1664
1665 // Direct: decode_to_slice_trusted with WILDCOPY slack.
1666 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
1667 let mut dec_b = FrameDecoder::new();
1668 let mut out_b = alloc::vec![0u8; payload.len() + slack];
1669 let n_b = dec_b
1670 .decode_to_slice_trusted(compressed.as_slice(), &mut out_b)
1671 .expect("decode_to_slice_trusted should succeed");
1672 assert_eq!(
1673 n_b,
1674 payload.len(),
1675 "direct decode produced wrong byte count"
1676 );
1677 assert_eq!(&out_b[..n_b], payload.as_slice());
1678 }
1679
1680 #[test]
1681 fn decode_to_slice_trusted_multi_segment_frame_decodes_correctly() {
1682 // Multi-segment frame: payload large enough that the
1683 // encoder's default frame layout has `single_segment_flag =
1684 // false` and `window_size < frame_content_size`. The direct
1685 // path must cap the visible buffer at window_size after each
1686 // block (drop_to_window_size) so match-offset validation
1687 // matches the spec rule `offset <= window_size`, and still
1688 // produce the same bytes as decode_all on the
1689 // FlatBuf/Ring-backed path.
1690 //
1691 // Make the payload structured so multi-segment behavior
1692 // actually kicks in: 2 MiB of repeating + random-ish bytes
1693 // forces window_size lower than content_size at the encoder.
1694 let mut payload: Vec<u8> = Vec::with_capacity(2 * 1024 * 1024);
1695 for i in 0..payload.capacity() {
1696 payload.push((i.wrapping_mul(2_654_435_761) & 0xFF) as u8);
1697 }
1698 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
1699 compressor.set_source(payload.as_slice());
1700 let mut compressed = Vec::new();
1701 compressor.set_drain(&mut compressed);
1702 compressor.compress();
1703
1704 // Baseline: decode_all through the FlatBuf+drain path.
1705 let mut dec_a = FrameDecoder::new();
1706 let mut out_a = alloc::vec![0u8; payload.len()];
1707 let n_a = dec_a
1708 .decode_all(compressed.as_slice(), &mut out_a)
1709 .expect("decode_all should succeed");
1710 assert_eq!(n_a, payload.len());
1711 assert_eq!(&out_a[..n_a], payload.as_slice());
1712
1713 // Direct path: must give identical bytes via UserSliceBackend
1714 // + per-block drop_to_window_size.
1715 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
1716 let mut dec_b = FrameDecoder::new();
1717 let mut out_b = alloc::vec![0u8; payload.len() + slack];
1718 let n_b = dec_b
1719 .decode_to_slice_trusted(compressed.as_slice(), &mut out_b)
1720 .expect("decode_to_slice_trusted should succeed on multi-segment frame");
1721 assert_eq!(n_b, payload.len(), "wrong byte count on direct path");
1722 assert_eq!(&out_b[..n_b], payload.as_slice());
1723
1724 // Sanity-check: confirm the encoded frame really IS
1725 // multi-segment. If a future encoder default changes,
1726 // catching the assumption here is better than silently
1727 // testing single_segment on this name.
1728 let mut sanity = FrameDecoder::new();
1729 sanity.init(&mut compressed.as_slice()).unwrap();
1730 assert!(
1731 !sanity
1732 .state
1733 .as_ref()
1734 .unwrap()
1735 .frame_header
1736 .descriptor
1737 .single_segment_flag(),
1738 "test precondition violated: frame is single-segment, rename or resize"
1739 );
1740 }
1741
1742 #[cfg(feature = "hash")]
1743 #[test]
1744 fn decode_to_slice_trusted_propagates_checksum_into_persistent_scratch() {
1745 // Direct path on a checksum-flagged frame: the FrameCompressor
1746 // under `feature = "hash"` sets content_checksum_flag, so the
1747 // decoded frame has a recorded checksum. After
1748 // decode_to_slice_trusted we must be able to verify it matches via
1749 // the public get_calculated_checksum() accessor — the digest
1750 // is computed by walking output at end of decode and stored
1751 // into the persistent scratch's hasher.
1752 let payload: Vec<u8> = (0..8192u32).map(|i| (i & 0xFF) as u8).collect();
1753 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
1754 compressor.set_source(payload.as_slice());
1755 let mut compressed = Vec::new();
1756 compressor.set_drain(&mut compressed);
1757 compressor.compress();
1758
1759 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
1760 let mut dec = FrameDecoder::new();
1761 let mut out = alloc::vec![0u8; payload.len() + slack];
1762 let n = dec
1763 .decode_to_slice_trusted(compressed.as_slice(), &mut out)
1764 .expect("decode_to_slice_trusted with checksum must succeed");
1765 assert_eq!(n, payload.len());
1766 assert_eq!(&out[..n], payload.as_slice());
1767
1768 // Both sides must report the same checksum: the frame header
1769 // carries the stored u32, and get_calculated_checksum reads
1770 // the running digest the direct path just propagated.
1771 let stored = dec.get_checksum_from_data();
1772 let calculated = dec.get_calculated_checksum();
1773 assert!(stored.is_some(), "frame must carry stored checksum");
1774 assert!(
1775 calculated.is_some(),
1776 "direct path must propagate calculated checksum"
1777 );
1778 assert_eq!(
1779 stored, calculated,
1780 "stored vs calculated checksum mismatch on direct path"
1781 );
1782 }
1783
1784 #[test]
1785 fn decode_to_slice_trusted_fcs_overflow_via_corrupt_frame_returns_structured_error() {
1786 // Hand-build a corrupt frame that declares
1787 // frame_content_size = 4 but the (last) block carries a
1788 // larger Raw payload. The pre-flight FCS check inside the
1789 // direct path's block loop catches this and returns the
1790 // structured FrameContentSizeMismatch variant — not a
1791 // panic, not a generic TargetTooSmall.
1792 //
1793 // Frame layout (single_segment, FCS=4):
1794 // magic 4 bytes 0xFD2FB528
1795 // FHD 1 byte single_segment=1, no checksum,
1796 // FCS field size = 0 (-> 1-byte FCS)
1797 // FCS 1 byte 0x04
1798 // block_header 3 bytes last=1, type=Raw, block_size=10
1799 // block_payload 10 bytes 0xAA repeated
1800 let mut frame = alloc::vec::Vec::new();
1801 // magic
1802 frame.extend_from_slice(&0xFD2FB528u32.to_le_bytes());
1803 // FHD: single_segment=1, fcs_flag=0 (1-byte FCS), no checksum,
1804 // no dict. Bit layout: FCS(7-6)=0, single_segment(5)=1,
1805 // reserved/uncs(4)=0, content_checksum(2)=0, dict(0-1)=00.
1806 frame.push(0b0010_0000);
1807 // FCS: 1 byte
1808 frame.push(4);
1809 // Block header: cBlockSize=10, type=Raw (0), last=1
1810 // 3-byte LE: bit0=last, bits1-2=type(2 bits), bits3-23=size
1811 let cblock_size: u32 = 10;
1812 let bh: u32 = 1 | (cblock_size << 3); // last=1, type=Raw=0
1813 frame.push((bh & 0xFF) as u8);
1814 frame.push((bh >> 8) as u8);
1815 frame.push((bh >> 16) as u8);
1816 // Payload — 10 bytes that, if decoded, would exceed FCS=4.
1817 frame.extend(core::iter::repeat_n(0xAAu8, 10));
1818
1819 let slack = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
1820 let mut dec = FrameDecoder::new();
1821 let mut out = alloc::vec![0u8; 4 + slack];
1822 let err = dec
1823 .decode_to_slice_trusted(&frame, &mut out)
1824 .expect_err("FCS-overflow frame must fail decode");
1825 assert!(
1826 matches!(
1827 err,
1828 super::FrameDecoderError::FrameContentSizeMismatch { .. }
1829 ),
1830 "expected FrameContentSizeMismatch, got {:?}",
1831 err
1832 );
1833 }
1834
1835 #[test]
1836 fn decode_to_slice_trusted_falls_back_when_output_too_small_for_wildcopy_slack() {
1837 // Output sized exactly to frame_content_size (no
1838 // WILDCOPY_OVERLENGTH slack) must NOT trigger the direct
1839 // path — the burst's `extend_from_within_unchecked` writes
1840 // past `tail` into the slack region. Direct dispatcher
1841 // recognises this and falls back to the FlatBuf + drain
1842 // path which still produces the right output.
1843 let payload: Vec<u8> = (0..2048u32)
1844 .map(|i| (i.wrapping_mul(31) & 0xFF) as u8)
1845 .collect();
1846 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
1847 compressor.set_source(payload.as_slice());
1848 let mut compressed = Vec::new();
1849 compressor.set_drain(&mut compressed);
1850 compressor.compress();
1851
1852 let mut dec = FrameDecoder::new();
1853 // Exactly payload.len(), no slack — direct path is gated out.
1854 let mut out = alloc::vec![0u8; payload.len()];
1855 let n = dec
1856 .decode_to_slice_trusted(compressed.as_slice(), &mut out)
1857 .expect("decode_to_slice_trusted should still succeed via fallback");
1858 assert_eq!(n, payload.len());
1859 assert_eq!(&out[..n], payload.as_slice());
1860 }
1861
1862 #[test]
1863 fn decode_to_slice_trusted_fallback_validates_fcs_against_total_output() {
1864 // Synthetic single-segment frame: FCS = 20 bytes, but the
1865 // last-block flag fires after only 4 bytes of raw payload.
1866 // On the direct path this would trip the post-block
1867 // `produced > content_size` check; the fallback path
1868 // (eligible=false because output is sized exactly to FCS,
1869 // no WILDCOPY slack) used to silently return Ok(4). With
1870 // the fix it now surfaces `FrameContentSizeMismatch`
1871 // matching the direct path.
1872 //
1873 // Frame layout: 4 B magic | 1 B FHD (single_segment=1,
1874 // FCS_flag=3 → 8-byte FCS) | 8 B FCS=20 | block header
1875 // (Raw, last, size=4) | 4 raw bytes.
1876 let mut wire = Vec::new();
1877 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes()); // magic
1878 // FHD: FCS_flag=3 (8-byte FCS) <<6 | single_segment=1 <<5.
1879 wire.push(0b1110_0000);
1880 wire.extend_from_slice(&20u64.to_le_bytes()); // declared FCS
1881 // Block header: (size << 3) | (block_type << 1) | last_block.
1882 // Raw block (block_type=0), last_block=1, size=4 → 0b00100001 = 0x21.
1883 wire.push(0x21);
1884 wire.push(0x00);
1885 wire.push(0x00);
1886 wire.extend_from_slice(&[1u8, 2, 3, 4]);
1887
1888 let mut dec = FrameDecoder::new();
1889 // Size output exactly at declared FCS (no WILDCOPY slack)
1890 // so the eligibility check gates the direct path out.
1891 let mut out = alloc::vec![0u8; 20];
1892 let err = dec
1893 .decode_to_slice_trusted(wire.as_slice(), &mut out)
1894 .expect_err("fallback must reject corrupt FCS underflow");
1895 match err {
1896 crate::decoding::errors::FrameDecoderError::FrameContentSizeMismatch {
1897 declared,
1898 produced,
1899 } => {
1900 assert_eq!(declared, 20);
1901 assert_eq!(produced, 4);
1902 }
1903 other => panic!("expected FrameContentSizeMismatch, got {other:?}"),
1904 }
1905 }
1906
1907 #[test]
1908 fn decode_to_slice_trusted_fallback_treats_explicit_fcs_zero_as_declared() {
1909 // Synthetic multi-segment frame with FCS_flag=2 (4-byte
1910 // FCS) explicitly set to 0. The header DECLARES zero
1911 // content, but the body carries a 5-byte raw last-block.
1912 // `fcs_declared()` must return true (the field is on the
1913 // wire) so the fallback's post-decode size check sees the
1914 // mismatch — even though `frame_content_size == 0`. This
1915 // is exactly the FCS=0 edge case where the previous
1916 // `content_size > 0` proxy would have silently accepted
1917 // the corrupt frame.
1918 //
1919 // Frame layout:
1920 // 4 B magic — 28 B5 2F FD
1921 // 1 B FHD — FCS_flag=2 (bits 7-6), no
1922 // single_segment, content_checksum=0,
1923 // dict_id_flag=0 → 0b1000_0000
1924 // 1 B window_descriptor — exp=10, mantissa=0 → window=1 MiB
1925 // 4 B FCS — 0 LE
1926 // 3 B block header — raw, last, size=5 → 0x29 0x00 0x00
1927 // 5 B raw payload — anything non-empty
1928 let mut wire = Vec::new();
1929 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
1930 wire.push(0b1000_0000); // FHD: FCS_flag=2, others 0.
1931 wire.push(0x50); // window_descriptor: exp=10, mantissa=0.
1932 wire.extend_from_slice(&0u32.to_le_bytes()); // FCS = 0.
1933 // Block header (24-bit LE): (size << 3) | (block_type << 1) | last_block
1934 // = (5 << 3) | (0 << 1) | 1 = 0x29.
1935 wire.push(0x29);
1936 wire.push(0x00);
1937 wire.push(0x00);
1938 wire.extend_from_slice(&[1u8, 2, 3, 4, 5]);
1939
1940 let mut dec = FrameDecoder::new();
1941 // FCS=0 declared, so eligibility (`content_size > 0`)
1942 // false — falls through to the drain loop. Output buffer
1943 // size doesn't matter for the eligibility check here;
1944 // give it some room so `read()` can drain the block.
1945 let mut out = alloc::vec![0u8; 16];
1946 let err = dec
1947 .decode_to_slice_trusted(wire.as_slice(), &mut out)
1948 .expect_err("corrupt FCS=0 + 5-byte block must error");
1949 match err {
1950 crate::decoding::errors::FrameDecoderError::FrameContentSizeMismatch {
1951 declared,
1952 produced,
1953 } => {
1954 assert_eq!(declared, 0);
1955 assert_eq!(produced, 5);
1956 }
1957 other => panic!("expected FrameContentSizeMismatch, got {other:?}"),
1958 }
1959 }
1960
1961 #[test]
1962 fn decode_to_slice_trusted_fallback_accepts_honest_explicit_fcs_zero() {
1963 // Companion to the corrupt-FCS=0 test above: an HONEST
1964 // empty frame with FCS_flag=2 (4-byte FCS) explicitly set
1965 // to 0 AND a 0-byte raw last-block. `fcs_declared()`
1966 // returns true and `content_size == 0 == total_written`,
1967 // so the fallback validation accepts the frame instead of
1968 // misreporting a mismatch.
1969 //
1970 // (Single-segment FCS=0 would test a similar invariant
1971 // but trips header-stage validation: `window_size =
1972 // frame_content_size = 0 < MIN_WINDOW_SIZE` fails the
1973 // window-size sanity check before decode runs. Use the
1974 // multi-segment shape where `window_size` comes from
1975 // `window_descriptor` independently of FCS.)
1976 //
1977 // Frame layout:
1978 // 4 B magic
1979 // 1 B FHD — FCS_flag=2, others 0 → 0x80
1980 // 1 B window_descriptor — exp=10 → 1 MiB window
1981 // 4 B FCS — 0 LE
1982 // 3 B block header — raw, last, size=0 → 0x01 0x00 0x00
1983 let mut wire = Vec::new();
1984 wire.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
1985 wire.push(0b1000_0000);
1986 wire.push(0x50);
1987 wire.extend_from_slice(&0u32.to_le_bytes());
1988 // Block header: (0 << 3) | (0 << 1) | 1 = 0x01.
1989 wire.push(0x01);
1990 wire.push(0x00);
1991 wire.push(0x00);
1992
1993 let mut dec = FrameDecoder::new();
1994 let mut out = alloc::vec![0u8; 16];
1995 let n = dec
1996 .decode_to_slice_trusted(wire.as_slice(), &mut out)
1997 .expect("honest FCS=0 + empty block must succeed");
1998 assert_eq!(n, 0);
1999 }
2000
2001 #[test]
2002 fn reset_with_dict_handle_applies_dict_when_no_dict_id() {
2003 let payload = b"reset-without-dict-id";
2004 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
2005 compressor.set_source(payload.as_slice());
2006 let mut compressed = Vec::new();
2007 compressor.set_drain(&mut compressed);
2008 compressor.compress();
2009
2010 let dict_raw = include_bytes!("../../dict_tests/dictionary");
2011 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dictionary should parse");
2012
2013 let mut decoder = FrameDecoder::new();
2014 decoder
2015 .reset_with_dict_handle(compressed.as_slice(), &handle)
2016 .expect("reset should succeed");
2017 let state = decoder.state.as_ref().expect("state should be initialized");
2018 assert!(state.frame_header.dictionary_id().is_none());
2019 assert_eq!(state.using_dict, Some(handle.id()));
2020 }
2021
2022 #[cfg(feature = "lsm")]
2023 mod expect_validation {
2024 use super::*;
2025 use crate::decoding::errors::FrameDecoderError;
2026
2027 fn compress(payload: &[u8]) -> Vec<u8> {
2028 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
2029 compressor.set_source(payload);
2030 let mut compressed = Vec::new();
2031 compressor.set_drain(&mut compressed);
2032 compressor.compress();
2033 compressed
2034 }
2035
2036 fn compress_with_dict(payload: &[u8], dict_raw: &[u8]) -> Vec<u8> {
2037 let mut compressor = FrameCompressor::new(CompressionLevel::Default);
2038 compressor
2039 .set_dictionary_from_bytes(dict_raw)
2040 .expect("dict load");
2041 compressor.set_source(payload);
2042 let mut compressed = Vec::new();
2043 compressor.set_drain(&mut compressed);
2044 compressor.compress();
2045 compressed
2046 }
2047
2048 #[test]
2049 fn expect_dict_id_none_default_allows_anything() {
2050 let compressed = compress(b"hello-no-expect");
2051 let mut decoder = FrameDecoder::new();
2052 decoder
2053 .reset(compressed.as_slice())
2054 .expect("default None passes");
2055 }
2056
2057 #[test]
2058 fn expect_dict_id_zero_matches_frame_without_dict_id() {
2059 // Default-encoded frame has no dict_id; pinning Some(0)
2060 // ("no dictionary expected") must accept it.
2061 let compressed = compress(b"payload");
2062 let mut decoder = FrameDecoder::new();
2063 decoder.expect_dict_id(Some(0));
2064 decoder
2065 .reset(compressed.as_slice())
2066 .expect("Some(0) ~ None");
2067 }
2068
2069 #[test]
2070 fn expect_dict_id_matching_value_passes() {
2071 let dict_raw = include_bytes!("../../dict_tests/dictionary");
2072 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dict parse");
2073 let actual_id = handle.id();
2074
2075 let compressed = compress_with_dict(b"payload-with-dict", dict_raw);
2076
2077 let mut decoder = FrameDecoder::new();
2078 decoder.expect_dict_id(Some(actual_id));
2079 // Decode requires the dict to be registered; using
2080 // reset_with_dict_handle for that.
2081 decoder
2082 .reset_with_dict_handle(compressed.as_slice(), &handle)
2083 .expect("matching dict_id passes");
2084 }
2085
2086 #[test]
2087 fn expect_dict_id_mismatching_value_fails_before_decode() {
2088 let dict_raw = include_bytes!("../../dict_tests/dictionary");
2089 let handle = DictionaryHandle::decode_dict(dict_raw).expect("dict parse");
2090 let actual_id = handle.id();
2091 let wrong_id = actual_id.wrapping_add(1);
2092
2093 let compressed = compress_with_dict(b"payload-with-dict", dict_raw);
2094
2095 let mut decoder = FrameDecoder::new();
2096 decoder.expect_dict_id(Some(wrong_id));
2097 let err = decoder
2098 .reset_with_dict_handle(compressed.as_slice(), &handle)
2099 .expect_err("mismatch must fail");
2100 match err {
2101 FrameDecoderError::UnexpectedDictId { expected, found } => {
2102 assert_eq!(expected, Some(wrong_id));
2103 assert_eq!(found, Some(actual_id));
2104 }
2105 other => panic!("expected UnexpectedDictId, got {other:?}"),
2106 }
2107 }
2108
2109 #[test]
2110 fn expect_dict_id_nonzero_fails_on_frame_without_dict_id() {
2111 // Frame has no dict_id; expecting Some(42) (non-zero)
2112 // must fail with found = None.
2113 let compressed = compress(b"no-dict-frame");
2114 let mut decoder = FrameDecoder::new();
2115 decoder.expect_dict_id(Some(42));
2116 let err = decoder
2117 .reset(compressed.as_slice())
2118 .expect_err("nonzero expectation on dictless frame must fail");
2119 match err {
2120 FrameDecoderError::UnexpectedDictId { expected, found } => {
2121 assert_eq!(expected, Some(42));
2122 assert_eq!(found, None);
2123 }
2124 other => panic!("expected UnexpectedDictId, got {other:?}"),
2125 }
2126 }
2127
2128 #[test]
2129 fn expect_window_descriptor_none_default_allows_anything() {
2130 let compressed = compress(b"hello-no-wd-expect");
2131 let mut decoder = FrameDecoder::new();
2132 decoder
2133 .reset(compressed.as_slice())
2134 .expect("default None passes");
2135 }
2136
2137 #[test]
2138 fn expect_window_descriptor_mismatch_fails_before_decode() {
2139 // Compress a payload large enough to force a
2140 // multi-segment frame (window_descriptor on wire).
2141 // Default compression at >256 KiB produces multi-
2142 // segment frames with a real window_descriptor byte.
2143 let payload = alloc::vec![0xABu8; 512 * 1024];
2144 let compressed = compress(&payload);
2145
2146 // Read the actual window_descriptor by decoding once
2147 // without expectations, then pin a wrong value.
2148 let mut probe_decoder = FrameDecoder::new();
2149 probe_decoder.reset(compressed.as_slice()).unwrap();
2150 let probe_state = probe_decoder.state.as_ref().unwrap();
2151 let actual_wd = probe_state
2152 .frame_header
2153 .window_descriptor()
2154 .expect("multi-segment frame should expose window_descriptor");
2155 let wrong_wd = actual_wd.wrapping_add(0x10); // bump exponent
2156
2157 let mut decoder = FrameDecoder::new();
2158 decoder.expect_window_descriptor(Some(wrong_wd));
2159 let err = decoder
2160 .reset(compressed.as_slice())
2161 .expect_err("wrong window_descriptor must fail");
2162 match err {
2163 FrameDecoderError::UnexpectedWindowDescriptor { expected, found } => {
2164 assert_eq!(expected, wrong_wd);
2165 assert_eq!(found, Some(actual_wd));
2166 }
2167 other => panic!("expected UnexpectedWindowDescriptor, got {other:?}"),
2168 }
2169 }
2170
2171 /// Build a minimal synthetic single-segment zstd frame
2172 /// carrying a 4-byte raw payload. RFC 8878 §3.1.1.1
2173 /// layout, hand-rolled because our default
2174 /// `FrameCompressor` settings don't emit
2175 /// `single_segment_flag` for tiny inputs.
2176 ///
2177 /// Wire bytes (13 total for 4-byte payload):
2178 /// ```text
2179 /// 28 B5 2F FD magic
2180 /// 20 FHD: single_segment=1, FCS_flag=0
2181 /// 04 FCS (single byte, value = payload.len())
2182 /// 21 00 00 block header: raw, last, size=4
2183 /// .. .. .. .. payload bytes
2184 /// ```
2185 fn synth_single_segment_frame(payload: &[u8]) -> Vec<u8> {
2186 assert!(payload.len() <= 255, "1-byte FCS field caps at 255");
2187 assert!(payload.len() < (1usize << 21), "block size 21-bit max");
2188 let mut out = Vec::new();
2189 // Magic 0xFD2FB528 LE.
2190 out.extend_from_slice(&0xFD2F_B528u32.to_le_bytes());
2191 // FHD: single_segment_flag (bit 5) set, everything
2192 // else zero. With single_segment + FCS_flag=0 the FCS
2193 // field is 1 byte. No window_descriptor on wire.
2194 out.push(0b0010_0000);
2195 // 1-byte FCS = payload length.
2196 out.push(payload.len() as u8);
2197 // Block header (3 bytes LE):
2198 // last_block=1, block_type=0 (Raw), block_size=payload.len().
2199 // Encoded: (size << 3) | (block_type << 1) | last_block.
2200 // Block header: last_block flag in bit 0, block_type
2201 // (0 = Raw) in bits 1-2, block size in bits 3+.
2202 let bh: u32 = ((payload.len() as u32) << 3) | 1;
2203 out.push((bh & 0xFF) as u8);
2204 out.push(((bh >> 8) & 0xFF) as u8);
2205 out.push(((bh >> 16) & 0xFF) as u8);
2206 // Raw payload.
2207 out.extend_from_slice(payload);
2208 out
2209 }
2210
2211 #[test]
2212 fn expect_window_descriptor_on_single_segment_frame_fails_with_found_none() {
2213 // Single-segment frames omit the window_descriptor
2214 // byte from the wire entirely. Setting an expectation
2215 // here must surface `found: None` so callers
2216 // distinguish "wrong descriptor" from "no descriptor
2217 // on the wire" — never silently pass.
2218 let compressed = synth_single_segment_frame(b"tiny");
2219
2220 // First sanity-check: the synthetic frame decodes
2221 // cleanly without any expectation.
2222 {
2223 let mut probe = FrameDecoder::new();
2224 probe
2225 .reset(compressed.as_slice())
2226 .expect("synth frame parses");
2227 let probe_state = probe.state.as_ref().unwrap();
2228 assert!(
2229 probe_state.frame_header.window_descriptor().is_none(),
2230 "synth frame must be single-segment"
2231 );
2232 }
2233
2234 let mut decoder = FrameDecoder::new();
2235 decoder.expect_window_descriptor(Some(0x40));
2236 let err = decoder
2237 .reset(compressed.as_slice())
2238 .expect_err("single-segment + expectation must fail");
2239 match err {
2240 FrameDecoderError::UnexpectedWindowDescriptor { expected, found } => {
2241 assert_eq!(expected, 0x40);
2242 assert_eq!(found, None);
2243 }
2244 other => panic!("expected UnexpectedWindowDescriptor, got {other:?}"),
2245 }
2246 }
2247
2248 #[test]
2249 fn validation_failure_leaves_decoder_re_resettable() {
2250 // After UnexpectedDictId on a wrong-expectation reset,
2251 // clearing the expectation and re-calling reset must
2252 // succeed on the same source — no lingering failed
2253 // state.
2254 let compressed = compress(b"re-resettable");
2255
2256 let mut decoder = FrameDecoder::new();
2257 decoder.expect_dict_id(Some(42));
2258 let err = decoder
2259 .reset(compressed.as_slice())
2260 .expect_err("first reset fails");
2261 assert!(matches!(err, FrameDecoderError::UnexpectedDictId { .. }));
2262
2263 // Clear expectation and retry on a fresh source.
2264 decoder.expect_dict_id(None);
2265 decoder
2266 .reset(compressed.as_slice())
2267 .expect("retry after clearing expectation should succeed");
2268 }
2269 }
2270}