Skip to main content

structured_zstd/encoding/
mod.rs

1//! Structures and utilities used for compressing/encoding data into the Zstd format.
2
3pub(crate) mod block_header;
4pub(crate) mod blocks;
5pub(crate) mod frame_header;
6pub(crate) mod incompressible;
7pub(crate) mod match_generator;
8pub(crate) mod util;
9
10mod frame_compressor;
11mod levels;
12mod streaming_encoder;
13pub use frame_compressor::FrameCompressor;
14pub use match_generator::MatchGeneratorDriver;
15pub use streaming_encoder::StreamingEncoder;
16
17use crate::io::{Read, Write};
18use alloc::vec::Vec;
19
20pub(crate) const BETTER_WINDOW_LOG: u8 = 23;
21
22/// Convenience function to compress some source into a target without reusing any resources of the compressor
23/// ```rust
24/// use structured_zstd::encoding::{compress, CompressionLevel};
25/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
26/// let mut target = Vec::new();
27/// compress(data, &mut target, CompressionLevel::Fastest);
28/// ```
29pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
30    let mut frame_enc = FrameCompressor::new(level);
31    frame_enc.set_source(source);
32    frame_enc.set_drain(target);
33    frame_enc.compress();
34}
35
36/// Convenience function to compress some source into a Vec without reusing any resources of the compressor.
37///
38/// This helper eagerly buffers the full input (`Read`) before compression so it
39/// can provide a source-size hint to the one-shot encoder path. Peak memory can
40/// therefore be roughly `input_size + output_size`. For very large payloads or
41/// tighter memory budgets, prefer streaming APIs such as [`StreamingEncoder`].
42/// ```rust
43/// use structured_zstd::encoding::{compress_to_vec, CompressionLevel};
44/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
45/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
46/// ```
47pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
48    let mut source = source;
49    let mut input = Vec::new();
50    source.read_to_end(&mut input).unwrap();
51
52    let mut vec = Vec::new();
53    let mut frame_enc = FrameCompressor::new(level);
54    frame_enc.set_source_size_hint(input.len() as u64);
55    frame_enc.set_source(input.as_slice());
56    frame_enc.set_drain(&mut vec);
57    frame_enc.compress();
58    vec
59}
60
61/// The compression mode used impacts the speed of compression,
62/// and resulting compression ratios. Faster compression will result
63/// in worse compression ratios, and vice versa.
64#[derive(Copy, Clone, Debug)]
65pub enum CompressionLevel {
66    /// This level does not compress the data at all, and simply wraps
67    /// it in a Zstandard frame.
68    Uncompressed,
69    /// This level is roughly equivalent to Zstd compression level 1
70    Fastest,
71    /// This level uses the crate's dedicated `dfast`-style matcher to
72    /// target a better speed/ratio tradeoff than [`CompressionLevel::Fastest`].
73    ///
74    /// It represents this crate's "default" compression setting and may
75    /// evolve in future versions as the implementation moves closer to
76    /// reference zstd level 3 behavior.
77    Default,
78    /// This level is roughly equivalent to Zstd level 7.
79    ///
80    /// Uses the hash-chain matcher with a lazy2 matching strategy: the encoder
81    /// evaluates up to two positions ahead before committing to a match,
82    /// trading speed for a better compression ratio than [`CompressionLevel::Default`].
83    Better,
84    /// This level is roughly equivalent to Zstd level 11.
85    ///
86    /// Uses the hash-chain matcher with a deep lazy2 matching strategy and
87    /// a 16 MiB window. Compared to [`CompressionLevel::Better`], this level
88    /// uses larger hash and chain tables (2 M / 1 M entries vs 1 M / 512 K),
89    /// a deeper search (32 candidates vs 16), and a higher target match
90    /// length (128 vs 48), trading speed for the best compression ratio
91    /// available in this crate.
92    Best,
93    /// Numeric compression level.
94    ///
95    /// Levels 1–22 correspond to the C zstd level numbering.  Higher values
96    /// produce smaller output at the cost of more CPU time.  Negative values
97    /// select ultra-fast modes that trade ratio for speed.  Level 0 is
98    /// treated as [`DEFAULT_LEVEL`](Self::DEFAULT_LEVEL), matching C zstd
99    /// semantics.
100    ///
101    /// Named variants map to specific numeric levels:
102    /// [`Fastest`](Self::Fastest) = 1, [`Default`](Self::Default) = 3,
103    /// [`Better`](Self::Better) = 7, [`Best`](Self::Best) = 11.
104    /// [`Best`](Self::Best) remains the highest-ratio named preset, but
105    /// [`Level`](Self::Level) values above 11 can target stronger (slower)
106    /// tuning than the named hierarchy.
107    ///
108    /// Levels above 11 use progressively larger windows and deeper search
109    /// with the lazy2 hash-chain backend.  Levels that require strategies
110    /// this crate has not yet implemented (btopt, btultra) are approximated
111    /// with the closest available matcher.
112    ///
113    /// Semver note: this variant was added after the initial enum shape and
114    /// is a breaking API change for downstream crates that exhaustively
115    /// `match` on [`CompressionLevel`] without a wildcard arm.
116    Level(i32),
117}
118
119impl CompressionLevel {
120    /// The minimum supported numeric compression level (ultra-fast mode).
121    pub const MIN_LEVEL: i32 = -131072;
122    /// The maximum supported numeric compression level.
123    pub const MAX_LEVEL: i32 = 22;
124    /// The default numeric compression level (equivalent to [`Default`](Self::Default)).
125    pub const DEFAULT_LEVEL: i32 = 3;
126
127    /// Create a compression level from a numeric value.
128    ///
129    /// Returns named variants for canonical levels (`0`/`3`, `1`, `7`, `11`)
130    /// and [`Level`](Self::Level) for all other values.
131    ///
132    /// With the default matcher backend (`MatchGeneratorDriver`), values
133    /// outside [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL)
134    /// are silently clamped during built-in level parameter resolution.
135    pub const fn from_level(level: i32) -> Self {
136        match level {
137            0 | Self::DEFAULT_LEVEL => Self::Default,
138            1 => Self::Fastest,
139            7 => Self::Better,
140            11 => Self::Best,
141            _ => Self::Level(level),
142        }
143    }
144}
145
146/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
147/// making their own tradeoffs between runtime, memory usage and compression ratio
148///
149/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
150/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
151/// the decoder will need to decode the data again.
152///
153/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
154/// window of data that is being used for matching.
155///
156/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
157///
158/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
159///
160/// This is repeated until no more data is left to be compressed.
161pub trait Matcher {
162    /// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
163    fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
164    /// Get a reference to the last committed space
165    fn get_last_space(&mut self) -> &[u8];
166    /// Commit a space to the matcher so it can be matched against
167    fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
168    /// Just process the data in the last committed space for future matching.
169    fn skip_matching(&mut self);
170    /// Hint-aware skip path used internally to thread a precomputed block
171    /// incompressibility verdict to matcher backends.
172    ///
173    /// Default implementation preserves backwards compatibility for external
174    /// custom matchers by delegating to [`skip_matching`](Self::skip_matching).
175    fn skip_matching_with_hint(&mut self, _incompressible_hint: Option<bool>) {
176        self.skip_matching();
177    }
178    /// Process the data in the last committed space for future matching AND generate matches for the data
179    fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
180    /// Reset this matcher so it can be used for the next new frame
181    fn reset(&mut self, level: CompressionLevel);
182    /// Provide a hint about the total uncompressed size for the next frame.
183    ///
184    /// Implementations may use this to select smaller hash tables and windows
185    /// for small inputs, matching the C zstd source-size-class behavior.
186    /// Called before [`reset`](Self::reset) when the caller knows the input
187    /// size (e.g. from pledged content size or file metadata).
188    ///
189    /// The default implementation is a no-op for custom matchers and
190    /// test stubs. The built-in runtime matcher (`MatchGeneratorDriver`)
191    /// overrides this hook and applies the hint during level resolution.
192    fn set_source_size_hint(&mut self, _size: u64) {}
193    /// Prime matcher state with dictionary history before compressing the next frame.
194    /// Default implementation is a no-op for custom matchers that do not support this.
195    fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {}
196    /// Returns whether this matcher can consume dictionary priming state and produce
197    /// dictionary-dependent sequences. Defaults to `false` for custom matchers.
198    fn supports_dictionary_priming(&self) -> bool {
199        false
200    }
201    /// The size of the window the decoder will need to execute all sequences produced by this matcher.
202    ///
203    /// Must return a positive (non-zero) value; returning 0 causes
204    /// [`StreamingEncoder`] to reject the first write with an invalid-input error
205    /// (`InvalidInput` with `std`, `Other` with `no_std`).
206    ///
207    /// Must remain stable for the lifetime of a frame.
208    /// It may change only after `reset()` is called for the next frame
209    /// (for example because the compression level changed).
210    fn window_size(&self) -> u64;
211}
212
213#[derive(PartialEq, Eq, Debug)]
214/// Sequences that a [`Matcher`] can produce
215pub enum Sequence<'data> {
216    /// Is encoded as a sequence for the decoder sequence execution.
217    ///
218    /// First the literals will be copied to the decoded data,
219    /// then `match_len` bytes are copied from `offset` bytes back in the decoded data
220    Triple {
221        literals: &'data [u8],
222        offset: usize,
223        match_len: usize,
224    },
225    /// This is returned as the last sequence in a block
226    ///
227    /// These literals will just be copied at the end of the sequence execution by the decoder
228    Literals { literals: &'data [u8] },
229}