structured_zstd/encoding/mod.rs
1//! Structures and utilities used for compressing/encoding data into the Zstd format.
2
3pub(crate) mod block_header;
4pub(crate) mod blocks;
5pub(crate) mod frame_header;
6pub(crate) mod match_generator;
7pub(crate) mod util;
8
9mod frame_compressor;
10mod levels;
11mod streaming_encoder;
12pub use frame_compressor::FrameCompressor;
13pub use match_generator::MatchGeneratorDriver;
14pub use streaming_encoder::StreamingEncoder;
15
16use crate::io::{Read, Write};
17use alloc::vec::Vec;
18
19/// Convenience function to compress some source into a target without reusing any resources of the compressor
20/// ```rust
21/// use structured_zstd::encoding::{compress, CompressionLevel};
22/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
23/// let mut target = Vec::new();
24/// compress(data, &mut target, CompressionLevel::Fastest);
25/// ```
26pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
27 let mut frame_enc = FrameCompressor::new(level);
28 frame_enc.set_source(source);
29 frame_enc.set_drain(target);
30 frame_enc.compress();
31}
32
33/// Convenience function to compress some source into a Vec without reusing any resources of the compressor.
34///
35/// This helper eagerly buffers the full input (`Read`) before compression so it
36/// can provide a source-size hint to the one-shot encoder path. Peak memory can
37/// therefore be roughly `input_size + output_size`. For very large payloads or
38/// tighter memory budgets, prefer streaming APIs such as [`StreamingEncoder`].
39/// ```rust
40/// use structured_zstd::encoding::{compress_to_vec, CompressionLevel};
41/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
42/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
43/// ```
44pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
45 let mut source = source;
46 let mut input = Vec::new();
47 source.read_to_end(&mut input).unwrap();
48
49 let mut vec = Vec::new();
50 let mut frame_enc = FrameCompressor::new(level);
51 frame_enc.set_source_size_hint(input.len() as u64);
52 frame_enc.set_source(input.as_slice());
53 frame_enc.set_drain(&mut vec);
54 frame_enc.compress();
55 vec
56}
57
58/// The compression mode used impacts the speed of compression,
59/// and resulting compression ratios. Faster compression will result
60/// in worse compression ratios, and vice versa.
61#[derive(Copy, Clone, Debug)]
62pub enum CompressionLevel {
63 /// This level does not compress the data at all, and simply wraps
64 /// it in a Zstandard frame.
65 Uncompressed,
66 /// This level is roughly equivalent to Zstd compression level 1
67 Fastest,
68 /// This level uses the crate's dedicated `dfast`-style matcher to
69 /// target a better speed/ratio tradeoff than [`CompressionLevel::Fastest`].
70 ///
71 /// It represents this crate's "default" compression setting and may
72 /// evolve in future versions as the implementation moves closer to
73 /// reference zstd level 3 behavior.
74 Default,
75 /// This level is roughly equivalent to Zstd level 7.
76 ///
77 /// Uses the hash-chain matcher with a lazy2 matching strategy: the encoder
78 /// evaluates up to two positions ahead before committing to a match,
79 /// trading speed for a better compression ratio than [`CompressionLevel::Default`].
80 Better,
81 /// This level is roughly equivalent to Zstd level 11.
82 ///
83 /// Uses the hash-chain matcher with a deep lazy2 matching strategy and
84 /// a 16 MiB window. Compared to [`CompressionLevel::Better`], this level
85 /// uses larger hash and chain tables (2 M / 1 M entries vs 1 M / 512 K),
86 /// a deeper search (32 candidates vs 16), and a higher target match
87 /// length (128 vs 48), trading speed for the best compression ratio
88 /// available in this crate.
89 Best,
90 /// Numeric compression level.
91 ///
92 /// Levels 1–22 correspond to the C zstd level numbering. Higher values
93 /// produce smaller output at the cost of more CPU time. Negative values
94 /// select ultra-fast modes that trade ratio for speed. Level 0 is
95 /// treated as [`DEFAULT_LEVEL`](Self::DEFAULT_LEVEL), matching C zstd
96 /// semantics.
97 ///
98 /// Named variants map to specific numeric levels:
99 /// [`Fastest`](Self::Fastest) = 1, [`Default`](Self::Default) = 3,
100 /// [`Better`](Self::Better) = 7, [`Best`](Self::Best) = 11.
101 /// [`Best`](Self::Best) remains the highest-ratio named preset, but
102 /// [`Level`](Self::Level) values above 11 can target stronger (slower)
103 /// tuning than the named hierarchy.
104 ///
105 /// Levels above 11 use progressively larger windows and deeper search
106 /// with the lazy2 hash-chain backend. Levels that require strategies
107 /// this crate has not yet implemented (btopt, btultra) are approximated
108 /// with the closest available matcher.
109 ///
110 /// Semver note: this variant was added after the initial enum shape and
111 /// is a breaking API change for downstream crates that exhaustively
112 /// `match` on [`CompressionLevel`] without a wildcard arm.
113 Level(i32),
114}
115
116impl CompressionLevel {
117 /// The minimum supported numeric compression level (ultra-fast mode).
118 pub const MIN_LEVEL: i32 = -131072;
119 /// The maximum supported numeric compression level.
120 pub const MAX_LEVEL: i32 = 22;
121 /// The default numeric compression level (equivalent to [`Default`](Self::Default)).
122 pub const DEFAULT_LEVEL: i32 = 3;
123
124 /// Create a compression level from a numeric value.
125 ///
126 /// Returns named variants for canonical levels (`0`/`3`, `1`, `7`, `11`)
127 /// and [`Level`](Self::Level) for all other values.
128 ///
129 /// With the default matcher backend (`MatchGeneratorDriver`), values
130 /// outside [`MIN_LEVEL`](Self::MIN_LEVEL)..=[`MAX_LEVEL`](Self::MAX_LEVEL)
131 /// are silently clamped during built-in level parameter resolution.
132 pub const fn from_level(level: i32) -> Self {
133 match level {
134 0 | Self::DEFAULT_LEVEL => Self::Default,
135 1 => Self::Fastest,
136 7 => Self::Better,
137 11 => Self::Best,
138 _ => Self::Level(level),
139 }
140 }
141}
142
143/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
144/// making their own tradeoffs between runtime, memory usage and compression ratio
145///
146/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
147/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
148/// the decoder will need to decode the data again.
149///
150/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
151/// window of data that is being used for matching.
152///
153/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
154///
155/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
156///
157/// This is repeated until no more data is left to be compressed.
158pub trait Matcher {
159 /// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
160 fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
161 /// Get a reference to the last commited space
162 fn get_last_space(&mut self) -> &[u8];
163 /// Commit a space to the matcher so it can be matched against
164 fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
165 /// Just process the data in the last commited space for future matching
166 fn skip_matching(&mut self);
167 /// Process the data in the last commited space for future matching AND generate matches for the data
168 fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
169 /// Reset this matcher so it can be used for the next new frame
170 fn reset(&mut self, level: CompressionLevel);
171 /// Provide a hint about the total uncompressed size for the next frame.
172 ///
173 /// Implementations may use this to select smaller hash tables and windows
174 /// for small inputs, matching the C zstd source-size-class behavior.
175 /// Called before [`reset`](Self::reset) when the caller knows the input
176 /// size (e.g. from pledged content size or file metadata).
177 ///
178 /// The default implementation is a no-op for custom matchers and
179 /// test stubs. The built-in runtime matcher (`MatchGeneratorDriver`)
180 /// overrides this hook and applies the hint during level resolution.
181 fn set_source_size_hint(&mut self, _size: u64) {}
182 /// Prime matcher state with dictionary history before compressing the next frame.
183 /// Default implementation is a no-op for custom matchers that do not support this.
184 fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {}
185 /// Returns whether this matcher can consume dictionary priming state and produce
186 /// dictionary-dependent sequences. Defaults to `false` for custom matchers.
187 fn supports_dictionary_priming(&self) -> bool {
188 false
189 }
190 /// The size of the window the decoder will need to execute all sequences produced by this matcher.
191 ///
192 /// Must return a positive (non-zero) value; returning 0 causes
193 /// [`StreamingEncoder`] to reject the first write with an invalid-input error
194 /// (`InvalidInput` with `std`, `Other` with `no_std`).
195 ///
196 /// Must remain stable for the lifetime of a frame.
197 /// It may change only after `reset()` is called for the next frame
198 /// (for example because the compression level changed).
199 fn window_size(&self) -> u64;
200}
201
202#[derive(PartialEq, Eq, Debug)]
203/// Sequences that a [`Matcher`] can produce
204pub enum Sequence<'data> {
205 /// Is encoded as a sequence for the decoder sequence execution.
206 ///
207 /// First the literals will be copied to the decoded data,
208 /// then `match_len` bytes are copied from `offset` bytes back in the decoded data
209 Triple {
210 literals: &'data [u8],
211 offset: usize,
212 match_len: usize,
213 },
214 /// This is returned as the last sequence in a block
215 ///
216 /// These literals will just be copied at the end of the sequence execution by the decoder
217 Literals { literals: &'data [u8] },
218}