Skip to main content

structured_zstd/encoding/
mod.rs

1//! Structures and utilities used for compressing/encoding data into the Zstd format.
2
3pub(crate) mod block_header;
4pub(crate) mod blocks;
5pub(crate) mod frame_header;
6pub(crate) mod match_generator;
7pub(crate) mod util;
8
9mod frame_compressor;
10mod levels;
11mod streaming_encoder;
12pub use frame_compressor::FrameCompressor;
13pub use match_generator::MatchGeneratorDriver;
14pub use streaming_encoder::StreamingEncoder;
15
16use crate::io::{Read, Write};
17use alloc::vec::Vec;
18
19/// Convenience function to compress some source into a target without reusing any resources of the compressor
20/// ```rust
21/// use structured_zstd::encoding::{compress, CompressionLevel};
22/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
23/// let mut target = Vec::new();
24/// compress(data, &mut target, CompressionLevel::Fastest);
25/// ```
26pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
27    let mut frame_enc = FrameCompressor::new(level);
28    frame_enc.set_source(source);
29    frame_enc.set_drain(target);
30    frame_enc.compress();
31}
32
33/// Convenience function to compress some source into a Vec without reusing any resources of the compressor
34/// ```rust
35/// use structured_zstd::encoding::{compress_to_vec, CompressionLevel};
36/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
37/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
38/// ```
39pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
40    let mut vec = Vec::new();
41    compress(source, &mut vec, level);
42    vec
43}
44
45/// The compression mode used impacts the speed of compression,
46/// and resulting compression ratios. Faster compression will result
47/// in worse compression ratios, and vice versa.
48#[derive(Copy, Clone)]
49pub enum CompressionLevel {
50    /// This level does not compress the data at all, and simply wraps
51    /// it in a Zstandard frame.
52    Uncompressed,
53    /// This level is roughly equivalent to Zstd compression level 1
54    Fastest,
55    /// This level uses the crate's dedicated `dfast`-style matcher to
56    /// target a better speed/ratio tradeoff than [`CompressionLevel::Fastest`].
57    ///
58    /// It represents this crate's "default" compression setting and may
59    /// evolve in future versions as the implementation moves closer to
60    /// reference zstd level 3 behavior.
61    Default,
62    /// This level is roughly equivalent to Zstd level 7.
63    ///
64    /// Uses the hash-chain matcher with a lazy2 matching strategy: the encoder
65    /// evaluates up to two positions ahead before committing to a match,
66    /// trading speed for a better compression ratio than [`CompressionLevel::Default`].
67    ///
68    /// **Limitation:** hash-chain tables use 32-bit positions. For single-frame
69    /// inputs exceeding ~4 GiB, matches can still be found for roughly one
70    /// window past that point; once all in-window positions exceed `u32::MAX`
71    /// (≈4 GiB + window size), matching becomes effectively repcode-only.
72    /// Prefer [`CompressionLevel::Default`] for very large single-frame streams
73    /// until table rebasing is implemented.
74    Better,
75    /// This level is roughly equivalent to Zstd level 11.
76    ///
77    /// Uses the hash-chain matcher with a deep lazy2 matching strategy and
78    /// a 16 MiB window. Compared to [`CompressionLevel::Better`], this level
79    /// uses larger hash and chain tables (2 M / 1 M entries vs 1 M / 512 K),
80    /// a deeper search (32 candidates vs 16), and a higher target match
81    /// length (128 vs 48), trading speed for the best compression ratio
82    /// available in this crate.
83    ///
84    /// **Limitation:** hash-chain tables use 32-bit positions. For single-frame
85    /// inputs exceeding ~4 GiB, matches can still be found for roughly one
86    /// window past that point; once all in-window positions exceed `u32::MAX`
87    /// (≈4 GiB + window size), matching becomes effectively repcode-only.
88    /// Prefer [`CompressionLevel::Default`] for very large single-frame
89    /// streams until table rebasing is implemented.
90    Best,
91}
92
93/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
94/// making their own tradeoffs between runtime, memory usage and compression ratio
95///
96/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
97/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
98/// the decoder will need to decode the data again.
99///
100/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
101/// window of data that is being used for matching.
102///
103/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
104///
105/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
106///
107/// This is repeated until no more data is left to be compressed.
108pub trait Matcher {
109    /// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
110    fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
111    /// Get a reference to the last commited space
112    fn get_last_space(&mut self) -> &[u8];
113    /// Commit a space to the matcher so it can be matched against
114    fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
115    /// Just process the data in the last commited space for future matching
116    fn skip_matching(&mut self);
117    /// Process the data in the last commited space for future matching AND generate matches for the data
118    fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
119    /// Reset this matcher so it can be used for the next new frame
120    fn reset(&mut self, level: CompressionLevel);
121    /// Prime matcher state with dictionary history before compressing the next frame.
122    /// Default implementation is a no-op for custom matchers that do not support this.
123    fn prime_with_dictionary(&mut self, _dict_content: &[u8], _offset_hist: [u32; 3]) {}
124    /// Returns whether this matcher can consume dictionary priming state and produce
125    /// dictionary-dependent sequences. Defaults to `false` for custom matchers.
126    fn supports_dictionary_priming(&self) -> bool {
127        false
128    }
129    /// The size of the window the decoder will need to execute all sequences produced by this matcher.
130    ///
131    /// Must return a positive (non-zero) value; returning 0 causes
132    /// [`StreamingEncoder`] to reject the first write with an invalid-input error
133    /// (`InvalidInput` with `std`, `Other` with `no_std`).
134    ///
135    /// Must remain stable for the lifetime of a frame.
136    /// It may change only after `reset()` is called for the next frame
137    /// (for example because the compression level changed).
138    fn window_size(&self) -> u64;
139}
140
141#[derive(PartialEq, Eq, Debug)]
142/// Sequences that a [`Matcher`] can produce
143pub enum Sequence<'data> {
144    /// Is encoded as a sequence for the decoder sequence execution.
145    ///
146    /// First the literals will be copied to the decoded data,
147    /// then `match_len` bytes are copied from `offset` bytes back in the decoded data
148    Triple {
149        literals: &'data [u8],
150        offset: usize,
151        match_len: usize,
152    },
153    /// This is returned as the last sequence in a block
154    ///
155    /// These literals will just be copied at the end of the sequence execution by the decoder
156    Literals { literals: &'data [u8] },
157}