Skip to main content

structured_zstd/decoding/
dictionary.rs

1#[cfg(not(target_has_atomic = "ptr"))]
2use alloc::rc::Rc;
3#[cfg(target_has_atomic = "ptr")]
4use alloc::sync::Arc;
5use alloc::vec::Vec;
6use core::convert::TryInto;
7
8use crate::decoding::errors::DictionaryDecodeError;
9use crate::decoding::scratch::FSEScratch;
10use crate::decoding::scratch::HuffmanScratch;
11
12/// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used
13/// during sequence execution.
14///
15/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
16#[derive(Clone)]
17pub struct Dictionary {
18    /// A 4 byte value used by decoders to check if they can use
19    /// the correct dictionary. This value must not be zero.
20    pub id: u32,
21    /// A dictionary can contain an entropy table, either FSE or
22    /// Huffman.
23    pub fse: FSEScratch,
24    /// A dictionary can contain an entropy table, either FSE or
25    /// Huffman.
26    pub huf: HuffmanScratch,
27    /// The content of a dictionary acts as a "past" in front of data
28    /// to compress or decompress,
29    /// so it can be referenced in sequence commands.
30    /// As long as the amount of data decoded from this frame is less than or
31    /// equal to Window_Size, sequence commands may specify offsets longer than
32    /// the total length of decoded output so far to reference back to the
33    /// dictionary, even parts of the dictionary with offsets larger than Window_Size.
34    /// After the total output has surpassed Window_Size however,
35    /// this is no longer allowed and the dictionary is no longer accessible
36    pub dict_content: Vec<u8>,
37    /// The 3 most recent offsets are stored so that they can be used
38    /// during sequence execution, see
39    /// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets>
40    /// for more.
41    pub offset_hist: [u32; 3],
42}
43
44#[cfg(target_has_atomic = "ptr")]
45type SharedDictionary = Arc<Dictionary>;
46#[cfg(not(target_has_atomic = "ptr"))]
47type SharedDictionary = Rc<Dictionary>;
48
49/// Shared pre-parsed dictionary handle for repeated decoding.
50///
51/// Uses `Arc` on targets with atomics and falls back to `Rc` otherwise.
52#[derive(Clone)]
53pub struct DictionaryHandle {
54    inner: SharedDictionary,
55}
56
57/// This 4 byte (little endian) magic number refers to the start of a dictionary
58pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
59
60impl Dictionary {
61    /// Heap bytes owned by this dictionary: the content plus the parsed
62    /// entropy tables' heap (the fixed-size FSE decode arrays are inline,
63    /// counted by `size_of::<Dictionary>()`).
64    pub fn heap_bytes(&self) -> usize {
65        self.dict_content.capacity() + self.fse.heap_bytes() + self.huf.heap_bytes()
66    }
67
68    /// Build a dictionary from raw content bytes (without entropy table sections).
69    ///
70    /// This is primarily intended for dictionaries produced by the `dict_builder`
71    /// module, which currently emits raw-content dictionaries.
72    pub fn from_raw_content(
73        id: u32,
74        dict_content: Vec<u8>,
75    ) -> Result<Dictionary, DictionaryDecodeError> {
76        if id == 0 {
77            return Err(DictionaryDecodeError::ZeroDictionaryId);
78        }
79        if dict_content.is_empty() {
80            return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
81        }
82
83        Ok(Dictionary {
84            id,
85            fse: FSEScratch::new(),
86            huf: HuffmanScratch::new(),
87            dict_content,
88            offset_hist: [1, 4, 8],
89        })
90    }
91
92    /// Parses the dictionary from `raw`, initializes its tables,
93    /// and returns a fully constructed [`Dictionary`] whose `id` can be
94    /// checked against the frame's `dict_id`.
95    pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
96        Self::decode_dict_inner(raw, true)
97    }
98
99    /// Parse a dictionary for ENCODER use: builds the entropy
100    /// probabilities/weights needed by `to_encoder_table` but skips the
101    /// decode-only work the encoder never reads — the FSE *decoding*
102    /// tables + their `enrich_*` post-passes, and the HUF decode lookup
103    /// table (`packed_decode`). Produces a [`Dictionary`] whose FSE
104    /// `symbol_probabilities` / `accuracy_log` and HUF `bits` /
105    /// `max_num_bits` match `decode_dict` exactly, so the encoder entropy
106    /// tables — and thus the emitted frame — are byte-identical; only the
107    /// wasted decode-table builds are dropped. Offset history + content
108    /// are parsed the same way.
109    /// Crate-internal: the returned [`Dictionary`] deliberately has no
110    /// decode lookup tables (`packed_decode` / FSE `decode`), so it is
111    /// NOT safe to feed into a [`FrameDecoder`](crate::decoding::FrameDecoder)
112    /// — Huffman decode would index an empty `packed_decode`. The only caller
113    /// is `EncoderDictionary::from_bytes`, which wraps the result in the
114    /// encoder-only `EncoderDictionary` type (no decode path), so this
115    /// incomplete dictionary can never escape to the decode side. Keeping
116    /// this `pub(crate)` keeps it off the public `Dictionary` API entirely.
117    pub(crate) fn decode_dict_for_encoding(
118        raw: &[u8],
119    ) -> Result<Dictionary, DictionaryDecodeError> {
120        Self::decode_dict_inner(raw, false)
121    }
122
123    /// Shared dictionary parser. `build_decode_tables` selects whether the
124    /// FSE/HUF tables get their full decoding tables (FSE decode table +
125    /// `enrich_*`, HUF `packed_decode`; decoder path) or only the
126    /// probability/weight parse (encoder path — see
127    /// [`Self::decode_dict_for_encoding`]).
128    fn decode_dict_inner(
129        raw: &[u8],
130        build_decode_tables: bool,
131    ) -> Result<Dictionary, DictionaryDecodeError> {
132        const MIN_MAGIC_AND_ID_LEN: usize = 8;
133        const OFFSET_HISTORY_LEN: usize = 12;
134
135        if raw.len() < MIN_MAGIC_AND_ID_LEN {
136            return Err(DictionaryDecodeError::DictionaryTooSmall {
137                got: raw.len(),
138                need: MIN_MAGIC_AND_ID_LEN,
139            });
140        }
141
142        let mut new_dict = Dictionary {
143            id: 0,
144            fse: FSEScratch::new(),
145            huf: HuffmanScratch::new(),
146            dict_content: Vec::new(),
147            offset_hist: [1, 4, 8],
148        };
149
150        let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
151        if magic_num != MAGIC_NUM {
152            return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
153        }
154
155        let dict_id = raw[4..8].try_into().expect("optimized away");
156        let dict_id = u32::from_le_bytes(dict_id);
157        if dict_id == 0 {
158            return Err(DictionaryDecodeError::ZeroDictionaryId);
159        }
160        new_dict.id = dict_id;
161
162        let raw_tables = &raw[8..];
163
164        let huf_size = if build_decode_tables {
165            new_dict.huf.table.build_decoder(raw_tables)?
166        } else {
167            new_dict.huf.table.build_weights_only(raw_tables)?
168        };
169        let raw_tables = &raw_tables[huf_size as usize..];
170
171        let of_size = if build_decode_tables {
172            let n = new_dict.fse.offsets.build_decoder(
173                raw_tables,
174                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
175            )?;
176            new_dict.fse.offsets.enrich_for_offsets();
177            // Compute the pipeline-gate long-offset share ONCE here, while the
178            // dictionary handle is built, so the per-decode `init_from_dict`
179            // path can COPY it instead of re-walking the offsets table on every
180            // `decode_*_with_dict_handle` call (the dict is immutable, so the
181            // share never changes after this).
182            new_dict.fse.offsets_long_share =
183                crate::decoding::sequence_section_decoder::compute_offsets_long_share(
184                    &new_dict.fse.offsets,
185                );
186            n
187        } else {
188            new_dict.fse.offsets.read_table_probabilities(
189                raw_tables,
190                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
191            )?
192        };
193        let raw_tables = &raw_tables[of_size..];
194
195        let ml_size = if build_decode_tables {
196            let n = new_dict.fse.match_lengths.build_decoder(
197                raw_tables,
198                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
199            )?;
200            new_dict
201                .fse
202                .match_lengths
203                .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::ML_META);
204            n
205        } else {
206            new_dict.fse.match_lengths.read_table_probabilities(
207                raw_tables,
208                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
209            )?
210        };
211        let raw_tables = &raw_tables[ml_size..];
212
213        let ll_size = if build_decode_tables {
214            let n = new_dict.fse.literal_lengths.build_decoder(
215                raw_tables,
216                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
217            )?;
218            new_dict
219                .fse
220                .literal_lengths
221                .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::LL_META);
222            n
223        } else {
224            new_dict.fse.literal_lengths.read_table_probabilities(
225                raw_tables,
226                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
227            )?
228        };
229        let raw_tables = &raw_tables[ll_size..];
230
231        if raw_tables.len() < OFFSET_HISTORY_LEN {
232            return Err(DictionaryDecodeError::DictionaryTooSmall {
233                got: raw_tables.len(),
234                need: OFFSET_HISTORY_LEN,
235            });
236        }
237
238        let offset1 = raw_tables[0..4].try_into().expect("optimized away");
239        let offset1 = u32::from_le_bytes(offset1);
240
241        let offset2 = raw_tables[4..8].try_into().expect("optimized away");
242        let offset2 = u32::from_le_bytes(offset2);
243
244        let offset3 = raw_tables[8..12].try_into().expect("optimized away");
245        let offset3 = u32::from_le_bytes(offset3);
246
247        if offset1 == 0 {
248            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
249        }
250        if offset2 == 0 {
251            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
252        }
253        if offset3 == 0 {
254            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
255        }
256
257        new_dict.offset_hist[0] = offset1;
258        new_dict.offset_hist[1] = offset2;
259        new_dict.offset_hist[2] = offset3;
260
261        let raw_content = &raw_tables[12..];
262        new_dict.dict_content.extend(raw_content);
263
264        Ok(new_dict)
265    }
266
267    /// Convert this parsed dictionary into a reusable shared handle.
268    pub fn into_handle(self) -> DictionaryHandle {
269        DictionaryHandle::from_dictionary(self)
270    }
271}
272
273impl DictionaryHandle {
274    /// Wrap an already-parsed dictionary in a shared handle.
275    pub fn from_dictionary(dict: Dictionary) -> Self {
276        Self {
277            inner: SharedDictionary::new(dict),
278        }
279    }
280
281    /// Parse a serialized dictionary and return a reusable shared handle.
282    pub fn decode_dict(raw: &[u8]) -> Result<Self, DictionaryDecodeError> {
283        Dictionary::decode_dict(raw).map(Self::from_dictionary)
284    }
285
286    pub fn id(&self) -> u32 {
287        self.inner.id
288    }
289
290    pub fn as_dict(&self) -> &Dictionary {
291        &self.inner
292    }
293}
294
295impl AsRef<Dictionary> for DictionaryHandle {
296    fn as_ref(&self) -> &Dictionary {
297        self.as_dict()
298    }
299}
300
301impl From<Dictionary> for DictionaryHandle {
302    fn from(dict: Dictionary) -> Self {
303        DictionaryHandle::from_dictionary(dict)
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use alloc::vec;
311
312    fn offset_history_start(raw: &[u8]) -> usize {
313        let mut huf = crate::decoding::scratch::HuffmanScratch::new();
314        let mut fse = crate::decoding::scratch::FSEScratch::new();
315        let mut cursor = 8usize;
316
317        let huf_size = huf
318            .table
319            .build_decoder(&raw[cursor..])
320            .expect("reference dictionary huffman table should decode");
321        cursor += huf_size as usize;
322
323        let of_size = fse
324            .offsets
325            .build_decoder(
326                &raw[cursor..],
327                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
328            )
329            .expect("reference dictionary OF table should decode");
330        cursor += of_size;
331
332        let ml_size = fse
333            .match_lengths
334            .build_decoder(
335                &raw[cursor..],
336                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
337            )
338            .expect("reference dictionary ML table should decode");
339        cursor += ml_size;
340
341        let ll_size = fse
342            .literal_lengths
343            .build_decoder(
344                &raw[cursor..],
345                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
346            )
347            .expect("reference dictionary LL table should decode");
348        cursor += ll_size;
349
350        cursor
351    }
352
353    #[test]
354    fn decode_dict_rejects_short_buffer_before_magic_and_id() {
355        let err = match Dictionary::decode_dict(&[]) {
356            Ok(_) => panic!("expected short dictionary to fail"),
357            Err(err) => err,
358        };
359        assert!(matches!(
360            err,
361            DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
362        ));
363    }
364
365    #[test]
366    fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
367        let mut raw = Vec::new();
368        raw.extend_from_slice(&MAGIC_NUM);
369        raw.extend_from_slice(&1u32.to_le_bytes());
370        raw.extend_from_slice(&[0u8; 7]);
371
372        let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
373        assert!(
374            result.is_ok(),
375            "decode_dict must not panic on malformed input"
376        );
377        assert!(
378            result.unwrap().is_err(),
379            "malformed dictionary must return error"
380        );
381    }
382
383    #[test]
384    fn decode_dict_rejects_zero_repeat_offsets() {
385        let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
386        let offset_start = offset_history_start(&raw);
387
388        // Corrupt rep0 to zero.
389        raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
390        let decoded = Dictionary::decode_dict(&raw);
391        assert!(matches!(
392            decoded,
393            Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
394        ));
395    }
396
397    #[test]
398    fn from_raw_content_rejects_empty_dictionary_content() {
399        let result = Dictionary::from_raw_content(1, Vec::new());
400        assert!(matches!(
401            result,
402            Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
403        ));
404    }
405
406    #[test]
407    fn dictionary_handle_from_raw_content_supports_as_ref() {
408        let dict = Dictionary::from_raw_content(7, vec![42]).expect("raw dict should build");
409        let handle = dict.into_handle();
410        let dict_ref: &Dictionary = handle.as_ref();
411
412        assert_eq!(dict_ref.id, 7);
413        assert_eq!(dict_ref.dict_content.as_slice(), &[42]);
414    }
415
416    #[test]
417    fn dictionary_handle_clones_share_inner() {
418        let raw = include_bytes!("../../dict_tests/dictionary");
419        let handle = DictionaryHandle::decode_dict(raw).expect("dictionary should parse");
420        let clone = handle.clone();
421
422        assert_eq!(handle.id(), clone.id());
423        assert!(SharedDictionary::ptr_eq(&handle.inner, &clone.inner));
424    }
425}