Skip to main content

structured_zstd/decoding/
dictionary.rs

1#[cfg(not(target_has_atomic = "ptr"))]
2use alloc::rc::Rc;
3#[cfg(target_has_atomic = "ptr")]
4use alloc::sync::Arc;
5use alloc::vec::Vec;
6use core::convert::TryInto;
7
8use crate::decoding::errors::DictionaryDecodeError;
9use crate::decoding::scratch::FSEScratch;
10use crate::decoding::scratch::HuffmanScratch;
11
12/// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used
13/// during sequence execution.
14///
15/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
16pub struct Dictionary {
17    /// A 4 byte value used by decoders to check if they can use
18    /// the correct dictionary. This value must not be zero.
19    pub id: u32,
20    /// A dictionary can contain an entropy table, either FSE or
21    /// Huffman.
22    pub fse: FSEScratch,
23    /// A dictionary can contain an entropy table, either FSE or
24    /// Huffman.
25    pub huf: HuffmanScratch,
26    /// The content of a dictionary acts as a "past" in front of data
27    /// to compress or decompress,
28    /// so it can be referenced in sequence commands.
29    /// As long as the amount of data decoded from this frame is less than or
30    /// equal to Window_Size, sequence commands may specify offsets longer than
31    /// the total length of decoded output so far to reference back to the
32    /// dictionary, even parts of the dictionary with offsets larger than Window_Size.
33    /// After the total output has surpassed Window_Size however,
34    /// this is no longer allowed and the dictionary is no longer accessible
35    pub dict_content: Vec<u8>,
36    /// The 3 most recent offsets are stored so that they can be used
37    /// during sequence execution, see
38    /// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets>
39    /// for more.
40    pub offset_hist: [u32; 3],
41}
42
43#[cfg(target_has_atomic = "ptr")]
44type SharedDictionary = Arc<Dictionary>;
45#[cfg(not(target_has_atomic = "ptr"))]
46type SharedDictionary = Rc<Dictionary>;
47
48/// Shared pre-parsed dictionary handle for repeated decoding.
49///
50/// Uses `Arc` on targets with atomics and falls back to `Rc` otherwise.
51#[derive(Clone)]
52pub struct DictionaryHandle {
53    inner: SharedDictionary,
54}
55
56/// This 4 byte (little endian) magic number refers to the start of a dictionary
57pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
58
59impl Dictionary {
60    /// Build a dictionary from raw content bytes (without entropy table sections).
61    ///
62    /// This is primarily intended for dictionaries produced by the `dict_builder`
63    /// module, which currently emits raw-content dictionaries.
64    pub fn from_raw_content(
65        id: u32,
66        dict_content: Vec<u8>,
67    ) -> Result<Dictionary, DictionaryDecodeError> {
68        if id == 0 {
69            return Err(DictionaryDecodeError::ZeroDictionaryId);
70        }
71        if dict_content.is_empty() {
72            return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
73        }
74
75        Ok(Dictionary {
76            id,
77            fse: FSEScratch::new(),
78            huf: HuffmanScratch::new(),
79            dict_content,
80            offset_hist: [1, 4, 8],
81        })
82    }
83
84    /// Parses the dictionary from `raw`, initializes its tables,
85    /// and returns a fully constructed [`Dictionary`] whose `id` can be
86    /// checked against the frame's `dict_id`.
87    pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
88        const MIN_MAGIC_AND_ID_LEN: usize = 8;
89        const OFFSET_HISTORY_LEN: usize = 12;
90
91        if raw.len() < MIN_MAGIC_AND_ID_LEN {
92            return Err(DictionaryDecodeError::DictionaryTooSmall {
93                got: raw.len(),
94                need: MIN_MAGIC_AND_ID_LEN,
95            });
96        }
97
98        let mut new_dict = Dictionary {
99            id: 0,
100            fse: FSEScratch::new(),
101            huf: HuffmanScratch::new(),
102            dict_content: Vec::new(),
103            offset_hist: [1, 4, 8],
104        };
105
106        let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
107        if magic_num != MAGIC_NUM {
108            return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
109        }
110
111        let dict_id = raw[4..8].try_into().expect("optimized away");
112        let dict_id = u32::from_le_bytes(dict_id);
113        if dict_id == 0 {
114            return Err(DictionaryDecodeError::ZeroDictionaryId);
115        }
116        new_dict.id = dict_id;
117
118        let raw_tables = &raw[8..];
119
120        let huf_size = new_dict.huf.table.build_decoder(raw_tables)?;
121        let raw_tables = &raw_tables[huf_size as usize..];
122
123        let of_size = new_dict.fse.offsets.build_decoder(
124            raw_tables,
125            crate::decoding::sequence_section_decoder::OF_MAX_LOG,
126        )?;
127        new_dict.fse.offsets.enrich_for_offsets();
128        let raw_tables = &raw_tables[of_size..];
129
130        let ml_size = new_dict.fse.match_lengths.build_decoder(
131            raw_tables,
132            crate::decoding::sequence_section_decoder::ML_MAX_LOG,
133        )?;
134        new_dict
135            .fse
136            .match_lengths
137            .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::ML_META);
138        let raw_tables = &raw_tables[ml_size..];
139
140        let ll_size = new_dict.fse.literal_lengths.build_decoder(
141            raw_tables,
142            crate::decoding::sequence_section_decoder::LL_MAX_LOG,
143        )?;
144        new_dict
145            .fse
146            .literal_lengths
147            .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::LL_META);
148        let raw_tables = &raw_tables[ll_size..];
149
150        if raw_tables.len() < OFFSET_HISTORY_LEN {
151            return Err(DictionaryDecodeError::DictionaryTooSmall {
152                got: raw_tables.len(),
153                need: OFFSET_HISTORY_LEN,
154            });
155        }
156
157        let offset1 = raw_tables[0..4].try_into().expect("optimized away");
158        let offset1 = u32::from_le_bytes(offset1);
159
160        let offset2 = raw_tables[4..8].try_into().expect("optimized away");
161        let offset2 = u32::from_le_bytes(offset2);
162
163        let offset3 = raw_tables[8..12].try_into().expect("optimized away");
164        let offset3 = u32::from_le_bytes(offset3);
165
166        if offset1 == 0 {
167            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
168        }
169        if offset2 == 0 {
170            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
171        }
172        if offset3 == 0 {
173            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
174        }
175
176        new_dict.offset_hist[0] = offset1;
177        new_dict.offset_hist[1] = offset2;
178        new_dict.offset_hist[2] = offset3;
179
180        let raw_content = &raw_tables[12..];
181        new_dict.dict_content.extend(raw_content);
182
183        Ok(new_dict)
184    }
185
186    /// Convert this parsed dictionary into a reusable shared handle.
187    pub fn into_handle(self) -> DictionaryHandle {
188        DictionaryHandle::from_dictionary(self)
189    }
190}
191
192impl DictionaryHandle {
193    /// Wrap an already-parsed dictionary in a shared handle.
194    pub fn from_dictionary(dict: Dictionary) -> Self {
195        Self {
196            inner: SharedDictionary::new(dict),
197        }
198    }
199
200    /// Parse a serialized dictionary and return a reusable shared handle.
201    pub fn decode_dict(raw: &[u8]) -> Result<Self, DictionaryDecodeError> {
202        Dictionary::decode_dict(raw).map(Self::from_dictionary)
203    }
204
205    pub fn id(&self) -> u32 {
206        self.inner.id
207    }
208
209    pub fn as_dict(&self) -> &Dictionary {
210        &self.inner
211    }
212}
213
214impl AsRef<Dictionary> for DictionaryHandle {
215    fn as_ref(&self) -> &Dictionary {
216        self.as_dict()
217    }
218}
219
220impl From<Dictionary> for DictionaryHandle {
221    fn from(dict: Dictionary) -> Self {
222        DictionaryHandle::from_dictionary(dict)
223    }
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229    use alloc::vec;
230
231    fn offset_history_start(raw: &[u8]) -> usize {
232        let mut huf = crate::decoding::scratch::HuffmanScratch::new();
233        let mut fse = crate::decoding::scratch::FSEScratch::new();
234        let mut cursor = 8usize;
235
236        let huf_size = huf
237            .table
238            .build_decoder(&raw[cursor..])
239            .expect("reference dictionary huffman table should decode");
240        cursor += huf_size as usize;
241
242        let of_size = fse
243            .offsets
244            .build_decoder(
245                &raw[cursor..],
246                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
247            )
248            .expect("reference dictionary OF table should decode");
249        cursor += of_size;
250
251        let ml_size = fse
252            .match_lengths
253            .build_decoder(
254                &raw[cursor..],
255                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
256            )
257            .expect("reference dictionary ML table should decode");
258        cursor += ml_size;
259
260        let ll_size = fse
261            .literal_lengths
262            .build_decoder(
263                &raw[cursor..],
264                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
265            )
266            .expect("reference dictionary LL table should decode");
267        cursor += ll_size;
268
269        cursor
270    }
271
272    #[test]
273    fn decode_dict_rejects_short_buffer_before_magic_and_id() {
274        let err = match Dictionary::decode_dict(&[]) {
275            Ok(_) => panic!("expected short dictionary to fail"),
276            Err(err) => err,
277        };
278        assert!(matches!(
279            err,
280            DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
281        ));
282    }
283
284    #[test]
285    fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
286        let mut raw = Vec::new();
287        raw.extend_from_slice(&MAGIC_NUM);
288        raw.extend_from_slice(&1u32.to_le_bytes());
289        raw.extend_from_slice(&[0u8; 7]);
290
291        let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
292        assert!(
293            result.is_ok(),
294            "decode_dict must not panic on malformed input"
295        );
296        assert!(
297            result.unwrap().is_err(),
298            "malformed dictionary must return error"
299        );
300    }
301
302    #[test]
303    fn decode_dict_rejects_zero_repeat_offsets() {
304        let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
305        let offset_start = offset_history_start(&raw);
306
307        // Corrupt rep0 to zero.
308        raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
309        let decoded = Dictionary::decode_dict(&raw);
310        assert!(matches!(
311            decoded,
312            Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
313        ));
314    }
315
316    #[test]
317    fn from_raw_content_rejects_empty_dictionary_content() {
318        let result = Dictionary::from_raw_content(1, Vec::new());
319        assert!(matches!(
320            result,
321            Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
322        ));
323    }
324
325    #[test]
326    fn dictionary_handle_from_raw_content_supports_as_ref() {
327        let dict = Dictionary::from_raw_content(7, vec![42]).expect("raw dict should build");
328        let handle = dict.into_handle();
329        let dict_ref: &Dictionary = handle.as_ref();
330
331        assert_eq!(dict_ref.id, 7);
332        assert_eq!(dict_ref.dict_content.as_slice(), &[42]);
333    }
334
335    #[test]
336    fn dictionary_handle_clones_share_inner() {
337        let raw = include_bytes!("../../dict_tests/dictionary");
338        let handle = DictionaryHandle::decode_dict(raw).expect("dictionary should parse");
339        let clone = handle.clone();
340
341        assert_eq!(handle.id(), clone.id());
342        assert!(SharedDictionary::ptr_eq(&handle.inner, &clone.inner));
343    }
344}