Skip to main content

structured_zstd/decoding/
dictionary.rs

1use alloc::vec::Vec;
2use core::convert::TryInto;
3
4use crate::decoding::errors::DictionaryDecodeError;
5use crate::decoding::scratch::FSEScratch;
6use crate::decoding::scratch::HuffmanScratch;
7
8/// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used
9/// during sequence execution.
10///
11/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
12pub struct Dictionary {
13    /// A 4 byte value used by decoders to check if they can use
14    /// the correct dictionary. This value must not be zero.
15    pub id: u32,
16    /// A dictionary can contain an entropy table, either FSE or
17    /// Huffman.
18    pub fse: FSEScratch,
19    /// A dictionary can contain an entropy table, either FSE or
20    /// Huffman.
21    pub huf: HuffmanScratch,
22    /// The content of a dictionary acts as a "past" in front of data
23    /// to compress or decompress,
24    /// so it can be referenced in sequence commands.
25    /// As long as the amount of data decoded from this frame is less than or
26    /// equal to Window_Size, sequence commands may specify offsets longer than
27    /// the total length of decoded output so far to reference back to the
28    /// dictionary, even parts of the dictionary with offsets larger than Window_Size.
29    /// After the total output has surpassed Window_Size however,
30    /// this is no longer allowed and the dictionary is no longer accessible
31    pub dict_content: Vec<u8>,
32    /// The 3 most recent offsets are stored so that they can be used
33    /// during sequence execution, see
34    /// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets>
35    /// for more.
36    pub offset_hist: [u32; 3],
37}
38
39/// This 4 byte (little endian) magic number refers to the start of a dictionary
40pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
41
42impl Dictionary {
43    /// Build a dictionary from raw content bytes (without entropy table sections).
44    ///
45    /// This is primarily intended for dictionaries produced by the `dict_builder`
46    /// module, which currently emits raw-content dictionaries.
47    pub fn from_raw_content(
48        id: u32,
49        dict_content: Vec<u8>,
50    ) -> Result<Dictionary, DictionaryDecodeError> {
51        if id == 0 {
52            return Err(DictionaryDecodeError::ZeroDictionaryId);
53        }
54        if dict_content.is_empty() {
55            return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
56        }
57
58        Ok(Dictionary {
59            id,
60            fse: FSEScratch::new(),
61            huf: HuffmanScratch::new(),
62            dict_content,
63            offset_hist: [1, 4, 8],
64        })
65    }
66
67    /// Parses the dictionary from `raw`, initializes its tables,
68    /// and returns a fully constructed [`Dictionary`] whose `id` can be
69    /// checked against the frame's `dict_id`.
70    pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
71        const MIN_MAGIC_AND_ID_LEN: usize = 8;
72        const OFFSET_HISTORY_LEN: usize = 12;
73
74        if raw.len() < MIN_MAGIC_AND_ID_LEN {
75            return Err(DictionaryDecodeError::DictionaryTooSmall {
76                got: raw.len(),
77                need: MIN_MAGIC_AND_ID_LEN,
78            });
79        }
80
81        let mut new_dict = Dictionary {
82            id: 0,
83            fse: FSEScratch::new(),
84            huf: HuffmanScratch::new(),
85            dict_content: Vec::new(),
86            offset_hist: [1, 4, 8],
87        };
88
89        let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
90        if magic_num != MAGIC_NUM {
91            return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
92        }
93
94        let dict_id = raw[4..8].try_into().expect("optimized away");
95        let dict_id = u32::from_le_bytes(dict_id);
96        if dict_id == 0 {
97            return Err(DictionaryDecodeError::ZeroDictionaryId);
98        }
99        new_dict.id = dict_id;
100
101        let raw_tables = &raw[8..];
102
103        let huf_size = new_dict.huf.table.build_decoder(raw_tables)?;
104        let raw_tables = &raw_tables[huf_size as usize..];
105
106        let of_size = new_dict.fse.offsets.build_decoder(
107            raw_tables,
108            crate::decoding::sequence_section_decoder::OF_MAX_LOG,
109        )?;
110        let raw_tables = &raw_tables[of_size..];
111
112        let ml_size = new_dict.fse.match_lengths.build_decoder(
113            raw_tables,
114            crate::decoding::sequence_section_decoder::ML_MAX_LOG,
115        )?;
116        let raw_tables = &raw_tables[ml_size..];
117
118        let ll_size = new_dict.fse.literal_lengths.build_decoder(
119            raw_tables,
120            crate::decoding::sequence_section_decoder::LL_MAX_LOG,
121        )?;
122        let raw_tables = &raw_tables[ll_size..];
123
124        if raw_tables.len() < OFFSET_HISTORY_LEN {
125            return Err(DictionaryDecodeError::DictionaryTooSmall {
126                got: raw_tables.len(),
127                need: OFFSET_HISTORY_LEN,
128            });
129        }
130
131        let offset1 = raw_tables[0..4].try_into().expect("optimized away");
132        let offset1 = u32::from_le_bytes(offset1);
133
134        let offset2 = raw_tables[4..8].try_into().expect("optimized away");
135        let offset2 = u32::from_le_bytes(offset2);
136
137        let offset3 = raw_tables[8..12].try_into().expect("optimized away");
138        let offset3 = u32::from_le_bytes(offset3);
139
140        if offset1 == 0 {
141            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
142        }
143        if offset2 == 0 {
144            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
145        }
146        if offset3 == 0 {
147            return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
148        }
149
150        new_dict.offset_hist[0] = offset1;
151        new_dict.offset_hist[1] = offset2;
152        new_dict.offset_hist[2] = offset3;
153
154        let raw_content = &raw_tables[12..];
155        new_dict.dict_content.extend(raw_content);
156
157        Ok(new_dict)
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    fn offset_history_start(raw: &[u8]) -> usize {
166        let mut huf = crate::decoding::scratch::HuffmanScratch::new();
167        let mut fse = crate::decoding::scratch::FSEScratch::new();
168        let mut cursor = 8usize;
169
170        let huf_size = huf
171            .table
172            .build_decoder(&raw[cursor..])
173            .expect("reference dictionary huffman table should decode");
174        cursor += huf_size as usize;
175
176        let of_size = fse
177            .offsets
178            .build_decoder(
179                &raw[cursor..],
180                crate::decoding::sequence_section_decoder::OF_MAX_LOG,
181            )
182            .expect("reference dictionary OF table should decode");
183        cursor += of_size;
184
185        let ml_size = fse
186            .match_lengths
187            .build_decoder(
188                &raw[cursor..],
189                crate::decoding::sequence_section_decoder::ML_MAX_LOG,
190            )
191            .expect("reference dictionary ML table should decode");
192        cursor += ml_size;
193
194        let ll_size = fse
195            .literal_lengths
196            .build_decoder(
197                &raw[cursor..],
198                crate::decoding::sequence_section_decoder::LL_MAX_LOG,
199            )
200            .expect("reference dictionary LL table should decode");
201        cursor += ll_size;
202
203        cursor
204    }
205
206    #[test]
207    fn decode_dict_rejects_short_buffer_before_magic_and_id() {
208        let err = match Dictionary::decode_dict(&[]) {
209            Ok(_) => panic!("expected short dictionary to fail"),
210            Err(err) => err,
211        };
212        assert!(matches!(
213            err,
214            DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
215        ));
216    }
217
218    #[test]
219    fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
220        let mut raw = Vec::new();
221        raw.extend_from_slice(&MAGIC_NUM);
222        raw.extend_from_slice(&1u32.to_le_bytes());
223        raw.extend_from_slice(&[0u8; 7]);
224
225        let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
226        assert!(
227            result.is_ok(),
228            "decode_dict must not panic on malformed input"
229        );
230        assert!(
231            result.unwrap().is_err(),
232            "malformed dictionary must return error"
233        );
234    }
235
236    #[test]
237    fn decode_dict_rejects_zero_repeat_offsets() {
238        let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
239        let offset_start = offset_history_start(&raw);
240
241        // Corrupt rep0 to zero.
242        raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
243        let decoded = Dictionary::decode_dict(&raw);
244        assert!(matches!(
245            decoded,
246            Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
247        ));
248    }
249
250    #[test]
251    fn from_raw_content_rejects_empty_dictionary_content() {
252        let result = Dictionary::from_raw_content(1, Vec::new());
253        assert!(matches!(
254            result,
255            Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
256        ));
257    }
258}