structured_zstd/decoding/
dictionary.rs1#[cfg(not(target_has_atomic = "ptr"))]
2use alloc::rc::Rc;
3#[cfg(target_has_atomic = "ptr")]
4use alloc::sync::Arc;
5use alloc::vec::Vec;
6use core::convert::TryInto;
7
8use crate::decoding::errors::DictionaryDecodeError;
9use crate::decoding::scratch::FSEScratch;
10use crate::decoding::scratch::HuffmanScratch;
11
12#[derive(Clone)]
17pub struct Dictionary {
18 pub id: u32,
21 pub fse: FSEScratch,
24 pub huf: HuffmanScratch,
27 pub dict_content: Vec<u8>,
37 pub offset_hist: [u32; 3],
42}
43
44#[cfg(target_has_atomic = "ptr")]
45type SharedDictionary = Arc<Dictionary>;
46#[cfg(not(target_has_atomic = "ptr"))]
47type SharedDictionary = Rc<Dictionary>;
48
49#[derive(Clone)]
53pub struct DictionaryHandle {
54 inner: SharedDictionary,
55}
56
57pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
59
60impl Dictionary {
61 pub fn heap_bytes(&self) -> usize {
65 self.dict_content.capacity() + self.fse.heap_bytes() + self.huf.heap_bytes()
66 }
67
68 pub fn from_raw_content(
73 id: u32,
74 dict_content: Vec<u8>,
75 ) -> Result<Dictionary, DictionaryDecodeError> {
76 if id == 0 {
77 return Err(DictionaryDecodeError::ZeroDictionaryId);
78 }
79 if dict_content.is_empty() {
80 return Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 });
81 }
82
83 Ok(Dictionary {
84 id,
85 fse: FSEScratch::new(),
86 huf: HuffmanScratch::new(),
87 dict_content,
88 offset_hist: [1, 4, 8],
89 })
90 }
91
92 pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
96 Self::decode_dict_inner(raw, true)
97 }
98
99 pub(crate) fn decode_dict_for_encoding(
118 raw: &[u8],
119 ) -> Result<Dictionary, DictionaryDecodeError> {
120 Self::decode_dict_inner(raw, false)
121 }
122
123 fn decode_dict_inner(
129 raw: &[u8],
130 build_decode_tables: bool,
131 ) -> Result<Dictionary, DictionaryDecodeError> {
132 const MIN_MAGIC_AND_ID_LEN: usize = 8;
133 const OFFSET_HISTORY_LEN: usize = 12;
134
135 if raw.len() < MIN_MAGIC_AND_ID_LEN {
136 return Err(DictionaryDecodeError::DictionaryTooSmall {
137 got: raw.len(),
138 need: MIN_MAGIC_AND_ID_LEN,
139 });
140 }
141
142 let mut new_dict = Dictionary {
143 id: 0,
144 fse: FSEScratch::new(),
145 huf: HuffmanScratch::new(),
146 dict_content: Vec::new(),
147 offset_hist: [1, 4, 8],
148 };
149
150 let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
151 if magic_num != MAGIC_NUM {
152 return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
153 }
154
155 let dict_id = raw[4..8].try_into().expect("optimized away");
156 let dict_id = u32::from_le_bytes(dict_id);
157 if dict_id == 0 {
158 return Err(DictionaryDecodeError::ZeroDictionaryId);
159 }
160 new_dict.id = dict_id;
161
162 let raw_tables = &raw[8..];
163
164 let huf_size = if build_decode_tables {
165 new_dict.huf.table.build_decoder(raw_tables)?
166 } else {
167 new_dict.huf.table.build_weights_only(raw_tables)?
168 };
169 let raw_tables = &raw_tables[huf_size as usize..];
170
171 let of_size = if build_decode_tables {
172 let n = new_dict.fse.offsets.build_decoder(
173 raw_tables,
174 crate::decoding::sequence_section_decoder::OF_MAX_LOG,
175 )?;
176 new_dict.fse.offsets.enrich_for_offsets();
177 new_dict.fse.offsets_long_share =
183 crate::decoding::sequence_section_decoder::compute_offsets_long_share(
184 &new_dict.fse.offsets,
185 );
186 n
187 } else {
188 new_dict.fse.offsets.read_table_probabilities(
189 raw_tables,
190 crate::decoding::sequence_section_decoder::OF_MAX_LOG,
191 )?
192 };
193 let raw_tables = &raw_tables[of_size..];
194
195 let ml_size = if build_decode_tables {
196 let n = new_dict.fse.match_lengths.build_decoder(
197 raw_tables,
198 crate::decoding::sequence_section_decoder::ML_MAX_LOG,
199 )?;
200 new_dict
201 .fse
202 .match_lengths
203 .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::ML_META);
204 n
205 } else {
206 new_dict.fse.match_lengths.read_table_probabilities(
207 raw_tables,
208 crate::decoding::sequence_section_decoder::ML_MAX_LOG,
209 )?
210 };
211 let raw_tables = &raw_tables[ml_size..];
212
213 let ll_size = if build_decode_tables {
214 let n = new_dict.fse.literal_lengths.build_decoder(
215 raw_tables,
216 crate::decoding::sequence_section_decoder::LL_MAX_LOG,
217 )?;
218 new_dict
219 .fse
220 .literal_lengths
221 .enrich_with_packed_seq_meta(&crate::decoding::sequence_section_decoder::LL_META);
222 n
223 } else {
224 new_dict.fse.literal_lengths.read_table_probabilities(
225 raw_tables,
226 crate::decoding::sequence_section_decoder::LL_MAX_LOG,
227 )?
228 };
229 let raw_tables = &raw_tables[ll_size..];
230
231 if raw_tables.len() < OFFSET_HISTORY_LEN {
232 return Err(DictionaryDecodeError::DictionaryTooSmall {
233 got: raw_tables.len(),
234 need: OFFSET_HISTORY_LEN,
235 });
236 }
237
238 let offset1 = raw_tables[0..4].try_into().expect("optimized away");
239 let offset1 = u32::from_le_bytes(offset1);
240
241 let offset2 = raw_tables[4..8].try_into().expect("optimized away");
242 let offset2 = u32::from_le_bytes(offset2);
243
244 let offset3 = raw_tables[8..12].try_into().expect("optimized away");
245 let offset3 = u32::from_le_bytes(offset3);
246
247 if offset1 == 0 {
248 return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 });
249 }
250 if offset2 == 0 {
251 return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 1 });
252 }
253 if offset3 == 0 {
254 return Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 2 });
255 }
256
257 new_dict.offset_hist[0] = offset1;
258 new_dict.offset_hist[1] = offset2;
259 new_dict.offset_hist[2] = offset3;
260
261 let raw_content = &raw_tables[12..];
262 new_dict.dict_content.extend(raw_content);
263
264 Ok(new_dict)
265 }
266
267 pub fn into_handle(self) -> DictionaryHandle {
269 DictionaryHandle::from_dictionary(self)
270 }
271}
272
273impl DictionaryHandle {
274 pub fn from_dictionary(dict: Dictionary) -> Self {
276 Self {
277 inner: SharedDictionary::new(dict),
278 }
279 }
280
281 pub fn decode_dict(raw: &[u8]) -> Result<Self, DictionaryDecodeError> {
283 Dictionary::decode_dict(raw).map(Self::from_dictionary)
284 }
285
286 pub fn id(&self) -> u32 {
287 self.inner.id
288 }
289
290 pub fn as_dict(&self) -> &Dictionary {
291 &self.inner
292 }
293}
294
295impl AsRef<Dictionary> for DictionaryHandle {
296 fn as_ref(&self) -> &Dictionary {
297 self.as_dict()
298 }
299}
300
301impl From<Dictionary> for DictionaryHandle {
302 fn from(dict: Dictionary) -> Self {
303 DictionaryHandle::from_dictionary(dict)
304 }
305}
306
307#[cfg(test)]
308mod tests {
309 use super::*;
310 use alloc::vec;
311
312 fn offset_history_start(raw: &[u8]) -> usize {
313 let mut huf = crate::decoding::scratch::HuffmanScratch::new();
314 let mut fse = crate::decoding::scratch::FSEScratch::new();
315 let mut cursor = 8usize;
316
317 let huf_size = huf
318 .table
319 .build_decoder(&raw[cursor..])
320 .expect("reference dictionary huffman table should decode");
321 cursor += huf_size as usize;
322
323 let of_size = fse
324 .offsets
325 .build_decoder(
326 &raw[cursor..],
327 crate::decoding::sequence_section_decoder::OF_MAX_LOG,
328 )
329 .expect("reference dictionary OF table should decode");
330 cursor += of_size;
331
332 let ml_size = fse
333 .match_lengths
334 .build_decoder(
335 &raw[cursor..],
336 crate::decoding::sequence_section_decoder::ML_MAX_LOG,
337 )
338 .expect("reference dictionary ML table should decode");
339 cursor += ml_size;
340
341 let ll_size = fse
342 .literal_lengths
343 .build_decoder(
344 &raw[cursor..],
345 crate::decoding::sequence_section_decoder::LL_MAX_LOG,
346 )
347 .expect("reference dictionary LL table should decode");
348 cursor += ll_size;
349
350 cursor
351 }
352
353 #[test]
354 fn decode_dict_rejects_short_buffer_before_magic_and_id() {
355 let err = match Dictionary::decode_dict(&[]) {
356 Ok(_) => panic!("expected short dictionary to fail"),
357 Err(err) => err,
358 };
359 assert!(matches!(
360 err,
361 DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 8 }
362 ));
363 }
364
365 #[test]
366 fn decode_dict_malformed_input_returns_error_instead_of_panicking() {
367 let mut raw = Vec::new();
368 raw.extend_from_slice(&MAGIC_NUM);
369 raw.extend_from_slice(&1u32.to_le_bytes());
370 raw.extend_from_slice(&[0u8; 7]);
371
372 let result = std::panic::catch_unwind(|| Dictionary::decode_dict(&raw));
373 assert!(
374 result.is_ok(),
375 "decode_dict must not panic on malformed input"
376 );
377 assert!(
378 result.unwrap().is_err(),
379 "malformed dictionary must return error"
380 );
381 }
382
383 #[test]
384 fn decode_dict_rejects_zero_repeat_offsets() {
385 let mut raw = include_bytes!("../../dict_tests/dictionary").to_vec();
386 let offset_start = offset_history_start(&raw);
387
388 raw[offset_start..offset_start + 4].copy_from_slice(&0u32.to_le_bytes());
390 let decoded = Dictionary::decode_dict(&raw);
391 assert!(matches!(
392 decoded,
393 Err(DictionaryDecodeError::ZeroRepeatOffsetInDictionary { index: 0 })
394 ));
395 }
396
397 #[test]
398 fn from_raw_content_rejects_empty_dictionary_content() {
399 let result = Dictionary::from_raw_content(1, Vec::new());
400 assert!(matches!(
401 result,
402 Err(DictionaryDecodeError::DictionaryTooSmall { got: 0, need: 1 })
403 ));
404 }
405
406 #[test]
407 fn dictionary_handle_from_raw_content_supports_as_ref() {
408 let dict = Dictionary::from_raw_content(7, vec![42]).expect("raw dict should build");
409 let handle = dict.into_handle();
410 let dict_ref: &Dictionary = handle.as_ref();
411
412 assert_eq!(dict_ref.id, 7);
413 assert_eq!(dict_ref.dict_content.as_slice(), &[42]);
414 }
415
416 #[test]
417 fn dictionary_handle_clones_share_inner() {
418 let raw = include_bytes!("../../dict_tests/dictionary");
419 let handle = DictionaryHandle::decode_dict(raw).expect("dictionary should parse");
420 let clone = handle.clone();
421
422 assert_eq!(handle.id(), clone.id());
423 assert!(SharedDictionary::ptr_eq(&handle.inner, &clone.inner));
424 }
425}