Skip to main content

zrip_core/dict/
mod.rs

1#![forbid(unsafe_code)]
2
3#[cfg(feature = "dict_builder")]
4pub mod cover;
5#[cfg(feature = "dict_builder")]
6pub mod fastcover;
7#[cfg(feature = "dict_builder")]
8pub mod finalize;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use crate::bitstream::reader::BitReader;
14use crate::error::DecompressError;
15use crate::fse::table_builder::{build_decode_table, parse_fse_table_description};
16use crate::fse::{FseDecodeEntry, LL_MAX_SYMBOL, ML_MAX_SYMBOL, OF_MAX_SYMBOL};
17use crate::huffman::HuffmanDecodeEntry;
18use crate::huffman::weights::{build_huffman_decode_table, parse_huffman_weights};
19
20pub const DICT_MAGIC: u32 = 0xEC30A437;
21
22/// A pre-trained zstd dictionary for improved compression of small data.
23///
24/// Load from raw bytes with [`Dictionary::from_bytes`], or train with
25/// [`train_dict_fastcover`] (requires `dict_builder` feature).
26#[cfg(feature = "alloc")]
27pub struct Dictionary {
28    id: u32,
29    content: Vec<u8>,
30    huf_table: Option<(Vec<HuffmanDecodeEntry>, u8)>,
31    of_table: Option<(Vec<FseDecodeEntry>, u8)>,
32    ml_table: Option<(Vec<FseDecodeEntry>, u8)>,
33    ll_table: Option<(Vec<FseDecodeEntry>, u8)>,
34    rep_offsets: [u32; 3],
35}
36
37#[cfg(feature = "alloc")]
38impl Dictionary {
39    /// Parses a dictionary from its raw byte representation.
40    pub fn from_bytes(data: &[u8]) -> Result<Self, DecompressError> {
41        if data.len() < 8 {
42            return Err(DecompressError::InvalidDictionary);
43        }
44
45        let magic = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
46        if magic != DICT_MAGIC {
47            return Err(DecompressError::InvalidDictionary);
48        }
49
50        let id = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
51        let mut pos = 8;
52
53        let huf_table = parse_dict_huffman(&data[pos..])?;
54        pos += huf_table.1;
55        let huf_decode = if huf_table.0.is_some() {
56            huf_table.0
57        } else {
58            None
59        };
60
61        let (of_table, of_consumed) = parse_dict_fse(&data[pos..], OF_MAX_SYMBOL)?;
62        pos += of_consumed;
63
64        let (ml_table, ml_consumed) = parse_dict_fse(&data[pos..], ML_MAX_SYMBOL)?;
65        pos += ml_consumed;
66
67        let (ll_table, ll_consumed) = parse_dict_fse(&data[pos..], LL_MAX_SYMBOL)?;
68        pos += ll_consumed;
69
70        if pos + 12 > data.len() {
71            return Err(DecompressError::InvalidDictionary);
72        }
73
74        let rep1 = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
75        let rep2 = u32::from_le_bytes([data[pos + 4], data[pos + 5], data[pos + 6], data[pos + 7]]);
76        let rep3 =
77            u32::from_le_bytes([data[pos + 8], data[pos + 9], data[pos + 10], data[pos + 11]]);
78        pos += 12;
79
80        if rep1 == 0 || rep2 == 0 || rep3 == 0 {
81            return Err(DecompressError::InvalidDictionary);
82        }
83
84        let content = data[pos..].to_vec();
85
86        Ok(Self {
87            id,
88            content,
89            huf_table: huf_decode,
90            of_table,
91            ml_table,
92            ll_table,
93            rep_offsets: [rep1, rep2, rep3],
94        })
95    }
96
97    /// Returns the dictionary ID embedded in the header.
98    pub fn id(&self) -> u32 {
99        self.id
100    }
101
102    /// Returns the raw content segment used as match-finding history prefix.
103    pub fn content(&self) -> &[u8] {
104        &self.content
105    }
106
107    /// Returns the three initial repeat offsets stored in the dictionary.
108    pub fn rep_offsets(&self) -> &[u32; 3] {
109        &self.rep_offsets
110    }
111
112    pub fn huf_table(&self) -> Option<(&[HuffmanDecodeEntry], u8)> {
113        self.huf_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
114    }
115
116    pub fn of_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
117        self.of_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
118    }
119
120    pub fn ml_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
121        self.ml_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
122    }
123
124    pub fn ll_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
125        self.ll_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
126    }
127}
128
129#[cfg(feature = "alloc")]
130fn parse_dict_huffman(
131    data: &[u8],
132) -> Result<(Option<(Vec<HuffmanDecodeEntry>, u8)>, usize), DecompressError> {
133    if data.is_empty() {
134        return Err(DecompressError::InvalidDictionary);
135    }
136
137    let (weights, consumed) = parse_huffman_weights(data)?;
138    if weights.is_empty() {
139        return Ok((None, consumed));
140    }
141    let (table, table_log) = build_huffman_decode_table(&weights)?;
142    Ok((Some((table, table_log)), consumed))
143}
144
145#[cfg(feature = "alloc")]
146fn parse_dict_fse(
147    data: &[u8],
148    max_symbol: u8,
149) -> Result<(Option<(Vec<FseDecodeEntry>, u8)>, usize), DecompressError> {
150    if data.is_empty() {
151        return Err(DecompressError::InvalidDictionary);
152    }
153
154    let mut reader = BitReader::new(data);
155    let (distribution, accuracy_log) = parse_fse_table_description(&mut reader, max_symbol)?;
156    let consumed = reader.bytes_consumed();
157    let table = build_decode_table(&distribution, accuracy_log)
158        .map_err(|_| DecompressError::InvalidDictionary)?;
159    Ok((Some((table, accuracy_log)), consumed))
160}
161
162#[cfg(feature = "dict_builder")]
163/// Trains a dictionary from sample data using the FastCOVER algorithm.
164pub fn train_dict_fastcover(
165    samples: &[&[u8]],
166    dict_size: usize,
167    params: fastcover::FastCoverParams,
168) -> Dictionary {
169    let content = fastcover::select_segments(samples, dict_size, &params);
170    let dict_bytes = finalize::finalize_dictionary(&content, samples, dict_size);
171    Dictionary::from_bytes(&dict_bytes).expect("finalized dictionary must be valid")
172}