Skip to main content

zrip_core/dict/
mod.rs

1#![forbid(unsafe_code)]
2
3#[cfg(feature = "dict_builder")]
4pub mod cover;
5#[cfg(feature = "dict_builder")]
6pub mod fastcover;
7#[cfg(feature = "dict_builder")]
8pub mod finalize;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use crate::bitstream::reader::BitReader;
14use crate::error::DecompressError;
15use crate::fse::table_builder::{build_decode_table, parse_fse_table_description};
16use crate::fse::{FseDecodeEntry, LL_MAX_SYMBOL, ML_MAX_SYMBOL, OF_MAX_SYMBOL};
17use crate::huffman::HuffmanDecodeEntry;
18use crate::huffman::weights::{build_huffman_decode_table, parse_huffman_weights};
19
20pub const DICT_MAGIC: u32 = 0xEC30_A437;
21
22/// A pre-trained zstd dictionary for improved compression of small data.
23///
24/// Load from raw bytes with [`Dictionary::from_bytes`], or train with
25/// [`train_dict_fastcover`] (requires `dict_builder` feature).
26#[cfg(feature = "alloc")]
27#[derive(Clone)]
28pub struct Dictionary {
29    id: u32,
30    content: Vec<u8>,
31    huf_table: Option<(Vec<HuffmanDecodeEntry>, u8)>,
32    of_table: Option<(Vec<FseDecodeEntry>, u8)>,
33    ml_table: Option<(Vec<FseDecodeEntry>, u8)>,
34    ll_table: Option<(Vec<FseDecodeEntry>, u8)>,
35    rep_offsets: [u32; 3],
36}
37
38#[cfg(feature = "alloc")]
39impl Dictionary {
40    /// Parses a dictionary from its raw byte representation.
41    pub fn from_bytes(data: &[u8]) -> Result<Self, DecompressError> {
42        if data.len() < 8 {
43            return Err(DecompressError::InvalidDictionary);
44        }
45
46        let magic = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
47        if magic != DICT_MAGIC {
48            return Err(DecompressError::InvalidDictionary);
49        }
50
51        let id = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
52        if id == 0 {
53            return Err(DecompressError::InvalidDictionary);
54        }
55        let mut pos = 8;
56
57        let huf_table = parse_dict_huffman(&data[pos..])?;
58        pos += huf_table.1;
59        let huf_decode = if huf_table.0.is_some() {
60            huf_table.0
61        } else {
62            None
63        };
64
65        let (of_table, of_consumed) = parse_dict_fse(&data[pos..], OF_MAX_SYMBOL)?;
66        pos += of_consumed;
67
68        let (ml_table, ml_consumed) = parse_dict_fse(&data[pos..], ML_MAX_SYMBOL)?;
69        pos += ml_consumed;
70
71        let (ll_table, ll_consumed) = parse_dict_fse(&data[pos..], LL_MAX_SYMBOL)?;
72        pos += ll_consumed;
73
74        if pos + 12 > data.len() {
75            return Err(DecompressError::InvalidDictionary);
76        }
77
78        let rep1 = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]);
79        let rep2 = u32::from_le_bytes([data[pos + 4], data[pos + 5], data[pos + 6], data[pos + 7]]);
80        let rep3 =
81            u32::from_le_bytes([data[pos + 8], data[pos + 9], data[pos + 10], data[pos + 11]]);
82        pos += 12;
83
84        if rep1 == 0 || rep2 == 0 || rep3 == 0 {
85            return Err(DecompressError::InvalidDictionary);
86        }
87
88        let content = data[pos..].to_vec();
89
90        Ok(Self {
91            id,
92            content,
93            huf_table: huf_decode,
94            of_table,
95            ml_table,
96            ll_table,
97            rep_offsets: [rep1, rep2, rep3],
98        })
99    }
100
101    /// Returns the dictionary ID embedded in the header.
102    pub fn id(&self) -> u32 {
103        self.id
104    }
105
106    /// Returns the raw content segment used as match-finding history prefix.
107    pub fn content(&self) -> &[u8] {
108        &self.content
109    }
110
111    /// Returns the three initial repeat offsets stored in the dictionary.
112    pub fn rep_offsets(&self) -> &[u32; 3] {
113        &self.rep_offsets
114    }
115
116    /// Returns the Huffman decode table and its log2 size, if present.
117    pub fn huf_table(&self) -> Option<(&[HuffmanDecodeEntry], u8)> {
118        self.huf_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
119    }
120
121    /// Returns the offset-code FSE decode table and accuracy log, if present.
122    pub fn of_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
123        self.of_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
124    }
125
126    /// Returns the match-length FSE decode table and accuracy log, if present.
127    pub fn ml_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
128        self.ml_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
129    }
130
131    /// Returns the literal-length FSE decode table and accuracy log, if present.
132    pub fn ll_table(&self) -> Option<(&[FseDecodeEntry], u8)> {
133        self.ll_table.as_ref().map(|(t, l)| (t.as_slice(), *l))
134    }
135}
136
137#[cfg(feature = "alloc")]
138#[allow(clippy::type_complexity)]
139fn parse_dict_huffman(
140    data: &[u8],
141) -> Result<(Option<(Vec<HuffmanDecodeEntry>, u8)>, usize), DecompressError> {
142    if data.is_empty() {
143        return Err(DecompressError::InvalidDictionary);
144    }
145
146    let (weights, consumed) = parse_huffman_weights(data)?;
147    if weights.is_empty() {
148        return Ok((None, consumed));
149    }
150    let (table, table_log) = build_huffman_decode_table(&weights)?;
151    Ok((Some((table, table_log)), consumed))
152}
153
154#[cfg(feature = "alloc")]
155#[allow(clippy::type_complexity)]
156fn parse_dict_fse(
157    data: &[u8],
158    max_symbol: u8,
159) -> Result<(Option<(Vec<FseDecodeEntry>, u8)>, usize), DecompressError> {
160    if data.is_empty() {
161        return Err(DecompressError::InvalidDictionary);
162    }
163
164    let mut reader = BitReader::new(data);
165    let (distribution, accuracy_log) = parse_fse_table_description(&mut reader, max_symbol)?;
166    let consumed = reader.bytes_consumed();
167    let table = build_decode_table(&distribution, accuracy_log)
168        .map_err(|_| DecompressError::InvalidDictionary)?;
169    Ok((Some((table, accuracy_log)), consumed))
170}
171
172#[cfg(feature = "dict_builder")]
173/// Trains a dictionary from sample data using the FastCOVER algorithm.
174pub fn train_dict_fastcover(
175    samples: &[&[u8]],
176    dict_size: usize,
177    params: fastcover::FastCoverParams,
178) -> Dictionary {
179    let content = fastcover::select_segments(samples, dict_size, &params);
180    let dict_bytes = finalize::finalize_dictionary(&content, samples, dict_size);
181    Dictionary::from_bytes(&dict_bytes).expect("finalized dictionary must be valid")
182}