avx_arrow/compression/
dictionary.rs

1//! Dictionary Encoding implementation
2//!
3//! Optimized for low-cardinality data (categorical columns)
4
5use crate::error::{ArrowError, Result};
6use std::collections::HashMap;
7
8/// Encode data using dictionary encoding
9pub fn encode(data: &[u8]) -> Result<Vec<u8>> {
10    if data.is_empty() {
11        return Ok(Vec::new());
12    }
13
14    let mut dictionary = Vec::new();
15    let mut map: HashMap<u8, u8> = HashMap::new();
16    let mut indices = Vec::with_capacity(data.len());
17
18    // Build dictionary
19    for &value in data {
20        let index = *map.entry(value).or_insert_with(|| {
21            let idx = dictionary.len() as u8;
22            dictionary.push(value);
23            idx
24        });
25        indices.push(index);
26    }
27
28    if dictionary.len() > 255 {
29        return Err(ArrowError::InvalidData(
30            "Dictionary too large (>255 entries)".to_string()
31        ));
32    }
33
34    // Output format: [dict_size][dict_values...][indices...]
35    let mut output = Vec::with_capacity(1 + dictionary.len() + indices.len());
36    output.push(dictionary.len() as u8);
37    output.extend_from_slice(&dictionary);
38    output.extend_from_slice(&indices);
39
40    Ok(output)
41}
42
43/// Decode dictionary-encoded data
44pub fn decode(data: &[u8]) -> Result<Vec<u8>> {
45    if data.is_empty() {
46        return Ok(Vec::new());
47    }
48
49    if data.len() < 1 {
50        return Err(ArrowError::InvalidData(
51            "Dictionary data too short".to_string()
52        ));
53    }
54
55    let dict_size = data[0] as usize;
56    if data.len() < 1 + dict_size {
57        return Err(ArrowError::InvalidData(
58            "Truncated dictionary".to_string()
59        ));
60    }
61
62    let dictionary = &data[1..1 + dict_size];
63    let indices = &data[1 + dict_size..];
64
65    let mut output = Vec::with_capacity(indices.len());
66    for &idx in indices {
67        if idx as usize >= dict_size {
68            return Err(ArrowError::InvalidData(
69                format!("Invalid index {} for dictionary size {}", idx, dict_size)
70            ));
71        }
72        output.push(dictionary[idx as usize]);
73    }
74
75    Ok(output)
76}
77
78/// Dictionary encoder for strings
79pub struct DictionaryEncoder {
80    dictionary: Vec<Vec<u8>>,
81    map: HashMap<Vec<u8>, u32>,
82    indices: Vec<u32>,
83}
84
85impl DictionaryEncoder {
86    /// Create new encoder
87    pub fn new() -> Self {
88        Self {
89            dictionary: Vec::new(),
90            map: HashMap::new(),
91            indices: Vec::new(),
92        }
93    }
94
95    /// Encode a string value
96    pub fn encode(&mut self, value: &[u8]) -> Result<()> {
97        let index = *self.map.entry(value.to_vec()).or_insert_with(|| {
98            let idx = self.dictionary.len() as u32;
99            self.dictionary.push(value.to_vec());
100            idx
101        });
102        self.indices.push(index);
103        Ok(())
104    }
105
106    /// Get dictionary size
107    pub fn dict_size(&self) -> usize {
108        self.dictionary.len()
109    }
110
111    /// Get encoded data
112    pub fn finish(self) -> (Vec<Vec<u8>>, Vec<u32>) {
113        (self.dictionary, self.indices)
114    }
115}
116
117impl Default for DictionaryEncoder {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123/// Dictionary encoder for integers
124pub struct DictionaryEncoderI64 {
125    dictionary: Vec<i64>,
126    map: HashMap<i64, u32>,
127    indices: Vec<u32>,
128}
129
130impl DictionaryEncoderI64 {
131    /// Create new encoder
132    pub fn new() -> Self {
133        Self {
134            dictionary: Vec::new(),
135            map: HashMap::new(),
136            indices: Vec::new(),
137        }
138    }
139
140    /// Encode an i64 value
141    pub fn encode(&mut self, value: i64) {
142        let index = *self.map.entry(value).or_insert_with(|| {
143            let idx = self.dictionary.len() as u32;
144            self.dictionary.push(value);
145            idx
146        });
147        self.indices.push(index);
148    }
149
150    /// Get dictionary size
151    pub fn dict_size(&self) -> usize {
152        self.dictionary.len()
153    }
154
155    /// Get encoded data
156    pub fn finish(self) -> (Vec<i64>, Vec<u32>) {
157        (self.dictionary, self.indices)
158    }
159}
160
161impl Default for DictionaryEncoderI64 {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167/// Dictionary encoder for f64
168pub struct DictionaryEncoderF64 {
169    dictionary: Vec<f64>,
170    map: HashMap<u64, u32>, // Use bits as key
171    indices: Vec<u32>,
172}
173
174impl DictionaryEncoderF64 {
175    /// Create new encoder
176    pub fn new() -> Self {
177        Self {
178            dictionary: Vec::new(),
179            map: HashMap::new(),
180            indices: Vec::new(),
181        }
182    }
183
184    /// Encode an f64 value
185    pub fn encode(&mut self, value: f64) {
186        let bits = value.to_bits();
187        let index = *self.map.entry(bits).or_insert_with(|| {
188            let idx = self.dictionary.len() as u32;
189            self.dictionary.push(value);
190            idx
191        });
192        self.indices.push(index);
193    }
194
195    /// Get dictionary size
196    pub fn dict_size(&self) -> usize {
197        self.dictionary.len()
198    }
199
200    /// Get encoded data
201    pub fn finish(self) -> (Vec<f64>, Vec<u32>) {
202        (self.dictionary, self.indices)
203    }
204}
205
206impl Default for DictionaryEncoderF64 {
207    fn default() -> Self {
208        Self::new()
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_dict_low_cardinality() {
218        let data = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
219        let encoded = encode(&data).unwrap();
220        // Small data may not compress due to overhead
221
222        let decoded = decode(&encoded).unwrap();
223        assert_eq!(decoded, data);
224    }
225
226    #[test]
227    fn test_dict_repeated() {
228        let data = vec![5u8; 100];
229        let encoded = encode(&data).unwrap();
230        // Small repeated data has overhead
231
232        let decoded = decode(&encoded).unwrap();
233        assert_eq!(decoded, data);
234    }
235
236    #[test]
237    fn test_dict_encoder_strings() {
238        let mut encoder = DictionaryEncoder::new();
239        encoder.encode(b"hello").unwrap();
240        encoder.encode(b"world").unwrap();
241        encoder.encode(b"hello").unwrap();
242        encoder.encode(b"world").unwrap();
243
244        assert_eq!(encoder.dict_size(), 2);
245        let (dict, indices) = encoder.finish();
246        assert_eq!(dict.len(), 2);
247        assert_eq!(indices, vec![0, 1, 0, 1]);
248    }
249
250    #[test]
251    fn test_dict_encoder_i64() {
252        let mut encoder = DictionaryEncoderI64::new();
253        for i in 0..100 {
254            encoder.encode(i % 10); // Only 10 unique values
255        }
256
257        assert_eq!(encoder.dict_size(), 10);
258        let (dict, indices) = encoder.finish();
259        assert_eq!(dict.len(), 10);
260        assert_eq!(indices.len(), 100);
261    }
262
263    #[test]
264    fn test_dict_encoder_f64() {
265        let mut encoder = DictionaryEncoderF64::new();
266        for i in 0..100 {
267            encoder.encode((i % 5) as f64 * 0.5); // Only 5 unique values
268        }
269
270        assert_eq!(encoder.dict_size(), 5);
271        let (dict, indices) = encoder.finish();
272        assert_eq!(dict.len(), 5);
273        assert_eq!(indices.len(), 100);
274    }
275}
276
277
278
279
280