avx_arrow/compression/
dictionary.rs1use crate::error::{ArrowError, Result};
6use std::collections::HashMap;
7
8pub fn encode(data: &[u8]) -> Result<Vec<u8>> {
10 if data.is_empty() {
11 return Ok(Vec::new());
12 }
13
14 let mut dictionary = Vec::new();
15 let mut map: HashMap<u8, u8> = HashMap::new();
16 let mut indices = Vec::with_capacity(data.len());
17
18 for &value in data {
20 let index = *map.entry(value).or_insert_with(|| {
21 let idx = dictionary.len() as u8;
22 dictionary.push(value);
23 idx
24 });
25 indices.push(index);
26 }
27
28 if dictionary.len() > 255 {
29 return Err(ArrowError::InvalidData(
30 "Dictionary too large (>255 entries)".to_string()
31 ));
32 }
33
34 let mut output = Vec::with_capacity(1 + dictionary.len() + indices.len());
36 output.push(dictionary.len() as u8);
37 output.extend_from_slice(&dictionary);
38 output.extend_from_slice(&indices);
39
40 Ok(output)
41}
42
43pub fn decode(data: &[u8]) -> Result<Vec<u8>> {
45 if data.is_empty() {
46 return Ok(Vec::new());
47 }
48
49 if data.len() < 1 {
50 return Err(ArrowError::InvalidData(
51 "Dictionary data too short".to_string()
52 ));
53 }
54
55 let dict_size = data[0] as usize;
56 if data.len() < 1 + dict_size {
57 return Err(ArrowError::InvalidData(
58 "Truncated dictionary".to_string()
59 ));
60 }
61
62 let dictionary = &data[1..1 + dict_size];
63 let indices = &data[1 + dict_size..];
64
65 let mut output = Vec::with_capacity(indices.len());
66 for &idx in indices {
67 if idx as usize >= dict_size {
68 return Err(ArrowError::InvalidData(
69 format!("Invalid index {} for dictionary size {}", idx, dict_size)
70 ));
71 }
72 output.push(dictionary[idx as usize]);
73 }
74
75 Ok(output)
76}
77
78pub struct DictionaryEncoder {
80 dictionary: Vec<Vec<u8>>,
81 map: HashMap<Vec<u8>, u32>,
82 indices: Vec<u32>,
83}
84
85impl DictionaryEncoder {
86 pub fn new() -> Self {
88 Self {
89 dictionary: Vec::new(),
90 map: HashMap::new(),
91 indices: Vec::new(),
92 }
93 }
94
95 pub fn encode(&mut self, value: &[u8]) -> Result<()> {
97 let index = *self.map.entry(value.to_vec()).or_insert_with(|| {
98 let idx = self.dictionary.len() as u32;
99 self.dictionary.push(value.to_vec());
100 idx
101 });
102 self.indices.push(index);
103 Ok(())
104 }
105
106 pub fn dict_size(&self) -> usize {
108 self.dictionary.len()
109 }
110
111 pub fn finish(self) -> (Vec<Vec<u8>>, Vec<u32>) {
113 (self.dictionary, self.indices)
114 }
115}
116
117impl Default for DictionaryEncoder {
118 fn default() -> Self {
119 Self::new()
120 }
121}
122
123pub struct DictionaryEncoderI64 {
125 dictionary: Vec<i64>,
126 map: HashMap<i64, u32>,
127 indices: Vec<u32>,
128}
129
130impl DictionaryEncoderI64 {
131 pub fn new() -> Self {
133 Self {
134 dictionary: Vec::new(),
135 map: HashMap::new(),
136 indices: Vec::new(),
137 }
138 }
139
140 pub fn encode(&mut self, value: i64) {
142 let index = *self.map.entry(value).or_insert_with(|| {
143 let idx = self.dictionary.len() as u32;
144 self.dictionary.push(value);
145 idx
146 });
147 self.indices.push(index);
148 }
149
150 pub fn dict_size(&self) -> usize {
152 self.dictionary.len()
153 }
154
155 pub fn finish(self) -> (Vec<i64>, Vec<u32>) {
157 (self.dictionary, self.indices)
158 }
159}
160
161impl Default for DictionaryEncoderI64 {
162 fn default() -> Self {
163 Self::new()
164 }
165}
166
167pub struct DictionaryEncoderF64 {
169 dictionary: Vec<f64>,
170 map: HashMap<u64, u32>, indices: Vec<u32>,
172}
173
174impl DictionaryEncoderF64 {
175 pub fn new() -> Self {
177 Self {
178 dictionary: Vec::new(),
179 map: HashMap::new(),
180 indices: Vec::new(),
181 }
182 }
183
184 pub fn encode(&mut self, value: f64) {
186 let bits = value.to_bits();
187 let index = *self.map.entry(bits).or_insert_with(|| {
188 let idx = self.dictionary.len() as u32;
189 self.dictionary.push(value);
190 idx
191 });
192 self.indices.push(index);
193 }
194
195 pub fn dict_size(&self) -> usize {
197 self.dictionary.len()
198 }
199
200 pub fn finish(self) -> (Vec<f64>, Vec<u32>) {
202 (self.dictionary, self.indices)
203 }
204}
205
206impl Default for DictionaryEncoderF64 {
207 fn default() -> Self {
208 Self::new()
209 }
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_dict_low_cardinality() {
218 let data = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
219 let encoded = encode(&data).unwrap();
220 let decoded = decode(&encoded).unwrap();
223 assert_eq!(decoded, data);
224 }
225
226 #[test]
227 fn test_dict_repeated() {
228 let data = vec![5u8; 100];
229 let encoded = encode(&data).unwrap();
230 let decoded = decode(&encoded).unwrap();
233 assert_eq!(decoded, data);
234 }
235
236 #[test]
237 fn test_dict_encoder_strings() {
238 let mut encoder = DictionaryEncoder::new();
239 encoder.encode(b"hello").unwrap();
240 encoder.encode(b"world").unwrap();
241 encoder.encode(b"hello").unwrap();
242 encoder.encode(b"world").unwrap();
243
244 assert_eq!(encoder.dict_size(), 2);
245 let (dict, indices) = encoder.finish();
246 assert_eq!(dict.len(), 2);
247 assert_eq!(indices, vec![0, 1, 0, 1]);
248 }
249
250 #[test]
251 fn test_dict_encoder_i64() {
252 let mut encoder = DictionaryEncoderI64::new();
253 for i in 0..100 {
254 encoder.encode(i % 10); }
256
257 assert_eq!(encoder.dict_size(), 10);
258 let (dict, indices) = encoder.finish();
259 assert_eq!(dict.len(), 10);
260 assert_eq!(indices.len(), 100);
261 }
262
263 #[test]
264 fn test_dict_encoder_f64() {
265 let mut encoder = DictionaryEncoderF64::new();
266 for i in 0..100 {
267 encoder.encode((i % 5) as f64 * 0.5); }
269
270 assert_eq!(encoder.dict_size(), 5);
271 let (dict, indices) = encoder.finish();
272 assert_eq!(dict.len(), 5);
273 assert_eq!(indices.len(), 100);
274 }
275}
276
277
278
279
280