grafeo_core/codec/
dictionary.rs1use std::collections::HashMap;
25use std::sync::Arc;
26
27#[derive(Debug, Clone)]
33pub struct DictionaryEncoding {
34 dictionary: Arc<[Arc<str>]>,
36 codes: Vec<u32>,
38 null_bitmap: Option<Vec<u64>>,
40}
41
42impl DictionaryEncoding {
43 pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
45 Self {
46 dictionary,
47 codes,
48 null_bitmap: None,
49 }
50 }
51
52 pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
54 self.null_bitmap = Some(null_bitmap);
55 self
56 }
57
58 pub fn len(&self) -> usize {
60 self.codes.len()
61 }
62
63 pub fn is_empty(&self) -> bool {
65 self.codes.is_empty()
66 }
67
68 pub fn dictionary_size(&self) -> usize {
70 self.dictionary.len()
71 }
72
73 pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
75 &self.dictionary
76 }
77
78 pub fn codes(&self) -> &[u32] {
80 &self.codes
81 }
82
83 pub fn is_null(&self, index: usize) -> bool {
85 if let Some(bitmap) = &self.null_bitmap {
86 let word_idx = index / 64;
87 let bit_idx = index % 64;
88 if word_idx < bitmap.len() {
89 return (bitmap[word_idx] & (1 << bit_idx)) != 0;
90 }
91 }
92 false
93 }
94
95 pub fn get(&self, index: usize) -> Option<&str> {
99 if self.is_null(index) {
100 return None;
101 }
102 let code = self.codes.get(index)? as &u32;
103 self.dictionary.get(*code as usize).map(|s| s.as_ref())
104 }
105
106 pub fn get_code(&self, index: usize) -> Option<u32> {
108 if self.is_null(index) {
109 return None;
110 }
111 self.codes.get(index).copied()
112 }
113
114 pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
116 (0..self.len()).map(move |i| self.get(i))
117 }
118
119 pub fn compression_ratio(&self) -> f64 {
123 if self.codes.is_empty() {
124 return 1.0;
125 }
126
127 let original_size: usize = self
129 .codes
130 .iter()
131 .map(|&code| {
132 if (code as usize) < self.dictionary.len() {
133 self.dictionary[code as usize].len()
134 } else {
135 0
136 }
137 })
138 .sum();
139
140 let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
142 let codes_size = self.codes.len() * std::mem::size_of::<u32>();
143 let compressed_size = dict_size + codes_size;
144
145 if compressed_size == 0 {
146 return 1.0;
147 }
148
149 original_size as f64 / compressed_size as f64
150 }
151
152 pub fn encode(&self, value: &str) -> Option<u32> {
154 self.dictionary
155 .iter()
156 .position(|s| s.as_ref() == value)
157 .and_then(|i| u32::try_from(i).ok())
158 }
159
160 pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
162 self.codes
163 .iter()
164 .enumerate()
165 .filter_map(|(i, &code)| {
166 if !self.is_null(i) && predicate(code) {
167 Some(i)
168 } else {
169 None
170 }
171 })
172 .collect()
173 }
174}
175
176#[derive(Debug)]
181pub struct DictionaryBuilder {
182 string_to_code: HashMap<Arc<str>, u32>,
184 dictionary: Vec<Arc<str>>,
186 codes: Vec<u32>,
188 null_positions: Vec<usize>,
190}
191
192impl DictionaryBuilder {
193 pub fn new() -> Self {
195 Self {
196 string_to_code: HashMap::new(),
197 dictionary: Vec::new(),
198 codes: Vec::new(),
199 null_positions: Vec::new(),
200 }
201 }
202
203 pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
205 Self {
206 string_to_code: HashMap::with_capacity(dictionary_capacity),
207 dictionary: Vec::with_capacity(dictionary_capacity),
208 codes: Vec::with_capacity(value_capacity),
209 null_positions: Vec::new(),
210 }
211 }
212
213 pub fn add(&mut self, value: &str) -> u32 {
217 if let Some(&code) = self.string_to_code.get(value) {
218 self.codes.push(code);
219 code
220 } else {
221 #[allow(clippy::cast_possible_truncation)]
223 let code = self.dictionary.len() as u32;
224 let arc_value: Arc<str> = value.into();
225 self.string_to_code.insert(arc_value.clone(), code);
226 self.dictionary.push(arc_value);
227 self.codes.push(code);
228 code
229 }
230 }
231
232 pub fn add_null(&mut self) {
234 let idx = self.codes.len();
235 self.null_positions.push(idx);
236 self.codes.push(0); }
238
239 pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
241 match value {
242 Some(v) => Some(self.add(v)),
243 None => {
244 self.add_null();
245 None
246 }
247 }
248 }
249
250 pub fn len(&self) -> usize {
252 self.codes.len()
253 }
254
255 pub fn is_empty(&self) -> bool {
257 self.codes.is_empty()
258 }
259
260 pub fn dictionary_size(&self) -> usize {
262 self.dictionary.len()
263 }
264
265 pub fn build(self) -> DictionaryEncoding {
267 let null_bitmap = if self.null_positions.is_empty() {
268 None
269 } else {
270 let num_words = (self.codes.len() + 63) / 64;
271 let mut bitmap = vec![0u64; num_words];
272 for &pos in &self.null_positions {
273 let word_idx = pos / 64;
274 let bit_idx = pos % 64;
275 bitmap[word_idx] |= 1 << bit_idx;
276 }
277 Some(bitmap)
278 };
279
280 let dict: Arc<[Arc<str>]> = self.dictionary.into();
281
282 let mut encoding = DictionaryEncoding::new(dict, self.codes);
283 if let Some(bitmap) = null_bitmap {
284 encoding = encoding.with_nulls(bitmap);
285 }
286 encoding
287 }
288
289 pub fn clear(&mut self) {
291 self.string_to_code.clear();
292 self.dictionary.clear();
293 self.codes.clear();
294 self.null_positions.clear();
295 }
296}
297
298impl Default for DictionaryBuilder {
299 fn default() -> Self {
300 Self::new()
301 }
302}
303
304pub trait IntoDictionaryEncoding {
306 fn into_dictionary_encoding(self) -> DictionaryEncoding;
308}
309
310impl<'a, I> IntoDictionaryEncoding for I
311where
312 I: IntoIterator<Item = &'a str>,
313{
314 fn into_dictionary_encoding(self) -> DictionaryEncoding {
315 let mut builder = DictionaryBuilder::new();
316 for s in self {
317 builder.add(s);
318 }
319 builder.build()
320 }
321}
322
323#[cfg(test)]
324mod tests {
325 use super::*;
326
327 #[test]
328 fn test_dictionary_builder_basic() {
329 let mut builder = DictionaryBuilder::new();
330 builder.add("apple");
331 builder.add("banana");
332 builder.add("apple");
333 builder.add("cherry");
334 builder.add("apple");
335
336 let dict = builder.build();
337
338 assert_eq!(dict.len(), 5);
339 assert_eq!(dict.dictionary_size(), 3);
340
341 assert_eq!(dict.get(0), Some("apple"));
342 assert_eq!(dict.get(1), Some("banana"));
343 assert_eq!(dict.get(2), Some("apple"));
344 assert_eq!(dict.get(3), Some("cherry"));
345 assert_eq!(dict.get(4), Some("apple"));
346 }
347
348 #[test]
349 fn test_dictionary_codes() {
350 let mut builder = DictionaryBuilder::new();
351 let code_apple = builder.add("apple");
352 let code_banana = builder.add("banana");
353 let code_apple2 = builder.add("apple");
354
355 assert_eq!(code_apple, code_apple2);
356 assert_ne!(code_apple, code_banana);
357
358 let dict = builder.build();
359 assert_eq!(dict.codes(), &[0, 1, 0]);
360 }
361
362 #[test]
363 fn test_dictionary_with_nulls() {
364 let mut builder = DictionaryBuilder::new();
365 builder.add("apple");
366 builder.add_null();
367 builder.add("banana");
368 builder.add_null();
369
370 let dict = builder.build();
371
372 assert_eq!(dict.len(), 4);
373 assert_eq!(dict.get(0), Some("apple"));
374 assert_eq!(dict.get(1), None);
375 assert!(dict.is_null(1));
376 assert_eq!(dict.get(2), Some("banana"));
377 assert_eq!(dict.get(3), None);
378 assert!(dict.is_null(3));
379 }
380
381 #[test]
382 fn test_dictionary_encode_lookup() {
383 let mut builder = DictionaryBuilder::new();
384 builder.add("apple");
385 builder.add("banana");
386 builder.add("cherry");
387
388 let dict = builder.build();
389
390 assert_eq!(dict.encode("apple"), Some(0));
391 assert_eq!(dict.encode("banana"), Some(1));
392 assert_eq!(dict.encode("cherry"), Some(2));
393 assert_eq!(dict.encode("date"), None);
394 }
395
396 #[test]
397 fn test_dictionary_filter_by_code() {
398 let mut builder = DictionaryBuilder::new();
399 builder.add("apple");
400 builder.add("banana");
401 builder.add("apple");
402 builder.add("cherry");
403 builder.add("apple");
404
405 let dict = builder.build();
406 let apple_code = dict.encode("apple").unwrap();
407
408 let indices = dict.filter_by_code(|code| code == apple_code);
409 assert_eq!(indices, vec![0, 2, 4]);
410 }
411
412 #[test]
413 fn test_compression_ratio() {
414 let mut builder = DictionaryBuilder::new();
415
416 for _ in 0..100 {
418 builder.add("this_is_a_very_long_string_that_repeats_many_times");
419 }
420
421 let dict = builder.build();
422
423 let ratio = dict.compression_ratio();
425 assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
426 }
427
428 #[test]
429 fn test_into_dictionary_encoding() {
430 let strings = vec!["apple", "banana", "apple", "cherry"];
431 let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
432
433 assert_eq!(dict.len(), 4);
434 assert_eq!(dict.dictionary_size(), 3);
435 }
436
437 #[test]
438 fn test_empty_dictionary() {
439 let builder = DictionaryBuilder::new();
440 let dict = builder.build();
441
442 assert!(dict.is_empty());
443 assert_eq!(dict.dictionary_size(), 0);
444 assert_eq!(dict.get(0), None);
445 }
446
447 #[test]
448 fn test_single_value() {
449 let mut builder = DictionaryBuilder::new();
450 builder.add("only_value");
451
452 let dict = builder.build();
453
454 assert_eq!(dict.len(), 1);
455 assert_eq!(dict.dictionary_size(), 1);
456 assert_eq!(dict.get(0), Some("only_value"));
457 }
458
459 #[test]
460 fn test_all_unique() {
461 let mut builder = DictionaryBuilder::new();
462 builder.add("a");
463 builder.add("b");
464 builder.add("c");
465 builder.add("d");
466
467 let dict = builder.build();
468
469 assert_eq!(dict.len(), 4);
470 assert_eq!(dict.dictionary_size(), 4);
471 assert_eq!(dict.codes(), &[0, 1, 2, 3]);
472 }
473
474 #[test]
475 fn test_all_same() {
476 let mut builder = DictionaryBuilder::new();
477 for _ in 0..10 {
478 builder.add("same");
479 }
480
481 let dict = builder.build();
482
483 assert_eq!(dict.len(), 10);
484 assert_eq!(dict.dictionary_size(), 1);
485 assert!(dict.codes().iter().all(|&c| c == 0));
486 }
487}