grafeo_core/storage/
dictionary.rs1use std::collections::HashMap;
22use std::sync::Arc;
23
24#[derive(Debug, Clone)]
28pub struct DictionaryEncoding {
29 dictionary: Arc<[Arc<str>]>,
31 codes: Vec<u32>,
33 null_bitmap: Option<Vec<u64>>,
35}
36
37impl DictionaryEncoding {
38 pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
40 Self {
41 dictionary,
42 codes,
43 null_bitmap: None,
44 }
45 }
46
47 pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
49 self.null_bitmap = Some(null_bitmap);
50 self
51 }
52
53 pub fn len(&self) -> usize {
55 self.codes.len()
56 }
57
58 pub fn is_empty(&self) -> bool {
60 self.codes.is_empty()
61 }
62
63 pub fn dictionary_size(&self) -> usize {
65 self.dictionary.len()
66 }
67
68 pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
70 &self.dictionary
71 }
72
73 pub fn codes(&self) -> &[u32] {
75 &self.codes
76 }
77
78 pub fn is_null(&self, index: usize) -> bool {
80 if let Some(bitmap) = &self.null_bitmap {
81 let word_idx = index / 64;
82 let bit_idx = index % 64;
83 if word_idx < bitmap.len() {
84 return (bitmap[word_idx] & (1 << bit_idx)) != 0;
85 }
86 }
87 false
88 }
89
90 pub fn get(&self, index: usize) -> Option<&str> {
94 if self.is_null(index) {
95 return None;
96 }
97 let code = self.codes.get(index)? as &u32;
98 self.dictionary.get(*code as usize).map(|s| s.as_ref())
99 }
100
101 pub fn get_code(&self, index: usize) -> Option<u32> {
103 if self.is_null(index) {
104 return None;
105 }
106 self.codes.get(index).copied()
107 }
108
109 pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
111 (0..self.len()).map(move |i| self.get(i))
112 }
113
114 pub fn compression_ratio(&self) -> f64 {
118 if self.codes.is_empty() {
119 return 1.0;
120 }
121
122 let original_size: usize = self
124 .codes
125 .iter()
126 .map(|&code| {
127 if (code as usize) < self.dictionary.len() {
128 self.dictionary[code as usize].len()
129 } else {
130 0
131 }
132 })
133 .sum();
134
135 let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
137 let codes_size = self.codes.len() * std::mem::size_of::<u32>();
138 let compressed_size = dict_size + codes_size;
139
140 if compressed_size == 0 {
141 return 1.0;
142 }
143
144 original_size as f64 / compressed_size as f64
145 }
146
147 pub fn encode(&self, value: &str) -> Option<u32> {
149 self.dictionary
150 .iter()
151 .position(|s| s.as_ref() == value)
152 .map(|i| i as u32)
153 }
154
155 pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
157 self.codes
158 .iter()
159 .enumerate()
160 .filter_map(|(i, &code)| {
161 if !self.is_null(i) && predicate(code) {
162 Some(i)
163 } else {
164 None
165 }
166 })
167 .collect()
168 }
169}
170
171#[derive(Debug)]
173pub struct DictionaryBuilder {
174 string_to_code: HashMap<Arc<str>, u32>,
176 dictionary: Vec<Arc<str>>,
178 codes: Vec<u32>,
180 null_positions: Vec<usize>,
182}
183
184impl DictionaryBuilder {
185 pub fn new() -> Self {
187 Self {
188 string_to_code: HashMap::new(),
189 dictionary: Vec::new(),
190 codes: Vec::new(),
191 null_positions: Vec::new(),
192 }
193 }
194
195 pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
197 Self {
198 string_to_code: HashMap::with_capacity(dictionary_capacity),
199 dictionary: Vec::with_capacity(dictionary_capacity),
200 codes: Vec::with_capacity(value_capacity),
201 null_positions: Vec::new(),
202 }
203 }
204
205 pub fn add(&mut self, value: &str) -> u32 {
209 if let Some(&code) = self.string_to_code.get(value) {
210 self.codes.push(code);
211 code
212 } else {
213 let code = self.dictionary.len() as u32;
214 let arc_value: Arc<str> = value.into();
215 self.string_to_code.insert(arc_value.clone(), code);
216 self.dictionary.push(arc_value);
217 self.codes.push(code);
218 code
219 }
220 }
221
222 pub fn add_null(&mut self) {
224 let idx = self.codes.len();
225 self.null_positions.push(idx);
226 self.codes.push(0); }
228
229 pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
231 match value {
232 Some(v) => Some(self.add(v)),
233 None => {
234 self.add_null();
235 None
236 }
237 }
238 }
239
240 pub fn len(&self) -> usize {
242 self.codes.len()
243 }
244
245 pub fn is_empty(&self) -> bool {
247 self.codes.is_empty()
248 }
249
250 pub fn dictionary_size(&self) -> usize {
252 self.dictionary.len()
253 }
254
255 pub fn build(self) -> DictionaryEncoding {
257 let null_bitmap = if self.null_positions.is_empty() {
258 None
259 } else {
260 let num_words = (self.codes.len() + 63) / 64;
261 let mut bitmap = vec![0u64; num_words];
262 for &pos in &self.null_positions {
263 let word_idx = pos / 64;
264 let bit_idx = pos % 64;
265 bitmap[word_idx] |= 1 << bit_idx;
266 }
267 Some(bitmap)
268 };
269
270 let dict: Arc<[Arc<str>]> = self.dictionary.into();
271
272 let mut encoding = DictionaryEncoding::new(dict, self.codes);
273 if let Some(bitmap) = null_bitmap {
274 encoding = encoding.with_nulls(bitmap);
275 }
276 encoding
277 }
278
279 pub fn clear(&mut self) {
281 self.string_to_code.clear();
282 self.dictionary.clear();
283 self.codes.clear();
284 self.null_positions.clear();
285 }
286}
287
288impl Default for DictionaryBuilder {
289 fn default() -> Self {
290 Self::new()
291 }
292}
293
294pub trait IntoDictionaryEncoding {
296 fn into_dictionary_encoding(self) -> DictionaryEncoding;
298}
299
300impl<'a, I> IntoDictionaryEncoding for I
301where
302 I: IntoIterator<Item = &'a str>,
303{
304 fn into_dictionary_encoding(self) -> DictionaryEncoding {
305 let mut builder = DictionaryBuilder::new();
306 for s in self {
307 builder.add(s);
308 }
309 builder.build()
310 }
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316
317 #[test]
318 fn test_dictionary_builder_basic() {
319 let mut builder = DictionaryBuilder::new();
320 builder.add("apple");
321 builder.add("banana");
322 builder.add("apple");
323 builder.add("cherry");
324 builder.add("apple");
325
326 let dict = builder.build();
327
328 assert_eq!(dict.len(), 5);
329 assert_eq!(dict.dictionary_size(), 3);
330
331 assert_eq!(dict.get(0), Some("apple"));
332 assert_eq!(dict.get(1), Some("banana"));
333 assert_eq!(dict.get(2), Some("apple"));
334 assert_eq!(dict.get(3), Some("cherry"));
335 assert_eq!(dict.get(4), Some("apple"));
336 }
337
338 #[test]
339 fn test_dictionary_codes() {
340 let mut builder = DictionaryBuilder::new();
341 let code_apple = builder.add("apple");
342 let code_banana = builder.add("banana");
343 let code_apple2 = builder.add("apple");
344
345 assert_eq!(code_apple, code_apple2);
346 assert_ne!(code_apple, code_banana);
347
348 let dict = builder.build();
349 assert_eq!(dict.codes(), &[0, 1, 0]);
350 }
351
352 #[test]
353 fn test_dictionary_with_nulls() {
354 let mut builder = DictionaryBuilder::new();
355 builder.add("apple");
356 builder.add_null();
357 builder.add("banana");
358 builder.add_null();
359
360 let dict = builder.build();
361
362 assert_eq!(dict.len(), 4);
363 assert_eq!(dict.get(0), Some("apple"));
364 assert_eq!(dict.get(1), None);
365 assert!(dict.is_null(1));
366 assert_eq!(dict.get(2), Some("banana"));
367 assert_eq!(dict.get(3), None);
368 assert!(dict.is_null(3));
369 }
370
371 #[test]
372 fn test_dictionary_encode_lookup() {
373 let mut builder = DictionaryBuilder::new();
374 builder.add("apple");
375 builder.add("banana");
376 builder.add("cherry");
377
378 let dict = builder.build();
379
380 assert_eq!(dict.encode("apple"), Some(0));
381 assert_eq!(dict.encode("banana"), Some(1));
382 assert_eq!(dict.encode("cherry"), Some(2));
383 assert_eq!(dict.encode("date"), None);
384 }
385
386 #[test]
387 fn test_dictionary_filter_by_code() {
388 let mut builder = DictionaryBuilder::new();
389 builder.add("apple");
390 builder.add("banana");
391 builder.add("apple");
392 builder.add("cherry");
393 builder.add("apple");
394
395 let dict = builder.build();
396 let apple_code = dict.encode("apple").unwrap();
397
398 let indices = dict.filter_by_code(|code| code == apple_code);
399 assert_eq!(indices, vec![0, 2, 4]);
400 }
401
402 #[test]
403 fn test_compression_ratio() {
404 let mut builder = DictionaryBuilder::new();
405
406 for _ in 0..100 {
408 builder.add("this_is_a_very_long_string_that_repeats_many_times");
409 }
410
411 let dict = builder.build();
412
413 let ratio = dict.compression_ratio();
415 assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
416 }
417
418 #[test]
419 fn test_into_dictionary_encoding() {
420 let strings = vec!["apple", "banana", "apple", "cherry"];
421 let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
422
423 assert_eq!(dict.len(), 4);
424 assert_eq!(dict.dictionary_size(), 3);
425 }
426
427 #[test]
428 fn test_empty_dictionary() {
429 let builder = DictionaryBuilder::new();
430 let dict = builder.build();
431
432 assert!(dict.is_empty());
433 assert_eq!(dict.dictionary_size(), 0);
434 assert_eq!(dict.get(0), None);
435 }
436
437 #[test]
438 fn test_single_value() {
439 let mut builder = DictionaryBuilder::new();
440 builder.add("only_value");
441
442 let dict = builder.build();
443
444 assert_eq!(dict.len(), 1);
445 assert_eq!(dict.dictionary_size(), 1);
446 assert_eq!(dict.get(0), Some("only_value"));
447 }
448
449 #[test]
450 fn test_all_unique() {
451 let mut builder = DictionaryBuilder::new();
452 builder.add("a");
453 builder.add("b");
454 builder.add("c");
455 builder.add("d");
456
457 let dict = builder.build();
458
459 assert_eq!(dict.len(), 4);
460 assert_eq!(dict.dictionary_size(), 4);
461 assert_eq!(dict.codes(), &[0, 1, 2, 3]);
462 }
463
464 #[test]
465 fn test_all_same() {
466 let mut builder = DictionaryBuilder::new();
467 for _ in 0..10 {
468 builder.add("same");
469 }
470
471 let dict = builder.build();
472
473 assert_eq!(dict.len(), 10);
474 assert_eq!(dict.dictionary_size(), 1);
475 assert!(dict.codes().iter().all(|&c| c == 0));
476 }
477}