grafeo_core/storage/
dictionary.rs1use std::collections::HashMap;
25use std::sync::Arc;
26
27#[derive(Debug, Clone)]
33pub struct DictionaryEncoding {
34 dictionary: Arc<[Arc<str>]>,
36 codes: Vec<u32>,
38 null_bitmap: Option<Vec<u64>>,
40}
41
42impl DictionaryEncoding {
43 pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
45 Self {
46 dictionary,
47 codes,
48 null_bitmap: None,
49 }
50 }
51
52 pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
54 self.null_bitmap = Some(null_bitmap);
55 self
56 }
57
58 pub fn len(&self) -> usize {
60 self.codes.len()
61 }
62
63 pub fn is_empty(&self) -> bool {
65 self.codes.is_empty()
66 }
67
68 pub fn dictionary_size(&self) -> usize {
70 self.dictionary.len()
71 }
72
73 pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
75 &self.dictionary
76 }
77
78 pub fn codes(&self) -> &[u32] {
80 &self.codes
81 }
82
83 pub fn is_null(&self, index: usize) -> bool {
85 if let Some(bitmap) = &self.null_bitmap {
86 let word_idx = index / 64;
87 let bit_idx = index % 64;
88 if word_idx < bitmap.len() {
89 return (bitmap[word_idx] & (1 << bit_idx)) != 0;
90 }
91 }
92 false
93 }
94
95 pub fn get(&self, index: usize) -> Option<&str> {
99 if self.is_null(index) {
100 return None;
101 }
102 let code = self.codes.get(index)? as &u32;
103 self.dictionary.get(*code as usize).map(|s| s.as_ref())
104 }
105
106 pub fn get_code(&self, index: usize) -> Option<u32> {
108 if self.is_null(index) {
109 return None;
110 }
111 self.codes.get(index).copied()
112 }
113
114 pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
116 (0..self.len()).map(move |i| self.get(i))
117 }
118
119 pub fn compression_ratio(&self) -> f64 {
123 if self.codes.is_empty() {
124 return 1.0;
125 }
126
127 let original_size: usize = self
129 .codes
130 .iter()
131 .map(|&code| {
132 if (code as usize) < self.dictionary.len() {
133 self.dictionary[code as usize].len()
134 } else {
135 0
136 }
137 })
138 .sum();
139
140 let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
142 let codes_size = self.codes.len() * std::mem::size_of::<u32>();
143 let compressed_size = dict_size + codes_size;
144
145 if compressed_size == 0 {
146 return 1.0;
147 }
148
149 original_size as f64 / compressed_size as f64
150 }
151
152 pub fn encode(&self, value: &str) -> Option<u32> {
154 self.dictionary
155 .iter()
156 .position(|s| s.as_ref() == value)
157 .map(|i| i as u32)
158 }
159
160 pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
162 self.codes
163 .iter()
164 .enumerate()
165 .filter_map(|(i, &code)| {
166 if !self.is_null(i) && predicate(code) {
167 Some(i)
168 } else {
169 None
170 }
171 })
172 .collect()
173 }
174}
175
176#[derive(Debug)]
181pub struct DictionaryBuilder {
182 string_to_code: HashMap<Arc<str>, u32>,
184 dictionary: Vec<Arc<str>>,
186 codes: Vec<u32>,
188 null_positions: Vec<usize>,
190}
191
192impl DictionaryBuilder {
193 pub fn new() -> Self {
195 Self {
196 string_to_code: HashMap::new(),
197 dictionary: Vec::new(),
198 codes: Vec::new(),
199 null_positions: Vec::new(),
200 }
201 }
202
203 pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
205 Self {
206 string_to_code: HashMap::with_capacity(dictionary_capacity),
207 dictionary: Vec::with_capacity(dictionary_capacity),
208 codes: Vec::with_capacity(value_capacity),
209 null_positions: Vec::new(),
210 }
211 }
212
213 pub fn add(&mut self, value: &str) -> u32 {
217 if let Some(&code) = self.string_to_code.get(value) {
218 self.codes.push(code);
219 code
220 } else {
221 let code = self.dictionary.len() as u32;
222 let arc_value: Arc<str> = value.into();
223 self.string_to_code.insert(arc_value.clone(), code);
224 self.dictionary.push(arc_value);
225 self.codes.push(code);
226 code
227 }
228 }
229
230 pub fn add_null(&mut self) {
232 let idx = self.codes.len();
233 self.null_positions.push(idx);
234 self.codes.push(0); }
236
237 pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
239 match value {
240 Some(v) => Some(self.add(v)),
241 None => {
242 self.add_null();
243 None
244 }
245 }
246 }
247
248 pub fn len(&self) -> usize {
250 self.codes.len()
251 }
252
253 pub fn is_empty(&self) -> bool {
255 self.codes.is_empty()
256 }
257
258 pub fn dictionary_size(&self) -> usize {
260 self.dictionary.len()
261 }
262
263 pub fn build(self) -> DictionaryEncoding {
265 let null_bitmap = if self.null_positions.is_empty() {
266 None
267 } else {
268 let num_words = (self.codes.len() + 63) / 64;
269 let mut bitmap = vec![0u64; num_words];
270 for &pos in &self.null_positions {
271 let word_idx = pos / 64;
272 let bit_idx = pos % 64;
273 bitmap[word_idx] |= 1 << bit_idx;
274 }
275 Some(bitmap)
276 };
277
278 let dict: Arc<[Arc<str>]> = self.dictionary.into();
279
280 let mut encoding = DictionaryEncoding::new(dict, self.codes);
281 if let Some(bitmap) = null_bitmap {
282 encoding = encoding.with_nulls(bitmap);
283 }
284 encoding
285 }
286
287 pub fn clear(&mut self) {
289 self.string_to_code.clear();
290 self.dictionary.clear();
291 self.codes.clear();
292 self.null_positions.clear();
293 }
294}
295
296impl Default for DictionaryBuilder {
297 fn default() -> Self {
298 Self::new()
299 }
300}
301
302pub trait IntoDictionaryEncoding {
304 fn into_dictionary_encoding(self) -> DictionaryEncoding;
306}
307
308impl<'a, I> IntoDictionaryEncoding for I
309where
310 I: IntoIterator<Item = &'a str>,
311{
312 fn into_dictionary_encoding(self) -> DictionaryEncoding {
313 let mut builder = DictionaryBuilder::new();
314 for s in self {
315 builder.add(s);
316 }
317 builder.build()
318 }
319}
320
321#[cfg(test)]
322mod tests {
323 use super::*;
324
325 #[test]
326 fn test_dictionary_builder_basic() {
327 let mut builder = DictionaryBuilder::new();
328 builder.add("apple");
329 builder.add("banana");
330 builder.add("apple");
331 builder.add("cherry");
332 builder.add("apple");
333
334 let dict = builder.build();
335
336 assert_eq!(dict.len(), 5);
337 assert_eq!(dict.dictionary_size(), 3);
338
339 assert_eq!(dict.get(0), Some("apple"));
340 assert_eq!(dict.get(1), Some("banana"));
341 assert_eq!(dict.get(2), Some("apple"));
342 assert_eq!(dict.get(3), Some("cherry"));
343 assert_eq!(dict.get(4), Some("apple"));
344 }
345
346 #[test]
347 fn test_dictionary_codes() {
348 let mut builder = DictionaryBuilder::new();
349 let code_apple = builder.add("apple");
350 let code_banana = builder.add("banana");
351 let code_apple2 = builder.add("apple");
352
353 assert_eq!(code_apple, code_apple2);
354 assert_ne!(code_apple, code_banana);
355
356 let dict = builder.build();
357 assert_eq!(dict.codes(), &[0, 1, 0]);
358 }
359
360 #[test]
361 fn test_dictionary_with_nulls() {
362 let mut builder = DictionaryBuilder::new();
363 builder.add("apple");
364 builder.add_null();
365 builder.add("banana");
366 builder.add_null();
367
368 let dict = builder.build();
369
370 assert_eq!(dict.len(), 4);
371 assert_eq!(dict.get(0), Some("apple"));
372 assert_eq!(dict.get(1), None);
373 assert!(dict.is_null(1));
374 assert_eq!(dict.get(2), Some("banana"));
375 assert_eq!(dict.get(3), None);
376 assert!(dict.is_null(3));
377 }
378
379 #[test]
380 fn test_dictionary_encode_lookup() {
381 let mut builder = DictionaryBuilder::new();
382 builder.add("apple");
383 builder.add("banana");
384 builder.add("cherry");
385
386 let dict = builder.build();
387
388 assert_eq!(dict.encode("apple"), Some(0));
389 assert_eq!(dict.encode("banana"), Some(1));
390 assert_eq!(dict.encode("cherry"), Some(2));
391 assert_eq!(dict.encode("date"), None);
392 }
393
394 #[test]
395 fn test_dictionary_filter_by_code() {
396 let mut builder = DictionaryBuilder::new();
397 builder.add("apple");
398 builder.add("banana");
399 builder.add("apple");
400 builder.add("cherry");
401 builder.add("apple");
402
403 let dict = builder.build();
404 let apple_code = dict.encode("apple").unwrap();
405
406 let indices = dict.filter_by_code(|code| code == apple_code);
407 assert_eq!(indices, vec![0, 2, 4]);
408 }
409
410 #[test]
411 fn test_compression_ratio() {
412 let mut builder = DictionaryBuilder::new();
413
414 for _ in 0..100 {
416 builder.add("this_is_a_very_long_string_that_repeats_many_times");
417 }
418
419 let dict = builder.build();
420
421 let ratio = dict.compression_ratio();
423 assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
424 }
425
426 #[test]
427 fn test_into_dictionary_encoding() {
428 let strings = vec!["apple", "banana", "apple", "cherry"];
429 let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
430
431 assert_eq!(dict.len(), 4);
432 assert_eq!(dict.dictionary_size(), 3);
433 }
434
435 #[test]
436 fn test_empty_dictionary() {
437 let builder = DictionaryBuilder::new();
438 let dict = builder.build();
439
440 assert!(dict.is_empty());
441 assert_eq!(dict.dictionary_size(), 0);
442 assert_eq!(dict.get(0), None);
443 }
444
445 #[test]
446 fn test_single_value() {
447 let mut builder = DictionaryBuilder::new();
448 builder.add("only_value");
449
450 let dict = builder.build();
451
452 assert_eq!(dict.len(), 1);
453 assert_eq!(dict.dictionary_size(), 1);
454 assert_eq!(dict.get(0), Some("only_value"));
455 }
456
457 #[test]
458 fn test_all_unique() {
459 let mut builder = DictionaryBuilder::new();
460 builder.add("a");
461 builder.add("b");
462 builder.add("c");
463 builder.add("d");
464
465 let dict = builder.build();
466
467 assert_eq!(dict.len(), 4);
468 assert_eq!(dict.dictionary_size(), 4);
469 assert_eq!(dict.codes(), &[0, 1, 2, 3]);
470 }
471
472 #[test]
473 fn test_all_same() {
474 let mut builder = DictionaryBuilder::new();
475 for _ in 0..10 {
476 builder.add("same");
477 }
478
479 let dict = builder.build();
480
481 assert_eq!(dict.len(), 10);
482 assert_eq!(dict.dictionary_size(), 1);
483 assert!(dict.codes().iter().all(|&c| c == 0));
484 }
485}