grafeo_core/storage/
dictionary.rs1use std::collections::HashMap;
24use std::sync::Arc;
25
26#[derive(Debug, Clone)]
32pub struct DictionaryEncoding {
33 dictionary: Arc<[Arc<str>]>,
35 codes: Vec<u32>,
37 null_bitmap: Option<Vec<u64>>,
39}
40
41impl DictionaryEncoding {
42 pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
44 Self {
45 dictionary,
46 codes,
47 null_bitmap: None,
48 }
49 }
50
51 pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
53 self.null_bitmap = Some(null_bitmap);
54 self
55 }
56
57 pub fn len(&self) -> usize {
59 self.codes.len()
60 }
61
62 pub fn is_empty(&self) -> bool {
64 self.codes.is_empty()
65 }
66
67 pub fn dictionary_size(&self) -> usize {
69 self.dictionary.len()
70 }
71
72 pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
74 &self.dictionary
75 }
76
77 pub fn codes(&self) -> &[u32] {
79 &self.codes
80 }
81
82 pub fn is_null(&self, index: usize) -> bool {
84 if let Some(bitmap) = &self.null_bitmap {
85 let word_idx = index / 64;
86 let bit_idx = index % 64;
87 if word_idx < bitmap.len() {
88 return (bitmap[word_idx] & (1 << bit_idx)) != 0;
89 }
90 }
91 false
92 }
93
94 pub fn get(&self, index: usize) -> Option<&str> {
98 if self.is_null(index) {
99 return None;
100 }
101 let code = self.codes.get(index)? as &u32;
102 self.dictionary.get(*code as usize).map(|s| s.as_ref())
103 }
104
105 pub fn get_code(&self, index: usize) -> Option<u32> {
107 if self.is_null(index) {
108 return None;
109 }
110 self.codes.get(index).copied()
111 }
112
113 pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
115 (0..self.len()).map(move |i| self.get(i))
116 }
117
118 pub fn compression_ratio(&self) -> f64 {
122 if self.codes.is_empty() {
123 return 1.0;
124 }
125
126 let original_size: usize = self
128 .codes
129 .iter()
130 .map(|&code| {
131 if (code as usize) < self.dictionary.len() {
132 self.dictionary[code as usize].len()
133 } else {
134 0
135 }
136 })
137 .sum();
138
139 let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
141 let codes_size = self.codes.len() * std::mem::size_of::<u32>();
142 let compressed_size = dict_size + codes_size;
143
144 if compressed_size == 0 {
145 return 1.0;
146 }
147
148 original_size as f64 / compressed_size as f64
149 }
150
151 pub fn encode(&self, value: &str) -> Option<u32> {
153 self.dictionary
154 .iter()
155 .position(|s| s.as_ref() == value)
156 .map(|i| i as u32)
157 }
158
159 pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
161 self.codes
162 .iter()
163 .enumerate()
164 .filter_map(|(i, &code)| {
165 if !self.is_null(i) && predicate(code) {
166 Some(i)
167 } else {
168 None
169 }
170 })
171 .collect()
172 }
173}
174
175#[derive(Debug)]
180pub struct DictionaryBuilder {
181 string_to_code: HashMap<Arc<str>, u32>,
183 dictionary: Vec<Arc<str>>,
185 codes: Vec<u32>,
187 null_positions: Vec<usize>,
189}
190
191impl DictionaryBuilder {
192 pub fn new() -> Self {
194 Self {
195 string_to_code: HashMap::new(),
196 dictionary: Vec::new(),
197 codes: Vec::new(),
198 null_positions: Vec::new(),
199 }
200 }
201
202 pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
204 Self {
205 string_to_code: HashMap::with_capacity(dictionary_capacity),
206 dictionary: Vec::with_capacity(dictionary_capacity),
207 codes: Vec::with_capacity(value_capacity),
208 null_positions: Vec::new(),
209 }
210 }
211
212 pub fn add(&mut self, value: &str) -> u32 {
216 if let Some(&code) = self.string_to_code.get(value) {
217 self.codes.push(code);
218 code
219 } else {
220 let code = self.dictionary.len() as u32;
221 let arc_value: Arc<str> = value.into();
222 self.string_to_code.insert(arc_value.clone(), code);
223 self.dictionary.push(arc_value);
224 self.codes.push(code);
225 code
226 }
227 }
228
229 pub fn add_null(&mut self) {
231 let idx = self.codes.len();
232 self.null_positions.push(idx);
233 self.codes.push(0); }
235
236 pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
238 match value {
239 Some(v) => Some(self.add(v)),
240 None => {
241 self.add_null();
242 None
243 }
244 }
245 }
246
247 pub fn len(&self) -> usize {
249 self.codes.len()
250 }
251
252 pub fn is_empty(&self) -> bool {
254 self.codes.is_empty()
255 }
256
257 pub fn dictionary_size(&self) -> usize {
259 self.dictionary.len()
260 }
261
262 pub fn build(self) -> DictionaryEncoding {
264 let null_bitmap = if self.null_positions.is_empty() {
265 None
266 } else {
267 let num_words = (self.codes.len() + 63) / 64;
268 let mut bitmap = vec![0u64; num_words];
269 for &pos in &self.null_positions {
270 let word_idx = pos / 64;
271 let bit_idx = pos % 64;
272 bitmap[word_idx] |= 1 << bit_idx;
273 }
274 Some(bitmap)
275 };
276
277 let dict: Arc<[Arc<str>]> = self.dictionary.into();
278
279 let mut encoding = DictionaryEncoding::new(dict, self.codes);
280 if let Some(bitmap) = null_bitmap {
281 encoding = encoding.with_nulls(bitmap);
282 }
283 encoding
284 }
285
286 pub fn clear(&mut self) {
288 self.string_to_code.clear();
289 self.dictionary.clear();
290 self.codes.clear();
291 self.null_positions.clear();
292 }
293}
294
295impl Default for DictionaryBuilder {
296 fn default() -> Self {
297 Self::new()
298 }
299}
300
301pub trait IntoDictionaryEncoding {
303 fn into_dictionary_encoding(self) -> DictionaryEncoding;
305}
306
307impl<'a, I> IntoDictionaryEncoding for I
308where
309 I: IntoIterator<Item = &'a str>,
310{
311 fn into_dictionary_encoding(self) -> DictionaryEncoding {
312 let mut builder = DictionaryBuilder::new();
313 for s in self {
314 builder.add(s);
315 }
316 builder.build()
317 }
318}
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323
324 #[test]
325 fn test_dictionary_builder_basic() {
326 let mut builder = DictionaryBuilder::new();
327 builder.add("apple");
328 builder.add("banana");
329 builder.add("apple");
330 builder.add("cherry");
331 builder.add("apple");
332
333 let dict = builder.build();
334
335 assert_eq!(dict.len(), 5);
336 assert_eq!(dict.dictionary_size(), 3);
337
338 assert_eq!(dict.get(0), Some("apple"));
339 assert_eq!(dict.get(1), Some("banana"));
340 assert_eq!(dict.get(2), Some("apple"));
341 assert_eq!(dict.get(3), Some("cherry"));
342 assert_eq!(dict.get(4), Some("apple"));
343 }
344
345 #[test]
346 fn test_dictionary_codes() {
347 let mut builder = DictionaryBuilder::new();
348 let code_apple = builder.add("apple");
349 let code_banana = builder.add("banana");
350 let code_apple2 = builder.add("apple");
351
352 assert_eq!(code_apple, code_apple2);
353 assert_ne!(code_apple, code_banana);
354
355 let dict = builder.build();
356 assert_eq!(dict.codes(), &[0, 1, 0]);
357 }
358
359 #[test]
360 fn test_dictionary_with_nulls() {
361 let mut builder = DictionaryBuilder::new();
362 builder.add("apple");
363 builder.add_null();
364 builder.add("banana");
365 builder.add_null();
366
367 let dict = builder.build();
368
369 assert_eq!(dict.len(), 4);
370 assert_eq!(dict.get(0), Some("apple"));
371 assert_eq!(dict.get(1), None);
372 assert!(dict.is_null(1));
373 assert_eq!(dict.get(2), Some("banana"));
374 assert_eq!(dict.get(3), None);
375 assert!(dict.is_null(3));
376 }
377
378 #[test]
379 fn test_dictionary_encode_lookup() {
380 let mut builder = DictionaryBuilder::new();
381 builder.add("apple");
382 builder.add("banana");
383 builder.add("cherry");
384
385 let dict = builder.build();
386
387 assert_eq!(dict.encode("apple"), Some(0));
388 assert_eq!(dict.encode("banana"), Some(1));
389 assert_eq!(dict.encode("cherry"), Some(2));
390 assert_eq!(dict.encode("date"), None);
391 }
392
393 #[test]
394 fn test_dictionary_filter_by_code() {
395 let mut builder = DictionaryBuilder::new();
396 builder.add("apple");
397 builder.add("banana");
398 builder.add("apple");
399 builder.add("cherry");
400 builder.add("apple");
401
402 let dict = builder.build();
403 let apple_code = dict.encode("apple").unwrap();
404
405 let indices = dict.filter_by_code(|code| code == apple_code);
406 assert_eq!(indices, vec![0, 2, 4]);
407 }
408
409 #[test]
410 fn test_compression_ratio() {
411 let mut builder = DictionaryBuilder::new();
412
413 for _ in 0..100 {
415 builder.add("this_is_a_very_long_string_that_repeats_many_times");
416 }
417
418 let dict = builder.build();
419
420 let ratio = dict.compression_ratio();
422 assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
423 }
424
425 #[test]
426 fn test_into_dictionary_encoding() {
427 let strings = vec!["apple", "banana", "apple", "cherry"];
428 let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
429
430 assert_eq!(dict.len(), 4);
431 assert_eq!(dict.dictionary_size(), 3);
432 }
433
434 #[test]
435 fn test_empty_dictionary() {
436 let builder = DictionaryBuilder::new();
437 let dict = builder.build();
438
439 assert!(dict.is_empty());
440 assert_eq!(dict.dictionary_size(), 0);
441 assert_eq!(dict.get(0), None);
442 }
443
444 #[test]
445 fn test_single_value() {
446 let mut builder = DictionaryBuilder::new();
447 builder.add("only_value");
448
449 let dict = builder.build();
450
451 assert_eq!(dict.len(), 1);
452 assert_eq!(dict.dictionary_size(), 1);
453 assert_eq!(dict.get(0), Some("only_value"));
454 }
455
456 #[test]
457 fn test_all_unique() {
458 let mut builder = DictionaryBuilder::new();
459 builder.add("a");
460 builder.add("b");
461 builder.add("c");
462 builder.add("d");
463
464 let dict = builder.build();
465
466 assert_eq!(dict.len(), 4);
467 assert_eq!(dict.dictionary_size(), 4);
468 assert_eq!(dict.codes(), &[0, 1, 2, 3]);
469 }
470
471 #[test]
472 fn test_all_same() {
473 let mut builder = DictionaryBuilder::new();
474 for _ in 0..10 {
475 builder.add("same");
476 }
477
478 let dict = builder.build();
479
480 assert_eq!(dict.len(), 10);
481 assert_eq!(dict.dictionary_size(), 1);
482 assert!(dict.codes().iter().all(|&c| c == 0));
483 }
484}