Skip to main content

grafeo_core/storage/
dictionary.rs

1//! Dictionary encoding for repeated strings.
2//!
3//! If your data has lots of repeated strings (like node labels or edge types),
4//! dictionary encoding stores each unique string once and references it by a
5//! small integer code. A million "Person" labels becomes one string + a million
6//! 4-byte codes instead of a million strings.
7//!
8//! # Example
9//!
10//! ```ignore
11//! let mut builder = DictionaryBuilder::new();
12//! builder.add("Person");
13//! builder.add("Company");
14//! builder.add("Person");  // same as first - reuses code 0
15//! builder.add("Person");  // reuses code 0 again
16//!
17//! let dict = builder.build();
18//! // Dictionary: ["Person", "Company"]
19//! // Codes:      [0, 1, 0, 0]
20//! assert_eq!(dict.dictionary_size(), 2);  // Only 2 unique strings stored
21//! ```
22
23use std::collections::HashMap;
24use std::sync::Arc;
25
26/// Stores repeated strings efficiently by referencing them with integer codes.
27///
28/// Each unique string appears once in the dictionary. Values are stored as u32
29/// indices pointing into that dictionary. Great for labels, categories, and
30/// other low-cardinality string columns.
31#[derive(Debug, Clone)]
32pub struct DictionaryEncoding {
33    /// The dictionary of unique strings.
34    dictionary: Arc<[Arc<str>]>,
35    /// Encoded values as indices into the dictionary.
36    codes: Vec<u32>,
37    /// Null bitmap (bit set = null).
38    null_bitmap: Option<Vec<u64>>,
39}
40
41impl DictionaryEncoding {
42    /// Creates a new dictionary encoding from a dictionary and codes.
43    pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
44        Self {
45            dictionary,
46            codes,
47            null_bitmap: None,
48        }
49    }
50
51    /// Creates a dictionary encoding with a null bitmap.
52    pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
53        self.null_bitmap = Some(null_bitmap);
54        self
55    }
56
57    /// Returns the number of values.
58    pub fn len(&self) -> usize {
59        self.codes.len()
60    }
61
62    /// Returns whether the encoding is empty.
63    pub fn is_empty(&self) -> bool {
64        self.codes.is_empty()
65    }
66
67    /// Returns the number of unique strings in the dictionary.
68    pub fn dictionary_size(&self) -> usize {
69        self.dictionary.len()
70    }
71
72    /// Returns the dictionary.
73    pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
74        &self.dictionary
75    }
76
77    /// Returns the encoded values.
78    pub fn codes(&self) -> &[u32] {
79        &self.codes
80    }
81
82    /// Returns whether the value at index is null.
83    pub fn is_null(&self, index: usize) -> bool {
84        if let Some(bitmap) = &self.null_bitmap {
85            let word_idx = index / 64;
86            let bit_idx = index % 64;
87            if word_idx < bitmap.len() {
88                return (bitmap[word_idx] & (1 << bit_idx)) != 0;
89            }
90        }
91        false
92    }
93
94    /// Returns the string value at the given index.
95    ///
96    /// Returns `None` if the value is null.
97    pub fn get(&self, index: usize) -> Option<&str> {
98        if self.is_null(index) {
99            return None;
100        }
101        let code = self.codes.get(index)? as &u32;
102        self.dictionary.get(*code as usize).map(|s| s.as_ref())
103    }
104
105    /// Returns the code at the given index.
106    pub fn get_code(&self, index: usize) -> Option<u32> {
107        if self.is_null(index) {
108            return None;
109        }
110        self.codes.get(index).copied()
111    }
112
113    /// Iterates over all values, yielding `Option<&str>`.
114    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
115        (0..self.len()).map(move |i| self.get(i))
116    }
117
118    /// Returns the compression ratio (original size / compressed size).
119    ///
120    /// A ratio > 1.0 means compression is effective.
121    pub fn compression_ratio(&self) -> f64 {
122        if self.codes.is_empty() {
123            return 1.0;
124        }
125
126        // Estimate original size: sum of string lengths
127        let original_size: usize = self
128            .codes
129            .iter()
130            .map(|&code| {
131                if (code as usize) < self.dictionary.len() {
132                    self.dictionary[code as usize].len()
133                } else {
134                    0
135                }
136            })
137            .sum();
138
139        // Compressed size: dictionary + codes
140        let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
141        let codes_size = self.codes.len() * std::mem::size_of::<u32>();
142        let compressed_size = dict_size + codes_size;
143
144        if compressed_size == 0 {
145            return 1.0;
146        }
147
148        original_size as f64 / compressed_size as f64
149    }
150
151    /// Encodes a lookup value into a code, if it exists in the dictionary.
152    pub fn encode(&self, value: &str) -> Option<u32> {
153        self.dictionary
154            .iter()
155            .position(|s| s.as_ref() == value)
156            .map(|i| i as u32)
157    }
158
159    /// Filters the encoding to only include rows matching a predicate code.
160    pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
161        self.codes
162            .iter()
163            .enumerate()
164            .filter_map(|(i, &code)| {
165                if !self.is_null(i) && predicate(code) {
166                    Some(i)
167                } else {
168                    None
169                }
170            })
171            .collect()
172    }
173}
174
175/// Builds a dictionary encoding by streaming values through.
176///
177/// Call [`add()`](Self::add) for each value - we'll automatically assign codes
178/// and build the dictionary. Then [`build()`](Self::build) to get the final encoding.
179#[derive(Debug)]
180pub struct DictionaryBuilder {
181    /// Map from string to code.
182    string_to_code: HashMap<Arc<str>, u32>,
183    /// Dictionary (code -> string).
184    dictionary: Vec<Arc<str>>,
185    /// Encoded values.
186    codes: Vec<u32>,
187    /// Null positions (for marking nulls).
188    null_positions: Vec<usize>,
189}
190
191impl DictionaryBuilder {
192    /// Creates a new dictionary builder.
193    pub fn new() -> Self {
194        Self {
195            string_to_code: HashMap::new(),
196            dictionary: Vec::new(),
197            codes: Vec::new(),
198            null_positions: Vec::new(),
199        }
200    }
201
202    /// Creates a new dictionary builder with estimated capacity.
203    pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
204        Self {
205            string_to_code: HashMap::with_capacity(dictionary_capacity),
206            dictionary: Vec::with_capacity(dictionary_capacity),
207            codes: Vec::with_capacity(value_capacity),
208            null_positions: Vec::new(),
209        }
210    }
211
212    /// Adds a string value to the encoding.
213    ///
214    /// Returns the code assigned to this value.
215    pub fn add(&mut self, value: &str) -> u32 {
216        if let Some(&code) = self.string_to_code.get(value) {
217            self.codes.push(code);
218            code
219        } else {
220            let code = self.dictionary.len() as u32;
221            let arc_value: Arc<str> = value.into();
222            self.string_to_code.insert(arc_value.clone(), code);
223            self.dictionary.push(arc_value);
224            self.codes.push(code);
225            code
226        }
227    }
228
229    /// Adds a null value.
230    pub fn add_null(&mut self) {
231        let idx = self.codes.len();
232        self.null_positions.push(idx);
233        self.codes.push(0); // Placeholder code
234    }
235
236    /// Adds an optional value.
237    pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
238        match value {
239            Some(v) => Some(self.add(v)),
240            None => {
241                self.add_null();
242                None
243            }
244        }
245    }
246
247    /// Returns the current number of values.
248    pub fn len(&self) -> usize {
249        self.codes.len()
250    }
251
252    /// Returns whether the builder is empty.
253    pub fn is_empty(&self) -> bool {
254        self.codes.is_empty()
255    }
256
257    /// Returns the current dictionary size.
258    pub fn dictionary_size(&self) -> usize {
259        self.dictionary.len()
260    }
261
262    /// Builds the dictionary encoding.
263    pub fn build(self) -> DictionaryEncoding {
264        let null_bitmap = if self.null_positions.is_empty() {
265            None
266        } else {
267            let num_words = (self.codes.len() + 63) / 64;
268            let mut bitmap = vec![0u64; num_words];
269            for &pos in &self.null_positions {
270                let word_idx = pos / 64;
271                let bit_idx = pos % 64;
272                bitmap[word_idx] |= 1 << bit_idx;
273            }
274            Some(bitmap)
275        };
276
277        let dict: Arc<[Arc<str>]> = self.dictionary.into();
278
279        let mut encoding = DictionaryEncoding::new(dict, self.codes);
280        if let Some(bitmap) = null_bitmap {
281            encoding = encoding.with_nulls(bitmap);
282        }
283        encoding
284    }
285
286    /// Clears the builder for reuse.
287    pub fn clear(&mut self) {
288        self.string_to_code.clear();
289        self.dictionary.clear();
290        self.codes.clear();
291        self.null_positions.clear();
292    }
293}
294
295impl Default for DictionaryBuilder {
296    fn default() -> Self {
297        Self::new()
298    }
299}
300
301/// Extension trait for building dictionary encodings from iterators.
302pub trait IntoDictionaryEncoding {
303    /// Creates a dictionary encoding from an iterator of strings.
304    fn into_dictionary_encoding(self) -> DictionaryEncoding;
305}
306
307impl<'a, I> IntoDictionaryEncoding for I
308where
309    I: IntoIterator<Item = &'a str>,
310{
311    fn into_dictionary_encoding(self) -> DictionaryEncoding {
312        let mut builder = DictionaryBuilder::new();
313        for s in self {
314            builder.add(s);
315        }
316        builder.build()
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_dictionary_builder_basic() {
326        let mut builder = DictionaryBuilder::new();
327        builder.add("apple");
328        builder.add("banana");
329        builder.add("apple");
330        builder.add("cherry");
331        builder.add("apple");
332
333        let dict = builder.build();
334
335        assert_eq!(dict.len(), 5);
336        assert_eq!(dict.dictionary_size(), 3);
337
338        assert_eq!(dict.get(0), Some("apple"));
339        assert_eq!(dict.get(1), Some("banana"));
340        assert_eq!(dict.get(2), Some("apple"));
341        assert_eq!(dict.get(3), Some("cherry"));
342        assert_eq!(dict.get(4), Some("apple"));
343    }
344
345    #[test]
346    fn test_dictionary_codes() {
347        let mut builder = DictionaryBuilder::new();
348        let code_apple = builder.add("apple");
349        let code_banana = builder.add("banana");
350        let code_apple2 = builder.add("apple");
351
352        assert_eq!(code_apple, code_apple2);
353        assert_ne!(code_apple, code_banana);
354
355        let dict = builder.build();
356        assert_eq!(dict.codes(), &[0, 1, 0]);
357    }
358
359    #[test]
360    fn test_dictionary_with_nulls() {
361        let mut builder = DictionaryBuilder::new();
362        builder.add("apple");
363        builder.add_null();
364        builder.add("banana");
365        builder.add_null();
366
367        let dict = builder.build();
368
369        assert_eq!(dict.len(), 4);
370        assert_eq!(dict.get(0), Some("apple"));
371        assert_eq!(dict.get(1), None);
372        assert!(dict.is_null(1));
373        assert_eq!(dict.get(2), Some("banana"));
374        assert_eq!(dict.get(3), None);
375        assert!(dict.is_null(3));
376    }
377
378    #[test]
379    fn test_dictionary_encode_lookup() {
380        let mut builder = DictionaryBuilder::new();
381        builder.add("apple");
382        builder.add("banana");
383        builder.add("cherry");
384
385        let dict = builder.build();
386
387        assert_eq!(dict.encode("apple"), Some(0));
388        assert_eq!(dict.encode("banana"), Some(1));
389        assert_eq!(dict.encode("cherry"), Some(2));
390        assert_eq!(dict.encode("date"), None);
391    }
392
393    #[test]
394    fn test_dictionary_filter_by_code() {
395        let mut builder = DictionaryBuilder::new();
396        builder.add("apple");
397        builder.add("banana");
398        builder.add("apple");
399        builder.add("cherry");
400        builder.add("apple");
401
402        let dict = builder.build();
403        let apple_code = dict.encode("apple").unwrap();
404
405        let indices = dict.filter_by_code(|code| code == apple_code);
406        assert_eq!(indices, vec![0, 2, 4]);
407    }
408
409    #[test]
410    fn test_compression_ratio() {
411        let mut builder = DictionaryBuilder::new();
412
413        // Add many repeated long strings
414        for _ in 0..100 {
415            builder.add("this_is_a_very_long_string_that_repeats_many_times");
416        }
417
418        let dict = builder.build();
419
420        // Compression ratio should be > 1 for highly repetitive data
421        let ratio = dict.compression_ratio();
422        assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
423    }
424
425    #[test]
426    fn test_into_dictionary_encoding() {
427        let strings = vec!["apple", "banana", "apple", "cherry"];
428        let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
429
430        assert_eq!(dict.len(), 4);
431        assert_eq!(dict.dictionary_size(), 3);
432    }
433
434    #[test]
435    fn test_empty_dictionary() {
436        let builder = DictionaryBuilder::new();
437        let dict = builder.build();
438
439        assert!(dict.is_empty());
440        assert_eq!(dict.dictionary_size(), 0);
441        assert_eq!(dict.get(0), None);
442    }
443
444    #[test]
445    fn test_single_value() {
446        let mut builder = DictionaryBuilder::new();
447        builder.add("only_value");
448
449        let dict = builder.build();
450
451        assert_eq!(dict.len(), 1);
452        assert_eq!(dict.dictionary_size(), 1);
453        assert_eq!(dict.get(0), Some("only_value"));
454    }
455
456    #[test]
457    fn test_all_unique() {
458        let mut builder = DictionaryBuilder::new();
459        builder.add("a");
460        builder.add("b");
461        builder.add("c");
462        builder.add("d");
463
464        let dict = builder.build();
465
466        assert_eq!(dict.len(), 4);
467        assert_eq!(dict.dictionary_size(), 4);
468        assert_eq!(dict.codes(), &[0, 1, 2, 3]);
469    }
470
471    #[test]
472    fn test_all_same() {
473        let mut builder = DictionaryBuilder::new();
474        for _ in 0..10 {
475            builder.add("same");
476        }
477
478        let dict = builder.build();
479
480        assert_eq!(dict.len(), 10);
481        assert_eq!(dict.dictionary_size(), 1);
482        assert!(dict.codes().iter().all(|&c| c == 0));
483    }
484}