Skip to main content

grafeo_core/storage/
dictionary.rs

1//! Dictionary encoding for repeated strings.
2//!
3//! If your data has lots of repeated strings (like node labels or edge types),
4//! dictionary encoding stores each unique string once and references it by a
5//! small integer code. A million "Person" labels becomes one string + a million
6//! 4-byte codes instead of a million strings.
7//!
8//! # Example
9//!
10//! ```no_run
11//! # use grafeo_core::storage::dictionary::DictionaryBuilder;
12//! let mut builder = DictionaryBuilder::new();
13//! builder.add("Person");
14//! builder.add("Company");
15//! builder.add("Person");  // same as first - reuses code 0
16//! builder.add("Person");  // reuses code 0 again
17//!
18//! let dict = builder.build();
19//! // Dictionary: ["Person", "Company"]
20//! // Codes:      [0, 1, 0, 0]
21//! assert_eq!(dict.dictionary_size(), 2);  // Only 2 unique strings stored
22//! ```
23
24use std::collections::HashMap;
25use std::sync::Arc;
26
27/// Stores repeated strings efficiently by referencing them with integer codes.
28///
29/// Each unique string appears once in the dictionary. Values are stored as u32
30/// indices pointing into that dictionary. Great for labels, categories, and
31/// other low-cardinality string columns.
32#[derive(Debug, Clone)]
33pub struct DictionaryEncoding {
34    /// The dictionary of unique strings.
35    dictionary: Arc<[Arc<str>]>,
36    /// Encoded values as indices into the dictionary.
37    codes: Vec<u32>,
38    /// Null bitmap (bit set = null).
39    null_bitmap: Option<Vec<u64>>,
40}
41
42impl DictionaryEncoding {
43    /// Creates a new dictionary encoding from a dictionary and codes.
44    pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
45        Self {
46            dictionary,
47            codes,
48            null_bitmap: None,
49        }
50    }
51
52    /// Creates a dictionary encoding with a null bitmap.
53    pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
54        self.null_bitmap = Some(null_bitmap);
55        self
56    }
57
58    /// Returns the number of values.
59    pub fn len(&self) -> usize {
60        self.codes.len()
61    }
62
63    /// Returns whether the encoding is empty.
64    pub fn is_empty(&self) -> bool {
65        self.codes.is_empty()
66    }
67
68    /// Returns the number of unique strings in the dictionary.
69    pub fn dictionary_size(&self) -> usize {
70        self.dictionary.len()
71    }
72
73    /// Returns the dictionary.
74    pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
75        &self.dictionary
76    }
77
78    /// Returns the encoded values.
79    pub fn codes(&self) -> &[u32] {
80        &self.codes
81    }
82
83    /// Returns whether the value at index is null.
84    pub fn is_null(&self, index: usize) -> bool {
85        if let Some(bitmap) = &self.null_bitmap {
86            let word_idx = index / 64;
87            let bit_idx = index % 64;
88            if word_idx < bitmap.len() {
89                return (bitmap[word_idx] & (1 << bit_idx)) != 0;
90            }
91        }
92        false
93    }
94
95    /// Returns the string value at the given index.
96    ///
97    /// Returns `None` if the value is null.
98    pub fn get(&self, index: usize) -> Option<&str> {
99        if self.is_null(index) {
100            return None;
101        }
102        let code = self.codes.get(index)? as &u32;
103        self.dictionary.get(*code as usize).map(|s| s.as_ref())
104    }
105
106    /// Returns the code at the given index.
107    pub fn get_code(&self, index: usize) -> Option<u32> {
108        if self.is_null(index) {
109            return None;
110        }
111        self.codes.get(index).copied()
112    }
113
114    /// Iterates over all values, yielding `Option<&str>`.
115    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
116        (0..self.len()).map(move |i| self.get(i))
117    }
118
119    /// Returns the compression ratio (original size / compressed size).
120    ///
121    /// A ratio > 1.0 means compression is effective.
122    pub fn compression_ratio(&self) -> f64 {
123        if self.codes.is_empty() {
124            return 1.0;
125        }
126
127        // Estimate original size: sum of string lengths
128        let original_size: usize = self
129            .codes
130            .iter()
131            .map(|&code| {
132                if (code as usize) < self.dictionary.len() {
133                    self.dictionary[code as usize].len()
134                } else {
135                    0
136                }
137            })
138            .sum();
139
140        // Compressed size: dictionary + codes
141        let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
142        let codes_size = self.codes.len() * std::mem::size_of::<u32>();
143        let compressed_size = dict_size + codes_size;
144
145        if compressed_size == 0 {
146            return 1.0;
147        }
148
149        original_size as f64 / compressed_size as f64
150    }
151
152    /// Encodes a lookup value into a code, if it exists in the dictionary.
153    pub fn encode(&self, value: &str) -> Option<u32> {
154        self.dictionary
155            .iter()
156            .position(|s| s.as_ref() == value)
157            .map(|i| i as u32)
158    }
159
160    /// Filters the encoding to only include rows matching a predicate code.
161    pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
162        self.codes
163            .iter()
164            .enumerate()
165            .filter_map(|(i, &code)| {
166                if !self.is_null(i) && predicate(code) {
167                    Some(i)
168                } else {
169                    None
170                }
171            })
172            .collect()
173    }
174}
175
176/// Builds a dictionary encoding by streaming values through.
177///
178/// Call [`add()`](Self::add) for each value - we'll automatically assign codes
179/// and build the dictionary. Then [`build()`](Self::build) to get the final encoding.
180#[derive(Debug)]
181pub struct DictionaryBuilder {
182    /// Map from string to code.
183    string_to_code: HashMap<Arc<str>, u32>,
184    /// Dictionary (code -> string).
185    dictionary: Vec<Arc<str>>,
186    /// Encoded values.
187    codes: Vec<u32>,
188    /// Null positions (for marking nulls).
189    null_positions: Vec<usize>,
190}
191
192impl DictionaryBuilder {
193    /// Creates a new dictionary builder.
194    pub fn new() -> Self {
195        Self {
196            string_to_code: HashMap::new(),
197            dictionary: Vec::new(),
198            codes: Vec::new(),
199            null_positions: Vec::new(),
200        }
201    }
202
203    /// Creates a new dictionary builder with estimated capacity.
204    pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
205        Self {
206            string_to_code: HashMap::with_capacity(dictionary_capacity),
207            dictionary: Vec::with_capacity(dictionary_capacity),
208            codes: Vec::with_capacity(value_capacity),
209            null_positions: Vec::new(),
210        }
211    }
212
213    /// Adds a string value to the encoding.
214    ///
215    /// Returns the code assigned to this value.
216    pub fn add(&mut self, value: &str) -> u32 {
217        if let Some(&code) = self.string_to_code.get(value) {
218            self.codes.push(code);
219            code
220        } else {
221            let code = self.dictionary.len() as u32;
222            let arc_value: Arc<str> = value.into();
223            self.string_to_code.insert(arc_value.clone(), code);
224            self.dictionary.push(arc_value);
225            self.codes.push(code);
226            code
227        }
228    }
229
230    /// Adds a null value.
231    pub fn add_null(&mut self) {
232        let idx = self.codes.len();
233        self.null_positions.push(idx);
234        self.codes.push(0); // Placeholder code
235    }
236
237    /// Adds an optional value.
238    pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
239        match value {
240            Some(v) => Some(self.add(v)),
241            None => {
242                self.add_null();
243                None
244            }
245        }
246    }
247
248    /// Returns the current number of values.
249    pub fn len(&self) -> usize {
250        self.codes.len()
251    }
252
253    /// Returns whether the builder is empty.
254    pub fn is_empty(&self) -> bool {
255        self.codes.is_empty()
256    }
257
258    /// Returns the current dictionary size.
259    pub fn dictionary_size(&self) -> usize {
260        self.dictionary.len()
261    }
262
263    /// Builds the dictionary encoding.
264    pub fn build(self) -> DictionaryEncoding {
265        let null_bitmap = if self.null_positions.is_empty() {
266            None
267        } else {
268            let num_words = (self.codes.len() + 63) / 64;
269            let mut bitmap = vec![0u64; num_words];
270            for &pos in &self.null_positions {
271                let word_idx = pos / 64;
272                let bit_idx = pos % 64;
273                bitmap[word_idx] |= 1 << bit_idx;
274            }
275            Some(bitmap)
276        };
277
278        let dict: Arc<[Arc<str>]> = self.dictionary.into();
279
280        let mut encoding = DictionaryEncoding::new(dict, self.codes);
281        if let Some(bitmap) = null_bitmap {
282            encoding = encoding.with_nulls(bitmap);
283        }
284        encoding
285    }
286
287    /// Clears the builder for reuse.
288    pub fn clear(&mut self) {
289        self.string_to_code.clear();
290        self.dictionary.clear();
291        self.codes.clear();
292        self.null_positions.clear();
293    }
294}
295
296impl Default for DictionaryBuilder {
297    fn default() -> Self {
298        Self::new()
299    }
300}
301
302/// Extension trait for building dictionary encodings from iterators.
303pub trait IntoDictionaryEncoding {
304    /// Creates a dictionary encoding from an iterator of strings.
305    fn into_dictionary_encoding(self) -> DictionaryEncoding;
306}
307
308impl<'a, I> IntoDictionaryEncoding for I
309where
310    I: IntoIterator<Item = &'a str>,
311{
312    fn into_dictionary_encoding(self) -> DictionaryEncoding {
313        let mut builder = DictionaryBuilder::new();
314        for s in self {
315            builder.add(s);
316        }
317        builder.build()
318    }
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    #[test]
326    fn test_dictionary_builder_basic() {
327        let mut builder = DictionaryBuilder::new();
328        builder.add("apple");
329        builder.add("banana");
330        builder.add("apple");
331        builder.add("cherry");
332        builder.add("apple");
333
334        let dict = builder.build();
335
336        assert_eq!(dict.len(), 5);
337        assert_eq!(dict.dictionary_size(), 3);
338
339        assert_eq!(dict.get(0), Some("apple"));
340        assert_eq!(dict.get(1), Some("banana"));
341        assert_eq!(dict.get(2), Some("apple"));
342        assert_eq!(dict.get(3), Some("cherry"));
343        assert_eq!(dict.get(4), Some("apple"));
344    }
345
346    #[test]
347    fn test_dictionary_codes() {
348        let mut builder = DictionaryBuilder::new();
349        let code_apple = builder.add("apple");
350        let code_banana = builder.add("banana");
351        let code_apple2 = builder.add("apple");
352
353        assert_eq!(code_apple, code_apple2);
354        assert_ne!(code_apple, code_banana);
355
356        let dict = builder.build();
357        assert_eq!(dict.codes(), &[0, 1, 0]);
358    }
359
360    #[test]
361    fn test_dictionary_with_nulls() {
362        let mut builder = DictionaryBuilder::new();
363        builder.add("apple");
364        builder.add_null();
365        builder.add("banana");
366        builder.add_null();
367
368        let dict = builder.build();
369
370        assert_eq!(dict.len(), 4);
371        assert_eq!(dict.get(0), Some("apple"));
372        assert_eq!(dict.get(1), None);
373        assert!(dict.is_null(1));
374        assert_eq!(dict.get(2), Some("banana"));
375        assert_eq!(dict.get(3), None);
376        assert!(dict.is_null(3));
377    }
378
379    #[test]
380    fn test_dictionary_encode_lookup() {
381        let mut builder = DictionaryBuilder::new();
382        builder.add("apple");
383        builder.add("banana");
384        builder.add("cherry");
385
386        let dict = builder.build();
387
388        assert_eq!(dict.encode("apple"), Some(0));
389        assert_eq!(dict.encode("banana"), Some(1));
390        assert_eq!(dict.encode("cherry"), Some(2));
391        assert_eq!(dict.encode("date"), None);
392    }
393
394    #[test]
395    fn test_dictionary_filter_by_code() {
396        let mut builder = DictionaryBuilder::new();
397        builder.add("apple");
398        builder.add("banana");
399        builder.add("apple");
400        builder.add("cherry");
401        builder.add("apple");
402
403        let dict = builder.build();
404        let apple_code = dict.encode("apple").unwrap();
405
406        let indices = dict.filter_by_code(|code| code == apple_code);
407        assert_eq!(indices, vec![0, 2, 4]);
408    }
409
410    #[test]
411    fn test_compression_ratio() {
412        let mut builder = DictionaryBuilder::new();
413
414        // Add many repeated long strings
415        for _ in 0..100 {
416            builder.add("this_is_a_very_long_string_that_repeats_many_times");
417        }
418
419        let dict = builder.build();
420
421        // Compression ratio should be > 1 for highly repetitive data
422        let ratio = dict.compression_ratio();
423        assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
424    }
425
426    #[test]
427    fn test_into_dictionary_encoding() {
428        let strings = vec!["apple", "banana", "apple", "cherry"];
429        let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
430
431        assert_eq!(dict.len(), 4);
432        assert_eq!(dict.dictionary_size(), 3);
433    }
434
435    #[test]
436    fn test_empty_dictionary() {
437        let builder = DictionaryBuilder::new();
438        let dict = builder.build();
439
440        assert!(dict.is_empty());
441        assert_eq!(dict.dictionary_size(), 0);
442        assert_eq!(dict.get(0), None);
443    }
444
445    #[test]
446    fn test_single_value() {
447        let mut builder = DictionaryBuilder::new();
448        builder.add("only_value");
449
450        let dict = builder.build();
451
452        assert_eq!(dict.len(), 1);
453        assert_eq!(dict.dictionary_size(), 1);
454        assert_eq!(dict.get(0), Some("only_value"));
455    }
456
457    #[test]
458    fn test_all_unique() {
459        let mut builder = DictionaryBuilder::new();
460        builder.add("a");
461        builder.add("b");
462        builder.add("c");
463        builder.add("d");
464
465        let dict = builder.build();
466
467        assert_eq!(dict.len(), 4);
468        assert_eq!(dict.dictionary_size(), 4);
469        assert_eq!(dict.codes(), &[0, 1, 2, 3]);
470    }
471
472    #[test]
473    fn test_all_same() {
474        let mut builder = DictionaryBuilder::new();
475        for _ in 0..10 {
476            builder.add("same");
477        }
478
479        let dict = builder.build();
480
481        assert_eq!(dict.len(), 10);
482        assert_eq!(dict.dictionary_size(), 1);
483        assert!(dict.codes().iter().all(|&c| c == 0));
484    }
485}