Skip to main content

grafeo_core/codec/
dictionary.rs

1//! Dictionary encoding for repeated strings.
2//!
3//! If your data has lots of repeated strings (like node labels or edge types),
4//! dictionary encoding stores each unique string once and references it by a
5//! small integer code. A million "Person" labels becomes one string + a million
6//! 4-byte codes instead of a million strings.
7//!
8//! # Example
9//!
10//! ```no_run
11//! # use grafeo_core::codec::dictionary::DictionaryBuilder;
12//! let mut builder = DictionaryBuilder::new();
13//! builder.add("Person");
14//! builder.add("Company");
15//! builder.add("Person");  // same as first - reuses code 0
16//! builder.add("Person");  // reuses code 0 again
17//!
18//! let dict = builder.build();
19//! // Dictionary: ["Person", "Company"]
20//! // Codes:      [0, 1, 0, 0]
21//! assert_eq!(dict.dictionary_size(), 2);  // Only 2 unique strings stored
22//! ```
23
24use std::collections::HashMap;
25use std::sync::Arc;
26
27/// Stores repeated strings efficiently by referencing them with integer codes.
28///
29/// Each unique string appears once in the dictionary. Values are stored as u32
30/// indices pointing into that dictionary. Great for labels, categories, and
31/// other low-cardinality string columns.
32#[derive(Debug, Clone)]
33pub struct DictionaryEncoding {
34    /// The dictionary of unique strings.
35    dictionary: Arc<[Arc<str>]>,
36    /// Encoded values as indices into the dictionary.
37    codes: Vec<u32>,
38    /// Null bitmap (bit set = null).
39    null_bitmap: Option<Vec<u64>>,
40}
41
42impl DictionaryEncoding {
43    /// Creates a new dictionary encoding from a dictionary and codes.
44    pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
45        Self {
46            dictionary,
47            codes,
48            null_bitmap: None,
49        }
50    }
51
52    /// Creates a dictionary encoding with a null bitmap.
53    pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
54        self.null_bitmap = Some(null_bitmap);
55        self
56    }
57
58    /// Returns the number of values.
59    pub fn len(&self) -> usize {
60        self.codes.len()
61    }
62
63    /// Returns whether the encoding is empty.
64    pub fn is_empty(&self) -> bool {
65        self.codes.is_empty()
66    }
67
68    /// Returns the number of unique strings in the dictionary.
69    pub fn dictionary_size(&self) -> usize {
70        self.dictionary.len()
71    }
72
73    /// Returns the dictionary.
74    pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
75        &self.dictionary
76    }
77
78    /// Returns the encoded values.
79    pub fn codes(&self) -> &[u32] {
80        &self.codes
81    }
82
83    /// Returns whether the value at index is null.
84    pub fn is_null(&self, index: usize) -> bool {
85        if let Some(bitmap) = &self.null_bitmap {
86            let word_idx = index / 64;
87            let bit_idx = index % 64;
88            if word_idx < bitmap.len() {
89                return (bitmap[word_idx] & (1 << bit_idx)) != 0;
90            }
91        }
92        false
93    }
94
95    /// Returns the string value at the given index.
96    ///
97    /// Returns `None` if the value is null.
98    pub fn get(&self, index: usize) -> Option<&str> {
99        if self.is_null(index) {
100            return None;
101        }
102        let code = self.codes.get(index)? as &u32;
103        self.dictionary.get(*code as usize).map(|s| s.as_ref())
104    }
105
106    /// Returns the code at the given index.
107    pub fn get_code(&self, index: usize) -> Option<u32> {
108        if self.is_null(index) {
109            return None;
110        }
111        self.codes.get(index).copied()
112    }
113
114    /// Iterates over all values, yielding `Option<&str>`.
115    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
116        (0..self.len()).map(move |i| self.get(i))
117    }
118
119    /// Returns the compression ratio (original size / compressed size).
120    ///
121    /// A ratio > 1.0 means compression is effective.
122    pub fn compression_ratio(&self) -> f64 {
123        if self.codes.is_empty() {
124            return 1.0;
125        }
126
127        // Estimate original size: sum of string lengths
128        let original_size: usize = self
129            .codes
130            .iter()
131            .map(|&code| {
132                if (code as usize) < self.dictionary.len() {
133                    self.dictionary[code as usize].len()
134                } else {
135                    0
136                }
137            })
138            .sum();
139
140        // Compressed size: dictionary + codes
141        let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
142        let codes_size = self.codes.len() * std::mem::size_of::<u32>();
143        let compressed_size = dict_size + codes_size;
144
145        if compressed_size == 0 {
146            return 1.0;
147        }
148
149        original_size as f64 / compressed_size as f64
150    }
151
152    /// Encodes a lookup value into a code, if it exists in the dictionary.
153    pub fn encode(&self, value: &str) -> Option<u32> {
154        self.dictionary
155            .iter()
156            .position(|s| s.as_ref() == value)
157            .and_then(|i| u32::try_from(i).ok())
158    }
159
160    /// Filters the encoding to only include rows matching a predicate code.
161    pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
162        self.codes
163            .iter()
164            .enumerate()
165            .filter_map(|(i, &code)| {
166                if !self.is_null(i) && predicate(code) {
167                    Some(i)
168                } else {
169                    None
170                }
171            })
172            .collect()
173    }
174}
175
176/// Builds a dictionary encoding by streaming values through.
177///
178/// Call [`add()`](Self::add) for each value - we'll automatically assign codes
179/// and build the dictionary. Then [`build()`](Self::build) to get the final encoding.
180#[derive(Debug)]
181pub struct DictionaryBuilder {
182    /// Map from string to code.
183    string_to_code: HashMap<Arc<str>, u32>,
184    /// Dictionary (code -> string).
185    dictionary: Vec<Arc<str>>,
186    /// Encoded values.
187    codes: Vec<u32>,
188    /// Null positions (for marking nulls).
189    null_positions: Vec<usize>,
190}
191
192impl DictionaryBuilder {
193    /// Creates a new dictionary builder.
194    pub fn new() -> Self {
195        Self {
196            string_to_code: HashMap::new(),
197            dictionary: Vec::new(),
198            codes: Vec::new(),
199            null_positions: Vec::new(),
200        }
201    }
202
203    /// Creates a new dictionary builder with estimated capacity.
204    pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
205        Self {
206            string_to_code: HashMap::with_capacity(dictionary_capacity),
207            dictionary: Vec::with_capacity(dictionary_capacity),
208            codes: Vec::with_capacity(value_capacity),
209            null_positions: Vec::new(),
210        }
211    }
212
213    /// Adds a string value to the encoding.
214    ///
215    /// Returns the code assigned to this value.
216    pub fn add(&mut self, value: &str) -> u32 {
217        if let Some(&code) = self.string_to_code.get(value) {
218            self.codes.push(code);
219            code
220        } else {
221            // reason: dictionary size is bounded by u32 (codes are u32)
222            #[allow(clippy::cast_possible_truncation)]
223            let code = self.dictionary.len() as u32;
224            let arc_value: Arc<str> = value.into();
225            self.string_to_code.insert(arc_value.clone(), code);
226            self.dictionary.push(arc_value);
227            self.codes.push(code);
228            code
229        }
230    }
231
232    /// Adds a null value.
233    pub fn add_null(&mut self) {
234        let idx = self.codes.len();
235        self.null_positions.push(idx);
236        self.codes.push(0); // Placeholder code
237    }
238
239    /// Adds an optional value.
240    pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
241        match value {
242            Some(v) => Some(self.add(v)),
243            None => {
244                self.add_null();
245                None
246            }
247        }
248    }
249
250    /// Returns the current number of values.
251    pub fn len(&self) -> usize {
252        self.codes.len()
253    }
254
255    /// Returns whether the builder is empty.
256    pub fn is_empty(&self) -> bool {
257        self.codes.is_empty()
258    }
259
260    /// Returns the current dictionary size.
261    pub fn dictionary_size(&self) -> usize {
262        self.dictionary.len()
263    }
264
265    /// Builds the dictionary encoding.
266    pub fn build(self) -> DictionaryEncoding {
267        let null_bitmap = if self.null_positions.is_empty() {
268            None
269        } else {
270            let num_words = (self.codes.len() + 63) / 64;
271            let mut bitmap = vec![0u64; num_words];
272            for &pos in &self.null_positions {
273                let word_idx = pos / 64;
274                let bit_idx = pos % 64;
275                bitmap[word_idx] |= 1 << bit_idx;
276            }
277            Some(bitmap)
278        };
279
280        let dict: Arc<[Arc<str>]> = self.dictionary.into();
281
282        let mut encoding = DictionaryEncoding::new(dict, self.codes);
283        if let Some(bitmap) = null_bitmap {
284            encoding = encoding.with_nulls(bitmap);
285        }
286        encoding
287    }
288
289    /// Clears the builder for reuse.
290    pub fn clear(&mut self) {
291        self.string_to_code.clear();
292        self.dictionary.clear();
293        self.codes.clear();
294        self.null_positions.clear();
295    }
296}
297
298impl Default for DictionaryBuilder {
299    fn default() -> Self {
300        Self::new()
301    }
302}
303
304/// Extension trait for building dictionary encodings from iterators.
305pub trait IntoDictionaryEncoding {
306    /// Creates a dictionary encoding from an iterator of strings.
307    fn into_dictionary_encoding(self) -> DictionaryEncoding;
308}
309
310impl<'a, I> IntoDictionaryEncoding for I
311where
312    I: IntoIterator<Item = &'a str>,
313{
314    fn into_dictionary_encoding(self) -> DictionaryEncoding {
315        let mut builder = DictionaryBuilder::new();
316        for s in self {
317            builder.add(s);
318        }
319        builder.build()
320    }
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326
327    #[test]
328    fn test_dictionary_builder_basic() {
329        let mut builder = DictionaryBuilder::new();
330        builder.add("apple");
331        builder.add("banana");
332        builder.add("apple");
333        builder.add("cherry");
334        builder.add("apple");
335
336        let dict = builder.build();
337
338        assert_eq!(dict.len(), 5);
339        assert_eq!(dict.dictionary_size(), 3);
340
341        assert_eq!(dict.get(0), Some("apple"));
342        assert_eq!(dict.get(1), Some("banana"));
343        assert_eq!(dict.get(2), Some("apple"));
344        assert_eq!(dict.get(3), Some("cherry"));
345        assert_eq!(dict.get(4), Some("apple"));
346    }
347
348    #[test]
349    fn test_dictionary_codes() {
350        let mut builder = DictionaryBuilder::new();
351        let code_apple = builder.add("apple");
352        let code_banana = builder.add("banana");
353        let code_apple2 = builder.add("apple");
354
355        assert_eq!(code_apple, code_apple2);
356        assert_ne!(code_apple, code_banana);
357
358        let dict = builder.build();
359        assert_eq!(dict.codes(), &[0, 1, 0]);
360    }
361
362    #[test]
363    fn test_dictionary_with_nulls() {
364        let mut builder = DictionaryBuilder::new();
365        builder.add("apple");
366        builder.add_null();
367        builder.add("banana");
368        builder.add_null();
369
370        let dict = builder.build();
371
372        assert_eq!(dict.len(), 4);
373        assert_eq!(dict.get(0), Some("apple"));
374        assert_eq!(dict.get(1), None);
375        assert!(dict.is_null(1));
376        assert_eq!(dict.get(2), Some("banana"));
377        assert_eq!(dict.get(3), None);
378        assert!(dict.is_null(3));
379    }
380
381    #[test]
382    fn test_dictionary_encode_lookup() {
383        let mut builder = DictionaryBuilder::new();
384        builder.add("apple");
385        builder.add("banana");
386        builder.add("cherry");
387
388        let dict = builder.build();
389
390        assert_eq!(dict.encode("apple"), Some(0));
391        assert_eq!(dict.encode("banana"), Some(1));
392        assert_eq!(dict.encode("cherry"), Some(2));
393        assert_eq!(dict.encode("date"), None);
394    }
395
396    #[test]
397    fn test_dictionary_filter_by_code() {
398        let mut builder = DictionaryBuilder::new();
399        builder.add("apple");
400        builder.add("banana");
401        builder.add("apple");
402        builder.add("cherry");
403        builder.add("apple");
404
405        let dict = builder.build();
406        let apple_code = dict.encode("apple").unwrap();
407
408        let indices = dict.filter_by_code(|code| code == apple_code);
409        assert_eq!(indices, vec![0, 2, 4]);
410    }
411
412    #[test]
413    fn test_compression_ratio() {
414        let mut builder = DictionaryBuilder::new();
415
416        // Add many repeated long strings
417        for _ in 0..100 {
418            builder.add("this_is_a_very_long_string_that_repeats_many_times");
419        }
420
421        let dict = builder.build();
422
423        // Compression ratio should be > 1 for highly repetitive data
424        let ratio = dict.compression_ratio();
425        assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
426    }
427
428    #[test]
429    fn test_into_dictionary_encoding() {
430        let strings = vec!["apple", "banana", "apple", "cherry"];
431        let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
432
433        assert_eq!(dict.len(), 4);
434        assert_eq!(dict.dictionary_size(), 3);
435    }
436
437    #[test]
438    fn test_empty_dictionary() {
439        let builder = DictionaryBuilder::new();
440        let dict = builder.build();
441
442        assert!(dict.is_empty());
443        assert_eq!(dict.dictionary_size(), 0);
444        assert_eq!(dict.get(0), None);
445    }
446
447    #[test]
448    fn test_single_value() {
449        let mut builder = DictionaryBuilder::new();
450        builder.add("only_value");
451
452        let dict = builder.build();
453
454        assert_eq!(dict.len(), 1);
455        assert_eq!(dict.dictionary_size(), 1);
456        assert_eq!(dict.get(0), Some("only_value"));
457    }
458
459    #[test]
460    fn test_all_unique() {
461        let mut builder = DictionaryBuilder::new();
462        builder.add("a");
463        builder.add("b");
464        builder.add("c");
465        builder.add("d");
466
467        let dict = builder.build();
468
469        assert_eq!(dict.len(), 4);
470        assert_eq!(dict.dictionary_size(), 4);
471        assert_eq!(dict.codes(), &[0, 1, 2, 3]);
472    }
473
474    #[test]
475    fn test_all_same() {
476        let mut builder = DictionaryBuilder::new();
477        for _ in 0..10 {
478            builder.add("same");
479        }
480
481        let dict = builder.build();
482
483        assert_eq!(dict.len(), 10);
484        assert_eq!(dict.dictionary_size(), 1);
485        assert!(dict.codes().iter().all(|&c| c == 0));
486    }
487}