Skip to main content

graphos_core/storage/
dictionary.rs

1//! Dictionary encoding for string compression.
2//!
3//! Dictionary encoding replaces repeated string values with integer codes,
4//! which is efficient for strings with low cardinality (many repeated values).
5//!
6//! # Example
7//!
8//! ```ignore
9//! let mut builder = DictionaryBuilder::new();
10//! builder.add("apple");
11//! builder.add("banana");
12//! builder.add("apple");  // repeated
13//! builder.add("cherry");
14//! builder.add("apple");  // repeated
15//!
16//! let dict = builder.build();
17//! // Dictionary: ["apple", "banana", "cherry"]
18//! // Encoded: [0, 1, 0, 2, 0]
19//! ```
20
21use std::collections::HashMap;
22use std::sync::Arc;
23
24/// A dictionary-encoded string column.
25///
26/// Stores unique strings in a dictionary and references them by integer codes.
27#[derive(Debug, Clone)]
28pub struct DictionaryEncoding {
29    /// The dictionary of unique strings.
30    dictionary: Arc<[Arc<str>]>,
31    /// Encoded values as indices into the dictionary.
32    codes: Vec<u32>,
33    /// Null bitmap (bit set = null).
34    null_bitmap: Option<Vec<u64>>,
35}
36
37impl DictionaryEncoding {
38    /// Creates a new dictionary encoding from a dictionary and codes.
39    pub fn new(dictionary: Arc<[Arc<str>]>, codes: Vec<u32>) -> Self {
40        Self {
41            dictionary,
42            codes,
43            null_bitmap: None,
44        }
45    }
46
47    /// Creates a dictionary encoding with a null bitmap.
48    pub fn with_nulls(mut self, null_bitmap: Vec<u64>) -> Self {
49        self.null_bitmap = Some(null_bitmap);
50        self
51    }
52
53    /// Returns the number of values.
54    pub fn len(&self) -> usize {
55        self.codes.len()
56    }
57
58    /// Returns whether the encoding is empty.
59    pub fn is_empty(&self) -> bool {
60        self.codes.is_empty()
61    }
62
63    /// Returns the number of unique strings in the dictionary.
64    pub fn dictionary_size(&self) -> usize {
65        self.dictionary.len()
66    }
67
68    /// Returns the dictionary.
69    pub fn dictionary(&self) -> &Arc<[Arc<str>]> {
70        &self.dictionary
71    }
72
73    /// Returns the encoded values.
74    pub fn codes(&self) -> &[u32] {
75        &self.codes
76    }
77
78    /// Returns whether the value at index is null.
79    pub fn is_null(&self, index: usize) -> bool {
80        if let Some(bitmap) = &self.null_bitmap {
81            let word_idx = index / 64;
82            let bit_idx = index % 64;
83            if word_idx < bitmap.len() {
84                return (bitmap[word_idx] & (1 << bit_idx)) != 0;
85            }
86        }
87        false
88    }
89
90    /// Returns the string value at the given index.
91    ///
92    /// Returns `None` if the value is null.
93    pub fn get(&self, index: usize) -> Option<&str> {
94        if self.is_null(index) {
95            return None;
96        }
97        let code = self.codes.get(index)? as &u32;
98        self.dictionary.get(*code as usize).map(|s| s.as_ref())
99    }
100
101    /// Returns the code at the given index.
102    pub fn get_code(&self, index: usize) -> Option<u32> {
103        if self.is_null(index) {
104            return None;
105        }
106        self.codes.get(index).copied()
107    }
108
109    /// Iterates over all values, yielding `Option<&str>`.
110    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> {
111        (0..self.len()).map(move |i| self.get(i))
112    }
113
114    /// Returns the compression ratio (original size / compressed size).
115    ///
116    /// A ratio > 1.0 means compression is effective.
117    pub fn compression_ratio(&self) -> f64 {
118        if self.codes.is_empty() {
119            return 1.0;
120        }
121
122        // Estimate original size: sum of string lengths
123        let original_size: usize = self
124            .codes
125            .iter()
126            .map(|&code| {
127                if (code as usize) < self.dictionary.len() {
128                    self.dictionary[code as usize].len()
129                } else {
130                    0
131                }
132            })
133            .sum();
134
135        // Compressed size: dictionary + codes
136        let dict_size: usize = self.dictionary.iter().map(|s| s.len()).sum();
137        let codes_size = self.codes.len() * std::mem::size_of::<u32>();
138        let compressed_size = dict_size + codes_size;
139
140        if compressed_size == 0 {
141            return 1.0;
142        }
143
144        original_size as f64 / compressed_size as f64
145    }
146
147    /// Encodes a lookup value into a code, if it exists in the dictionary.
148    pub fn encode(&self, value: &str) -> Option<u32> {
149        self.dictionary
150            .iter()
151            .position(|s| s.as_ref() == value)
152            .map(|i| i as u32)
153    }
154
155    /// Filters the encoding to only include rows matching a predicate code.
156    pub fn filter_by_code(&self, predicate: impl Fn(u32) -> bool) -> Vec<usize> {
157        self.codes
158            .iter()
159            .enumerate()
160            .filter_map(|(i, &code)| {
161                if !self.is_null(i) && predicate(code) {
162                    Some(i)
163                } else {
164                    None
165                }
166            })
167            .collect()
168    }
169}
170
171/// Builder for creating dictionary encodings.
172#[derive(Debug)]
173pub struct DictionaryBuilder {
174    /// Map from string to code.
175    string_to_code: HashMap<Arc<str>, u32>,
176    /// Dictionary (code -> string).
177    dictionary: Vec<Arc<str>>,
178    /// Encoded values.
179    codes: Vec<u32>,
180    /// Null positions (for marking nulls).
181    null_positions: Vec<usize>,
182}
183
184impl DictionaryBuilder {
185    /// Creates a new dictionary builder.
186    pub fn new() -> Self {
187        Self {
188            string_to_code: HashMap::new(),
189            dictionary: Vec::new(),
190            codes: Vec::new(),
191            null_positions: Vec::new(),
192        }
193    }
194
195    /// Creates a new dictionary builder with estimated capacity.
196    pub fn with_capacity(value_capacity: usize, dictionary_capacity: usize) -> Self {
197        Self {
198            string_to_code: HashMap::with_capacity(dictionary_capacity),
199            dictionary: Vec::with_capacity(dictionary_capacity),
200            codes: Vec::with_capacity(value_capacity),
201            null_positions: Vec::new(),
202        }
203    }
204
205    /// Adds a string value to the encoding.
206    ///
207    /// Returns the code assigned to this value.
208    pub fn add(&mut self, value: &str) -> u32 {
209        if let Some(&code) = self.string_to_code.get(value) {
210            self.codes.push(code);
211            code
212        } else {
213            let code = self.dictionary.len() as u32;
214            let arc_value: Arc<str> = value.into();
215            self.string_to_code.insert(arc_value.clone(), code);
216            self.dictionary.push(arc_value);
217            self.codes.push(code);
218            code
219        }
220    }
221
222    /// Adds a null value.
223    pub fn add_null(&mut self) {
224        let idx = self.codes.len();
225        self.null_positions.push(idx);
226        self.codes.push(0); // Placeholder code
227    }
228
229    /// Adds an optional value.
230    pub fn add_optional(&mut self, value: Option<&str>) -> Option<u32> {
231        match value {
232            Some(v) => Some(self.add(v)),
233            None => {
234                self.add_null();
235                None
236            }
237        }
238    }
239
240    /// Returns the current number of values.
241    pub fn len(&self) -> usize {
242        self.codes.len()
243    }
244
245    /// Returns whether the builder is empty.
246    pub fn is_empty(&self) -> bool {
247        self.codes.is_empty()
248    }
249
250    /// Returns the current dictionary size.
251    pub fn dictionary_size(&self) -> usize {
252        self.dictionary.len()
253    }
254
255    /// Builds the dictionary encoding.
256    pub fn build(self) -> DictionaryEncoding {
257        let null_bitmap = if self.null_positions.is_empty() {
258            None
259        } else {
260            let num_words = (self.codes.len() + 63) / 64;
261            let mut bitmap = vec![0u64; num_words];
262            for &pos in &self.null_positions {
263                let word_idx = pos / 64;
264                let bit_idx = pos % 64;
265                bitmap[word_idx] |= 1 << bit_idx;
266            }
267            Some(bitmap)
268        };
269
270        let dict: Arc<[Arc<str>]> = self.dictionary.into();
271
272        let mut encoding = DictionaryEncoding::new(dict, self.codes);
273        if let Some(bitmap) = null_bitmap {
274            encoding = encoding.with_nulls(bitmap);
275        }
276        encoding
277    }
278
279    /// Clears the builder for reuse.
280    pub fn clear(&mut self) {
281        self.string_to_code.clear();
282        self.dictionary.clear();
283        self.codes.clear();
284        self.null_positions.clear();
285    }
286}
287
288impl Default for DictionaryBuilder {
289    fn default() -> Self {
290        Self::new()
291    }
292}
293
294/// Extension trait for building dictionary encodings from iterators.
295pub trait IntoDictionaryEncoding {
296    /// Creates a dictionary encoding from an iterator of strings.
297    fn into_dictionary_encoding(self) -> DictionaryEncoding;
298}
299
300impl<'a, I> IntoDictionaryEncoding for I
301where
302    I: IntoIterator<Item = &'a str>,
303{
304    fn into_dictionary_encoding(self) -> DictionaryEncoding {
305        let mut builder = DictionaryBuilder::new();
306        for s in self {
307            builder.add(s);
308        }
309        builder.build()
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    #[test]
318    fn test_dictionary_builder_basic() {
319        let mut builder = DictionaryBuilder::new();
320        builder.add("apple");
321        builder.add("banana");
322        builder.add("apple");
323        builder.add("cherry");
324        builder.add("apple");
325
326        let dict = builder.build();
327
328        assert_eq!(dict.len(), 5);
329        assert_eq!(dict.dictionary_size(), 3);
330
331        assert_eq!(dict.get(0), Some("apple"));
332        assert_eq!(dict.get(1), Some("banana"));
333        assert_eq!(dict.get(2), Some("apple"));
334        assert_eq!(dict.get(3), Some("cherry"));
335        assert_eq!(dict.get(4), Some("apple"));
336    }
337
338    #[test]
339    fn test_dictionary_codes() {
340        let mut builder = DictionaryBuilder::new();
341        let code_apple = builder.add("apple");
342        let code_banana = builder.add("banana");
343        let code_apple2 = builder.add("apple");
344
345        assert_eq!(code_apple, code_apple2);
346        assert_ne!(code_apple, code_banana);
347
348        let dict = builder.build();
349        assert_eq!(dict.codes(), &[0, 1, 0]);
350    }
351
352    #[test]
353    fn test_dictionary_with_nulls() {
354        let mut builder = DictionaryBuilder::new();
355        builder.add("apple");
356        builder.add_null();
357        builder.add("banana");
358        builder.add_null();
359
360        let dict = builder.build();
361
362        assert_eq!(dict.len(), 4);
363        assert_eq!(dict.get(0), Some("apple"));
364        assert_eq!(dict.get(1), None);
365        assert!(dict.is_null(1));
366        assert_eq!(dict.get(2), Some("banana"));
367        assert_eq!(dict.get(3), None);
368        assert!(dict.is_null(3));
369    }
370
371    #[test]
372    fn test_dictionary_encode_lookup() {
373        let mut builder = DictionaryBuilder::new();
374        builder.add("apple");
375        builder.add("banana");
376        builder.add("cherry");
377
378        let dict = builder.build();
379
380        assert_eq!(dict.encode("apple"), Some(0));
381        assert_eq!(dict.encode("banana"), Some(1));
382        assert_eq!(dict.encode("cherry"), Some(2));
383        assert_eq!(dict.encode("date"), None);
384    }
385
386    #[test]
387    fn test_dictionary_filter_by_code() {
388        let mut builder = DictionaryBuilder::new();
389        builder.add("apple");
390        builder.add("banana");
391        builder.add("apple");
392        builder.add("cherry");
393        builder.add("apple");
394
395        let dict = builder.build();
396        let apple_code = dict.encode("apple").unwrap();
397
398        let indices = dict.filter_by_code(|code| code == apple_code);
399        assert_eq!(indices, vec![0, 2, 4]);
400    }
401
402    #[test]
403    fn test_compression_ratio() {
404        let mut builder = DictionaryBuilder::new();
405
406        // Add many repeated long strings
407        for _ in 0..100 {
408            builder.add("this_is_a_very_long_string_that_repeats_many_times");
409        }
410
411        let dict = builder.build();
412
413        // Compression ratio should be > 1 for highly repetitive data
414        let ratio = dict.compression_ratio();
415        assert!(ratio > 1.0, "Expected compression ratio > 1, got {}", ratio);
416    }
417
418    #[test]
419    fn test_into_dictionary_encoding() {
420        let strings = vec!["apple", "banana", "apple", "cherry"];
421        let dict: DictionaryEncoding = strings.into_iter().into_dictionary_encoding();
422
423        assert_eq!(dict.len(), 4);
424        assert_eq!(dict.dictionary_size(), 3);
425    }
426
427    #[test]
428    fn test_empty_dictionary() {
429        let builder = DictionaryBuilder::new();
430        let dict = builder.build();
431
432        assert!(dict.is_empty());
433        assert_eq!(dict.dictionary_size(), 0);
434        assert_eq!(dict.get(0), None);
435    }
436
437    #[test]
438    fn test_single_value() {
439        let mut builder = DictionaryBuilder::new();
440        builder.add("only_value");
441
442        let dict = builder.build();
443
444        assert_eq!(dict.len(), 1);
445        assert_eq!(dict.dictionary_size(), 1);
446        assert_eq!(dict.get(0), Some("only_value"));
447    }
448
449    #[test]
450    fn test_all_unique() {
451        let mut builder = DictionaryBuilder::new();
452        builder.add("a");
453        builder.add("b");
454        builder.add("c");
455        builder.add("d");
456
457        let dict = builder.build();
458
459        assert_eq!(dict.len(), 4);
460        assert_eq!(dict.dictionary_size(), 4);
461        assert_eq!(dict.codes(), &[0, 1, 2, 3]);
462    }
463
464    #[test]
465    fn test_all_same() {
466        let mut builder = DictionaryBuilder::new();
467        for _ in 0..10 {
468            builder.add("same");
469        }
470
471        let dict = builder.build();
472
473        assert_eq!(dict.len(), 10);
474        assert_eq!(dict.dictionary_size(), 1);
475        assert!(dict.codes().iter().all(|&c| c == 0));
476    }
477}