Skip to main content

cjc_data/
dict_encoding.rs

1//! Stable dictionary encoding for string columns.
2//!
3//! Uses `BTreeMap` for deterministic key ordering — the same input data always
4//! produces the same dictionary codes regardless of platform or run.
5
6use std::collections::BTreeMap;
7
8/// A dictionary-encoded string column.
9///
10/// Unique string values are assigned compact `u32` codes in **sorted order**
11/// (via `BTreeMap`), guaranteeing deterministic encoding across runs.
12#[derive(Debug, Clone)]
13pub struct DictEncoding {
14    /// Dictionary: sorted unique values -> compact u32 codes.
15    dict: BTreeMap<String, u32>,
16    /// Reverse lookup: code -> string (sorted order).
17    reverse: Vec<String>,
18    /// Encoded data: each row's string mapped to its u32 code.
19    codes: Vec<u32>,
20}
21
22impl DictEncoding {
23    /// Build a dictionary encoding from a string column.
24    ///
25    /// Unique values are sorted (BTreeMap natural ordering) and assigned
26    /// consecutive codes starting from 0.
27    pub fn encode(data: &[String]) -> Self {
28        // Collect unique values in sorted order via BTreeMap.
29        let mut unique: BTreeMap<String, u32> = BTreeMap::new();
30        for s in data {
31            unique.entry(s.clone()).or_insert(0);
32        }
33
34        // Assign consecutive codes in sorted order.
35        let mut reverse = Vec::with_capacity(unique.len());
36        for (i, (key, code)) in unique.iter_mut().enumerate() {
37            *code = i as u32;
38            reverse.push(key.clone());
39        }
40
41        // Encode the data.
42        let codes: Vec<u32> = data.iter().map(|s| unique[s]).collect();
43
44        DictEncoding {
45            dict: unique,
46            reverse,
47            codes,
48        }
49    }
50
51    /// Decode back to the original string values.
52    pub fn decode(&self) -> Vec<String> {
53        self.codes
54            .iter()
55            .map(|&c| self.reverse[c as usize].clone())
56            .collect()
57    }
58
59    /// Look up the code for a string value.
60    pub fn lookup(&self, value: &str) -> Option<u32> {
61        self.dict.get(value).copied()
62    }
63
64    /// Number of unique values in the dictionary.
65    pub fn cardinality(&self) -> usize {
66        self.reverse.len()
67    }
68
69    /// Get the encoded codes slice.
70    pub fn codes(&self) -> &[u32] {
71        &self.codes
72    }
73
74    /// Get the dictionary mapping (sorted).
75    pub fn dict(&self) -> &BTreeMap<String, u32> {
76        &self.dict
77    }
78
79    /// Get the reverse lookup table (code -> string).
80    pub fn reverse(&self) -> &[String] {
81        &self.reverse
82    }
83}
84
85#[cfg(test)]
86mod tests {
87    use super::*;
88
89    #[test]
90    fn test_roundtrip() {
91        let data: Vec<String> = vec!["banana", "apple", "cherry", "apple", "banana"]
92            .into_iter()
93            .map(String::from)
94            .collect();
95        let enc = DictEncoding::encode(&data);
96        assert_eq!(enc.decode(), data);
97    }
98
99    #[test]
100    fn test_sorted_codes() {
101        let data: Vec<String> = vec!["cherry", "apple", "banana"]
102            .into_iter()
103            .map(String::from)
104            .collect();
105        let enc = DictEncoding::encode(&data);
106        // BTreeMap sorts: apple=0, banana=1, cherry=2
107        assert_eq!(enc.lookup("apple"), Some(0));
108        assert_eq!(enc.lookup("banana"), Some(1));
109        assert_eq!(enc.lookup("cherry"), Some(2));
110        assert_eq!(enc.cardinality(), 3);
111    }
112
113    #[test]
114    fn test_empty() {
115        let enc = DictEncoding::encode(&[]);
116        assert_eq!(enc.cardinality(), 0);
117        assert!(enc.codes().is_empty());
118        assert!(enc.decode().is_empty());
119    }
120}