cjc_data/
dict_encoding.rs1use std::collections::BTreeMap;
7
8#[derive(Debug, Clone)]
13pub struct DictEncoding {
14 dict: BTreeMap<String, u32>,
16 reverse: Vec<String>,
18 codes: Vec<u32>,
20}
21
22impl DictEncoding {
23 pub fn encode(data: &[String]) -> Self {
28 let mut unique: BTreeMap<String, u32> = BTreeMap::new();
30 for s in data {
31 unique.entry(s.clone()).or_insert(0);
32 }
33
34 let mut reverse = Vec::with_capacity(unique.len());
36 for (i, (key, code)) in unique.iter_mut().enumerate() {
37 *code = i as u32;
38 reverse.push(key.clone());
39 }
40
41 let codes: Vec<u32> = data.iter().map(|s| unique[s]).collect();
43
44 DictEncoding {
45 dict: unique,
46 reverse,
47 codes,
48 }
49 }
50
51 pub fn decode(&self) -> Vec<String> {
53 self.codes
54 .iter()
55 .map(|&c| self.reverse[c as usize].clone())
56 .collect()
57 }
58
59 pub fn lookup(&self, value: &str) -> Option<u32> {
61 self.dict.get(value).copied()
62 }
63
64 pub fn cardinality(&self) -> usize {
66 self.reverse.len()
67 }
68
69 pub fn codes(&self) -> &[u32] {
71 &self.codes
72 }
73
74 pub fn dict(&self) -> &BTreeMap<String, u32> {
76 &self.dict
77 }
78
79 pub fn reverse(&self) -> &[String] {
81 &self.reverse
82 }
83}
84
85#[cfg(test)]
86mod tests {
87 use super::*;
88
89 #[test]
90 fn test_roundtrip() {
91 let data: Vec<String> = vec!["banana", "apple", "cherry", "apple", "banana"]
92 .into_iter()
93 .map(String::from)
94 .collect();
95 let enc = DictEncoding::encode(&data);
96 assert_eq!(enc.decode(), data);
97 }
98
99 #[test]
100 fn test_sorted_codes() {
101 let data: Vec<String> = vec!["cherry", "apple", "banana"]
102 .into_iter()
103 .map(String::from)
104 .collect();
105 let enc = DictEncoding::encode(&data);
106 assert_eq!(enc.lookup("apple"), Some(0));
108 assert_eq!(enc.lookup("banana"), Some(1));
109 assert_eq!(enc.lookup("cherry"), Some(2));
110 assert_eq!(enc.cardinality(), 3);
111 }
112
113 #[test]
114 fn test_empty() {
115 let enc = DictEncoding::encode(&[]);
116 assert_eq!(enc.cardinality(), 0);
117 assert!(enc.codes().is_empty());
118 assert!(enc.decode().is_empty());
119 }
120}