1use std::collections::{HashMap, HashSet};
26use std::hash::{Hash, Hasher};
27
28use thiserror::Error;
29
30use crate::registry::FieldId;
31
32#[derive(Debug, Clone, PartialEq, Eq)]
34pub enum StoredValue {
35 DictRef(u32),
37 Inline(Vec<u8>),
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct ValueDictionaryConfig {
44 pub low_cardinality_threshold: usize,
46 pub min_len_for_dictionary: usize,
48}
49
50impl Default for ValueDictionaryConfig {
51 fn default() -> Self {
52 Self {
53 low_cardinality_threshold: 10_000,
54 min_len_for_dictionary: 5,
55 }
56 }
57}
58
59#[derive(Debug, Error, PartialEq, Eq)]
61pub enum DictionaryError {
62 #[error("dictionary id space exhausted")]
64 DictionaryIdOverflow,
65 #[error("unknown dictionary id {0}")]
67 UnknownDictionaryId(u32),
68}
69
70#[derive(Debug, Default)]
72pub struct ValueDictionary {
73 config: ValueDictionaryConfig,
74 id_by_value: HashMap<Vec<u8>, u32>,
75 value_by_id: HashMap<u32, Vec<u8>>,
76 field_unique_hashes: HashMap<FieldId, HashSet<u64>>,
77 next_id: u64,
78}
79
80impl ValueDictionary {
81 #[must_use]
83 pub fn new(config: ValueDictionaryConfig) -> Self {
84 Self {
85 config,
86 ..Self::default()
87 }
88 }
89
90 pub fn encode(
96 &mut self,
97 field_id: FieldId,
98 value: &[u8],
99 ) -> Result<StoredValue, DictionaryError> {
100 let cardinality = self.record_and_estimate_cardinality(field_id, value);
101 if cardinality <= self.config.low_cardinality_threshold
102 && value.len() >= self.config.min_len_for_dictionary
103 {
104 let id = self.get_or_insert(value)?;
105 return Ok(StoredValue::DictRef(id));
106 }
107
108 Ok(StoredValue::Inline(value.to_vec()))
109 }
110
111 pub fn decode(&self, stored: &StoredValue) -> Result<Vec<u8>, DictionaryError> {
113 match stored {
114 StoredValue::DictRef(id) => self
115 .value_by_id
116 .get(id)
117 .cloned()
118 .ok_or(DictionaryError::UnknownDictionaryId(*id)),
119 StoredValue::Inline(bytes) => Ok(bytes.clone()),
120 }
121 }
122
123 #[must_use]
125 pub fn cardinality_estimate(&self, field_id: FieldId) -> usize {
126 self.field_unique_hashes
127 .get(&field_id)
128 .map_or(0, HashSet::len)
129 }
130
131 #[must_use]
133 pub fn len(&self) -> usize {
134 self.id_by_value.len()
135 }
136
137 #[must_use]
139 pub fn is_empty(&self) -> bool {
140 self.id_by_value.is_empty()
141 }
142
143 fn get_or_insert(&mut self, value: &[u8]) -> Result<u32, DictionaryError> {
144 if let Some(id) = self.id_by_value.get(value) {
145 return Ok(*id);
146 }
147
148 let id = u32::try_from(self.next_id).map_err(|_| DictionaryError::DictionaryIdOverflow)?;
149 self.next_id += 1;
150
151 let owned = value.to_vec();
152 self.id_by_value.insert(owned.clone(), id);
153 self.value_by_id.insert(id, owned);
154 Ok(id)
155 }
156
157 fn record_and_estimate_cardinality(&mut self, field_id: FieldId, value: &[u8]) -> usize {
158 let hash = stable_hash(value);
159 let set = self.field_unique_hashes.entry(field_id).or_default();
160 set.insert(hash);
161 set.len()
162 }
163}
164
165fn stable_hash<T: Hash>(value: T) -> u64 {
166 let mut hasher = std::collections::hash_map::DefaultHasher::new();
167 value.hash(&mut hasher);
168 hasher.finish()
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn low_cardinality_values_use_dictionary() {
177 let mut dict = ValueDictionary::new(ValueDictionaryConfig {
178 low_cardinality_threshold: 10,
179 min_len_for_dictionary: 5,
180 });
181
182 let first = dict.encode(1, b"accra").expect("encoding should succeed");
183 let second = dict.encode(1, b"accra").expect("encoding should succeed");
184
185 assert_eq!(first, second);
186 assert_eq!(dict.len(), 1);
187 }
188
189 #[test]
190 fn high_cardinality_values_fall_back_to_inline() {
191 let mut dict = ValueDictionary::new(ValueDictionaryConfig {
192 low_cardinality_threshold: 1,
193 min_len_for_dictionary: 3,
194 });
195
196 let first = dict.encode(7, b"one").expect("encoding should succeed");
197 let second = dict.encode(7, b"two").expect("encoding should succeed");
198
199 assert!(matches!(first, StoredValue::DictRef(_)));
200 assert_eq!(second, StoredValue::Inline(b"two".to_vec()));
201 }
202
203 #[test]
204 fn decode_round_trip() {
205 let mut dict = ValueDictionary::default();
206 let encoded = dict.encode(11, b"london").expect("encoding should succeed");
207 let decoded = dict.decode(&encoded).expect("decode should succeed");
208 assert_eq!(decoded, b"london".to_vec());
209 }
210}