Skip to main content

kora_doc/
dictionary.rs

1//! Dictionary encoding for low-cardinality string field values.
2//!
3//! When a field's unique value count stays below a configurable threshold,
4//! storing the same string bytes in every packed document wastes space.
5//! [`ValueDictionary`] assigns a compact `u32` dictionary ID to each unique
6//! byte sequence and returns a [`StoredValue::DictRef`] instead of inlining
7//! the full payload. High-cardinality fields (or values shorter than
8//! `min_len_for_dictionary`) fall back to [`StoredValue::Inline`].
9//!
10//! ## Cardinality Estimation
11//!
12//! Per-field cardinality is tracked with a `HashSet<u64>` of value hashes,
13//! giving exact counts up to the threshold and constant-space tracking
14//! thereafter. Once the cardinality of a field exceeds
15//! `low_cardinality_threshold`, all subsequent values for that field are
16//! stored inline.
17//!
18//! ## Encoding / Decoding
19//!
20//! - **Encode:** `encode(field_id, value_bytes)` returns `DictRef(id)` or
21//!   `Inline(bytes)`.
22//! - **Decode:** `decode(stored)` maps `DictRef(id)` back to owned bytes, or
23//!   clones `Inline(bytes)` directly.
24
25use std::collections::{HashMap, HashSet};
26use std::hash::{Hash, Hasher};
27
28use thiserror::Error;
29
30use crate::registry::FieldId;
31
32/// Stored field value representation after dictionary encoding.
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub enum StoredValue {
35    /// Reference to a dictionary value.
36    DictRef(u32),
37    /// Inline bytes stored directly in the packed payload.
38    Inline(Vec<u8>),
39}
40
41/// Value dictionary tuning parameters.
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct ValueDictionaryConfig {
44    /// Maximum unique cardinality per field before dictionary encoding is disabled.
45    pub low_cardinality_threshold: usize,
46    /// Minimum byte length before dictionary encoding is considered.
47    pub min_len_for_dictionary: usize,
48}
49
50impl Default for ValueDictionaryConfig {
51    fn default() -> Self {
52        Self {
53            low_cardinality_threshold: 10_000,
54            min_len_for_dictionary: 5,
55        }
56    }
57}
58
59/// Errors returned by dictionary operations.
60#[derive(Debug, Error, PartialEq, Eq)]
61pub enum DictionaryError {
62    /// Dictionary ID space is exhausted.
63    #[error("dictionary id space exhausted")]
64    DictionaryIdOverflow,
65    /// The provided dictionary reference does not exist.
66    #[error("unknown dictionary id {0}")]
67    UnknownDictionaryId(u32),
68}
69
70/// Per-collection value dictionary.
71#[derive(Debug, Default)]
72pub struct ValueDictionary {
73    config: ValueDictionaryConfig,
74    id_by_value: HashMap<Vec<u8>, u32>,
75    value_by_id: HashMap<u32, Vec<u8>>,
76    field_unique_hashes: HashMap<FieldId, HashSet<u64>>,
77    next_id: u64,
78}
79
80impl ValueDictionary {
81    /// Create a dictionary with custom configuration.
82    #[must_use]
83    pub fn new(config: ValueDictionaryConfig) -> Self {
84        Self {
85            config,
86            ..Self::default()
87        }
88    }
89
90    /// Encode a value for a field.
91    ///
92    /// Values are dictionary-encoded only when:
93    /// - observed field cardinality is below `low_cardinality_threshold`, and
94    /// - value length is at least `min_len_for_dictionary`.
95    pub fn encode(
96        &mut self,
97        field_id: FieldId,
98        value: &[u8],
99    ) -> Result<StoredValue, DictionaryError> {
100        let cardinality = self.record_and_estimate_cardinality(field_id, value);
101        if cardinality <= self.config.low_cardinality_threshold
102            && value.len() >= self.config.min_len_for_dictionary
103        {
104            let id = self.get_or_insert(value)?;
105            return Ok(StoredValue::DictRef(id));
106        }
107
108        Ok(StoredValue::Inline(value.to_vec()))
109    }
110
111    /// Decode a stored value back to owned bytes.
112    pub fn decode(&self, stored: &StoredValue) -> Result<Vec<u8>, DictionaryError> {
113        match stored {
114            StoredValue::DictRef(id) => self
115                .value_by_id
116                .get(id)
117                .cloned()
118                .ok_or(DictionaryError::UnknownDictionaryId(*id)),
119            StoredValue::Inline(bytes) => Ok(bytes.clone()),
120        }
121    }
122
123    /// Return the configured cardinality estimate for a field.
124    #[must_use]
125    pub fn cardinality_estimate(&self, field_id: FieldId) -> usize {
126        self.field_unique_hashes
127            .get(&field_id)
128            .map_or(0, HashSet::len)
129    }
130
131    /// Number of unique values stored in the dictionary.
132    #[must_use]
133    pub fn len(&self) -> usize {
134        self.id_by_value.len()
135    }
136
137    /// Returns true when the dictionary has no values.
138    #[must_use]
139    pub fn is_empty(&self) -> bool {
140        self.id_by_value.is_empty()
141    }
142
143    fn get_or_insert(&mut self, value: &[u8]) -> Result<u32, DictionaryError> {
144        if let Some(id) = self.id_by_value.get(value) {
145            return Ok(*id);
146        }
147
148        let id = u32::try_from(self.next_id).map_err(|_| DictionaryError::DictionaryIdOverflow)?;
149        self.next_id += 1;
150
151        let owned = value.to_vec();
152        self.id_by_value.insert(owned.clone(), id);
153        self.value_by_id.insert(id, owned);
154        Ok(id)
155    }
156
157    fn record_and_estimate_cardinality(&mut self, field_id: FieldId, value: &[u8]) -> usize {
158        let hash = stable_hash(value);
159        let set = self.field_unique_hashes.entry(field_id).or_default();
160        set.insert(hash);
161        set.len()
162    }
163}
164
165fn stable_hash<T: Hash>(value: T) -> u64 {
166    let mut hasher = std::collections::hash_map::DefaultHasher::new();
167    value.hash(&mut hasher);
168    hasher.finish()
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn low_cardinality_values_use_dictionary() {
177        let mut dict = ValueDictionary::new(ValueDictionaryConfig {
178            low_cardinality_threshold: 10,
179            min_len_for_dictionary: 5,
180        });
181
182        let first = dict.encode(1, b"accra").expect("encoding should succeed");
183        let second = dict.encode(1, b"accra").expect("encoding should succeed");
184
185        assert_eq!(first, second);
186        assert_eq!(dict.len(), 1);
187    }
188
189    #[test]
190    fn high_cardinality_values_fall_back_to_inline() {
191        let mut dict = ValueDictionary::new(ValueDictionaryConfig {
192            low_cardinality_threshold: 1,
193            min_len_for_dictionary: 3,
194        });
195
196        let first = dict.encode(7, b"one").expect("encoding should succeed");
197        let second = dict.encode(7, b"two").expect("encoding should succeed");
198
199        assert!(matches!(first, StoredValue::DictRef(_)));
200        assert_eq!(second, StoredValue::Inline(b"two".to_vec()));
201    }
202
203    #[test]
204    fn decode_round_trip() {
205        let mut dict = ValueDictionary::default();
206        let encoded = dict.encode(11, b"london").expect("encoding should succeed");
207        let decoded = dict.decode(&encoded).expect("decode should succeed");
208        assert_eq!(decoded, b"london".to_vec());
209    }
210}