Skip to main content

oxirs_tsdb/compression/
dictionary.rs

1//! Dictionary compression for categorical time-series data
2//!
3//! Maps repeated string values to compact integer codes, enabling efficient
4//! storage of label / enumeration series (device states, alert levels, etc.).
5//!
6//! ## Design
7//!
8//! * A **dictionary** maps each unique string value to a `u32` code.
9//! * The encoded data stream stores `(timestamp_ms, code)` pairs.
10//! * The [`DictionaryBlock`] bundles dictionary + encoded stream and supports
11//!   efficient random-access lookup by timestamp.
12//!
13//! ## Compression characteristics
14//!
15//! If there are `V` distinct values and `N` total samples, storage is roughly:
16//!
17//! ```text
18//! V × avg_string_len  +  N × 4 bytes  (vs N × avg_string_len uncompressed)
19//! ```
20//!
21//! This is especially effective when `V << N`.
22
23use crate::error::{TsdbError, TsdbResult};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26
27// ---------------------------------------------------------------------------
28// DictionaryEncoder
29// ---------------------------------------------------------------------------
30
31/// Streaming encoder for `(timestamp_ms, string_value)` pairs.
32///
33/// Assign compact `u32` codes to unique string values and build a
34/// [`DictionaryBlock`] via [`finish`](Self::finish).
35pub struct DictionaryEncoder {
36    /// Forward map: string value → code
37    dictionary: HashMap<String, u32>,
38    /// Reverse map: code → string value (index == code)
39    reverse: Vec<String>,
40    /// Encoded `(timestamp_ms, code)` pairs
41    encoded: Vec<(i64, u32)>,
42}
43
44impl DictionaryEncoder {
45    /// Create a new, empty encoder.
46    pub fn new() -> Self {
47        Self {
48            dictionary: HashMap::new(),
49            reverse: Vec::new(),
50            encoded: Vec::new(),
51        }
52    }
53
54    /// Encode a single `(timestamp_ms, value)` sample.
55    ///
56    /// Returns the assigned code for `value`.
57    pub fn encode(&mut self, timestamp: i64, value: &str) -> TsdbResult<u32> {
58        let code = match self.dictionary.get(value) {
59            Some(&c) => c,
60            None => {
61                let next_code = self.reverse.len() as u32;
62                if next_code == u32::MAX {
63                    return Err(TsdbError::Compression(
64                        "Dictionary overflow: more than u32::MAX distinct values".to_string(),
65                    ));
66                }
67                self.dictionary.insert(value.to_owned(), next_code);
68                self.reverse.push(value.to_owned());
69                next_code
70            }
71        };
72        self.encoded.push((timestamp, code));
73        Ok(code)
74    }
75
76    /// Consume the encoder and produce a [`DictionaryBlock`].
77    pub fn finish(self) -> DictionaryBlock {
78        DictionaryBlock {
79            dictionary: self.reverse,
80            encoded: self.encoded,
81        }
82    }
83
84    /// Number of distinct values seen so far.
85    pub fn cardinality(&self) -> usize {
86        self.reverse.len()
87    }
88}
89
90impl Default for DictionaryEncoder {
91    fn default() -> Self {
92        Self::new()
93    }
94}
95
96// ---------------------------------------------------------------------------
97// DictionaryBlock
98// ---------------------------------------------------------------------------
99
100/// A self-contained, serialisable dictionary-compressed block.
101#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102pub struct DictionaryBlock {
103    /// Reverse dictionary: `dictionary[code]` → string value.
104    pub dictionary: Vec<String>,
105    /// Encoded `(timestamp_ms, code)` pairs.
106    pub encoded: Vec<(i64, u32)>,
107}
108
109impl DictionaryBlock {
110    /// Build a block from a slice of `(timestamp_ms, &str)` pairs.
111    pub fn from_data(data: &[(i64, &str)]) -> TsdbResult<Self> {
112        let mut enc = DictionaryEncoder::new();
113        for &(ts, val) in data {
114            enc.encode(ts, val)?;
115        }
116        Ok(enc.finish())
117    }
118
119    /// Decode all `(timestamp_ms, value_str)` pairs.
120    pub fn decode(&self) -> TsdbResult<Vec<(i64, &str)>> {
121        self.encoded
122            .iter()
123            .map(|&(ts, code)| {
124                self.lookup_code(code).map(|s| (ts, s)).ok_or_else(|| {
125                    TsdbError::Decompression(format!("Dictionary: unknown code {}", code))
126                })
127            })
128            .collect()
129    }
130
131    /// Decode all samples into owned `String` values.
132    pub fn decode_owned(&self) -> TsdbResult<Vec<(i64, String)>> {
133        self.encoded
134            .iter()
135            .map(|&(ts, code)| {
136                self.lookup_code(code)
137                    .map(|s| (ts, s.to_owned()))
138                    .ok_or_else(|| {
139                        TsdbError::Decompression(format!("Dictionary: unknown code {}", code))
140                    })
141            })
142            .collect()
143    }
144
145    /// Look up the string for a given code.
146    ///
147    /// Returns `None` if the code is out of range.
148    pub fn lookup_code(&self, code: u32) -> Option<&str> {
149        self.dictionary.get(code as usize).map(|s| s.as_str())
150    }
151
152    /// Find the code assigned to `value`, or `None` if it is not in the
153    /// dictionary.
154    pub fn find_code(&self, value: &str) -> Option<u32> {
155        self.dictionary
156            .iter()
157            .position(|s| s == value)
158            .map(|i| i as u32)
159    }
160
161    /// Return the string value at the given `timestamp_ms`, or `None` if no
162    /// sample exists at that exact timestamp.
163    pub fn get_value(&self, timestamp: i64) -> Option<&str> {
164        // Linear search – for large blocks callers should use `decode()` and
165        // build their own index.
166        let code = self
167            .encoded
168            .iter()
169            .find(|&&(ts, _)| ts == timestamp)
170            .map(|&(_, code)| code)?;
171        self.lookup_code(code)
172    }
173
174    /// Number of distinct values in the dictionary.
175    pub fn cardinality(&self) -> usize {
176        self.dictionary.len()
177    }
178
179    /// Total number of encoded samples.
180    pub fn len(&self) -> usize {
181        self.encoded.len()
182    }
183
184    /// Returns `true` if the block contains no samples.
185    pub fn is_empty(&self) -> bool {
186        self.encoded.is_empty()
187    }
188
189    /// Approximate compression ratio vs. storing raw strings.
190    ///
191    /// Assumes average string length of 10 bytes and 8-byte timestamp.
192    pub fn compression_ratio(&self, avg_string_len: usize) -> f64 {
193        let original = self.encoded.len() * (8 + avg_string_len);
194        // Dictionary size: sum of string lengths + 4 bytes for pointer
195        let dict_bytes: usize = self.dictionary.iter().map(|s| s.len() + 4).sum();
196        let encoded_bytes = self.encoded.len() * 12 + dict_bytes; // (i64, u32) = 12 bytes
197        if encoded_bytes == 0 {
198            1.0
199        } else {
200            original as f64 / encoded_bytes as f64
201        }
202    }
203
204    /// Filter encoded samples to a timestamp range `[start, end]` (inclusive).
205    pub fn filter_range(&self, start: i64, end: i64) -> TsdbResult<Vec<(i64, &str)>> {
206        self.encoded
207            .iter()
208            .filter(|&&(ts, _)| ts >= start && ts <= end)
209            .map(|&(ts, code)| {
210                self.lookup_code(code).map(|s| (ts, s)).ok_or_else(|| {
211                    TsdbError::Decompression(format!(
212                        "Dictionary: unknown code {} at ts {}",
213                        code, ts
214                    ))
215                })
216            })
217            .collect()
218    }
219
220    /// Return the frequency of each distinct value as `(value, count)` pairs,
221    /// sorted by count descending.
222    pub fn value_frequencies(&self) -> Vec<(&str, usize)> {
223        let mut counts: Vec<usize> = vec![0; self.dictionary.len()];
224        for &(_, code) in &self.encoded {
225            if (code as usize) < counts.len() {
226                counts[code as usize] += 1;
227            }
228        }
229        let mut freq: Vec<(&str, usize)> = self
230            .dictionary
231            .iter()
232            .enumerate()
233            .map(|(i, s)| (s.as_str(), counts[i]))
234            .collect();
235        freq.sort_by(|a, b| b.1.cmp(&a.1));
236        freq
237    }
238}
239
240// ---------------------------------------------------------------------------
241// Convenience functions
242// ---------------------------------------------------------------------------
243
244/// Encode a slice of `(timestamp_ms, &str)` pairs into a [`DictionaryBlock`].
245pub fn dict_encode(data: &[(i64, &str)]) -> TsdbResult<DictionaryBlock> {
246    DictionaryBlock::from_data(data)
247}
248
249/// Decode a [`DictionaryBlock`] into `(timestamp_ms, String)` pairs.
250pub fn dict_decode(block: &DictionaryBlock) -> TsdbResult<Vec<(i64, String)>> {
251    block.decode_owned()
252}
253
254// ---------------------------------------------------------------------------
255// Tests
256// ---------------------------------------------------------------------------
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn test_empty_encode_decode() {
264        let block = DictionaryBlock::from_data(&[]).expect("build block");
265        assert!(block.is_empty());
266        assert_eq!(block.cardinality(), 0);
267        let decoded = block.decode().expect("decode");
268        assert!(decoded.is_empty());
269    }
270
271    #[test]
272    fn test_single_sample() {
273        let data = vec![(1000i64, "on")];
274        let block = DictionaryBlock::from_data(&data).expect("build");
275        assert_eq!(block.len(), 1);
276        assert_eq!(block.cardinality(), 1);
277        let decoded = block.decode().expect("decode");
278        assert_eq!(decoded, vec![(1000, "on")]);
279    }
280
281    #[test]
282    fn test_round_trip_multiple_values() {
283        let data = vec![
284            (0i64, "idle"),
285            (1000, "running"),
286            (2000, "running"),
287            (3000, "idle"),
288            (4000, "error"),
289            (5000, "idle"),
290            (6000, "running"),
291        ];
292        let block = DictionaryBlock::from_data(&data).expect("build");
293        // Only 3 distinct values
294        assert_eq!(block.cardinality(), 3);
295        let decoded = block.decode().expect("decode");
296        assert_eq!(
297            decoded,
298            data.iter().map(|&(t, s)| (t, s)).collect::<Vec<_>>()
299        );
300    }
301
302    #[test]
303    fn test_find_code_and_lookup() {
304        let data = vec![(0i64, "alpha"), (1000, "beta"), (2000, "alpha")];
305        let block = DictionaryBlock::from_data(&data).expect("build");
306        let code_alpha = block.find_code("alpha").expect("alpha should exist");
307        let code_beta = block.find_code("beta").expect("beta should exist");
308        assert_ne!(code_alpha, code_beta);
309        assert_eq!(block.lookup_code(code_alpha), Some("alpha"));
310        assert_eq!(block.lookup_code(code_beta), Some("beta"));
311        assert!(block.find_code("gamma").is_none());
312    }
313
314    #[test]
315    fn test_get_value_by_timestamp() {
316        let data = vec![(0i64, "off"), (1000, "on"), (2000, "standby")];
317        let block = DictionaryBlock::from_data(&data).expect("build");
318        assert_eq!(block.get_value(0), Some("off"));
319        assert_eq!(block.get_value(1000), Some("on"));
320        assert_eq!(block.get_value(2000), Some("standby"));
321        assert_eq!(block.get_value(9999), None);
322    }
323
324    #[test]
325    fn test_filter_range() {
326        let data = vec![
327            (0i64, "a"),
328            (1000, "b"),
329            (2000, "c"),
330            (3000, "d"),
331            (4000, "e"),
332        ];
333        let block = DictionaryBlock::from_data(&data).expect("build");
334        let filtered = block.filter_range(1000, 3000).expect("filter");
335        assert_eq!(filtered.len(), 3);
336        assert_eq!(filtered[0], (1000, "b"));
337        assert_eq!(filtered[1], (2000, "c"));
338        assert_eq!(filtered[2], (3000, "d"));
339    }
340
341    #[test]
342    fn test_value_frequencies() {
343        let data = vec![
344            (0i64, "a"),
345            (1000, "b"),
346            (2000, "a"),
347            (3000, "a"),
348            (4000, "c"),
349            (5000, "b"),
350        ];
351        let block = DictionaryBlock::from_data(&data).expect("build");
352        let freq = block.value_frequencies();
353        // "a" appears 3 times → should be first
354        assert_eq!(freq[0].0, "a");
355        assert_eq!(freq[0].1, 3);
356        // "b" appears 2 times → second
357        assert_eq!(freq[1].0, "b");
358        assert_eq!(freq[1].1, 2);
359        // "c" appears 1 time → third
360        assert_eq!(freq[2].0, "c");
361        assert_eq!(freq[2].1, 1);
362    }
363
364    #[test]
365    fn test_compression_ratio_high_repetition() {
366        // 1000 samples with only 2 distinct values (16 char average)
367        let data: Vec<(i64, &str)> = (0..1000i64)
368            .map(|i| {
369                (
370                    i * 1000,
371                    if i % 2 == 0 {
372                        "device_status_on"
373                    } else {
374                        "device_status_off"
375                    },
376                )
377            })
378            .collect();
379        let block = DictionaryBlock::from_data(&data).expect("build");
380        assert_eq!(block.cardinality(), 2);
381        let ratio = block.compression_ratio(16);
382        assert!(ratio > 1.0, "should be compressed: {:.2}", ratio);
383    }
384
385    #[test]
386    fn test_owned_decode() {
387        let data = vec![(0i64, "hello"), (1000, "world")];
388        let block = DictionaryBlock::from_data(&data).expect("build");
389        let decoded = dict_decode(&block).expect("decode");
390        assert_eq!(
391            decoded,
392            vec![(0, "hello".to_string()), (1000, "world".to_string())]
393        );
394    }
395
396    #[test]
397    fn test_encoder_encode_returns_code() {
398        let mut enc = DictionaryEncoder::new();
399        let code_a = enc.encode(0, "a").expect("ok");
400        let code_b = enc.encode(1000, "b").expect("ok");
401        let code_a2 = enc.encode(2000, "a").expect("ok");
402        // Same value should return same code
403        assert_eq!(code_a, code_a2);
404        assert_ne!(code_a, code_b);
405        assert_eq!(enc.cardinality(), 2);
406    }
407
408    #[test]
409    fn test_all_unique_values() {
410        let data: Vec<(i64, String)> = (0..100).map(|i| (i as i64, format!("val_{}", i))).collect();
411        let refs: Vec<(i64, &str)> = data.iter().map(|(t, s)| (*t, s.as_str())).collect();
412        let block = DictionaryBlock::from_data(&refs).expect("build");
413        assert_eq!(block.cardinality(), 100);
414        assert_eq!(block.len(), 100);
415        let decoded = block.decode().expect("decode");
416        for (i, (ts, val)) in decoded.iter().enumerate() {
417            assert_eq!(*ts, i as i64);
418            assert_eq!(*val, format!("val_{}", i));
419        }
420    }
421
422    #[test]
423    fn test_empty_string_value() {
424        let data = vec![(0i64, ""), (1000, "nonempty"), (2000, "")];
425        let block = DictionaryBlock::from_data(&data).expect("build");
426        assert_eq!(block.cardinality(), 2);
427        let decoded = block.decode().expect("decode");
428        assert_eq!(decoded, vec![(0, ""), (1000, "nonempty"), (2000, "")]);
429    }
430}