1use crate::error::{TsdbError, TsdbResult};
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26
27pub struct DictionaryEncoder {
36 dictionary: HashMap<String, u32>,
38 reverse: Vec<String>,
40 encoded: Vec<(i64, u32)>,
42}
43
44impl DictionaryEncoder {
45 pub fn new() -> Self {
47 Self {
48 dictionary: HashMap::new(),
49 reverse: Vec::new(),
50 encoded: Vec::new(),
51 }
52 }
53
54 pub fn encode(&mut self, timestamp: i64, value: &str) -> TsdbResult<u32> {
58 let code = match self.dictionary.get(value) {
59 Some(&c) => c,
60 None => {
61 let next_code = self.reverse.len() as u32;
62 if next_code == u32::MAX {
63 return Err(TsdbError::Compression(
64 "Dictionary overflow: more than u32::MAX distinct values".to_string(),
65 ));
66 }
67 self.dictionary.insert(value.to_owned(), next_code);
68 self.reverse.push(value.to_owned());
69 next_code
70 }
71 };
72 self.encoded.push((timestamp, code));
73 Ok(code)
74 }
75
76 pub fn finish(self) -> DictionaryBlock {
78 DictionaryBlock {
79 dictionary: self.reverse,
80 encoded: self.encoded,
81 }
82 }
83
84 pub fn cardinality(&self) -> usize {
86 self.reverse.len()
87 }
88}
89
90impl Default for DictionaryEncoder {
91 fn default() -> Self {
92 Self::new()
93 }
94}
95
96#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102pub struct DictionaryBlock {
103 pub dictionary: Vec<String>,
105 pub encoded: Vec<(i64, u32)>,
107}
108
109impl DictionaryBlock {
110 pub fn from_data(data: &[(i64, &str)]) -> TsdbResult<Self> {
112 let mut enc = DictionaryEncoder::new();
113 for &(ts, val) in data {
114 enc.encode(ts, val)?;
115 }
116 Ok(enc.finish())
117 }
118
119 pub fn decode(&self) -> TsdbResult<Vec<(i64, &str)>> {
121 self.encoded
122 .iter()
123 .map(|&(ts, code)| {
124 self.lookup_code(code).map(|s| (ts, s)).ok_or_else(|| {
125 TsdbError::Decompression(format!("Dictionary: unknown code {}", code))
126 })
127 })
128 .collect()
129 }
130
131 pub fn decode_owned(&self) -> TsdbResult<Vec<(i64, String)>> {
133 self.encoded
134 .iter()
135 .map(|&(ts, code)| {
136 self.lookup_code(code)
137 .map(|s| (ts, s.to_owned()))
138 .ok_or_else(|| {
139 TsdbError::Decompression(format!("Dictionary: unknown code {}", code))
140 })
141 })
142 .collect()
143 }
144
145 pub fn lookup_code(&self, code: u32) -> Option<&str> {
149 self.dictionary.get(code as usize).map(|s| s.as_str())
150 }
151
152 pub fn find_code(&self, value: &str) -> Option<u32> {
155 self.dictionary
156 .iter()
157 .position(|s| s == value)
158 .map(|i| i as u32)
159 }
160
161 pub fn get_value(&self, timestamp: i64) -> Option<&str> {
164 let code = self
167 .encoded
168 .iter()
169 .find(|&&(ts, _)| ts == timestamp)
170 .map(|&(_, code)| code)?;
171 self.lookup_code(code)
172 }
173
174 pub fn cardinality(&self) -> usize {
176 self.dictionary.len()
177 }
178
179 pub fn len(&self) -> usize {
181 self.encoded.len()
182 }
183
184 pub fn is_empty(&self) -> bool {
186 self.encoded.is_empty()
187 }
188
189 pub fn compression_ratio(&self, avg_string_len: usize) -> f64 {
193 let original = self.encoded.len() * (8 + avg_string_len);
194 let dict_bytes: usize = self.dictionary.iter().map(|s| s.len() + 4).sum();
196 let encoded_bytes = self.encoded.len() * 12 + dict_bytes; if encoded_bytes == 0 {
198 1.0
199 } else {
200 original as f64 / encoded_bytes as f64
201 }
202 }
203
204 pub fn filter_range(&self, start: i64, end: i64) -> TsdbResult<Vec<(i64, &str)>> {
206 self.encoded
207 .iter()
208 .filter(|&&(ts, _)| ts >= start && ts <= end)
209 .map(|&(ts, code)| {
210 self.lookup_code(code).map(|s| (ts, s)).ok_or_else(|| {
211 TsdbError::Decompression(format!(
212 "Dictionary: unknown code {} at ts {}",
213 code, ts
214 ))
215 })
216 })
217 .collect()
218 }
219
220 pub fn value_frequencies(&self) -> Vec<(&str, usize)> {
223 let mut counts: Vec<usize> = vec![0; self.dictionary.len()];
224 for &(_, code) in &self.encoded {
225 if (code as usize) < counts.len() {
226 counts[code as usize] += 1;
227 }
228 }
229 let mut freq: Vec<(&str, usize)> = self
230 .dictionary
231 .iter()
232 .enumerate()
233 .map(|(i, s)| (s.as_str(), counts[i]))
234 .collect();
235 freq.sort_by(|a, b| b.1.cmp(&a.1));
236 freq
237 }
238}
239
240pub fn dict_encode(data: &[(i64, &str)]) -> TsdbResult<DictionaryBlock> {
246 DictionaryBlock::from_data(data)
247}
248
249pub fn dict_decode(block: &DictionaryBlock) -> TsdbResult<Vec<(i64, String)>> {
251 block.decode_owned()
252}
253
254#[cfg(test)]
259mod tests {
260 use super::*;
261
262 #[test]
263 fn test_empty_encode_decode() {
264 let block = DictionaryBlock::from_data(&[]).expect("build block");
265 assert!(block.is_empty());
266 assert_eq!(block.cardinality(), 0);
267 let decoded = block.decode().expect("decode");
268 assert!(decoded.is_empty());
269 }
270
271 #[test]
272 fn test_single_sample() {
273 let data = vec![(1000i64, "on")];
274 let block = DictionaryBlock::from_data(&data).expect("build");
275 assert_eq!(block.len(), 1);
276 assert_eq!(block.cardinality(), 1);
277 let decoded = block.decode().expect("decode");
278 assert_eq!(decoded, vec![(1000, "on")]);
279 }
280
281 #[test]
282 fn test_round_trip_multiple_values() {
283 let data = vec![
284 (0i64, "idle"),
285 (1000, "running"),
286 (2000, "running"),
287 (3000, "idle"),
288 (4000, "error"),
289 (5000, "idle"),
290 (6000, "running"),
291 ];
292 let block = DictionaryBlock::from_data(&data).expect("build");
293 assert_eq!(block.cardinality(), 3);
295 let decoded = block.decode().expect("decode");
296 assert_eq!(
297 decoded,
298 data.iter().map(|&(t, s)| (t, s)).collect::<Vec<_>>()
299 );
300 }
301
302 #[test]
303 fn test_find_code_and_lookup() {
304 let data = vec![(0i64, "alpha"), (1000, "beta"), (2000, "alpha")];
305 let block = DictionaryBlock::from_data(&data).expect("build");
306 let code_alpha = block.find_code("alpha").expect("alpha should exist");
307 let code_beta = block.find_code("beta").expect("beta should exist");
308 assert_ne!(code_alpha, code_beta);
309 assert_eq!(block.lookup_code(code_alpha), Some("alpha"));
310 assert_eq!(block.lookup_code(code_beta), Some("beta"));
311 assert!(block.find_code("gamma").is_none());
312 }
313
314 #[test]
315 fn test_get_value_by_timestamp() {
316 let data = vec![(0i64, "off"), (1000, "on"), (2000, "standby")];
317 let block = DictionaryBlock::from_data(&data).expect("build");
318 assert_eq!(block.get_value(0), Some("off"));
319 assert_eq!(block.get_value(1000), Some("on"));
320 assert_eq!(block.get_value(2000), Some("standby"));
321 assert_eq!(block.get_value(9999), None);
322 }
323
324 #[test]
325 fn test_filter_range() {
326 let data = vec![
327 (0i64, "a"),
328 (1000, "b"),
329 (2000, "c"),
330 (3000, "d"),
331 (4000, "e"),
332 ];
333 let block = DictionaryBlock::from_data(&data).expect("build");
334 let filtered = block.filter_range(1000, 3000).expect("filter");
335 assert_eq!(filtered.len(), 3);
336 assert_eq!(filtered[0], (1000, "b"));
337 assert_eq!(filtered[1], (2000, "c"));
338 assert_eq!(filtered[2], (3000, "d"));
339 }
340
341 #[test]
342 fn test_value_frequencies() {
343 let data = vec![
344 (0i64, "a"),
345 (1000, "b"),
346 (2000, "a"),
347 (3000, "a"),
348 (4000, "c"),
349 (5000, "b"),
350 ];
351 let block = DictionaryBlock::from_data(&data).expect("build");
352 let freq = block.value_frequencies();
353 assert_eq!(freq[0].0, "a");
355 assert_eq!(freq[0].1, 3);
356 assert_eq!(freq[1].0, "b");
358 assert_eq!(freq[1].1, 2);
359 assert_eq!(freq[2].0, "c");
361 assert_eq!(freq[2].1, 1);
362 }
363
364 #[test]
365 fn test_compression_ratio_high_repetition() {
366 let data: Vec<(i64, &str)> = (0..1000i64)
368 .map(|i| {
369 (
370 i * 1000,
371 if i % 2 == 0 {
372 "device_status_on"
373 } else {
374 "device_status_off"
375 },
376 )
377 })
378 .collect();
379 let block = DictionaryBlock::from_data(&data).expect("build");
380 assert_eq!(block.cardinality(), 2);
381 let ratio = block.compression_ratio(16);
382 assert!(ratio > 1.0, "should be compressed: {:.2}", ratio);
383 }
384
385 #[test]
386 fn test_owned_decode() {
387 let data = vec![(0i64, "hello"), (1000, "world")];
388 let block = DictionaryBlock::from_data(&data).expect("build");
389 let decoded = dict_decode(&block).expect("decode");
390 assert_eq!(
391 decoded,
392 vec![(0, "hello".to_string()), (1000, "world".to_string())]
393 );
394 }
395
396 #[test]
397 fn test_encoder_encode_returns_code() {
398 let mut enc = DictionaryEncoder::new();
399 let code_a = enc.encode(0, "a").expect("ok");
400 let code_b = enc.encode(1000, "b").expect("ok");
401 let code_a2 = enc.encode(2000, "a").expect("ok");
402 assert_eq!(code_a, code_a2);
404 assert_ne!(code_a, code_b);
405 assert_eq!(enc.cardinality(), 2);
406 }
407
408 #[test]
409 fn test_all_unique_values() {
410 let data: Vec<(i64, String)> = (0..100).map(|i| (i as i64, format!("val_{}", i))).collect();
411 let refs: Vec<(i64, &str)> = data.iter().map(|(t, s)| (*t, s.as_str())).collect();
412 let block = DictionaryBlock::from_data(&refs).expect("build");
413 assert_eq!(block.cardinality(), 100);
414 assert_eq!(block.len(), 100);
415 let decoded = block.decode().expect("decode");
416 for (i, (ts, val)) in decoded.iter().enumerate() {
417 assert_eq!(*ts, i as i64);
418 assert_eq!(*val, format!("val_{}", i));
419 }
420 }
421
422 #[test]
423 fn test_empty_string_value() {
424 let data = vec![(0i64, ""), (1000, "nonempty"), (2000, "")];
425 let block = DictionaryBlock::from_data(&data).expect("build");
426 assert_eq!(block.cardinality(), 2);
427 let decoded = block.decode().expect("decode");
428 assert_eq!(decoded, vec![(0, ""), (1000, "nonempty"), (2000, "")]);
429 }
430}