velesdb_core/compression/
dictionary.rs

1//! Dictionary Encoding for column compression.
2//!
3//! Replaces repeated values with compact integer codes.
4//! Ideal for columns with low cardinality (e.g., country, category).
5
6#![allow(clippy::cast_possible_truncation)]
7#![allow(clippy::cast_precision_loss)]
8
9use rustc_hash::FxHashMap;
10use std::hash::Hash;
11use std::mem::size_of;
12
13/// Compression statistics.
14#[derive(Debug, Clone, Default)]
15pub struct CompressionStats {
16    /// Number of unique values in dictionary.
17    pub unique_values: usize,
18    /// Total number of encoded values.
19    pub total_values: usize,
20    /// Dictionary size in bytes.
21    pub dictionary_size_bytes: usize,
22    /// Encoded data size in bytes.
23    pub encoded_size_bytes: usize,
24    /// Compression ratio (original / compressed).
25    pub compression_ratio: f64,
26}
27
28/// Dictionary codebook mapping values to codes.
29#[derive(Debug, Clone)]
30pub struct DictCodebook<V> {
31    /// Value to code mapping.
32    value_to_code: FxHashMap<V, u32>,
33    /// Code to value mapping.
34    code_to_value: Vec<V>,
35}
36
37impl<V: Hash + Eq + Clone> Default for DictCodebook<V> {
38    fn default() -> Self {
39        Self {
40            value_to_code: FxHashMap::default(),
41            code_to_value: Vec::new(),
42        }
43    }
44}
45
46/// Dictionary encoder for column compression.
47///
48/// Encodes values as compact integer codes using a codebook.
49#[derive(Debug, Clone)]
50pub struct DictionaryEncoder<V: Hash + Eq + Clone> {
51    /// The codebook.
52    codebook: DictCodebook<V>,
53    /// Number of values encoded (including duplicates).
54    total_encoded: usize,
55}
56
57impl<V: Hash + Eq + Clone> DictionaryEncoder<V> {
58    /// Create a new dictionary encoder.
59    #[must_use]
60    pub fn new() -> Self {
61        Self {
62            codebook: DictCodebook::default(),
63            total_encoded: 0,
64        }
65    }
66
67    /// Check if the dictionary is empty.
68    #[must_use]
69    pub fn is_empty(&self) -> bool {
70        self.codebook.code_to_value.is_empty()
71    }
72
73    /// Get the number of unique values in the dictionary.
74    #[must_use]
75    pub fn len(&self) -> usize {
76        self.codebook.code_to_value.len()
77    }
78
79    /// Encode a single value, returning its code.
80    ///
81    /// If the value is new, it's added to the dictionary.
82    pub fn encode(&mut self, value: V) -> u32 {
83        self.total_encoded += 1;
84
85        if let Some(&code) = self.codebook.value_to_code.get(&value) {
86            return code;
87        }
88
89        let code = self.codebook.code_to_value.len() as u32;
90        self.codebook.value_to_code.insert(value.clone(), code);
91        self.codebook.code_to_value.push(value);
92        code
93    }
94
95    /// Decode a code back to its value.
96    #[must_use]
97    pub fn decode(&self, code: u32) -> Option<&V> {
98        self.codebook.code_to_value.get(code as usize)
99    }
100
101    /// Encode a batch of values.
102    pub fn encode_batch(&mut self, values: &[V]) -> Vec<u32> {
103        values.iter().map(|v| self.encode(v.clone())).collect()
104    }
105
106    /// Decode a batch of codes.
107    #[must_use]
108    pub fn decode_batch(&self, codes: &[u32]) -> Vec<V> {
109        codes
110            .iter()
111            .filter_map(|&code| self.decode(code).cloned())
112            .collect()
113    }
114
115    /// Clear the encoder.
116    pub fn clear(&mut self) {
117        self.codebook.value_to_code.clear();
118        self.codebook.code_to_value.clear();
119        self.total_encoded = 0;
120    }
121
122    /// Get compression statistics.
123    #[must_use]
124    pub fn stats(&self) -> CompressionStats {
125        let unique = self.len();
126        let total = self.total_encoded;
127
128        // Estimate sizes
129        let value_size = size_of::<V>();
130        let original_size = total * value_size;
131        let dict_size = unique * value_size + unique * 4; // value + code
132        let encoded_size = total * 4; // u32 codes
133        let compressed_size = dict_size + encoded_size;
134
135        let ratio = if compressed_size > 0 {
136            original_size as f64 / compressed_size as f64
137        } else {
138            0.0
139        };
140
141        CompressionStats {
142            unique_values: unique,
143            total_values: total,
144            dictionary_size_bytes: dict_size,
145            encoded_size_bytes: encoded_size,
146            compression_ratio: ratio,
147        }
148    }
149
150    /// Get the codebook.
151    #[must_use]
152    pub fn codebook(&self) -> &DictCodebook<V> {
153        &self.codebook
154    }
155}
156
157impl<V: Hash + Eq + Clone> Default for DictionaryEncoder<V> {
158    fn default() -> Self {
159        Self::new()
160    }
161}