Skip to main content

delta_pack/
encoder.rs

1use crate::rle::RleWriter;
2use crate::varint::{write_uvarint_vec, write_varint_vec};
3
4/// Fast string hash (FxHash algorithm).
5#[inline]
6fn hash_str(s: &str) -> u64 {
7    const K: u64 = 0x517cc1b727220a95;
8    let bytes = s.as_bytes();
9    let mut hash: u64 = 0;
10    for chunk in bytes.chunks(8) {
11        let mut val = 0u64;
12        for (i, &b) in chunk.iter().enumerate() {
13            val |= (b as u64) << (i * 8);
14        }
15        hash = (hash.rotate_left(5) ^ val).wrapping_mul(K);
16    }
17    hash
18}
19
20/// Binary encoder with RLE bit packing.
21pub struct Encoder {
22    buffer: Vec<u8>,
23    dict: Vec<u64>,
24    rle: RleWriter,
25}
26
27impl Encoder {
28    /// Create a new encoder.
29    #[inline]
30    pub fn new() -> Self {
31        Self {
32            buffer: Vec::with_capacity(32),
33            dict: Vec::new(),
34            rle: RleWriter::new(),
35        }
36    }
37
38    /// Encode using a thread-local encoder for optimal performance.
39    /// This avoids allocation overhead by reusing the encoder across calls.
40    #[inline]
41    pub fn encode<F, R>(f: F) -> R
42    where
43        F: FnOnce(&mut Encoder) -> R,
44    {
45        use std::cell::RefCell;
46        thread_local! {
47            static ENCODER: RefCell<Encoder> = RefCell::new(Encoder::new());
48        }
49        ENCODER.with(|enc| f(&mut enc.borrow_mut()))
50    }
51
52    // Primitive methods
53
54    /// Push a string value with dictionary deduplication.
55    #[inline]
56    pub fn push_string(&mut self, val: &str) {
57        if val.is_empty() {
58            self.push_int(0);
59            return;
60        }
61
62        // Check dictionary for deduplication
63        let hash = hash_str(val);
64        if let Some(idx) = self.dict.iter().position(|&h| h == hash) {
65            self.push_int(-(idx as i64) - 1);
66            return;
67        }
68
69        self.dict.push(hash);
70        let bytes = val.as_bytes();
71        self.push_int(bytes.len() as i64);
72        self.buffer.extend_from_slice(bytes);
73    }
74
75    #[inline]
76    pub fn push_int(&mut self, val: i64) {
77        write_varint_vec(&mut self.buffer, val);
78    }
79
80    #[inline]
81    pub fn push_bounded_int(&mut self, val: i64, min: i64) {
82        assert!(val >= min, "int {} below minimum {}", val, min);
83        self.push_uint((val - min) as u64);
84    }
85
86    #[inline]
87    pub fn push_uint(&mut self, val: u64) {
88        write_uvarint_vec(&mut self.buffer, val);
89    }
90
91    #[inline]
92    pub fn push_float(&mut self, val: f32) {
93        self.buffer.extend_from_slice(&val.to_le_bytes());
94    }
95
96    #[inline]
97    pub fn push_float_quantized(&mut self, val: f32, precision: f32) {
98        assert!(val.is_finite(), "invalid quantized float: {}", val);
99        self.push_int((val / precision).round() as i64);
100    }
101
102    #[inline]
103    pub fn push_boolean(&mut self, val: bool) {
104        self.rle.push_bit(val);
105    }
106
107    #[inline]
108    pub fn push_enum(&mut self, val: u32, num_bits: u8) {
109        assert!(
110            (val as u64) < (1u64 << num_bits),
111            "value {} out of range for {} bits",
112            val,
113            num_bits
114        );
115        self.rle.push_bits(val, num_bits);
116    }
117
118    #[inline]
119    pub fn push_bit_packed_int(&mut self, val: i64, min: i64, max: i64, num_bits: u8) {
120        assert!(
121            val >= min && val <= max,
122            "int {} outside [{}, {}]",
123            val,
124            min,
125            max
126        );
127        self.rle.push_bits((val - min) as u32, num_bits);
128    }
129
130    // Diff methods (value-only - caller handles change bit for object fields)
131
132    #[inline]
133    pub fn push_string_diff(&mut self, a: &str, b: &str) {
134        // Ensure 'a' is in dictionary for decoder sync
135        let hash_a = hash_str(a);
136        if !self.dict.contains(&hash_a) {
137            self.dict.push(hash_a);
138        }
139        self.push_string(b);
140    }
141
142    #[inline]
143    pub fn push_int_diff(&mut self, _a: i64, b: i64) {
144        self.push_int(b);
145    }
146
147    #[inline]
148    pub fn push_uint_diff(&mut self, _a: u64, b: u64) {
149        self.push_uint(b);
150    }
151
152    #[inline]
153    pub fn push_bounded_int_diff(&mut self, _a: i64, b: i64, min: i64) {
154        self.push_bounded_int(b, min);
155    }
156
157    #[inline]
158    pub fn push_float_diff(&mut self, _a: f32, b: f32) {
159        self.push_float(b);
160    }
161
162    #[inline]
163    pub fn push_float_quantized_diff(&mut self, _a: f32, b: f32, precision: f32) {
164        self.push_float_quantized(b, precision);
165    }
166
167    #[inline]
168    pub fn push_boolean_diff(&mut self, a: bool, b: bool) {
169        // Boolean diff is special - change bit IS the diff
170        self.push_boolean(a != b);
171    }
172
173    #[inline]
174    pub fn push_enum_diff(&mut self, _a: u32, b: u32, num_bits: u8) {
175        self.push_enum(b, num_bits);
176    }
177
178    #[inline]
179    pub fn push_bit_packed_int_diff(&mut self, _a: i64, b: i64, min: i64, max: i64, num_bits: u8) {
180        self.push_bit_packed_int(b, min, max, num_bits);
181    }
182
183    // Object diff helper (wrap object encoding with change bit)
184
185    #[inline]
186    pub fn push_object_diff<T, E, F>(&mut self, a: &T, b: &T, equals: E, encode_diff: F)
187    where
188        E: FnOnce(&T, &T) -> bool,
189        F: FnOnce(&mut Self),
190    {
191        let changed = !equals(a, b);
192        self.push_boolean(changed);
193        if changed {
194            encode_diff(self);
195        }
196    }
197
198    // Field diff helper (wrap value-only diff with change bit)
199
200    #[inline]
201    pub fn push_field_diff<T, E, F>(&mut self, a: &T, b: &T, equals: E, encode_diff: F)
202    where
203        E: FnOnce(&T, &T) -> bool,
204        F: FnOnce(&mut Self, &T, &T),
205    {
206        let changed = !equals(a, b);
207        self.push_boolean(changed);
208        if changed {
209            encode_diff(self, a, b);
210        }
211    }
212
213    // Array helpers
214
215    /// Encode an array by writing length followed by each element.
216    #[inline]
217    pub fn push_array<T, F>(&mut self, arr: &[T], mut inner_write: F)
218    where
219        F: FnMut(&mut Self, &T),
220    {
221        self.push_uint(arr.len() as u64);
222        for item in arr {
223            inner_write(self, item);
224        }
225    }
226
227    /// Encode array diff by comparing lengths and elements.
228    /// Caller handles change bit.
229    #[inline]
230    pub fn push_array_diff<T, FW, FD, E>(
231        &mut self,
232        a: &[T],
233        b: &[T],
234        mut equals: E,
235        mut inner_write: FW,
236        mut inner_diff: FD,
237    ) where
238        FW: FnMut(&mut Self, &T),
239        FD: FnMut(&mut Self, &T, &T),
240        E: FnMut(&T, &T) -> bool,
241    {
242        self.push_uint(b.len() as u64);
243
244        // Collect changed indices (sparse encoding)
245        let min_len = a.len().min(b.len());
246        let updates: Vec<usize> = (0..min_len).filter(|&i| !equals(&a[i], &b[i])).collect();
247
248        // Write updates (sparse)
249        self.push_uint(updates.len() as u64);
250        for i in updates {
251            self.push_uint(i as u64);
252            inner_diff(self, &a[i], &b[i]);
253        }
254
255        // Write additions
256        for item in b.iter().skip(a.len()) {
257            inner_write(self, item);
258        }
259    }
260
261    // Optional helpers
262
263    /// Encode an optional value by writing presence flag followed by value if present.
264    #[inline]
265    pub fn push_optional<T, F>(&mut self, opt: &Option<T>, mut inner_write: F)
266    where
267        F: FnMut(&mut Self, &T),
268    {
269        self.push_boolean(opt.is_some());
270        if let Some(val) = opt {
271            inner_write(self, val);
272        }
273    }
274
275    /// Encode optional diff, matching TS/C# format.
276    /// Optimization: if a was None, we know b must be Some (else unchanged).
277    /// So skip the present bit in None→Some case.
278    #[inline]
279    pub fn push_optional_diff<T, FW, FD>(
280        &mut self,
281        a: &Option<T>,
282        b: &Option<T>,
283        mut inner_write: FW,
284        mut inner_diff: FD,
285    ) where
286        FW: FnMut(&mut Self, &T),
287        FD: FnMut(&mut Self, &T, &T),
288    {
289        match a {
290            None => {
291                // None → Some (b guaranteed Some by caller)
292                inner_write(self, b.as_ref().unwrap());
293            }
294            Some(av) => {
295                self.push_boolean(b.is_some());
296                if let Some(bv) = b {
297                    inner_diff(self, av, bv); // Some → Some
298                }
299                // else Some → None
300            }
301        }
302    }
303
304    // Record (map) helpers
305
306    /// Encode a record (map) by writing length followed by key-value pairs.
307    #[inline]
308    pub fn push_record<K, V, FK, FV>(
309        &mut self,
310        map: &indexmap::IndexMap<K, V>,
311        mut key_write: FK,
312        mut val_write: FV,
313    ) where
314        K: std::hash::Hash + Eq,
315        FK: FnMut(&mut Self, &K),
316        FV: FnMut(&mut Self, &V),
317    {
318        self.push_uint(map.len() as u64);
319        for (k, v) in map {
320            key_write(self, k);
321            val_write(self, v);
322        }
323    }
324
325    /// Encode record diff, matching TS/C# format.
326    /// Caller handles change bit.
327    #[inline]
328    pub fn push_record_diff<K, V, FK, FV, FVD, E>(
329        &mut self,
330        a: &indexmap::IndexMap<K, V>,
331        b: &indexmap::IndexMap<K, V>,
332        mut equals: E,
333        mut key_write: FK,
334        mut val_write: FV,
335        mut val_diff: FVD,
336    ) where
337        K: Clone + std::hash::Hash + Eq,
338        FK: FnMut(&mut Self, &K),
339        FV: FnMut(&mut Self, &V),
340        FVD: FnMut(&mut Self, &V, &V),
341        E: FnMut(&V, &V) -> bool,
342    {
343        let mut deletions: Vec<usize> = Vec::new();
344        let mut updates: Vec<(usize, K)> = Vec::new();
345        let mut additions = Vec::new();
346
347        for (idx, (k, av)) in a.iter().enumerate() {
348            if let Some(bv) = b.get(k) {
349                if !equals(av, bv) {
350                    updates.push((idx, k.clone()));
351                }
352            } else {
353                deletions.push(idx);
354            }
355        }
356
357        for (k, v) in b {
358            if !a.contains_key(k) {
359                additions.push((k.clone(), v));
360            }
361        }
362
363        // Write deletions and updates (only if a was non-empty)
364        if !a.is_empty() {
365            self.push_uint(deletions.len() as u64);
366            for del_idx in &deletions {
367                self.push_uint(*del_idx as u64);
368            }
369            self.push_uint(updates.len() as u64);
370            for (upd_idx, key) in &updates {
371                self.push_uint(*upd_idx as u64);
372                val_diff(self, a.get(key).unwrap(), b.get(key).unwrap());
373            }
374        }
375
376        // Write additions
377        self.push_uint(additions.len() as u64);
378        for (k, v) in additions {
379            key_write(self, &k);
380            val_write(self, v);
381        }
382    }
383
384    /// Finish encoding and return the buffer.
385    #[inline]
386    pub fn finish(&mut self) -> Vec<u8> {
387        self.rle.write_to_buffer(&mut self.buffer);
388        let result = std::mem::take(&mut self.buffer);
389        self.buffer = Vec::with_capacity(64);
390        self.dict.clear();
391        self.rle.reset();
392        result
393    }
394}
395
396impl Default for Encoder {
397    fn default() -> Self {
398        Self::new()
399    }
400}
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405
406    #[test]
407    fn test_encode_string() {
408        let mut encoder = Encoder::new();
409        encoder.push_string("hello");
410        let buf = encoder.finish();
411        assert!(!buf.is_empty());
412    }
413
414    #[test]
415    fn test_encode_string_dictionary() {
416        let mut encoder = Encoder::new();
417        encoder.push_string("hello");
418        encoder.push_string("world");
419        encoder.push_string("hello"); // Should use dictionary
420        let buf = encoder.finish();
421        assert!(!buf.is_empty());
422    }
423
424    #[test]
425    fn test_encode_int() {
426        let mut encoder = Encoder::new();
427        encoder.push_int(42);
428        encoder.push_int(-100);
429        let buf = encoder.finish();
430        assert!(!buf.is_empty());
431    }
432
433    #[test]
434    fn test_encode_float() {
435        let mut encoder = Encoder::new();
436        encoder.push_float(3.14);
437        let buf = encoder.finish();
438        assert_eq!(buf.len(), 4 + 1); // 4 bytes float + 1 byte RLE (0 bits)
439    }
440
441    #[test]
442    fn test_encode_boolean() {
443        let mut encoder = Encoder::new();
444        encoder.push_boolean(true);
445        encoder.push_boolean(false);
446        encoder.push_boolean(true);
447        let buf = encoder.finish();
448        assert!(!buf.is_empty());
449    }
450}