Skip to main content

grafeo_core/codec/
delta.rs

1//! Delta encoding for sorted integer sequences.
2//!
3//! Instead of storing [100, 105, 107, 110], store the base (100) and the deltas
4//! [5, 2, 3]. The deltas are often tiny even when values are huge, making this
5//! a great first step before bit-packing.
6//!
7//! For signed integers, we use zig-zag encoding to map negative deltas to small
8//! positive numbers: 0→0, -1→1, 1→2, -2→3, etc.
9//!
10//! # Example
11//!
12//! ```no_run
13//! # use grafeo_core::codec::delta::DeltaEncoding;
14//! let values = vec![100u64, 105, 107, 110, 115];
15//! let encoded = DeltaEncoding::encode(&values);
16//! // base=100, deltas=[5, 2, 3, 5]
17//! assert_eq!(encoded.decode(), values);
18//! ```
19
20use std::io;
21
22/// Stores differences between consecutive values instead of the values themselves.
23///
24/// Pair this with [`BitPackedInts`](super::BitPackedInts) for maximum compression -
25/// use [`DeltaBitPacked`](super::DeltaBitPacked) for the combo.
26#[derive(Debug, Clone)]
27pub struct DeltaEncoding {
28    /// The first value in the sequence.
29    base: u64,
30    /// Deltas between consecutive values.
31    deltas: Vec<u64>,
32    /// Number of values (base + deltas.len()).
33    count: usize,
34}
35
36impl DeltaEncoding {
37    /// Encodes a slice of u64 values using delta encoding.
38    ///
39    /// # Requirements
40    ///
41    /// Values **must** be sorted in ascending order. Unsorted input will produce
42    /// incorrect results due to `saturating_sub` mapping negative deltas to zero.
43    /// A debug assertion checks this in debug builds.
44    ///
45    /// For signed or unsorted data, use [`encode_signed`](Self::encode_signed) instead.
46    #[must_use]
47    pub fn encode(values: &[u64]) -> Self {
48        if values.is_empty() {
49            return Self {
50                base: 0,
51                deltas: Vec::new(),
52                count: 0,
53            };
54        }
55
56        // Values must be sorted for unsigned delta encoding to work correctly
57        debug_assert!(
58            values.windows(2).all(|w| w[0] <= w[1]),
59            "DeltaEncoding::encode requires sorted input; use encode_signed for unsorted data"
60        );
61
62        let base = values[0];
63        let deltas: Vec<u64> = values
64            .windows(2)
65            .map(|w| w[1].saturating_sub(w[0]))
66            .collect();
67
68        Self {
69            base,
70            deltas,
71            count: values.len(),
72        }
73    }
74
75    /// Encodes a slice of i64 values using delta encoding.
76    ///
77    /// Computes signed deltas between consecutive values, then zig-zag encodes
78    /// the deltas to store them as unsigned integers.
79    #[must_use]
80    pub fn encode_signed(values: &[i64]) -> Self {
81        if values.is_empty() {
82            return Self {
83                base: 0,
84                deltas: Vec::new(),
85                count: 0,
86            };
87        }
88
89        // Store base as zig-zag encoded
90        let base = zigzag_encode(values[0]);
91
92        // Compute signed deltas, then zig-zag encode them
93        let deltas: Vec<u64> = values
94            .windows(2)
95            .map(|w| zigzag_encode(w[1] - w[0]))
96            .collect();
97
98        Self {
99            base,
100            deltas,
101            count: values.len(),
102        }
103    }
104
105    /// Decodes the delta-encoded values back to the original sequence.
106    #[must_use]
107    pub fn decode(&self) -> Vec<u64> {
108        if self.count == 0 {
109            return Vec::new();
110        }
111
112        let mut result = Vec::with_capacity(self.count);
113        let mut current = self.base;
114        result.push(current);
115
116        for &delta in &self.deltas {
117            current = current.wrapping_add(delta);
118            result.push(current);
119        }
120
121        result
122    }
123
124    /// Decodes to signed integers using zig-zag decoding.
125    ///
126    /// Assumes the encoding was created with `encode_signed`.
127    #[must_use]
128    pub fn decode_signed(&self) -> Vec<i64> {
129        if self.count == 0 {
130            return Vec::new();
131        }
132
133        let mut result = Vec::with_capacity(self.count);
134        let mut current = zigzag_decode(self.base);
135        result.push(current);
136
137        for &delta in &self.deltas {
138            current += zigzag_decode(delta);
139            result.push(current);
140        }
141
142        result
143    }
144
145    /// Returns the number of values.
146    #[must_use]
147    pub fn len(&self) -> usize {
148        self.count
149    }
150
151    /// Returns whether the encoding is empty.
152    #[must_use]
153    pub fn is_empty(&self) -> bool {
154        self.count == 0
155    }
156
157    /// Returns the base value.
158    #[must_use]
159    pub fn base(&self) -> u64 {
160        self.base
161    }
162
163    /// Returns the deltas.
164    #[must_use]
165    pub fn deltas(&self) -> &[u64] {
166        &self.deltas
167    }
168
169    /// Returns the maximum delta value.
170    ///
171    /// Useful for determining bit width for bit-packing.
172    #[must_use]
173    pub fn max_delta(&self) -> u64 {
174        self.deltas.iter().copied().max().unwrap_or(0)
175    }
176
177    /// Returns the number of bits needed to represent the largest delta.
178    #[must_use]
179    pub fn bits_for_max_delta(&self) -> u8 {
180        let max = self.max_delta();
181        if max == 0 {
182            1
183        } else {
184            // reason: leading_zeros() returns [0, 64], 64 - n fits u8
185            #[allow(clippy::cast_possible_truncation)]
186            {
187                64 - max.leading_zeros() as u8
188            }
189        }
190    }
191
192    /// Estimates the compression ratio.
193    ///
194    /// Returns the ratio of original size to compressed size.
195    #[must_use]
196    pub fn compression_ratio(&self) -> f64 {
197        if self.count == 0 {
198            return 1.0;
199        }
200
201        let original_size = self.count * 8; // 8 bytes per u64
202        let compressed_size = 8 + (self.deltas.len() * 8); // base + deltas
203
204        original_size as f64 / compressed_size as f64
205    }
206
207    /// Serializes the delta encoding to bytes.
208    pub fn to_bytes(&self) -> Vec<u8> {
209        let mut buf = Vec::with_capacity(8 + 4 + self.deltas.len() * 8);
210        buf.extend_from_slice(&self.base.to_le_bytes());
211        // reason: delta-encoded count is bounded by practical data sizes, fits u32
212        #[allow(clippy::cast_possible_truncation)]
213        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
214        for &delta in &self.deltas {
215            buf.extend_from_slice(&delta.to_le_bytes());
216        }
217        buf
218    }
219
220    /// Deserializes a delta encoding from bytes.
221    ///
222    /// # Errors
223    ///
224    /// Returns `Err` if the byte slice is too short or contains invalid data.
225    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
226        if bytes.len() < 12 {
227            return Err(io::Error::new(
228                io::ErrorKind::InvalidData,
229                "Delta encoding too short",
230            ));
231        }
232
233        let base = u64::from_le_bytes(
234            bytes[0..8]
235                .try_into()
236                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
237        );
238        let count = u32::from_le_bytes(
239            bytes[8..12]
240                .try_into()
241                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
242        ) as usize;
243
244        let expected_len = 12 + (count.saturating_sub(1)) * 8;
245        if bytes.len() < expected_len {
246            return Err(io::Error::new(
247                io::ErrorKind::InvalidData,
248                "Delta encoding truncated",
249            ));
250        }
251
252        let mut deltas = Vec::with_capacity(count.saturating_sub(1));
253        let mut offset = 12;
254        for _ in 1..count {
255            let delta = u64::from_le_bytes(
256                bytes[offset..offset + 8]
257                    .try_into()
258                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
259            );
260            deltas.push(delta);
261            offset += 8;
262        }
263
264        Ok(Self {
265            base,
266            deltas,
267            count,
268        })
269    }
270}
271
272/// Zig-zag encodes a signed integer to unsigned.
273///
274/// Maps signed integers to unsigned: 0 -> 0, -1 -> 1, 1 -> 2, -2 -> 3, etc.
275#[inline]
276#[must_use]
277// reason: zig-zag encoding intentionally reinterprets bits, no data loss
278#[allow(clippy::cast_sign_loss)]
279pub fn zigzag_encode(value: i64) -> u64 {
280    ((value << 1) ^ (value >> 63)) as u64
281}
282
283/// Zig-zag decodes an unsigned integer to signed.
284#[inline]
285#[must_use]
286// reason: zig-zag decoding intentionally reinterprets bits, inverse of encode
287#[allow(clippy::cast_possible_wrap)]
288pub fn zigzag_decode(value: u64) -> i64 {
289    ((value >> 1) as i64) ^ (-((value & 1) as i64))
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn test_delta_encode_decode() {
298        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
299        let encoded = DeltaEncoding::encode(&values);
300        let decoded = encoded.decode();
301        assert_eq!(values, decoded);
302    }
303
304    #[test]
305    fn test_delta_empty() {
306        let values: Vec<u64> = vec![];
307        let encoded = DeltaEncoding::encode(&values);
308        assert!(encoded.is_empty());
309        assert_eq!(encoded.decode(), values);
310    }
311
312    #[test]
313    fn test_delta_single_value() {
314        let values = vec![42u64];
315        let encoded = DeltaEncoding::encode(&values);
316        assert_eq!(encoded.len(), 1);
317        assert_eq!(encoded.decode(), values);
318    }
319
320    #[test]
321    fn test_delta_sequential() {
322        let values: Vec<u64> = (0..100).collect();
323        let encoded = DeltaEncoding::encode(&values);
324        assert_eq!(encoded.max_delta(), 1);
325        assert_eq!(encoded.bits_for_max_delta(), 1);
326        assert_eq!(encoded.decode(), values);
327    }
328
329    #[test]
330    fn test_delta_large_gaps() {
331        let values = vec![0u64, 1000, 2000, 3000];
332        let encoded = DeltaEncoding::encode(&values);
333        assert_eq!(encoded.max_delta(), 1000);
334        assert_eq!(encoded.decode(), values);
335    }
336
337    #[test]
338    fn test_zigzag_encode_decode() {
339        assert_eq!(zigzag_encode(0), 0);
340        assert_eq!(zigzag_encode(-1), 1);
341        assert_eq!(zigzag_encode(1), 2);
342        assert_eq!(zigzag_encode(-2), 3);
343        assert_eq!(zigzag_encode(2), 4);
344
345        for v in [-100i64, -50, -1, 0, 1, 50, 100, i64::MIN, i64::MAX] {
346            assert_eq!(zigzag_decode(zigzag_encode(v)), v);
347        }
348    }
349
350    #[test]
351    fn test_delta_signed() {
352        let values = vec![-100i64, -50, 0, 50, 100];
353        let encoded = DeltaEncoding::encode_signed(&values);
354        let decoded = encoded.decode_signed();
355        assert_eq!(values, decoded);
356    }
357
358    #[test]
359    fn test_delta_serialization() {
360        let values = vec![100u64, 105, 107, 110, 115];
361        let encoded = DeltaEncoding::encode(&values);
362        let bytes = encoded.to_bytes();
363        let restored = DeltaEncoding::from_bytes(&bytes).unwrap();
364        assert_eq!(encoded.decode(), restored.decode());
365    }
366
367    #[test]
368    fn test_compression_ratio() {
369        // Sequential values should compress well
370        let sequential: Vec<u64> = (0..1000).collect();
371        let encoded = DeltaEncoding::encode(&sequential);
372        // Each delta is 1, so compression is minimal but base + deltas is stored
373        assert!(encoded.len() == 1000);
374    }
375}