Skip to main content

grafeo_core/codec/
delta.rs

1//! Delta encoding for sorted integer sequences.
2//!
3//! Instead of storing [100, 105, 107, 110], store the base (100) and the deltas
4//! [5, 2, 3]. The deltas are often tiny even when values are huge, making this
5//! a great first step before bit-packing.
6//!
7//! For signed integers, we use zig-zag encoding to map negative deltas to small
8//! positive numbers: 0→0, -1→1, 1→2, -2→3, etc.
9//!
10//! # Example
11//!
12//! ```no_run
13//! # use grafeo_core::codec::delta::DeltaEncoding;
14//! let values = vec![100u64, 105, 107, 110, 115];
15//! let encoded = DeltaEncoding::encode(&values);
16//! // base=100, deltas=[5, 2, 3, 5]
17//! assert_eq!(encoded.decode(), values);
18//! ```
19
20use std::io;
21
22/// Stores differences between consecutive values instead of the values themselves.
23///
24/// Pair this with [`BitPackedInts`](super::BitPackedInts) for maximum compression -
25/// use [`DeltaBitPacked`](super::DeltaBitPacked) for the combo.
26#[derive(Debug, Clone)]
27pub struct DeltaEncoding {
28    /// The first value in the sequence.
29    base: u64,
30    /// Deltas between consecutive values.
31    deltas: Vec<u64>,
32    /// Number of values (base + deltas.len()).
33    count: usize,
34}
35
36impl DeltaEncoding {
37    /// Encodes a slice of u64 values using delta encoding.
38    ///
39    /// # Requirements
40    ///
41    /// Values **must** be sorted in ascending order. Unsorted input will produce
42    /// incorrect results due to `saturating_sub` mapping negative deltas to zero.
43    /// A debug assertion checks this in debug builds.
44    ///
45    /// For signed or unsorted data, use [`encode_signed`](Self::encode_signed) instead.
46    #[must_use]
47    pub fn encode(values: &[u64]) -> Self {
48        if values.is_empty() {
49            return Self {
50                base: 0,
51                deltas: Vec::new(),
52                count: 0,
53            };
54        }
55
56        // Values must be sorted for unsigned delta encoding to work correctly
57        debug_assert!(
58            values.windows(2).all(|w| w[0] <= w[1]),
59            "DeltaEncoding::encode requires sorted input; use encode_signed for unsorted data"
60        );
61
62        let base = values[0];
63        let deltas: Vec<u64> = values
64            .windows(2)
65            .map(|w| w[1].saturating_sub(w[0]))
66            .collect();
67
68        Self {
69            base,
70            deltas,
71            count: values.len(),
72        }
73    }
74
75    /// Encodes a slice of i64 values using delta encoding.
76    ///
77    /// Computes signed deltas between consecutive values, then zig-zag encodes
78    /// the deltas to store them as unsigned integers.
79    #[must_use]
80    pub fn encode_signed(values: &[i64]) -> Self {
81        if values.is_empty() {
82            return Self {
83                base: 0,
84                deltas: Vec::new(),
85                count: 0,
86            };
87        }
88
89        // Store base as zig-zag encoded
90        let base = zigzag_encode(values[0]);
91
92        // Compute signed deltas, then zig-zag encode them
93        let deltas: Vec<u64> = values
94            .windows(2)
95            .map(|w| zigzag_encode(w[1] - w[0]))
96            .collect();
97
98        Self {
99            base,
100            deltas,
101            count: values.len(),
102        }
103    }
104
105    /// Decodes the delta-encoded values back to the original sequence.
106    #[must_use]
107    pub fn decode(&self) -> Vec<u64> {
108        if self.count == 0 {
109            return Vec::new();
110        }
111
112        let mut result = Vec::with_capacity(self.count);
113        let mut current = self.base;
114        result.push(current);
115
116        for &delta in &self.deltas {
117            current = current.wrapping_add(delta);
118            result.push(current);
119        }
120
121        result
122    }
123
124    /// Decodes to signed integers using zig-zag decoding.
125    ///
126    /// Assumes the encoding was created with `encode_signed`.
127    #[must_use]
128    pub fn decode_signed(&self) -> Vec<i64> {
129        if self.count == 0 {
130            return Vec::new();
131        }
132
133        let mut result = Vec::with_capacity(self.count);
134        let mut current = zigzag_decode(self.base);
135        result.push(current);
136
137        for &delta in &self.deltas {
138            current += zigzag_decode(delta);
139            result.push(current);
140        }
141
142        result
143    }
144
145    /// Returns the number of values.
146    #[must_use]
147    pub fn len(&self) -> usize {
148        self.count
149    }
150
151    /// Returns whether the encoding is empty.
152    #[must_use]
153    pub fn is_empty(&self) -> bool {
154        self.count == 0
155    }
156
157    /// Returns the base value.
158    #[must_use]
159    pub fn base(&self) -> u64 {
160        self.base
161    }
162
163    /// Returns the deltas.
164    #[must_use]
165    pub fn deltas(&self) -> &[u64] {
166        &self.deltas
167    }
168
169    /// Returns the maximum delta value.
170    ///
171    /// Useful for determining bit width for bit-packing.
172    #[must_use]
173    pub fn max_delta(&self) -> u64 {
174        self.deltas.iter().copied().max().unwrap_or(0)
175    }
176
177    /// Returns the number of bits needed to represent the largest delta.
178    #[must_use]
179    pub fn bits_for_max_delta(&self) -> u8 {
180        let max = self.max_delta();
181        if max == 0 {
182            1
183        } else {
184            64 - max.leading_zeros() as u8
185        }
186    }
187
188    /// Estimates the compression ratio.
189    ///
190    /// Returns the ratio of original size to compressed size.
191    #[must_use]
192    pub fn compression_ratio(&self) -> f64 {
193        if self.count == 0 {
194            return 1.0;
195        }
196
197        let original_size = self.count * 8; // 8 bytes per u64
198        let compressed_size = 8 + (self.deltas.len() * 8); // base + deltas
199
200        original_size as f64 / compressed_size as f64
201    }
202
203    /// Serializes the delta encoding to bytes.
204    pub fn to_bytes(&self) -> Vec<u8> {
205        let mut buf = Vec::with_capacity(8 + 4 + self.deltas.len() * 8);
206        buf.extend_from_slice(&self.base.to_le_bytes());
207        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
208        for &delta in &self.deltas {
209            buf.extend_from_slice(&delta.to_le_bytes());
210        }
211        buf
212    }
213
214    /// Deserializes a delta encoding from bytes.
215    ///
216    /// # Errors
217    ///
218    /// Returns `Err` if the byte slice is too short or contains invalid data.
219    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
220        if bytes.len() < 12 {
221            return Err(io::Error::new(
222                io::ErrorKind::InvalidData,
223                "Delta encoding too short",
224            ));
225        }
226
227        let base = u64::from_le_bytes(
228            bytes[0..8]
229                .try_into()
230                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
231        );
232        let count = u32::from_le_bytes(
233            bytes[8..12]
234                .try_into()
235                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
236        ) as usize;
237
238        let expected_len = 12 + (count.saturating_sub(1)) * 8;
239        if bytes.len() < expected_len {
240            return Err(io::Error::new(
241                io::ErrorKind::InvalidData,
242                "Delta encoding truncated",
243            ));
244        }
245
246        let mut deltas = Vec::with_capacity(count.saturating_sub(1));
247        let mut offset = 12;
248        for _ in 1..count {
249            let delta = u64::from_le_bytes(
250                bytes[offset..offset + 8]
251                    .try_into()
252                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
253            );
254            deltas.push(delta);
255            offset += 8;
256        }
257
258        Ok(Self {
259            base,
260            deltas,
261            count,
262        })
263    }
264}
265
266/// Zig-zag encodes a signed integer to unsigned.
267///
268/// Maps signed integers to unsigned: 0 -> 0, -1 -> 1, 1 -> 2, -2 -> 3, etc.
269#[inline]
270#[must_use]
271pub fn zigzag_encode(value: i64) -> u64 {
272    ((value << 1) ^ (value >> 63)) as u64
273}
274
275/// Zig-zag decodes an unsigned integer to signed.
276#[inline]
277#[must_use]
278pub fn zigzag_decode(value: u64) -> i64 {
279    ((value >> 1) as i64) ^ (-((value & 1) as i64))
280}
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    #[test]
287    fn test_delta_encode_decode() {
288        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
289        let encoded = DeltaEncoding::encode(&values);
290        let decoded = encoded.decode();
291        assert_eq!(values, decoded);
292    }
293
294    #[test]
295    fn test_delta_empty() {
296        let values: Vec<u64> = vec![];
297        let encoded = DeltaEncoding::encode(&values);
298        assert!(encoded.is_empty());
299        assert_eq!(encoded.decode(), values);
300    }
301
302    #[test]
303    fn test_delta_single_value() {
304        let values = vec![42u64];
305        let encoded = DeltaEncoding::encode(&values);
306        assert_eq!(encoded.len(), 1);
307        assert_eq!(encoded.decode(), values);
308    }
309
310    #[test]
311    fn test_delta_sequential() {
312        let values: Vec<u64> = (0..100).collect();
313        let encoded = DeltaEncoding::encode(&values);
314        assert_eq!(encoded.max_delta(), 1);
315        assert_eq!(encoded.bits_for_max_delta(), 1);
316        assert_eq!(encoded.decode(), values);
317    }
318
319    #[test]
320    fn test_delta_large_gaps() {
321        let values = vec![0u64, 1000, 2000, 3000];
322        let encoded = DeltaEncoding::encode(&values);
323        assert_eq!(encoded.max_delta(), 1000);
324        assert_eq!(encoded.decode(), values);
325    }
326
327    #[test]
328    fn test_zigzag_encode_decode() {
329        assert_eq!(zigzag_encode(0), 0);
330        assert_eq!(zigzag_encode(-1), 1);
331        assert_eq!(zigzag_encode(1), 2);
332        assert_eq!(zigzag_encode(-2), 3);
333        assert_eq!(zigzag_encode(2), 4);
334
335        for v in [-100i64, -50, -1, 0, 1, 50, 100, i64::MIN, i64::MAX] {
336            assert_eq!(zigzag_decode(zigzag_encode(v)), v);
337        }
338    }
339
340    #[test]
341    fn test_delta_signed() {
342        let values = vec![-100i64, -50, 0, 50, 100];
343        let encoded = DeltaEncoding::encode_signed(&values);
344        let decoded = encoded.decode_signed();
345        assert_eq!(values, decoded);
346    }
347
348    #[test]
349    fn test_delta_serialization() {
350        let values = vec![100u64, 105, 107, 110, 115];
351        let encoded = DeltaEncoding::encode(&values);
352        let bytes = encoded.to_bytes();
353        let restored = DeltaEncoding::from_bytes(&bytes).unwrap();
354        assert_eq!(encoded.decode(), restored.decode());
355    }
356
357    #[test]
358    fn test_compression_ratio() {
359        // Sequential values should compress well
360        let sequential: Vec<u64> = (0..1000).collect();
361        let encoded = DeltaEncoding::encode(&sequential);
362        // Each delta is 1, so compression is minimal but base + deltas is stored
363        assert!(encoded.len() == 1000);
364    }
365}