Skip to main content

grafeo_core/storage/
delta.rs

1//! Delta encoding for sorted integer sequences.
2//!
3//! Instead of storing [100, 105, 107, 110], store the base (100) and the deltas
4//! [5, 2, 3]. The deltas are often tiny even when values are huge, making this
5//! a great first step before bit-packing.
6//!
7//! For signed integers, we use zig-zag encoding to map negative deltas to small
8//! positive numbers: 0→0, -1→1, 1→2, -2→3, etc.
9//!
10//! # Example
11//!
12//! ```no_run
13//! # use grafeo_core::storage::delta::DeltaEncoding;
14//! let values = vec![100u64, 105, 107, 110, 115];
15//! let encoded = DeltaEncoding::encode(&values);
16//! // base=100, deltas=[5, 2, 3, 5]
17//! assert_eq!(encoded.decode(), values);
18//! ```
19
20use std::io;
21
22/// Stores differences between consecutive values instead of the values themselves.
23///
24/// Pair this with [`BitPackedInts`](super::BitPackedInts) for maximum compression -
25/// use [`DeltaBitPacked`](super::DeltaBitPacked) for the combo.
26#[derive(Debug, Clone)]
27pub struct DeltaEncoding {
28    /// The first value in the sequence.
29    base: u64,
30    /// Deltas between consecutive values.
31    deltas: Vec<u64>,
32    /// Number of values (base + deltas.len()).
33    count: usize,
34}
35
36impl DeltaEncoding {
37    /// Encodes a slice of u64 values using delta encoding.
38    ///
39    /// # Requirements
40    ///
41    /// Values **must** be sorted in ascending order. Unsorted input will produce
42    /// incorrect results due to `saturating_sub` mapping negative deltas to zero.
43    /// A debug assertion checks this in debug builds.
44    ///
45    /// For signed or unsorted data, use [`encode_signed`](Self::encode_signed) instead.
46    #[must_use]
47    pub fn encode(values: &[u64]) -> Self {
48        if values.is_empty() {
49            return Self {
50                base: 0,
51                deltas: Vec::new(),
52                count: 0,
53            };
54        }
55
56        // Values must be sorted for unsigned delta encoding to work correctly
57        debug_assert!(
58            values.windows(2).all(|w| w[0] <= w[1]),
59            "DeltaEncoding::encode requires sorted input; use encode_signed for unsorted data"
60        );
61
62        let base = values[0];
63        let deltas: Vec<u64> = values
64            .windows(2)
65            .map(|w| w[1].saturating_sub(w[0]))
66            .collect();
67
68        Self {
69            base,
70            deltas,
71            count: values.len(),
72        }
73    }
74
75    /// Encodes a slice of i64 values using delta encoding.
76    ///
77    /// Computes signed deltas between consecutive values, then zig-zag encodes
78    /// the deltas to store them as unsigned integers.
79    #[must_use]
80    pub fn encode_signed(values: &[i64]) -> Self {
81        if values.is_empty() {
82            return Self {
83                base: 0,
84                deltas: Vec::new(),
85                count: 0,
86            };
87        }
88
89        // Store base as zig-zag encoded
90        let base = zigzag_encode(values[0]);
91
92        // Compute signed deltas, then zig-zag encode them
93        let deltas: Vec<u64> = values
94            .windows(2)
95            .map(|w| zigzag_encode(w[1] - w[0]))
96            .collect();
97
98        Self {
99            base,
100            deltas,
101            count: values.len(),
102        }
103    }
104
105    /// Decodes the delta-encoded values back to the original sequence.
106    #[must_use]
107    pub fn decode(&self) -> Vec<u64> {
108        if self.count == 0 {
109            return Vec::new();
110        }
111
112        let mut result = Vec::with_capacity(self.count);
113        let mut current = self.base;
114        result.push(current);
115
116        for &delta in &self.deltas {
117            current = current.wrapping_add(delta);
118            result.push(current);
119        }
120
121        result
122    }
123
124    /// Decodes to signed integers using zig-zag decoding.
125    ///
126    /// Assumes the encoding was created with `encode_signed`.
127    #[must_use]
128    pub fn decode_signed(&self) -> Vec<i64> {
129        if self.count == 0 {
130            return Vec::new();
131        }
132
133        let mut result = Vec::with_capacity(self.count);
134        let mut current = zigzag_decode(self.base);
135        result.push(current);
136
137        for &delta in &self.deltas {
138            current += zigzag_decode(delta);
139            result.push(current);
140        }
141
142        result
143    }
144
145    /// Returns the number of values.
146    #[must_use]
147    pub fn len(&self) -> usize {
148        self.count
149    }
150
151    /// Returns whether the encoding is empty.
152    #[must_use]
153    pub fn is_empty(&self) -> bool {
154        self.count == 0
155    }
156
157    /// Returns the base value.
158    #[must_use]
159    pub fn base(&self) -> u64 {
160        self.base
161    }
162
163    /// Returns the deltas.
164    #[must_use]
165    pub fn deltas(&self) -> &[u64] {
166        &self.deltas
167    }
168
169    /// Returns the maximum delta value.
170    ///
171    /// Useful for determining bit width for bit-packing.
172    #[must_use]
173    pub fn max_delta(&self) -> u64 {
174        self.deltas.iter().copied().max().unwrap_or(0)
175    }
176
177    /// Returns the number of bits needed to represent the largest delta.
178    #[must_use]
179    pub fn bits_for_max_delta(&self) -> u8 {
180        let max = self.max_delta();
181        if max == 0 {
182            1
183        } else {
184            64 - max.leading_zeros() as u8
185        }
186    }
187
188    /// Estimates the compression ratio.
189    ///
190    /// Returns the ratio of original size to compressed size.
191    #[must_use]
192    pub fn compression_ratio(&self) -> f64 {
193        if self.count == 0 {
194            return 1.0;
195        }
196
197        let original_size = self.count * 8; // 8 bytes per u64
198        let compressed_size = 8 + (self.deltas.len() * 8); // base + deltas
199
200        original_size as f64 / compressed_size as f64
201    }
202
203    /// Serializes the delta encoding to bytes.
204    pub fn to_bytes(&self) -> Vec<u8> {
205        let mut buf = Vec::with_capacity(8 + 4 + self.deltas.len() * 8);
206        buf.extend_from_slice(&self.base.to_le_bytes());
207        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
208        for &delta in &self.deltas {
209            buf.extend_from_slice(&delta.to_le_bytes());
210        }
211        buf
212    }
213
214    /// Deserializes a delta encoding from bytes.
215    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
216        if bytes.len() < 12 {
217            return Err(io::Error::new(
218                io::ErrorKind::InvalidData,
219                "Delta encoding too short",
220            ));
221        }
222
223        let base = u64::from_le_bytes(
224            bytes[0..8]
225                .try_into()
226                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
227        );
228        let count = u32::from_le_bytes(
229            bytes[8..12]
230                .try_into()
231                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
232        ) as usize;
233
234        let expected_len = 12 + (count.saturating_sub(1)) * 8;
235        if bytes.len() < expected_len {
236            return Err(io::Error::new(
237                io::ErrorKind::InvalidData,
238                "Delta encoding truncated",
239            ));
240        }
241
242        let mut deltas = Vec::with_capacity(count.saturating_sub(1));
243        let mut offset = 12;
244        for _ in 1..count {
245            let delta = u64::from_le_bytes(
246                bytes[offset..offset + 8]
247                    .try_into()
248                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
249            );
250            deltas.push(delta);
251            offset += 8;
252        }
253
254        Ok(Self {
255            base,
256            deltas,
257            count,
258        })
259    }
260}
261
262/// Zig-zag encodes a signed integer to unsigned.
263///
264/// Maps signed integers to unsigned: 0 -> 0, -1 -> 1, 1 -> 2, -2 -> 3, etc.
265#[inline]
266#[must_use]
267pub fn zigzag_encode(value: i64) -> u64 {
268    ((value << 1) ^ (value >> 63)) as u64
269}
270
271/// Zig-zag decodes an unsigned integer to signed.
272#[inline]
273#[must_use]
274pub fn zigzag_decode(value: u64) -> i64 {
275    ((value >> 1) as i64) ^ (-((value & 1) as i64))
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281
282    #[test]
283    fn test_delta_encode_decode() {
284        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
285        let encoded = DeltaEncoding::encode(&values);
286        let decoded = encoded.decode();
287        assert_eq!(values, decoded);
288    }
289
290    #[test]
291    fn test_delta_empty() {
292        let values: Vec<u64> = vec![];
293        let encoded = DeltaEncoding::encode(&values);
294        assert!(encoded.is_empty());
295        assert_eq!(encoded.decode(), values);
296    }
297
298    #[test]
299    fn test_delta_single_value() {
300        let values = vec![42u64];
301        let encoded = DeltaEncoding::encode(&values);
302        assert_eq!(encoded.len(), 1);
303        assert_eq!(encoded.decode(), values);
304    }
305
306    #[test]
307    fn test_delta_sequential() {
308        let values: Vec<u64> = (0..100).collect();
309        let encoded = DeltaEncoding::encode(&values);
310        assert_eq!(encoded.max_delta(), 1);
311        assert_eq!(encoded.bits_for_max_delta(), 1);
312        assert_eq!(encoded.decode(), values);
313    }
314
315    #[test]
316    fn test_delta_large_gaps() {
317        let values = vec![0u64, 1000, 2000, 3000];
318        let encoded = DeltaEncoding::encode(&values);
319        assert_eq!(encoded.max_delta(), 1000);
320        assert_eq!(encoded.decode(), values);
321    }
322
323    #[test]
324    fn test_zigzag_encode_decode() {
325        assert_eq!(zigzag_encode(0), 0);
326        assert_eq!(zigzag_encode(-1), 1);
327        assert_eq!(zigzag_encode(1), 2);
328        assert_eq!(zigzag_encode(-2), 3);
329        assert_eq!(zigzag_encode(2), 4);
330
331        for v in [-100i64, -50, -1, 0, 1, 50, 100, i64::MIN, i64::MAX] {
332            assert_eq!(zigzag_decode(zigzag_encode(v)), v);
333        }
334    }
335
336    #[test]
337    fn test_delta_signed() {
338        let values = vec![-100i64, -50, 0, 50, 100];
339        let encoded = DeltaEncoding::encode_signed(&values);
340        let decoded = encoded.decode_signed();
341        assert_eq!(values, decoded);
342    }
343
344    #[test]
345    fn test_delta_serialization() {
346        let values = vec![100u64, 105, 107, 110, 115];
347        let encoded = DeltaEncoding::encode(&values);
348        let bytes = encoded.to_bytes();
349        let restored = DeltaEncoding::from_bytes(&bytes).unwrap();
350        assert_eq!(encoded.decode(), restored.decode());
351    }
352
353    #[test]
354    fn test_compression_ratio() {
355        // Sequential values should compress well
356        let sequential: Vec<u64> = (0..1000).collect();
357        let encoded = DeltaEncoding::encode(&sequential);
358        // Each delta is 1, so compression is minimal but base + deltas is stored
359        assert!(encoded.len() == 1000);
360    }
361}