Skip to main content

grafeo_core/storage/
delta.rs

1//! Delta encoding for sorted integer sequences.
2//!
3//! Instead of storing [100, 105, 107, 110], store the base (100) and the deltas
4//! [5, 2, 3]. The deltas are often tiny even when values are huge, making this
5//! a great first step before bit-packing.
6//!
7//! For signed integers, we use zig-zag encoding to map negative deltas to small
8//! positive numbers: 0→0, -1→1, 1→2, -2→3, etc.
9//!
10//! # Example
11//!
12//! ```no_run
13//! # use grafeo_core::storage::delta::DeltaEncoding;
14//! let values = vec![100u64, 105, 107, 110, 115];
15//! let encoded = DeltaEncoding::encode(&values);
16//! // base=100, deltas=[5, 2, 3, 5]
17//! assert_eq!(encoded.decode(), values);
18//! ```
19
20use std::io;
21
22/// Stores differences between consecutive values instead of the values themselves.
23///
24/// Pair this with [`BitPackedInts`](super::BitPackedInts) for maximum compression -
25/// use [`DeltaBitPacked`](super::DeltaBitPacked) for the combo.
26#[derive(Debug, Clone)]
27pub struct DeltaEncoding {
28    /// The first value in the sequence.
29    base: u64,
30    /// Deltas between consecutive values.
31    deltas: Vec<u64>,
32    /// Number of values (base + deltas.len()).
33    count: usize,
34}
35
36impl DeltaEncoding {
37    /// Encodes a slice of u64 values using delta encoding.
38    ///
39    /// # Requirements
40    ///
41    /// Values **must** be sorted in ascending order. Unsorted input will produce
42    /// incorrect results due to `saturating_sub` mapping negative deltas to zero.
43    /// A debug assertion checks this in debug builds.
44    ///
45    /// For signed or unsorted data, use [`encode_signed`](Self::encode_signed) instead.
46    #[must_use]
47    pub fn encode(values: &[u64]) -> Self {
48        if values.is_empty() {
49            return Self {
50                base: 0,
51                deltas: Vec::new(),
52                count: 0,
53            };
54        }
55
56        // Values must be sorted for unsigned delta encoding to work correctly
57        debug_assert!(
58            values.windows(2).all(|w| w[0] <= w[1]),
59            "DeltaEncoding::encode requires sorted input; use encode_signed for unsorted data"
60        );
61
62        let base = values[0];
63        let deltas: Vec<u64> = values
64            .windows(2)
65            .map(|w| w[1].saturating_sub(w[0]))
66            .collect();
67
68        Self {
69            base,
70            deltas,
71            count: values.len(),
72        }
73    }
74
75    /// Encodes a slice of i64 values using delta encoding.
76    ///
77    /// Computes signed deltas between consecutive values, then zig-zag encodes
78    /// the deltas to store them as unsigned integers.
79    #[must_use]
80    pub fn encode_signed(values: &[i64]) -> Self {
81        if values.is_empty() {
82            return Self {
83                base: 0,
84                deltas: Vec::new(),
85                count: 0,
86            };
87        }
88
89        // Store base as zig-zag encoded
90        let base = zigzag_encode(values[0]);
91
92        // Compute signed deltas, then zig-zag encode them
93        let deltas: Vec<u64> = values
94            .windows(2)
95            .map(|w| zigzag_encode(w[1] - w[0]))
96            .collect();
97
98        Self {
99            base,
100            deltas,
101            count: values.len(),
102        }
103    }
104
105    /// Decodes the delta-encoded values back to the original sequence.
106    #[must_use]
107    pub fn decode(&self) -> Vec<u64> {
108        if self.count == 0 {
109            return Vec::new();
110        }
111
112        let mut result = Vec::with_capacity(self.count);
113        let mut current = self.base;
114        result.push(current);
115
116        for &delta in &self.deltas {
117            current = current.wrapping_add(delta);
118            result.push(current);
119        }
120
121        result
122    }
123
124    /// Decodes to signed integers using zig-zag decoding.
125    ///
126    /// Assumes the encoding was created with `encode_signed`.
127    #[must_use]
128    pub fn decode_signed(&self) -> Vec<i64> {
129        if self.count == 0 {
130            return Vec::new();
131        }
132
133        let mut result = Vec::with_capacity(self.count);
134        let mut current = zigzag_decode(self.base);
135        result.push(current);
136
137        for &delta in &self.deltas {
138            current += zigzag_decode(delta);
139            result.push(current);
140        }
141
142        result
143    }
144
145    /// Returns the number of values.
146    #[must_use]
147    pub fn len(&self) -> usize {
148        self.count
149    }
150
151    /// Returns whether the encoding is empty.
152    #[must_use]
153    pub fn is_empty(&self) -> bool {
154        self.count == 0
155    }
156
157    /// Returns the base value.
158    #[must_use]
159    pub fn base(&self) -> u64 {
160        self.base
161    }
162
163    /// Returns the deltas.
164    #[must_use]
165    pub fn deltas(&self) -> &[u64] {
166        &self.deltas
167    }
168
169    /// Returns the maximum delta value.
170    ///
171    /// Useful for determining bit width for bit-packing.
172    #[must_use]
173    pub fn max_delta(&self) -> u64 {
174        self.deltas.iter().copied().max().unwrap_or(0)
175    }
176
177    /// Returns the number of bits needed to represent the largest delta.
178    #[must_use]
179    pub fn bits_for_max_delta(&self) -> u8 {
180        let max = self.max_delta();
181        if max == 0 {
182            1
183        } else {
184            64 - max.leading_zeros() as u8
185        }
186    }
187
188    /// Estimates the compression ratio.
189    ///
190    /// Returns the ratio of original size to compressed size.
191    #[must_use]
192    pub fn compression_ratio(&self) -> f64 {
193        if self.count == 0 {
194            return 1.0;
195        }
196
197        let original_size = self.count * 8; // 8 bytes per u64
198        let compressed_size = 8 + (self.deltas.len() * 8); // base + deltas
199
200        original_size as f64 / compressed_size as f64
201    }
202
203    /// Serializes the delta encoding to bytes.
204    pub fn to_bytes(&self) -> Vec<u8> {
205        let mut buf = Vec::with_capacity(8 + 4 + self.deltas.len() * 8);
206        buf.extend_from_slice(&self.base.to_le_bytes());
207        buf.extend_from_slice(&(self.count as u32).to_le_bytes());
208        for &delta in &self.deltas {
209            buf.extend_from_slice(&delta.to_le_bytes());
210        }
211        buf
212    }
213
214    /// Deserializes a delta encoding from bytes.
215    pub fn from_bytes(bytes: &[u8]) -> io::Result<Self> {
216        if bytes.len() < 12 {
217            return Err(io::Error::new(
218                io::ErrorKind::InvalidData,
219                "Delta encoding too short",
220            ));
221        }
222
223        let base = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
224        let count = u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize;
225
226        let expected_len = 12 + (count.saturating_sub(1)) * 8;
227        if bytes.len() < expected_len {
228            return Err(io::Error::new(
229                io::ErrorKind::InvalidData,
230                "Delta encoding truncated",
231            ));
232        }
233
234        let mut deltas = Vec::with_capacity(count.saturating_sub(1));
235        let mut offset = 12;
236        for _ in 1..count {
237            let delta = u64::from_le_bytes(bytes[offset..offset + 8].try_into().unwrap());
238            deltas.push(delta);
239            offset += 8;
240        }
241
242        Ok(Self {
243            base,
244            deltas,
245            count,
246        })
247    }
248}
249
250/// Zig-zag encodes a signed integer to unsigned.
251///
252/// Maps signed integers to unsigned: 0 -> 0, -1 -> 1, 1 -> 2, -2 -> 3, etc.
253#[inline]
254#[must_use]
255pub fn zigzag_encode(value: i64) -> u64 {
256    ((value << 1) ^ (value >> 63)) as u64
257}
258
259/// Zig-zag decodes an unsigned integer to signed.
260#[inline]
261#[must_use]
262pub fn zigzag_decode(value: u64) -> i64 {
263    ((value >> 1) as i64) ^ (-((value & 1) as i64))
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn test_delta_encode_decode() {
272        let values = vec![100u64, 105, 107, 110, 115, 120, 128, 130];
273        let encoded = DeltaEncoding::encode(&values);
274        let decoded = encoded.decode();
275        assert_eq!(values, decoded);
276    }
277
278    #[test]
279    fn test_delta_empty() {
280        let values: Vec<u64> = vec![];
281        let encoded = DeltaEncoding::encode(&values);
282        assert!(encoded.is_empty());
283        assert_eq!(encoded.decode(), values);
284    }
285
286    #[test]
287    fn test_delta_single_value() {
288        let values = vec![42u64];
289        let encoded = DeltaEncoding::encode(&values);
290        assert_eq!(encoded.len(), 1);
291        assert_eq!(encoded.decode(), values);
292    }
293
294    #[test]
295    fn test_delta_sequential() {
296        let values: Vec<u64> = (0..100).collect();
297        let encoded = DeltaEncoding::encode(&values);
298        assert_eq!(encoded.max_delta(), 1);
299        assert_eq!(encoded.bits_for_max_delta(), 1);
300        assert_eq!(encoded.decode(), values);
301    }
302
303    #[test]
304    fn test_delta_large_gaps() {
305        let values = vec![0u64, 1000, 2000, 3000];
306        let encoded = DeltaEncoding::encode(&values);
307        assert_eq!(encoded.max_delta(), 1000);
308        assert_eq!(encoded.decode(), values);
309    }
310
311    #[test]
312    fn test_zigzag_encode_decode() {
313        assert_eq!(zigzag_encode(0), 0);
314        assert_eq!(zigzag_encode(-1), 1);
315        assert_eq!(zigzag_encode(1), 2);
316        assert_eq!(zigzag_encode(-2), 3);
317        assert_eq!(zigzag_encode(2), 4);
318
319        for v in [-100i64, -50, -1, 0, 1, 50, 100, i64::MIN, i64::MAX] {
320            assert_eq!(zigzag_decode(zigzag_encode(v)), v);
321        }
322    }
323
324    #[test]
325    fn test_delta_signed() {
326        let values = vec![-100i64, -50, 0, 50, 100];
327        let encoded = DeltaEncoding::encode_signed(&values);
328        let decoded = encoded.decode_signed();
329        assert_eq!(values, decoded);
330    }
331
332    #[test]
333    fn test_delta_serialization() {
334        let values = vec![100u64, 105, 107, 110, 115];
335        let encoded = DeltaEncoding::encode(&values);
336        let bytes = encoded.to_bytes();
337        let restored = DeltaEncoding::from_bytes(&bytes).unwrap();
338        assert_eq!(encoded.decode(), restored.decode());
339    }
340
341    #[test]
342    fn test_compression_ratio() {
343        // Sequential values should compress well
344        let sequential: Vec<u64> = (0..1000).collect();
345        let encoded = DeltaEncoding::encode(&sequential);
346        // Each delta is 1, so compression is minimal but base + deltas is stored
347        assert!(encoded.len() == 1000);
348    }
349}