Skip to main content

tensor_compress/
delta.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Delta encoding with variable-length integers for sorted ID sequences.
3
4/// Delta-encode a sorted list of IDs.
5/// Stores first value followed by differences between consecutive values.
6#[must_use]
7pub fn delta_encode(ids: &[u64]) -> Vec<u64> {
8    if ids.is_empty() {
9        return Vec::new();
10    }
11
12    let mut result = Vec::with_capacity(ids.len());
13    result.push(ids[0]);
14
15    for window in ids.windows(2) {
16        result.push(window[1].saturating_sub(window[0]));
17    }
18
19    result
20}
21
22/// Decode delta-encoded IDs back to original sorted list.
23#[must_use]
24pub fn delta_decode(deltas: &[u64]) -> Vec<u64> {
25    if deltas.is_empty() {
26        return Vec::new();
27    }
28
29    let mut result = Vec::with_capacity(deltas.len());
30    let mut current = deltas[0];
31    result.push(current);
32
33    for &delta in &deltas[1..] {
34        current = current.saturating_add(delta);
35        result.push(current);
36    }
37
38    result
39}
40
41/// Variable-length encode u64 values.
42/// Uses 7 bits per byte with high bit as continuation flag.
43#[must_use]
44pub fn varint_encode(values: &[u64]) -> Vec<u8> {
45    let mut result = Vec::with_capacity(values.len() * 2);
46
47    for &value in values {
48        let mut v = value;
49        loop {
50            let byte = (v & 0x7f) as u8;
51            v >>= 7;
52            if v == 0 {
53                result.push(byte);
54                break;
55            }
56            result.push(byte | 0x80);
57        }
58    }
59
60    result
61}
62
63/// Decode variable-length encoded bytes back to u64 values.
64#[must_use]
65pub fn varint_decode(bytes: &[u8]) -> Vec<u64> {
66    let mut result = Vec::new();
67    let mut current: u64 = 0;
68    let mut shift: u32 = 0;
69
70    for &byte in bytes {
71        // Prevent shift overflow (max 10 bytes for u64, shift max 63)
72        if shift >= 64 {
73            // Malformed input: too many continuation bytes
74            // Skip remaining bytes of this value until we find a terminator
75            if byte & 0x80 == 0 {
76                result.push(current);
77                current = 0;
78                shift = 0;
79            }
80            continue;
81        }
82
83        current |= u64::from(byte & 0x7f) << shift;
84        shift += 7;
85
86        if byte & 0x80 == 0 {
87            result.push(current);
88            current = 0;
89            shift = 0;
90        }
91    }
92
93    result
94}
95
96/// Combined delta + varint encoding for maximum compression of sorted IDs.
97#[must_use]
98pub fn compress_ids(ids: &[u64]) -> Vec<u8> {
99    let deltas = delta_encode(ids);
100    varint_encode(&deltas)
101}
102
103/// Decompress delta + varint encoded IDs.
104#[must_use]
105pub fn decompress_ids(bytes: &[u8]) -> Vec<u64> {
106    let deltas = varint_decode(bytes);
107    delta_decode(&deltas)
108}
109
110#[cfg(test)]
111mod tests {
112    use super::*;
113
114    #[test]
115    fn test_delta_encode_empty() {
116        assert!(delta_encode(&[]).is_empty());
117    }
118
119    #[test]
120    fn test_delta_encode_single() {
121        assert_eq!(delta_encode(&[42]), vec![42]);
122    }
123
124    #[test]
125    fn test_delta_encode_sequential() {
126        let ids = vec![100, 101, 102, 103, 104];
127        let encoded = delta_encode(&ids);
128        assert_eq!(encoded, vec![100, 1, 1, 1, 1]);
129    }
130
131    #[test]
132    fn test_delta_encode_gaps() {
133        let ids = vec![10, 20, 100, 101, 200];
134        let encoded = delta_encode(&ids);
135        assert_eq!(encoded, vec![10, 10, 80, 1, 99]);
136    }
137
138    #[test]
139    fn test_delta_roundtrip() {
140        let original = vec![1, 5, 10, 100, 1000, 10000];
141        let encoded = delta_encode(&original);
142        let decoded = delta_decode(&encoded);
143        assert_eq!(original, decoded);
144    }
145
146    #[test]
147    fn test_delta_decode_empty() {
148        assert!(delta_decode(&[]).is_empty());
149    }
150
151    #[test]
152    fn test_varint_encode_small() {
153        let values = vec![0, 1, 127];
154        let encoded = varint_encode(&values);
155        assert_eq!(encoded, vec![0, 1, 127]);
156    }
157
158    #[test]
159    fn test_varint_encode_medium() {
160        let values = vec![128];
161        let encoded = varint_encode(&values);
162        assert_eq!(encoded, vec![0x80, 0x01]);
163    }
164
165    #[test]
166    fn test_varint_encode_large() {
167        let values = vec![16384];
168        let encoded = varint_encode(&values);
169        assert_eq!(encoded, vec![0x80, 0x80, 0x01]);
170    }
171
172    #[test]
173    fn test_varint_roundtrip() {
174        let original = vec![0, 1, 127, 128, 255, 256, 16383, 16384, u64::MAX];
175        let encoded = varint_encode(&original);
176        let decoded = varint_decode(&encoded);
177        assert_eq!(original, decoded);
178    }
179
180    #[test]
181    fn test_varint_empty() {
182        assert!(varint_encode(&[]).is_empty());
183        assert!(varint_decode(&[]).is_empty());
184    }
185
186    #[test]
187    fn test_compress_ids_sequential() {
188        let ids: Vec<u64> = (1000..1100).collect();
189        let compressed = compress_ids(&ids);
190        let decompressed = decompress_ids(&compressed);
191        assert_eq!(ids, decompressed);
192
193        // Sequential IDs should compress very well
194        let original_bytes = ids.len() * 8;
195        let compressed_bytes = compressed.len();
196        let ratio = original_bytes as f64 / compressed_bytes as f64;
197        assert!(ratio > 4.0, "Expected >4x compression, got {ratio:.2}x");
198    }
199
200    #[test]
201    fn test_compress_ids_sparse() {
202        let ids = vec![100, 1000, 10000, 100_000, 1_000_000];
203        let compressed = compress_ids(&ids);
204        let decompressed = decompress_ids(&compressed);
205        assert_eq!(ids, decompressed);
206    }
207
208    #[test]
209    fn test_compress_ids_empty() {
210        assert!(compress_ids(&[]).is_empty());
211        assert!(decompress_ids(&[]).is_empty());
212    }
213
214    #[test]
215    fn test_compression_ratio_best_case() {
216        // 10,000 sequential IDs starting at 0
217        let ids: Vec<u64> = (0..10_000).collect();
218        let compressed = compress_ids(&ids);
219
220        let original_bytes = ids.len() * 8;
221        let compressed_bytes = compressed.len();
222        let ratio = original_bytes as f64 / compressed_bytes as f64;
223
224        // Sequential should give ~8x compression (1 byte per delta)
225        assert!(ratio > 7.0, "Expected ~8x compression, got {ratio:.2}x");
226    }
227
228    #[test]
229    fn test_compression_ratio_worst_case() {
230        // Random large gaps (worst case for delta)
231        let ids = vec![0, u64::MAX / 4, u64::MAX / 2, u64::MAX];
232        let compressed = compress_ids(&ids);
233        let decompressed = decompress_ids(&compressed);
234        assert_eq!(ids, decompressed);
235    }
236
237    #[test]
238    fn test_varint_decode_malformed_overflow() {
239        // Create malformed input with too many continuation bytes (>10 for u64)
240        // Each byte has high bit set (continuation) except the terminator
241        let mut malformed = vec![0x80u8; 12]; // 12 continuation bytes
242        malformed.push(0x00); // Terminator
243        malformed.push(0x01); // A valid varint (value 1)
244
245        let decoded = varint_decode(&malformed);
246        // Should handle gracefully without panicking
247        // The malformed value should be skipped/truncated, but valid ones parsed
248        assert!(!decoded.is_empty());
249    }
250
251    #[test]
252    fn test_decompress_ids_malformed() {
253        // Malformed delta-encoded data with overflow varints
254        let mut malformed = vec![0x80u8; 15]; // Way too many continuation bytes
255        malformed.push(0x00); // Terminator
256
257        // Should not panic
258        let result = decompress_ids(&malformed);
259        // Result may be incomplete but should not crash
260        assert!(result.len() <= 2);
261    }
262}