Skip to main content

nodedb_codec/
alp_rd.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! ALP-RD (Real Doubles) codec for true arbitrary f64 values.
4//!
5//! For full-precision doubles (scientific data, vector embeddings) where
6//! ALP can't find a lossless decimal mapping, ALP-RD exploits the structure
7//! of IEEE 754: the front bits (sign + exponent + high mantissa) are
8//! predictable (few unique patterns), while the tail bits (low mantissa)
9//! are noisy.
10//!
11//! Approach:
12//! 1. Right-shift each f64's bits by `cut` positions (typically 44-52 bits),
13//!    producing a "front" value with few unique values.
14//! 2. Dictionary-encode the front values (usually <256 unique patterns).
15//! 3. Store the tail bits raw (the bottom `cut` bits of each value).
16//!
17//! Compression: ~54 bits/f64 vs 64 raw (~15% reduction). Modest but
18//! consistent and lossless.
19//!
20//! Wire format:
21//! ```text
22//! [4 bytes] value count (LE u32)
23//! [1 byte]  cut position (number of tail bits, 0-63)
24//! [2 bytes] dictionary size (LE u16)
25//! [dict_size × 8 bytes] dictionary entries (LE u64 front values)
26//! [count × 1-2 bytes] dictionary indices (u8 if dict ≤ 256, u16 otherwise)
27//! [count × ceil(cut/8) bytes] tail bits (packed, little-endian)
28//! ```
29
30use crate::error::CodecError;
31
32use crate::CODEC_SAMPLE_SIZE;
33
34// ---------------------------------------------------------------------------
35// Public API
36// ---------------------------------------------------------------------------
37
38/// Encode f64 values using ALP-RD (front-bit dictionary + raw tail bits).
39pub fn encode(values: &[f64]) -> Result<Vec<u8>, CodecError> {
40    let count = values.len() as u32;
41
42    if values.is_empty() {
43        let mut out = Vec::with_capacity(7);
44        out.extend_from_slice(&0u32.to_le_bytes());
45        out.push(0); // cut
46        out.extend_from_slice(&0u16.to_le_bytes());
47        return Ok(out);
48    }
49
50    // Find optimal cut position.
51    let cut = find_best_cut(values);
52    let bits: Vec<u64> = values.iter().map(|v| v.to_bits()).collect();
53
54    // Split into front and tail.
55    let front_mask: u64 = if cut == 64 { 0 } else { u64::MAX << cut };
56    let tail_mask: u64 = if cut == 0 { 0 } else { (1u64 << cut) - 1 };
57    let tail_bytes_per_value = (cut as usize).div_ceil(8);
58
59    let fronts: Vec<u64> = bits.iter().map(|&b| (b & front_mask) >> cut).collect();
60
61    // Build dictionary from unique front values.
62    let mut dict: Vec<u64> = fronts.clone();
63    dict.sort_unstable();
64    dict.dedup();
65
66    // Map front values to dictionary indices.
67    // Safety: `dict` is built from `fronts` via sort+dedup, so every front value
68    // is guaranteed to exist in the dictionary. binary_search cannot fail here.
69    let indices: Vec<u16> = fronts
70        .iter()
71        .map(|f| {
72            dict.binary_search(f)
73                .map(|idx| idx as u16)
74                .map_err(|_| CodecError::Corrupt {
75                    detail: "ALP-RD front value missing from dictionary".into(),
76                })
77        })
78        .collect::<Result<_, _>>()?;
79
80    let dict_size = dict.len() as u16;
81    let use_u8_indices = dict.len() <= 256;
82
83    // Build output.
84    let mut out = Vec::with_capacity(
85        7 + dict.len() * 8
86            + values.len() * if use_u8_indices { 1 } else { 2 }
87            + values.len() * tail_bytes_per_value,
88    );
89
90    // Header.
91    out.extend_from_slice(&count.to_le_bytes());
92    out.push(cut);
93    out.extend_from_slice(&dict_size.to_le_bytes());
94
95    // Dictionary.
96    for &entry in &dict {
97        out.extend_from_slice(&entry.to_le_bytes());
98    }
99
100    // Indices.
101    if use_u8_indices {
102        for &idx in &indices {
103            out.push(idx as u8);
104        }
105    } else {
106        for &idx in &indices {
107            out.extend_from_slice(&idx.to_le_bytes());
108        }
109    }
110
111    // Tail bits (packed little-endian, tail_bytes_per_value bytes each).
112    for &b in &bits {
113        let tail = b & tail_mask;
114        for byte_idx in 0..tail_bytes_per_value {
115            out.push((tail >> (byte_idx * 8)) as u8);
116        }
117    }
118
119    Ok(out)
120}
121
122/// Decode ALP-RD compressed data back to f64 values.
123pub fn decode(data: &[u8]) -> Result<Vec<f64>, CodecError> {
124    if data.len() < 7 {
125        return Err(CodecError::Truncated {
126            expected: 7,
127            actual: data.len(),
128        });
129    }
130
131    let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
132    let cut = data[4];
133    let dict_size = u16::from_le_bytes([data[5], data[6]]) as usize;
134
135    if count == 0 {
136        return Ok(Vec::new());
137    }
138
139    if cut > 64 {
140        return Err(CodecError::Corrupt {
141            detail: format!("invalid ALP-RD cut position: {cut}"),
142        });
143    }
144
145    let tail_bytes_per_value = (cut as usize).div_ceil(8);
146    let tail_mask: u64 = if cut == 0 { 0 } else { (1u64 << cut) - 1 };
147    let use_u8_indices = dict_size <= 256;
148
149    // Read dictionary.
150    let mut pos = 7;
151    let dict_bytes = dict_size * 8;
152    if pos + dict_bytes > data.len() {
153        return Err(CodecError::Truncated {
154            expected: pos + dict_bytes,
155            actual: data.len(),
156        });
157    }
158    let mut dict = Vec::with_capacity(dict_size);
159    for _ in 0..dict_size {
160        dict.push(u64::from_le_bytes([
161            data[pos],
162            data[pos + 1],
163            data[pos + 2],
164            data[pos + 3],
165            data[pos + 4],
166            data[pos + 5],
167            data[pos + 6],
168            data[pos + 7],
169        ]));
170        pos += 8;
171    }
172
173    // Read indices.
174    let index_bytes = count * if use_u8_indices { 1 } else { 2 };
175    if pos + index_bytes > data.len() {
176        return Err(CodecError::Truncated {
177            expected: pos + index_bytes,
178            actual: data.len(),
179        });
180    }
181    let mut indices = Vec::with_capacity(count);
182    if use_u8_indices {
183        for i in 0..count {
184            indices.push(data[pos + i] as usize);
185        }
186        pos += count;
187    } else {
188        for i in 0..count {
189            let idx_pos = pos + i * 2;
190            indices.push(u16::from_le_bytes([data[idx_pos], data[idx_pos + 1]]) as usize);
191        }
192        pos += count * 2;
193    }
194
195    // Read tail bits.
196    let tail_total = count * tail_bytes_per_value;
197    if pos + tail_total > data.len() {
198        return Err(CodecError::Truncated {
199            expected: pos + tail_total,
200            actual: data.len(),
201        });
202    }
203
204    let mut values = Vec::with_capacity(count);
205    for (i, &idx) in indices.iter().enumerate() {
206        if idx >= dict.len() {
207            return Err(CodecError::Corrupt {
208                detail: format!("ALP-RD dict index {idx} out of range (max {})", dict.len()),
209            });
210        }
211
212        let front = dict[idx] << cut;
213        let mut tail = 0u64;
214        let tail_pos = pos + i * tail_bytes_per_value;
215        for byte_idx in 0..tail_bytes_per_value {
216            tail |= (data[tail_pos + byte_idx] as u64) << (byte_idx * 8);
217        }
218        tail &= tail_mask;
219
220        values.push(f64::from_bits(front | tail));
221    }
222
223    Ok(values)
224}
225
226// ---------------------------------------------------------------------------
227// Cut position selection
228// ---------------------------------------------------------------------------
229
230/// Find the optimal cut position that minimizes the dictionary size.
231///
232/// Tries cut positions from 44 to 56 bits (typical for f64) and picks
233/// the one that produces the fewest unique front values.
234fn find_best_cut(values: &[f64]) -> u8 {
235    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
236    let sample = &values[..sample_end];
237    let bits: Vec<u64> = sample.iter().map(|v| v.to_bits()).collect();
238
239    let mut best_cut = 48u8;
240    let mut best_unique = usize::MAX;
241
242    // Try cut positions from 40 to 56 (covering typical f64 patterns).
243    for cut in 40..=56 {
244        let mut fronts: Vec<u64> = bits.iter().map(|&b| b >> cut).collect();
245        fronts.sort_unstable();
246        fronts.dedup();
247
248        if fronts.len() < best_unique {
249            best_unique = fronts.len();
250            best_cut = cut;
251        }
252    }
253
254    best_cut
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    #[test]
262    fn empty_roundtrip() {
263        let encoded = encode(&[]).unwrap();
264        let decoded = decode(&encoded).unwrap();
265        assert!(decoded.is_empty());
266    }
267
268    #[test]
269    fn pi_multiples() {
270        let values: Vec<f64> = (1..1000).map(|i| std::f64::consts::PI * i as f64).collect();
271        let encoded = encode(&values).unwrap();
272        let decoded = decode(&encoded).unwrap();
273        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
274            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
275        }
276    }
277
278    #[test]
279    fn scientific_data() {
280        let values: Vec<f64> = (0..1000).map(|i| (i as f64 * 0.001).exp()).collect();
281        let encoded = encode(&values).unwrap();
282        let decoded = decode(&encoded).unwrap();
283        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
284            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
285        }
286    }
287
288    #[test]
289    fn compression_ratio() {
290        let values: Vec<f64> = (1..10_000)
291            .map(|i| std::f64::consts::E * i as f64 + (i as f64).sqrt())
292            .collect();
293        let encoded = encode(&values).unwrap();
294        let raw_size = values.len() * 8;
295        let ratio = raw_size as f64 / encoded.len() as f64;
296        // ALP-RD saves depend on front-bit dictionary size.
297        // For data with many unique exponents, savings may be minimal.
298        // The key test is lossless roundtrip, not compression ratio.
299        assert!(
300            ratio >= 0.95,
301            "ALP-RD should not expand >5%, got {ratio:.2}x"
302        );
303
304        let decoded = decode(&encoded).unwrap();
305        for (a, b) in values.iter().zip(decoded.iter()) {
306            assert_eq!(a.to_bits(), b.to_bits());
307        }
308    }
309
310    #[test]
311    fn special_values() {
312        let values = vec![
313            0.0,
314            -0.0,
315            f64::INFINITY,
316            f64::NEG_INFINITY,
317            f64::NAN,
318            f64::MIN,
319            f64::MAX,
320            f64::MIN_POSITIVE,
321        ];
322        let encoded = encode(&values).unwrap();
323        let decoded = decode(&encoded).unwrap();
324        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
325            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
326        }
327    }
328
329    #[test]
330    fn identical_values() {
331        let values = vec![42.0f64; 1000];
332        let encoded = encode(&values).unwrap();
333        let decoded = decode(&encoded).unwrap();
334        for (a, b) in values.iter().zip(decoded.iter()) {
335            assert_eq!(a.to_bits(), b.to_bits());
336        }
337        // Dictionary has 1 entry — should be smaller than raw.
338        assert!(encoded.len() < values.len() * 8);
339    }
340
341    #[test]
342    fn truncated_errors() {
343        assert!(decode(&[]).is_err());
344        assert!(decode(&[1, 0, 0, 0, 48, 0]).is_err()); // too short
345    }
346}