nodedb_codec/
alp_rd.rs

1//! ALP-RD (Real Doubles) codec for true arbitrary f64 values.
2//!
3//! For full-precision doubles (scientific data, vector embeddings) where
4//! ALP can't find a lossless decimal mapping, ALP-RD exploits the structure
5//! of IEEE 754: the front bits (sign + exponent + high mantissa) are
6//! predictable (few unique patterns), while the tail bits (low mantissa)
7//! are noisy.
8//!
9//! Approach:
10//! 1. Right-shift each f64's bits by `cut` positions (typically 44-52 bits),
11//!    producing a "front" value with few unique values.
12//! 2. Dictionary-encode the front values (usually <256 unique patterns).
13//! 3. Store the tail bits raw (the bottom `cut` bits of each value).
14//!
15//! Compression: ~54 bits/f64 vs 64 raw (~15% reduction). Modest but
16//! consistent and lossless.
17//!
18//! Wire format:
19//! ```text
20//! [4 bytes] value count (LE u32)
21//! [1 byte]  cut position (number of tail bits, 0-63)
22//! [2 bytes] dictionary size (LE u16)
23//! [dict_size × 8 bytes] dictionary entries (LE u64 front values)
24//! [count × 1-2 bytes] dictionary indices (u8 if dict ≤ 256, u16 otherwise)
25//! [count × ceil(cut/8) bytes] tail bits (packed, little-endian)
26//! ```
27
28use crate::error::CodecError;
29
30use crate::CODEC_SAMPLE_SIZE;
31
32// ---------------------------------------------------------------------------
33// Public API
34// ---------------------------------------------------------------------------
35
36/// Encode f64 values using ALP-RD (front-bit dictionary + raw tail bits).
37pub fn encode(values: &[f64]) -> Result<Vec<u8>, CodecError> {
38    let count = values.len() as u32;
39
40    if values.is_empty() {
41        let mut out = Vec::with_capacity(7);
42        out.extend_from_slice(&0u32.to_le_bytes());
43        out.push(0); // cut
44        out.extend_from_slice(&0u16.to_le_bytes());
45        return Ok(out);
46    }
47
48    // Find optimal cut position.
49    let cut = find_best_cut(values);
50    let bits: Vec<u64> = values.iter().map(|v| v.to_bits()).collect();
51
52    // Split into front and tail.
53    let front_mask: u64 = if cut == 64 { 0 } else { u64::MAX << cut };
54    let tail_mask: u64 = if cut == 0 { 0 } else { (1u64 << cut) - 1 };
55    let tail_bytes_per_value = (cut as usize).div_ceil(8);
56
57    let fronts: Vec<u64> = bits.iter().map(|&b| (b & front_mask) >> cut).collect();
58
59    // Build dictionary from unique front values.
60    let mut dict: Vec<u64> = fronts.clone();
61    dict.sort_unstable();
62    dict.dedup();
63
64    // Map front values to dictionary indices.
65    // Safety: `dict` is built from `fronts` via sort+dedup, so every front value
66    // is guaranteed to exist in the dictionary. binary_search cannot fail here.
67    let indices: Vec<u16> = fronts
68        .iter()
69        .map(|f| {
70            dict.binary_search(f)
71                .map(|idx| idx as u16)
72                .map_err(|_| CodecError::Corrupt {
73                    detail: "ALP-RD front value missing from dictionary".into(),
74                })
75        })
76        .collect::<Result<_, _>>()?;
77
78    let dict_size = dict.len() as u16;
79    let use_u8_indices = dict.len() <= 256;
80
81    // Build output.
82    let mut out = Vec::with_capacity(
83        7 + dict.len() * 8
84            + values.len() * if use_u8_indices { 1 } else { 2 }
85            + values.len() * tail_bytes_per_value,
86    );
87
88    // Header.
89    out.extend_from_slice(&count.to_le_bytes());
90    out.push(cut);
91    out.extend_from_slice(&dict_size.to_le_bytes());
92
93    // Dictionary.
94    for &entry in &dict {
95        out.extend_from_slice(&entry.to_le_bytes());
96    }
97
98    // Indices.
99    if use_u8_indices {
100        for &idx in &indices {
101            out.push(idx as u8);
102        }
103    } else {
104        for &idx in &indices {
105            out.extend_from_slice(&idx.to_le_bytes());
106        }
107    }
108
109    // Tail bits (packed little-endian, tail_bytes_per_value bytes each).
110    for &b in &bits {
111        let tail = b & tail_mask;
112        for byte_idx in 0..tail_bytes_per_value {
113            out.push((tail >> (byte_idx * 8)) as u8);
114        }
115    }
116
117    Ok(out)
118}
119
120/// Decode ALP-RD compressed data back to f64 values.
121pub fn decode(data: &[u8]) -> Result<Vec<f64>, CodecError> {
122    if data.len() < 7 {
123        return Err(CodecError::Truncated {
124            expected: 7,
125            actual: data.len(),
126        });
127    }
128
129    let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
130    let cut = data[4];
131    let dict_size = u16::from_le_bytes([data[5], data[6]]) as usize;
132
133    if count == 0 {
134        return Ok(Vec::new());
135    }
136
137    if cut > 64 {
138        return Err(CodecError::Corrupt {
139            detail: format!("invalid ALP-RD cut position: {cut}"),
140        });
141    }
142
143    let tail_bytes_per_value = (cut as usize).div_ceil(8);
144    let tail_mask: u64 = if cut == 0 { 0 } else { (1u64 << cut) - 1 };
145    let use_u8_indices = dict_size <= 256;
146
147    // Read dictionary.
148    let mut pos = 7;
149    let dict_bytes = dict_size * 8;
150    if pos + dict_bytes > data.len() {
151        return Err(CodecError::Truncated {
152            expected: pos + dict_bytes,
153            actual: data.len(),
154        });
155    }
156    let mut dict = Vec::with_capacity(dict_size);
157    for _ in 0..dict_size {
158        dict.push(u64::from_le_bytes([
159            data[pos],
160            data[pos + 1],
161            data[pos + 2],
162            data[pos + 3],
163            data[pos + 4],
164            data[pos + 5],
165            data[pos + 6],
166            data[pos + 7],
167        ]));
168        pos += 8;
169    }
170
171    // Read indices.
172    let index_bytes = count * if use_u8_indices { 1 } else { 2 };
173    if pos + index_bytes > data.len() {
174        return Err(CodecError::Truncated {
175            expected: pos + index_bytes,
176            actual: data.len(),
177        });
178    }
179    let mut indices = Vec::with_capacity(count);
180    if use_u8_indices {
181        for i in 0..count {
182            indices.push(data[pos + i] as usize);
183        }
184        pos += count;
185    } else {
186        for i in 0..count {
187            let idx_pos = pos + i * 2;
188            indices.push(u16::from_le_bytes([data[idx_pos], data[idx_pos + 1]]) as usize);
189        }
190        pos += count * 2;
191    }
192
193    // Read tail bits.
194    let tail_total = count * tail_bytes_per_value;
195    if pos + tail_total > data.len() {
196        return Err(CodecError::Truncated {
197            expected: pos + tail_total,
198            actual: data.len(),
199        });
200    }
201
202    let mut values = Vec::with_capacity(count);
203    for (i, &idx) in indices.iter().enumerate() {
204        if idx >= dict.len() {
205            return Err(CodecError::Corrupt {
206                detail: format!("ALP-RD dict index {idx} out of range (max {})", dict.len()),
207            });
208        }
209
210        let front = dict[idx] << cut;
211        let mut tail = 0u64;
212        let tail_pos = pos + i * tail_bytes_per_value;
213        for byte_idx in 0..tail_bytes_per_value {
214            tail |= (data[tail_pos + byte_idx] as u64) << (byte_idx * 8);
215        }
216        tail &= tail_mask;
217
218        values.push(f64::from_bits(front | tail));
219    }
220
221    Ok(values)
222}
223
224// ---------------------------------------------------------------------------
225// Cut position selection
226// ---------------------------------------------------------------------------
227
228/// Find the optimal cut position that minimizes the dictionary size.
229///
230/// Tries cut positions from 44 to 56 bits (typical for f64) and picks
231/// the one that produces the fewest unique front values.
232fn find_best_cut(values: &[f64]) -> u8 {
233    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
234    let sample = &values[..sample_end];
235    let bits: Vec<u64> = sample.iter().map(|v| v.to_bits()).collect();
236
237    let mut best_cut = 48u8;
238    let mut best_unique = usize::MAX;
239
240    // Try cut positions from 40 to 56 (covering typical f64 patterns).
241    for cut in 40..=56 {
242        let mut fronts: Vec<u64> = bits.iter().map(|&b| b >> cut).collect();
243        fronts.sort_unstable();
244        fronts.dedup();
245
246        if fronts.len() < best_unique {
247            best_unique = fronts.len();
248            best_cut = cut;
249        }
250    }
251
252    best_cut
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn empty_roundtrip() {
261        let encoded = encode(&[]).unwrap();
262        let decoded = decode(&encoded).unwrap();
263        assert!(decoded.is_empty());
264    }
265
266    #[test]
267    fn pi_multiples() {
268        let values: Vec<f64> = (1..1000).map(|i| std::f64::consts::PI * i as f64).collect();
269        let encoded = encode(&values).unwrap();
270        let decoded = decode(&encoded).unwrap();
271        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
272            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
273        }
274    }
275
276    #[test]
277    fn scientific_data() {
278        let values: Vec<f64> = (0..1000).map(|i| (i as f64 * 0.001).exp()).collect();
279        let encoded = encode(&values).unwrap();
280        let decoded = decode(&encoded).unwrap();
281        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
282            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
283        }
284    }
285
286    #[test]
287    fn compression_ratio() {
288        let values: Vec<f64> = (1..10_000)
289            .map(|i| std::f64::consts::E * i as f64 + (i as f64).sqrt())
290            .collect();
291        let encoded = encode(&values).unwrap();
292        let raw_size = values.len() * 8;
293        let ratio = raw_size as f64 / encoded.len() as f64;
294        // ALP-RD saves depend on front-bit dictionary size.
295        // For data with many unique exponents, savings may be minimal.
296        // The key test is lossless roundtrip, not compression ratio.
297        assert!(
298            ratio >= 0.95,
299            "ALP-RD should not expand >5%, got {ratio:.2}x"
300        );
301
302        let decoded = decode(&encoded).unwrap();
303        for (a, b) in values.iter().zip(decoded.iter()) {
304            assert_eq!(a.to_bits(), b.to_bits());
305        }
306    }
307
308    #[test]
309    fn special_values() {
310        let values = vec![
311            0.0,
312            -0.0,
313            f64::INFINITY,
314            f64::NEG_INFINITY,
315            f64::NAN,
316            f64::MIN,
317            f64::MAX,
318            f64::MIN_POSITIVE,
319        ];
320        let encoded = encode(&values).unwrap();
321        let decoded = decode(&encoded).unwrap();
322        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
323            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
324        }
325    }
326
327    #[test]
328    fn identical_values() {
329        let values = vec![42.0f64; 1000];
330        let encoded = encode(&values).unwrap();
331        let decoded = decode(&encoded).unwrap();
332        for (a, b) in values.iter().zip(decoded.iter()) {
333            assert_eq!(a.to_bits(), b.to_bits());
334        }
335        // Dictionary has 1 entry — should be smaller than raw.
336        assert!(encoded.len() < values.len() * 8);
337    }
338
339    #[test]
340    fn truncated_errors() {
341        assert!(decode(&[]).is_err());
342        assert!(decode(&[1, 0, 0, 0, 48, 0]).is_err()); // too short
343    }
344}
nodedb_codec/alp_rd.rs

nodedb_codec/
alp_rd.rs