nodedb_codec/
alp.rs

1//! ALP (Adaptive Lossless floating-Point) codec for f64 columns.
2//!
3//! Most real-world float metrics originate as fixed-point decimals (e.g.,
4//! `23.5`, `99.99`). ALP finds the optimal power-of-10 multiplier that
5//! converts them to integers **losslessly**: `23.5 × 10 = 235` round-trips
6//! exactly via `235 / 10 = 23.5`. The resulting integers have tiny bit-widths
7//! and compress 3-6x better than Gorilla with SIMD-friendly bit-packing.
8//!
9//! Values that don't round-trip exactly (true arbitrary doubles, NaN, Inf)
10//! are stored as exceptions — their original f64 bits are preserved separately.
11//!
12//! Wire format:
13//! ```text
14//! [4 bytes] total value count (LE u32)
15//! [1 byte]  exponent e (power of 10: factor = 10^e, range 0-18)
16//! [1 byte]  factor index f (combined encode/decode factor)
17//! [4 bytes] exception count (LE u32)
18//! [exception_count × 12 bytes] exceptions: (index: u32, value_bits: u64)
19//! [N bytes] FastLanes-packed encoded integers (FOR + bit-pack)
20//! ```
21//!
22//! Reference: "ALP: Adaptive Lossless floating-Point Compression"
23//! (Afroozeh et al., SIGMOD 2023)
24
25use crate::error::CodecError;
26use crate::fastlanes;
27
28/// Maximum exponent (10^18 fits in i64).
29const MAX_EXPONENT: u8 = 18;
30
31/// Powers of 10 as f64 for the encode multiplier.
32const POW10: [f64; 19] = [
33    1.0,
34    10.0,
35    100.0,
36    1_000.0,
37    10_000.0,
38    100_000.0,
39    1_000_000.0,
40    10_000_000.0,
41    100_000_000.0,
42    1_000_000_000.0,
43    10_000_000_000.0,
44    100_000_000_000.0,
45    1_000_000_000_000.0,
46    10_000_000_000_000.0,
47    100_000_000_000_000.0,
48    1_000_000_000_000_000.0,
49    10_000_000_000_000_000.0,
50    100_000_000_000_000_000.0,
51    1_000_000_000_000_000_000.0,
52];
53
54/// Inverse powers of 10 for the decode divisor.
55const INV_POW10: [f64; 19] = [
56    1.0,
57    0.1,
58    0.01,
59    0.001,
60    0.000_1,
61    0.000_01,
62    0.000_001,
63    0.000_000_1,
64    0.000_000_01,
65    0.000_000_001,
66    0.000_000_000_1,
67    0.000_000_000_01,
68    0.000_000_000_001,
69    0.000_000_000_000_1,
70    0.000_000_000_000_01,
71    0.000_000_000_000_001,
72    0.000_000_000_000_000_1,
73    0.000_000_000_000_000_01,
74    0.000_000_000_000_000_001,
75];
76
77use crate::CODEC_SAMPLE_SIZE;
78
79// ---------------------------------------------------------------------------
80// Public encode / decode API
81// ---------------------------------------------------------------------------
82
83/// Encode a slice of f64 values using ALP compression.
84///
85/// Finds the optimal decimal exponent, converts to integers, stores
86/// exceptions for values that don't round-trip, and bit-packs the integers
87/// via FastLanes.
88pub fn encode(values: &[f64]) -> Vec<u8> {
89    let count = values.len() as u32;
90
91    if values.is_empty() {
92        let mut out = Vec::with_capacity(11);
93        out.extend_from_slice(&0u32.to_le_bytes()); // count
94        out.push(0); // encode exponent
95        out.push(0); // decode exponent
96        out.push(0); // mode
97        out.extend_from_slice(&0u32.to_le_bytes()); // exception count
98        return out;
99    }
100
101    // Step 1: Find optimal parameters by sampling.
102    let params = find_best_params(values);
103    let factor = POW10[params.encode_exp as usize];
104
105    // Step 2: Encode values to integers, collect exceptions.
106    let mut encoded_ints = Vec::with_capacity(values.len());
107    let mut exceptions: Vec<(u32, u64)> = Vec::new();
108
109    for (i, &val) in values.iter().enumerate() {
110        let encoded = try_alp_encode(val, factor, params.decode_exp, params.mode);
111        match encoded {
112            Some(int_val) => {
113                encoded_ints.push(int_val);
114            }
115            None => {
116                // Exception: store original bits, use 0 as placeholder in int array.
117                exceptions.push((i as u32, val.to_bits()));
118                encoded_ints.push(0);
119            }
120        }
121    }
122
123    // Step 3: Build output.
124    let exception_count = exceptions.len() as u32;
125    let packed_ints = fastlanes::encode(&encoded_ints);
126
127    let mut out = Vec::with_capacity(10 + exceptions.len() * 12 + packed_ints.len());
128
129    // Header.
130    out.extend_from_slice(&count.to_le_bytes());
131    out.push(params.encode_exp);
132    out.push(params.decode_exp);
133    out.push(match params.mode {
134        DecodeMode::MultiplyInverse => 0,
135        DecodeMode::DivideByFactor => 1,
136    });
137    out.extend_from_slice(&exception_count.to_le_bytes());
138
139    // Exceptions.
140    for &(idx, bits) in &exceptions {
141        out.extend_from_slice(&idx.to_le_bytes());
142        out.extend_from_slice(&bits.to_le_bytes());
143    }
144
145    // Packed integers.
146    out.extend_from_slice(&packed_ints);
147
148    out
149}
150
151/// Decode ALP-compressed bytes back to f64 values.
152pub fn decode(data: &[u8]) -> Result<Vec<f64>, CodecError> {
153    const HEADER_SIZE: usize = 11; // 4 + 1 + 1 + 1 + 4
154
155    if data.len() < HEADER_SIZE {
156        return Err(CodecError::Truncated {
157            expected: HEADER_SIZE,
158            actual: data.len(),
159        });
160    }
161
162    let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
163    let _encode_exp = data[4];
164    let decode_exp = data[5];
165    let mode = match data[6] {
166        0 => DecodeMode::MultiplyInverse,
167        1 => DecodeMode::DivideByFactor,
168        m => {
169            return Err(CodecError::Corrupt {
170                detail: format!("invalid ALP decode mode {m}"),
171            });
172        }
173    };
174    let exception_count = u32::from_le_bytes([data[7], data[8], data[9], data[10]]) as usize;
175
176    if count == 0 {
177        return Ok(Vec::new());
178    }
179
180    if decode_exp > MAX_EXPONENT {
181        return Err(CodecError::Corrupt {
182            detail: format!("invalid ALP decode_exp {decode_exp}"),
183        });
184    }
185
186    // Read exceptions.
187    let exceptions_size = exception_count * 12;
188    let exceptions_end = HEADER_SIZE + exceptions_size;
189    if data.len() < exceptions_end {
190        return Err(CodecError::Truncated {
191            expected: exceptions_end,
192            actual: data.len(),
193        });
194    }
195
196    let mut exceptions = Vec::with_capacity(exception_count);
197    let mut pos = HEADER_SIZE;
198    for _ in 0..exception_count {
199        let idx =
200            u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
201        let bits = u64::from_le_bytes([
202            data[pos + 4],
203            data[pos + 5],
204            data[pos + 6],
205            data[pos + 7],
206            data[pos + 8],
207            data[pos + 9],
208            data[pos + 10],
209            data[pos + 11],
210        ]);
211        exceptions.push((idx, bits));
212        pos += 12;
213    }
214
215    // Decode FastLanes-packed integers.
216    let encoded_ints =
217        fastlanes::decode(&data[exceptions_end..]).map_err(|e| CodecError::Corrupt {
218            detail: format!("ALP fastlanes decode: {e}"),
219        })?;
220
221    if encoded_ints.len() != count {
222        return Err(CodecError::Corrupt {
223            detail: format!(
224                "ALP int count mismatch: expected {count}, got {}",
225                encoded_ints.len()
226            ),
227        });
228    }
229
230    // Reconstruct f64 values using the same decode mode that was selected during encode.
231    let mut values = Vec::with_capacity(count);
232    for &int_val in &encoded_ints {
233        values.push(alp_decode_value(int_val, decode_exp, mode));
234    }
235
236    // Patch exceptions.
237    for &(idx, bits) in &exceptions {
238        if idx < values.len() {
239            values[idx] = f64::from_bits(bits);
240        }
241    }
242
243    Ok(values)
244}
245
246/// Check what fraction of values can be ALP-encoded at the best exponent.
247///
248/// Returns a value between 0.0 and 1.0. Values > 0.95 mean ALP is a good
249/// fit. Used by auto-detection to choose between ALP and Gorilla.
250pub fn alp_encodability(values: &[f64]) -> f64 {
251    if values.is_empty() {
252        return 1.0;
253    }
254
255    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
256    let sample = &values[..sample_end];
257    let params = find_best_params(sample);
258    let factor = POW10[params.encode_exp as usize];
259
260    let encodable = sample
261        .iter()
262        .filter(|&&v| try_alp_encode(v, factor, params.decode_exp, params.mode).is_some())
263        .count();
264
265    encodable as f64 / sample.len() as f64
266}
267
268// ---------------------------------------------------------------------------
269// ALP core: exponent selection + encode/decode
270// ---------------------------------------------------------------------------
271
272/// Decode mode: whether to multiply by inverse or divide by factor.
273///
274/// IEEE 754 multiplication and division produce different bit patterns
275/// for the same logical value (e.g., `3 * 0.1 != 3 / 10.0` at the bit level).
276/// ALP tries both and picks whichever matches the original data.
277#[derive(Debug, Clone, Copy, PartialEq, Eq)]
278enum DecodeMode {
279    /// Decode via `encoded * INV_POW10[e]` (matches data computed as `x * 0.1`).
280    MultiplyInverse,
281    /// Decode via `encoded / POW10[e]` (matches data computed as `x / 10`).
282    DivideByFactor,
283}
284
285/// Try to ALP-encode a single f64 value.
286///
287/// Returns `Some(i64)` if the value round-trips exactly, `None` otherwise.
288#[inline]
289fn try_alp_encode(value: f64, factor: f64, decode_exp: u8, mode: DecodeMode) -> Option<i64> {
290    if !value.is_finite() {
291        return None;
292    }
293
294    let scaled = value * factor;
295    if scaled < i64::MIN as f64 || scaled > i64::MAX as f64 {
296        return None;
297    }
298
299    let encoded = scaled.round() as i64;
300
301    let decoded = match mode {
302        DecodeMode::MultiplyInverse => encoded as f64 * INV_POW10[decode_exp as usize],
303        DecodeMode::DivideByFactor => encoded as f64 / POW10[decode_exp as usize],
304    };
305
306    if decoded.to_bits() == value.to_bits() {
307        Some(encoded)
308    } else {
309        None
310    }
311}
312
313/// Reconstruct a single f64 from an ALP-encoded integer.
314#[inline]
315fn alp_decode_value(encoded: i64, decode_exp: u8, mode: DecodeMode) -> f64 {
316    match mode {
317        DecodeMode::MultiplyInverse => encoded as f64 * INV_POW10[decode_exp as usize],
318        DecodeMode::DivideByFactor => encoded as f64 / POW10[decode_exp as usize],
319    }
320}
321
322/// Find the best (exponent, factor_index) pair for a set of values.
323///
324/// Tries all exponents 0-18 and picks the one that encodes the most
325/// values losslessly. The factor_index may differ from exponent when
326/// the decode factor produces better roundtrips.
327/// ALP parameters determined by sampling.
328struct AlpParams {
329    encode_exp: u8,
330    decode_exp: u8,
331    mode: DecodeMode,
332}
333
334/// Find the best ALP parameters by trying all exponents and both decode modes.
335fn find_best_params(values: &[f64]) -> AlpParams {
336    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
337    let sample = &values[..sample_end];
338
339    let mut best = AlpParams {
340        encode_exp: 0,
341        decode_exp: 0,
342        mode: DecodeMode::MultiplyInverse,
343    };
344    let mut best_count: usize = 0;
345
346    for e in 0..=MAX_EXPONENT {
347        let factor = POW10[e as usize];
348
349        for mode in [DecodeMode::MultiplyInverse, DecodeMode::DivideByFactor] {
350            // Try same decode exponent.
351            let count = sample
352                .iter()
353                .filter(|&&v| try_alp_encode(v, factor, e, mode).is_some())
354                .count();
355
356            if count > best_count {
357                best_count = count;
358                best = AlpParams {
359                    encode_exp: e,
360                    decode_exp: e,
361                    mode,
362                };
363            }
364
365            // Try neighboring decode exponents.
366            if e > 0 {
367                let count_alt = sample
368                    .iter()
369                    .filter(|&&v| try_alp_encode(v, factor, e - 1, mode).is_some())
370                    .count();
371                if count_alt > best_count {
372                    best_count = count_alt;
373                    best = AlpParams {
374                        encode_exp: e,
375                        decode_exp: e - 1,
376                        mode,
377                    };
378                }
379            }
380            if e < MAX_EXPONENT {
381                let count_alt = sample
382                    .iter()
383                    .filter(|&&v| try_alp_encode(v, factor, e + 1, mode).is_some())
384                    .count();
385                if count_alt > best_count {
386                    best_count = count_alt;
387                    best = AlpParams {
388                        encode_exp: e,
389                        decode_exp: e + 1,
390                        mode,
391                    };
392                }
393            }
394        }
395    }
396
397    best
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn empty_roundtrip() {
406        let encoded = encode(&[]);
407        let decoded = decode(&encoded).unwrap();
408        assert!(decoded.is_empty());
409    }
410
411    #[test]
412    fn single_value() {
413        let values = vec![23.5f64];
414        let encoded = encode(&values);
415        let decoded = decode(&encoded).unwrap();
416        assert_eq!(decoded[0].to_bits(), values[0].to_bits());
417    }
418
419    #[test]
420    fn integer_values() {
421        // Integer values: exponent=0, factor=1.
422        let values: Vec<f64> = (0..1000).map(|i| i as f64).collect();
423        let encoded = encode(&values);
424        let decoded = decode(&encoded).unwrap();
425        for (a, b) in values.iter().zip(decoded.iter()) {
426            assert_eq!(a.to_bits(), b.to_bits(), "mismatch: {a} vs {b}");
427        }
428    }
429
430    #[test]
431    fn decimal_values_one_digit() {
432        // Values like 23.5, 99.1 — exponent=1, factor=10.
433        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
434        let encoded = encode(&values);
435        let decoded = decode(&encoded).unwrap();
436        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
437            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}: {a} vs {b}");
438        }
439    }
440
441    #[test]
442    fn decimal_values_two_digits() {
443        // Values like 99.99 — exponent=2, factor=100.
444        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.01).collect();
445        let encoded = encode(&values);
446        let decoded = decode(&encoded).unwrap();
447        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
448            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}: {a} vs {b}");
449        }
450    }
451
452    #[test]
453    fn typical_cpu_metrics() {
454        // CPU percentages: 0.0 to 100.0 with 0.1 resolution.
455        let mut values = Vec::with_capacity(10_000);
456        let mut rng: u64 = 42;
457        for _ in 0..10_000 {
458            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
459            let cpu = ((rng >> 33) as f64 / (u32::MAX as f64)) * 100.0;
460            // Round to 0.1 to simulate typical metric resolution.
461            let cpu = (cpu * 10.0).round() / 10.0;
462            values.push(cpu);
463        }
464        let encoded = encode(&values);
465        let decoded = decode(&encoded).unwrap();
466        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
467            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}: {a} vs {b}");
468        }
469
470        // ALP should compress much better than raw.
471        let raw_size = values.len() * 8;
472        let ratio = raw_size as f64 / encoded.len() as f64;
473        assert!(
474            ratio > 3.0,
475            "ALP should compress CPU metrics >3x, got {ratio:.1}x"
476        );
477    }
478
479    #[test]
480    fn temperature_readings() {
481        // Temperatures: -40.0 to 50.0 with 0.01 resolution.
482        let mut values = Vec::with_capacity(10_000);
483        let mut rng: u64 = 99;
484        for _ in 0..10_000 {
485            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
486            let temp = ((rng >> 33) as f64 / (u32::MAX as f64)) * 90.0 - 40.0;
487            let temp = (temp * 100.0).round() / 100.0;
488            values.push(temp);
489        }
490        let encoded = encode(&values);
491        let decoded = decode(&encoded).unwrap();
492        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
493            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}: {a} vs {b}");
494        }
495    }
496
497    #[test]
498    fn exception_handling() {
499        // Mix of ALP-encodable and non-encodable values.
500        let mut values = vec![23.5, 99.9, 100.1, 50.0];
501        values.push(f64::NAN);
502        values.push(f64::INFINITY);
503        values.push(f64::NEG_INFINITY);
504        values.push(std::f64::consts::PI); // Irrational — likely exception.
505
506        let encoded = encode(&values);
507        let decoded = decode(&encoded).unwrap();
508
509        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
510            assert_eq!(
511                a.to_bits(),
512                b.to_bits(),
513                "mismatch at {i}: {a} ({:016x}) vs {b} ({:016x})",
514                a.to_bits(),
515                b.to_bits()
516            );
517        }
518    }
519
520    #[test]
521    fn all_exceptions() {
522        // All values are irrational — all exceptions.
523        let values: Vec<f64> = (1..100).map(|i| std::f64::consts::PI * i as f64).collect();
524        let encoded = encode(&values);
525        let decoded = decode(&encoded).unwrap();
526        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
527            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
528        }
529    }
530
531    #[test]
532    fn encodability_check() {
533        // Decimal values should have high encodability.
534        let decimal_values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
535        assert!(alp_encodability(&decimal_values) > 0.95);
536
537        // PI multiples: may still be encodable at high exponents, but the
538        // resulting integers will have wide bit-widths (poor compression).
539        // The key metric is that decimals have HIGHER encodability than irrationals.
540        let irrational_values: Vec<f64> =
541            (1..1000).map(|i| std::f64::consts::PI * i as f64).collect();
542        let irrational_enc = alp_encodability(&irrational_values);
543        // Decimal values should always be better than irrational ones.
544        assert!(
545            alp_encodability(&decimal_values) >= irrational_enc,
546            "decimal encodability should be >= irrational"
547        );
548    }
549
550    #[test]
551    fn better_than_gorilla_for_decimals() {
552        let values: Vec<f64> = (0..10_000).map(|i| i as f64 * 0.1).collect();
553        let alp_encoded = encode(&values);
554        let gorilla_encoded = crate::gorilla::encode_f64(&values);
555
556        assert!(
557            alp_encoded.len() < gorilla_encoded.len(),
558            "ALP ({} bytes) should beat Gorilla ({} bytes) on decimal data",
559            alp_encoded.len(),
560            gorilla_encoded.len()
561        );
562    }
563
564    #[test]
565    fn zero_values() {
566        let values = vec![0.0f64; 1000];
567        let encoded = encode(&values);
568        let decoded = decode(&encoded).unwrap();
569        for (a, b) in values.iter().zip(decoded.iter()) {
570            assert_eq!(a.to_bits(), b.to_bits());
571        }
572    }
573
574    #[test]
575    fn negative_zero() {
576        let values = vec![-0.0f64, 0.0, -0.0];
577        let encoded = encode(&values);
578        let decoded = decode(&encoded).unwrap();
579        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
580            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
581        }
582    }
583
584    #[test]
585    fn truncated_input_errors() {
586        assert!(decode(&[]).is_err());
587        assert!(decode(&[1, 0, 0, 0, 0, 0, 0, 0, 0]).is_err()); // too short
588    }
589
590    #[test]
591    fn large_dataset() {
592        let mut values = Vec::with_capacity(100_000);
593        let mut rng: u64 = 12345;
594        for _ in 0..100_000 {
595            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
596            let val = ((rng >> 33) as f64 / (u32::MAX as f64)) * 1000.0;
597            let val = (val * 100.0).round() / 100.0; // 2 decimal places
598            values.push(val);
599        }
600        let encoded = encode(&values);
601        let decoded = decode(&encoded).unwrap();
602        assert_eq!(decoded.len(), values.len());
603        for (i, (a, b)) in values.iter().zip(decoded.iter()).enumerate() {
604            assert_eq!(a.to_bits(), b.to_bits(), "mismatch at {i}");
605        }
606    }
607}
nodedb_codec/alp.rs

nodedb_codec/
alp.rs