Skip to main content

vortex_alp/alp/
compress.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use itertools::Itertools;
5use vortex_array::ArrayRef;
6use vortex_array::IntoArray;
7use vortex_array::arrays::PrimitiveArray;
8use vortex_array::dtype::PType;
9use vortex_array::patches::Patches;
10use vortex_array::validity::Validity;
11use vortex_buffer::Buffer;
12use vortex_buffer::BufferMut;
13use vortex_error::VortexResult;
14use vortex_error::vortex_bail;
15use vortex_mask::Mask;
16
17use crate::ALP;
18use crate::Exponents;
19use crate::alp::ALPArray;
20use crate::alp::ALPFloat;
21
22#[macro_export]
23macro_rules! match_each_alp_float_ptype {
24    ($self:expr, | $enc:ident | $body:block) => {{
25        use vortex_array::dtype::PType;
26        use vortex_error::vortex_panic;
27        let ptype = $self;
28        match ptype {
29            PType::F32 => {
30                type $enc = f32;
31                $body
32            }
33            PType::F64 => {
34                type $enc = f64;
35                $body
36            }
37            _ => vortex_panic!("ALP can only encode f32 and f64, got {}", ptype),
38        }
39    }};
40}
41
42pub fn alp_encode(parray: &PrimitiveArray, exponents: Option<Exponents>) -> VortexResult<ALPArray> {
43    let (exponents, encoded, patches) = match parray.ptype() {
44        PType::F32 => alp_encode_components_typed::<f32>(parray, exponents)?,
45        PType::F64 => alp_encode_components_typed::<f64>(parray, exponents)?,
46        _ => vortex_bail!("ALP can only encode f32 and f64"),
47    };
48
49    // SAFETY: alp_encode_components_typed must return well-formed components
50    unsafe { Ok(ALP::new_unchecked(encoded, exponents, patches)) }
51}
52
53#[expect(
54    clippy::cast_possible_truncation,
55    reason = "u64 index cast to usize is safe for reasonable array sizes"
56)]
57fn alp_encode_components_typed<T>(
58    values: &PrimitiveArray,
59    exponents: Option<Exponents>,
60) -> VortexResult<(Exponents, ArrayRef, Option<Patches>)>
61where
62    T: ALPFloat,
63{
64    let values_slice = values.as_slice::<T>();
65
66    let (exponents, encoded, exceptional_positions, exceptional_values, mut chunk_offsets) =
67        T::encode(values_slice, exponents);
68
69    let encoded_array = PrimitiveArray::new(encoded, values.validity()?).into_array();
70
71    let validity = values.validity_mask()?;
72    // exceptional_positions may contain exceptions at invalid positions (which contain garbage
73    // data). We remove null exceptions in order to keep the Patches small.
74    let (valid_exceptional_positions, valid_exceptional_values): (Buffer<u64>, Buffer<T>) =
75        match validity {
76            Mask::AllTrue(_) => (exceptional_positions, exceptional_values),
77            Mask::AllFalse(_) => {
78                // no valid positions, ergo nothing worth patching
79                (Buffer::empty(), Buffer::empty())
80            }
81            Mask::Values(is_valid) => {
82                let (pos, vals): (BufferMut<u64>, BufferMut<T>) = exceptional_positions
83                    .into_iter()
84                    .zip_eq(exceptional_values)
85                    .filter(|(index, _)| {
86                        let is_valid = is_valid.value(*index as usize);
87                        if !is_valid {
88                            let patch_chunk = *index as usize / 1024;
89                            for chunk_idx in (patch_chunk + 1)..chunk_offsets.len() {
90                                chunk_offsets[chunk_idx] -= 1;
91                            }
92                        }
93                        is_valid
94                    })
95                    .unzip();
96                (pos.freeze(), vals.freeze())
97            }
98        };
99    let patches = if valid_exceptional_positions.is_empty() {
100        None
101    } else {
102        let patches_validity = if values.dtype().is_nullable() {
103            Validity::AllValid
104        } else {
105            Validity::NonNullable
106        };
107        let valid_exceptional_values =
108            PrimitiveArray::new(valid_exceptional_values, patches_validity).into_array();
109
110        Some(Patches::new(
111            values_slice.len(),
112            0,
113            valid_exceptional_positions.into_array(),
114            valid_exceptional_values,
115            Some(chunk_offsets.into_array()),
116        )?)
117    };
118    Ok((exponents, encoded_array, patches))
119}
120
121#[cfg(test)]
122mod tests {
123    use core::f64;
124
125    use f64::consts::E;
126    use f64::consts::PI;
127    use vortex_array::LEGACY_SESSION;
128    use vortex_array::ToCanonical;
129    use vortex_array::VortexSessionExecute;
130    use vortex_array::assert_arrays_eq;
131    use vortex_array::dtype::NativePType;
132    use vortex_array::validity::Validity;
133    use vortex_buffer::Buffer;
134    use vortex_buffer::buffer;
135
136    use super::*;
137    use crate::alp::array::ALPArrayExt;
138    use crate::alp::array::ALPArraySlotsExt;
139    use crate::decompress_into_array;
140
141    #[test]
142    fn test_compress() {
143        let array = PrimitiveArray::new(buffer![1.234f32; 1025], Validity::NonNullable);
144        let encoded = alp_encode(&array, None).unwrap();
145        assert!(encoded.patches().is_none());
146        let expected_encoded = PrimitiveArray::from_iter(vec![1234i32; 1025]);
147        assert_arrays_eq!(encoded.encoded(), expected_encoded);
148        assert_eq!(encoded.exponents(), Exponents { e: 9, f: 6 });
149
150        let decoded =
151            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
152        assert_arrays_eq!(decoded, array);
153    }
154
155    #[test]
156    fn test_nullable_compress() {
157        let array = PrimitiveArray::from_option_iter([None, Some(1.234f32), None]);
158        let encoded = alp_encode(&array, None).unwrap();
159        assert!(encoded.patches().is_none());
160        let expected_encoded = PrimitiveArray::from_option_iter([None, Some(1234i32), None]);
161        assert_arrays_eq!(encoded.encoded(), expected_encoded);
162        assert_eq!(encoded.exponents(), Exponents { e: 9, f: 6 });
163
164        let decoded =
165            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
166        let expected = PrimitiveArray::from_option_iter(vec![None, Some(1.234f32), None]);
167        assert_arrays_eq!(decoded, expected);
168    }
169
170    #[test]
171    #[allow(clippy::approx_constant)] // Clippy objects to 2.718, an approximation of e, the base of the natural logarithm.
172    fn test_patched_compress() {
173        let values = buffer![1.234f64, 2.718, PI, 4.0];
174        let array = PrimitiveArray::new(values.clone(), Validity::NonNullable);
175        let encoded = alp_encode(&array, None).unwrap();
176        assert!(encoded.patches().is_some());
177        let expected_encoded = PrimitiveArray::from_iter(vec![1234i64, 2718, 1234, 4000]);
178        assert_arrays_eq!(encoded.encoded(), expected_encoded);
179        assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
180
181        let decoded =
182            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
183        let expected_decoded = PrimitiveArray::new(values, Validity::NonNullable);
184        assert_arrays_eq!(decoded, expected_decoded);
185    }
186
187    #[test]
188    #[allow(clippy::approx_constant)] // Clippy objects to 2.718, an approximation of e, the base of the natural logarithm.
189    fn test_compress_ignores_invalid_exceptional_values() {
190        let values = buffer![1.234f64, 2.718, PI, 4.0];
191        let array = PrimitiveArray::new(values, Validity::from_iter([true, true, false, true]));
192        let encoded = alp_encode(&array, None).unwrap();
193        assert!(encoded.patches().is_none());
194        let expected_encoded =
195            PrimitiveArray::from_option_iter(buffer![Some(1234i64), Some(2718), None, Some(4000)]);
196        assert_arrays_eq!(encoded.encoded(), expected_encoded);
197        assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
198
199        let decoded =
200            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
201        assert_arrays_eq!(decoded, array);
202    }
203
204    #[test]
205    #[allow(clippy::approx_constant)] // ALP doesn't like E
206    fn test_nullable_patched_scalar_at() {
207        let array = PrimitiveArray::from_option_iter([
208            Some(1.234f64),
209            Some(2.718),
210            Some(PI),
211            Some(4.0),
212            None,
213        ]);
214        let encoded = alp_encode(&array, None).unwrap();
215        assert!(encoded.patches().is_some());
216
217        assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
218
219        assert_arrays_eq!(encoded, array);
220
221        let _decoded =
222            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
223    }
224
225    #[test]
226    fn roundtrips_close_fractional() {
227        let original = PrimitiveArray::from_iter([195.26274f32, 195.27837, -48.815685]);
228        let alp_arr = alp_encode(&original, None).unwrap();
229        assert_arrays_eq!(alp_arr, original);
230    }
231
232    #[test]
233    fn roundtrips_all_null() {
234        let original =
235            PrimitiveArray::new(buffer![195.26274f64, PI, -48.815685], Validity::AllInvalid);
236        let alp_arr = alp_encode(&original, None).unwrap();
237        let decompressed = alp_arr.into_array().to_primitive();
238
239        assert_eq!(
240            // The second and third values become exceptions and are replaced
241            [195.26274, 195.26274, 195.26274],
242            decompressed.as_slice::<f64>()
243        );
244
245        assert_arrays_eq!(decompressed, original);
246    }
247
248    #[test]
249    fn non_finite_numbers() {
250        let original = PrimitiveArray::new(
251            buffer![0.0f32, -0.0, f32::NAN, f32::NEG_INFINITY, f32::INFINITY],
252            Validity::NonNullable,
253        );
254        let encoded = alp_encode(&original, None).unwrap();
255        let decoded = encoded.as_array().to_primitive();
256        for idx in 0..original.len() {
257            let decoded_val = decoded.as_slice::<f32>()[idx];
258            let original_val = original.as_slice::<f32>()[idx];
259            assert!(
260                decoded_val.is_eq(original_val),
261                "Expected {original_val} but got {decoded_val}"
262            );
263        }
264    }
265
266    #[test]
267    fn test_chunk_offsets() {
268        let mut values = vec![1.0f64; 3072];
269
270        values[1023] = PI;
271        values[1024] = E;
272        values[1025] = PI;
273
274        let array = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
275        let encoded = alp_encode(&array, None).unwrap();
276        let patches = encoded.patches().unwrap();
277
278        let chunk_offsets = patches.chunk_offsets().clone().unwrap().to_primitive();
279        let expected_offsets = PrimitiveArray::from_iter(vec![0u64, 1, 3]);
280        assert_arrays_eq!(chunk_offsets, expected_offsets);
281
282        let patch_indices = patches.indices().to_primitive();
283        let expected_indices = PrimitiveArray::from_iter(vec![1023u64, 1024, 1025]);
284        assert_arrays_eq!(patch_indices, expected_indices);
285
286        let patch_values = patches.values().to_primitive();
287        let expected_values = PrimitiveArray::from_iter(vec![PI, E, PI]);
288        assert_arrays_eq!(patch_values, expected_values);
289    }
290
291    #[test]
292    fn test_chunk_offsets_no_patches_in_middle() {
293        let mut values = vec![1.0f64; 3072];
294        values[0] = PI;
295        values[2048] = E;
296
297        let array = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
298        let encoded = alp_encode(&array, None).unwrap();
299        let patches = encoded.patches().unwrap();
300
301        let chunk_offsets = patches.chunk_offsets().clone().unwrap().to_primitive();
302        let expected_offsets = PrimitiveArray::from_iter(vec![0u64, 1, 1]);
303        assert_arrays_eq!(chunk_offsets, expected_offsets);
304
305        let patch_indices = patches.indices().to_primitive();
306        let expected_indices = PrimitiveArray::from_iter(vec![0u64, 2048]);
307        assert_arrays_eq!(patch_indices, expected_indices);
308
309        let patch_values = patches.values().to_primitive();
310        let expected_values = PrimitiveArray::from_iter(vec![PI, E]);
311        assert_arrays_eq!(patch_values, expected_values);
312    }
313
314    #[test]
315    fn test_chunk_offsets_trailing_empty_chunks() {
316        let mut values = vec![1.0f64; 3072];
317        values[0] = PI;
318
319        let array = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
320        let encoded = alp_encode(&array, None).unwrap();
321        let patches = encoded.patches().unwrap();
322
323        let chunk_offsets = patches.chunk_offsets().clone().unwrap().to_primitive();
324        let expected_offsets = PrimitiveArray::from_iter(vec![0u64, 1, 1]);
325        assert_arrays_eq!(chunk_offsets, expected_offsets);
326
327        let patch_indices = patches.indices().to_primitive();
328        let expected_indices = PrimitiveArray::from_iter(vec![0u64]);
329        assert_arrays_eq!(patch_indices, expected_indices);
330
331        let patch_values = patches.values().to_primitive();
332        let expected_values = PrimitiveArray::from_iter(vec![PI]);
333        assert_arrays_eq!(patch_values, expected_values);
334    }
335
336    #[test]
337    fn test_chunk_offsets_single_chunk() {
338        let mut values = vec![1.0f64; 512];
339        values[0] = PI;
340        values[100] = E;
341
342        let array = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
343        let encoded = alp_encode(&array, None).unwrap();
344        let patches = encoded.patches().unwrap();
345
346        let chunk_offsets = patches.chunk_offsets().clone().unwrap().to_primitive();
347        let expected_offsets = PrimitiveArray::from_iter(vec![0u64]);
348        assert_arrays_eq!(chunk_offsets, expected_offsets);
349
350        let patch_indices = patches.indices().to_primitive();
351        let expected_indices = PrimitiveArray::from_iter(vec![0u64, 100]);
352        assert_arrays_eq!(patch_indices, expected_indices);
353
354        let patch_values = patches.values().to_primitive();
355        let expected_values = PrimitiveArray::from_iter(vec![PI, E]);
356        assert_arrays_eq!(patch_values, expected_values);
357    }
358
359    #[test]
360    fn test_slice_half_chunk_f32_roundtrip() {
361        // Create 1024 elements, encode, slice to first 512, then decode
362        let values = vec![1.234f32; 1024];
363        let original = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
364        let encoded = alp_encode(&original, None).unwrap();
365
366        let sliced_alp = encoded.slice(512..1024).unwrap();
367
368        let expected_slice = original.slice(512..1024).unwrap();
369        assert_arrays_eq!(sliced_alp, expected_slice);
370    }
371
372    #[test]
373    fn test_slice_half_chunk_f64_roundtrip() {
374        let values = vec![5.678f64; 1024];
375        let original = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
376        let encoded = alp_encode(&original, None).unwrap();
377
378        let sliced_alp = encoded.slice(512..1024).unwrap();
379
380        let expected_slice = original.slice(512..1024).unwrap();
381        assert_arrays_eq!(sliced_alp, expected_slice);
382    }
383
384    #[test]
385    fn test_slice_half_chunk_with_patches_roundtrip() {
386        let mut values = vec![1.0f64; 1024];
387        values[100] = PI;
388        values[200] = E;
389        values[600] = 42.42;
390
391        let original = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
392        let encoded = alp_encode(&original, None).unwrap();
393
394        let sliced_alp = encoded.slice(512..1024).unwrap();
395
396        let expected_slice = original.slice(512..1024).unwrap();
397        assert_arrays_eq!(sliced_alp, expected_slice);
398        assert!(encoded.patches().is_some());
399    }
400
401    #[test]
402    fn test_slice_across_chunks_with_patches_roundtrip() {
403        let mut values = vec![1.0f64; 2048];
404        values[100] = PI;
405        values[200] = E;
406        values[600] = 42.42;
407        values[800] = 42.42;
408        values[1000] = 42.42;
409        values[1023] = 42.42;
410
411        let original = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
412        let encoded = alp_encode(&original, None).unwrap();
413
414        let sliced_alp = encoded.slice(1023..1025).unwrap();
415
416        let expected_slice = original.slice(1023..1025).unwrap();
417        assert_arrays_eq!(sliced_alp, expected_slice);
418        assert!(encoded.patches().is_some());
419    }
420
421    #[test]
422    fn test_slice_half_chunk_nullable_roundtrip() {
423        let values = (0..1024)
424            .map(|i| if i % 3 == 0 { None } else { Some(2.5f32) })
425            .collect::<Vec<_>>();
426
427        let original = PrimitiveArray::from_option_iter(values);
428        let encoded = alp_encode(&original, None).unwrap();
429
430        let sliced_alp = encoded.slice(512..1024).unwrap();
431        let decoded = sliced_alp.to_primitive();
432
433        let expected_slice = original.slice(512..1024).unwrap();
434        assert_arrays_eq!(decoded, expected_slice);
435    }
436
437    #[test]
438    fn test_large_f32_array_uniform_values() {
439        let size = 10_000;
440        let array = PrimitiveArray::new(buffer![42.125f32; size], Validity::NonNullable);
441        let encoded = alp_encode(&array, None).unwrap();
442
443        assert!(encoded.patches().is_none());
444        let decoded =
445            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
446        assert_arrays_eq!(decoded, array);
447    }
448
449    #[test]
450    fn test_large_f64_array_uniform_values() {
451        let size = 50_000;
452        let array = PrimitiveArray::new(buffer![123.456789f64; size], Validity::NonNullable);
453        let encoded = alp_encode(&array, None).unwrap();
454
455        assert!(encoded.patches().is_none());
456        let decoded =
457            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
458        assert_arrays_eq!(decoded, array);
459    }
460
461    #[test]
462    fn test_large_f32_array_with_patches() {
463        let size = 5_000;
464        let mut values = vec![1.5f32; size];
465        values[100] = std::f32::consts::PI;
466        values[1500] = std::f32::consts::E;
467        values[3000] = f32::NEG_INFINITY;
468        values[4500] = f32::INFINITY;
469
470        let array = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
471        let encoded = alp_encode(&array, None).unwrap();
472
473        assert!(encoded.patches().is_some());
474        let decoded =
475            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
476        assert_arrays_eq!(decoded, array);
477    }
478
479    #[test]
480    fn test_large_f64_array_with_patches() {
481        let size = 8_000;
482        let mut values = vec![2.2184f64; size];
483        values[0] = PI;
484        values[1000] = E;
485        values[2000] = f64::NAN;
486        values[3000] = f64::INFINITY;
487        values[4000] = f64::NEG_INFINITY;
488        values[5000] = 0.0;
489        values[6000] = -0.0;
490        values[7000] = 999.999999999;
491
492        let array = PrimitiveArray::new(Buffer::from(values.clone()), Validity::NonNullable);
493        let encoded = alp_encode(&array, None).unwrap();
494
495        assert!(encoded.patches().is_some());
496        let decoded =
497            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
498
499        for idx in 0..size {
500            let decoded_val = decoded.as_slice::<f64>()[idx];
501            let original_val = values[idx];
502            assert!(
503                decoded_val.is_eq(original_val),
504                "At index {idx}: Expected {original_val} but got {decoded_val}"
505            );
506        }
507    }
508
509    #[test]
510    fn test_large_nullable_array() {
511        let size = 12_000;
512        let values: Vec<Option<f32>> = (0..size)
513            .map(|i| {
514                if i % 7 == 0 {
515                    None
516                } else {
517                    Some((i as f32) * 0.1)
518                }
519            })
520            .collect();
521
522        let array = PrimitiveArray::from_option_iter(values);
523        let encoded = alp_encode(&array, None).unwrap();
524        let decoded =
525            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
526
527        assert_arrays_eq!(decoded, array);
528    }
529
530    #[test]
531    fn test_large_mixed_validity_with_patches() {
532        let size = 6_000;
533        let mut values = vec![10.125f64; size];
534
535        values[500] = PI;
536        values[1500] = E;
537        values[2500] = f64::INFINITY;
538        values[3500] = f64::NEG_INFINITY;
539        values[4500] = f64::NAN;
540
541        let validity = Validity::from_iter((0..size).map(|i| !matches!(i, 500 | 2500)));
542
543        let array = PrimitiveArray::new(Buffer::from(values), validity);
544        let encoded = alp_encode(&array, None).unwrap();
545        let decoded =
546            decompress_into_array(encoded, &mut LEGACY_SESSION.create_execution_ctx()).unwrap();
547
548        assert_arrays_eq!(decoded, array);
549    }
550
551    /// Regression test for patch_chunk index-out-of-bounds when slicing a multi-chunk
552    /// ALP array mid-chunk with patches in the trailing chunk.
553    ///
554    /// The bug: chunk_offsets are sliced at chunk granularity (1024-row boundaries)
555    /// but patches indices/values are sliced at element granularity. When a slice ends
556    /// mid-chunk, patches_end_idx could exceed patches_indices.len(), causing OOB panic
557    /// during decompression.
558    #[test]
559    fn test_slice_mid_chunk_with_patches_in_trailing_chunk() {
560        // 3 chunks (3072 elements), patches scattered across all chunks.
561        let mut values = vec![1.0f64; 3072];
562        // Chunk 0 patches (indices 0..1024)
563        values[100] = PI;
564        values[500] = E;
565        // Chunk 1 patches (indices 1024..2048)
566        values[1100] = PI;
567        values[1500] = E;
568        values[1900] = PI;
569        // Chunk 2 patches (indices 2048..3072)
570        values[2100] = PI;
571        values[2500] = E;
572        values[2900] = PI;
573
574        let original = PrimitiveArray::new(Buffer::from(values), Validity::NonNullable);
575        let encoded = alp_encode(&original, None).unwrap();
576        assert!(encoded.patches().is_some());
577
578        // Slice ending mid-chunk-2 (element 2500 is inside chunk 2 = 2048..3072).
579        // This creates a mismatch: chunk_offsets includes the full chunk 2 offset,
580        // but patches_indices only includes patches up to element 2500.
581        let sliced_alp = encoded.slice(0..2500).unwrap();
582        let expected = original.slice(0..2500).unwrap();
583        assert_arrays_eq!(sliced_alp, expected);
584
585        // Also test slicing that starts mid-chunk (both start and end mid-chunk).
586        let sliced_alp = encoded.slice(500..2500).unwrap();
587        let expected = original.slice(500..2500).unwrap();
588        assert_arrays_eq!(sliced_alp, expected);
589    }
590}