vortex_fastlanes/rle/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_array::Array;
5use vortex_array::ArrayRef;
6use vortex_array::stats::ArrayStats;
7use vortex_dtype::DType;
8use vortex_dtype::PType;
9use vortex_error::VortexResult;
10use vortex_error::vortex_ensure;
11
12use crate::FL_CHUNK_SIZE;
13
14pub mod rle_compress;
15pub mod rle_decompress;
16
17#[derive(Clone, Debug)]
18pub struct RLEArray {
19    pub(super) dtype: DType,
20    /// Run value in the dictionary.
21    pub(super) values: ArrayRef,
22    /// Chunk-local indices from all chunks. The start of each chunk is looked up in `values_idx_offsets`.
23    pub(super) indices: ArrayRef,
24    /// Index start positions of each value chunk.
25    ///
26    /// # Example
27    /// ```
28    /// // Chunk 0: [10, 20] (starts at index 0)
29    /// // Chunk 1: [30, 40] (starts at index 2)
30    /// let values = [10, 20, 30, 40];           // Global values array
31    /// let values_idx_offsets = [0, 2];         // Chunk 0 starts at index 0, Chunk 1 starts at index 2
32    /// ```
33    pub(super) values_idx_offsets: ArrayRef,
34
35    pub(super) stats_set: ArrayStats,
36    // Offset relative to the start of the chunk.
37    pub(super) offset: usize,
38    pub(super) length: usize,
39}
40
41impl RLEArray {
42    fn validate(
43        values: &dyn Array,
44        indices: &dyn Array,
45        value_idx_offsets: &dyn Array,
46        offset: usize,
47    ) -> VortexResult<()> {
48        vortex_ensure!(
49            offset < 1024,
50            "Offset must be smaller than 1024, got {}",
51            offset
52        );
53
54        vortex_ensure!(
55            values.dtype().is_primitive(),
56            "RLE values must be a primitive type, got {}",
57            values.dtype()
58        );
59
60        vortex_ensure!(
61            matches!(indices.dtype().as_ptype(), PType::U8 | PType::U16),
62            "RLE indices must be u8 or u16, got {}",
63            indices.dtype()
64        );
65
66        vortex_ensure!(
67            value_idx_offsets.dtype().is_unsigned_int() && !value_idx_offsets.dtype().is_nullable(),
68            "RLE value idx offsets must be non-nullable unsigned integer, got {}",
69            value_idx_offsets.dtype()
70        );
71
72        vortex_ensure!(
73            indices.len().div_ceil(FL_CHUNK_SIZE) == value_idx_offsets.len(),
74            "RLE must have one value idx offset per chunk, got {}",
75            value_idx_offsets.len()
76        );
77
78        vortex_ensure!(
79            indices.len() >= values.len(),
80            "RLE must have at least as many indices as values, got {} indices and {} values",
81            indices.len(),
82            values.len()
83        );
84
85        Ok(())
86    }
87
88    /// Create a new chunk-based RLE array from its components.
89    ///
90    /// # Arguments
91    ///
92    /// * `values` - Unique values from all chunks
93    /// * `indices` - Chunk-local indices from all chunks
94    /// * `values_idx_offsets` - Start indices for each value chunk.
95    /// * `offset` - Offset into the first chunk
96    /// * `length` - Array length
97    pub fn try_new(
98        values: ArrayRef,
99        indices: ArrayRef,
100        values_idx_offsets: ArrayRef,
101        offset: usize,
102        length: usize,
103    ) -> VortexResult<Self> {
104        assert_eq!(indices.len() % FL_CHUNK_SIZE, 0);
105        Self::validate(&values, &indices, &values_idx_offsets, offset)?;
106
107        // Ensure that the DType has the same nullability as the indices array.
108        let dtype = DType::Primitive(values.dtype().as_ptype(), indices.dtype().nullability());
109
110        Ok(Self {
111            dtype,
112            values,
113            indices,
114            values_idx_offsets,
115            stats_set: ArrayStats::default(),
116            offset,
117            length,
118        })
119    }
120
121    /// Create a new RLEArray without validation.
122    ///
123    /// # Safety
124    /// The caller must ensure that:
125    /// - `offset + length` does not exceed the length of the indices array
126    /// - The `dtype` is consistent with the values array's primitive type and validity nullability
127    /// - The `indices` array contains valid indices into chunks of the `values` array
128    /// - The `values_idx_offsets` array contains valid chunk start offsets
129    /// - The `validity` array has the same length as `length`
130    pub unsafe fn new_unchecked(
131        values: ArrayRef,
132        indices: ArrayRef,
133        values_idx_offsets: ArrayRef,
134        dtype: DType,
135        offset: usize,
136        length: usize,
137    ) -> Self {
138        Self {
139            dtype,
140            values,
141            indices,
142            values_idx_offsets,
143            stats_set: ArrayStats::default(),
144            offset,
145            length,
146        }
147    }
148
149    #[inline]
150    pub fn len(&self) -> usize {
151        self.length
152    }
153
154    #[inline]
155    pub fn is_empty(&self) -> bool {
156        self.length == 0
157    }
158
159    #[inline]
160    pub fn dtype(&self) -> &DType {
161        &self.dtype
162    }
163
164    #[inline]
165    pub fn values(&self) -> &ArrayRef {
166        &self.values
167    }
168
169    #[inline]
170    pub fn indices(&self) -> &ArrayRef {
171        &self.indices
172    }
173
174    #[inline]
175    pub fn values_idx_offsets(&self) -> &ArrayRef {
176        &self.values_idx_offsets
177    }
178
179    /// Values index offset relative to the first chunk.
180    ///
181    /// Offsets in `values_idx_offsets` are absolute and need to be shifted
182    /// by the offset of the first chunk, respective the current slice, in
183    /// order to make them relative.
184    #[expect(
185        clippy::expect_used,
186        reason = "expect is safe here as scalar_at returns a valid primitive"
187    )]
188    pub(crate) fn values_idx_offset(&self, chunk_idx: usize) -> usize {
189        self.values_idx_offsets
190            .scalar_at(chunk_idx)
191            .as_primitive()
192            .as_::<usize>()
193            .expect("index must be of type usize")
194            - self
195                .values_idx_offsets
196                .scalar_at(0)
197                .as_primitive()
198                .as_::<usize>()
199                .expect("index must be of type usize")
200    }
201
202    /// Index offset into the array
203    #[inline]
204    pub fn offset(&self) -> usize {
205        self.offset
206    }
207
208    #[inline]
209    pub(crate) fn stats_set(&self) -> &ArrayStats {
210        &self.stats_set
211    }
212}
213
214#[cfg(test)]
215mod tests {
216    use vortex_array::Array;
217    use vortex_array::ArrayContext;
218    use vortex_array::IntoArray;
219    use vortex_array::ToCanonical;
220    use vortex_array::arrays::PrimitiveArray;
221    use vortex_array::serde::ArrayParts;
222    use vortex_array::serde::SerializeOptions;
223    use vortex_array::validity::Validity;
224    use vortex_array::vtable::ArrayVTableExt;
225    use vortex_buffer::Buffer;
226    use vortex_buffer::ByteBufferMut;
227    use vortex_dtype::DType;
228    use vortex_dtype::Nullability;
229    use vortex_dtype::PType;
230
231    use crate::RLEArray;
232    use crate::RLEVTable;
233
234    #[test]
235    fn test_try_new() {
236        let values = PrimitiveArray::from_iter([10u32, 20, 30]).into_array();
237
238        // Pad indices to 1024 chunk.
239        let indices =
240            PrimitiveArray::from_iter([0u16, 0, 1, 1, 2].iter().cycle().take(1024).copied())
241                .into_array();
242        let values_idx_offsets = PrimitiveArray::from_iter([0u64]).into_array();
243        let rle_array = RLEArray::try_new(values, indices, values_idx_offsets, 0, 5).unwrap();
244
245        assert_eq!(rle_array.len(), 5);
246        assert_eq!(rle_array.values().len(), 3);
247        assert_eq!(rle_array.values().dtype().as_ptype(), PType::U32);
248    }
249
250    #[test]
251    fn test_try_new_with_validity() {
252        let values = PrimitiveArray::from_iter([10u32, 20]).into_array();
253        let values_idx_offsets = PrimitiveArray::from_iter([0u64]).into_array();
254
255        let indices_pattern = [0u16, 1, 0];
256        let validity_pattern = [true, false, true];
257
258        // Pad indices to 1024 chunk.
259        let indices_with_validity = PrimitiveArray::new(
260            indices_pattern
261                .iter()
262                .cycle()
263                .take(1024)
264                .copied()
265                .collect::<Buffer<u16>>(),
266            Validity::from_iter(validity_pattern.iter().cycle().take(1024).copied()),
267        )
268        .into_array();
269
270        let rle_array = RLEArray::try_new(
271            values.clone(),
272            indices_with_validity,
273            values_idx_offsets,
274            0,
275            3,
276        )
277        .unwrap();
278
279        assert_eq!(rle_array.len(), 3);
280        assert_eq!(rle_array.values().len(), 2);
281        assert!(rle_array.is_valid(0));
282        assert!(!rle_array.is_valid(1));
283        assert!(rle_array.is_valid(2));
284    }
285
286    #[test]
287    fn test_all_valid() {
288        let values = PrimitiveArray::from_iter([10u32, 20, 30]).into_array();
289        let values_idx_offsets = PrimitiveArray::from_iter([0u64]).into_array();
290
291        let indices_pattern = [0u16, 1, 2, 0, 1];
292        let validity_pattern = [true, true, true, false, false];
293
294        // Pad indices to 1024 chunk.
295        let indices_with_validity = PrimitiveArray::new(
296            indices_pattern
297                .iter()
298                .cycle()
299                .take(1024)
300                .copied()
301                .collect::<Buffer<u16>>(),
302            Validity::from_iter(validity_pattern.iter().cycle().take(1024).copied()),
303        )
304        .into_array();
305
306        let rle_array = RLEArray::try_new(
307            values.clone(),
308            indices_with_validity,
309            values_idx_offsets,
310            0,
311            5,
312        )
313        .unwrap();
314
315        let valid_slice = rle_array.slice(0..3);
316        assert!(valid_slice.all_valid());
317
318        let mixed_slice = rle_array.slice(1..5);
319        assert!(!mixed_slice.all_valid());
320    }
321
322    #[test]
323    fn test_all_invalid() {
324        let values = PrimitiveArray::from_iter([10u32, 20, 30]).into_array();
325        let values_idx_offsets = PrimitiveArray::from_iter([0u64]).into_array();
326
327        // Pad indices to 1024 chunk.
328        let indices_pattern = [0u16, 1, 2, 0, 1];
329        let validity_pattern = [true, true, false, false, false];
330
331        let indices_with_validity = PrimitiveArray::new(
332            indices_pattern
333                .iter()
334                .cycle()
335                .take(1024)
336                .copied()
337                .collect::<Buffer<u16>>(),
338            Validity::from_iter(validity_pattern.iter().cycle().take(1024).copied()),
339        )
340        .into_array();
341
342        let rle_array = RLEArray::try_new(
343            values.clone(),
344            indices_with_validity,
345            values_idx_offsets,
346            0,
347            5,
348        )
349        .unwrap();
350
351        let invalid_slice = rle_array.slice(2..5);
352        assert!(invalid_slice.all_invalid());
353
354        let mixed_slice = rle_array.slice(1..4);
355        assert!(!mixed_slice.all_invalid());
356    }
357
358    #[test]
359    fn test_validity_mask() {
360        let values = PrimitiveArray::from_iter([10u32, 20, 30]).into_array();
361        let values_idx_offsets = PrimitiveArray::from_iter([0u64]).into_array();
362
363        // Pad indices to 1024 chunk.
364        let indices_pattern = [0u16, 1, 2, 0];
365        let validity_pattern = [true, false, true, false];
366
367        let indices_with_validity = PrimitiveArray::new(
368            indices_pattern
369                .iter()
370                .cycle()
371                .take(1024)
372                .copied()
373                .collect::<Buffer<u16>>(),
374            Validity::from_iter(validity_pattern.iter().cycle().take(1024).copied()),
375        )
376        .into_array();
377
378        let rle_array = RLEArray::try_new(
379            values.clone(),
380            indices_with_validity,
381            values_idx_offsets,
382            0,
383            4,
384        )
385        .unwrap();
386
387        let sliced_array = rle_array.slice(1..4);
388        let validity_mask = sliced_array.validity_mask();
389
390        let expected_mask = Validity::from_iter([false, true, false]).to_mask(3);
391        assert_eq!(validity_mask.len(), expected_mask.len());
392        assert_eq!(validity_mask, expected_mask);
393    }
394
395    #[test]
396    fn test_try_new_empty() {
397        let values = PrimitiveArray::from_iter(Vec::<u32>::new()).into_array();
398        let indices = PrimitiveArray::from_iter(Vec::<u16>::new()).into_array();
399        let values_idx_offsets = PrimitiveArray::from_iter(Vec::<u64>::new()).into_array();
400        let rle_array = RLEArray::try_new(
401            values,
402            indices.clone(),
403            values_idx_offsets,
404            0,
405            indices.len(),
406        )
407        .unwrap();
408
409        assert_eq!(rle_array.len(), 0);
410        assert_eq!(rle_array.values().len(), 0);
411    }
412
413    #[test]
414    fn test_multi_chunk_two_chunks() {
415        let values = PrimitiveArray::from_iter([10u32, 20, 30, 40]).into_array();
416        let indices = PrimitiveArray::from_iter([0u16, 1].repeat(1024)).into_array();
417        let values_idx_offsets = PrimitiveArray::from_iter([0u64, 2]).into_array();
418        let rle_array = RLEArray::try_new(values, indices, values_idx_offsets, 0, 2048).unwrap();
419
420        assert_eq!(rle_array.len(), 2048);
421        assert_eq!(rle_array.values().len(), 4);
422
423        assert_eq!(rle_array.values_idx_offset(0), 0);
424        assert_eq!(rle_array.values_idx_offset(1), 2);
425    }
426
427    #[test]
428    fn test_rle_serialization() {
429        let primitive = PrimitiveArray::from_iter((0..2048).map(|i| (i / 100) as u32));
430        let rle_array = RLEArray::encode(&primitive).unwrap();
431        assert_eq!(rle_array.len(), 2048);
432
433        let original_data = rle_array.to_primitive();
434        let original_values = original_data.as_slice::<u32>();
435
436        let ctx = ArrayContext::empty().with(RLEVTable.as_vtable());
437        let serialized = rle_array
438            .to_array()
439            .serialize(&ctx, &SerializeOptions::default())
440            .unwrap();
441
442        let mut concat = ByteBufferMut::empty();
443        for buf in serialized {
444            concat.extend_from_slice(buf.as_ref());
445        }
446        let concat = concat.freeze();
447
448        let parts = ArrayParts::try_from(concat).unwrap();
449        let decoded = parts
450            .decode(
451                &ctx,
452                &DType::Primitive(PType::U32, Nullability::NonNullable),
453                2048,
454            )
455            .unwrap();
456
457        let decoded_data = decoded.to_primitive();
458        let decoded_values = decoded_data.as_slice::<u32>();
459
460        assert_eq!(original_values, decoded_values);
461    }
462
463    #[test]
464    fn test_rle_serialization_slice() {
465        let primitive = PrimitiveArray::from_iter((0..2048).map(|i| (i / 100) as u32));
466        let rle_array = RLEArray::encode(&primitive).unwrap();
467        let sliced = rle_array.slice(100..200);
468        assert_eq!(sliced.len(), 100);
469
470        let ctx = ArrayContext::empty().with(RLEVTable.as_vtable());
471        let serialized = sliced
472            .serialize(&ctx, &SerializeOptions::default())
473            .unwrap();
474
475        let mut concat = ByteBufferMut::empty();
476        for buf in serialized {
477            concat.extend_from_slice(buf.as_ref());
478        }
479        let concat = concat.freeze();
480
481        let parts = ArrayParts::try_from(concat).unwrap();
482        let decoded = parts.decode(&ctx, sliced.dtype(), sliced.len()).unwrap();
483
484        let original_data = sliced.to_primitive();
485        let decoded_data = decoded.to_primitive();
486
487        let original_values = original_data.as_slice::<u32>();
488        let decoded_values = decoded_data.as_slice::<u32>();
489
490        assert_eq!(original_values, decoded_values);
491    }
492}