Skip to main content

vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_dtype::IntegerPType;
7use vortex_dtype::Nullability;
8use vortex_dtype::match_each_integer_ptype;
9use vortex_error::VortexExpect;
10use vortex_error::VortexResult;
11
12use crate::Array;
13use crate::IntoArray;
14use crate::ToCanonical;
15use crate::arrays::ListViewArray;
16use crate::builders::builder_with_capacity;
17use crate::compute;
18use crate::scalar::Scalar;
19use crate::vtable::ValidityHelper;
20
21/// Modes for rebuilding a [`ListViewArray`].
22pub enum ListViewRebuildMode {
23    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
24    /// to a [`ListArray`].
25    ///
26    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
27    /// like a [`ListArray`] but with an additional `sizes` array.
28    ///
29    /// [`ListArray`]: crate::arrays::ListArray
30    MakeZeroCopyToList,
31
32    /// Removes any leading or trailing elements that are unused / not referenced by any views in
33    /// the [`ListViewArray`].
34    TrimElements,
35
36    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
37    ///
38    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
39    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
40    ///
41    /// [`ListArray`]: crate::arrays::ListArray
42    MakeExact,
43
44    // TODO(connor)[ListView]: Implement some version of this.
45    /// Finds the shortest packing / overlapping of list elements.
46    ///
47    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
48    /// this algorithm (but in all seriousness there are many approximate algorithms that could
49    /// work well here).
50    OverlapCompression,
51}
52
53impl ListViewArray {
54    /// Rebuilds the [`ListViewArray`] according to the specified mode.
55    pub fn rebuild(&self, mode: ListViewRebuildMode) -> VortexResult<ListViewArray> {
56        if self.is_empty() {
57            return Ok(self.clone());
58        }
59
60        match mode {
61            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
62            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
63            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
64            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
65        }
66    }
67
68    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
69    ///
70    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
71    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
72    /// a [`ListArray`].
73    ///
74    /// [`ListArray`]: crate::arrays::ListArray
75    fn rebuild_zero_copy_to_list(&self) -> VortexResult<ListViewArray> {
76        if self.is_zero_copy_to_list() {
77            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
78            return Ok(self.clone());
79        }
80
81        let offsets_ptype = self.offsets().dtype().as_ptype();
82        let sizes_ptype = self.sizes().dtype().as_ptype();
83
84        // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
85        // is that we want to pass data to systems that expect Arrow data.
86        // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
87        // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
88        // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
89        // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
90        // for sizes as well.
91        match_each_integer_ptype!(sizes_ptype, |S| {
92            match offsets_ptype {
93                PType::U8 => self.naive_rebuild::<u8, u32, S>(),
94                PType::U16 => self.naive_rebuild::<u16, u32, S>(),
95                PType::U32 => self.naive_rebuild::<u32, u32, S>(),
96                PType::U64 => self.naive_rebuild::<u64, u64, S>(),
97                PType::I8 => self.naive_rebuild::<i8, i32, S>(),
98                PType::I16 => self.naive_rebuild::<i16, i32, S>(),
99                PType::I32 => self.naive_rebuild::<i32, i32, S>(),
100                PType::I64 => self.naive_rebuild::<i64, i64, S>(),
101                _ => unreachable!("invalid offsets PType"),
102            }
103        })
104    }
105
106    // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements
107    // instead of using a builder.
108    /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
109    /// by piece.
110    fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
111        &self,
112    ) -> VortexResult<ListViewArray> {
113        let element_dtype = self
114            .dtype()
115            .as_list_element_opt()
116            .vortex_expect("somehow had a canonical list that was not a list");
117
118        let offsets_canonical = self.offsets().to_primitive();
119        let offsets_slice = offsets_canonical.as_slice::<O>();
120        let sizes_canonical = self.sizes().to_primitive();
121        let sizes_slice = sizes_canonical.as_slice::<S>();
122
123        let len = offsets_slice.len();
124
125        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
126        // TODO(connor)[ListView]: Do we really need to do this?
127        // The only reason we need to rebuild the sizes here is that the validity may indicate that
128        // a list is null even though it has a non-zero size. This rebuild will set the size of all
129        // null lists to 0.
130        let mut new_sizes = BufferMut::<S>::with_capacity(len);
131
132        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
133        let elements_canonical = self
134            .elements()
135            .to_canonical()
136            .vortex_expect("canonicalize elements for rebuild")
137            .into_array();
138
139        // Note that we do not know what the exact capacity should be of the new elements since
140        // there could be overlaps in the existing `ListViewArray`.
141        let mut new_elements_builder =
142            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
143
144        let mut n_elements = NewOffset::zero();
145        for index in 0..len {
146            if !self.is_valid(index)? {
147                // For NULL lists, place them after the previous item's data to maintain the
148                // no-overlap invariant for zero-copy to `ListArray` arrays.
149                new_offsets.push(n_elements);
150                new_sizes.push(S::zero());
151                continue;
152            }
153
154            let offset = offsets_slice[index];
155            let size = sizes_slice[index];
156
157            let start = offset.as_();
158            let stop = start + size.as_();
159
160            new_offsets.push(n_elements);
161            new_sizes.push(size);
162            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
163
164            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
165        }
166
167        let offsets = new_offsets.into_array();
168        let sizes = new_sizes.into_array();
169        let elements = new_elements_builder.finish();
170
171        debug_assert_eq!(
172            n_elements.as_(),
173            elements.len(),
174            "The accumulated elements somehow had the wrong length"
175        );
176
177        // SAFETY:
178        // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
179        // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
180        //   lists).
181        // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
182        // - The validity array is preserved from the original array unchanged
183        // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
184        //   and no overlaps.
185        Ok(unsafe {
186            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
187                .with_zero_copy_to_list(true)
188        })
189    }
190
191    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
192    /// elements, which is defined as a contiguous run of values in the `elements` array that are
193    /// not referecened by any views in the corresponding [`ListViewArray`].
194    fn rebuild_trim_elements(&self) -> VortexResult<ListViewArray> {
195        let start = if self.is_zero_copy_to_list() {
196            // If offsets are sorted, then the minimum offset is the first offset.
197            // Note that even if the first view is null, offsets must always be valid, so it is
198            // completely fine for us to use this as a lower-bounded start of the `elements`.
199            self.offset_at(0)
200        } else {
201            self.offsets().statistics().compute_min().vortex_expect(
202                "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
203            )
204        };
205
206        let end = if self.is_zero_copy_to_list() {
207            // If offsets are sorted and there are no overlaps (views are always "increasing"), we
208            // can just grab the last offset and last size.
209            let last_offset = self.offset_at(self.len() - 1);
210            let last_size = self.size_at(self.len() - 1);
211            last_offset + last_size
212        } else {
213            let min_max = compute::min_max(
214                &compute::add(self.offsets(), self.sizes())
215                    .vortex_expect("`offsets + sizes` somehow overflowed"),
216            )
217            .vortex_expect("Something went wrong while computing min and max")
218            .vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
219
220            min_max
221                .max
222                .as_primitive()
223                .as_::<usize>()
224                .vortex_expect("unable to interpret the max `offset + size` as a `usize`")
225        };
226
227        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
228            let offset = <O as FromPrimitive>::from_usize(start)
229                .vortex_expect("unable to convert the min offset `start` into a `usize`");
230            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
231
232            compute::sub_scalar(self.offsets(), scalar)
233                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
234        });
235
236        let sliced_elements = self.elements().slice(start..end)?;
237
238        // SAFETY: The only thing we changed was the elements (which we verify through mins and
239        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
240        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
241        // change that property.
242        Ok(unsafe {
243            ListViewArray::new_unchecked(
244                sliced_elements,
245                adjusted_offsets,
246                self.sizes().clone(),
247                self.validity().clone(),
248            )
249            .with_zero_copy_to_list(self.is_zero_copy_to_list())
250        })
251    }
252
253    fn rebuild_make_exact(&self) -> VortexResult<ListViewArray> {
254        if self.is_zero_copy_to_list() {
255            self.rebuild_trim_elements()
256        } else {
257            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
258            // any leading and trailing garbage data.
259            self.rebuild_zero_copy_to_list()
260        }
261    }
262}
263
264#[cfg(test)]
265mod tests {
266    use vortex_buffer::BitBuffer;
267    use vortex_dtype::Nullability;
268    use vortex_error::VortexResult;
269
270    use super::ListViewRebuildMode;
271    use crate::IntoArray;
272    use crate::ToCanonical;
273    use crate::arrays::ListViewArray;
274    use crate::arrays::PrimitiveArray;
275    use crate::assert_arrays_eq;
276    use crate::validity::Validity;
277    use crate::vtable::ValidityHelper;
278
279    #[test]
280    fn test_rebuild_flatten_removes_overlaps() -> VortexResult<()> {
281        // Create a list view with overlapping lists: [A, B, C]
282        // List 0: offset=0, size=3 -> [A, B, C]
283        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
284        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
285        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
286        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
287
288        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
289
290        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
291
292        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
293        // Lists should be sequential with no overlaps
294        assert_eq!(flattened.elements().len(), 5);
295
296        // Offsets should be sequential
297        assert_eq!(flattened.offset_at(0), 0);
298        assert_eq!(flattened.size_at(0), 3);
299        assert_eq!(flattened.offset_at(1), 3);
300        assert_eq!(flattened.size_at(1), 2);
301
302        // Verify the data is correct
303        assert_arrays_eq!(
304            flattened.list_elements_at(0).unwrap(),
305            PrimitiveArray::from_iter([1i32, 2, 3])
306        );
307
308        assert_arrays_eq!(
309            flattened.list_elements_at(1).unwrap(),
310            PrimitiveArray::from_iter([2i32, 3])
311        );
312        Ok(())
313    }
314
315    #[test]
316    fn test_rebuild_flatten_with_nullable() -> VortexResult<()> {
317        use crate::arrays::BoolArray;
318
319        // Create a nullable list view with a null list
320        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
321        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
322        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
323        let validity = Validity::Array(
324            BoolArray::new(
325                BitBuffer::from(vec![true, false, true]),
326                Validity::NonNullable,
327            )
328            .into_array(),
329        );
330
331        let listview = ListViewArray::new(elements, offsets, sizes, validity);
332
333        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
334
335        // Verify nullability is preserved
336        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
337        assert!(flattened.validity().is_valid(0).unwrap());
338        assert!(!flattened.validity().is_valid(1).unwrap());
339        assert!(flattened.validity().is_valid(2).unwrap());
340
341        // Verify valid lists contain correct data
342        assert_arrays_eq!(
343            flattened.list_elements_at(0).unwrap(),
344            PrimitiveArray::from_iter([1i32, 2])
345        );
346
347        assert_arrays_eq!(
348            flattened.list_elements_at(2).unwrap(),
349            PrimitiveArray::from_iter([3i32])
350        );
351        Ok(())
352    }
353
354    #[test]
355    fn test_rebuild_trim_elements_basic() -> VortexResult<()> {
356        // Test trimming both leading and trailing unused elements while preserving gaps in the
357        // middle.
358        // Elements: [_, _, A, B, _, C, D, _, _]
359        //            0  1  2  3  4  5  6  7  8
360        // List 0: offset=2, size=2 -> [A, B]
361        // List 1: offset=5, size=2 -> [C, D]
362        // Should trim to: [A, B, _, C, D] with adjusted offsets.
363        let elements =
364            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
365        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
366        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
367
368        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
369
370        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
371
372        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
373        assert_eq!(trimmed.elements().len(), 5);
374
375        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
376        assert_eq!(trimmed.offset_at(0), 0);
377        assert_eq!(trimmed.size_at(0), 2);
378        assert_eq!(trimmed.offset_at(1), 3);
379        assert_eq!(trimmed.size_at(1), 2);
380
381        // Verify the data is correct.
382        assert_arrays_eq!(
383            trimmed.list_elements_at(0).unwrap(),
384            PrimitiveArray::from_iter([1i32, 2])
385        );
386
387        assert_arrays_eq!(
388            trimmed.list_elements_at(1).unwrap(),
389            PrimitiveArray::from_iter([3i32, 4])
390        );
391
392        // Note that element at index 2 (97) is preserved as a gap.
393        let all_elements = trimmed.elements().to_primitive();
394        assert_eq!(all_elements.scalar_at(2).unwrap(), 97i32.into());
395        Ok(())
396    }
397
398    #[test]
399    fn test_rebuild_with_trailing_nulls_regression() -> VortexResult<()> {
400        // Regression test for issue #5412
401        // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
402        // offsets for NULL items to maintain no-overlap invariant
403
404        // Create a ListViewArray with trailing NULLs
405        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
406        let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
407        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
408        let validity = Validity::from_iter(vec![true, true, false, false]);
409
410        let listview = ListViewArray::new(elements, offsets, sizes, validity);
411
412        // First rebuild to make it zero-copy-to-list
413        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
414        assert!(rebuilt.is_zero_copy_to_list());
415
416        // Verify NULL items have correct offsets (should not reuse previous offsets)
417        // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
418        assert_eq!(rebuilt.offset_at(0), 0);
419        assert_eq!(rebuilt.offset_at(1), 2);
420        assert_eq!(rebuilt.offset_at(2), 4); // NULL should be at position 4
421        assert_eq!(rebuilt.offset_at(3), 4); // Second NULL also at position 4
422
423        // All sizes should be correct
424        assert_eq!(rebuilt.size_at(0), 2);
425        assert_eq!(rebuilt.size_at(1), 2);
426        assert_eq!(rebuilt.size_at(2), 0); // NULL has size 0
427        assert_eq!(rebuilt.size_at(3), 0); // NULL has size 0
428
429        // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
430        // This should not panic (issue #5412)
431        let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact)?;
432
433        // Verify the result is still valid
434        assert!(exact.is_valid(0).unwrap());
435        assert!(exact.is_valid(1).unwrap());
436        assert!(!exact.is_valid(2).unwrap());
437        assert!(!exact.is_valid(3).unwrap());
438
439        // Verify data is preserved
440        assert_arrays_eq!(
441            exact.list_elements_at(0).unwrap(),
442            PrimitiveArray::from_iter([1i32, 2])
443        );
444
445        assert_arrays_eq!(
446            exact.list_elements_at(1).unwrap(),
447            PrimitiveArray::from_iter([3i32, 4])
448        );
449        Ok(())
450    }
451}