vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_dtype::IntegerPType;
7use vortex_dtype::Nullability;
8use vortex_dtype::match_each_integer_ptype;
9use vortex_error::VortexExpect;
10use vortex_scalar::Scalar;
11
12use crate::Array;
13use crate::IntoArray;
14use crate::ToCanonical;
15use crate::arrays::ListViewArray;
16use crate::builders::builder_with_capacity;
17use crate::compute;
18use crate::vtable::ValidityHelper;
19
20/// Modes for rebuilding a [`ListViewArray`].
21pub enum ListViewRebuildMode {
22    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
23    /// to a [`ListArray`].
24    ///
25    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
26    /// like a [`ListArray`] but with an additional `sizes` array.
27    ///
28    /// [`ListArray`]: crate::arrays::ListArray
29    MakeZeroCopyToList,
30
31    /// Removes any leading or trailing elements that are unused / not referenced by any views in
32    /// the [`ListViewArray`].
33    TrimElements,
34
35    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
36    ///
37    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
38    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
39    ///
40    /// [`ListArray`]: crate::arrays::ListArray
41    MakeExact,
42
43    // TODO(connor)[ListView]: Implement some version of this.
44    /// Finds the shortest packing / overlapping of list elements.
45    ///
46    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
47    /// this algorithm (but in all seriousness there are many approximate algorithms that could
48    /// work well here).
49    OverlapCompression,
50}
51
52impl ListViewArray {
53    /// Rebuilds the [`ListViewArray`] according to the specified mode.
54    pub fn rebuild(&self, mode: ListViewRebuildMode) -> ListViewArray {
55        if self.is_empty() {
56            return self.clone();
57        }
58
59        match mode {
60            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
61            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
62            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
63            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
64        }
65    }
66
67    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
68    ///
69    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
70    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
71    /// a [`ListArray`].
72    ///
73    /// [`ListArray`]: crate::arrays::ListArray
74    fn rebuild_zero_copy_to_list(&self) -> ListViewArray {
75        if self.is_zero_copy_to_list() {
76            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
77            return self.clone();
78        }
79
80        let offsets_ptype = self.offsets().dtype().as_ptype();
81        let sizes_ptype = self.sizes().dtype().as_ptype();
82
83        // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
84        // is that we want to pass data to systems that expect Arrow data.
85        // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
86        // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
87        // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
88        // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
89        // for sizes as well.
90        match_each_integer_ptype!(sizes_ptype, |S| {
91            match offsets_ptype {
92                PType::U8 => self.naive_rebuild::<u8, u32, S>(),
93                PType::U16 => self.naive_rebuild::<u16, u32, S>(),
94                PType::U32 => self.naive_rebuild::<u32, u32, S>(),
95                PType::U64 => self.naive_rebuild::<u64, u64, S>(),
96                PType::I8 => self.naive_rebuild::<i8, i32, S>(),
97                PType::I16 => self.naive_rebuild::<i16, i32, S>(),
98                PType::I32 => self.naive_rebuild::<i32, i32, S>(),
99                PType::I64 => self.naive_rebuild::<i64, i64, S>(),
100                _ => unreachable!("invalid offsets PType"),
101            }
102        })
103    }
104
105    // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements
106    // instead of using a builder.
107    /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
108    /// by piece.
109    fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
110        &self,
111    ) -> ListViewArray {
112        let element_dtype = self
113            .dtype()
114            .as_list_element_opt()
115            .vortex_expect("somehow had a canonical list that was not a list");
116
117        let offsets_canonical = self.offsets().to_primitive();
118        let offsets_slice = offsets_canonical.as_slice::<O>();
119        let sizes_canonical = self.sizes().to_primitive();
120        let sizes_slice = sizes_canonical.as_slice::<S>();
121
122        let len = offsets_slice.len();
123
124        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
125        // TODO(connor)[ListView]: Do we really need to do this?
126        // The only reason we need to rebuild the sizes here is that the validity may indicate that
127        // a list is null even though it has a non-zero size. This rebuild will set the size of all
128        // null lists to 0.
129        let mut new_sizes = BufferMut::<S>::with_capacity(len);
130
131        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
132        let elements_canonical = self.elements().to_canonical().into_array();
133
134        // Note that we do not know what the exact capacity should be of the new elements since
135        // there could be overlaps in the existing `ListViewArray`.
136        let mut new_elements_builder =
137            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
138
139        let mut n_elements = NewOffset::zero();
140        for index in 0..len {
141            if !self.is_valid(index) {
142                // For NULL lists, place them after the previous item's data to maintain the
143                // no-overlap invariant for zero-copy to `ListArray` arrays.
144                new_offsets.push(n_elements);
145                new_sizes.push(S::zero());
146                continue;
147            }
148
149            let offset = offsets_slice[index];
150            let size = sizes_slice[index];
151
152            let start = offset.as_();
153            let stop = start + size.as_();
154
155            new_offsets.push(n_elements);
156            new_sizes.push(size);
157            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop));
158
159            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
160        }
161
162        let offsets = new_offsets.into_array();
163        let sizes = new_sizes.into_array();
164        let elements = new_elements_builder.finish();
165
166        debug_assert_eq!(
167            n_elements.as_(),
168            elements.len(),
169            "The accumulated elements somehow had the wrong length"
170        );
171
172        // SAFETY:
173        // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
174        // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
175        //   lists).
176        // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
177        // - The validity array is preserved from the original array unchanged
178        // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
179        //   and no overlaps.
180        unsafe {
181            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
182                .with_zero_copy_to_list(true)
183        }
184    }
185
186    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
187    /// elements, which is defined as a contiguous run of values in the `elements` array that are
188    /// not referecened by any views in the corresponding [`ListViewArray`].
189    fn rebuild_trim_elements(&self) -> ListViewArray {
190        let start = if self.is_zero_copy_to_list() {
191            // If offsets are sorted, then the minimum offset is the first offset.
192            // Note that even if the first view is null, offsets must always be valid, so it is
193            // completely fine for us to use this as a lower-bounded start of the `elements`.
194            self.offset_at(0)
195        } else {
196            self.offsets().statistics().compute_min().vortex_expect(
197                "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
198            )
199        };
200
201        let end = if self.is_zero_copy_to_list() {
202            // If offsets are sorted and there are no overlaps (views are always "increasing"), we
203            // can just grab the last offset and last size.
204            let last_offset = self.offset_at(self.len() - 1);
205            let last_size = self.size_at(self.len() - 1);
206            last_offset + last_size
207        } else {
208            let min_max = compute::min_max(
209                &compute::add(self.offsets(), self.sizes())
210                    .vortex_expect("`offsets + sizes` somehow overflowed"),
211            )
212            .vortex_expect("Something went wrong while computing min and max")
213            .vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
214
215            min_max
216                .max
217                .as_primitive()
218                .as_::<usize>()
219                .vortex_expect("unable to interpret the max `offset + size` as a `usize`")
220        };
221
222        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
223            let offset = <O as FromPrimitive>::from_usize(start)
224                .vortex_expect("unable to convert the min offset `start` into a `usize`");
225            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
226
227            compute::sub_scalar(self.offsets(), scalar)
228                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
229        });
230
231        let sliced_elements = self.elements().slice(start..end);
232
233        // SAFETY: The only thing we changed was the elements (which we verify through mins and
234        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
235        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
236        // change that property.
237        unsafe {
238            ListViewArray::new_unchecked(
239                sliced_elements,
240                adjusted_offsets,
241                self.sizes().clone(),
242                self.validity().clone(),
243            )
244            .with_zero_copy_to_list(self.is_zero_copy_to_list())
245        }
246    }
247
248    fn rebuild_make_exact(&self) -> ListViewArray {
249        if self.is_zero_copy_to_list() {
250            self.rebuild_trim_elements()
251        } else {
252            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
253            // any leading and trailing garbage data.
254            self.rebuild_zero_copy_to_list()
255        }
256    }
257}
258
259#[cfg(test)]
260mod tests {
261    use vortex_buffer::BitBuffer;
262    use vortex_dtype::Nullability;
263
264    use super::ListViewRebuildMode;
265    use crate::IntoArray;
266    use crate::ToCanonical;
267    use crate::arrays::ListViewArray;
268    use crate::arrays::PrimitiveArray;
269    use crate::validity::Validity;
270    use crate::vtable::ValidityHelper;
271
272    #[test]
273    fn test_rebuild_flatten_removes_overlaps() {
274        // Create a list view with overlapping lists: [A, B, C]
275        // List 0: offset=0, size=3 -> [A, B, C]
276        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
277        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
278        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
279        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
280
281        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
282
283        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList);
284
285        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
286        // Lists should be sequential with no overlaps
287        assert_eq!(flattened.elements().len(), 5);
288
289        // Offsets should be sequential
290        assert_eq!(flattened.offset_at(0), 0);
291        assert_eq!(flattened.size_at(0), 3);
292        assert_eq!(flattened.offset_at(1), 3);
293        assert_eq!(flattened.size_at(1), 2);
294
295        // Verify the data is correct
296        let list0 = flattened.list_elements_at(0).to_primitive();
297        assert_eq!(list0.as_slice::<i32>(), &[1, 2, 3]);
298
299        let list1 = flattened.list_elements_at(1).to_primitive();
300        assert_eq!(list1.as_slice::<i32>(), &[2, 3]);
301    }
302
303    #[test]
304    fn test_rebuild_flatten_with_nullable() {
305        use crate::arrays::BoolArray;
306
307        // Create a nullable list view with a null list
308        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
309        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
310        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
311        let validity = Validity::Array(
312            BoolArray::from_bit_buffer(
313                BitBuffer::from(vec![true, false, true]),
314                Validity::NonNullable,
315            )
316            .into_array(),
317        );
318
319        let listview = ListViewArray::new(elements, offsets, sizes, validity);
320
321        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList);
322
323        // Verify nullability is preserved
324        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
325        assert!(flattened.validity().is_valid(0));
326        assert!(!flattened.validity().is_valid(1));
327        assert!(flattened.validity().is_valid(2));
328
329        // Verify valid lists contain correct data
330        let list0 = flattened.list_elements_at(0).to_primitive();
331        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
332
333        let list2 = flattened.list_elements_at(2).to_primitive();
334        assert_eq!(list2.as_slice::<i32>(), &[3]);
335    }
336
337    #[test]
338    fn test_rebuild_trim_elements_basic() {
339        // Test trimming both leading and trailing unused elements while preserving gaps in the
340        // middle.
341        // Elements: [_, _, A, B, _, C, D, _, _]
342        //            0  1  2  3  4  5  6  7  8
343        // List 0: offset=2, size=2 -> [A, B]
344        // List 1: offset=5, size=2 -> [C, D]
345        // Should trim to: [A, B, _, C, D] with adjusted offsets.
346        let elements =
347            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
348        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
349        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
350
351        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
352
353        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements);
354
355        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
356        assert_eq!(trimmed.elements().len(), 5);
357
358        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
359        assert_eq!(trimmed.offset_at(0), 0);
360        assert_eq!(trimmed.size_at(0), 2);
361        assert_eq!(trimmed.offset_at(1), 3);
362        assert_eq!(trimmed.size_at(1), 2);
363
364        // Verify the data is correct.
365        let list0 = trimmed.list_elements_at(0).to_primitive();
366        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
367
368        let list1 = trimmed.list_elements_at(1).to_primitive();
369        assert_eq!(list1.as_slice::<i32>(), &[3, 4]);
370
371        // Note that element at index 2 (97) is preserved as a gap.
372        let all_elements = trimmed.elements().to_primitive();
373        assert_eq!(all_elements.scalar_at(2), 97i32.into());
374    }
375
376    #[test]
377    fn test_rebuild_with_trailing_nulls_regression() {
378        // Regression test for issue #5412
379        // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
380        // offsets for NULL items to maintain no-overlap invariant
381
382        // Create a ListViewArray with trailing NULLs
383        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
384        let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
385        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
386        let validity = Validity::from_iter(vec![true, true, false, false]);
387
388        let listview = ListViewArray::new(elements, offsets, sizes, validity);
389
390        // First rebuild to make it zero-copy-to-list
391        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList);
392        assert!(rebuilt.is_zero_copy_to_list());
393
394        // Verify NULL items have correct offsets (should not reuse previous offsets)
395        // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
396        assert_eq!(rebuilt.offset_at(0), 0);
397        assert_eq!(rebuilt.offset_at(1), 2);
398        assert_eq!(rebuilt.offset_at(2), 4); // NULL should be at position 4
399        assert_eq!(rebuilt.offset_at(3), 4); // Second NULL also at position 4
400
401        // All sizes should be correct
402        assert_eq!(rebuilt.size_at(0), 2);
403        assert_eq!(rebuilt.size_at(1), 2);
404        assert_eq!(rebuilt.size_at(2), 0); // NULL has size 0
405        assert_eq!(rebuilt.size_at(3), 0); // NULL has size 0
406
407        // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
408        // This should not panic (issue #5412)
409        let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact);
410
411        // Verify the result is still valid
412        assert!(exact.is_valid(0));
413        assert!(exact.is_valid(1));
414        assert!(!exact.is_valid(2));
415        assert!(!exact.is_valid(3));
416
417        // Verify data is preserved
418        let list0 = exact.list_elements_at(0).to_primitive();
419        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
420
421        let list1 = exact.list_elements_at(1).to_primitive();
422        assert_eq!(list1.as_slice::<i32>(), &[3, 4]);
423    }
424}