Skip to main content

vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8
9use crate::Array;
10use crate::IntoArray;
11use crate::ToCanonical;
12use crate::arrays::ConstantArray;
13use crate::arrays::ListViewArray;
14use crate::builders::builder_with_capacity;
15use crate::builtins::ArrayBuiltins;
16use crate::compute;
17use crate::dtype::IntegerPType;
18use crate::dtype::Nullability;
19use crate::match_each_integer_ptype;
20use crate::scalar::Scalar;
21use crate::scalar_fn::fns::operators::Operator;
22use crate::vtable::ValidityHelper;
23
24/// Modes for rebuilding a [`ListViewArray`].
25pub enum ListViewRebuildMode {
26    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
27    /// to a [`ListArray`].
28    ///
29    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
30    /// like a [`ListArray`] but with an additional `sizes` array.
31    ///
32    /// [`ListArray`]: crate::arrays::ListArray
33    MakeZeroCopyToList,
34
35    /// Removes any leading or trailing elements that are unused / not referenced by any views in
36    /// the [`ListViewArray`].
37    TrimElements,
38
39    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
40    ///
41    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
42    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
43    ///
44    /// [`ListArray`]: crate::arrays::ListArray
45    MakeExact,
46
47    // TODO(connor)[ListView]: Implement some version of this.
48    /// Finds the shortest packing / overlapping of list elements.
49    ///
50    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
51    /// this algorithm (but in all seriousness there are many approximate algorithms that could
52    /// work well here).
53    OverlapCompression,
54}
55
56impl ListViewArray {
57    /// Rebuilds the [`ListViewArray`] according to the specified mode.
58    pub fn rebuild(&self, mode: ListViewRebuildMode) -> VortexResult<ListViewArray> {
59        if self.is_empty() {
60            return Ok(self.clone());
61        }
62
63        match mode {
64            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
65            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
66            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
67            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
68        }
69    }
70
71    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
72    ///
73    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
74    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
75    /// a [`ListArray`].
76    ///
77    /// [`ListArray`]: crate::arrays::ListArray
78    fn rebuild_zero_copy_to_list(&self) -> VortexResult<ListViewArray> {
79        if self.is_zero_copy_to_list() {
80            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
81            return Ok(self.clone());
82        }
83
84        let offsets_ptype = self.offsets().dtype().as_ptype();
85        let sizes_ptype = self.sizes().dtype().as_ptype();
86
87        // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
88        // is that we want to pass data to systems that expect Arrow data.
89        // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
90        // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
91        // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
92        // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
93        // for sizes as well.
94        match_each_integer_ptype!(sizes_ptype, |S| {
95            match offsets_ptype {
96                PType::U8 => self.naive_rebuild::<u8, u32, S>(),
97                PType::U16 => self.naive_rebuild::<u16, u32, S>(),
98                PType::U32 => self.naive_rebuild::<u32, u32, S>(),
99                PType::U64 => self.naive_rebuild::<u64, u64, S>(),
100                PType::I8 => self.naive_rebuild::<i8, i32, S>(),
101                PType::I16 => self.naive_rebuild::<i16, i32, S>(),
102                PType::I32 => self.naive_rebuild::<i32, i32, S>(),
103                PType::I64 => self.naive_rebuild::<i64, i64, S>(),
104                _ => unreachable!("invalid offsets PType"),
105            }
106        })
107    }
108
109    /// Picks between [`rebuild_with_take`](Self::rebuild_with_take) and
110    /// [`rebuild_list_by_list`](Self::rebuild_list_by_list) based on element dtype and average
111    /// list size.
112    fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
113        &self,
114    ) -> VortexResult<ListViewArray> {
115        let sizes_canonical = self.sizes().to_primitive();
116        let total: u64 = sizes_canonical
117            .as_slice::<S>()
118            .iter()
119            .map(|s| (*s).as_() as u64)
120            .sum();
121        if Self::should_use_take(total, self.len()) {
122            self.rebuild_with_take::<O, NewOffset, S>()
123        } else {
124            self.rebuild_list_by_list::<O, NewOffset, S>()
125        }
126    }
127
128    /// Returns `true` when we are confident that `rebuild_with_take` will
129    /// outperform `rebuild_list_by_list`.
130    ///
131    /// Take is dramatically faster for small lists (often 10-100×) because it
132    /// avoids per-list builder overhead. LBL is the safer default for larger
133    /// lists since its sequential memcpy scales well. We only choose take when
134    /// the average list size is small enough that take clearly dominates.
135    fn should_use_take(total_output_elements: u64, num_lists: usize) -> bool {
136        if num_lists == 0 {
137            return true;
138        }
139        let avg = total_output_elements / num_lists as u64;
140        avg < 128
141    }
142
143    /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat
144    /// `BufferMut<u64>`, perform a single `take`.
145    fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
146        &self,
147    ) -> VortexResult<ListViewArray> {
148        let offsets_canonical = self.offsets().to_primitive();
149        let offsets_slice = offsets_canonical.as_slice::<O>();
150        let sizes_canonical = self.sizes().to_primitive();
151        let sizes_slice = sizes_canonical.as_slice::<S>();
152
153        let len = offsets_slice.len();
154
155        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
156        let mut new_sizes = BufferMut::<S>::with_capacity(len);
157        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
158
159        let mut n_elements = NewOffset::zero();
160        for index in 0..len {
161            if !self.is_valid(index)? {
162                new_offsets.push(n_elements);
163                new_sizes.push(S::zero());
164                continue;
165            }
166
167            let offset = offsets_slice[index];
168            let size = sizes_slice[index];
169            let start = offset.as_();
170            let stop = start + size.as_();
171
172            new_offsets.push(n_elements);
173            new_sizes.push(size);
174            take_indices.extend(start as u64..stop as u64);
175            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
176        }
177
178        let elements = self.elements().take(take_indices.into_array())?;
179        let offsets = new_offsets.into_array();
180        let sizes = new_sizes.into_array();
181
182        // SAFETY: same invariants as `rebuild_list_by_list` — offsets are sequential and
183        // non-overlapping, all (offset, size) pairs reference valid elements, and the validity
184        // array is preserved from the original.
185        Ok(unsafe {
186            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
187                .with_zero_copy_to_list(true)
188        })
189    }
190
191    /// Rebuilds elements list-by-list: canonicalize elements upfront, then for each list `slice`
192    /// the relevant range and `extend_from_array` into a typed builder.
193    fn rebuild_list_by_list<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
194        &self,
195    ) -> VortexResult<ListViewArray> {
196        let element_dtype = self
197            .dtype()
198            .as_list_element_opt()
199            .vortex_expect("somehow had a canonical list that was not a list");
200
201        let offsets_canonical = self.offsets().to_primitive();
202        let offsets_slice = offsets_canonical.as_slice::<O>();
203        let sizes_canonical = self.sizes().to_primitive();
204        let sizes_slice = sizes_canonical.as_slice::<S>();
205
206        let len = offsets_slice.len();
207
208        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
209        // TODO(connor)[ListView]: Do we really need to do this?
210        // The only reason we need to rebuild the sizes here is that the validity may indicate that
211        // a list is null even though it has a non-zero size. This rebuild will set the size of all
212        // null lists to 0.
213        let mut new_sizes = BufferMut::<S>::with_capacity(len);
214
215        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
216        let elements_canonical = self
217            .elements()
218            .to_canonical()
219            .vortex_expect("canonicalize elements for rebuild")
220            .into_array();
221
222        // Note that we do not know what the exact capacity should be of the new elements since
223        // there could be overlaps in the existing `ListViewArray`.
224        let mut new_elements_builder =
225            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
226
227        let mut n_elements = NewOffset::zero();
228        for index in 0..len {
229            if !self.is_valid(index)? {
230                // For NULL lists, place them after the previous item's data to maintain the
231                // no-overlap invariant for zero-copy to `ListArray` arrays.
232                new_offsets.push(n_elements);
233                new_sizes.push(S::zero());
234                continue;
235            }
236
237            let offset = offsets_slice[index];
238            let size = sizes_slice[index];
239
240            let start = offset.as_();
241            let stop = start + size.as_();
242
243            new_offsets.push(n_elements);
244            new_sizes.push(size);
245            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
246
247            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
248        }
249
250        let offsets = new_offsets.into_array();
251        let sizes = new_sizes.into_array();
252        let elements = new_elements_builder.finish();
253
254        debug_assert_eq!(
255            n_elements.as_(),
256            elements.len(),
257            "The accumulated elements somehow had the wrong length"
258        );
259
260        // SAFETY:
261        // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
262        // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
263        //   lists).
264        // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
265        // - The validity array is preserved from the original array unchanged
266        // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
267        //   and no overlaps.
268        Ok(unsafe {
269            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
270                .with_zero_copy_to_list(true)
271        })
272    }
273
274    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
275    /// elements, which is defined as a contiguous run of values in the `elements` array that are
276    /// not referecened by any views in the corresponding [`ListViewArray`].
277    fn rebuild_trim_elements(&self) -> VortexResult<ListViewArray> {
278        let start = if self.is_zero_copy_to_list() {
279            // If offsets are sorted, then the minimum offset is the first offset.
280            // Note that even if the first view is null, offsets must always be valid, so it is
281            // completely fine for us to use this as a lower-bounded start of the `elements`.
282            self.offset_at(0)
283        } else {
284            self.offsets().statistics().compute_min().vortex_expect(
285                "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
286            )
287        };
288
289        let end = if self.is_zero_copy_to_list() {
290            // If offsets are sorted and there are no overlaps (views are always "increasing"), we
291            // can just grab the last offset and last size.
292            let last_offset = self.offset_at(self.len() - 1);
293            let last_size = self.size_at(self.len() - 1);
294            last_offset + last_size
295        } else {
296            let min_max = compute::min_max(
297                &self
298                    .offsets()
299                    .clone()
300                    .binary(self.sizes().clone(), Operator::Add)
301                    .vortex_expect("`offsets + sizes` somehow overflowed"),
302            )
303            .vortex_expect("Something went wrong while computing min and max")
304            .vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
305
306            min_max
307                .max
308                .as_primitive()
309                .as_::<usize>()
310                .vortex_expect("unable to interpret the max `offset + size` as a `usize`")
311        };
312
313        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
314            let offset = <O as FromPrimitive>::from_usize(start)
315                .vortex_expect("unable to convert the min offset `start` into a `usize`");
316            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
317
318            self.offsets()
319                .to_array()
320                .binary(
321                    ConstantArray::new(scalar, self.offsets().len()).into_array(),
322                    Operator::Sub,
323                )
324                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
325        });
326
327        let sliced_elements = self.elements().slice(start..end)?;
328
329        // SAFETY: The only thing we changed was the elements (which we verify through mins and
330        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
331        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
332        // change that property.
333        Ok(unsafe {
334            ListViewArray::new_unchecked(
335                sliced_elements,
336                adjusted_offsets,
337                self.sizes().clone(),
338                self.validity().clone(),
339            )
340            .with_zero_copy_to_list(self.is_zero_copy_to_list())
341        })
342    }
343
344    fn rebuild_make_exact(&self) -> VortexResult<ListViewArray> {
345        if self.is_zero_copy_to_list() {
346            self.rebuild_trim_elements()
347        } else {
348            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
349            // any leading and trailing garbage data.
350            self.rebuild_zero_copy_to_list()
351        }
352    }
353}
354
355#[cfg(test)]
356#[allow(clippy::cast_possible_truncation)]
357mod tests {
358    use vortex_buffer::BitBuffer;
359    use vortex_error::VortexResult;
360
361    use super::ListViewRebuildMode;
362    use crate::IntoArray;
363    use crate::ToCanonical;
364    use crate::arrays::ListViewArray;
365    use crate::arrays::PrimitiveArray;
366    use crate::assert_arrays_eq;
367    use crate::dtype::Nullability;
368    use crate::validity::Validity;
369    use crate::vtable::ValidityHelper;
370
371    #[test]
372    fn test_rebuild_flatten_removes_overlaps() -> VortexResult<()> {
373        // Create a list view with overlapping lists: [A, B, C]
374        // List 0: offset=0, size=3 -> [A, B, C]
375        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
376        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
377        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
378        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
379
380        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
381
382        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
383
384        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
385        // Lists should be sequential with no overlaps
386        assert_eq!(flattened.elements().len(), 5);
387
388        // Offsets should be sequential
389        assert_eq!(flattened.offset_at(0), 0);
390        assert_eq!(flattened.size_at(0), 3);
391        assert_eq!(flattened.offset_at(1), 3);
392        assert_eq!(flattened.size_at(1), 2);
393
394        // Verify the data is correct
395        assert_arrays_eq!(
396            flattened.list_elements_at(0).unwrap(),
397            PrimitiveArray::from_iter([1i32, 2, 3])
398        );
399
400        assert_arrays_eq!(
401            flattened.list_elements_at(1).unwrap(),
402            PrimitiveArray::from_iter([2i32, 3])
403        );
404        Ok(())
405    }
406
407    #[test]
408    fn test_rebuild_flatten_with_nullable() -> VortexResult<()> {
409        use crate::arrays::BoolArray;
410
411        // Create a nullable list view with a null list
412        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
413        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
414        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
415        let validity = Validity::Array(
416            BoolArray::new(
417                BitBuffer::from(vec![true, false, true]),
418                Validity::NonNullable,
419            )
420            .into_array(),
421        );
422
423        let listview = ListViewArray::new(elements, offsets, sizes, validity);
424
425        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
426
427        // Verify nullability is preserved
428        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
429        assert!(flattened.validity().is_valid(0).unwrap());
430        assert!(!flattened.validity().is_valid(1).unwrap());
431        assert!(flattened.validity().is_valid(2).unwrap());
432
433        // Verify valid lists contain correct data
434        assert_arrays_eq!(
435            flattened.list_elements_at(0).unwrap(),
436            PrimitiveArray::from_iter([1i32, 2])
437        );
438
439        assert_arrays_eq!(
440            flattened.list_elements_at(2).unwrap(),
441            PrimitiveArray::from_iter([3i32])
442        );
443        Ok(())
444    }
445
446    #[test]
447    fn test_rebuild_trim_elements_basic() -> VortexResult<()> {
448        // Test trimming both leading and trailing unused elements while preserving gaps in the
449        // middle.
450        // Elements: [_, _, A, B, _, C, D, _, _]
451        //            0  1  2  3  4  5  6  7  8
452        // List 0: offset=2, size=2 -> [A, B]
453        // List 1: offset=5, size=2 -> [C, D]
454        // Should trim to: [A, B, _, C, D] with adjusted offsets.
455        let elements =
456            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
457        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
458        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
459
460        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
461
462        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
463
464        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
465        assert_eq!(trimmed.elements().len(), 5);
466
467        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
468        assert_eq!(trimmed.offset_at(0), 0);
469        assert_eq!(trimmed.size_at(0), 2);
470        assert_eq!(trimmed.offset_at(1), 3);
471        assert_eq!(trimmed.size_at(1), 2);
472
473        // Verify the data is correct.
474        assert_arrays_eq!(
475            trimmed.list_elements_at(0).unwrap(),
476            PrimitiveArray::from_iter([1i32, 2])
477        );
478
479        assert_arrays_eq!(
480            trimmed.list_elements_at(1).unwrap(),
481            PrimitiveArray::from_iter([3i32, 4])
482        );
483
484        // Note that element at index 2 (97) is preserved as a gap.
485        let all_elements = trimmed.elements().to_primitive();
486        assert_eq!(all_elements.scalar_at(2).unwrap(), 97i32.into());
487        Ok(())
488    }
489
490    #[test]
491    fn test_rebuild_with_trailing_nulls_regression() -> VortexResult<()> {
492        // Regression test for issue #5412
493        // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
494        // offsets for NULL items to maintain no-overlap invariant
495
496        // Create a ListViewArray with trailing NULLs
497        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
498        let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
499        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
500        let validity = Validity::from_iter(vec![true, true, false, false]);
501
502        let listview = ListViewArray::new(elements, offsets, sizes, validity);
503
504        // First rebuild to make it zero-copy-to-list
505        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
506        assert!(rebuilt.is_zero_copy_to_list());
507
508        // Verify NULL items have correct offsets (should not reuse previous offsets)
509        // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
510        assert_eq!(rebuilt.offset_at(0), 0);
511        assert_eq!(rebuilt.offset_at(1), 2);
512        assert_eq!(rebuilt.offset_at(2), 4); // NULL should be at position 4
513        assert_eq!(rebuilt.offset_at(3), 4); // Second NULL also at position 4
514
515        // All sizes should be correct
516        assert_eq!(rebuilt.size_at(0), 2);
517        assert_eq!(rebuilt.size_at(1), 2);
518        assert_eq!(rebuilt.size_at(2), 0); // NULL has size 0
519        assert_eq!(rebuilt.size_at(3), 0); // NULL has size 0
520
521        // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
522        // This should not panic (issue #5412)
523        let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact)?;
524
525        // Verify the result is still valid
526        assert!(exact.is_valid(0).unwrap());
527        assert!(exact.is_valid(1).unwrap());
528        assert!(!exact.is_valid(2).unwrap());
529        assert!(!exact.is_valid(3).unwrap());
530
531        // Verify data is preserved
532        assert_arrays_eq!(
533            exact.list_elements_at(0).unwrap(),
534            PrimitiveArray::from_iter([1i32, 2])
535        );
536
537        assert_arrays_eq!(
538            exact.list_elements_at(1).unwrap(),
539            PrimitiveArray::from_iter([3i32, 4])
540        );
541        Ok(())
542    }
543
544    // ── should_use_take heuristic tests ────────────────────────────────────
545
546    #[test]
547    fn heuristic_zero_lists_uses_take() {
548        assert!(ListViewArray::should_use_take(0, 0));
549    }
550
551    #[test]
552    fn heuristic_small_lists_use_take() {
553        // avg = 127 → take
554        assert!(ListViewArray::should_use_take(127_000, 1_000));
555        // avg = 128 → LBL
556        assert!(!ListViewArray::should_use_take(128_000, 1_000));
557    }
558}