Skip to main content

vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8
9use crate::IntoArray;
10use crate::LEGACY_SESSION;
11#[expect(deprecated)]
12use crate::ToCanonical as _;
13use crate::VortexSessionExecute;
14use crate::arrays::ConstantArray;
15use crate::arrays::ListViewArray;
16use crate::arrays::PrimitiveArray;
17use crate::arrays::listview::ListViewArrayExt;
18use crate::arrays::primitive::PrimitiveArrayExt;
19use crate::builders::builder_with_capacity;
20use crate::builtins::ArrayBuiltins;
21use crate::dtype::IntegerPType;
22use crate::dtype::Nullability;
23use crate::dtype::PType;
24use crate::match_each_integer_ptype;
25use crate::match_each_unsigned_integer_ptype;
26use crate::scalar::Scalar;
27use crate::scalar_fn::fns::operators::Operator;
28use crate::validity::Validity;
29
30/// The widened offset type used for a rebuilt, zero-copy-to-list array.
31///
32/// Offsets are widened to at least 32 bits (Arrow only permits 32/64-bit list offsets) while
33/// preserving signedness, since a signed result keeps the array zero-copyable to Arrow's
34/// `ListArray`.
35fn rebuilt_offset_ptype(offsets_ptype: PType) -> PType {
36    match offsets_ptype {
37        PType::U8 | PType::U16 | PType::U32 => PType::U32,
38        PType::U64 => PType::U64,
39        PType::I8 | PType::I16 | PType::I32 => PType::I32,
40        PType::I64 => PType::I64,
41        _ => unreachable!("invalid offsets PType"),
42    }
43}
44
45/// Density threshold to decide whether to rebuild a sparse `ListViewArray`.
46///
47/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after
48/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`)
49/// falls below this threshold, the benefits of a rebuild may outweigh its cost.
50///
51/// This is a somewhat arbitrary rule-of-thumb and may be suboptimal depending on different use cases and
52/// list element dtypes.
53pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
54
55/// Waste threshold to decide whether to trim a zero-copy-to-list `ListViewArray`.
56///
57/// A zero-copy-to-list array has no overlaps and no interior gaps, so its only unreferenced bytes
58/// are leading and trailing elements. Trimming those is much cheaper than a full rebuild (a lazy
59/// `elements` slice plus an `O(num_lists)` offset adjustment, with no element data copy), so we use
60/// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`].
61///
62/// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim.
63pub const DEFAULT_TRIM_ELEMENTS_THRESHOLD: f32 = 0.05;
64
65/// Modes for rebuilding a [`ListViewArray`].
66pub enum ListViewRebuildMode {
67    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
68    /// to a [`ListArray`].
69    ///
70    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
71    /// like a [`ListArray`] but with an additional `sizes` array.
72    ///
73    /// [`ListArray`]: crate::arrays::ListArray
74    MakeZeroCopyToList,
75
76    /// Removes any leading or trailing elements that are unused / not referenced by any views in
77    /// the [`ListViewArray`].
78    ///
79    /// If the referenced `[start, end)` bounds are already known, prefer calling
80    /// [`trim_elements`](ListViewArray::trim_elements) directly to avoid recomputing them.
81    TrimElements,
82
83    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
84    ///
85    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
86    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
87    ///
88    /// [`ListArray`]: crate::arrays::ListArray
89    MakeExact,
90
91    // TODO(connor)[ListView]: Implement some version of this.
92    /// Finds the shortest packing / overlapping of list elements.
93    ///
94    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
95    /// this algorithm (but in all seriousness there are many approximate algorithms that could
96    /// work well here).
97    OverlapCompression,
98}
99
100impl ListViewArray {
101    /// Rebuilds the [`ListViewArray`] according to the specified mode.
102    pub fn rebuild(&self, mode: ListViewRebuildMode) -> VortexResult<ListViewArray> {
103        if self.is_empty() {
104            // SAFETY: An empty array is trivially zero-copyable to a `ListArray`.
105            return Ok(unsafe { self.clone().with_zero_copy_to_list(true) });
106        }
107
108        match mode {
109            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
110            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
111            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
112            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
113        }
114    }
115
116    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
117    ///
118    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
119    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
120    /// a [`ListArray`].
121    ///
122    /// [`ListArray`]: crate::arrays::ListArray
123    fn rebuild_zero_copy_to_list(&self) -> VortexResult<ListViewArray> {
124        if self.is_zero_copy_to_list() {
125            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
126            return Ok(self.clone());
127        }
128
129        let offsets_ptype = self.offsets().dtype().as_ptype();
130        let sizes_ptype = self.sizes().dtype().as_ptype();
131
132        // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
133        // is that we want to pass data to systems that expect Arrow data.
134        // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
135        // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
136        // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
137        // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
138        // for sizes as well.
139        match_each_unsigned_integer_ptype!(sizes_ptype.to_unsigned(), |S| {
140            match offsets_ptype.to_unsigned() {
141                PType::U8 => self.naive_rebuild::<u8, u32, S>(),
142                PType::U16 => self.naive_rebuild::<u16, u32, S>(),
143                PType::U32 => self.naive_rebuild::<u32, u32, S>(),
144                PType::U64 => self.naive_rebuild::<u64, u64, S>(),
145                _ => unreachable!("invalid offsets PType"),
146            }
147        })
148    }
149
150    /// Picks between [`rebuild_with_take`](Self::rebuild_with_take) and
151    /// [`rebuild_list_by_list`](Self::rebuild_list_by_list) based on element dtype and average
152    /// list size.
153    fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
154        &self,
155    ) -> VortexResult<ListViewArray> {
156        #[expect(deprecated)]
157        let sizes_canonical = self.sizes().to_primitive();
158        let sizes_canonical =
159            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
160        let total: u64 = sizes_canonical
161            .as_slice::<S>()
162            .iter()
163            .map(|s| (*s).as_() as u64)
164            .sum();
165        if Self::should_use_take(total, self.len()) {
166            self.rebuild_with_take::<O, NewOffset, S>()
167        } else {
168            self.rebuild_list_by_list::<O, NewOffset, S>()
169        }
170    }
171
172    /// Returns `true` when we are confident that `rebuild_with_take` will
173    /// outperform `rebuild_list_by_list`.
174    ///
175    /// Take is dramatically faster for small lists (often 10-100×) because it
176    /// avoids per-list builder overhead. LBL is the safer default for larger
177    /// lists since its sequential memcpy scales well. We only choose take when
178    /// the average list size is small enough that take clearly dominates.
179    fn should_use_take(total_output_elements: u64, num_lists: usize) -> bool {
180        if num_lists == 0 {
181            return true;
182        }
183        let avg = total_output_elements / num_lists as u64;
184        avg < 128
185    }
186
187    /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat
188    /// `BufferMut<u64>`, perform a single `take`.
189    fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
190        &self,
191    ) -> VortexResult<ListViewArray> {
192        let new_offset_ptype = rebuilt_offset_ptype(self.offsets().dtype().as_ptype());
193        let size_ptype = self.sizes().dtype().as_ptype();
194
195        #[expect(deprecated)]
196        let offsets_canonical = self.offsets().to_primitive();
197        let offsets_canonical =
198            offsets_canonical.reinterpret_cast(offsets_canonical.ptype().to_unsigned());
199        let offsets_slice = offsets_canonical.as_slice::<O>();
200        #[expect(deprecated)]
201        let sizes_canonical = self.sizes().to_primitive();
202        let sizes_canonical =
203            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
204        let sizes_slice = sizes_canonical.as_slice::<S>();
205
206        let len = offsets_slice.len();
207
208        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
209        let mut new_sizes = BufferMut::<S>::with_capacity(len);
210        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
211
212        let mut n_elements = NewOffset::zero();
213        for index in 0..len {
214            if !self.validity()?.is_valid(index)? {
215                new_offsets.push(n_elements);
216                new_sizes.push(S::zero());
217                continue;
218            }
219
220            let offset = offsets_slice[index];
221            let size = sizes_slice[index];
222            let start = offset.as_();
223            let stop = start + size.as_();
224
225            new_offsets.push(n_elements);
226            new_sizes.push(size);
227            take_indices.extend(start as u64..stop as u64);
228            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
229        }
230
231        let elements = self.elements().take(take_indices.into_array())?;
232        // Built unsigned; reinterpret back to the signed-preserving result types.
233        let offsets = PrimitiveArray::new(new_offsets.freeze(), Validity::NonNullable)
234            .reinterpret_cast(new_offset_ptype)
235            .into_array();
236        let sizes = PrimitiveArray::new(new_sizes.freeze(), Validity::NonNullable)
237            .reinterpret_cast(size_ptype)
238            .into_array();
239
240        // SAFETY: same invariants as `rebuild_list_by_list` — offsets are sequential and
241        // non-overlapping, all (offset, size) pairs reference valid elements, and the validity
242        // array is preserved from the original.
243        Ok(unsafe {
244            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity()?)
245                .with_zero_copy_to_list(true)
246        })
247    }
248
249    /// Rebuilds elements list-by-list: canonicalize elements upfront, then for each list `slice`
250    /// the relevant range and `extend_from_array` into a typed builder.
251    fn rebuild_list_by_list<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
252        &self,
253    ) -> VortexResult<ListViewArray> {
254        let element_dtype = self
255            .dtype()
256            .as_list_element_opt()
257            .vortex_expect("somehow had a canonical list that was not a list");
258
259        let new_offset_ptype = rebuilt_offset_ptype(self.offsets().dtype().as_ptype());
260        let size_ptype = self.sizes().dtype().as_ptype();
261
262        #[expect(deprecated)]
263        let offsets_canonical = self.offsets().to_primitive();
264        let offsets_canonical =
265            offsets_canonical.reinterpret_cast(offsets_canonical.ptype().to_unsigned());
266        let offsets_slice = offsets_canonical.as_slice::<O>();
267        #[expect(deprecated)]
268        let sizes_canonical = self.sizes().to_primitive();
269        let sizes_canonical =
270            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
271        let sizes_slice = sizes_canonical.as_slice::<S>();
272
273        let len = offsets_slice.len();
274
275        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
276        // TODO(connor)[ListView]: Do we really need to do this?
277        // The only reason we need to rebuild the sizes here is that the validity may indicate that
278        // a list is null even though it has a non-zero size. This rebuild will set the size of all
279        // null lists to 0.
280        let mut new_sizes = BufferMut::<S>::with_capacity(len);
281
282        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
283        #[expect(deprecated)]
284        let elements_canonical = self
285            .elements()
286            .to_canonical()
287            .vortex_expect("canonicalize elements for rebuild")
288            .into_array();
289
290        // Note that we do not know what the exact capacity should be of the new elements since
291        // there could be overlaps in the existing `ListViewArray`.
292        let mut new_elements_builder =
293            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
294
295        let mut n_elements = NewOffset::zero();
296        for index in 0..len {
297            if !self.validity()?.is_valid(index)? {
298                // For NULL lists, place them after the previous item's data to maintain the
299                // no-overlap invariant for zero-copy to `ListArray` arrays.
300                new_offsets.push(n_elements);
301                new_sizes.push(S::zero());
302                continue;
303            }
304
305            let offset = offsets_slice[index];
306            let size = sizes_slice[index];
307
308            let start = offset.as_();
309            let stop = start + size.as_();
310
311            new_offsets.push(n_elements);
312            new_sizes.push(size);
313            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
314
315            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
316        }
317
318        // Built unsigned; reinterpret back to the signed-preserving result types.
319        let offsets = PrimitiveArray::new(new_offsets.freeze(), Validity::NonNullable)
320            .reinterpret_cast(new_offset_ptype)
321            .into_array();
322        let sizes = PrimitiveArray::new(new_sizes.freeze(), Validity::NonNullable)
323            .reinterpret_cast(size_ptype)
324            .into_array();
325        let elements = new_elements_builder.finish();
326
327        debug_assert_eq!(
328            n_elements.as_(),
329            elements.len(),
330            "The accumulated elements somehow had the wrong length"
331        );
332
333        // SAFETY:
334        // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
335        // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
336        //   lists).
337        // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
338        // - The validity array is preserved from the original array unchanged
339        // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
340        //   and no overlaps.
341        Ok(unsafe {
342            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity()?)
343                .with_zero_copy_to_list(true)
344        })
345    }
346
347    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
348    /// elements, which is defined as a contiguous run of values in the `elements` array that are
349    /// not referenced by any views in the corresponding [`ListViewArray`].
350    fn rebuild_trim_elements(&self) -> VortexResult<ListViewArray> {
351        let mut ctx = LEGACY_SESSION.create_execution_ctx();
352        let (start, end) = self.referenced_element_bounds(&mut ctx)?;
353
354        // SAFETY: we calculated valid start and end bounds
355        unsafe { self.trim_elements(start, end) }
356    }
357
358    /// Unsafely trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset
359    /// down by `start`. The result preserves the original `is_zero_copy_to_list` flag.
360    ///
361    /// # SAFETY
362    ///
363    /// `start` must be the minimum value of `offsets`, and end should be the maximum value of `offsets[i] + size[i]`
364    /// over all indices `i` of offsets. Otherwise, `offsets` and `sizes` may hold references to elements that no longer exist
365    /// and the array will be corrupted.
366    pub unsafe fn trim_elements(&self, start: usize, end: usize) -> VortexResult<ListViewArray> {
367        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
368            let offset = <O as FromPrimitive>::from_usize(start)
369                .vortex_expect("unable to convert the min offset `start` into a `usize`");
370            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
371
372            self.offsets()
373                .clone()
374                .binary(
375                    ConstantArray::new(scalar, self.offsets().len()).into_array(),
376                    Operator::Sub,
377                )
378                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
379        });
380
381        let sliced_elements = self.elements().slice(start..end)?;
382
383        // SAFETY: The only thing we changed was the elements (which we verify through mins and
384        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
385        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
386        // change that property.
387        Ok(unsafe {
388            ListViewArray::new_unchecked(
389                sliced_elements,
390                adjusted_offsets,
391                self.sizes().clone(),
392                self.validity()?,
393            )
394            .with_zero_copy_to_list(self.is_zero_copy_to_list())
395        })
396    }
397
398    fn rebuild_make_exact(&self) -> VortexResult<ListViewArray> {
399        if self.is_zero_copy_to_list() {
400            self.rebuild_trim_elements()
401        } else {
402            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
403            // any leading and trailing garbage data.
404            self.rebuild_zero_copy_to_list()
405        }
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use vortex_buffer::BitBuffer;
412    use vortex_error::VortexResult;
413
414    use super::ListViewRebuildMode;
415    use crate::IntoArray;
416    use crate::LEGACY_SESSION;
417    #[expect(deprecated)]
418    use crate::ToCanonical as _;
419    use crate::VortexSessionExecute;
420    use crate::arrays::ListViewArray;
421    use crate::arrays::PrimitiveArray;
422    use crate::arrays::listview::ListViewArrayExt;
423    use crate::assert_arrays_eq;
424    use crate::dtype::Nullability;
425    use crate::validity::Validity;
426
427    #[test]
428    fn test_rebuild_flatten_removes_overlaps() -> VortexResult<()> {
429        // Create a list view with overlapping lists: [A, B, C]
430        // List 0: offset=0, size=3 -> [A, B, C]
431        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
432        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
433        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
434        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
435
436        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
437
438        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
439
440        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
441        // Lists should be sequential with no overlaps
442        assert_eq!(flattened.elements().len(), 5);
443
444        // Offsets should be sequential
445        assert_eq!(flattened.offset_at(0), 0);
446        assert_eq!(flattened.size_at(0), 3);
447        assert_eq!(flattened.offset_at(1), 3);
448        assert_eq!(flattened.size_at(1), 2);
449
450        // Verify the data is correct
451        assert_arrays_eq!(
452            flattened.list_elements_at(0)?,
453            PrimitiveArray::from_iter([1i32, 2, 3])
454        );
455
456        assert_arrays_eq!(
457            flattened.list_elements_at(1)?,
458            PrimitiveArray::from_iter([2i32, 3])
459        );
460        Ok(())
461    }
462
463    #[test]
464    fn test_rebuild_flatten_with_nullable() -> VortexResult<()> {
465        use crate::arrays::BoolArray;
466
467        // Create a nullable list view with a null list
468        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
469        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
470        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
471        let validity = Validity::Array(
472            BoolArray::new(
473                BitBuffer::from(vec![true, false, true]),
474                Validity::NonNullable,
475            )
476            .into_array(),
477        );
478
479        let listview = ListViewArray::new(elements, offsets, sizes, validity);
480
481        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
482
483        // Verify nullability is preserved
484        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
485        assert!(flattened.validity()?.is_valid(0)?);
486        assert!(!flattened.validity()?.is_valid(1)?);
487        assert!(flattened.validity()?.is_valid(2)?);
488
489        // Verify valid lists contain correct data
490        assert_arrays_eq!(
491            flattened.list_elements_at(0)?,
492            PrimitiveArray::from_iter([1i32, 2])
493        );
494
495        assert_arrays_eq!(
496            flattened.list_elements_at(2)?,
497            PrimitiveArray::from_iter([3i32])
498        );
499        Ok(())
500    }
501
502    #[test]
503    fn test_rebuild_trim_elements_basic() -> VortexResult<()> {
504        // Test trimming both leading and trailing unused elements while preserving gaps in the
505        // middle.
506        // Elements: [_, _, A, B, _, C, D, _, _]
507        //            0  1  2  3  4  5  6  7  8
508        // List 0: offset=2, size=2 -> [A, B]
509        // List 1: offset=5, size=2 -> [C, D]
510        // Should trim to: [A, B, _, C, D] with adjusted offsets.
511        let elements =
512            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
513        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
514        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
515
516        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
517
518        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
519
520        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
521        assert_eq!(trimmed.elements().len(), 5);
522
523        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
524        assert_eq!(trimmed.offset_at(0), 0);
525        assert_eq!(trimmed.size_at(0), 2);
526        assert_eq!(trimmed.offset_at(1), 3);
527        assert_eq!(trimmed.size_at(1), 2);
528
529        // Verify the data is correct.
530        assert_arrays_eq!(
531            trimmed.list_elements_at(0)?,
532            PrimitiveArray::from_iter([1i32, 2])
533        );
534
535        assert_arrays_eq!(
536            trimmed.list_elements_at(1)?,
537            PrimitiveArray::from_iter([3i32, 4])
538        );
539
540        // Note that element at index 2 (97) is preserved as a gap.
541        #[expect(deprecated)]
542        let all_elements = trimmed.elements().to_primitive();
543        assert_eq!(
544            all_elements.execute_scalar(2, &mut LEGACY_SESSION.create_execution_ctx())?,
545            97i32.into()
546        );
547        Ok(())
548    }
549
550    #[test]
551    fn test_rebuild_with_trailing_nulls_regression() -> VortexResult<()> {
552        // Regression test for issue #5412
553        // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
554        // offsets for NULL items to maintain no-overlap invariant
555
556        // Create a ListViewArray with trailing NULLs
557        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
558        let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
559        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
560        let validity = Validity::from_iter(vec![true, true, false, false]);
561
562        let listview = ListViewArray::new(elements, offsets, sizes, validity);
563
564        // First rebuild to make it zero-copy-to-list
565        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
566        assert!(rebuilt.is_zero_copy_to_list());
567
568        // Verify NULL items have correct offsets (should not reuse previous offsets)
569        // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
570        assert_eq!(rebuilt.offset_at(0), 0);
571        assert_eq!(rebuilt.offset_at(1), 2);
572        assert_eq!(rebuilt.offset_at(2), 4); // NULL should be at position 4
573        assert_eq!(rebuilt.offset_at(3), 4); // Second NULL also at position 4
574
575        // All sizes should be correct
576        assert_eq!(rebuilt.size_at(0), 2);
577        assert_eq!(rebuilt.size_at(1), 2);
578        assert_eq!(rebuilt.size_at(2), 0); // NULL has size 0
579        assert_eq!(rebuilt.size_at(3), 0); // NULL has size 0
580
581        // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
582        // This should not panic (issue #5412)
583        let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact)?;
584
585        // Verify the result is still valid
586        assert!(exact.is_valid(0, &mut LEGACY_SESSION.create_execution_ctx())?);
587        assert!(exact.is_valid(1, &mut LEGACY_SESSION.create_execution_ctx())?);
588        assert!(!exact.is_valid(2, &mut LEGACY_SESSION.create_execution_ctx())?);
589        assert!(!exact.is_valid(3, &mut LEGACY_SESSION.create_execution_ctx())?);
590
591        // Verify data is preserved
592        assert_arrays_eq!(
593            exact.list_elements_at(0)?,
594            PrimitiveArray::from_iter([1i32, 2])
595        );
596
597        assert_arrays_eq!(
598            exact.list_elements_at(1)?,
599            PrimitiveArray::from_iter([3i32, 4])
600        );
601        Ok(())
602    }
603
604    /// Regression test for <https://github.com/vortex-data/vortex/issues/6773>.
605    /// u32 offsets exceed u16::MAX, so u16 sizes are widened to u32 for the add.
606    #[test]
607    fn test_rebuild_trim_elements_offsets_wider_than_sizes() -> VortexResult<()> {
608        let mut elems = vec![0i32; 70_005];
609        elems[70_000] = 10;
610        elems[70_001] = 20;
611        elems[70_002] = 30;
612        elems[70_003] = 40;
613        let elements = PrimitiveArray::from_iter(elems).into_array();
614        let offsets = PrimitiveArray::from_iter(vec![70_000u32, 70_002]).into_array();
615        let sizes = PrimitiveArray::from_iter(vec![2u16, 2]).into_array();
616
617        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
618        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
619        assert_arrays_eq!(
620            trimmed.list_elements_at(1)?,
621            PrimitiveArray::from_iter([30i32, 40])
622        );
623        Ok(())
624    }
625
626    /// Regression test for <https://github.com/vortex-data/vortex/issues/6773>.
627    /// u32 sizes exceed u16::MAX, so u16 offsets are widened to u32 for the add.
628    #[test]
629    fn test_rebuild_trim_elements_sizes_wider_than_offsets() -> VortexResult<()> {
630        let mut elems = vec![0i32; 70_001];
631        elems[3] = 30;
632        elems[4] = 40;
633        let elements = PrimitiveArray::from_iter(elems).into_array();
634        let offsets = PrimitiveArray::from_iter(vec![1u16, 3]).into_array();
635        let sizes = PrimitiveArray::from_iter(vec![70_000u32, 2]).into_array();
636
637        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
638        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
639        assert_arrays_eq!(
640            trimmed.list_elements_at(1)?,
641            PrimitiveArray::from_iter([30i32, 40])
642        );
643        Ok(())
644    }
645
646    /// Rebuild with signed offsets/sizes: exercises the unsigned-reinterpret read path and asserts
647    /// the result offset/size dtypes preserve signedness (widened to >=32-bit for offsets).
648    #[test]
649    fn test_rebuild_preserves_signed_offset_and_size_types() -> VortexResult<()> {
650        use crate::dtype::PType;
651
652        // Overlapping lists force an actual rebuild rather than the zero-copy fast path.
653        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
654        let offsets = PrimitiveArray::from_iter(vec![0i32, 1]).into_array();
655        let sizes = PrimitiveArray::from_iter(vec![3i16, 2]).into_array();
656
657        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
658        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
659
660        // Values: [1,2,3] and [2,3].
661        assert_arrays_eq!(
662            rebuilt.list_elements_at(0)?,
663            PrimitiveArray::from_iter([1i32, 2, 3])
664        );
665        assert_arrays_eq!(
666            rebuilt.list_elements_at(1)?,
667            PrimitiveArray::from_iter([2i32, 3])
668        );
669
670        // Signed input -> signed result (offsets widened to i32, sizes kept i16).
671        assert_eq!(rebuilt.offsets().dtype().as_ptype(), PType::I32);
672        assert_eq!(rebuilt.sizes().dtype().as_ptype(), PType::I16);
673        Ok(())
674    }
675
676    // ── should_use_take heuristic tests ────────────────────────────────────
677
678    #[test]
679    fn heuristic_zero_lists_uses_take() {
680        assert!(ListViewArray::should_use_take(0, 0));
681    }
682
683    #[test]
684    fn heuristic_small_lists_use_take() {
685        // avg = 127 → take
686        assert!(ListViewArray::should_use_take(127_000, 1_000));
687        // avg = 128 → LBL
688        assert!(!ListViewArray::should_use_take(128_000, 1_000));
689    }
690
691    /// Regression test for <https://github.com/vortex-data/vortex/issues/6973>.
692    /// Both offsets and sizes are u8, and offset + size exceeds u8::MAX.
693    #[test]
694    fn test_rebuild_trim_elements_sum_overflows_type() -> VortexResult<()> {
695        let elements = PrimitiveArray::from_iter(vec![0i32; 261]).into_array();
696        let offsets = PrimitiveArray::from_iter(vec![215u8, 0]).into_array();
697        let sizes = PrimitiveArray::from_iter(vec![46u8, 10]).into_array();
698
699        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
700        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
701
702        // min(offsets) = 0, so nothing to trim; output should equal input.
703        assert_arrays_eq!(trimmed, listview);
704        Ok(())
705    }
706}