Skip to main content

vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8
9use crate::Canonical;
10use crate::ExecutionCtx;
11use crate::IntoArray;
12use crate::arrays::ConstantArray;
13use crate::arrays::ListViewArray;
14use crate::arrays::PrimitiveArray;
15use crate::arrays::listview::ListViewArrayExt;
16use crate::arrays::primitive::PrimitiveArrayExt;
17use crate::builders::builder_with_capacity;
18use crate::builtins::ArrayBuiltins;
19use crate::dtype::IntegerPType;
20use crate::dtype::Nullability;
21use crate::dtype::PType;
22use crate::match_each_integer_ptype;
23use crate::match_each_unsigned_integer_ptype;
24use crate::scalar::Scalar;
25use crate::scalar_fn::fns::operators::Operator;
26use crate::validity::Validity;
27
28/// The widened offset type used for a rebuilt, zero-copy-to-list array.
29///
30/// Offsets are widened to at least 32 bits (Arrow only permits 32/64-bit list offsets) while
31/// preserving signedness, since a signed result keeps the array zero-copyable to Arrow's
32/// `ListArray`.
33fn rebuilt_offset_ptype(offsets_ptype: PType) -> PType {
34    match offsets_ptype {
35        PType::U8 | PType::U16 | PType::U32 => PType::U32,
36        PType::U64 => PType::U64,
37        PType::I8 | PType::I16 | PType::I32 => PType::I32,
38        PType::I64 => PType::I64,
39        _ => unreachable!("invalid offsets PType"),
40    }
41}
42
43/// Density threshold to decide whether to rebuild a sparse `ListViewArray`.
44///
45/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after
46/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`)
47/// falls below this threshold, the benefits of a rebuild may outweigh its cost.
48///
49/// This is a somewhat arbitrary rule-of-thumb and may be suboptimal depending on different use cases and
50/// list element dtypes.
51pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
52
53/// Waste threshold to decide whether to trim a zero-copy-to-list `ListViewArray`.
54///
55/// A zero-copy-to-list array has no overlaps and no interior gaps, so its only unreferenced bytes
56/// are leading and trailing elements. Trimming those is much cheaper than a full rebuild (a lazy
57/// `elements` slice plus an `O(num_lists)` offset adjustment, with no element data copy), so we use
58/// a more aggressive threshold than [`DEFAULT_REBUILD_DENSITY_THRESHOLD`].
59///
60/// When the unreferenced (leading + trailing) fraction of `elements` exceeds this threshold, we trim.
61pub const DEFAULT_TRIM_ELEMENTS_THRESHOLD: f32 = 0.05;
62
63/// Modes for rebuilding a [`ListViewArray`].
64pub enum ListViewRebuildMode {
65    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
66    /// to a [`ListArray`].
67    ///
68    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
69    /// like a [`ListArray`] but with an additional `sizes` array.
70    ///
71    /// [`ListArray`]: crate::arrays::ListArray
72    MakeZeroCopyToList,
73
74    /// Removes any leading or trailing elements that are unused / not referenced by any views in
75    /// the [`ListViewArray`].
76    ///
77    /// If the referenced `[start, end)` bounds are already known, prefer calling
78    /// [`trim_elements`](ListViewArray::trim_elements) directly to avoid recomputing them.
79    TrimElements,
80
81    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
82    ///
83    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
84    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
85    ///
86    /// [`ListArray`]: crate::arrays::ListArray
87    MakeExact,
88
89    // TODO(connor)[ListView]: Implement some version of this.
90    /// Finds the shortest packing / overlapping of list elements.
91    ///
92    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
93    /// this algorithm (but in all seriousness there are many approximate algorithms that could
94    /// work well here).
95    OverlapCompression,
96}
97
98impl ListViewArray {
99    /// Rebuilds the [`ListViewArray`] according to the specified mode.
100    pub fn rebuild(
101        &self,
102        mode: ListViewRebuildMode,
103        ctx: &mut ExecutionCtx,
104    ) -> VortexResult<ListViewArray> {
105        if self.is_empty() {
106            // SAFETY: An empty array is trivially zero-copyable to a `ListArray`.
107            return Ok(unsafe { self.clone().with_zero_copy_to_list(true) });
108        }
109
110        match mode {
111            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(ctx),
112            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(ctx),
113            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(ctx),
114            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
115        }
116    }
117
118    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
119    ///
120    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
121    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
122    /// a [`ListArray`].
123    ///
124    /// [`ListArray`]: crate::arrays::ListArray
125    fn rebuild_zero_copy_to_list(&self, ctx: &mut ExecutionCtx) -> VortexResult<ListViewArray> {
126        if self.is_zero_copy_to_list() {
127            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
128            return Ok(self.clone());
129        }
130
131        let offsets_ptype = self.offsets().dtype().as_ptype();
132        let sizes_ptype = self.sizes().dtype().as_ptype();
133
134        // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
135        // is that we want to pass data to systems that expect Arrow data.
136        // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
137        // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
138        // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
139        // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
140        // for sizes as well.
141        match_each_unsigned_integer_ptype!(sizes_ptype.to_unsigned(), |S| {
142            match offsets_ptype.to_unsigned() {
143                PType::U8 => self.naive_rebuild::<u8, u32, S>(ctx),
144                PType::U16 => self.naive_rebuild::<u16, u32, S>(ctx),
145                PType::U32 => self.naive_rebuild::<u32, u32, S>(ctx),
146                PType::U64 => self.naive_rebuild::<u64, u64, S>(ctx),
147                _ => unreachable!("invalid offsets PType"),
148            }
149        })
150    }
151
152    /// Picks between [`rebuild_with_take`](Self::rebuild_with_take) and
153    /// [`rebuild_list_by_list`](Self::rebuild_list_by_list) based on element dtype and average
154    /// list size.
155    fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
156        &self,
157        ctx: &mut ExecutionCtx,
158    ) -> VortexResult<ListViewArray> {
159        let sizes_canonical = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;
160        let sizes_canonical =
161            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
162        let total: u64 = sizes_canonical
163            .as_slice::<S>()
164            .iter()
165            .map(|s| (*s).as_() as u64)
166            .sum();
167        if Self::should_use_take(total, self.len()) {
168            self.rebuild_with_take::<O, NewOffset, S>(ctx)
169        } else {
170            self.rebuild_list_by_list::<O, NewOffset, S>(ctx)
171        }
172    }
173
174    /// Returns `true` when we are confident that `rebuild_with_take` will
175    /// outperform `rebuild_list_by_list`.
176    ///
177    /// Take is dramatically faster for small lists (often 10-100×) because it
178    /// avoids per-list builder overhead. LBL is the safer default for larger
179    /// lists since its sequential memcpy scales well. We only choose take when
180    /// the average list size is small enough that take clearly dominates.
181    fn should_use_take(total_output_elements: u64, num_lists: usize) -> bool {
182        if num_lists == 0 {
183            return true;
184        }
185        let avg = total_output_elements / num_lists as u64;
186        avg < 128
187    }
188
189    /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat
190    /// `BufferMut<u64>`, perform a single `take`.
191    fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
192        &self,
193        ctx: &mut ExecutionCtx,
194    ) -> VortexResult<ListViewArray> {
195        let new_offset_ptype = rebuilt_offset_ptype(self.offsets().dtype().as_ptype());
196        let size_ptype = self.sizes().dtype().as_ptype();
197
198        let offsets_canonical = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
199        let offsets_canonical =
200            offsets_canonical.reinterpret_cast(offsets_canonical.ptype().to_unsigned());
201        let offsets_slice = offsets_canonical.as_slice::<O>();
202        let sizes_canonical = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;
203        let sizes_canonical =
204            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
205        let sizes_slice = sizes_canonical.as_slice::<S>();
206
207        let len = offsets_slice.len();
208
209        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
210        let mut new_sizes = BufferMut::<S>::with_capacity(len);
211        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
212
213        // Resolve validity to a mask once instead of probing it per row: `execute_is_valid`
214        // executes a scalar on every call for array-backed validity, which is O(len) work repeated
215        // `len` times.
216        let validity = self.validity()?.execute_mask(len, ctx)?;
217
218        let mut n_elements = NewOffset::zero();
219        for index in 0..len {
220            if !validity.value(index) {
221                new_offsets.push(n_elements);
222                new_sizes.push(S::zero());
223                continue;
224            }
225
226            let offset = offsets_slice[index];
227            let size = sizes_slice[index];
228            let start = offset.as_();
229            let stop = start + size.as_();
230
231            new_offsets.push(n_elements);
232            new_sizes.push(size);
233            take_indices.extend(start as u64..stop as u64);
234            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
235        }
236
237        let elements = self.elements().take(take_indices.into_array())?;
238        // Built unsigned; reinterpret back to the signed-preserving result types.
239        let offsets = PrimitiveArray::new(new_offsets.freeze(), Validity::NonNullable)
240            .reinterpret_cast(new_offset_ptype)
241            .into_array();
242        let sizes = PrimitiveArray::new(new_sizes.freeze(), Validity::NonNullable)
243            .reinterpret_cast(size_ptype)
244            .into_array();
245
246        // SAFETY: same invariants as `rebuild_list_by_list` — offsets are sequential and
247        // non-overlapping, all (offset, size) pairs reference valid elements, and the validity
248        // array is preserved from the original.
249        Ok(unsafe {
250            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity()?)
251                .with_zero_copy_to_list(true)
252        })
253    }
254
255    /// Rebuilds elements list-by-list: canonicalize elements upfront, then for each list `slice`
256    /// the relevant range and `extend_from_array` into a typed builder.
257    fn rebuild_list_by_list<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
258        &self,
259        ctx: &mut ExecutionCtx,
260    ) -> VortexResult<ListViewArray> {
261        let element_dtype = self
262            .dtype()
263            .as_list_element_opt()
264            .vortex_expect("somehow had a canonical list that was not a list");
265
266        let new_offset_ptype = rebuilt_offset_ptype(self.offsets().dtype().as_ptype());
267        let size_ptype = self.sizes().dtype().as_ptype();
268
269        let offsets_canonical = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
270        let offsets_canonical =
271            offsets_canonical.reinterpret_cast(offsets_canonical.ptype().to_unsigned());
272        let offsets_slice = offsets_canonical.as_slice::<O>();
273        let sizes_canonical = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;
274        let sizes_canonical =
275            sizes_canonical.reinterpret_cast(sizes_canonical.ptype().to_unsigned());
276        let sizes_slice = sizes_canonical.as_slice::<S>();
277
278        let len = offsets_slice.len();
279
280        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
281        // TODO(connor)[ListView]: Do we really need to do this?
282        // The only reason we need to rebuild the sizes here is that the validity may indicate that
283        // a list is null even though it has a non-zero size. This rebuild will set the size of all
284        // null lists to 0.
285        let mut new_sizes = BufferMut::<S>::with_capacity(len);
286
287        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
288        let elements_canonical = self
289            .elements()
290            .clone()
291            .execute::<Canonical>(ctx)?
292            .into_array();
293
294        // Note that we do not know what the exact capacity should be of the new elements since
295        // there could be overlaps in the existing `ListViewArray`.
296        let mut new_elements_builder =
297            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
298
299        // Resolve validity to a mask once instead of probing it per row (see `rebuild_with_take`).
300        let validity = self.validity()?.execute_mask(len, ctx)?;
301
302        let mut n_elements = NewOffset::zero();
303        for index in 0..len {
304            if !validity.value(index) {
305                // For NULL lists, place them after the previous item's data to maintain the
306                // no-overlap invariant for zero-copy to `ListArray` arrays.
307                new_offsets.push(n_elements);
308                new_sizes.push(S::zero());
309                continue;
310            }
311
312            let offset = offsets_slice[index];
313            let size = sizes_slice[index];
314
315            let start = offset.as_();
316            let stop = start + size.as_();
317
318            new_offsets.push(n_elements);
319            new_sizes.push(size);
320            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
321
322            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
323        }
324
325        // Built unsigned; reinterpret back to the signed-preserving result types.
326        let offsets = PrimitiveArray::new(new_offsets.freeze(), Validity::NonNullable)
327            .reinterpret_cast(new_offset_ptype)
328            .into_array();
329        let sizes = PrimitiveArray::new(new_sizes.freeze(), Validity::NonNullable)
330            .reinterpret_cast(size_ptype)
331            .into_array();
332        let elements = new_elements_builder.finish();
333
334        debug_assert_eq!(
335            n_elements.as_(),
336            elements.len(),
337            "The accumulated elements somehow had the wrong length"
338        );
339
340        // SAFETY:
341        // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
342        // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
343        //   lists).
344        // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
345        // - The validity array is preserved from the original array unchanged
346        // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
347        //   and no overlaps.
348        Ok(unsafe {
349            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity()?)
350                .with_zero_copy_to_list(true)
351        })
352    }
353
354    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
355    /// elements, which is defined as a contiguous run of values in the `elements` array that are
356    /// not referenced by any views in the corresponding [`ListViewArray`].
357    fn rebuild_trim_elements(&self, ctx: &mut ExecutionCtx) -> VortexResult<ListViewArray> {
358        let (start, end) = self.referenced_element_bounds(ctx)?;
359
360        // SAFETY: we calculated valid start and end bounds
361        unsafe { self.trim_elements(start, end) }
362    }
363
364    /// Unsafely trims `elements` to the referenced half-open range `[start, end)`, adjusting every offset
365    /// down by `start`. The result preserves the original `is_zero_copy_to_list` flag.
366    ///
367    /// # SAFETY
368    ///
369    /// `start` must be the minimum value of `offsets`, and end should be the maximum value of `offsets[i] + size[i]`
370    /// over all indices `i` of offsets. Otherwise, `offsets` and `sizes` may hold references to elements that no longer exist
371    /// and the array will be corrupted.
372    pub unsafe fn trim_elements(&self, start: usize, end: usize) -> VortexResult<ListViewArray> {
373        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
374            let offset = <O as FromPrimitive>::from_usize(start)
375                .vortex_expect("unable to convert the min offset `start` into a `usize`");
376            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
377
378            self.offsets()
379                .clone()
380                .binary(
381                    ConstantArray::new(scalar, self.offsets().len()).into_array(),
382                    Operator::Sub,
383                )
384                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
385        });
386
387        let sliced_elements = self.elements().slice(start..end)?;
388
389        // SAFETY: The only thing we changed was the elements (which we verify through mins and
390        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
391        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
392        // change that property.
393        Ok(unsafe {
394            ListViewArray::new_unchecked(
395                sliced_elements,
396                adjusted_offsets,
397                self.sizes().clone(),
398                self.validity()?,
399            )
400            .with_zero_copy_to_list(self.is_zero_copy_to_list())
401        })
402    }
403
404    fn rebuild_make_exact(&self, ctx: &mut ExecutionCtx) -> VortexResult<ListViewArray> {
405        if self.is_zero_copy_to_list() {
406            self.rebuild_trim_elements(ctx)
407        } else {
408            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
409            // any leading and trailing garbage data.
410            self.rebuild_zero_copy_to_list(ctx)
411        }
412    }
413}
414
415#[cfg(test)]
416mod tests {
417    use vortex_buffer::BitBuffer;
418    use vortex_error::VortexResult;
419
420    use super::super::tests::common::SESSION;
421    use super::ListViewRebuildMode;
422    use crate::IntoArray;
423    use crate::VortexSessionExecute;
424    use crate::arrays::ListViewArray;
425    use crate::arrays::PrimitiveArray;
426    use crate::arrays::listview::ListViewArrayExt;
427    use crate::assert_arrays_eq;
428    use crate::dtype::Nullability;
429    use crate::validity::Validity;
430
431    #[test]
432    fn test_rebuild_flatten_removes_overlaps() -> VortexResult<()> {
433        // Create a list view with overlapping lists: [A, B, C]
434        // List 0: offset=0, size=3 -> [A, B, C]
435        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
436        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
437        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
438        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
439
440        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
441
442        let mut ctx = SESSION.create_execution_ctx();
443        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList, &mut ctx)?;
444
445        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
446        // Lists should be sequential with no overlaps
447        assert_eq!(flattened.elements().len(), 5);
448
449        // Offsets should be sequential
450        assert_eq!(flattened.offset_at(0), 0);
451        assert_eq!(flattened.size_at(0), 3);
452        assert_eq!(flattened.offset_at(1), 3);
453        assert_eq!(flattened.size_at(1), 2);
454
455        // Verify the data is correct
456        assert_arrays_eq!(
457            flattened.list_elements_at(0)?,
458            PrimitiveArray::from_iter([1i32, 2, 3])
459        );
460
461        assert_arrays_eq!(
462            flattened.list_elements_at(1)?,
463            PrimitiveArray::from_iter([2i32, 3])
464        );
465        Ok(())
466    }
467
468    #[test]
469    fn test_rebuild_flatten_with_nullable() -> VortexResult<()> {
470        use crate::arrays::BoolArray;
471
472        // Create a nullable list view with a null list
473        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
474        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
475        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
476        let validity = Validity::Array(
477            BoolArray::new(
478                BitBuffer::from(vec![true, false, true]),
479                Validity::NonNullable,
480            )
481            .into_array(),
482        );
483
484        let listview = ListViewArray::new(elements, offsets, sizes, validity);
485
486        let mut ctx = SESSION.create_execution_ctx();
487        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList, &mut ctx)?;
488
489        // Verify nullability is preserved
490        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
491        assert!(flattened.validity()?.execute_is_valid(0, &mut ctx)?);
492        assert!(!flattened.validity()?.execute_is_valid(1, &mut ctx)?);
493        assert!(flattened.validity()?.execute_is_valid(2, &mut ctx)?);
494
495        // Verify valid lists contain correct data
496        assert_arrays_eq!(
497            flattened.list_elements_at(0)?,
498            PrimitiveArray::from_iter([1i32, 2])
499        );
500
501        assert_arrays_eq!(
502            flattened.list_elements_at(2)?,
503            PrimitiveArray::from_iter([3i32])
504        );
505        Ok(())
506    }
507
508    #[test]
509    fn test_rebuild_trim_elements_basic() -> VortexResult<()> {
510        // Test trimming both leading and trailing unused elements while preserving gaps in the
511        // middle.
512        // Elements: [_, _, A, B, _, C, D, _, _]
513        //            0  1  2  3  4  5  6  7  8
514        // List 0: offset=2, size=2 -> [A, B]
515        // List 1: offset=5, size=2 -> [C, D]
516        // Should trim to: [A, B, _, C, D] with adjusted offsets.
517        let elements =
518            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
519        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
520        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
521
522        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
523
524        let mut ctx = SESSION.create_execution_ctx();
525        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements, &mut ctx)?;
526
527        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
528        assert_eq!(trimmed.elements().len(), 5);
529
530        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
531        assert_eq!(trimmed.offset_at(0), 0);
532        assert_eq!(trimmed.size_at(0), 2);
533        assert_eq!(trimmed.offset_at(1), 3);
534        assert_eq!(trimmed.size_at(1), 2);
535
536        // Verify the data is correct.
537        assert_arrays_eq!(
538            trimmed.list_elements_at(0)?,
539            PrimitiveArray::from_iter([1i32, 2])
540        );
541
542        assert_arrays_eq!(
543            trimmed.list_elements_at(1)?,
544            PrimitiveArray::from_iter([3i32, 4])
545        );
546
547        // Note that element at index 2 (97) is preserved as a gap.
548        let all_elements = trimmed
549            .elements()
550            .clone()
551            .execute::<PrimitiveArray>(&mut ctx)?;
552        assert_eq!(all_elements.execute_scalar(2, &mut ctx)?, 97i32.into());
553        Ok(())
554    }
555
556    #[test]
557    fn test_rebuild_with_trailing_nulls_regression() -> VortexResult<()> {
558        // Regression test for issue #5412
559        // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
560        // offsets for NULL items to maintain no-overlap invariant
561
562        // Create a ListViewArray with trailing NULLs
563        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
564        let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
565        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
566        let validity = Validity::from_iter(vec![true, true, false, false]);
567
568        let listview = ListViewArray::new(elements, offsets, sizes, validity);
569
570        // First rebuild to make it zero-copy-to-list
571        let mut ctx = SESSION.create_execution_ctx();
572        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList, &mut ctx)?;
573        assert!(rebuilt.is_zero_copy_to_list());
574
575        // Verify NULL items have correct offsets (should not reuse previous offsets)
576        // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
577        assert_eq!(rebuilt.offset_at(0), 0);
578        assert_eq!(rebuilt.offset_at(1), 2);
579        assert_eq!(rebuilt.offset_at(2), 4); // NULL should be at position 4
580        assert_eq!(rebuilt.offset_at(3), 4); // Second NULL also at position 4
581
582        // All sizes should be correct
583        assert_eq!(rebuilt.size_at(0), 2);
584        assert_eq!(rebuilt.size_at(1), 2);
585        assert_eq!(rebuilt.size_at(2), 0); // NULL has size 0
586        assert_eq!(rebuilt.size_at(3), 0); // NULL has size 0
587
588        // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
589        // This should not panic (issue #5412)
590        let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact, &mut ctx)?;
591
592        // Verify the result is still valid
593        assert!(exact.is_valid(0, &mut ctx)?);
594        assert!(exact.is_valid(1, &mut ctx)?);
595        assert!(!exact.is_valid(2, &mut ctx)?);
596        assert!(!exact.is_valid(3, &mut ctx)?);
597
598        // Verify data is preserved
599        assert_arrays_eq!(
600            exact.list_elements_at(0)?,
601            PrimitiveArray::from_iter([1i32, 2])
602        );
603
604        assert_arrays_eq!(
605            exact.list_elements_at(1)?,
606            PrimitiveArray::from_iter([3i32, 4])
607        );
608        Ok(())
609    }
610
611    /// Regression test for <https://github.com/vortex-data/vortex/issues/6773>.
612    /// u32 offsets exceed u16::MAX, so u16 sizes are widened to u32 for the add.
613    #[test]
614    fn test_rebuild_trim_elements_offsets_wider_than_sizes() -> VortexResult<()> {
615        let mut elems = vec![0i32; 70_005];
616        elems[70_000] = 10;
617        elems[70_001] = 20;
618        elems[70_002] = 30;
619        elems[70_003] = 40;
620        let elements = PrimitiveArray::from_iter(elems).into_array();
621        let offsets = PrimitiveArray::from_iter(vec![70_000u32, 70_002]).into_array();
622        let sizes = PrimitiveArray::from_iter(vec![2u16, 2]).into_array();
623
624        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
625        let mut ctx = SESSION.create_execution_ctx();
626        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements, &mut ctx)?;
627        assert_arrays_eq!(
628            trimmed.list_elements_at(1)?,
629            PrimitiveArray::from_iter([30i32, 40])
630        );
631        Ok(())
632    }
633
634    /// Regression test for <https://github.com/vortex-data/vortex/issues/6773>.
635    /// u32 sizes exceed u16::MAX, so u16 offsets are widened to u32 for the add.
636    #[test]
637    fn test_rebuild_trim_elements_sizes_wider_than_offsets() -> VortexResult<()> {
638        let mut elems = vec![0i32; 70_001];
639        elems[3] = 30;
640        elems[4] = 40;
641        let elements = PrimitiveArray::from_iter(elems).into_array();
642        let offsets = PrimitiveArray::from_iter(vec![1u16, 3]).into_array();
643        let sizes = PrimitiveArray::from_iter(vec![70_000u32, 2]).into_array();
644
645        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
646        let mut ctx = SESSION.create_execution_ctx();
647        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements, &mut ctx)?;
648        assert_arrays_eq!(
649            trimmed.list_elements_at(1)?,
650            PrimitiveArray::from_iter([30i32, 40])
651        );
652        Ok(())
653    }
654
655    /// Rebuild with signed offsets/sizes: exercises the unsigned-reinterpret read path and asserts
656    /// the result offset/size dtypes preserve signedness (widened to >=32-bit for offsets).
657    #[test]
658    fn test_rebuild_preserves_signed_offset_and_size_types() -> VortexResult<()> {
659        use crate::dtype::PType;
660
661        // Overlapping lists force an actual rebuild rather than the zero-copy fast path.
662        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
663        let offsets = PrimitiveArray::from_iter(vec![0i32, 1]).into_array();
664        let sizes = PrimitiveArray::from_iter(vec![3i16, 2]).into_array();
665
666        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
667        let mut ctx = SESSION.create_execution_ctx();
668        let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList, &mut ctx)?;
669
670        // Values: [1,2,3] and [2,3].
671        assert_arrays_eq!(
672            rebuilt.list_elements_at(0)?,
673            PrimitiveArray::from_iter([1i32, 2, 3])
674        );
675        assert_arrays_eq!(
676            rebuilt.list_elements_at(1)?,
677            PrimitiveArray::from_iter([2i32, 3])
678        );
679
680        // Signed input -> signed result (offsets widened to i32, sizes kept i16).
681        assert_eq!(rebuilt.offsets().dtype().as_ptype(), PType::I32);
682        assert_eq!(rebuilt.sizes().dtype().as_ptype(), PType::I16);
683        Ok(())
684    }
685
686    // ── should_use_take heuristic tests ────────────────────────────────────
687
688    #[test]
689    fn heuristic_zero_lists_uses_take() {
690        assert!(ListViewArray::should_use_take(0, 0));
691    }
692
693    #[test]
694    fn heuristic_small_lists_use_take() {
695        // avg = 127 → take
696        assert!(ListViewArray::should_use_take(127_000, 1_000));
697        // avg = 128 → LBL
698        assert!(!ListViewArray::should_use_take(128_000, 1_000));
699    }
700
701    /// Regression test for <https://github.com/vortex-data/vortex/issues/6973>.
702    /// Both offsets and sizes are u8, and offset + size exceeds u8::MAX.
703    #[test]
704    fn test_rebuild_trim_elements_sum_overflows_type() -> VortexResult<()> {
705        let elements = PrimitiveArray::from_iter(vec![0i32; 261]).into_array();
706        let offsets = PrimitiveArray::from_iter(vec![215u8, 0]).into_array();
707        let sizes = PrimitiveArray::from_iter(vec![46u8, 10]).into_array();
708
709        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
710        let mut ctx = SESSION.create_execution_ctx();
711        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements, &mut ctx)?;
712
713        // min(offsets) = 0, so nothing to trim; output should equal input.
714        assert_arrays_eq!(trimmed, listview);
715        Ok(())
716    }
717}