Skip to main content

vortex_array/arrays/listview/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6use std::sync::Arc;
7
8use num_traits::AsPrimitive;
9use smallvec::smallvec;
10use vortex_buffer::BitBufferMut;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_ensure;
15use vortex_error::vortex_err;
16use vortex_mask::Mask;
17
18use crate::ArrayRef;
19use crate::ArraySlots;
20use crate::ExecutionCtx;
21use crate::LEGACY_SESSION;
22#[expect(deprecated)]
23use crate::ToCanonical as _;
24use crate::VortexSessionExecute;
25use crate::aggregate_fn::fns::min_max::min_max;
26use crate::array::Array;
27use crate::array::ArrayParts;
28use crate::array::TypedArrayRef;
29use crate::array::child_to_validity;
30use crate::array::validity_to_child;
31use crate::arrays::ListView;
32use crate::arrays::Primitive;
33use crate::arrays::PrimitiveArray;
34use crate::arrays::bool;
35use crate::arrays::primitive::PrimitiveArrayExt;
36use crate::builtins::ArrayBuiltins;
37use crate::dtype::DType;
38use crate::dtype::IntegerPType;
39use crate::dtype::PType;
40use crate::expr::stats::Stat;
41use crate::match_each_integer_ptype;
42use crate::match_each_unsigned_integer_ptype;
43use crate::scalar_fn::fns::operators::Operator;
44use crate::validity::Validity;
45
46/// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
47/// each inner list element is a _scalar_ of the `elements` array.
48pub(super) const ELEMENTS_SLOT: usize = 0;
49/// The `offsets` array indicating the start position of each list in elements.
50///
51/// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
52/// (which is different from [`ListArray`](crate::arrays::ListArray)).
53pub(super) const OFFSETS_SLOT: usize = 1;
54/// The `sizes` array indicating the length of each list.
55///
56/// This field is intended to be paired with a corresponding offset to determine the list scalar
57/// we want to access.
58pub(super) const SIZES_SLOT: usize = 2;
59/// The validity bitmap indicating which list elements are non-null.
60pub(super) const VALIDITY_SLOT: usize = 3;
61pub(super) const NUM_SLOTS: usize = 4;
62pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["elements", "offsets", "sizes", "validity"];
63
64/// The canonical encoding for variable-length list arrays.
65///
66/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
67/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
68///
69/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
70/// since migrated to `ListViewArray` for a few reasons:
71///
72/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
73/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
74/// - Supports different integer types for offsets vs sizes
75///
76/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
77/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
78/// and size in a row-major fashion rather than column-major. More specifically, the row-major
79/// layout has a single child array with alternating offset and size next to each other.
80///
81/// We choose the column-major layout as it allows better compressability, as well as using
82/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
83/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
84/// about flexibility in the logcial type).
85///
86/// # Examples
87///
88/// ```
89/// # fn main() -> vortex_error::VortexResult<()> {
90/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
91/// # use vortex_array::arrays::listview::ListViewArrayExt;
92/// # use vortex_array::validity::Validity;
93/// # use vortex_array::IntoArray;
94/// # use vortex_buffer::buffer;
95/// # use std::sync::Arc;
96/// #
97/// // Create a list view array representing [[3, 4], [1], [2, 3]].
98/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
99///
100/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
101/// let offsets = buffer![2u32, 0, 1].into_array();  // Out-of-order offsets
102/// let sizes = buffer![2u32, 1, 2].into_array();  // The sizes cause overlaps
103///
104/// let list_view = ListViewArray::new(
105///     elements.into_array(),
106///     offsets.into_array(),
107///     sizes.into_array(),
108///     Validity::NonNullable,
109/// );
110///
111/// assert_eq!(list_view.len(), 3);
112///
113/// // Access individual lists
114/// let first_list = list_view.list_elements_at(0)?;
115/// assert_eq!(first_list.len(), 2);
116/// // First list contains elements[2..4] = [3, 4]
117///
118/// let first_offset = list_view.offset_at(0);
119/// let first_size = list_view.size_at(0);
120/// assert_eq!(first_offset, 2);
121/// assert_eq!(first_size, 2);
122/// # Ok(())
123/// # }
124/// ```
125///
126/// [`ListArray`]: crate::arrays::ListArray
127#[derive(Clone, Debug)]
128pub struct ListViewData {
129    // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
130    /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
131    ///
132    /// We use this information to help us more efficiently rebuild / compact our data.
133    ///
134    /// When this flag is true (indicating sorted offsets with no gaps and no overlaps and all
135    /// `offsets[i] + sizes[i]` are in order), conversions can bypass the very expensive rebuild
136    /// process which must rebuild the array from scratch.
137    is_zero_copy_to_list: bool,
138}
139
140impl Display for ListViewData {
141    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
142        write!(f, "is_zero_copy_to_list: {}", self.is_zero_copy_to_list)
143    }
144}
145
146pub struct ListViewDataParts {
147    pub elements_dtype: Arc<DType>,
148
149    /// See `ListViewArray::elements`
150    pub elements: ArrayRef,
151
152    /// See `ListViewArray::offsets`
153    pub offsets: ArrayRef,
154
155    /// See `ListViewArray::sizes`
156    pub sizes: ArrayRef,
157
158    /// See `ListViewArray::validity`
159    pub validity: Validity,
160}
161
162impl ListViewData {
163    pub(crate) fn make_slots(
164        elements: &ArrayRef,
165        offsets: &ArrayRef,
166        sizes: &ArrayRef,
167        validity: &Validity,
168        len: usize,
169    ) -> ArraySlots {
170        smallvec![
171            Some(elements.clone()),
172            Some(offsets.clone()),
173            Some(sizes.clone()),
174            validity_to_child(validity, len),
175        ]
176    }
177
178    /// Creates a new `ListViewArray`.
179    ///
180    /// # Panics
181    ///
182    /// Panics if the provided components do not satisfy the invariants documented
183    /// in `ListViewArray::new_unchecked`.
184    pub fn new() -> Self {
185        Self {
186            is_zero_copy_to_list: false,
187        }
188    }
189
190    /// Constructs a new `ListViewArray`.
191    ///
192    /// # Errors
193    ///
194    /// Returns an error if the provided components do not satisfy the invariants documented
195    /// in `ListViewArray::new_unchecked`.
196    pub fn try_new() -> VortexResult<Self> {
197        Ok(Self::new())
198    }
199
200    /// Creates a new `ListViewArray` without validation.
201    ///
202    /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
203    /// [`try_new()`] over this function, as they will check the validity of the data.
204    ///
205    /// [`ListArray`]: crate::arrays::ListArray
206    /// [`new()`]: Self::new
207    /// [`try_new()`]: Self::try_new
208    ///
209    /// # Safety
210    ///
211    /// The caller must ensure all of the following invariants are satisfied:
212    ///
213    /// - `offsets` and `sizes` must be non-nullable integer arrays.
214    /// - `offsets` and `sizes` must have the same length.
215    /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
216    /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
217    ///   (even if the corresponding view is defined as null by the validity array).
218    /// - If validity is an array, its length must equal `offsets.len()`.
219    pub unsafe fn new_unchecked() -> Self {
220        Self::new()
221    }
222
223    /// Validates the components that would be used to create a `ListViewArray`.
224    pub fn validate(
225        elements: &ArrayRef,
226        offsets: &ArrayRef,
227        sizes: &ArrayRef,
228        validity: &Validity,
229    ) -> VortexResult<()> {
230        // Check that offsets and sizes are integer arrays and non-nullable.
231        vortex_ensure!(
232            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
233            "offsets must be non-nullable integer array, got {}",
234            offsets.dtype()
235        );
236        vortex_ensure!(
237            sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
238            "sizes must be non-nullable integer array, got {}",
239            sizes.dtype()
240        );
241
242        // Check that they have the same length.
243        vortex_ensure!(
244            offsets.len() == sizes.len(),
245            "offsets and sizes must have the same length, got {} and {}",
246            offsets.len(),
247            sizes.len()
248        );
249
250        // If a validity array is present, it must be the same length as the `ListViewArray`.
251        if let Some(validity_len) = validity.maybe_len() {
252            vortex_ensure!(
253                validity_len == offsets.len(),
254                "validity with size {validity_len} does not match array size {}",
255                offsets.len()
256            );
257        }
258
259        // Skip host-only validation when offsets/sizes are not host-resident.
260        if offsets.is_host() && sizes.is_host() {
261            #[expect(deprecated)]
262            let offsets_primitive = offsets.to_primitive();
263            #[expect(deprecated)]
264            let sizes_primitive = sizes.to_primitive();
265            // Offsets and sizes are non-negative; reinterpret to unsigned to dispatch over 4 widths
266            // each (4x4 instead of 8x8). This is a read-only validation, so result types are moot.
267            let offsets_primitive =
268                offsets_primitive.reinterpret_cast(offsets_primitive.ptype().to_unsigned());
269            let sizes_primitive =
270                sizes_primitive.reinterpret_cast(sizes_primitive.ptype().to_unsigned());
271
272            // Validate the `offsets` and `sizes` arrays.
273            match_each_unsigned_integer_ptype!(offsets_primitive.ptype(), |O| {
274                match_each_unsigned_integer_ptype!(sizes_primitive.ptype(), |S| {
275                    let offsets_slice = offsets_primitive.as_slice::<O>();
276                    let sizes_slice = sizes_primitive.as_slice::<S>();
277
278                    validate_offsets_and_sizes::<O, S>(
279                        offsets_slice,
280                        sizes_slice,
281                        elements.len() as u64,
282                    )?;
283                })
284            });
285        }
286
287        Ok(())
288    }
289
290    /// Sets whether this `ListViewArray` is zero-copyable to a [`ListArray`].
291    ///
292    /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
293    /// needing to copy or reorganize the data.
294    ///
295    /// [`ListArray`]: crate::arrays::ListArray
296    ///
297    /// # Safety
298    ///
299    /// When setting `is_zctl` to `true`, the caller must ensure that the `ListViewArray` is
300    /// actually zero-copyable to a [`ListArray`]. This means:
301    ///
302    /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
303    /// - `offsets[i] + sizes[i] == offsets[i + 1]` for all `i`.
304    /// - No gaps in elements between first and last referenced elements.
305    /// - No overlapping list views (each element referenced at most once).
306    ///
307    /// Note that leading and trailing unreferenced elements **ARE** allowed.
308    pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
309        self.is_zero_copy_to_list = is_zctl;
310        self
311    }
312
313    /// Returns true if the `ListViewArray` is zero-copyable to a
314    /// [`ListArray`](crate::arrays::ListArray).
315    pub fn is_zero_copy_to_list(&self) -> bool {
316        self.is_zero_copy_to_list
317    }
318}
319
320impl Default for ListViewData {
321    fn default() -> Self {
322        Self::new()
323    }
324}
325
326/// Walks parallel `(offset, size)` slices and sets each range `[offset, offset + size]` in `buf`.
327///
328/// **Preconditions**
329///
330/// `offsets` and `sizes` must be the same length (which is always the case in valid `ListViewArray`s).
331fn fill_referenced_mask<O: IntegerPType, S: IntegerPType>(
332    buf: &mut BitBufferMut,
333    offsets: &[O],
334    sizes: &[S],
335) {
336    let len = offsets.len();
337
338    assert_eq!(
339        len,
340        sizes.len(),
341        "offsets and sizes must be the same length"
342    );
343
344    for i in 0..len {
345        let start: usize = offsets[i].as_();
346        let size: usize = sizes[i].as_();
347        buf.fill_range(start, start + size, true);
348    }
349}
350
351pub trait ListViewArrayExt: TypedArrayRef<ListView> {
352    fn nullability(&self) -> crate::dtype::Nullability {
353        match self.as_ref().dtype() {
354            DType::List(_, nullability) => *nullability,
355            _ => unreachable!("ListViewArrayExt requires a list dtype"),
356        }
357    }
358
359    fn elements(&self) -> &ArrayRef {
360        self.as_ref().slots()[ELEMENTS_SLOT]
361            .as_ref()
362            .vortex_expect("ListViewArray elements slot")
363    }
364
365    fn offsets(&self) -> &ArrayRef {
366        self.as_ref().slots()[OFFSETS_SLOT]
367            .as_ref()
368            .vortex_expect("ListViewArray offsets slot")
369    }
370
371    fn sizes(&self) -> &ArrayRef {
372        self.as_ref().slots()[SIZES_SLOT]
373            .as_ref()
374            .vortex_expect("ListViewArray sizes slot")
375    }
376
377    fn listview_validity(&self) -> Validity {
378        child_to_validity(
379            self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
380            self.nullability(),
381        )
382    }
383
384    fn offset_at(&self, index: usize) -> usize {
385        assert!(
386            index < self.as_ref().len(),
387            "Index {index} out of bounds 0..{}",
388            self.as_ref().len()
389        );
390        self.offsets()
391            .as_opt::<Primitive>()
392            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
393            .unwrap_or_else(|| {
394                self.offsets()
395                    .execute_scalar(index, &mut LEGACY_SESSION.create_execution_ctx())
396                    .vortex_expect("offsets must support execute_scalar")
397                    .as_primitive()
398                    .as_::<usize>()
399                    .vortex_expect("offset must fit in usize")
400            })
401    }
402
403    fn size_at(&self, index: usize) -> usize {
404        assert!(
405            index < self.as_ref().len(),
406            "Index {} out of bounds 0..{}",
407            index,
408            self.as_ref().len()
409        );
410        self.sizes()
411            .as_opt::<Primitive>()
412            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
413            .unwrap_or_else(|| {
414                self.sizes()
415                    .execute_scalar(index, &mut LEGACY_SESSION.create_execution_ctx())
416                    .vortex_expect("sizes must support execute_scalar")
417                    .as_primitive()
418                    .as_::<usize>()
419                    .vortex_expect("size must fit in usize")
420            })
421    }
422
423    fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
424        let offset = self.offset_at(index);
425        let size = self.size_at(index);
426        self.elements().slice(offset..offset + size)
427    }
428
429    fn verify_is_zero_copy_to_list(&self) -> bool {
430        #[expect(deprecated)]
431        let offsets_primitive = self.offsets().to_primitive();
432        #[expect(deprecated)]
433        let sizes_primitive = self.sizes().to_primitive();
434        validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok()
435    }
436
437    /// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that
438    /// position in `elements` is referenced by at least one view. Caller must ensure `elements`
439    /// is non-empty.
440    ///
441    /// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`,
442    /// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly.
443    ///
444    /// **Preconditions**
445    ///
446    /// `self.elements()` must be non-empty.
447    fn compute_referenced_elements_mask(&self, ctx: &mut ExecutionCtx) -> VortexResult<Mask> {
448        assert!(!self.elements().is_empty());
449        let len = self.elements().len();
450
451        let offsets_primitive = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
452        let sizes_primitive = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;
453
454        let mut buf = BitBufferMut::new_unset(len);
455
456        // Offsets/sizes are non-negative; reinterpret to unsigned (4x4 instead of 8x8).
457        let offsets_primitive =
458            offsets_primitive.reinterpret_cast(offsets_primitive.ptype().to_unsigned());
459        let sizes_primitive =
460            sizes_primitive.reinterpret_cast(sizes_primitive.ptype().to_unsigned());
461        match_each_unsigned_integer_ptype!(offsets_primitive.ptype(), |O| {
462            match_each_unsigned_integer_ptype!(sizes_primitive.ptype(), |S| {
463                fill_referenced_mask::<O, S>(
464                    &mut buf,
465                    offsets_primitive.as_slice::<O>(),
466                    sizes_primitive.as_slice::<S>(),
467                );
468            })
469        });
470
471        Ok(Mask::from_buffer(buf.freeze()))
472    }
473
474    /// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly.
475    ///
476    /// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0.
477    fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
478        if self.elements().is_empty() {
479            return Ok(1.0);
480        }
481
482        if self.sizes().is_empty() {
483            return Ok(0.0);
484        }
485
486        let density = match self.compute_referenced_elements_mask(ctx)? {
487            Mask::AllTrue(_) => 1.0,
488            Mask::AllFalse(_) => 0.0,
489            Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
490        };
491
492        Ok(density)
493    }
494
495    /// Upper-bound estimate of [`compute_density`](Self::compute_density) via
496    /// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`.
497    ///
498    /// Exact for non-overlapping views, but overcounts when multiple views share the same elements.
499    ///
500    /// Returns `Ok(1.0)` when `elements` is empty instead of dividing by 0.
501    fn upper_bound_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
502        let n_elts = self.elements().len();
503        if n_elts == 0 {
504            return Ok(1.0);
505        }
506
507        let sizes = self.sizes();
508        if sizes.is_empty() {
509            return Ok(0.0);
510        }
511
512        // compute_stat short-circuits on a cached exact Sum and otherwise computes
513        let sizes_sum = sizes
514            .statistics()
515            .compute_stat(Stat::Sum, ctx)?
516            .vortex_expect("sizes array has integer ptype elements")
517            .as_primitive()
518            .as_::<u64>()
519            .vortex_expect("integer ptypes can be upcast to u64");
520
521        // if the same elements are referenced more than once the estimate may be
522        // greater than 1.0, so clamp
523        let estimate = (sizes_sum as f32 / n_elts as f32).min(1.0);
524
525        debug_assert!(estimate >= 0.0);
526
527        Ok(estimate)
528    }
529
530    /// Returns the half-open range `[start, end)` of `elements` indices referenced by any view:
531    /// the minimum offset and the maximum `offset + size`. Elements outside this range are
532    /// unreferenced leading or trailing slack that a
533    /// [`TrimElements`](super::ListViewRebuildMode::TrimElements) rebuild would reclaim.
534    ///
535    /// For **zero-copy-to-list** arrays this is `O(1)`: views are sorted and non-overlapping with
536    /// no interior gaps, so the bounds are exactly `[first_offset, last_offset + last_size)`.
537    /// Otherwise it computes min/max statistics over `offsets` and `offsets + sizes`.
538    ///
539    /// # Preconditions
540    ///
541    /// The array must contain at least one list (`len() > 0`).
542    fn referenced_element_bounds(&self, ctx: &mut ExecutionCtx) -> VortexResult<(usize, usize)> {
543        let n_lists = self.as_ref().len();
544        vortex_ensure!(
545            n_lists > 0,
546            "referenced_element_bounds requires a non-empty array"
547        );
548
549        if self.is_zero_copy_to_list() {
550            let start = self.offset_at(0);
551            let end = self.offset_at(n_lists - 1) + self.size_at(n_lists - 1);
552            return Ok((start, end));
553        }
554
555        let start = self
556            .offsets()
557            .statistics()
558            .compute_min::<usize>(ctx)
559            .vortex_expect("offsets must report a usize min statistic");
560
561        // Cast offsets and sizes to the widest integer type so that `offset + size` cannot overflow
562        // the narrower input width.
563        let wide_dtype = DType::from(if self.offsets().dtype().as_ptype().is_unsigned_int() {
564            PType::U64
565        } else {
566            PType::I64
567        });
568        let offsets = self.offsets().cast(wide_dtype.clone())?;
569        let sizes = self.sizes().cast(wide_dtype)?;
570        let end = min_max(&offsets.binary(sizes, Operator::Add)?, ctx)?
571            .vortex_expect("non-empty array must report a min/max")
572            .max
573            .as_primitive()
574            .as_::<usize>()
575            .vortex_expect("max `offset + size` must fit in a usize");
576
577        Ok((start, end))
578    }
579}
580impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}
581
582impl Array<ListView> {
583    /// Creates a new `ListViewArray`.
584    pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
585        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
586        let len = offsets.len();
587        let slots = ListViewData::make_slots(&elements, &offsets, &sizes, &validity, len);
588        ListViewData::validate(&elements, &offsets, &sizes, &validity)
589            .vortex_expect("`ListViewArray` construction failed");
590        let data = ListViewData::new();
591        unsafe {
592            Array::from_parts_unchecked(
593                ArrayParts::new(ListView, dtype, len, data).with_slots(slots),
594            )
595        }
596    }
597
598    /// Constructs a new `ListViewArray`.
599    pub fn try_new(
600        elements: ArrayRef,
601        offsets: ArrayRef,
602        sizes: ArrayRef,
603        validity: Validity,
604    ) -> VortexResult<Self> {
605        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
606        let len = offsets.len();
607        let slots = ListViewData::make_slots(&elements, &offsets, &sizes, &validity, len);
608        ListViewData::validate(&elements, &offsets, &sizes, &validity)?;
609        let data = ListViewData::try_new()?;
610        Ok(unsafe {
611            Array::from_parts_unchecked(
612                ArrayParts::new(ListView, dtype, len, data).with_slots(slots),
613            )
614        })
615    }
616
617    /// Creates a new `ListViewArray` without validation.
618    ///
619    /// # Safety
620    ///
621    /// See [`ListViewData::new_unchecked`].
622    pub unsafe fn new_unchecked(
623        elements: ArrayRef,
624        offsets: ArrayRef,
625        sizes: ArrayRef,
626        validity: Validity,
627    ) -> Self {
628        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
629        let len = offsets.len();
630        let slots = ListViewData::make_slots(&elements, &offsets, &sizes, &validity, len);
631        let data = unsafe { ListViewData::new_unchecked() };
632        unsafe {
633            Array::from_parts_unchecked(
634                ArrayParts::new(ListView, dtype, len, data).with_slots(slots),
635            )
636        }
637    }
638
639    /// Mark whether this list view can be zero-copy converted to a list.
640    ///
641    /// # Safety
642    ///
643    /// See [`ListViewData::with_zero_copy_to_list`].
644    pub unsafe fn with_zero_copy_to_list(self, is_zctl: bool) -> Self {
645        if cfg!(debug_assertions) && is_zctl {
646            #[expect(deprecated)]
647            let offsets_primitive = self.offsets().to_primitive();
648            #[expect(deprecated)]
649            let sizes_primitive = self.sizes().to_primitive();
650            validate_zctl(self.elements(), offsets_primitive, sizes_primitive)
651                .vortex_expect("Failed to validate zero-copy to list flag");
652        }
653        let dtype = self.dtype().clone();
654        let len = self.len();
655        let slots: ArraySlots = self.slots().iter().cloned().collect();
656        let data = unsafe { self.into_data().with_zero_copy_to_list(is_zctl) };
657        unsafe {
658            Array::from_parts_unchecked(
659                ArrayParts::new(ListView, dtype, len, data).with_slots(slots),
660            )
661        }
662    }
663
664    pub fn into_data_parts(self) -> ListViewDataParts {
665        let elements = self.slots()[ELEMENTS_SLOT]
666            .clone()
667            .vortex_expect("ListViewArray elements slot");
668        let offsets = self.slots()[OFFSETS_SLOT]
669            .clone()
670            .vortex_expect("ListViewArray offsets slot");
671        let sizes = self.slots()[SIZES_SLOT]
672            .clone()
673            .vortex_expect("ListViewArray sizes slot");
674        let validity = self.listview_validity();
675        ListViewDataParts {
676            elements_dtype: Arc::new(elements.dtype().clone()),
677            elements,
678            offsets,
679            sizes,
680            validity,
681        }
682    }
683}
684
685/// Helper function to validate `offsets` and `sizes` with specific types.
686fn validate_offsets_and_sizes<O, S>(
687    offsets_slice: &[O],
688    sizes_slice: &[S],
689    elements_len: u64,
690) -> VortexResult<()>
691where
692    O: IntegerPType,
693    S: IntegerPType,
694{
695    debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
696
697    #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
698    for i in 0..offsets_slice.len() {
699        let offset = offsets_slice[i];
700        let size = sizes_slice[i];
701
702        vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
703        vortex_ensure!(size >= S::zero(), "cannot have negative size");
704
705        let offset_u64 = offset
706            .to_u64()
707            .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
708
709        let size_u64 = size
710            .to_u64()
711            .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
712
713        // Check for overflow when adding offset + size.
714        let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
715            vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
716        })?;
717
718        if offset_u64 == elements_len {
719            vortex_ensure!(
720                size_u64 == 0,
721                "views to the end of the elements array (length {elements_len}) must have size 0 \
722                    (had size {size_u64})"
723            );
724        }
725
726        vortex_ensure!(
727            end <= elements_len,
728            "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
729            exceeds elements length {elements_len}",
730        );
731    }
732
733    Ok(())
734}
735
736/// Helper function to validate if the `ListViewArray` components are actually zero-copyable to
737/// [`ListArray`](crate::arrays::ListArray).
738fn validate_zctl(
739    elements: &ArrayRef,
740    offsets_primitive: PrimitiveArray,
741    sizes_primitive: PrimitiveArray,
742) -> VortexResult<()> {
743    // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
744    // if there are null views.
745    let mut ctx = LEGACY_SESSION.create_execution_ctx();
746    if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted(&mut ctx) {
747        vortex_ensure!(is_sorted, "offsets must be sorted");
748    } else {
749        vortex_bail!("offsets must report is_sorted statistic");
750    }
751
752    // Validate that offset[i] + size[i] <= offset[i+1] for all items
753    // This ensures views are non-overlapping and properly ordered for zero-copy-to-list
754    fn validate_monotonic_ends<O: IntegerPType, S: IntegerPType>(
755        offsets_slice: &[O],
756        sizes_slice: &[S],
757        len: usize,
758    ) -> VortexResult<()> {
759        let mut max_end = 0usize;
760
761        for i in 0..len {
762            let offset = offsets_slice[i].to_usize().unwrap_or(usize::MAX);
763            let size = sizes_slice[i].to_usize().unwrap_or(usize::MAX);
764
765            // Check that this view starts at or after the previous view ended
766            vortex_ensure!(
767                offset >= max_end,
768                "Zero-copy-to-list requires views to be non-overlapping and ordered: \
769                 view[{}] starts at {} but previous views extend to {}",
770                i,
771                offset,
772                max_end
773            );
774
775            // Update max_end for the next iteration
776            let end = offset.saturating_add(size);
777            max_end = max_end.max(end);
778        }
779
780        Ok(())
781    }
782
783    let offsets_dtype = offsets_primitive.dtype();
784    let sizes_dtype = sizes_primitive.dtype();
785    let len = offsets_primitive.len();
786
787    // Offsets/sizes are non-negative; reinterpret to unsigned (4x4 instead of 8x8).
788    let offsets_unsigned =
789        offsets_primitive.reinterpret_cast(offsets_dtype.as_ptype().to_unsigned());
790    let sizes_unsigned = sizes_primitive.reinterpret_cast(sizes_dtype.as_ptype().to_unsigned());
791
792    // Check that offset + size values are monotonic (no overlaps)
793    match_each_unsigned_integer_ptype!(offsets_unsigned.ptype(), |O| {
794        match_each_unsigned_integer_ptype!(sizes_unsigned.ptype(), |S| {
795            let offsets_slice = offsets_unsigned.as_slice::<O>();
796            let sizes_slice = sizes_unsigned.as_slice::<S>();
797
798            validate_monotonic_ends(offsets_slice, sizes_slice, len)?;
799        })
800    });
801
802    // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
803    // implementation would be even more complicated than this. We could use a bit buffer denoting
804    // if positions in `elements` are used, and then additionally store a separate flag that tells
805    // us if a position is used more than once.
806    let mut element_references = vec![0u8; elements.len()];
807
808    fn count_references<O: IntegerPType, S: IntegerPType>(
809        element_references: &mut [u8],
810        offsets_primitive: PrimitiveArray,
811        sizes_primitive: PrimitiveArray,
812    ) {
813        let offsets_slice = offsets_primitive.as_slice::<O>();
814        let sizes_slice = sizes_primitive.as_slice::<S>();
815
816        // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
817        // invariants as non-null views, even for a `bool` information.
818        for i in 0..offsets_slice.len() {
819            let offset: usize = offsets_slice[i].as_();
820            let size: usize = sizes_slice[i].as_();
821            for j in offset..offset + size {
822                element_references[j] = element_references[j].saturating_add(1);
823            }
824        }
825    }
826
827    match_each_unsigned_integer_ptype!(offsets_unsigned.ptype(), |O| {
828        match_each_unsigned_integer_ptype!(sizes_unsigned.ptype(), |S| {
829            count_references::<O, S>(&mut element_references, offsets_unsigned, sizes_unsigned);
830        })
831    });
832
833    // Allow leading and trailing unreferenced elements, but not gaps in the middle.
834    let leftmost_used = element_references
835        .iter()
836        .position(|&references| references != 0);
837    let rightmost_used = element_references
838        .iter()
839        .rposition(|&references| references != 0);
840
841    if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
842        vortex_ensure!(
843            element_references[first_ref..=last_ref]
844                .iter()
845                .all(|&references| references != 0),
846            "found gap in elements array between first and last referenced elements"
847        );
848    }
849
850    vortex_ensure!(element_references.iter().all(|&references| references <= 1));
851
852    Ok(())
853}