vortex_array/arrays/listview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::{DType, IntegerPType, match_each_integer_ptype};
8use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_err};
9
10use crate::arrays::PrimitiveVTable;
11use crate::stats::ArrayStats;
12use crate::validity::Validity;
13use crate::{Array, ArrayRef, ToCanonical};
14
15/// The canonical encoding for variable-length list arrays.
16///
17/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
18/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
19///
20/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
21/// since migrated to `ListViewArray` for a few reasons:
22///
23/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
24/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
25/// - Supports different integer types for offsets vs sizes
26///
27/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
28/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
29/// and size in a row-major fashion rather than column-major. More specifically, the row-major
30/// layout has a single child array with alternating offset and size next to each other.
31///
32/// We choose the column-major layout as it allows better compressability, as well as using
33/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
34/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
35/// about flexibility in the logcial type).
36///
37/// # Examples
38///
39/// ```
40/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
41/// # use vortex_array::validity::Validity;
42/// # use vortex_array::IntoArray;
43/// # use vortex_buffer::buffer;
44/// # use std::sync::Arc;
45/// #
46/// // Create a list view array representing [[3, 4], [1], [2, 3]].
47/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
48///
49/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
50/// let offsets = buffer![2u32, 0, 1].into_array(); // Out-of-order offsets
51/// let sizes = buffer![2u32, 1, 2].into_array(); // The sizes cause overlaps
52///
53/// let list_view = ListViewArray::try_new(
54/// elements.into_array(),
55/// offsets.into_array(),
56/// sizes.into_array(),
57/// Validity::NonNullable,
58/// ).unwrap();
59///
60/// assert_eq!(list_view.len(), 3);
61///
62/// // Access individual lists
63/// let first_list = list_view.list_elements_at(0);
64/// assert_eq!(first_list.len(), 2);
65/// // First list contains elements[2..4] = [3, 4]
66///
67/// let first_offset = list_view.offset_at(0);
68/// let first_size = list_view.size_at(0);
69/// assert_eq!(first_offset, 2);
70/// assert_eq!(first_size, 2);
71/// ```
72///
73/// [`ListArray`]: crate::arrays::ListArray
74#[derive(Clone, Debug)]
75pub struct ListViewArray {
76 /// The [`DType`] of the list array.
77 ///
78 /// This type **must** be the variant [`DType::List`].
79 pub(super) dtype: DType,
80
81 /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
82 /// each inner list element is a _scalar_ of the `elements` array.
83 elements: ArrayRef,
84
85 /// The `offsets` array indicating the start position of each list in elements.
86 ///
87 /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
88 /// (which is different from [`ListArray`](crate::arrays::ListArray)),
89 offsets: ArrayRef,
90
91 /// The `sizes` array indicating the length of each list.
92 ///
93 /// This field is intended to be paired with a corresponding offset to determine the list scalar
94 /// we want to access.
95 sizes: ArrayRef,
96
97 /// The validity / null map of the array.
98 ///
99 /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
100 /// list scalars are null. The `elements` array will track individual value nullability.
101 pub(super) validity: Validity,
102
103 /// The stats for this array.
104 pub(super) stats_set: ArrayStats,
105}
106
107impl ListViewArray {
108 /// Creates a new [`ListViewArray`].
109 ///
110 /// # Panics
111 ///
112 /// Panics if the provided components do not satisfy the invariants documented
113 /// in [`ListViewArray::new_unchecked`].
114 pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
115 Self::try_new(elements, offsets, sizes, validity)
116 .vortex_expect("ListViewArray construction failed")
117 }
118
119 /// Constructs a new `ListViewArray`.
120 ///
121 /// # Errors
122 ///
123 /// Returns an error if the provided components do not satisfy the invariants documented
124 /// in [`ListViewArray::new_unchecked`].
125 pub fn try_new(
126 elements: ArrayRef,
127 offsets: ArrayRef,
128 sizes: ArrayRef,
129 validity: Validity,
130 ) -> VortexResult<Self> {
131 Self::validate(&elements, &offsets, &sizes, &validity)?;
132
133 // SAFETY: validate ensures all invariants are met.
134 Ok(unsafe { Self::new_unchecked(elements, offsets, sizes, validity) })
135 }
136
137 /// Creates a new [`ListViewArray`] without validation.
138 ///
139 /// # Safety
140 ///
141 /// The caller must ensure all of the following invariants are satisfied:
142 ///
143 /// - `offsets` and `sizes` must be non-nullable integer arrays.
144 /// - `offsets` and `sizes` must have the same length.
145 /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
146 /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`.
147 /// - If validity is an array, its length must equal `offsets.len()`.
148 pub unsafe fn new_unchecked(
149 elements: ArrayRef,
150 offsets: ArrayRef,
151 sizes: ArrayRef,
152 validity: Validity,
153 ) -> Self {
154 #[cfg(debug_assertions)]
155 Self::validate(&elements, &offsets, &sizes, &validity)
156 .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
157
158 Self {
159 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
160 elements,
161 offsets,
162 sizes,
163 validity,
164 stats_set: Default::default(),
165 }
166 }
167
168 /// Validates the components that would be used to create a [`ListViewArray`].
169 pub fn validate(
170 elements: &dyn Array,
171 offsets: &dyn Array,
172 sizes: &dyn Array,
173 validity: &Validity,
174 ) -> VortexResult<()> {
175 // Check that offsets and sizes are integer arrays and non-nullable.
176 vortex_ensure!(
177 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
178 "offsets must be non-nullable integer array, got {}",
179 offsets.dtype()
180 );
181 vortex_ensure!(
182 sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
183 "sizes must be non-nullable integer array, got {}",
184 sizes.dtype()
185 );
186
187 // Check that they have the same length.
188 vortex_ensure!(
189 offsets.len() == sizes.len(),
190 "offsets and sizes must have the same length, got {} and {}",
191 offsets.len(),
192 sizes.len()
193 );
194
195 // Check that the size type can fit within the offset type to prevent overflows.
196 let size_ptype = sizes.dtype().as_ptype();
197 let offset_ptype = offsets.dtype().as_ptype();
198 let size_max = sizes.dtype().as_ptype().max_value_as_u64();
199 let offset_max = offsets.dtype().as_ptype().max_value_as_u64();
200
201 vortex_ensure!(
202 size_max <= offset_max,
203 "size type {:?} (max {}) must fit within offset type {:?} (max {})",
204 size_ptype,
205 size_max,
206 offset_ptype,
207 offset_max
208 );
209
210 let offsets_primitive = offsets.to_primitive();
211 let sizes_primitive = sizes.to_primitive();
212
213 // Validate the `offsets` and `sizes` arrays.
214 match_each_integer_ptype!(offset_ptype, |O| {
215 match_each_integer_ptype!(size_ptype, |S| {
216 let offsets_slice = offsets_primitive.as_slice::<O>();
217 let sizes_slice = sizes_primitive.as_slice::<S>();
218
219 validate_offsets_and_sizes::<O, S>(
220 offsets_slice,
221 sizes_slice,
222 elements.len() as u64,
223 )?;
224 })
225 });
226
227 // If a validity array is present, it must be the same length as the ListView.
228 if let Some(validity_len) = validity.maybe_len() {
229 vortex_ensure!(
230 validity_len == offsets.len(),
231 "validity with size {validity_len} does not match array size {}",
232 offsets.len()
233 );
234 }
235
236 Ok(())
237 }
238
239 /// Returns the offset at the given index.
240 ///
241 /// Note that it is possible the corresponding list view is null (which is only defined by the
242 /// validity map). Regardless, we are still guaranteed that this offset is valid by the
243 /// invariants of [`ListViewArray`].
244 pub fn offset_at(&self, index: usize) -> usize {
245 assert!(
246 index < self.len(),
247 "Index {index} out of bounds 0..{}",
248 self.len()
249 );
250
251 // Fast path for `PrimitiveArray`.
252 self.offsets
253 .as_opt::<PrimitiveVTable>()
254 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
255 .unwrap_or_else(|| {
256 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
257 self.offsets
258 .scalar_at(index)
259 .as_primitive()
260 .as_::<usize>()
261 .vortex_expect("offset must fit in usize")
262 })
263 }
264
265 /// Returns the size at the given index.
266 ///
267 /// Note that it is possible the corresponding list view is null (which is only defined by the
268 /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
269 /// of [`ListViewArray`].
270 pub fn size_at(&self, index: usize) -> usize {
271 assert!(
272 index < self.len(),
273 "Index {} out of bounds 0..{}",
274 index,
275 self.len()
276 );
277
278 // Fast path for `PrimitiveArray`.
279 self.sizes
280 .as_opt::<PrimitiveVTable>()
281 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
282 .unwrap_or_else(|| {
283 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
284 self.sizes
285 .scalar_at(index)
286 .as_primitive()
287 .as_::<usize>()
288 .vortex_expect("size must fit in usize")
289 })
290 }
291
292 /// Returns the elements at the given index from the list array.
293 pub fn list_elements_at(&self, index: usize) -> ArrayRef {
294 let offset = self.offset_at(index);
295 let size = self.size_at(index);
296 self.elements().slice(offset..offset + size)
297 }
298
299 /// Returns the offsets array.
300 pub fn offsets(&self) -> &ArrayRef {
301 &self.offsets
302 }
303
304 /// Returns the sizes array.
305 pub fn sizes(&self) -> &ArrayRef {
306 &self.sizes
307 }
308
309 /// Returns the elements array.
310 pub fn elements(&self) -> &ArrayRef {
311 &self.elements
312 }
313}
314
315/// Helper function to validate `offsets` and `sizes` with specific types.
316fn validate_offsets_and_sizes<O, S>(
317 offsets_slice: &[O],
318 sizes_slice: &[S],
319 elements_len: u64,
320) -> VortexResult<()>
321where
322 O: IntegerPType,
323 S: IntegerPType,
324{
325 debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
326
327 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
328 for i in 0..offsets_slice.len() {
329 let offset = offsets_slice[i];
330 let size = sizes_slice[i];
331
332 vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
333 vortex_ensure!(size >= S::zero(), "cannot have negative size");
334
335 let offset_u64 = offset
336 .to_u64()
337 .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
338
339 let size_u64 = size
340 .to_u64()
341 .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
342
343 // Check for overflow when adding offset + size.
344 let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
345 vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
346 })?;
347
348 vortex_ensure!(
349 end <= elements_len,
350 "offset[{i}] + size[{i}] = {end} exceeds elements length {elements_len}",
351 );
352 }
353
354 Ok(())
355}