vortex_array/builders/
listview.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! ListView Builder Implementation.
5//!
6//! A builder for [`ListViewArray`] that tracks both offsets and sizes.
7//!
8//! Unlike [`ListArray`] which only tracks offsets, [`ListViewArray`] stores both offsets and sizes
9//! in separate arrays for better compression.
10//!
11//! [`ListArray`]: crate::arrays::ListArray
12
13use std::sync::Arc;
14
15use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype};
16use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_panic};
17use vortex_mask::Mask;
18use vortex_scalar::{ListScalar, Scalar};
19
20use crate::array::{Array, ArrayRef, IntoArray};
21use crate::arrays::{ListViewArray, ListViewRebuildMode, PrimitiveArray};
22use crate::builders::lazy_null_builder::LazyBitBufferBuilder;
23use crate::builders::{
24    ArrayBuilder, DEFAULT_BUILDER_CAPACITY, PrimitiveBuilder, UninitRange, builder_with_capacity,
25};
26use crate::{Canonical, ToCanonical, compute};
27
28/// A builder for creating [`ListViewArray`] instances, parameterized by the [`IntegerPType`] of
29/// the `offsets` and the `sizes` builders.
30///
31/// This builder tracks both offsets and sizes using potentially different integer types for memory
32/// efficiency. For example, you might use `u64` for offsets but only `u8` for sizes if your lists
33/// are small.
34///
35/// Any combination of [`IntegerPType`] types are valid, as long as the type of `sizes` can fit into
36/// the type of `offsets`.
37pub struct ListViewBuilder<O: IntegerPType, S: IntegerPType> {
38    /// The [`DType`] of the [`ListViewArray`]. This **must** be a [`DType::List`].
39    dtype: DType,
40
41    /// The builder for the underlying elements of the [`ListArray`](crate::arrays::ListArray).
42    elements_builder: Box<dyn ArrayBuilder>,
43
44    /// The builder for the `offsets` into the `elements` array.
45    offsets_builder: PrimitiveBuilder<O>,
46
47    /// The builder for the `sizes` of each list view.
48    sizes_builder: PrimitiveBuilder<S>,
49
50    /// The null map builder of the [`ListViewArray`].
51    nulls: LazyBitBufferBuilder,
52}
53
54impl<O: IntegerPType, S: IntegerPType> ListViewBuilder<O, S> {
55    /// Creates a new `ListViewBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
56    pub fn new(element_dtype: Arc<DType>, nullability: Nullability) -> Self {
57        Self::with_capacity(
58            element_dtype,
59            nullability,
60            // We arbitrarily choose 2 times the number of list scalars for the capacity of the
61            // elements builder since we cannot know this ahead of time.
62            DEFAULT_BUILDER_CAPACITY * 2,
63            DEFAULT_BUILDER_CAPACITY,
64        )
65    }
66
67    /// Create a new [`ListViewArray`] builder with a with the given `capacity`, as well as an
68    /// initial capacity for the `elements` builder (since we cannot know that ahead of time solely
69    /// based on the outer array `capacity`).
70    ///
71    /// # Panics
72    ///
73    /// Panics if the size type `S` cannot fit within the offset type `O`.
74    pub fn with_capacity(
75        element_dtype: Arc<DType>,
76        nullability: Nullability,
77        elements_capacity: usize,
78        capacity: usize,
79    ) -> Self {
80        // Validate that size type's maximum value fits within offset type's maximum value.
81        // Since offsets are non-negative, we only need to check max values.
82        assert!(
83            S::max_value_as_u64() <= O::max_value_as_u64(),
84            "Size type {:?} (max offset {}) must fit within offset type {:?} (max offset {})",
85            S::PTYPE,
86            S::max_value_as_u64(),
87            O::PTYPE,
88            O::max_value_as_u64()
89        );
90
91        let elements_builder = builder_with_capacity(&element_dtype, elements_capacity);
92
93        let offsets_builder =
94            PrimitiveBuilder::<O>::with_capacity(Nullability::NonNullable, capacity);
95        let sizes_builder =
96            PrimitiveBuilder::<S>::with_capacity(Nullability::NonNullable, capacity);
97
98        let nulls = LazyBitBufferBuilder::new(capacity);
99
100        Self {
101            dtype: DType::List(element_dtype, nullability),
102            elements_builder,
103            offsets_builder,
104            sizes_builder,
105            nulls,
106        }
107    }
108
109    /// Append a list of values to the builder.
110    ///
111    /// This method extends the value builder with the provided values and records
112    /// the offset and size of the new list.
113    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
114        let Some(elements) = value.elements() else {
115            // If `elements` is `None`, then the `value` is a null value.
116            vortex_ensure!(
117                self.dtype.is_nullable(),
118                "Cannot append null value to non-nullable list builder"
119            );
120            self.append_null();
121            return Ok(());
122        };
123
124        let curr_offset = self.elements_builder.len();
125        let num_elements = elements.len();
126
127        // We must assert this even in release mode to ensure that the safety comment in
128        // `finish_into_listview` is correct.
129        assert!(
130            ((curr_offset + num_elements) as u64) < O::max_value_as_u64(),
131            "appending this list would cause an offset overflow"
132        );
133
134        for scalar in elements {
135            self.elements_builder.append_scalar(&scalar)?;
136        }
137        self.nulls.append_non_null();
138
139        self.offsets_builder.append_value(
140            O::from_usize(curr_offset).vortex_expect("Failed to convert from usize to `O`"),
141        );
142        self.sizes_builder.append_value(
143            S::from_usize(num_elements).vortex_expect("Failed to convert from usize to `S`"),
144        );
145
146        Ok(())
147    }
148
149    /// Finishes the builder directly into a [`ListViewArray`].
150    pub fn finish_into_listview(&mut self) -> ListViewArray {
151        debug_assert_eq!(self.offsets_builder.len(), self.sizes_builder.len());
152        debug_assert_eq!(self.offsets_builder.len(), self.nulls.len());
153
154        let elements = self.elements_builder.finish();
155        let offsets = self.offsets_builder.finish();
156        let sizes = self.sizes_builder.finish();
157        let validity = self.nulls.finish_with_nullability(self.dtype.nullability());
158
159        // SAFETY:
160        // - Both the offsets and the sizes are non-nullable.
161        // - The offsets, sizes, and validity have the same length since we always appended the same
162        //   amount.
163        // - We checked on construction that the sizes type fits into the offsets.
164        // - In every method that adds values to this builder (`append_value`, `append_scalar`, and
165        //   `extend_from_array_unchecked`), we checked that `offset + size` does not overflow.
166        // - We constructed everything in a way that builds the `ListViewArray` similar to the shape
167        //   of a `ListArray`, so we know the resulting array is zero-copyable to a `ListArray`.
168        unsafe {
169            ListViewArray::new_unchecked(elements, offsets, sizes, validity)
170                .with_zero_copy_to_list(true)
171        }
172    }
173
174    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
175    /// the outer `FixedSizeList`.
176    pub fn element_dtype(&self) -> &DType {
177        let DType::List(element_dtype, ..) = &self.dtype else {
178            vortex_panic!("`ListViewBuilder` has an incorrect dtype: {}", self.dtype);
179        };
180
181        element_dtype
182    }
183}
184
185impl<O: IntegerPType, S: IntegerPType> ArrayBuilder for ListViewBuilder<O, S> {
186    fn as_any(&self) -> &dyn std::any::Any {
187        self
188    }
189
190    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
191        self
192    }
193
194    fn dtype(&self) -> &DType {
195        &self.dtype
196    }
197
198    fn len(&self) -> usize {
199        self.offsets_builder.len()
200    }
201
202    fn append_zeros(&mut self, n: usize) {
203        debug_assert_eq!(self.offsets_builder.len(), self.sizes_builder.len());
204        debug_assert_eq!(self.offsets_builder.len(), self.nulls.len());
205
206        // Get the current position in the elements array.
207        let curr_offset = self.elements_builder.len();
208
209        // Since we consider the "zero" element of a list an empty list, we simply update the
210        // `offsets` and `sizes` metadata to add an empty list.
211        for _ in 0..n {
212            self.offsets_builder.append_value(
213                O::from_usize(curr_offset).vortex_expect("Failed to convert from usize to `O`"),
214            );
215            self.sizes_builder.append_value(S::zero());
216        }
217
218        self.nulls.append_n_non_nulls(n);
219    }
220
221    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
222        debug_assert_eq!(self.offsets_builder.len(), self.sizes_builder.len());
223        debug_assert_eq!(self.offsets_builder.len(), self.nulls.len());
224
225        // Get the current position in the elements array.
226        let curr_offset = self.elements_builder.len();
227
228        // A null list can have any representation, but we choose to use the zero representation.
229        for _ in 0..n {
230            self.offsets_builder.append_value(
231                O::from_usize(curr_offset).vortex_expect("Failed to convert from usize to `O`"),
232            );
233            self.sizes_builder.append_value(S::zero());
234        }
235
236        // This is the only difference from `append_zeros`.
237        self.nulls.append_n_nulls(n);
238    }
239
240    fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
241        vortex_ensure!(
242            scalar.dtype() == self.dtype(),
243            "ListViewBuilder expected scalar with dtype {:?}, got {:?}",
244            self.dtype(),
245            scalar.dtype()
246        );
247
248        let list_scalar = scalar.as_list();
249        self.append_value(list_scalar)
250    }
251
252    unsafe fn extend_from_array_unchecked(&mut self, array: &dyn Array) {
253        let listview = array.to_listview();
254        if listview.is_empty() {
255            return;
256        }
257
258        // If we do not have the guarantee that the array is zero-copyable to a list, then we have
259        // to manually append each scalar.
260        if !listview.is_zero_copy_to_list() {
261            for i in 0..listview.len() {
262                let list = listview.scalar_at(i);
263
264                self.append_scalar(&list)
265                    .vortex_expect("was unable to extend the `ListViewBuilder`")
266            }
267
268            return;
269        }
270
271        // Otherwise, after removing any leading and trailing elements, we can simply bulk append
272        // the entire array.
273        let listview = listview.rebuild(ListViewRebuildMode::MakeExact);
274        debug_assert!(listview.is_zero_copy_to_list());
275
276        self.nulls.append_validity_mask(array.validity_mask());
277
278        // Bulk append the new elements (which should have no gaps or overlaps).
279        let old_elements_len = self.elements_builder.len();
280        self.elements_builder
281            .reserve_exact(listview.elements().len());
282        self.elements_builder.extend_from_array(listview.elements());
283        let new_elements_len = self.elements_builder.len();
284
285        // Reserve enough space for the new views.
286        let extend_length = listview.len();
287        self.sizes_builder.reserve_exact(extend_length);
288        self.offsets_builder.reserve_exact(extend_length);
289
290        // The incoming sizes might have a different type than the builder, so we need to cast.
291        let cast_sizes = compute::cast(listview.sizes(), self.sizes_builder.dtype()).vortex_expect(
292            "was somehow unable to cast the new sizes to the type of the builder sizes",
293        );
294        self.sizes_builder.extend_from_array(cast_sizes.as_ref());
295
296        // Now we need to adjust all of the offsets by adding the current number of elements in the
297        // builder.
298
299        let uninit_range = self.offsets_builder.uninit_range(extend_length);
300
301        // This should be cheap because we didn't compress after rebuilding.
302        let new_offsets = listview.offsets().to_primitive();
303
304        match_each_integer_ptype!(new_offsets.ptype(), |A| {
305            adjust_and_extend_offsets::<O, A>(
306                uninit_range,
307                new_offsets,
308                old_elements_len,
309                new_elements_len,
310            );
311        })
312    }
313
314    fn reserve_exact(&mut self, capacity: usize) {
315        self.elements_builder.reserve_exact(capacity * 2);
316        self.offsets_builder.reserve_exact(capacity);
317        self.sizes_builder.reserve_exact(capacity);
318        self.nulls.reserve_exact(capacity);
319    }
320
321    unsafe fn set_validity_unchecked(&mut self, validity: Mask) {
322        self.nulls = LazyBitBufferBuilder::new(validity.len());
323        self.nulls.append_validity_mask(validity);
324    }
325
326    fn finish(&mut self) -> ArrayRef {
327        self.finish_into_listview().into_array()
328    }
329
330    fn finish_into_canonical(&mut self) -> Canonical {
331        Canonical::List(self.finish_into_listview())
332    }
333}
334
335/// Given new offsets, adds them to the `UninitRange` after adding the `old_elements_len` to each
336/// offset.
337fn adjust_and_extend_offsets<'a, O: IntegerPType, A: IntegerPType>(
338    mut uninit_range: UninitRange<'a, O>,
339    new_offsets: PrimitiveArray,
340    old_elements_len: usize,
341    new_elements_len: usize,
342) {
343    let new_offsets_slice = new_offsets.as_slice::<A>();
344    let old_elements_len = O::from_usize(old_elements_len)
345        .vortex_expect("the old elements length did not fit into the offset type (impossible)");
346    let new_elements_len = O::from_usize(new_elements_len)
347        .vortex_expect("the current elements length did not fit into the offset type (impossible)");
348
349    for i in 0..uninit_range.len() {
350        let new_offset = O::from_usize(
351            new_offsets_slice[i]
352                .to_usize()
353                .vortex_expect("Offsets must always fit in usize"),
354        )
355        .vortex_expect("New offset somehow did not fit into the builder's offset type");
356
357        // We have to check this even in release mode to ensure the final `new_unchecked`
358        // construction in `finish_into_listview` is valid.
359        let adjusted_new_offset = new_offset + old_elements_len;
360        assert!(
361            adjusted_new_offset <= new_elements_len,
362            "[{i}/{}]: {new_offset} + {old_elements_len} \
363                = {adjusted_new_offset} <= {new_elements_len} failed",
364            uninit_range.len()
365        );
366
367        uninit_range.set_value(i, adjusted_new_offset);
368    }
369
370    // SAFETY: We have set all the values in the range, and since `offsets` are non-nullable, we are
371    // done.
372    unsafe { uninit_range.finish() };
373}
374
375#[cfg(test)]
376mod tests {
377    use std::sync::Arc;
378
379    use vortex_dtype::DType;
380    use vortex_dtype::Nullability::{NonNullable, Nullable};
381    use vortex_dtype::PType::I32;
382    use vortex_scalar::Scalar;
383
384    use super::ListViewBuilder;
385    use crate::IntoArray;
386    use crate::array::Array;
387    use crate::arrays::ListArray;
388    use crate::builders::ArrayBuilder;
389    use crate::vtable::ValidityHelper;
390
391    #[test]
392    fn test_empty() {
393        let mut builder =
394            ListViewBuilder::<u32, u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0, 0);
395
396        let listview = builder.finish();
397        assert_eq!(listview.len(), 0);
398    }
399
400    #[test]
401    fn test_basic_append_and_nulls() {
402        let dtype: Arc<DType> = Arc::new(I32.into());
403        let mut builder = ListViewBuilder::<u32, u32>::with_capacity(dtype.clone(), Nullable, 0, 0);
404
405        // Append a regular list.
406        builder
407            .append_value(
408                Scalar::list(
409                    dtype.clone(),
410                    vec![1i32.into(), 2i32.into(), 3i32.into()],
411                    NonNullable,
412                )
413                .as_list(),
414            )
415            .unwrap();
416
417        // Append an empty list.
418        builder
419            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
420            .unwrap();
421
422        // Append a null list.
423        builder.append_null();
424
425        // Append another regular list.
426        builder
427            .append_value(
428                Scalar::list(dtype, vec![4i32.into(), 5i32.into()], NonNullable).as_list(),
429            )
430            .unwrap();
431
432        let listview = builder.finish_into_listview();
433        assert_eq!(listview.len(), 4);
434
435        // Check first list: [1, 2, 3].
436        let first_list = listview.list_elements_at(0);
437        assert_eq!(first_list.len(), 3);
438        assert_eq!(first_list.scalar_at(0), 1i32.into());
439        assert_eq!(first_list.scalar_at(1), 2i32.into());
440        assert_eq!(first_list.scalar_at(2), 3i32.into());
441
442        // Check empty list.
443        let empty_list = listview.list_elements_at(1);
444        assert_eq!(empty_list.len(), 0);
445
446        // Check null list.
447        assert!(!listview.validity().is_valid(2));
448
449        // Check last list: [4, 5].
450        let last_list = listview.list_elements_at(3);
451        assert_eq!(last_list.len(), 2);
452        assert_eq!(last_list.scalar_at(0), 4i32.into());
453        assert_eq!(last_list.scalar_at(1), 5i32.into());
454    }
455
456    #[test]
457    fn test_different_offset_size_types() {
458        // Test u32 offsets with u8 sizes.
459        let dtype: Arc<DType> = Arc::new(I32.into());
460        let mut builder =
461            ListViewBuilder::<u32, u8>::with_capacity(dtype.clone(), NonNullable, 0, 0);
462
463        builder
464            .append_value(
465                Scalar::list(dtype.clone(), vec![1i32.into(), 2i32.into()], NonNullable).as_list(),
466            )
467            .unwrap();
468
469        builder
470            .append_value(
471                Scalar::list(
472                    dtype,
473                    vec![3i32.into(), 4i32.into(), 5i32.into()],
474                    NonNullable,
475                )
476                .as_list(),
477            )
478            .unwrap();
479
480        let listview = builder.finish_into_listview();
481        assert_eq!(listview.len(), 2);
482
483        // Verify first list: [1, 2].
484        let first = listview.list_elements_at(0);
485        assert_eq!(first.scalar_at(0), 1i32.into());
486        assert_eq!(first.scalar_at(1), 2i32.into());
487
488        // Verify second list: [3, 4, 5].
489        let second = listview.list_elements_at(1);
490        assert_eq!(second.scalar_at(0), 3i32.into());
491        assert_eq!(second.scalar_at(1), 4i32.into());
492        assert_eq!(second.scalar_at(2), 5i32.into());
493
494        // Test u64 offsets with u16 sizes.
495        let dtype2: Arc<DType> = Arc::new(I32.into());
496        let mut builder2 =
497            ListViewBuilder::<u64, u16>::with_capacity(dtype2.clone(), NonNullable, 0, 0);
498
499        for i in 0..5 {
500            builder2
501                .append_value(
502                    Scalar::list(dtype2.clone(), vec![(i * 10).into()], NonNullable).as_list(),
503                )
504                .unwrap();
505        }
506
507        let listview2 = builder2.finish_into_listview();
508        assert_eq!(listview2.len(), 5);
509
510        // Verify the values: [0], [10], [20], [30], [40].
511        for i in 0..5i32 {
512            let list = listview2.list_elements_at(i as usize);
513            assert_eq!(list.len(), 1);
514            assert_eq!(list.scalar_at(0), (i * 10).into());
515        }
516    }
517
518    #[test]
519    fn test_builder_trait_methods() {
520        let dtype: Arc<DType> = Arc::new(I32.into());
521        let mut builder = ListViewBuilder::<u32, u32>::with_capacity(dtype.clone(), Nullable, 0, 0);
522
523        // Test append_zeros (creates empty lists).
524        builder.append_zeros(2);
525        assert_eq!(builder.len(), 2);
526
527        // Test append_nulls.
528        unsafe {
529            builder.append_nulls_unchecked(2);
530        }
531        assert_eq!(builder.len(), 4);
532
533        // Test append_scalar.
534        let list_scalar = Scalar::list(dtype, vec![10i32.into(), 20i32.into()], Nullable);
535        builder.append_scalar(&list_scalar).unwrap();
536        assert_eq!(builder.len(), 5);
537
538        let listview = builder.finish_into_listview();
539        assert_eq!(listview.len(), 5);
540
541        // First two are empty lists (from append_zeros).
542        assert_eq!(listview.list_elements_at(0).len(), 0);
543        assert_eq!(listview.list_elements_at(1).len(), 0);
544
545        // Next two are nulls.
546        assert!(!listview.validity().is_valid(2));
547        assert!(!listview.validity().is_valid(3));
548
549        // Last is the regular list: [10, 20].
550        let last_list = listview.list_elements_at(4);
551        assert_eq!(last_list.len(), 2);
552        assert_eq!(last_list.scalar_at(0), 10i32.into());
553        assert_eq!(last_list.scalar_at(1), 20i32.into());
554    }
555
556    #[test]
557    fn test_extend_from_array() {
558        let dtype: Arc<DType> = Arc::new(I32.into());
559
560        // Create a source ListArray.
561        let source = ListArray::from_iter_opt_slow::<u32, _, Vec<i32>>(
562            [Some(vec![1, 2, 3]), None, Some(vec![4, 5])],
563            Arc::new(I32.into()),
564        )
565        .unwrap();
566
567        let mut builder = ListViewBuilder::<u32, u32>::with_capacity(dtype.clone(), Nullable, 0, 0);
568
569        // Add initial data.
570        builder
571            .append_value(Scalar::list(dtype, vec![0i32.into()], NonNullable).as_list())
572            .unwrap();
573
574        // Extend from the ListArray.
575        builder.extend_from_array(&source.into_array());
576
577        // Extend from empty array (should be no-op).
578        let empty_source = ListArray::from_iter_opt_slow::<u32, _, Vec<i32>>(
579            std::iter::empty::<Option<Vec<i32>>>(),
580            Arc::new(I32.into()),
581        )
582        .unwrap();
583        builder.extend_from_array(&empty_source.into_array());
584
585        let listview = builder.finish_into_listview();
586        assert_eq!(listview.len(), 4);
587
588        // Check the extended data.
589        // First list: [0] (initial data).
590        let first = listview.list_elements_at(0);
591        assert_eq!(first.len(), 1);
592        assert_eq!(first.scalar_at(0), 0i32.into());
593
594        // Second list: [1, 2, 3] (from source).
595        let second = listview.list_elements_at(1);
596        assert_eq!(second.len(), 3);
597        assert_eq!(second.scalar_at(0), 1i32.into());
598        assert_eq!(second.scalar_at(1), 2i32.into());
599        assert_eq!(second.scalar_at(2), 3i32.into());
600
601        // Third list: null (from source).
602        assert!(!listview.validity().is_valid(2));
603
604        // Fourth list: [4, 5] (from source).
605        let fourth = listview.list_elements_at(3);
606        assert_eq!(fourth.len(), 2);
607        assert_eq!(fourth.scalar_at(0), 4i32.into());
608        assert_eq!(fourth.scalar_at(1), 5i32.into());
609    }
610
611    #[test]
612    fn test_error_append_null_to_non_nullable() {
613        let dtype: Arc<DType> = Arc::new(I32.into());
614        let mut builder =
615            ListViewBuilder::<u32, u32>::with_capacity(dtype.clone(), NonNullable, 0, 0);
616
617        // Create a null list with nullable type (since Scalar::null requires nullable type).
618        let null_scalar = Scalar::null(DType::List(dtype, Nullable));
619        let null_list = null_scalar.as_list();
620
621        // This should fail because we're trying to append a null to a non-nullable builder.
622        let result = builder.append_value(null_list);
623        assert!(result.is_err());
624        assert!(
625            result
626                .unwrap_err()
627                .to_string()
628                .contains("null value to non-nullable")
629        );
630    }
631
632    #[test]
633    #[should_panic(
634        expected = "Size type I32 (max offset 2147483647) must fit within offset type I16 (max offset 32767)"
635    )]
636    fn test_error_invalid_type_combination() {
637        let dtype: Arc<DType> = Arc::new(I32.into());
638        // This should panic because i32 (4 bytes) cannot fit within i16 (2 bytes).
639        let _builder = ListViewBuilder::<i16, i32>::with_capacity(dtype, NonNullable, 0, 0);
640    }
641}