vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_dtype::Nullability::NonNullable;
8use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_panic};
10use vortex_mask::Mask;
11use vortex_scalar::{ListScalar, Scalar};
12
13use crate::arrays::{ListArray, list_view_from_list};
14use crate::builders::{
15    ArrayBuilder, DEFAULT_BUILDER_CAPACITY, LazyNullBufferBuilder, PrimitiveBuilder,
16    builder_with_capacity,
17};
18use crate::canonical::{Canonical, ToCanonical};
19use crate::{Array, ArrayRef, IntoArray};
20
21/// The builder for building a [`ListArray`], parametrized by the [`IntegerPType`] of the `offsets`
22/// builder.
23pub struct ListBuilder<O: IntegerPType> {
24    /// The [`DType`] of the [`ListArray`]. This **must** be a [`DType::List`].
25    dtype: DType,
26
27    /// The builder for the underlying elements of the [`ListArray`].
28    elements_builder: Box<dyn ArrayBuilder>,
29
30    /// The builder for the `offsets` into the `elements` array.
31    offsets_builder: PrimitiveBuilder<O>,
32
33    /// The null map builder of the [`ListArray`].
34    nulls: LazyNullBufferBuilder,
35}
36
37impl<O: IntegerPType> ListBuilder<O> {
38    /// Creates a new `ListBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
39    pub fn new(value_dtype: Arc<DType>, nullability: Nullability) -> Self {
40        Self::with_capacity(
41            value_dtype,
42            nullability,
43            // We arbitrarily choose 2 times the number of list scalars for the capacity of the
44            // elements builder since we cannot know this ahead of time.
45            DEFAULT_BUILDER_CAPACITY * 2,
46            DEFAULT_BUILDER_CAPACITY,
47        )
48    }
49
50    /// Create a new [`ListArray`] builder with a with the given `capacity`, as well as an initial
51    /// capacity for the `elements` builder (since we cannot know that ahead of time solely based on
52    /// the outer array `capacity`).
53    ///
54    /// # Notes
55    ///
56    /// The number of offsets is one more than the length (# of list scalars) in the array.
57    pub fn with_capacity(
58        value_dtype: Arc<DType>,
59        nullability: Nullability,
60        elements_capacity: usize,
61        capacity: usize,
62    ) -> Self {
63        let elements_builder = builder_with_capacity(value_dtype.as_ref(), elements_capacity);
64        let mut offsets_builder = PrimitiveBuilder::<O>::with_capacity(NonNullable, capacity + 1);
65
66        // The first offset is always 0 and represents an empty list.
67        offsets_builder.append_zero();
68
69        Self {
70            elements_builder,
71            offsets_builder,
72            nulls: LazyNullBufferBuilder::new(capacity),
73            dtype: DType::List(value_dtype, nullability),
74        }
75    }
76
77    /// Appends a list `value` to the builder.
78    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
79        match value.elements() {
80            None => {
81                if self.dtype.nullability() == NonNullable {
82                    vortex_bail!("Cannot append null value to non-nullable list");
83                }
84                self.append_null();
85            }
86            Some(elements) => {
87                for scalar in elements {
88                    // TODO(connor): This is slow, we should be able to append multiple values at
89                    // once, or the list scalar should hold an Array
90                    self.elements_builder.append_scalar(&scalar)?;
91                }
92
93                self.nulls.append_non_null();
94                self.offsets_builder.append_value(
95                    O::from_usize(self.elements_builder.len())
96                        .vortex_expect("Failed to convert from usize to O"),
97                );
98            }
99        }
100
101        Ok(())
102    }
103
104    /// Finishes the builder directly into a [`ListArray`].
105    pub fn finish_into_list(&mut self) -> ListArray {
106        assert_eq!(
107            self.offsets_builder.len(),
108            self.nulls.len() + 1,
109            "offsets length must be one more than nulls length."
110        );
111
112        ListArray::try_new(
113            self.elements_builder.finish(),
114            self.offsets_builder.finish(),
115            self.nulls.finish_with_nullability(self.dtype.nullability()),
116        )
117        .vortex_expect("Buffer, offsets, and validity must have same length.")
118    }
119
120    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
121    /// the outer `List`.
122    pub fn element_dtype(&self) -> &DType {
123        let DType::List(element_dtype, _) = &self.dtype else {
124            vortex_panic!("`ListBuilder` has an incorrect dtype: {}", self.dtype);
125        };
126
127        element_dtype
128    }
129}
130
131impl<O: IntegerPType> ArrayBuilder for ListBuilder<O> {
132    fn as_any(&self) -> &dyn Any {
133        self
134    }
135
136    fn as_any_mut(&mut self) -> &mut dyn Any {
137        self
138    }
139
140    fn dtype(&self) -> &DType {
141        &self.dtype
142    }
143
144    fn len(&self) -> usize {
145        self.nulls.len()
146    }
147
148    fn append_zeros(&mut self, n: usize) {
149        let curr_len = self.elements_builder.len();
150        for _ in 0..n {
151            self.offsets_builder.append_value(
152                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
153            )
154        }
155        self.nulls.append_n_non_nulls(n);
156    }
157
158    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
159        let curr_len = self.elements_builder.len();
160        for _ in 0..n {
161            // A list with a null element is can be a list with a zero-span offset and a validity
162            // bit set
163            self.offsets_builder.append_value(
164                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
165            )
166        }
167        self.nulls.append_n_nulls(n);
168    }
169
170    fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
171        vortex_ensure!(
172            scalar.dtype() == self.dtype(),
173            "ListBuilder expected scalar with dtype {:?}, got {:?}",
174            self.dtype(),
175            scalar.dtype()
176        );
177
178        let list_scalar = ListScalar::try_from(scalar)?;
179        self.append_value(list_scalar)
180    }
181
182    unsafe fn extend_from_array_unchecked(&mut self, array: &dyn Array) {
183        let list = array.to_listview();
184        if list.is_empty() {
185            return;
186        }
187
188        // Append validity information.
189        self.nulls.append_validity_mask(array.validity_mask());
190
191        // Note that `ListViewArray` has `n` offsets and sizes, not `n+1` offsets like `ListArray`.
192        let elements = list.elements();
193        let offsets = list.offsets().to_primitive();
194        let sizes = list.sizes().to_primitive();
195
196        fn extend_inner<O, OffsetType, SizeType>(
197            builder: &mut ListBuilder<O>,
198            new_elements: &ArrayRef,
199            new_offsets: &[OffsetType],
200            new_sizes: &[SizeType],
201        ) where
202            O: IntegerPType,
203            OffsetType: IntegerPType,
204            SizeType: IntegerPType,
205        {
206            let num_lists = new_offsets.len();
207            debug_assert_eq!(num_lists, new_sizes.len());
208
209            let mut curr_offset = builder.elements_builder.len();
210            let mut offsets_range = builder.offsets_builder.uninit_range(num_lists);
211
212            // We need to append each list individually, converting from `ListViewArray` format to
213            // the `ListArray` format that `ListBuilder` expects.
214            for i in 0..new_offsets.len() {
215                let offset: usize = new_offsets[i].as_();
216                let size: usize = new_sizes[i].as_();
217
218                if size > 0 {
219                    let list_elements = new_elements.slice(offset..offset + size);
220                    builder.elements_builder.extend_from_array(&list_elements);
221                    curr_offset += size;
222                }
223
224                let new_offset =
225                    O::from_usize(curr_offset).vortex_expect("Failed to convert offset");
226
227                offsets_range.set_value(i, new_offset);
228            }
229
230            // SAFETY: We have initialized all `num_lists` values, and since the `offsets` array is
231            // non-nullable, we are done.
232            unsafe { offsets_range.finish() };
233        }
234
235        match_each_integer_ptype!(offsets.ptype(), |OffsetType| {
236            match_each_integer_ptype!(sizes.ptype(), |SizeType| {
237                extend_inner(
238                    self,
239                    elements,
240                    offsets.as_slice::<OffsetType>(),
241                    sizes.as_slice::<SizeType>(),
242                )
243            })
244        })
245    }
246
247    fn reserve_exact(&mut self, additional: usize) {
248        self.elements_builder.reserve_exact(additional);
249        self.offsets_builder.reserve_exact(additional);
250        self.nulls.reserve_exact(additional);
251    }
252
253    unsafe fn set_validity_unchecked(&mut self, validity: Mask) {
254        self.nulls = LazyNullBufferBuilder::new(validity.len());
255        self.nulls.append_validity_mask(validity);
256    }
257
258    fn finish(&mut self) -> ArrayRef {
259        self.finish_into_list().into_array()
260    }
261
262    fn finish_into_canonical(&mut self) -> Canonical {
263        Canonical::List(list_view_from_list(self.finish_into_list()))
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use std::sync::Arc;
270
271    use Nullability::{NonNullable, Nullable};
272    use vortex_buffer::buffer;
273    use vortex_dtype::PType::I32;
274    use vortex_dtype::{DType, IntegerPType, Nullability};
275    use vortex_scalar::Scalar;
276
277    use crate::array::Array;
278    use crate::arrays::{ChunkedArray, ListArray};
279    use crate::builders::ArrayBuilder;
280    use crate::builders::list::ListBuilder;
281    use crate::validity::Validity;
282    use crate::vtable::ValidityHelper;
283    use crate::{IntoArray, ToCanonical};
284
285    #[test]
286    fn test_empty() {
287        let mut builder =
288            ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0, 0);
289
290        let list = builder.finish();
291        assert_eq!(list.len(), 0);
292    }
293
294    #[test]
295    fn test_values() {
296        let dtype: Arc<DType> = Arc::new(I32.into());
297        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0, 0);
298
299        builder
300            .append_value(
301                Scalar::list(
302                    dtype.clone(),
303                    vec![1i32.into(), 2i32.into(), 3i32.into()],
304                    NonNullable,
305                )
306                .as_list(),
307            )
308            .unwrap();
309
310        builder
311            .append_value(
312                Scalar::list(
313                    dtype,
314                    vec![4i32.into(), 5i32.into(), 6i32.into()],
315                    NonNullable,
316                )
317                .as_list(),
318            )
319            .unwrap();
320
321        let list = builder.finish();
322        assert_eq!(list.len(), 2);
323
324        let list_array = list.to_listview();
325
326        assert_eq!(list_array.list_elements_at(0).len(), 3);
327        assert_eq!(list_array.list_elements_at(1).len(), 3);
328    }
329
330    #[test]
331    fn test_append_empty_list() {
332        let dtype: Arc<DType> = Arc::new(I32.into());
333        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0, 0);
334
335        assert!(
336            builder
337                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
338                .is_ok()
339        )
340    }
341
342    #[test]
343    fn test_nullable_values() {
344        let dtype: Arc<DType> = Arc::new(I32.into());
345        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), Nullable, 0, 0);
346
347        builder
348            .append_value(
349                Scalar::list(
350                    dtype.clone(),
351                    vec![1i32.into(), 2i32.into(), 3i32.into()],
352                    NonNullable,
353                )
354                .as_list(),
355            )
356            .unwrap();
357
358        builder
359            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
360            .unwrap();
361
362        builder
363            .append_value(
364                Scalar::list(
365                    dtype,
366                    vec![4i32.into(), 5i32.into(), 6i32.into()],
367                    NonNullable,
368                )
369                .as_list(),
370            )
371            .unwrap();
372
373        let list = builder.finish();
374        assert_eq!(list.len(), 3);
375
376        let list_array = list.to_listview();
377
378        assert_eq!(list_array.list_elements_at(0).len(), 3);
379        assert_eq!(list_array.list_elements_at(1).len(), 0);
380        assert_eq!(list_array.list_elements_at(2).len(), 3);
381    }
382
383    fn test_extend_builder_gen<O: IntegerPType>() {
384        let list = ListArray::from_iter_opt_slow::<O, _, _>(
385            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
386            Arc::new(I32.into()),
387        )
388        .unwrap();
389        assert_eq!(list.len(), 3);
390
391        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 18, 9);
392
393        builder.extend_from_array(&list);
394        builder.extend_from_array(&list);
395        builder.extend_from_array(&list.slice(0..0));
396        builder.extend_from_array(&list.slice(1..3));
397
398        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
399            [
400                Some(vec![0, 1, 2]),
401                None,
402                Some(vec![4, 5]),
403                Some(vec![0, 1, 2]),
404                None,
405                Some(vec![4, 5]),
406                None,
407                Some(vec![4, 5]),
408            ],
409            Arc::new(DType::Primitive(I32, NonNullable)),
410        )
411        .unwrap()
412        .to_listview();
413
414        let actual = builder.finish_into_canonical().into_listview();
415
416        assert_eq!(
417            actual.elements().to_primitive().as_slice::<i32>(),
418            expected.elements().to_primitive().as_slice::<i32>()
419        );
420
421        assert_eq!(
422            actual.offsets().to_primitive().as_slice::<O>(),
423            expected.offsets().to_primitive().as_slice::<O>()
424        );
425
426        assert_eq!(actual.validity(), expected.validity())
427    }
428
429    #[test]
430    fn test_extend_builder() {
431        test_extend_builder_gen::<i8>();
432        test_extend_builder_gen::<i16>();
433        test_extend_builder_gen::<i32>();
434        test_extend_builder_gen::<i64>();
435
436        test_extend_builder_gen::<u8>();
437        test_extend_builder_gen::<u16>();
438        test_extend_builder_gen::<u32>();
439        test_extend_builder_gen::<u64>();
440    }
441
442    #[test]
443    pub fn test_array_with_gap() {
444        let one_trailing_unused_element = ListArray::try_new(
445            buffer![1, 2, 3, 4].into_array(),
446            buffer![0, 3].into_array(),
447            Validity::NonNullable,
448        )
449        .unwrap();
450
451        let second_array = ListArray::try_new(
452            buffer![5, 6].into_array(),
453            buffer![0, 2].into_array(),
454            Validity::NonNullable,
455        )
456        .unwrap();
457
458        let chunked_list = ChunkedArray::try_new(
459            vec![
460                one_trailing_unused_element.clone().into_array(),
461                second_array.clone().into_array(),
462            ],
463            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
464        );
465
466        let canon_values = chunked_list.unwrap().to_listview();
467
468        assert_eq!(
469            one_trailing_unused_element.scalar_at(0),
470            canon_values.scalar_at(0)
471        );
472        assert_eq!(second_array.scalar_at(0), canon_values.scalar_at(1));
473    }
474
475    #[test]
476    fn test_append_scalar() {
477        let dtype: Arc<DType> = Arc::new(I32.into());
478        let mut builder = ListBuilder::<u64>::with_capacity(dtype.clone(), Nullable, 20, 10);
479
480        // Test appending a valid list.
481        let list_scalar1 = Scalar::list(dtype.clone(), vec![1i32.into(), 2i32.into()], Nullable);
482        builder.append_scalar(&list_scalar1).unwrap();
483
484        // Test appending another list.
485        let list_scalar2 = Scalar::list(
486            dtype.clone(),
487            vec![3i32.into(), 4i32.into(), 5i32.into()],
488            Nullable,
489        );
490        builder.append_scalar(&list_scalar2).unwrap();
491
492        // Test appending null value.
493        let null_scalar = Scalar::null(DType::List(dtype.clone(), Nullable));
494        builder.append_scalar(&null_scalar).unwrap();
495
496        let array = builder.finish_into_list();
497        assert_eq!(array.len(), 3);
498
499        // Check actual values using scalar_at.
500
501        let scalar0 = array.scalar_at(0);
502        let list0 = scalar0.as_list();
503        assert_eq!(list0.len(), 2);
504        if let Some(list0_items) = list0.elements() {
505            assert_eq!(list0_items[0].as_primitive().typed_value::<i32>(), Some(1));
506            assert_eq!(list0_items[1].as_primitive().typed_value::<i32>(), Some(2));
507        }
508
509        let scalar1 = array.scalar_at(1);
510        let list1 = scalar1.as_list();
511        assert_eq!(list1.len(), 3);
512        if let Some(list1_items) = list1.elements() {
513            assert_eq!(list1_items[0].as_primitive().typed_value::<i32>(), Some(3));
514            assert_eq!(list1_items[1].as_primitive().typed_value::<i32>(), Some(4));
515            assert_eq!(list1_items[2].as_primitive().typed_value::<i32>(), Some(5));
516        }
517
518        let scalar2 = array.scalar_at(2);
519        let list2 = scalar2.as_list();
520        assert!(list2.is_null()); // This should be null.
521
522        // Check validity.
523        assert!(array.validity().is_valid(0));
524        assert!(array.validity().is_valid(1));
525        assert!(!array.validity().is_valid(2));
526
527        // Test wrong dtype error.
528        let mut builder = ListBuilder::<u64>::with_capacity(dtype, NonNullable, 20, 10);
529        let wrong_scalar = Scalar::from(42i32);
530        assert!(builder.append_scalar(&wrong_scalar).is_err());
531    }
532}