Skip to main content

vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_bail;
10use vortex_error::vortex_ensure;
11use vortex_error::vortex_panic;
12use vortex_mask::Mask;
13
14use crate::ArrayRef;
15use crate::Canonical;
16use crate::IntoArray;
17use crate::LEGACY_SESSION;
18use crate::VortexSessionExecute;
19use crate::arrays::ListArray;
20use crate::arrays::listview::ListViewArrayExt;
21use crate::builders::ArrayBuilder;
22use crate::builders::DEFAULT_BUILDER_CAPACITY;
23use crate::builders::LazyBitBufferBuilder;
24use crate::builders::PrimitiveBuilder;
25use crate::builders::builder_with_capacity;
26use crate::canonical::ToCanonical;
27use crate::dtype::DType;
28use crate::dtype::IntegerPType;
29use crate::dtype::Nullability;
30use crate::dtype::Nullability::NonNullable;
31use crate::match_each_integer_ptype;
32use crate::scalar::ListScalar;
33use crate::scalar::Scalar;
34
35/// The builder for building a [`ListArray`], parametrized by the [`IntegerPType`] of the `offsets`
36/// builder.
37pub struct ListBuilder<O: IntegerPType> {
38    /// The [`DType`] of the [`ListArray`]. This **must** be a [`DType::List`].
39    dtype: DType,
40
41    /// The builder for the underlying elements of the [`ListArray`].
42    elements_builder: Box<dyn ArrayBuilder>,
43
44    /// The builder for the `offsets` into the `elements` array.
45    offsets_builder: PrimitiveBuilder<O>,
46
47    /// The null map builder of the [`ListArray`].
48    nulls: LazyBitBufferBuilder,
49}
50
51impl<O: IntegerPType> ListBuilder<O> {
52    /// Creates a new `ListBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
53    pub fn new(value_dtype: Arc<DType>, nullability: Nullability) -> Self {
54        Self::with_capacity(
55            value_dtype,
56            nullability,
57            // We arbitrarily choose 2 times the number of list scalars for the capacity of the
58            // elements builder since we cannot know this ahead of time.
59            DEFAULT_BUILDER_CAPACITY * 2,
60            DEFAULT_BUILDER_CAPACITY,
61        )
62    }
63
64    /// Create a new [`ListArray`] builder with a with the given `capacity`, as well as an initial
65    /// capacity for the `elements` builder (since we cannot know that ahead of time solely based on
66    /// the outer array `capacity`).
67    ///
68    /// # Notes
69    ///
70    /// The number of offsets is one more than the length (# of list scalars) in the array.
71    pub fn with_capacity(
72        value_dtype: Arc<DType>,
73        nullability: Nullability,
74        elements_capacity: usize,
75        capacity: usize,
76    ) -> Self {
77        let elements_builder = builder_with_capacity(value_dtype.as_ref(), elements_capacity);
78        let mut offsets_builder = PrimitiveBuilder::<O>::with_capacity(NonNullable, capacity + 1);
79
80        // The first offset is always 0 and represents an empty list.
81        offsets_builder.append_zero();
82
83        Self {
84            elements_builder,
85            offsets_builder,
86            nulls: LazyBitBufferBuilder::new(capacity),
87            dtype: DType::List(value_dtype, nullability),
88        }
89    }
90
91    /// Appends an array as a single non-null list entry to the builder.
92    ///
93    /// The input `array` must have the same dtype as the element dtype of this list builder.
94    ///
95    /// Note that the list entry will be non-null but the elements themselves are allowed to be null
96    /// (only if the elements [`DType`] in nullable, of course).
97    pub fn append_array_as_list(&mut self, array: &ArrayRef) -> VortexResult<()> {
98        vortex_ensure!(
99            array.dtype() == self.element_dtype(),
100            "Array dtype {:?} does not match list element dtype {:?}",
101            array.dtype(),
102            self.element_dtype()
103        );
104
105        self.elements_builder.extend_from_array(array);
106        self.nulls.append_non_null();
107        self.offsets_builder.append_value(
108            O::from_usize(self.elements_builder.len())
109                .vortex_expect("Failed to convert from usize to O"),
110        );
111
112        Ok(())
113    }
114
115    /// Appends a list `value` to the builder.
116    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
117        match value.elements() {
118            None => {
119                if self.dtype.nullability() == NonNullable {
120                    vortex_bail!("Cannot append null value to non-nullable list");
121                }
122                self.append_null();
123            }
124            Some(elements) => {
125                for scalar in elements {
126                    // TODO(connor): This is slow, we should be able to append multiple values at
127                    // once, or the list scalar should hold an Array
128                    self.elements_builder.append_scalar(&scalar)?;
129                }
130
131                self.nulls.append_non_null();
132                self.offsets_builder.append_value(
133                    O::from_usize(self.elements_builder.len())
134                        .vortex_expect("Failed to convert from usize to O"),
135                );
136            }
137        }
138
139        Ok(())
140    }
141
142    /// Finishes the builder directly into a [`ListArray`].
143    pub fn finish_into_list(&mut self) -> ListArray {
144        assert_eq!(
145            self.offsets_builder.len(),
146            self.nulls.len() + 1,
147            "offsets length must be one more than nulls length."
148        );
149
150        ListArray::try_new(
151            self.elements_builder.finish(),
152            self.offsets_builder.finish(),
153            self.nulls.finish_with_nullability(self.dtype.nullability()),
154        )
155        .vortex_expect("Buffer, offsets, and validity must have same length.")
156    }
157
158    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
159    /// the outer `List`.
160    pub fn element_dtype(&self) -> &DType {
161        let DType::List(element_dtype, _) = &self.dtype else {
162            vortex_panic!("`ListBuilder` has an incorrect dtype: {}", self.dtype);
163        };
164
165        element_dtype
166    }
167}
168
169impl<O: IntegerPType> ArrayBuilder for ListBuilder<O> {
170    fn as_any(&self) -> &dyn Any {
171        self
172    }
173
174    fn as_any_mut(&mut self) -> &mut dyn Any {
175        self
176    }
177
178    fn dtype(&self) -> &DType {
179        &self.dtype
180    }
181
182    fn len(&self) -> usize {
183        self.nulls.len()
184    }
185
186    fn append_zeros(&mut self, n: usize) {
187        let curr_len = self.elements_builder.len();
188        for _ in 0..n {
189            self.offsets_builder.append_value(
190                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
191            )
192        }
193        self.nulls.append_n_non_nulls(n);
194    }
195
196    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
197        let curr_len = self.elements_builder.len();
198        for _ in 0..n {
199            // A list with a null element is can be a list with a zero-span offset and a validity
200            // bit set
201            self.offsets_builder.append_value(
202                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
203            )
204        }
205        self.nulls.append_n_nulls(n);
206    }
207
208    fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
209        vortex_ensure!(
210            scalar.dtype() == self.dtype(),
211            "ListBuilder expected scalar with dtype {}, got {}",
212            self.dtype(),
213            scalar.dtype()
214        );
215
216        self.append_value(scalar.as_list())
217    }
218
219    unsafe fn extend_from_array_unchecked(&mut self, array: &ArrayRef) {
220        let list = array.to_listview();
221        if list.is_empty() {
222            return;
223        }
224
225        // Append validity information.
226        self.nulls.append_validity_mask(
227            array
228                .validity()
229                .vortex_expect("validity_mask in extend_from_array_unchecked")
230                .to_mask(array.len(), &mut LEGACY_SESSION.create_execution_ctx())
231                .vortex_expect("Failed to compute validity mask"),
232        );
233
234        // Note that `ListViewArray` has `n` offsets and sizes, not `n+1` offsets like `ListArray`.
235        let elements = list.elements();
236        let offsets = list.offsets().to_primitive();
237        let sizes = list.sizes().to_primitive();
238
239        fn extend_inner<O, OffsetType, SizeType>(
240            builder: &mut ListBuilder<O>,
241            new_elements: &ArrayRef,
242            new_offsets: &[OffsetType],
243            new_sizes: &[SizeType],
244        ) where
245            O: IntegerPType,
246            OffsetType: IntegerPType,
247            SizeType: IntegerPType,
248        {
249            let num_lists = new_offsets.len();
250            debug_assert_eq!(num_lists, new_sizes.len());
251
252            let mut curr_offset = builder.elements_builder.len();
253            let mut offsets_range = builder.offsets_builder.uninit_range(num_lists);
254
255            // We need to append each list individually, converting from `ListViewArray` format to
256            // the `ListArray` format that `ListBuilder` expects.
257            for i in 0..new_offsets.len() {
258                let offset: usize = new_offsets[i].as_();
259                let size: usize = new_sizes[i].as_();
260
261                if size > 0 {
262                    let list_elements = new_elements
263                        .slice(offset..offset + size)
264                        .vortex_expect("list builder slice");
265                    builder.elements_builder.extend_from_array(&list_elements);
266                    curr_offset += size;
267                }
268
269                let new_offset =
270                    O::from_usize(curr_offset).vortex_expect("Failed to convert offset");
271
272                offsets_range.set_value(i, new_offset);
273            }
274
275            // SAFETY: We have initialized all `num_lists` values, and since the `offsets` array is
276            // non-nullable, we are done.
277            unsafe { offsets_range.finish() };
278        }
279
280        match_each_integer_ptype!(offsets.ptype(), |OffsetType| {
281            match_each_integer_ptype!(sizes.ptype(), |SizeType| {
282                extend_inner(
283                    self,
284                    elements,
285                    offsets.as_slice::<OffsetType>(),
286                    sizes.as_slice::<SizeType>(),
287                )
288            })
289        })
290    }
291
292    fn reserve_exact(&mut self, additional: usize) {
293        self.elements_builder.reserve_exact(additional);
294        self.offsets_builder.reserve_exact(additional);
295        self.nulls.reserve_exact(additional);
296    }
297
298    unsafe fn set_validity_unchecked(&mut self, validity: Mask) {
299        self.nulls = LazyBitBufferBuilder::new(validity.len());
300        self.nulls.append_validity_mask(validity);
301    }
302
303    fn finish(&mut self) -> ArrayRef {
304        self.finish_into_list().into_array()
305    }
306
307    fn finish_into_canonical(&mut self) -> Canonical {
308        Canonical::List(self.finish_into_list().into_array().to_listview())
309    }
310}
311
312#[cfg(test)]
313mod tests {
314    use std::sync::Arc;
315
316    use Nullability::NonNullable;
317    use Nullability::Nullable;
318    use vortex_buffer::buffer;
319    use vortex_error::VortexExpect;
320
321    use crate::IntoArray;
322    use crate::LEGACY_SESSION;
323    use crate::ToCanonical;
324    use crate::arrays::ChunkedArray;
325    use crate::arrays::PrimitiveArray;
326    use crate::arrays::list::ListArrayExt;
327    use crate::arrays::listview::ListViewArrayExt;
328    use crate::assert_arrays_eq;
329    use crate::builders::ArrayBuilder;
330    use crate::builders::list::ListArray;
331    use crate::builders::list::ListBuilder;
332    use crate::dtype::DType;
333    use crate::dtype::IntegerPType;
334    use crate::dtype::Nullability;
335    use crate::dtype::PType::I32;
336    use crate::executor::VortexSessionExecute;
337    use crate::scalar::Scalar;
338    use crate::validity::Validity;
339
340    #[test]
341    fn test_empty() {
342        let mut builder =
343            ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0, 0);
344
345        let list = builder.finish();
346        assert_eq!(list.len(), 0);
347    }
348
349    #[test]
350    fn test_values() {
351        let dtype: Arc<DType> = Arc::new(I32.into());
352        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 0, 0);
353
354        builder
355            .append_value(
356                Scalar::list(
357                    Arc::clone(&dtype),
358                    vec![1i32.into(), 2i32.into(), 3i32.into()],
359                    NonNullable,
360                )
361                .as_list(),
362            )
363            .unwrap();
364
365        builder
366            .append_value(
367                Scalar::list(
368                    dtype,
369                    vec![4i32.into(), 5i32.into(), 6i32.into()],
370                    NonNullable,
371                )
372                .as_list(),
373            )
374            .unwrap();
375
376        let list = builder.finish();
377        assert_eq!(list.len(), 2);
378
379        let list_array = list.to_listview();
380
381        assert_eq!(list_array.list_elements_at(0).unwrap().len(), 3);
382        assert_eq!(list_array.list_elements_at(1).unwrap().len(), 3);
383    }
384
385    #[test]
386    fn test_append_empty_list() {
387        let dtype: Arc<DType> = Arc::new(I32.into());
388        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 0, 0);
389
390        assert!(
391            builder
392                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
393                .is_ok()
394        )
395    }
396
397    #[test]
398    fn test_nullable_values() {
399        let dtype: Arc<DType> = Arc::new(I32.into());
400        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), Nullable, 0, 0);
401
402        builder
403            .append_value(
404                Scalar::list(
405                    Arc::clone(&dtype),
406                    vec![1i32.into(), 2i32.into(), 3i32.into()],
407                    NonNullable,
408                )
409                .as_list(),
410            )
411            .unwrap();
412
413        builder
414            .append_value(Scalar::list_empty(Arc::clone(&dtype), NonNullable).as_list())
415            .unwrap();
416
417        builder
418            .append_value(
419                Scalar::list(
420                    dtype,
421                    vec![4i32.into(), 5i32.into(), 6i32.into()],
422                    NonNullable,
423                )
424                .as_list(),
425            )
426            .unwrap();
427
428        let list = builder.finish();
429        assert_eq!(list.len(), 3);
430
431        let list_array = list.to_listview();
432
433        assert_eq!(list_array.list_elements_at(0).unwrap().len(), 3);
434        assert_eq!(list_array.list_elements_at(1).unwrap().len(), 0);
435        assert_eq!(list_array.list_elements_at(2).unwrap().len(), 3);
436    }
437
438    fn test_extend_builder_gen<O: IntegerPType>() {
439        let list = ListArray::from_iter_opt_slow::<O, _, _>(
440            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
441            Arc::new(I32.into()),
442        )
443        .unwrap();
444        assert_eq!(list.len(), 3);
445
446        let mut ctx = LEGACY_SESSION.create_execution_ctx();
447
448        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 18, 9);
449        builder.extend_from_array(&list);
450        builder.extend_from_array(&list);
451        builder.extend_from_array(&list.slice(0..0).unwrap());
452        builder.extend_from_array(&list.slice(1..3).unwrap());
453
454        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
455            [
456                Some(vec![0, 1, 2]),
457                None,
458                Some(vec![4, 5]),
459                Some(vec![0, 1, 2]),
460                None,
461                Some(vec![4, 5]),
462                None,
463                Some(vec![4, 5]),
464            ],
465            Arc::new(DType::Primitive(I32, NonNullable)),
466        )
467        .unwrap()
468        .to_listview();
469
470        let actual = builder.finish_into_canonical().into_listview();
471
472        assert_arrays_eq!(actual.elements(), expected.elements());
473
474        assert_arrays_eq!(actual.offsets(), expected.offsets());
475
476        assert!(
477            actual
478                .validity()
479                .vortex_expect("list validity should be derivable")
480                .mask_eq(
481                    &expected
482                        .validity()
483                        .vortex_expect("list validity should be derivable"),
484                    &mut ctx,
485                )
486                .unwrap(),
487        );
488    }
489
490    #[test]
491    fn test_extend_builder() {
492        test_extend_builder_gen::<i8>();
493        test_extend_builder_gen::<i16>();
494        test_extend_builder_gen::<i32>();
495        test_extend_builder_gen::<i64>();
496
497        test_extend_builder_gen::<u8>();
498        test_extend_builder_gen::<u16>();
499        test_extend_builder_gen::<u32>();
500        test_extend_builder_gen::<u64>();
501    }
502
503    #[test]
504    pub fn test_array_with_gap() {
505        let one_trailing_unused_element = ListArray::try_new(
506            buffer![1, 2, 3, 4].into_array(),
507            buffer![0, 3].into_array(),
508            Validity::NonNullable,
509        )
510        .unwrap();
511
512        let second_array = ListArray::try_new(
513            buffer![5, 6].into_array(),
514            buffer![0, 2].into_array(),
515            Validity::NonNullable,
516        )
517        .unwrap();
518
519        let chunked_list = ChunkedArray::try_new(
520            vec![
521                one_trailing_unused_element.clone().into_array(),
522                second_array.clone().into_array(),
523            ],
524            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
525        );
526
527        let canon_values = chunked_list.unwrap().as_array().to_listview();
528
529        assert_eq!(
530            one_trailing_unused_element
531                .execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())
532                .unwrap(),
533            canon_values
534                .execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())
535                .unwrap()
536        );
537        assert_eq!(
538            second_array
539                .execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())
540                .unwrap(),
541            canon_values
542                .execute_scalar(1, &mut LEGACY_SESSION.create_execution_ctx())
543                .unwrap()
544        );
545    }
546
547    #[test]
548    fn test_append_scalar() {
549        let dtype: Arc<DType> = Arc::new(I32.into());
550        let mut builder = ListBuilder::<u64>::with_capacity(Arc::clone(&dtype), Nullable, 20, 10);
551
552        // Test appending a valid list.
553        let list_scalar1 =
554            Scalar::list(Arc::clone(&dtype), vec![1i32.into(), 2i32.into()], Nullable);
555        builder.append_scalar(&list_scalar1).unwrap();
556
557        // Test appending another list.
558        let list_scalar2 = Scalar::list(
559            Arc::clone(&dtype),
560            vec![3i32.into(), 4i32.into(), 5i32.into()],
561            Nullable,
562        );
563        builder.append_scalar(&list_scalar2).unwrap();
564
565        // Test appending null value.
566        let null_scalar = Scalar::null(DType::List(Arc::clone(&dtype), Nullable));
567        builder.append_scalar(&null_scalar).unwrap();
568
569        let array = builder.finish_into_list();
570        assert_eq!(array.len(), 3);
571
572        // Check actual values using scalar_at.
573
574        let scalar0 = array
575            .execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())
576            .unwrap();
577        let list0 = scalar0.as_list();
578        assert_eq!(list0.len(), 2);
579        if let Some(list0_items) = list0.elements() {
580            assert_eq!(list0_items[0].as_primitive().typed_value::<i32>(), Some(1));
581            assert_eq!(list0_items[1].as_primitive().typed_value::<i32>(), Some(2));
582        }
583
584        let scalar1 = array
585            .execute_scalar(1, &mut LEGACY_SESSION.create_execution_ctx())
586            .unwrap();
587        let list1 = scalar1.as_list();
588        assert_eq!(list1.len(), 3);
589        if let Some(list1_items) = list1.elements() {
590            assert_eq!(list1_items[0].as_primitive().typed_value::<i32>(), Some(3));
591            assert_eq!(list1_items[1].as_primitive().typed_value::<i32>(), Some(4));
592            assert_eq!(list1_items[2].as_primitive().typed_value::<i32>(), Some(5));
593        }
594
595        let scalar2 = array
596            .execute_scalar(2, &mut LEGACY_SESSION.create_execution_ctx())
597            .unwrap();
598        let list2 = scalar2.as_list();
599        assert!(list2.is_null()); // This should be null.
600
601        // Check validity.
602        assert!(
603            array
604                .validity()
605                .vortex_expect("list validity should be derivable")
606                .is_valid(0)
607                .unwrap()
608        );
609        assert!(
610            array
611                .validity()
612                .vortex_expect("list validity should be derivable")
613                .is_valid(1)
614                .unwrap()
615        );
616        assert!(
617            !array
618                .validity()
619                .vortex_expect("list validity should be derivable")
620                .is_valid(2)
621                .unwrap()
622        );
623
624        // Test wrong dtype error.
625        let mut builder = ListBuilder::<u64>::with_capacity(dtype, NonNullable, 20, 10);
626        let wrong_scalar = Scalar::from(42i32);
627        assert!(builder.append_scalar(&wrong_scalar).is_err());
628    }
629
630    #[test]
631    fn test_append_array_as_list() {
632        let dtype: Arc<DType> = Arc::new(I32.into());
633        let mut builder =
634            ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 20, 10);
635
636        // Append a primitive array as a single list entry.
637        let arr1 = buffer![1i32, 2, 3].into_array();
638        builder.append_array_as_list(&arr1).unwrap();
639
640        // Interleave with a list scalar.
641        builder
642            .append_value(
643                Scalar::list(
644                    Arc::clone(&dtype),
645                    vec![10i32.into(), 11i32.into()],
646                    NonNullable,
647                )
648                .as_list(),
649            )
650            .unwrap();
651
652        // Append another primitive array as a single list entry.
653        let arr2 = buffer![4i32, 5].into_array();
654        builder.append_array_as_list(&arr2).unwrap();
655
656        // Append an empty array as a single list entry (empty list).
657        let arr3 = buffer![0i32; 0].into_array();
658        builder.append_array_as_list(&arr3).unwrap();
659
660        // Interleave with another list scalar (empty list).
661        builder
662            .append_value(Scalar::list_empty(Arc::clone(&dtype), NonNullable).as_list())
663            .unwrap();
664
665        let list = builder.finish_into_list();
666        assert_eq!(list.len(), 5);
667
668        // Verify elements array: [1, 2, 3, 10, 11, 4, 5].
669        assert_arrays_eq!(
670            list.elements(),
671            PrimitiveArray::from_iter([1i32, 2, 3, 10, 11, 4, 5])
672        );
673
674        // Verify offsets array.
675        assert_arrays_eq!(
676            list.offsets(),
677            PrimitiveArray::from_iter([0u32, 3, 5, 7, 7, 7])
678        );
679
680        // Test dtype mismatch error.
681        let mut builder = ListBuilder::<u32>::with_capacity(dtype, NonNullable, 20, 10);
682        let wrong_dtype_arr = buffer![1i64, 2, 3].into_array();
683        assert!(builder.append_array_as_list(&wrong_dtype_arr).is_err());
684    }
685}