Skip to main content

vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_bail;
10use vortex_error::vortex_ensure;
11use vortex_error::vortex_panic;
12use vortex_mask::Mask;
13
14use crate::ArrayRef;
15use crate::Canonical;
16use crate::IntoArray;
17use crate::arrays::ListArray;
18use crate::arrays::listview::ListViewArrayExt;
19use crate::builders::ArrayBuilder;
20use crate::builders::DEFAULT_BUILDER_CAPACITY;
21use crate::builders::LazyBitBufferBuilder;
22use crate::builders::PrimitiveBuilder;
23use crate::builders::builder_with_capacity;
24use crate::canonical::ToCanonical;
25use crate::dtype::DType;
26use crate::dtype::IntegerPType;
27use crate::dtype::Nullability;
28use crate::dtype::Nullability::NonNullable;
29use crate::match_each_integer_ptype;
30use crate::scalar::ListScalar;
31use crate::scalar::Scalar;
32
33/// The builder for building a [`ListArray`], parametrized by the [`IntegerPType`] of the `offsets`
34/// builder.
35pub struct ListBuilder<O: IntegerPType> {
36    /// The [`DType`] of the [`ListArray`]. This **must** be a [`DType::List`].
37    dtype: DType,
38
39    /// The builder for the underlying elements of the [`ListArray`].
40    elements_builder: Box<dyn ArrayBuilder>,
41
42    /// The builder for the `offsets` into the `elements` array.
43    offsets_builder: PrimitiveBuilder<O>,
44
45    /// The null map builder of the [`ListArray`].
46    nulls: LazyBitBufferBuilder,
47}
48
49impl<O: IntegerPType> ListBuilder<O> {
50    /// Creates a new `ListBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
51    pub fn new(value_dtype: Arc<DType>, nullability: Nullability) -> Self {
52        Self::with_capacity(
53            value_dtype,
54            nullability,
55            // We arbitrarily choose 2 times the number of list scalars for the capacity of the
56            // elements builder since we cannot know this ahead of time.
57            DEFAULT_BUILDER_CAPACITY * 2,
58            DEFAULT_BUILDER_CAPACITY,
59        )
60    }
61
62    /// Create a new [`ListArray`] builder with a with the given `capacity`, as well as an initial
63    /// capacity for the `elements` builder (since we cannot know that ahead of time solely based on
64    /// the outer array `capacity`).
65    ///
66    /// # Notes
67    ///
68    /// The number of offsets is one more than the length (# of list scalars) in the array.
69    pub fn with_capacity(
70        value_dtype: Arc<DType>,
71        nullability: Nullability,
72        elements_capacity: usize,
73        capacity: usize,
74    ) -> Self {
75        let elements_builder = builder_with_capacity(value_dtype.as_ref(), elements_capacity);
76        let mut offsets_builder = PrimitiveBuilder::<O>::with_capacity(NonNullable, capacity + 1);
77
78        // The first offset is always 0 and represents an empty list.
79        offsets_builder.append_zero();
80
81        Self {
82            elements_builder,
83            offsets_builder,
84            nulls: LazyBitBufferBuilder::new(capacity),
85            dtype: DType::List(value_dtype, nullability),
86        }
87    }
88
89    /// Appends an array as a single non-null list entry to the builder.
90    ///
91    /// The input `array` must have the same dtype as the element dtype of this list builder.
92    ///
93    /// Note that the list entry will be non-null but the elements themselves are allowed to be null
94    /// (only if the elements [`DType`] in nullable, of course).
95    pub fn append_array_as_list(&mut self, array: &ArrayRef) -> VortexResult<()> {
96        vortex_ensure!(
97            array.dtype() == self.element_dtype(),
98            "Array dtype {:?} does not match list element dtype {:?}",
99            array.dtype(),
100            self.element_dtype()
101        );
102
103        self.elements_builder.extend_from_array(array);
104        self.nulls.append_non_null();
105        self.offsets_builder.append_value(
106            O::from_usize(self.elements_builder.len())
107                .vortex_expect("Failed to convert from usize to O"),
108        );
109
110        Ok(())
111    }
112
113    /// Appends a list `value` to the builder.
114    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
115        match value.elements() {
116            None => {
117                if self.dtype.nullability() == NonNullable {
118                    vortex_bail!("Cannot append null value to non-nullable list");
119                }
120                self.append_null();
121            }
122            Some(elements) => {
123                for scalar in elements {
124                    // TODO(connor): This is slow, we should be able to append multiple values at
125                    // once, or the list scalar should hold an Array
126                    self.elements_builder.append_scalar(&scalar)?;
127                }
128
129                self.nulls.append_non_null();
130                self.offsets_builder.append_value(
131                    O::from_usize(self.elements_builder.len())
132                        .vortex_expect("Failed to convert from usize to O"),
133                );
134            }
135        }
136
137        Ok(())
138    }
139
140    /// Finishes the builder directly into a [`ListArray`].
141    pub fn finish_into_list(&mut self) -> ListArray {
142        assert_eq!(
143            self.offsets_builder.len(),
144            self.nulls.len() + 1,
145            "offsets length must be one more than nulls length."
146        );
147
148        ListArray::try_new(
149            self.elements_builder.finish(),
150            self.offsets_builder.finish(),
151            self.nulls.finish_with_nullability(self.dtype.nullability()),
152        )
153        .vortex_expect("Buffer, offsets, and validity must have same length.")
154    }
155
156    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
157    /// the outer `List`.
158    pub fn element_dtype(&self) -> &DType {
159        let DType::List(element_dtype, _) = &self.dtype else {
160            vortex_panic!("`ListBuilder` has an incorrect dtype: {}", self.dtype);
161        };
162
163        element_dtype
164    }
165}
166
167impl<O: IntegerPType> ArrayBuilder for ListBuilder<O> {
168    fn as_any(&self) -> &dyn Any {
169        self
170    }
171
172    fn as_any_mut(&mut self) -> &mut dyn Any {
173        self
174    }
175
176    fn dtype(&self) -> &DType {
177        &self.dtype
178    }
179
180    fn len(&self) -> usize {
181        self.nulls.len()
182    }
183
184    fn append_zeros(&mut self, n: usize) {
185        let curr_len = self.elements_builder.len();
186        for _ in 0..n {
187            self.offsets_builder.append_value(
188                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
189            )
190        }
191        self.nulls.append_n_non_nulls(n);
192    }
193
194    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
195        let curr_len = self.elements_builder.len();
196        for _ in 0..n {
197            // A list with a null element is can be a list with a zero-span offset and a validity
198            // bit set
199            self.offsets_builder.append_value(
200                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
201            )
202        }
203        self.nulls.append_n_nulls(n);
204    }
205
206    fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
207        vortex_ensure!(
208            scalar.dtype() == self.dtype(),
209            "ListBuilder expected scalar with dtype {}, got {}",
210            self.dtype(),
211            scalar.dtype()
212        );
213
214        self.append_value(scalar.as_list())
215    }
216
217    unsafe fn extend_from_array_unchecked(&mut self, array: &ArrayRef) {
218        let list = array.to_listview();
219        if list.is_empty() {
220            return;
221        }
222
223        // Append validity information.
224        self.nulls.append_validity_mask(
225            array
226                .validity_mask()
227                .vortex_expect("validity_mask in extend_from_array_unchecked"),
228        );
229
230        // Note that `ListViewArray` has `n` offsets and sizes, not `n+1` offsets like `ListArray`.
231        let elements = list.elements();
232        let offsets = list.offsets().to_primitive();
233        let sizes = list.sizes().to_primitive();
234
235        fn extend_inner<O, OffsetType, SizeType>(
236            builder: &mut ListBuilder<O>,
237            new_elements: &ArrayRef,
238            new_offsets: &[OffsetType],
239            new_sizes: &[SizeType],
240        ) where
241            O: IntegerPType,
242            OffsetType: IntegerPType,
243            SizeType: IntegerPType,
244        {
245            let num_lists = new_offsets.len();
246            debug_assert_eq!(num_lists, new_sizes.len());
247
248            let mut curr_offset = builder.elements_builder.len();
249            let mut offsets_range = builder.offsets_builder.uninit_range(num_lists);
250
251            // We need to append each list individually, converting from `ListViewArray` format to
252            // the `ListArray` format that `ListBuilder` expects.
253            for i in 0..new_offsets.len() {
254                let offset: usize = new_offsets[i].as_();
255                let size: usize = new_sizes[i].as_();
256
257                if size > 0 {
258                    let list_elements = new_elements
259                        .slice(offset..offset + size)
260                        .vortex_expect("list builder slice");
261                    builder.elements_builder.extend_from_array(&list_elements);
262                    curr_offset += size;
263                }
264
265                let new_offset =
266                    O::from_usize(curr_offset).vortex_expect("Failed to convert offset");
267
268                offsets_range.set_value(i, new_offset);
269            }
270
271            // SAFETY: We have initialized all `num_lists` values, and since the `offsets` array is
272            // non-nullable, we are done.
273            unsafe { offsets_range.finish() };
274        }
275
276        match_each_integer_ptype!(offsets.ptype(), |OffsetType| {
277            match_each_integer_ptype!(sizes.ptype(), |SizeType| {
278                extend_inner(
279                    self,
280                    elements,
281                    offsets.as_slice::<OffsetType>(),
282                    sizes.as_slice::<SizeType>(),
283                )
284            })
285        })
286    }
287
288    fn reserve_exact(&mut self, additional: usize) {
289        self.elements_builder.reserve_exact(additional);
290        self.offsets_builder.reserve_exact(additional);
291        self.nulls.reserve_exact(additional);
292    }
293
294    unsafe fn set_validity_unchecked(&mut self, validity: Mask) {
295        self.nulls = LazyBitBufferBuilder::new(validity.len());
296        self.nulls.append_validity_mask(validity);
297    }
298
299    fn finish(&mut self) -> ArrayRef {
300        self.finish_into_list().into_array()
301    }
302
303    fn finish_into_canonical(&mut self) -> Canonical {
304        Canonical::List(self.finish_into_list().into_array().to_listview())
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use std::sync::Arc;
311
312    use Nullability::NonNullable;
313    use Nullability::Nullable;
314    use vortex_buffer::buffer;
315    use vortex_error::VortexExpect;
316
317    use crate::IntoArray;
318    use crate::LEGACY_SESSION;
319    use crate::ToCanonical;
320    use crate::arrays::ChunkedArray;
321    use crate::arrays::PrimitiveArray;
322    use crate::arrays::list::ListArrayExt;
323    use crate::arrays::listview::ListViewArrayExt;
324    use crate::assert_arrays_eq;
325    use crate::builders::ArrayBuilder;
326    use crate::builders::list::ListArray;
327    use crate::builders::list::ListBuilder;
328    use crate::dtype::DType;
329    use crate::dtype::IntegerPType;
330    use crate::dtype::Nullability;
331    use crate::dtype::PType::I32;
332    use crate::executor::VortexSessionExecute;
333    use crate::scalar::Scalar;
334    use crate::validity::Validity;
335
336    #[test]
337    fn test_empty() {
338        let mut builder =
339            ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0, 0);
340
341        let list = builder.finish();
342        assert_eq!(list.len(), 0);
343    }
344
345    #[test]
346    fn test_values() {
347        let dtype: Arc<DType> = Arc::new(I32.into());
348        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 0, 0);
349
350        builder
351            .append_value(
352                Scalar::list(
353                    Arc::clone(&dtype),
354                    vec![1i32.into(), 2i32.into(), 3i32.into()],
355                    NonNullable,
356                )
357                .as_list(),
358            )
359            .unwrap();
360
361        builder
362            .append_value(
363                Scalar::list(
364                    dtype,
365                    vec![4i32.into(), 5i32.into(), 6i32.into()],
366                    NonNullable,
367                )
368                .as_list(),
369            )
370            .unwrap();
371
372        let list = builder.finish();
373        assert_eq!(list.len(), 2);
374
375        let list_array = list.to_listview();
376
377        assert_eq!(list_array.list_elements_at(0).unwrap().len(), 3);
378        assert_eq!(list_array.list_elements_at(1).unwrap().len(), 3);
379    }
380
381    #[test]
382    fn test_append_empty_list() {
383        let dtype: Arc<DType> = Arc::new(I32.into());
384        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 0, 0);
385
386        assert!(
387            builder
388                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
389                .is_ok()
390        )
391    }
392
393    #[test]
394    fn test_nullable_values() {
395        let dtype: Arc<DType> = Arc::new(I32.into());
396        let mut builder = ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), Nullable, 0, 0);
397
398        builder
399            .append_value(
400                Scalar::list(
401                    Arc::clone(&dtype),
402                    vec![1i32.into(), 2i32.into(), 3i32.into()],
403                    NonNullable,
404                )
405                .as_list(),
406            )
407            .unwrap();
408
409        builder
410            .append_value(Scalar::list_empty(Arc::clone(&dtype), NonNullable).as_list())
411            .unwrap();
412
413        builder
414            .append_value(
415                Scalar::list(
416                    dtype,
417                    vec![4i32.into(), 5i32.into(), 6i32.into()],
418                    NonNullable,
419                )
420                .as_list(),
421            )
422            .unwrap();
423
424        let list = builder.finish();
425        assert_eq!(list.len(), 3);
426
427        let list_array = list.to_listview();
428
429        assert_eq!(list_array.list_elements_at(0).unwrap().len(), 3);
430        assert_eq!(list_array.list_elements_at(1).unwrap().len(), 0);
431        assert_eq!(list_array.list_elements_at(2).unwrap().len(), 3);
432    }
433
434    fn test_extend_builder_gen<O: IntegerPType>() {
435        let list = ListArray::from_iter_opt_slow::<O, _, _>(
436            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
437            Arc::new(I32.into()),
438        )
439        .unwrap();
440        assert_eq!(list.len(), 3);
441
442        let mut ctx = LEGACY_SESSION.create_execution_ctx();
443
444        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 18, 9);
445        builder.extend_from_array(&list);
446        builder.extend_from_array(&list);
447        builder.extend_from_array(&list.slice(0..0).unwrap());
448        builder.extend_from_array(&list.slice(1..3).unwrap());
449
450        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
451            [
452                Some(vec![0, 1, 2]),
453                None,
454                Some(vec![4, 5]),
455                Some(vec![0, 1, 2]),
456                None,
457                Some(vec![4, 5]),
458                None,
459                Some(vec![4, 5]),
460            ],
461            Arc::new(DType::Primitive(I32, NonNullable)),
462        )
463        .unwrap()
464        .to_listview();
465
466        let actual = builder.finish_into_canonical().into_listview();
467
468        assert_arrays_eq!(actual.elements(), expected.elements());
469
470        assert_arrays_eq!(actual.offsets(), expected.offsets());
471
472        assert!(
473            actual
474                .validity()
475                .vortex_expect("list validity should be derivable")
476                .mask_eq(
477                    &expected
478                        .validity()
479                        .vortex_expect("list validity should be derivable"),
480                    &mut ctx,
481                )
482                .unwrap(),
483        );
484    }
485
486    #[test]
487    fn test_extend_builder() {
488        test_extend_builder_gen::<i8>();
489        test_extend_builder_gen::<i16>();
490        test_extend_builder_gen::<i32>();
491        test_extend_builder_gen::<i64>();
492
493        test_extend_builder_gen::<u8>();
494        test_extend_builder_gen::<u16>();
495        test_extend_builder_gen::<u32>();
496        test_extend_builder_gen::<u64>();
497    }
498
499    #[test]
500    pub fn test_array_with_gap() {
501        let one_trailing_unused_element = ListArray::try_new(
502            buffer![1, 2, 3, 4].into_array(),
503            buffer![0, 3].into_array(),
504            Validity::NonNullable,
505        )
506        .unwrap();
507
508        let second_array = ListArray::try_new(
509            buffer![5, 6].into_array(),
510            buffer![0, 2].into_array(),
511            Validity::NonNullable,
512        )
513        .unwrap();
514
515        let chunked_list = ChunkedArray::try_new(
516            vec![
517                one_trailing_unused_element.clone().into_array(),
518                second_array.clone().into_array(),
519            ],
520            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
521        );
522
523        let canon_values = chunked_list.unwrap().as_array().to_listview();
524
525        assert_eq!(
526            one_trailing_unused_element.scalar_at(0).unwrap(),
527            canon_values.scalar_at(0).unwrap()
528        );
529        assert_eq!(
530            second_array.scalar_at(0).unwrap(),
531            canon_values.scalar_at(1).unwrap()
532        );
533    }
534
535    #[test]
536    fn test_append_scalar() {
537        let dtype: Arc<DType> = Arc::new(I32.into());
538        let mut builder = ListBuilder::<u64>::with_capacity(Arc::clone(&dtype), Nullable, 20, 10);
539
540        // Test appending a valid list.
541        let list_scalar1 =
542            Scalar::list(Arc::clone(&dtype), vec![1i32.into(), 2i32.into()], Nullable);
543        builder.append_scalar(&list_scalar1).unwrap();
544
545        // Test appending another list.
546        let list_scalar2 = Scalar::list(
547            Arc::clone(&dtype),
548            vec![3i32.into(), 4i32.into(), 5i32.into()],
549            Nullable,
550        );
551        builder.append_scalar(&list_scalar2).unwrap();
552
553        // Test appending null value.
554        let null_scalar = Scalar::null(DType::List(Arc::clone(&dtype), Nullable));
555        builder.append_scalar(&null_scalar).unwrap();
556
557        let array = builder.finish_into_list();
558        assert_eq!(array.len(), 3);
559
560        // Check actual values using scalar_at.
561
562        let scalar0 = array.scalar_at(0).unwrap();
563        let list0 = scalar0.as_list();
564        assert_eq!(list0.len(), 2);
565        if let Some(list0_items) = list0.elements() {
566            assert_eq!(list0_items[0].as_primitive().typed_value::<i32>(), Some(1));
567            assert_eq!(list0_items[1].as_primitive().typed_value::<i32>(), Some(2));
568        }
569
570        let scalar1 = array.scalar_at(1).unwrap();
571        let list1 = scalar1.as_list();
572        assert_eq!(list1.len(), 3);
573        if let Some(list1_items) = list1.elements() {
574            assert_eq!(list1_items[0].as_primitive().typed_value::<i32>(), Some(3));
575            assert_eq!(list1_items[1].as_primitive().typed_value::<i32>(), Some(4));
576            assert_eq!(list1_items[2].as_primitive().typed_value::<i32>(), Some(5));
577        }
578
579        let scalar2 = array.scalar_at(2).unwrap();
580        let list2 = scalar2.as_list();
581        assert!(list2.is_null()); // This should be null.
582
583        // Check validity.
584        assert!(
585            array
586                .validity()
587                .vortex_expect("list validity should be derivable")
588                .is_valid(0)
589                .unwrap()
590        );
591        assert!(
592            array
593                .validity()
594                .vortex_expect("list validity should be derivable")
595                .is_valid(1)
596                .unwrap()
597        );
598        assert!(
599            !array
600                .validity()
601                .vortex_expect("list validity should be derivable")
602                .is_valid(2)
603                .unwrap()
604        );
605
606        // Test wrong dtype error.
607        let mut builder = ListBuilder::<u64>::with_capacity(dtype, NonNullable, 20, 10);
608        let wrong_scalar = Scalar::from(42i32);
609        assert!(builder.append_scalar(&wrong_scalar).is_err());
610    }
611
612    #[test]
613    fn test_append_array_as_list() {
614        let dtype: Arc<DType> = Arc::new(I32.into());
615        let mut builder =
616            ListBuilder::<u32>::with_capacity(Arc::clone(&dtype), NonNullable, 20, 10);
617
618        // Append a primitive array as a single list entry.
619        let arr1 = buffer![1i32, 2, 3].into_array();
620        builder.append_array_as_list(&arr1).unwrap();
621
622        // Interleave with a list scalar.
623        builder
624            .append_value(
625                Scalar::list(
626                    Arc::clone(&dtype),
627                    vec![10i32.into(), 11i32.into()],
628                    NonNullable,
629                )
630                .as_list(),
631            )
632            .unwrap();
633
634        // Append another primitive array as a single list entry.
635        let arr2 = buffer![4i32, 5].into_array();
636        builder.append_array_as_list(&arr2).unwrap();
637
638        // Append an empty array as a single list entry (empty list).
639        let arr3 = buffer![0i32; 0].into_array();
640        builder.append_array_as_list(&arr3).unwrap();
641
642        // Interleave with another list scalar (empty list).
643        builder
644            .append_value(Scalar::list_empty(Arc::clone(&dtype), NonNullable).as_list())
645            .unwrap();
646
647        let list = builder.finish_into_list();
648        assert_eq!(list.len(), 5);
649
650        // Verify elements array: [1, 2, 3, 10, 11, 4, 5].
651        assert_arrays_eq!(
652            list.elements(),
653            PrimitiveArray::from_iter([1i32, 2, 3, 10, 11, 4, 5])
654        );
655
656        // Verify offsets array.
657        assert_arrays_eq!(
658            list.offsets(),
659            PrimitiveArray::from_iter([0u32, 3, 5, 7, 7, 7])
660        );
661
662        // Test dtype mismatch error.
663        let mut builder = ListBuilder::<u32>::with_capacity(dtype, NonNullable, 20, 10);
664        let wrong_dtype_arr = buffer![1i64, 2, 3].into_array();
665        assert!(builder.append_array_as_list(&wrong_dtype_arr).is_err());
666    }
667}