vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_dtype::Nullability::NonNullable;
8use vortex_dtype::{DType, NativePType, Nullability};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_panic};
10use vortex_mask::Mask;
11use vortex_scalar::{ListScalar, Scalar};
12
13use crate::arrays::ListArray;
14use crate::builders::{
15    ArrayBuilder, DEFAULT_BUILDER_CAPACITY, LazyNullBufferBuilder, PrimitiveBuilder,
16    builder_with_capacity,
17};
18use crate::canonical::{Canonical, ToCanonical};
19use crate::compute::{add_scalar, cast, sub_scalar};
20use crate::{Array, ArrayRef, IntoArray, OffsetPType};
21
22/// The builder for building a [`ListArray`], parametrized by the `PType` of the offsets buffer.
23pub struct ListBuilder<O: NativePType> {
24    dtype: DType,
25    /// The values of the list.
26    value_builder: Box<dyn ArrayBuilder>,
27    /// Represents the offsets into the values array.
28    index_builder: PrimitiveBuilder<O>,
29    nulls: LazyNullBufferBuilder,
30}
31
32impl<O: OffsetPType> ListBuilder<O> {
33    /// Creates a new `ListBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
34    pub fn new(value_dtype: Arc<DType>, nullability: Nullability) -> Self {
35        Self::with_capacity(value_dtype, nullability, DEFAULT_BUILDER_CAPACITY)
36    }
37
38    /// Creates a new `ListBuilder` with the given `capacity`.
39    ///
40    /// # Notes
41    ///
42    /// The number of indices is one more than the number of lists in the array!
43    ///
44    /// See also: [`ListBuilder::with_values_and_index_capacity`].
45    pub fn with_capacity(
46        value_dtype: Arc<DType>,
47        nullability: Nullability,
48        index_capacity: usize,
49    ) -> Self {
50        Self::with_values_and_index_capacity(
51            value_dtype,
52            nullability,
53            // We choose an arbitrary capacity for values since we can't know the true capacity.
54            2 * index_capacity,
55            index_capacity,
56        )
57    }
58
59    /// Create a ListBuilder with the specified capacity for indices and values.
60    ///
61    /// # Notes
62    ///
63    /// The number of indices is one more than the number of lists in the array!
64    pub fn with_values_and_index_capacity(
65        value_dtype: Arc<DType>,
66        nullability: Nullability,
67        values_capacity: usize,
68        index_capacity: usize,
69    ) -> Self {
70        let value_builder = builder_with_capacity(value_dtype.as_ref(), values_capacity);
71        let mut index_builder = PrimitiveBuilder::<O>::with_capacity(NonNullable, index_capacity);
72
73        // The first index of the list, which is always 0 and represents an empty list.
74        index_builder.append_zero();
75
76        Self {
77            value_builder,
78            index_builder,
79            nulls: LazyNullBufferBuilder::new(index_capacity),
80            dtype: DType::List(value_dtype, nullability),
81        }
82    }
83
84    /// Appends a list `value` to the builder.
85    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
86        match value.elements() {
87            None => {
88                if self.dtype.nullability() == NonNullable {
89                    vortex_bail!("Cannot append null value to non-nullable list");
90                }
91                self.append_null();
92            }
93            Some(elements) => {
94                for scalar in elements {
95                    // TODO(connor): This is slow, we should be able to append multiple values at
96                    // once, or the list scalar should hold an Array
97                    self.value_builder.append_scalar(&scalar)?;
98                }
99
100                self.nulls.append_non_null();
101                self.index_builder.append_value(
102                    O::from_usize(self.value_builder.len())
103                        .vortex_expect("Failed to convert from usize to O"),
104                );
105            }
106        }
107
108        Ok(())
109    }
110
111    /// Finishes the builder directly into a [`ListArray`].
112    pub fn finish_into_list(&mut self) -> ListArray {
113        assert_eq!(
114            self.index_builder.len(),
115            self.nulls.len() + 1,
116            "Indices length must be one more than nulls length."
117        );
118
119        ListArray::try_new(
120            self.value_builder.finish(),
121            self.index_builder.finish(),
122            self.nulls.finish_with_nullability(self.dtype.nullability()),
123        )
124        .vortex_expect("Buffer, offsets, and validity must have same length.")
125    }
126
127    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
128    /// the outer `List`.
129    pub fn element_dtype(&self) -> &DType {
130        let DType::List(element_dtype, _) = &self.dtype else {
131            vortex_panic!("`ListBuilder` has an incorrect dtype: {}", self.dtype);
132        };
133
134        element_dtype
135    }
136}
137
138impl<O: OffsetPType> ArrayBuilder for ListBuilder<O> {
139    fn as_any(&self) -> &dyn Any {
140        self
141    }
142
143    fn as_any_mut(&mut self) -> &mut dyn Any {
144        self
145    }
146
147    fn dtype(&self) -> &DType {
148        &self.dtype
149    }
150
151    fn len(&self) -> usize {
152        self.nulls.len()
153    }
154
155    fn append_zeros(&mut self, n: usize) {
156        let curr_len = self.value_builder.len();
157        for _ in 0..n {
158            self.index_builder.append_value(
159                O::from_usize(curr_len).vortex_expect("Failed to convert from usize to <O>"),
160            )
161        }
162        self.nulls.append_n_non_nulls(n);
163    }
164
165    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
166        let count = self.value_builder.len();
167        for _ in 0..n {
168            // A list with a null element is can be a list with a zero-span offset and a validity
169            // bit set
170            self.index_builder.append_value(
171                O::from_usize(count).vortex_expect("Failed to convert from usize to <O>"),
172            )
173        }
174        self.nulls.append_n_nulls(n);
175    }
176
177    fn append_scalar(&mut self, scalar: &Scalar) -> VortexResult<()> {
178        vortex_ensure!(
179            scalar.dtype() == self.dtype(),
180            "ListBuilder expected scalar with dtype {:?}, got {:?}",
181            self.dtype(),
182            scalar.dtype()
183        );
184
185        let list_scalar = ListScalar::try_from(scalar)?;
186        self.append_value(list_scalar)
187    }
188
189    unsafe fn extend_from_array_unchecked(&mut self, array: &dyn Array) {
190        let list = array.to_list();
191        if list.is_empty() {
192            return;
193        }
194
195        let n_already_added_values = self.value_builder.len();
196        let Some(n_already_added_values) = O::from_usize(n_already_added_values) else {
197            vortex_panic!(
198                "cannot convert length {} to type {:?}",
199                n_already_added_values,
200                O::PTYPE
201            )
202        };
203
204        let offsets = list.offsets();
205        let elements = list.elements();
206
207        let index_dtype = self.index_builder.dtype();
208
209        let n_leading_junk_values_scalar = offsets
210            .scalar_at(0)
211            .cast(index_dtype)
212            .vortex_expect("Must cast to index dtype");
213        let n_leading_junk_values =
214            usize::try_from(&n_leading_junk_values_scalar).vortex_expect("Offset must be a usize");
215
216        let casted_offsets = cast(&offsets.slice(1..offsets.len()), index_dtype)
217            .vortex_expect("Offsets must be an index dtype");
218        let offsets_without_leading_junk =
219            sub_scalar(&casted_offsets, n_leading_junk_values_scalar)
220                .vortex_expect("Offsets must be able to subtract leading offset");
221        let offsets_into_builder =
222            add_scalar(&offsets_without_leading_junk, n_already_added_values.into())
223                .vortex_expect("Offsets must be able to add existing values offsets");
224
225        let last_offset = offsets.scalar_at(offsets.len() - 1);
226        let last_offset = usize::try_from(&last_offset).vortex_expect("Offset must be a usize");
227        let non_junk_values = elements.slice(n_leading_junk_values..last_offset);
228
229        self.nulls.append_validity_mask(array.validity_mask());
230        self.index_builder.extend_from_array(&offsets_into_builder);
231        self.value_builder.ensure_capacity(non_junk_values.len());
232        self.value_builder.extend_from_array(&non_junk_values);
233    }
234
235    fn ensure_capacity(&mut self, capacity: usize) {
236        self.value_builder.ensure_capacity(capacity);
237        self.index_builder.ensure_capacity(capacity);
238        self.nulls.ensure_capacity(capacity);
239    }
240
241    fn set_validity(&mut self, validity: Mask) {
242        self.nulls = LazyNullBufferBuilder::new(validity.len());
243        self.nulls.append_validity_mask(validity);
244    }
245
246    fn finish(&mut self) -> ArrayRef {
247        self.finish_into_list().into_array()
248    }
249
250    fn finish_into_canonical(&mut self) -> Canonical {
251        Canonical::List(self.finish_into_list())
252    }
253}
254
255#[cfg(test)]
256mod tests {
257    use std::sync::Arc;
258
259    use Nullability::{NonNullable, Nullable};
260    use vortex_buffer::buffer;
261    use vortex_dtype::PType::I32;
262    use vortex_dtype::{DType, Nullability};
263    use vortex_scalar::Scalar;
264
265    use crate::array::Array;
266    use crate::arrays::{ChunkedArray, ListArray};
267    use crate::builders::ArrayBuilder;
268    use crate::builders::list::ListBuilder;
269    use crate::validity::Validity;
270    use crate::vtable::ValidityHelper;
271    use crate::{IntoArray as _, OffsetPType, ToCanonical};
272
273    #[test]
274    fn test_empty() {
275        let mut builder = ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0);
276
277        let list = builder.finish();
278        assert_eq!(list.len(), 0);
279    }
280
281    #[test]
282    fn test_values() {
283        let dtype: Arc<DType> = Arc::new(I32.into());
284        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
285
286        builder
287            .append_value(
288                Scalar::list(
289                    dtype.clone(),
290                    vec![1i32.into(), 2i32.into(), 3i32.into()],
291                    NonNullable,
292                )
293                .as_list(),
294            )
295            .unwrap();
296
297        builder
298            .append_value(
299                Scalar::list(
300                    dtype,
301                    vec![4i32.into(), 5i32.into(), 6i32.into()],
302                    NonNullable,
303                )
304                .as_list(),
305            )
306            .unwrap();
307
308        let list = builder.finish();
309        assert_eq!(list.len(), 2);
310
311        let list_array = list.to_list();
312
313        assert_eq!(list_array.list_elements_at(0).len(), 3);
314        assert_eq!(list_array.list_elements_at(1).len(), 3);
315    }
316
317    #[test]
318    fn test_append_empty_list() {
319        let dtype: Arc<DType> = Arc::new(I32.into());
320        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
321
322        assert!(
323            builder
324                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
325                .is_ok()
326        )
327    }
328
329    #[test]
330    fn test_nullable_values() {
331        let dtype: Arc<DType> = Arc::new(I32.into());
332        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), Nullable, 0);
333
334        builder
335            .append_value(
336                Scalar::list(
337                    dtype.clone(),
338                    vec![1i32.into(), 2i32.into(), 3i32.into()],
339                    NonNullable,
340                )
341                .as_list(),
342            )
343            .unwrap();
344
345        builder
346            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
347            .unwrap();
348
349        builder
350            .append_value(
351                Scalar::list(
352                    dtype,
353                    vec![4i32.into(), 5i32.into(), 6i32.into()],
354                    NonNullable,
355                )
356                .as_list(),
357            )
358            .unwrap();
359
360        let list = builder.finish();
361        assert_eq!(list.len(), 3);
362
363        let list_array = list.to_list();
364
365        assert_eq!(list_array.list_elements_at(0).len(), 3);
366        assert_eq!(list_array.list_elements_at(1).len(), 0);
367        assert_eq!(list_array.list_elements_at(2).len(), 3);
368    }
369
370    fn test_extend_builder_gen<O: OffsetPType>() {
371        let list = ListArray::from_iter_opt_slow::<O, _, _>(
372            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
373            Arc::new(I32.into()),
374        )
375        .unwrap();
376
377        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 6);
378
379        builder.extend_from_array(&list);
380        builder.extend_from_array(&list);
381        builder.extend_from_array(&list.slice(0..0));
382        builder.extend_from_array(&list.slice(1..3));
383
384        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
385            [
386                Some(vec![0, 1, 2]),
387                None,
388                Some(vec![4, 5]),
389                Some(vec![0, 1, 2]),
390                None,
391                Some(vec![4, 5]),
392                None,
393                Some(vec![4, 5]),
394            ],
395            Arc::new(DType::Primitive(I32, NonNullable)),
396        )
397        .unwrap()
398        .to_list();
399
400        let actual = builder.finish_into_canonical().into_list();
401
402        assert_eq!(
403            actual.elements().to_primitive().as_slice::<i32>(),
404            expected.elements().to_primitive().as_slice::<i32>()
405        );
406
407        assert_eq!(
408            actual.offsets().to_primitive().as_slice::<O>(),
409            expected.offsets().to_primitive().as_slice::<O>()
410        );
411
412        assert_eq!(actual.validity(), expected.validity())
413    }
414
415    #[test]
416    fn test_extend_builder() {
417        test_extend_builder_gen::<i8>();
418        test_extend_builder_gen::<i16>();
419        test_extend_builder_gen::<i32>();
420        test_extend_builder_gen::<i64>();
421
422        test_extend_builder_gen::<u8>();
423        test_extend_builder_gen::<u16>();
424        test_extend_builder_gen::<u32>();
425        test_extend_builder_gen::<u64>();
426    }
427
428    #[test]
429    pub fn test_array_with_gap() {
430        let one_trailing_unused_element = ListArray::try_new(
431            buffer![1, 2, 3, 4].into_array(),
432            buffer![0, 3].into_array(),
433            Validity::NonNullable,
434        )
435        .unwrap();
436
437        let second_array = ListArray::try_new(
438            buffer![5, 6].into_array(),
439            buffer![0, 2].into_array(),
440            Validity::NonNullable,
441        )
442        .unwrap();
443
444        let chunked_list = ChunkedArray::try_new(
445            vec![
446                one_trailing_unused_element.clone().into_array(),
447                second_array.clone().into_array(),
448            ],
449            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
450        );
451
452        let canon_values = chunked_list.unwrap().to_list();
453
454        assert_eq!(
455            one_trailing_unused_element.scalar_at(0),
456            canon_values.scalar_at(0)
457        );
458        assert_eq!(second_array.scalar_at(0), canon_values.scalar_at(1));
459    }
460
461    #[test]
462    fn test_append_scalar() {
463        let dtype: Arc<DType> = Arc::new(I32.into());
464        let mut builder = ListBuilder::<u64>::with_capacity(dtype.clone(), Nullable, 10);
465
466        // Test appending a valid list.
467        let list_scalar1 = Scalar::list(dtype.clone(), vec![1i32.into(), 2i32.into()], Nullable);
468        builder.append_scalar(&list_scalar1).unwrap();
469
470        // Test appending another list.
471        let list_scalar2 = Scalar::list(
472            dtype.clone(),
473            vec![3i32.into(), 4i32.into(), 5i32.into()],
474            Nullable,
475        );
476        builder.append_scalar(&list_scalar2).unwrap();
477
478        // Test appending null value.
479        let null_scalar = Scalar::null(DType::List(dtype.clone(), Nullable));
480        builder.append_scalar(&null_scalar).unwrap();
481
482        let array = builder.finish_into_list();
483        assert_eq!(array.len(), 3);
484
485        // Check actual values using scalar_at.
486
487        let scalar0 = array.scalar_at(0);
488        let list0 = scalar0.as_list();
489        assert_eq!(list0.len(), 2);
490        if let Some(list0_items) = list0.elements() {
491            assert_eq!(list0_items[0].as_primitive().typed_value::<i32>(), Some(1));
492            assert_eq!(list0_items[1].as_primitive().typed_value::<i32>(), Some(2));
493        }
494
495        let scalar1 = array.scalar_at(1);
496        let list1 = scalar1.as_list();
497        assert_eq!(list1.len(), 3);
498        if let Some(list1_items) = list1.elements() {
499            assert_eq!(list1_items[0].as_primitive().typed_value::<i32>(), Some(3));
500            assert_eq!(list1_items[1].as_primitive().typed_value::<i32>(), Some(4));
501            assert_eq!(list1_items[2].as_primitive().typed_value::<i32>(), Some(5));
502        }
503
504        let scalar2 = array.scalar_at(2);
505        let list2 = scalar2.as_list();
506        assert!(list2.is_null()); // This should be null.
507
508        // Check validity.
509        assert!(array.validity().is_valid(0));
510        assert!(array.validity().is_valid(1));
511        assert!(!array.validity().is_valid(2));
512
513        // Test wrong dtype error.
514        let mut builder = ListBuilder::<u64>::with_capacity(dtype, NonNullable, 10);
515        let wrong_scalar = Scalar::from(42i32);
516        assert!(builder.append_scalar(&wrong_scalar).is_err());
517    }
518}