vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_dtype::Nullability::NonNullable;
8use vortex_dtype::{DType, NativePType, Nullability};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_panic};
10use vortex_mask::Mask;
11use vortex_scalar::ListScalar;
12
13use crate::arrays::{ListArray, OffsetPType};
14use crate::builders::{
15    ArrayBuilder, DEFAULT_BUILDER_CAPACITY, LazyNullBufferBuilder, PrimitiveBuilder,
16    builder_with_capacity,
17};
18use crate::canonical::{Canonical, ToCanonical};
19use crate::compute::{add_scalar, cast, sub_scalar};
20use crate::{Array, ArrayRef, IntoArray};
21
22/// The builder for building a [`ListArray`], parametrized by the `PType` of the offsets buffer.
23pub struct ListBuilder<O: NativePType> {
24    dtype: DType,
25    /// The values of the list.
26    value_builder: Box<dyn ArrayBuilder>,
27    /// Represents the offsets into the values array.
28    index_builder: PrimitiveBuilder<O>,
29    nulls: LazyNullBufferBuilder,
30}
31
32impl<O: OffsetPType> ListBuilder<O> {
33    /// Creates a new `ListBuilder` with a capacity of [`DEFAULT_BUILDER_CAPACITY`].
34    pub fn new(value_dtype: Arc<DType>, nullability: Nullability) -> Self {
35        Self::with_capacity(value_dtype, nullability, DEFAULT_BUILDER_CAPACITY)
36    }
37
38    /// Creates a new `ListBuilder` with the given `capacity`.
39    ///
40    /// # Notes
41    ///
42    /// The number of indices is one more than the number of lists in the array!
43    ///
44    /// See also: [`ListBuilder::with_values_and_index_capacity`].
45    pub fn with_capacity(
46        value_dtype: Arc<DType>,
47        nullability: Nullability,
48        index_capacity: usize,
49    ) -> Self {
50        Self::with_values_and_index_capacity(
51            value_dtype,
52            nullability,
53            // We choose an arbitrary capacity for values since we can't know the true capacity.
54            2 * index_capacity,
55            index_capacity,
56        )
57    }
58
59    /// Create a ListBuilder with the specified capacity for indices and values.
60    ///
61    /// # Notes
62    ///
63    /// The number of indices is one more than the number of lists in the array!
64    pub fn with_values_and_index_capacity(
65        value_dtype: Arc<DType>,
66        nullability: Nullability,
67        values_capacity: usize,
68        index_capacity: usize,
69    ) -> Self {
70        let value_builder = builder_with_capacity(value_dtype.as_ref(), values_capacity);
71        let mut index_builder = PrimitiveBuilder::<O>::with_capacity(NonNullable, index_capacity);
72
73        // The first index of the list, which is always 0 and represents an empty list.
74        index_builder.append_zero();
75
76        Self {
77            value_builder,
78            index_builder,
79            nulls: LazyNullBufferBuilder::new(index_capacity),
80            dtype: DType::List(value_dtype, nullability),
81        }
82    }
83
84    /// Appends a list `value` to the builder.
85    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
86        match value.elements() {
87            None => {
88                if self.dtype.nullability() == NonNullable {
89                    vortex_bail!("Cannot append null value to non-nullable list");
90                }
91                self.append_null();
92            }
93            Some(elements) => {
94                for scalar in elements {
95                    // TODO(connor): This is slow, we should be able to append multiple values at
96                    // once, or the list scalar should hold an Array
97                    self.value_builder.append_scalar(&scalar)?;
98                }
99
100                self.nulls.append_non_null();
101                self.index_builder.append_value(
102                    O::from_usize(self.value_builder.len())
103                        .vortex_expect("Failed to convert from usize to O"),
104                );
105            }
106        }
107
108        Ok(())
109    }
110
111    /// Appends an optional list value to the builder.
112    ///
113    /// If the value is `Some`, it appends the list. If the value is `None`, it appends a null.
114    ///
115    /// # Panics
116    ///
117    /// This method will panic if the input is `None` and the builder is non-nullable.
118    pub fn append_option(&mut self, value: Option<ListScalar>) -> VortexResult<()> {
119        match value {
120            Some(value) => self.append_value(value),
121            None => {
122                self.append_null();
123                Ok(())
124            }
125        }
126    }
127
128    /// Finishes the builder directly into a [`ListArray`].
129    pub fn finish_into_list(&mut self) -> ListArray {
130        assert_eq!(
131            self.index_builder.len(),
132            self.nulls.len() + 1,
133            "Indices length must be one more than nulls length."
134        );
135
136        // TODO(connor): Use `new_unchecked` here.
137        ListArray::try_new(
138            self.value_builder.finish(),
139            self.index_builder.finish(),
140            self.nulls.finish_with_nullability(self.dtype.nullability()),
141        )
142        .vortex_expect("Buffer, offsets, and validity must have same length.")
143    }
144
145    /// The [`DType`] of the inner elements. Note that this is **not** the same as the [`DType`] of
146    /// the outer `List`.
147    pub fn element_dtype(&self) -> &DType {
148        let DType::List(element_dtype, _) = &self.dtype else {
149            vortex_panic!("`ListBuilder` has an incorrect dtype: {}", self.dtype);
150        };
151
152        element_dtype
153    }
154}
155
156impl<O: OffsetPType> ArrayBuilder for ListBuilder<O> {
157    fn as_any(&self) -> &dyn Any {
158        self
159    }
160
161    fn as_any_mut(&mut self) -> &mut dyn Any {
162        self
163    }
164
165    fn dtype(&self) -> &DType {
166        &self.dtype
167    }
168
169    fn len(&self) -> usize {
170        self.nulls.len()
171    }
172
173    fn append_zeros(&mut self, n: usize) {
174        let count = self.value_builder.len();
175        // TODO(connor): this is incorrect, as it creates lists of size 1 instead of 0.
176        self.value_builder.append_zeros(n);
177        for i in 0..n {
178            self.index_builder.append_value(
179                O::from_usize(count + i + 1).vortex_expect("Failed to convert from usize to <O>"),
180            )
181        }
182        self.nulls.append_n_non_nulls(n);
183    }
184
185    unsafe fn append_nulls_unchecked(&mut self, n: usize) {
186        let count = self.value_builder.len();
187        for _ in 0..n {
188            // A list with a null element is can be a list with a zero-span offset and a validity
189            // bit set
190            self.index_builder.append_value(
191                O::from_usize(count).vortex_expect("Failed to convert from usize to <O>"),
192            )
193        }
194        self.nulls.append_n_nulls(n);
195    }
196
197    unsafe fn extend_from_array_unchecked(&mut self, array: &dyn Array) {
198        let list = array.to_list();
199        if list.is_empty() {
200            return;
201        }
202
203        let n_already_added_values = self.value_builder.len();
204        let Some(n_already_added_values) = O::from_usize(n_already_added_values) else {
205            vortex_panic!(
206                "cannot convert length {} to type {:?}",
207                n_already_added_values,
208                O::PTYPE
209            )
210        };
211
212        let offsets = list.offsets();
213        let elements = list.elements();
214
215        let index_dtype = self.index_builder.dtype();
216
217        let n_leading_junk_values_scalar = offsets
218            .scalar_at(0)
219            .cast(index_dtype)
220            .vortex_expect("Must cast to index dtype");
221        let n_leading_junk_values =
222            usize::try_from(&n_leading_junk_values_scalar).vortex_expect("Offset must be a usize");
223
224        let casted_offsets = cast(&offsets.slice(1..offsets.len()), index_dtype)
225            .vortex_expect("Offsets must be an index dtype");
226        let offsets_without_leading_junk =
227            sub_scalar(&casted_offsets, n_leading_junk_values_scalar)
228                .vortex_expect("Offsets must be able to subtract leading offset");
229        let offsets_into_builder =
230            add_scalar(&offsets_without_leading_junk, n_already_added_values.into())
231                .vortex_expect("Offsets must be able to add existing values offsets");
232
233        let last_offset = offsets.scalar_at(offsets.len() - 1);
234        let last_offset = usize::try_from(&last_offset).vortex_expect("Offset must be a usize");
235        let non_junk_values = elements.slice(n_leading_junk_values..last_offset);
236
237        self.nulls.append_validity_mask(array.validity_mask());
238        self.index_builder.extend_from_array(&offsets_into_builder);
239        self.value_builder.ensure_capacity(non_junk_values.len());
240        self.value_builder.extend_from_array(&non_junk_values);
241    }
242
243    fn ensure_capacity(&mut self, capacity: usize) {
244        self.value_builder.ensure_capacity(capacity);
245        self.index_builder.ensure_capacity(capacity);
246        self.nulls.ensure_capacity(capacity);
247    }
248
249    fn set_validity(&mut self, validity: Mask) {
250        self.nulls = LazyNullBufferBuilder::new(validity.len());
251        self.nulls.append_validity_mask(validity);
252    }
253
254    fn finish(&mut self) -> ArrayRef {
255        self.finish_into_list().into_array()
256    }
257
258    fn finish_into_canonical(&mut self) -> Canonical {
259        Canonical::List(self.finish_into_list())
260    }
261}
262
263#[cfg(test)]
264mod tests {
265    use std::sync::Arc;
266
267    use Nullability::{NonNullable, Nullable};
268    use vortex_buffer::buffer;
269    use vortex_dtype::PType::I32;
270    use vortex_dtype::{DType, Nullability};
271    use vortex_scalar::Scalar;
272
273    use crate::array::Array;
274    use crate::arrays::{ChunkedArray, ListArray, OffsetPType};
275    use crate::builders::ArrayBuilder;
276    use crate::builders::list::ListBuilder;
277    use crate::validity::Validity;
278    use crate::vtable::ValidityHelper;
279    use crate::{IntoArray as _, ToCanonical};
280
281    #[test]
282    fn test_empty() {
283        let mut builder = ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0);
284
285        let list = builder.finish();
286        assert_eq!(list.len(), 0);
287    }
288
289    #[test]
290    fn test_values() {
291        let dtype: Arc<DType> = Arc::new(I32.into());
292        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
293
294        builder
295            .append_value(
296                Scalar::list(
297                    dtype.clone(),
298                    vec![1i32.into(), 2i32.into(), 3i32.into()],
299                    NonNullable,
300                )
301                .as_list(),
302            )
303            .unwrap();
304
305        builder
306            .append_value(
307                Scalar::list(
308                    dtype,
309                    vec![4i32.into(), 5i32.into(), 6i32.into()],
310                    NonNullable,
311                )
312                .as_list(),
313            )
314            .unwrap();
315
316        let list = builder.finish();
317        assert_eq!(list.len(), 2);
318
319        let list_array = list.to_list();
320
321        assert_eq!(list_array.elements_at(0).len(), 3);
322        assert_eq!(list_array.elements_at(1).len(), 3);
323    }
324
325    #[test]
326    fn test_append_empty_list() {
327        let dtype: Arc<DType> = Arc::new(I32.into());
328        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
329
330        assert!(
331            builder
332                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
333                .is_ok()
334        )
335    }
336
337    #[test]
338    fn test_nullable_values() {
339        let dtype: Arc<DType> = Arc::new(I32.into());
340        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), Nullable, 0);
341
342        builder
343            .append_value(
344                Scalar::list(
345                    dtype.clone(),
346                    vec![1i32.into(), 2i32.into(), 3i32.into()],
347                    NonNullable,
348                )
349                .as_list(),
350            )
351            .unwrap();
352
353        builder
354            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
355            .unwrap();
356
357        builder
358            .append_value(
359                Scalar::list(
360                    dtype,
361                    vec![4i32.into(), 5i32.into(), 6i32.into()],
362                    NonNullable,
363                )
364                .as_list(),
365            )
366            .unwrap();
367
368        let list = builder.finish();
369        assert_eq!(list.len(), 3);
370
371        let list_array = list.to_list();
372
373        assert_eq!(list_array.elements_at(0).len(), 3);
374        assert_eq!(list_array.elements_at(1).len(), 0);
375        assert_eq!(list_array.elements_at(2).len(), 3);
376    }
377
378    fn test_extend_builder_gen<O: OffsetPType>() {
379        let list = ListArray::from_iter_opt_slow::<O, _, _>(
380            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
381            Arc::new(I32.into()),
382        )
383        .unwrap();
384
385        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 6);
386
387        builder.extend_from_array(&list);
388        builder.extend_from_array(&list);
389        builder.extend_from_array(&list.slice(0..0));
390        builder.extend_from_array(&list.slice(1..3));
391
392        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
393            [
394                Some(vec![0, 1, 2]),
395                None,
396                Some(vec![4, 5]),
397                Some(vec![0, 1, 2]),
398                None,
399                Some(vec![4, 5]),
400                None,
401                Some(vec![4, 5]),
402            ],
403            Arc::new(DType::Primitive(I32, NonNullable)),
404        )
405        .unwrap()
406        .to_list();
407
408        let actual = builder.finish_into_canonical().into_list();
409
410        assert_eq!(
411            actual.elements().to_primitive().as_slice::<i32>(),
412            expected.elements().to_primitive().as_slice::<i32>()
413        );
414
415        assert_eq!(
416            actual.offsets().to_primitive().as_slice::<O>(),
417            expected.offsets().to_primitive().as_slice::<O>()
418        );
419
420        assert_eq!(actual.validity(), expected.validity())
421    }
422
423    #[test]
424    fn test_extend_builder() {
425        test_extend_builder_gen::<i8>();
426        test_extend_builder_gen::<i16>();
427        test_extend_builder_gen::<i32>();
428        test_extend_builder_gen::<i64>();
429
430        test_extend_builder_gen::<u8>();
431        test_extend_builder_gen::<u16>();
432        test_extend_builder_gen::<u32>();
433        test_extend_builder_gen::<u64>();
434    }
435
436    #[test]
437    pub fn test_array_with_gap() {
438        let one_trailing_unused_element = ListArray::try_new(
439            buffer![1, 2, 3, 4].into_array(),
440            buffer![0, 3].into_array(),
441            Validity::NonNullable,
442        )
443        .unwrap();
444
445        let second_array = ListArray::try_new(
446            buffer![5, 6].into_array(),
447            buffer![0, 2].into_array(),
448            Validity::NonNullable,
449        )
450        .unwrap();
451
452        let chunked_list = ChunkedArray::try_new(
453            vec![
454                one_trailing_unused_element.clone().into_array(),
455                second_array.clone().into_array(),
456            ],
457            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
458        );
459
460        let canon_values = chunked_list.unwrap().to_list();
461
462        assert_eq!(
463            one_trailing_unused_element.scalar_at(0),
464            canon_values.scalar_at(0)
465        );
466        assert_eq!(second_array.scalar_at(0), canon_values.scalar_at(1));
467    }
468}