vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_dtype::Nullability::NonNullable;
8use vortex_dtype::{DType, NativePType, Nullability};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail};
10use vortex_mask::Mask;
11use vortex_scalar::ListScalar;
12
13use crate::arrays::{ListArray, OffsetPType};
14use crate::builders::lazy_validity_builder::LazyNullBufferBuilder;
15use crate::builders::{ArrayBuilder, ArrayBuilderExt, PrimitiveBuilder, builder_with_capacity};
16use crate::compute::{add_scalar, cast, sub_scalar};
17use crate::{Array, ArrayRef, IntoArray, ToCanonical};
18
19pub struct ListBuilder<O: NativePType> {
20    /// The values of the list.
21    value_builder: Box<dyn ArrayBuilder>,
22    /// Represents the offsets into the values array.
23    index_builder: PrimitiveBuilder<O>,
24    nulls: LazyNullBufferBuilder,
25    // TODO(connor): This can probably be removed since we store it in the `dtype` field as well.
26    nullability: Nullability,
27    dtype: DType,
28}
29
30impl<O: OffsetPType> ListBuilder<O> {
31    /// Create a ListBuilder with the specified capacity for indices.
32    ///
33    /// # Notes
34    ///
35    /// The number of indices is one more than the number of lists in the array!
36    ///
37    /// See also: [ListBuilder::with_values_and_index_capacity].
38    pub fn with_capacity(
39        value_dtype: Arc<DType>,
40        nullability: Nullability,
41        index_capacity: usize,
42    ) -> Self {
43        // I would expect the list to have more than one value per index
44        Self::with_values_and_index_capacity(
45            value_dtype,
46            nullability,
47            2 * index_capacity,
48            index_capacity,
49        )
50    }
51
52    /// Create a ListBuilder with the specified capacity for indices and values.
53    ///
54    /// # Notes
55    ///
56    /// The number of indices is one more than the number of lists in the array!
57    pub fn with_values_and_index_capacity(
58        value_dtype: Arc<DType>,
59        nullability: Nullability,
60        values_capacity: usize,
61        index_capacity: usize,
62    ) -> Self {
63        let value_builder = builder_with_capacity(value_dtype.as_ref(), values_capacity);
64        let mut index_builder = PrimitiveBuilder::with_capacity(NonNullable, index_capacity);
65
66        // The first index of the list, which is always 0 and represents an empty list.
67        index_builder.append_zero();
68
69        Self {
70            value_builder,
71            index_builder,
72            nulls: LazyNullBufferBuilder::new(index_capacity),
73            nullability,
74            dtype: DType::List(value_dtype, nullability),
75        }
76    }
77
78    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
79        match value.elements() {
80            None => {
81                if self.nullability == NonNullable {
82                    vortex_bail!("Cannot append null value to non-nullable list");
83                }
84                self.append_null();
85                Ok(())
86            }
87            Some(elements) => {
88                for scalar in elements {
89                    // TODO(connor): This is slow, we should be able to append multiple values at
90                    // once, or the list scalar should hold an Array
91                    self.value_builder.append_scalar(&scalar)?;
92                }
93                self.nulls.append_non_null();
94                self.append_index(
95                    O::from_usize(self.value_builder.len())
96                        .vortex_expect("Failed to convert from usize to O"),
97                )
98            }
99        }
100    }
101
102    fn append_index(&mut self, index: O) -> VortexResult<()> {
103        self.index_builder.append_scalar(&index.into())
104    }
105}
106
107impl<O: OffsetPType> ArrayBuilder for ListBuilder<O> {
108    fn as_any(&self) -> &dyn Any {
109        self
110    }
111
112    fn as_any_mut(&mut self) -> &mut dyn Any {
113        self
114    }
115
116    fn dtype(&self) -> &DType {
117        &self.dtype
118    }
119
120    fn len(&self) -> usize {
121        self.nulls.len()
122    }
123
124    fn append_zeros(&mut self, n: usize) {
125        let count = self.value_builder.len();
126        // TODO(connor): this is incorrect, as it creates lists of size 1 instead of 0.
127        self.value_builder.append_zeros(n);
128        for i in 0..n {
129            self.append_index(
130                O::from_usize(count + i + 1).vortex_expect("Failed to convert from usize to <O>"),
131            )
132            .vortex_expect("Failed to append index");
133        }
134        self.nulls.append_n_non_nulls(n);
135    }
136
137    fn append_nulls(&mut self, n: usize) {
138        let count = self.value_builder.len();
139        for _ in 0..n {
140            // A list with a null element is can be a list with a zero-span offset and a validity
141            // bit set
142            self.append_index(
143                O::from_usize(count).vortex_expect("Failed to convert from usize to <O>"),
144            )
145            .vortex_expect("Failed to append index");
146        }
147        self.nulls.append_n_nulls(n);
148    }
149
150    fn extend_from_array(&mut self, array: &dyn Array) -> VortexResult<()> {
151        let list = array.to_list()?;
152        if list.is_empty() {
153            return Ok(());
154        }
155
156        let n_already_added_values = self.value_builder.len();
157        let Some(n_already_added_values) = O::from_usize(n_already_added_values) else {
158            vortex_bail!(
159                "cannot convert length {} to type {:?}",
160                n_already_added_values,
161                O::PTYPE
162            )
163        };
164
165        let offsets = list.offsets();
166        let elements = list.elements();
167
168        let index_dtype = self.index_builder.dtype();
169
170        let n_leading_junk_values_scalar = offsets.scalar_at(0).cast(index_dtype)?;
171        let n_leading_junk_values = usize::try_from(&n_leading_junk_values_scalar)?;
172
173        let casted_offsets = cast(&offsets.slice(1..offsets.len()), index_dtype)?;
174        let offsets_without_leading_junk =
175            sub_scalar(&casted_offsets, n_leading_junk_values_scalar)?;
176        let offsets_into_builder =
177            add_scalar(&offsets_without_leading_junk, n_already_added_values.into())?;
178
179        let last_offset = offsets.scalar_at(offsets.len() - 1);
180        let last_offset = usize::try_from(&last_offset)?;
181        let non_junk_values = elements.slice(n_leading_junk_values..last_offset);
182
183        self.nulls.append_validity_mask(array.validity_mask());
184        self.index_builder
185            .extend_from_array(&offsets_into_builder)?;
186        self.value_builder.ensure_capacity(non_junk_values.len());
187        self.value_builder.extend_from_array(&non_junk_values)?;
188
189        Ok(())
190    }
191
192    fn ensure_capacity(&mut self, capacity: usize) {
193        self.index_builder.ensure_capacity(capacity);
194        self.value_builder.ensure_capacity(capacity);
195        self.nulls.ensure_capacity(capacity);
196    }
197
198    fn set_validity(&mut self, validity: Mask) {
199        self.nulls = LazyNullBufferBuilder::new(validity.len());
200        self.nulls.append_validity_mask(validity);
201    }
202
203    fn finish(&mut self) -> ArrayRef {
204        assert_eq!(
205            self.index_builder.len(),
206            self.nulls.len() + 1,
207            "Indices length must be one more than nulls length."
208        );
209
210        // TODO(connor): Use `new_unchecked` here.
211        ListArray::try_new(
212            self.value_builder.finish(),
213            self.index_builder.finish(),
214            self.nulls.finish_with_nullability(self.nullability),
215        )
216        .vortex_expect("Buffer, offsets, and validity must have same length.")
217        .into_array()
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use std::sync::Arc;
224
225    use Nullability::{NonNullable, Nullable};
226    use vortex_buffer::buffer;
227    use vortex_dtype::PType::I32;
228    use vortex_dtype::{DType, Nullability};
229    use vortex_scalar::Scalar;
230
231    use crate::array::Array;
232    use crate::arrays::{ChunkedArray, ListArray, OffsetPType};
233    use crate::builders::ArrayBuilder;
234    use crate::builders::list::ListBuilder;
235    use crate::validity::Validity;
236    use crate::vtable::ValidityHelper;
237    use crate::{IntoArray as _, ToCanonical};
238
239    #[test]
240    fn test_empty() {
241        let mut builder = ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0);
242
243        let list = builder.finish();
244        assert_eq!(list.len(), 0);
245    }
246
247    #[test]
248    fn test_values() {
249        let dtype: Arc<DType> = Arc::new(I32.into());
250        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
251
252        builder
253            .append_value(
254                Scalar::list(
255                    dtype.clone(),
256                    vec![1i32.into(), 2i32.into(), 3i32.into()],
257                    NonNullable,
258                )
259                .as_list(),
260            )
261            .unwrap();
262
263        builder
264            .append_value(
265                Scalar::list(
266                    dtype,
267                    vec![4i32.into(), 5i32.into(), 6i32.into()],
268                    NonNullable,
269                )
270                .as_list(),
271            )
272            .unwrap();
273
274        let list = builder.finish();
275        assert_eq!(list.len(), 2);
276
277        let list_array = list.to_list().unwrap();
278
279        assert_eq!(list_array.elements_at(0).len(), 3);
280        assert_eq!(list_array.elements_at(1).len(), 3);
281    }
282
283    #[test]
284    fn test_append_empty_list() {
285        let dtype: Arc<DType> = Arc::new(I32.into());
286        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
287
288        assert!(
289            builder
290                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
291                .is_ok()
292        )
293    }
294
295    #[test]
296    fn test_nullable_values() {
297        let dtype: Arc<DType> = Arc::new(I32.into());
298        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), Nullable, 0);
299
300        builder
301            .append_value(
302                Scalar::list(
303                    dtype.clone(),
304                    vec![1i32.into(), 2i32.into(), 3i32.into()],
305                    NonNullable,
306                )
307                .as_list(),
308            )
309            .unwrap();
310
311        builder
312            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
313            .unwrap();
314
315        builder
316            .append_value(
317                Scalar::list(
318                    dtype,
319                    vec![4i32.into(), 5i32.into(), 6i32.into()],
320                    NonNullable,
321                )
322                .as_list(),
323            )
324            .unwrap();
325
326        let list = builder.finish();
327        assert_eq!(list.len(), 3);
328
329        let list_array = list.to_list().unwrap();
330
331        assert_eq!(list_array.elements_at(0).len(), 3);
332        assert_eq!(list_array.elements_at(1).len(), 0);
333        assert_eq!(list_array.elements_at(2).len(), 3);
334    }
335
336    fn test_extend_builder_gen<O: OffsetPType>() {
337        let list = ListArray::from_iter_opt_slow::<O, _, _>(
338            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
339            Arc::new(I32.into()),
340        )
341        .unwrap();
342
343        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 6);
344
345        builder.extend_from_array(&list).unwrap();
346        builder.extend_from_array(&list).unwrap();
347        builder.extend_from_array(&list.slice(0..0)).unwrap();
348        builder.extend_from_array(&list.slice(1..3)).unwrap();
349
350        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
351            [
352                Some(vec![0, 1, 2]),
353                None,
354                Some(vec![4, 5]),
355                Some(vec![0, 1, 2]),
356                None,
357                Some(vec![4, 5]),
358                None,
359                Some(vec![4, 5]),
360            ],
361            Arc::new(DType::Primitive(I32, NonNullable)),
362        )
363        .unwrap()
364        .to_list()
365        .unwrap();
366
367        let actual = builder
368            .finish()
369            .to_canonical()
370            .unwrap()
371            .into_list()
372            .unwrap();
373
374        assert_eq!(
375            actual.elements().to_primitive().unwrap().as_slice::<i32>(),
376            expected
377                .elements()
378                .to_primitive()
379                .unwrap()
380                .as_slice::<i32>()
381        );
382
383        assert_eq!(
384            actual.offsets().to_primitive().unwrap().as_slice::<O>(),
385            expected.offsets().to_primitive().unwrap().as_slice::<O>()
386        );
387
388        assert_eq!(actual.validity(), expected.validity())
389    }
390
391    #[test]
392    fn test_extend_builder() {
393        test_extend_builder_gen::<i8>();
394        test_extend_builder_gen::<i16>();
395        test_extend_builder_gen::<i32>();
396        test_extend_builder_gen::<i64>();
397
398        test_extend_builder_gen::<u8>();
399        test_extend_builder_gen::<u16>();
400        test_extend_builder_gen::<u32>();
401        test_extend_builder_gen::<u64>();
402    }
403
404    #[test]
405    pub fn test_array_with_gap() {
406        let one_trailing_unused_element = ListArray::try_new(
407            buffer![1, 2, 3, 4].into_array(),
408            buffer![0, 3].into_array(),
409            Validity::NonNullable,
410        )
411        .unwrap();
412
413        let second_array = ListArray::try_new(
414            buffer![5, 6].into_array(),
415            buffer![0, 2].into_array(),
416            Validity::NonNullable,
417        )
418        .unwrap();
419
420        let chunked_list = ChunkedArray::try_new(
421            vec![
422                one_trailing_unused_element.clone().into_array(),
423                second_array.clone().into_array(),
424            ],
425            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
426        );
427
428        let canon_values = chunked_list.unwrap().to_list().unwrap();
429
430        assert_eq!(
431            one_trailing_unused_element.scalar_at(0),
432            canon_values.scalar_at(0)
433        );
434        assert_eq!(second_array.scalar_at(0), canon_values.scalar_at(1));
435    }
436}