vortex_array/builders/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::any::Any;
5use std::sync::Arc;
6
7use vortex_dtype::Nullability::NonNullable;
8use vortex_dtype::{DType, NativePType, Nullability};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail};
10use vortex_mask::Mask;
11use vortex_scalar::ListScalar;
12
13use crate::arrays::{ListArray, OffsetPType};
14use crate::builders::lazy_validity_builder::LazyNullBufferBuilder;
15use crate::builders::{ArrayBuilder, ArrayBuilderExt, PrimitiveBuilder, builder_with_capacity};
16use crate::compute::{add_scalar, cast, sub_scalar};
17use crate::{Array, ArrayRef, IntoArray, ToCanonical};
18
19pub struct ListBuilder<O: NativePType> {
20    /// The values of the list.
21    value_builder: Box<dyn ArrayBuilder>,
22    /// Represents the offsets into the values array.
23    index_builder: PrimitiveBuilder<O>,
24    nulls: LazyNullBufferBuilder,
25    nullability: Nullability,
26    dtype: DType,
27}
28
29impl<O: OffsetPType> ListBuilder<O> {
30    /// Create a ListBuilder with the specified capacity for indices.
31    ///
32    /// # Notes
33    ///
34    /// The number of indices is one more than the number of lists in the array!
35    ///
36    /// See also: [ListBuilder::with_values_and_index_capacity].
37    pub fn with_capacity(
38        value_dtype: Arc<DType>,
39        nullability: Nullability,
40        index_capacity: usize,
41    ) -> Self {
42        // I would expect the list to have more than one value per index
43        Self::with_values_and_index_capacity(
44            value_dtype,
45            nullability,
46            2 * index_capacity,
47            index_capacity,
48        )
49    }
50
51    /// Create a ListBuilder with the specified capacity for indices and values.
52    ///
53    /// # Notes
54    ///
55    /// The number of indices is one more than the number of lists in the array!
56    pub fn with_values_and_index_capacity(
57        value_dtype: Arc<DType>,
58        nullability: Nullability,
59        values_capacity: usize,
60        index_capacity: usize,
61    ) -> Self {
62        let value_builder = builder_with_capacity(value_dtype.as_ref(), values_capacity);
63        let mut index_builder = PrimitiveBuilder::with_capacity(NonNullable, index_capacity);
64
65        // The first index of the list, which is always 0 and represents an empty list.
66        index_builder.append_zero();
67
68        Self {
69            value_builder,
70            index_builder,
71            nulls: LazyNullBufferBuilder::new(index_capacity),
72            nullability,
73            dtype: DType::List(value_dtype, nullability),
74        }
75    }
76
77    pub fn append_value(&mut self, value: ListScalar) -> VortexResult<()> {
78        match value.elements() {
79            None => {
80                if self.nullability == NonNullable {
81                    vortex_bail!("Cannot append null value to non-nullable list");
82                }
83                self.append_null();
84                Ok(())
85            }
86            Some(elements) => {
87                for scalar in elements {
88                    // TODO(joe): This is slow, we should be able to append multiple values at once,
89                    // or the list scalar should hold an Array
90                    self.value_builder.append_scalar(&scalar)?;
91                }
92                self.nulls.append_non_null();
93                self.append_index(
94                    O::from_usize(self.value_builder.len())
95                        .vortex_expect("Failed to convert from usize to O"),
96                )
97            }
98        }
99    }
100
101    fn append_index(&mut self, index: O) -> VortexResult<()> {
102        self.index_builder.append_scalar(&index.into())
103    }
104}
105
106impl<O: OffsetPType> ArrayBuilder for ListBuilder<O> {
107    fn as_any(&self) -> &dyn Any {
108        self
109    }
110
111    fn as_any_mut(&mut self) -> &mut dyn Any {
112        self
113    }
114
115    fn dtype(&self) -> &DType {
116        &self.dtype
117    }
118
119    fn len(&self) -> usize {
120        self.nulls.len()
121    }
122
123    fn append_zeros(&mut self, n: usize) {
124        let count = self.value_builder.len();
125        self.value_builder.append_zeros(n);
126        for i in 0..n {
127            self.append_index(
128                O::from_usize(count + i + 1).vortex_expect("Failed to convert from usize to <O>"),
129            )
130            .vortex_expect("Failed to append index");
131        }
132        self.nulls.append_n_non_nulls(n);
133    }
134
135    fn append_nulls(&mut self, n: usize) {
136        let count = self.value_builder.len();
137        for _ in 0..n {
138            // A list with a null element is can be a list with a zero-span offset and a validity
139            // bit set
140            self.append_index(
141                O::from_usize(count).vortex_expect("Failed to convert from usize to <O>"),
142            )
143            .vortex_expect("Failed to append index");
144        }
145        self.nulls.append_n_nulls(n);
146    }
147
148    fn extend_from_array(&mut self, array: &dyn Array) -> VortexResult<()> {
149        let list = array.to_list()?;
150        if list.is_empty() {
151            return Ok(());
152        }
153
154        let n_already_added_values = self.value_builder.len();
155        let Some(n_already_added_values) = O::from_usize(n_already_added_values) else {
156            vortex_bail!(
157                "cannot convert length {} to type {:?}",
158                n_already_added_values,
159                O::PTYPE
160            )
161        };
162
163        let offsets = list.offsets();
164        let elements = list.elements();
165
166        let index_dtype = self.index_builder.dtype();
167
168        let n_leading_junk_values_scalar = offsets.scalar_at(0).cast(index_dtype)?;
169        let n_leading_junk_values = usize::try_from(&n_leading_junk_values_scalar)?;
170
171        let casted_offsets = cast(&offsets.slice(1, offsets.len()), index_dtype)?;
172        let offsets_without_leading_junk =
173            sub_scalar(&casted_offsets, n_leading_junk_values_scalar)?;
174        let offsets_into_builder =
175            add_scalar(&offsets_without_leading_junk, n_already_added_values.into())?;
176
177        let last_offset = offsets.scalar_at(offsets.len() - 1);
178        let last_offset = usize::try_from(&last_offset)?;
179        let non_junk_values = elements.slice(n_leading_junk_values, last_offset);
180
181        self.nulls.append_validity_mask(array.validity_mask()?);
182        self.index_builder
183            .extend_from_array(&offsets_into_builder)?;
184        self.value_builder.ensure_capacity(non_junk_values.len());
185        self.value_builder.extend_from_array(&non_junk_values)?;
186
187        Ok(())
188    }
189
190    fn ensure_capacity(&mut self, capacity: usize) {
191        self.index_builder.ensure_capacity(capacity);
192        self.value_builder.ensure_capacity(capacity);
193        self.nulls.ensure_capacity(capacity);
194    }
195
196    fn set_validity(&mut self, validity: Mask) {
197        self.nulls = LazyNullBufferBuilder::new(validity.len());
198        self.nulls.append_validity_mask(validity);
199    }
200
201    fn finish(&mut self) -> ArrayRef {
202        assert_eq!(
203            self.index_builder.len(),
204            self.nulls.len() + 1,
205            "Indices length must be one more than nulls length."
206        );
207
208        ListArray::try_new(
209            self.value_builder.finish(),
210            self.index_builder.finish(),
211            self.nulls.finish_with_nullability(self.nullability),
212        )
213        .vortex_expect("Buffer, offsets, and validity must have same length.")
214        .into_array()
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use std::sync::Arc;
221
222    use Nullability::{NonNullable, Nullable};
223    use vortex_buffer::buffer;
224    use vortex_dtype::PType::I32;
225    use vortex_dtype::{DType, Nullability};
226    use vortex_scalar::Scalar;
227
228    use crate::array::Array;
229    use crate::arrays::{ChunkedArray, ListArray, OffsetPType};
230    use crate::builders::ArrayBuilder;
231    use crate::builders::list::ListBuilder;
232    use crate::validity::Validity;
233    use crate::vtable::ValidityHelper;
234    use crate::{IntoArray as _, ToCanonical};
235
236    #[test]
237    fn test_empty() {
238        let mut builder = ListBuilder::<u32>::with_capacity(Arc::new(I32.into()), NonNullable, 0);
239
240        let list = builder.finish();
241        assert_eq!(list.len(), 0);
242    }
243
244    #[test]
245    fn test_values() {
246        let dtype: Arc<DType> = Arc::new(I32.into());
247        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
248
249        builder
250            .append_value(
251                Scalar::list(
252                    dtype.clone(),
253                    vec![1i32.into(), 2i32.into(), 3i32.into()],
254                    NonNullable,
255                )
256                .as_list(),
257            )
258            .unwrap();
259
260        builder
261            .append_value(
262                Scalar::list(
263                    dtype,
264                    vec![4i32.into(), 5i32.into(), 6i32.into()],
265                    NonNullable,
266                )
267                .as_list(),
268            )
269            .unwrap();
270
271        let list = builder.finish();
272        assert_eq!(list.len(), 2);
273
274        let list_array = list.to_list().unwrap();
275
276        assert_eq!(list_array.elements_at(0).len(), 3);
277        assert_eq!(list_array.elements_at(1).len(), 3);
278    }
279
280    #[test]
281    fn test_append_empty_list() {
282        let dtype: Arc<DType> = Arc::new(I32.into());
283        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), NonNullable, 0);
284
285        assert!(
286            builder
287                .append_value(Scalar::list_empty(dtype, NonNullable).as_list())
288                .is_ok()
289        )
290    }
291
292    #[test]
293    fn test_nullable_values() {
294        let dtype: Arc<DType> = Arc::new(I32.into());
295        let mut builder = ListBuilder::<u32>::with_capacity(dtype.clone(), Nullable, 0);
296
297        builder
298            .append_value(
299                Scalar::list(
300                    dtype.clone(),
301                    vec![1i32.into(), 2i32.into(), 3i32.into()],
302                    NonNullable,
303                )
304                .as_list(),
305            )
306            .unwrap();
307
308        builder
309            .append_value(Scalar::list_empty(dtype.clone(), NonNullable).as_list())
310            .unwrap();
311
312        builder
313            .append_value(
314                Scalar::list(
315                    dtype,
316                    vec![4i32.into(), 5i32.into(), 6i32.into()],
317                    NonNullable,
318                )
319                .as_list(),
320            )
321            .unwrap();
322
323        let list = builder.finish();
324        assert_eq!(list.len(), 3);
325
326        let list_array = list.to_list().unwrap();
327
328        assert_eq!(list_array.elements_at(0).len(), 3);
329        assert_eq!(list_array.elements_at(1).len(), 0);
330        assert_eq!(list_array.elements_at(2).len(), 3);
331    }
332
333    fn test_extend_builder_gen<O: OffsetPType>() {
334        let list = ListArray::from_iter_opt_slow::<O, _, _>(
335            [Some(vec![0, 1, 2]), None, Some(vec![4, 5])],
336            Arc::new(I32.into()),
337        )
338        .unwrap();
339
340        let mut builder = ListBuilder::<O>::with_capacity(Arc::new(I32.into()), Nullable, 6);
341
342        builder.extend_from_array(&list).unwrap();
343        builder.extend_from_array(&list).unwrap();
344        builder.extend_from_array(&list.slice(0, 0)).unwrap();
345        builder.extend_from_array(&list.slice(1, 3)).unwrap();
346
347        let expected = ListArray::from_iter_opt_slow::<O, _, _>(
348            [
349                Some(vec![0, 1, 2]),
350                None,
351                Some(vec![4, 5]),
352                Some(vec![0, 1, 2]),
353                None,
354                Some(vec![4, 5]),
355                None,
356                Some(vec![4, 5]),
357            ],
358            Arc::new(DType::Primitive(I32, NonNullable)),
359        )
360        .unwrap()
361        .to_list()
362        .unwrap();
363
364        let actual = builder
365            .finish()
366            .to_canonical()
367            .unwrap()
368            .into_list()
369            .unwrap();
370
371        assert_eq!(
372            actual.elements().to_primitive().unwrap().as_slice::<i32>(),
373            expected
374                .elements()
375                .to_primitive()
376                .unwrap()
377                .as_slice::<i32>()
378        );
379
380        assert_eq!(
381            actual.offsets().to_primitive().unwrap().as_slice::<O>(),
382            expected.offsets().to_primitive().unwrap().as_slice::<O>()
383        );
384
385        assert_eq!(actual.validity(), expected.validity())
386    }
387
388    #[test]
389    fn test_extend_builder() {
390        test_extend_builder_gen::<i8>();
391        test_extend_builder_gen::<i16>();
392        test_extend_builder_gen::<i32>();
393        test_extend_builder_gen::<i64>();
394
395        test_extend_builder_gen::<u8>();
396        test_extend_builder_gen::<u16>();
397        test_extend_builder_gen::<u32>();
398        test_extend_builder_gen::<u64>();
399    }
400
401    #[test]
402    pub fn test_array_with_gap() {
403        let one_trailing_unused_element = ListArray::try_new(
404            buffer![1, 2, 3, 4].into_array(),
405            buffer![0, 3].into_array(),
406            Validity::NonNullable,
407        )
408        .unwrap();
409
410        let second_array = ListArray::try_new(
411            buffer![5, 6].into_array(),
412            buffer![0, 2].into_array(),
413            Validity::NonNullable,
414        )
415        .unwrap();
416
417        let chunked_list = ChunkedArray::try_new(
418            vec![
419                one_trailing_unused_element.clone().into_array(),
420                second_array.clone().into_array(),
421            ],
422            DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
423        );
424
425        let canon_values = chunked_list.unwrap().to_list().unwrap();
426
427        assert_eq!(
428            one_trailing_unused_element.scalar_at(0),
429            canon_values.scalar_at(0)
430        );
431        assert_eq!(second_array.scalar_at(0), canon_values.scalar_at(1));
432    }
433}