vortex_array/arrays/chunked/vtable/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_buffer::BufferMut;
5use vortex_dtype::{DType, Nullability, PType, StructFields};
6use vortex_error::VortexExpect;
7
8use crate::arrays::{
9    ChunkedArray, ChunkedVTable, ListViewArray, ListViewRebuildMode, PrimitiveArray, StructArray,
10};
11use crate::builders::{ArrayBuilder, builder_with_capacity};
12use crate::compute::cast;
13use crate::validity::Validity;
14use crate::vtable::CanonicalVTable;
15use crate::{Array, ArrayRef, Canonical, IntoArray, ToCanonical};
16
17impl CanonicalVTable<ChunkedVTable> for ChunkedVTable {
18    fn canonicalize(array: &ChunkedArray) -> Canonical {
19        if array.nchunks() == 0 {
20            return Canonical::empty(array.dtype());
21        }
22        if array.nchunks() == 1 {
23            return array.chunks()[0].to_canonical();
24        }
25
26        match array.dtype() {
27            DType::Struct(struct_dtype, _) => {
28                let struct_array = pack_struct_chunks(
29                    array.chunks(),
30                    Validity::copy_from_array(array.as_ref()),
31                    struct_dtype,
32                );
33                Canonical::Struct(struct_array)
34            }
35            DType::List(elem_dtype, _) => Canonical::List(swizzle_list_chunks(
36                array.chunks(),
37                Validity::copy_from_array(array.as_ref()),
38                elem_dtype,
39            )),
40            _ => {
41                let mut builder = builder_with_capacity(array.dtype(), array.len());
42                array.append_to_builder(builder.as_mut());
43                builder.finish_into_canonical()
44            }
45        }
46    }
47
48    fn append_to_builder(array: &ChunkedArray, builder: &mut dyn ArrayBuilder) {
49        for chunk in array.chunks() {
50            chunk.append_to_builder(builder);
51        }
52    }
53}
54
55/// Packs many [`StructArray`]s to instead be a single [`StructArray`], where the [`Array`] for each
56/// field is a [`ChunkedArray`].
57///
58/// The caller guarantees there are at least 2 chunks.
59fn pack_struct_chunks(
60    chunks: &[ArrayRef],
61    validity: Validity,
62    struct_dtype: &StructFields,
63) -> StructArray {
64    let len = chunks.iter().map(|chunk| chunk.len()).sum();
65    let mut field_arrays = Vec::new();
66
67    for (field_idx, field_dtype) in struct_dtype.fields().enumerate() {
68        let field_chunks = chunks
69            .iter()
70            .map(|c| {
71                c.to_struct()
72                    .fields()
73                    .get(field_idx)
74                    .vortex_expect("Invalid field index")
75                    .to_array()
76            })
77            .collect::<Vec<_>>();
78
79        // SAFETY: field_chunks are extracted from valid StructArrays with matching dtypes.
80        // Each chunk's field array is guaranteed to be valid for field_dtype.
81        let field_array = unsafe { ChunkedArray::new_unchecked(field_chunks, field_dtype.clone()) };
82        field_arrays.push(field_array.into_array());
83    }
84
85    // SAFETY: field_arrays are built from corresponding chunks of same length, dtypes match by
86    // construction.
87    unsafe { StructArray::new_unchecked(field_arrays, struct_dtype.clone(), len, validity) }
88}
89
90/// Packs [`ListViewArray`]s together into a chunked `ListViewArray`.
91///
92/// We use the existing arrays (chunks) to form a chunked array of `elements` (the child array).
93///
94/// The caller guarantees there are at least 2 chunks.
95fn swizzle_list_chunks(
96    chunks: &[ArrayRef],
97    validity: Validity,
98    elem_dtype: &DType,
99) -> ListViewArray {
100    let len: usize = chunks.iter().map(|c| c.len()).sum();
101
102    assert_eq!(
103        chunks[0]
104            .dtype()
105            .as_list_element_opt()
106            .vortex_expect("DType was somehow not a list")
107            .as_ref(),
108        elem_dtype
109    );
110
111    // Since each list array in `chunks` has offsets local to each array, we can reuse the existing
112    // array's child `elements` as the chunks and recompute offsets.
113
114    let mut list_elements_chunks = Vec::with_capacity(chunks.len());
115    let mut num_elements = 0;
116
117    // TODO(connor)[ListView]: We could potentially choose a smaller type here, but that would make
118    // this much more complicated.
119    // We (somewhat arbitrarily) choose `u64` for our offsets and sizes here. These can always be
120    // narrowed later by the compressor.
121    let mut offsets = BufferMut::<u64>::with_capacity(len);
122    let mut sizes = BufferMut::<u64>::with_capacity(len);
123
124    for chunk in chunks {
125        let chunk_array = chunk.to_listview();
126        // By rebuilding as zero-copy to `List` and trimming all elements (to prevent gaps), we make
127        // the final output `ListView` also zero-copyable to `List`.
128        let chunk_array = chunk_array.rebuild(ListViewRebuildMode::MakeExact);
129
130        // Add the `elements` of the current array as a new chunk.
131        list_elements_chunks.push(chunk_array.elements().clone());
132
133        // Cast offsets and sizes to `u64`.
134        let offsets_arr = cast(
135            chunk_array.offsets(),
136            &DType::Primitive(PType::U64, Nullability::NonNullable),
137        )
138        .vortex_expect("Must be able to fit array offsets in u64")
139        .to_primitive();
140
141        let sizes_arr = cast(
142            chunk_array.sizes(),
143            &DType::Primitive(PType::U64, Nullability::NonNullable),
144        )
145        .vortex_expect("Must be able to fit array offsets in u64")
146        .to_primitive();
147
148        let offsets_slice = offsets_arr.as_slice::<u64>();
149        let sizes_slice = sizes_arr.as_slice::<u64>();
150
151        // Append offsets and sizes, adjusting offsets to point into the combined array.
152        offsets.extend(offsets_slice.iter().map(|o| o + num_elements));
153        sizes.extend(sizes_slice);
154
155        num_elements += chunk_array.elements().len() as u64;
156    }
157
158    // SAFETY: elements are sliced from valid `ListViewArray`s (from `to_listview()`).
159    let chunked_elements =
160        unsafe { ChunkedArray::new_unchecked(list_elements_chunks, elem_dtype.clone()) }
161            .into_array();
162
163    let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable).into_array();
164    let sizes = PrimitiveArray::new(sizes.freeze(), Validity::NonNullable).into_array();
165
166    // SAFETY:
167    // - `offsets` and `sizes` are non-nullable u64 arrays of the same length
168    // - Each `offset[i] + size[i]` list view is within bounds of elements array because it came
169    //   from valid chunks
170    // - Validity came from the outer chunked array so it must have the same length
171    // - Since we made sure that all chunks were zero-copyable to a list above, we know that the
172    //   final concatenated output is also zero-copyable to a list.
173    unsafe {
174        ListViewArray::new_unchecked(chunked_elements, offsets, sizes, validity)
175            .with_zero_copy_to_list(true)
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use std::sync::Arc;
182
183    use vortex_buffer::buffer;
184    use vortex_dtype::DType::{List, Primitive};
185    use vortex_dtype::Nullability::NonNullable;
186    use vortex_dtype::PType::I32;
187
188    use crate::accessor::ArrayAccessor;
189    use crate::arrays::{ChunkedArray, ListArray, StructArray, VarBinViewArray};
190    use crate::validity::Validity;
191    use crate::{IntoArray, ToCanonical};
192
193    #[test]
194    pub fn pack_nested_structs() {
195        let struct_array = StructArray::try_new(
196            ["a"].into(),
197            vec![VarBinViewArray::from_iter_str(["foo", "bar", "baz", "quak"]).into_array()],
198            4,
199            Validity::NonNullable,
200        )
201        .unwrap();
202        let dtype = struct_array.dtype().clone();
203        let chunked = ChunkedArray::try_new(
204            vec![
205                ChunkedArray::try_new(vec![struct_array.to_array()], dtype.clone())
206                    .unwrap()
207                    .into_array(),
208            ],
209            dtype,
210        )
211        .unwrap()
212        .into_array();
213        let canonical_struct = chunked.to_struct();
214        let canonical_varbin = canonical_struct.fields()[0].to_varbinview();
215        let original_varbin = struct_array.fields()[0].to_varbinview();
216        let orig_values = original_varbin
217            .with_iterator(|it| it.map(|a| a.map(|v| v.to_vec())).collect::<Vec<_>>());
218        let canon_values = canonical_varbin
219            .with_iterator(|it| it.map(|a| a.map(|v| v.to_vec())).collect::<Vec<_>>());
220        assert_eq!(orig_values, canon_values);
221    }
222
223    #[test]
224    pub fn pack_nested_lists() {
225        let l1 = ListArray::try_new(
226            buffer![1, 2, 3, 4].into_array(),
227            buffer![0, 3].into_array(),
228            Validity::NonNullable,
229        )
230        .unwrap();
231
232        let l2 = ListArray::try_new(
233            buffer![5, 6].into_array(),
234            buffer![0, 2].into_array(),
235            Validity::NonNullable,
236        )
237        .unwrap();
238
239        let chunked_list = ChunkedArray::try_new(
240            vec![l1.clone().into_array(), l2.clone().into_array()],
241            List(Arc::new(Primitive(I32, NonNullable)), NonNullable),
242        );
243
244        let canon_values = chunked_list.unwrap().to_listview();
245
246        assert_eq!(l1.scalar_at(0), canon_values.scalar_at(0));
247        assert_eq!(l2.scalar_at(0), canon_values.scalar_at(1));
248    }
249}