Skip to main content

vortex_array/arrays/chunked/vtable/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::hash::Hasher;
5
6use itertools::Itertools;
7use smallvec::SmallVec;
8use vortex_error::VortexExpect;
9use vortex_error::VortexResult;
10use vortex_error::vortex_bail;
11use vortex_error::vortex_ensure;
12use vortex_error::vortex_err;
13use vortex_error::vortex_panic;
14use vortex_session::VortexSession;
15use vortex_session::registry::CachedId;
16
17use crate::ArrayEq;
18use crate::ArrayHash;
19use crate::ArrayRef;
20use crate::Canonical;
21use crate::EqMode;
22use crate::ExecutionCtx;
23use crate::ExecutionResult;
24use crate::IntoArray;
25#[expect(deprecated)]
26use crate::ToCanonical as _;
27use crate::array::Array;
28use crate::array::ArrayId;
29use crate::array::ArrayParts;
30use crate::array::ArrayView;
31use crate::array::VTable;
32use crate::array::with_empty_buffers;
33use crate::arrays::chunked::ChunkedArrayExt;
34use crate::arrays::chunked::ChunkedData;
35use crate::arrays::chunked::array::CHUNK_OFFSETS_SLOT;
36use crate::arrays::chunked::array::CHUNKS_OFFSET;
37use crate::arrays::chunked::compute::rules::PARENT_RULES;
38use crate::arrays::chunked::vtable::canonical::_canonicalize;
39use crate::buffer::BufferHandle;
40use crate::builders::ArrayBuilder;
41use crate::dtype::DType;
42use crate::dtype::Nullability;
43use crate::dtype::PType;
44use crate::serde::ArrayChildren;
45mod canonical;
46mod operations;
47mod validity;
48
49/// A [`Chunked`]-encoded Vortex array.
50pub type ChunkedArray = Array<Chunked>;
51
52#[derive(Clone, Debug)]
53pub struct Chunked;
54
55impl ArrayHash for ChunkedData {
56    fn array_hash<H: Hasher>(&self, _state: &mut H, _accuracy: EqMode) {
57        // Chunk offsets are cached derived data. Slot 0 already stores the logical offsets array,
58        // and ArrayData hashing includes every slot before TypedArrayData.
59    }
60}
61
62impl ArrayEq for ChunkedData {
63    fn array_eq(&self, _other: &Self, _accuracy: EqMode) -> bool {
64        // Chunk offsets are cached derived data. Slot 0 already stores the logical offsets array,
65        // and ArrayData equality compares every slot before TypedArrayData.
66        true
67    }
68}
69
70impl VTable for Chunked {
71    type TypedArrayData = ChunkedData;
72
73    type OperationsVTable = Self;
74    type ValidityVTable = Self;
75    fn id(&self) -> ArrayId {
76        static ID: CachedId = CachedId::new("vortex.chunked");
77        *ID
78    }
79
80    fn validate(
81        &self,
82        data: &ChunkedData,
83        dtype: &DType,
84        len: usize,
85        slots: &[Option<ArrayRef>],
86    ) -> VortexResult<()> {
87        vortex_ensure!(
88            !slots.is_empty(),
89            "ChunkedArray must have at least a chunk offsets slot"
90        );
91        let chunk_offsets = slots[CHUNK_OFFSETS_SLOT]
92            .as_ref()
93            .vortex_expect("validated chunk offsets slot");
94        vortex_ensure!(
95            chunk_offsets.dtype() == &DType::Primitive(PType::U64, Nullability::NonNullable),
96            "ChunkedArray chunk offsets must be non-nullable u64, found {}",
97            chunk_offsets.dtype()
98        );
99        vortex_ensure!(
100            chunk_offsets.len() == data.chunk_offsets.len(),
101            "ChunkedArray chunk offsets slot length {} does not match cached offsets length {}",
102            chunk_offsets.len(),
103            data.chunk_offsets.len()
104        );
105        vortex_ensure!(
106            data.chunk_offsets.len() == slots.len() - CHUNKS_OFFSET + 1,
107            "ChunkedArray chunk offsets length {} does not match {} chunks",
108            data.chunk_offsets.len(),
109            slots.len() - CHUNKS_OFFSET
110        );
111        vortex_ensure!(
112            data.chunk_offsets
113                .last()
114                .copied()
115                .vortex_expect("chunked arrays always have a leading 0 offset")
116                == len,
117            "ChunkedArray length {} does not match outer length {}",
118            data.chunk_offsets.last().copied().unwrap_or_default(),
119            len
120        );
121        for (idx, (start, end)) in data
122            .chunk_offsets
123            .iter()
124            .copied()
125            .tuple_windows()
126            .enumerate()
127        {
128            let chunk = slots[CHUNKS_OFFSET + idx]
129                .as_ref()
130                .vortex_expect("validated chunk slot");
131            vortex_ensure!(
132                chunk.dtype() == dtype,
133                "ChunkedArray chunk dtype {} does not match outer dtype {}",
134                chunk.dtype(),
135                dtype
136            );
137            vortex_ensure!(
138                chunk.len() == end - start,
139                "ChunkedArray chunk {} len {} does not match offsets span {}",
140                idx,
141                chunk.len(),
142                end - start
143            );
144        }
145        Ok(())
146    }
147
148    fn nbuffers(_array: ArrayView<'_, Self>) -> usize {
149        0
150    }
151
152    fn buffer(_array: ArrayView<'_, Self>, idx: usize) -> BufferHandle {
153        vortex_panic!("ChunkedArray buffer index {idx} out of bounds")
154    }
155
156    fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option<String> {
157        vortex_panic!("ChunkedArray buffer_name index {idx} out of bounds")
158    }
159
160    fn with_buffers(
161        &self,
162        array: ArrayView<'_, Self>,
163        buffers: &[BufferHandle],
164    ) -> VortexResult<ArrayParts<Self>> {
165        with_empty_buffers(self, array, buffers)
166    }
167
168    fn serialize(
169        _array: ArrayView<'_, Self>,
170        _session: &VortexSession,
171    ) -> VortexResult<Option<Vec<u8>>> {
172        Ok(Some(vec![]))
173    }
174
175    fn deserialize(
176        &self,
177        dtype: &DType,
178        len: usize,
179        metadata: &[u8],
180        _buffers: &[BufferHandle],
181        children: &dyn ArrayChildren,
182        _session: &VortexSession,
183    ) -> VortexResult<ArrayParts<Self>> {
184        if !metadata.is_empty() {
185            vortex_bail!(
186                "ChunkedArray expects empty metadata, got {} bytes",
187                metadata.len()
188            );
189        }
190        if children.is_empty() {
191            vortex_bail!("Chunked array needs at least one child");
192        }
193
194        let nchunks = children.len() - 1;
195        let chunk_offsets = children.get(
196            CHUNK_OFFSETS_SLOT,
197            &DType::Primitive(PType::U64, Nullability::NonNullable),
198            nchunks + 1,
199        )?;
200        #[expect(deprecated)]
201        let chunk_offsets_buf = chunk_offsets.to_primitive().to_buffer::<u64>();
202        let chunk_offsets_usize = chunk_offsets_buf
203            .iter()
204            .copied()
205            .map(|offset| {
206                usize::try_from(offset)
207                    .map_err(|_| vortex_err!("chunk offset {offset} exceeds usize range"))
208            })
209            .collect::<VortexResult<Vec<_>>>()?;
210        let mut slots = SmallVec::with_capacity(children.len());
211        slots.push(Some(chunk_offsets));
212        for (idx, (start, end)) in chunk_offsets_usize
213            .iter()
214            .copied()
215            .tuple_windows()
216            .enumerate()
217        {
218            let chunk_len = end - start;
219            slots.push(Some(children.get(idx + CHUNKS_OFFSET, dtype, chunk_len)?));
220        }
221
222        Ok(ArrayParts::new(
223            self.clone(),
224            dtype.clone(),
225            len,
226            ChunkedData::new(chunk_offsets_usize),
227        )
228        .with_slots(slots))
229    }
230
231    fn append_to_builder(
232        array: ArrayView<'_, Self>,
233        builder: &mut dyn ArrayBuilder,
234        ctx: &mut ExecutionCtx,
235    ) -> VortexResult<()> {
236        for chunk in array.iter_chunks() {
237            chunk.append_to_builder(builder, ctx)?;
238        }
239        Ok(())
240    }
241
242    fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String {
243        match idx {
244            CHUNK_OFFSETS_SLOT => "chunk_offsets".to_string(),
245            n => format!("chunks[{}]", n - CHUNKS_OFFSET),
246        }
247    }
248
249    fn execute(array: Array<Self>, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionResult> {
250        match array.dtype() {
251            // Struct, List, FixedSizeList, and Variant need child swizzling that the builder path
252            // cannot express.
253            DType::Struct(..) | DType::List(..) | DType::FixedSizeList(..) | DType::Variant(..) => {
254                // TODO(joe)[#7674]: iterative execution here too
255                Ok(ExecutionResult::done(_canonicalize(array.as_view(), ctx)?))
256            }
257            // For all other types, use the builder path via AppendChild.
258            _ => {
259                let slot_idx = array.next_builder_slot.max(CHUNKS_OFFSET);
260                if slot_idx < array.slots().len() {
261                    Ok(ExecutionResult::append_child(
262                        array.with_next_builder_slot(slot_idx + 1),
263                        slot_idx,
264                    ))
265                } else {
266                    Ok(ExecutionResult::done(
267                        Canonical::empty(array.dtype()).into_array(),
268                    ))
269                }
270            }
271        }
272    }
273
274    fn reduce(array: ArrayView<'_, Self>) -> VortexResult<Option<ArrayRef>> {
275        Ok(match array.nchunks() {
276            0 => Some(Canonical::empty(array.dtype()).into_array()),
277            1 => Some(array.chunk(0).clone()),
278            _ => None,
279        })
280    }
281
282    fn reduce_parent(
283        array: ArrayView<'_, Self>,
284        parent: &ArrayRef,
285        child_idx: usize,
286    ) -> VortexResult<Option<ArrayRef>> {
287        PARENT_RULES.evaluate(array, parent, child_idx)
288    }
289}