Skip to main content

vortex_array/vtable/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module contains the VTable definitions for a Vortex encoding.
5
6mod dyn_;
7mod operations;
8mod validity;
9
10use std::fmt::Debug;
11use std::hash::Hasher;
12use std::ops::Deref;
13
14pub use dyn_::*;
15pub use operations::*;
16pub use validity::*;
17use vortex_error::VortexExpect;
18use vortex_error::VortexResult;
19use vortex_error::vortex_panic;
20use vortex_session::VortexSession;
21
22use crate::Array;
23use crate::ArrayRef;
24use crate::Canonical;
25use crate::IntoArray;
26use crate::Precision;
27use crate::arrays::ConstantArray;
28use crate::buffer::BufferHandle;
29use crate::builders::ArrayBuilder;
30use crate::dtype::DType;
31use crate::executor::ExecutionCtx;
32use crate::patches::Patches;
33use crate::serde::ArrayChildren;
34use crate::stats::StatsSetRef;
35use crate::validity::Validity;
36
37/// The array [`VTable`] encapsulates logic for an Array type within Vortex.
38///
39/// The logic is split across several "VTable" traits to enable easier code organization than
40/// simply lumping everything into a single trait.
41///
42/// From this [`VTable`] trait, we derive implementations for the sealed [`Array`] and [`DynVTable`]
43/// traits.
44///
45/// The functions defined in these vtable traits will typically document their pre- and
46/// post-conditions. The pre-conditions are validated inside the [`Array`] and [`DynVTable`]
47/// implementations so do not need to be checked in the vtable implementations (for example, index
48/// out of bounds). Post-conditions are validated after invocation of the vtable function and will
49/// panic if violated.
50pub trait VTable: 'static + Sized + Send + Sync + Debug {
51    type Array: 'static + Send + Sync + Clone + Debug + Deref<Target = dyn Array> + IntoArray;
52    type Metadata: Debug;
53
54    type OperationsVTable: OperationsVTable<Self>;
55    type ValidityVTable: ValidityVTable<Self>;
56
57    /// Returns the ID of the array.
58    fn id(array: &Self::Array) -> ArrayId;
59
60    /// Returns the length of the array.
61    fn len(array: &Self::Array) -> usize;
62
63    /// Returns the DType of the array.
64    fn dtype(array: &Self::Array) -> &DType;
65
66    /// Returns the stats set for the array.
67    fn stats(array: &Self::Array) -> StatsSetRef<'_>;
68
69    /// Hashes the array contents.
70    fn array_hash<H: Hasher>(array: &Self::Array, state: &mut H, precision: Precision);
71
72    /// Compares two arrays of the same type for equality.
73    fn array_eq(array: &Self::Array, other: &Self::Array, precision: Precision) -> bool;
74
75    /// Returns the number of buffers in the array.
76    fn nbuffers(array: &Self::Array) -> usize;
77
78    /// Returns the buffer at the given index.
79    ///
80    /// # Panics
81    /// Panics if `idx >= nbuffers(array)`.
82    fn buffer(array: &Self::Array, idx: usize) -> BufferHandle;
83
84    /// Returns the name of the buffer at the given index, or `None` if unnamed.
85    fn buffer_name(array: &Self::Array, idx: usize) -> Option<String>;
86
87    /// Returns the number of children in the array.
88    fn nchildren(array: &Self::Array) -> usize;
89
90    /// Returns the child at the given index.
91    ///
92    /// # Panics
93    /// Panics if `idx >= nchildren(array)`.
94    fn child(array: &Self::Array, idx: usize) -> ArrayRef;
95
96    /// Returns the name of the child at the given index.
97    ///
98    /// # Panics
99    /// Panics if `idx >= nchildren(array)`.
100    fn child_name(array: &Self::Array, idx: usize) -> String;
101
102    /// Exports metadata for an array.
103    ///
104    /// * If the array does not contain metadata, it should return
105    ///   [`crate::metadata::EmptyMetadata`].
106    fn metadata(array: &Self::Array) -> VortexResult<Self::Metadata>;
107
108    /// Serialize metadata into a byte buffer for IPC or file storage.
109    /// Return `None` if the array cannot be serialized.
110    fn serialize(metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>>;
111
112    /// Deserialize array metadata from a byte buffer.
113    ///
114    /// To reduce the serialized form, arrays do not store their own DType and length. Instead,
115    /// this is passed down from the parent array during deserialization. These properties are
116    /// exposed here for use during deserialization.
117    fn deserialize(
118        bytes: &[u8],
119        _dtype: &DType,
120        _len: usize,
121        _buffers: &[BufferHandle],
122        _session: &VortexSession,
123    ) -> VortexResult<Self::Metadata>;
124
125    /// Writes the array into a canonical builder.
126    ///
127    /// ## Post-conditions
128    /// - The length of the builder is incremented by the length of the input array.
129    fn append_to_builder(
130        array: &Self::Array,
131        builder: &mut dyn ArrayBuilder,
132        ctx: &mut ExecutionCtx,
133    ) -> VortexResult<()> {
134        let canonical = array.to_array().execute::<Canonical>(ctx)?.into_array();
135        builder.extend_from_array(&canonical);
136        Ok(())
137    }
138
139    /// Build an array from components.
140    ///
141    /// This is called on the file and IPC deserialization pathways, to reconstruct the array from
142    /// type-erased components.
143    ///
144    /// Encoding implementers should take note that all validation necessary to ensure the encoding
145    /// is safe to read should happen inside of this method.
146    ///
147    /// # Safety and correctness
148    ///
149    /// This method should *never* panic, it must always return an error or else it returns a
150    /// valid `Array` that meets all the encoding's preconditions.
151    ///
152    /// For example, the `build` implementation for a dictionary encoding should ensure that all
153    /// codes lie in the valid range. For a UTF-8 array, it should check the bytes to ensure they
154    /// are all valid string data bytes. Any corrupt files or malformed data buffers should be
155    /// caught here, before returning the deserialized array.
156    ///
157    /// # Validation
158    ///
159    /// Validation is mainly meant to ensure that all internal pointers in the encoding reference
160    /// valid ranges of data, and that all data conforms to its DType constraints. These ensure
161    /// that no array operations will panic at runtime, or yield undefined behavior when unsafe
162    /// operations like `get_unchecked` use indices in the array buffer.
163    ///
164    /// Examples of the kinds of validation that should be part of the `build` step:
165    ///
166    /// * Checking that any offsets buffers point to valid offsets in some other child array
167    /// * Checking that any buffers for data or validity have the appropriate size for the
168    ///   encoding
169    /// * Running UTF-8 validation for any buffers that are expected to hold flat UTF-8 data
170    // TODO(ngates): take the parts by ownership, since most arrays need them anyway
171    fn build(
172        dtype: &DType,
173        len: usize,
174        metadata: &Self::Metadata,
175        buffers: &[BufferHandle],
176        children: &dyn ArrayChildren,
177    ) -> VortexResult<Self::Array>;
178
179    /// Replaces the children in `array` with `children`. The count must be the same and types
180    /// of children must be expected.
181    fn with_children(array: &mut Self::Array, children: Vec<ArrayRef>) -> VortexResult<()>;
182
183    /// Execute this array to produce an [`ArrayRef`].
184    ///
185    /// Array execution is designed such that repeated execution of an array will eventually
186    /// converge to a canonical representation. Implementations of this function should therefore
187    /// ensure they make progress towards that goal.
188    ///
189    /// This includes fully evaluating the array, such us decoding run-end encoding, or executing
190    /// one of the array's children and re-building the array with the executed child.
191    ///
192    /// It is recommended to only perform a single step of execution per call to this function,
193    /// such that surrounding arrays have an opportunity to perform their own parent reduction
194    /// or execution logic.
195    ///
196    /// The returned array must be logically equivalent to the input array. In other words, the
197    /// recursively canonicalized forms of both arrays must be equal.
198    ///
199    /// Debug builds will panic if the returned array is of the wrong type, wrong length, or
200    /// incorrectly contains null values.
201    ///
202    // TODO(ngates): in the future, we may pass a "target encoding hint" such that this array
203    //  can produce a more optimal representation for the parent. This could be used to preserve
204    //  varbin vs varbinview or list vs listview encodings when the parent knows it prefers
205    //  one representation over another, such as when exporting to a specific Arrow array.
206    fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef>;
207
208    /// Attempt to execute the parent of this array.
209    ///
210    /// This function allows arrays to plug in specialized execution logic for their parent. For
211    /// example, strings compressed as FSST arrays can implement a custom equality comparison when
212    /// the comparing against a scalar string.
213    ///
214    /// Returns `Ok(None)` if no specialized execution is possible.
215    fn execute_parent(
216        array: &Self::Array,
217        parent: &ArrayRef,
218        child_idx: usize,
219        ctx: &mut ExecutionCtx,
220    ) -> VortexResult<Option<ArrayRef>> {
221        _ = (array, parent, child_idx, ctx);
222        Ok(None)
223    }
224
225    /// Attempt to reduce the array to a more simple representation.
226    ///
227    /// Returns `Ok(None)` if no reduction is possible.
228    fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>> {
229        _ = array;
230        Ok(None)
231    }
232
233    /// Attempt to perform a reduction of the parent of this array.
234    ///
235    /// This function allows arrays to plug in reduction rules to their parents, for example
236    /// run-end arrays can pull-down scalar functions and apply them only over their values.
237    ///
238    /// Returns `Ok(None)` if no reduction is possible.
239    fn reduce_parent(
240        array: &Self::Array,
241        parent: &ArrayRef,
242        child_idx: usize,
243    ) -> VortexResult<Option<ArrayRef>> {
244        _ = (array, parent, child_idx);
245        Ok(None)
246    }
247}
248
249/// Placeholder type used to indicate when a particular vtable is not supported by the encoding.
250pub struct NotSupported;
251
252/// Returns the validity as a child array if it produces one.
253///
254/// - `NonNullable` and `AllValid` produce no child (returns `None`)
255/// - `AllInvalid` produces a `ConstantArray` of `false` values
256/// - `Array` returns the validity array
257#[inline]
258pub fn validity_to_child(validity: &Validity, len: usize) -> Option<ArrayRef> {
259    match validity {
260        Validity::NonNullable | Validity::AllValid => None,
261        Validity::AllInvalid => Some(ConstantArray::new(false, len).into_array()),
262        Validity::Array(array) => Some(array.clone()),
263    }
264}
265
266/// Returns 1 if validity produces a child, 0 otherwise.
267#[inline]
268pub fn validity_nchildren(validity: &Validity) -> usize {
269    match validity {
270        Validity::NonNullable | Validity::AllValid => 0,
271        Validity::AllInvalid | Validity::Array(_) => 1,
272    }
273}
274
275/// Returns the number of children produced by patches.
276#[inline]
277pub fn patches_nchildren(patches: &Patches) -> usize {
278    2 + patches.chunk_offsets().is_some() as usize
279}
280
281/// Returns the child at the given index within a patches component.
282///
283/// Index 0 = patch_indices, 1 = patch_values, 2 = patch_chunk_offsets (if present).
284#[inline]
285pub fn patches_child(patches: &Patches, idx: usize) -> ArrayRef {
286    match idx {
287        0 => patches.indices().clone(),
288        1 => patches.values().clone(),
289        2 => patches
290            .chunk_offsets()
291            .as_ref()
292            .vortex_expect("patch_chunk_offsets child out of bounds")
293            .clone(),
294        _ => vortex_panic!("patches child index {idx} out of bounds"),
295    }
296}
297
298/// Returns the name of the child at the given index within a patches component.
299#[inline]
300pub fn patches_child_name(idx: usize) -> &'static str {
301    match idx {
302        0 => "patch_indices",
303        1 => "patch_values",
304        2 => "patch_chunk_offsets",
305        _ => vortex_panic!("patches child name index {idx} out of bounds"),
306    }
307}
308
309#[macro_export]
310macro_rules! vtable {
311    ($V:ident) => {
312        $crate::aliases::paste::paste! {
313            impl AsRef<dyn $crate::Array> for [<$V Array>] {
314                fn as_ref(&self) -> &dyn $crate::Array {
315                    // We can unsafe cast ourselves to an ArrayAdapter.
316                    unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
317                }
318            }
319
320            impl std::ops::Deref for [<$V Array>] {
321                type Target = dyn $crate::Array;
322
323                fn deref(&self) -> &Self::Target {
324                    // We can unsafe cast ourselves to an ArrayAdapter.
325                    unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
326                }
327            }
328
329            impl $crate::IntoArray for [<$V Array>] {
330                fn into_array(self) -> $crate::ArrayRef {
331                    // We can unsafe transmute ourselves to an ArrayAdapter.
332                    std::sync::Arc::new(unsafe { std::mem::transmute::<[<$V Array>], $crate::ArrayAdapter::<[<$V VTable>]>>(self) })
333                }
334            }
335
336            impl From<[<$V Array>]> for $crate::ArrayRef {
337                fn from(value: [<$V Array>]) -> $crate::ArrayRef {
338                    use $crate::IntoArray;
339                    value.into_array()
340                }
341            }
342        }
343    };
344}