Skip to main content

vortex_array/vtable/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module contains the VTable definitions for a Vortex encoding.
5
6mod dyn_;
7mod operations;
8mod validity;
9
10use std::fmt::Debug;
11use std::hash::Hasher;
12use std::ops::Deref;
13use std::sync::Arc;
14
15pub use dyn_::*;
16pub use operations::*;
17pub use validity::*;
18use vortex_error::VortexExpect;
19use vortex_error::VortexResult;
20use vortex_error::vortex_panic;
21use vortex_session::VortexSession;
22
23use crate::ArrayRef;
24use crate::Canonical;
25use crate::DynArray;
26use crate::ExecutionResult;
27use crate::IntoArray;
28use crate::Precision;
29use crate::arrays::ConstantArray;
30use crate::buffer::BufferHandle;
31use crate::builders::ArrayBuilder;
32use crate::dtype::DType;
33use crate::executor::ExecutionCtx;
34use crate::patches::Patches;
35use crate::serde::ArrayChildren;
36use crate::stats::StatsSetRef;
37use crate::validity::Validity;
38
39/// The array [`VTable`] encapsulates logic for an Array type within Vortex.
40///
41/// The logic is split across several "VTable" traits to enable easier code organization than
42/// simply lumping everything into a single trait.
43///
44/// From this [`VTable`] trait, we derive implementations for the sealed [`DynArray`] and [`DynVTable`]
45/// traits.
46///
47/// The functions defined in these vtable traits will typically document their pre- and
48/// post-conditions. The pre-conditions are validated inside the [`DynArray`] and [`DynVTable`]
49/// implementations so do not need to be checked in the vtable implementations (for example, index
50/// out of bounds). Post-conditions are validated after invocation of the vtable function and will
51/// panic if violated.
52pub trait VTable: 'static + Clone + Sized + Send + Sync + Debug {
53    type Array: 'static + Send + Sync + Clone + Debug + Deref<Target = dyn DynArray> + IntoArray;
54    type Metadata: Debug;
55
56    type OperationsVTable: OperationsVTable<Self>;
57    type ValidityVTable: ValidityVTable<Self>;
58
59    /// Returns the VTable from the array instance.
60    ///
61    // NOTE(ngates): this function is temporary while we migrate Arrays over to the unified vtable
62    fn vtable(array: &Self::Array) -> &Self;
63
64    /// Returns the ID of the array.
65    fn id(&self) -> ArrayId;
66
67    /// Returns the length of the array.
68    fn len(array: &Self::Array) -> usize;
69
70    /// Returns the DType of the array.
71    fn dtype(array: &Self::Array) -> &DType;
72
73    /// Returns the stats set for the array.
74    fn stats(array: &Self::Array) -> StatsSetRef<'_>;
75
76    /// Hashes the array contents.
77    fn array_hash<H: Hasher>(array: &Self::Array, state: &mut H, precision: Precision);
78
79    /// Compares two arrays of the same type for equality.
80    fn array_eq(array: &Self::Array, other: &Self::Array, precision: Precision) -> bool;
81
82    /// Returns the number of buffers in the array.
83    fn nbuffers(array: &Self::Array) -> usize;
84
85    /// Returns the buffer at the given index.
86    ///
87    /// # Panics
88    /// Panics if `idx >= nbuffers(array)`.
89    fn buffer(array: &Self::Array, idx: usize) -> BufferHandle;
90
91    /// Returns the name of the buffer at the given index, or `None` if unnamed.
92    fn buffer_name(array: &Self::Array, idx: usize) -> Option<String>;
93
94    /// Returns the number of children in the array.
95    fn nchildren(array: &Self::Array) -> usize;
96
97    /// Returns the child at the given index.
98    ///
99    /// # Panics
100    /// Panics if `idx >= nchildren(array)`.
101    fn child(array: &Self::Array, idx: usize) -> ArrayRef;
102
103    /// Returns the name of the child at the given index.
104    ///
105    /// # Panics
106    /// Panics if `idx >= nchildren(array)`.
107    fn child_name(array: &Self::Array, idx: usize) -> String;
108
109    /// Exports metadata for an array.
110    ///
111    /// * If the array does not contain metadata, it should return
112    ///   [`crate::metadata::EmptyMetadata`].
113    fn metadata(array: &Self::Array) -> VortexResult<Self::Metadata>;
114
115    /// Serialize metadata into a byte buffer for IPC or file storage.
116    /// Return `None` if the array cannot be serialized.
117    fn serialize(metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>>;
118
119    /// Deserialize array metadata from a byte buffer.
120    ///
121    /// To reduce the serialized form, arrays do not store their own DType and length. Instead,
122    /// this is passed down from the parent array during deserialization. These properties are
123    /// exposed here for use during deserialization.
124    fn deserialize(
125        bytes: &[u8],
126        _dtype: &DType,
127        _len: usize,
128        _buffers: &[BufferHandle],
129        _session: &VortexSession,
130    ) -> VortexResult<Self::Metadata>;
131
132    /// Writes the array into a canonical builder.
133    ///
134    /// ## Post-conditions
135    /// - The length of the builder is incremented by the length of the input array.
136    fn append_to_builder(
137        array: &Self::Array,
138        builder: &mut dyn ArrayBuilder,
139        ctx: &mut ExecutionCtx,
140    ) -> VortexResult<()> {
141        let canonical = array.to_array().execute::<Canonical>(ctx)?.into_array();
142        builder.extend_from_array(&canonical);
143        Ok(())
144    }
145
146    /// Build an array from components.
147    ///
148    /// This is called on the file and IPC deserialization pathways, to reconstruct the array from
149    /// type-erased components.
150    ///
151    /// Encoding implementers should take note that all validation necessary to ensure the encoding
152    /// is safe to read should happen inside of this method.
153    ///
154    /// # Safety and correctness
155    ///
156    /// This method should *never* panic, it must always return an error or else it returns a
157    /// valid `Array` that meets all the encoding's preconditions.
158    ///
159    /// For example, the `build` implementation for a dictionary encoding should ensure that all
160    /// codes lie in the valid range. For a UTF-8 array, it should check the bytes to ensure they
161    /// are all valid string data bytes. Any corrupt files or malformed data buffers should be
162    /// caught here, before returning the deserialized array.
163    ///
164    /// # Validation
165    ///
166    /// Validation is mainly meant to ensure that all internal pointers in the encoding reference
167    /// valid ranges of data, and that all data conforms to its DType constraints. These ensure
168    /// that no array operations will panic at runtime, or yield undefined behavior when unsafe
169    /// operations like `get_unchecked` use indices in the array buffer.
170    ///
171    /// Examples of the kinds of validation that should be part of the `build` step:
172    ///
173    /// * Checking that any offsets buffers point to valid offsets in some other child array
174    /// * Checking that any buffers for data or validity have the appropriate size for the
175    ///   encoding
176    /// * Running UTF-8 validation for any buffers that are expected to hold flat UTF-8 data
177    // TODO(ngates): take the parts by ownership, since most arrays need them anyway
178    fn build(
179        dtype: &DType,
180        len: usize,
181        metadata: &Self::Metadata,
182        buffers: &[BufferHandle],
183        children: &dyn ArrayChildren,
184    ) -> VortexResult<Self::Array>;
185
186    /// Replaces the children in `array` with `children`. The count must be the same and types
187    /// of children must be expected.
188    fn with_children(array: &mut Self::Array, children: Vec<ArrayRef>) -> VortexResult<()>;
189
190    /// Execute this array by returning an [`ExecutionResult`] that tells the scheduler what to
191    /// do next.
192    ///
193    /// Instead of recursively executing children, implementations should return
194    /// [`ExecutionResult::execute_child`] to request that the scheduler execute a child first,
195    /// or [`ExecutionResult::done`] when the encoding can produce a result directly.
196    ///
197    /// Array execution is designed such that repeated execution of an array will eventually
198    /// converge to a canonical representation. Implementations of this function should therefore
199    /// ensure they make progress towards that goal.
200    ///
201    /// The returned array (in `Done`) must be logically equivalent to the input array. In other
202    /// words, the recursively canonicalized forms of both arrays must be equal.
203    ///
204    /// Debug builds will panic if the returned array is of the wrong type, wrong length, or
205    /// incorrectly contains null values.
206    fn execute(array: Arc<Self::Array>, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionResult>;
207
208    /// Attempt to execute the parent of this array.
209    ///
210    /// This function allows arrays to plug in specialized execution logic for their parent. For
211    /// example, strings compressed as FSST arrays can implement a custom equality comparison when
212    /// the comparing against a scalar string.
213    ///
214    /// Returns `Ok(None)` if no specialized execution is possible.
215    fn execute_parent(
216        array: &Self::Array,
217        parent: &ArrayRef,
218        child_idx: usize,
219        ctx: &mut ExecutionCtx,
220    ) -> VortexResult<Option<ArrayRef>> {
221        _ = (array, parent, child_idx, ctx);
222        Ok(None)
223    }
224
225    /// Attempt to reduce the array to a more simple representation.
226    ///
227    /// Returns `Ok(None)` if no reduction is possible.
228    fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>> {
229        _ = array;
230        Ok(None)
231    }
232
233    /// Attempt to perform a reduction of the parent of this array.
234    ///
235    /// This function allows arrays to plug in reduction rules to their parents, for example
236    /// run-end arrays can pull-down scalar functions and apply them only over their values.
237    ///
238    /// Returns `Ok(None)` if no reduction is possible.
239    fn reduce_parent(
240        array: &Self::Array,
241        parent: &ArrayRef,
242        child_idx: usize,
243    ) -> VortexResult<Option<ArrayRef>> {
244        _ = (array, parent, child_idx);
245        Ok(None)
246    }
247}
248
249/// Placeholder type used to indicate when a particular vtable is not supported by the encoding.
250pub struct NotSupported;
251
252/// Returns the validity as a child array if it produces one.
253///
254/// - `NonNullable` and `AllValid` produce no child (returns `None`)
255/// - `AllInvalid` produces a `ConstantArray` of `false` values
256/// - `Array` returns the validity array
257#[inline]
258pub fn validity_to_child(validity: &Validity, len: usize) -> Option<ArrayRef> {
259    match validity {
260        Validity::NonNullable | Validity::AllValid => None,
261        Validity::AllInvalid => Some(ConstantArray::new(false, len).into_array()),
262        Validity::Array(array) => Some(array.clone()),
263    }
264}
265
266/// Returns 1 if validity produces a child, 0 otherwise.
267#[inline]
268pub fn validity_nchildren(validity: &Validity) -> usize {
269    match validity {
270        Validity::NonNullable | Validity::AllValid => 0,
271        Validity::AllInvalid | Validity::Array(_) => 1,
272    }
273}
274
275/// Returns the number of children produced by patches.
276#[inline]
277pub fn patches_nchildren(patches: &Patches) -> usize {
278    2 + patches.chunk_offsets().is_some() as usize
279}
280
281/// Returns the child at the given index within a patches component.
282///
283/// Index 0 = patch_indices, 1 = patch_values, 2 = patch_chunk_offsets (if present).
284#[inline]
285pub fn patches_child(patches: &Patches, idx: usize) -> ArrayRef {
286    match idx {
287        0 => patches.indices().clone(),
288        1 => patches.values().clone(),
289        2 => patches
290            .chunk_offsets()
291            .as_ref()
292            .vortex_expect("patch_chunk_offsets child out of bounds")
293            .clone(),
294        _ => vortex_panic!("patches child index {idx} out of bounds"),
295    }
296}
297
298/// Returns the name of the child at the given index within a patches component.
299#[inline]
300pub fn patches_child_name(idx: usize) -> &'static str {
301    match idx {
302        0 => "patch_indices",
303        1 => "patch_values",
304        2 => "patch_chunk_offsets",
305        _ => vortex_panic!("patches child name index {idx} out of bounds"),
306    }
307}
308
309#[macro_export]
310macro_rules! vtable {
311    ($V:ident) => {
312        $crate::vtable!($V, $V);
313    };
314    ($Base:ident, $VT:ident) => {
315        $crate::aliases::paste::paste! {
316            impl AsRef<dyn $crate::DynArray> for [<$Base Array>] {
317                fn as_ref(&self) -> &dyn $crate::DynArray {
318                    // We can unsafe cast ourselves to an ArrayAdapter.
319                    unsafe { &*(self as *const [<$Base Array>] as *const $crate::ArrayAdapter<$VT>) }
320                }
321            }
322
323            impl std::ops::Deref for [<$Base Array>] {
324                type Target = dyn $crate::DynArray;
325
326                fn deref(&self) -> &Self::Target {
327                    // We can unsafe cast ourselves to an ArrayAdapter.
328                    unsafe { &*(self as *const [<$Base Array>] as *const $crate::ArrayAdapter<$VT>) }
329                }
330            }
331
332            impl $crate::IntoArray for [<$Base Array>] {
333                fn into_array(self) -> $crate::ArrayRef {
334                    // We can unsafe transmute ourselves to an ArrayAdapter.
335                    std::sync::Arc::new(unsafe { std::mem::transmute::<[<$Base Array>], $crate::ArrayAdapter::<$VT>>(self) })
336                }
337            }
338
339            impl From<[<$Base Array>]> for $crate::ArrayRef {
340                fn from(value: [<$Base Array>]) -> $crate::ArrayRef {
341                    use $crate::IntoArray;
342                    value.into_array()
343                }
344            }
345
346            impl [<$Base Array>] {
347                #[deprecated(note = "use `.into_array()` (owned) or `.clone().into_array()` (ref) to make clones explicit")]
348                pub fn to_array(&self) -> $crate::ArrayRef {
349                    use $crate::IntoArray;
350                    self.clone().into_array()
351                }
352
353                /// Upcasts an `Arc<Self>` to an [`ArrayRef`] without cloning.
354                pub fn into_array_ref(self: std::sync::Arc<Self>) -> $crate::ArrayRef {
355                    // SAFETY: ArrayAdapter<V> is #[repr(transparent)] over V::Array,
356                    // so Arc<V::Array> and Arc<ArrayAdapter<V>> have identical layout.
357                    let raw = std::sync::Arc::into_raw(self) as *const $crate::ArrayAdapter<$VT>;
358                    unsafe { std::sync::Arc::from_raw(raw) }
359                }
360            }
361        }
362    };
363}