vortex_array/vtable/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module contains the VTable definitions for a Vortex encoding.
5
6mod array;
7mod dyn_;
8mod operations;
9mod validity;
10mod visitor;
11
12use std::fmt::Debug;
13use std::ops::Deref;
14
15pub use array::*;
16pub use dyn_::*;
17pub use operations::*;
18pub use validity::*;
19pub use visitor::*;
20use vortex_error::VortexResult;
21use vortex_session::VortexSession;
22
23use crate::Array;
24use crate::ArrayRef;
25use crate::Canonical;
26use crate::IntoArray;
27use crate::buffer::BufferHandle;
28use crate::builders::ArrayBuilder;
29use crate::dtype::DType;
30use crate::executor::ExecutionCtx;
31use crate::serde::ArrayChildren;
32
33/// The array [`VTable`] encapsulates logic for an Array type within Vortex.
34///
35/// The logic is split across several "VTable" traits to enable easier code organization than
36/// simply lumping everything into a single trait.
37///
38/// From this [`VTable`] trait, we derive implementations for the sealed [`Array`] and [`DynVTable`]
39/// traits.
40///
41/// The functions defined in these vtable traits will typically document their pre- and
42/// post-conditions. The pre-conditions are validated inside the [`Array`] and [`DynVTable`]
43/// implementations so do not need to be checked in the vtable implementations (for example, index
44/// out of bounds). Post-conditions are validated after invocation of the vtable function and will
45/// panic if violated.
46pub trait VTable: 'static + Sized + Send + Sync + Debug {
47 type Array: 'static + Send + Sync + Clone + Debug + Deref<Target = dyn Array> + IntoArray;
48 type Metadata: Debug;
49
50 type ArrayVTable: BaseArrayVTable<Self>;
51 type OperationsVTable: OperationsVTable<Self>;
52 type ValidityVTable: ValidityVTable<Self>;
53 type VisitorVTable: VisitorVTable<Self>;
54
55 /// Returns the ID of the array.
56 fn id(array: &Self::Array) -> ArrayId;
57
58 /// Exports metadata for an array.
59 ///
60 /// All other parts of the array are exported using the [`crate::vtable::VisitorVTable`].
61 ///
62 /// * If the array does not contain metadata, it should return
63 /// [`crate::metadata::EmptyMetadata`].
64 fn metadata(array: &Self::Array) -> VortexResult<Self::Metadata>;
65
66 /// Serialize metadata into a byte buffer for IPC or file storage.
67 /// Return `None` if the array cannot be serialized.
68 fn serialize(metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>>;
69
70 /// Deserialize array metadata from a byte buffer.
71 ///
72 /// To reduce the serialized form, arrays do not store their own DType and length. Instead,
73 /// this is passed down from the parent array during deserialization. These properties are
74 /// exposed here for use during deserialization.
75 fn deserialize(
76 bytes: &[u8],
77 _dtype: &DType,
78 _len: usize,
79 _buffers: &[BufferHandle],
80 _session: &VortexSession,
81 ) -> VortexResult<Self::Metadata>;
82
83 /// Writes the array into a canonical builder.
84 ///
85 /// ## Post-conditions
86 /// - The length of the builder is incremented by the length of the input array.
87 fn append_to_builder(
88 array: &Self::Array,
89 builder: &mut dyn ArrayBuilder,
90 ctx: &mut ExecutionCtx,
91 ) -> VortexResult<()> {
92 let canonical = array.to_array().execute::<Canonical>(ctx)?;
93 builder.extend_from_array(canonical.as_ref());
94 Ok(())
95 }
96
97 /// Build an array from components.
98 ///
99 /// This is called on the file and IPC deserialization pathways, to reconstruct the array from
100 /// type-erased components.
101 ///
102 /// Encoding implementers should take note that all validation necessary to ensure the encoding
103 /// is safe to read should happen inside of this method.
104 ///
105 /// # Safety and correctness
106 ///
107 /// This method should *never* panic, it must always return an error or else it returns a
108 /// valid `Array` that meets all the encoding's preconditions.
109 ///
110 /// For example, the `build` implementation for a dictionary encoding should ensure that all
111 /// codes lie in the valid range. For a UTF-8 array, it should check the bytes to ensure they
112 /// are all valid string data bytes. Any corrupt files or malformed data buffers should be
113 /// caught here, before returning the deserialized array.
114 ///
115 /// # Validation
116 ///
117 /// Validation is mainly meant to ensure that all internal pointers in the encoding reference
118 /// valid ranges of data, and that all data conforms to its DType constraints. These ensure
119 /// that no array operations will panic at runtime, or yield undefined behavior when unsafe
120 /// operations like `get_unchecked` use indices in the array buffer.
121 ///
122 /// Examples of the kinds of validation that should be part of the `build` step:
123 ///
124 /// * Checking that any offsets buffers point to valid offsets in some other child array
125 /// * Checking that any buffers for data or validity have the appropriate size for the
126 /// encoding
127 /// * Running UTF-8 validation for any buffers that are expected to hold flat UTF-8 data
128 // TODO(ngates): take the parts by ownership, since most arrays need them anyway
129 fn build(
130 dtype: &DType,
131 len: usize,
132 metadata: &Self::Metadata,
133 buffers: &[BufferHandle],
134 children: &dyn ArrayChildren,
135 ) -> VortexResult<Self::Array>;
136
137 /// Replaces the children in `array` with `children`. The count must be the same and types
138 /// of children must be expected.
139 fn with_children(array: &mut Self::Array, children: Vec<ArrayRef>) -> VortexResult<()>;
140
141 /// Execute this array to produce an [`ArrayRef`].
142 ///
143 /// Array execution is designed such that repeated execution of an array will eventually
144 /// converge to a canonical representation. Implementations of this function should therefore
145 /// ensure they make progress towards that goal.
146 ///
147 /// This includes fully evaluating the array, such us decoding run-end encoding, or executing
148 /// one of the array's children and re-building the array with the executed child.
149 ///
150 /// It is recommended to only perform a single step of execution per call to this function,
151 /// such that surrounding arrays have an opportunity to perform their own parent reduction
152 /// or execution logic.
153 ///
154 /// The returned array must be logically equivalent to the input array. In other words, the
155 /// recursively canonicalized forms of both arrays must be equal.
156 ///
157 /// Debug builds will panic if the returned array is of the wrong type, wrong length, or
158 /// incorrectly contains null values.
159 ///
160 // TODO(ngates): in the future, we may pass a "target encoding hint" such that this array
161 // can produce a more optimal representation for the parent. This could be used to preserve
162 // varbin vs varbinview or list vs listview encodings when the parent knows it prefers
163 // one representation over another, such as when exporting to a specific Arrow array.
164 fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef>;
165
166 /// Attempt to execute the parent of this array.
167 ///
168 /// This function allows arrays to plug in specialized execution logic for their parent. For
169 /// example, strings compressed as FSST arrays can implement a custom equality comparison when
170 /// the comparing against a scalar string.
171 ///
172 /// Returns `Ok(None)` if no specialized execution is possible.
173 fn execute_parent(
174 array: &Self::Array,
175 parent: &ArrayRef,
176 child_idx: usize,
177 ctx: &mut ExecutionCtx,
178 ) -> VortexResult<Option<ArrayRef>> {
179 _ = (array, parent, child_idx, ctx);
180 Ok(None)
181 }
182
183 /// Attempt to reduce the array to a more simple representation.
184 ///
185 /// Returns `Ok(None)` if no reduction is possible.
186 fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>> {
187 _ = array;
188 Ok(None)
189 }
190
191 /// Attempt to perform a reduction of the parent of this array.
192 ///
193 /// This function allows arrays to plug in reduction rules to their parents, for example
194 /// run-end arrays can pull-down scalar functions and apply them only over their values.
195 ///
196 /// Returns `Ok(None)` if no reduction is possible.
197 fn reduce_parent(
198 array: &Self::Array,
199 parent: &ArrayRef,
200 child_idx: usize,
201 ) -> VortexResult<Option<ArrayRef>> {
202 _ = (array, parent, child_idx);
203 Ok(None)
204 }
205}
206
207/// Placeholder type used to indicate when a particular vtable is not supported by the encoding.
208pub struct NotSupported;
209
210#[macro_export]
211macro_rules! vtable {
212 ($V:ident) => {
213 $crate::aliases::paste::paste! {
214 impl AsRef<dyn $crate::Array> for [<$V Array>] {
215 fn as_ref(&self) -> &dyn $crate::Array {
216 // We can unsafe cast ourselves to an ArrayAdapter.
217 unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
218 }
219 }
220
221 impl std::ops::Deref for [<$V Array>] {
222 type Target = dyn $crate::Array;
223
224 fn deref(&self) -> &Self::Target {
225 // We can unsafe cast ourselves to an ArrayAdapter.
226 unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
227 }
228 }
229
230 impl $crate::IntoArray for [<$V Array>] {
231 fn into_array(self) -> $crate::ArrayRef {
232 // We can unsafe transmute ourselves to an ArrayAdapter.
233 std::sync::Arc::new(unsafe { std::mem::transmute::<[<$V Array>], $crate::ArrayAdapter::<[<$V VTable>]>>(self) })
234 }
235 }
236
237 impl From<[<$V Array>]> for $crate::ArrayRef {
238 fn from(value: [<$V Array>]) -> $crate::ArrayRef {
239 use $crate::IntoArray;
240 value.into_array()
241 }
242 }
243 }
244 };
245}