vortex_array/vtable/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module contains the VTable definitions for a Vortex encoding.
5
6mod dyn_;
7mod operations;
8mod validity;
9
10use std::fmt::Debug;
11use std::hash::Hasher;
12use std::ops::Deref;
13
14pub use dyn_::*;
15pub use operations::*;
16pub use validity::*;
17use vortex_error::VortexExpect;
18use vortex_error::VortexResult;
19use vortex_error::vortex_panic;
20use vortex_session::VortexSession;
21
22use crate::Array;
23use crate::ArrayRef;
24use crate::Canonical;
25use crate::IntoArray;
26use crate::Precision;
27use crate::arrays::ConstantArray;
28use crate::buffer::BufferHandle;
29use crate::builders::ArrayBuilder;
30use crate::dtype::DType;
31use crate::executor::ExecutionCtx;
32use crate::patches::Patches;
33use crate::serde::ArrayChildren;
34use crate::stats::StatsSetRef;
35use crate::validity::Validity;
36
37/// The array [`VTable`] encapsulates logic for an Array type within Vortex.
38///
39/// The logic is split across several "VTable" traits to enable easier code organization than
40/// simply lumping everything into a single trait.
41///
42/// From this [`VTable`] trait, we derive implementations for the sealed [`Array`] and [`DynVTable`]
43/// traits.
44///
45/// The functions defined in these vtable traits will typically document their pre- and
46/// post-conditions. The pre-conditions are validated inside the [`Array`] and [`DynVTable`]
47/// implementations so do not need to be checked in the vtable implementations (for example, index
48/// out of bounds). Post-conditions are validated after invocation of the vtable function and will
49/// panic if violated.
50pub trait VTable: 'static + Sized + Send + Sync + Debug {
51 type Array: 'static + Send + Sync + Clone + Debug + Deref<Target = dyn Array> + IntoArray;
52 type Metadata: Debug;
53
54 type OperationsVTable: OperationsVTable<Self>;
55 type ValidityVTable: ValidityVTable<Self>;
56
57 /// Returns the ID of the array.
58 fn id(array: &Self::Array) -> ArrayId;
59
60 /// Returns the length of the array.
61 fn len(array: &Self::Array) -> usize;
62
63 /// Returns the DType of the array.
64 fn dtype(array: &Self::Array) -> &DType;
65
66 /// Returns the stats set for the array.
67 fn stats(array: &Self::Array) -> StatsSetRef<'_>;
68
69 /// Hashes the array contents.
70 fn array_hash<H: Hasher>(array: &Self::Array, state: &mut H, precision: Precision);
71
72 /// Compares two arrays of the same type for equality.
73 fn array_eq(array: &Self::Array, other: &Self::Array, precision: Precision) -> bool;
74
75 /// Returns the number of buffers in the array.
76 fn nbuffers(array: &Self::Array) -> usize;
77
78 /// Returns the buffer at the given index.
79 ///
80 /// # Panics
81 /// Panics if `idx >= nbuffers(array)`.
82 fn buffer(array: &Self::Array, idx: usize) -> BufferHandle;
83
84 /// Returns the name of the buffer at the given index, or `None` if unnamed.
85 fn buffer_name(array: &Self::Array, idx: usize) -> Option<String>;
86
87 /// Returns the number of children in the array.
88 fn nchildren(array: &Self::Array) -> usize;
89
90 /// Returns the child at the given index.
91 ///
92 /// # Panics
93 /// Panics if `idx >= nchildren(array)`.
94 fn child(array: &Self::Array, idx: usize) -> ArrayRef;
95
96 /// Returns the name of the child at the given index.
97 ///
98 /// # Panics
99 /// Panics if `idx >= nchildren(array)`.
100 fn child_name(array: &Self::Array, idx: usize) -> String;
101
102 /// Exports metadata for an array.
103 ///
104 /// * If the array does not contain metadata, it should return
105 /// [`crate::metadata::EmptyMetadata`].
106 fn metadata(array: &Self::Array) -> VortexResult<Self::Metadata>;
107
108 /// Serialize metadata into a byte buffer for IPC or file storage.
109 /// Return `None` if the array cannot be serialized.
110 fn serialize(metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>>;
111
112 /// Deserialize array metadata from a byte buffer.
113 ///
114 /// To reduce the serialized form, arrays do not store their own DType and length. Instead,
115 /// this is passed down from the parent array during deserialization. These properties are
116 /// exposed here for use during deserialization.
117 fn deserialize(
118 bytes: &[u8],
119 _dtype: &DType,
120 _len: usize,
121 _buffers: &[BufferHandle],
122 _session: &VortexSession,
123 ) -> VortexResult<Self::Metadata>;
124
125 /// Writes the array into a canonical builder.
126 ///
127 /// ## Post-conditions
128 /// - The length of the builder is incremented by the length of the input array.
129 fn append_to_builder(
130 array: &Self::Array,
131 builder: &mut dyn ArrayBuilder,
132 ctx: &mut ExecutionCtx,
133 ) -> VortexResult<()> {
134 let canonical = array.to_array().execute::<Canonical>(ctx)?.into_array();
135 builder.extend_from_array(&canonical);
136 Ok(())
137 }
138
139 /// Build an array from components.
140 ///
141 /// This is called on the file and IPC deserialization pathways, to reconstruct the array from
142 /// type-erased components.
143 ///
144 /// Encoding implementers should take note that all validation necessary to ensure the encoding
145 /// is safe to read should happen inside of this method.
146 ///
147 /// # Safety and correctness
148 ///
149 /// This method should *never* panic, it must always return an error or else it returns a
150 /// valid `Array` that meets all the encoding's preconditions.
151 ///
152 /// For example, the `build` implementation for a dictionary encoding should ensure that all
153 /// codes lie in the valid range. For a UTF-8 array, it should check the bytes to ensure they
154 /// are all valid string data bytes. Any corrupt files or malformed data buffers should be
155 /// caught here, before returning the deserialized array.
156 ///
157 /// # Validation
158 ///
159 /// Validation is mainly meant to ensure that all internal pointers in the encoding reference
160 /// valid ranges of data, and that all data conforms to its DType constraints. These ensure
161 /// that no array operations will panic at runtime, or yield undefined behavior when unsafe
162 /// operations like `get_unchecked` use indices in the array buffer.
163 ///
164 /// Examples of the kinds of validation that should be part of the `build` step:
165 ///
166 /// * Checking that any offsets buffers point to valid offsets in some other child array
167 /// * Checking that any buffers for data or validity have the appropriate size for the
168 /// encoding
169 /// * Running UTF-8 validation for any buffers that are expected to hold flat UTF-8 data
170 // TODO(ngates): take the parts by ownership, since most arrays need them anyway
171 fn build(
172 dtype: &DType,
173 len: usize,
174 metadata: &Self::Metadata,
175 buffers: &[BufferHandle],
176 children: &dyn ArrayChildren,
177 ) -> VortexResult<Self::Array>;
178
179 /// Replaces the children in `array` with `children`. The count must be the same and types
180 /// of children must be expected.
181 fn with_children(array: &mut Self::Array, children: Vec<ArrayRef>) -> VortexResult<()>;
182
183 /// Execute this array to produce an [`ArrayRef`].
184 ///
185 /// Array execution is designed such that repeated execution of an array will eventually
186 /// converge to a canonical representation. Implementations of this function should therefore
187 /// ensure they make progress towards that goal.
188 ///
189 /// This includes fully evaluating the array, such us decoding run-end encoding, or executing
190 /// one of the array's children and re-building the array with the executed child.
191 ///
192 /// It is recommended to only perform a single step of execution per call to this function,
193 /// such that surrounding arrays have an opportunity to perform their own parent reduction
194 /// or execution logic.
195 ///
196 /// The returned array must be logically equivalent to the input array. In other words, the
197 /// recursively canonicalized forms of both arrays must be equal.
198 ///
199 /// Debug builds will panic if the returned array is of the wrong type, wrong length, or
200 /// incorrectly contains null values.
201 ///
202 // TODO(ngates): in the future, we may pass a "target encoding hint" such that this array
203 // can produce a more optimal representation for the parent. This could be used to preserve
204 // varbin vs varbinview or list vs listview encodings when the parent knows it prefers
205 // one representation over another, such as when exporting to a specific Arrow array.
206 fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef>;
207
208 /// Attempt to execute the parent of this array.
209 ///
210 /// This function allows arrays to plug in specialized execution logic for their parent. For
211 /// example, strings compressed as FSST arrays can implement a custom equality comparison when
212 /// the comparing against a scalar string.
213 ///
214 /// Returns `Ok(None)` if no specialized execution is possible.
215 fn execute_parent(
216 array: &Self::Array,
217 parent: &ArrayRef,
218 child_idx: usize,
219 ctx: &mut ExecutionCtx,
220 ) -> VortexResult<Option<ArrayRef>> {
221 _ = (array, parent, child_idx, ctx);
222 Ok(None)
223 }
224
225 /// Attempt to reduce the array to a more simple representation.
226 ///
227 /// Returns `Ok(None)` if no reduction is possible.
228 fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>> {
229 _ = array;
230 Ok(None)
231 }
232
233 /// Attempt to perform a reduction of the parent of this array.
234 ///
235 /// This function allows arrays to plug in reduction rules to their parents, for example
236 /// run-end arrays can pull-down scalar functions and apply them only over their values.
237 ///
238 /// Returns `Ok(None)` if no reduction is possible.
239 fn reduce_parent(
240 array: &Self::Array,
241 parent: &ArrayRef,
242 child_idx: usize,
243 ) -> VortexResult<Option<ArrayRef>> {
244 _ = (array, parent, child_idx);
245 Ok(None)
246 }
247}
248
249/// Placeholder type used to indicate when a particular vtable is not supported by the encoding.
250pub struct NotSupported;
251
252/// Returns the validity as a child array if it produces one.
253///
254/// - `NonNullable` and `AllValid` produce no child (returns `None`)
255/// - `AllInvalid` produces a `ConstantArray` of `false` values
256/// - `Array` returns the validity array
257#[inline]
258pub fn validity_to_child(validity: &Validity, len: usize) -> Option<ArrayRef> {
259 match validity {
260 Validity::NonNullable | Validity::AllValid => None,
261 Validity::AllInvalid => Some(ConstantArray::new(false, len).into_array()),
262 Validity::Array(array) => Some(array.clone()),
263 }
264}
265
266/// Returns 1 if validity produces a child, 0 otherwise.
267#[inline]
268pub fn validity_nchildren(validity: &Validity) -> usize {
269 match validity {
270 Validity::NonNullable | Validity::AllValid => 0,
271 Validity::AllInvalid | Validity::Array(_) => 1,
272 }
273}
274
275/// Returns the number of children produced by patches.
276#[inline]
277pub fn patches_nchildren(patches: &Patches) -> usize {
278 2 + patches.chunk_offsets().is_some() as usize
279}
280
281/// Returns the child at the given index within a patches component.
282///
283/// Index 0 = patch_indices, 1 = patch_values, 2 = patch_chunk_offsets (if present).
284#[inline]
285pub fn patches_child(patches: &Patches, idx: usize) -> ArrayRef {
286 match idx {
287 0 => patches.indices().clone(),
288 1 => patches.values().clone(),
289 2 => patches
290 .chunk_offsets()
291 .as_ref()
292 .vortex_expect("patch_chunk_offsets child out of bounds")
293 .clone(),
294 _ => vortex_panic!("patches child index {idx} out of bounds"),
295 }
296}
297
298/// Returns the name of the child at the given index within a patches component.
299#[inline]
300pub fn patches_child_name(idx: usize) -> &'static str {
301 match idx {
302 0 => "patch_indices",
303 1 => "patch_values",
304 2 => "patch_chunk_offsets",
305 _ => vortex_panic!("patches child name index {idx} out of bounds"),
306 }
307}
308
309#[macro_export]
310macro_rules! vtable {
311 ($V:ident) => {
312 $crate::aliases::paste::paste! {
313 impl AsRef<dyn $crate::Array> for [<$V Array>] {
314 fn as_ref(&self) -> &dyn $crate::Array {
315 // We can unsafe cast ourselves to an ArrayAdapter.
316 unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
317 }
318 }
319
320 impl std::ops::Deref for [<$V Array>] {
321 type Target = dyn $crate::Array;
322
323 fn deref(&self) -> &Self::Target {
324 // We can unsafe cast ourselves to an ArrayAdapter.
325 unsafe { &*(self as *const [<$V Array>] as *const $crate::ArrayAdapter<[<$V VTable>]>) }
326 }
327 }
328
329 impl $crate::IntoArray for [<$V Array>] {
330 fn into_array(self) -> $crate::ArrayRef {
331 // We can unsafe transmute ourselves to an ArrayAdapter.
332 std::sync::Arc::new(unsafe { std::mem::transmute::<[<$V Array>], $crate::ArrayAdapter::<[<$V VTable>]>>(self) })
333 }
334 }
335
336 impl From<[<$V Array>]> for $crate::ArrayRef {
337 fn from(value: [<$V Array>]) -> $crate::ArrayRef {
338 use $crate::IntoArray;
339 value.into_array()
340 }
341 }
342 }
343 };
344}