vortex_array/arrays/extension/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_dtype::{DType, ExtDType, ExtID};
7use vortex_error::VortexResult;
8use vortex_scalar::Scalar;
9
10use crate::stats::{ArrayStats, StatsSetRef};
11use crate::vtable::{
12 ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
13 ValidityVTableFromChild, VisitorVTable,
14};
15use crate::{
16 Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
17 IntoArray, vtable,
18};
19
20mod compute;
21mod serde;
22
23vtable!(Extension);
24
25impl VTable for ExtensionVTable {
26 type Array = ExtensionArray;
27 type Encoding = ExtensionEncoding;
28
29 type ArrayVTable = Self;
30 type CanonicalVTable = Self;
31 type OperationsVTable = Self;
32 type ValidityVTable = ValidityVTableFromChild;
33 type VisitorVTable = Self;
34 type ComputeVTable = NotSupported;
35 type EncodeVTable = NotSupported;
36 type SerdeVTable = Self;
37
38 fn id(_encoding: &Self::Encoding) -> EncodingId {
39 EncodingId::new_ref("vortex.ext")
40 }
41
42 fn encoding(_array: &Self::Array) -> EncodingRef {
43 EncodingRef::new_ref(ExtensionEncoding.as_ref())
44 }
45}
46
47#[derive(Clone, Debug)]
48pub struct ExtensionEncoding;
49
50/// An extension array that wraps another array with additional type information.
51///
52/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
53/// in future versions. The extension type system is still evolving.
54///
55/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
56/// mechanism for adding semantic meaning to existing array types without requiring
57/// changes to the core type system.
58///
59/// ## Design Philosophy
60///
61/// Extension arrays serve as a type-safe wrapper that:
62/// - Preserves the underlying storage format and operations
63/// - Adds semantic type information via `ExtDType`
64/// - Enables custom serialization and deserialization logic
65/// - Allows domain-specific interpretations of generic data
66///
67/// ## Storage and Type Relationship
68///
69/// The extension array maintains a strict contract:
70/// - **Storage array**: Contains the actual data in a standard Vortex encoding
71/// - **Extension type**: Defines how to interpret the storage data semantically
72/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
73///
74/// ## Use Cases
75///
76/// Extension arrays are ideal for:
77/// - **Custom numeric types**: Units of measurement, currencies
78/// - **Temporal types**: Custom date/time formats, time zones, calendars
79/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
80/// - **Encoded types**: Base64 strings, compressed data, encrypted values
81///
82/// ## Validity and Operations
83///
84/// Extension arrays delegate validity and most operations to their storage array:
85/// - Validity is inherited from the underlying storage
86/// - Slicing preserves the extension type
87/// - Scalar access wraps storage scalars with extension metadata
88///
89/// # Examples
90///
91/// ```
92/// use std::sync::Arc;
93/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
94/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
95/// use vortex_array::validity::Validity;
96/// use vortex_array::IntoArray;
97/// use vortex_buffer::buffer;
98///
99/// // Define a custom extension type for representing currency values
100/// let currency_id = ExtID::from("example.currency");
101/// let currency_dtype = Arc::new(ExtDType::new(
102/// currency_id,
103/// Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
104/// None, // No additional metadata needed
105/// ));
106///
107/// // Create storage array with currency values in cents
108/// let cents_storage = PrimitiveArray::new(
109/// buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
110/// Validity::NonNullable
111/// );
112///
113/// // Wrap with extension type
114/// let currency_array = ExtensionArray::new(
115/// currency_dtype.clone(),
116/// cents_storage.into_array()
117/// );
118///
119/// assert_eq!(currency_array.len(), 3);
120/// assert_eq!(currency_array.id().as_ref(), "example.currency");
121///
122/// // Access maintains extension type information
123/// let first_value = currency_array.scalar_at(0).unwrap();
124/// assert!(first_value.as_extension_opt().is_some());
125/// ```
126#[derive(Clone, Debug)]
127pub struct ExtensionArray {
128 dtype: DType,
129 storage: ArrayRef,
130 stats_set: ArrayStats,
131}
132
133impl ExtensionArray {
134 pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
135 assert_eq!(
136 ext_dtype.storage_dtype(),
137 storage.dtype(),
138 "ExtensionArray: storage_dtype must match storage array DType",
139 );
140 Self {
141 dtype: DType::Extension(ext_dtype),
142 storage,
143 stats_set: ArrayStats::default(),
144 }
145 }
146
147 pub fn ext_dtype(&self) -> &Arc<ExtDType> {
148 let DType::Extension(ext) = &self.dtype else {
149 unreachable!("ExtensionArray: dtype must be an ExtDType")
150 };
151 ext
152 }
153
154 pub fn storage(&self) -> &ArrayRef {
155 &self.storage
156 }
157
158 #[allow(dead_code)]
159 #[inline]
160 pub fn id(&self) -> &ExtID {
161 self.ext_dtype().id()
162 }
163}
164
165impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
166 fn len(array: &ExtensionArray) -> usize {
167 array.storage.len()
168 }
169
170 fn dtype(array: &ExtensionArray) -> &DType {
171 &array.dtype
172 }
173
174 fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
175 array.stats_set.to_ref(array.as_ref())
176 }
177}
178
179impl ValidityChild<ExtensionVTable> for ExtensionVTable {
180 fn validity_child(array: &ExtensionArray) -> &dyn Array {
181 array.storage.as_ref()
182 }
183}
184
185impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
186 fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
187 Ok(Canonical::Extension(array.clone()))
188 }
189}
190
191impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
192 fn slice(array: &ExtensionArray, start: usize, stop: usize) -> VortexResult<ArrayRef> {
193 Ok(ExtensionArray::new(
194 array.ext_dtype().clone(),
195 array.storage().slice(start, stop)?,
196 )
197 .into_array())
198 }
199
200 fn scalar_at(array: &ExtensionArray, index: usize) -> VortexResult<Scalar> {
201 Ok(Scalar::extension(
202 array.ext_dtype().clone(),
203 array.storage().scalar_at(index)?,
204 ))
205 }
206}
207
208impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
209 fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
210
211 fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
212 visitor.visit_child("storage", array.storage.as_ref());
213 }
214}