vortex_array/arrays/extension/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::ops::Range;
5use std::sync::Arc;
6
7use vortex_dtype::{DType, ExtDType, ExtID};
8use vortex_error::VortexResult;
9use vortex_scalar::Scalar;
10
11use crate::stats::{ArrayStats, StatsSetRef};
12use crate::vtable::{
13 ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
14 ValidityVTableFromChild, VisitorVTable,
15};
16use crate::{
17 Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
18 IntoArray, vtable,
19};
20
21mod compute;
22mod serde;
23
24vtable!(Extension);
25
26impl VTable for ExtensionVTable {
27 type Array = ExtensionArray;
28 type Encoding = ExtensionEncoding;
29
30 type ArrayVTable = Self;
31 type CanonicalVTable = Self;
32 type OperationsVTable = Self;
33 type ValidityVTable = ValidityVTableFromChild;
34 type VisitorVTable = Self;
35 type ComputeVTable = NotSupported;
36 type EncodeVTable = NotSupported;
37 type PipelineVTable = NotSupported;
38 type SerdeVTable = Self;
39
40 fn id(_encoding: &Self::Encoding) -> EncodingId {
41 EncodingId::new_ref("vortex.ext")
42 }
43
44 fn encoding(_array: &Self::Array) -> EncodingRef {
45 EncodingRef::new_ref(ExtensionEncoding.as_ref())
46 }
47}
48
49#[derive(Clone, Debug)]
50pub struct ExtensionEncoding;
51
52/// An extension array that wraps another array with additional type information.
53///
54/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
55/// in future versions. The extension type system is still evolving.
56///
57/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
58/// mechanism for adding semantic meaning to existing array types without requiring
59/// changes to the core type system.
60///
61/// ## Design Philosophy
62///
63/// Extension arrays serve as a type-safe wrapper that:
64/// - Preserves the underlying storage format and operations
65/// - Adds semantic type information via `ExtDType`
66/// - Enables custom serialization and deserialization logic
67/// - Allows domain-specific interpretations of generic data
68///
69/// ## Storage and Type Relationship
70///
71/// The extension array maintains a strict contract:
72/// - **Storage array**: Contains the actual data in a standard Vortex encoding
73/// - **Extension type**: Defines how to interpret the storage data semantically
74/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
75///
76/// ## Use Cases
77///
78/// Extension arrays are ideal for:
79/// - **Custom numeric types**: Units of measurement, currencies
80/// - **Temporal types**: Custom date/time formats, time zones, calendars
81/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
82/// - **Encoded types**: Base64 strings, compressed data, encrypted values
83///
84/// ## Validity and Operations
85///
86/// Extension arrays delegate validity and most operations to their storage array:
87/// - Validity is inherited from the underlying storage
88/// - Slicing preserves the extension type
89/// - Scalar access wraps storage scalars with extension metadata
90///
91/// # Examples
92///
93/// ```
94/// use std::sync::Arc;
95/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
96/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
97/// use vortex_array::validity::Validity;
98/// use vortex_array::IntoArray;
99/// use vortex_buffer::buffer;
100///
101/// // Define a custom extension type for representing currency values
102/// let currency_id = ExtID::from("example.currency");
103/// let currency_dtype = Arc::new(ExtDType::new(
104/// currency_id,
105/// Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
106/// None, // No additional metadata needed
107/// ));
108///
109/// // Create storage array with currency values in cents
110/// let cents_storage = PrimitiveArray::new(
111/// buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
112/// Validity::NonNullable
113/// );
114///
115/// // Wrap with extension type
116/// let currency_array = ExtensionArray::new(
117/// currency_dtype.clone(),
118/// cents_storage.into_array()
119/// );
120///
121/// assert_eq!(currency_array.len(), 3);
122/// assert_eq!(currency_array.id().as_ref(), "example.currency");
123///
124/// // Access maintains extension type information
125/// let first_value = currency_array.scalar_at(0);
126/// assert!(first_value.as_extension_opt().is_some());
127/// ```
128#[derive(Clone, Debug)]
129pub struct ExtensionArray {
130 dtype: DType,
131 storage: ArrayRef,
132 stats_set: ArrayStats,
133}
134
135impl ExtensionArray {
136 pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
137 assert_eq!(
138 ext_dtype.storage_dtype(),
139 storage.dtype(),
140 "ExtensionArray: storage_dtype must match storage array DType",
141 );
142 Self {
143 dtype: DType::Extension(ext_dtype),
144 storage,
145 stats_set: ArrayStats::default(),
146 }
147 }
148
149 pub fn ext_dtype(&self) -> &Arc<ExtDType> {
150 let DType::Extension(ext) = &self.dtype else {
151 unreachable!("ExtensionArray: dtype must be an ExtDType")
152 };
153 ext
154 }
155
156 pub fn storage(&self) -> &ArrayRef {
157 &self.storage
158 }
159
160 #[allow(dead_code)]
161 #[inline]
162 pub fn id(&self) -> &ExtID {
163 self.ext_dtype().id()
164 }
165}
166
167impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
168 fn len(array: &ExtensionArray) -> usize {
169 array.storage.len()
170 }
171
172 fn dtype(array: &ExtensionArray) -> &DType {
173 &array.dtype
174 }
175
176 fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
177 array.stats_set.to_ref(array.as_ref())
178 }
179}
180
181impl ValidityChild<ExtensionVTable> for ExtensionVTable {
182 fn validity_child(array: &ExtensionArray) -> &dyn Array {
183 array.storage.as_ref()
184 }
185}
186
187impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
188 fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
189 Ok(Canonical::Extension(array.clone()))
190 }
191}
192
193impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
194 fn slice(array: &ExtensionArray, range: Range<usize>) -> ArrayRef {
195 ExtensionArray::new(array.ext_dtype().clone(), array.storage().slice(range)).into_array()
196 }
197
198 fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar {
199 Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index))
200 }
201}
202
203impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
204 fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
205
206 fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
207 visitor.visit_child("storage", array.storage.as_ref());
208 }
209}