vortex_array/arrays/extension/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_dtype::{DType, ExtDType, ExtID};
7use vortex_error::VortexResult;
8use vortex_scalar::Scalar;
9
10use crate::stats::{ArrayStats, StatsSetRef};
11use crate::vtable::{
12 ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
13 ValidityVTableFromChild, VisitorVTable,
14};
15use crate::{
16 Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
17 IntoArray, vtable,
18};
19
20mod compute;
21mod serde;
22
23vtable!(Extension);
24
25impl VTable for ExtensionVTable {
26 type Array = ExtensionArray;
27 type Encoding = ExtensionEncoding;
28
29 type ArrayVTable = Self;
30 type CanonicalVTable = Self;
31 type OperationsVTable = Self;
32 type ValidityVTable = ValidityVTableFromChild;
33 type VisitorVTable = Self;
34 type ComputeVTable = NotSupported;
35 type EncodeVTable = NotSupported;
36 type PipelineVTable = NotSupported;
37 type SerdeVTable = Self;
38
39 fn id(_encoding: &Self::Encoding) -> EncodingId {
40 EncodingId::new_ref("vortex.ext")
41 }
42
43 fn encoding(_array: &Self::Array) -> EncodingRef {
44 EncodingRef::new_ref(ExtensionEncoding.as_ref())
45 }
46}
47
48#[derive(Clone, Debug)]
49pub struct ExtensionEncoding;
50
51/// An extension array that wraps another array with additional type information.
52///
53/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
54/// in future versions. The extension type system is still evolving.
55///
56/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
57/// mechanism for adding semantic meaning to existing array types without requiring
58/// changes to the core type system.
59///
60/// ## Design Philosophy
61///
62/// Extension arrays serve as a type-safe wrapper that:
63/// - Preserves the underlying storage format and operations
64/// - Adds semantic type information via `ExtDType`
65/// - Enables custom serialization and deserialization logic
66/// - Allows domain-specific interpretations of generic data
67///
68/// ## Storage and Type Relationship
69///
70/// The extension array maintains a strict contract:
71/// - **Storage array**: Contains the actual data in a standard Vortex encoding
72/// - **Extension type**: Defines how to interpret the storage data semantically
73/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
74///
75/// ## Use Cases
76///
77/// Extension arrays are ideal for:
78/// - **Custom numeric types**: Units of measurement, currencies
79/// - **Temporal types**: Custom date/time formats, time zones, calendars
80/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
81/// - **Encoded types**: Base64 strings, compressed data, encrypted values
82///
83/// ## Validity and Operations
84///
85/// Extension arrays delegate validity and most operations to their storage array:
86/// - Validity is inherited from the underlying storage
87/// - Slicing preserves the extension type
88/// - Scalar access wraps storage scalars with extension metadata
89///
90/// # Examples
91///
92/// ```
93/// use std::sync::Arc;
94/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
95/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
96/// use vortex_array::validity::Validity;
97/// use vortex_array::IntoArray;
98/// use vortex_buffer::buffer;
99///
100/// // Define a custom extension type for representing currency values
101/// let currency_id = ExtID::from("example.currency");
102/// let currency_dtype = Arc::new(ExtDType::new(
103/// currency_id,
104/// Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
105/// None, // No additional metadata needed
106/// ));
107///
108/// // Create storage array with currency values in cents
109/// let cents_storage = PrimitiveArray::new(
110/// buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
111/// Validity::NonNullable
112/// );
113///
114/// // Wrap with extension type
115/// let currency_array = ExtensionArray::new(
116/// currency_dtype.clone(),
117/// cents_storage.into_array()
118/// );
119///
120/// assert_eq!(currency_array.len(), 3);
121/// assert_eq!(currency_array.id().as_ref(), "example.currency");
122///
123/// // Access maintains extension type information
124/// let first_value = currency_array.scalar_at(0);
125/// assert!(first_value.as_extension_opt().is_some());
126/// ```
127#[derive(Clone, Debug)]
128pub struct ExtensionArray {
129 dtype: DType,
130 storage: ArrayRef,
131 stats_set: ArrayStats,
132}
133
134impl ExtensionArray {
135 pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
136 assert_eq!(
137 ext_dtype.storage_dtype(),
138 storage.dtype(),
139 "ExtensionArray: storage_dtype must match storage array DType",
140 );
141 Self {
142 dtype: DType::Extension(ext_dtype),
143 storage,
144 stats_set: ArrayStats::default(),
145 }
146 }
147
148 pub fn ext_dtype(&self) -> &Arc<ExtDType> {
149 let DType::Extension(ext) = &self.dtype else {
150 unreachable!("ExtensionArray: dtype must be an ExtDType")
151 };
152 ext
153 }
154
155 pub fn storage(&self) -> &ArrayRef {
156 &self.storage
157 }
158
159 #[allow(dead_code)]
160 #[inline]
161 pub fn id(&self) -> &ExtID {
162 self.ext_dtype().id()
163 }
164}
165
166impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
167 fn len(array: &ExtensionArray) -> usize {
168 array.storage.len()
169 }
170
171 fn dtype(array: &ExtensionArray) -> &DType {
172 &array.dtype
173 }
174
175 fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
176 array.stats_set.to_ref(array.as_ref())
177 }
178}
179
180impl ValidityChild<ExtensionVTable> for ExtensionVTable {
181 fn validity_child(array: &ExtensionArray) -> &dyn Array {
182 array.storage.as_ref()
183 }
184}
185
186impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
187 fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
188 Ok(Canonical::Extension(array.clone()))
189 }
190}
191
192impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
193 fn slice(array: &ExtensionArray, start: usize, stop: usize) -> ArrayRef {
194 ExtensionArray::new(
195 array.ext_dtype().clone(),
196 array.storage().slice(start, stop),
197 )
198 .into_array()
199 }
200
201 fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar {
202 Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index))
203 }
204}
205
206impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
207 fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
208
209 fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
210 visitor.visit_child("storage", array.storage.as_ref());
211 }
212}