vortex_array/arrays/extension/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::ops::Range;
5use std::sync::Arc;
6
7use vortex_dtype::{DType, ExtDType, ExtID};
8use vortex_error::VortexResult;
9use vortex_scalar::Scalar;
10
11use crate::stats::{ArrayStats, StatsSetRef};
12use crate::vtable::{
13    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
14    ValidityVTableFromChild, VisitorVTable,
15};
16use crate::{
17    Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
18    IntoArray, vtable,
19};
20
21mod compute;
22mod serde;
23
24vtable!(Extension);
25
26impl VTable for ExtensionVTable {
27    type Array = ExtensionArray;
28    type Encoding = ExtensionEncoding;
29
30    type ArrayVTable = Self;
31    type CanonicalVTable = Self;
32    type OperationsVTable = Self;
33    type ValidityVTable = ValidityVTableFromChild;
34    type VisitorVTable = Self;
35    type ComputeVTable = NotSupported;
36    type EncodeVTable = NotSupported;
37    type PipelineVTable = NotSupported;
38    type SerdeVTable = Self;
39
40    fn id(_encoding: &Self::Encoding) -> EncodingId {
41        EncodingId::new_ref("vortex.ext")
42    }
43
44    fn encoding(_array: &Self::Array) -> EncodingRef {
45        EncodingRef::new_ref(ExtensionEncoding.as_ref())
46    }
47}
48
49#[derive(Clone, Debug)]
50pub struct ExtensionEncoding;
51
52/// An extension array that wraps another array with additional type information.
53///
54/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
55/// in future versions. The extension type system is still evolving.
56///
57/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
58/// mechanism for adding semantic meaning to existing array types without requiring
59/// changes to the core type system.
60///
61/// ## Design Philosophy
62///
63/// Extension arrays serve as a type-safe wrapper that:
64/// - Preserves the underlying storage format and operations
65/// - Adds semantic type information via `ExtDType`
66/// - Enables custom serialization and deserialization logic
67/// - Allows domain-specific interpretations of generic data
68///
69/// ## Storage and Type Relationship
70///
71/// The extension array maintains a strict contract:
72/// - **Storage array**: Contains the actual data in a standard Vortex encoding
73/// - **Extension type**: Defines how to interpret the storage data semantically
74/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
75///
76/// ## Use Cases
77///
78/// Extension arrays are ideal for:
79/// - **Custom numeric types**: Units of measurement, currencies
80/// - **Temporal types**: Custom date/time formats, time zones, calendars
81/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
82/// - **Encoded types**: Base64 strings, compressed data, encrypted values
83///
84/// ## Validity and Operations
85///
86/// Extension arrays delegate validity and most operations to their storage array:
87/// - Validity is inherited from the underlying storage
88/// - Slicing preserves the extension type
89/// - Scalar access wraps storage scalars with extension metadata
90///
91/// # Examples
92///
93/// ```
94/// use std::sync::Arc;
95/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
96/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
97/// use vortex_array::validity::Validity;
98/// use vortex_array::IntoArray;
99/// use vortex_buffer::buffer;
100///
101/// // Define a custom extension type for representing currency values
102/// let currency_id = ExtID::from("example.currency");
103/// let currency_dtype = Arc::new(ExtDType::new(
104///     currency_id,
105///     Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
106///     None, // No additional metadata needed
107/// ));
108///
109/// // Create storage array with currency values in cents
110/// let cents_storage = PrimitiveArray::new(
111///     buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
112///     Validity::NonNullable
113/// );
114///
115/// // Wrap with extension type
116/// let currency_array = ExtensionArray::new(
117///     currency_dtype.clone(),
118///     cents_storage.into_array()
119/// );
120///
121/// assert_eq!(currency_array.len(), 3);
122/// assert_eq!(currency_array.id().as_ref(), "example.currency");
123///
124/// // Access maintains extension type information
125/// let first_value = currency_array.scalar_at(0);
126/// assert!(first_value.as_extension_opt().is_some());
127/// ```
128#[derive(Clone, Debug)]
129pub struct ExtensionArray {
130    dtype: DType,
131    storage: ArrayRef,
132    stats_set: ArrayStats,
133}
134
135impl ExtensionArray {
136    pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
137        assert_eq!(
138            ext_dtype.storage_dtype(),
139            storage.dtype(),
140            "ExtensionArray: storage_dtype must match storage array DType",
141        );
142        Self {
143            dtype: DType::Extension(ext_dtype),
144            storage,
145            stats_set: ArrayStats::default(),
146        }
147    }
148
149    pub fn ext_dtype(&self) -> &Arc<ExtDType> {
150        let DType::Extension(ext) = &self.dtype else {
151            unreachable!("ExtensionArray: dtype must be an ExtDType")
152        };
153        ext
154    }
155
156    pub fn storage(&self) -> &ArrayRef {
157        &self.storage
158    }
159
160    #[allow(dead_code)]
161    #[inline]
162    pub fn id(&self) -> &ExtID {
163        self.ext_dtype().id()
164    }
165}
166
167impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
168    fn len(array: &ExtensionArray) -> usize {
169        array.storage.len()
170    }
171
172    fn dtype(array: &ExtensionArray) -> &DType {
173        &array.dtype
174    }
175
176    fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
177        array.stats_set.to_ref(array.as_ref())
178    }
179}
180
181impl ValidityChild<ExtensionVTable> for ExtensionVTable {
182    fn validity_child(array: &ExtensionArray) -> &dyn Array {
183        array.storage.as_ref()
184    }
185}
186
187impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
188    fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
189        Ok(Canonical::Extension(array.clone()))
190    }
191}
192
193impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
194    fn slice(array: &ExtensionArray, range: Range<usize>) -> ArrayRef {
195        ExtensionArray::new(array.ext_dtype().clone(), array.storage().slice(range)).into_array()
196    }
197
198    fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar {
199        Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index))
200    }
201}
202
203impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
204    fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
205
206    fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
207        visitor.visit_child("storage", array.storage.as_ref());
208    }
209}