vortex_array/arrays/extension/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_dtype::{DType, ExtDType, ExtID};
7use vortex_error::VortexResult;
8use vortex_scalar::Scalar;
9
10use crate::stats::{ArrayStats, StatsSetRef};
11use crate::vtable::{
12    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
13    ValidityVTableFromChild, VisitorVTable,
14};
15use crate::{
16    Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
17    IntoArray, vtable,
18};
19
20mod compute;
21mod serde;
22
23vtable!(Extension);
24
25impl VTable for ExtensionVTable {
26    type Array = ExtensionArray;
27    type Encoding = ExtensionEncoding;
28
29    type ArrayVTable = Self;
30    type CanonicalVTable = Self;
31    type OperationsVTable = Self;
32    type ValidityVTable = ValidityVTableFromChild;
33    type VisitorVTable = Self;
34    type ComputeVTable = NotSupported;
35    type EncodeVTable = NotSupported;
36    type SerdeVTable = Self;
37
38    fn id(_encoding: &Self::Encoding) -> EncodingId {
39        EncodingId::new_ref("vortex.ext")
40    }
41
42    fn encoding(_array: &Self::Array) -> EncodingRef {
43        EncodingRef::new_ref(ExtensionEncoding.as_ref())
44    }
45}
46
47#[derive(Clone, Debug)]
48pub struct ExtensionEncoding;
49
50/// An extension array that wraps another array with additional type information.
51///
52/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
53/// in future versions. The extension type system is still evolving.
54///
55/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
56/// mechanism for adding semantic meaning to existing array types without requiring
57/// changes to the core type system.
58///
59/// ## Design Philosophy
60///
61/// Extension arrays serve as a type-safe wrapper that:
62/// - Preserves the underlying storage format and operations
63/// - Adds semantic type information via `ExtDType`
64/// - Enables custom serialization and deserialization logic
65/// - Allows domain-specific interpretations of generic data
66///
67/// ## Storage and Type Relationship
68///
69/// The extension array maintains a strict contract:
70/// - **Storage array**: Contains the actual data in a standard Vortex encoding
71/// - **Extension type**: Defines how to interpret the storage data semantically
72/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
73///
74/// ## Use Cases
75///
76/// Extension arrays are ideal for:
77/// - **Custom numeric types**: Units of measurement, currencies
78/// - **Temporal types**: Custom date/time formats, time zones, calendars
79/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
80/// - **Encoded types**: Base64 strings, compressed data, encrypted values
81///
82/// ## Validity and Operations
83///
84/// Extension arrays delegate validity and most operations to their storage array:
85/// - Validity is inherited from the underlying storage
86/// - Slicing preserves the extension type
87/// - Scalar access wraps storage scalars with extension metadata
88///
89/// # Examples
90///
91/// ```
92/// use std::sync::Arc;
93/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
94/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
95/// use vortex_array::validity::Validity;
96/// use vortex_array::IntoArray;
97/// use vortex_buffer::buffer;
98///
99/// // Define a custom extension type for representing currency values
100/// let currency_id = ExtID::from("example.currency");
101/// let currency_dtype = Arc::new(ExtDType::new(
102///     currency_id,
103///     Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
104///     None, // No additional metadata needed
105/// ));
106///
107/// // Create storage array with currency values in cents
108/// let cents_storage = PrimitiveArray::new(
109///     buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
110///     Validity::NonNullable
111/// );
112///
113/// // Wrap with extension type
114/// let currency_array = ExtensionArray::new(
115///     currency_dtype.clone(),
116///     cents_storage.into_array()
117/// );
118///
119/// assert_eq!(currency_array.len(), 3);
120/// assert_eq!(currency_array.id().as_ref(), "example.currency");
121///
122/// // Access maintains extension type information
123/// let first_value = currency_array.scalar_at(0).unwrap();
124/// assert!(first_value.as_extension_opt().is_some());
125/// ```
126#[derive(Clone, Debug)]
127pub struct ExtensionArray {
128    dtype: DType,
129    storage: ArrayRef,
130    stats_set: ArrayStats,
131}
132
133impl ExtensionArray {
134    pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
135        assert_eq!(
136            ext_dtype.storage_dtype(),
137            storage.dtype(),
138            "ExtensionArray: storage_dtype must match storage array DType",
139        );
140        Self {
141            dtype: DType::Extension(ext_dtype),
142            storage,
143            stats_set: ArrayStats::default(),
144        }
145    }
146
147    pub fn ext_dtype(&self) -> &Arc<ExtDType> {
148        let DType::Extension(ext) = &self.dtype else {
149            unreachable!("ExtensionArray: dtype must be an ExtDType")
150        };
151        ext
152    }
153
154    pub fn storage(&self) -> &ArrayRef {
155        &self.storage
156    }
157
158    #[allow(dead_code)]
159    #[inline]
160    pub fn id(&self) -> &ExtID {
161        self.ext_dtype().id()
162    }
163}
164
165impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
166    fn len(array: &ExtensionArray) -> usize {
167        array.storage.len()
168    }
169
170    fn dtype(array: &ExtensionArray) -> &DType {
171        &array.dtype
172    }
173
174    fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
175        array.stats_set.to_ref(array.as_ref())
176    }
177}
178
179impl ValidityChild<ExtensionVTable> for ExtensionVTable {
180    fn validity_child(array: &ExtensionArray) -> &dyn Array {
181        array.storage.as_ref()
182    }
183}
184
185impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
186    fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
187        Ok(Canonical::Extension(array.clone()))
188    }
189}
190
191impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
192    fn slice(array: &ExtensionArray, start: usize, stop: usize) -> VortexResult<ArrayRef> {
193        Ok(ExtensionArray::new(
194            array.ext_dtype().clone(),
195            array.storage().slice(start, stop)?,
196        )
197        .into_array())
198    }
199
200    fn scalar_at(array: &ExtensionArray, index: usize) -> VortexResult<Scalar> {
201        Ok(Scalar::extension(
202            array.ext_dtype().clone(),
203            array.storage().scalar_at(index)?,
204        ))
205    }
206}
207
208impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
209    fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
210
211    fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
212        visitor.visit_child("storage", array.storage.as_ref());
213    }
214}