vortex_array/arrays/extension/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_dtype::{DType, ExtDType, ExtID};
7use vortex_error::VortexResult;
8use vortex_scalar::Scalar;
9
10use crate::stats::{ArrayStats, StatsSetRef};
11use crate::vtable::{
12    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild,
13    ValidityVTableFromChild, VisitorVTable,
14};
15use crate::{
16    Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef,
17    IntoArray, vtable,
18};
19
20mod compute;
21mod serde;
22
23vtable!(Extension);
24
25impl VTable for ExtensionVTable {
26    type Array = ExtensionArray;
27    type Encoding = ExtensionEncoding;
28
29    type ArrayVTable = Self;
30    type CanonicalVTable = Self;
31    type OperationsVTable = Self;
32    type ValidityVTable = ValidityVTableFromChild;
33    type VisitorVTable = Self;
34    type ComputeVTable = NotSupported;
35    type EncodeVTable = NotSupported;
36    type PipelineVTable = NotSupported;
37    type SerdeVTable = Self;
38
39    fn id(_encoding: &Self::Encoding) -> EncodingId {
40        EncodingId::new_ref("vortex.ext")
41    }
42
43    fn encoding(_array: &Self::Array) -> EncodingRef {
44        EncodingRef::new_ref(ExtensionEncoding.as_ref())
45    }
46}
47
48#[derive(Clone, Debug)]
49pub struct ExtensionEncoding;
50
51/// An extension array that wraps another array with additional type information.
52///
53/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
54/// in future versions. The extension type system is still evolving.
55///
56/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
57/// mechanism for adding semantic meaning to existing array types without requiring
58/// changes to the core type system.
59///
60/// ## Design Philosophy
61///
62/// Extension arrays serve as a type-safe wrapper that:
63/// - Preserves the underlying storage format and operations
64/// - Adds semantic type information via `ExtDType`
65/// - Enables custom serialization and deserialization logic
66/// - Allows domain-specific interpretations of generic data
67///
68/// ## Storage and Type Relationship
69///
70/// The extension array maintains a strict contract:
71/// - **Storage array**: Contains the actual data in a standard Vortex encoding
72/// - **Extension type**: Defines how to interpret the storage data semantically
73/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
74///
75/// ## Use Cases
76///
77/// Extension arrays are ideal for:
78/// - **Custom numeric types**: Units of measurement, currencies
79/// - **Temporal types**: Custom date/time formats, time zones, calendars
80/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
81/// - **Encoded types**: Base64 strings, compressed data, encrypted values
82///
83/// ## Validity and Operations
84///
85/// Extension arrays delegate validity and most operations to their storage array:
86/// - Validity is inherited from the underlying storage
87/// - Slicing preserves the extension type
88/// - Scalar access wraps storage scalars with extension metadata
89///
90/// # Examples
91///
92/// ```
93/// use std::sync::Arc;
94/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray};
95/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType};
96/// use vortex_array::validity::Validity;
97/// use vortex_array::IntoArray;
98/// use vortex_buffer::buffer;
99///
100/// // Define a custom extension type for representing currency values
101/// let currency_id = ExtID::from("example.currency");
102/// let currency_dtype = Arc::new(ExtDType::new(
103///     currency_id,
104///     Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents
105///     None, // No additional metadata needed
106/// ));
107///
108/// // Create storage array with currency values in cents
109/// let cents_storage = PrimitiveArray::new(
110///     buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99
111///     Validity::NonNullable
112/// );
113///
114/// // Wrap with extension type
115/// let currency_array = ExtensionArray::new(
116///     currency_dtype.clone(),
117///     cents_storage.into_array()
118/// );
119///
120/// assert_eq!(currency_array.len(), 3);
121/// assert_eq!(currency_array.id().as_ref(), "example.currency");
122///
123/// // Access maintains extension type information
124/// let first_value = currency_array.scalar_at(0);
125/// assert!(first_value.as_extension_opt().is_some());
126/// ```
127#[derive(Clone, Debug)]
128pub struct ExtensionArray {
129    dtype: DType,
130    storage: ArrayRef,
131    stats_set: ArrayStats,
132}
133
134impl ExtensionArray {
135    pub fn new(ext_dtype: Arc<ExtDType>, storage: ArrayRef) -> Self {
136        assert_eq!(
137            ext_dtype.storage_dtype(),
138            storage.dtype(),
139            "ExtensionArray: storage_dtype must match storage array DType",
140        );
141        Self {
142            dtype: DType::Extension(ext_dtype),
143            storage,
144            stats_set: ArrayStats::default(),
145        }
146    }
147
148    pub fn ext_dtype(&self) -> &Arc<ExtDType> {
149        let DType::Extension(ext) = &self.dtype else {
150            unreachable!("ExtensionArray: dtype must be an ExtDType")
151        };
152        ext
153    }
154
155    pub fn storage(&self) -> &ArrayRef {
156        &self.storage
157    }
158
159    #[allow(dead_code)]
160    #[inline]
161    pub fn id(&self) -> &ExtID {
162        self.ext_dtype().id()
163    }
164}
165
166impl ArrayVTable<ExtensionVTable> for ExtensionVTable {
167    fn len(array: &ExtensionArray) -> usize {
168        array.storage.len()
169    }
170
171    fn dtype(array: &ExtensionArray) -> &DType {
172        &array.dtype
173    }
174
175    fn stats(array: &ExtensionArray) -> StatsSetRef<'_> {
176        array.stats_set.to_ref(array.as_ref())
177    }
178}
179
180impl ValidityChild<ExtensionVTable> for ExtensionVTable {
181    fn validity_child(array: &ExtensionArray) -> &dyn Array {
182        array.storage.as_ref()
183    }
184}
185
186impl CanonicalVTable<ExtensionVTable> for ExtensionVTable {
187    fn canonicalize(array: &ExtensionArray) -> VortexResult<Canonical> {
188        Ok(Canonical::Extension(array.clone()))
189    }
190}
191
192impl OperationsVTable<ExtensionVTable> for ExtensionVTable {
193    fn slice(array: &ExtensionArray, start: usize, stop: usize) -> ArrayRef {
194        ExtensionArray::new(
195            array.ext_dtype().clone(),
196            array.storage().slice(start, stop),
197        )
198        .into_array()
199    }
200
201    fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar {
202        Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index))
203    }
204}
205
206impl VisitorVTable<ExtensionVTable> for ExtensionVTable {
207    fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {}
208
209    fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) {
210        visitor.visit_child("storage", array.storage.as_ref());
211    }
212}