vortex_array/arrays/extension/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9
10use crate::ArrayRef;
11use crate::array::Array;
12use crate::array::ArrayParts;
13use crate::array::TypedArrayRef;
14use crate::arrays::Extension;
15use crate::dtype::DType;
16use crate::dtype::extension::ExtDTypeRef;
17
18/// The backing storage array for this extension array.
19pub(super) const STORAGE_SLOT: usize = 0;
20pub(super) const NUM_SLOTS: usize = 1;
21pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["storage"];
22
23/// An extension array that wraps another array with additional type information.
24///
25/// **⚠️ Unstable API**: This is an experimental feature that may change significantly
26/// in future versions. The extension type system is still evolving.
27///
28/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible
29/// mechanism for adding semantic meaning to existing array types without requiring
30/// changes to the core type system.
31///
32/// ## Design Philosophy
33///
34/// Extension arrays serve as a type-safe wrapper that:
35/// - Preserves the underlying storage format and operations
36/// - Adds semantic type information via `ExtDType`
37/// - Enables custom serialization and deserialization logic
38/// - Allows domain-specific interpretations of generic data
39///
40/// ## Storage and Type Relationship
41///
42/// The extension array maintains a strict contract:
43/// - **Storage array**: Contains the actual data in a standard Vortex encoding
44/// - **Extension type**: Defines how to interpret the storage data semantically
45/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype
46///
47/// ## Use Cases
48///
49/// Extension arrays are ideal for:
50/// - **Custom numeric types**: Units of measurement, currencies
51/// - **Temporal types**: Custom date/time formats, time zones, calendars
52/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates
53/// - **Encoded types**: Base64 strings, compressed data, encrypted values
54///
55/// ## Validity and Operations
56///
57/// Extension arrays delegate validity and most operations to their storage array:
58/// - Validity is inherited from the underlying storage
59/// - Slicing preserves the extension type
60/// - Scalar access wraps storage scalars with extension metadata
61#[derive(Clone, Debug)]
62pub struct ExtensionData {
63 /// The storage dtype. This **must** be a [`Extension::DType`] variant.
64 pub(super) ext_dtype: ExtDTypeRef,
65}
66
67impl Display for ExtensionData {
68 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
69 write!(f, "ext_dtype: {}", self.ext_dtype)
70 }
71}
72
73impl ExtensionData {
74 /// Constructs a new `ExtensionArray`.
75 ///
76 /// # Panics
77 ///
78 /// Panics if the storage array in not compatible with the extension dtype.
79 pub fn new(ext_dtype: ExtDTypeRef, storage_dtype: &DType) -> Self {
80 Self::try_new(ext_dtype, storage_dtype).vortex_expect("Failed to create `ExtensionArray`")
81 }
82
83 /// Tries to construct a new `ExtensionArray`.
84 ///
85 /// # Errors
86 ///
87 /// Returns an error if the storage array in not compatible with the extension dtype.
88 pub fn try_new(ext_dtype: ExtDTypeRef, storage_dtype: &DType) -> VortexResult<Self> {
89 // TODO(connor): Replace these statements once we add `validate_storage_array`.
90 // ext_dtype.validate_storage_array(&storage_array)?;
91 assert_eq!(
92 ext_dtype.storage_dtype(),
93 storage_dtype,
94 "ExtensionArray: storage_dtype must match storage array DType",
95 );
96
97 // SAFETY: we validate that the inputs are valid above.
98 Ok(unsafe { Self::new_unchecked(ext_dtype, storage_dtype) })
99 }
100
101 /// Creates a new `ExtensionArray`.
102 ///
103 /// # Safety
104 ///
105 /// The caller must ensure that the storage array is compatible with the extension dtype. In
106 /// other words, they must know that `ext_dtype.validate_storage_array(&storage_array)` has been
107 /// called successfully on this storage array.
108 pub unsafe fn new_unchecked(ext_dtype: ExtDTypeRef, storage_dtype: &DType) -> Self {
109 // TODO(connor): Replace these statements once we add `validate_storage_array`.
110 // #[cfg(debug_assertions)]
111 // ext_dtype
112 // .validate_storage_array(&storage_array)
113 // .vortex_expect("[Debug Assertion]: Invalid storage array for `ExtensionArray`");
114 debug_assert_eq!(
115 ext_dtype.storage_dtype(),
116 storage_dtype,
117 "ExtensionArray: storage_dtype must match storage array DType",
118 );
119
120 Self { ext_dtype }
121 }
122
123 /// The extension dtype of this array.
124 pub fn ext_dtype(&self) -> &ExtDTypeRef {
125 &self.ext_dtype
126 }
127}
128
129pub trait ExtensionArrayExt: TypedArrayRef<Extension> {
130 fn storage_array(&self) -> &ArrayRef {
131 self.as_ref().slots()[STORAGE_SLOT]
132 .as_ref()
133 .vortex_expect("ExtensionArray storage slot")
134 }
135}
136impl<T: TypedArrayRef<Extension>> ExtensionArrayExt for T {}
137
138impl Array<Extension> {
139 /// Constructs a new `ExtensionArray`.
140 ///
141 /// # Panics
142 ///
143 /// Panics if the storage array is not compatible with the extension dtype.
144 pub fn new(ext_dtype: ExtDTypeRef, storage_array: ArrayRef) -> Self {
145 let dtype = DType::Extension(ext_dtype.clone());
146 let len = storage_array.len();
147 let data = ExtensionData::new(ext_dtype, storage_array.dtype());
148 unsafe {
149 Array::from_parts_unchecked(
150 ArrayParts::new(Extension, dtype, len, data).with_slots(vec![Some(storage_array)]),
151 )
152 }
153 }
154
155 /// Tries to construct a new `ExtensionArray`.
156 pub fn try_new(ext_dtype: ExtDTypeRef, storage_array: ArrayRef) -> VortexResult<Self> {
157 let dtype = DType::Extension(ext_dtype.clone());
158 let len = storage_array.len();
159 let data = ExtensionData::try_new(ext_dtype, storage_array.dtype())?;
160 Ok(unsafe {
161 Array::from_parts_unchecked(
162 ArrayParts::new(Extension, dtype, len, data).with_slots(vec![Some(storage_array)]),
163 )
164 })
165 }
166}