vortex_array/arrow/
array.rs

1use arrow_array::array::{
2    Array as ArrowArray, ArrayRef as ArrowArrayRef, ArrowPrimitiveType,
3    BooleanArray as ArrowBooleanArray, GenericByteArray, NullArray as ArrowNullArray,
4    OffsetSizeTrait, PrimitiveArray as ArrowPrimitiveArray, StructArray as ArrowStructArray,
5};
6use arrow_array::cast::{AsArray, as_null_array};
7use arrow_array::types::{
8    ByteArrayType, ByteViewType, Date32Type, Date64Type, DurationMicrosecondType,
9    DurationMillisecondType, DurationNanosecondType, DurationSecondType, Float16Type, Float32Type,
10    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time32MillisecondType,
11    Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
12    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type,
13    UInt32Type, UInt64Type,
14};
15use arrow_array::{BinaryViewArray, GenericByteViewArray, GenericListArray, StringViewArray};
16use arrow_buffer::buffer::{NullBuffer, OffsetBuffer};
17use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer as ArrowBuffer, ScalarBuffer};
18use arrow_schema::{DataType, TimeUnit as ArrowTimeUnit};
19use vortex_buffer::{Alignment, Buffer, ByteBuffer};
20use vortex_dtype::datetime::TimeUnit;
21use vortex_dtype::{DType, NativePType, PType};
22use vortex_error::{VortexExpect as _, vortex_panic};
23
24use crate::arrays::{
25    BoolArray, ListArray, NullArray, PrimitiveArray, StructArray, TemporalArray, VarBinArray,
26    VarBinViewArray,
27};
28use crate::arrow::FromArrowArray;
29use crate::validity::Validity;
30use crate::{Array, ArrayRef, IntoArray};
31
32impl IntoArray for ArrowBuffer {
33    fn into_array(self) -> ArrayRef {
34        PrimitiveArray::from_byte_buffer(
35            ByteBuffer::from_arrow_buffer(self, Alignment::of::<u8>()),
36            PType::U8,
37            Validity::NonNullable,
38        )
39        .into_array()
40    }
41}
42
43impl IntoArray for BooleanBuffer {
44    fn into_array(self) -> ArrayRef {
45        BoolArray::new(self, Validity::NonNullable).into_array()
46    }
47}
48
49impl<T> IntoArray for ScalarBuffer<T>
50where
51    T: ArrowNativeType + NativePType,
52{
53    fn into_array(self) -> ArrayRef {
54        PrimitiveArray::new(
55            Buffer::<T>::from_arrow_scalar_buffer(self),
56            Validity::NonNullable,
57        )
58        .into_array()
59    }
60}
61
62impl<O> IntoArray for OffsetBuffer<O>
63where
64    O: NativePType + OffsetSizeTrait,
65{
66    fn into_array(self) -> ArrayRef {
67        let primitive = PrimitiveArray::new(
68            Buffer::from_arrow_scalar_buffer(self.into_inner()),
69            Validity::NonNullable,
70        );
71
72        primitive.into_array()
73    }
74}
75
76impl<T: ArrowPrimitiveType> FromArrowArray<&ArrowPrimitiveArray<T>> for ArrayRef
77where
78    <T as ArrowPrimitiveType>::Native: NativePType,
79{
80    fn from_arrow(value: &ArrowPrimitiveArray<T>, nullable: bool) -> Self {
81        let arr = PrimitiveArray::new(
82            Buffer::from_arrow_scalar_buffer(value.values().clone()),
83            nulls(value.nulls(), nullable),
84        );
85
86        if T::DATA_TYPE.is_numeric() {
87            return arr.into_array();
88        }
89
90        match T::DATA_TYPE {
91            DataType::Timestamp(time_unit, tz) => {
92                let tz = tz.map(|s| s.to_string());
93                TemporalArray::new_timestamp(arr.into_array(), time_unit.into(), tz).into()
94            }
95            DataType::Time32(time_unit) => {
96                TemporalArray::new_time(arr.into_array(), time_unit.into()).into()
97            }
98            DataType::Time64(time_unit) => {
99                TemporalArray::new_time(arr.into_array(), time_unit.into()).into()
100            }
101            DataType::Date32 => TemporalArray::new_date(arr.into_array(), TimeUnit::D).into(),
102            DataType::Date64 => TemporalArray::new_date(arr.into_array(), TimeUnit::Ms).into(),
103            DataType::Duration(_) => unimplemented!(),
104            DataType::Interval(_) => unimplemented!(),
105            _ => vortex_panic!("Invalid data type for PrimitiveArray: {}", T::DATA_TYPE),
106        }
107    }
108}
109
110impl<T: ByteArrayType> FromArrowArray<&GenericByteArray<T>> for ArrayRef
111where
112    <T as ByteArrayType>::Offset: NativePType,
113{
114    fn from_arrow(value: &GenericByteArray<T>, nullable: bool) -> Self {
115        let dtype = match T::DATA_TYPE {
116            DataType::Binary | DataType::LargeBinary => DType::Binary(nullable.into()),
117            DataType::Utf8 | DataType::LargeUtf8 => DType::Utf8(nullable.into()),
118            _ => vortex_panic!("Invalid data type for ByteArray: {}", T::DATA_TYPE),
119        };
120        VarBinArray::try_new(
121            value.offsets().clone().into_array(),
122            ByteBuffer::from_arrow_buffer(value.values().clone(), Alignment::of::<u8>()),
123            dtype,
124            nulls(value.nulls(), nullable),
125        )
126        .vortex_expect("Failed to convert Arrow GenericByteArray to Vortex VarBinArray")
127        .into_array()
128    }
129}
130
131impl<T: ByteViewType> FromArrowArray<&GenericByteViewArray<T>> for ArrayRef {
132    fn from_arrow(value: &GenericByteViewArray<T>, nullable: bool) -> Self {
133        let dtype = match T::DATA_TYPE {
134            DataType::BinaryView => DType::Binary(nullable.into()),
135            DataType::Utf8View => DType::Utf8(nullable.into()),
136            _ => vortex_panic!("Invalid data type for ByteViewArray: {}", T::DATA_TYPE),
137        };
138
139        let views_buffer = Buffer::from_byte_buffer(
140            Buffer::from_arrow_scalar_buffer(value.views().clone()).into_byte_buffer(),
141        );
142
143        VarBinViewArray::try_new(
144            views_buffer,
145            value
146                .data_buffers()
147                .iter()
148                .map(|b| ByteBuffer::from_arrow_buffer(b.clone(), Alignment::of::<u8>()))
149                .collect::<Vec<_>>(),
150            dtype,
151            nulls(value.nulls(), nullable),
152        )
153        .vortex_expect("Failed to convert Arrow GenericByteViewArray to Vortex VarBinViewArray")
154        .into_array()
155    }
156}
157
158impl FromArrowArray<&ArrowBooleanArray> for ArrayRef {
159    fn from_arrow(value: &ArrowBooleanArray, nullable: bool) -> Self {
160        BoolArray::new(value.values().clone(), nulls(value.nulls(), nullable)).into_array()
161    }
162}
163
164impl FromArrowArray<&ArrowStructArray> for ArrayRef {
165    fn from_arrow(value: &ArrowStructArray, nullable: bool) -> Self {
166        StructArray::try_new(
167            value.column_names().iter().map(|s| (*s).into()).collect(),
168            value
169                .columns()
170                .iter()
171                .zip(value.fields())
172                .map(|(c, field)| Self::from_arrow(c.clone(), field.is_nullable()))
173                .collect(),
174            value.len(),
175            nulls(value.nulls(), nullable),
176        )
177        .vortex_expect("Failed to convert Arrow StructArray to Vortex StructArray")
178        .into_array()
179    }
180}
181
182impl<O: OffsetSizeTrait + NativePType> FromArrowArray<&GenericListArray<O>> for ArrayRef {
183    fn from_arrow(value: &GenericListArray<O>, nullable: bool) -> Self {
184        // Extract the validity of the underlying element array
185        let elem_nullable = match value.data_type() {
186            DataType::List(field) => field.is_nullable(),
187            DataType::LargeList(field) => field.is_nullable(),
188            dt => vortex_panic!("Invalid data type for ListArray: {dt}"),
189        };
190        ListArray::try_new(
191            Self::from_arrow(value.values().clone(), elem_nullable),
192            // offsets are always non-nullable
193            value.offsets().clone().into_array(),
194            nulls(value.nulls(), nullable),
195        )
196        .vortex_expect("Failed to convert Arrow StructArray to Vortex StructArray")
197        .into_array()
198    }
199}
200
201impl FromArrowArray<&ArrowNullArray> for ArrayRef {
202    fn from_arrow(value: &ArrowNullArray, nullable: bool) -> Self {
203        assert!(nullable);
204        NullArray::new(value.len()).into_array()
205    }
206}
207
208fn nulls(nulls: Option<&NullBuffer>, nullable: bool) -> Validity {
209    if nullable {
210        nulls
211            .map(|nulls| {
212                if nulls.null_count() == nulls.len() {
213                    Validity::AllInvalid
214                } else {
215                    Validity::from(nulls.inner().clone())
216                }
217            })
218            .unwrap_or_else(|| Validity::AllValid)
219    } else {
220        assert!(nulls.map(|x| x.null_count() == 0).unwrap_or(true));
221        Validity::NonNullable
222    }
223}
224
225impl FromArrowArray<ArrowArrayRef> for ArrayRef {
226    fn from_arrow(array: ArrowArrayRef, nullable: bool) -> Self {
227        match array.data_type() {
228            DataType::Boolean => Self::from_arrow(array.as_boolean(), nullable),
229            DataType::UInt8 => Self::from_arrow(array.as_primitive::<UInt8Type>(), nullable),
230            DataType::UInt16 => Self::from_arrow(array.as_primitive::<UInt16Type>(), nullable),
231            DataType::UInt32 => Self::from_arrow(array.as_primitive::<UInt32Type>(), nullable),
232            DataType::UInt64 => Self::from_arrow(array.as_primitive::<UInt64Type>(), nullable),
233            DataType::Int8 => Self::from_arrow(array.as_primitive::<Int8Type>(), nullable),
234            DataType::Int16 => Self::from_arrow(array.as_primitive::<Int16Type>(), nullable),
235            DataType::Int32 => Self::from_arrow(array.as_primitive::<Int32Type>(), nullable),
236            DataType::Int64 => Self::from_arrow(array.as_primitive::<Int64Type>(), nullable),
237            DataType::Float16 => Self::from_arrow(array.as_primitive::<Float16Type>(), nullable),
238            DataType::Float32 => Self::from_arrow(array.as_primitive::<Float32Type>(), nullable),
239            DataType::Float64 => Self::from_arrow(array.as_primitive::<Float64Type>(), nullable),
240            DataType::Utf8 => Self::from_arrow(array.as_string::<i32>(), nullable),
241            DataType::LargeUtf8 => Self::from_arrow(array.as_string::<i64>(), nullable),
242            DataType::Binary => Self::from_arrow(array.as_binary::<i32>(), nullable),
243            DataType::LargeBinary => Self::from_arrow(array.as_binary::<i64>(), nullable),
244            DataType::BinaryView => Self::from_arrow(
245                array
246                    .as_any()
247                    .downcast_ref::<BinaryViewArray>()
248                    .vortex_expect("Expected Arrow BinaryViewArray for DataType::BinaryView"),
249                nullable,
250            ),
251            DataType::Utf8View => Self::from_arrow(
252                array
253                    .as_any()
254                    .downcast_ref::<StringViewArray>()
255                    .vortex_expect("Expected Arrow StringViewArray for DataType::Utf8View"),
256                nullable,
257            ),
258            DataType::Struct(_) => Self::from_arrow(array.as_struct(), nullable),
259            DataType::List(_) => Self::from_arrow(array.as_list::<i32>(), nullable),
260            DataType::LargeList(_) => Self::from_arrow(array.as_list::<i64>(), nullable),
261            DataType::Null => Self::from_arrow(as_null_array(&array), nullable),
262            DataType::Timestamp(u, _) => match u {
263                ArrowTimeUnit::Second => {
264                    Self::from_arrow(array.as_primitive::<TimestampSecondType>(), nullable)
265                }
266                ArrowTimeUnit::Millisecond => {
267                    Self::from_arrow(array.as_primitive::<TimestampMillisecondType>(), nullable)
268                }
269                ArrowTimeUnit::Microsecond => {
270                    Self::from_arrow(array.as_primitive::<TimestampMicrosecondType>(), nullable)
271                }
272                ArrowTimeUnit::Nanosecond => {
273                    Self::from_arrow(array.as_primitive::<TimestampNanosecondType>(), nullable)
274                }
275            },
276            DataType::Date32 => Self::from_arrow(array.as_primitive::<Date32Type>(), nullable),
277            DataType::Date64 => Self::from_arrow(array.as_primitive::<Date64Type>(), nullable),
278            DataType::Time32(u) => match u {
279                ArrowTimeUnit::Second => {
280                    Self::from_arrow(array.as_primitive::<Time32SecondType>(), nullable)
281                }
282                ArrowTimeUnit::Millisecond => {
283                    Self::from_arrow(array.as_primitive::<Time32MillisecondType>(), nullable)
284                }
285                _ => unreachable!(),
286            },
287            DataType::Time64(u) => match u {
288                ArrowTimeUnit::Microsecond => {
289                    Self::from_arrow(array.as_primitive::<Time64MicrosecondType>(), nullable)
290                }
291                ArrowTimeUnit::Nanosecond => {
292                    Self::from_arrow(array.as_primitive::<Time64NanosecondType>(), nullable)
293                }
294                _ => unreachable!(),
295            },
296            DataType::Duration(u) => match u {
297                ArrowTimeUnit::Second => {
298                    Self::from_arrow(array.as_primitive::<DurationSecondType>(), nullable)
299                }
300                ArrowTimeUnit::Millisecond => {
301                    Self::from_arrow(array.as_primitive::<DurationMillisecondType>(), nullable)
302                }
303                ArrowTimeUnit::Microsecond => {
304                    Self::from_arrow(array.as_primitive::<DurationMicrosecondType>(), nullable)
305                }
306                ArrowTimeUnit::Nanosecond => {
307                    Self::from_arrow(array.as_primitive::<DurationNanosecondType>(), nullable)
308                }
309            },
310            _ => vortex_panic!(
311                "Array encoding not implemented for Arrow data type {}",
312                array.data_type().clone()
313            ),
314        }
315    }
316}