Skip to main content

arrow_convert/deserialize/
mod.rs

1//! Implementation and traits for deserializing from Arrow.
2mod iterable;
3pub use iterable::*;
4
5use arrow_array::{types, ArrowPrimitiveType, *};
6use arrow_buffer::{ArrowNativeType, Buffer, ScalarBuffer};
7use chrono::{DateTime, NaiveDate, NaiveDateTime, TimeZone, Utc};
8
9use crate::field::*;
10
11/// Implemented by [`ArrowField`] that can be deserialized from arrow
12pub trait ArrowDeserialize: ArrowField + Sized
13where
14    Self::ArrayType: ArrowArray,
15{
16    /// The `arrow::Array` type corresponding to this field
17    type ArrayType;
18
19    /// Deserialize this field from arrow
20    fn arrow_deserialize(v: <Self::ArrayType as ArrowArrayIterable>::Item<'_>) -> Option<<Self as ArrowField>::Type>;
21
22    #[inline]
23    #[doc(hidden)]
24    /// For internal use only
25    ///
26    /// This is an ugly hack to allow generating a blanket Option<T> deserialize.
27    /// Ideally we would be able to capture the optional field of the iterator via
28    /// something like  T::ArrayType: ArrowArrayIterable<Item=Option<E>>,
29    /// However, the E parameter seems to confuse the borrow checker if it's a reference.
30    fn arrow_deserialize_internal(v: <Self::ArrayType as ArrowArrayIterable>::Item<'_>) -> <Self as ArrowField>::Type {
31        Self::arrow_deserialize(v).unwrap()
32    }
33}
34
35/// Internal trait used to support deserialization and iteration of structs, and nested struct lists
36///
37/// Trivial pass-thru implementations are provided for arrow arrays that implement ArrowArrayIterable.
38///
39/// The derive macro generates implementations for typed struct arrays.
40#[doc(hidden)]
41pub trait ArrowArray
42where
43    Self: ArrowArrayIterable,
44{
45    type BaseArrayType: Array;
46
47    // Returns a typed iterator to the underlying elements of the array from an untyped Array reference.
48    fn iter_from_array_ref(b: &dyn Array) -> <Self as ArrowArrayIterable>::Iter<'_>;
49}
50
51// Macro to facilitate implementation for numeric types and numeric arrays.
52macro_rules! impl_arrow_deserialize_primitive {
53    ($physical_type:ty, $primitive_type:ty) => {
54        impl ArrowDeserialize for $physical_type {
55            type ArrayType = PrimitiveArray<$primitive_type>;
56
57            #[inline]
58            fn arrow_deserialize<'a>(v: Option<<$primitive_type as ArrowPrimitiveType>::Native>) -> Option<Self> {
59                v
60            }
61        }
62
63        impl_arrow_array!(PrimitiveArray<$primitive_type>);
64    };
65}
66
67macro_rules! impl_arrow_array {
68    ($array:ty) => {
69        impl ArrowArray for $array {
70            type BaseArrayType = Self;
71
72            #[inline]
73            fn iter_from_array_ref(b: &dyn Array) -> <Self as ArrowArrayIterable>::Iter<'_> {
74                let b = b.as_any().downcast_ref::<Self::BaseArrayType>().unwrap();
75                <Self as ArrowArrayIterable>::iter(b)
76            }
77        }
78    };
79}
80
81// blanket implementation for optional fields
82impl<T> ArrowDeserialize for Option<T>
83where
84    T: ArrowDeserialize,
85    T::ArrayType: 'static + ArrowArray,
86    T::ArrayType: ArrowArrayIterable,
87{
88    type ArrayType = <T as ArrowDeserialize>::ArrayType;
89
90    #[inline]
91    fn arrow_deserialize(v: <Self::ArrayType as ArrowArrayIterable>::Item<'_>) -> Option<<Self as ArrowField>::Type> {
92        Self::arrow_deserialize_internal(v).map(Some)
93    }
94
95    #[inline]
96    fn arrow_deserialize_internal(v: <Self::ArrayType as ArrowArrayIterable>::Item<'_>) -> <Self as ArrowField>::Type {
97        <T as ArrowDeserialize>::arrow_deserialize(v)
98    }
99}
100
101impl_arrow_deserialize_primitive!(u8, types::UInt8Type);
102impl_arrow_deserialize_primitive!(u16, types::UInt16Type);
103impl_arrow_deserialize_primitive!(u32, types::UInt32Type);
104impl_arrow_deserialize_primitive!(u64, types::UInt64Type);
105impl_arrow_deserialize_primitive!(i8, types::Int8Type);
106impl_arrow_deserialize_primitive!(i16, types::Int16Type);
107impl_arrow_deserialize_primitive!(i32, types::Int32Type);
108impl_arrow_deserialize_primitive!(i64, types::Int64Type);
109impl_arrow_deserialize_primitive!(half::f16, types::Float16Type);
110impl_arrow_deserialize_primitive!(f32, types::Float32Type);
111impl_arrow_deserialize_primitive!(f64, types::Float64Type);
112
113impl<const PRECISION: u8, const SCALE: i8> ArrowDeserialize for I128<PRECISION, SCALE> {
114    type ArrayType = PrimitiveArray<types::Decimal128Type>;
115
116    #[inline]
117    fn arrow_deserialize<'a>(v: Option<i128>) -> Option<i128> {
118        v
119    }
120}
121
122impl_arrow_array!(PrimitiveArray<types::Decimal128Type>);
123
124impl ArrowDeserialize for String {
125    type ArrayType = StringArray;
126
127    #[inline]
128    fn arrow_deserialize(v: Option<&str>) -> Option<Self> {
129        v.map(|t| t.to_string())
130    }
131}
132
133impl ArrowDeserialize for LargeString {
134    type ArrayType = LargeStringArray;
135
136    #[inline]
137    fn arrow_deserialize(v: Option<&str>) -> Option<String> {
138        v.map(|t| t.to_string())
139    }
140}
141
142impl ArrowDeserialize for bool {
143    type ArrayType = BooleanArray;
144
145    #[inline]
146    fn arrow_deserialize(v: Option<bool>) -> Option<Self> {
147        v
148    }
149}
150
151impl ArrowDeserialize for NaiveDateTime {
152    type ArrayType = TimestampNanosecondArray;
153
154    #[inline]
155    fn arrow_deserialize(v: Option<i64>) -> Option<Self> {
156        v.and_then(arrow_array::temporal_conversions::timestamp_ns_to_datetime)
157    }
158}
159
160impl ArrowDeserialize for DateTime<Utc> {
161    type ArrayType = TimestampNanosecondArray;
162
163    #[inline]
164    fn arrow_deserialize(v: Option<i64>) -> Option<Self> {
165        v.map(|ns| Utc.timestamp_nanos(ns))
166    }
167}
168
169impl ArrowDeserialize for NaiveDate {
170    type ArrayType = Date32Array;
171
172    #[inline]
173    fn arrow_deserialize(v: Option<i32>) -> Option<Self> {
174        v.and_then(|t| arrow_array::temporal_conversions::as_date::<types::Date32Type>(t as i64))
175    }
176}
177
178/// Iterator for for [`BufferBinaryArray`]
179pub struct BufferBinaryArrayIter<'a> {
180    index: usize,
181    array: &'a BinaryArray,
182}
183
184impl<'a> Iterator for BufferBinaryArrayIter<'a> {
185    type Item = Option<&'a [u8]>;
186
187    fn next(&mut self) -> Option<Self::Item> {
188        if self.index >= self.array.len() {
189            None
190        } else if self.array.is_valid(self.index) {
191            // self.array.iter
192            let value = self.array.value(self.index);
193            self.index += 1;
194            Some(Some(value))
195        } else {
196            self.index += 1;
197            Some(None)
198        }
199    }
200}
201
202/// Internal `ArrowArray` helper to iterate over a `BinaryArray` while exposing Buffer slices
203pub struct BufferBinaryArray;
204
205impl ArrowArray for BufferBinaryArray {
206    type BaseArrayType = BinaryArray;
207    #[inline]
208    fn iter_from_array_ref(a: &dyn Array) -> <Self as ArrowArrayIterable>::Iter<'_> {
209        let b = a.as_any().downcast_ref::<Self::BaseArrayType>().unwrap();
210
211        BufferBinaryArrayIter { index: 0, array: b }
212    }
213}
214
215// Treat both Buffer and ScalarBuffer<u8> the same
216impl ArrowDeserialize for Buffer {
217    type ArrayType = BufferBinaryArray;
218
219    #[inline]
220    fn arrow_deserialize(v: Option<&[u8]>) -> Option<Self> {
221        v.map(|t| t.into())
222    }
223}
224impl ArrowDeserialize for ScalarBuffer<u8> {
225    type ArrayType = BufferBinaryArray;
226
227    #[inline]
228    fn arrow_deserialize(v: Option<&[u8]>) -> Option<Self> {
229        v.map(|t| ScalarBuffer::from(t.to_vec()))
230    }
231}
232
233impl ArrowDeserialize for Vec<u8> {
234    type ArrayType = BinaryArray;
235
236    #[inline]
237    fn arrow_deserialize(v: Option<&[u8]>) -> Option<Self> {
238        v.map(|t| t.to_vec())
239    }
240}
241
242impl ArrowDeserialize for LargeBinary {
243    type ArrayType = LargeBinaryArray;
244
245    #[inline]
246    fn arrow_deserialize(v: Option<&[u8]>) -> Option<Vec<u8>> {
247        v.map(|t| t.to_vec())
248    }
249}
250
251impl<const SIZE: i32> ArrowDeserialize for FixedSizeBinary<SIZE> {
252    type ArrayType = FixedSizeBinaryArray;
253
254    #[inline]
255    fn arrow_deserialize(v: Option<&[u8]>) -> Option<Vec<u8>> {
256        v.map(|t| t.to_vec())
257    }
258}
259
260impl<const SIZE: usize> ArrowDeserialize for [u8; SIZE] {
261    type ArrayType = FixedSizeBinaryArray;
262
263    #[inline]
264    fn arrow_deserialize(v: Option<&[u8]>) -> Option<[u8; SIZE]> {
265        v.map(|t| t.to_vec().try_into().unwrap())
266    }
267}
268
269pub(crate) fn arrow_deserialize_vec_helper<T>(v: Option<ArrayRef>) -> Option<<Vec<T> as ArrowField>::Type>
270where
271    T: ArrowDeserialize + ArrowEnableVecForType + 'static,
272    T::ArrayType: ArrowArrayIterable,
273{
274    use std::ops::Deref;
275    v.map(|t| {
276        arrow_array_deserialize_iterator_internal::<<T as ArrowField>::Type, T>(t.deref())
277            .collect::<Vec<<T as ArrowField>::Type>>()
278    })
279}
280
281// Blanket implementation for ScalarBuffer
282impl<T, K> ArrowDeserialize for ScalarBuffer<T>
283where
284    K: ArrowPrimitiveType<Native = T>,
285    T: ArrowDeserialize<ArrayType = PrimitiveArray<K>> + ArrowNativeType + ArrowEnableVecForType,
286    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
287{
288    type ArrayType = ListArray;
289
290    #[inline]
291    fn arrow_deserialize(v: <Self::ArrayType as ArrowArrayIterable>::Item<'_>) -> Option<<Self as ArrowField>::Type> {
292        let t = v?;
293        let array = t.as_any().downcast_ref::<PrimitiveArray<K>>().unwrap().values().clone();
294        Some(array)
295    }
296}
297
298// Blanket implementation for Vec
299impl<T> ArrowDeserialize for Vec<T>
300where
301    T: ArrowDeserialize + ArrowEnableVecForType + 'static,
302    <T as ArrowDeserialize>::ArrayType: 'static,
303    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
304{
305    type ArrayType = ListArray;
306
307    fn arrow_deserialize(v: Option<ArrayRef>) -> Option<<Self as ArrowField>::Type> {
308        arrow_deserialize_vec_helper::<T>(v)
309    }
310}
311
312impl<T> ArrowDeserialize for LargeVec<T>
313where
314    T: ArrowDeserialize + ArrowEnableVecForType + 'static,
315    <T as ArrowDeserialize>::ArrayType: 'static,
316    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
317{
318    type ArrayType = LargeListArray;
319
320    fn arrow_deserialize(v: Option<ArrayRef>) -> Option<<Self as ArrowField>::Type> {
321        arrow_deserialize_vec_helper::<T>(v)
322    }
323}
324
325impl<T, const SIZE: i32> ArrowDeserialize for FixedSizeVec<T, SIZE>
326where
327    T: ArrowDeserialize + ArrowEnableVecForType + 'static,
328    <T as ArrowDeserialize>::ArrayType: 'static,
329    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
330{
331    type ArrayType = FixedSizeListArray;
332
333    fn arrow_deserialize(v: Option<ArrayRef>) -> Option<<Self as ArrowField>::Type> {
334        arrow_deserialize_vec_helper::<T>(v)
335    }
336}
337impl<T, const SIZE: usize> ArrowDeserialize for [T; SIZE]
338where
339    T: ArrowDeserialize + ArrowEnableVecForType + 'static,
340    <T as ArrowDeserialize>::ArrayType: 'static,
341    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
342{
343    type ArrayType = FixedSizeListArray;
344
345    fn arrow_deserialize(v: Option<ArrayRef>) -> Option<<Self as ArrowField>::Type> {
346        let result = arrow_deserialize_vec_helper::<T>(v)?;
347        let length = result.len();
348
349        match <[<T as ArrowField>::Type; SIZE]>::try_from(result).ok() {
350            None => panic!(
351                "Expected size of {} deserializing array of type `{}`, got {}",
352                std::any::type_name::<T>(),
353                SIZE,
354                length
355            ),
356            array => array,
357        }
358    }
359}
360
361impl_arrow_array!(BooleanArray);
362impl_arrow_array!(StringArray);
363impl_arrow_array!(LargeStringArray);
364impl_arrow_array!(BinaryArray);
365impl_arrow_array!(LargeBinaryArray);
366impl_arrow_array!(FixedSizeBinaryArray);
367impl_arrow_array!(ListArray);
368impl_arrow_array!(LargeListArray);
369impl_arrow_array!(FixedSizeListArray);
370impl_arrow_array!(Date32Array);
371impl_arrow_array!(Date64Array);
372impl_arrow_array!(TimestampSecondArray);
373impl_arrow_array!(TimestampMillisecondArray);
374impl_arrow_array!(TimestampMicrosecondArray);
375impl_arrow_array!(TimestampNanosecondArray);
376
377/// Top-level API to deserialize from Arrow
378pub trait TryIntoCollection<Collection, Element>
379where
380    Collection: FromIterator<Element>,
381{
382    /// Convert from a `arrow::Array` to any collection that implements the `FromIterator` trait
383    fn try_into_collection(self) -> Result<Collection, arrow_schema::ArrowError>
384    where
385        Element: ArrowDeserialize + ArrowField<Type = Element> + 'static;
386
387    /// Same as `try_into_collection` except can coerce the conversion to a specific Arrow type. This is
388    /// useful when the same rust type maps to one or more Arrow types for example `LargeString`.
389    fn try_into_collection_as_type<ArrowType>(self) -> Result<Collection, arrow_schema::ArrowError>
390    where
391        ArrowType: ArrowDeserialize + ArrowField<Type = Element> + 'static;
392    //  <ArrowType as ArrowDeserialize>::ArrayType: ArrowArrayIterable;
393}
394
395/// Helper to return an iterator for elements from a [`arrow::array::Array`].
396fn arrow_array_deserialize_iterator_internal<Element, Field>(b: &dyn Array) -> impl Iterator<Item = Element> + '_
397where
398    Field: ArrowDeserialize + ArrowField<Type = Element> + 'static,
399    <Field as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
400{
401    <<Field as ArrowDeserialize>::ArrayType as ArrowArray>::iter_from_array_ref(b)
402        .map(<Field as ArrowDeserialize>::arrow_deserialize_internal)
403}
404
405/// Returns a typed iterator to a target type from an `arrow::Array`
406pub fn arrow_array_deserialize_iterator_as_type<Element, ArrowType>(
407    arr: &dyn Array,
408) -> Result<impl Iterator<Item = Element> + '_, arrow_schema::ArrowError>
409where
410    Element: 'static,
411    ArrowType: ArrowDeserialize + ArrowField<Type = Element> + 'static,
412    <ArrowType as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
413{
414    if &<ArrowType as ArrowField>::data_type() != arr.data_type() {
415        Err(arrow_schema::ArrowError::InvalidArgumentError(format!(
416            "Data type mismatch. Expected type={:#?} is_nullable={}, but was type={:#?} is_nullable={}",
417            &<ArrowType as ArrowField>::data_type(),
418            &<ArrowType as ArrowField>::is_nullable(),
419            arr.data_type(),
420            arr.is_nullable()
421        )))
422    } else {
423        Ok(arrow_array_deserialize_iterator_internal::<Element, ArrowType>(
424            arr,
425        ))
426    }
427}
428
429/// Return an iterator that deserializes an [`Array`] to an element of type T
430pub fn arrow_array_deserialize_iterator<T>(
431    arr: &dyn Array,
432) -> Result<impl Iterator<Item = T> + '_, arrow_schema::ArrowError>
433where
434    T: ArrowDeserialize + ArrowField<Type = T> + 'static,
435    <T as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
436{
437    arrow_array_deserialize_iterator_as_type::<T, T>(arr)
438}
439
440impl<Collection, Element, ArrowArray> TryIntoCollection<Collection, Element> for ArrowArray
441where
442    Element: 'static,
443    ArrowArray: std::borrow::Borrow<dyn Array>,
444    Collection: FromIterator<Element>,
445{
446    fn try_into_collection(self) -> Result<Collection, arrow_schema::ArrowError>
447    where
448        Element: ArrowDeserialize + ArrowField<Type = Element> + 'static,
449        <Element as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
450    {
451        Ok(arrow_array_deserialize_iterator::<Element>(self.borrow())?.collect())
452    }
453
454    fn try_into_collection_as_type<ArrowType>(self) -> Result<Collection, arrow_schema::ArrowError>
455    where
456        ArrowType: ArrowDeserialize + ArrowField<Type = Element> + 'static,
457        <ArrowType as ArrowDeserialize>::ArrayType: ArrowArrayIterable,
458    {
459        Ok(arrow_array_deserialize_iterator_as_type::<Element, ArrowType>(self.borrow())?.collect())
460    }
461}