polars_arrow/ffi/
array.rs

1//! Contains functionality to load an ArrayData from the C Data Interface
2use std::sync::Arc;
3
4use polars_error::{PolarsResult, polars_bail};
5
6use super::ArrowArray;
7use crate::array::*;
8use crate::bitmap::Bitmap;
9use crate::bitmap::utils::bytes_for;
10use crate::buffer::Buffer;
11use crate::datatypes::{ArrowDataType, PhysicalType};
12use crate::ffi::schema::get_child;
13use crate::storage::SharedStorage;
14use crate::types::NativeType;
15use crate::{ffi, match_integer_type, with_match_primitive_type_full};
16
17/// Reads a valid `ffi` interface into a `Box<dyn Array>`
18/// # Errors
19/// If and only if:
20/// * the interface is not valid (e.g. a null pointer)
21pub unsafe fn try_from<A: ArrowArrayRef>(array: A) -> PolarsResult<Box<dyn Array>> {
22    use PhysicalType::*;
23    Ok(match array.dtype().to_physical_type() {
24        Null => Box::new(NullArray::try_from_ffi(array)?),
25        Boolean => Box::new(BooleanArray::try_from_ffi(array)?),
26        Primitive(primitive) => with_match_primitive_type_full!(primitive, |$T| {
27            Box::new(PrimitiveArray::<$T>::try_from_ffi(array)?)
28        }),
29        Utf8 => Box::new(Utf8Array::<i32>::try_from_ffi(array)?),
30        LargeUtf8 => Box::new(Utf8Array::<i64>::try_from_ffi(array)?),
31        Binary => Box::new(BinaryArray::<i32>::try_from_ffi(array)?),
32        LargeBinary => Box::new(BinaryArray::<i64>::try_from_ffi(array)?),
33        FixedSizeBinary => Box::new(FixedSizeBinaryArray::try_from_ffi(array)?),
34        List => Box::new(ListArray::<i32>::try_from_ffi(array)?),
35        LargeList => Box::new(ListArray::<i64>::try_from_ffi(array)?),
36        FixedSizeList => Box::new(FixedSizeListArray::try_from_ffi(array)?),
37        Struct => Box::new(StructArray::try_from_ffi(array)?),
38        Dictionary(key_type) => {
39            match_integer_type!(key_type, |$T| {
40                Box::new(DictionaryArray::<$T>::try_from_ffi(array)?)
41            })
42        },
43        Union => Box::new(UnionArray::try_from_ffi(array)?),
44        Map => Box::new(MapArray::try_from_ffi(array)?),
45        BinaryView => Box::new(BinaryViewArray::try_from_ffi(array)?),
46        Utf8View => Box::new(Utf8ViewArray::try_from_ffi(array)?),
47    })
48}
49
50// Sound because the arrow specification does not allow multiple implementations
51// to change this struct
52// This is intrinsically impossible to prove because the implementations agree
53// on this as part of the Arrow specification
54unsafe impl Send for ArrowArray {}
55unsafe impl Sync for ArrowArray {}
56
57impl Drop for ArrowArray {
58    fn drop(&mut self) {
59        match self.release {
60            None => (),
61            Some(release) => unsafe { release(self) },
62        };
63    }
64}
65
66// callback used to drop [ArrowArray] when it is exported
67unsafe extern "C" fn c_release_array(array: *mut ArrowArray) {
68    if array.is_null() {
69        return;
70    }
71    let array = &mut *array;
72
73    // take ownership of `private_data`, therefore dropping it
74    let private = Box::from_raw(array.private_data as *mut PrivateData);
75    for child in private.children_ptr.iter() {
76        let _ = Box::from_raw(*child);
77    }
78
79    if let Some(ptr) = private.dictionary_ptr {
80        let _ = Box::from_raw(ptr);
81    }
82
83    array.release = None;
84}
85
86#[allow(dead_code)]
87struct PrivateData {
88    array: Box<dyn Array>,
89    buffers_ptr: Box<[*const std::os::raw::c_void]>,
90    children_ptr: Box<[*mut ArrowArray]>,
91    dictionary_ptr: Option<*mut ArrowArray>,
92    variadic_buffer_sizes: Box<[i64]>,
93}
94
95impl ArrowArray {
96    /// creates a new `ArrowArray` from existing data.
97    ///
98    /// # Safety
99    /// This method releases `buffers`. Consumers of this struct *must* call `release` before
100    /// releasing this struct, or contents in `buffers` leak.
101    pub(crate) fn new(array: Box<dyn Array>) -> Self {
102        #[allow(unused_mut)]
103        let (offset, mut buffers, children, dictionary) =
104            offset_buffers_children_dictionary(array.as_ref());
105
106        let variadic_buffer_sizes = match array.dtype() {
107            ArrowDataType::BinaryView => {
108                let arr = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
109                let boxed = arr.variadic_buffer_lengths().into_boxed_slice();
110                let ptr = boxed.as_ptr().cast::<u8>();
111                buffers.push(Some(ptr));
112                boxed
113            },
114            ArrowDataType::Utf8View => {
115                let arr = array.as_any().downcast_ref::<Utf8ViewArray>().unwrap();
116                let boxed = arr.variadic_buffer_lengths().into_boxed_slice();
117                let ptr = boxed.as_ptr().cast::<u8>();
118                buffers.push(Some(ptr));
119                boxed
120            },
121            _ => Box::new([]),
122        };
123
124        let buffers_ptr = buffers
125            .iter()
126            .map(|maybe_buffer| match maybe_buffer {
127                Some(b) => *b as *const std::os::raw::c_void,
128                None => std::ptr::null(),
129            })
130            .collect::<Box<[_]>>();
131        let n_buffers = buffers.len() as i64;
132
133        let children_ptr = children
134            .into_iter()
135            .map(|child| {
136                Box::into_raw(Box::new(ArrowArray::new(ffi::align_to_c_data_interface(
137                    child,
138                ))))
139            })
140            .collect::<Box<_>>();
141        let n_children = children_ptr.len() as i64;
142
143        let dictionary_ptr = dictionary.map(|array| {
144            Box::into_raw(Box::new(ArrowArray::new(ffi::align_to_c_data_interface(
145                array,
146            ))))
147        });
148
149        let length = array.len() as i64;
150        let null_count = array.null_count() as i64;
151
152        let mut private_data = Box::new(PrivateData {
153            array,
154            buffers_ptr,
155            children_ptr,
156            dictionary_ptr,
157            variadic_buffer_sizes,
158        });
159
160        Self {
161            length,
162            null_count,
163            offset: offset as i64,
164            n_buffers,
165            n_children,
166            buffers: private_data.buffers_ptr.as_mut_ptr(),
167            children: private_data.children_ptr.as_mut_ptr(),
168            dictionary: private_data.dictionary_ptr.unwrap_or(std::ptr::null_mut()),
169            release: Some(c_release_array),
170            private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
171        }
172    }
173
174    /// creates an empty [`ArrowArray`], which can be used to import data into
175    pub fn empty() -> Self {
176        Self {
177            length: 0,
178            null_count: 0,
179            offset: 0,
180            n_buffers: 0,
181            n_children: 0,
182            buffers: std::ptr::null_mut(),
183            children: std::ptr::null_mut(),
184            dictionary: std::ptr::null_mut(),
185            release: None,
186            private_data: std::ptr::null_mut(),
187        }
188    }
189
190    /// the length of the array
191    pub(crate) fn len(&self) -> usize {
192        self.length as usize
193    }
194
195    /// the offset of the array
196    pub(crate) fn offset(&self) -> usize {
197        self.offset as usize
198    }
199
200    /// the null count of the array
201    pub(crate) fn null_count(&self) -> usize {
202        self.null_count as usize
203    }
204}
205
206/// # Safety
207/// The caller must ensure that the buffer at index `i` is not mutably shared.
208unsafe fn get_buffer_ptr<T: NativeType>(
209    array: &ArrowArray,
210    dtype: &ArrowDataType,
211    index: usize,
212) -> PolarsResult<*mut T> {
213    if array.buffers.is_null() {
214        polars_bail!( ComputeError:
215            "an ArrowArray of type {dtype:?} must have non-null buffers"
216        );
217    }
218
219    if array.buffers.align_offset(align_of::<*mut *const u8>()) != 0 {
220        polars_bail!( ComputeError:
221            "an ArrowArray of type {dtype:?}
222            must have buffer {index} aligned to type {}",
223            std::any::type_name::<*mut *const u8>()
224        );
225    }
226    let buffers = array.buffers as *mut *const u8;
227
228    if index >= array.n_buffers as usize {
229        polars_bail!(ComputeError:
230            "An ArrowArray of type {dtype:?}
231             must have buffer {index}."
232        )
233    }
234
235    let ptr = *buffers.add(index);
236    if ptr.is_null() {
237        polars_bail!(ComputeError:
238            "An array of type {dtype:?}
239            must have a non-null buffer {index}"
240        )
241    }
242
243    // note: we can't prove that this pointer is not mutably shared - part of the safety invariant
244    Ok(ptr as *mut T)
245}
246
247unsafe fn create_buffer_known_len<T: NativeType>(
248    array: &ArrowArray,
249    dtype: &ArrowDataType,
250    owner: InternalArrowArray,
251    len: usize,
252    index: usize,
253) -> PolarsResult<Buffer<T>> {
254    if len == 0 {
255        return Ok(Buffer::new());
256    }
257    let ptr: *mut T = get_buffer_ptr(array, dtype, index)?;
258    let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner);
259    Ok(Buffer::from_storage(storage))
260}
261
262/// returns the buffer `i` of `array` interpreted as a [`Buffer`].
263/// # Safety
264/// This function is safe iff:
265/// * the buffers up to position `index` are valid for the declared length
266/// * the buffers' pointers are not mutably shared for the lifetime of `owner`
267unsafe fn create_buffer<T: NativeType>(
268    array: &ArrowArray,
269    dtype: &ArrowDataType,
270    owner: InternalArrowArray,
271    index: usize,
272) -> PolarsResult<Buffer<T>> {
273    let len = buffer_len(array, dtype, index)?;
274
275    if len == 0 {
276        return Ok(Buffer::new());
277    }
278
279    let offset = buffer_offset(array, dtype, index);
280    let ptr: *mut T = get_buffer_ptr(array, dtype, index)?;
281
282    // We have to check alignment.
283    // This is the zero-copy path.
284    if ptr.align_offset(align_of::<T>()) == 0 {
285        let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner);
286        Ok(Buffer::from_storage(storage).sliced(offset, len - offset))
287    }
288    // This is the path where alignment isn't correct.
289    // We copy the data to a new vec
290    else {
291        let buf = std::slice::from_raw_parts(ptr, len - offset).to_vec();
292        Ok(Buffer::from(buf))
293    }
294}
295
296/// returns the buffer `i` of `array` interpreted as a [`Bitmap`].
297/// # Safety
298/// This function is safe iff:
299/// * the buffer at position `index` is valid for the declared length
300/// * the buffers' pointer is not mutable for the lifetime of `owner`
301unsafe fn create_bitmap(
302    array: &ArrowArray,
303    dtype: &ArrowDataType,
304    owner: InternalArrowArray,
305    index: usize,
306    // if this is the validity bitmap
307    // we can use the null count directly
308    is_validity: bool,
309) -> PolarsResult<Bitmap> {
310    let len: usize = array.length.try_into().expect("length to fit in `usize`");
311    if len == 0 {
312        return Ok(Bitmap::new());
313    }
314    let ptr = get_buffer_ptr(array, dtype, index)?;
315
316    // Pointer of u8 has alignment 1, so we don't have to check alignment.
317
318    let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
319    let bytes_len = bytes_for(offset + len);
320    let storage = SharedStorage::from_internal_arrow_array(ptr, bytes_len, owner);
321
322    let null_count = if is_validity {
323        Some(array.null_count())
324    } else {
325        None
326    };
327    Ok(Bitmap::from_inner_unchecked(
328        storage, offset, len, null_count,
329    ))
330}
331
332fn buffer_offset(array: &ArrowArray, dtype: &ArrowDataType, i: usize) -> usize {
333    use PhysicalType::*;
334    match (dtype.to_physical_type(), i) {
335        (LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0,
336        (FixedSizeBinary, 1) => {
337            if let ArrowDataType::FixedSizeBinary(size) = dtype.to_logical_type() {
338                let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`");
339                offset * *size
340            } else {
341                unreachable!()
342            }
343        },
344        _ => array.offset.try_into().expect("Offset to fit in `usize`"),
345    }
346}
347
348/// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface)
349unsafe fn buffer_len(array: &ArrowArray, dtype: &ArrowDataType, i: usize) -> PolarsResult<usize> {
350    Ok(match (dtype.to_physical_type(), i) {
351        (PhysicalType::FixedSizeBinary, 1) => {
352            if let ArrowDataType::FixedSizeBinary(size) = dtype.to_logical_type() {
353                *size * (array.offset as usize + array.length as usize)
354            } else {
355                unreachable!()
356            }
357        },
358        (PhysicalType::FixedSizeList, 1) => {
359            if let ArrowDataType::FixedSizeList(_, size) = dtype.to_logical_type() {
360                *size * (array.offset as usize + array.length as usize)
361            } else {
362                unreachable!()
363            }
364        },
365        (PhysicalType::Utf8, 1)
366        | (PhysicalType::LargeUtf8, 1)
367        | (PhysicalType::Binary, 1)
368        | (PhysicalType::LargeBinary, 1)
369        | (PhysicalType::List, 1)
370        | (PhysicalType::LargeList, 1)
371        | (PhysicalType::Map, 1) => {
372            // the len of the offset buffer (buffer 1) equals length + 1
373            array.offset as usize + array.length as usize + 1
374        },
375        (PhysicalType::BinaryView, 1) | (PhysicalType::Utf8View, 1) => {
376            array.offset as usize + array.length as usize
377        },
378        (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => {
379            // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
380            let len = buffer_len(array, dtype, 1)?;
381            // first buffer is the null buffer => add(1)
382            let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
383            // interpret as i32
384            let offset_buffer = offset_buffer as *const i32;
385            // get last offset
386
387            (unsafe { *offset_buffer.add(len - 1) }) as usize
388        },
389        (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => {
390            // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
391            let len = buffer_len(array, dtype, 1)?;
392            // first buffer is the null buffer => add(1)
393            let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
394            // interpret as i64
395            let offset_buffer = offset_buffer as *const i64;
396            // get last offset
397            (unsafe { *offset_buffer.add(len - 1) }) as usize
398        },
399        // buffer len of primitive types
400        _ => array.offset as usize + array.length as usize,
401    })
402}
403
404/// # Safety
405///
406/// This function is safe iff:
407/// * `array.children` at `index` is valid
408/// * `array.children` is not mutably shared for the lifetime of `parent`
409/// * the pointer of `array.children` at `index` is valid
410/// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
411unsafe fn create_child(
412    array: &ArrowArray,
413    dtype: &ArrowDataType,
414    parent: InternalArrowArray,
415    index: usize,
416) -> PolarsResult<ArrowArrayChild<'static>> {
417    let dtype = get_child(dtype, index)?;
418
419    // catch what we can
420    if array.children.is_null() {
421        polars_bail!(ComputeError: "an ArrowArray of type {dtype:?} must have non-null children");
422    }
423
424    if index >= array.n_children as usize {
425        polars_bail!(ComputeError:
426            "an ArrowArray of type {dtype:?}
427             must have child {index}."
428        );
429    }
430
431    // SAFETY: part of the invariant
432    let arr_ptr = unsafe { *array.children.add(index) };
433
434    // catch what we can
435    if arr_ptr.is_null() {
436        polars_bail!(ComputeError:
437            "an array of type {dtype:?}
438            must have a non-null child {index}"
439        )
440    }
441
442    // SAFETY: invariant of this function
443    let arr_ptr = unsafe { &*arr_ptr };
444    Ok(ArrowArrayChild::new(arr_ptr, dtype, parent))
445}
446
447/// # Safety
448///
449/// This function is safe iff:
450/// * `array.dictionary` is valid
451/// * `array.dictionary` is not mutably shared for the lifetime of `parent`
452unsafe fn create_dictionary(
453    array: &ArrowArray,
454    dtype: &ArrowDataType,
455    parent: InternalArrowArray,
456) -> PolarsResult<Option<ArrowArrayChild<'static>>> {
457    if let ArrowDataType::Dictionary(_, values, _) = dtype {
458        let dtype = values.as_ref().clone();
459        // catch what we can
460        if array.dictionary.is_null() {
461            polars_bail!(ComputeError:
462                "an array of type {dtype:?}
463                must have a non-null dictionary"
464            )
465        }
466
467        // SAFETY: part of the invariant
468        let array = unsafe { &*array.dictionary };
469        Ok(Some(ArrowArrayChild::new(array, dtype, parent)))
470    } else {
471        Ok(None)
472    }
473}
474
475pub trait ArrowArrayRef: std::fmt::Debug {
476    fn owner(&self) -> InternalArrowArray {
477        (*self.parent()).clone()
478    }
479
480    /// returns the null bit buffer.
481    /// Rust implementation uses a buffer that is not part of the array of buffers.
482    /// The C Data interface's null buffer is part of the array of buffers.
483    ///
484    /// # Safety
485    /// The caller must guarantee that the buffer `index` corresponds to a bitmap.
486    /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove.
487    unsafe fn validity(&self) -> PolarsResult<Option<Bitmap>> {
488        if self.array().null_count() == 0 {
489            Ok(None)
490        } else {
491            create_bitmap(self.array(), self.dtype(), self.owner(), 0, true).map(Some)
492        }
493    }
494
495    /// # Safety
496    /// The caller must guarantee that the buffer `index` corresponds to a buffer.
497    /// This function assumes that the buffer created from FFI is valid; this is impossible to prove.
498    unsafe fn buffer<T: NativeType>(&self, index: usize) -> PolarsResult<Buffer<T>> {
499        create_buffer::<T>(self.array(), self.dtype(), self.owner(), index)
500    }
501
502    /// # Safety
503    /// The caller must guarantee that the buffer `index` corresponds to a buffer.
504    /// This function assumes that the buffer created from FFI is valid; this is impossible to prove.
505    unsafe fn buffer_known_len<T: NativeType>(
506        &self,
507        index: usize,
508        len: usize,
509    ) -> PolarsResult<Buffer<T>> {
510        create_buffer_known_len::<T>(self.array(), self.dtype(), self.owner(), len, index)
511    }
512
513    /// # Safety
514    /// This function is safe iff:
515    /// * the buffer at position `index` is valid for the declared length
516    /// * the buffers' pointer is not mutable for the lifetime of `owner`
517    unsafe fn bitmap(&self, index: usize) -> PolarsResult<Bitmap> {
518        create_bitmap(self.array(), self.dtype(), self.owner(), index, false)
519    }
520
521    /// # Safety
522    /// * `array.children` at `index` is valid
523    /// * `array.children` is not mutably shared for the lifetime of `parent`
524    /// * the pointer of `array.children` at `index` is valid
525    /// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
526    unsafe fn child(&self, index: usize) -> PolarsResult<ArrowArrayChild> {
527        create_child(self.array(), self.dtype(), self.parent().clone(), index)
528    }
529
530    unsafe fn dictionary(&self) -> PolarsResult<Option<ArrowArrayChild>> {
531        create_dictionary(self.array(), self.dtype(), self.parent().clone())
532    }
533
534    fn n_buffers(&self) -> usize;
535
536    fn offset(&self) -> usize;
537    fn length(&self) -> usize;
538
539    fn parent(&self) -> &InternalArrowArray;
540    fn array(&self) -> &ArrowArray;
541    fn dtype(&self) -> &ArrowDataType;
542}
543
544/// Struct used to move an Array from and to the C Data Interface.
545/// Its main responsibility is to expose functionality that requires
546/// both [ArrowArray] and [ArrowSchema].
547///
548/// This struct has two main paths:
549///
550/// ## Import from the C Data Interface
551/// * [InternalArrowArray::empty] to allocate memory to be filled by an external call
552/// * [InternalArrowArray::try_from_raw] to consume two non-null allocated pointers
553/// ## Export to the C Data Interface
554/// * [InternalArrowArray::try_new] to create a new [InternalArrowArray] from Rust-specific information
555/// * [InternalArrowArray::into_raw] to expose two pointers for [ArrowArray] and [ArrowSchema].
556///
557/// # Safety
558/// Whoever creates this struct is responsible for releasing their resources. Specifically,
559/// consumers *must* call [InternalArrowArray::into_raw] and take ownership of the individual pointers,
560/// calling [ArrowArray::release] and [ArrowSchema::release] accordingly.
561///
562/// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
563#[derive(Debug, Clone)]
564pub struct InternalArrowArray {
565    // Arc is used for sharability since this is immutable
566    array: Arc<ArrowArray>,
567    // Arced to reduce cost of cloning
568    dtype: Arc<ArrowDataType>,
569}
570
571impl InternalArrowArray {
572    pub fn new(array: ArrowArray, dtype: ArrowDataType) -> Self {
573        Self {
574            array: Arc::new(array),
575            dtype: Arc::new(dtype),
576        }
577    }
578}
579
580impl ArrowArrayRef for InternalArrowArray {
581    /// the dtype as declared in the schema
582    fn dtype(&self) -> &ArrowDataType {
583        &self.dtype
584    }
585
586    fn parent(&self) -> &InternalArrowArray {
587        self
588    }
589
590    fn array(&self) -> &ArrowArray {
591        self.array.as_ref()
592    }
593
594    fn n_buffers(&self) -> usize {
595        self.array.n_buffers as usize
596    }
597
598    fn offset(&self) -> usize {
599        self.array.offset as usize
600    }
601
602    fn length(&self) -> usize {
603        self.array.length as usize
604    }
605}
606
607#[derive(Debug)]
608pub struct ArrowArrayChild<'a> {
609    array: &'a ArrowArray,
610    dtype: ArrowDataType,
611    parent: InternalArrowArray,
612}
613
614impl ArrowArrayRef for ArrowArrayChild<'_> {
615    /// the dtype as declared in the schema
616    fn dtype(&self) -> &ArrowDataType {
617        &self.dtype
618    }
619
620    fn parent(&self) -> &InternalArrowArray {
621        &self.parent
622    }
623
624    fn array(&self) -> &ArrowArray {
625        self.array
626    }
627
628    fn n_buffers(&self) -> usize {
629        self.array.n_buffers as usize
630    }
631
632    fn offset(&self) -> usize {
633        self.array.offset as usize
634    }
635
636    fn length(&self) -> usize {
637        self.array.length as usize
638    }
639}
640
641impl<'a> ArrowArrayChild<'a> {
642    fn new(array: &'a ArrowArray, dtype: ArrowDataType, parent: InternalArrowArray) -> Self {
643        Self {
644            array,
645            dtype,
646            parent,
647        }
648    }
649}