arrow2/ffi/
array.rs

1//! Contains functionality to load an ArrayData from the C Data Interface
2use std::sync::Arc;
3
4use crate::bitmap::utils::count_zeros;
5use crate::buffer::BytesAllocator;
6use crate::{
7    array::*,
8    bitmap::{utils::bytes_for, Bitmap},
9    buffer::{Buffer, Bytes},
10    datatypes::{DataType, PhysicalType},
11    error::{Error, Result},
12    ffi::schema::get_child,
13    types::NativeType,
14};
15
16use super::ArrowArray;
17
18/// Reads a valid `ffi` interface into a `Box<dyn Array>`
19/// # Errors
20/// If and only if:
21/// * the interface is not valid (e.g. a null pointer)
22pub unsafe fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
23    use PhysicalType::*;
24    Ok(match array.data_type().to_physical_type() {
25        Null => Box::new(NullArray::try_from_ffi(array)?),
26        Boolean => Box::new(BooleanArray::try_from_ffi(array)?),
27        Primitive(primitive) => with_match_primitive_type!(primitive, |$T| {
28            Box::new(PrimitiveArray::<$T>::try_from_ffi(array)?)
29        }),
30        Utf8 => Box::new(Utf8Array::<i32>::try_from_ffi(array)?),
31        LargeUtf8 => Box::new(Utf8Array::<i64>::try_from_ffi(array)?),
32        Binary => Box::new(BinaryArray::<i32>::try_from_ffi(array)?),
33        LargeBinary => Box::new(BinaryArray::<i64>::try_from_ffi(array)?),
34        FixedSizeBinary => Box::new(FixedSizeBinaryArray::try_from_ffi(array)?),
35        List => Box::new(ListArray::<i32>::try_from_ffi(array)?),
36        LargeList => Box::new(ListArray::<i64>::try_from_ffi(array)?),
37        FixedSizeList => Box::new(FixedSizeListArray::try_from_ffi(array)?),
38        Struct => Box::new(StructArray::try_from_ffi(array)?),
39        Dictionary(key_type) => {
40            match_integer_type!(key_type, |$T| {
41                Box::new(DictionaryArray::<$T>::try_from_ffi(array)?)
42            })
43        }
44        Union => Box::new(UnionArray::try_from_ffi(array)?),
45        Map => Box::new(MapArray::try_from_ffi(array)?),
46    })
47}
48
49// Sound because the arrow specification does not allow multiple implementations
50// to change this struct
51// This is intrinsically impossible to prove because the implementations agree
52// on this as part of the Arrow specification
53unsafe impl Send for ArrowArray {}
54unsafe impl Sync for ArrowArray {}
55
56impl Drop for ArrowArray {
57    fn drop(&mut self) {
58        match self.release {
59            None => (),
60            Some(release) => unsafe { release(self) },
61        };
62    }
63}
64
65// callback used to drop [ArrowArray] when it is exported
66unsafe extern "C" fn c_release_array(array: *mut ArrowArray) {
67    if array.is_null() {
68        return;
69    }
70    let array = &mut *array;
71
72    // take ownership of `private_data`, therefore dropping it
73    let private = Box::from_raw(array.private_data as *mut PrivateData);
74    for child in private.children_ptr.iter() {
75        let _ = Box::from_raw(*child);
76    }
77
78    if let Some(ptr) = private.dictionary_ptr {
79        let _ = Box::from_raw(ptr);
80    }
81
82    array.release = None;
83}
84
85#[allow(dead_code)]
86struct PrivateData {
87    array: Box<dyn Array>,
88    buffers_ptr: Box<[*const std::os::raw::c_void]>,
89    children_ptr: Box<[*mut ArrowArray]>,
90    dictionary_ptr: Option<*mut ArrowArray>,
91}
92
93impl ArrowArray {
94    /// creates a new `ArrowArray` from existing data.
95    /// # Safety
96    /// This method releases `buffers`. Consumers of this struct *must* call `release` before
97    /// releasing this struct, or contents in `buffers` leak.
98    pub(crate) fn new(array: Box<dyn Array>) -> Self {
99        let (offset, buffers, children, dictionary) =
100            offset_buffers_children_dictionary(array.as_ref());
101
102        let buffers_ptr = buffers
103            .iter()
104            .map(|maybe_buffer| match maybe_buffer {
105                Some(b) => *b as *const std::os::raw::c_void,
106                None => std::ptr::null(),
107            })
108            .collect::<Box<[_]>>();
109        let n_buffers = buffers.len() as i64;
110
111        let children_ptr = children
112            .into_iter()
113            .map(|child| Box::into_raw(Box::new(ArrowArray::new(child))))
114            .collect::<Box<_>>();
115        let n_children = children_ptr.len() as i64;
116
117        let dictionary_ptr =
118            dictionary.map(|array| Box::into_raw(Box::new(ArrowArray::new(array))));
119
120        let length = array.len() as i64;
121        let null_count = array.null_count() as i64;
122
123        let mut private_data = Box::new(PrivateData {
124            array,
125            buffers_ptr,
126            children_ptr,
127            dictionary_ptr,
128        });
129
130        Self {
131            length,
132            null_count,
133            offset: offset as i64,
134            n_buffers,
135            n_children,
136            buffers: private_data.buffers_ptr.as_mut_ptr(),
137            children: private_data.children_ptr.as_mut_ptr(),
138            dictionary: private_data.dictionary_ptr.unwrap_or(std::ptr::null_mut()),
139            release: Some(c_release_array),
140            private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
141        }
142    }
143
144    /// creates an empty [`ArrowArray`], which can be used to import data into
145    pub fn empty() -> Self {
146        Self {
147            length: 0,
148            null_count: 0,
149            offset: 0,
150            n_buffers: 0,
151            n_children: 0,
152            buffers: std::ptr::null_mut(),
153            children: std::ptr::null_mut(),
154            dictionary: std::ptr::null_mut(),
155            release: None,
156            private_data: std::ptr::null_mut(),
157        }
158    }
159
160    /// the length of the array
161    pub(crate) fn len(&self) -> usize {
162        self.length as usize
163    }
164
165    /// the offset of the array
166    pub(crate) fn offset(&self) -> usize {
167        self.offset as usize
168    }
169
170    /// the null count of the array
171    pub(crate) fn null_count(&self) -> usize {
172        self.null_count as usize
173    }
174}
175
176/// # Safety
177/// The caller must ensure that the buffer at index `i` is not mutably shared.
178unsafe fn get_buffer_ptr<T: NativeType>(
179    array: &ArrowArray,
180    data_type: &DataType,
181    index: usize,
182) -> Result<*mut T> {
183    if array.buffers.is_null() {
184        return Err(Error::oos(format!(
185            "An ArrowArray of type {data_type:?} must have non-null buffers"
186        )));
187    }
188
189    if array
190        .buffers
191        .align_offset(std::mem::align_of::<*mut *const u8>())
192        != 0
193    {
194        return Err(Error::oos(format!(
195            "An ArrowArray of type {data_type:?}
196            must have buffer {index} aligned to type {}",
197            std::any::type_name::<*mut *const u8>()
198        )));
199    }
200    let buffers = array.buffers as *mut *const u8;
201
202    if index >= array.n_buffers as usize {
203        return Err(Error::oos(format!(
204            "An ArrowArray of type {data_type:?} 
205             must have buffer {index}."
206        )));
207    }
208
209    let ptr = *buffers.add(index);
210    if ptr.is_null() {
211        return Err(Error::oos(format!(
212            "An array of type {data_type:?} 
213            must have a non-null buffer {index}"
214        )));
215    }
216
217    // note: we can't prove that this pointer is not mutably shared - part of the safety invariant
218    Ok(ptr as *mut T)
219}
220
221/// returns the buffer `i` of `array` interpreted as a [`Buffer`].
222/// # Safety
223/// This function is safe iff:
224/// * the buffers up to position `index` are valid for the declared length
225/// * the buffers' pointers are not mutably shared for the lifetime of `owner`
226unsafe fn create_buffer<T: NativeType>(
227    array: &ArrowArray,
228    data_type: &DataType,
229    owner: InternalArrowArray,
230    index: usize,
231) -> Result<Buffer<T>> {
232    let len = buffer_len(array, data_type, index)?;
233
234    if len == 0 {
235        return Ok(Buffer::new());
236    }
237
238    let offset = buffer_offset(array, data_type, index);
239    let ptr: *mut T = get_buffer_ptr(array, data_type, index)?;
240
241    // We have to check alignment.
242    // This is the zero-copy path.
243    if ptr.align_offset(std::mem::align_of::<T>()) == 0 {
244        let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner));
245        Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset))
246    }
247    // This is the path where alignment isn't correct.
248    // We copy the data to a new vec
249    else {
250        let buf = std::slice::from_raw_parts(ptr, len - offset).to_vec();
251        Ok(Buffer::from(buf))
252    }
253}
254
255/// returns the buffer `i` of `array` interpreted as a [`Bitmap`].
256/// # Safety
257/// This function is safe iff:
258/// * the buffer at position `index` is valid for the declared length
259/// * the buffers' pointer is not mutable for the lifetime of `owner`
260unsafe fn create_bitmap(
261    array: &ArrowArray,
262    data_type: &DataType,
263    owner: InternalArrowArray,
264    index: usize,
265    // if this is the validity bitmap
266    // we can use the null count directly
267    is_validity: bool,
268) -> Result<Bitmap> {
269    let len: usize = array.length.try_into().expect("length to fit in `usize`");
270    if len == 0 {
271        return Ok(Bitmap::new());
272    }
273    let ptr = get_buffer_ptr(array, data_type, index)?;
274
275    // Pointer of u8 has alignment 1, so we don't have to check alignment.
276
277    let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
278    let bytes_len = bytes_for(offset + len);
279    let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner));
280
281    let null_count: usize = if is_validity {
282        array.null_count()
283    } else {
284        count_zeros(bytes.as_ref(), offset, len)
285    };
286    Bitmap::from_inner(Arc::new(bytes), offset, len, null_count)
287}
288
289fn buffer_offset(array: &ArrowArray, data_type: &DataType, i: usize) -> usize {
290    use PhysicalType::*;
291    match (data_type.to_physical_type(), i) {
292        (LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0,
293        (FixedSizeBinary, 1) => {
294            if let DataType::FixedSizeBinary(size) = data_type.to_logical_type() {
295                let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`");
296                offset * *size
297            } else {
298                unreachable!()
299            }
300        }
301        _ => array.offset.try_into().expect("Offset to fit in `usize`"),
302    }
303}
304
305/// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface)
306unsafe fn buffer_len(array: &ArrowArray, data_type: &DataType, i: usize) -> Result<usize> {
307    Ok(match (data_type.to_physical_type(), i) {
308        (PhysicalType::FixedSizeBinary, 1) => {
309            if let DataType::FixedSizeBinary(size) = data_type.to_logical_type() {
310                *size * (array.offset as usize + array.length as usize)
311            } else {
312                unreachable!()
313            }
314        }
315        (PhysicalType::FixedSizeList, 1) => {
316            if let DataType::FixedSizeList(_, size) = data_type.to_logical_type() {
317                *size * (array.offset as usize + array.length as usize)
318            } else {
319                unreachable!()
320            }
321        }
322        (PhysicalType::Utf8, 1)
323        | (PhysicalType::LargeUtf8, 1)
324        | (PhysicalType::Binary, 1)
325        | (PhysicalType::LargeBinary, 1)
326        | (PhysicalType::List, 1)
327        | (PhysicalType::LargeList, 1)
328        | (PhysicalType::Map, 1) => {
329            // the len of the offset buffer (buffer 1) equals length + 1
330            array.offset as usize + array.length as usize + 1
331        }
332        (PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => {
333            // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
334            let len = buffer_len(array, data_type, 1)?;
335            // first buffer is the null buffer => add(1)
336            let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
337            // interpret as i32
338            let offset_buffer = offset_buffer as *const i32;
339            // get last offset
340
341            (unsafe { *offset_buffer.add(len - 1) }) as usize
342        }
343        (PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => {
344            // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
345            let len = buffer_len(array, data_type, 1)?;
346            // first buffer is the null buffer => add(1)
347            let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
348            // interpret as i64
349            let offset_buffer = offset_buffer as *const i64;
350            // get last offset
351            (unsafe { *offset_buffer.add(len - 1) }) as usize
352        }
353        // buffer len of primitive types
354        _ => array.offset as usize + array.length as usize,
355    })
356}
357
358/// Safety
359/// This function is safe iff:
360/// * `array.children` at `index` is valid
361/// * `array.children` is not mutably shared for the lifetime of `parent`
362/// * the pointer of `array.children` at `index` is valid
363/// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
364unsafe fn create_child(
365    array: &ArrowArray,
366    data_type: &DataType,
367    parent: InternalArrowArray,
368    index: usize,
369) -> Result<ArrowArrayChild<'static>> {
370    let data_type = get_child(data_type, index)?;
371
372    // catch what we can
373    if array.children.is_null() {
374        return Err(Error::oos(format!(
375            "An ArrowArray of type {data_type:?} must have non-null children"
376        )));
377    }
378
379    if index >= array.n_children as usize {
380        return Err(Error::oos(format!(
381            "An ArrowArray of type {data_type:?} 
382             must have child {index}."
383        )));
384    }
385
386    // Safety - part of the invariant
387    let arr_ptr = unsafe { *array.children.add(index) };
388
389    // catch what we can
390    if arr_ptr.is_null() {
391        return Err(Error::oos(format!(
392            "An array of type {data_type:?}
393            must have a non-null child {index}"
394        )));
395    }
396
397    // Safety - invariant of this function
398    let arr_ptr = unsafe { &*arr_ptr };
399    Ok(ArrowArrayChild::new(arr_ptr, data_type, parent))
400}
401
402/// Safety
403/// This function is safe iff:
404/// * `array.dictionary` is valid
405/// * `array.dictionary` is not mutably shared for the lifetime of `parent`
406unsafe fn create_dictionary(
407    array: &ArrowArray,
408    data_type: &DataType,
409    parent: InternalArrowArray,
410) -> Result<Option<ArrowArrayChild<'static>>> {
411    if let DataType::Dictionary(_, values, _) = data_type {
412        let data_type = values.as_ref().clone();
413        // catch what we can
414        if array.dictionary.is_null() {
415            return Err(Error::oos(format!(
416                "An array of type {data_type:?}
417                must have a non-null dictionary"
418            )));
419        }
420
421        // safety: part of the invariant
422        let array = unsafe { &*array.dictionary };
423        Ok(Some(ArrowArrayChild::new(array, data_type, parent)))
424    } else {
425        Ok(None)
426    }
427}
428
429pub trait ArrowArrayRef: std::fmt::Debug {
430    fn owner(&self) -> InternalArrowArray {
431        (*self.parent()).clone()
432    }
433
434    /// returns the null bit buffer.
435    /// Rust implementation uses a buffer that is not part of the array of buffers.
436    /// The C Data interface's null buffer is part of the array of buffers.
437    /// # Safety
438    /// The caller must guarantee that the buffer `index` corresponds to a bitmap.
439    /// This function assumes that the bitmap created from FFI is valid; this is impossible to prove.
440    unsafe fn validity(&self) -> Result<Option<Bitmap>> {
441        if self.array().null_count() == 0 {
442            Ok(None)
443        } else {
444            create_bitmap(self.array(), self.data_type(), self.owner(), 0, true).map(Some)
445        }
446    }
447
448    /// # Safety
449    /// The caller must guarantee that the buffer `index` corresponds to a buffer.
450    /// This function assumes that the buffer created from FFI is valid; this is impossible to prove.
451    unsafe fn buffer<T: NativeType>(&self, index: usize) -> Result<Buffer<T>> {
452        create_buffer::<T>(self.array(), self.data_type(), self.owner(), index)
453    }
454
455    /// # Safety
456    /// This function is safe iff:
457    /// * the buffer at position `index` is valid for the declared length
458    /// * the buffers' pointer is not mutable for the lifetime of `owner`
459    unsafe fn bitmap(&self, index: usize) -> Result<Bitmap> {
460        create_bitmap(self.array(), self.data_type(), self.owner(), index, false)
461    }
462
463    /// # Safety
464    /// * `array.children` at `index` is valid
465    /// * `array.children` is not mutably shared for the lifetime of `parent`
466    /// * the pointer of `array.children` at `index` is valid
467    /// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
468    unsafe fn child(&self, index: usize) -> Result<ArrowArrayChild> {
469        create_child(self.array(), self.data_type(), self.parent().clone(), index)
470    }
471
472    unsafe fn dictionary(&self) -> Result<Option<ArrowArrayChild>> {
473        create_dictionary(self.array(), self.data_type(), self.parent().clone())
474    }
475
476    fn n_buffers(&self) -> usize;
477
478    fn parent(&self) -> &InternalArrowArray;
479    fn array(&self) -> &ArrowArray;
480    fn data_type(&self) -> &DataType;
481}
482
483/// Struct used to move an Array from and to the C Data Interface.
484/// Its main responsibility is to expose functionality that requires
485/// both [ArrowArray] and [ArrowSchema].
486///
487/// This struct has two main paths:
488///
489/// ## Import from the C Data Interface
490/// * [InternalArrowArray::empty] to allocate memory to be filled by an external call
491/// * [InternalArrowArray::try_from_raw] to consume two non-null allocated pointers
492/// ## Export to the C Data Interface
493/// * [InternalArrowArray::try_new] to create a new [InternalArrowArray] from Rust-specific information
494/// * [InternalArrowArray::into_raw] to expose two pointers for [ArrowArray] and [ArrowSchema].
495///
496/// # Safety
497/// Whoever creates this struct is responsible for releasing their resources. Specifically,
498/// consumers *must* call [InternalArrowArray::into_raw] and take ownership of the individual pointers,
499/// calling [ArrowArray::release] and [ArrowSchema::release] accordingly.
500///
501/// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
502#[derive(Debug, Clone)]
503pub struct InternalArrowArray {
504    // Arc is used for sharability since this is immutable
505    array: Arc<ArrowArray>,
506    // Arced to reduce cost of cloning
507    data_type: Arc<DataType>,
508}
509
510impl InternalArrowArray {
511    pub fn new(array: ArrowArray, data_type: DataType) -> Self {
512        Self {
513            array: Arc::new(array),
514            data_type: Arc::new(data_type),
515        }
516    }
517}
518
519impl ArrowArrayRef for InternalArrowArray {
520    /// the data_type as declared in the schema
521    fn data_type(&self) -> &DataType {
522        &self.data_type
523    }
524
525    fn parent(&self) -> &InternalArrowArray {
526        self
527    }
528
529    fn array(&self) -> &ArrowArray {
530        self.array.as_ref()
531    }
532
533    fn n_buffers(&self) -> usize {
534        self.array.n_buffers as usize
535    }
536}
537
538#[derive(Debug)]
539pub struct ArrowArrayChild<'a> {
540    array: &'a ArrowArray,
541    data_type: DataType,
542    parent: InternalArrowArray,
543}
544
545impl<'a> ArrowArrayRef for ArrowArrayChild<'a> {
546    /// the data_type as declared in the schema
547    fn data_type(&self) -> &DataType {
548        &self.data_type
549    }
550
551    fn parent(&self) -> &InternalArrowArray {
552        &self.parent
553    }
554
555    fn array(&self) -> &ArrowArray {
556        self.array
557    }
558
559    fn n_buffers(&self) -> usize {
560        self.array.n_buffers as usize
561    }
562}
563
564impl<'a> ArrowArrayChild<'a> {
565    fn new(array: &'a ArrowArray, data_type: DataType, parent: InternalArrowArray) -> Self {
566        Self {
567            array,
568            data_type,
569            parent,
570        }
571    }
572}