use std::{
ffi::CStr,
ffi::CString,
iter,
mem::size_of,
ptr::{self, NonNull},
sync::Arc,
};
use crate::buffer::Buffer;
use crate::datatypes::{DataType, DateUnit, TimeUnit};
use crate::error::{ArrowError, Result};
use crate::util::bit_util;
#[repr(C)]
#[derive(Debug)]
pub struct FFI_ArrowSchema {
format: *const ::std::os::raw::c_char,
name: *const ::std::os::raw::c_char,
metadata: *const ::std::os::raw::c_char,
flags: i64,
n_children: i64,
children: *mut *mut FFI_ArrowSchema,
dictionary: *mut FFI_ArrowSchema,
release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowSchema)>,
private_data: *mut ::std::os::raw::c_void,
}
unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
let schema = &mut *schema;
CString::from_raw(schema.format as *mut std::os::raw::c_char);
schema.release = None;
}
impl FFI_ArrowSchema {
fn new(format: &str) -> FFI_ArrowSchema {
FFI_ArrowSchema {
format: CString::new(format).unwrap().into_raw(),
name: std::ptr::null_mut(),
metadata: std::ptr::null_mut(),
flags: 0,
n_children: 0,
children: ptr::null_mut(),
dictionary: std::ptr::null_mut(),
release: Some(release_schema),
private_data: std::ptr::null_mut(),
}
}
fn empty() -> Self {
Self {
format: std::ptr::null_mut(),
name: std::ptr::null_mut(),
metadata: std::ptr::null_mut(),
flags: 0,
n_children: 0,
children: ptr::null_mut(),
dictionary: std::ptr::null_mut(),
release: None,
private_data: std::ptr::null_mut(),
}
}
pub fn format(&self) -> &str {
unsafe { CStr::from_ptr(self.format) }
.to_str()
.expect("The external API has a non-utf8 as format")
}
}
impl Drop for FFI_ArrowSchema {
fn drop(&mut self) {
match self.release {
None => (),
Some(release) => unsafe { release(self) },
};
}
}
fn to_datatype(format: &str) -> Result<DataType> {
Ok(match format {
"n" => DataType::Null,
"b" => DataType::Boolean,
"c" => DataType::Int8,
"C" => DataType::UInt8,
"s" => DataType::Int16,
"S" => DataType::UInt16,
"i" => DataType::Int32,
"I" => DataType::UInt32,
"l" => DataType::Int64,
"L" => DataType::UInt64,
"e" => DataType::Float16,
"f" => DataType::Float32,
"g" => DataType::Float64,
"z" => DataType::Binary,
"Z" => DataType::LargeBinary,
"u" => DataType::Utf8,
"U" => DataType::LargeUtf8,
"tdD" => DataType::Date32(DateUnit::Day),
"tdm" => DataType::Date64(DateUnit::Millisecond),
"tts" => DataType::Time32(TimeUnit::Second),
"ttm" => DataType::Time32(TimeUnit::Millisecond),
"ttu" => DataType::Time64(TimeUnit::Microsecond),
"ttn" => DataType::Time64(TimeUnit::Nanosecond),
_ => {
return Err(ArrowError::CDataInterface(
"The datatype \"{}\" is still not supported in Rust implementation"
.to_string(),
))
}
})
}
fn from_datatype(datatype: &DataType) -> Result<String> {
Ok(match datatype {
DataType::Null => "n",
DataType::Boolean => "b",
DataType::Int8 => "c",
DataType::UInt8 => "C",
DataType::Int16 => "s",
DataType::UInt16 => "S",
DataType::Int32 => "i",
DataType::UInt32 => "I",
DataType::Int64 => "l",
DataType::UInt64 => "L",
DataType::Float16 => "e",
DataType::Float32 => "f",
DataType::Float64 => "g",
DataType::Binary => "z",
DataType::LargeBinary => "Z",
DataType::Utf8 => "u",
DataType::LargeUtf8 => "U",
DataType::Date32(DateUnit::Day) => "tdD",
DataType::Date64(DateUnit::Millisecond) => "tdm",
DataType::Time32(TimeUnit::Second) => "tts",
DataType::Time32(TimeUnit::Millisecond) => "ttm",
DataType::Time64(TimeUnit::Microsecond) => "ttu",
DataType::Time64(TimeUnit::Nanosecond) => "ttn",
z => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" is still not supported in Rust implementation",
z
)))
}
}
.to_string())
}
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
Ok(match (data_type, i) {
(_, 0) => 1,
(DataType::Boolean, 1) => 1,
(DataType::UInt8, 1) => size_of::<u8>() * 8,
(DataType::UInt16, 1) => size_of::<u16>() * 8,
(DataType::UInt32, 1) => size_of::<u32>() * 8,
(DataType::UInt64, 1) => size_of::<u64>() * 8,
(DataType::Int8, 1) => size_of::<i8>() * 8,
(DataType::Int16, 1) => size_of::<i16>() * 8,
(DataType::Int32, 1) | (DataType::Date32(_), 1) | (DataType::Time32(_), 1) => size_of::<i32>() * 8,
(DataType::Int64, 1) | (DataType::Date64(_), 1) | (DataType::Time64(_), 1) => size_of::<i64>() * 8,
(DataType::Float32, 1) => size_of::<f32>() * 8,
(DataType::Float64, 1) => size_of::<f64>() * 8,
(DataType::Boolean, _) |
(DataType::UInt8, _) |
(DataType::UInt16, _) |
(DataType::UInt32, _) |
(DataType::UInt64, _) |
(DataType::Int8, _) |
(DataType::Int16, _) |
(DataType::Int32, _) | (DataType::Date32(_), _) | (DataType::Time32(_), _) |
(DataType::Int64, _) | (DataType::Date64(_), _) | (DataType::Time64(_), _) |
(DataType::Float32, _) |
(DataType::Float64, _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
data_type, i
)))
}
(DataType::Utf8, 1) | (DataType::Binary, 1) => size_of::<i32>() * 8,
(DataType::Utf8, 2) | (DataType::Binary, 2) => size_of::<u8>() * 8,
(DataType::Utf8, _) | (DataType::Binary, _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
data_type, i
)))
}
(DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) => size_of::<i64>() * 8,
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => size_of::<u8>() * 8,
(DataType::LargeUtf8, _) | (DataType::LargeBinary, _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
data_type, i
)))
}
_ => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" is still not supported in Rust implementation",
data_type
)))
}
})
}
#[repr(C)]
#[derive(Debug)]
pub struct FFI_ArrowArray {
pub(crate) length: i64,
pub(crate) null_count: i64,
pub(crate) offset: i64,
pub(crate) n_buffers: i64,
pub(crate) n_children: i64,
pub(crate) buffers: *mut *const ::std::os::raw::c_void,
children: *mut *mut FFI_ArrowArray,
dictionary: *mut FFI_ArrowArray,
release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArray)>,
private_data: *mut ::std::os::raw::c_void,
}
unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
if array.is_null() {
return;
}
let array = &mut *array;
Box::from_raw(array.private_data as *mut PrivateData);
array.release = None;
}
struct PrivateData {
buffers: Vec<Option<Buffer>>,
buffers_ptr: Box<[*const std::os::raw::c_void]>,
}
impl FFI_ArrowArray {
unsafe fn new(
length: i64,
null_count: i64,
offset: i64,
n_buffers: i64,
buffers: Vec<Option<Buffer>>,
) -> Self {
let buffers_ptr = buffers
.iter()
.map(|maybe_buffer| match maybe_buffer {
Some(b) => b.as_ptr() as *const std::os::raw::c_void,
None => std::ptr::null(),
})
.collect::<Box<[_]>>();
let pointer = buffers_ptr.as_ptr() as *mut *const std::ffi::c_void;
let private_data = Box::new(PrivateData {
buffers,
buffers_ptr,
});
Self {
length,
null_count,
offset,
n_buffers,
n_children: 0,
buffers: pointer,
children: std::ptr::null_mut(),
dictionary: std::ptr::null_mut(),
release: Some(release_array),
private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
}
}
fn empty() -> Self {
Self {
length: 0,
null_count: 0,
offset: 0,
n_buffers: 0,
n_children: 0,
buffers: std::ptr::null_mut(),
children: std::ptr::null_mut(),
dictionary: std::ptr::null_mut(),
release: None,
private_data: std::ptr::null_mut(),
}
}
}
unsafe fn create_buffer(
array: Arc<FFI_ArrowArray>,
index: usize,
len: usize,
) -> Option<Buffer> {
if array.buffers.is_null() {
return None;
}
let buffers = array.buffers as *mut *const u8;
assert!(index < array.n_buffers as usize);
let ptr = *buffers.add(index);
NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, array))
}
impl Drop for FFI_ArrowArray {
fn drop(&mut self) {
match self.release {
None => (),
Some(release) => unsafe { release(self) },
};
}
}
#[derive(Debug)]
pub struct ArrowArray {
array: Arc<FFI_ArrowArray>,
schema: Arc<FFI_ArrowSchema>,
}
impl ArrowArray {
pub unsafe fn try_new(
data_type: &DataType,
len: usize,
null_count: usize,
null_buffer: Option<Buffer>,
offset: usize,
buffers: Vec<Buffer>,
_child_data: Vec<ArrowArray>,
) -> Result<Self> {
let format = from_datatype(data_type)?;
let new_buffers = iter::once(null_buffer)
.chain(buffers.iter().map(|b| Some(b.clone())))
.collect::<Vec<_>>();
let schema = Arc::new(FFI_ArrowSchema::new(&format));
let array = Arc::new(FFI_ArrowArray::new(
len as i64,
null_count as i64,
offset as i64,
new_buffers.len() as i64,
new_buffers,
));
Ok(ArrowArray { schema, array })
}
pub unsafe fn try_from_raw(
array: *const FFI_ArrowArray,
schema: *const FFI_ArrowSchema,
) -> Result<Self> {
if array.is_null() || schema.is_null() {
return Err(ArrowError::MemoryError(
"At least one of the pointers passed to `try_from_raw` is null"
.to_string(),
));
};
Ok(Self {
array: Arc::from_raw(array as *mut FFI_ArrowArray),
schema: Arc::from_raw(schema as *mut FFI_ArrowSchema),
})
}
pub unsafe fn empty() -> Self {
let schema = Arc::new(FFI_ArrowSchema::empty());
let array = Arc::new(FFI_ArrowArray::empty());
ArrowArray { schema, array }
}
pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) {
(Arc::into_raw(this.array), Arc::into_raw(this.schema))
}
pub fn null_bit_buffer(&self) -> Option<Buffer> {
let buffer_len = bit_util::ceil(self.array.length as usize, 8);
unsafe { create_buffer(self.array.clone(), 0, buffer_len) }
}
fn buffer_len(&self, i: usize) -> Result<usize> {
let data_type = &self.data_type()?;
Ok(match (data_type, i) {
(DataType::Utf8, 1)
| (DataType::LargeUtf8, 1)
| (DataType::Binary, 1)
| (DataType::LargeBinary, 1) => {
let bits = bit_width(data_type, i)?;
debug_assert_eq!(bits % 8, 0);
(self.array.length as usize + 1) * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
let len = self.buffer_len(1)?;
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = unsafe {
*(self.array.buffers as *mut *const u8).add(1) as *const i32
};
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
}
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
let len = self.buffer_len(1)?;
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = unsafe {
*(self.array.buffers as *mut *const u8).add(1) as *const i64
};
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
}
_ => {
let bits = bit_width(data_type, i)?;
bit_util::ceil(self.array.length as usize * bits, 8)
}
})
}
pub fn buffers(&self) -> Result<Vec<Buffer>> {
(0..self.array.n_buffers - 1)
.map(|index| {
let index = (index + 1) as usize;
let len = self.buffer_len(index)?;
unsafe { create_buffer(self.array.clone(), index, len) }.ok_or_else(
|| {
ArrowError::CDataInterface(format!(
"The external buffer at position {} is null.",
index - 1
))
},
)
})
.collect()
}
pub fn len(&self) -> usize {
self.array.length as usize
}
pub fn is_empty(&self) -> bool {
self.array.length == 0
}
pub fn offset(&self) -> usize {
self.array.offset as usize
}
pub fn null_count(&self) -> usize {
self.array.null_count as usize
}
pub fn data_type(&self) -> Result<DataType> {
to_datatype(self.schema.format())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::array::{
make_array, Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray,
GenericBinaryArray, GenericStringArray, Int32Array, StringOffsetSizeTrait,
Time32MillisecondArray,
};
use crate::compute::kernels;
use std::convert::TryFrom;
use std::sync::Arc;
#[test]
fn test_round_trip() -> Result<()> {
let array = Int32Array::from(vec![1, 2, 3]);
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);
let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
let array = kernels::arithmetic::add(&array, &array).unwrap();
assert_eq!(array, Int32Array::from(vec![2, 4, 6]));
Ok(())
}
fn test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()> {
let array =
GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]);
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);
let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
let array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let expected = GenericStringArray::<Offset>::from(vec![
Some("a"),
None,
Some("aaa"),
Some("a"),
None,
Some("aaa"),
]);
assert_eq!(array, &expected);
Ok(())
}
#[test]
fn test_string() -> Result<()> {
test_generic_string::<i32>()
}
#[test]
fn test_large_string() -> Result<()> {
test_generic_string::<i64>()
}
fn test_generic_binary<Offset: BinaryOffsetSizeTrait>() -> Result<()> {
let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
let array = GenericBinaryArray::<Offset>::from(array);
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);
let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<Offset>>()
.unwrap();
let expected: Vec<Option<&[u8]>> = vec![
Some(b"a"),
None,
Some(b"aaa"),
Some(b"a"),
None,
Some(b"aaa"),
];
let expected = GenericBinaryArray::<Offset>::from(expected);
assert_eq!(array, &expected);
Ok(())
}
#[test]
fn test_binary() -> Result<()> {
test_generic_binary::<i32>()
}
#[test]
fn test_large_binary() -> Result<()> {
test_generic_binary::<i64>()
}
#[test]
fn test_bool() -> Result<()> {
let array = BooleanArray::from(vec![None, Some(true), Some(false)]);
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
let array = kernels::boolean::not(&array)?;
assert_eq!(
array,
BooleanArray::from(vec![None, Some(false), Some(true)])
);
Ok(())
}
#[test]
fn test_time32() -> Result<()> {
let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]);
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);
let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
let array = array
.as_any()
.downcast_ref::<Time32MillisecondArray>()
.unwrap();
assert_eq!(
array,
&Time32MillisecondArray::from(vec![
None,
Some(1),
Some(2),
None,
Some(1),
Some(2)
])
);
Ok(())
}
}