pyo3_arrow/
array.rs

1use std::fmt::Display;
2use std::sync::Arc;
3
4use arrow_array::types::{
5    Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
6    UInt64Type, UInt8Type,
7};
8use arrow_array::{
9    Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Datum, LargeBinaryArray,
10    LargeStringArray, PrimitiveArray, StringArray, StringViewArray,
11};
12use arrow_cast::cast;
13use arrow_schema::{ArrowError, DataType, Field, FieldRef};
14use arrow_select::concat::concat;
15use arrow_select::take::take;
16use numpy::PyUntypedArray;
17use pyo3::exceptions::{PyIndexError, PyNotImplementedError, PyValueError};
18use pyo3::prelude::*;
19use pyo3::types::{PyCapsule, PyTuple, PyType};
20use pyo3::{intern, IntoPyObjectExt};
21
22#[cfg(feature = "buffer_protocol")]
23use crate::buffer::AnyBufferProtocol;
24use crate::error::PyArrowResult;
25use crate::export::{Arro3Array, Arro3DataType, Arro3Field};
26use crate::ffi::from_python::utils::import_array_pycapsules;
27use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
28use crate::ffi::{to_array_pycapsules, to_schema_pycapsule};
29use crate::input::AnyArray;
30use crate::interop::numpy::from_numpy::from_numpy;
31use crate::interop::numpy::to_numpy::to_numpy;
32use crate::scalar::PyScalar;
33use crate::{PyDataType, PyField};
34
35/// A Python-facing Arrow array.
36///
37/// This is a wrapper around an [ArrayRef] and a [FieldRef].
38///
39/// It's important for this to wrap both an array _and_ a field so that it can faithfully store all
40/// data transmitted via the `__arrow_c_array__` Python method, which [exports both an Array and a
41/// Field](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrow_c_array__).
42/// In particular, storing a [FieldRef] is required to persist Arrow extension metadata through the
43/// C Data Interface.
44#[derive(Debug)]
45#[pyclass(module = "arro3.core._core", name = "Array", subclass, frozen)]
46pub struct PyArray {
47    array: ArrayRef,
48    field: FieldRef,
49}
50
51impl PyArray {
52    /// Create a new Python Array from an [ArrayRef] and a [FieldRef].
53    ///
54    /// This will panic if the array's data type does not match the field's data type.
55    pub fn new(array: ArrayRef, field: FieldRef) -> Self {
56        Self::try_new(array, field).unwrap()
57    }
58
59    /// Create a new Python Array from an [ArrayRef] and a [FieldRef].
60    pub fn try_new(array: ArrayRef, field: FieldRef) -> Result<Self, ArrowError> {
61        // Note: if the array and field data types don't match, you'll get an obscure FFI
62        // exception, because you might be describing a different array than you're actually
63        // providing.
64        if array.data_type() != field.data_type() {
65            return Err(ArrowError::SchemaError(
66                format!("Array DataType must match Field DataType. Array DataType is {}; field DataType is {}", array.data_type(), field.data_type())
67            ));
68        }
69        Ok(Self { array, field })
70    }
71
72    /// Create a new PyArray from an [ArrayRef], inferring its data type automatically.
73    pub fn from_array_ref(array: ArrayRef) -> Self {
74        let field = Field::new("", array.data_type().clone(), true);
75        Self::new(array, Arc::new(field))
76    }
77
78    /// Import from raw Arrow capsules
79    pub fn from_arrow_pycapsule(
80        schema_capsule: &Bound<PyCapsule>,
81        array_capsule: &Bound<PyCapsule>,
82    ) -> PyResult<Self> {
83        let (array, field, _data_len) = import_array_pycapsules(schema_capsule, array_capsule)?;
84        Ok(Self::new(array, Arc::new(field)))
85    }
86
87    /// Access the underlying [ArrayRef].
88    pub fn array(&self) -> &ArrayRef {
89        &self.array
90    }
91
92    /// Access the underlying [FieldRef].
93    pub fn field(&self) -> &FieldRef {
94        &self.field
95    }
96
97    /// Consume self to access the underlying [ArrayRef] and [FieldRef].
98    pub fn into_inner(self) -> (ArrayRef, FieldRef) {
99        (self.array, self.field)
100    }
101
102    /// Export to an arro3.core.Array.
103    ///
104    /// This requires that you depend on arro3-core from your Python package.
105    pub fn to_arro3<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
106        let arro3_mod = py.import(intern!(py, "arro3.core"))?;
107        arro3_mod.getattr(intern!(py, "Array"))?.call_method1(
108            intern!(py, "from_arrow_pycapsule"),
109            self.__arrow_c_array__(py, None)?,
110        )
111    }
112
113    /// Export to an arro3.core.Array.
114    ///
115    /// This requires that you depend on arro3-core from your Python package.
116    pub fn into_arro3(self, py: Python) -> PyResult<Bound<PyAny>> {
117        let arro3_mod = py.import(intern!(py, "arro3.core"))?;
118        let array_capsules = to_array_pycapsules(py, self.field.clone(), &self.array, None)?;
119        arro3_mod
120            .getattr(intern!(py, "Array"))?
121            .call_method1(intern!(py, "from_arrow_pycapsule"), array_capsules)
122    }
123
124    /// Export this to a Python `nanoarrow.Array`.
125    pub fn to_nanoarrow<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
126        to_nanoarrow_array(py, self.__arrow_c_array__(py, None)?)
127    }
128
129    /// Export to a pyarrow.Array
130    ///
131    /// Requires pyarrow >=14
132    pub fn to_pyarrow<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
133        let pyarrow_mod = py.import(intern!(py, "pyarrow"))?;
134        let cloned = Self::new(self.array.clone(), self.field.clone());
135        pyarrow_mod
136            .getattr(intern!(py, "array"))?
137            .call1(PyTuple::new(py, vec![cloned.into_pyobject(py)?])?)
138    }
139}
140
141impl From<ArrayRef> for PyArray {
142    fn from(value: ArrayRef) -> Self {
143        Self::from_array_ref(value)
144    }
145}
146
147impl AsRef<ArrayRef> for PyArray {
148    fn as_ref(&self) -> &ArrayRef {
149        &self.array
150    }
151}
152
153impl Display for PyArray {
154    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
155        write!(f, "arro3.core.Array<")?;
156        self.array.data_type().fmt(f)?;
157        writeln!(f, ">")?;
158        Ok(())
159    }
160}
161
162impl Datum for PyArray {
163    fn get(&self) -> (&dyn Array, bool) {
164        (self.array.as_ref(), false)
165    }
166}
167
168#[pymethods]
169impl PyArray {
170    #[new]
171    #[pyo3(signature = (obj, /, r#type = None, *))]
172    pub(crate) fn init(obj: &Bound<PyAny>, r#type: Option<PyField>) -> PyResult<Self> {
173        if let Ok(data) = obj.extract::<PyArray>() {
174            return Ok(data);
175        }
176
177        macro_rules! impl_primitive {
178            ($rust_type:ty, $arrow_type:ty) => {{
179                let values: Vec<Option<$rust_type>> = obj.extract()?;
180                Arc::new(PrimitiveArray::<$arrow_type>::from(values))
181            }};
182        }
183
184        let field = r#type
185            .ok_or(PyValueError::new_err(
186                "type must be passed for non-Arrow input",
187            ))?
188            .into_inner();
189        let array: ArrayRef = match field.data_type() {
190            DataType::Float32 => impl_primitive!(f32, Float32Type),
191            DataType::Float64 => impl_primitive!(f64, Float64Type),
192            DataType::UInt8 => impl_primitive!(u8, UInt8Type),
193            DataType::UInt16 => impl_primitive!(u16, UInt16Type),
194            DataType::UInt32 => impl_primitive!(u32, UInt32Type),
195            DataType::UInt64 => impl_primitive!(u64, UInt64Type),
196            DataType::Int8 => impl_primitive!(i8, Int8Type),
197            DataType::Int16 => impl_primitive!(i16, Int16Type),
198            DataType::Int32 => impl_primitive!(i32, Int32Type),
199            DataType::Int64 => impl_primitive!(i64, Int64Type),
200            DataType::Boolean => {
201                let values: Vec<Option<bool>> = obj.extract()?;
202                Arc::new(BooleanArray::from(values))
203            }
204            DataType::Binary => {
205                let values: Vec<Option<Vec<u8>>> = obj.extract()?;
206                let slices = values
207                    .iter()
208                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_slice()))
209                    .collect::<Vec<_>>();
210                Arc::new(BinaryArray::from(slices))
211            }
212            DataType::LargeBinary => {
213                let values: Vec<Option<Vec<u8>>> = obj.extract()?;
214                let slices = values
215                    .iter()
216                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_slice()))
217                    .collect::<Vec<_>>();
218                Arc::new(LargeBinaryArray::from(slices))
219            }
220            DataType::BinaryView => {
221                let values: Vec<Option<Vec<u8>>> = obj.extract()?;
222                let slices = values
223                    .iter()
224                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_slice()))
225                    .collect::<Vec<_>>();
226                Arc::new(BinaryViewArray::from(slices))
227            }
228            DataType::Utf8 => {
229                let values: Vec<Option<String>> = obj.extract()?;
230                Arc::new(StringArray::from(values))
231            }
232            DataType::LargeUtf8 => {
233                let values: Vec<Option<String>> = obj.extract()?;
234                Arc::new(LargeStringArray::from(values))
235            }
236            DataType::Utf8View => {
237                let values: Vec<Option<String>> = obj.extract()?;
238                Arc::new(StringViewArray::from(values))
239            }
240            dt => {
241                return Err(PyNotImplementedError::new_err(format!(
242                    "Array constructor for {dt} not yet implemented."
243                )))
244            }
245        };
246        Ok(Self::new(array, field))
247    }
248
249    #[cfg(feature = "buffer_protocol")]
250    fn buffer(&self) -> crate::buffer::PyArrowBuffer {
251        use arrow_array::cast::AsArray;
252
253        match self.array.data_type() {
254            DataType::Int64 => {
255                let arr = self.array.as_primitive::<Int64Type>();
256                let values = arr.values();
257                let buffer = values.inner().clone();
258                crate::buffer::PyArrowBuffer::new(buffer)
259            }
260            _ => todo!(),
261        }
262    }
263
264    #[pyo3(signature = (dtype=None, copy=None))]
265    #[allow(unused_variables)]
266    fn __array__<'py>(
267        &'py self,
268        py: Python<'py>,
269        dtype: Option<Bound<'py, PyAny>>,
270        copy: Option<Bound<'py, PyAny>>,
271    ) -> PyResult<Bound<'py, PyAny>> {
272        to_numpy(py, &self.array)
273    }
274
275    #[pyo3(signature = (requested_schema=None))]
276    fn __arrow_c_array__<'py>(
277        &'py self,
278        py: Python<'py>,
279        requested_schema: Option<Bound<'py, PyCapsule>>,
280    ) -> PyArrowResult<Bound<'py, PyTuple>> {
281        to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema)
282    }
283
284    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
285        to_schema_pycapsule(py, self.field.as_ref())
286    }
287
288    fn __eq__(&self, other: &PyArray) -> bool {
289        self.array.as_ref() == other.array.as_ref() && self.field == other.field
290    }
291
292    fn __getitem__(&self, i: isize) -> PyArrowResult<PyScalar> {
293        // Handle negative indexes from the end
294        let i = if i < 0 {
295            let i = self.array.len() as isize + i;
296            if i < 0 {
297                return Err(PyIndexError::new_err("Index out of range").into());
298            }
299            i as usize
300        } else {
301            i as usize
302        };
303        if i >= self.array.len() {
304            return Err(PyIndexError::new_err("Index out of range").into());
305        }
306        PyScalar::try_new(self.array.slice(i, 1), self.field.clone())
307    }
308
309    fn __len__(&self) -> usize {
310        self.array.len()
311    }
312
313    fn __repr__(&self) -> String {
314        self.to_string()
315    }
316
317    #[classmethod]
318    fn from_arrow(_cls: &Bound<PyType>, input: AnyArray) -> PyArrowResult<Self> {
319        match input {
320            AnyArray::Array(array) => Ok(array),
321            AnyArray::Stream(stream) => {
322                let chunked_array = stream.into_chunked_array()?;
323                let (chunks, field) = chunked_array.into_inner();
324                let chunk_refs = chunks.iter().map(|arr| arr.as_ref()).collect::<Vec<_>>();
325                let concatted = concat(chunk_refs.as_slice())?;
326                Ok(Self::new(concatted, field))
327            }
328        }
329    }
330
331    #[classmethod]
332    #[pyo3(name = "from_arrow_pycapsule")]
333    fn from_arrow_pycapsule_py(
334        _cls: &Bound<PyType>,
335        schema_capsule: &Bound<PyCapsule>,
336        array_capsule: &Bound<PyCapsule>,
337    ) -> PyResult<Self> {
338        Self::from_arrow_pycapsule(schema_capsule, array_capsule)
339    }
340
341    /// Import via buffer protocol
342    #[cfg(feature = "buffer_protocol")]
343    #[classmethod]
344    fn from_buffer(_cls: &Bound<PyType>, buffer: AnyBufferProtocol) -> PyArrowResult<Self> {
345        buffer.try_into()
346    }
347
348    #[classmethod]
349    fn from_numpy(
350        _cls: &Bound<PyType>,
351        py: Python,
352        array: Bound<'_, PyAny>,
353    ) -> PyArrowResult<Self> {
354        let mut numpy_array = array;
355        if numpy_array.hasattr("__array__")? {
356            numpy_array = numpy_array.call_method0("__array__")?;
357        };
358
359        // Prefer zero-copy route via buffer protocol, if possible
360        #[cfg(feature = "buffer_protocol")]
361        if let Ok(buf) = numpy_array.extract::<AnyBufferProtocol>() {
362            return buf.try_into();
363        }
364
365        let numpy_array: Bound<PyUntypedArray> = FromPyObject::extract_bound(&numpy_array)?;
366        let arrow_array = from_numpy(py, &numpy_array)?;
367        Ok(Self::from_array_ref(arrow_array))
368    }
369
370    fn cast(&self, target_type: PyField) -> PyArrowResult<Arro3Array> {
371        let new_field = target_type.into_inner();
372        let new_array = cast(self.as_ref(), new_field.data_type())?;
373        Ok(PyArray::new(new_array, new_field).into())
374    }
375
376    #[getter]
377    #[pyo3(name = "field")]
378    fn py_field(&self) -> Arro3Field {
379        PyField::new(self.field.clone()).into()
380    }
381
382    #[getter]
383    fn nbytes(&self) -> usize {
384        self.array.get_array_memory_size()
385    }
386
387    #[getter]
388    fn null_count(&self) -> usize {
389        self.array.null_count()
390    }
391
392    #[pyo3(signature = (offset=0, length=None))]
393    fn slice(&self, offset: usize, length: Option<usize>) -> Arro3Array {
394        let length = length.unwrap_or_else(|| self.array.len() - offset);
395        let new_array = self.array.slice(offset, length);
396        PyArray::new(new_array, self.field().clone()).into()
397    }
398
399    fn take(&self, indices: PyArray) -> PyArrowResult<Arro3Array> {
400        let new_array = take(self.as_ref(), indices.as_ref(), None)?;
401        Ok(PyArray::new(new_array, self.field.clone()).into())
402    }
403
404    fn to_numpy<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
405        self.__array__(py, None, None)
406    }
407
408    fn to_pylist(&self, py: Python) -> PyResult<PyObject> {
409        let mut scalars = Vec::with_capacity(self.array.len());
410        for i in 0..self.array.len() {
411            let scalar =
412                unsafe { PyScalar::new_unchecked(self.array.slice(i, 1), self.field.clone()) };
413            scalars.push(scalar.as_py(py)?);
414        }
415        scalars.into_py_any(py)
416    }
417
418    #[getter]
419    fn r#type(&self) -> Arro3DataType {
420        PyDataType::new(self.field.data_type().clone()).into()
421    }
422}