pyo3_arrow/
array.rs

1use std::fmt::Display;
2use std::sync::Arc;
3
4use arrow_array::types::{
5    Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
6    UInt64Type, UInt8Type,
7};
8use arrow_array::{
9    Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Datum, FixedSizeBinaryArray,
10    LargeBinaryArray, LargeStringArray, PrimitiveArray, StringArray, StringViewArray,
11};
12use arrow_cast::cast;
13use arrow_cast::display::ArrayFormatter;
14use arrow_schema::{ArrowError, DataType, Field, FieldRef};
15use arrow_select::concat::concat;
16use arrow_select::take::take;
17use numpy::PyUntypedArray;
18use pyo3::exceptions::{PyIndexError, PyNotImplementedError, PyValueError};
19use pyo3::intern;
20use pyo3::prelude::*;
21use pyo3::pybacked::{PyBackedBytes, PyBackedStr};
22use pyo3::types::{PyCapsule, PyTuple, PyType};
23
24#[cfg(feature = "buffer_protocol")]
25use crate::buffer::AnyBufferProtocol;
26use crate::error::PyArrowResult;
27use crate::export::{Arro3Array, Arro3DataType, Arro3Field};
28use crate::ffi::from_python::utils::import_array_pycapsules;
29use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
30use crate::ffi::{to_array_pycapsules, to_schema_pycapsule};
31use crate::input::AnyArray;
32use crate::interop::numpy::from_numpy::from_numpy;
33use crate::interop::numpy::to_numpy::to_numpy;
34use crate::scalar::PyScalar;
35use crate::utils::default_repr_options;
36use crate::{PyDataType, PyField};
37
38/// A Python-facing Arrow array.
39///
40/// This is a wrapper around an [ArrayRef] and a [FieldRef].
41///
42/// It's important for this to wrap both an array _and_ a field so that it can faithfully store all
43/// data transmitted via the `__arrow_c_array__` Python method, which [exports both an Array and a
44/// Field](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrow_c_array__).
45/// In particular, storing a [FieldRef] is required to persist Arrow extension metadata through the
46/// C Data Interface.
47#[derive(Debug)]
48#[pyclass(module = "arro3.core._core", name = "Array", subclass, frozen)]
49pub struct PyArray {
50    array: ArrayRef,
51    field: FieldRef,
52}
53
54impl PyArray {
55    /// Create a new Python Array from an [ArrayRef] and a [FieldRef].
56    ///
57    /// This will panic if the array's data type does not match the field's data type.
58    pub fn new(array: ArrayRef, field: FieldRef) -> Self {
59        Self::try_new(array, field).unwrap()
60    }
61
62    /// Create a new Python Array from an [ArrayRef] and a [FieldRef].
63    pub fn try_new(array: ArrayRef, field: FieldRef) -> Result<Self, ArrowError> {
64        // Note: if the array and field data types don't match, you'll get an obscure FFI
65        // exception, because you might be describing a different array than you're actually
66        // providing.
67        if array.data_type() != field.data_type() {
68            return Err(ArrowError::SchemaError(
69                format!("Array DataType must match Field DataType. Array DataType is {}; field DataType is {}", array.data_type(), field.data_type())
70            ));
71        }
72        Ok(Self { array, field })
73    }
74
75    /// Create a new PyArray from an [ArrayRef], inferring its data type automatically.
76    pub fn from_array_ref(array: ArrayRef) -> Self {
77        let field = Field::new("", array.data_type().clone(), true);
78        Self::new(array, Arc::new(field))
79    }
80
81    /// Import from raw Arrow capsules
82    pub fn from_arrow_pycapsule(
83        schema_capsule: &Bound<PyCapsule>,
84        array_capsule: &Bound<PyCapsule>,
85    ) -> PyResult<Self> {
86        let (array, field) = import_array_pycapsules(schema_capsule, array_capsule)?;
87        Ok(Self::new(array, Arc::new(field)))
88    }
89
90    /// Access the underlying [ArrayRef].
91    pub fn array(&self) -> &ArrayRef {
92        &self.array
93    }
94
95    /// Access the underlying [FieldRef].
96    pub fn field(&self) -> &FieldRef {
97        &self.field
98    }
99
100    /// Consume self to access the underlying [ArrayRef] and [FieldRef].
101    pub fn into_inner(self) -> (ArrayRef, FieldRef) {
102        (self.array, self.field)
103    }
104
105    /// Export to an arro3.core.Array.
106    ///
107    /// This requires that you depend on arro3-core from your Python package.
108    pub fn to_arro3<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
109        let arro3_mod = py.import(intern!(py, "arro3.core"))?;
110        arro3_mod.getattr(intern!(py, "Array"))?.call_method1(
111            intern!(py, "from_arrow_pycapsule"),
112            self.__arrow_c_array__(py, None)?,
113        )
114    }
115
116    /// Export to an arro3.core.Array.
117    ///
118    /// This requires that you depend on arro3-core from your Python package.
119    pub fn into_arro3(self, py: Python) -> PyResult<Bound<PyAny>> {
120        let arro3_mod = py.import(intern!(py, "arro3.core"))?;
121        let array_capsules = to_array_pycapsules(py, self.field.clone(), &self.array, None)?;
122        arro3_mod
123            .getattr(intern!(py, "Array"))?
124            .call_method1(intern!(py, "from_arrow_pycapsule"), array_capsules)
125    }
126
127    /// Export this to a Python `nanoarrow.Array`.
128    pub fn to_nanoarrow<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
129        to_nanoarrow_array(py, self.__arrow_c_array__(py, None)?)
130    }
131
132    /// Export to a pyarrow.Array
133    ///
134    /// Requires pyarrow >=14
135    pub fn to_pyarrow<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
136        let pyarrow_mod = py.import(intern!(py, "pyarrow"))?;
137        let cloned = Self::new(self.array.clone(), self.field.clone());
138        pyarrow_mod
139            .getattr(intern!(py, "array"))?
140            .call1(PyTuple::new(py, vec![cloned.into_pyobject(py)?])?)
141    }
142}
143
144impl From<ArrayRef> for PyArray {
145    fn from(value: ArrayRef) -> Self {
146        Self::from_array_ref(value)
147    }
148}
149
150impl AsRef<ArrayRef> for PyArray {
151    fn as_ref(&self) -> &ArrayRef {
152        &self.array
153    }
154}
155
156impl Display for PyArray {
157    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
158        write!(f, "arro3.core.Array<")?;
159        self.array.data_type().fmt(f)?;
160        writeln!(f, ">")?;
161
162        let options = default_repr_options();
163        let formatter =
164            ArrayFormatter::try_new(self.array.as_ref(), &options).map_err(|_| std::fmt::Error)?;
165
166        writeln!(f, "[")?;
167        for i in 0..self.array.len().min(10) {
168            let row = formatter.value(i);
169            writeln!(f, "  {},", row)?;
170        }
171        writeln!(f, "]")?;
172
173        Ok(())
174    }
175}
176
177impl Datum for PyArray {
178    fn get(&self) -> (&dyn Array, bool) {
179        (self.array.as_ref(), false)
180    }
181}
182
183#[pymethods]
184impl PyArray {
185    #[new]
186    #[pyo3(signature = (obj, /, r#type = None, *))]
187    pub(crate) fn init(
188        py: Python,
189        obj: &Bound<PyAny>,
190        r#type: Option<PyField>,
191    ) -> PyArrowResult<Self> {
192        // Need to check first if the object has the __arrow_c_array__ method, so that we can
193        // preserve any error upon calling it.
194        // Then we also need to check if we can extract a PyArray, since we also support buffer
195        // protocol input there.
196        if obj.hasattr(intern!(py, "__arrow_c_array__"))? {
197            return Ok(obj.extract::<PyArray>()?);
198        }
199        if let Ok(data) = obj.extract::<PyArray>() {
200            return Ok(data);
201        }
202
203        macro_rules! impl_primitive {
204            ($rust_type:ty, $arrow_type:ty) => {{
205                let values: Vec<Option<$rust_type>> = obj.extract()?;
206                Arc::new(PrimitiveArray::<$arrow_type>::from(values))
207            }};
208        }
209
210        let field = r#type
211            .ok_or(PyValueError::new_err(
212                "type must be passed for non-Arrow input",
213            ))?
214            .into_inner();
215        let array: ArrayRef = match field.data_type() {
216            DataType::Float32 => impl_primitive!(f32, Float32Type),
217            DataType::Float64 => impl_primitive!(f64, Float64Type),
218            DataType::UInt8 => impl_primitive!(u8, UInt8Type),
219            DataType::UInt16 => impl_primitive!(u16, UInt16Type),
220            DataType::UInt32 => impl_primitive!(u32, UInt32Type),
221            DataType::UInt64 => impl_primitive!(u64, UInt64Type),
222            DataType::Int8 => impl_primitive!(i8, Int8Type),
223            DataType::Int16 => impl_primitive!(i16, Int16Type),
224            DataType::Int32 => impl_primitive!(i32, Int32Type),
225            DataType::Int64 => impl_primitive!(i64, Int64Type),
226            DataType::Boolean => {
227                let values: Vec<Option<bool>> = obj.extract()?;
228                Arc::new(BooleanArray::from(values))
229            }
230            DataType::Binary => {
231                let values: Vec<Option<PyBackedBytes>> = obj.extract()?;
232                let slices = values
233                    .iter()
234                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_ref()))
235                    .collect::<Vec<_>>();
236                Arc::new(BinaryArray::from(slices))
237            }
238            DataType::LargeBinary => {
239                let values: Vec<Option<PyBackedBytes>> = obj.extract()?;
240                let slices = values
241                    .iter()
242                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_ref()))
243                    .collect::<Vec<_>>();
244                Arc::new(LargeBinaryArray::from(slices))
245            }
246            DataType::BinaryView => {
247                let values: Vec<Option<PyBackedBytes>> = obj.extract()?;
248                let slices = values
249                    .iter()
250                    .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_ref()))
251                    .collect::<Vec<_>>();
252                Arc::new(BinaryViewArray::from(slices))
253            }
254            DataType::FixedSizeBinary(size) => {
255                let values: Vec<Option<PyBackedBytes>> = obj.extract()?;
256                Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(
257                    values
258                        .iter()
259                        .map(|maybe_vec| maybe_vec.as_ref().map(|vec| vec.as_ref())),
260                    *size,
261                )?)
262            }
263            DataType::Utf8 => {
264                let values: Vec<Option<PyBackedStr>> = obj.extract()?;
265                let slices = values
266                    .iter()
267                    .map(|maybe_str| maybe_str.as_ref().map(|s| s.as_ref()))
268                    .collect::<Vec<_>>();
269                Arc::new(StringArray::from(slices))
270            }
271            DataType::LargeUtf8 => {
272                let values: Vec<Option<PyBackedStr>> = obj.extract()?;
273                let slices = values
274                    .iter()
275                    .map(|maybe_str| maybe_str.as_ref().map(|s| s.as_ref()))
276                    .collect::<Vec<_>>();
277                Arc::new(LargeStringArray::from(slices))
278            }
279            DataType::Utf8View => {
280                let values: Vec<Option<PyBackedStr>> = obj.extract()?;
281                let slices = values
282                    .iter()
283                    .map(|maybe_str| maybe_str.as_ref().map(|s| s.as_ref()))
284                    .collect::<Vec<_>>();
285                Arc::new(StringViewArray::from(slices))
286            }
287            dt => {
288                return Err(PyNotImplementedError::new_err(format!(
289                    "Array constructor for {dt} not yet implemented."
290                ))
291                .into())
292            }
293        };
294        Ok(Self::new(array, field))
295    }
296
297    #[cfg(feature = "buffer_protocol")]
298    fn buffer(&self) -> crate::buffer::PyArrowBuffer {
299        use arrow_array::cast::AsArray;
300
301        match self.array.data_type() {
302            DataType::Int64 => {
303                let arr = self.array.as_primitive::<Int64Type>();
304                let values = arr.values();
305                let buffer = values.inner().clone();
306                crate::buffer::PyArrowBuffer::new(buffer)
307            }
308            _ => todo!(),
309        }
310    }
311
312    #[pyo3(signature = (dtype=None, copy=None))]
313    #[allow(unused_variables)]
314    fn __array__<'py>(
315        &'py self,
316        py: Python<'py>,
317        dtype: Option<Bound<'py, PyAny>>,
318        copy: Option<Bound<'py, PyAny>>,
319    ) -> PyResult<Bound<'py, PyAny>> {
320        to_numpy(py, &self.array)
321    }
322
323    #[pyo3(signature = (requested_schema=None))]
324    fn __arrow_c_array__<'py>(
325        &'py self,
326        py: Python<'py>,
327        requested_schema: Option<Bound<'py, PyCapsule>>,
328    ) -> PyArrowResult<Bound<'py, PyTuple>> {
329        to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema)
330    }
331
332    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
333        to_schema_pycapsule(py, self.field.as_ref())
334    }
335
336    fn __eq__(&self, other: &PyArray) -> bool {
337        self.array.as_ref() == other.array.as_ref() && self.field == other.field
338    }
339
340    fn __getitem__(&self, i: isize) -> PyArrowResult<PyScalar> {
341        // Handle negative indexes from the end
342        let i = if i < 0 {
343            let i = self.array.len() as isize + i;
344            if i < 0 {
345                return Err(PyIndexError::new_err("Index out of range").into());
346            }
347            i as usize
348        } else {
349            i as usize
350        };
351        if i >= self.array.len() {
352            return Err(PyIndexError::new_err("Index out of range").into());
353        }
354        PyScalar::try_new(self.array.slice(i, 1), self.field.clone())
355    }
356
357    fn __len__(&self) -> usize {
358        self.array.len()
359    }
360
361    fn __repr__(&self) -> String {
362        self.to_string()
363    }
364
365    #[classmethod]
366    fn from_arrow(_cls: &Bound<PyType>, input: AnyArray) -> PyArrowResult<Self> {
367        match input {
368            AnyArray::Array(array) => Ok(array),
369            AnyArray::Stream(stream) => {
370                let chunked_array = stream.into_chunked_array()?;
371                let (chunks, field) = chunked_array.into_inner();
372                let chunk_refs = chunks.iter().map(|arr| arr.as_ref()).collect::<Vec<_>>();
373                let concatted = concat(chunk_refs.as_slice())?;
374                Ok(Self::new(concatted, field))
375            }
376        }
377    }
378
379    #[classmethod]
380    #[pyo3(name = "from_arrow_pycapsule")]
381    fn from_arrow_pycapsule_py(
382        _cls: &Bound<PyType>,
383        schema_capsule: &Bound<PyCapsule>,
384        array_capsule: &Bound<PyCapsule>,
385    ) -> PyResult<Self> {
386        Self::from_arrow_pycapsule(schema_capsule, array_capsule)
387    }
388
389    /// Import via buffer protocol
390    #[cfg(feature = "buffer_protocol")]
391    #[classmethod]
392    fn from_buffer(_cls: &Bound<PyType>, buffer: AnyBufferProtocol) -> PyArrowResult<Self> {
393        buffer.try_into()
394    }
395
396    #[classmethod]
397    fn from_numpy(
398        _cls: &Bound<PyType>,
399        py: Python,
400        array: Bound<'_, PyAny>,
401    ) -> PyArrowResult<Self> {
402        let mut numpy_array = array;
403        if numpy_array.hasattr("__array__")? {
404            numpy_array = numpy_array.call_method0("__array__")?;
405        };
406
407        // Prefer zero-copy route via buffer protocol, if possible
408        #[cfg(feature = "buffer_protocol")]
409        if let Ok(buf) = numpy_array.extract::<AnyBufferProtocol>() {
410            return buf.try_into();
411        }
412
413        let numpy_array: Bound<PyUntypedArray> = numpy_array.extract()?;
414        let arrow_array = from_numpy(py, &numpy_array)?;
415        Ok(Self::from_array_ref(arrow_array))
416    }
417
418    fn cast(&self, target_type: PyField) -> PyArrowResult<Arro3Array> {
419        let new_field = target_type.into_inner();
420        let new_array = cast(self.as_ref(), new_field.data_type())?;
421        Ok(PyArray::new(new_array, new_field).into())
422    }
423
424    #[getter]
425    #[pyo3(name = "field")]
426    fn py_field(&self) -> Arro3Field {
427        PyField::new(self.field.clone()).into()
428    }
429
430    #[getter]
431    fn nbytes(&self) -> usize {
432        self.array.get_array_memory_size()
433    }
434
435    #[getter]
436    fn null_count(&self) -> usize {
437        self.array.null_count()
438    }
439
440    #[pyo3(signature = (offset=0, length=None))]
441    fn slice(&self, offset: usize, length: Option<usize>) -> Arro3Array {
442        let length = length.unwrap_or_else(|| self.array.len() - offset);
443        let new_array = self.array.slice(offset, length);
444        PyArray::new(new_array, self.field().clone()).into()
445    }
446
447    fn take(&self, indices: PyArray) -> PyArrowResult<Arro3Array> {
448        let new_array = take(self.as_ref(), indices.as_ref(), None)?;
449        Ok(PyArray::new(new_array, self.field.clone()).into())
450    }
451
452    fn to_numpy<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
453        self.__array__(py, None, None)
454    }
455
456    fn to_pylist(&self, py: Python) -> PyResult<Vec<Py<PyAny>>> {
457        let mut scalars = Vec::with_capacity(self.array.len());
458        for i in 0..self.array.len() {
459            let scalar =
460                unsafe { PyScalar::new_unchecked(self.array.slice(i, 1), self.field.clone()) };
461            scalars.push(scalar.as_py(py)?);
462        }
463        Ok(scalars)
464    }
465
466    #[getter]
467    fn r#type(&self) -> Arro3DataType {
468        PyDataType::new(self.field.data_type().clone()).into()
469    }
470}