Skip to main content

pyo3_geoarrow/
chunked_array.rs

1use std::sync::Arc;
2
3use arrow_schema::ArrowError;
4use geoarrow_array::GeoArrowArray;
5use geoarrow_array::array::from_arrow_array;
6use geoarrow_cast::downcast::NativeType;
7use geoarrow_schema::{
8    BoxType, GeoArrowType, GeometryCollectionType, LineStringType, MultiLineStringType,
9    MultiPointType, MultiPolygonType, PointType, PolygonType,
10};
11use pyo3::exceptions::{PyIndexError, PyTypeError};
12use pyo3::intern;
13use pyo3::prelude::*;
14use pyo3::types::{PyCapsule, PyTuple, PyType};
15use pyo3_arrow::ffi::{ArrayIterator, to_stream_pycapsule};
16use pyo3_arrow::input::AnyArray;
17use pyo3_arrow::{PyArrayReader, PyChunkedArray};
18
19use crate::data_type::PyGeoType;
20use crate::error::{PyGeoArrowError, PyGeoArrowResult};
21use crate::input::AnyGeoArray;
22use crate::scalar::PyGeoScalar;
23use crate::utils::text_repr::text_repr;
24use crate::{PyCoordType, PyGeoArray};
25
26/// Python wrapper for a chunked GeoArrow geometry array.
27///
28/// A chunked array is a collection of contiguous arrays of the same type.
29#[pyclass(
30    module = "geoarrow.rust.core",
31    name = "GeoChunkedArray",
32    subclass,
33    frozen
34)]
35pub struct PyGeoChunkedArray {
36    chunks: Vec<Arc<dyn GeoArrowArray>>,
37    data_type: GeoArrowType,
38}
39
40impl PyGeoChunkedArray {
41    /// Construct a new [PyGeoChunkedArray] from existing chunks and a field.
42    pub fn try_new(chunks: Vec<Arc<dyn GeoArrowArray>>, data_type: GeoArrowType) -> PyResult<Self> {
43        if !chunks.iter().all(|chunk| chunk.data_type() == data_type) {
44            return Err(PyTypeError::new_err("All chunks must have same data type"));
45        }
46
47        Ok(Self { chunks, data_type })
48    }
49
50    /// Import from a raw Arrow C Stream capsule
51    pub fn from_arrow_pycapsule(capsule: &Bound<PyCapsule>) -> PyGeoArrowResult<Self> {
52        PyChunkedArray::from_arrow_pycapsule(capsule)?.try_into()
53    }
54
55    /// Export to a geoarrow.rust.core.GeometryArray.
56    ///
57    /// This requires that you depend on geoarrow-rust-core from your Python package.
58    pub fn to_geoarrow<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
59        let geoarrow_mod = py.import(intern!(py, "geoarrow.rust.core"))?;
60        geoarrow_mod
61            .getattr(intern!(py, "GeoChunkedArray"))?
62            .call_method1(
63                intern!(py, "from_arrow_pycapsule"),
64                PyTuple::new(py, vec![self.__arrow_c_stream__(py, None)?])?,
65            )
66    }
67
68    /// Create a new PyChunkedArray from a vec of [ArrayRef][arrow_array::array::ArrayRef]s,
69    /// inferring their data type automatically.
70    pub fn from_arrays(chunks: Vec<Arc<dyn GeoArrowArray>>) -> PyGeoArrowResult<Self> {
71        if chunks.is_empty() {
72            return Err(ArrowError::SchemaError(
73                "Cannot infer data type from empty Vec<Arc<dyn GeoArrowArray>>".to_string(),
74            )
75            .into());
76        }
77
78        if !chunks
79            .windows(2)
80            .all(|w| w[0].data_type() == w[1].data_type())
81        {
82            return Err(ArrowError::SchemaError("Mismatched data types".to_string()).into());
83        }
84
85        let data_type = chunks[0].data_type();
86        Ok(Self::try_new(chunks, data_type)?)
87    }
88
89    /// Consume this wrapper and return the underlying chunks and data type.
90    pub fn into_inner(self) -> (Vec<Arc<dyn GeoArrowArray>>, GeoArrowType) {
91        (self.chunks, self.data_type)
92    }
93}
94
95#[pymethods]
96impl PyGeoChunkedArray {
97    #[new]
98    #[pyo3(signature = (arrays, r#type=None))]
99    fn init(
100        py: Python,
101        arrays: &Bound<PyAny>,
102        r#type: Option<PyGeoType>,
103    ) -> PyGeoArrowResult<Self> {
104        if arrays.hasattr(intern!(py, "__arrow_c_array__"))?
105            || arrays.hasattr(intern!(py, "__arrow_c_stream__"))?
106        {
107            Ok(arrays.extract::<AnyGeoArray>()?.into_chunked_array()?)
108        } else if let Ok(geo_arrays) = arrays.extract::<Vec<PyGeoArray>>() {
109            let geo_arrays = geo_arrays
110                .into_iter()
111                .map(|arr| arr.into_inner())
112                .collect::<Vec<_>>();
113
114            if !geo_arrays
115                .windows(2)
116                .all(|w| w[0].data_type() == w[1].data_type())
117            {
118                return Err(PyTypeError::new_err(
119                    "Cannot create a ChunkedArray with differing data types.",
120                )
121                .into());
122            }
123
124            let geo_type = r#type
125                .map(|py_data_type| py_data_type.into_inner())
126                .unwrap_or_else(|| geo_arrays[0].data_type());
127
128            Ok(Self::try_new(geo_arrays, geo_type)?)
129        } else {
130            Err(
131                PyTypeError::new_err("Expected ChunkedArray-like input or sequence of arrays.")
132                    .into(),
133            )
134        }
135    }
136
137    #[pyo3(signature = (requested_schema=None))]
138    fn __arrow_c_stream__<'py>(
139        &self,
140        py: Python<'py>,
141        requested_schema: Option<Bound<'py, PyCapsule>>,
142    ) -> PyResult<Bound<'py, PyCapsule>> {
143        let field = Arc::new(self.data_type.to_field("", true));
144        let arrow_chunks = self
145            .chunks
146            .iter()
147            .map(|x| x.to_array_ref())
148            .collect::<Vec<_>>();
149
150        let array_reader = Box::new(ArrayIterator::new(arrow_chunks.into_iter().map(Ok), field));
151        Ok(to_stream_pycapsule(py, array_reader, requested_schema)?)
152    }
153
154    /// Check for equality with other object.
155    fn __eq__(&self, other: &Bound<PyAny>) -> bool {
156        // Do extraction within body because `__eq__` should never raise an exception.
157        if let Ok(other) = other.extract::<Self>() {
158            self.data_type == other.data_type
159                && self.chunks.len() == other.chunks.len()
160                && self
161                    .chunks
162                    .iter()
163                    .zip(other.chunks)
164                    .all(|(left, right)| left.to_array_ref() == right.to_array_ref())
165        } else {
166            false
167        }
168    }
169
170    fn __getitem__(&self, i: isize) -> PyGeoArrowResult<PyGeoScalar> {
171        // Handle negative indexes from the end
172        let mut i = if i < 0 {
173            let i = self.__len__() as isize + i;
174            if i < 0 {
175                return Err(PyIndexError::new_err("Index out of range").into());
176            }
177            i as usize
178        } else {
179            i as usize
180        };
181        if i >= self.__len__() {
182            return Err(PyIndexError::new_err("Index out of range").into());
183        }
184
185        for chunk in self.chunks() {
186            if i < chunk.inner().len() {
187                return PyGeoScalar::try_new(chunk.inner().slice(i, 1));
188            }
189            i -= chunk.inner().len();
190        }
191        unreachable!("index in range but past end of last chunk")
192    }
193
194    fn __len__(&self) -> usize {
195        self.chunks.iter().fold(0, |acc, arr| acc + arr.len())
196    }
197
198    fn __repr__(&self) -> String {
199        format!("GeoChunkedArray({})", text_repr(&self.data_type))
200    }
201
202    #[classmethod]
203    fn from_arrow(_cls: &Bound<PyType>, data: Self) -> Self {
204        data
205    }
206
207    #[classmethod]
208    #[pyo3(name = "from_arrow_pycapsule")]
209    fn from_arrow_pycapsule_py(
210        _cls: &Bound<PyType>,
211        capsule: &Bound<PyCapsule>,
212    ) -> PyGeoArrowResult<Self> {
213        Self::from_arrow_pycapsule(capsule)
214    }
215
216    #[getter]
217    fn null_count(&self) -> usize {
218        self.chunks
219            .iter()
220            .map(|chunk| chunk.logical_null_count())
221            .sum()
222    }
223
224    #[getter]
225    fn num_chunks(&self) -> usize {
226        self.chunks.len()
227    }
228
229    fn chunk(&self, i: usize) -> PyGeoArray {
230        PyGeoArray::new(self.chunks[i].clone())
231    }
232
233    fn chunks(&self) -> Vec<PyGeoArray> {
234        self.chunks
235            .iter()
236            .map(|chunk| PyGeoArray::new(chunk.clone()))
237            .collect()
238    }
239
240    #[pyo3(signature = (to_type, /))]
241    fn cast(&self, to_type: PyGeoType) -> PyGeoArrowResult<Self> {
242        let casted = self
243            .chunks
244            .iter()
245            .map(|chunk| geoarrow_cast::cast::cast(chunk.as_ref(), to_type.as_ref()))
246            .collect::<Result<Vec<_>, _>>()?;
247
248        Self::from_arrays(casted)
249    }
250
251    #[pyo3(
252        signature = (*, coord_type = PyCoordType::Separated),
253        text_signature = "(*, coord_type='separated')"
254    )]
255    fn downcast(&self, coord_type: PyCoordType) -> PyGeoArrowResult<Self> {
256        if let Some((native_type, dim)) =
257            geoarrow_cast::downcast::infer_downcast_type(self.chunks.iter().map(|x| x.as_ref()))?
258        {
259            let metadata = self.data_type.metadata().clone();
260            let coord_type = coord_type.into();
261            let to_type = match native_type {
262                NativeType::Point => PointType::new(dim, metadata)
263                    .with_coord_type(coord_type)
264                    .into(),
265                NativeType::LineString => LineStringType::new(dim, metadata)
266                    .with_coord_type(coord_type)
267                    .into(),
268                NativeType::Polygon => PolygonType::new(dim, metadata)
269                    .with_coord_type(coord_type)
270                    .into(),
271                NativeType::MultiPoint => MultiPointType::new(dim, metadata)
272                    .with_coord_type(coord_type)
273                    .into(),
274                NativeType::MultiLineString => MultiLineStringType::new(dim, metadata)
275                    .with_coord_type(coord_type)
276                    .into(),
277                NativeType::MultiPolygon => MultiPolygonType::new(dim, metadata)
278                    .with_coord_type(coord_type)
279                    .into(),
280                NativeType::GeometryCollection => GeometryCollectionType::new(dim, metadata)
281                    .with_coord_type(coord_type)
282                    .into(),
283                NativeType::Rect => BoxType::new(dim, metadata).into(),
284            };
285            self.cast(PyGeoType::new(to_type))
286        } else {
287            Ok(Self::try_new(self.chunks.clone(), self.data_type.clone())?)
288        }
289    }
290
291    #[getter]
292    fn r#type(&self) -> PyGeoType {
293        self.data_type.clone().into()
294    }
295}
296
297impl<'py> FromPyObject<'_, 'py> for PyGeoChunkedArray {
298    type Error = PyErr;
299
300    fn extract(ob: Borrowed<'_, 'py, PyAny>) -> PyResult<Self> {
301        let chunked_array = ob.extract::<AnyArray>()?.into_chunked_array()?;
302        chunked_array.try_into().map_err(PyErr::from)
303    }
304}
305
306impl TryFrom<PyChunkedArray> for PyGeoChunkedArray {
307    type Error = PyGeoArrowError;
308
309    fn try_from(value: PyChunkedArray) -> Result<Self, Self::Error> {
310        let (chunks, field) = value.into_inner();
311        let geo_chunks = chunks
312            .iter()
313            .map(|array| from_arrow_array(&array, &field))
314            .collect::<Result<Vec<_>, _>>()?;
315        let geo_data_type = GeoArrowType::try_from(field.as_ref())?;
316        Ok(Self {
317            chunks: geo_chunks,
318            data_type: geo_data_type,
319        })
320    }
321}
322
323impl TryFrom<PyArrayReader> for PyGeoChunkedArray {
324    type Error = PyGeoArrowError;
325
326    fn try_from(value: PyArrayReader) -> Result<Self, Self::Error> {
327        value.into_chunked_array()?.try_into()
328    }
329}