Skip to main content

minarrow_pyo3/
lib.rs

1// Copyright 2025 Peter Garfield Bower
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # minarrow-pyo3 - PyO3 Bindings for MinArrow
16//!
17//! Zero-copy Python bindings for MinArrow via the Arrow C Data Interface and PyCapsules.
18//!
19//! This crate provides transparent wrapper types that enable zero-copy conversion
20//! between MinArrow's Rust types and PyArrow's Python types.
21//!
22//! ## Features
23//!
24//! - **Zero-copy data transfer** via Arrow C Data Interface
25//! - **Transparent wrappers** (`PyArray`, `PyRecordBatch`) implementing PyO3 traits
26//! - **Idiomatic Rust API** for building Python extensions
27//!
28//! ## Copy Semantics
29//!
30//! ### Zero-copy
31//!
32//! All primary data buffers are transferred without copying in both directions.
33//! This applies to all export paths, single array imports, ChunkedArray chunk
34//! imports, and RecordBatch/Table column imports via both the PyCapsule stream
35//! and legacy `_import_from_c` paths.
36//!
37//! ### Copied by design
38//!
39//! The following are copied during import because they require structural
40//! transformation between MinArrow and Arrow representations:
41//!
42//! - **Null bitmasks** — reconstructed into MinArrow's `Bitmask` type on import.
43//!   These are small: ceil(N/8) bytes for N elements.
44//! - **String offsets** — reconstructed into MinArrow's offset representation.
45//! - **Categorical dictionary strings** — Arrow stores dictionaries as contiguous
46//!   offsets+data; MinArrow stores them as `Vec64<String>` with individual heap
47//!   allocations. The integer codes buffer is zero-copy.
48//! - **Field metadata** — names, types, and flags are lightweight and always copied.
49//!
50//! ## Type Mappings
51//! 
52//! Minarrow calls an object with a header, rows and columns a 'Table' favouring broader matter-of-factness.
53//! Apache Arrow calls it a 'RecordBatch' in line with the Apache Arrow standard, whereby a 'Table' (at least in PyArrow),
54//! is considered a chunked composition of those RecordBatches, for a more highly engineered approach.
55//! Below is how they map to one another for the equivalent memory and object layout.
56//! 
57//! | MinArrow | PyArrow | Wrapper Type |
58//! |----------|---------|--------------|
59//! | `Array` | `pa.Array` | `PyArray` |
60//! | `Table` | `pa.RecordBatch` | `PyRecordBatch` |
61//! | `SuperTable` | `pa.Table` | `PyTable` |
62//! | `SuperArray` | `pa.ChunkedArray` | `PyChunkedArray` |
63//!
64//! ## Conversion Protocols
65//!
66//! Two protocols are supported for data exchange:
67//!
68//! 1. **Arrow PyCapsule Interface** - the standard `__arrow_c_array__` / `__arrow_c_stream__`
69//!    protocol. Works with any Arrow-compatible Python library including PyArrow, Polars,
70//!    DuckDB, nanoarrow, and pandas with ArrowDtype.
71//!
72//! 2. **Legacy `_export_to_c`** - PyArrow-specific fallback using raw pointer integers.
73//!
74//! Import functions try the PyCapsule protocol first, falling back to the legacy approach
75//! for older PyArrow versions.
76//!
77//! For the complete array data type mapping including numeric, temporal, boolean, text,
78//! and categorical types, see the [`ffi`] module documentation.
79//!
80//! ## Example
81//!
82//! ```ignore
83//! use minarrow_pyo3::{PyArray, PyRecordBatch};
84//! use pyo3::prelude::*;
85//!
86//! #[pyfunction]
87//! fn process_batch(input: PyRecordBatch) -> PyResult<PyRecordBatch> {
88//!     let table: minarrow::Table = input.into();
89//!     // Process the table using MinArrow...
90//!     Ok(PyRecordBatch::from(table))
91//! }
92//!
93//! #[pymodule]
94//! fn my_extension(m: &Bound<'_, PyModule>) -> PyResult<()> {
95//!     m.add_function(wrap_pyfunction!(process_batch, m)?)?;
96//!     Ok(())
97//! }
98//! ```
99//!
100//! In Python:
101//! ```python
102//! import pyarrow as pa
103//! import my_extension
104//!
105//! batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
106//! result = my_extension.process_batch(batch)
107//! ```
108
109#![feature(allocator_api)]
110#![feature(slice_ptr_get)]
111#![feature(portable_simd)]
112
113use pyo3::prelude::*;
114use std::sync::Arc;
115
116pub mod error;
117pub mod ffi;
118pub mod types;
119
120// Re-export the main types for ease of use
121pub use error::{PyMinarrowError, PyMinarrowResult};
122pub use types::{PyArray, PyChunkedArray, PyField, PyRecordBatch, PyTable};
123
124// Re-export minarrow types that users might need
125pub use minarrow::{Array, Field, FieldArray, MaskedArray, NumericArray, SuperArray, SuperTable, Table, TextArray};
126
127/// Echo back a PyArrow array after roundtrip through MinArrow.
128/// Used to test that conversion works correctly.
129#[pyfunction]
130fn echo_array(arr: PyArray) -> PyResult<PyArray> {
131    // The array is converted to MinArrow on input and back to PyArrow on output
132    Ok(arr)
133}
134
135/// Echo back a PyArrow RecordBatch after roundtrip through MinArrow.
136/// Used to test that conversion works correctly.
137#[pyfunction]
138fn echo_batch(batch: PyRecordBatch) -> PyResult<PyRecordBatch> {
139    // The batch is converted to MinArrow Table on input and back to PyArrow on output
140    Ok(batch)
141}
142
143/// Get information about a PyArrow array after converting to MinArrow.
144#[pyfunction]
145fn array_info(arr: PyArray) -> PyResult<String> {
146    let inner = arr.inner();
147    Ok(format!(
148        "MinArrow Array: len={}, null_count={}",
149        inner.len(),
150        inner.null_count()
151    ))
152}
153
154/// Get information about a PyArrow RecordBatch after converting to MinArrow.
155#[pyfunction]
156fn batch_info(batch: PyRecordBatch) -> PyResult<String> {
157    let inner = batch.inner();
158    Ok(format!(
159        "MinArrow Table: rows={}, cols={}",
160        inner.n_rows(),
161        inner.n_cols()
162    ))
163}
164
165/// Echo back a PyArrow Table after roundtrip through MinArrow.
166/// Used to test that conversion works correctly.
167#[pyfunction]
168fn echo_table(table: PyTable) -> PyResult<PyTable> {
169    // The table is converted to MinArrow SuperTable on input and back to PyArrow on output
170    Ok(table)
171}
172
173/// Echo back a PyArrow ChunkedArray after roundtrip through MinArrow.
174/// Used to test that conversion works correctly.
175#[pyfunction]
176fn echo_chunked(arr: PyChunkedArray) -> PyResult<PyChunkedArray> {
177    // The array is converted to MinArrow SuperArray on input and back to PyArrow on output
178    Ok(arr)
179}
180
181/// Get information about a PyArrow Table after converting to MinArrow.
182#[pyfunction]
183fn table_info(table: PyTable) -> PyResult<String> {
184    let inner = table.inner();
185    Ok(format!(
186        "MinArrow SuperTable: batches={}, rows={}, cols={}",
187        inner.batches.len(),
188        inner.n_rows,
189        inner.schema.len()
190    ))
191}
192
193/// Get information about a PyArrow ChunkedArray after converting to MinArrow.
194#[pyfunction]
195fn chunked_info(arr: PyChunkedArray) -> PyResult<String> {
196    let inner = arr.inner();
197    Ok(format!(
198        "MinArrow SuperArray: chunks={}, len={}",
199        inner.n_chunks(),
200        inner.len()
201    ))
202}
203
204/// Export a MinArrow array as a pair of Arrow PyCapsules (schema, array).
205///
206/// The returned tuple follows the Arrow PyCapsule Interface and can be
207/// consumed by any library supporting the protocol.
208#[pyfunction]
209fn export_array_capsule(py: Python, arr: PyArray) -> PyResult<ArrowArrayWrapper> {
210    let fa = arr.field_array();
211    let array = Arc::new(fa.array.clone());
212    let (schema_capsule, array_capsule) = ffi::to_py::array_to_capsules(array, &fa.field, py)?;
213    Ok(ArrowArrayWrapper {
214        schema_capsule: Some(schema_capsule),
215        array_capsule: Some(array_capsule),
216    })
217}
218
219/// Export a MinArrow RecordBatch as an ArrowArrayStream PyCapsule.
220///
221/// The stream yields one struct array representing the record batch.
222#[pyfunction]
223fn export_batch_stream_capsule(py: Python, batch: PyRecordBatch) -> PyResult<ArrowStream> {
224    let table = batch.inner();
225    let capsule = ffi::to_py::table_to_stream_capsule(table, py)?;
226    Ok(ArrowStream {
227        capsule: Some(capsule),
228    })
229}
230
231/// Export a MinArrow Table as an ArrowArrayStream PyCapsule.
232///
233/// The stream yields one struct array per batch in the table.
234#[pyfunction]
235fn export_table_stream_capsule(py: Python, table: PyTable) -> PyResult<ArrowStream> {
236    let super_table = table.inner();
237    let capsule = ffi::to_py::super_table_to_stream_capsule(super_table, py)?;
238    Ok(ArrowStream {
239        capsule: Some(capsule),
240    })
241}
242
243/// Export a MinArrow ChunkedArray as an ArrowArrayStream PyCapsule.
244///
245/// The stream yields one plain array per chunk.
246#[pyfunction]
247fn export_chunked_stream_capsule(py: Python, arr: PyChunkedArray) -> PyResult<ArrowStream> {
248    let super_array = arr.inner();
249    let capsule = ffi::to_py::super_array_to_stream_capsule(super_array, py)?;
250    Ok(ArrowStream {
251        capsule: Some(capsule),
252    })
253}
254
255// PyCapsule protocol wrapper types
256
257/// Python-visible wrapper implementing `__arrow_c_stream__`.
258///
259/// Any Arrow-compatible Python library can consume this object directly,
260/// e.g. `pa.RecordBatchReader.from_stream(obj)` or `nanoarrow.ArrayStream(obj)`.
261#[pyclass(name = "ArrowStream")]
262struct ArrowStream {
263    capsule: Option<PyObject>,
264}
265
266#[pymethods]
267impl ArrowStream {
268    /// Arrow PyCapsule stream protocol.
269    ///
270    /// Returns the underlying ArrowArrayStream capsule. The capsule can
271    /// only be consumed once - subsequent calls raise ValueError.
272    #[pyo3(signature = (requested_schema=None))]
273    fn __arrow_c_stream__(&mut self, requested_schema: Option<PyObject>) -> PyResult<PyObject> {
274        let _ = requested_schema;
275        self.capsule.take().ok_or_else(|| {
276            pyo3::exceptions::PyValueError::new_err(
277                "ArrowStream capsule has already been consumed",
278            )
279        })
280    }
281}
282
283/// Python-visible wrapper implementing `__arrow_c_array__`.
284///
285/// Any Arrow-compatible Python library can consume this object directly,
286/// e.g. `pa.array(obj)`.
287#[pyclass(name = "ArrowArray")]
288struct ArrowArrayWrapper {
289    schema_capsule: Option<PyObject>,
290    array_capsule: Option<PyObject>,
291}
292
293#[pymethods]
294impl ArrowArrayWrapper {
295    /// Arrow PyCapsule array protocol.
296    ///
297    /// Returns (schema_capsule, array_capsule). The capsules can only
298    /// be consumed once - subsequent calls raise ValueError.
299    #[pyo3(signature = (requested_schema=None))]
300    fn __arrow_c_array__(
301        &mut self,
302        requested_schema: Option<PyObject>,
303    ) -> PyResult<(PyObject, PyObject)> {
304        let _ = requested_schema;
305        let schema = self.schema_capsule.take();
306        let array = self.array_capsule.take();
307        match (schema, array) {
308            (Some(s), Some(a)) => Ok((s, a)),
309            _ => Err(pyo3::exceptions::PyValueError::new_err(
310                "ArrowArray capsules have already been consumed",
311            )),
312        }
313    }
314}
315
316// Data generators - these produce protocol-conforming objects
317
318/// Generate sample data entirely in Rust and return as an ArrowStream.
319///
320/// The returned object implements `__arrow_c_stream__` and can be consumed
321/// by any Arrow-compatible Python library. The consumer never touches
322/// minarrow types directly.
323///
324/// Contains a batch with columns: id (int64), score (float64), label (utf8).
325#[pyfunction]
326fn generate_sample_batch(py: Python) -> PyResult<ArrowStream> {
327    use minarrow::ffi::arrow_dtype::ArrowType;
328
329    let mut ids = minarrow::IntegerArray::<i64>::default();
330    let mut scores = minarrow::FloatArray::<f64>::default();
331    let mut labels = minarrow::StringArray::<u32>::default();
332    for i in 0..5 {
333        ids.push(i + 1);
334        scores.push((i as f64 + 1.0) * 1.1);
335        labels.push_str(&format!("item_{}", i + 1));
336    }
337
338    let table = Table::new(
339        "sample".to_string(),
340        Some(vec![
341            FieldArray::new(
342                Field::new("id", ArrowType::Int64, false, None),
343                Array::from_int64(ids),
344            ),
345            FieldArray::new(
346                Field::new("score", ArrowType::Float64, false, None),
347                Array::from_float64(scores),
348            ),
349            FieldArray::new(
350                Field::new("label", ArrowType::String, false, None),
351                Array::from_string32(labels),
352            ),
353        ]),
354    );
355
356    let capsule = ffi::to_py::table_to_stream_capsule(&table, py)?;
357    Ok(ArrowStream {
358        capsule: Some(capsule),
359    })
360}
361
362/// Generate a sample array with nulls in Rust and return as an ArrowArray.
363///
364/// The returned object implements `__arrow_c_array__` and can be consumed
365/// by `pa.array(obj)` or any Arrow-compatible library.
366///
367/// Contains a nullable int64 array: [10, null, 30, null, 50].
368#[pyfunction]
369fn generate_nullable_array(py: Python) -> PyResult<ArrowArrayWrapper> {
370    use minarrow::ffi::arrow_dtype::ArrowType;
371
372    let mut arr = minarrow::IntegerArray::<i64>::default();
373    arr.push(10);
374    arr.push_null();
375    arr.push(30);
376    arr.push_null();
377    arr.push(50);
378
379    let array = Array::from_int64(arr);
380    let field = Field::new("values", ArrowType::Int64, true, None);
381    let (schema_capsule, array_capsule) =
382        ffi::to_py::array_to_capsules(Arc::new(array), &field, py)?;
383    Ok(ArrowArrayWrapper {
384        schema_capsule: Some(schema_capsule),
385        array_capsule: Some(array_capsule),
386    })
387}
388
389/// Python module definition for minarrow_pyo3.
390///
391/// This module primarily provides type conversion capabilities via the
392/// `PyArray` and `PyRecordBatch` wrapper types. The actual conversions
393/// happen automatically when these types are used as function parameters
394/// or return values in PyO3 functions.
395#[pymodule]
396fn minarrow_pyo3(m: &Bound<'_, PyModule>) -> PyResult<()> {
397    // Module-level docstring
398    m.add("__doc__", "PyO3 bindings for MinArrow - zero-copy Arrow interop with Python")?;
399    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
400
401    // Add test functions
402    m.add_function(wrap_pyfunction!(echo_array, m)?)?;
403    m.add_function(wrap_pyfunction!(echo_batch, m)?)?;
404    m.add_function(wrap_pyfunction!(echo_table, m)?)?;
405    m.add_function(wrap_pyfunction!(echo_chunked, m)?)?;
406    m.add_function(wrap_pyfunction!(array_info, m)?)?;
407    m.add_function(wrap_pyfunction!(batch_info, m)?)?;
408    m.add_function(wrap_pyfunction!(table_info, m)?)?;
409    m.add_function(wrap_pyfunction!(chunked_info, m)?)?;
410
411    // PyCapsule export functions
412    m.add_function(wrap_pyfunction!(export_array_capsule, m)?)?;
413    m.add_function(wrap_pyfunction!(export_batch_stream_capsule, m)?)?;
414    m.add_function(wrap_pyfunction!(export_table_stream_capsule, m)?)?;
415    m.add_function(wrap_pyfunction!(export_chunked_stream_capsule, m)?)?;
416
417    // PyCapsule protocol wrapper types
418    m.add_class::<ArrowStream>()?;
419    m.add_class::<ArrowArrayWrapper>()?;
420
421    // Data generators (return protocol-conforming objects)
422    m.add_function(wrap_pyfunction!(generate_sample_batch, m)?)?;
423    m.add_function(wrap_pyfunction!(generate_nullable_array, m)?)?;
424
425    Ok(())
426}