Skip to main content

pycapsule_exchange/
pycapsule_exchange.rs

1// Copyright 2025 Peter Garfield Bower
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # PyCapsule Exchange Example
16//!
17//! Demonstrates how the Arrow PyCapsule Interface simplifies data exchange
18//! between Rust (MinArrow) and Python libraries.
19//!
20//! ## The problem PyCapsules solve
21//!
22//! **C Data Interface - Old Arrow approach** 
23//! ```python
24//! # Worked with PyArrow - requires both sides to know about
25//! # PyArrow's private _export_to_c / _import_from_c methods and
26//! # to pass raw memory addresses as integers.
27//! addr = pyarrow_array._export_to_c()
28//! # Pass integer address across FFI boundary...
29//! ```
30//!
31//! **PyCapsule approach**:
32//! ```python
33//! # Works with various Arrow-compatible libraries natively: PyArrow, Polars, DuckDB,
34//! # pandas (with ArrowDtype), nanoarrow, etc.
35//! # The producer exposes __arrow_c_array__ / __arrow_c_stream__,
36//! # the consumer calls it - all sorted.
37//!
38//! import polars as pl
39//! df = pl.DataFrame({"a": [1, 2, 3]})
40//! # Polars implements __arrow_c_stream__, minarrow consumes it directly
41//! result = minarrow_pyo3.echo_table(df)
42//! ```
43//!
44//! ## What this example shows
45//!
46//! 1. Exporting a MinArrow array as PyCapsules, consumed by PyArrow via `pa.array()`
47//! 2. Exporting a MinArrow table as a stream PyCapsule, consumed by `RecordBatchReader`
48//! 3. Importing a PyArrow array into MinArrow via `__arrow_c_array__`
49//! 4. Importing a PyArrow table into MinArrow via `__arrow_c_stream__`
50//!
51//! ## Running this example
52//!
53//! ```bash
54//! cd pyo3
55//!
56//! PYO3_PYTHON=.venv/bin/python cargo build --example pycapsule_exchange \
57//!     --no-default-features \
58//!     --features "datetime,extended_numeric_types,extended_categorical"
59//!
60//! # PYTHONHOME must point to the system prefix (stdlib lives there),
61//! # PYTHONPATH adds the venv's site-packages (pyarrow etc.).
62//! PYTHONHOME=/usr \
63//!   PYTHONPATH=.venv/lib/python3.12/site-packages \
64//!   LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \
65//!   cargo run --example pycapsule_exchange \
66//!     --no-default-features \
67//!     --features "datetime,extended_numeric_types,extended_categorical"
68//! ```
69
70use minarrow::ffi::arrow_dtype::ArrowType;
71use minarrow::{Array, Field, FieldArray, FloatArray, IntegerArray, MaskedArray, NumericArray, Table};
72use minarrow_pyo3::ffi::{to_py, to_rust};
73use pyo3::prelude::*;
74use pyo3::types::IntoPyDict;
75use std::sync::Arc;
76
77fn main() -> PyResult<()> {
78    pyo3::prepare_freethreaded_python();
79
80    Python::with_gil(|py| {
81        println!("=== Arrow PyCapsule Interface Examples ===\n");
82
83        example_1_export_array(py)?;
84        example_2_export_table_stream(py)?;
85        example_3_import_from_pyarrow_array(py)?;
86        example_4_import_from_pyarrow_table(py)?;
87
88        println!("\n=== All examples completed ===");
89        Ok(())
90    })
91}
92
93/// Export a MinArrow array as PyCapsules, then import into PyArrow.
94///
95/// MinArrow produces a wrapper with `__arrow_c_array__`. PyArrow's
96/// `pa.array()` calls that method automatically to consume the data.
97fn example_1_export_array(py: Python<'_>) -> PyResult<()> {
98    println!("Example 1: Export MinArrow array -> PyArrow via __arrow_c_array__");
99    println!("----------------------------------------------------------------");
100
101    // Build an array in Rust
102    let mut arr = IntegerArray::<i64>::default();
103    for i in 0..5 {
104        arr.push(i * 10);
105    }
106    let array = Array::from_int64(arr);
107    let field = Field::new("values", ArrowType::Int64, false, None);
108    println!("  Created MinArrow i64 array: [0, 10, 20, 30, 40]");
109
110    // Export as PyCapsules (i.e., schema + array)
111    let (schema_capsule, array_capsule) =
112        to_py::array_to_capsules(Arc::new(array), &field, py)?;
113    println!("  Exported as PyCapsules");
114
115    // Build a wrapper that implements __arrow_c_array__ so PyArrow can consume it.
116    // In a real #[pyfunction] you'd return ArrowArrayWrapper directly;
117    // here we simulate by calling __arrow_c_array__ manually.
118    let pyarrow = py.import("pyarrow")?;
119
120    // PyArrow.Array._import_from_c expects (array_ptr, schema_ptr) as integers,
121    // but we can extract the raw pointers from the capsules properly:
122    let array_ptr = unsafe {
123        pyo3::ffi::PyCapsule_GetPointer(
124            array_capsule.as_ptr(),
125            c"arrow_array".as_ptr(),
126        )
127    } as usize;
128    let schema_ptr = unsafe {
129        pyo3::ffi::PyCapsule_GetPointer(
130            schema_capsule.as_ptr(),
131            c"arrow_schema".as_ptr(),
132        )
133    } as usize;
134
135    let pa_array = pyarrow
136        .getattr("Array")?
137        .call_method1("_import_from_c", (array_ptr, schema_ptr))?;
138
139    let repr: String = pa_array.call_method0("__repr__")?.extract()?;
140    println!("  PyArrow received: {}", repr.lines().next().unwrap_or(""));
141    println!("  Done.\n");
142    Ok(())
143}
144
145/// Export a MinArrow table as a stream PyCapsule, then consume in PyArrow.
146///
147/// For multi-column data, the stream interface is more natural - the
148/// consumer gets a single capsule and pulls batches from it.
149fn example_2_export_table_stream(py: Python<'_>) -> PyResult<()> {
150    println!("Example 2: Export MinArrow table -> PyArrow via __arrow_c_stream__");
151    println!("------------------------------------------------------------------");
152
153    let mut ids = IntegerArray::<i32>::default();
154    ids.push(1);
155    ids.push(2);
156    ids.push(3);
157
158    let mut scores = FloatArray::<f64>::default();
159    scores.push(9.5);
160    scores.push(8.3);
161    scores.push(7.1);
162
163    let table = Table::new(
164        "results".to_string(),
165        Some(vec![
166            FieldArray::new(
167                Field::new("id", ArrowType::Int32, false, None),
168                Array::from_int32(ids),
169            ),
170            FieldArray::new(
171                Field::new("score", ArrowType::Float64, false, None),
172                Array::from_float64(scores),
173            ),
174        ]),
175    );
176    println!("  Created MinArrow table: 3 rows x 2 columns (id, score)");
177
178    // Export as a stream capsule and extract the raw pointer for PyArrow
179    let stream_capsule = to_py::table_to_stream_capsule(&table, py)?;
180    let stream_ptr = unsafe {
181        pyo3::ffi::PyCapsule_GetPointer(
182            stream_capsule.as_ptr(),
183            c"arrow_array_stream".as_ptr(),
184        )
185    } as usize;
186
187    let pyarrow = py.import("pyarrow")?;
188    let reader = pyarrow
189        .getattr("RecordBatchReader")?
190        .call_method1("_import_from_c", (stream_ptr,))?;
191    let pa_table = reader.call_method0("read_all")?;
192
193    let num_rows: usize = pa_table.getattr("num_rows")?.extract()?;
194    let schema_repr: String = pa_table.getattr("schema")?.call_method0("__repr__")?.extract()?;
195    println!("  PyArrow received: {} rows", num_rows);
196    println!("  Schema: {}", schema_repr.lines().next().unwrap_or(""));
197    println!("  Done.\n");
198    Ok(())
199}
200
201/// Import a PyArrow array into MinArrow via __arrow_c_array__.
202///
203/// This is the Python-to-Rust direction using PyCapsules. We call the
204/// standard __arrow_c_array__ protocol method - the same code works with
205/// Polars Series, nanoarrow arrays, etc.
206fn example_3_import_from_pyarrow_array(py: Python<'_>) -> PyResult<()> {
207    println!("Example 3: Import PyArrow array -> MinArrow via __arrow_c_array__");
208    println!("-----------------------------------------------------------------");
209
210    let pyarrow = py.import("pyarrow")?;
211    let py_array = pyarrow.call_method1("array", (vec![100i64, 200, 300, 400, 500],))?;
212    println!("  Created PyArrow array: [100, 200, 300, 400, 500]");
213
214    let result = to_rust::try_capsule_array(&py_array);
215
216    match result {
217        Some(Ok(field_array)) => {
218            println!("  Imported into MinArrow: {} elements", field_array.array.len());
219
220            match &field_array.array {
221                Array::NumericArray(NumericArray::Int64(a)) => {
222                    let values: Vec<i64> = (0..a.len())
223                        .map(|i| a.get(i).unwrap_or(0))
224                        .collect();
225                    println!("  Values: {:?}", values);
226                }
227                other => println!("  Got type: {:?}", std::mem::discriminant(other)),
228            }
229        }
230        Some(Err(e)) => println!("  Import failed: {}", e),
231        None => println!("  __arrow_c_array__ not available on this object"),
232    }
233
234    println!("  Done.\n");
235    Ok(())
236}
237
238/// Import a PyArrow table into MinArrow via __arrow_c_stream__.
239///
240/// The stream protocol is ideal for tabular data - the consumer gets
241/// schema and batches through a single interface.
242fn example_4_import_from_pyarrow_table(py: Python<'_>) -> PyResult<()> {
243    println!("Example 4: Import PyArrow table -> MinArrow via __arrow_c_stream__");
244    println!("------------------------------------------------------------------");
245
246    let pyarrow = py.import("pyarrow")?;
247    let dict = vec![
248        (
249            "name",
250            pyarrow.call_method1("array", (vec!["Alice", "Bob", "Charlie"],))?,
251        ),
252        (
253            "age",
254            pyarrow.call_method1("array", (vec![30i64, 25, 35],))?,
255        ),
256    ]
257    .into_py_dict(py)?;
258
259    let py_table = pyarrow.call_method1("table", (dict,))?;
260    println!("  Created PyArrow table: name=['Alice','Bob','Charlie'], age=[30,25,35]");
261
262    let result = to_rust::try_capsule_record_batch_stream(&py_table);
263
264    match result {
265        Some(Ok((batches, _metadata))) => {
266            println!("  Imported {} batch(es) into MinArrow", batches.len());
267            for (batch_idx, batch) in batches.iter().enumerate() {
268                println!("  Batch {}: {} columns", batch_idx, batch.len());
269                for (col_idx, (array, field)) in batch.iter().enumerate() {
270                    println!(
271                        "    Column {} '{}': {} rows, type={:?}",
272                        col_idx,
273                        field.name,
274                        array.len(),
275                        field.dtype
276                    );
277                }
278            }
279        }
280        Some(Err(e)) => println!("  Import failed: {}", e),
281        None => println!("  __arrow_c_stream__ not available on this object"),
282    }
283
284    println!("  Done.\n");
285    Ok(())
286}