df_interchange/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#![allow(clippy::useless_transmute)]
#![allow(dead_code)]
#![allow(unused_macros)]

//! # DataFrame Interchange
//!
//! This crate allows for seamless interoperability between any version of [Polars (>=0.40)](https://docs.rs/polars/latest/polars/) and any version of [Arrow (>=50)](https://docs.rs/arrow/latest/arrow/), including between versions of the same crate (e.g. `Polars 0.40` to `Polars 0.46`), using the [Arrow C Data Interchange](https://arrow.apache.org/docs/format/CDataInterface.html) format.
//!
//! Supported versions:
//! * Arrow: "50", "51", "52", "53", "54"
//! * Polars: "0.40", "0.41", "0.42", "0.43", "0.44", "0.45", "0.46"
//!
//! ## Polars and Arrow Rust ecosystem
//!
//! Since both `Polars` (pre-1.0) and `Arrow` have SemVer breaking API updates in Rust every few months, the Rust ecosystem that depend on these crates update at a different rates and are consistently incompatible with each other (e.g. one crate outputs `Polars 0.45` and another crate takes `Polars 0.43` as input). For crates who take these as input or provide these as output, updating should be considered an API break, and require a major bump in version. This has a cascading effect over the whole ecosystem.
//!
//! For example, attempting to pass `Polars 0.45` to a crate that uses `Polars 0.43`, or vice versa, will give a [error\[E0308\]: mismatched types](https://doc.rust-lang.org/error_codes/E0308.html) error with the note  "'DataFrame' and 'DataFrame' have similar names, but are actually distinct types".
//!
//! This crate is meant to solve the interoperability issue and prevent the need for the entirety of the ecosystem to update at the same speed.
//!
//! ## Usage
//!
//! Enable the correct feature (e.g. `Polars 0.43`, `Polars 0.46` and `Arrow 54`):
//!
//! ```toml
//! [dependencies]
//! polars = "0.43"
//! arrow = "54"
//! df-interchange = { version = "0.1.0", features = ["polars_0_43", "polars_0_46", "arrow_54"] }
//! ```
//! Then use the `from_polars_0_43` & `from_arrow_54` and `to_polars_0_46` implementation of `Interchange` to change types:
//!
//! ```no_run
//! use std::sync::Arc;
//! use arrow::{array::{ArrayRef, Int32Array, Int64Array}, record_batch::RecordBatch}; // Arrow 54
//! use polars::prelude::*; // Polars 0.43
//! use df_interchange::Interchange;
//!
//! // Create Polars 0.43 data
//! let polars_0_43 = DataFrame::new(vec![
//!     Series::new("test_i32".into(), [-1i32, 0, 1]),
//!     Series::new("test_i64".into(), [-1i64, 0, 1]),
//! ])
//! .unwrap();
//!
//! // Create arrow 54 data
//! let arrow_54: Vec<_> = vec![RecordBatch::try_from_iter(vec![
//!     ("test_i32", Arc::new(Int32Array::from(vec![-1i32, 0, 1])) as ArrayRef),
//!     ("test_i64", Arc::new(Int64Array::from(vec![-1i64, 0, 1])) as ArrayRef),
//! ])
//! .unwrap()];
//!
//! // Convert Polars 0.43 to Polars 0.46
//! let df_polars = Interchange::from_polars_0_43(polars_0_43)?.to_polars_0_46()?;
//!
//! // Convert Arrow 54 to Polars 0.46
//! let df_arrow = Interchange::from_arrow_54(arrow_54)?.to_polars_0_46()?;
//!
//! // Compare the two DataFrames (not possible prior to conversion to Polars 0.46)
//! assert!(df_polars.equals_missing(&df_arrow));
//!
//! ```
//! ## Technical info
//!
//! ### Features
//!
//! Since Rust features are [additive](https://doc.rust-lang.org/cargo/reference/features.html#feature-unification), you can enable features on Arrow or Polars crates by adding them to your own `Cargo.toml`.
//!
//! For example, you can enable the `lazy` feature on the Polars version you receive from `df-interchange`.
//!
//! ```toml
//! [dependencies]
//! polars = { version = "0.46", features = ["lazy"] }
//! polars_old = { package = "polars", version = "0.45", features = ["lazy"] }
//! df-interchange = { path = "/home/eric/Rust/df-interchange", version = "0.1.0", features = ["polars_0_45", "polars_0_46"] }
//! ```
//!
//! To use this, since the `.lazy()` uses the `IntoLazy` trait for `DataFrame`, you have to [disambiguate the trait](https://doc.rust-lang.org/rust-by-example/trait/disambiguating.html) with `<polars_old::prelude::DataFrame as polars_old::prelude::IntoLazy>::lazy(df)`:
//!
//! ```no_run
//! use df_interchange::Interchange;
//! use polars::prelude::*;
//!
//! let df_0_46 = DataFrame::new(vec![
//!     Column::new("test_i32".into(), [1i32, 2, 3, 4]),
//!     Column::new("test_i64".into(), [1i64, 2, 3, 4]),
//! ])
//! .unwrap()
//! .lazy();
//!
//! let df_0_45 = Interchange::from_polars_0_46(df_0_46.collect().unwrap())?.to_polars_0_45()?;
//!
//! let lf = <polars_old::prelude::DataFrame as polars_old::prelude::IntoLazy>::lazy(df_0_45);
//! ```
//!
//! During conversion, you may encounter errors based on data type conversions enabled by features. For example, if you convert a column of `i8` from a `Polars 0.46` that enables the `dtype-i8` feature, to `Polars 0.43` that does not enable `dtype-i8`, you will get a `Error(ComputeError(ErrString("cannot create series from Int8")))`. You can enable this feature on both versions of the crate to solve the issue.
//!
//! ```toml
//! [dependencies]
//! polars = { version = "0.46", features = ["dtype-i8"] }
//! polars_0_43 = { package = "polars", version = "0.43", features = ["dtype-i8"] }
//! df-interchange = { version = "0.1.0", features = ["polars_0_43", "polars_0_46"] }
//! ```
//!
//! ```no_run
//! use polars::prelude::*; // Polars 0.46
//! use df_interchange::Interchange;
//!
//! let df_0_46 = DataFrame::new(vec![
//!     Column::new("test_i8".into(), [1i8, 2, 3, 4]),
//!     Column::new("test_i64".into(), [1i64, 2, 3, 4]),
//! ])
//! .unwrap();
//!
//! let df_0_43 = Interchange::from_polars_0_46(df_0_46)?.to_polars_0_43()?;
//! ```
//!
mod error;
pub use error::InterchangeError;

#[cfg(any(
    feature = "arrow_50",
    feature = "arrow_51",
    feature = "arrow_52",
    feature = "arrow_53",
    feature = "arrow_54"
))]
mod from_arrow;

#[cfg(any(
    feature = "polars_0_40",
    feature = "polars_0_41",
    feature = "polars_0_42",
    feature = "polars_0_43",
    feature = "polars_0_44",
    feature = "polars_0_45",
    feature = "polars_0_46"
))]
mod from_polars;

#[cfg(any(
    feature = "arrow_50",
    feature = "arrow_51",
    feature = "arrow_52",
    feature = "arrow_53",
    feature = "arrow_54"
))]
mod to_arrow;

#[cfg(any(
    feature = "polars_0_40",
    feature = "polars_0_41",
    feature = "polars_0_42",
    feature = "polars_0_43",
    feature = "polars_0_44",
    feature = "polars_0_45",
    feature = "polars_0_46"
))]
mod to_polars;

#[repr(C)]
struct ArrowArray {
    length: i64,
    null_count: i64,
    offset: i64,
    n_buffers: i64,
    n_children: i64,
    buffers: *mut *const ::std::os::raw::c_void,
    children: *mut *mut ArrowArray,
    dictionary: *mut ArrowArray,
    release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut ArrowArray)>,
    private_data: *mut ::std::os::raw::c_void,
}

#[repr(C)]
struct ArrowSchema {
    format: *const ::std::os::raw::c_char,
    name: *const ::std::os::raw::c_char,
    metadata: *const ::std::os::raw::c_char,
    flags: i64,
    n_children: i64,
    children: *mut *mut ArrowSchema,
    dictionary: *mut ArrowSchema,
    release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut ArrowSchema)>,
    private_data: *mut ::std::os::raw::c_void,
}

pub struct Interchange {
    ffi: Vec<(String, Vec<(ArrowArray, ArrowSchema)>)>,
}