polars_python/interop/numpy/
to_numpy_df.rs

1use ndarray::IntoDimension;
2use numpy::npyffi::flags;
3use numpy::{Element, IntoPyArray, PyArray1};
4use polars_core::prelude::*;
5use polars_core::utils::dtypes_to_supertype;
6use polars_core::with_match_physical_numeric_polars_type;
7use pyo3::exceptions::PyRuntimeError;
8use pyo3::prelude::*;
9use pyo3::types::PyList;
10use pyo3::{IntoPyObjectExt, intern};
11
12use super::to_numpy_series::series_to_numpy;
13use super::utils::{
14    create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,
15};
16use crate::conversion::Wrap;
17use crate::dataframe::PyDataFrame;
18
19#[pymethods]
20impl PyDataFrame {
21    /// Convert this DataFrame to a NumPy ndarray.
22    fn to_numpy(
23        &self,
24        py: Python<'_>,
25        order: Wrap<IndexOrder>,
26        writable: bool,
27        allow_copy: bool,
28    ) -> PyResult<PyObject> {
29        df_to_numpy(py, &self.df, order.0, writable, allow_copy)
30    }
31}
32
33pub(super) fn df_to_numpy(
34    py: Python<'_>,
35    df: &DataFrame,
36    order: IndexOrder,
37    writable: bool,
38    allow_copy: bool,
39) -> PyResult<PyObject> {
40    if df.is_empty() {
41        // Take this path to ensure a writable array.
42        // This does not actually copy data for an empty DataFrame.
43        return df_to_numpy_with_copy(py, df, order, true);
44    }
45
46    if matches!(order, IndexOrder::Fortran) {
47        if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {
48            if writable {
49                if !allow_copy {
50                    return Err(PyRuntimeError::new_err(
51                        "copy not allowed: cannot create a writable array without copying data",
52                    ));
53                }
54                arr = arr.call_method0(py, intern!(py, "copy"))?;
55            }
56            return Ok(arr);
57        }
58    }
59
60    if !allow_copy {
61        return Err(PyRuntimeError::new_err(
62            "copy not allowed: cannot convert to a NumPy array without copying data",
63        ));
64    }
65
66    df_to_numpy_with_copy(py, df, order, writable)
67}
68
69/// Create a NumPy view of the given DataFrame.
70fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<PyObject> {
71    let first_dtype = check_df_dtypes_support_view(df)?;
72
73    // TODO: Check for nested nulls using `series_contains_null` util when we support Array types.
74    if !allow_nulls && df.get_columns().iter().any(|s| s.null_count() > 0) {
75        return None;
76    }
77    if !check_df_columns_contiguous(df) {
78        return None;
79    }
80
81    let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.
82
83    let arr = match first_dtype {
84        dt if dt.is_primitive_numeric() => {
85            with_match_physical_numpy_polars_type!(first_dtype, |$T| {
86                numeric_df_to_numpy_view::<$T>(py, df, owner)
87            })
88        },
89        DataType::Datetime(_, _) | DataType::Duration(_) => {
90            temporal_df_to_numpy_view(py, df, owner)
91        },
92        _ => unreachable!(),
93    };
94    Some(arr)
95}
96/// Check whether the data types of the DataFrame allow for creating a NumPy view.
97///
98/// Returns the common data type if it is supported, otherwise returns `None`.
99fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {
100    let columns = df.get_columns();
101    let first_dtype = columns.first()?.dtype();
102
103    // TODO: Support viewing Array types
104    if first_dtype.is_array() || !dtype_supports_view(first_dtype) {
105        return None;
106    }
107    if columns.iter().any(|s| s.dtype() != first_dtype) {
108        return None;
109    }
110    Some(first_dtype)
111}
112/// Returns whether all columns of the dataframe are contiguous in memory.
113fn check_df_columns_contiguous(df: &DataFrame) -> bool {
114    let columns = df.get_columns();
115
116    if columns
117        .iter()
118        .any(|s| s.as_materialized_series().n_chunks() > 1)
119    {
120        return false;
121    }
122    if columns.len() <= 1 {
123        return true;
124    }
125
126    match columns.first().unwrap().dtype() {
127        dt if dt.is_primitive_numeric() => {
128            with_match_physical_numeric_polars_type!(dt, |$T| {
129                let slices = columns
130                    .iter()
131                    .map(|s| {
132                        let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();
133                        ca.data_views().next().unwrap()
134                    })
135                    .collect::<Vec<_>>();
136
137                check_slices_contiguous::<$T>(slices)
138            })
139        },
140        DataType::Datetime(_, _) | DataType::Duration(_) => {
141            let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();
142            let slices = phys
143                .iter()
144                .map(|s| {
145                    let ca = s.i64().unwrap();
146                    ca.data_views().next().unwrap()
147                })
148                .collect::<Vec<_>>();
149
150            check_slices_contiguous::<Int64Type>(slices)
151        },
152        _ => panic!("invalid data type"),
153    }
154}
155/// Returns whether the end and start pointers of all consecutive slices match.
156fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool
157where
158    T: PolarsNumericType,
159{
160    let first_slice = slices.first().unwrap();
161
162    // Check whether all arrays are from the same buffer.
163    let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };
164    slices[1..].iter().all(|slice| {
165        let slice_ptr = slice.as_ptr();
166        let valid = std::ptr::eq(slice_ptr, end_ptr);
167
168        end_ptr = unsafe { slice_ptr.add(slice.len()) };
169
170        valid
171    })
172}
173
174/// Create a NumPy view of a numeric DataFrame.
175fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: PyObject) -> PyObject
176where
177    T: PolarsNumericType,
178    T::Native: Element,
179{
180    let ca: &ChunkedArray<T> = df
181        .get_columns()
182        .first()
183        .unwrap()
184        .as_materialized_series()
185        .unpack()
186        .unwrap();
187    let first_slice = ca.data_views().next().unwrap();
188
189    let start_ptr = first_slice.as_ptr();
190    let np_dtype = T::Native::get_dtype(py);
191    let dims = [first_slice.len(), df.width()].into_dimension();
192
193    unsafe {
194        create_borrowed_np_array::<_>(
195            py,
196            np_dtype,
197            dims,
198            flags::NPY_ARRAY_FARRAY_RO,
199            start_ptr as _,
200            owner,
201        )
202    }
203}
204/// Create a NumPy view of a Datetime or Duration DataFrame.
205fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: PyObject) -> PyObject {
206    let s = df.get_columns().first().unwrap();
207    let phys = s.to_physical_repr();
208    let ca = phys.i64().unwrap();
209    let first_slice = ca.data_views().next().unwrap();
210
211    let start_ptr = first_slice.as_ptr();
212    let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
213    let dims = [first_slice.len(), df.width()].into_dimension();
214
215    unsafe {
216        create_borrowed_np_array::<_>(
217            py,
218            np_dtype,
219            dims,
220            flags::NPY_ARRAY_FARRAY_RO,
221            start_ptr as _,
222            owner,
223        )
224    }
225}
226
227fn df_to_numpy_with_copy(
228    py: Python<'_>,
229    df: &DataFrame,
230    order: IndexOrder,
231    writable: bool,
232) -> PyResult<PyObject> {
233    if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
234        Ok(arr)
235    } else {
236        df_columns_to_numpy(py, df, order, writable)
237    }
238}
239fn try_df_to_numpy_numeric_supertype(
240    py: Python<'_>,
241    df: &DataFrame,
242    order: IndexOrder,
243) -> Option<PyObject> {
244    let st = dtypes_to_supertype(df.iter().map(|s| s.dtype())).ok()?;
245
246    let np_array = match st {
247        dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {
248            df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?
249        }),
250        _ => return None,
251    };
252    Some(np_array)
253}
254
255fn df_columns_to_numpy(
256    py: Python<'_>,
257    df: &DataFrame,
258    order: IndexOrder,
259    writable: bool,
260) -> PyResult<PyObject> {
261    let np_arrays = df.iter().map(|s| {
262        let mut arr = series_to_numpy(py, s, writable, true).unwrap();
263
264        // Convert multidimensional arrays to 1D object arrays.
265        let shape: Vec<usize> = arr
266            .getattr(py, intern!(py, "shape"))
267            .unwrap()
268            .extract(py)
269            .unwrap();
270        if shape.len() > 1 {
271            // TODO: Downcast the NumPy array to Rust and split without calling into Python.
272            let subarrays = (0..shape[0]).map(|idx| {
273                arr.call_method1(py, intern!(py, "__getitem__"), (idx,))
274                    .unwrap()
275            });
276            arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();
277        }
278        arr
279    });
280
281    let numpy = PyModule::import(py, intern!(py, "numpy"))?;
282    let np_array = match order {
283        IndexOrder::C => numpy
284            .getattr(intern!(py, "column_stack"))?
285            .call1((PyList::new(py, np_arrays)?,))?,
286        IndexOrder::Fortran => numpy
287            .getattr(intern!(py, "vstack"))?
288            .call1((PyList::new(py, np_arrays)?,))?
289            .getattr(intern!(py, "T"))?,
290    };
291
292    Ok(np_array.into())
293}