Skip to main content

polars_python/interop/numpy/
to_numpy_df.rs

1use ndarray::IntoDimension;
2use numpy::npyffi::flags;
3use numpy::{Element, IntoPyArray, PyArray1};
4use polars_core::prelude::*;
5use polars_core::utils::dtypes_to_supertype;
6use polars_core::with_match_physical_numeric_polars_type;
7use pyo3::exceptions::PyRuntimeError;
8use pyo3::prelude::*;
9use pyo3::types::{PyList, PyTuple};
10use pyo3::{IntoPyObjectExt, intern};
11
12use super::to_numpy_series::series_to_numpy;
13use super::utils::{
14    create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,
15};
16use crate::conversion::Wrap;
17use crate::dataframe::PyDataFrame;
18
19#[pymethods]
20impl PyDataFrame {
21    /// Convert this DataFrame to a NumPy ndarray.
22    fn to_numpy(
23        &self,
24        py: Python<'_>,
25        order: Wrap<IndexOrder>,
26        writable: bool,
27        allow_copy: bool,
28    ) -> PyResult<Py<PyAny>> {
29        df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)
30    }
31}
32
33pub(super) fn df_to_numpy(
34    py: Python<'_>,
35    df: &DataFrame,
36    order: IndexOrder,
37    writable: bool,
38    allow_copy: bool,
39) -> PyResult<Py<PyAny>> {
40    if df.shape_has_zero() {
41        if df.width() == 0 {
42            let shape = PyTuple::new(py, [df.height(), df.width()])?;
43            let numpy = super::utils::get_numpy_module(py)?;
44
45            return Ok(numpy
46                .call_method1(
47                    intern!(py, "zeros"),
48                    (shape, numpy.getattr(intern!(py, "int8"))?),
49                )?
50                .unbind());
51        }
52        // Take this path to ensure a writable array.
53        // This does not actually copy data for an empty DataFrame.
54        return df_to_numpy_with_copy(py, df, order, true);
55    }
56
57    if matches!(order, IndexOrder::Fortran) {
58        if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {
59            if writable {
60                if !allow_copy {
61                    return Err(PyRuntimeError::new_err(
62                        "copy not allowed: cannot create a writable array without copying data",
63                    ));
64                }
65                arr = arr.call_method0(py, intern!(py, "copy"))?;
66            }
67            return Ok(arr);
68        }
69    }
70
71    if !allow_copy {
72        return Err(PyRuntimeError::new_err(
73            "copy not allowed: cannot convert to a NumPy array without copying data",
74        ));
75    }
76
77    df_to_numpy_with_copy(py, df, order, writable)
78}
79
80/// Create a NumPy view of the given DataFrame.
81fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {
82    let first_dtype = check_df_dtypes_support_view(df)?;
83
84    // TODO: Check for nested nulls using `series_contains_null` util when we support Array types.
85    if !allow_nulls && df.columns().iter().any(|s| s.null_count() > 0) {
86        return None;
87    }
88    if !check_df_columns_contiguous(df) {
89        return None;
90    }
91
92    let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.
93
94    let arr = match first_dtype {
95        dt if dt.is_primitive_numeric() => {
96            with_match_physical_numpy_polars_type!(first_dtype, |$T| {
97                numeric_df_to_numpy_view::<$T>(py, df, owner)
98            })
99        },
100        DataType::Datetime(_, _) | DataType::Duration(_) => {
101            temporal_df_to_numpy_view(py, df, owner)
102        },
103        _ => unreachable!(),
104    };
105    Some(arr)
106}
107/// Check whether the data types of the DataFrame allow for creating a NumPy view.
108///
109/// Returns the common data type if it is supported, otherwise returns `None`.
110fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {
111    let columns = df.columns();
112    let first_dtype = columns.first()?.dtype();
113
114    // TODO: Support viewing Array types
115    if first_dtype.is_array() || !dtype_supports_view(first_dtype) {
116        return None;
117    }
118    if columns.iter().any(|s| s.dtype() != first_dtype) {
119        return None;
120    }
121    Some(first_dtype)
122}
123/// Returns whether all columns of the dataframe are contiguous in memory.
124fn check_df_columns_contiguous(df: &DataFrame) -> bool {
125    let columns = df.columns();
126
127    if columns
128        .iter()
129        .any(|s| s.as_materialized_series().n_chunks() > 1)
130    {
131        return false;
132    }
133    if columns.len() <= 1 {
134        return true;
135    }
136
137    match columns.first().unwrap().dtype() {
138        dt if dt.is_primitive_numeric() => {
139            with_match_physical_numeric_polars_type!(dt, |$T| {
140                let slices = columns
141                    .iter()
142                    .map(|s| {
143                        let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();
144                        ca.data_views().next().unwrap()
145                    })
146                    .collect::<Vec<_>>();
147
148                check_slices_contiguous::<$T>(slices)
149            })
150        },
151        DataType::Datetime(_, _) | DataType::Duration(_) => {
152            let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();
153            let slices = phys
154                .iter()
155                .map(|s| {
156                    let ca = s.i64().unwrap();
157                    ca.data_views().next().unwrap()
158                })
159                .collect::<Vec<_>>();
160
161            check_slices_contiguous::<Int64Type>(slices)
162        },
163        _ => panic!("invalid data type"),
164    }
165}
166/// Returns whether the end and start pointers of all consecutive slices match.
167fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool
168where
169    T: PolarsNumericType,
170{
171    let first_slice = slices.first().unwrap();
172
173    // Check whether all arrays are from the same buffer.
174    let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };
175    slices[1..].iter().all(|slice| {
176        let slice_ptr = slice.as_ptr();
177        let valid = std::ptr::eq(slice_ptr, end_ptr);
178
179        end_ptr = unsafe { slice_ptr.add(slice.len()) };
180
181        valid
182    })
183}
184
185/// Create a NumPy view of a numeric DataFrame.
186fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>
187where
188    T: PolarsNumericType,
189    T::Native: Element,
190{
191    let ca: &ChunkedArray<T> = df
192        .columns()
193        .first()
194        .unwrap()
195        .as_materialized_series()
196        .unpack()
197        .unwrap();
198    let first_slice = ca.data_views().next().unwrap();
199
200    let start_ptr = first_slice.as_ptr();
201    let np_dtype = T::Native::get_dtype(py);
202    let dims = [first_slice.len(), df.width()].into_dimension();
203
204    unsafe {
205        create_borrowed_np_array::<_>(
206            py,
207            np_dtype,
208            dims,
209            flags::NPY_ARRAY_FARRAY_RO,
210            start_ptr as _,
211            owner,
212        )
213    }
214}
215/// Create a NumPy view of a Datetime or Duration DataFrame.
216fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {
217    let s = df.columns().first().unwrap();
218    let phys = s.to_physical_repr();
219    let ca = phys.i64().unwrap();
220    let first_slice = ca.data_views().next().unwrap();
221
222    let start_ptr = first_slice.as_ptr();
223    let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
224    let dims = [first_slice.len(), df.width()].into_dimension();
225
226    unsafe {
227        create_borrowed_np_array::<_>(
228            py,
229            np_dtype,
230            dims,
231            flags::NPY_ARRAY_FARRAY_RO,
232            start_ptr as _,
233            owner,
234        )
235    }
236}
237
238fn df_to_numpy_with_copy(
239    py: Python<'_>,
240    df: &DataFrame,
241    order: IndexOrder,
242    writable: bool,
243) -> PyResult<Py<PyAny>> {
244    if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
245        Ok(arr)
246    } else {
247        df_columns_to_numpy(py, df, order, writable)
248    }
249}
250fn try_df_to_numpy_numeric_supertype(
251    py: Python<'_>,
252    df: &DataFrame,
253    order: IndexOrder,
254) -> Option<Py<PyAny>> {
255    let st = dtypes_to_supertype(df.columns().iter().map(|s| s.dtype())).ok()?;
256
257    let np_array = match st {
258        dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {
259            df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?
260        }),
261        _ => return None,
262    };
263    Some(np_array)
264}
265
266fn df_columns_to_numpy(
267    py: Python<'_>,
268    df: &DataFrame,
269    order: IndexOrder,
270    writable: bool,
271) -> PyResult<Py<PyAny>> {
272    let np_arrays = df.columns().iter().map(|c| {
273        let mut arr = series_to_numpy(py, c.as_materialized_series(), writable, true).unwrap();
274
275        // Convert multidimensional arrays to 1D object arrays.
276        let shape: Vec<usize> = arr
277            .getattr(py, intern!(py, "shape"))
278            .unwrap()
279            .extract(py)
280            .unwrap();
281        if shape.len() > 1 {
282            // TODO: Downcast the NumPy array to Rust and split without calling into Python.
283            let subarrays = (0..shape[0]).map(|idx| {
284                arr.call_method1(py, intern!(py, "__getitem__"), (idx,))
285                    .unwrap()
286            });
287            arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();
288        }
289        arr
290    });
291
292    let numpy = super::utils::get_numpy_module(py)?;
293    let np_array = match order {
294        IndexOrder::C => numpy
295            .getattr(intern!(py, "column_stack"))?
296            .call1((PyList::new(py, np_arrays)?,))?,
297        IndexOrder::Fortran => numpy
298            .getattr(intern!(py, "vstack"))?
299            .call1((PyList::new(py, np_arrays)?,))?
300            .getattr(intern!(py, "T"))?,
301    };
302
303    Ok(np_array.into())
304}