Skip to main content

polars_python/interop/numpy/
to_numpy_df.rs

1use ndarray::IntoDimension;
2use numpy::npyffi::flags;
3use numpy::{Element, IntoPyArray, PyArray1};
4use polars_core::prelude::*;
5use polars_core::utils::dtypes_to_supertype;
6use polars_core::with_match_physical_numeric_polars_type;
7use pyo3::exceptions::PyRuntimeError;
8use pyo3::prelude::*;
9use pyo3::types::{PyList, PyTuple};
10use pyo3::{IntoPyObjectExt, intern};
11
12use super::to_numpy_series::series_to_numpy;
13use super::utils::{
14    create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,
15};
16use crate::conversion::Wrap;
17use crate::dataframe::PyDataFrame;
18use crate::interned;
19use crate::utils::EnterPolarsExt;
20
21#[pymethods]
22impl PyDataFrame {
23    /// Convert this DataFrame to a NumPy ndarray.
24    fn to_numpy(
25        &self,
26        py: Python<'_>,
27        order: Wrap<IndexOrder>,
28        writable: bool,
29        allow_copy: bool,
30    ) -> PyResult<Py<PyAny>> {
31        df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)
32    }
33}
34
35pub(super) fn df_to_numpy(
36    py: Python<'_>,
37    df: &DataFrame,
38    order: IndexOrder,
39    writable: bool,
40    allow_copy: bool,
41) -> PyResult<Py<PyAny>> {
42    if df.shape_has_zero() {
43        if df.width() == 0 {
44            let shape = PyTuple::new(py, [df.height(), df.width()])?;
45            let numpy = super::utils::get_numpy_module(py)?;
46
47            return Ok(numpy
48                .call_method1(
49                    intern!(py, "zeros"),
50                    (shape, numpy.getattr(intern!(py, "int8"))?),
51                )?
52                .unbind());
53        }
54        // Take this path to ensure a writable array.
55        // This does not actually copy data for an empty DataFrame.
56        return df_to_numpy_with_copy(py, df, order, true);
57    }
58
59    if matches!(order, IndexOrder::Fortran) {
60        if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {
61            if writable {
62                if !allow_copy {
63                    return Err(PyRuntimeError::new_err(
64                        "copy not allowed: cannot create a writable array without copying data",
65                    ));
66                }
67                arr = arr.call_method0(py, interned::COPY.get(py))?;
68            }
69            return Ok(arr);
70        }
71    }
72
73    if !allow_copy {
74        return Err(PyRuntimeError::new_err(
75            "copy not allowed: cannot convert to a NumPy array without copying data",
76        ));
77    }
78
79    df_to_numpy_with_copy(py, df, order, writable)
80}
81
82/// Create a NumPy view of the given DataFrame.
83fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {
84    let first_dtype = check_df_dtypes_support_view(df)?;
85
86    // TODO: Check for nested nulls using `series_contains_null` util when we support Array types.
87    if !allow_nulls && df.columns().iter().any(|s| s.null_count() > 0) {
88        return None;
89    }
90    if !check_df_columns_contiguous(df) {
91        return None;
92    }
93
94    let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.
95
96    let arr = match first_dtype {
97        dt if dt.is_primitive_numeric() => {
98            with_match_physical_numpy_polars_type!(first_dtype, |$T| {
99                numeric_df_to_numpy_view::<$T>(py, df, owner)
100            })
101        },
102        DataType::Datetime(_, _) | DataType::Duration(_) => {
103            temporal_df_to_numpy_view(py, df, owner)
104        },
105        _ => unreachable!(),
106    };
107    Some(arr)
108}
109/// Check whether the data types of the DataFrame allow for creating a NumPy view.
110///
111/// Returns the common data type if it is supported, otherwise returns `None`.
112fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {
113    let columns = df.columns();
114    let first_dtype = columns.first()?.dtype();
115
116    // TODO: Support viewing Array types
117    if first_dtype.is_array() || !dtype_supports_view(first_dtype) {
118        return None;
119    }
120    if columns.iter().any(|s| s.dtype() != first_dtype) {
121        return None;
122    }
123    Some(first_dtype)
124}
125/// Returns whether all columns of the dataframe are contiguous in memory.
126fn check_df_columns_contiguous(df: &DataFrame) -> bool {
127    let columns = df.columns();
128
129    if columns
130        .iter()
131        .any(|s| s.as_materialized_series().n_chunks() > 1)
132    {
133        return false;
134    }
135    if columns.len() <= 1 {
136        return true;
137    }
138
139    match columns.first().unwrap().dtype() {
140        dt if dt.is_primitive_numeric() => {
141            with_match_physical_numeric_polars_type!(dt, |$T| {
142                let slices = columns
143                    .iter()
144                    .map(|s| {
145                        let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();
146                        ca.data_views().next().unwrap()
147                    })
148                    .collect::<Vec<_>>();
149
150                check_slices_contiguous::<$T>(slices)
151            })
152        },
153        DataType::Datetime(_, _) | DataType::Duration(_) => {
154            let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();
155            let slices = phys
156                .iter()
157                .map(|s| {
158                    let ca = s.i64().unwrap();
159                    ca.data_views().next().unwrap()
160                })
161                .collect::<Vec<_>>();
162
163            check_slices_contiguous::<Int64Type>(slices)
164        },
165        _ => panic!("invalid data type"),
166    }
167}
168/// Returns whether the end and start pointers of all consecutive slices match.
169fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool
170where
171    T: PolarsNumericType,
172{
173    let first_slice = slices.first().unwrap();
174
175    // Check whether all arrays are from the same buffer.
176    let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };
177    slices[1..].iter().all(|slice| {
178        let slice_ptr = slice.as_ptr();
179        let valid = std::ptr::eq(slice_ptr, end_ptr);
180
181        end_ptr = unsafe { slice_ptr.add(slice.len()) };
182
183        valid
184    })
185}
186
187/// Create a NumPy view of a numeric DataFrame.
188fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>
189where
190    T: PolarsNumericType,
191    T::Native: Element,
192{
193    let ca: &ChunkedArray<T> = df
194        .columns()
195        .first()
196        .unwrap()
197        .as_materialized_series()
198        .unpack()
199        .unwrap();
200    let first_slice = ca.data_views().next().unwrap();
201
202    let start_ptr = first_slice.as_ptr();
203    let np_dtype = T::Native::get_dtype(py);
204    let dims = [first_slice.len(), df.width()].into_dimension();
205
206    unsafe {
207        create_borrowed_np_array::<_>(
208            py,
209            np_dtype,
210            dims,
211            flags::NPY_ARRAY_FARRAY_RO,
212            start_ptr as _,
213            owner,
214        )
215    }
216}
217/// Create a NumPy view of a Datetime or Duration DataFrame.
218fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {
219    let s = df.columns().first().unwrap();
220    let phys = s.to_physical_repr();
221    let ca = phys.i64().unwrap();
222    let first_slice = ca.data_views().next().unwrap();
223
224    let start_ptr = first_slice.as_ptr();
225    let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
226    let dims = [first_slice.len(), df.width()].into_dimension();
227
228    unsafe {
229        create_borrowed_np_array::<_>(
230            py,
231            np_dtype,
232            dims,
233            flags::NPY_ARRAY_FARRAY_RO,
234            start_ptr as _,
235            owner,
236        )
237    }
238}
239
240fn df_to_numpy_with_copy(
241    py: Python<'_>,
242    df: &DataFrame,
243    order: IndexOrder,
244    writable: bool,
245) -> PyResult<Py<PyAny>> {
246    if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
247        Ok(arr)
248    } else {
249        df_columns_to_numpy(py, df, order, writable)
250    }
251}
252fn try_df_to_numpy_numeric_supertype(
253    py: Python<'_>,
254    df: &DataFrame,
255    order: IndexOrder,
256) -> Option<Py<PyAny>> {
257    let st = dtypes_to_supertype(df.columns().iter().map(|s| s.dtype())).ok()?;
258
259    let np_array = match st {
260        dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {
261            let arr = py.enter_polars(|| df.to_ndarray::<$T>(order)).ok()?;
262            arr.into_pyarray(py).into_py_any(py).ok()?
263        }),
264        _ => return None,
265    };
266    Some(np_array)
267}
268
269fn df_columns_to_numpy(
270    py: Python<'_>,
271    df: &DataFrame,
272    order: IndexOrder,
273    writable: bool,
274) -> PyResult<Py<PyAny>> {
275    let np_arrays = df.columns().iter().map(|c| {
276        let mut arr = series_to_numpy(py, c.as_materialized_series(), writable, true).unwrap();
277
278        // Convert multidimensional arrays to 1D object arrays.
279        let shape: Vec<usize> = arr
280            .getattr(py, interned::SHAPE.get(py))
281            .unwrap()
282            .extract(py)
283            .unwrap();
284        if shape.len() > 1 {
285            // TODO: Downcast the NumPy array to Rust and split without calling into Python.
286            let subarrays = (0..shape[0]).map(|idx| {
287                arr.call_method1(py, interned::DUNDER_GETITEM.get(py), (idx,))
288                    .unwrap()
289            });
290            arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();
291        }
292        arr
293    });
294
295    let numpy = super::utils::get_numpy_module(py)?;
296    let np_array = match order {
297        IndexOrder::C => numpy
298            .getattr(intern!(py, "column_stack"))?
299            .call1((PyList::new(py, np_arrays)?,))?,
300        IndexOrder::Fortran => numpy
301            .getattr(intern!(py, "vstack"))?
302            .call1((PyList::new(py, np_arrays)?,))?
303            .getattr(intern!(py, "T"))?,
304    };
305
306    Ok(np_array.into())
307}