polars_python/dataframe/
export.rs

1use arrow::datatypes::IntegerType;
2use arrow::record_batch::RecordBatch;
3use polars::prelude::*;
4use polars_compute::cast::CastOptionsImpl;
5use pyo3::IntoPyObjectExt;
6use pyo3::prelude::*;
7use pyo3::types::{PyCapsule, PyList, PyTuple};
8
9use super::PyDataFrame;
10use crate::conversion::{ObjectValue, Wrap};
11use crate::error::PyPolarsErr;
12use crate::interop;
13use crate::interop::arrow::to_py::dataframe_to_stream;
14use crate::prelude::PyCompatLevel;
15use crate::utils::EnterPolarsExt;
16
17#[pymethods]
18impl PyDataFrame {
19    #[cfg(feature = "object")]
20    pub fn row_tuple<'py>(&self, idx: i64, py: Python<'py>) -> PyResult<Bound<'py, PyTuple>> {
21        let idx = if idx < 0 {
22            (self.df.height() as i64 + idx) as usize
23        } else {
24            idx as usize
25        };
26        if idx >= self.df.height() {
27            return Err(PyPolarsErr::from(polars_err!(oob = idx, self.df.height())).into());
28        }
29        PyTuple::new(
30            py,
31            self.df.get_columns().iter().map(|s| match s.dtype() {
32                DataType::Object(_) => {
33                    let obj: Option<&ObjectValue> = s.get_object(idx).map(|any| any.into());
34                    obj.into_py_any(py).unwrap()
35                },
36                _ => Wrap(s.get(idx).unwrap()).into_py_any(py).unwrap(),
37            }),
38        )
39    }
40
41    #[cfg(feature = "object")]
42    pub fn row_tuples<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
43        let mut rechunked;
44        // Rechunk if random access would become rather expensive.
45        // TODO: iterate over the chunks directly instead of using random access.
46        let df = if self.df.max_n_chunks() > 16 {
47            rechunked = self.df.clone();
48            rechunked.as_single_chunk_par();
49            &rechunked
50        } else {
51            &self.df
52        };
53        PyList::new(
54            py,
55            (0..df.height()).map(|idx| {
56                PyTuple::new(
57                    py,
58                    df.get_columns().iter().map(|c| match c.dtype() {
59                        DataType::Null => py.None(),
60                        DataType::Object(_) => {
61                            let obj: Option<&ObjectValue> = c.get_object(idx).map(|any| any.into());
62                            obj.into_py_any(py).unwrap()
63                        },
64                        _ => {
65                            // SAFETY: we are in bounds.
66                            let av = unsafe { c.get_unchecked(idx) };
67                            Wrap(av).into_py_any(py).unwrap()
68                        },
69                    }),
70                )
71                .unwrap()
72            }),
73        )
74    }
75
76    #[allow(clippy::wrong_self_convention)]
77    pub fn to_arrow(
78        &mut self,
79        py: Python<'_>,
80        compat_level: PyCompatLevel,
81    ) -> PyResult<Vec<PyObject>> {
82        py.enter_polars_ok(|| self.df.align_chunks_par())?;
83        let pyarrow = py.import("pyarrow")?;
84
85        let rbs = self
86            .df
87            .iter_chunks(compat_level.0, true)
88            .map(|rb| interop::arrow::to_py::to_py_rb(&rb, py, &pyarrow))
89            .collect::<PyResult<_>>()?;
90        Ok(rbs)
91    }
92
93    /// Create a `Vec` of PyArrow RecordBatch instances.
94    ///
95    /// Note this will give bad results for columns with dtype `pl.Object`,
96    /// since those can't be converted correctly via PyArrow. The calling Python
97    /// code should make sure these are not included.
98    #[allow(clippy::wrong_self_convention)]
99    pub fn to_pandas(&mut self, py: Python) -> PyResult<Vec<PyObject>> {
100        py.enter_polars_ok(|| self.df.as_single_chunk_par())?;
101        Python::with_gil(|py| {
102            let pyarrow = py.import("pyarrow")?;
103            let cat_columns = self
104                .df
105                .get_columns()
106                .iter()
107                .enumerate()
108                .filter(|(_i, s)| {
109                    matches!(
110                        s.dtype(),
111                        DataType::Categorical(_, _) | DataType::Enum(_, _)
112                    )
113                })
114                .map(|(i, _)| i)
115                .collect::<Vec<_>>();
116
117            let enum_and_categorical_dtype = ArrowDataType::Dictionary(
118                IntegerType::Int64,
119                Box::new(ArrowDataType::LargeUtf8),
120                false,
121            );
122
123            let mut replaced_schema = None;
124            let rbs = self
125                .df
126                .iter_chunks(CompatLevel::oldest(), true)
127                .map(|rb| {
128                    let length = rb.len();
129                    let (schema, mut arrays) = rb.into_schema_and_arrays();
130
131                    // Pandas does not allow unsigned dictionary indices so we replace them.
132                    replaced_schema =
133                        (replaced_schema.is_none() && !cat_columns.is_empty()).then(|| {
134                            let mut schema = schema.as_ref().clone();
135                            for i in &cat_columns {
136                                let (_, field) = schema.get_at_index_mut(*i).unwrap();
137                                field.dtype = enum_and_categorical_dtype.clone();
138                            }
139                            Arc::new(schema)
140                        });
141
142                    for i in &cat_columns {
143                        let arr = arrays.get_mut(*i).unwrap();
144                        let out = polars_compute::cast::cast(
145                            &**arr,
146                            &enum_and_categorical_dtype,
147                            CastOptionsImpl::default(),
148                        )
149                        .unwrap();
150                        *arr = out;
151                    }
152                    let schema = replaced_schema
153                        .as_ref()
154                        .map_or(schema, |replaced| replaced.clone());
155                    let rb = RecordBatch::new(length, schema, arrays);
156
157                    interop::arrow::to_py::to_py_rb(&rb, py, &pyarrow)
158                })
159                .collect::<PyResult<_>>()?;
160            Ok(rbs)
161        })
162    }
163
164    #[allow(unused_variables)]
165    #[pyo3(signature = (requested_schema=None))]
166    fn __arrow_c_stream__<'py>(
167        &mut self,
168        py: Python<'py>,
169        requested_schema: Option<PyObject>,
170    ) -> PyResult<Bound<'py, PyCapsule>> {
171        py.enter_polars_ok(|| self.df.align_chunks_par())?;
172        dataframe_to_stream(&self.df, py)
173    }
174}