polars_python/dataframe/
general.rs

1use std::hash::BuildHasher;
2
3use arrow::bitmap::MutableBitmap;
4use either::Either;
5use polars::prelude::*;
6use polars_ffi::version_0::SeriesExport;
7#[cfg(feature = "pivot")]
8use polars_lazy::frame::pivot::{pivot, pivot_stable};
9use pyo3::IntoPyObjectExt;
10use pyo3::exceptions::PyIndexError;
11use pyo3::prelude::*;
12use pyo3::pybacked::PyBackedStr;
13use pyo3::types::{PyList, PyType};
14
15use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};
16use super::PyDataFrame;
17use crate::conversion::Wrap;
18use crate::error::PyPolarsErr;
19use crate::map::dataframe::{
20    apply_lambda_unknown, apply_lambda_with_bool_out_type, apply_lambda_with_primitive_out_type,
21    apply_lambda_with_string_out_type,
22};
23use crate::prelude::strings_to_pl_smallstr;
24use crate::py_modules::polars;
25use crate::series::{PySeries, ToPySeries, ToSeries};
26use crate::utils::EnterPolarsExt;
27use crate::{PyExpr, PyLazyFrame};
28
29#[pymethods]
30impl PyDataFrame {
31    #[new]
32    pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {
33        let columns = columns.to_series();
34        // @scalar-opt
35        let columns = columns.into_iter().map(|s| s.into()).collect();
36        let df = DataFrame::new(columns).map_err(PyPolarsErr::from)?;
37        Ok(PyDataFrame::new(df))
38    }
39
40    pub fn estimated_size(&self) -> usize {
41        self.df.estimated_size()
42    }
43
44    pub fn dtype_strings(&self) -> Vec<String> {
45        self.df
46            .get_columns()
47            .iter()
48            .map(|s| format!("{}", s.dtype()))
49            .collect()
50    }
51
52    pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
53        py.enter_polars_df(|| &self.df + &s.series)
54    }
55
56    pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
57        py.enter_polars_df(|| &self.df - &s.series)
58    }
59
60    pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
61        py.enter_polars_df(|| &self.df * &s.series)
62    }
63
64    pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
65        py.enter_polars_df(|| &self.df / &s.series)
66    }
67
68    pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
69        py.enter_polars_df(|| &self.df % &s.series)
70    }
71
72    pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
73        py.enter_polars_df(|| &self.df + &s.df)
74    }
75
76    pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
77        py.enter_polars_df(|| &self.df - &s.df)
78    }
79
80    pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
81        py.enter_polars_df(|| &self.df * &s.df)
82    }
83
84    pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
85        py.enter_polars_df(|| &self.df / &s.df)
86    }
87
88    pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
89        py.enter_polars_df(|| &self.df % &s.df)
90    }
91
92    #[pyo3(signature = (n, with_replacement, shuffle, seed=None))]
93    pub fn sample_n(
94        &self,
95        py: Python<'_>,
96        n: &PySeries,
97        with_replacement: bool,
98        shuffle: bool,
99        seed: Option<u64>,
100    ) -> PyResult<Self> {
101        py.enter_polars_df(|| self.df.sample_n(&n.series, with_replacement, shuffle, seed))
102    }
103
104    #[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]
105    pub fn sample_frac(
106        &self,
107        py: Python<'_>,
108        frac: &PySeries,
109        with_replacement: bool,
110        shuffle: bool,
111        seed: Option<u64>,
112    ) -> PyResult<Self> {
113        py.enter_polars_df(|| {
114            self.df
115                .sample_frac(&frac.series, with_replacement, shuffle, seed)
116        })
117    }
118
119    pub fn rechunk(&self, py: Python) -> PyResult<Self> {
120        py.enter_polars_df(|| {
121            let mut df = self.df.clone();
122            df.as_single_chunk_par();
123            Ok(df)
124        })
125    }
126
127    /// Format `DataFrame` as String
128    pub fn as_str(&self) -> String {
129        format!("{:?}", self.df)
130    }
131
132    pub fn get_columns(&self) -> Vec<PySeries> {
133        let cols = self.df.get_columns().to_vec();
134        cols.to_pyseries()
135    }
136
137    /// Get column names
138    pub fn columns(&self) -> Vec<&str> {
139        self.df.get_column_names_str()
140    }
141
142    /// set column names
143    pub fn set_column_names(&mut self, names: Vec<PyBackedStr>) -> PyResult<()> {
144        self.df
145            .set_column_names(names.iter().map(|x| &**x))
146            .map_err(PyPolarsErr::from)?;
147        Ok(())
148    }
149
150    /// Get datatypes
151    pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
152        let iter = self
153            .df
154            .iter()
155            .map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());
156        PyList::new(py, iter)
157    }
158
159    pub fn n_chunks(&self) -> usize {
160        self.df.first_col_n_chunks()
161    }
162
163    pub fn shape(&self) -> (usize, usize) {
164        self.df.shape()
165    }
166
167    pub fn height(&self) -> usize {
168        self.df.height()
169    }
170
171    pub fn width(&self) -> usize {
172        self.df.width()
173    }
174
175    pub fn is_empty(&self) -> bool {
176        self.df.is_empty()
177    }
178
179    pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {
180        let columns = columns.to_series();
181        // @scalar-opt
182        let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
183        py.enter_polars_df(|| self.df.hstack(&columns))
184    }
185
186    pub fn hstack_mut(&mut self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {
187        let columns = columns.to_series();
188        // @scalar-opt
189        let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
190        py.enter_polars(|| self.df.hstack_mut(&columns))?;
191        Ok(())
192    }
193
194    pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {
195        py.enter_polars_df(|| self.df.vstack(&other.df))
196    }
197
198    pub fn vstack_mut(&mut self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
199        py.enter_polars(|| self.df.vstack_mut(&other.df))?;
200        Ok(())
201    }
202
203    pub fn extend(&mut self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
204        py.enter_polars(|| self.df.extend(&other.df))?;
205        Ok(())
206    }
207
208    pub fn drop_in_place(&mut self, name: &str) -> PyResult<PySeries> {
209        let s = self.df.drop_in_place(name).map_err(PyPolarsErr::from)?;
210        let s = s.take_materialized_series();
211        Ok(PySeries { series: s })
212    }
213
214    pub fn to_series(&self, index: isize) -> PyResult<PySeries> {
215        let df = &self.df;
216
217        let index_adjusted = if index < 0 {
218            df.width().checked_sub(index.unsigned_abs())
219        } else {
220            Some(usize::try_from(index).unwrap())
221        };
222
223        let s = index_adjusted.and_then(|i| df.select_at_idx(i));
224        match s {
225            Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),
226            None => Err(PyIndexError::new_err(
227                polars_err!(oob = index, df.width()).to_string(),
228            )),
229        }
230    }
231
232    pub fn get_column_index(&self, name: &str) -> PyResult<usize> {
233        Ok(self
234            .df
235            .try_get_column_index(name)
236            .map_err(PyPolarsErr::from)?)
237    }
238
239    pub fn get_column(&self, name: &str) -> PyResult<PySeries> {
240        let series = self
241            .df
242            .column(name)
243            .map(|s| PySeries::new(s.as_materialized_series().clone()))
244            .map_err(PyPolarsErr::from)?;
245        Ok(series)
246    }
247
248    pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {
249        py.enter_polars_df(|| self.df.select(columns.iter().map(|x| &**x)))
250    }
251
252    pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {
253        let indices = indices.0;
254        let indices = IdxCa::from_vec("".into(), indices);
255        py.enter_polars_df(|| self.df.take(&indices))
256    }
257
258    pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {
259        let indices = indices.series.idx().map_err(PyPolarsErr::from)?;
260        py.enter_polars_df(|| self.df.take(indices))
261    }
262
263    pub fn replace(&mut self, column: &str, new_col: PySeries) -> PyResult<()> {
264        self.df
265            .replace(column, new_col.series)
266            .map_err(PyPolarsErr::from)?;
267        Ok(())
268    }
269
270    pub fn replace_column(&mut self, index: usize, new_column: PySeries) -> PyResult<()> {
271        self.df
272            .replace_column(index, new_column.series)
273            .map_err(PyPolarsErr::from)?;
274        Ok(())
275    }
276
277    pub fn insert_column(&mut self, index: usize, column: PySeries) -> PyResult<()> {
278        self.df
279            .insert_column(index, column.series)
280            .map_err(PyPolarsErr::from)?;
281        Ok(())
282    }
283
284    #[pyo3(signature = (offset, length=None))]
285    pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {
286        py.enter_polars_df(|| {
287            Ok(self
288                .df
289                .slice(offset, length.unwrap_or_else(|| self.df.height())))
290        })
291    }
292
293    pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
294        py.enter_polars_df(|| Ok(self.df.head(Some(n))))
295    }
296
297    pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
298        py.enter_polars_df(|| Ok(self.df.tail(Some(n))))
299    }
300
301    pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {
302        py.enter_polars_series(|| self.df.is_unique())
303    }
304
305    pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {
306        py.enter_polars_series(|| self.df.is_duplicated())
307    }
308
309    pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {
310        if null_equal {
311            py.enter_polars_ok(|| self.df.equals_missing(&other.df))
312        } else {
313            py.enter_polars_ok(|| self.df.equals(&other.df))
314        }
315    }
316
317    #[pyo3(signature = (name, offset=None))]
318    pub fn with_row_index(
319        &self,
320        py: Python<'_>,
321        name: &str,
322        offset: Option<IdxSize>,
323    ) -> PyResult<Self> {
324        py.enter_polars_df(|| self.df.with_row_index(name.into(), offset))
325    }
326
327    pub fn _to_metadata(&self) -> Self {
328        Self {
329            df: self.df._to_metadata(),
330        }
331    }
332
333    pub fn group_by_map_groups(
334        &self,
335        by: Vec<PyBackedStr>,
336        lambda: PyObject,
337        maintain_order: bool,
338    ) -> PyResult<Self> {
339        let gb = if maintain_order {
340            self.df.group_by_stable(by.iter().map(|x| &**x))
341        } else {
342            self.df.group_by(by.iter().map(|x| &**x))
343        }
344        .map_err(PyPolarsErr::from)?;
345
346        let function = move |df: DataFrame| {
347            Python::with_gil(|py| {
348                let pypolars = polars(py).bind(py);
349                let pydf = PyDataFrame::new(df);
350                let python_df_wrapper =
351                    pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();
352
353                // Call the lambda and get a python-side DataFrame wrapper.
354                let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {
355                    Ok(pyobj) => pyobj,
356                    Err(e) => panic!("UDF failed: {}", e.value(py)),
357                };
358                let py_pydf = result_df_wrapper.getattr(py, "_df").expect(
359                    "Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",
360                );
361
362                let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();
363                Ok(pydf.df)
364            })
365        };
366        // We don't use `py.allow_threads(|| gb.par_apply(..)` because that segfaulted
367        // due to code related to Pyo3 or rayon, cannot reproduce it in native polars.
368        // So we lose parallelism, but it doesn't really matter because we are GIL bound anyways
369        // and this function should not be used in idiomatic polars anyway.
370        let df = gb.apply(function).map_err(PyPolarsErr::from)?;
371
372        Ok(df.into())
373    }
374
375    #[allow(clippy::should_implement_trait)]
376    pub fn clone(&self) -> Self {
377        PyDataFrame::new(self.df.clone())
378    }
379
380    #[cfg(feature = "pivot")]
381    #[pyo3(signature = (on, index, value_name=None, variable_name=None))]
382    pub fn unpivot(
383        &self,
384        py: Python<'_>,
385        on: Vec<PyBackedStr>,
386        index: Vec<PyBackedStr>,
387        value_name: Option<&str>,
388        variable_name: Option<&str>,
389    ) -> PyResult<Self> {
390        use polars_ops::pivot::UnpivotDF;
391        let args = UnpivotArgsIR {
392            on: strings_to_pl_smallstr(on),
393            index: strings_to_pl_smallstr(index),
394            value_name: value_name.map(|s| s.into()),
395            variable_name: variable_name.map(|s| s.into()),
396        };
397
398        py.enter_polars_df(|| self.df.unpivot2(args))
399    }
400
401    #[cfg(feature = "pivot")]
402    #[pyo3(signature = (on, index, values, maintain_order, sort_columns, aggregate_expr, separator))]
403    pub fn pivot_expr(
404        &self,
405        py: Python<'_>,
406        on: Vec<String>,
407        index: Option<Vec<String>>,
408        values: Option<Vec<String>>,
409        maintain_order: bool,
410        sort_columns: bool,
411        aggregate_expr: Option<PyExpr>,
412        separator: Option<&str>,
413    ) -> PyResult<Self> {
414        let fun = if maintain_order { pivot_stable } else { pivot };
415        let agg_expr = aggregate_expr.map(|expr| expr.inner);
416        py.enter_polars_df(|| {
417            fun(
418                &self.df,
419                on,
420                index,
421                values,
422                sort_columns,
423                agg_expr,
424                separator,
425            )
426        })
427    }
428
429    pub fn partition_by(
430        &self,
431        py: Python<'_>,
432        by: Vec<String>,
433        maintain_order: bool,
434        include_key: bool,
435    ) -> PyResult<Vec<Self>> {
436        let out = py.enter_polars(|| {
437            if maintain_order {
438                self.df.partition_by_stable(by, include_key)
439            } else {
440                self.df.partition_by(by, include_key)
441            }
442        })?;
443
444        // SAFETY: PyDataFrame is a repr(transparent) DataFrame.
445        Ok(unsafe { std::mem::transmute::<Vec<DataFrame>, Vec<PyDataFrame>>(out) })
446    }
447
448    pub fn lazy(&self) -> PyLazyFrame {
449        self.df.clone().lazy().into()
450    }
451
452    #[pyo3(signature = (columns, separator, drop_first=false))]
453    pub fn to_dummies(
454        &self,
455        py: Python<'_>,
456        columns: Option<Vec<String>>,
457        separator: Option<&str>,
458        drop_first: bool,
459    ) -> PyResult<Self> {
460        py.enter_polars_df(|| match columns {
461            Some(cols) => self.df.columns_to_dummies(
462                cols.iter().map(|x| x as &str).collect(),
463                separator,
464                drop_first,
465            ),
466            None => self.df.to_dummies(separator, drop_first),
467        })
468    }
469
470    pub fn null_count(&self, py: Python) -> PyResult<Self> {
471        py.enter_polars_df(|| Ok(self.df.null_count()))
472    }
473
474    #[pyo3(signature = (lambda, output_type, inference_size))]
475    pub fn map_rows(
476        &mut self,
477        lambda: Bound<PyAny>,
478        output_type: Option<Wrap<DataType>>,
479        inference_size: usize,
480    ) -> PyResult<(PyObject, bool)> {
481        Python::with_gil(|py| {
482            // needed for series iter
483            self.df.as_single_chunk_par();
484            let df = &self.df;
485
486            use apply_lambda_with_primitive_out_type as apply;
487            #[rustfmt::skip]
488            let out = match output_type.map(|dt| dt.0) {
489                Some(DataType::Int32) => apply::<Int32Type>(df, py, lambda, 0, None)?.into_series(),
490                Some(DataType::Int64) => apply::<Int64Type>(df, py, lambda, 0, None)?.into_series(),
491                Some(DataType::UInt32) => apply::<UInt32Type>(df, py, lambda, 0, None)?.into_series(),
492                Some(DataType::UInt64) => apply::<UInt64Type>(df, py, lambda, 0, None)?.into_series(),
493                Some(DataType::Float32) => apply::<Float32Type>(df, py, lambda, 0, None)?.into_series(),
494                Some(DataType::Float64) => apply::<Float64Type>(df, py, lambda, 0, None)?.into_series(),
495                Some(DataType::Date) => apply::<Int32Type>(df, py, lambda, 0, None)?.into_date().into_series(),
496                Some(DataType::Datetime(tu, tz)) => apply::<Int64Type>(df, py, lambda, 0, None)?.into_datetime(tu, tz).into_series(),
497                Some(DataType::Boolean) => apply_lambda_with_bool_out_type(df, py, lambda, 0, None)?.into_series(),
498                Some(DataType::String) => apply_lambda_with_string_out_type(df, py, lambda, 0, None)?.into_series(),
499                _ => return apply_lambda_unknown(df, py, lambda, inference_size),
500            };
501
502            Ok((PySeries::from(out).into_py_any(py)?, false))
503        })
504    }
505
506    pub fn shrink_to_fit(&mut self, py: Python) -> PyResult<()> {
507        py.enter_polars_ok(|| self.df.shrink_to_fit())
508    }
509
510    pub fn hash_rows(
511        &mut self,
512        py: Python<'_>,
513        k0: u64,
514        k1: u64,
515        k2: u64,
516        k3: u64,
517    ) -> PyResult<PySeries> {
518        // TODO: don't expose all these seeds.
519        let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));
520        let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);
521        py.enter_polars_series(|| self.df.hash_rows(Some(hb)))
522    }
523
524    #[pyo3(signature = (keep_names_as, column_names))]
525    pub fn transpose(
526        &mut self,
527        py: Python<'_>,
528        keep_names_as: Option<&str>,
529        column_names: &Bound<PyAny>,
530    ) -> PyResult<Self> {
531        let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
532            Some(Either::Right(name))
533        } else if let Ok(name) = column_names.extract::<String>() {
534            Some(Either::Left(name))
535        } else {
536            None
537        };
538        py.enter_polars_df(|| self.df.transpose(keep_names_as, new_col_names))
539    }
540
541    pub fn upsample(
542        &self,
543        py: Python<'_>,
544        by: Vec<String>,
545        index_column: &str,
546        every: &str,
547        stable: bool,
548    ) -> PyResult<Self> {
549        let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;
550        py.enter_polars_df(|| {
551            if stable {
552                self.df.upsample_stable(by, index_column, every)
553            } else {
554                self.df.upsample(by, index_column, every)
555            }
556        })
557    }
558
559    pub fn to_struct(
560        &self,
561        py: Python<'_>,
562        name: &str,
563        invalid_indices: Vec<usize>,
564    ) -> PyResult<PySeries> {
565        py.enter_polars_series(|| {
566            let mut ca = self.df.clone().into_struct(name.into());
567
568            if !invalid_indices.is_empty() {
569                let mut validity = MutableBitmap::with_capacity(ca.len());
570                validity.extend_constant(ca.len(), true);
571                for i in invalid_indices {
572                    validity.set(i, false);
573                }
574                ca.rechunk_mut();
575                Ok(ca.with_outer_validity(Some(validity.freeze())))
576            } else {
577                Ok(ca)
578            }
579        })
580    }
581
582    pub fn clear(&self, py: Python) -> PyResult<Self> {
583        py.enter_polars_df(|| Ok(self.df.clear()))
584    }
585
586    /// Export the columns via polars-ffi
587    /// # Safety
588    /// Needs a preallocated *mut SeriesExport that has allocated space for n_columns.
589    pub unsafe fn _export_columns(&mut self, location: usize) {
590        use polars_ffi::version_0::export_column;
591
592        let cols = self.df.get_columns();
593
594        let location = location as *mut SeriesExport;
595
596        for (i, col) in cols.iter().enumerate() {
597            let e = export_column(col);
598            // SAFETY:
599            // Caller should ensure address is allocated.
600            // Be careful not to drop `e` here as that should be dropped by the ffi consumer
601            unsafe { core::ptr::write(location.add(i), e) };
602        }
603    }
604
605    /// Import [`Self`] via polars-ffi
606    /// # Safety
607    /// [`location`] should be an address that contains [`width`] properly initialized
608    /// [`SeriesExport`]s
609    #[classmethod]
610    pub unsafe fn _import_columns(
611        _cls: &Bound<PyType>,
612        location: usize,
613        width: usize,
614    ) -> PyResult<Self> {
615        use polars_ffi::version_0::import_df;
616
617        let location = location as *mut SeriesExport;
618
619        let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;
620        Ok(PyDataFrame { df })
621    }
622
623    /// Internal utility function to allow direct access to the row encoding from python.
624    #[pyo3(signature = (opts))]
625    fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {
626        py.enter_polars_series(|| {
627            let name = PlSmallStr::from_static("row_enc");
628            let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);
629
630            let ca = if is_unordered {
631                _get_rows_encoded_ca_unordered(name, self.df.get_columns())
632            } else {
633                let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();
634                let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();
635
636                _get_rows_encoded_ca(
637                    name,
638                    self.df.get_columns(),
639                    descending.as_slice(),
640                    nulls_last.as_slice(),
641                )
642            }?;
643
644            Ok(ca)
645        })
646    }
647}