Skip to main content

polars_python/dataframe/
general.rs

1use std::hash::BuildHasher;
2
3use arrow::bitmap::MutableBitmap;
4use either::Either;
5use parking_lot::RwLock;
6use polars::prelude::*;
7use polars_ffi::version_0::SeriesExport;
8use pyo3::exceptions::PyIndexError;
9use pyo3::prelude::*;
10use pyo3::pybacked::PyBackedStr;
11use pyo3::types::{PyList, PyType};
12
13use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};
14use super::PyDataFrame;
15use crate::PyLazyFrame;
16use crate::conversion::Wrap;
17use crate::error::PyPolarsErr;
18use crate::prelude::strings_to_pl_smallstr;
19use crate::py_modules::polars;
20use crate::series::{PySeries, ToPySeries, ToSeries};
21use crate::utils::{EnterPolarsExt, to_py_err};
22
23#[pymethods]
24impl PyDataFrame {
25    #[new]
26    pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {
27        let columns = columns.to_series();
28        // @scalar-opt
29        let columns = columns.into_iter().map(|s| s.into()).collect();
30        let df = DataFrame::new_infer_height(columns).map_err(PyPolarsErr::from)?;
31        Ok(PyDataFrame::new(df))
32    }
33
34    #[staticmethod]
35    pub fn empty_with_height(height: u64) -> PyResult<Self> {
36        Ok(PyDataFrame::new(DataFrame::empty_with_height(
37            IdxSize::try_from(height)
38                .map_err(|_| polars_err!(bigidx, ctx = "DataFrame(height = _)", size = height))
39                .map_err(to_py_err)? as usize,
40        )))
41    }
42
43    pub fn estimated_size(&self) -> usize {
44        self.df.read().estimated_size()
45    }
46
47    pub fn dtype_strings(&self) -> Vec<String> {
48        self.df
49            .read()
50            .columns()
51            .iter()
52            .map(|s| format!("{}", s.dtype()))
53            .collect()
54    }
55
56    pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
57        py.enter_polars_df(|| &*self.df.read() + &*s.series.read())
58    }
59
60    pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
61        py.enter_polars_df(|| &*self.df.read() - &*s.series.read())
62    }
63
64    pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
65        py.enter_polars_df(|| &*self.df.read() * &*s.series.read())
66    }
67
68    pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
69        py.enter_polars_df(|| &*self.df.read() / &*s.series.read())
70    }
71
72    pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
73        py.enter_polars_df(|| &*self.df.read() % &*s.series.read())
74    }
75
76    pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
77        py.enter_polars_df(|| &*self.df.read() + &*s.df.read())
78    }
79
80    pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
81        py.enter_polars_df(|| &*self.df.read() - &*s.df.read())
82    }
83
84    pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
85        py.enter_polars_df(|| &*self.df.read() * &*s.df.read())
86    }
87
88    pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
89        py.enter_polars_df(|| &*self.df.read() / &*s.df.read())
90    }
91
92    pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
93        py.enter_polars_df(|| &*self.df.read() % &*s.df.read())
94    }
95
96    #[pyo3(signature = (n, with_replacement, shuffle, seed=None))]
97    pub fn sample_n(
98        &self,
99        py: Python<'_>,
100        n: &PySeries,
101        with_replacement: bool,
102        shuffle: bool,
103        seed: Option<u64>,
104    ) -> PyResult<Self> {
105        py.enter_polars_df(|| {
106            self.df
107                .read()
108                .sample_n(&n.series.read(), with_replacement, shuffle, seed)
109        })
110    }
111
112    #[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]
113    pub fn sample_frac(
114        &self,
115        py: Python<'_>,
116        frac: &PySeries,
117        with_replacement: bool,
118        shuffle: bool,
119        seed: Option<u64>,
120    ) -> PyResult<Self> {
121        py.enter_polars_df(|| {
122            self.df
123                .read()
124                .sample_frac(&frac.series.read(), with_replacement, shuffle, seed)
125        })
126    }
127
128    pub fn rechunk(&self, py: Python) -> PyResult<Self> {
129        py.enter_polars_df(|| {
130            let mut df = self.df.read().clone();
131            df.rechunk_mut_par();
132            Ok(df)
133        })
134    }
135
136    /// Format `DataFrame` as String
137    pub fn as_str(&self) -> String {
138        format!("{:?}", self.df.read())
139    }
140
141    pub fn get_columns(&self) -> Vec<PySeries> {
142        let cols = self.df.read().columns().to_vec();
143        cols.to_pyseries()
144    }
145
146    /// Get column names
147    pub fn columns(&self) -> Vec<String> {
148        self.df
149            .read()
150            .columns()
151            .iter()
152            .map(|s| s.name().to_string())
153            .collect()
154    }
155
156    /// set column names
157    pub fn set_column_names(&self, names: Vec<PyBackedStr>) -> PyResult<()> {
158        self.df
159            .write()
160            .set_column_names(&names)
161            .map_err(PyPolarsErr::from)?;
162        Ok(())
163    }
164
165    /// Get datatypes
166    pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
167        let df = self.df.read();
168        let iter = df
169            .columns()
170            .iter()
171            .map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());
172        PyList::new(py, iter)
173    }
174
175    pub fn n_chunks(&self) -> usize {
176        self.df.read().first_col_n_chunks()
177    }
178
179    pub fn shape(&self) -> (usize, usize) {
180        self.df.read().shape()
181    }
182
183    pub fn height(&self) -> usize {
184        self.df.read().height()
185    }
186
187    pub fn width(&self) -> usize {
188        self.df.read().width()
189    }
190
191    pub fn is_empty(&self) -> bool {
192        self.df.read().shape_has_zero()
193    }
194
195    pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {
196        let columns = columns.to_series();
197        // @scalar-opt
198        let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
199        py.enter_polars_df(|| self.df.read().hstack(&columns))
200    }
201
202    pub fn hstack_mut(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {
203        let columns = columns.to_series();
204        // @scalar-opt
205        let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
206        py.enter_polars(|| self.df.write().hstack_mut(&columns).map(drop))?;
207        Ok(())
208    }
209
210    pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {
211        py.enter_polars_df(|| self.df.read().vstack(&other.df.read()))
212    }
213
214    pub fn vstack_mut(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
215        py.enter_polars(|| {
216            // Prevent self-vstack deadlocks.
217            let other = other.df.read().clone();
218            self.df.write().vstack_mut_owned(other)?;
219            PolarsResult::Ok(())
220        })?;
221        Ok(())
222    }
223
224    pub fn extend(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
225        py.enter_polars(|| {
226            // Prevent self-extend deadlocks.
227            let other = other.df.read().clone();
228            self.df.write().extend(&other)
229        })?;
230        Ok(())
231    }
232
233    pub fn drop_in_place(&self, name: &str) -> PyResult<PySeries> {
234        let s = self
235            .df
236            .write()
237            .drop_in_place(name)
238            .map_err(PyPolarsErr::from)?;
239        let s = s.take_materialized_series();
240        Ok(PySeries::from(s))
241    }
242
243    pub fn to_series(&self, index: isize) -> PyResult<PySeries> {
244        let df = &self.df.read();
245
246        let index_adjusted = if index < 0 {
247            df.width().checked_sub(index.unsigned_abs())
248        } else {
249            Some(usize::try_from(index).unwrap())
250        };
251
252        let s = index_adjusted.and_then(|i| df.select_at_idx(i));
253        match s {
254            Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),
255            None => Err(PyIndexError::new_err(
256                polars_err!(oob = index, df.width()).to_string(),
257            )),
258        }
259    }
260
261    pub fn get_column_index(&self, name: &str) -> PyResult<usize> {
262        Ok(self
263            .df
264            .read()
265            .try_get_column_index(name)
266            .map_err(PyPolarsErr::from)?)
267    }
268
269    pub fn get_column(&self, name: &str) -> PyResult<PySeries> {
270        let series = self
271            .df
272            .read()
273            .column(name)
274            .map(|s| PySeries::new(s.as_materialized_series().clone()))
275            .map_err(PyPolarsErr::from)?;
276        Ok(series)
277    }
278
279    pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {
280        py.enter_polars_df(|| self.df.read().select(columns.iter().map(|x| &**x)))
281    }
282
283    pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {
284        let indices = indices.0;
285        let indices = IdxCa::from_vec("".into(), indices);
286        py.enter_polars_df(|| self.df.read().take(&indices))
287    }
288
289    pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {
290        let idx_s = indices.series.read();
291        let indices = idx_s.idx().map_err(PyPolarsErr::from)?;
292        py.enter_polars_df(|| self.df.read().take(indices))
293    }
294
295    pub fn replace(&self, column: &str, new_col: PySeries) -> PyResult<()> {
296        self.df
297            .write()
298            .replace(column, new_col.series.into_inner().into_column())
299            .map_err(PyPolarsErr::from)?;
300        Ok(())
301    }
302
303    pub fn replace_column(&self, index: usize, new_column: PySeries) -> PyResult<()> {
304        self.df
305            .write()
306            .replace_column(index, new_column.series.into_inner().into_column())
307            .map_err(PyPolarsErr::from)?;
308        Ok(())
309    }
310
311    pub fn insert_column(&self, index: usize, column: PySeries) -> PyResult<()> {
312        self.df
313            .write()
314            .insert_column(index, column.series.into_inner().into_column())
315            .map_err(PyPolarsErr::from)?;
316        Ok(())
317    }
318
319    #[pyo3(signature = (offset, length))]
320    pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {
321        py.enter_polars_df(|| {
322            let df = self.df.read();
323            let len = length.unwrap_or(usize::MAX);
324            Ok(df.slice(offset, len))
325        })
326    }
327
328    pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
329        py.enter_polars_df(|| Ok(self.df.read().head(Some(n))))
330    }
331
332    pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
333        py.enter_polars_df(|| Ok(self.df.read().tail(Some(n))))
334    }
335
336    pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {
337        py.enter_polars_series(|| self.df.read().is_unique())
338    }
339
340    pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {
341        py.enter_polars_series(|| self.df.read().is_duplicated())
342    }
343
344    pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {
345        if null_equal {
346            py.enter_polars_ok(|| self.df.read().equals_missing(&other.df.read()))
347        } else {
348            py.enter_polars_ok(|| self.df.read().equals(&other.df.read()))
349        }
350    }
351
352    #[pyo3(signature = (name, offset=None))]
353    pub fn with_row_index(
354        &self,
355        py: Python<'_>,
356        name: &str,
357        offset: Option<IdxSize>,
358    ) -> PyResult<Self> {
359        py.enter_polars_df(|| self.df.read().with_row_index(name.into(), offset))
360    }
361
362    pub fn _to_metadata(&self) -> Self {
363        Self {
364            df: RwLock::new(self.df.read()._to_metadata()),
365        }
366    }
367
368    pub fn group_by_map_groups(
369        &self,
370        py: Python<'_>,
371        by: Vec<PyBackedStr>,
372        lambda: Py<PyAny>,
373        maintain_order: bool,
374    ) -> PyResult<Self> {
375        py.enter_polars_df(|| {
376            let df = self.df.read().clone(); // Clone so we can't deadlock on re-entrance from lambda.
377            let gb = if maintain_order {
378                df.group_by_stable(by.iter().map(|x| &**x))
379            } else {
380                df.group_by(by.iter().map(|x| &**x))
381            }?;
382
383            let function = move |df: DataFrame| {
384                Python::attach(|py| {
385                    let pypolars = polars(py).bind(py);
386                    let pydf = PyDataFrame::new(df);
387                    let python_df_wrapper =
388                        pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();
389
390                    // Call the lambda and get a python-side DataFrame wrapper.
391                    let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {
392                        Ok(pyobj) => pyobj,
393                        Err(e) => panic!("UDF failed: {}", e.value(py)),
394                    };
395                    let py_pydf = result_df_wrapper.getattr(py, "_df").expect(
396                        "Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",
397                    );
398
399                    let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();
400                    Ok(pydf.df.into_inner())
401                })
402            };
403
404            gb.apply(function)
405        })
406    }
407
408    #[allow(clippy::should_implement_trait)]
409    pub fn clone(&self) -> Self {
410        Clone::clone(self)
411    }
412
413    #[cfg(feature = "pivot")]
414    #[pyo3(signature = (on, index, value_name=None, variable_name=None))]
415    pub fn unpivot(
416        &self,
417        py: Python<'_>,
418        on: Option<Vec<PyBackedStr>>,
419        index: Vec<PyBackedStr>,
420        value_name: Option<&str>,
421        variable_name: Option<&str>,
422    ) -> PyResult<Self> {
423        use polars_ops::unpivot::UnpivotDF;
424        let args = UnpivotArgsIR::new(
425            self.df.read().get_column_names_owned(),
426            on.map(strings_to_pl_smallstr),
427            strings_to_pl_smallstr(index),
428            value_name.map(|s| s.into()),
429            variable_name.map(|s| s.into()),
430        );
431
432        py.enter_polars_df(|| self.df.read().unpivot2(args))
433    }
434
435    pub fn partition_by(
436        &self,
437        py: Python<'_>,
438        by: Vec<String>,
439        maintain_order: bool,
440        include_key: bool,
441    ) -> PyResult<Vec<Self>> {
442        let out = py.enter_polars(|| {
443            if maintain_order {
444                self.df.read().partition_by_stable(by, include_key)
445            } else {
446                self.df.read().partition_by(by, include_key)
447            }
448        })?;
449
450        Ok(out.into_iter().map(PyDataFrame::from).collect())
451    }
452
453    pub fn lazy(&self) -> PyLazyFrame {
454        self.df.read().clone().lazy().into()
455    }
456
457    #[pyo3(signature = (columns, separator, drop_first, drop_nulls))]
458    pub fn to_dummies(
459        &self,
460        py: Python<'_>,
461        columns: Option<Vec<String>>,
462        separator: Option<&str>,
463        drop_first: bool,
464        drop_nulls: bool,
465    ) -> PyResult<Self> {
466        py.enter_polars_df(|| match columns {
467            Some(cols) => self.df.read().columns_to_dummies(
468                cols.iter().map(|x| x as &str).collect(),
469                separator,
470                drop_first,
471                drop_nulls,
472            ),
473            None => self.df.read().to_dummies(separator, drop_first, drop_nulls),
474        })
475    }
476
477    pub fn null_count(&self, py: Python) -> PyResult<Self> {
478        py.enter_polars_df(|| Ok(self.df.read().null_count()))
479    }
480
481    pub fn shrink_to_fit(&self, py: Python) -> PyResult<()> {
482        py.enter_polars_ok(|| self.df.write().shrink_to_fit())
483    }
484
485    pub fn hash_rows(
486        &self,
487        py: Python<'_>,
488        k0: u64,
489        k1: u64,
490        k2: u64,
491        k3: u64,
492    ) -> PyResult<PySeries> {
493        // TODO: don't expose all these seeds.
494        let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));
495        let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);
496        py.enter_polars_series(|| self.df.write().hash_rows(Some(hb)))
497    }
498
499    #[pyo3(signature = (keep_names_as, column_names))]
500    pub fn transpose(
501        &self,
502        py: Python<'_>,
503        keep_names_as: Option<&str>,
504        column_names: &Bound<PyAny>,
505    ) -> PyResult<Self> {
506        let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
507            Some(Either::Right(name))
508        } else if let Ok(name) = column_names.extract::<String>() {
509            Some(Either::Left(name))
510        } else {
511            None
512        };
513        py.enter_polars_df(|| self.df.write().transpose(keep_names_as, new_col_names))
514    }
515
516    pub fn upsample(
517        &self,
518        py: Python<'_>,
519        by: Vec<String>,
520        index_column: &str,
521        every: &str,
522        stable: bool,
523    ) -> PyResult<Self> {
524        let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;
525        py.enter_polars_df(|| {
526            if stable {
527                self.df.read().upsample_stable(by, index_column, every)
528            } else {
529                self.df.read().upsample(by, index_column, every)
530            }
531        })
532    }
533
534    pub fn to_struct(
535        &self,
536        py: Python<'_>,
537        name: &str,
538        invalid_indices: Vec<usize>,
539    ) -> PyResult<PySeries> {
540        py.enter_polars_series(|| {
541            let mut ca = self.df.read().clone().into_struct(name.into());
542
543            if !invalid_indices.is_empty() {
544                let mut validity = MutableBitmap::with_capacity(ca.len());
545                validity.extend_constant(ca.len(), true);
546                for i in invalid_indices {
547                    validity.set(i, false);
548                }
549                ca.rechunk_mut();
550                Ok(ca.with_outer_validity(Some(validity.freeze())))
551            } else {
552                Ok(ca)
553            }
554        })
555    }
556
557    pub fn clear(&self, py: Python) -> PyResult<Self> {
558        py.enter_polars_df(|| Ok(self.df.read().clear()))
559    }
560
561    /// Export the columns via polars-ffi
562    /// # Safety
563    /// Needs a preallocated *mut SeriesExport that has allocated space for n_columns.
564    pub unsafe fn _export_columns(&self, location: usize) {
565        use polars_ffi::version_0::export_column;
566
567        let df = self.df.read();
568        let cols = df.columns();
569
570        let location = location as *mut SeriesExport;
571
572        for (i, col) in cols.iter().enumerate() {
573            let e = export_column(col);
574            // SAFETY:
575            // Caller should ensure address is allocated.
576            // Be careful not to drop `e` here as that should be dropped by the ffi consumer
577            unsafe { core::ptr::write(location.add(i), e) };
578        }
579    }
580
581    /// Import [`Self`] via polars-ffi
582    /// # Safety
583    /// [`location`] should be an address that contains [`width`] properly initialized
584    /// [`SeriesExport`]s
585    #[classmethod]
586    pub unsafe fn _import_columns(
587        _cls: &Bound<PyType>,
588        location: usize,
589        width: usize,
590    ) -> PyResult<Self> {
591        use polars_ffi::version_0::import_df;
592
593        let location = location as *mut SeriesExport;
594
595        let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;
596        Ok(PyDataFrame::from(df))
597    }
598
599    /// Internal utility function to allow direct access to the row encoding from python.
600    #[pyo3(signature = (opts))]
601    fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {
602        py.enter_polars_series(|| {
603            let name = PlSmallStr::from_static("row_enc");
604            let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);
605
606            let ca = if is_unordered {
607                _get_rows_encoded_ca_unordered(name, self.df.read().columns())
608            } else {
609                let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();
610                let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();
611
612                _get_rows_encoded_ca(
613                    name,
614                    self.df.read().columns(),
615                    descending.as_slice(),
616                    nulls_last.as_slice(),
617                    false,
618                )
619            }?;
620
621            Ok(ca)
622        })
623    }
624}