1use std::hash::BuildHasher;
2
3use arrow::bitmap::MutableBitmap;
4use either::Either;
5use polars::prelude::*;
6use polars_ffi::version_0::SeriesExport;
7#[cfg(feature = "pivot")]
8use polars_lazy::frame::pivot::{pivot, pivot_stable};
9use pyo3::IntoPyObjectExt;
10use pyo3::exceptions::PyIndexError;
11use pyo3::prelude::*;
12use pyo3::pybacked::PyBackedStr;
13use pyo3::types::{PyList, PyType};
14
15use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};
16use super::PyDataFrame;
17use crate::conversion::Wrap;
18use crate::error::PyPolarsErr;
19use crate::map::dataframe::{
20 apply_lambda_unknown, apply_lambda_with_bool_out_type, apply_lambda_with_primitive_out_type,
21 apply_lambda_with_string_out_type,
22};
23use crate::prelude::strings_to_pl_smallstr;
24use crate::py_modules::polars;
25use crate::series::{PySeries, ToPySeries, ToSeries};
26use crate::utils::EnterPolarsExt;
27use crate::{PyExpr, PyLazyFrame};
28
29#[pymethods]
30impl PyDataFrame {
31 #[new]
32 pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {
33 let columns = columns.to_series();
34 let columns = columns.into_iter().map(|s| s.into()).collect();
36 let df = DataFrame::new(columns).map_err(PyPolarsErr::from)?;
37 Ok(PyDataFrame::new(df))
38 }
39
40 pub fn estimated_size(&self) -> usize {
41 self.df.estimated_size()
42 }
43
44 pub fn dtype_strings(&self) -> Vec<String> {
45 self.df
46 .get_columns()
47 .iter()
48 .map(|s| format!("{}", s.dtype()))
49 .collect()
50 }
51
52 pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
53 py.enter_polars_df(|| &self.df + &s.series)
54 }
55
56 pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
57 py.enter_polars_df(|| &self.df - &s.series)
58 }
59
60 pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
61 py.enter_polars_df(|| &self.df * &s.series)
62 }
63
64 pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
65 py.enter_polars_df(|| &self.df / &s.series)
66 }
67
68 pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
69 py.enter_polars_df(|| &self.df % &s.series)
70 }
71
72 pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
73 py.enter_polars_df(|| &self.df + &s.df)
74 }
75
76 pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
77 py.enter_polars_df(|| &self.df - &s.df)
78 }
79
80 pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
81 py.enter_polars_df(|| &self.df * &s.df)
82 }
83
84 pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
85 py.enter_polars_df(|| &self.df / &s.df)
86 }
87
88 pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
89 py.enter_polars_df(|| &self.df % &s.df)
90 }
91
92 #[pyo3(signature = (n, with_replacement, shuffle, seed=None))]
93 pub fn sample_n(
94 &self,
95 py: Python<'_>,
96 n: &PySeries,
97 with_replacement: bool,
98 shuffle: bool,
99 seed: Option<u64>,
100 ) -> PyResult<Self> {
101 py.enter_polars_df(|| self.df.sample_n(&n.series, with_replacement, shuffle, seed))
102 }
103
104 #[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]
105 pub fn sample_frac(
106 &self,
107 py: Python<'_>,
108 frac: &PySeries,
109 with_replacement: bool,
110 shuffle: bool,
111 seed: Option<u64>,
112 ) -> PyResult<Self> {
113 py.enter_polars_df(|| {
114 self.df
115 .sample_frac(&frac.series, with_replacement, shuffle, seed)
116 })
117 }
118
119 pub fn rechunk(&self, py: Python) -> PyResult<Self> {
120 py.enter_polars_df(|| {
121 let mut df = self.df.clone();
122 df.as_single_chunk_par();
123 Ok(df)
124 })
125 }
126
127 pub fn as_str(&self) -> String {
129 format!("{:?}", self.df)
130 }
131
132 pub fn get_columns(&self) -> Vec<PySeries> {
133 let cols = self.df.get_columns().to_vec();
134 cols.to_pyseries()
135 }
136
137 pub fn columns(&self) -> Vec<&str> {
139 self.df.get_column_names_str()
140 }
141
142 pub fn set_column_names(&mut self, names: Vec<PyBackedStr>) -> PyResult<()> {
144 self.df
145 .set_column_names(names.iter().map(|x| &**x))
146 .map_err(PyPolarsErr::from)?;
147 Ok(())
148 }
149
150 pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
152 let iter = self
153 .df
154 .iter()
155 .map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());
156 PyList::new(py, iter)
157 }
158
159 pub fn n_chunks(&self) -> usize {
160 self.df.first_col_n_chunks()
161 }
162
163 pub fn shape(&self) -> (usize, usize) {
164 self.df.shape()
165 }
166
167 pub fn height(&self) -> usize {
168 self.df.height()
169 }
170
171 pub fn width(&self) -> usize {
172 self.df.width()
173 }
174
175 pub fn is_empty(&self) -> bool {
176 self.df.is_empty()
177 }
178
179 pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {
180 let columns = columns.to_series();
181 let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
183 py.enter_polars_df(|| self.df.hstack(&columns))
184 }
185
186 pub fn hstack_mut(&mut self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {
187 let columns = columns.to_series();
188 let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
190 py.enter_polars(|| self.df.hstack_mut(&columns))?;
191 Ok(())
192 }
193
194 pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {
195 py.enter_polars_df(|| self.df.vstack(&other.df))
196 }
197
198 pub fn vstack_mut(&mut self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
199 py.enter_polars(|| self.df.vstack_mut(&other.df))?;
200 Ok(())
201 }
202
203 pub fn extend(&mut self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
204 py.enter_polars(|| self.df.extend(&other.df))?;
205 Ok(())
206 }
207
208 pub fn drop_in_place(&mut self, name: &str) -> PyResult<PySeries> {
209 let s = self.df.drop_in_place(name).map_err(PyPolarsErr::from)?;
210 let s = s.take_materialized_series();
211 Ok(PySeries { series: s })
212 }
213
214 pub fn to_series(&self, index: isize) -> PyResult<PySeries> {
215 let df = &self.df;
216
217 let index_adjusted = if index < 0 {
218 df.width().checked_sub(index.unsigned_abs())
219 } else {
220 Some(usize::try_from(index).unwrap())
221 };
222
223 let s = index_adjusted.and_then(|i| df.select_at_idx(i));
224 match s {
225 Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),
226 None => Err(PyIndexError::new_err(
227 polars_err!(oob = index, df.width()).to_string(),
228 )),
229 }
230 }
231
232 pub fn get_column_index(&self, name: &str) -> PyResult<usize> {
233 Ok(self
234 .df
235 .try_get_column_index(name)
236 .map_err(PyPolarsErr::from)?)
237 }
238
239 pub fn get_column(&self, name: &str) -> PyResult<PySeries> {
240 let series = self
241 .df
242 .column(name)
243 .map(|s| PySeries::new(s.as_materialized_series().clone()))
244 .map_err(PyPolarsErr::from)?;
245 Ok(series)
246 }
247
248 pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {
249 py.enter_polars_df(|| self.df.select(columns.iter().map(|x| &**x)))
250 }
251
252 pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {
253 let indices = indices.0;
254 let indices = IdxCa::from_vec("".into(), indices);
255 py.enter_polars_df(|| self.df.take(&indices))
256 }
257
258 pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {
259 let indices = indices.series.idx().map_err(PyPolarsErr::from)?;
260 py.enter_polars_df(|| self.df.take(indices))
261 }
262
263 pub fn replace(&mut self, column: &str, new_col: PySeries) -> PyResult<()> {
264 self.df
265 .replace(column, new_col.series)
266 .map_err(PyPolarsErr::from)?;
267 Ok(())
268 }
269
270 pub fn replace_column(&mut self, index: usize, new_column: PySeries) -> PyResult<()> {
271 self.df
272 .replace_column(index, new_column.series)
273 .map_err(PyPolarsErr::from)?;
274 Ok(())
275 }
276
277 pub fn insert_column(&mut self, index: usize, column: PySeries) -> PyResult<()> {
278 self.df
279 .insert_column(index, column.series)
280 .map_err(PyPolarsErr::from)?;
281 Ok(())
282 }
283
284 #[pyo3(signature = (offset, length=None))]
285 pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {
286 py.enter_polars_df(|| {
287 Ok(self
288 .df
289 .slice(offset, length.unwrap_or_else(|| self.df.height())))
290 })
291 }
292
293 pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
294 py.enter_polars_df(|| Ok(self.df.head(Some(n))))
295 }
296
297 pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
298 py.enter_polars_df(|| Ok(self.df.tail(Some(n))))
299 }
300
301 pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {
302 py.enter_polars_series(|| self.df.is_unique())
303 }
304
305 pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {
306 py.enter_polars_series(|| self.df.is_duplicated())
307 }
308
309 pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {
310 if null_equal {
311 py.enter_polars_ok(|| self.df.equals_missing(&other.df))
312 } else {
313 py.enter_polars_ok(|| self.df.equals(&other.df))
314 }
315 }
316
317 #[pyo3(signature = (name, offset=None))]
318 pub fn with_row_index(
319 &self,
320 py: Python<'_>,
321 name: &str,
322 offset: Option<IdxSize>,
323 ) -> PyResult<Self> {
324 py.enter_polars_df(|| self.df.with_row_index(name.into(), offset))
325 }
326
327 pub fn _to_metadata(&self) -> Self {
328 Self {
329 df: self.df._to_metadata(),
330 }
331 }
332
333 pub fn group_by_map_groups(
334 &self,
335 by: Vec<PyBackedStr>,
336 lambda: PyObject,
337 maintain_order: bool,
338 ) -> PyResult<Self> {
339 let gb = if maintain_order {
340 self.df.group_by_stable(by.iter().map(|x| &**x))
341 } else {
342 self.df.group_by(by.iter().map(|x| &**x))
343 }
344 .map_err(PyPolarsErr::from)?;
345
346 let function = move |df: DataFrame| {
347 Python::with_gil(|py| {
348 let pypolars = polars(py).bind(py);
349 let pydf = PyDataFrame::new(df);
350 let python_df_wrapper =
351 pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();
352
353 let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {
355 Ok(pyobj) => pyobj,
356 Err(e) => panic!("UDF failed: {}", e.value(py)),
357 };
358 let py_pydf = result_df_wrapper.getattr(py, "_df").expect(
359 "Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",
360 );
361
362 let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();
363 Ok(pydf.df)
364 })
365 };
366 let df = gb.apply(function).map_err(PyPolarsErr::from)?;
371
372 Ok(df.into())
373 }
374
375 #[allow(clippy::should_implement_trait)]
376 pub fn clone(&self) -> Self {
377 PyDataFrame::new(self.df.clone())
378 }
379
380 #[cfg(feature = "pivot")]
381 #[pyo3(signature = (on, index, value_name=None, variable_name=None))]
382 pub fn unpivot(
383 &self,
384 py: Python<'_>,
385 on: Vec<PyBackedStr>,
386 index: Vec<PyBackedStr>,
387 value_name: Option<&str>,
388 variable_name: Option<&str>,
389 ) -> PyResult<Self> {
390 use polars_ops::pivot::UnpivotDF;
391 let args = UnpivotArgsIR {
392 on: strings_to_pl_smallstr(on),
393 index: strings_to_pl_smallstr(index),
394 value_name: value_name.map(|s| s.into()),
395 variable_name: variable_name.map(|s| s.into()),
396 };
397
398 py.enter_polars_df(|| self.df.unpivot2(args))
399 }
400
401 #[cfg(feature = "pivot")]
402 #[pyo3(signature = (on, index, values, maintain_order, sort_columns, aggregate_expr, separator))]
403 pub fn pivot_expr(
404 &self,
405 py: Python<'_>,
406 on: Vec<String>,
407 index: Option<Vec<String>>,
408 values: Option<Vec<String>>,
409 maintain_order: bool,
410 sort_columns: bool,
411 aggregate_expr: Option<PyExpr>,
412 separator: Option<&str>,
413 ) -> PyResult<Self> {
414 let fun = if maintain_order { pivot_stable } else { pivot };
415 let agg_expr = aggregate_expr.map(|expr| expr.inner);
416 py.enter_polars_df(|| {
417 fun(
418 &self.df,
419 on,
420 index,
421 values,
422 sort_columns,
423 agg_expr,
424 separator,
425 )
426 })
427 }
428
429 pub fn partition_by(
430 &self,
431 py: Python<'_>,
432 by: Vec<String>,
433 maintain_order: bool,
434 include_key: bool,
435 ) -> PyResult<Vec<Self>> {
436 let out = py.enter_polars(|| {
437 if maintain_order {
438 self.df.partition_by_stable(by, include_key)
439 } else {
440 self.df.partition_by(by, include_key)
441 }
442 })?;
443
444 Ok(unsafe { std::mem::transmute::<Vec<DataFrame>, Vec<PyDataFrame>>(out) })
446 }
447
448 pub fn lazy(&self) -> PyLazyFrame {
449 self.df.clone().lazy().into()
450 }
451
452 #[pyo3(signature = (columns, separator, drop_first=false))]
453 pub fn to_dummies(
454 &self,
455 py: Python<'_>,
456 columns: Option<Vec<String>>,
457 separator: Option<&str>,
458 drop_first: bool,
459 ) -> PyResult<Self> {
460 py.enter_polars_df(|| match columns {
461 Some(cols) => self.df.columns_to_dummies(
462 cols.iter().map(|x| x as &str).collect(),
463 separator,
464 drop_first,
465 ),
466 None => self.df.to_dummies(separator, drop_first),
467 })
468 }
469
470 pub fn null_count(&self, py: Python) -> PyResult<Self> {
471 py.enter_polars_df(|| Ok(self.df.null_count()))
472 }
473
474 #[pyo3(signature = (lambda, output_type, inference_size))]
475 pub fn map_rows(
476 &mut self,
477 lambda: Bound<PyAny>,
478 output_type: Option<Wrap<DataType>>,
479 inference_size: usize,
480 ) -> PyResult<(PyObject, bool)> {
481 Python::with_gil(|py| {
482 self.df.as_single_chunk_par();
484 let df = &self.df;
485
486 use apply_lambda_with_primitive_out_type as apply;
487 #[rustfmt::skip]
488 let out = match output_type.map(|dt| dt.0) {
489 Some(DataType::Int32) => apply::<Int32Type>(df, py, lambda, 0, None)?.into_series(),
490 Some(DataType::Int64) => apply::<Int64Type>(df, py, lambda, 0, None)?.into_series(),
491 Some(DataType::UInt32) => apply::<UInt32Type>(df, py, lambda, 0, None)?.into_series(),
492 Some(DataType::UInt64) => apply::<UInt64Type>(df, py, lambda, 0, None)?.into_series(),
493 Some(DataType::Float32) => apply::<Float32Type>(df, py, lambda, 0, None)?.into_series(),
494 Some(DataType::Float64) => apply::<Float64Type>(df, py, lambda, 0, None)?.into_series(),
495 Some(DataType::Date) => apply::<Int32Type>(df, py, lambda, 0, None)?.into_date().into_series(),
496 Some(DataType::Datetime(tu, tz)) => apply::<Int64Type>(df, py, lambda, 0, None)?.into_datetime(tu, tz).into_series(),
497 Some(DataType::Boolean) => apply_lambda_with_bool_out_type(df, py, lambda, 0, None)?.into_series(),
498 Some(DataType::String) => apply_lambda_with_string_out_type(df, py, lambda, 0, None)?.into_series(),
499 _ => return apply_lambda_unknown(df, py, lambda, inference_size),
500 };
501
502 Ok((PySeries::from(out).into_py_any(py)?, false))
503 })
504 }
505
506 pub fn shrink_to_fit(&mut self, py: Python) -> PyResult<()> {
507 py.enter_polars_ok(|| self.df.shrink_to_fit())
508 }
509
510 pub fn hash_rows(
511 &mut self,
512 py: Python<'_>,
513 k0: u64,
514 k1: u64,
515 k2: u64,
516 k3: u64,
517 ) -> PyResult<PySeries> {
518 let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));
520 let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);
521 py.enter_polars_series(|| self.df.hash_rows(Some(hb)))
522 }
523
524 #[pyo3(signature = (keep_names_as, column_names))]
525 pub fn transpose(
526 &mut self,
527 py: Python<'_>,
528 keep_names_as: Option<&str>,
529 column_names: &Bound<PyAny>,
530 ) -> PyResult<Self> {
531 let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
532 Some(Either::Right(name))
533 } else if let Ok(name) = column_names.extract::<String>() {
534 Some(Either::Left(name))
535 } else {
536 None
537 };
538 py.enter_polars_df(|| self.df.transpose(keep_names_as, new_col_names))
539 }
540
541 pub fn upsample(
542 &self,
543 py: Python<'_>,
544 by: Vec<String>,
545 index_column: &str,
546 every: &str,
547 stable: bool,
548 ) -> PyResult<Self> {
549 let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;
550 py.enter_polars_df(|| {
551 if stable {
552 self.df.upsample_stable(by, index_column, every)
553 } else {
554 self.df.upsample(by, index_column, every)
555 }
556 })
557 }
558
559 pub fn to_struct(
560 &self,
561 py: Python<'_>,
562 name: &str,
563 invalid_indices: Vec<usize>,
564 ) -> PyResult<PySeries> {
565 py.enter_polars_series(|| {
566 let mut ca = self.df.clone().into_struct(name.into());
567
568 if !invalid_indices.is_empty() {
569 let mut validity = MutableBitmap::with_capacity(ca.len());
570 validity.extend_constant(ca.len(), true);
571 for i in invalid_indices {
572 validity.set(i, false);
573 }
574 ca.rechunk_mut();
575 Ok(ca.with_outer_validity(Some(validity.freeze())))
576 } else {
577 Ok(ca)
578 }
579 })
580 }
581
582 pub fn clear(&self, py: Python) -> PyResult<Self> {
583 py.enter_polars_df(|| Ok(self.df.clear()))
584 }
585
586 pub unsafe fn _export_columns(&mut self, location: usize) {
590 use polars_ffi::version_0::export_column;
591
592 let cols = self.df.get_columns();
593
594 let location = location as *mut SeriesExport;
595
596 for (i, col) in cols.iter().enumerate() {
597 let e = export_column(col);
598 unsafe { core::ptr::write(location.add(i), e) };
602 }
603 }
604
605 #[classmethod]
610 pub unsafe fn _import_columns(
611 _cls: &Bound<PyType>,
612 location: usize,
613 width: usize,
614 ) -> PyResult<Self> {
615 use polars_ffi::version_0::import_df;
616
617 let location = location as *mut SeriesExport;
618
619 let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;
620 Ok(PyDataFrame { df })
621 }
622
623 #[pyo3(signature = (opts))]
625 fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {
626 py.enter_polars_series(|| {
627 let name = PlSmallStr::from_static("row_enc");
628 let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);
629
630 let ca = if is_unordered {
631 _get_rows_encoded_ca_unordered(name, self.df.get_columns())
632 } else {
633 let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();
634 let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();
635
636 _get_rows_encoded_ca(
637 name,
638 self.df.get_columns(),
639 descending.as_slice(),
640 nulls_last.as_slice(),
641 )
642 }?;
643
644 Ok(ca)
645 })
646 }
647}