1use std::hash::BuildHasher;
2
3use arrow::bitmap::MutableBitmap;
4use either::Either;
5use parking_lot::RwLock;
6use polars::prelude::*;
7use polars_ffi::version_0::SeriesExport;
8use pyo3::exceptions::PyIndexError;
9use pyo3::prelude::*;
10use pyo3::pybacked::PyBackedStr;
11use pyo3::types::{PyList, PyType};
12
13use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};
14use super::PyDataFrame;
15use crate::PyLazyFrame;
16use crate::conversion::Wrap;
17use crate::error::PyPolarsErr;
18use crate::prelude::strings_to_pl_smallstr;
19use crate::py_modules::polars;
20use crate::series::{PySeries, ToPySeries, ToSeries};
21use crate::utils::{EnterPolarsExt, to_py_err};
22
23#[pymethods]
24impl PyDataFrame {
25 #[new]
26 pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {
27 let columns = columns.to_series();
28 let columns = columns.into_iter().map(|s| s.into()).collect();
30 let df = DataFrame::new_infer_height(columns).map_err(PyPolarsErr::from)?;
31 Ok(PyDataFrame::new(df))
32 }
33
34 #[staticmethod]
35 pub fn empty_with_height(height: u64) -> PyResult<Self> {
36 Ok(PyDataFrame::new(DataFrame::empty_with_height(
37 IdxSize::try_from(height)
38 .map_err(|_| polars_err!(bigidx, ctx = "DataFrame(height = _)", size = height))
39 .map_err(to_py_err)? as usize,
40 )))
41 }
42
43 pub fn estimated_size(&self) -> usize {
44 self.df.read().estimated_size()
45 }
46
47 pub fn dtype_strings(&self) -> Vec<String> {
48 self.df
49 .read()
50 .columns()
51 .iter()
52 .map(|s| format!("{}", s.dtype()))
53 .collect()
54 }
55
56 pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
57 py.enter_polars_df(|| &*self.df.read() + &*s.series.read())
58 }
59
60 pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
61 py.enter_polars_df(|| &*self.df.read() - &*s.series.read())
62 }
63
64 pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
65 py.enter_polars_df(|| &*self.df.read() * &*s.series.read())
66 }
67
68 pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
69 py.enter_polars_df(|| &*self.df.read() / &*s.series.read())
70 }
71
72 pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
73 py.enter_polars_df(|| &*self.df.read() % &*s.series.read())
74 }
75
76 pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
77 py.enter_polars_df(|| &*self.df.read() + &*s.df.read())
78 }
79
80 pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
81 py.enter_polars_df(|| &*self.df.read() - &*s.df.read())
82 }
83
84 pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
85 py.enter_polars_df(|| &*self.df.read() * &*s.df.read())
86 }
87
88 pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
89 py.enter_polars_df(|| &*self.df.read() / &*s.df.read())
90 }
91
92 pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
93 py.enter_polars_df(|| &*self.df.read() % &*s.df.read())
94 }
95
96 #[pyo3(signature = (n, with_replacement, shuffle, seed=None))]
97 pub fn sample_n(
98 &self,
99 py: Python<'_>,
100 n: &PySeries,
101 with_replacement: bool,
102 shuffle: bool,
103 seed: Option<u64>,
104 ) -> PyResult<Self> {
105 py.enter_polars_df(|| {
106 self.df
107 .read()
108 .sample_n(&n.series.read(), with_replacement, shuffle, seed)
109 })
110 }
111
112 #[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]
113 pub fn sample_frac(
114 &self,
115 py: Python<'_>,
116 frac: &PySeries,
117 with_replacement: bool,
118 shuffle: bool,
119 seed: Option<u64>,
120 ) -> PyResult<Self> {
121 py.enter_polars_df(|| {
122 self.df
123 .read()
124 .sample_frac(&frac.series.read(), with_replacement, shuffle, seed)
125 })
126 }
127
128 pub fn rechunk(&self, py: Python) -> PyResult<Self> {
129 py.enter_polars_df(|| {
130 let mut df = self.df.read().clone();
131 df.rechunk_mut_par();
132 Ok(df)
133 })
134 }
135
136 pub fn as_str(&self) -> String {
138 format!("{:?}", self.df.read())
139 }
140
141 pub fn get_columns(&self) -> Vec<PySeries> {
142 let cols = self.df.read().columns().to_vec();
143 cols.to_pyseries()
144 }
145
146 pub fn columns(&self) -> Vec<String> {
148 self.df
149 .read()
150 .columns()
151 .iter()
152 .map(|s| s.name().to_string())
153 .collect()
154 }
155
156 pub fn set_column_names(&self, names: Vec<PyBackedStr>) -> PyResult<()> {
158 self.df
159 .write()
160 .set_column_names(&names)
161 .map_err(PyPolarsErr::from)?;
162 Ok(())
163 }
164
165 pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
167 let df = self.df.read();
168 let iter = df
169 .columns()
170 .iter()
171 .map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());
172 PyList::new(py, iter)
173 }
174
175 pub fn n_chunks(&self) -> usize {
176 self.df.read().first_col_n_chunks()
177 }
178
179 pub fn shape(&self) -> (usize, usize) {
180 self.df.read().shape()
181 }
182
183 pub fn height(&self) -> usize {
184 self.df.read().height()
185 }
186
187 pub fn width(&self) -> usize {
188 self.df.read().width()
189 }
190
191 pub fn is_empty(&self) -> bool {
192 self.df.read().shape_has_zero()
193 }
194
195 pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {
196 let columns = columns.to_series();
197 let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
199 py.enter_polars_df(|| self.df.read().hstack(&columns))
200 }
201
202 pub fn hstack_mut(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {
203 let columns = columns.to_series();
204 let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
206 py.enter_polars(|| self.df.write().hstack_mut(&columns).map(drop))?;
207 Ok(())
208 }
209
210 pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {
211 py.enter_polars_df(|| self.df.read().vstack(&other.df.read()))
212 }
213
214 pub fn vstack_mut(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
215 py.enter_polars(|| {
216 let other = other.df.read().clone();
218 self.df.write().vstack_mut_owned(other)?;
219 PolarsResult::Ok(())
220 })?;
221 Ok(())
222 }
223
224 pub fn extend(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
225 py.enter_polars(|| {
226 let other = other.df.read().clone();
228 self.df.write().extend(&other)
229 })?;
230 Ok(())
231 }
232
233 pub fn drop_in_place(&self, name: &str) -> PyResult<PySeries> {
234 let s = self
235 .df
236 .write()
237 .drop_in_place(name)
238 .map_err(PyPolarsErr::from)?;
239 let s = s.take_materialized_series();
240 Ok(PySeries::from(s))
241 }
242
243 pub fn to_series(&self, index: isize) -> PyResult<PySeries> {
244 let df = &self.df.read();
245
246 let index_adjusted = if index < 0 {
247 df.width().checked_sub(index.unsigned_abs())
248 } else {
249 Some(usize::try_from(index).unwrap())
250 };
251
252 let s = index_adjusted.and_then(|i| df.select_at_idx(i));
253 match s {
254 Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),
255 None => Err(PyIndexError::new_err(
256 polars_err!(oob = index, df.width()).to_string(),
257 )),
258 }
259 }
260
261 pub fn get_column_index(&self, name: &str) -> PyResult<usize> {
262 Ok(self
263 .df
264 .read()
265 .try_get_column_index(name)
266 .map_err(PyPolarsErr::from)?)
267 }
268
269 pub fn get_column(&self, name: &str) -> PyResult<PySeries> {
270 let series = self
271 .df
272 .read()
273 .column(name)
274 .map(|s| PySeries::new(s.as_materialized_series().clone()))
275 .map_err(PyPolarsErr::from)?;
276 Ok(series)
277 }
278
279 pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {
280 py.enter_polars_df(|| self.df.read().select(columns.iter().map(|x| &**x)))
281 }
282
283 pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {
284 let indices = indices.0;
285 let indices = IdxCa::from_vec("".into(), indices);
286 py.enter_polars_df(|| self.df.read().take(&indices))
287 }
288
289 pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {
290 let idx_s = indices.series.read();
291 let indices = idx_s.idx().map_err(PyPolarsErr::from)?;
292 py.enter_polars_df(|| self.df.read().take(indices))
293 }
294
295 pub fn replace(&self, column: &str, new_col: PySeries) -> PyResult<()> {
296 self.df
297 .write()
298 .replace(column, new_col.series.into_inner().into_column())
299 .map_err(PyPolarsErr::from)?;
300 Ok(())
301 }
302
303 pub fn replace_column(&self, index: usize, new_column: PySeries) -> PyResult<()> {
304 self.df
305 .write()
306 .replace_column(index, new_column.series.into_inner().into_column())
307 .map_err(PyPolarsErr::from)?;
308 Ok(())
309 }
310
311 pub fn insert_column(&self, index: usize, column: PySeries) -> PyResult<()> {
312 self.df
313 .write()
314 .insert_column(index, column.series.into_inner().into_column())
315 .map_err(PyPolarsErr::from)?;
316 Ok(())
317 }
318
319 #[pyo3(signature = (offset, length))]
320 pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {
321 py.enter_polars_df(|| {
322 let df = self.df.read();
323 let len = length.unwrap_or(usize::MAX);
324 Ok(df.slice(offset, len))
325 })
326 }
327
328 pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
329 py.enter_polars_df(|| Ok(self.df.read().head(Some(n))))
330 }
331
332 pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
333 py.enter_polars_df(|| Ok(self.df.read().tail(Some(n))))
334 }
335
336 pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {
337 py.enter_polars_series(|| self.df.read().is_unique())
338 }
339
340 pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {
341 py.enter_polars_series(|| self.df.read().is_duplicated())
342 }
343
344 pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {
345 if null_equal {
346 py.enter_polars_ok(|| self.df.read().equals_missing(&other.df.read()))
347 } else {
348 py.enter_polars_ok(|| self.df.read().equals(&other.df.read()))
349 }
350 }
351
352 #[pyo3(signature = (name, offset=None))]
353 pub fn with_row_index(
354 &self,
355 py: Python<'_>,
356 name: &str,
357 offset: Option<IdxSize>,
358 ) -> PyResult<Self> {
359 py.enter_polars_df(|| self.df.read().with_row_index(name.into(), offset))
360 }
361
362 pub fn _to_metadata(&self) -> Self {
363 Self {
364 df: RwLock::new(self.df.read()._to_metadata()),
365 }
366 }
367
368 pub fn group_by_map_groups(
369 &self,
370 py: Python<'_>,
371 by: Vec<PyBackedStr>,
372 lambda: Py<PyAny>,
373 maintain_order: bool,
374 ) -> PyResult<Self> {
375 py.enter_polars_df(|| {
376 let df = self.df.read().clone(); let gb = if maintain_order {
378 df.group_by_stable(by.iter().map(|x| &**x))
379 } else {
380 df.group_by(by.iter().map(|x| &**x))
381 }?;
382
383 let function = move |df: DataFrame| {
384 Python::attach(|py| {
385 let pypolars = polars(py).bind(py);
386 let pydf = PyDataFrame::new(df);
387 let python_df_wrapper =
388 pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();
389
390 let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {
392 Ok(pyobj) => pyobj,
393 Err(e) => panic!("UDF failed: {}", e.value(py)),
394 };
395 let py_pydf = result_df_wrapper.getattr(py, "_df").expect(
396 "Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",
397 );
398
399 let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();
400 Ok(pydf.df.into_inner())
401 })
402 };
403
404 gb.apply(function)
405 })
406 }
407
408 #[allow(clippy::should_implement_trait)]
409 pub fn clone(&self) -> Self {
410 Clone::clone(self)
411 }
412
413 #[cfg(feature = "pivot")]
414 #[pyo3(signature = (on, index, value_name=None, variable_name=None))]
415 pub fn unpivot(
416 &self,
417 py: Python<'_>,
418 on: Option<Vec<PyBackedStr>>,
419 index: Vec<PyBackedStr>,
420 value_name: Option<&str>,
421 variable_name: Option<&str>,
422 ) -> PyResult<Self> {
423 use polars_ops::unpivot::UnpivotDF;
424 let args = UnpivotArgsIR::new(
425 self.df.read().get_column_names_owned(),
426 on.map(strings_to_pl_smallstr),
427 strings_to_pl_smallstr(index),
428 value_name.map(|s| s.into()),
429 variable_name.map(|s| s.into()),
430 );
431
432 py.enter_polars_df(|| self.df.read().unpivot2(args))
433 }
434
435 pub fn partition_by(
436 &self,
437 py: Python<'_>,
438 by: Vec<String>,
439 maintain_order: bool,
440 include_key: bool,
441 ) -> PyResult<Vec<Self>> {
442 let out = py.enter_polars(|| {
443 if maintain_order {
444 self.df.read().partition_by_stable(by, include_key)
445 } else {
446 self.df.read().partition_by(by, include_key)
447 }
448 })?;
449
450 Ok(out.into_iter().map(PyDataFrame::from).collect())
451 }
452
453 pub fn lazy(&self) -> PyLazyFrame {
454 self.df.read().clone().lazy().into()
455 }
456
457 #[pyo3(signature = (columns, separator, drop_first, drop_nulls))]
458 pub fn to_dummies(
459 &self,
460 py: Python<'_>,
461 columns: Option<Vec<String>>,
462 separator: Option<&str>,
463 drop_first: bool,
464 drop_nulls: bool,
465 ) -> PyResult<Self> {
466 py.enter_polars_df(|| match columns {
467 Some(cols) => self.df.read().columns_to_dummies(
468 cols.iter().map(|x| x as &str).collect(),
469 separator,
470 drop_first,
471 drop_nulls,
472 ),
473 None => self.df.read().to_dummies(separator, drop_first, drop_nulls),
474 })
475 }
476
477 pub fn null_count(&self, py: Python) -> PyResult<Self> {
478 py.enter_polars_df(|| Ok(self.df.read().null_count()))
479 }
480
481 pub fn shrink_to_fit(&self, py: Python) -> PyResult<()> {
482 py.enter_polars_ok(|| self.df.write().shrink_to_fit())
483 }
484
485 pub fn hash_rows(
486 &self,
487 py: Python<'_>,
488 k0: u64,
489 k1: u64,
490 k2: u64,
491 k3: u64,
492 ) -> PyResult<PySeries> {
493 let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));
495 let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);
496 py.enter_polars_series(|| self.df.write().hash_rows(Some(hb)))
497 }
498
499 #[pyo3(signature = (keep_names_as, column_names))]
500 pub fn transpose(
501 &self,
502 py: Python<'_>,
503 keep_names_as: Option<&str>,
504 column_names: &Bound<PyAny>,
505 ) -> PyResult<Self> {
506 let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
507 Some(Either::Right(name))
508 } else if let Ok(name) = column_names.extract::<String>() {
509 Some(Either::Left(name))
510 } else {
511 None
512 };
513 py.enter_polars_df(|| self.df.write().transpose(keep_names_as, new_col_names))
514 }
515
516 pub fn upsample(
517 &self,
518 py: Python<'_>,
519 by: Vec<String>,
520 index_column: &str,
521 every: &str,
522 stable: bool,
523 ) -> PyResult<Self> {
524 let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;
525 py.enter_polars_df(|| {
526 if stable {
527 self.df.read().upsample_stable(by, index_column, every)
528 } else {
529 self.df.read().upsample(by, index_column, every)
530 }
531 })
532 }
533
534 pub fn to_struct(
535 &self,
536 py: Python<'_>,
537 name: &str,
538 invalid_indices: Vec<usize>,
539 ) -> PyResult<PySeries> {
540 py.enter_polars_series(|| {
541 let mut ca = self.df.read().clone().into_struct(name.into());
542
543 if !invalid_indices.is_empty() {
544 let mut validity = MutableBitmap::with_capacity(ca.len());
545 validity.extend_constant(ca.len(), true);
546 for i in invalid_indices {
547 validity.set(i, false);
548 }
549 ca.rechunk_mut();
550 Ok(ca.with_outer_validity(Some(validity.freeze())))
551 } else {
552 Ok(ca)
553 }
554 })
555 }
556
557 pub fn clear(&self, py: Python) -> PyResult<Self> {
558 py.enter_polars_df(|| Ok(self.df.read().clear()))
559 }
560
561 pub unsafe fn _export_columns(&self, location: usize) {
565 use polars_ffi::version_0::export_column;
566
567 let df = self.df.read();
568 let cols = df.columns();
569
570 let location = location as *mut SeriesExport;
571
572 for (i, col) in cols.iter().enumerate() {
573 let e = export_column(col);
574 unsafe { core::ptr::write(location.add(i), e) };
578 }
579 }
580
581 #[classmethod]
586 pub unsafe fn _import_columns(
587 _cls: &Bound<PyType>,
588 location: usize,
589 width: usize,
590 ) -> PyResult<Self> {
591 use polars_ffi::version_0::import_df;
592
593 let location = location as *mut SeriesExport;
594
595 let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;
596 Ok(PyDataFrame::from(df))
597 }
598
599 #[pyo3(signature = (opts))]
601 fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {
602 py.enter_polars_series(|| {
603 let name = PlSmallStr::from_static("row_enc");
604 let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);
605
606 let ca = if is_unordered {
607 _get_rows_encoded_ca_unordered(name, self.df.read().columns())
608 } else {
609 let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();
610 let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();
611
612 _get_rows_encoded_ca(
613 name,
614 self.df.read().columns(),
615 descending.as_slice(),
616 nulls_last.as_slice(),
617 false,
618 )
619 }?;
620
621 Ok(ca)
622 })
623 }
624}