Module polars::docs::eager[−][src]

Expand description

Polars Eager cookbook

This page should serve a cookbook to quickly get you started with most fundamental operations executed on a ChunkedArray, Series or DataFrame.

Tree Of Contents

Creation of data structures
Arithmetic
Comparisons
Apply functions/ closures
- Series / ChunkedArrays
- DataFrame
Filter
Sort
Joins
GroupBy
- pivot
- downsample
Melt
Explode
IO
Various
- Replace NaN
- Extracting data

Creation of Data structures

ChunkedArray

use polars::prelude::*;

// use iterators
let ca: UInt32Chunked = (0..10).map(Some).collect();

// from slices
let ca = UInt32Chunked::new_from_slice("foo", &[1, 2, 3]);

// use builders
let mut builder = PrimitiveChunkedBuilder::<UInt32Type>::new("foo", 10);
for value in 0..10 {
    builder.append_value(value);
}
let ca = builder.finish();

Series

use polars::prelude::*;

// use iterators
let s: Series = (0..10).map(Some).collect();

// from slices
let s = Series::new("foo", &[1, 2, 3]);

// from a chunked-array
let ca = UInt32Chunked::new_from_opt_slice("foo", &[Some(1), None, Some(3)]);
let s = ca.into_series();

DataFrame

use polars::prelude::*;
use polars::df;

// use macro
let df = df! [
    "names" => ["a", "b", "c"],
    "values" => [1, 2, 3],
    "values_nulls" => [Some(1), None, Some(3)]
]?;

// from a Vec<Series>
let s1 = Series::new("names", &["a", "b", "c"]);
let s2 = Series::new("values", &[Some(1), None, Some(3)]);
let df = DataFrame::new(vec![s1, s2])?;

Arithmetic

Arithmetic can be done on both Series and ChunkedArrays. The most notable difference is that a Series coerces the data to match the underlying data types.

use polars::prelude::*;
let s_int = Series::new("a", &[1, 2, 3]);
let s_flt = Series::new("b", &[1.0, 2.0, 3.0]);

let added = &s_int + &s_flt;
let subtracted = &s_int - &s_flt;
let multiplied = &s_int * &s_flt;
let divided = &s_int / &s_flt;
let moduloed = &s_int % &s_flt;


// on chunked-arrays we first need to cast to same types
let ca_int = s_int.i32()?;
let ca_flt = s_flt.f32()?;

&ca_int.cast::<Float32Type>()? * ca_flt;
&ca_flt.cast::<Int32Type>()? * ca_int;

// we can also do arithmetic with numeric values
let multiplied = ca_int * 2.0;
let multiplied = s_flt * 2.0;

// or broadcast Series to match the operands type
let added = &s_int * &Series::new("broadcast_me", &[10]);

Because Rusts Orphan Rule doesn’t allow use to implement left side operations, we need to call such operation directly.

let series = Series::new("foo", [1, 2, 3]);

// 1 / s
let divide_one_by_s = 1.div(&series);

// 1 - s
let subtract_one_by_s = 1.sub(&series);

For ChunkedArrays this left hand side operations can be done with the apply method.

let ca = UInt32Chunked::new_from_slice("foo", &[1, 2, 3]);

// 1 / ca
let divide_one_by_ca = ca.apply(|rhs| 1 / rhs);

Comparisons

Series and ChunkedArrays can be used in comparison operations to create boolean masks/predicates.

use polars::prelude::*;

let s = Series::new("a", &[1, 2, 3]);
let ca = UInt32Chunked::new_from_opt_slice("b", &[Some(3), None, Some(1)]);

// compare Series with numeric values
// ==
s.eq(2);
// !=
s.neq(2);
// >
s.gt(2);
// >=
s.gt_eq(2);
// <
s.lt(2);
// <=
s.lt_eq(2);


// compare Series with Series
// ==
s.eq(&s);
// !=
s.neq(&s);
// >
s.gt(&s);
// >=
s.gt_eq(&s);
// <
s.lt(&s);
// <=
s.lt_eq(&s);


// compare chunked-array with numeric values
// ==
ca.eq(2);
// !=
ca.neq(2);
// >
ca.gt(2);
// >=
ca.gt_eq(2);
// <
ca.lt(2);
// <=
ca.lt_eq(2);

// compare chunked-array with chunked-array
// ==
ca.eq(&ca);
// !=
ca.neq(&ca);
// >
ca.gt(&ca);
// >=
ca.gt_eq(&ca);
// <
ca.lt(&ca);
// <=
ca.lt_eq(&ca);

// use iterators
let a: BooleanChunked = ca.into_iter()
    .map(|opt_value| {
         match opt_value {
         Some(value) => value < 10,
         None => false
}}).collect();

Apply functions/ closures

See all possible apply methods here.

Series / ChunkedArrays

use polars::prelude::*;

// apply a closure over all values
let s = Series::new("foo", &[Some(1), Some(2), None]);
s.i32()?.apply(|value| value * 20);

// count string lengths
let s = Series::new("foo", &["foo", "bar", "foobar"]);
s.utf8()?.apply_cast_numeric::<_, UInt64Type>(|str_val| str_val.len() as u64);

DataFrame

use polars::prelude::*;
use polars::df;

let mut df = df![
    "letters" => ["a", "b", "c", "d"],
    "numbers" => [1, 2, 3, 4]
]?;


// coerce numbers to floats
df.may_apply("number", |s: &Series| s.cast::<Float64Type>())?;

// transform letters to uppercase letters
df.may_apply("letters", |s: &Series| {
    Ok(s.utf8()?.to_uppercase())
});

Filter

use polars::prelude::*;

// create a mask to filter out null values
let mask = df.column("sepal.width")?.is_not_null();

// select column
let s = df.column("sepal.length")?;

// apply filter on a Series
let filtered_series = s.filter(&mask);

// apply the filter on a DataFrame
let filtered_df = df.filter(&mask)?;

Sort

use polars::prelude::*;
use polars::df;

let df = df![
    "a" => [1, 2, 3],
    "b" => ["a", "a", "b"]
]?;
// sort this DataFrame by multiple columns

// ordering of the columns
let reverse = &[true, false];
// columns to sort by
let by = &["b", "a"];
// do the sort operation
let sorted = df.sort(by, reverse)?;

// sorted:

// ╭─────┬─────╮
// │ a   ┆ b   │
// │ --- ┆ --- │
// │ i64 ┆ str │
// ╞═════╪═════╡
// │ 1   ┆ "a" │
// ├╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2   ┆ "a" │
// ├╌╌╌╌╌┼╌╌╌╌╌┤
// │ 3   ┆ "b" │
// ╰─────┴─────╯

Joins

use polars::prelude::*;
use polars::df;

// Create first df.
let temp = df!("days" => &[0, 1, 2, 3, 4],
               "temp" => &[22.1, 19.9, 7., 2., 3.],
               "other" => &[1, 2, 3, 4, 5]
)?;

// Create second df.
let rain = df!("days" => &[1, 2],
               "rain" => &[0.1, 0.2],
               "other" => &[1, 2, 3, 4, 5]
)?;

// join on a single column
temp.left_join(&rain, "days", "days");
temp.inner_join(&rain, "days", "days");
temp.outer_join(&rain, "days", "days");

// join on multiple columns
temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinType::Left);

Groupby

Note that Polars lazy is a lot more powerful in and more performant in groupby operations. In lazy a meriad of aggregations can be combined from expressions.

See more in:

Groupby

GroupBy

use polars::prelude::*;

 // groupby "groups" | sum "foo"
 let out = df.groupby("groups")?
 .select("foo")
 .sum();

Pivot

use polars::prelude::*;
use polars::df;

 let df = df!("foo" => ["A", "A", "B", "B", "C"],
     "N" => [1, 2, 2, 4, 2],
     "bar" => ["k", "l", "m", "n", "0"]
     )?;

// groupby "foo" | pivot "bar" column | aggregate "N"
 let pivoted = df.groupby("groups")?
    .pivot("bar", "N")
    .first();

// pivoted:
// +-----+------+------+------+------+------+
// | foo | o    | n    | m    | l    | k    |
// | --- | ---  | ---  | ---  | ---  | ---  |
// | str | i32  | i32  | i32  | i32  | i32  |
// +=====+======+======+======+======+======+
// | "A" | null | null | null | 2    | 1    |
// +-----+------+------+------+------+------+
// | "B" | null | 4    | 2    | null | null |
// +-----+------+------+------+------+------+
// | "C" | 2    | null | null | null | null |
// +-----+------+------+------+------+------+!

Downsample

Downsample the DataFrame given some frequency/ downsample rule

use polars::prelude::*;
use polars::df;
use polars::frame::groupby::resample::SampleRule;

// given an input dataframe

// ╭─────────────────────┬─────╮
// │ ms                  ┆ i   │
// │ ---                 ┆ --- │
// │ date64(ms)          ┆ u8  │
// ╞═════════════════════╪═════╡
// │ 2000-01-01 00:00:00 ┆ 0   │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:01:00 ┆ 1   │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:02:00 ┆ 2   │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:03:00 ┆ 3   │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ ...                 ┆ ... │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:15:00 ┆ 15  │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:16:00 ┆ 16  │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:17:00 ┆ 17  │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:18:00 ┆ 18  │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
// │ 2000-01-01 00:19:00 ┆ 19  │
// ╰─────────────────────┴─────╯


let downsampled = df.downsample("datetime", SampleRule::Minute(5))?
    .first()?
    .sort("datetime", false)?;

// downsampled:

// ╭─────────────────────┬─────────╮
// │ ms                  ┆ i_first │
// │ ---                 ┆ ---     │
// │ date64(ms)          ┆ u8      │
// ╞═════════════════════╪═════════╡
// │ 2000-01-01 00:00:00 ┆ 0       │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
// │ 2000-01-01 00:05:00 ┆ 5       │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
// │ 2000-01-01 00:10:00 ┆ 10      │
// ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
// │ 2000-01-01 00:15:00 ┆ 15      │
// ╰─────────────────────┴─────────╯

Melt

use polars::prelude::*;
use polars::df;

let df = df!["A" => &["a", "b", "a"],
             "B" => &[1, 3, 5],
             "C" => &[10, 11, 12],
             "D" => &[2, 4, 6]
    ]?;

let melted = df.melt(&["A", "B"], &["C", "D"]).unwrap();
// melted:

// +-----+-----+----------+-------+
//  | A   | B   | variable | value |
//  | --- | --- | ---      | ---   |
//  | str | i32 | str      | i32   |
//  +=====+=====+==========+=======+
//  | "a" | 1   | "C"      | 10    |
//  +-----+-----+----------+-------+
//  | "b" | 3   | "C"      | 11    |
//  +-----+-----+----------+-------+
//  | "a" | 5   | "C"      | 12    |
//  +-----+-----+----------+-------+
//  | "a" | 1   | "D"      | 2     |
//  +-----+-----+----------+-------+
//  | "b" | 3   | "D"      | 4     |
//  +-----+-----+----------+-------+
//  | "a" | 5   | "D"      | 6     |
//  +-----+-----+----------+-------+

Explode

use polars::prelude::*;
use polars::df;

let s0 = Series::new("a", &[1i64, 2, 3]);
let s1 = Series::new("b", &[1i64, 1, 1]);
let s2 = Series::new("c", &[2i64, 2, 2]);
// construct a new ListChunked for a slice of Series.
let list = Series::new("foo", &[s0, s1, s2]);

// construct a few more Series.
let s0 = Series::new("B", [1, 2, 3]);
let s1 = Series::new("C", [1, 1, 1]);
let df = DataFrame::new(vec![list, s0, s1])?;

let exploded = df.explode("foo")?;
// exploded:

// +-----+-----+-----+
// | foo | B   | C   |
// | --- | --- | --- |
// | i64 | i32 | i32 |
// +=====+=====+=====+
// | 1   | 1   | 1   |
// +-----+-----+-----+
// | 2   | 1   | 1   |
// +-----+-----+-----+
// | 3   | 1   | 1   |
// +-----+-----+-----+
// | 1   | 2   | 1   |
// +-----+-----+-----+
// | 1   | 2   | 1   |
// +-----+-----+-----+
// | 1   | 2   | 1   |
// +-----+-----+-----+
// | 2   | 3   | 1   |
// +-----+-----+-----+
// | 2   | 3   | 1   |
// +-----+-----+-----+
// | 2   | 3   | 1   |
// +-----+-----+-----+

IO

Read CSV

use polars::prelude::*;

// read from path
let df = CsvReader::from_path("iris_csv")?
            .infer_schema(None)
            .has_header(true)
            .finish()?;

Write CSV

use polars::prelude::*;
use std::fs::File;

// create a file
let mut file = File::create("example.csv").expect("could not create file");

// write DataFrame to file
CsvWriter::new(&mut file)
    .has_headers(true)
    .with_delimiter(b',')
    .finish(df);

Read IPC

use polars::prelude::*;
use std::fs::File;

// open file
let file = File::open("file.ipc").expect("file not found");

// read to DataFrame
let df = IpcReader::new(file)
   .finish()?;

Write IPC

use polars::prelude::*;
use std::fs::File;

// create a file
let mut file = File::create("file.ipc").expect("could not create file");

// write DataFrame to file
IpcWriter::new(&mut file)
    .finish(df)

Read Parquet

use polars::prelude::*;
use std::fs::File;

// open file
let file = File::open("some_file.parquet").unwrap();

// read to DataFrame
let df = ParquetReader::new(file).finish()?;

Write Parquet

use polars::prelude::*;
use std::fs::File;

// create a file
let file = File::create("example.parquet").expect("could not create file");

ParquetWriter::new(file)
    .finish(df)

Various

The floating point Not a Number: NaN is conceptually different than missing data in Polars. In the snippet below we show how we can replace NaN values with missing values, by setting them to None.

use polars::prelude::*;
use polars::df;

/// Replaces NaN with missing values.
fn fill_nan_with_nulls() -> Result<DataFrame> {
    let nan = f64::NAN;

    let mut df = df! {
       "a" => [nan, 1.0, 2.0],
       "b" => [nan, 1.0, 2.0]
    }
    .unwrap();

    for idx in 0..df.width() {
        df.may_apply_at_idx(idx, |series| {
            let mask = series.is_nan()?;
            let ca = series.f64()?;
            ca.set(&mask, None)
        })?;
    }
    Ok(df)
}

Extracting data

To be able to extract data out of Series, either by iterating over them or converting them to other datatypes like a Vec<T>, we first need to downcast them to a ChunkedArray<T>. This is needed because we don’t know the data type that is hold by the Series.

use polars::prelude::*;
use polars::df;

fn extract_data() -> Result<()> {
    let df = df! [
       "a" => [None, Some(1.0f32), Some(2.0)],
       "str" => ["foo", "bar", "ham"]
    ]?;

    // first extract ChunkedArray to get the inner type.
    let ca = df.column("a").f32();

    // Then convert to vec
    let to_vec: Vec<Option<f32>> = Vec::from(ca);

    // We can also do this with iterators
    let ca = df.column("str").utf8();
    let to_vec: Vec<Option<&str>> = ca.into_iter().collect();
    let to_vec_no_options: Vec<&str> = ca.into_no_null_iter().collect();

    Ok(())
}