polars-rows-iter 0.13.2

Library for easy and convenient row iteration of polars dataframes
Documentation
use itertools::Itertools;
use polars::prelude::*;
use rand::{
    distr::{Alphanumeric, Distribution, StandardUniform},
    rngs::StdRng,
    RngExt, SeedableRng,
};
use std::collections::HashMap;

pub type IsOptional = bool;

const TIME64_MAX_VALUE: i64 = 24 * 60 * 60 * 1_000_000_000;

#[derive(Debug, Clone)]
pub struct ColumnType(pub DataType, pub IsOptional);

pub fn create_values<T, F>(height: usize, mut get_value: F) -> Vec<T>
where
    F: FnMut() -> T,
{
    let mut values = Vec::<T>::with_capacity(height);
    for _ in 0..height {
        values.push(get_value());
    }

    values
}

pub fn create_optional_bool(rng: &mut StdRng) -> Option<bool> {
    let is_none = rng.random_bool(0.5);
    if !is_none {
        Some(rng.random_bool(0.5))
    } else {
        None
    }
}

pub fn create_optional_number<T>(rng: &mut StdRng) -> Option<T>
where
    StandardUniform: Distribution<T>,
{
    let is_none = rng.random_bool(0.5);
    if !is_none {
        Some(rng.random())
    } else {
        None
    }
}

pub fn create_optional<T, F>(rng: &mut StdRng, mut create_value: F) -> Option<T>
where
    F: FnMut(&mut StdRng) -> T,
{
    let is_none = rng.random_bool(0.5);
    if !is_none {
        Some(create_value(rng))
    } else {
        None
    }
}

pub fn create_random_string(rng: &mut StdRng) -> String {
    let size: usize = rng.random_range(4..32);
    rng.sample_iter(&Alphanumeric).take(size).map(char::from).collect()
}

pub fn create_random_binary(rng: &mut StdRng) -> Vec<u8> {
    let size: usize = rng.random_range(4..32);
    rng.sample_iter(&Alphanumeric).take(size).collect()
}

pub fn create_enum_values<'a>(
    categories: &'a FrozenCategories,
    mapping: &'a CategoricalMapping,
    height: usize,
    rng: &mut StdRng,
) -> Vec<&'a str> {
    (0..height)
        .map(|_| {
            let enum_index = rng.random_range(0..categories.categories().len() as u32);
            mapping.cat_to_str(enum_index).unwrap()
        })
        .collect()
}

pub fn create_optional_enum_values<'a>(
    categories: &'a FrozenCategories,
    mapping: &'a CategoricalMapping,
    height: usize,
    rng: &mut StdRng,
) -> Vec<Option<&'a str>> {
    (0..height)
        .map(move |_| {
            let enum_index = rng.random_range(0..categories.categories().len() as u32);
            match rng.random_bool(0.5) {
                true => mapping.cat_to_str(enum_index),
                false => None,
            }
        })
        .collect()
}

pub fn create_column(name: &str, dtype: &DataType, optional: IsOptional, height: usize, rng: &mut StdRng) -> Column {
    // println!("Creating column {name} with type {dtype} (optional: {optional})");
    let name = name.into();
    match dtype {
        DataType::Boolean => match optional {
            true => Column::new(name, create_values(height, || create_optional_bool(rng))),
            false => Column::new(name, create_values(height, || rng.random_bool(0.5))),
        },
        DataType::UInt8 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<u8>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<u8>())),
        },
        DataType::UInt16 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<u16>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<u16>())),
        },
        DataType::UInt32 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<u32>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<u32>())),
        },
        DataType::UInt64 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<u64>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<u64>())),
        },
        DataType::Int8 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i8>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<i8>())),
        },
        DataType::Int16 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i16>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<i16>())),
        },
        DataType::Int32 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i32>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<i32>())),
        },
        DataType::Int64 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i64>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<i64>())),
        },
        DataType::Float32 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<f32>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<f32>())),
        },
        DataType::Float64 => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<f64>(rng))),
            false => Column::new(name, create_values(height, || rng.random::<f64>())),
        },
        DataType::String => match optional {
            true => Column::new(
                name,
                create_values(height, || create_optional(rng, create_random_string)),
            ),
            false => Column::new(name, create_values(height, || create_random_string(rng))),
        },
        DataType::Categorical(mapping, ordering) => match optional {
            true => Column::new(
                name,
                create_values(height, || create_optional(rng, create_random_string)),
            )
            .cast(&DataType::Categorical(mapping.clone(), ordering.clone()))
            .unwrap(),
            false => Column::new(name, create_values(height, || create_random_string(rng)))
                .cast(&DataType::Categorical(mapping.clone(), ordering.clone()))
                .unwrap(),
        },
        DataType::Enum(categories, mapping) => match optional {
            true => Column::new(
                name,
                create_optional_enum_values(categories.as_ref(), mapping.as_ref(), height, rng),
            )
            .cast(&DataType::Enum(categories.clone(), mapping.clone()))
            .unwrap(),
            false => Column::new(
                name,
                create_enum_values(categories.as_ref(), mapping.as_ref(), height, rng),
            )
            .cast(&DataType::Enum(categories.clone(), mapping.clone()))
            .unwrap(),
        },
        DataType::Datetime(unit, zone) => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i64>(rng)))
                .cast(&DataType::Datetime(*unit, zone.clone()))
                .unwrap(),
            false => Column::new(name, create_values(height, || rng.random::<i64>()))
                .cast(&DataType::Datetime(*unit, zone.clone()))
                .unwrap(),
        },
        DataType::Date => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i32>(rng)))
                .cast(&DataType::Date)
                .unwrap(),
            false => Column::new(name, create_values(height, || rng.random::<i32>()))
                .cast(&DataType::Date)
                .unwrap(),
        },
        DataType::Time => match optional {
            true => Column::new(
                name,
                create_values(height, || {
                    create_optional(rng, |rng| {
                        let value: i64 = rng.random_range(0..TIME64_MAX_VALUE);
                        value
                    })
                }),
            )
            .cast(&DataType::Time)
            .unwrap(),
            false => Column::new(
                name,
                create_values(height, || {
                    let value: i64 = rng.random_range(0..TIME64_MAX_VALUE);
                    value
                }),
            )
            .cast(&DataType::Time)
            .unwrap(),
        },
        DataType::Duration(unit) => match optional {
            true => Column::new(name, create_values(height, || create_optional_number::<i64>(rng)))
                .cast(&DataType::Duration(*unit))
                .unwrap(),
            false => Column::new(name, create_values(height, || rng.random::<i64>()))
                .cast(&DataType::Duration(*unit))
                .unwrap(),
        },
        DataType::Binary => match optional {
            true => Column::new(
                name,
                create_values(height, || create_optional(rng, create_random_binary)),
            ),
            false => Column::new(name, create_values(height, || create_random_binary(rng))),
        },
        DataType::BinaryOffset => match optional {
            true => {
                let values: LargeBinaryArray = create_values(height, || create_optional(rng, create_random_binary))
                    .into_iter()
                    .collect();

                BinaryOffsetChunked::from(values).into_column().with_name(name)
            }
            false => {
                let values: LargeBinaryArray = create_values(height, || Some(create_random_binary(rng)))
                    .into_iter()
                    .collect();

                BinaryOffsetChunked::from(values).into_column().with_name(name)
            }
        },
        DataType::List(dtype) => {
            if optional {
                let series = (0..height)
                    .map(|_| {
                        if rng.random_bool(0.5) {
                            Some(
                                create_column("", dtype.as_ref(), optional, height, rng)
                                    .as_materialized_series()
                                    .clone(),
                            )
                        } else {
                            None
                        }
                    })
                    .collect_vec();

                Column::new(name, series)
            } else {
                let series = (0..height)
                    .map(|_| {
                        create_column("", dtype.as_ref(), optional, height, rng)
                            .as_materialized_series()
                            .clone()
                    })
                    .collect_vec();

                Column::new(name, series)
            }
        }
        _ => todo!(),
    }
}

pub fn create_dataframe(columns: HashMap<&str, ColumnType>, height: usize) -> DataFrame {
    let mut rng = StdRng::seed_from_u64(0);
    let columns = columns
        .into_iter()
        .map(|(name, ColumnType(dtype, optional))| create_column(name, dtype.as_ref(), optional, height, &mut rng))
        .collect::<Vec<Column>>();

    DataFrame::new(height, columns).unwrap()
}