use crate::core::data_value::DataValue;
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ColumnType {
Int64,
Int32,
Float64,
Float32,
String,
Boolean,
DateTime,
Categorical,
Object,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DuplicateKeep {
First,
Last,
None,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PadSide {
Left,
Right,
Both,
}
pub trait ColumnOps<T> {
type Output: ColumnOps<T>;
type Error: std::error::Error;
fn get(&self, index: usize) -> Option<&T>;
fn get_unchecked(&self, index: usize) -> &T;
fn len(&self) -> usize;
fn is_empty(&self) -> bool;
fn dtype(&self) -> ColumnType;
fn name(&self) -> Option<&str>;
fn set_name(&mut self, name: String);
fn is_null(&self, index: usize) -> bool;
fn null_count(&self) -> usize;
fn has_nulls(&self) -> bool;
fn dropna(&self) -> std::result::Result<Self::Output, Self::Error>;
fn fillna(&self, value: &T) -> std::result::Result<Self::Output, Self::Error>;
fn append(&mut self, value: T) -> std::result::Result<(), Self::Error>;
fn extend_from_slice(&mut self, values: &[T]) -> std::result::Result<(), Self::Error>;
fn insert(&mut self, index: usize, value: T) -> std::result::Result<(), Self::Error>;
fn remove(&mut self, index: usize) -> std::result::Result<T, Self::Error>;
fn map<U, F>(&self, func: F) -> std::result::Result<Box<dyn std::any::Any>, Self::Error>
where
F: Fn(&T) -> U,
U: Clone + Send + Sync + 'static;
fn filter(&self, mask: &[bool]) -> std::result::Result<Self::Output, Self::Error>;
fn take(&self, indices: &[usize]) -> std::result::Result<Self::Output, Self::Error>;
fn slice(&self, start: usize, end: usize) -> std::result::Result<Self::Output, Self::Error>;
fn eq(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialEq;
fn ne(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialEq;
fn lt(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialOrd;
fn le(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialOrd;
fn gt(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialOrd;
fn ge(&self, other: &Self) -> std::result::Result<BooleanColumn, Self::Error>
where
T: PartialOrd;
fn unique(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: Eq + std::hash::Hash;
fn nunique(&self) -> usize
where
T: Eq + std::hash::Hash;
fn is_unique(&self) -> bool
where
T: Eq + std::hash::Hash;
fn duplicated(&self, keep: DuplicateKeep) -> std::result::Result<BooleanColumn, Self::Error>
where
T: Eq + std::hash::Hash;
fn sort(&self, ascending: bool) -> std::result::Result<Self::Output, Self::Error>
where
T: Ord;
fn argsort(&self, ascending: bool) -> std::result::Result<Vec<usize>, Self::Error>
where
T: Ord;
fn memory_usage(&self) -> usize;
fn shrink_to_fit(&mut self);
}
pub trait NumericColumnOps<T>: ColumnOps<T>
where
T: num_traits::Num + Copy + PartialOrd + Send + Sync + 'static,
{
fn add(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn sub(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn mul(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn div(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn pow(&self, exponent: f64) -> std::result::Result<Self::Output, Self::Error>;
fn add_scalar(&self, scalar: T) -> std::result::Result<Self::Output, Self::Error>;
fn sub_scalar(&self, scalar: T) -> std::result::Result<Self::Output, Self::Error>;
fn mul_scalar(&self, scalar: T) -> std::result::Result<Self::Output, Self::Error>;
fn div_scalar(&self, scalar: T) -> std::result::Result<Self::Output, Self::Error>;
fn sum(&self) -> Option<T>;
fn mean(&self) -> Option<f64>;
fn median(&self) -> Option<f64>;
fn std(&self, ddof: usize) -> Option<f64>;
fn var(&self, ddof: usize) -> Option<f64>;
fn min(&self) -> Option<T>;
fn max(&self) -> Option<T>;
fn quantile(&self, q: f64) -> Option<f64>;
fn cumsum(&self) -> std::result::Result<Self::Output, Self::Error>;
fn cumprod(&self) -> std::result::Result<Self::Output, Self::Error>;
fn cummax(&self) -> std::result::Result<Self::Output, Self::Error>;
fn cummin(&self) -> std::result::Result<Self::Output, Self::Error>;
fn round(&self, decimals: i32) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn floor(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn ceil(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn abs(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Signed;
fn sqrt(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn exp(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn log(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn sin(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
fn cos(&self) -> std::result::Result<Self::Output, Self::Error>
where
T: num_traits::Float;
}
pub trait StringColumnOps: ColumnOps<String> {
fn len_chars(&self) -> std::result::Result<Int64Column, Self::Error>;
fn lower(&self) -> std::result::Result<Self::Output, Self::Error>;
fn upper(&self) -> std::result::Result<Self::Output, Self::Error>;
fn strip(&self) -> std::result::Result<Self::Output, Self::Error>;
fn lstrip(&self) -> std::result::Result<Self::Output, Self::Error>;
fn rstrip(&self) -> std::result::Result<Self::Output, Self::Error>;
fn contains(
&self,
pattern: &str,
regex: bool,
) -> std::result::Result<BooleanColumn, Self::Error>;
fn startswith(&self, prefix: &str) -> std::result::Result<BooleanColumn, Self::Error>;
fn endswith(&self, suffix: &str) -> std::result::Result<BooleanColumn, Self::Error>;
fn find(&self, substring: &str) -> std::result::Result<Int64Column, Self::Error>;
fn replace(
&self,
pattern: &str,
replacement: &str,
regex: bool,
) -> std::result::Result<Self::Output, Self::Error>;
fn slice_str(
&self,
start: Option<usize>,
end: Option<usize>,
) -> std::result::Result<Self::Output, Self::Error>;
fn split(&self, delimiter: &str) -> std::result::Result<Vec<Self::Output>, Self::Error>;
fn join(&self, separator: &str) -> String;
fn value_counts(&self) -> std::result::Result<crate::dataframe::DataFrame, Self::Error>;
fn to_categorical(&self) -> std::result::Result<CategoricalColumn, Self::Error>;
fn pad(
&self,
width: usize,
side: PadSide,
fillchar: char,
) -> std::result::Result<Self::Output, Self::Error>;
fn center(
&self,
width: usize,
fillchar: char,
) -> std::result::Result<Self::Output, Self::Error>;
fn ljust(&self, width: usize, fillchar: char)
-> std::result::Result<Self::Output, Self::Error>;
fn rjust(&self, width: usize, fillchar: char)
-> std::result::Result<Self::Output, Self::Error>;
}
pub trait DateTimeColumnOps: ColumnOps<chrono::DateTime<chrono::Utc>> {
fn year(&self) -> std::result::Result<Int64Column, Self::Error>;
fn month(&self) -> std::result::Result<Int64Column, Self::Error>;
fn day(&self) -> std::result::Result<Int64Column, Self::Error>;
fn hour(&self) -> std::result::Result<Int64Column, Self::Error>;
fn minute(&self) -> std::result::Result<Int64Column, Self::Error>;
fn second(&self) -> std::result::Result<Int64Column, Self::Error>;
fn weekday(&self) -> std::result::Result<Int64Column, Self::Error>;
fn dayofyear(&self) -> std::result::Result<Int64Column, Self::Error>;
fn strftime(&self, format: &str) -> std::result::Result<StringColumn, Self::Error>;
fn to_date(&self) -> std::result::Result<DateColumn, Self::Error>;
fn to_time(&self) -> std::result::Result<TimeColumn, Self::Error>;
fn tz_localize(&self, tz: &str) -> std::result::Result<Self::Output, Self::Error>;
fn tz_convert(&self, tz: &str) -> std::result::Result<Self::Output, Self::Error>;
fn add_days(&self, days: i64) -> std::result::Result<Self::Output, Self::Error>;
fn add_months(&self, months: i64) -> std::result::Result<Self::Output, Self::Error>;
fn add_years(&self, years: i64) -> std::result::Result<Self::Output, Self::Error>;
fn between(
&self,
start: &chrono::DateTime<chrono::Utc>,
end: &chrono::DateTime<chrono::Utc>,
) -> std::result::Result<BooleanColumn, Self::Error>;
fn business_day_count(&self, end: &Self) -> std::result::Result<Int64Column, Self::Error>;
}
pub trait BooleanColumnOps: ColumnOps<bool> {
fn and(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn or(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn xor(&self, other: &Self) -> std::result::Result<Self::Output, Self::Error>;
fn not(&self) -> std::result::Result<Self::Output, Self::Error>;
fn any(&self) -> bool;
fn all(&self) -> bool;
fn count_true(&self) -> usize;
fn count_false(&self) -> usize;
fn to_int(&self) -> std::result::Result<Int64Column, Self::Error>;
fn to_float(&self) -> std::result::Result<Float64Column, Self::Error>;
}
pub trait CategoricalColumnOps<T>: ColumnOps<T>
where
T: Clone + Eq + std::hash::Hash + Send + Sync + 'static,
{
fn categories(&self) -> Vec<T>;
fn add_categories(&mut self, categories: &[T]) -> std::result::Result<(), Self::Error>;
fn remove_categories(&mut self, categories: &[T]) -> std::result::Result<(), Self::Error>;
fn set_categories(
&mut self,
categories: Vec<T>,
ordered: bool,
) -> std::result::Result<(), Self::Error>;
fn is_ordered(&self) -> bool;
fn set_ordered(&mut self, ordered: bool);
fn codes(&self) -> std::result::Result<Int64Column, Self::Error>;
fn reorder_categories(
&mut self,
new_categories: Vec<T>,
) -> std::result::Result<(), Self::Error>;
fn rename_categories<F>(&mut self, rename_func: F) -> std::result::Result<(), Self::Error>
where
F: Fn(&T) -> T;
}
#[derive(Debug, Clone)]
pub struct ConcreteInt64Column {
inner: crate::column::Int64Column,
}
#[derive(Debug, Clone)]
pub struct ConcreteInt32Column {
inner: crate::column::Int64Column, }
#[derive(Debug, Clone)]
pub struct ConcreteFloat64Column {
inner: crate::column::Float64Column,
}
#[derive(Debug, Clone)]
pub struct ConcreteFloat32Column {
inner: crate::column::Float64Column, }
#[derive(Debug, Clone)]
pub struct ConcreteStringColumn {
inner: crate::column::StringColumn,
}
#[derive(Debug, Clone)]
pub struct ConcreteBooleanColumn {
inner: crate::column::BooleanColumn,
}
#[derive(Debug, Clone)]
pub struct ConcreteDateTimeColumn {
inner: crate::column::StringColumn, }
#[derive(Debug, Clone)]
pub struct ConcreteDateColumn {
inner: crate::column::StringColumn, }
#[derive(Debug, Clone)]
pub struct ConcreteTimeColumn {
inner: crate::column::StringColumn, }
#[derive(Debug, Clone)]
pub struct ConcreteCategoricalColumn {
inner: crate::column::StringColumn, }
pub type Int64Column = Box<ConcreteInt64Column>;
pub type Int32Column = Box<ConcreteInt32Column>;
pub type Float64Column = Box<ConcreteFloat64Column>;
pub type Float32Column = Box<ConcreteFloat32Column>;
pub type StringColumn = Box<ConcreteStringColumn>;
pub type BooleanColumn = Box<ConcreteBooleanColumn>;
pub type DateTimeColumn = Box<ConcreteDateTimeColumn>;
pub type DateColumn = Box<ConcreteDateColumn>;
pub type TimeColumn = Box<ConcreteTimeColumn>;
pub type CategoricalColumn = Box<ConcreteCategoricalColumn>;
pub trait ColumnStorage {
type StorageType;
fn allocate(
&mut self,
capacity: usize,
) -> std::result::Result<Self::StorageType, Box<dyn std::error::Error>>;
fn deallocate(&mut self, storage: Self::StorageType);
fn resize(
&mut self,
storage: &mut Self::StorageType,
new_size: usize,
) -> std::result::Result<(), Box<dyn std::error::Error>>;
fn memory_usage(&self) -> usize;
}
pub trait TypedColumn<T>: ColumnOps<T> {
fn as_slice(&self) -> Option<&[T]>;
fn push(&mut self, value: T);
fn extend_from_slice(&mut self, values: &[T]);
fn into_vec(self) -> Vec<T>
where
Self: Sized;
fn from_vec(data: Vec<T>, name: Option<String>) -> Self
where
Self: Sized;
}
pub trait ColumnCast<T, U> {
type Error: std::error::Error;
fn cast(&self) -> std::result::Result<Box<dyn std::any::Any>, Self::Error>;
fn try_cast(&self) -> std::result::Result<Box<dyn std::any::Any>, Self::Error>;
fn safe_cast(
&self,
errors: CastErrorBehavior,
) -> std::result::Result<Box<dyn std::any::Any>, Self::Error>;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CastErrorBehavior {
Raise,
Coerce,
Ignore,
}
pub trait ColumnFactory {
fn create_int64(&self, data: Vec<i64>, name: Option<String>) -> Int64Column;
fn create_int32(&self, data: Vec<i32>, name: Option<String>) -> Int32Column;
fn create_float64(&self, data: Vec<f64>, name: Option<String>) -> Float64Column;
fn create_float32(&self, data: Vec<f32>, name: Option<String>) -> Float32Column;
fn create_string(&self, data: Vec<String>, name: Option<String>) -> StringColumn;
fn create_boolean(&self, data: Vec<bool>, name: Option<String>) -> BooleanColumn;
fn create_datetime(
&self,
data: Vec<chrono::DateTime<chrono::Utc>>,
name: Option<String>,
) -> DateTimeColumn;
fn create_categorical<T>(
&self,
data: Vec<T>,
categories: Vec<T>,
name: Option<String>,
) -> CategoricalColumn
where
T: Clone + Eq + std::hash::Hash + Send + Sync + 'static + Into<String>;
}
#[derive(Debug, Default)]
pub struct DefaultColumnFactory;
impl ColumnFactory for DefaultColumnFactory {
fn create_int64(&self, data: Vec<i64>, name: Option<String>) -> Int64Column {
let mut column = crate::column::Int64Column::new(data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteInt64Column { inner: column })
}
fn create_int32(&self, data: Vec<i32>, name: Option<String>) -> Int32Column {
let i64_data: Vec<i64> = data.into_iter().map(|x| x as i64).collect();
let mut column = crate::column::Int64Column::new(i64_data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteInt32Column { inner: column })
}
fn create_float64(&self, data: Vec<f64>, name: Option<String>) -> Float64Column {
let mut column = crate::column::Float64Column::new(data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteFloat64Column { inner: column })
}
fn create_float32(&self, data: Vec<f32>, name: Option<String>) -> Float32Column {
let f64_data: Vec<f64> = data.into_iter().map(|x| x as f64).collect();
let mut column = crate::column::Float64Column::new(f64_data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteFloat32Column { inner: column })
}
fn create_string(&self, data: Vec<String>, name: Option<String>) -> StringColumn {
let mut column = crate::column::StringColumn::new(data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteStringColumn { inner: column })
}
fn create_boolean(&self, data: Vec<bool>, name: Option<String>) -> BooleanColumn {
let mut column = crate::column::BooleanColumn::new(data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteBooleanColumn { inner: column })
}
fn create_datetime(
&self,
data: Vec<chrono::DateTime<chrono::Utc>>,
name: Option<String>,
) -> DateTimeColumn {
let string_data: Vec<String> = data.into_iter().map(|dt| dt.to_rfc3339()).collect();
let mut column = crate::column::StringColumn::new(string_data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteDateTimeColumn { inner: column })
}
fn create_categorical<T>(
&self,
data: Vec<T>,
_categories: Vec<T>,
name: Option<String>,
) -> CategoricalColumn
where
T: Clone + Eq + std::hash::Hash + Send + Sync + 'static + Into<String>,
{
let string_data: Vec<String> = data.into_iter().map(|x| x.into()).collect();
let mut column = crate::column::StringColumn::new(string_data);
if let Some(name) = name {
column.set_name(name);
}
Box::new(ConcreteCategoricalColumn { inner: column })
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_column_type() {
assert_eq!(ColumnType::Int64, ColumnType::Int64);
assert_ne!(ColumnType::Int64, ColumnType::Float64);
}
#[test]
fn test_duplicate_keep() {
assert_eq!(DuplicateKeep::First, DuplicateKeep::First);
assert_ne!(DuplicateKeep::First, DuplicateKeep::Last);
}
#[test]
fn test_pad_side() {
assert_eq!(PadSide::Left, PadSide::Left);
assert_ne!(PadSide::Left, PadSide::Right);
}
#[test]
fn test_cast_error_behavior() {
assert_eq!(CastErrorBehavior::Raise, CastErrorBehavior::Raise);
assert_ne!(CastErrorBehavior::Raise, CastErrorBehavior::Coerce);
}
}