use crate::core::data_value::DataValue;
use crate::core::error::{Error, Result};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Axis {
Row = 0,
Column = 1,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JoinType {
Inner,
Left,
Right,
Outer,
Cross,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DropNaHow {
Any, All, }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FillMethod {
Forward, Backward, Zero, Mean, Interpolate, }
#[derive(Debug, Clone)]
pub struct DataFrameInfo {
pub shape: (usize, usize),
pub memory_usage: usize,
pub null_counts: HashMap<String, usize>,
pub dtypes: HashMap<String, String>,
}
#[derive(Debug, Clone)]
pub enum AggFunc {
Sum,
Mean,
Median,
Min,
Max,
Count,
Std,
Var,
Custom(String), }
pub trait DataFrameOps {
type Output: DataFrameOps;
type Error: std::error::Error + Send + Sync + 'static;
fn select(&self, columns: &[&str]) -> Result<Self::Output>;
fn drop(&self, columns: &[&str]) -> Result<Self::Output>;
fn rename(&self, mapping: &HashMap<String, String>) -> Result<Self::Output>;
fn filter<F>(&self, predicate: F) -> Result<Self::Output>
where
F: Fn(&dyn DataValue) -> bool + Send + Sync;
fn head(&self, n: usize) -> Result<Self::Output>;
fn tail(&self, n: usize) -> Result<Self::Output>;
fn sample(&self, n: usize, random_state: Option<u64>) -> Result<Self::Output>;
fn sort_values(&self, by: &[&str], ascending: &[bool]) -> Result<Self::Output>;
fn sort_index(&self) -> Result<Self::Output>;
fn shape(&self) -> (usize, usize);
fn columns(&self) -> Vec<String>;
fn dtypes(&self) -> HashMap<String, String>;
fn info(&self) -> DataFrameInfo;
fn dropna(&self, axis: Option<Axis>, how: DropNaHow) -> Result<Self::Output>;
fn fillna(&self, value: &dyn DataValue, method: Option<FillMethod>) -> Result<Self::Output>;
fn isna(&self) -> Result<Self::Output>;
fn map<F>(&self, func: F) -> Result<Self::Output>
where
F: Fn(&dyn DataValue) -> Box<dyn DataValue> + Send + Sync;
fn apply<F>(&self, func: F, axis: Axis) -> Result<Self::Output>
where
F: Fn(&Self::Output) -> Box<dyn DataValue> + Send + Sync;
}
pub trait DataFrameAdvancedOps: DataFrameOps {
type GroupBy: GroupByOps<Self>
where
Self: Sized;
fn merge(&self, other: &Self, on: &[&str], how: JoinType) -> Result<Self::Output>;
fn concat(&self, others: &[&Self], axis: Axis) -> Result<Self::Output>;
fn pivot(&self, index: &[&str], columns: &[&str], values: &[&str]) -> Result<Self::Output>;
fn melt(&self, id_vars: &[&str], value_vars: &[&str]) -> Result<Self::Output>;
fn stack(&self, level: Option<usize>) -> Result<Self::Output>;
fn unstack(&self, level: Option<usize>) -> Result<Self::Output>;
fn group_by(&self, by: &[&str]) -> Result<Self::GroupBy>
where
Self: Sized;
fn rolling(&self, window: usize) -> Result<RollingWindow<Self>>
where
Self: Sized;
fn expanding(&self) -> Result<ExpandingWindow<Self>>
where
Self: Sized;
fn resample(&self, freq: &str) -> Result<Resampler<Self>>
where
Self: Sized;
fn shift(&self, periods: i64) -> Result<Self::Output>;
fn set_index(&self, keys: &[&str]) -> Result<Self::Output>;
fn reset_index(&self, drop: bool) -> Result<Self::Output>;
fn reindex<I: IndexTrait>(&self, index: &I) -> Result<Self::Output>;
}
pub trait GroupByOps<T: DataFrameOps> {
type GroupByResult: DataFrameOps;
type Error: std::error::Error + Send + Sync + 'static;
fn sum(&self) -> Result<Self::GroupByResult>;
fn mean(&self) -> Result<Self::GroupByResult>;
fn median(&self) -> Result<Self::GroupByResult>;
fn std(&self) -> Result<Self::GroupByResult>;
fn var(&self) -> Result<Self::GroupByResult>;
fn min(&self) -> Result<Self::GroupByResult>;
fn max(&self) -> Result<Self::GroupByResult>;
fn count(&self) -> Result<Self::GroupByResult>;
fn nunique(&self) -> Result<Self::GroupByResult>;
fn agg(&self, funcs: &[AggFunc]) -> Result<Self::GroupByResult>;
fn transform<F>(&self, func: F) -> Result<T>
where
F: Fn(&T) -> T + Send + Sync;
fn apply<F>(&self, func: F) -> Result<Self::GroupByResult>
where
F: Fn(&T) -> Box<dyn DataValue> + Send + Sync;
fn filter<F>(&self, func: F) -> Result<T>
where
F: Fn(&T) -> bool + Send + Sync;
fn groups(&self) -> GroupIterator<T>
where
T: Clone;
fn get_group(&self, key: &GroupKey) -> Result<T>;
fn describe(&self) -> Result<Self::GroupByResult>;
fn quantile(&self, q: f64) -> Result<Self::GroupByResult>;
}
pub trait IndexingOps {
type Output;
type Error: std::error::Error + Send + Sync + 'static;
fn iloc(&self, row_indexer: &RowIndexer, col_indexer: &ColIndexer) -> Result<Self::Output>;
fn iloc_scalar(&self, row: usize, col: usize) -> Result<Box<dyn DataValue>>;
fn loc(&self, row_indexer: &LabelIndexer, col_indexer: &LabelIndexer) -> Result<Self::Output>;
fn loc_scalar(&self, row_label: &str, col_label: &str) -> Result<Box<dyn DataValue>>;
fn mask(&self, mask: &BooleanMask) -> Result<Self::Output>;
fn where_condition<F>(&self, condition: F) -> Result<Self::Output>
where
F: Fn(&dyn DataValue) -> bool + Send + Sync;
fn query(&self, expression: &str) -> Result<Self::Output>;
fn eval(&self, expression: &str) -> Result<Self::Output>;
fn at(&self, row_label: &str, col_label: &str) -> Result<Box<dyn DataValue>>;
fn iat(&self, row: usize, col: usize) -> Result<Box<dyn DataValue>>;
}
pub trait IndexTrait: Clone + Send + Sync {
fn len(&self) -> usize;
fn get(&self, pos: usize) -> Option<&str>;
fn position(&self, label: &str) -> Option<usize>;
}
#[derive(Debug, Clone)]
pub enum RowIndexer {
Single(usize),
Range(std::ops::Range<usize>),
List(Vec<usize>),
Slice(usize, Option<usize>, Option<isize>), }
#[derive(Debug, Clone)]
pub enum ColIndexer {
Single(usize),
Range(std::ops::Range<usize>),
List(Vec<usize>),
All,
}
#[derive(Debug, Clone)]
pub enum LabelIndexer {
Single(String),
List(Vec<String>),
Slice(String, Option<String>), All,
}
#[derive(Debug, Clone)]
pub struct BooleanMask {
pub mask: Vec<bool>,
}
impl BooleanMask {
pub fn new(mask: Vec<bool>) -> Self {
Self { mask }
}
pub fn len(&self) -> usize {
self.mask.len()
}
pub fn get(&self, index: usize) -> Option<bool> {
self.mask.get(index).copied()
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct GroupKey {
pub values: Vec<String>,
}
impl GroupKey {
pub fn new(values: Vec<String>) -> Self {
Self { values }
}
}
pub struct GroupIterator<T>
where
T: Clone,
{
groups: Vec<(GroupKey, T)>,
current: usize,
}
impl<T> GroupIterator<T>
where
T: Clone,
{
pub fn new(groups: Vec<(GroupKey, T)>) -> Self {
Self { groups, current: 0 }
}
}
impl<T> Iterator for GroupIterator<T>
where
T: Clone,
{
type Item = (GroupKey, T);
fn next(&mut self) -> Option<Self::Item> {
if self.current < self.groups.len() {
let result = self.groups.get(self.current)?.clone();
self.current += 1;
Some(result)
} else {
None
}
}
}
pub struct RollingWindow<T> {
dataframe: T,
window_size: usize,
min_periods: Option<usize>,
}
impl<T> RollingWindow<T> {
pub fn new(dataframe: T, window_size: usize) -> Self {
Self {
dataframe,
window_size,
min_periods: None,
}
}
pub fn min_periods(mut self, min_periods: usize) -> Self {
self.min_periods = Some(min_periods);
self
}
}
impl<T: DataFrameOps> RollingWindow<T> {
pub fn sum(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling sum requires DataFrame-specific implementation".to_string(),
))
}
pub fn mean(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling mean requires DataFrame-specific implementation".to_string(),
))
}
pub fn std(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling std requires DataFrame-specific implementation".to_string(),
))
}
pub fn min(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling min requires DataFrame-specific implementation".to_string(),
))
}
pub fn max(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling max requires DataFrame-specific implementation".to_string(),
))
}
pub fn count(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Rolling count requires DataFrame-specific implementation".to_string(),
))
}
pub fn apply<F>(&self, _func: F) -> Result<T::Output>
where
F: Fn(&[f64]) -> f64 + Send + Sync,
{
Err(Error::NotImplemented(
"Rolling apply requires DataFrame-specific implementation".to_string(),
))
}
}
pub struct ExpandingWindow<T> {
dataframe: T,
min_periods: Option<usize>,
}
impl<T> ExpandingWindow<T> {
pub fn new(dataframe: T) -> Self {
Self {
dataframe,
min_periods: None,
}
}
pub fn min_periods(mut self, min_periods: usize) -> Self {
self.min_periods = Some(min_periods);
self
}
}
impl<T: DataFrameOps> ExpandingWindow<T> {
pub fn sum(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding sum requires DataFrame-specific implementation".to_string(),
))
}
pub fn mean(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding mean requires DataFrame-specific implementation".to_string(),
))
}
pub fn std(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding std requires DataFrame-specific implementation".to_string(),
))
}
pub fn var(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding var requires DataFrame-specific implementation".to_string(),
))
}
pub fn min(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding min requires DataFrame-specific implementation".to_string(),
))
}
pub fn max(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding max requires DataFrame-specific implementation".to_string(),
))
}
pub fn count(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Expanding count requires DataFrame-specific implementation".to_string(),
))
}
pub fn apply<F>(&self, _func: F) -> Result<T::Output>
where
F: Fn(&[f64]) -> f64 + Send + Sync,
{
Err(Error::NotImplemented(
"Expanding apply requires DataFrame-specific implementation".to_string(),
))
}
}
pub struct Resampler<T> {
dataframe: T,
frequency: String,
}
impl<T> Resampler<T> {
pub fn new(dataframe: T, frequency: String) -> Self {
Self {
dataframe,
frequency,
}
}
}
impl<T: DataFrameOps> Resampler<T> {
pub fn sum(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample sum requires DataFrame-specific implementation".to_string(),
))
}
pub fn mean(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample mean requires DataFrame-specific implementation".to_string(),
))
}
pub fn first(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample first requires DataFrame-specific implementation".to_string(),
))
}
pub fn last(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample last requires DataFrame-specific implementation".to_string(),
))
}
pub fn std(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample std requires DataFrame-specific implementation".to_string(),
))
}
pub fn var(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample var requires DataFrame-specific implementation".to_string(),
))
}
pub fn min(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample min requires DataFrame-specific implementation".to_string(),
))
}
pub fn max(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample max requires DataFrame-specific implementation".to_string(),
))
}
pub fn count(&self) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample count requires DataFrame-specific implementation".to_string(),
))
}
pub fn agg(&self, _funcs: &[AggFunc]) -> Result<T::Output> {
Err(Error::NotImplemented(
"Resample agg requires DataFrame-specific implementation".to_string(),
))
}
pub fn frequency(&self) -> &str {
&self.frequency
}
}
pub trait StatisticalOps: DataFrameOps {
fn describe(&self) -> Result<Self::Output>;
fn corr(&self, method: CorrelationMethod) -> Result<Self::Output>;
fn cov(&self) -> Result<Self::Output>;
fn quantile(&self, q: &[f64]) -> Result<Self::Output>;
fn value_counts(&self, column: &str) -> Result<Self::Output>;
fn duplicated(&self, subset: Option<&[&str]>) -> Result<BooleanMask>;
fn drop_duplicates(&self, subset: Option<&[&str]>) -> Result<Self::Output>;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CorrelationMethod {
Pearson,
Spearman,
Kendall,
}
pub trait DataFrameIO: DataFrameOps {
fn read_csv(path: &str, options: CsvReadOptions) -> Result<Self::Output>;
fn to_csv(&self, path: &str, options: CsvWriteOptions) -> Result<()>;
fn read_json(path: &str, options: JsonReadOptions) -> Result<Self::Output>;
fn to_json(&self, path: &str, options: JsonWriteOptions) -> Result<()>;
fn read_parquet(path: &str, options: ParquetReadOptions) -> Result<Self::Output>;
fn to_parquet(&self, path: &str, options: ParquetWriteOptions) -> Result<()>;
}
#[derive(Debug, Clone, Default)]
pub struct CsvReadOptions {
pub delimiter: Option<char>,
pub header: Option<bool>,
pub skip_rows: Option<usize>,
pub nrows: Option<usize>,
pub encoding: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct CsvWriteOptions {
pub delimiter: Option<char>,
pub header: Option<bool>,
pub index: Option<bool>,
pub encoding: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct JsonReadOptions {
pub orient: Option<String>,
pub encoding: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct JsonWriteOptions {
pub orient: Option<String>,
pub encoding: Option<String>,
pub indent: Option<usize>,
}
#[derive(Debug, Clone, Default)]
pub struct ParquetReadOptions {
pub columns: Option<Vec<String>>,
pub use_threads: Option<bool>,
}
#[derive(Debug, Clone, Default)]
pub struct ParquetWriteOptions {
pub compression: Option<String>,
pub use_dictionary: Option<bool>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_boolean_mask() {
let mask = BooleanMask::new(vec![true, false, true, false]);
assert_eq!(mask.len(), 4);
assert_eq!(mask.get(0), Some(true));
assert_eq!(mask.get(1), Some(false));
assert_eq!(mask.get(4), None);
}
#[test]
fn test_group_key() {
let key1 = GroupKey::new(vec!["A".to_string(), "1".to_string()]);
let key2 = GroupKey::new(vec!["A".to_string(), "1".to_string()]);
let key3 = GroupKey::new(vec!["B".to_string(), "2".to_string()]);
assert_eq!(key1, key2);
assert_ne!(key1, key3);
}
#[test]
fn test_indexers() {
let row_indexer = RowIndexer::Range(0..5);
let col_indexer = ColIndexer::List(vec![0, 2, 4]);
match row_indexer {
RowIndexer::Range(range) => assert_eq!(range, 0..5),
_ => panic!("Wrong indexer type"),
}
match col_indexer {
ColIndexer::List(list) => assert_eq!(list, vec![0, 2, 4]),
_ => panic!("Wrong indexer type"),
}
}
}