use data_value::{DataValue, Extract};
use halfbrown::HashMap;
use ndarray::{Array1, Array2, ArrayView1};
use std::fmt;
pub mod colums_store;
pub mod index;
pub mod join;
pub mod key;
use crate::{error::Error, CandidateData};
#[cfg(feature = "python")]
pub mod python;
#[cfg(feature = "python")]
use pyo3::prelude::*;
use crate::{
dataframe::{colums_store::ColumnFrame, join::JoinRelation, key::Key},
MLChefMap,
};
#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
#[cfg_attr(feature = "python", pyclass)]
pub struct DataFrame {
pub constants: HashMap<Key, DataValue>,
pub dataframe: ColumnFrame,
pub metadata: HashMap<String, DataValue>,
}
impl fmt::Display for DataFrame {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.dataframe.fmt(f)
}
}
impl DataFrame {
pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
Self {
constants: HashMap::new(),
dataframe: dataframe.into(),
metadata: HashMap::new(),
}
}
pub fn shrink(&mut self) {
self.dataframe.shrink();
}
pub fn add_metadata(&mut self, key: String, value: DataValue) {
self.metadata.insert(key, value);
}
pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
self.metadata.get(key)
}
pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
other.constants.into_iter().for_each(|(key, value)| {
self.constants.insert(key, value);
});
self.dataframe.join(other.dataframe, join_type)
}
pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
where
F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
{
self.dataframe.apply_function(keys, &mut func)
}
pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
Ok(self.dataframe.select(keys))
}
pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
self.dataframe.select_transposed_typed::<D>(keys)
}
pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<DataValue>> {
self.dataframe.select_column(&key)
}
pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
self.dataframe.select_transposed(keys)
}
pub fn insert_constant(&mut self, key: Key, value: DataValue) {
self.constants.insert(key, value);
}
pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
self.dataframe.push(item)
}
pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
self.dataframe.remove_column(keys).map(|x| x.into())
}
pub fn extend(&mut self, items: Self) -> Result<(), Error> {
self.dataframe.extend(items.dataframe)
}
pub fn len(&self) -> usize {
self.dataframe.len()
}
pub fn is_empty(&self) -> bool {
self.dataframe.is_empty()
}
pub fn add_single_column<K: Into<Key>>(
&mut self,
key: K,
values: Array1<DataValue>,
) -> Result<(), Error> {
self.dataframe.add_single_column(key, values)
}
pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<DataValue>> {
self.dataframe.get_single_column(key)
}
}
impl From<ColumnFrame> for DataFrame {
fn from(dataframe: ColumnFrame) -> Self {
Self::new(dataframe)
}
}
impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
Self::new(ColumnFrame::from(dataframe))
}
}
impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
Self::new(ColumnFrame::from(dataframe))
}
}
impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
Self::new(ColumnFrame::from(dataframe))
}
}
impl From<MLChefMap> for DataFrame {
fn from(dataframe: MLChefMap) -> Self {
Self::new(ColumnFrame::from(dataframe))
}
}
impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
Self::new(ColumnFrame::from(dataframe))
}
}
#[cfg(test)]
mod test {
use super::*;
use halfbrown::hashmap;
use rstest::*;
use tracing_test::traced_test;
#[fixture]
fn dummy_candidates() -> ColumnFrame {
ColumnFrame::from(vec![
hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
"key2".into() => "b".into(),
},
])
}
#[rstest]
#[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
#[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
#[case(vec![hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
},])]
#[case(vec![data_value::stdhashmap! {
"key1" => DataValue::from(1),
"key2" => DataValue::from("a"),
},data_value::stdhashmap! {
"key1" => DataValue::from(2),
},])]
#[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
vec![DataValue::from("a")])])]
fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
let df: DataFrame = input.into();
assert_eq!(
df,
DataFrame {
constants: HashMap::new(),
dataframe: ColumnFrame::from(vec![
hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
},
]),
metadata: HashMap::new(),
}
);
let selected_transposed = df.select_column("key1".into());
assert!(selected_transposed.is_some());
let selected_transposed = selected_transposed.unwrap();
assert_eq!(selected_transposed.len(), 2);
assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
}
#[rstest]
#[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
#[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
#[case(vec![hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
},])]
#[case(vec![data_value::stdhashmap! {
"key1" => DataValue::from(1),
"key2" => DataValue::from("a"),
},data_value::stdhashmap! {
"key1" => DataValue::from(2),
},])]
#[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a")])])]
fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
let df: DataFrame = input.into();
assert_eq!(
df,
DataFrame {
constants: HashMap::new(),
dataframe: ColumnFrame::from(vec![
hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
},
]),
metadata: HashMap::new(),
}
);
let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
assert_eq!(selected_transposed.len(), 2);
println!("{:?}", selected_transposed);
assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
}
#[rstest]
fn test_dataframe(dummy_candidates: ColumnFrame) {
let mut dataframe: DataFrame = DataFrame::default();
assert!(dataframe.is_empty());
assert!(dataframe.extend(dummy_candidates.into()).is_ok());
assert_eq!(dataframe.len(), 2);
let candidate = hashmap! {
"key1".into() => 3.into(),
"key2".into() => "c".into(),
};
assert!(dataframe.push(candidate).is_ok());
assert_eq!(dataframe.len(), 3);
assert!(!dataframe.is_empty());
dataframe.insert_constant("key3".into(), 4.into());
assert_eq!(dataframe.constants.len(), 1);
assert!(dataframe
.apply_function(&["key1".into()], |keys, df| {
let key = keys[0].clone();
let s = df
.get_single_column(&key)
.expect("BUG: Cannot get column")
.to_owned();
let s = s.mapv(|x| x + DataValue::from(1));
df.add_single_column("key5", s)?;
Ok(())
})
.is_ok());
let original = dataframe.clone();
dataframe.shrink();
let remove_df = dataframe.remove_column(&["key1".into()]);
assert!(remove_df.is_ok());
let mut remove_df = remove_df.unwrap();
assert_eq!(remove_df.len(), 3);
let selected = dataframe.select(Some(&["key2".into()]));
assert!(selected.is_ok());
let selected = selected.unwrap();
println!("{:?}", selected);
assert_eq!(selected.len(), 3);
let joined_result =
remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
assert!(joined_result.is_ok(), "{:?}", joined_result);
assert_eq!(original, remove_df);
}
#[rstest]
fn test_metadata(dummy_candidates: ColumnFrame) {
let mut dataframe: DataFrame = DataFrame::default();
assert!(dataframe.is_empty());
println!("{:?}", dataframe);
assert!(dataframe.extend(dummy_candidates.into()).is_ok());
println!("{:?}", dataframe);
assert_eq!(dataframe.len(), 2);
dataframe.add_metadata("test".into(), 1.into());
assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
let dataframe = DataFrame::new(ColumnFrame::from(vec![
hashmap! {
"key1".into() => 1.into(),
"key2".into() => "a".into(),
},
hashmap! {
"key1".into() => 2.into(),
"key2".into() => "b".into(),
},
]));
assert_eq!(dataframe.get_metadata("test"), None);
let tt = dataframe.select_transposed(None);
assert!(tt.is_ok());
let tt = tt.unwrap();
assert_eq!(tt.shape(), [2, 2]);
assert_eq!(
tt,
Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
.unwrap()
);
}
#[rstest]
#[traced_test]
fn add_single_column_test() {
let mut dataframe = DataFrame::default();
let values = Array1::from(vec![1.into(), 2.into()]);
let r = dataframe.add_single_column("key1", values);
assert!(r.is_ok(), "{r:?}");
let selected = dataframe.select(None);
assert!(selected.is_ok());
let selected = selected.unwrap();
assert_eq!(selected.shape(), [2, 1]);
assert_eq!(
selected,
Array2::from_shape_vec((2, 1), vec![1.into(), 2.into()]).unwrap()
);
let values = Array1::from(vec![1.into(), 2.into()]);
assert!(dataframe.add_single_column("key1", values).is_err());
let values = Array1::from(vec![3.into(), 4.into()]);
assert!(dataframe.add_single_column("key2", values).is_ok());
let values = Array1::from(vec![3.into()]);
assert!(dataframe.add_single_column("key3", values).is_err());
}
}