trs-dataframe 0.10.2

Dataframe library for Teiresias
Documentation
use data_value::DataValue;
use halfbrown::HashMap;
use ndarray::{Array1, Array2};

use crate::{detect_dtype, detect_dtype_arr, Key, MLChefMap};

use super::{ColumnFrame, KeyIndex};

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<Vec<std::collections::HashMap<Key, DataValue>>> for ColumnFrame {
    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
        let mut keys = dataframe
            .iter()
            .flat_map(|x| x.keys())
            .cloned()
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let index = KeyIndex::new(keys);
        let mut data_frame = Array2::default((dataframe.len(), index.len()));
        for (idx, row) in dataframe.iter().enumerate() {
            for (key, value) in row.iter() {
                if let Some(column) = index.get_column_index(key) {
                    if let Some(x) = data_frame.get_mut((idx, column)) {
                        *x = value.clone();
                    }
                }
            }
        }
        Self::new(index, data_frame)
    }
}

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<Vec<HashMap<Key, DataValue>>> for ColumnFrame {
    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
        let mut keys = dataframe
            .iter()
            .flat_map(|x| x.keys())
            .cloned()
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let index = KeyIndex::new(keys);
        let mut data_frame = Array2::default((dataframe.len(), index.len()));
        for (idx, row) in dataframe.iter().enumerate() {
            for (key, value) in row.iter() {
                if let Some(column) = index.get_column_index(key) {
                    if let Some(x) = data_frame.get_mut((idx, column)) {
                        *x = value.clone();
                    }
                }
            }
        }
        Self::new(index, data_frame)
    }
}

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<std::collections::HashMap<String, Vec<DataValue>>> for ColumnFrame {
    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
        let mut keys = dataframe
            .keys()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let mut index = KeyIndex::new(keys);
        let size = dataframe
            .values()
            .map(|x| x.len())
            .max()
            .unwrap_or_default();
        let mut data_frame = Array2::default((size, index.len()));
        for (key, value) in dataframe.into_iter() {
            let dtype = detect_dtype_arr(&value);
            let new_key = Key::new(&key, dtype);
            for (idx, value) in value.into_iter().enumerate() {
                if let Some(column) = index.get_column_index(&new_key) {
                    if let Some(x) = data_frame.get_mut((idx, column)) {
                        *x = value.clone();
                    }
                }
            }
            let _ = index.rename_key(key.as_str(), new_key);
        }
        Self::new(index, data_frame)
    }
}

impl From<MLChefMap> for ColumnFrame {
    fn from(dataframe: MLChefMap) -> Self {
        let mut keys = dataframe
            .keys()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let mut index = KeyIndex::new(keys);
        let size = dataframe
            .values()
            .map(|x| x.len())
            .max()
            .unwrap_or_default();
        let mut data_frame = Array2::default((size, index.len()));
        for (key, value) in dataframe.into_iter() {
            let dtype = detect_dtype_arr(&value);
            let new_key = Key::new(&key, dtype);
            for (idx, value) in value.into_iter().enumerate() {
                if let Some(column) = index.get_column_index(&new_key) {
                    if let Some(x) = data_frame.get_mut((idx, column)) {
                        *x = value.clone();
                    }
                }
            }
            let _ = index.rename_key(key.as_str(), new_key);
        }
        Self::new(index, data_frame)
    }
}

impl From<Vec<(Key, Vec<DataValue>)>> for ColumnFrame {
    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
        if dataframe.is_empty() {
            return Self::default();
        }
        let mut index = KeyIndex::new(vec![]);

        let mut data_frame = Array2::default((
            dataframe.first().expect("Expects data").1.len(),
            dataframe.len(),
        ));
        for (key, value) in dataframe {
            index.store_key(key.clone());
            let column_index = index.get_column_index(&key).expect("BUG: Defined above!");
            let mut res = data_frame.slice_mut(ndarray::s![.., column_index]);
            res.assign(&Array1::from_vec(value));
        }
        Self::new(index, data_frame)
    }
}

impl From<std::collections::HashMap<String, Array1<DataValue>>> for ColumnFrame {
    fn from(mut dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
        let keys = dataframe.keys().map(|key| key.into()).collect::<Vec<_>>();
        let mut index = KeyIndex::new(keys);
        let mut arr = Array2::default((
            dataframe.values().next().map_or(0, |v| v.len()),
            index.len(),
        ));

        for (column_index, key) in index.get_keys_mut().iter_mut().enumerate() {
            if let Some(value) = dataframe.remove(key.name()) {
                let mut res = arr.slice_mut(ndarray::s![.., column_index]);
                key.ctype = detect_dtype(res.get(0).unwrap_or_else(|| &DataValue::Null));
                res.assign(&value);
            }
        }
        ColumnFrame::new(index, arr)
    }
}

#[cfg(feature = "polars-df")]
impl From<polars::prelude::DataFrame> for ColumnFrame {
    fn from(dataframe: polars::prelude::DataFrame) -> Self {
        let keys = dataframe
            .get_column_names()
            .iter()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        let mut index = KeyIndex::new(keys);

        let mut arr = Array2::default((dataframe.height(), index.len()));
        for column in dataframe.iter() {
            let mut key = Key::from(column.name().as_str());
            let mut dtype = crate::DataType::Unknown;
            if let Some(column_index) = index.get_column_index(&key) {
                use crate::detect_dtype_arr;

                let mut res = arr.slice_mut(ndarray::s![.., column_index]);
                let values: Vec<DataValue> = column
                    .iter()
                    .map(crate::dataframe::from_polars_value)
                    .collect();
                dtype = detect_dtype_arr(&values);
                res.assign(&Array1::from_vec(values));
            }
            key.ctype = dtype;
            let _ = index.rename_key(column.name(), key);
        }
        ColumnFrame::new(index, arr)
    }
}