trs-dataframe 0.11.1

Dataframe library for Teiresias
Documentation
use data_value::DataValue;
use halfbrown::HashMap;

use crate::{detect_dtype_arr, Key, MLChefMap};

use super::{typed_array::TypedDataArray, ColumnFrame, KeyIndex};

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<Vec<std::collections::HashMap<Key, DataValue>>> for ColumnFrame {
    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
        let mut keys = dataframe
            .iter()
            .flat_map(|x| x.keys())
            .cloned()
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let index = KeyIndex::new(keys);
        let nrows = dataframe.len();
        let ncols = index.len();
        let mut columns: Vec<Vec<DataValue>> = vec![vec![DataValue::default(); nrows]; ncols];
        for (row_idx, row) in dataframe.iter().enumerate() {
            for (key, value) in row.iter() {
                if let Some(col_idx) = index.get_column_index(key) {
                    columns[col_idx][row_idx] = value.clone();
                }
            }
        }
        Self::new(index, columns)
    }
}

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<Vec<HashMap<Key, DataValue>>> for ColumnFrame {
    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
        let mut keys = dataframe
            .iter()
            .flat_map(|x| x.keys())
            .cloned()
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        let index = KeyIndex::new(keys);
        let nrows = dataframe.len();
        let ncols = index.len();
        let mut columns: Vec<Vec<DataValue>> = vec![vec![DataValue::default(); nrows]; ncols];
        for (row_idx, row) in dataframe.iter().enumerate() {
            for (key, value) in row.iter() {
                if let Some(col_idx) = index.get_column_index(key) {
                    columns[col_idx][row_idx] = value.clone();
                }
            }
        }
        Self::new(index, columns)
    }
}

fn from_string_keyed<S: AsRef<str>>(
    dataframe: impl IntoIterator<Item = (S, Vec<DataValue>)>,
    sorted_keys: Vec<Key>,
) -> ColumnFrame {
    let mut index = KeyIndex::new(sorted_keys);
    let entries: Vec<(S, Vec<DataValue>)> = dataframe.into_iter().collect();
    let size = entries
        .iter()
        .map(|(_, v)| v.len())
        .max()
        .unwrap_or_default();
    let ncols = index.len();
    let mut data_frame: Vec<Vec<DataValue>> =
        (0..ncols).map(|_| vec![DataValue::Null; size]).collect();
    // Track detected dtypes so we can update the index keys without
    // forcing the column data to a typed representation. This preserves
    // `Null` values when the column has been resized to match the widest
    // input vector.
    let mut detected: Vec<(String, Key)> = Vec::new();
    for (key, mut value) in entries {
        let key_ref: Key = key.as_ref().into();
        if let Some(col_idx) = index.get_column_index(&key_ref) {
            let dtype_from_input = detect_dtype_arr(&value);
            value.resize(size, DataValue::default());
            data_frame[col_idx] = value;
            detected.push((
                key.as_ref().to_string(),
                Key::new(key.as_ref(), dtype_from_input),
            ));
        }
    }
    for (name, new_key) in detected {
        let _ = index.rename_key(&name, new_key);
    }
    ColumnFrame::new(index, data_frame)
}

/// NOTE: Because of randomnes of the key order in the hashmap, the order of
/// the keys are sorted!
impl From<std::collections::HashMap<String, Vec<DataValue>>> for ColumnFrame {
    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
        let mut keys = dataframe
            .keys()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        from_string_keyed(dataframe, keys)
    }
}

impl From<MLChefMap> for ColumnFrame {
    fn from(dataframe: MLChefMap) -> Self {
        let mut keys = dataframe
            .keys()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        keys.sort();
        keys.dedup();
        from_string_keyed(dataframe, keys)
    }
}

impl From<Vec<(Key, Vec<DataValue>)>> for ColumnFrame {
    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
        if dataframe.is_empty() {
            return Self::default();
        }
        let mut index = KeyIndex::new(vec![]);
        let mut data_frame: Vec<TypedDataArray> = Vec::with_capacity(dataframe.len());
        for (key, value) in dataframe {
            let dtype = key.ctype;
            index.store_key(key);
            data_frame.push(TypedDataArray::new(dtype, value));
        }
        ColumnFrame::new(index, data_frame)
    }
}

impl From<std::collections::HashMap<String, ndarray::Array1<DataValue>>> for ColumnFrame {
    fn from(mut dataframe: std::collections::HashMap<String, ndarray::Array1<DataValue>>) -> Self {
        let keys = dataframe.keys().map(|key| key.into()).collect::<Vec<_>>();
        let index = KeyIndex::new(keys);
        let nrows = dataframe.values().next().map_or(0, |v| v.len());
        let ncols = index.len();
        let mut data_frame: Vec<Vec<DataValue>> =
            (0..ncols).map(|_| vec![DataValue::Null; nrows]).collect();

        for (column_index, key) in index.get_keys().iter().enumerate() {
            if let Some(value) = dataframe.remove(key.name()) {
                data_frame[column_index] = value.to_vec();
            }
        }
        ColumnFrame::new(index, data_frame)
    }
}

#[cfg(feature = "polars-df")]
impl From<polars::prelude::DataFrame> for ColumnFrame {
    fn from(dataframe: polars::prelude::DataFrame) -> Self {
        let keys = dataframe
            .get_column_names()
            .iter()
            .map(|x| x.as_str().into())
            .collect::<Vec<_>>();
        let mut index = KeyIndex::new(keys);
        let ncols = index.len();
        let nrows = dataframe.height();

        let mut data_frame: Vec<TypedDataArray> = (0..ncols)
            .map(|i| {
                index
                    .get_keys()
                    .get(i)
                    .map(|k| TypedDataArray::default_init(k, nrows))
                    .unwrap_or_default()
            })
            .collect();
        for column in dataframe.iter() {
            let mut key = Key::from(column.name().as_str());
            let mut dtype = crate::DataType::Unknown;
            if let Some(col_idx) = index.get_column_index(&key) {
                use crate::detect_dtype_arr;

                let values: Vec<DataValue> = column
                    .iter()
                    .map(crate::dataframe::from_polars_value)
                    .collect();
                dtype = detect_dtype_arr(&values);
                data_frame[col_idx] = TypedDataArray::new(dtype, values);
            }
            key.ctype = dtype;
            let _ = index.rename_key(column.name(), key);
        }
        ColumnFrame::new(index, data_frame)
    }
}