trs-dataframe 0.11.1

Dataframe library for Teiresias
Documentation
use data_value::DataValue;
use halfbrown::HashMap;
use ndarray::ArrayView1;
use serde::{Deserialize, Serialize};

use crate::{error::Error, Key};

/// [`KeyIndex`] is used to store the keys for the [`super::ColumnFrame`]
/// The keys are stored in the order they are added - the order is preserved
/// The keys are stored in the [`Vec`] and the indexes are stored in the [`HashMap`]
/// The indexes are used to access the data in the [`super::ColumnFrame`] by the column [`Key`]
/// NOTE: The keys are unique - if the key is already present, it will be removed
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct KeyIndex {
    pub keys: Vec<Key>,
    indexes: HashMap<String, usize>,
    pub alias: HashMap<String, String>,
}

impl KeyIndex {
    pub fn new(keys: Vec<Key>) -> Self {
        let mut indexes = HashMap::with_capacity(keys.len());
        let mut removed = 0;
        let mut actual_keys = Vec::with_capacity(keys.len());
        for (idx, key) in keys.into_iter().enumerate() {
            if indexes.contains_key(key.name()) {
                removed += 1;
            } else {
                indexes.insert(key.name().to_string(), idx.saturating_sub(removed));
                actual_keys.push(key)
            }
        }
        Self {
            keys: actual_keys,
            indexes,
            alias: HashMap::new(),
        }
    }

    pub fn len(&self) -> usize {
        self.keys.len()
    }
    pub fn is_empty(&self) -> bool {
        self.keys.is_empty()
    }

    pub fn get_column_index(&self, key: &Key) -> Option<usize> {
        self.get_column_index_by_name(key.name())
    }

    pub fn get_column_index_by_name(&self, key: &str) -> Option<usize> {
        if let Some(f) = self.indexes.get(key) {
            Some(*f)
        } else {
            self.alias
                .get(key)
                .and_then(|alias| self.indexes.get(alias).copied())
        }
    }

    pub fn get_keys(&self) -> &[Key] {
        &self.keys
    }
    pub fn get_keys_mut(&mut self) -> &mut [Key] {
        &mut self.keys
    }

    pub fn get_key(&self, idx: usize) -> Option<Key> {
        self.keys.get(idx).cloned()
    }
    pub fn get_complement_keys(&self, keys: &[Key]) -> Vec<Key> {
        self.keys
            .iter()
            .filter(|key| !keys.contains(key))
            .cloned()
            .collect()
    }

    pub fn select(&self, keys: &[Key]) -> KeyIndex {
        let mut new_keys = Vec::with_capacity(keys.len());
        let mut new_indexes = HashMap::with_capacity(keys.len());

        for key in keys.iter() {
            if let Some(idx) = self.indexes.get(key.name()) {
                new_indexes.insert(key.name().to_string(), *idx);
                new_keys.push(key.to_owned());
            } else if let Some(alias_key) = self.alias.get(key.name()) {
                if let Some(idx) = self.indexes.get(alias_key) {
                    new_indexes.insert(key.name().to_string(), *idx);
                    new_keys.push(key.to_owned());
                }
            }
        }
        Self {
            keys: new_keys,
            indexes: new_indexes,
            alias: HashMap::new(),
        }
    }

    pub fn indexes(&self) -> Vec<usize> {
        self.indexes.values().copied().collect()
    }

    pub fn store_key(&mut self, key: Key) {
        if self.indexes.contains_key(key.name()) {
            return;
        }
        self.keys.push(key.clone());
        self.indexes
            .insert(key.name().to_string(), self.keys.len() - 1);
    }

    pub fn remove_key(&mut self, key: &Key) -> Option<(Key, usize)> {
        let idx = self.indexes.remove(key.name())?;
        let current = self.keys.remove(idx);

        Some((current, idx))
    }

    pub fn get_as_candidate(&self, row: ArrayView1<DataValue>) -> HashMap<Key, DataValue> {
        let mut result = HashMap::with_capacity(self.keys.len());
        for (key, idx) in self.indexes.iter() {
            result.insert(key.into(), row[*idx].clone());
        }
        result
    }

    pub fn to_vec_row(&self, candidate: HashMap<Key, DataValue>) -> Vec<DataValue> {
        self.keys
            .iter()
            .map(|key| candidate.get(key).cloned().unwrap_or_default())
            .collect()
    }

    pub fn check_order_of_indexes(&self, other: &Self) -> Result<(), Error> {
        for (self_key, other_key) in self.keys.iter().zip(other.keys.iter()) {
            if self_key != other_key {
                return Err(Error::IndexOutOfOrder(
                    self.keys.clone(),
                    other.keys.clone(),
                ));
            }
        }
        Ok(())
    }

    pub fn rename_key(&mut self, key: &str, new_key: Key) -> Result<(), Error> {
        if let Some(idx) = self.indexes.remove(key) {
            self.indexes.insert(new_key.to_string(), idx);
            self.keys[idx] = new_key;
            Ok(())
        } else {
            Err(Error::NotFound(key.into()))
        }
    }

    pub fn add_alias(&mut self, key: &str, alias: &str) -> Result<(), Error> {
        if !self.indexes.contains_key(key) {
            return Err(Error::NotFound(key.into()));
        }
        self.alias.insert(alias.to_string(), key.to_string());
        Ok(())
    }
}

impl From<Vec<Key>> for KeyIndex {
    fn from(keys: Vec<Key>) -> Self {
        Self::new(keys)
    }
}

#[cfg(test)]
mod test {
    use crate::DataType;

    use super::*;
    use rstest::*;

    #[rstest]
    fn test_alias() {
        let key = Key::new("a", DataType::U32);
        let mut key_index = KeyIndex::new(vec![key.clone()]);
        assert_eq!(key_index.add_alias(key.name(), "alias"), Ok(()));
        assert!(key_index.add_alias("c", "alias").is_err());
        assert_eq!(key_index.alias.get("alias"), Some(&key.name().to_string()));
        assert_eq!(key_index.get_column_index(&key), Some(0));
        assert_eq!(
            key_index.get_column_index(&Key::new("alias", DataType::U32)),
            Some(0)
        );
    }

    #[rstest]
    fn test_rename() {
        let key = Key::new("a", DataType::U32);
        let mut key_index = KeyIndex::new(vec![key.clone(), Key::new("b", DataType::U32)]);
        assert_eq!(key_index.rename_key("a", "new_key".into()), Ok(()));
        assert_eq!(
            key_index.get_column_index(&Key::new("new_key", DataType::U32)),
            Some(0)
        );
        assert!(key_index.rename_key("c", "alias".into()).is_err());
    }

    #[rstest]
    #[case(
        vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)],
        vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)]
    )]
    fn test_key_index_new(#[case] keys: Vec<Key>, #[case] expected: Vec<Key>) {
        let key_index = KeyIndex::new(keys);
        assert_eq!(key_index.keys, expected);
        assert_eq!(key_index.get_keys(), expected);
    }

    #[rstest]
    #[case(
        vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)],
        vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)],
        Ok(())
    )]
    #[case(
        vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)],
        vec![Key::new("b", DataType::U32), Key::new("a", DataType::U32)],
        Err(Error::IndexOutOfOrder(
            vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)],
            vec![Key::new("b", DataType::U32), Key::new("a", DataType::U32)]
        ))
    )]
    fn test_key_index_check_order_of_indexes(
        #[case] keys: Vec<Key>,
        #[case] other_keys: Vec<Key>,
        #[case] expected: Result<(), Error>,
    ) {
        let key_index: KeyIndex = keys.into();
        let other_key_index = KeyIndex::new(other_keys);
        assert_eq!(key_index.check_order_of_indexes(&other_key_index), expected);
    }

    #[rstest]
    #[case(
        KeyIndex::new(vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)]),
        (Key::new("a", DataType::U32), 0)
    )]
    #[case(
        KeyIndex::new(vec![Key::new("a", DataType::U32), Key::new("b", DataType::U32)]),
        (Key::new("b", DataType::U32), 1)
    )]
    fn test_key_index_remove_key(#[case] mut key_index: KeyIndex, #[case] expected: (Key, usize)) {
        let key = expected.0.clone();
        assert_eq!(key_index.remove_key(&key), Some(expected));
    }

    #[rstest]
    fn new_dedupes_repeated_keys() {
        let keys = vec![
            Key::new("a", DataType::U32),
            Key::new("a", DataType::U32),
            Key::new("b", DataType::U32),
        ];
        let key_index = KeyIndex::new(keys);
        assert_eq!(key_index.len(), 2);
        assert_eq!(
            key_index.get_keys(),
            &[Key::new("a", DataType::U32), Key::new("b", DataType::U32)]
        );
    }

    #[rstest]
    fn store_key_is_idempotent_for_existing_name() {
        let mut idx = KeyIndex::new(vec![Key::new("a", DataType::U32)]);
        idx.store_key(Key::new("a", DataType::I32));
        // Length stays at 1 — the second store with the same name is a no-op.
        assert_eq!(idx.len(), 1);
    }

    #[rstest]
    fn get_keys_mut_allows_inplace_edit() {
        let mut idx = KeyIndex::new(vec![Key::new("a", DataType::U32)]);
        let keys = idx.get_keys_mut();
        keys[0].ctype = DataType::I32;
        assert_eq!(idx.get_keys()[0].ctype, DataType::I32);
    }

    #[rstest]
    fn get_as_candidate_maps_row_view() {
        use ndarray::array;
        let idx = KeyIndex::new(vec![
            Key::new("a", DataType::U32),
            Key::new("b", DataType::U32),
        ]);
        let row = array![DataValue::I32(1), DataValue::I32(2)];
        let mut got = idx.get_as_candidate(row.view());
        // The result has one entry per stored index. Ordering doesn't matter.
        assert_eq!(got.len(), 2);
        let a = got.remove(&Key::new("a", DataType::Unknown)).unwrap();
        let b = got.remove(&Key::new("b", DataType::Unknown)).unwrap();
        // The values must come from the row at the corresponding index — both
        // values here are I32(1) or I32(2).
        for v in [a, b] {
            assert!(matches!(v, DataValue::I32(1) | DataValue::I32(2)));
        }
    }

    #[rstest]
    fn to_vec_row_orders_by_index_keys() {
        let idx = KeyIndex::new(vec![
            Key::new("a", DataType::U32),
            Key::new("b", DataType::U32),
        ]);
        let mut candidate: HashMap<Key, DataValue> = HashMap::new();
        candidate.insert(Key::new("b", DataType::U32), DataValue::I32(20));
        candidate.insert(Key::new("a", DataType::U32), DataValue::I32(10));
        let row = idx.to_vec_row(candidate);
        assert_eq!(row, vec![DataValue::I32(10), DataValue::I32(20)]);
    }

    #[rstest]
    fn to_vec_row_fills_missing_keys_with_default() {
        let idx = KeyIndex::new(vec![
            Key::new("a", DataType::U32),
            Key::new("missing", DataType::U32),
        ]);
        let mut candidate: HashMap<Key, DataValue> = HashMap::new();
        candidate.insert(Key::new("a", DataType::U32), DataValue::I32(1));
        let row = idx.to_vec_row(candidate);
        assert_eq!(row, vec![DataValue::I32(1), DataValue::default()]);
    }

    #[rstest]
    fn select_resolves_via_alias() {
        let mut idx = KeyIndex::new(vec![Key::new("a", DataType::U32)]);
        idx.add_alias("a", "alias_a").unwrap();
        let selected = idx.select(&[Key::new("alias_a", DataType::U32)]);
        assert_eq!(selected.len(), 1);
        assert_eq!(
            selected.get_column_index(&Key::new("alias_a", DataType::U32)),
            Some(0)
        );
    }

    #[rstest]
    fn select_skips_unknown_keys() {
        let idx = KeyIndex::new(vec![Key::new("a", DataType::U32)]);
        let selected = idx.select(&[
            Key::new("a", DataType::U32),
            Key::new("ghost", DataType::U32),
        ]);
        assert_eq!(selected.len(), 1);
    }

    #[rstest]
    fn complement_keys_excludes_provided_set() {
        let idx = KeyIndex::new(vec![
            Key::new("a", DataType::U32),
            Key::new("b", DataType::U32),
            Key::new("c", DataType::U32),
        ]);
        let comp = idx.get_complement_keys(&[Key::new("b", DataType::U32)]);
        assert_eq!(
            comp,
            vec![Key::new("a", DataType::U32), Key::new("c", DataType::U32)]
        );
    }
}