Skip to main content

trs_dataframe/dataframe/column_store/
from.rs

1use data_value::DataValue;
2use halfbrown::HashMap;
3
4use crate::{detect_dtype_arr, Key, MLChefMap};
5
6use super::{typed_array::TypedDataArray, ColumnFrame, KeyIndex};
7
8/// NOTE: Because of randomnes of the key order in the hashmap, the order of
9/// the keys are sorted!
10impl From<Vec<std::collections::HashMap<Key, DataValue>>> for ColumnFrame {
11    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
12        let mut keys = dataframe
13            .iter()
14            .flat_map(|x| x.keys())
15            .cloned()
16            .collect::<Vec<_>>();
17        keys.sort();
18        keys.dedup();
19        let index = KeyIndex::new(keys);
20        let nrows = dataframe.len();
21        let ncols = index.len();
22        let mut columns: Vec<Vec<DataValue>> = vec![vec![DataValue::default(); nrows]; ncols];
23        for (row_idx, row) in dataframe.iter().enumerate() {
24            for (key, value) in row.iter() {
25                if let Some(col_idx) = index.get_column_index(key) {
26                    columns[col_idx][row_idx] = value.clone();
27                }
28            }
29        }
30        Self::new(index, columns)
31    }
32}
33
34/// NOTE: Because of randomnes of the key order in the hashmap, the order of
35/// the keys are sorted!
36impl From<Vec<HashMap<Key, DataValue>>> for ColumnFrame {
37    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
38        let mut keys = dataframe
39            .iter()
40            .flat_map(|x| x.keys())
41            .cloned()
42            .collect::<Vec<_>>();
43        keys.sort();
44        keys.dedup();
45        let index = KeyIndex::new(keys);
46        let nrows = dataframe.len();
47        let ncols = index.len();
48        let mut columns: Vec<Vec<DataValue>> = vec![vec![DataValue::default(); nrows]; ncols];
49        for (row_idx, row) in dataframe.iter().enumerate() {
50            for (key, value) in row.iter() {
51                if let Some(col_idx) = index.get_column_index(key) {
52                    columns[col_idx][row_idx] = value.clone();
53                }
54            }
55        }
56        Self::new(index, columns)
57    }
58}
59
60fn from_string_keyed<S: AsRef<str>>(
61    dataframe: impl IntoIterator<Item = (S, Vec<DataValue>)>,
62    sorted_keys: Vec<Key>,
63) -> ColumnFrame {
64    let mut index = KeyIndex::new(sorted_keys);
65    let entries: Vec<(S, Vec<DataValue>)> = dataframe.into_iter().collect();
66    let size = entries
67        .iter()
68        .map(|(_, v)| v.len())
69        .max()
70        .unwrap_or_default();
71    let ncols = index.len();
72    let mut data_frame: Vec<Vec<DataValue>> =
73        (0..ncols).map(|_| vec![DataValue::Null; size]).collect();
74    // Track detected dtypes so we can update the index keys without
75    // forcing the column data to a typed representation. This preserves
76    // `Null` values when the column has been resized to match the widest
77    // input vector.
78    let mut detected: Vec<(String, Key)> = Vec::new();
79    for (key, mut value) in entries {
80        let key_ref: Key = key.as_ref().into();
81        if let Some(col_idx) = index.get_column_index(&key_ref) {
82            let dtype_from_input = detect_dtype_arr(&value);
83            value.resize(size, DataValue::default());
84            data_frame[col_idx] = value;
85            detected.push((
86                key.as_ref().to_string(),
87                Key::new(key.as_ref(), dtype_from_input),
88            ));
89        }
90    }
91    for (name, new_key) in detected {
92        let _ = index.rename_key(&name, new_key);
93    }
94    ColumnFrame::new(index, data_frame)
95}
96
97/// NOTE: Because of randomnes of the key order in the hashmap, the order of
98/// the keys are sorted!
99impl From<std::collections::HashMap<String, Vec<DataValue>>> for ColumnFrame {
100    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
101        let mut keys = dataframe
102            .keys()
103            .map(|x| x.as_str().into())
104            .collect::<Vec<_>>();
105        keys.sort();
106        keys.dedup();
107        from_string_keyed(dataframe, keys)
108    }
109}
110
111impl From<MLChefMap> for ColumnFrame {
112    fn from(dataframe: MLChefMap) -> Self {
113        let mut keys = dataframe
114            .keys()
115            .map(|x| x.as_str().into())
116            .collect::<Vec<_>>();
117        keys.sort();
118        keys.dedup();
119        from_string_keyed(dataframe, keys)
120    }
121}
122
123impl From<Vec<(Key, Vec<DataValue>)>> for ColumnFrame {
124    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
125        if dataframe.is_empty() {
126            return Self::default();
127        }
128        let mut index = KeyIndex::new(vec![]);
129        let mut data_frame: Vec<TypedDataArray> = Vec::with_capacity(dataframe.len());
130        for (key, value) in dataframe {
131            let dtype = key.ctype;
132            index.store_key(key);
133            data_frame.push(TypedDataArray::new(dtype, value));
134        }
135        ColumnFrame::new(index, data_frame)
136    }
137}
138
139impl From<std::collections::HashMap<String, ndarray::Array1<DataValue>>> for ColumnFrame {
140    fn from(mut dataframe: std::collections::HashMap<String, ndarray::Array1<DataValue>>) -> Self {
141        let keys = dataframe.keys().map(|key| key.into()).collect::<Vec<_>>();
142        let index = KeyIndex::new(keys);
143        let nrows = dataframe.values().next().map_or(0, |v| v.len());
144        let ncols = index.len();
145        let mut data_frame: Vec<Vec<DataValue>> =
146            (0..ncols).map(|_| vec![DataValue::Null; nrows]).collect();
147
148        for (column_index, key) in index.get_keys().iter().enumerate() {
149            if let Some(value) = dataframe.remove(key.name()) {
150                data_frame[column_index] = value.to_vec();
151            }
152        }
153        ColumnFrame::new(index, data_frame)
154    }
155}
156
157#[cfg(feature = "polars-df")]
158impl From<polars::prelude::DataFrame> for ColumnFrame {
159    fn from(dataframe: polars::prelude::DataFrame) -> Self {
160        let keys = dataframe
161            .get_column_names()
162            .iter()
163            .map(|x| x.as_str().into())
164            .collect::<Vec<_>>();
165        let mut index = KeyIndex::new(keys);
166        let ncols = index.len();
167        let nrows = dataframe.height();
168
169        let mut data_frame: Vec<TypedDataArray> = (0..ncols)
170            .map(|i| {
171                index
172                    .get_keys()
173                    .get(i)
174                    .map(|k| TypedDataArray::default_init(k, nrows))
175                    .unwrap_or_default()
176            })
177            .collect();
178        for column in dataframe.iter() {
179            let mut key = Key::from(column.name().as_str());
180            let mut dtype = crate::DataType::Unknown;
181            if let Some(col_idx) = index.get_column_index(&key) {
182                use crate::detect_dtype_arr;
183
184                let values: Vec<DataValue> = column
185                    .iter()
186                    .map(crate::dataframe::from_polars_value)
187                    .collect();
188                dtype = detect_dtype_arr(&values);
189                data_frame[col_idx] = TypedDataArray::new(dtype, values);
190            }
191            key.ctype = dtype;
192            let _ = index.rename_key(column.name(), key);
193        }
194        ColumnFrame::new(index, data_frame)
195    }
196}