trs_dataframe/
dataframe.rs

1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18    dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19    MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24    First(usize),
25    Last(usize),
26}
27
28/// DataFrame holds information about [`ColumnFrame`].
29/// This is used to store the data and the metadata for the candidates.
30#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33    /// Constants for the dataframe - mikro optimization for the data
34    /// Values which is constant for the whole dataframe are stored here
35    pub constants: HashMap<Key, DataValue>,
36    /// Dataframe with the candidates
37    //pub dataframe: Candidates<CandidateItem>,
38    pub dataframe: ColumnFrame,
39    /// Metadata for the dataframe. Here you can store the information about the dataframe
40    pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        self.dataframe.fmt(f)
46    }
47}
48
49impl DataFrame {
50    pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51        Self {
52            constants: HashMap::new(),
53            dataframe: dataframe.into(),
54            metadata: HashMap::new(),
55        }
56    }
57
58    pub fn shrink(&mut self) {
59        self.dataframe.shrink();
60    }
61
62    pub fn add_metadata(&mut self, key: String, value: DataValue) {
63        self.metadata.insert(key, value);
64    }
65
66    pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67        self.metadata.get(key)
68    }
69
70    pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71        other.constants.into_iter().for_each(|(key, value)| {
72            self.constants.insert(key, value);
73        });
74        self.dataframe.join(other.dataframe, join_type)
75    }
76
77    pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78    where
79        F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80    {
81        self.dataframe.apply_function(keys, &mut func)
82    }
83
84    pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85        Ok(self.dataframe.select(keys))
86    }
87
88    pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89        self.dataframe.select_transposed_typed::<D>(keys)
90    }
91
92    pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<DataValue>> {
93        self.dataframe.select_column(&key)
94    }
95
96    pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97        self.dataframe.select_transposed(keys)
98    }
99
100    pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101        self.constants.insert(key, value);
102    }
103
104    pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105        self.dataframe.push(item)
106    }
107
108    pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109        self.dataframe.remove_column(keys).map(|x| x.into())
110    }
111
112    pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113        self.dataframe.extend(items.dataframe)
114    }
115
116    pub fn len(&self) -> usize {
117        self.dataframe.len()
118    }
119
120    pub fn is_empty(&self) -> bool {
121        self.dataframe.is_empty()
122    }
123
124    pub fn add_single_column<K: Into<Key>>(
125        &mut self,
126        key: K,
127        values: Array1<DataValue>,
128    ) -> Result<(), Error> {
129        self.dataframe.add_single_column(key, values)
130    }
131
132    pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<DataValue>> {
133        self.dataframe.get_single_column(key)
134    }
135
136    pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137        self.dataframe.sorted(key)
138    }
139
140    pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141        let filtered_df = self.dataframe.filter(filter)?;
142        Ok(Self {
143            constants: self.constants.clone(),
144            dataframe: filtered_df,
145            metadata: self.metadata.clone(),
146        })
147    }
148
149    #[cfg(feature = "polars-df")]
150    pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151        let mut columns = vec![];
152        for key in self.dataframe.keys() {
153            columns.push(polars::prelude::Column::new(
154                key.name().into(),
155                self.dataframe
156                    .get_single_column(key)
157                    .ok_or_else(|| Error::NotFound(key.clone()))?
158                    .into_iter()
159                    .map(|x| into_polars_value(x.clone()))
160                    .collect::<Vec<_>>(),
161            ));
162        }
163
164        Ok(polars::prelude::DataFrame::new(columns)?)
165    }
166}
167
168#[cfg(feature = "polars-df")]
169pub fn into_polars_value(dv: DataValue) -> polars::prelude::AnyValue<'static> {
170    use polars::prelude::AnyValue::*;
171    use polars::prelude::NamedFrom;
172    match dv {
173        DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
174        DataValue::Bytes(items) => BinaryOwned(items),
175        DataValue::U8(x) => UInt16(x as _),
176        DataValue::Bool(x) => Boolean(x),
177        DataValue::I32(x) => Int32(x),
178        DataValue::U32(x) => UInt32(x),
179        DataValue::I64(x) => Int64(x),
180        DataValue::U64(x) => UInt64(x),
181        DataValue::I128(x) => Int128(x),
182        DataValue::F32(x) => Float32(x),
183        DataValue::F64(x) => Float64(x),
184        DataValue::Null => Null,
185        DataValue::Vec(data_values) => List(polars::series::Series::new(
186            "v".into(),
187            data_values
188                .into_iter()
189                .map(into_polars_value)
190                .collect::<Vec<_>>(),
191        )),
192        DataValue::EnumNumber(_) => unimplemented!(),
193        DataValue::U128(_) => unimplemented!(),
194        DataValue::Map(_) => unimplemented!(),
195    }
196}
197
198#[cfg(feature = "polars-df")]
199pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
200    use polars::prelude::AnyValue::*;
201    match dv {
202        Null => DataValue::Null,
203        Boolean(v) => v.into(),
204        String(v) => DataValue::String(v.into()),
205        UInt8(v) => DataValue::U8(v),
206        UInt16(v) => DataValue::U32(v as u32),
207        UInt32(v) => v.into(),
208        UInt64(v) => v.into(),
209        Int8(v) => (v as i32).into(),
210        Int16(v) => (v as i32).into(),
211        Int32(v) => v.into(),
212        Int64(v) => v.into(),
213        Float32(v) => v.into(),
214        Float64(v) => v.into(),
215        Int128(v) => v.into(),
216        List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
217        // Array(series, _) => {
218        //     DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>())
219        // }
220        StringOwned(v) => DataValue::String(v.as_str().into()),
221        Binary(v) => DataValue::Bytes(v.to_owned()),
222        BinaryOwned(v) => DataValue::Bytes(v),
223        e => {
224            tracing::warn!("Unsupported polars value: {e:?}");
225            DataValue::Null
226        }
227    }
228}
229
230impl From<ColumnFrame> for DataFrame {
231    fn from(dataframe: ColumnFrame) -> Self {
232        Self::new(dataframe)
233    }
234}
235
236impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
237    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
238        Self::new(ColumnFrame::from(dataframe))
239    }
240}
241
242impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
243    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
244        Self::new(ColumnFrame::from(dataframe))
245    }
246}
247
248impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
249    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
250        Self::new(ColumnFrame::from(dataframe))
251    }
252}
253
254impl From<MLChefMap> for DataFrame {
255    fn from(dataframe: MLChefMap) -> Self {
256        Self::new(ColumnFrame::from(dataframe))
257    }
258}
259impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
260    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
261        Self::new(ColumnFrame::from(dataframe))
262    }
263}
264
265impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
266    fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
267        Self::new(ColumnFrame::from(dataframe))
268    }
269}
270
271#[cfg(feature = "polars-df")]
272impl From<polars::prelude::DataFrame> for DataFrame {
273    fn from(dataframe: polars::prelude::DataFrame) -> Self {
274        Self::new(ColumnFrame::from(dataframe))
275    }
276}
277#[cfg(test)]
278mod test {
279    use crate::filter::FilterRules;
280
281    use super::*;
282    use halfbrown::hashmap;
283    #[cfg(feature = "polars-df")]
284    use polars::prelude::NamedFrom as _;
285    use rstest::*;
286    use tracing_test::traced_test;
287    #[fixture]
288    fn dummy_candidates() -> ColumnFrame {
289        ColumnFrame::from(vec![
290            hashmap! {
291                "key1".into() => 1.into(),
292                "key2".into() => "a".into(),
293            },
294            hashmap! {
295                "key1".into() => 2.into(),
296                "key2".into() => "b".into(),
297            },
298        ])
299    }
300
301    #[rstest]
302    fn test_serde() {
303        let df = crate::df! {
304            "a" => [1u64, 2u64, 3u64],
305            "b" => [4u64, 5u64, 6u64],
306            "c" => [7u64, 8u64, 9u64]
307        };
308
309        let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
310
311        let deserialized =
312            serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
313
314        assert_eq!(df, deserialized);
315    }
316
317    #[cfg(feature = "polars-df")]
318    #[rstest]
319    fn test_polars() {
320        let expected = crate::df! {
321            "a" => [1u64, 2u64, 3u64],
322            "b" => [4f64, 5f64, 6f64],
323            "c" => [7i64, 8i64, 9i64]
324        };
325
326        let polars_df = polars::df!(
327            "a" => [1u64, 2u64, 3u64],
328            "b" => [4f64, 5f64, 6f64],
329            "c" => [7i64, 8i64, 9i64]
330        )
331        .expect("BUG: should be ok");
332        let as_df: DataFrame = polars_df.into();
333        let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
334        assert_eq!(
335            as_df.select(Some(keys.as_slice())),
336            expected.select(Some(keys.as_slice()))
337        );
338    }
339
340    #[cfg(feature = "polars-df")]
341    #[rstest]
342    #[case::str(DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
343    #[case::u32(DataValue::U32(u32::MAX), polars::prelude::AnyValue::UInt32(u32::MAX))]
344    #[case::i32(DataValue::I32(i32::MIN), polars::prelude::AnyValue::Int32(i32::MIN))]
345    #[case::i64(DataValue::I64(i64::MIN), polars::prelude::AnyValue::Int64(i64::MIN))]
346    #[case::u64(DataValue::U64(u64::MIN), polars::prelude::AnyValue::UInt64(u64::MIN))]
347    #[case::f32(DataValue::F32(f32::MIN), polars::prelude::AnyValue::Float32(f32::MIN))]
348    #[case::f64(DataValue::F64(f64::MIN), polars::prelude::AnyValue::Float64(f64::MIN))]
349    #[case::null(DataValue::Null, polars::prelude::AnyValue::Null)]
350    #[case::i128(
351        DataValue::I128(i128::MIN),
352        polars::prelude::AnyValue::Int128(i128::MIN)
353    )]
354    #[case::u8(DataValue::U8(255), polars::prelude::AnyValue::UInt8(255))]
355    #[case::bool(DataValue::Bool(true), polars::prelude::AnyValue::Boolean(true))]
356    #[case::bytes(DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
357    #[case::vec_uints(DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
358    // polars converts all by first element type
359    //#[case::vec_diff_int(DataValue::Vec(vec![ DataValue::I32(1), DataValue::U32(0)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::Int32(1i32), polars::prelude::AnyValue::UInt32(0u32)])))]
360    //#[case::vec_int_str(DataValue::Vec(vec![DataValue::U32(0), DataValue::String("1".into())]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::StringOwned("1".into())])))]
361    fn into_polars_value_test(
362        #[case] input: DataValue,
363        #[case] output: polars::prelude::AnyValue<'static>,
364    ) {
365        assert_eq!(into_polars_value(input.clone()), output);
366        assert_eq!(from_polars_value(output), input);
367    }
368
369    #[rstest]
370    #[case(
371        DataFrame::new(crate::column_frame! {
372            "a" => [1f64, 2f64, 3f64],
373            "b" => [4i64, 5i64, 6i64],
374            "c" => [7i64, 8i64, 9i64]
375        }),
376        DataFrame::new(crate::column_frame! {
377            "a" => [1f64, 2f64],
378            "b" => [4i64, 5i64],
379            "c" => [7i64, 8i64]
380        }),
381        FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
382    )]
383    #[case(
384        DataFrame::new(crate::column_frame! {
385            "a" => [1f64, 2f64, 3f64],
386            "b" => [4i64, 5i64, 6i64],
387            "c" => [7i64, 8i64, 9i64]
388        }),
389        DataFrame::new(crate::column_frame! {
390            "a" => [2f64],
391            "b" => [5i64],
392            "c" => [8i64]
393        }),
394        FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
395    )]
396    #[traced_test]
397    fn filter_test(
398        #[case] df: DataFrame,
399        #[case] expected: DataFrame,
400        #[case] filter: FilterRules,
401    ) {
402        let filtered = df.filter(&filter).expect("BUG: cannot filter");
403        assert_eq!(filtered, expected);
404    }
405
406    #[rstest]
407    fn test_serde_complex() {
408        let simple = r#"
409{
410    "constants": {},
411    "dataframe": {
412        "index": {
413            "keys": [
414                {
415                    "key": 3162770485,
416                    "name": "a",
417                    "ctype": "U32"
418                },
419                {
420                    "key": 2279056742,
421                    "name": "b",
422                    "ctype": "F64"
423                },
424                {
425                    "key": 2994984227,
426                    "name": "c",
427                    "ctype": "U64"
428                },
429                {
430                    "key": 3319645144,
431                    "name": "d",
432                    "ctype": "F64"
433                },
434                {
435                    "key": 1291847470,
436                    "name": "e",
437                    "ctype": "U32"
438                },
439                {
440                    "key": 874241070,
441                    "name": "f",
442                    "ctype": "Bool"
443                }
444            ],
445            "indexes": {
446                "a": 0,
447                "b": 1,
448                "c": 2,
449                "d": 3,
450                "e": 4,
451                "f": 5
452            },
453            "alias": {}
454        },
455        "data_frame": {
456            "v": 1,
457            "dim": [
458                2,
459                6
460            ],
461            "data": [
462                253780,
463                0.009369421750307085,
464                1633222860381359,
465                8,
466                5,
467                true,
468                64512,
469                0.003391335718333721,
470                1633222860810557,
471                8,
472                5,
473                null
474            ]
475        }
476    },
477    "metadata": {}
478}
479        "#;
480
481        let simple_deserialized: DataFrame =
482            serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
483
484        println!("deserialized: {simple_deserialized:?}");
485        let array = format!("[{}, {}, {}]", simple, simple, simple);
486        let deserialized: Vec<DataFrame> =
487            serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
488
489        println!("deserialized: {deserialized:?}");
490        assert_eq!(deserialized.len(), 3);
491        assert_eq!(simple_deserialized, deserialized[0]);
492    }
493
494    #[rstest]
495    #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
496    #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
497    #[case(vec![hashmap! {
498        "key1".into() => 1.into(),
499        "key2".into() => "a".into(),
500    },
501    hashmap! {
502        "key1".into() => 2.into(),
503    },])]
504    #[case(vec![data_value::stdhashmap! {
505        "key1" => DataValue::from(1),
506        "key2" => DataValue::from("a"),
507    },data_value::stdhashmap! {
508        "key1" => DataValue::from(2),
509    },])]
510    #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
511    vec![DataValue::from("a"), DataValue::Null])])]
512    fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
513        let df: DataFrame = input.into();
514        assert_eq!(
515            df,
516            DataFrame {
517                constants: HashMap::new(),
518                dataframe: ColumnFrame::from(vec![
519                    hashmap! {
520                        "key1".into() => 1.into(),
521                        "key2".into() => "a".into(),
522                    },
523                    hashmap! {
524                        "key1".into() => 2.into(),
525                    },
526                ]),
527                metadata: HashMap::new(),
528            }
529        );
530        let selected_transposed = df.select_column("key1".into());
531        assert!(selected_transposed.is_some());
532        let selected_transposed = selected_transposed.unwrap();
533        assert_eq!(selected_transposed.len(), 2);
534        assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
535    }
536
537    #[rstest]
538    #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
539    #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
540    #[case::hm({
541        let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
542        hm
543    })]
544    #[case::vec_hhm(vec![hashmap! {
545        "key1".into() => 1.into(),
546        "key2".into() => "a".into(),
547    },
548    hashmap! {
549        "key1".into() => 2.into(),
550    },])]
551    #[case::vec_hme(vec![data_value::stdhashmap! {
552        "key1" => DataValue::from(1),
553        "key2" => DataValue::from("a"),
554    },data_value::stdhashmap! {
555        "key1" => DataValue::from(2),
556    },])]
557    #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
558    fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
559        let df: DataFrame = input.into();
560        let expected: DataFrame = DataFrame {
561            constants: HashMap::new(),
562            dataframe: ColumnFrame::from(vec![
563                hashmap! {
564                    "key1".into() => 1.into(),
565                    "key2".into() => "a".into(),
566                },
567                hashmap! {
568                    "key1".into() => 2.into(),
569                },
570            ]),
571            metadata: HashMap::new(),
572        };
573        assert_eq!(
574            df.select(Some(&["key1".into(), "key2".into()])),
575            expected.select(Some(&["key1".into(), "key2".into()])),
576            "{df} vs {expected}"
577        );
578        let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
579        assert_eq!(selected_transposed.len(), 2);
580        println!("{:?}", selected_transposed);
581        assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
582    }
583    #[rstest]
584    fn test_dataframe(dummy_candidates: ColumnFrame) {
585        let mut dataframe: DataFrame = DataFrame::default();
586        assert!(dataframe.is_empty());
587        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
588        assert_eq!(dataframe.len(), 2);
589
590        let candidate = hashmap! {
591            "key1".into() => 3.into(),
592            "key2".into() => "c".into(),
593        };
594
595        assert!(dataframe.push(candidate).is_ok());
596        assert_eq!(dataframe.len(), 3);
597        assert!(!dataframe.is_empty());
598
599        dataframe.insert_constant("key3".into(), 4.into());
600        assert_eq!(dataframe.constants.len(), 1);
601        assert!(dataframe
602            .apply_function(&["key1".into()], |keys, df| {
603                let key = keys[0].clone();
604                let s = df
605                    .get_single_column(&key)
606                    .expect("BUG: Cannot get column")
607                    .to_owned();
608                let s = s.mapv(|x| x + DataValue::from(1));
609                df.add_single_column("key5", s)?;
610                Ok(())
611            })
612            .is_ok());
613        let original = dataframe.clone();
614        dataframe.shrink();
615        let remove_df = dataframe.remove_column(&["key1".into()]);
616        assert!(remove_df.is_ok());
617        let mut remove_df = remove_df.unwrap();
618        assert_eq!(remove_df.len(), 3);
619        let selected = dataframe.select(Some(&["key2".into()]));
620        assert!(selected.is_ok());
621        let selected = selected.unwrap();
622        println!("{:?}", selected);
623        assert_eq!(selected.len(), 3);
624
625        // fixme later
626        let joined_result =
627            remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
628        assert!(joined_result.is_ok(), "{:?}", joined_result);
629        assert_eq!(original, remove_df);
630    }
631
632    #[rstest]
633    fn test_metadata(dummy_candidates: ColumnFrame) {
634        let mut dataframe: DataFrame = DataFrame::default();
635        assert!(dataframe.is_empty());
636        println!("{:?}", dataframe);
637        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
638        println!("{:?}", dataframe);
639        assert_eq!(dataframe.len(), 2);
640
641        dataframe.add_metadata("test".into(), 1.into());
642        assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
643        let dataframe = DataFrame::new(ColumnFrame::from(vec![
644            hashmap! {
645                "key1".into() => 1.into(),
646                "key2".into() => "a".into(),
647            },
648            hashmap! {
649                "key1".into() => 2.into(),
650                "key2".into() => "b".into(),
651            },
652        ]));
653        assert_eq!(dataframe.get_metadata("test"), None);
654        let tt = dataframe.select_transposed(None);
655        assert!(tt.is_ok());
656        let tt = tt.unwrap();
657        assert_eq!(tt.shape(), [2, 2]);
658        assert_eq!(
659            tt,
660            Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
661                .unwrap()
662        );
663    }
664
665    #[rstest]
666    #[traced_test]
667    fn add_single_column_test() {
668        let mut dataframe = DataFrame::default();
669        let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
670        let r = dataframe.add_single_column("key1", values);
671        assert!(r.is_ok(), "{r:?}");
672        let selected = dataframe.select(None);
673        assert!(selected.is_ok());
674        let selected = selected.unwrap();
675        assert_eq!(selected.shape(), [3, 1]);
676        assert_eq!(
677            selected,
678            Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
679        );
680        let values = Array1::from(vec![1.into(), 2.into()]);
681        assert!(dataframe.add_single_column("key1", values).is_err());
682        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
683        assert!(dataframe.add_single_column("key2", values).is_ok());
684        let values = Array1::from(vec![3.into()]);
685        assert!(dataframe.add_single_column("key3", values).is_err());
686    }
687
688    #[rstest]
689    #[traced_test]
690    fn add_single_column_empty_test() {
691        let mut dataframe = DataFrame::default();
692        let values = Array1::from(vec![]);
693        let r = dataframe.add_single_column("key1", values);
694        assert!(r.is_ok(), "{r:?}");
695        let selected = dataframe.select(None);
696        assert!(selected.is_ok());
697        let selected = selected.unwrap();
698        assert_eq!(selected.shape(), [0, 1]);
699        assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
700        let values = Array1::from(vec![1.into(), 2.into()]);
701        assert!(dataframe.add_single_column("key1", values).is_err());
702        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
703        assert!(dataframe.add_single_column("key2", values).is_ok());
704        let values = Array1::from(vec![3.into(), 4.into()]);
705        assert!(dataframe.add_single_column("key3", values).is_err());
706        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
707        assert!(dataframe.add_single_column("key3", values).is_ok());
708
709        assert_eq!(
710            dataframe
711                .select_column("key1".into())
712                .expect("BUG: has to exists"),
713            ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
714        );
715        assert_eq!(
716            dataframe
717                .select_column("key2".into())
718                .expect("BUG: has to exists"),
719            ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
720        );
721        assert_eq!(
722            dataframe.select(None).expect("BUG: cannot get data"),
723            ndarray::arr2(&[
724                [DataValue::Null, 3.into(), 3.into()],
725                [DataValue::Null, 4.into(), 4.into()],
726                [DataValue::Null, 5.into(), 5.into()],
727            ])
728        );
729    }
730
731    #[rstest]
732    #[case(
733        DataFrame::new(ColumnFrame::from(vec![
734            hashmap! {
735                "k".into() => 1.into(),
736                "k2".into() => 2.into(),
737                "k3".into() => 2.2.into(),
738            },
739            hashmap! {
740                "k".into() => 11.into(),
741                "k2".into() => 3.into(),
742            },
743            hashmap! {
744                "k".into() => 4.into(),
745                "k2".into() => 5.into(),
746                "k3".into() => 2.3.into(),
747            },
748            hashmap! {
749                "k".into() => 4.into(),
750                "k2".into() => 5.into(),
751                "k3".into() => 2.4.into(),
752            },
753        ])),
754        vec!["k".into(), "k2".into()],
755        Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
756    )]
757    #[case(
758        DataFrame::new(ColumnFrame::from(vec![
759            hashmap! {
760                "k".into() => 1.into(),
761                "k2".into() => 2.into(),
762                "k3".into() => 2.2.into(),
763            },
764            hashmap! {
765                "k".into() => 11.into(),
766                "k2".into() => 3.into(),
767            },
768            hashmap! {
769                "k".into() => 4.into(),
770                "k2".into() => 5.into(),
771                "k3".into() => 2.3.into(),
772            },
773            hashmap! {
774                "k".into() => 4.into(),
775                "k2".into() => 5.into(),
776                "k3".into() => 2.4.into(),
777            },
778        ])),
779        vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
780        Array2::from_shape_vec((4, 5), vec![
781            2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
782            3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
783            5.into(), 2.3.into(),  DataValue::Null, DataValue::Null, 4.into(),
784            5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
785    )]
786    #[traced_test]
787    fn select_multiple(
788        #[case] input: DataFrame,
789        #[case] columns: Vec<Key>,
790        #[case] expected: Array2<DataValue>,
791    ) {
792        let selected = input.select(Some(&columns));
793        assert!(selected.is_ok());
794        let selected = selected.unwrap();
795
796        assert_eq!(selected, expected);
797    }
798
799    #[rstest]
800    #[case(
801        DataFrame::new(ColumnFrame::from(vec![
802            hashmap! {
803                "k".into() => 1.into(),
804                "k2".into() => 2.into(),
805                "k3".into() => 2.2.into(),
806            },
807            hashmap! {
808                "k".into() => 11.into(),
809                "k2".into() => 3.into(),
810            },
811            hashmap! {
812                "k".into() => 4.into(),
813                "k2".into() => 5.into(),
814                "k3".into() => 2.3.into(),
815            },
816            hashmap! {
817                "k".into() => 4.into(),
818                "k2".into() => 5.into(),
819                "k3".into() => 2.4.into(),
820            },
821        ])),
822        "k".into(),
823        Array2::from_shape_vec((4, 3), vec![
824            1.into(), 2.into(), 2.2.into(),
825            4.into(), 5.into(), 2.3.into(),
826            4.into(), 5.into(), 2.4.into(),
827            11.into(), 3.into(), DataValue::Null,
828            ]
829        ).unwrap(),
830        vec!["k".into(), "k2".into(), "k3".into()],
831    )]
832    #[rstest]
833    #[case(
834        DataFrame::new(ColumnFrame::from(vec![
835            hashmap! {
836                "k".into() => 1.into(),
837                "k2".into() => 2.into(),
838                "k3".into() => 2.2.into(),
839            },
840            hashmap! {
841                "k".into() => 11.into(),
842                "k2".into() => 3.into(),
843            },
844            hashmap! {
845                "k".into() => 4.into(),
846                "k2".into() => 5.into(),
847                "k3".into() => 2.3.into(),
848            },
849            hashmap! {
850                "k".into() => 4.into(),
851                "k2".into() => 5.into(),
852                "k3".into() => 2.4.into(),
853            },
854        ])),
855        "k3".into(),
856        Array2::from_shape_vec((4, 3), vec![
857            11.into(), 3.into(), DataValue::Null,
858            1.into(), 2.into(), 2.2.into(),
859            4.into(), 5.into(), 2.3.into(),
860            4.into(), 5.into(), 2.4.into(),
861            ]
862        ).unwrap(),
863        vec!["k".into(), "k2".into(), "k3".into()],
864    )]
865    #[case(
866        DataFrame::new(ColumnFrame::from(vec![
867            hashmap! {
868                "k".into() => 2.into(),
869                "k2".into() => 0.000001.into(),
870            },
871            hashmap! {
872                "k".into() => 1.into(),
873                "k2".into() =>0.0000001.into(),
874            },
875            hashmap! {
876                "k".into() => 3.into(),
877                "k2".into() => 0.00001.into(),
878            },
879            hashmap! {
880                "k".into() => 4.into(),
881                "k2".into() => 0.001.into(),
882            },
883        ])),
884        "k2".into(),
885        Array2::from_shape_vec((4, 2), vec![
886            1.into(), 0.0000001.into(),
887            2.into(), 0.000001.into(),
888            3.into(), 0.00001.into(),
889            4.into(), 0.001.into(),
890            ]
891        ).unwrap(),
892        vec!["k".into(), "k2".into()],
893    )]
894    #[case(
895        DataFrame::new(ColumnFrame::from(vec![
896            hashmap! {
897                "k".into() => 2.into(),
898                "k2".into() => "b".into(),
899            },
900            hashmap! {
901                "k".into() => 1.into(),
902                "k2".into() =>"a".into(),
903            },
904            hashmap! {
905                "k".into() => 3.into(),
906                "k2".into() =>"c".into(),
907            },
908            hashmap! {
909                "k".into() => 4.into(),
910                "k2".into() =>"z".into(),
911            },
912        ])),
913        "k2".into(),
914        Array2::from_shape_vec((4, 2), vec![
915            1.into(),"a".into(),
916            2.into(), "b".into(),
917            3.into(), "c".into(),
918            4.into(), "z".into(),
919            ]
920        ).unwrap(),
921        vec!["k".into(), "k2".into()],
922    )]
923    #[traced_test]
924    fn sort_by(
925        #[case] input: DataFrame,
926        #[case] column: Key,
927        #[case] expected: Array2<DataValue>,
928        #[case] columns: Vec<Key>,
929    ) {
930        let result = input.sorted(&column);
931        assert!(result.is_ok(), "{result:?}");
932        let result = result.unwrap().get_sorted();
933        let selected = result.select(Some(&columns));
934
935        assert_eq!(selected, expected);
936    }
937    #[rstest]
938    #[case(
939        DataFrame::new(ColumnFrame::from(vec![
940            hashmap! {
941                "k".into() => 2.into(),
942                "k2".into() => 0.000001.into(),
943            },
944            hashmap! {
945                "k".into() => 1.into(),
946                "k2".into() =>0.0000001.into(),
947            },
948            hashmap! {
949                "k".into() => 3.into(),
950                "k2".into() => 0.00001.into(),
951            },
952            hashmap! {
953                "k".into() => 4.into(),
954                "k2".into() => 0.001.into(),
955            },
956        ])),
957        "k2".into(),
958        TopN::Last(1),
959        Array2::from_shape_vec((1, 2), vec![
960            4.into(), 0.001.into(),
961            ]
962        ).unwrap(),
963        vec!["k".into(), "k2".into()],
964    )]
965    #[case(
966        DataFrame::new(ColumnFrame::from(vec![
967            hashmap! {
968                "k".into() => 2.into(),
969                "k2".into() => 0.000001.into(),
970            },
971            hashmap! {
972                "k".into() => 1.into(),
973                "k2".into() =>0.0000001.into(),
974            },
975            hashmap! {
976                "k".into() => 3.into(),
977                "k2".into() => 0.00001.into(),
978            },
979            hashmap! {
980                "k".into() => 4.into(),
981                "k2".into() => 0.001.into(),
982            },
983        ])),
984        "k2".into(),
985        TopN::Last(2),
986        Array2::from_shape_vec((2, 2), vec![
987            4.into(), 0.001.into(),
988            3.into(), 0.00001.into(),
989            ]
990        ).unwrap(),
991        vec!["k".into(), "k2".into()],
992    )]
993    #[case(
994        DataFrame::new(ColumnFrame::from(vec![
995            hashmap! {
996                "k".into() => 2.into(),
997                "k2".into() => "b".into(),
998            },
999            hashmap! {
1000                "k".into() => 1.into(),
1001                "k2".into() =>"a".into(),
1002            },
1003            hashmap! {
1004                "k".into() => 3.into(),
1005                "k2".into() =>"c".into(),
1006            },
1007            hashmap! {
1008                "k".into() => 4.into(),
1009                "k2".into() =>"z".into(),
1010            },
1011        ])),
1012        "k2".into(),
1013        TopN::First(1),
1014        Array2::from_shape_vec((1, 2), vec![
1015            1.into(),"a".into(),
1016            ]
1017        ).unwrap(),
1018        vec!["k".into(), "k2".into()],
1019    )]
1020    #[case(
1021        DataFrame::new(ColumnFrame::from(vec![
1022            hashmap! {
1023                "k".into() => 2.into(),
1024                "k2".into() => "b".into(),
1025            },
1026            hashmap! {
1027                "k".into() => 1.into(),
1028                "k2".into() =>"a".into(),
1029            },
1030            hashmap! {
1031                "k".into() => 3.into(),
1032                "k2".into() =>"c".into(),
1033            },
1034            hashmap! {
1035                "k".into() => 4.into(),
1036                "k2".into() =>"z".into(),
1037            },
1038        ])),
1039        "k2".into(),
1040        TopN::First(2),
1041        Array2::from_shape_vec((2, 2), vec![
1042            1.into(),"a".into(),
1043            2.into(),"b".into(),
1044            ]
1045        ).unwrap(),
1046        vec!["k".into(), "k2".into()],
1047    )]
1048    #[traced_test]
1049    fn top_n(
1050        #[case] input: DataFrame,
1051        #[case] column: Key,
1052        #[case] topn: TopN,
1053        #[case] expected: Array2<DataValue>,
1054        #[case] columns: Vec<Key>,
1055    ) {
1056        let result = input.sorted(&column);
1057        assert!(result.is_ok(), "{result:?}");
1058        let result = result.unwrap();
1059        let first = result.topn(topn).unwrap();
1060        let selected = first.select(Some(&columns));
1061        assert_eq!(selected, expected);
1062    }
1063}