trs_dataframe/
dataframe.rs

1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18    dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19    MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24    First(usize),
25    Last(usize),
26}
27
28/// DataFrame holds information about [`ColumnFrame`].
29/// This is used to store the data and the metadata for the candidates.
30#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33    /// Constants for the dataframe - mikro optimization for the data
34    /// Values which is constant for the whole dataframe are stored here
35    pub constants: HashMap<Key, DataValue>,
36    /// Dataframe with the candidates
37    //pub dataframe: Candidates<CandidateItem>,
38    pub dataframe: ColumnFrame,
39    /// Metadata for the dataframe. Here you can store the information about the dataframe
40    pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        self.dataframe.fmt(f)
46    }
47}
48
49impl DataFrame {
50    pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51        Self {
52            constants: HashMap::new(),
53            dataframe: dataframe.into(),
54            metadata: HashMap::new(),
55        }
56    }
57
58    pub fn shrink(&mut self) {
59        self.dataframe.shrink();
60    }
61
62    pub fn add_metadata(&mut self, key: String, value: DataValue) {
63        self.metadata.insert(key, value);
64    }
65
66    pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67        self.metadata.get(key)
68    }
69
70    pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71        other.constants.into_iter().for_each(|(key, value)| {
72            self.constants.insert(key, value);
73        });
74        self.dataframe.join(other.dataframe, join_type)
75    }
76
77    pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78    where
79        F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80    {
81        self.dataframe.apply_function(keys, &mut func)
82    }
83
84    pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85        Ok(self.dataframe.select(keys))
86    }
87
88    pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89        self.dataframe.select_transposed_typed::<D>(keys)
90    }
91
92    pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
93        self.dataframe.select_column(&key)
94    }
95
96    pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97        self.dataframe.select_transposed(keys)
98    }
99
100    pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101        self.constants.insert(key, value);
102    }
103
104    pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105        self.dataframe.push(item)
106    }
107
108    pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109        self.dataframe.remove_column(keys).map(|x| x.into())
110    }
111
112    pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113        self.dataframe.extend(items.dataframe)
114    }
115
116    pub fn len(&self) -> usize {
117        self.dataframe.len()
118    }
119
120    pub fn is_empty(&self) -> bool {
121        self.dataframe.is_empty()
122    }
123
124    pub fn add_single_column<K: Into<Key>>(
125        &mut self,
126        key: K,
127        values: Array1<DataValue>,
128    ) -> Result<(), Error> {
129        self.dataframe.add_single_column(key, values)
130    }
131
132    pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
133        self.dataframe.get_single_column(key)
134    }
135
136    pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137        self.dataframe.sorted(key)
138    }
139
140    pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141        let filtered_df = self.dataframe.filter(filter)?;
142        Ok(Self {
143            constants: self.constants.clone(),
144            dataframe: filtered_df,
145            metadata: self.metadata.clone(),
146        })
147    }
148
149    #[cfg(feature = "polars-df")]
150    pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151        let mut columns = vec![];
152        for key in self.dataframe.keys() {
153            columns.push(polars::prelude::Column::new(
154                key.name().into(),
155                self.dataframe
156                    .get_single_column(key)
157                    .ok_or_else(|| Error::NotFound(key.clone()))?
158                    .into_iter()
159                    .map(|x| into_polars_value(x.clone()))
160                    .collect::<Vec<_>>(),
161            ));
162        }
163
164        Ok(polars::prelude::DataFrame::new(columns)?)
165    }
166}
167#[cfg(feature = "polars-df")]
168pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
169    use crate::DataType::*;
170    use polars::prelude::DataType::*;
171    match dtype {
172        Bool => Boolean,
173        U32 => UInt32,
174        I32 => Int32,
175        U8 => UInt8,
176        U64 => UInt64,
177        I64 => Int64,
178        F32 => Float32,
179        F64 => Float64,
180        crate::DataType::String => polars::prelude::DataType::String,
181        Bytes => Binary,
182        crate::DataType::Unknown => Null,
183        Vec => List(Box::new(polars::prelude::DataType::Unknown(
184            polars::prelude::UnknownKind::Any,
185        ))),
186        Map => Struct(vec![]),
187    }
188}
189
190#[cfg(feature = "polars-df")]
191pub fn into_polars_value(dv: DataValue) -> polars::prelude::AnyValue<'static> {
192    use polars::prelude::AnyValue::*;
193    use polars::prelude::{Field, NamedFrom};
194    match dv {
195        DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
196        DataValue::Bytes(items) => BinaryOwned(items),
197        DataValue::U8(x) => UInt16(x as _),
198        DataValue::Bool(x) => Boolean(x),
199        DataValue::I32(x) => Int32(x),
200        DataValue::U32(x) => UInt32(x),
201        DataValue::I64(x) => Int64(x),
202        DataValue::U64(x) => UInt64(x),
203        DataValue::I128(x) => Int128(x),
204        DataValue::F32(x) => Float32(x),
205        DataValue::F64(x) => Float64(x),
206        DataValue::Null => Null,
207        DataValue::Vec(data_values) => List(polars::series::Series::new(
208            "v".into(),
209            data_values
210                .into_iter()
211                .map(into_polars_value)
212                .collect::<Vec<_>>(),
213        )),
214        DataValue::EnumNumber(x) => Int32(x),
215        DataValue::U128(x) => UInt128(x),
216        DataValue::Map(x) => {
217            let mut values = vec![];
218            let mut fields = vec![];
219            let mut sorted_keys = x.keys().collect::<Vec<_>>();
220            sorted_keys.sort();
221            for key in sorted_keys {
222                let value = x
223                    .get(key)
224                    .expect(&format!("Key {key:?} should exists in hm"));
225                let dtype = crate::detect_dtype(value);
226                values.push(into_polars_value(value.to_owned()));
227                fields.push(Field::new(key.as_str().into(), polars_dtype(dtype)));
228            }
229            StructOwned(Box::new((values, fields)))
230        }
231    }
232}
233
234#[cfg(feature = "polars-df")]
235pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
236    use polars::prelude::AnyValue::*;
237    match dv {
238        Null => DataValue::Null,
239        Boolean(v) => v.into(),
240        String(v) => DataValue::String(v.into()),
241        UInt8(v) => DataValue::U8(v),
242        UInt16(v) => DataValue::U32(v as u32),
243        UInt32(v) => v.into(),
244        UInt64(v) => v.into(),
245        Int8(v) => (v as i32).into(),
246        Int16(v) => (v as i32).into(),
247        Int32(v) => v.into(),
248        Int64(v) => v.into(),
249        Float32(v) => v.into(),
250        Float64(v) => v.into(),
251        Int128(v) => v.into(),
252        List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
253        // Array(series, _) => {
254        //     DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>())
255        // }
256        StringOwned(v) => DataValue::String(v.as_str().into()),
257        Binary(v) => DataValue::Bytes(v.to_owned()),
258        BinaryOwned(v) => DataValue::Bytes(v),
259        StructOwned(m) => {
260            let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
261                std::collections::HashMap::new();
262            for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
263                hm.insert(k.name.as_str().into(), from_polars_value(v));
264            }
265            DataValue::Map(hm)
266        }
267        e => {
268            tracing::warn!("Unsupported polars value: {e:?}");
269            DataValue::Null
270        }
271    }
272}
273
274impl From<ColumnFrame> for DataFrame {
275    fn from(dataframe: ColumnFrame) -> Self {
276        Self::new(dataframe)
277    }
278}
279
280impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
281    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
282        Self::new(ColumnFrame::from(dataframe))
283    }
284}
285
286impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
287    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
288        Self::new(ColumnFrame::from(dataframe))
289    }
290}
291
292impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
293    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
294        Self::new(ColumnFrame::from(dataframe))
295    }
296}
297
298impl From<MLChefMap> for DataFrame {
299    fn from(dataframe: MLChefMap) -> Self {
300        Self::new(ColumnFrame::from(dataframe))
301    }
302}
303impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
304    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
305        Self::new(ColumnFrame::from(dataframe))
306    }
307}
308
309impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
310    fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
311        Self::new(ColumnFrame::from(dataframe))
312    }
313}
314
315#[cfg(feature = "polars-df")]
316impl From<polars::prelude::DataFrame> for DataFrame {
317    fn from(dataframe: polars::prelude::DataFrame) -> Self {
318        Self::new(ColumnFrame::from(dataframe))
319    }
320}
321#[cfg(test)]
322mod test {
323    use crate::filter::FilterRules;
324
325    use super::*;
326    use halfbrown::hashmap;
327    #[cfg(feature = "polars-df")]
328    use polars::prelude::NamedFrom as _;
329    use rstest::*;
330    use tracing_test::traced_test;
331    #[fixture]
332    fn dummy_candidates() -> ColumnFrame {
333        ColumnFrame::from(vec![
334            hashmap! {
335                "key1".into() => 1.into(),
336                "key2".into() => "a".into(),
337            },
338            hashmap! {
339                "key1".into() => 2.into(),
340                "key2".into() => "b".into(),
341            },
342        ])
343    }
344
345    #[rstest]
346    fn test_serde() {
347        let df = crate::df! {
348            "a" => [1u64, 2u64, 3u64],
349            "b" => [4u64, 5u64, 6u64],
350            "c" => [7u64, 8u64, 9u64]
351        };
352
353        let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
354
355        let deserialized =
356            serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
357
358        assert_eq!(df, deserialized);
359    }
360
361    #[cfg(feature = "polars-df")]
362    #[rstest]
363    fn test_polars() {
364        let expected = crate::df! {
365            "a" => [1u64, 2u64, 3u64],
366            "b" => [4f64, 5f64, 6f64],
367            "c" => [7i64, 8i64, 9i64]
368        };
369
370        let polars_df = polars::df!(
371            "a" => [1u64, 2u64, 3u64],
372            "b" => [4f64, 5f64, 6f64],
373            "c" => [7i64, 8i64, 9i64]
374        )
375        .expect("BUG: should be ok");
376        let as_df: DataFrame = polars_df.into();
377        let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
378        assert_eq!(
379            as_df.select(Some(keys.as_slice())),
380            expected.select(Some(keys.as_slice()))
381        );
382    }
383
384    #[cfg(feature = "polars-df")]
385    #[rstest]
386    #[case::str(DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
387    #[case::u32(DataValue::U32(u32::MAX), polars::prelude::AnyValue::UInt32(u32::MAX))]
388    #[case::i32(DataValue::I32(i32::MIN), polars::prelude::AnyValue::Int32(i32::MIN))]
389    #[case::i64(DataValue::I64(i64::MIN), polars::prelude::AnyValue::Int64(i64::MIN))]
390    #[case::u64(DataValue::U64(u64::MIN), polars::prelude::AnyValue::UInt64(u64::MIN))]
391    #[case::f32(DataValue::F32(f32::MIN), polars::prelude::AnyValue::Float32(f32::MIN))]
392    #[case::f64(DataValue::F64(f64::MIN), polars::prelude::AnyValue::Float64(f64::MIN))]
393    #[case::null(DataValue::Null, polars::prelude::AnyValue::Null)]
394    #[case::i128(
395        DataValue::I128(i128::MIN),
396        polars::prelude::AnyValue::Int128(i128::MIN)
397    )]
398    #[case::u8(DataValue::U8(255), polars::prelude::AnyValue::UInt8(255))]
399    #[case::bool(DataValue::Bool(true), polars::prelude::AnyValue::Boolean(true))]
400    #[case::bytes(DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
401    #[case::vec_uints(DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
402    #[case::map(DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
403        vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
404        vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
405    // polars converts all by first element type
406    // #[case::vec_diff_int(DataValue::Vec(vec![ DataValue::I32(1), DataValue::U32(0)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::Int32(1i32), polars::prelude::AnyValue::UInt32(0u32)])))]
407    //#[case::vec_int_str(DataValue::Vec(vec![DataValue::U32(0), DataValue::String("1".into())]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::StringOwned("1".into())])))]
408    fn into_polars_value_test(
409        #[case] input: DataValue,
410        #[case] output: polars::prelude::AnyValue<'static>,
411    ) {
412        assert_eq!(into_polars_value(input.clone()), output);
413        assert_eq!(from_polars_value(output), input);
414    }
415
416    #[rstest]
417    #[case(
418        DataFrame::new(crate::column_frame! {
419            "a" => [1f64, 2f64, 3f64],
420            "b" => [4i64, 5i64, 6i64],
421            "c" => [7i64, 8i64, 9i64]
422        }),
423        DataFrame::new(crate::column_frame! {
424            "a" => [1f64, 2f64],
425            "b" => [4i64, 5i64],
426            "c" => [7i64, 8i64]
427        }),
428        FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
429    )]
430    #[case(
431        DataFrame::new(crate::column_frame! {
432            "a" => [1f64, 2f64, 3f64],
433            "b" => [4i64, 5i64, 6i64],
434            "c" => [7i64, 8i64, 9i64]
435        }),
436        DataFrame::new(crate::column_frame! {
437            "a" => [2f64],
438            "b" => [5i64],
439            "c" => [8i64]
440        }),
441        FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
442    )]
443    #[traced_test]
444    fn filter_test(
445        #[case] df: DataFrame,
446        #[case] expected: DataFrame,
447        #[case] filter: FilterRules,
448    ) {
449        let filtered = df.filter(&filter).expect("BUG: cannot filter");
450        assert_eq!(filtered, expected);
451    }
452
453    #[rstest]
454    fn test_serde_complex() {
455        let simple = r#"
456{
457    "constants": {},
458    "dataframe": {
459        "index": {
460            "keys": [
461                {
462                    "key": 3162770485,
463                    "name": "a",
464                    "ctype": "U32"
465                },
466                {
467                    "key": 2279056742,
468                    "name": "b",
469                    "ctype": "F64"
470                },
471                {
472                    "key": 2994984227,
473                    "name": "c",
474                    "ctype": "U64"
475                },
476                {
477                    "key": 3319645144,
478                    "name": "d",
479                    "ctype": "F64"
480                },
481                {
482                    "key": 1291847470,
483                    "name": "e",
484                    "ctype": "U32"
485                },
486                {
487                    "key": 874241070,
488                    "name": "f",
489                    "ctype": "Bool"
490                }
491            ],
492            "indexes": {
493                "a": 0,
494                "b": 1,
495                "c": 2,
496                "d": 3,
497                "e": 4,
498                "f": 5
499            },
500            "alias": {}
501        },
502        "data_frame": {
503            "v": 1,
504            "dim": [
505                2,
506                6
507            ],
508            "data": [
509                253780,
510                0.009369421750307085,
511                1633222860381359,
512                8,
513                5,
514                true,
515                64512,
516                0.003391335718333721,
517                1633222860810557,
518                8,
519                5,
520                null
521            ]
522        }
523    },
524    "metadata": {}
525}
526        "#;
527
528        let simple_deserialized: DataFrame =
529            serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
530
531        println!("deserialized: {simple_deserialized:?}");
532        let array = format!("[{}, {}, {}]", simple, simple, simple);
533        let deserialized: Vec<DataFrame> =
534            serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
535
536        println!("deserialized: {deserialized:?}");
537        assert_eq!(deserialized.len(), 3);
538        assert_eq!(simple_deserialized, deserialized[0]);
539    }
540
541    #[rstest]
542    #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
543    #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
544    #[case(vec![hashmap! {
545        "key1".into() => 1.into(),
546        "key2".into() => "a".into(),
547    },
548    hashmap! {
549        "key1".into() => 2.into(),
550    },])]
551    #[case(vec![data_value::stdhashmap! {
552        "key1" => DataValue::from(1),
553        "key2" => DataValue::from("a"),
554    },data_value::stdhashmap! {
555        "key1" => DataValue::from(2),
556    },])]
557    #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
558    vec![DataValue::from("a"), DataValue::Null])])]
559    fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
560        let df: DataFrame = input.into();
561        assert_eq!(
562            df,
563            DataFrame {
564                constants: HashMap::new(),
565                dataframe: ColumnFrame::from(vec![
566                    hashmap! {
567                        "key1".into() => 1.into(),
568                        "key2".into() => "a".into(),
569                    },
570                    hashmap! {
571                        "key1".into() => 2.into(),
572                    },
573                ]),
574                metadata: HashMap::new(),
575            }
576        );
577        let selected_transposed = df.select_column("key1".into());
578        assert!(selected_transposed.is_some());
579        let selected_transposed = selected_transposed.unwrap();
580        assert_eq!(selected_transposed.len(), 2);
581        assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
582    }
583
584    #[rstest]
585    #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
586    #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
587    #[case::hm({
588        let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
589        hm
590    })]
591    #[case::vec_hhm(vec![hashmap! {
592        "key1".into() => 1.into(),
593        "key2".into() => "a".into(),
594    },
595    hashmap! {
596        "key1".into() => 2.into(),
597    },])]
598    #[case::vec_hme(vec![data_value::stdhashmap! {
599        "key1" => DataValue::from(1),
600        "key2" => DataValue::from("a"),
601    },data_value::stdhashmap! {
602        "key1" => DataValue::from(2),
603    },])]
604    #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
605    fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
606        let df: DataFrame = input.into();
607        let expected: DataFrame = DataFrame {
608            constants: HashMap::new(),
609            dataframe: ColumnFrame::from(vec![
610                hashmap! {
611                    "key1".into() => 1.into(),
612                    "key2".into() => "a".into(),
613                },
614                hashmap! {
615                    "key1".into() => 2.into(),
616                },
617            ]),
618            metadata: HashMap::new(),
619        };
620        assert_eq!(
621            df.select(Some(&["key1".into(), "key2".into()])),
622            expected.select(Some(&["key1".into(), "key2".into()])),
623            "{df} vs {expected}"
624        );
625        let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
626        assert_eq!(selected_transposed.len(), 2);
627        println!("{:?}", selected_transposed);
628        assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
629    }
630    #[rstest]
631    fn test_dataframe(dummy_candidates: ColumnFrame) {
632        let mut dataframe: DataFrame = DataFrame::default();
633        assert!(dataframe.is_empty());
634        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
635        assert_eq!(dataframe.len(), 2);
636
637        let candidate = hashmap! {
638            "key1".into() => 3.into(),
639            "key2".into() => "c".into(),
640        };
641
642        assert!(dataframe.push(candidate).is_ok());
643        assert_eq!(dataframe.len(), 3);
644        assert!(!dataframe.is_empty());
645
646        dataframe.insert_constant("key3".into(), 4.into());
647        assert_eq!(dataframe.constants.len(), 1);
648        assert!(dataframe
649            .apply_function(&["key1".into()], |keys, df| {
650                let key = keys[0].clone();
651                let s = df
652                    .get_single_column(&key)
653                    .expect("BUG: Cannot get column")
654                    .to_owned();
655                let s = s.mapv(|x| x + DataValue::from(1));
656                df.add_single_column("key5", s)?;
657                Ok(())
658            })
659            .is_ok());
660        let original = dataframe.clone();
661        dataframe.shrink();
662        let remove_df = dataframe.remove_column(&["key1".into()]);
663        assert!(remove_df.is_ok());
664        let mut remove_df = remove_df.unwrap();
665        assert_eq!(remove_df.len(), 3);
666        let selected = dataframe.select(Some(&["key2".into()]));
667        assert!(selected.is_ok());
668        let selected = selected.unwrap();
669        println!("{:?}", selected);
670        assert_eq!(selected.len(), 3);
671
672        // fixme later
673        let joined_result =
674            remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
675        assert!(joined_result.is_ok(), "{:?}", joined_result);
676        assert_eq!(original, remove_df);
677    }
678
679    #[rstest]
680    fn test_metadata(dummy_candidates: ColumnFrame) {
681        let mut dataframe: DataFrame = DataFrame::default();
682        assert!(dataframe.is_empty());
683        println!("{:?}", dataframe);
684        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
685        println!("{:?}", dataframe);
686        assert_eq!(dataframe.len(), 2);
687
688        dataframe.add_metadata("test".into(), 1.into());
689        assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
690        let dataframe = DataFrame::new(ColumnFrame::from(vec![
691            hashmap! {
692                "key1".into() => 1.into(),
693                "key2".into() => "a".into(),
694            },
695            hashmap! {
696                "key1".into() => 2.into(),
697                "key2".into() => "b".into(),
698            },
699        ]));
700        assert_eq!(dataframe.get_metadata("test"), None);
701        let tt = dataframe.select_transposed(None);
702        assert!(tt.is_ok());
703        let tt = tt.unwrap();
704        assert_eq!(tt.shape(), [2, 2]);
705        assert_eq!(
706            tt,
707            Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
708                .unwrap()
709        );
710    }
711
712    #[rstest]
713    #[traced_test]
714    fn add_single_column_test() {
715        let mut dataframe = DataFrame::default();
716        let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
717        let r = dataframe.add_single_column("key1", values);
718        assert!(r.is_ok(), "{r:?}");
719        let selected = dataframe.select(None);
720        assert!(selected.is_ok());
721        let selected = selected.unwrap();
722        assert_eq!(selected.shape(), [3, 1]);
723        assert_eq!(
724            selected,
725            Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
726        );
727        let values = Array1::from(vec![1.into(), 2.into()]);
728        assert!(dataframe.add_single_column("key1", values).is_err());
729        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
730        assert!(dataframe.add_single_column("key2", values).is_ok());
731        let values = Array1::from(vec![3.into()]);
732        assert!(dataframe.add_single_column("key3", values).is_err());
733    }
734
735    #[rstest]
736    #[traced_test]
737    fn add_single_column_empty_test() {
738        let mut dataframe = DataFrame::default();
739        let values = Array1::from(vec![]);
740        let r = dataframe.add_single_column("key1", values);
741        assert!(r.is_ok(), "{r:?}");
742        let selected = dataframe.select(None);
743        assert!(selected.is_ok());
744        let selected = selected.unwrap();
745        assert_eq!(selected.shape(), [0, 1]);
746        assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
747        let values = Array1::from(vec![1.into(), 2.into()]);
748        assert!(dataframe.add_single_column("key1", values).is_err());
749        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
750        assert!(dataframe.add_single_column("key2", values).is_ok());
751        let values = Array1::from(vec![3.into(), 4.into()]);
752        assert!(dataframe.add_single_column("key3", values).is_err());
753        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
754        assert!(dataframe.add_single_column("key3", values).is_ok());
755
756        assert_eq!(
757            dataframe
758                .select_column("key1".into())
759                .expect("BUG: has to exists"),
760            ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
761        );
762        assert_eq!(
763            dataframe
764                .select_column("key2".into())
765                .expect("BUG: has to exists"),
766            ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
767        );
768        assert_eq!(
769            dataframe.select(None).expect("BUG: cannot get data"),
770            ndarray::arr2(&[
771                [DataValue::Null, 3.into(), 3.into()],
772                [DataValue::Null, 4.into(), 4.into()],
773                [DataValue::Null, 5.into(), 5.into()],
774            ])
775        );
776    }
777
778    #[rstest]
779    #[case(
780        DataFrame::new(ColumnFrame::from(vec![
781            hashmap! {
782                "k".into() => 1.into(),
783                "k2".into() => 2.into(),
784                "k3".into() => 2.2.into(),
785            },
786            hashmap! {
787                "k".into() => 11.into(),
788                "k2".into() => 3.into(),
789            },
790            hashmap! {
791                "k".into() => 4.into(),
792                "k2".into() => 5.into(),
793                "k3".into() => 2.3.into(),
794            },
795            hashmap! {
796                "k".into() => 4.into(),
797                "k2".into() => 5.into(),
798                "k3".into() => 2.4.into(),
799            },
800        ])),
801        vec!["k".into(), "k2".into()],
802        Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
803    )]
804    #[case(
805        DataFrame::new(ColumnFrame::from(vec![
806            hashmap! {
807                "k".into() => 1.into(),
808                "k2".into() => 2.into(),
809                "k3".into() => 2.2.into(),
810            },
811            hashmap! {
812                "k".into() => 11.into(),
813                "k2".into() => 3.into(),
814            },
815            hashmap! {
816                "k".into() => 4.into(),
817                "k2".into() => 5.into(),
818                "k3".into() => 2.3.into(),
819            },
820            hashmap! {
821                "k".into() => 4.into(),
822                "k2".into() => 5.into(),
823                "k3".into() => 2.4.into(),
824            },
825        ])),
826        vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
827        Array2::from_shape_vec((4, 5), vec![
828            2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
829            3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
830            5.into(), 2.3.into(),  DataValue::Null, DataValue::Null, 4.into(),
831            5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
832    )]
833    #[traced_test]
834    fn select_multiple(
835        #[case] input: DataFrame,
836        #[case] columns: Vec<Key>,
837        #[case] expected: Array2<DataValue>,
838    ) {
839        let selected = input.select(Some(&columns));
840        assert!(selected.is_ok());
841        let selected = selected.unwrap();
842
843        assert_eq!(selected, expected);
844    }
845
846    #[rstest]
847    #[case(
848        DataFrame::new(ColumnFrame::from(vec![
849            hashmap! {
850                "k".into() => 1.into(),
851                "k2".into() => 2.into(),
852                "k3".into() => 2.2.into(),
853            },
854            hashmap! {
855                "k".into() => 11.into(),
856                "k2".into() => 3.into(),
857            },
858            hashmap! {
859                "k".into() => 4.into(),
860                "k2".into() => 5.into(),
861                "k3".into() => 2.3.into(),
862            },
863            hashmap! {
864                "k".into() => 4.into(),
865                "k2".into() => 5.into(),
866                "k3".into() => 2.4.into(),
867            },
868        ])),
869        "k".into(),
870        Array2::from_shape_vec((4, 3), vec![
871            1.into(), 2.into(), 2.2.into(),
872            4.into(), 5.into(), 2.3.into(),
873            4.into(), 5.into(), 2.4.into(),
874            11.into(), 3.into(), DataValue::Null,
875            ]
876        ).unwrap(),
877        vec!["k".into(), "k2".into(), "k3".into()],
878    )]
879    #[rstest]
880    #[case(
881        DataFrame::new(ColumnFrame::from(vec![
882            hashmap! {
883                "k".into() => 1.into(),
884                "k2".into() => 2.into(),
885                "k3".into() => 2.2.into(),
886            },
887            hashmap! {
888                "k".into() => 11.into(),
889                "k2".into() => 3.into(),
890            },
891            hashmap! {
892                "k".into() => 4.into(),
893                "k2".into() => 5.into(),
894                "k3".into() => 2.3.into(),
895            },
896            hashmap! {
897                "k".into() => 4.into(),
898                "k2".into() => 5.into(),
899                "k3".into() => 2.4.into(),
900            },
901        ])),
902        "k3".into(),
903        Array2::from_shape_vec((4, 3), vec![
904            11.into(), 3.into(), DataValue::Null,
905            1.into(), 2.into(), 2.2.into(),
906            4.into(), 5.into(), 2.3.into(),
907            4.into(), 5.into(), 2.4.into(),
908            ]
909        ).unwrap(),
910        vec!["k".into(), "k2".into(), "k3".into()],
911    )]
912    #[case(
913        DataFrame::new(ColumnFrame::from(vec![
914            hashmap! {
915                "k".into() => 2.into(),
916                "k2".into() => 0.000001.into(),
917            },
918            hashmap! {
919                "k".into() => 1.into(),
920                "k2".into() =>0.0000001.into(),
921            },
922            hashmap! {
923                "k".into() => 3.into(),
924                "k2".into() => 0.00001.into(),
925            },
926            hashmap! {
927                "k".into() => 4.into(),
928                "k2".into() => 0.001.into(),
929            },
930        ])),
931        "k2".into(),
932        Array2::from_shape_vec((4, 2), vec![
933            1.into(), 0.0000001.into(),
934            2.into(), 0.000001.into(),
935            3.into(), 0.00001.into(),
936            4.into(), 0.001.into(),
937            ]
938        ).unwrap(),
939        vec!["k".into(), "k2".into()],
940    )]
941    #[case(
942        DataFrame::new(ColumnFrame::from(vec![
943            hashmap! {
944                "k".into() => 2.into(),
945                "k2".into() => "b".into(),
946            },
947            hashmap! {
948                "k".into() => 1.into(),
949                "k2".into() =>"a".into(),
950            },
951            hashmap! {
952                "k".into() => 3.into(),
953                "k2".into() =>"c".into(),
954            },
955            hashmap! {
956                "k".into() => 4.into(),
957                "k2".into() =>"z".into(),
958            },
959        ])),
960        "k2".into(),
961        Array2::from_shape_vec((4, 2), vec![
962            1.into(),"a".into(),
963            2.into(), "b".into(),
964            3.into(), "c".into(),
965            4.into(), "z".into(),
966            ]
967        ).unwrap(),
968        vec!["k".into(), "k2".into()],
969    )]
970    #[traced_test]
971    fn sort_by(
972        #[case] input: DataFrame,
973        #[case] column: Key,
974        #[case] expected: Array2<DataValue>,
975        #[case] columns: Vec<Key>,
976    ) {
977        let result = input.sorted(&column);
978        assert!(result.is_ok(), "{result:?}");
979        let result = result.unwrap().get_sorted();
980        let selected = result.select(Some(&columns));
981
982        assert_eq!(selected, expected);
983    }
984    #[rstest]
985    #[case(
986        DataFrame::new(ColumnFrame::from(vec![
987            hashmap! {
988                "k".into() => 2.into(),
989                "k2".into() => 0.000001.into(),
990            },
991            hashmap! {
992                "k".into() => 1.into(),
993                "k2".into() =>0.0000001.into(),
994            },
995            hashmap! {
996                "k".into() => 3.into(),
997                "k2".into() => 0.00001.into(),
998            },
999            hashmap! {
1000                "k".into() => 4.into(),
1001                "k2".into() => 0.001.into(),
1002            },
1003        ])),
1004        "k2".into(),
1005        TopN::Last(1),
1006        Array2::from_shape_vec((1, 2), vec![
1007            4.into(), 0.001.into(),
1008            ]
1009        ).unwrap(),
1010        vec!["k".into(), "k2".into()],
1011    )]
1012    #[case(
1013        DataFrame::new(ColumnFrame::from(vec![
1014            hashmap! {
1015                "k".into() => 2.into(),
1016                "k2".into() => 0.000001.into(),
1017            },
1018            hashmap! {
1019                "k".into() => 1.into(),
1020                "k2".into() =>0.0000001.into(),
1021            },
1022            hashmap! {
1023                "k".into() => 3.into(),
1024                "k2".into() => 0.00001.into(),
1025            },
1026            hashmap! {
1027                "k".into() => 4.into(),
1028                "k2".into() => 0.001.into(),
1029            },
1030        ])),
1031        "k2".into(),
1032        TopN::Last(2),
1033        Array2::from_shape_vec((2, 2), vec![
1034            4.into(), 0.001.into(),
1035            3.into(), 0.00001.into(),
1036            ]
1037        ).unwrap(),
1038        vec!["k".into(), "k2".into()],
1039    )]
1040    #[case(
1041        DataFrame::new(ColumnFrame::from(vec![
1042            hashmap! {
1043                "k".into() => 2.into(),
1044                "k2".into() => "b".into(),
1045            },
1046            hashmap! {
1047                "k".into() => 1.into(),
1048                "k2".into() =>"a".into(),
1049            },
1050            hashmap! {
1051                "k".into() => 3.into(),
1052                "k2".into() =>"c".into(),
1053            },
1054            hashmap! {
1055                "k".into() => 4.into(),
1056                "k2".into() =>"z".into(),
1057            },
1058        ])),
1059        "k2".into(),
1060        TopN::First(1),
1061        Array2::from_shape_vec((1, 2), vec![
1062            1.into(),"a".into(),
1063            ]
1064        ).unwrap(),
1065        vec!["k".into(), "k2".into()],
1066    )]
1067    #[case(
1068        DataFrame::new(ColumnFrame::from(vec![
1069            hashmap! {
1070                "k".into() => 2.into(),
1071                "k2".into() => "b".into(),
1072            },
1073            hashmap! {
1074                "k".into() => 1.into(),
1075                "k2".into() =>"a".into(),
1076            },
1077            hashmap! {
1078                "k".into() => 3.into(),
1079                "k2".into() =>"c".into(),
1080            },
1081            hashmap! {
1082                "k".into() => 4.into(),
1083                "k2".into() =>"z".into(),
1084            },
1085        ])),
1086        "k2".into(),
1087        TopN::First(2),
1088        Array2::from_shape_vec((2, 2), vec![
1089            1.into(),"a".into(),
1090            2.into(),"b".into(),
1091            ]
1092        ).unwrap(),
1093        vec!["k".into(), "k2".into()],
1094    )]
1095    #[traced_test]
1096    fn top_n(
1097        #[case] input: DataFrame,
1098        #[case] column: Key,
1099        #[case] topn: TopN,
1100        #[case] expected: Array2<DataValue>,
1101        #[case] columns: Vec<Key>,
1102    ) {
1103        let result = input.sorted(&column);
1104        assert!(result.is_ok(), "{result:?}");
1105        let result = result.unwrap();
1106        let first = result.topn(topn).unwrap();
1107        let selected = first.select(Some(&columns));
1108        assert_eq!(selected, expected);
1109    }
1110}