trs_dataframe/
dataframe.rs

1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18    dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19    MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24    First(usize),
25    Last(usize),
26}
27
28/// DataFrame holds information about [`ColumnFrame`].
29/// This is used to store the data and the metadata for the candidates.
30#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33    /// Constants for the dataframe - mikro optimization for the data
34    /// Values which is constant for the whole dataframe are stored here
35    pub constants: HashMap<Key, DataValue>,
36    /// Dataframe with the candidates
37    //pub dataframe: Candidates<CandidateItem>,
38    pub dataframe: ColumnFrame,
39    /// Metadata for the dataframe. Here you can store the information about the dataframe
40    pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        self.dataframe.fmt(f)
46    }
47}
48
49impl DataFrame {
50    pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51        Self {
52            constants: HashMap::new(),
53            dataframe: dataframe.into(),
54            metadata: HashMap::new(),
55        }
56    }
57
58    pub fn shrink(&mut self) {
59        self.dataframe.shrink();
60    }
61
62    pub fn add_metadata(&mut self, key: String, value: DataValue) {
63        self.metadata.insert(key, value);
64    }
65
66    pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67        self.metadata.get(key)
68    }
69
70    pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71        other.constants.into_iter().for_each(|(key, value)| {
72            self.constants.insert(key, value);
73        });
74        self.dataframe.join(other.dataframe, join_type)
75    }
76
77    pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78    where
79        F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80    {
81        self.dataframe.apply_function(keys, &mut func)
82    }
83
84    pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85        Ok(self.dataframe.select(keys))
86    }
87
88    pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89        self.dataframe.select_transposed_typed::<D>(keys)
90    }
91
92    pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
93        self.dataframe.select_column(&key)
94    }
95
96    pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97        self.dataframe.select_transposed(keys)
98    }
99
100    pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101        self.constants.insert(key, value);
102    }
103
104    pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105        self.dataframe.push(item)
106    }
107
108    pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109        self.dataframe.remove_column(keys).map(|x| x.into())
110    }
111
112    pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113        self.dataframe.extend(items.dataframe)
114    }
115
116    pub fn len(&self) -> usize {
117        self.dataframe.len()
118    }
119
120    pub fn is_empty(&self) -> bool {
121        self.dataframe.is_empty()
122    }
123
124    pub fn add_single_column<K: Into<Key>>(
125        &mut self,
126        key: K,
127        values: Array1<DataValue>,
128    ) -> Result<(), Error> {
129        self.dataframe.add_single_column(key, values)
130    }
131
132    pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
133        self.dataframe.get_single_column(key)
134    }
135
136    pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137        self.dataframe.sorted(key)
138    }
139
140    pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141        let filtered_df = self.dataframe.filter(filter)?;
142        Ok(Self {
143            constants: self.constants.clone(),
144            dataframe: filtered_df,
145            metadata: self.metadata.clone(),
146        })
147    }
148
149    #[cfg(feature = "polars-df")]
150    pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151        let mut columns = vec![];
152        for key in self.dataframe.keys() {
153            let values = self
154                .dataframe
155                .get_single_column(key)
156                .ok_or_else(|| Error::NotFound(key.clone()))?
157                .into_iter()
158                .map(|x| into_polars_value(key, x.clone()))
159                .collect::<Vec<_>>();
160            let s = polars::prelude::Column::new(key.name().into(), values);
161
162            columns.push(s);
163        }
164
165        Ok(polars::prelude::DataFrame::new(columns)?)
166    }
167}
168#[cfg(feature = "polars-df")]
169pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
170    use crate::DataType::*;
171    use polars::prelude::DataType::*;
172    match dtype {
173        Bool => Boolean,
174        U32 => UInt32,
175        I32 => Int32,
176        U8 => UInt8,
177        U64 => UInt64,
178        I64 => Int64,
179        F32 => Float32,
180        F64 => Float64,
181        U128 => UInt128,
182        I128 => Int128,
183        crate::DataType::String => polars::prelude::DataType::String,
184        Bytes => Binary,
185        crate::DataType::Unknown => Null,
186        Vec => List(Box::new(polars::prelude::DataType::Unknown(
187            polars::prelude::UnknownKind::Any,
188        ))),
189        Map => Struct(vec![]),
190    }
191}
192
193#[cfg(feature = "polars-df")]
194pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
195    use polars::prelude::AnyValue::*;
196    use polars::prelude::Field;
197
198    use crate::dataframe::column_store::convert_dv_to_dtype;
199    let dv = convert_dv_to_dtype(key, dv);
200    match dv {
201        DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
202        DataValue::Bytes(items) => BinaryOwned(items),
203        DataValue::U8(x) => UInt32(x as _),
204        DataValue::Bool(x) => Boolean(x),
205        DataValue::I32(x) => Int32(x),
206        DataValue::U32(x) => UInt32(x),
207        DataValue::I64(x) => Int64(x),
208        DataValue::U64(x) => UInt64(x),
209        DataValue::I128(x) => Int128(x),
210        DataValue::F32(x) => Float32(x),
211        DataValue::F64(x) => Float64(x),
212        DataValue::Null => Null,
213        DataValue::Vec(data_values) => {
214            let mut dt = crate::DataType::Unknown;
215            for d in data_values.iter() {
216                match crate::detect_dtype(d) {
217                    crate::DataType::Unknown => continue,
218                    e => {
219                        dt = e;
220                        break;
221                    }
222                }
223            }
224            let vec_key = Key::new(key.name(), dt);
225            let s = polars::series::Series::from_any_values(
226                key.name().into(),
227                &data_values
228                    .into_iter()
229                    .map(|x| into_polars_value(&vec_key, x))
230                    .collect::<Vec<_>>(),
231                true,
232            );
233            List(s.expect(&format!("Cannot create series for {key:?}")))
234        }
235        DataValue::EnumNumber(x) => Int32(x),
236        DataValue::U128(x) => UInt128(x),
237        DataValue::Map(x) => {
238            let mut values = vec![];
239            let mut fields = vec![];
240            let mut sorted_keys = x.keys().collect::<Vec<_>>();
241            sorted_keys.sort();
242            for k in sorted_keys {
243                let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
244                let dtype = crate::detect_dtype(value);
245                let k = Key::new(k, dtype);
246                values.push(into_polars_value(&k, value.to_owned()));
247                fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
248            }
249            StructOwned(Box::new((values, fields)))
250        }
251    }
252}
253
254#[cfg(feature = "polars-df")]
255pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
256    use polars::prelude::AnyValue::*;
257    match dv {
258        Null => DataValue::Null,
259        Boolean(v) => v.into(),
260        String(v) => DataValue::String(v.into()),
261        UInt8(v) => DataValue::U8(v),
262        UInt16(v) => DataValue::U32(v as u32),
263        UInt32(v) => v.into(),
264        UInt64(v) => v.into(),
265        Int8(v) => (v as i32).into(),
266        Int16(v) => (v as i32).into(),
267        Int32(v) => v.into(),
268        Int64(v) => v.into(),
269        Float32(v) => v.into(),
270        Float64(v) => v.into(),
271        Int128(v) => v.into(),
272        List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
273        // Array(series, _) => {
274        //     DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>())
275        // }
276        StringOwned(v) => DataValue::String(v.as_str().into()),
277        Binary(v) => DataValue::Bytes(v.to_owned()),
278        BinaryOwned(v) => DataValue::Bytes(v),
279        StructOwned(m) => {
280            let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
281                std::collections::HashMap::new();
282            for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
283                hm.insert(k.name.as_str().into(), from_polars_value(v));
284            }
285            DataValue::Map(hm)
286        }
287        e => {
288            tracing::warn!("Unsupported polars value: {e:?}");
289            DataValue::Null
290        }
291    }
292}
293
294impl From<ColumnFrame> for DataFrame {
295    fn from(dataframe: ColumnFrame) -> Self {
296        Self::new(dataframe)
297    }
298}
299
300impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
301    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
302        Self::new(ColumnFrame::from(dataframe))
303    }
304}
305
306impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
307    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
308        Self::new(ColumnFrame::from(dataframe))
309    }
310}
311
312impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
313    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
314        Self::new(ColumnFrame::from(dataframe))
315    }
316}
317
318impl From<MLChefMap> for DataFrame {
319    fn from(dataframe: MLChefMap) -> Self {
320        Self::new(ColumnFrame::from(dataframe))
321    }
322}
323impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
324    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
325        Self::new(ColumnFrame::from(dataframe))
326    }
327}
328
329impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
330    fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
331        Self::new(ColumnFrame::from(dataframe))
332    }
333}
334
335#[cfg(feature = "polars-df")]
336impl From<polars::prelude::DataFrame> for DataFrame {
337    fn from(dataframe: polars::prelude::DataFrame) -> Self {
338        Self::new(ColumnFrame::from(dataframe))
339    }
340}
341#[cfg(test)]
342mod test {
343    use crate::filter::FilterRules;
344
345    use super::*;
346    use halfbrown::hashmap;
347    #[cfg(feature = "polars-df")]
348    use polars::prelude::NamedFrom as _;
349    use rstest::*;
350    use tracing_test::traced_test;
351    #[fixture]
352    fn dummy_candidates() -> ColumnFrame {
353        ColumnFrame::from(vec![
354            hashmap! {
355                "key1".into() => 1.into(),
356                "key2".into() => "a".into(),
357            },
358            hashmap! {
359                "key1".into() => 2.into(),
360                "key2".into() => "b".into(),
361            },
362        ])
363    }
364
365    #[rstest]
366    fn test_serde() {
367        let df = crate::df! {
368            "a" => [1u64, 2u64, 3u64],
369            "b" => [4u64, 5u64, 6u64],
370            "c" => [7u64, 8u64, 9u64]
371        };
372
373        let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
374
375        let deserialized =
376            serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
377
378        assert_eq!(df, deserialized);
379    }
380
381    #[cfg(feature = "polars-df")]
382    #[rstest]
383    fn test_polars() {
384        let expected = crate::df! {
385            "a" => [1u64, 2u64, 3u64],
386            "b" => [4f64, 5f64, 6f64],
387            "c" => [7i64, 8i64, 9i64]
388        };
389
390        let polars_df = polars::df!(
391            "a" => [1u64, 2u64, 3u64],
392            "b" => [4f64, 5f64, 6f64],
393            "c" => [7i64, 8i64, 9i64]
394        )
395        .expect("BUG: should be ok");
396        let as_df: DataFrame = polars_df.into();
397        let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
398        assert_eq!(
399            as_df.select(Some(keys.as_slice())),
400            expected.select(Some(keys.as_slice()))
401        );
402    }
403    use crate::DataType;
404    #[cfg(feature = "polars-df")]
405    #[rstest]
406    #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
407    #[case::u32(
408        Key::new("a", DataType::U32),
409        DataValue::U32(u32::MAX),
410        polars::prelude::AnyValue::UInt32(u32::MAX)
411    )]
412    #[case::i32(
413        Key::new("a", DataType::I32),
414        DataValue::I32(i32::MIN),
415        polars::prelude::AnyValue::Int32(i32::MIN)
416    )]
417    #[case::i64(
418        Key::new("a", DataType::I64),
419        DataValue::I64(i64::MIN),
420        polars::prelude::AnyValue::Int64(i64::MIN)
421    )]
422    #[case::u64(
423        Key::new("a", DataType::U64),
424        DataValue::U64(u64::MIN),
425        polars::prelude::AnyValue::UInt64(u64::MIN)
426    )]
427    #[case::f32(
428        Key::new("a", DataType::F32),
429        DataValue::F32(f32::MIN),
430        polars::prelude::AnyValue::Float32(f32::MIN)
431    )]
432    #[case::f64(
433        Key::new("a", DataType::F64),
434        DataValue::F64(f64::MIN),
435        polars::prelude::AnyValue::Float64(f64::MIN)
436    )]
437    #[case::null(
438        Key::new("a", DataType::Unknown),
439        DataValue::Null,
440        polars::prelude::AnyValue::Null
441    )]
442    #[case::i128(
443        Key::new("a", DataType::I128),
444        DataValue::I128(i128::MIN),
445        polars::prelude::AnyValue::Int128(i128::MIN)
446    )]
447    #[case::u8(
448        Key::new("a", DataType::U8),
449        DataValue::U8(255),
450        polars::prelude::AnyValue::UInt8(255)
451    )]
452    #[case::bool(
453        Key::new("a", DataType::Bool),
454        DataValue::Bool(true),
455        polars::prelude::AnyValue::Boolean(true)
456    )]
457    #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
458    #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
459    #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
460        vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
461        vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
462    // polars converts all by first element type
463    // #[case::vec_diff_int(DataValue::Vec(vec![ DataValue::I32(1), DataValue::U32(0)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::Int32(1i32), polars::prelude::AnyValue::UInt32(0u32)])))]
464    //#[case::vec_int_str(DataValue::Vec(vec![DataValue::U32(0), DataValue::String("1".into())]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::StringOwned("1".into())])))]
465    fn into_polars_value_test(
466        #[case] key: Key,
467        #[case] input: DataValue,
468        #[case] output: polars::prelude::AnyValue<'static>,
469    ) {
470        assert_eq!(into_polars_value(&key, input.clone()), output);
471        assert_eq!(from_polars_value(output), input);
472    }
473
474    // #[cfg(feature = "polars-df")]
475    // #[rstest]
476    // fn as_polars() {
477    //     let state = include_bytes!("../part_00330.dfb");
478    //     let df: Result<DataFrame, _> = rmp_serde::decode::from_slice(state);
479    //     assert!(df.is_ok());
480    //     let df = df.unwrap();
481    //     println!("{df}");
482    //     let polars_df = df.as_polars();
483    //     assert!(polars_df.is_ok(), "{polars_df:?}");
484    // }
485    #[rstest]
486    #[case(
487        DataFrame::new(crate::column_frame! {
488            "a" => [1f64, 2f64, 3f64],
489            "b" => [4i64, 5i64, 6i64],
490            "c" => [7i64, 8i64, 9i64]
491        }),
492        DataFrame::new(crate::column_frame! {
493            "a" => [1f64, 2f64],
494            "b" => [4i64, 5i64],
495            "c" => [7i64, 8i64]
496        }),
497        FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
498    )]
499    #[case(
500        DataFrame::new(crate::column_frame! {
501            "a" => [1f64, 2f64, 3f64],
502            "b" => [4i64, 5i64, 6i64],
503            "c" => [7i64, 8i64, 9i64]
504        }),
505        DataFrame::new(crate::column_frame! {
506            "a" => [2f64],
507            "b" => [5i64],
508            "c" => [8i64]
509        }),
510        FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
511    )]
512    #[traced_test]
513    fn filter_test(
514        #[case] df: DataFrame,
515        #[case] expected: DataFrame,
516        #[case] filter: FilterRules,
517    ) {
518        let filtered = df.filter(&filter).expect("BUG: cannot filter");
519        assert_eq!(filtered, expected);
520    }
521
522    #[rstest]
523    fn test_serde_complex() {
524        let simple = r#"
525{
526    "constants": {},
527    "dataframe": {
528        "index": {
529            "keys": [
530                {
531                    "key": 3162770485,
532                    "name": "a",
533                    "ctype": "U32"
534                },
535                {
536                    "key": 2279056742,
537                    "name": "b",
538                    "ctype": "F64"
539                },
540                {
541                    "key": 2994984227,
542                    "name": "c",
543                    "ctype": "U64"
544                },
545                {
546                    "key": 3319645144,
547                    "name": "d",
548                    "ctype": "F64"
549                },
550                {
551                    "key": 1291847470,
552                    "name": "e",
553                    "ctype": "U32"
554                },
555                {
556                    "key": 874241070,
557                    "name": "f",
558                    "ctype": "Bool"
559                }
560            ],
561            "indexes": {
562                "a": 0,
563                "b": 1,
564                "c": 2,
565                "d": 3,
566                "e": 4,
567                "f": 5
568            },
569            "alias": {}
570        },
571        "data_frame": {
572            "v": 1,
573            "dim": [
574                2,
575                6
576            ],
577            "data": [
578                253780,
579                0.009369421750307085,
580                1633222860381359,
581                8,
582                5,
583                true,
584                64512,
585                0.003391335718333721,
586                1633222860810557,
587                8,
588                5,
589                null
590            ]
591        }
592    },
593    "metadata": {}
594}
595        "#;
596
597        let simple_deserialized: DataFrame =
598            serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
599
600        println!("deserialized: {simple_deserialized:?}");
601        let array = format!("[{}, {}, {}]", simple, simple, simple);
602        let deserialized: Vec<DataFrame> =
603            serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
604
605        println!("deserialized: {deserialized:?}");
606        assert_eq!(deserialized.len(), 3);
607        assert_eq!(simple_deserialized, deserialized[0]);
608    }
609
610    #[rstest]
611    #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
612    #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
613    #[case(vec![hashmap! {
614        "key1".into() => 1.into(),
615        "key2".into() => "a".into(),
616    },
617    hashmap! {
618        "key1".into() => 2.into(),
619    },])]
620    #[case(vec![data_value::stdhashmap! {
621        "key1" => DataValue::from(1),
622        "key2" => DataValue::from("a"),
623    },data_value::stdhashmap! {
624        "key1" => DataValue::from(2),
625    },])]
626    #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
627    vec![DataValue::from("a"), DataValue::Null])])]
628    fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
629        let df: DataFrame = input.into();
630        assert_eq!(
631            df,
632            DataFrame {
633                constants: HashMap::new(),
634                dataframe: ColumnFrame::from(vec![
635                    hashmap! {
636                        "key1".into() => 1.into(),
637                        "key2".into() => "a".into(),
638                    },
639                    hashmap! {
640                        "key1".into() => 2.into(),
641                    },
642                ]),
643                metadata: HashMap::new(),
644            }
645        );
646        let selected_transposed = df.select_column("key1".into());
647        assert!(selected_transposed.is_some());
648        let selected_transposed = selected_transposed.unwrap();
649        assert_eq!(selected_transposed.len(), 2);
650        assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
651    }
652
653    #[rstest]
654    #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
655    #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
656    #[case::hm({
657        let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
658        hm
659    })]
660    #[case::vec_hhm(vec![hashmap! {
661        "key1".into() => 1.into(),
662        "key2".into() => "a".into(),
663    },
664    hashmap! {
665        "key1".into() => 2.into(),
666    },])]
667    #[case::vec_hme(vec![data_value::stdhashmap! {
668        "key1" => DataValue::from(1),
669        "key2" => DataValue::from("a"),
670    },data_value::stdhashmap! {
671        "key1" => DataValue::from(2),
672    },])]
673    #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
674    fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
675        let df: DataFrame = input.into();
676        let expected: DataFrame = DataFrame {
677            constants: HashMap::new(),
678            dataframe: ColumnFrame::from(vec![
679                hashmap! {
680                    "key1".into() => 1.into(),
681                    "key2".into() => "a".into(),
682                },
683                hashmap! {
684                    "key1".into() => 2.into(),
685                },
686            ]),
687            metadata: HashMap::new(),
688        };
689        assert_eq!(
690            df.select(Some(&["key1".into(), "key2".into()])),
691            expected.select(Some(&["key1".into(), "key2".into()])),
692            "{df} vs {expected}"
693        );
694        let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
695        assert_eq!(selected_transposed.len(), 2);
696        println!("{:?}", selected_transposed);
697        assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
698    }
699    #[rstest]
700    fn test_dataframe(dummy_candidates: ColumnFrame) {
701        let mut dataframe: DataFrame = DataFrame::default();
702        assert!(dataframe.is_empty());
703        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
704        assert_eq!(dataframe.len(), 2);
705
706        let candidate = hashmap! {
707            "key1".into() => 3.into(),
708            "key2".into() => "c".into(),
709        };
710
711        assert!(dataframe.push(candidate).is_ok());
712        assert_eq!(dataframe.len(), 3);
713        assert!(!dataframe.is_empty());
714
715        dataframe.insert_constant("key3".into(), 4.into());
716        assert_eq!(dataframe.constants.len(), 1);
717        assert!(dataframe
718            .apply_function(&["key1".into()], |keys, df| {
719                let key = keys[0].clone();
720                let s = df
721                    .get_single_column(&key)
722                    .expect("BUG: Cannot get column")
723                    .to_owned();
724                let s = s.mapv(|x| x + DataValue::from(1));
725                df.add_single_column("key5", s)?;
726                Ok(())
727            })
728            .is_ok());
729        let original = dataframe.clone();
730        dataframe.shrink();
731        let remove_df = dataframe.remove_column(&["key1".into()]);
732        assert!(remove_df.is_ok());
733        let mut remove_df = remove_df.unwrap();
734        assert_eq!(remove_df.len(), 3);
735        let selected = dataframe.select(Some(&["key2".into()]));
736        assert!(selected.is_ok());
737        let selected = selected.unwrap();
738        println!("{:?}", selected);
739        assert_eq!(selected.len(), 3);
740
741        // fixme later
742        let joined_result =
743            remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
744        assert!(joined_result.is_ok(), "{:?}", joined_result);
745        assert_eq!(original, remove_df);
746    }
747
748    #[rstest]
749    fn test_metadata(dummy_candidates: ColumnFrame) {
750        let mut dataframe: DataFrame = DataFrame::default();
751        assert!(dataframe.is_empty());
752        println!("{:?}", dataframe);
753        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
754        println!("{:?}", dataframe);
755        assert_eq!(dataframe.len(), 2);
756
757        dataframe.add_metadata("test".into(), 1.into());
758        assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
759        let dataframe = DataFrame::new(ColumnFrame::from(vec![
760            hashmap! {
761                "key1".into() => 1.into(),
762                "key2".into() => "a".into(),
763            },
764            hashmap! {
765                "key1".into() => 2.into(),
766                "key2".into() => "b".into(),
767            },
768        ]));
769        assert_eq!(dataframe.get_metadata("test"), None);
770        let tt = dataframe.select_transposed(None);
771        assert!(tt.is_ok());
772        let tt = tt.unwrap();
773        assert_eq!(tt.shape(), [2, 2]);
774        assert_eq!(
775            tt,
776            Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
777                .unwrap()
778        );
779    }
780
781    #[rstest]
782    #[traced_test]
783    fn add_single_column_test() {
784        let mut dataframe = DataFrame::default();
785        let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
786        let r = dataframe.add_single_column("key1", values);
787        assert!(r.is_ok(), "{r:?}");
788        let selected = dataframe.select(None);
789        assert!(selected.is_ok());
790        let selected = selected.unwrap();
791        assert_eq!(selected.shape(), [3, 1]);
792        assert_eq!(
793            selected,
794            Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
795        );
796        let values = Array1::from(vec![1.into(), 2.into()]);
797        assert!(dataframe.add_single_column("key1", values).is_err());
798        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
799        assert!(dataframe.add_single_column("key2", values).is_ok());
800        let values = Array1::from(vec![3.into()]);
801        assert!(dataframe.add_single_column("key3", values).is_err());
802    }
803
804    #[rstest]
805    #[traced_test]
806    fn add_single_column_empty_test() {
807        let mut dataframe = DataFrame::default();
808        let values = Array1::from(vec![]);
809        let r = dataframe.add_single_column("key1", values);
810        assert!(r.is_ok(), "{r:?}");
811        let selected = dataframe.select(None);
812        assert!(selected.is_ok());
813        let selected = selected.unwrap();
814        assert_eq!(selected.shape(), [0, 1]);
815        assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
816        let values = Array1::from(vec![1.into(), 2.into()]);
817        assert!(dataframe.add_single_column("key1", values).is_err());
818        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
819        assert!(dataframe.add_single_column("key2", values).is_ok());
820        let values = Array1::from(vec![3.into(), 4.into()]);
821        assert!(dataframe.add_single_column("key3", values).is_err());
822        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
823        assert!(dataframe.add_single_column("key3", values).is_ok());
824
825        assert_eq!(
826            dataframe
827                .select_column("key1".into())
828                .expect("BUG: has to exists"),
829            ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
830        );
831        assert_eq!(
832            dataframe
833                .select_column("key2".into())
834                .expect("BUG: has to exists"),
835            ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
836        );
837        assert_eq!(
838            dataframe.select(None).expect("BUG: cannot get data"),
839            ndarray::arr2(&[
840                [DataValue::Null, 3.into(), 3.into()],
841                [DataValue::Null, 4.into(), 4.into()],
842                [DataValue::Null, 5.into(), 5.into()],
843            ])
844        );
845    }
846
847    #[rstest]
848    #[case(
849        DataFrame::new(ColumnFrame::from(vec![
850            hashmap! {
851                "k".into() => 1.into(),
852                "k2".into() => 2.into(),
853                "k3".into() => 2.2.into(),
854            },
855            hashmap! {
856                "k".into() => 11.into(),
857                "k2".into() => 3.into(),
858            },
859            hashmap! {
860                "k".into() => 4.into(),
861                "k2".into() => 5.into(),
862                "k3".into() => 2.3.into(),
863            },
864            hashmap! {
865                "k".into() => 4.into(),
866                "k2".into() => 5.into(),
867                "k3".into() => 2.4.into(),
868            },
869        ])),
870        vec!["k".into(), "k2".into()],
871        Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
872    )]
873    #[case(
874        DataFrame::new(ColumnFrame::from(vec![
875            hashmap! {
876                "k".into() => 1.into(),
877                "k2".into() => 2.into(),
878                "k3".into() => 2.2.into(),
879            },
880            hashmap! {
881                "k".into() => 11.into(),
882                "k2".into() => 3.into(),
883            },
884            hashmap! {
885                "k".into() => 4.into(),
886                "k2".into() => 5.into(),
887                "k3".into() => 2.3.into(),
888            },
889            hashmap! {
890                "k".into() => 4.into(),
891                "k2".into() => 5.into(),
892                "k3".into() => 2.4.into(),
893            },
894        ])),
895        vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
896        Array2::from_shape_vec((4, 5), vec![
897            2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
898            3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
899            5.into(), 2.3.into(),  DataValue::Null, DataValue::Null, 4.into(),
900            5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
901    )]
902    #[traced_test]
903    fn select_multiple(
904        #[case] input: DataFrame,
905        #[case] columns: Vec<Key>,
906        #[case] expected: Array2<DataValue>,
907    ) {
908        let selected = input.select(Some(&columns));
909        assert!(selected.is_ok());
910        let selected = selected.unwrap();
911
912        assert_eq!(selected, expected);
913    }
914
915    #[rstest]
916    #[case(
917        DataFrame::new(ColumnFrame::from(vec![
918            hashmap! {
919                "k".into() => 1.into(),
920                "k2".into() => 2.into(),
921                "k3".into() => 2.2.into(),
922            },
923            hashmap! {
924                "k".into() => 11.into(),
925                "k2".into() => 3.into(),
926            },
927            hashmap! {
928                "k".into() => 4.into(),
929                "k2".into() => 5.into(),
930                "k3".into() => 2.3.into(),
931            },
932            hashmap! {
933                "k".into() => 4.into(),
934                "k2".into() => 5.into(),
935                "k3".into() => 2.4.into(),
936            },
937        ])),
938        "k".into(),
939        Array2::from_shape_vec((4, 3), vec![
940            1.into(), 2.into(), 2.2.into(),
941            4.into(), 5.into(), 2.3.into(),
942            4.into(), 5.into(), 2.4.into(),
943            11.into(), 3.into(), DataValue::Null,
944            ]
945        ).unwrap(),
946        vec!["k".into(), "k2".into(), "k3".into()],
947    )]
948    #[rstest]
949    #[case(
950        DataFrame::new(ColumnFrame::from(vec![
951            hashmap! {
952                "k".into() => 1.into(),
953                "k2".into() => 2.into(),
954                "k3".into() => 2.2.into(),
955            },
956            hashmap! {
957                "k".into() => 11.into(),
958                "k2".into() => 3.into(),
959            },
960            hashmap! {
961                "k".into() => 4.into(),
962                "k2".into() => 5.into(),
963                "k3".into() => 2.3.into(),
964            },
965            hashmap! {
966                "k".into() => 4.into(),
967                "k2".into() => 5.into(),
968                "k3".into() => 2.4.into(),
969            },
970        ])),
971        "k3".into(),
972        Array2::from_shape_vec((4, 3), vec![
973            11.into(), 3.into(), DataValue::Null,
974            1.into(), 2.into(), 2.2.into(),
975            4.into(), 5.into(), 2.3.into(),
976            4.into(), 5.into(), 2.4.into(),
977            ]
978        ).unwrap(),
979        vec!["k".into(), "k2".into(), "k3".into()],
980    )]
981    #[case(
982        DataFrame::new(ColumnFrame::from(vec![
983            hashmap! {
984                "k".into() => 2.into(),
985                "k2".into() => 0.000001.into(),
986            },
987            hashmap! {
988                "k".into() => 1.into(),
989                "k2".into() =>0.0000001.into(),
990            },
991            hashmap! {
992                "k".into() => 3.into(),
993                "k2".into() => 0.00001.into(),
994            },
995            hashmap! {
996                "k".into() => 4.into(),
997                "k2".into() => 0.001.into(),
998            },
999        ])),
1000        "k2".into(),
1001        Array2::from_shape_vec((4, 2), vec![
1002            1.into(), 0.0000001.into(),
1003            2.into(), 0.000001.into(),
1004            3.into(), 0.00001.into(),
1005            4.into(), 0.001.into(),
1006            ]
1007        ).unwrap(),
1008        vec!["k".into(), "k2".into()],
1009    )]
1010    #[case(
1011        DataFrame::new(ColumnFrame::from(vec![
1012            hashmap! {
1013                "k".into() => 2.into(),
1014                "k2".into() => "b".into(),
1015            },
1016            hashmap! {
1017                "k".into() => 1.into(),
1018                "k2".into() =>"a".into(),
1019            },
1020            hashmap! {
1021                "k".into() => 3.into(),
1022                "k2".into() =>"c".into(),
1023            },
1024            hashmap! {
1025                "k".into() => 4.into(),
1026                "k2".into() =>"z".into(),
1027            },
1028        ])),
1029        "k2".into(),
1030        Array2::from_shape_vec((4, 2), vec![
1031            1.into(),"a".into(),
1032            2.into(), "b".into(),
1033            3.into(), "c".into(),
1034            4.into(), "z".into(),
1035            ]
1036        ).unwrap(),
1037        vec!["k".into(), "k2".into()],
1038    )]
1039    #[traced_test]
1040    fn sort_by(
1041        #[case] input: DataFrame,
1042        #[case] column: Key,
1043        #[case] expected: Array2<DataValue>,
1044        #[case] columns: Vec<Key>,
1045    ) {
1046        let result = input.sorted(&column);
1047        assert!(result.is_ok(), "{result:?}");
1048        let result = result.unwrap().get_sorted();
1049        let selected = result.select(Some(&columns));
1050
1051        assert_eq!(selected, expected);
1052    }
1053    #[rstest]
1054    #[case(
1055        DataFrame::new(ColumnFrame::from(vec![
1056            hashmap! {
1057                "k".into() => 2.into(),
1058                "k2".into() => 0.000001.into(),
1059            },
1060            hashmap! {
1061                "k".into() => 1.into(),
1062                "k2".into() =>0.0000001.into(),
1063            },
1064            hashmap! {
1065                "k".into() => 3.into(),
1066                "k2".into() => 0.00001.into(),
1067            },
1068            hashmap! {
1069                "k".into() => 4.into(),
1070                "k2".into() => 0.001.into(),
1071            },
1072        ])),
1073        "k2".into(),
1074        TopN::Last(1),
1075        Array2::from_shape_vec((1, 2), vec![
1076            4.into(), 0.001.into(),
1077            ]
1078        ).unwrap(),
1079        vec!["k".into(), "k2".into()],
1080    )]
1081    #[case(
1082        DataFrame::new(ColumnFrame::from(vec![
1083            hashmap! {
1084                "k".into() => 2.into(),
1085                "k2".into() => 0.000001.into(),
1086            },
1087            hashmap! {
1088                "k".into() => 1.into(),
1089                "k2".into() =>0.0000001.into(),
1090            },
1091            hashmap! {
1092                "k".into() => 3.into(),
1093                "k2".into() => 0.00001.into(),
1094            },
1095            hashmap! {
1096                "k".into() => 4.into(),
1097                "k2".into() => 0.001.into(),
1098            },
1099        ])),
1100        "k2".into(),
1101        TopN::Last(2),
1102        Array2::from_shape_vec((2, 2), vec![
1103            4.into(), 0.001.into(),
1104            3.into(), 0.00001.into(),
1105            ]
1106        ).unwrap(),
1107        vec!["k".into(), "k2".into()],
1108    )]
1109    #[case(
1110        DataFrame::new(ColumnFrame::from(vec![
1111            hashmap! {
1112                "k".into() => 2.into(),
1113                "k2".into() => "b".into(),
1114            },
1115            hashmap! {
1116                "k".into() => 1.into(),
1117                "k2".into() =>"a".into(),
1118            },
1119            hashmap! {
1120                "k".into() => 3.into(),
1121                "k2".into() =>"c".into(),
1122            },
1123            hashmap! {
1124                "k".into() => 4.into(),
1125                "k2".into() =>"z".into(),
1126            },
1127        ])),
1128        "k2".into(),
1129        TopN::First(1),
1130        Array2::from_shape_vec((1, 2), vec![
1131            1.into(),"a".into(),
1132            ]
1133        ).unwrap(),
1134        vec!["k".into(), "k2".into()],
1135    )]
1136    #[case(
1137        DataFrame::new(ColumnFrame::from(vec![
1138            hashmap! {
1139                "k".into() => 2.into(),
1140                "k2".into() => "b".into(),
1141            },
1142            hashmap! {
1143                "k".into() => 1.into(),
1144                "k2".into() =>"a".into(),
1145            },
1146            hashmap! {
1147                "k".into() => 3.into(),
1148                "k2".into() =>"c".into(),
1149            },
1150            hashmap! {
1151                "k".into() => 4.into(),
1152                "k2".into() =>"z".into(),
1153            },
1154        ])),
1155        "k2".into(),
1156        TopN::First(2),
1157        Array2::from_shape_vec((2, 2), vec![
1158            1.into(),"a".into(),
1159            2.into(),"b".into(),
1160            ]
1161        ).unwrap(),
1162        vec!["k".into(), "k2".into()],
1163    )]
1164    #[traced_test]
1165    fn top_n(
1166        #[case] input: DataFrame,
1167        #[case] column: Key,
1168        #[case] topn: TopN,
1169        #[case] expected: Array2<DataValue>,
1170        #[case] columns: Vec<Key>,
1171    ) {
1172        let result = input.sorted(&column);
1173        assert!(result.is_ok(), "{result:?}");
1174        let result = result.unwrap();
1175        let first = result.topn(topn).unwrap();
1176        let selected = first.select(Some(&columns));
1177        assert_eq!(selected, expected);
1178    }
1179}