Skip to main content

trs_dataframe/
dataframe.rs

1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18    dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19    MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24    First(usize),
25    Last(usize),
26}
27
28/// DataFrame holds information about [`ColumnFrame`].
29/// This is used to store the data and the metadata for the candidates.
30///
31/// # Columns Storage
32/// The underlying data is stored in row-major order using ndarray's Array2.
33/// Use `select()` for row-oriented access and `select_transposed()` for column-oriented access.
34///
35/// # Example
36/// ```
37/// use trs_dataframe::{DataFrame, column_frame};
38///
39/// let df = DataFrame::new(column_frame! {
40///     "a" => [1, 2, 3],
41///     "b" => [4, 5, 6]
42/// });
43///
44/// // Get all data as 2D array (rows x columns)
45/// let all_data = df.select(None);
46///
47/// // Get specific columns
48/// let keys = vec!["a".into(), "b".into()];
49/// let selected = df.select(Some(&keys));
50/// ```
51#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
52#[cfg_attr(feature = "python", pyclass)]
53pub struct DataFrame {
54    /// Constants for the dataframe - mikro optimization for the data
55    /// Values which is constant for the whole dataframe are stored here
56    /// These values are applied to all rows without storing them per-row
57    pub constants: HashMap<Key, DataValue>,
58    /// Internal columnar storage for row data
59    pub dataframe: ColumnFrame,
60    /// Metadata for the dataframe. Here you can store the information about the dataframe
61    /// This is user-defined key-value metadata that doesn't affect data operations
62    pub metadata: HashMap<String, DataValue>,
63}
64
65impl fmt::Display for DataFrame {
66    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67        self.dataframe.fmt(f)
68    }
69}
70
71impl DataFrame {
72    pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
73        Self {
74            constants: HashMap::new(),
75            dataframe: dataframe.into(),
76            metadata: HashMap::new(),
77        }
78    }
79
80    /// Returns the number of columns which dataframe contains.
81    pub fn n_columns(&self) -> usize {
82        self.dataframe.data_frame.ncols()
83    }
84
85    /// Returns the number of rows which dataframe contains.
86    pub fn n_rows(&self) -> usize {
87        self.dataframe.data_frame.nrows()
88    }
89
90    pub fn shrink(&mut self) {
91        self.dataframe.shrink();
92    }
93
94    pub fn add_metadata(&mut self, key: String, value: DataValue) {
95        self.metadata.insert(key, value);
96    }
97
98    pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
99        self.metadata.get(key)
100    }
101
102    pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
103        for (key, value) in other.constants {
104            self.constants.insert(key, value);
105        }
106        self.dataframe.join(other.dataframe, join_type)
107    }
108
109    pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
110    where
111        F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
112    {
113        self.dataframe.apply_function(keys, &mut func)
114    }
115
116    pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
117        Ok(self.dataframe.select(keys))
118    }
119
120    // pub fn select_view(&self, keys: Option<&[Key]>) -> Result<ArrayView2<'_, DataValue>, Error> {
121    //     Ok(self.dataframe.select_view(keys))
122    // }
123
124    pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
125        self.dataframe.select_transposed_typed::<D>(keys)
126    }
127
128    pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
129        self.dataframe.select_column(&key)
130    }
131
132    pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
133        self.dataframe.select_transposed(keys)
134    }
135
136    pub fn insert_constant(&mut self, key: Key, value: DataValue) {
137        self.constants.insert(key, value);
138    }
139
140    pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
141        self.dataframe.push(item)
142    }
143
144    pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
145        self.dataframe.remove_column(keys).map(|x| x.into())
146    }
147
148    pub fn extend(&mut self, items: Self) -> Result<(), Error> {
149        self.dataframe.extend(items.dataframe)
150    }
151
152    pub fn len(&self) -> usize {
153        self.dataframe.len()
154    }
155
156    pub fn is_empty(&self) -> bool {
157        self.dataframe.is_empty()
158    }
159
160    pub fn add_single_column<K: Into<Key>>(
161        &mut self,
162        key: K,
163        values: Array1<DataValue>,
164    ) -> Result<(), Error> {
165        self.dataframe.add_single_column(key, values)
166    }
167
168    pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
169        self.dataframe.get_single_column(key)
170    }
171
172    pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
173        self.dataframe.sorted(key)
174    }
175
176    pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
177        let filtered_df = self.dataframe.filter(filter)?;
178        Ok(Self {
179            constants: self.constants.clone(),
180            dataframe: filtered_df,
181            metadata: self.metadata.clone(),
182        })
183    }
184
185    #[cfg(feature = "polars-df")]
186    pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
187        let mut columns = vec![];
188        for key in self.dataframe.keys() {
189            let values = self
190                .dataframe
191                .get_single_column(key)
192                .ok_or_else(|| Error::NotFound(key.clone()))?
193                .into_iter()
194                .map(|x| into_polars_value(key, x.clone()))
195                .collect::<Vec<_>>();
196            let s = polars::prelude::Column::new(key.name().into(), values);
197
198            columns.push(s);
199        }
200
201        Ok(polars::prelude::DataFrame::new(columns)?)
202    }
203}
204#[cfg(feature = "polars-df")]
205pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
206    use crate::DataType::*;
207    use polars::prelude::DataType::*;
208    match dtype {
209        Bool => Boolean,
210        U32 => UInt32,
211        I32 => Int32,
212        U8 => UInt8,
213        U64 => UInt64,
214        I64 => Int64,
215        F32 => Float32,
216        F64 => Float64,
217        U128 => UInt128,
218        I128 => Int128,
219        crate::DataType::String => polars::prelude::DataType::String,
220        Bytes => Binary,
221        crate::DataType::Unknown => Null,
222        Vec => List(Box::new(polars::prelude::DataType::Unknown(
223            polars::prelude::UnknownKind::Any,
224        ))),
225        Map => Struct(vec![]),
226    }
227}
228
229#[cfg(feature = "polars-df")]
230pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
231    use polars::prelude::AnyValue::*;
232    use polars::prelude::Field;
233
234    use crate::dataframe::column_store::convert_dv_to_dtype;
235    let dv = convert_dv_to_dtype(key, dv);
236    match dv {
237        DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
238        DataValue::Bytes(items) => BinaryOwned(items),
239        DataValue::U8(x) => UInt32(x as _),
240        DataValue::Bool(x) => Boolean(x),
241        DataValue::I32(x) => Int32(x),
242        DataValue::U32(x) => UInt32(x),
243        DataValue::I64(x) => Int64(x),
244        DataValue::U64(x) => UInt64(x),
245        DataValue::I128(x) => Int128(x),
246        DataValue::F32(x) => Float32(x),
247        DataValue::F64(x) => Float64(x),
248        DataValue::Null => Null,
249        DataValue::Vec(data_values) => {
250            let mut dt = crate::DataType::Unknown;
251            for d in data_values.iter() {
252                match crate::detect_dtype(d) {
253                    crate::DataType::Unknown => continue,
254                    e => {
255                        dt = e;
256                        break;
257                    }
258                }
259            }
260            let vec_key = Key::new(key.name(), dt);
261            let s = polars::series::Series::from_any_values(
262                key.name().into(),
263                &data_values
264                    .into_iter()
265                    .map(|x| into_polars_value(&vec_key, x))
266                    .collect::<Vec<_>>(),
267                true,
268            );
269            List(s.expect(&format!("Cannot create series for {key:?}")))
270        }
271        DataValue::EnumNumber(x) => Int32(x),
272        DataValue::U128(x) => UInt128(x),
273        DataValue::Map(x) => {
274            let mut values = vec![];
275            let mut fields = vec![];
276            let mut sorted_keys = x.keys().collect::<Vec<_>>();
277            sorted_keys.sort();
278            for k in sorted_keys {
279                let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
280                let dtype = crate::detect_dtype(value);
281                let k = Key::new(k, dtype);
282                values.push(into_polars_value(&k, value.to_owned()));
283                fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
284            }
285            StructOwned(Box::new((values, fields)))
286        }
287    }
288}
289
290#[cfg(feature = "polars-df")]
291pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
292    use polars::prelude::AnyValue::*;
293    match dv {
294        Null => DataValue::Null,
295        Boolean(v) => v.into(),
296        String(v) => DataValue::String(v.into()),
297        UInt8(v) => DataValue::U8(v),
298        UInt16(v) => DataValue::U32(v as u32),
299        UInt32(v) => v.into(),
300        UInt64(v) => v.into(),
301        Int8(v) => (v as i32).into(),
302        Int16(v) => (v as i32).into(),
303        Int32(v) => v.into(),
304        Int64(v) => v.into(),
305        Float32(v) => v.into(),
306        Float64(v) => v.into(),
307        Int128(v) => v.into(),
308        List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
309        // Array(series, _) => {
310        //     DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>())
311        // }
312        StringOwned(v) => DataValue::String(v.as_str().into()),
313        Binary(v) => DataValue::Bytes(v.to_owned()),
314        BinaryOwned(v) => DataValue::Bytes(v),
315        StructOwned(m) => {
316            let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
317                std::collections::HashMap::new();
318            for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
319                hm.insert(k.name.as_str().into(), from_polars_value(v));
320            }
321            DataValue::Map(hm)
322        }
323        e => {
324            tracing::warn!("Unsupported polars value: {e:?}");
325            DataValue::Null
326        }
327    }
328}
329
330impl From<ColumnFrame> for DataFrame {
331    fn from(dataframe: ColumnFrame) -> Self {
332        Self::new(dataframe)
333    }
334}
335
336impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
337    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
338        Self::new(ColumnFrame::from(dataframe))
339    }
340}
341
342impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
343    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
344        Self::new(ColumnFrame::from(dataframe))
345    }
346}
347
348impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
349    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
350        Self::new(ColumnFrame::from(dataframe))
351    }
352}
353
354impl From<MLChefMap> for DataFrame {
355    fn from(dataframe: MLChefMap) -> Self {
356        Self::new(ColumnFrame::from(dataframe))
357    }
358}
359impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
360    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
361        Self::new(ColumnFrame::from(dataframe))
362    }
363}
364
365impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
366    fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
367        Self::new(ColumnFrame::from(dataframe))
368    }
369}
370
371#[cfg(feature = "polars-df")]
372impl From<polars::prelude::DataFrame> for DataFrame {
373    fn from(dataframe: polars::prelude::DataFrame) -> Self {
374        Self::new(ColumnFrame::from(dataframe))
375    }
376}
377#[cfg(test)]
378mod test {
379    use crate::filter::FilterRules;
380
381    use super::*;
382    use halfbrown::hashmap;
383    #[cfg(feature = "polars-df")]
384    use polars::prelude::NamedFrom as _;
385    use rstest::*;
386    use tracing_test::traced_test;
387    #[fixture]
388    fn dummy_candidates() -> ColumnFrame {
389        ColumnFrame::from(vec![
390            hashmap! {
391                "key1".into() => 1.into(),
392                "key2".into() => "a".into(),
393            },
394            hashmap! {
395                "key1".into() => 2.into(),
396                "key2".into() => "b".into(),
397            },
398        ])
399    }
400
401    #[rstest]
402    fn test_serde() {
403        let df = crate::df! {
404            "a" => [1u64, 2u64, 3u64],
405            "b" => [4u64, 5u64, 6u64],
406            "c" => [7u64, 8u64, 9u64]
407        };
408
409        let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
410
411        let deserialized =
412            serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
413
414        assert_eq!(df, deserialized);
415    }
416
417    #[cfg(feature = "polars-df")]
418    #[rstest]
419    fn test_polars() {
420        let expected = crate::df! {
421            "a" => [1u64, 2u64, 3u64],
422            "b" => [4f64, 5f64, 6f64],
423            "c" => [7i64, 8i64, 9i64]
424        };
425
426        let polars_df = polars::df!(
427            "a" => [1u64, 2u64, 3u64],
428            "b" => [4f64, 5f64, 6f64],
429            "c" => [7i64, 8i64, 9i64]
430        )
431        .expect("BUG: should be ok");
432        let as_df: DataFrame = polars_df.into();
433        let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
434        assert_eq!(
435            as_df.select(Some(keys.as_slice())),
436            expected.select(Some(keys.as_slice()))
437        );
438    }
439    #[cfg(feature = "polars-df")]
440    use crate::DataType;
441    #[cfg(feature = "polars-df")]
442    #[rstest]
443    #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
444    #[case::u32(
445        Key::new("a", DataType::U32),
446        DataValue::U32(u32::MAX),
447        polars::prelude::AnyValue::UInt32(u32::MAX)
448    )]
449    #[case::i32(
450        Key::new("a", DataType::I32),
451        DataValue::I32(i32::MIN),
452        polars::prelude::AnyValue::Int32(i32::MIN)
453    )]
454    #[case::i64(
455        Key::new("a", DataType::I64),
456        DataValue::I64(i64::MIN),
457        polars::prelude::AnyValue::Int64(i64::MIN)
458    )]
459    #[case::u64(
460        Key::new("a", DataType::U64),
461        DataValue::U64(u64::MIN),
462        polars::prelude::AnyValue::UInt64(u64::MIN)
463    )]
464    #[case::f32(
465        Key::new("a", DataType::F32),
466        DataValue::F32(f32::MIN),
467        polars::prelude::AnyValue::Float32(f32::MIN)
468    )]
469    #[case::f64(
470        Key::new("a", DataType::F64),
471        DataValue::F64(f64::MIN),
472        polars::prelude::AnyValue::Float64(f64::MIN)
473    )]
474    #[case::null(
475        Key::new("a", DataType::Unknown),
476        DataValue::Null,
477        polars::prelude::AnyValue::Null
478    )]
479    #[case::i128(
480        Key::new("a", DataType::I128),
481        DataValue::I128(i128::MIN),
482        polars::prelude::AnyValue::Int128(i128::MIN)
483    )]
484    #[case::u8(
485        Key::new("a", DataType::U8),
486        DataValue::U8(255),
487        polars::prelude::AnyValue::UInt8(255)
488    )]
489    #[case::bool(
490        Key::new("a", DataType::Bool),
491        DataValue::Bool(true),
492        polars::prelude::AnyValue::Boolean(true)
493    )]
494    #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
495    #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
496    #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
497        vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
498        vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
499    // polars converts all by first element type
500    // #[case::vec_diff_int(DataValue::Vec(vec![ DataValue::I32(1), DataValue::U32(0)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::Int32(1i32), polars::prelude::AnyValue::UInt32(0u32)])))]
501    //#[case::vec_int_str(DataValue::Vec(vec![DataValue::U32(0), DataValue::String("1".into())]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::StringOwned("1".into())])))]
502    fn into_polars_value_test(
503        #[case] key: Key,
504        #[case] input: DataValue,
505        #[case] output: polars::prelude::AnyValue<'static>,
506    ) {
507        assert_eq!(into_polars_value(&key, input.clone()), output);
508        assert_eq!(from_polars_value(output), input);
509    }
510
511    // #[cfg(feature = "polars-df")]
512    // #[rstest]
513    // fn as_polars() {
514    //     let state = include_bytes!("../part_00330.dfb");
515    //     let df: Result<DataFrame, _> = rmp_serde::decode::from_slice(state);
516    //     assert!(df.is_ok());
517    //     let df = df.unwrap();
518    //     println!("{df}");
519    //     let polars_df = df.as_polars();
520    //     assert!(polars_df.is_ok(), "{polars_df:?}");
521    // }
522    #[rstest]
523    #[case(
524        DataFrame::new(crate::column_frame! {
525            "a" => [1f64, 2f64, 3f64],
526            "b" => [4i64, 5i64, 6i64],
527            "c" => [7i64, 8i64, 9i64]
528        }),
529        DataFrame::new(crate::column_frame! {
530            "a" => [1f64, 2f64],
531            "b" => [4i64, 5i64],
532            "c" => [7i64, 8i64]
533        }),
534        FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
535    )]
536    #[case(
537        DataFrame::new(crate::column_frame! {
538            "a" => [1f64, 2f64, 3f64],
539            "b" => [4i64, 5i64, 6i64],
540            "c" => [7i64, 8i64, 9i64]
541        }),
542        DataFrame::new(crate::column_frame! {
543            "a" => [2f64],
544            "b" => [5i64],
545            "c" => [8i64]
546        }),
547        FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
548    )]
549    #[traced_test]
550    fn filter_test(
551        #[case] df: DataFrame,
552        #[case] expected: DataFrame,
553        #[case] filter: FilterRules,
554    ) {
555        let filtered = df.filter(&filter).expect("BUG: cannot filter");
556        assert_eq!(filtered, expected);
557    }
558
559    #[rstest]
560    fn test_serde_complex() {
561        let simple = r#"
562{
563    "constants": {},
564    "dataframe": {
565        "index": {
566            "keys": [
567                {
568                    "key": 3162770485,
569                    "name": "a",
570                    "ctype": "U32"
571                },
572                {
573                    "key": 2279056742,
574                    "name": "b",
575                    "ctype": "F64"
576                },
577                {
578                    "key": 2994984227,
579                    "name": "c",
580                    "ctype": "U64"
581                },
582                {
583                    "key": 3319645144,
584                    "name": "d",
585                    "ctype": "F64"
586                },
587                {
588                    "key": 1291847470,
589                    "name": "e",
590                    "ctype": "U32"
591                },
592                {
593                    "key": 874241070,
594                    "name": "f",
595                    "ctype": "Bool"
596                }
597            ],
598            "indexes": {
599                "a": 0,
600                "b": 1,
601                "c": 2,
602                "d": 3,
603                "e": 4,
604                "f": 5
605            },
606            "alias": {}
607        },
608        "data_frame": {
609            "v": 1,
610            "dim": [
611                2,
612                6
613            ],
614            "data": [
615                253780,
616                0.009369421750307085,
617                1633222860381359,
618                8,
619                5,
620                true,
621                64512,
622                0.003391335718333721,
623                1633222860810557,
624                8,
625                5,
626                null
627            ]
628        }
629    },
630    "metadata": {}
631}
632        "#;
633
634        let simple_deserialized: DataFrame =
635            serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
636
637        println!("deserialized: {simple_deserialized:?}");
638        let array = format!("[{}, {}, {}]", simple, simple, simple);
639        let deserialized: Vec<DataFrame> =
640            serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
641
642        println!("deserialized: {deserialized:?}");
643        assert_eq!(deserialized.len(), 3);
644        assert_eq!(simple_deserialized, deserialized[0]);
645    }
646
647    #[rstest]
648    #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
649    #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
650    #[case(vec![hashmap! {
651        "key1".into() => 1.into(),
652        "key2".into() => "a".into(),
653    },
654    hashmap! {
655        "key1".into() => 2.into(),
656    },])]
657    #[case(vec![data_value::stdhashmap! {
658        "key1" => DataValue::from(1),
659        "key2" => DataValue::from("a"),
660    },data_value::stdhashmap! {
661        "key1" => DataValue::from(2),
662    },])]
663    #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
664    vec![DataValue::from("a"), DataValue::Null])])]
665    fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
666        let df: DataFrame = input.into();
667        assert_eq!(
668            df,
669            DataFrame {
670                constants: HashMap::new(),
671                dataframe: ColumnFrame::from(vec![
672                    hashmap! {
673                        "key1".into() => 1.into(),
674                        "key2".into() => "a".into(),
675                    },
676                    hashmap! {
677                        "key1".into() => 2.into(),
678                    },
679                ]),
680                metadata: HashMap::new(),
681            }
682        );
683        let selected_transposed = df.select_column("key1".into());
684        assert!(selected_transposed.is_some());
685        let selected_transposed = selected_transposed.unwrap();
686        assert_eq!(selected_transposed.len(), 2);
687        assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
688    }
689
690    #[rstest]
691    #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
692    #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
693    #[case::hm({
694        let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
695        hm
696    })]
697    #[case::vec_hhm(vec![hashmap! {
698        "key1".into() => 1.into(),
699        "key2".into() => "a".into(),
700    },
701    hashmap! {
702        "key1".into() => 2.into(),
703    },])]
704    #[case::vec_hme(vec![data_value::stdhashmap! {
705        "key1" => DataValue::from(1),
706        "key2" => DataValue::from("a"),
707    },data_value::stdhashmap! {
708        "key1" => DataValue::from(2),
709    },])]
710    #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
711    fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
712        let df: DataFrame = input.into();
713        let expected: DataFrame = DataFrame {
714            constants: HashMap::new(),
715            dataframe: ColumnFrame::from(vec![
716                hashmap! {
717                    "key1".into() => 1.into(),
718                    "key2".into() => "a".into(),
719                },
720                hashmap! {
721                    "key1".into() => 2.into(),
722                },
723            ]),
724            metadata: HashMap::new(),
725        };
726        assert_eq!(
727            df.select(Some(&["key1".into(), "key2".into()])),
728            expected.select(Some(&["key1".into(), "key2".into()])),
729            "{df} vs {expected}"
730        );
731        let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
732        assert_eq!(selected_transposed.len(), 2);
733        println!("{:?}", selected_transposed);
734        assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
735    }
736    #[rstest]
737    fn test_dataframe(dummy_candidates: ColumnFrame) {
738        let mut dataframe: DataFrame = DataFrame::default();
739        assert!(dataframe.is_empty());
740        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
741        assert_eq!(dataframe.len(), 2);
742
743        let candidate = hashmap! {
744            "key1".into() => 3.into(),
745            "key2".into() => "c".into(),
746        };
747
748        assert!(dataframe.push(candidate).is_ok());
749        assert_eq!(dataframe.len(), 3);
750        assert!(!dataframe.is_empty());
751
752        dataframe.insert_constant("key3".into(), 4.into());
753        assert_eq!(dataframe.constants.len(), 1);
754        assert!(dataframe
755            .apply_function(&["key1".into()], |keys, df| {
756                let key = keys[0].clone();
757                let s = df
758                    .get_single_column(&key)
759                    .expect("BUG: Cannot get column")
760                    .to_owned();
761                let s = s.mapv(|x| x + DataValue::from(1));
762                df.add_single_column("key5", s)?;
763                Ok(())
764            })
765            .is_ok());
766        let original = dataframe.clone();
767        dataframe.shrink();
768        let remove_df = dataframe.remove_column(&["key1".into()]);
769        assert!(remove_df.is_ok());
770        let mut remove_df = remove_df.unwrap();
771        assert_eq!(remove_df.len(), 3);
772        let selected = dataframe.select(Some(&["key2".into()]));
773        assert!(selected.is_ok());
774        let selected = selected.unwrap();
775        println!("{:?}", selected);
776
777        // fixme later
778        let joined_result =
779            remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
780        assert!(joined_result.is_ok(), "{:?}", joined_result);
781        let keys = vec!["key1".into(), "key2".into(), "key5".into()];
782        assert_eq!(
783            original.select(Some(keys.as_slice())),
784            remove_df.select(Some(keys.as_slice()))
785        );
786    }
787
788    #[rstest]
789    fn test_size_methods() {
790        let candidate = hashmap! {
791            "key1".into() => 3.into(),
792            "key2".into() => "c".into(),
793            "key3".into() => false.into()
794        };
795
796        let dataframe: DataFrame = vec![candidate].into();
797
798        assert_eq!(dataframe.n_columns(), 3);
799        assert_eq!(dataframe.n_rows(), 1);
800    }
801
802    #[rstest]
803    fn test_metadata(dummy_candidates: ColumnFrame) {
804        let mut dataframe: DataFrame = DataFrame::default();
805        assert!(dataframe.is_empty());
806        println!("{:?}", dataframe);
807        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
808        println!("{:?}", dataframe);
809        assert_eq!(dataframe.len(), 2);
810
811        dataframe.add_metadata("test".into(), 1.into());
812        assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
813        let dataframe = DataFrame::new(ColumnFrame::from(vec![
814            hashmap! {
815                "key1".into() => 1.into(),
816                "key2".into() => "a".into(),
817            },
818            hashmap! {
819                "key1".into() => 2.into(),
820                "key2".into() => "b".into(),
821            },
822        ]));
823        assert_eq!(dataframe.get_metadata("test"), None);
824        let tt = dataframe.select_transposed(None);
825        assert!(tt.is_ok());
826        let tt = tt.unwrap();
827        assert_eq!(tt.shape(), [2, 2]);
828        assert_eq!(
829            tt,
830            Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
831                .unwrap()
832        );
833    }
834
835    #[rstest]
836    #[traced_test]
837    fn add_single_column_test() {
838        let mut dataframe = DataFrame::default();
839        let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
840        let r = dataframe.add_single_column("key1", values);
841        assert!(r.is_ok(), "{r:?}");
842        let selected = dataframe.select(None);
843        assert!(selected.is_ok());
844        let selected = selected.unwrap();
845        assert_eq!(selected.shape(), [3, 1]);
846        assert_eq!(
847            selected,
848            Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
849        );
850        let values = Array1::from(vec![1.into(), 2.into()]);
851        assert!(dataframe.add_single_column("key1", values).is_err());
852        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
853        assert!(dataframe.add_single_column("key2", values).is_ok());
854        let values = Array1::from(vec![3.into()]);
855        assert!(dataframe.add_single_column("key3", values).is_err());
856    }
857
858    #[rstest]
859    #[traced_test]
860    fn add_single_column_empty_test() {
861        let mut dataframe = DataFrame::default();
862        let values = Array1::from(vec![]);
863        let r = dataframe.add_single_column("key1", values);
864        assert!(r.is_ok(), "{r:?}");
865        let selected = dataframe.select(None);
866        assert!(selected.is_ok());
867        let selected = selected.unwrap();
868        assert_eq!(selected.shape(), [0, 1]);
869        assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
870        let values = Array1::from(vec![1.into(), 2.into()]);
871        assert!(dataframe.add_single_column("key1", values).is_err());
872        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
873        assert!(dataframe.add_single_column("key2", values).is_ok());
874        let values = Array1::from(vec![3.into(), 4.into()]);
875        assert!(dataframe.add_single_column("key3", values).is_err());
876        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
877        assert!(dataframe.add_single_column("key3", values).is_ok());
878
879        assert_eq!(
880            dataframe
881                .select_column("key1".into())
882                .expect("BUG: has to exists"),
883            ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
884        );
885        assert_eq!(
886            dataframe
887                .select_column("key2".into())
888                .expect("BUG: has to exists"),
889            ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
890        );
891        assert_eq!(
892            dataframe.select(None).expect("BUG: cannot get data"),
893            ndarray::arr2(&[
894                [DataValue::Null, 3.into(), 3.into()],
895                [DataValue::Null, 4.into(), 4.into()],
896                [DataValue::Null, 5.into(), 5.into()],
897            ])
898        );
899    }
900
901    #[rstest]
902    #[case(
903        DataFrame::new(ColumnFrame::from(vec![
904            hashmap! {
905                "k".into() => 1.into(),
906                "k2".into() => 2.into(),
907                "k3".into() => 2.2.into(),
908            },
909            hashmap! {
910                "k".into() => 11.into(),
911                "k2".into() => 3.into(),
912            },
913            hashmap! {
914                "k".into() => 4.into(),
915                "k2".into() => 5.into(),
916                "k3".into() => 2.3.into(),
917            },
918            hashmap! {
919                "k".into() => 4.into(),
920                "k2".into() => 5.into(),
921                "k3".into() => 2.4.into(),
922            },
923        ])),
924        vec!["k".into(), "k2".into()],
925        Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
926    )]
927    #[case(
928        DataFrame::new(ColumnFrame::from(vec![
929            hashmap! {
930                "k".into() => 1.into(),
931                "k2".into() => 2.into(),
932                "k3".into() => 2.2.into(),
933            },
934            hashmap! {
935                "k".into() => 11.into(),
936                "k2".into() => 3.into(),
937            },
938            hashmap! {
939                "k".into() => 4.into(),
940                "k2".into() => 5.into(),
941                "k3".into() => 2.3.into(),
942            },
943            hashmap! {
944                "k".into() => 4.into(),
945                "k2".into() => 5.into(),
946                "k3".into() => 2.4.into(),
947            },
948        ])),
949        vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
950        Array2::from_shape_vec((4, 5), vec![
951            2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
952            3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
953            5.into(), 2.3.into(),  DataValue::Null, DataValue::Null, 4.into(),
954            5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
955    )]
956    #[traced_test]
957    fn select_multiple(
958        #[case] input: DataFrame,
959        #[case] columns: Vec<Key>,
960        #[case] expected: Array2<DataValue>,
961    ) {
962        let selected = input.select(Some(&columns));
963        assert!(selected.is_ok());
964        let selected = selected.unwrap();
965
966        assert_eq!(selected, expected);
967    }
968
969    #[rstest]
970    #[case(
971        DataFrame::new(ColumnFrame::from(vec![
972            hashmap! {
973                "k".into() => 1.into(),
974                "k2".into() => 2.into(),
975                "k3".into() => 2.2.into(),
976            },
977            hashmap! {
978                "k".into() => 11.into(),
979                "k2".into() => 3.into(),
980            },
981            hashmap! {
982                "k".into() => 4.into(),
983                "k2".into() => 5.into(),
984                "k3".into() => 2.3.into(),
985            },
986            hashmap! {
987                "k".into() => 4.into(),
988                "k2".into() => 5.into(),
989                "k3".into() => 2.4.into(),
990            },
991        ])),
992        "k".into(),
993        Array2::from_shape_vec((4, 3), vec![
994            1.into(), 2.into(), 2.2.into(),
995            4.into(), 5.into(), 2.3.into(),
996            4.into(), 5.into(), 2.4.into(),
997            11.into(), 3.into(), DataValue::Null,
998            ]
999        ).unwrap(),
1000        vec!["k".into(), "k2".into(), "k3".into()],
1001    )]
1002    #[rstest]
1003    #[case(
1004        DataFrame::new(ColumnFrame::from(vec![
1005            hashmap! {
1006                "k".into() => 1.into(),
1007                "k2".into() => 2.into(),
1008                "k3".into() => 2.2.into(),
1009            },
1010            hashmap! {
1011                "k".into() => 11.into(),
1012                "k2".into() => 3.into(),
1013            },
1014            hashmap! {
1015                "k".into() => 4.into(),
1016                "k2".into() => 5.into(),
1017                "k3".into() => 2.3.into(),
1018            },
1019            hashmap! {
1020                "k".into() => 4.into(),
1021                "k2".into() => 5.into(),
1022                "k3".into() => 2.4.into(),
1023            },
1024        ])),
1025        "k3".into(),
1026        Array2::from_shape_vec((4, 3), vec![
1027            11.into(), 3.into(), DataValue::Null,
1028            1.into(), 2.into(), 2.2.into(),
1029            4.into(), 5.into(), 2.3.into(),
1030            4.into(), 5.into(), 2.4.into(),
1031            ]
1032        ).unwrap(),
1033        vec!["k".into(), "k2".into(), "k3".into()],
1034    )]
1035    #[case(
1036        DataFrame::new(ColumnFrame::from(vec![
1037            hashmap! {
1038                "k".into() => 2.into(),
1039                "k2".into() => 0.000001.into(),
1040            },
1041            hashmap! {
1042                "k".into() => 1.into(),
1043                "k2".into() =>0.0000001.into(),
1044            },
1045            hashmap! {
1046                "k".into() => 3.into(),
1047                "k2".into() => 0.00001.into(),
1048            },
1049            hashmap! {
1050                "k".into() => 4.into(),
1051                "k2".into() => 0.001.into(),
1052            },
1053        ])),
1054        "k2".into(),
1055        Array2::from_shape_vec((4, 2), vec![
1056            1.into(), 0.0000001.into(),
1057            2.into(), 0.000001.into(),
1058            3.into(), 0.00001.into(),
1059            4.into(), 0.001.into(),
1060            ]
1061        ).unwrap(),
1062        vec!["k".into(), "k2".into()],
1063    )]
1064    #[case(
1065        DataFrame::new(ColumnFrame::from(vec![
1066            hashmap! {
1067                "k".into() => 2.into(),
1068                "k2".into() => "b".into(),
1069            },
1070            hashmap! {
1071                "k".into() => 1.into(),
1072                "k2".into() =>"a".into(),
1073            },
1074            hashmap! {
1075                "k".into() => 3.into(),
1076                "k2".into() =>"c".into(),
1077            },
1078            hashmap! {
1079                "k".into() => 4.into(),
1080                "k2".into() =>"z".into(),
1081            },
1082        ])),
1083        "k2".into(),
1084        Array2::from_shape_vec((4, 2), vec![
1085            1.into(),"a".into(),
1086            2.into(), "b".into(),
1087            3.into(), "c".into(),
1088            4.into(), "z".into(),
1089            ]
1090        ).unwrap(),
1091        vec!["k".into(), "k2".into()],
1092    )]
1093    #[traced_test]
1094    fn sort_by(
1095        #[case] input: DataFrame,
1096        #[case] column: Key,
1097        #[case] expected: Array2<DataValue>,
1098        #[case] columns: Vec<Key>,
1099    ) {
1100        let result = input.sorted(&column);
1101        assert!(result.is_ok(), "{result:?}");
1102        let result = result.unwrap().get_sorted();
1103        let selected = result.select(Some(&columns));
1104
1105        assert_eq!(selected, expected);
1106    }
1107    #[rstest]
1108    #[case(
1109        DataFrame::new(ColumnFrame::from(vec![
1110            hashmap! {
1111                "k".into() => 2.into(),
1112                "k2".into() => 0.000001.into(),
1113            },
1114            hashmap! {
1115                "k".into() => 1.into(),
1116                "k2".into() =>0.0000001.into(),
1117            },
1118            hashmap! {
1119                "k".into() => 3.into(),
1120                "k2".into() => 0.00001.into(),
1121            },
1122            hashmap! {
1123                "k".into() => 4.into(),
1124                "k2".into() => 0.001.into(),
1125            },
1126        ])),
1127        "k2".into(),
1128        TopN::Last(1),
1129        Array2::from_shape_vec((1, 2), vec![
1130            4.into(), 0.001.into(),
1131            ]
1132        ).unwrap(),
1133        vec!["k".into(), "k2".into()],
1134    )]
1135    #[case(
1136        DataFrame::new(ColumnFrame::from(vec![
1137            hashmap! {
1138                "k".into() => 2.into(),
1139                "k2".into() => 0.000001.into(),
1140            },
1141            hashmap! {
1142                "k".into() => 1.into(),
1143                "k2".into() =>0.0000001.into(),
1144            },
1145            hashmap! {
1146                "k".into() => 3.into(),
1147                "k2".into() => 0.00001.into(),
1148            },
1149            hashmap! {
1150                "k".into() => 4.into(),
1151                "k2".into() => 0.001.into(),
1152            },
1153        ])),
1154        "k2".into(),
1155        TopN::Last(2),
1156        Array2::from_shape_vec((2, 2), vec![
1157            4.into(), 0.001.into(),
1158            3.into(), 0.00001.into(),
1159            ]
1160        ).unwrap(),
1161        vec!["k".into(), "k2".into()],
1162    )]
1163    #[case(
1164        DataFrame::new(ColumnFrame::from(vec![
1165            hashmap! {
1166                "k".into() => 2.into(),
1167                "k2".into() => "b".into(),
1168            },
1169            hashmap! {
1170                "k".into() => 1.into(),
1171                "k2".into() =>"a".into(),
1172            },
1173            hashmap! {
1174                "k".into() => 3.into(),
1175                "k2".into() =>"c".into(),
1176            },
1177            hashmap! {
1178                "k".into() => 4.into(),
1179                "k2".into() =>"z".into(),
1180            },
1181        ])),
1182        "k2".into(),
1183        TopN::First(1),
1184        Array2::from_shape_vec((1, 2), vec![
1185            1.into(),"a".into(),
1186            ]
1187        ).unwrap(),
1188        vec!["k".into(), "k2".into()],
1189    )]
1190    #[case(
1191        DataFrame::new(ColumnFrame::from(vec![
1192            hashmap! {
1193                "k".into() => 2.into(),
1194                "k2".into() => "b".into(),
1195            },
1196            hashmap! {
1197                "k".into() => 1.into(),
1198                "k2".into() =>"a".into(),
1199            },
1200            hashmap! {
1201                "k".into() => 3.into(),
1202                "k2".into() =>"c".into(),
1203            },
1204            hashmap! {
1205                "k".into() => 4.into(),
1206                "k2".into() =>"z".into(),
1207            },
1208        ])),
1209        "k2".into(),
1210        TopN::First(2),
1211        Array2::from_shape_vec((2, 2), vec![
1212            1.into(),"a".into(),
1213            2.into(),"b".into(),
1214            ]
1215        ).unwrap(),
1216        vec!["k".into(), "k2".into()],
1217    )]
1218    #[traced_test]
1219    fn top_n(
1220        #[case] input: DataFrame,
1221        #[case] column: Key,
1222        #[case] topn: TopN,
1223        #[case] expected: Array2<DataValue>,
1224        #[case] columns: Vec<Key>,
1225    ) {
1226        let result = input.sorted(&column);
1227        assert!(result.is_ok(), "{result:?}");
1228        let result = result.unwrap();
1229        let first = result.topn(topn).unwrap();
1230        let selected = first.select(Some(&columns));
1231        assert_eq!(selected, expected);
1232    }
1233}