Skip to main content

trs_dataframe/
dataframe.rs

1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18    dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19    MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24    First(usize),
25    Last(usize),
26}
27
28/// DataFrame holds information about [`ColumnFrame`].
29/// This is used to store the data and the metadata for the candidates.
30///
31/// # Columns Storage
32/// The underlying data is stored in row-major order using ndarray's Array2.
33/// Use `select()` for row-oriented access and `select_transposed()` for column-oriented access.
34///
35/// # Example
36/// ```
37/// use trs_dataframe::{DataFrame, column_frame};
38///
39/// let df = DataFrame::new(column_frame! {
40///     "a" => [1, 2, 3],
41///     "b" => [4, 5, 6]
42/// });
43///
44/// // Get all data as 2D array (rows x columns)
45/// let all_data = df.select(None);
46///
47/// // Get specific columns
48/// let keys = vec!["a".into(), "b".into()];
49/// let selected = df.select(Some(&keys));
50/// ```
51#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
52#[cfg_attr(feature = "python", pyclass)]
53pub struct DataFrame {
54    /// Constants for the dataframe - mikro optimization for the data
55    /// Values which is constant for the whole dataframe are stored here
56    /// These values are applied to all rows without storing them per-row
57    pub constants: HashMap<Key, DataValue>,
58    /// Internal columnar storage for row data
59    pub dataframe: ColumnFrame,
60    /// Metadata for the dataframe. Here you can store the information about the dataframe
61    /// This is user-defined key-value metadata that doesn't affect data operations
62    pub metadata: HashMap<String, DataValue>,
63}
64
65impl fmt::Display for DataFrame {
66    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67        self.dataframe.fmt(f)
68    }
69}
70
71impl DataFrame {
72    pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
73        Self {
74            constants: HashMap::new(),
75            dataframe: dataframe.into(),
76            metadata: HashMap::new(),
77        }
78    }
79
80    /// Returns the number of columns which dataframe contains.
81    pub fn n_columns(&self) -> usize {
82        self.dataframe.data_frame.ncols()
83    }
84
85    /// Returns the number of rows which dataframe contains.
86    pub fn n_rows(&self) -> usize {
87        self.dataframe.data_frame.nrows()
88    }
89
90    pub fn shrink(&mut self) {
91        self.dataframe.shrink();
92    }
93
94    pub fn add_metadata(&mut self, key: String, value: DataValue) {
95        self.metadata.insert(key, value);
96    }
97
98    pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
99        self.metadata.get(key)
100    }
101
102    pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
103        for (key, value) in other.constants {
104            self.constants.insert(key, value);
105        }
106        self.dataframe.join(other.dataframe, join_type)
107    }
108
109    pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
110    where
111        F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
112    {
113        self.dataframe.apply_function(keys, &mut func)
114    }
115
116    pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
117        Ok(self.dataframe.select(keys))
118    }
119
120    // pub fn select_view(&self, keys: Option<&[Key]>) -> Result<ArrayView2<'_, DataValue>, Error> {
121    //     Ok(self.dataframe.select_view(keys))
122    // }
123
124    pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
125        self.dataframe.select_transposed_typed::<D>(keys)
126    }
127
128    pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
129        self.dataframe.select_column(&key)
130    }
131
132    pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
133        self.dataframe.select_transposed(keys)
134    }
135
136    pub fn insert_constant(&mut self, key: Key, value: DataValue) {
137        self.constants.insert(key, value);
138    }
139
140    pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
141        self.dataframe.push(item)
142    }
143
144    pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
145        self.dataframe.remove_column(keys).map(|x| x.into())
146    }
147
148    pub fn extend(&mut self, items: Self) -> Result<(), Error> {
149        self.dataframe.extend(items.dataframe)
150    }
151
152    pub fn len(&self) -> usize {
153        self.dataframe.len()
154    }
155
156    pub fn is_empty(&self) -> bool {
157        self.dataframe.is_empty()
158    }
159
160    pub fn add_single_column<K: Into<Key>>(
161        &mut self,
162        key: K,
163        values: Array1<DataValue>,
164    ) -> Result<(), Error> {
165        self.dataframe.add_single_column(key, values)
166    }
167
168    pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
169        self.dataframe.get_single_column(key)
170    }
171
172    pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
173        self.dataframe.sorted(key)
174    }
175
176    pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
177        let filtered_df = self.dataframe.filter(filter)?;
178        Ok(Self {
179            constants: self.constants.clone(),
180            dataframe: filtered_df,
181            metadata: self.metadata.clone(),
182        })
183    }
184
185    #[cfg(feature = "polars-df")]
186    pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
187        let mut columns = vec![];
188        for key in self.dataframe.keys() {
189            let values = self
190                .dataframe
191                .get_single_column(key)
192                .ok_or_else(|| Error::NotFound(key.clone()))?
193                .into_iter()
194                .map(|x| into_polars_value(key, x.clone()))
195                .collect::<Vec<_>>();
196            let s = polars::prelude::Column::new(key.name().into(), values);
197
198            columns.push(s);
199        }
200
201        Ok(polars::prelude::DataFrame::new(columns)?)
202    }
203
204    pub fn load_from_messagepack(bytes: &[u8]) -> Result<Self, Error> {
205        rmp_serde::decode::from_slice(bytes).map_err(|e| Error::UnknownError(format!("{e:?}")))
206    }
207
208    pub fn store_into_messagepack(&self) -> Result<Vec<u8>, Error> {
209        rmp_serde::encode::to_vec(&self).map_err(|e| Error::UnknownError(format!("{e:?}")))
210    }
211}
212#[cfg(feature = "polars-df")]
213pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
214    use crate::DataType::*;
215    use polars::prelude::DataType::*;
216    match dtype {
217        Bool => Boolean,
218        U32 => UInt32,
219        I32 => Int32,
220        U8 => UInt8,
221        U64 => UInt64,
222        I64 => Int64,
223        F32 => Float32,
224        F64 => Float64,
225        U128 => UInt128,
226        I128 => Int128,
227        crate::DataType::String => polars::prelude::DataType::String,
228        Bytes => Binary,
229        crate::DataType::Unknown => Null,
230        Vec => List(Box::new(polars::prelude::DataType::Unknown(
231            polars::prelude::UnknownKind::Any,
232        ))),
233        Map => Struct(vec![]),
234    }
235}
236
237#[cfg(feature = "polars-df")]
238pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
239    use polars::prelude::AnyValue::*;
240    use polars::prelude::Field;
241
242    use crate::dataframe::column_store::convert_dv_to_dtype;
243    let dv = convert_dv_to_dtype(key, dv);
244    match dv {
245        DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
246        DataValue::Bytes(items) => BinaryOwned(items),
247        DataValue::U8(x) => UInt32(x as _),
248        DataValue::Bool(x) => Boolean(x),
249        DataValue::I32(x) => Int32(x),
250        DataValue::U32(x) => UInt32(x),
251        DataValue::I64(x) => Int64(x),
252        DataValue::U64(x) => UInt64(x),
253        DataValue::I128(x) => Int128(x),
254        DataValue::F32(x) => Float32(x),
255        DataValue::F64(x) => Float64(x),
256        DataValue::Null => Null,
257        DataValue::Vec(data_values) => {
258            let mut dt = crate::DataType::Unknown;
259            for d in data_values.iter() {
260                match crate::detect_dtype(d) {
261                    crate::DataType::Unknown => continue,
262                    e => {
263                        dt = e;
264                        break;
265                    }
266                }
267            }
268            let vec_key = Key::new(key.name(), dt);
269            let s = polars::series::Series::from_any_values(
270                key.name().into(),
271                &data_values
272                    .into_iter()
273                    .map(|x| into_polars_value(&vec_key, x))
274                    .collect::<Vec<_>>(),
275                true,
276            );
277            List(s.expect(&format!("Cannot create series for {key:?}")))
278        }
279        DataValue::EnumNumber(x) => Int32(x),
280        DataValue::U128(x) => UInt128(x),
281        DataValue::Map(x) => {
282            let mut values = vec![];
283            let mut fields = vec![];
284            let mut sorted_keys = x.keys().collect::<Vec<_>>();
285            sorted_keys.sort();
286            for k in sorted_keys {
287                let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
288                let dtype = crate::detect_dtype(value);
289                let k = Key::new(k, dtype);
290                values.push(into_polars_value(&k, value.to_owned()));
291                fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
292            }
293            StructOwned(Box::new((values, fields)))
294        }
295    }
296}
297
298#[cfg(feature = "polars-df")]
299pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
300    use polars::prelude::AnyValue::*;
301    match dv {
302        Null => DataValue::Null,
303        Boolean(v) => v.into(),
304        String(v) => DataValue::String(v.into()),
305        UInt8(v) => DataValue::U8(v),
306        UInt16(v) => DataValue::U32(v as u32),
307        UInt32(v) => v.into(),
308        UInt64(v) => v.into(),
309        Int8(v) => (v as i32).into(),
310        Int16(v) => (v as i32).into(),
311        Int32(v) => v.into(),
312        Int64(v) => v.into(),
313        Float32(v) => v.into(),
314        Float64(v) => v.into(),
315        Int128(v) => v.into(),
316        List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
317        // Array(series, _) => {
318        //     DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>())
319        // }
320        StringOwned(v) => DataValue::String(v.as_str().into()),
321        Binary(v) => DataValue::Bytes(v.to_owned()),
322        BinaryOwned(v) => DataValue::Bytes(v),
323        StructOwned(m) => {
324            let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
325                std::collections::HashMap::new();
326            for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
327                hm.insert(k.name.as_str().into(), from_polars_value(v));
328            }
329            DataValue::Map(hm)
330        }
331        e => {
332            tracing::warn!("Unsupported polars value: {e:?}");
333            DataValue::Null
334        }
335    }
336}
337
338impl From<ColumnFrame> for DataFrame {
339    fn from(dataframe: ColumnFrame) -> Self {
340        Self::new(dataframe)
341    }
342}
343
344impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
345    fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
346        Self::new(ColumnFrame::from(dataframe))
347    }
348}
349
350impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
351    fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
352        Self::new(ColumnFrame::from(dataframe))
353    }
354}
355
356impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
357    fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
358        Self::new(ColumnFrame::from(dataframe))
359    }
360}
361
362impl From<MLChefMap> for DataFrame {
363    fn from(dataframe: MLChefMap) -> Self {
364        Self::new(ColumnFrame::from(dataframe))
365    }
366}
367impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
368    fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
369        Self::new(ColumnFrame::from(dataframe))
370    }
371}
372
373impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
374    fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
375        Self::new(ColumnFrame::from(dataframe))
376    }
377}
378
379#[cfg(feature = "polars-df")]
380impl From<polars::prelude::DataFrame> for DataFrame {
381    fn from(dataframe: polars::prelude::DataFrame) -> Self {
382        Self::new(ColumnFrame::from(dataframe))
383    }
384}
385#[cfg(test)]
386mod test {
387    use crate::filter::FilterRules;
388
389    use super::*;
390    use halfbrown::hashmap;
391    #[cfg(feature = "polars-df")]
392    use polars::prelude::NamedFrom as _;
393    use rstest::*;
394    use tracing_test::traced_test;
395    #[fixture]
396    fn dummy_candidates() -> ColumnFrame {
397        ColumnFrame::from(vec![
398            hashmap! {
399                "key1".into() => 1.into(),
400                "key2".into() => "a".into(),
401            },
402            hashmap! {
403                "key1".into() => 2.into(),
404                "key2".into() => "b".into(),
405            },
406        ])
407    }
408
409    #[rstest]
410    fn test_serde() {
411        let df = crate::df! {
412            "a" => [1u64, 2u64, 3u64],
413            "b" => [4u64, 5u64, 6u64],
414            "c" => [7u64, 8u64, 9u64]
415        };
416
417        let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
418
419        let deserialized =
420            serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
421
422        assert_eq!(df, deserialized);
423    }
424
425    #[cfg(feature = "polars-df")]
426    #[rstest]
427    fn test_polars() {
428        let expected = crate::df! {
429            "a" => [1u64, 2u64, 3u64],
430            "b" => [4f64, 5f64, 6f64],
431            "c" => [7i64, 8i64, 9i64]
432        };
433
434        let polars_df = polars::df!(
435            "a" => [1u64, 2u64, 3u64],
436            "b" => [4f64, 5f64, 6f64],
437            "c" => [7i64, 8i64, 9i64]
438        )
439        .expect("BUG: should be ok");
440        let as_df: DataFrame = polars_df.into();
441        let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
442        assert_eq!(
443            as_df.select(Some(keys.as_slice())),
444            expected.select(Some(keys.as_slice()))
445        );
446    }
447    #[cfg(feature = "polars-df")]
448    use crate::DataType;
449    #[cfg(feature = "polars-df")]
450    #[rstest]
451    #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
452    #[case::u32(
453        Key::new("a", DataType::U32),
454        DataValue::U32(u32::MAX),
455        polars::prelude::AnyValue::UInt32(u32::MAX)
456    )]
457    #[case::i32(
458        Key::new("a", DataType::I32),
459        DataValue::I32(i32::MIN),
460        polars::prelude::AnyValue::Int32(i32::MIN)
461    )]
462    #[case::i64(
463        Key::new("a", DataType::I64),
464        DataValue::I64(i64::MIN),
465        polars::prelude::AnyValue::Int64(i64::MIN)
466    )]
467    #[case::u64(
468        Key::new("a", DataType::U64),
469        DataValue::U64(u64::MIN),
470        polars::prelude::AnyValue::UInt64(u64::MIN)
471    )]
472    #[case::f32(
473        Key::new("a", DataType::F32),
474        DataValue::F32(f32::MIN),
475        polars::prelude::AnyValue::Float32(f32::MIN)
476    )]
477    #[case::f64(
478        Key::new("a", DataType::F64),
479        DataValue::F64(f64::MIN),
480        polars::prelude::AnyValue::Float64(f64::MIN)
481    )]
482    #[case::null(
483        Key::new("a", DataType::Unknown),
484        DataValue::Null,
485        polars::prelude::AnyValue::Null
486    )]
487    #[case::i128(
488        Key::new("a", DataType::I128),
489        DataValue::I128(i128::MIN),
490        polars::prelude::AnyValue::Int128(i128::MIN)
491    )]
492    #[case::u8(
493        Key::new("a", DataType::U8),
494        DataValue::U8(255),
495        polars::prelude::AnyValue::UInt8(255)
496    )]
497    #[case::bool(
498        Key::new("a", DataType::Bool),
499        DataValue::Bool(true),
500        polars::prelude::AnyValue::Boolean(true)
501    )]
502    #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
503    #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
504    #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
505        vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
506        vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
507    // polars converts all by first element type
508    // #[case::vec_diff_int(DataValue::Vec(vec![ DataValue::I32(1), DataValue::U32(0)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::Int32(1i32), polars::prelude::AnyValue::UInt32(0u32)])))]
509    //#[case::vec_int_str(DataValue::Vec(vec![DataValue::U32(0), DataValue::String("1".into())]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::StringOwned("1".into())])))]
510    fn into_polars_value_test(
511        #[case] key: Key,
512        #[case] input: DataValue,
513        #[case] output: polars::prelude::AnyValue<'static>,
514    ) {
515        assert_eq!(into_polars_value(&key, input.clone()), output);
516        assert_eq!(from_polars_value(output), input);
517    }
518
519    // #[cfg(feature = "polars-df")]
520    // #[rstest]
521    // fn as_polars() {
522    //     let state = include_bytes!("../part_00330.dfb");
523    //     let df: Result<DataFrame, _> = rmp_serde::decode::from_slice(state);
524    //     assert!(df.is_ok());
525    //     let df = df.unwrap();
526    //     println!("{df}");
527    //     let polars_df = df.as_polars();
528    //     assert!(polars_df.is_ok(), "{polars_df:?}");
529    // }
530    #[rstest]
531    #[case(
532        DataFrame::new(crate::column_frame! {
533            "a" => [1f64, 2f64, 3f64],
534            "b" => [4i64, 5i64, 6i64],
535            "c" => [7i64, 8i64, 9i64]
536        }),
537        DataFrame::new(crate::column_frame! {
538            "a" => [1f64, 2f64],
539            "b" => [4i64, 5i64],
540            "c" => [7i64, 8i64]
541        }),
542        FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
543    )]
544    #[case(
545        DataFrame::new(crate::column_frame! {
546            "a" => [1f64, 2f64, 3f64],
547            "b" => [4i64, 5i64, 6i64],
548            "c" => [7i64, 8i64, 9i64]
549        }),
550        DataFrame::new(crate::column_frame! {
551            "a" => [2f64],
552            "b" => [5i64],
553            "c" => [8i64]
554        }),
555        FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
556    )]
557    #[traced_test]
558    fn filter_test(
559        #[case] df: DataFrame,
560        #[case] expected: DataFrame,
561        #[case] filter: FilterRules,
562    ) {
563        let filtered = df.filter(&filter).expect("BUG: cannot filter");
564        assert_eq!(filtered, expected);
565    }
566
567    #[rstest]
568    fn test_serde_complex() {
569        let simple = r#"
570{
571    "constants": {},
572    "dataframe": {
573        "index": {
574            "keys": [
575                {
576                    "key": 3162770485,
577                    "name": "a",
578                    "ctype": "U32"
579                },
580                {
581                    "key": 2279056742,
582                    "name": "b",
583                    "ctype": "F64"
584                },
585                {
586                    "key": 2994984227,
587                    "name": "c",
588                    "ctype": "U64"
589                },
590                {
591                    "key": 3319645144,
592                    "name": "d",
593                    "ctype": "F64"
594                },
595                {
596                    "key": 1291847470,
597                    "name": "e",
598                    "ctype": "U32"
599                },
600                {
601                    "key": 874241070,
602                    "name": "f",
603                    "ctype": "Bool"
604                }
605            ],
606            "indexes": {
607                "a": 0,
608                "b": 1,
609                "c": 2,
610                "d": 3,
611                "e": 4,
612                "f": 5
613            },
614            "alias": {}
615        },
616        "data_frame": {
617            "v": 1,
618            "dim": [
619                2,
620                6
621            ],
622            "data": [
623                253780,
624                0.009369421750307085,
625                1633222860381359,
626                8,
627                5,
628                true,
629                64512,
630                0.003391335718333721,
631                1633222860810557,
632                8,
633                5,
634                null
635            ]
636        }
637    },
638    "metadata": {}
639}
640        "#;
641
642        let simple_deserialized: DataFrame =
643            serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
644
645        println!("deserialized: {simple_deserialized:?}");
646        let array = format!("[{}, {}, {}]", simple, simple, simple);
647        let deserialized: Vec<DataFrame> =
648            serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
649
650        println!("deserialized: {deserialized:?}");
651        assert_eq!(deserialized.len(), 3);
652        assert_eq!(simple_deserialized, deserialized[0]);
653    }
654
655    #[rstest]
656    #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
657    #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
658    #[case(vec![hashmap! {
659        "key1".into() => 1.into(),
660        "key2".into() => "a".into(),
661    },
662    hashmap! {
663        "key1".into() => 2.into(),
664    },])]
665    #[case(vec![data_value::stdhashmap! {
666        "key1" => DataValue::from(1),
667        "key2" => DataValue::from("a"),
668    },data_value::stdhashmap! {
669        "key1" => DataValue::from(2),
670    },])]
671    #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
672    vec![DataValue::from("a"), DataValue::Null])])]
673    fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
674        let df: DataFrame = input.into();
675        assert_eq!(
676            df,
677            DataFrame {
678                constants: HashMap::new(),
679                dataframe: ColumnFrame::from(vec![
680                    hashmap! {
681                        "key1".into() => 1.into(),
682                        "key2".into() => "a".into(),
683                    },
684                    hashmap! {
685                        "key1".into() => 2.into(),
686                    },
687                ]),
688                metadata: HashMap::new(),
689            }
690        );
691        let selected_transposed = df.select_column("key1".into());
692        assert!(selected_transposed.is_some());
693        let selected_transposed = selected_transposed.unwrap();
694        assert_eq!(selected_transposed.len(), 2);
695        assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
696    }
697
698    #[rstest]
699    #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
700    #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
701    #[case::hm({
702        let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
703        hm
704    })]
705    #[case::vec_hhm(vec![hashmap! {
706        "key1".into() => 1.into(),
707        "key2".into() => "a".into(),
708    },
709    hashmap! {
710        "key1".into() => 2.into(),
711    },])]
712    #[case::vec_hme(vec![data_value::stdhashmap! {
713        "key1" => DataValue::from(1),
714        "key2" => DataValue::from("a"),
715    },data_value::stdhashmap! {
716        "key1" => DataValue::from(2),
717    },])]
718    #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
719    fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
720        let df: DataFrame = input.into();
721        let expected: DataFrame = DataFrame {
722            constants: HashMap::new(),
723            dataframe: ColumnFrame::from(vec![
724                hashmap! {
725                    "key1".into() => 1.into(),
726                    "key2".into() => "a".into(),
727                },
728                hashmap! {
729                    "key1".into() => 2.into(),
730                },
731            ]),
732            metadata: HashMap::new(),
733        };
734        assert_eq!(
735            df.select(Some(&["key1".into(), "key2".into()])),
736            expected.select(Some(&["key1".into(), "key2".into()])),
737            "{df} vs {expected}"
738        );
739        let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
740        assert_eq!(selected_transposed.len(), 2);
741        println!("{:?}", selected_transposed);
742        assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
743    }
744    #[rstest]
745    fn test_dataframe(dummy_candidates: ColumnFrame) {
746        let mut dataframe: DataFrame = DataFrame::default();
747        assert!(dataframe.is_empty());
748        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
749        assert_eq!(dataframe.len(), 2);
750
751        let candidate = hashmap! {
752            "key1".into() => 3.into(),
753            "key2".into() => "c".into(),
754        };
755
756        assert!(dataframe.push(candidate).is_ok());
757        assert_eq!(dataframe.len(), 3);
758        assert!(!dataframe.is_empty());
759
760        dataframe.insert_constant("key3".into(), 4.into());
761        assert_eq!(dataframe.constants.len(), 1);
762        assert!(dataframe
763            .apply_function(&["key1".into()], |keys, df| {
764                let key = keys[0].clone();
765                let s = df
766                    .get_single_column(&key)
767                    .expect("BUG: Cannot get column")
768                    .to_owned();
769                let s = s.mapv(|x| x + DataValue::from(1));
770                df.add_single_column("key5", s)?;
771                Ok(())
772            })
773            .is_ok());
774        let original = dataframe.clone();
775        dataframe.shrink();
776        let remove_df = dataframe.remove_column(&["key1".into()]);
777        assert!(remove_df.is_ok());
778        let mut remove_df = remove_df.unwrap();
779        assert_eq!(remove_df.len(), 3);
780        let selected = dataframe.select(Some(&["key2".into()]));
781        assert!(selected.is_ok());
782        let selected = selected.unwrap();
783        println!("{:?}", selected);
784
785        // fixme later
786        let joined_result =
787            remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
788        assert!(joined_result.is_ok(), "{:?}", joined_result);
789        let keys = vec!["key1".into(), "key2".into(), "key5".into()];
790        assert_eq!(
791            original.select(Some(keys.as_slice())),
792            remove_df.select(Some(keys.as_slice()))
793        );
794    }
795
796    #[rstest]
797    fn test_size_methods() {
798        let candidate = hashmap! {
799            "key1".into() => 3.into(),
800            "key2".into() => "c".into(),
801            "key3".into() => false.into()
802        };
803
804        let dataframe: DataFrame = vec![candidate].into();
805
806        assert_eq!(dataframe.n_columns(), 3);
807        assert_eq!(dataframe.n_rows(), 1);
808    }
809
810    #[rstest]
811    fn test_metadata(dummy_candidates: ColumnFrame) {
812        let mut dataframe: DataFrame = DataFrame::default();
813        assert!(dataframe.is_empty());
814        println!("{:?}", dataframe);
815        assert!(dataframe.extend(dummy_candidates.into()).is_ok());
816        println!("{:?}", dataframe);
817        assert_eq!(dataframe.len(), 2);
818
819        dataframe.add_metadata("test".into(), 1.into());
820        assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
821        let dataframe = DataFrame::new(ColumnFrame::from(vec![
822            hashmap! {
823                "key1".into() => 1.into(),
824                "key2".into() => "a".into(),
825            },
826            hashmap! {
827                "key1".into() => 2.into(),
828                "key2".into() => "b".into(),
829            },
830        ]));
831        assert_eq!(dataframe.get_metadata("test"), None);
832        let tt = dataframe.select_transposed(None);
833        assert!(tt.is_ok());
834        let tt = tt.unwrap();
835        assert_eq!(tt.shape(), [2, 2]);
836        assert_eq!(
837            tt,
838            Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
839                .unwrap()
840        );
841    }
842
843    #[rstest]
844    #[traced_test]
845    fn add_single_column_test() {
846        let mut dataframe = DataFrame::default();
847        let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
848        let r = dataframe.add_single_column("key1", values);
849        assert!(r.is_ok(), "{r:?}");
850        let selected = dataframe.select(None);
851        assert!(selected.is_ok());
852        let selected = selected.unwrap();
853        assert_eq!(selected.shape(), [3, 1]);
854        assert_eq!(
855            selected,
856            Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
857        );
858        let values = Array1::from(vec![1.into(), 2.into()]);
859        assert!(dataframe.add_single_column("key1", values).is_err());
860        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
861        assert!(dataframe.add_single_column("key2", values).is_ok());
862        let values = Array1::from(vec![3.into()]);
863        assert!(dataframe.add_single_column("key3", values).is_err());
864    }
865
866    #[rstest]
867    #[traced_test]
868    fn add_single_column_empty_test() {
869        let mut dataframe = DataFrame::default();
870        let values = Array1::from(vec![]);
871        let r = dataframe.add_single_column("key1", values);
872        assert!(r.is_ok(), "{r:?}");
873        let selected = dataframe.select(None);
874        assert!(selected.is_ok());
875        let selected = selected.unwrap();
876        assert_eq!(selected.shape(), [0, 1]);
877        assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
878        let values = Array1::from(vec![1.into(), 2.into()]);
879        assert!(dataframe.add_single_column("key1", values).is_err());
880        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
881        assert!(dataframe.add_single_column("key2", values).is_ok());
882        let values = Array1::from(vec![3.into(), 4.into()]);
883        assert!(dataframe.add_single_column("key3", values).is_err());
884        let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
885        assert!(dataframe.add_single_column("key3", values).is_ok());
886
887        assert_eq!(
888            dataframe
889                .select_column("key1".into())
890                .expect("BUG: has to exists"),
891            ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
892        );
893        assert_eq!(
894            dataframe
895                .select_column("key2".into())
896                .expect("BUG: has to exists"),
897            ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
898        );
899        assert_eq!(
900            dataframe.select(None).expect("BUG: cannot get data"),
901            ndarray::arr2(&[
902                [DataValue::Null, 3.into(), 3.into()],
903                [DataValue::Null, 4.into(), 4.into()],
904                [DataValue::Null, 5.into(), 5.into()],
905            ])
906        );
907    }
908
909    #[rstest]
910    #[case(
911        DataFrame::new(ColumnFrame::from(vec![
912            hashmap! {
913                "k".into() => 1.into(),
914                "k2".into() => 2.into(),
915                "k3".into() => 2.2.into(),
916            },
917            hashmap! {
918                "k".into() => 11.into(),
919                "k2".into() => 3.into(),
920            },
921            hashmap! {
922                "k".into() => 4.into(),
923                "k2".into() => 5.into(),
924                "k3".into() => 2.3.into(),
925            },
926            hashmap! {
927                "k".into() => 4.into(),
928                "k2".into() => 5.into(),
929                "k3".into() => 2.4.into(),
930            },
931        ])),
932        vec!["k".into(), "k2".into()],
933        Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
934    )]
935    #[case(
936        DataFrame::new(ColumnFrame::from(vec![
937            hashmap! {
938                "k".into() => 1.into(),
939                "k2".into() => 2.into(),
940                "k3".into() => 2.2.into(),
941            },
942            hashmap! {
943                "k".into() => 11.into(),
944                "k2".into() => 3.into(),
945            },
946            hashmap! {
947                "k".into() => 4.into(),
948                "k2".into() => 5.into(),
949                "k3".into() => 2.3.into(),
950            },
951            hashmap! {
952                "k".into() => 4.into(),
953                "k2".into() => 5.into(),
954                "k3".into() => 2.4.into(),
955            },
956        ])),
957        vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
958        Array2::from_shape_vec((4, 5), vec![
959            2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
960            3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
961            5.into(), 2.3.into(),  DataValue::Null, DataValue::Null, 4.into(),
962            5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
963    )]
964    #[traced_test]
965    fn select_multiple(
966        #[case] input: DataFrame,
967        #[case] columns: Vec<Key>,
968        #[case] expected: Array2<DataValue>,
969    ) {
970        let selected = input.select(Some(&columns));
971        assert!(selected.is_ok());
972        let selected = selected.unwrap();
973
974        assert_eq!(selected, expected);
975    }
976
977    #[rstest]
978    #[case(
979        DataFrame::new(ColumnFrame::from(vec![
980            hashmap! {
981                "k".into() => 1.into(),
982                "k2".into() => 2.into(),
983                "k3".into() => 2.2.into(),
984            },
985            hashmap! {
986                "k".into() => 11.into(),
987                "k2".into() => 3.into(),
988            },
989            hashmap! {
990                "k".into() => 4.into(),
991                "k2".into() => 5.into(),
992                "k3".into() => 2.3.into(),
993            },
994            hashmap! {
995                "k".into() => 4.into(),
996                "k2".into() => 5.into(),
997                "k3".into() => 2.4.into(),
998            },
999        ])),
1000        "k".into(),
1001        Array2::from_shape_vec((4, 3), vec![
1002            1.into(), 2.into(), 2.2.into(),
1003            4.into(), 5.into(), 2.3.into(),
1004            4.into(), 5.into(), 2.4.into(),
1005            11.into(), 3.into(), DataValue::Null,
1006            ]
1007        ).unwrap(),
1008        vec!["k".into(), "k2".into(), "k3".into()],
1009    )]
1010    #[rstest]
1011    #[case(
1012        DataFrame::new(ColumnFrame::from(vec![
1013            hashmap! {
1014                "k".into() => 1.into(),
1015                "k2".into() => 2.into(),
1016                "k3".into() => 2.2.into(),
1017            },
1018            hashmap! {
1019                "k".into() => 11.into(),
1020                "k2".into() => 3.into(),
1021            },
1022            hashmap! {
1023                "k".into() => 4.into(),
1024                "k2".into() => 5.into(),
1025                "k3".into() => 2.3.into(),
1026            },
1027            hashmap! {
1028                "k".into() => 4.into(),
1029                "k2".into() => 5.into(),
1030                "k3".into() => 2.4.into(),
1031            },
1032        ])),
1033        "k3".into(),
1034        Array2::from_shape_vec((4, 3), vec![
1035            11.into(), 3.into(), DataValue::Null,
1036            1.into(), 2.into(), 2.2.into(),
1037            4.into(), 5.into(), 2.3.into(),
1038            4.into(), 5.into(), 2.4.into(),
1039            ]
1040        ).unwrap(),
1041        vec!["k".into(), "k2".into(), "k3".into()],
1042    )]
1043    #[case(
1044        DataFrame::new(ColumnFrame::from(vec![
1045            hashmap! {
1046                "k".into() => 2.into(),
1047                "k2".into() => 0.000001.into(),
1048            },
1049            hashmap! {
1050                "k".into() => 1.into(),
1051                "k2".into() =>0.0000001.into(),
1052            },
1053            hashmap! {
1054                "k".into() => 3.into(),
1055                "k2".into() => 0.00001.into(),
1056            },
1057            hashmap! {
1058                "k".into() => 4.into(),
1059                "k2".into() => 0.001.into(),
1060            },
1061        ])),
1062        "k2".into(),
1063        Array2::from_shape_vec((4, 2), vec![
1064            1.into(), 0.0000001.into(),
1065            2.into(), 0.000001.into(),
1066            3.into(), 0.00001.into(),
1067            4.into(), 0.001.into(),
1068            ]
1069        ).unwrap(),
1070        vec!["k".into(), "k2".into()],
1071    )]
1072    #[case(
1073        DataFrame::new(ColumnFrame::from(vec![
1074            hashmap! {
1075                "k".into() => 2.into(),
1076                "k2".into() => "b".into(),
1077            },
1078            hashmap! {
1079                "k".into() => 1.into(),
1080                "k2".into() =>"a".into(),
1081            },
1082            hashmap! {
1083                "k".into() => 3.into(),
1084                "k2".into() =>"c".into(),
1085            },
1086            hashmap! {
1087                "k".into() => 4.into(),
1088                "k2".into() =>"z".into(),
1089            },
1090        ])),
1091        "k2".into(),
1092        Array2::from_shape_vec((4, 2), vec![
1093            1.into(),"a".into(),
1094            2.into(), "b".into(),
1095            3.into(), "c".into(),
1096            4.into(), "z".into(),
1097            ]
1098        ).unwrap(),
1099        vec!["k".into(), "k2".into()],
1100    )]
1101    #[traced_test]
1102    fn sort_by(
1103        #[case] input: DataFrame,
1104        #[case] column: Key,
1105        #[case] expected: Array2<DataValue>,
1106        #[case] columns: Vec<Key>,
1107    ) {
1108        let result = input.sorted(&column);
1109        assert!(result.is_ok(), "{result:?}");
1110        let result = result.unwrap().get_sorted();
1111        let selected = result.select(Some(&columns));
1112
1113        assert_eq!(selected, expected);
1114    }
1115    #[rstest]
1116    #[case(
1117        DataFrame::new(ColumnFrame::from(vec![
1118            hashmap! {
1119                "k".into() => 2.into(),
1120                "k2".into() => 0.000001.into(),
1121            },
1122            hashmap! {
1123                "k".into() => 1.into(),
1124                "k2".into() =>0.0000001.into(),
1125            },
1126            hashmap! {
1127                "k".into() => 3.into(),
1128                "k2".into() => 0.00001.into(),
1129            },
1130            hashmap! {
1131                "k".into() => 4.into(),
1132                "k2".into() => 0.001.into(),
1133            },
1134        ])),
1135        "k2".into(),
1136        TopN::Last(1),
1137        Array2::from_shape_vec((1, 2), vec![
1138            4.into(), 0.001.into(),
1139            ]
1140        ).unwrap(),
1141        vec!["k".into(), "k2".into()],
1142    )]
1143    #[case(
1144        DataFrame::new(ColumnFrame::from(vec![
1145            hashmap! {
1146                "k".into() => 2.into(),
1147                "k2".into() => 0.000001.into(),
1148            },
1149            hashmap! {
1150                "k".into() => 1.into(),
1151                "k2".into() =>0.0000001.into(),
1152            },
1153            hashmap! {
1154                "k".into() => 3.into(),
1155                "k2".into() => 0.00001.into(),
1156            },
1157            hashmap! {
1158                "k".into() => 4.into(),
1159                "k2".into() => 0.001.into(),
1160            },
1161        ])),
1162        "k2".into(),
1163        TopN::Last(2),
1164        Array2::from_shape_vec((2, 2), vec![
1165            4.into(), 0.001.into(),
1166            3.into(), 0.00001.into(),
1167            ]
1168        ).unwrap(),
1169        vec!["k".into(), "k2".into()],
1170    )]
1171    #[case(
1172        DataFrame::new(ColumnFrame::from(vec![
1173            hashmap! {
1174                "k".into() => 2.into(),
1175                "k2".into() => "b".into(),
1176            },
1177            hashmap! {
1178                "k".into() => 1.into(),
1179                "k2".into() =>"a".into(),
1180            },
1181            hashmap! {
1182                "k".into() => 3.into(),
1183                "k2".into() =>"c".into(),
1184            },
1185            hashmap! {
1186                "k".into() => 4.into(),
1187                "k2".into() =>"z".into(),
1188            },
1189        ])),
1190        "k2".into(),
1191        TopN::First(1),
1192        Array2::from_shape_vec((1, 2), vec![
1193            1.into(),"a".into(),
1194            ]
1195        ).unwrap(),
1196        vec!["k".into(), "k2".into()],
1197    )]
1198    #[case(
1199        DataFrame::new(ColumnFrame::from(vec![
1200            hashmap! {
1201                "k".into() => 2.into(),
1202                "k2".into() => "b".into(),
1203            },
1204            hashmap! {
1205                "k".into() => 1.into(),
1206                "k2".into() =>"a".into(),
1207            },
1208            hashmap! {
1209                "k".into() => 3.into(),
1210                "k2".into() =>"c".into(),
1211            },
1212            hashmap! {
1213                "k".into() => 4.into(),
1214                "k2".into() =>"z".into(),
1215            },
1216        ])),
1217        "k2".into(),
1218        TopN::First(2),
1219        Array2::from_shape_vec((2, 2), vec![
1220            1.into(),"a".into(),
1221            2.into(),"b".into(),
1222            ]
1223        ).unwrap(),
1224        vec!["k".into(), "k2".into()],
1225    )]
1226    #[traced_test]
1227    fn top_n(
1228        #[case] input: DataFrame,
1229        #[case] column: Key,
1230        #[case] topn: TopN,
1231        #[case] expected: Array2<DataValue>,
1232        #[case] columns: Vec<Key>,
1233    ) {
1234        let result = input.sorted(&column);
1235        assert!(result.is_ok(), "{result:?}");
1236        let result = result.unwrap();
1237        let first = result.topn(topn).unwrap();
1238        let selected = first.select(Some(&columns));
1239        assert_eq!(selected, expected);
1240    }
1241
1242    #[rstest]
1243    fn test_messagepack_roundtrip_empty_dataframe() {
1244        let df = DataFrame::default();
1245
1246        let bytes = df
1247            .store_into_messagepack()
1248            .expect("failed to serialize empty df");
1249        let restored =
1250            DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize empty df");
1251        assert_eq!(df, restored);
1252        assert!(restored.is_empty());
1253    }
1254
1255    #[rstest]
1256    fn test_messagepack_roundtrip_strings_and_bools() {
1257        // Strings and bools are preserved exactly by messagepack
1258        let df = DataFrame::new(ColumnFrame::from(vec![
1259            hashmap! {
1260                "str".into() => DataValue::String("hello".into()),
1261                "bool".into() => DataValue::Bool(true),
1262            },
1263            hashmap! {
1264                "str".into() => DataValue::String("".into()),
1265                "bool".into() => DataValue::Bool(false),
1266            },
1267        ]));
1268
1269        let bytes = df.store_into_messagepack().expect("failed to serialize");
1270        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1271        assert_eq!(df, restored);
1272    }
1273
1274    #[rstest]
1275    fn test_messagepack_roundtrip_f64_values() {
1276        let df = DataFrame::new(ColumnFrame::from(vec![
1277            hashmap! {
1278                "a".into() => DataValue::F64(3.14),
1279            },
1280            hashmap! {
1281                "a".into() => DataValue::F64(-2.718),
1282            },
1283        ]));
1284
1285        let bytes = df.store_into_messagepack().expect("failed to serialize");
1286        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1287        assert_eq!(df, restored);
1288    }
1289
1290    #[rstest]
1291    fn test_messagepack_f64_special_values_survive_roundtrip() {
1292        // f64::INFINITY serializes/deserializes but PartialEq may differ due to
1293        // DataValue Eq semantics; verify at the value level
1294        let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1295            "a".into() => DataValue::F64(f64::INFINITY),
1296        }]));
1297
1298        let bytes = df.store_into_messagepack().expect("failed to serialize");
1299        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1300        assert_eq!(restored.len(), 1);
1301        let col = restored.select_column("a".into()).expect("col exists");
1302        match &col[0] {
1303            DataValue::F64(v) => assert!(v.is_infinite() && v.is_sign_positive()),
1304            other => panic!("expected F64, got {other:?}"),
1305        }
1306    }
1307
1308    #[rstest]
1309    fn test_messagepack_roundtrip_with_nulls() {
1310        let df = DataFrame::new(ColumnFrame::from(vec![
1311            hashmap! {
1312                "a".into() => DataValue::String("x".into()),
1313                "b".into() => DataValue::String("y".into()),
1314            },
1315            hashmap! {
1316                "a".into() => DataValue::String("z".into()),
1317                // "b" missing => Null
1318            },
1319        ]));
1320
1321        let bytes = df.store_into_messagepack().expect("failed to serialize");
1322        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1323        assert_eq!(df, restored);
1324    }
1325
1326    #[rstest]
1327    fn test_messagepack_roundtrip_with_metadata() {
1328        let mut df = DataFrame::new(crate::column_frame! {
1329            "col" => ["a", "b"]
1330        });
1331        df.add_metadata("name".into(), DataValue::String("test_df".into()));
1332        df.add_metadata("flag".into(), DataValue::Bool(true));
1333
1334        let bytes = df.store_into_messagepack().expect("failed to serialize");
1335        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1336        assert_eq!(df, restored);
1337        assert_eq!(
1338            restored.get_metadata("name"),
1339            Some(&DataValue::String("test_df".into()))
1340        );
1341        assert_eq!(restored.get_metadata("flag"), Some(&DataValue::Bool(true)));
1342    }
1343
1344    #[rstest]
1345    fn test_messagepack_roundtrip_with_constants() {
1346        let mut df = DataFrame::new(crate::column_frame! {
1347            "x" => ["a", "b"]
1348        });
1349        df.insert_constant("const_key".into(), DataValue::String("const_val".into()));
1350        df.insert_constant("const_flag".into(), DataValue::Bool(false));
1351
1352        let bytes = df.store_into_messagepack().expect("failed to serialize");
1353        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1354        assert_eq!(df, restored);
1355        assert_eq!(
1356            restored.constants.get(&"const_key".into()),
1357            Some(&DataValue::String("const_val".into()))
1358        );
1359    }
1360
1361    #[rstest]
1362    fn test_messagepack_integer_type_coercion() {
1363        // MessagePack uses compact integer encoding: small I64 values may
1364        // deserialize as U8/U32 etc. This test documents this lossy behavior.
1365        let df = crate::df! {
1366            "a" => [1i64, 2i64, 3i64]
1367        };
1368
1369        let bytes = df.store_into_messagepack().expect("failed to serialize");
1370        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1371
1372        // The row count is preserved even if integer types differ
1373        assert_eq!(restored.len(), 3);
1374
1375        // Values that fit in u8 get coerced to U8 by messagepack
1376        let col = restored
1377            .select_column("a".into())
1378            .expect("column should exist");
1379        // Values are semantically equivalent but may be different DataValue variants
1380        assert_ne!(
1381            col[0],
1382            DataValue::I64(1),
1383            "messagepack coerces small ints to compact types"
1384        );
1385    }
1386
1387    #[rstest]
1388    fn test_messagepack_large_i64_preserved() {
1389        // Values that exceed u32 range stay as large integer types
1390        let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1391            "big".into() => DataValue::I64(i64::MIN),
1392        }]));
1393
1394        let bytes = df.store_into_messagepack().expect("failed to serialize");
1395        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1396        assert_eq!(df, restored);
1397    }
1398
1399    #[rstest]
1400    fn test_messagepack_load_invalid_bytes() {
1401        let result = DataFrame::load_from_messagepack(&[0xFF, 0xFE, 0xFD, 0x00]);
1402        assert!(result.is_err());
1403    }
1404
1405    #[rstest]
1406    fn test_messagepack_load_empty_bytes() {
1407        let result = DataFrame::load_from_messagepack(&[]);
1408        assert!(result.is_err());
1409    }
1410
1411    #[rstest]
1412    fn test_messagepack_load_truncated_bytes() {
1413        let df = DataFrame::new(ColumnFrame::from(vec![
1414            hashmap! {
1415                "a".into() => DataValue::String("hello world".into()),
1416                "b".into() => DataValue::Bool(true),
1417            },
1418            hashmap! {
1419                "a".into() => DataValue::String("test".into()),
1420                "b".into() => DataValue::Bool(false),
1421            },
1422        ]));
1423        let bytes = df.store_into_messagepack().expect("failed to serialize");
1424        // Truncate to half
1425        let truncated = &bytes[..bytes.len() / 2];
1426        let result = DataFrame::load_from_messagepack(truncated);
1427        assert!(result.is_err());
1428    }
1429
1430    #[rstest]
1431    fn test_messagepack_roundtrip_with_nested_vec_data() {
1432        let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1433            "vec_col".into() => DataValue::Vec(vec![
1434                DataValue::String("a".into()),
1435                DataValue::String("b".into()),
1436            ]),
1437            "bytes_col".into() => DataValue::Bytes(vec![0, 1, 255]),
1438        }]));
1439
1440        let bytes = df.store_into_messagepack().expect("failed to serialize");
1441        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1442        assert_eq!(df, restored);
1443    }
1444
1445    #[rstest]
1446    fn test_messagepack_roundtrip_preserves_row_count() {
1447        let df = DataFrame::new(ColumnFrame::from(vec![
1448            hashmap! { "a".into() => DataValue::String("x".into()) },
1449            hashmap! { "a".into() => DataValue::String("y".into()) },
1450            hashmap! { "a".into() => DataValue::String("z".into()) },
1451        ]));
1452
1453        let bytes = df.store_into_messagepack().expect("failed to serialize");
1454        let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1455        assert_eq!(restored.len(), 3);
1456        assert_eq!(restored.n_rows(), 3);
1457        assert_eq!(restored.n_columns(), 1);
1458    }
1459
1460    #[rstest]
1461    fn test_messagepack_idempotent_double_roundtrip() {
1462        // Use types that survive messagepack coercion (strings, bools, bytes)
1463        let mut df = DataFrame::new(ColumnFrame::from(vec![
1464            hashmap! {
1465                "a".into() => DataValue::String("hello".into()),
1466                "b".into() => DataValue::Bool(true),
1467            },
1468            hashmap! {
1469                "a".into() => DataValue::String("world".into()),
1470                "b".into() => DataValue::Bool(false),
1471            },
1472        ]));
1473        df.add_metadata("meta".into(), DataValue::Bool(true));
1474        df.insert_constant("c".into(), DataValue::String("const".into()));
1475
1476        let bytes1 = df.store_into_messagepack().expect("first serialize");
1477        let restored1 = DataFrame::load_from_messagepack(&bytes1).expect("first deserialize");
1478        let bytes2 = restored1
1479            .store_into_messagepack()
1480            .expect("second serialize");
1481        let restored2 = DataFrame::load_from_messagepack(&bytes2).expect("second deserialize");
1482
1483        assert_eq!(df, restored2);
1484        assert_eq!(bytes1, bytes2);
1485    }
1486
1487    #[rstest]
1488    fn test_messagepack_single_byte_payload() {
1489        // A single valid msgpack byte (e.g. fixint) should fail as incomplete DataFrame
1490        let result = DataFrame::load_from_messagepack(&[0x01]);
1491        assert!(result.is_err());
1492    }
1493
1494    // === hash_datavalue public API edge case tests ===
1495
1496    #[rstest]
1497    fn test_hash_datavalue_public_api_accessible() {
1498        // Verify the re-exported function works from the crate root
1499        let val = DataValue::I32(42);
1500        let h = crate::hash_datavalue(&val);
1501        // Deterministic
1502        assert_eq!(h, crate::hash_datavalue(&DataValue::I32(42)));
1503    }
1504
1505    #[rstest]
1506    fn test_hash_datavalue_vec_length_matters() {
1507        // [1] and [1, Null] should produce different hashes
1508        let short = DataValue::Vec(vec![DataValue::I32(1)]);
1509        let long = DataValue::Vec(vec![DataValue::I32(1), DataValue::Null]);
1510        assert_ne!(crate::hash_datavalue(&short), crate::hash_datavalue(&long));
1511    }
1512
1513    #[rstest]
1514    fn test_hash_datavalue_map_different_keys_same_values() {
1515        let mut m1 = std::collections::HashMap::new();
1516        m1.insert("a".into(), DataValue::I32(1));
1517        let mut m2 = std::collections::HashMap::new();
1518        m2.insert("b".into(), DataValue::I32(1));
1519
1520        assert_ne!(
1521            crate::hash_datavalue(&DataValue::Map(m1)),
1522            crate::hash_datavalue(&DataValue::Map(m2))
1523        );
1524    }
1525
1526    #[rstest]
1527    fn test_hash_datavalue_empty_string_vs_empty_bytes() {
1528        let empty_str = DataValue::String("".into());
1529        let empty_bytes = DataValue::Bytes(vec![]);
1530        assert_ne!(
1531            crate::hash_datavalue(&empty_str),
1532            crate::hash_datavalue(&empty_bytes)
1533        );
1534    }
1535
1536    #[rstest]
1537    fn test_hash_datavalue_empty_vec_vs_empty_map() {
1538        let empty_vec = DataValue::Vec(vec![]);
1539        let empty_map = DataValue::Map(std::collections::HashMap::new());
1540        assert_ne!(
1541            crate::hash_datavalue(&empty_vec),
1542            crate::hash_datavalue(&empty_map)
1543        );
1544    }
1545
1546    #[rstest]
1547    fn test_hash_datavalue_i128_boundary_values() {
1548        let max = DataValue::I128(i128::MAX);
1549        let min = DataValue::I128(i128::MIN);
1550        let zero = DataValue::I128(0);
1551        let neg_one = DataValue::I128(-1);
1552
1553        // All distinct
1554        let hashes: std::collections::HashSet<u64> = [&max, &min, &zero, &neg_one]
1555            .iter()
1556            .map(|v| crate::hash_datavalue(v))
1557            .collect();
1558        assert_eq!(hashes.len(), 4);
1559    }
1560
1561    #[rstest]
1562    fn test_hash_datavalue_u128_boundary_values() {
1563        let max = DataValue::U128(u128::MAX);
1564        let zero = DataValue::U128(0);
1565        let one = DataValue::U128(1);
1566        // u128::MAX is all bits set; ensure it differs from i128(-1) which is also all bits
1567        let i128_neg1 = DataValue::I128(-1);
1568
1569        assert_ne!(
1570            crate::hash_datavalue(&max),
1571            crate::hash_datavalue(&i128_neg1)
1572        );
1573        let hashes: std::collections::HashSet<u64> = [&max, &zero, &one]
1574            .iter()
1575            .map(|v| crate::hash_datavalue(v))
1576            .collect();
1577        assert_eq!(hashes.len(), 3);
1578    }
1579
1580    #[rstest]
1581    fn test_hash_datavalue_f64_special_values() {
1582        // NaN bit patterns: NaN == NaN for hashing since we use to_bits()
1583        let nan1 = DataValue::F64(f64::NAN);
1584        let nan2 = DataValue::F64(f64::NAN);
1585        assert_eq!(crate::hash_datavalue(&nan1), crate::hash_datavalue(&nan2));
1586
1587        // subnormal
1588        let subnormal = DataValue::F64(f64::MIN_POSITIVE / 2.0);
1589        let normal = DataValue::F64(f64::MIN_POSITIVE);
1590        assert_ne!(
1591            crate::hash_datavalue(&subnormal),
1592            crate::hash_datavalue(&normal)
1593        );
1594    }
1595
1596    #[rstest]
1597    fn test_hash_datavalue_enum_number_vs_i32_same_value() {
1598        // EnumNumber(42) and I32(42) should hash differently (different discriminant)
1599        let enum_val = DataValue::EnumNumber(42);
1600        let i32_val = DataValue::I32(42);
1601        assert_ne!(
1602            crate::hash_datavalue(&enum_val),
1603            crate::hash_datavalue(&i32_val)
1604        );
1605    }
1606}