1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
27pub enum TopN {
28 First(usize),
30 Last(usize),
32}
33
34#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
58#[cfg_attr(feature = "python", pyclass)]
59pub struct DataFrame {
60 pub constants: HashMap<Key, DataValue>,
64 pub dataframe: ColumnFrame,
66 pub metadata: HashMap<String, DataValue>,
69}
70
71impl fmt::Display for DataFrame {
72 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73 self.dataframe.fmt(f)
74 }
75}
76
77impl DataFrame {
78 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
93 Self {
94 constants: HashMap::new(),
95 dataframe: dataframe.into(),
96 metadata: HashMap::new(),
97 }
98 }
99
100 pub fn n_columns(&self) -> usize {
102 self.dataframe.data_frame.ncols()
103 }
104
105 pub fn n_rows(&self) -> usize {
107 self.dataframe.data_frame.nrows()
108 }
109
110 pub fn shrink(&mut self) {
113 self.dataframe.shrink();
114 }
115
116 pub fn add_metadata(&mut self, key: String, value: DataValue) {
121 self.metadata.insert(key, value);
122 }
123
124 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
126 self.metadata.get(key)
127 }
128
129 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
141 for (key, value) in other.constants {
142 self.constants.insert(key, value);
143 }
144 self.dataframe.join(other.dataframe, join_type)
145 }
146
147 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
152 where
153 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
154 {
155 self.dataframe.apply_function(keys, &mut func)
156 }
157
158 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
173 Ok(self.dataframe.select(keys))
174 }
175
176 pub fn select_typed<T: Extract>(&self, keys: Option<&[Key]>) -> Result<Array2<T>, Error> {
203 Ok(self.dataframe.select_typed(keys))
204 }
205
206 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
215 self.dataframe.select_transposed_typed::<D>(keys)
216 }
217
218 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
223 self.dataframe.select_column(&key)
224 }
225
226 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
231 self.dataframe.select_transposed(keys)
232 }
233
234 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
240 self.constants.insert(key, value);
241 }
242
243 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
249 self.dataframe.push(item)
250 }
251
252 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
255 self.dataframe.remove_column(keys).map(|x| x.into())
256 }
257
258 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
263 self.dataframe.extend(items.dataframe)
264 }
265
266 pub fn len(&self) -> usize {
268 self.dataframe.len()
269 }
270
271 pub fn is_empty(&self) -> bool {
273 self.dataframe.is_empty()
274 }
275
276 pub fn add_single_column<K: Into<Key>>(
281 &mut self,
282 key: K,
283 values: Array1<DataValue>,
284 ) -> Result<(), Error> {
285 self.dataframe.add_single_column(key, values)
286 }
287
288 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
293 self.dataframe.get_single_column(key)
294 }
295
296 pub fn get_single_column_typed<T: Extract>(&self, key: &Key) -> Option<Array1<T>> {
323 self.dataframe.get_single_column_typed(key)
324 }
325
326 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
331 self.dataframe.sorted(key)
332 }
333
334 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
341 let filtered_df = self.dataframe.filter(filter)?;
342 Ok(Self {
343 constants: self.constants.clone(),
344 dataframe: filtered_df,
345 metadata: self.metadata.clone(),
346 })
347 }
348
349 #[cfg(feature = "polars-df")]
354 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
355 let mut columns = vec![];
356 for key in self.dataframe.keys() {
357 let values = self
358 .dataframe
359 .get_single_column(key)
360 .ok_or_else(|| Error::NotFound(key.clone()))?
361 .into_iter()
362 .map(|x| into_polars_value(key, x.clone()))
363 .collect::<Vec<_>>();
364 let s = polars::prelude::Column::new(key.name().into(), values);
365
366 columns.push(s);
367 }
368
369 Ok(polars::prelude::DataFrame::new(columns)?)
370 }
371
372 pub fn load_from_messagepack(bytes: &[u8]) -> Result<Self, Error> {
377 rmp_serde::decode::from_slice(bytes).map_err(|e| Error::UnknownError(format!("{e:?}")))
378 }
379
380 pub fn store_into_messagepack(&self) -> Result<Vec<u8>, Error> {
385 rmp_serde::encode::to_vec(&self).map_err(|e| Error::UnknownError(format!("{e:?}")))
386 }
387}
388
389#[cfg(feature = "polars-df")]
393pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
394 use crate::DataType::*;
395 use polars::prelude::DataType::*;
396 match dtype {
397 Bool => Boolean,
398 U32 => UInt32,
399 I32 => Int32,
400 U8 => UInt8,
401 U64 => UInt64,
402 I64 => Int64,
403 F32 => Float32,
404 F64 => Float64,
405 U128 => UInt128,
406 I128 => Int128,
407 crate::DataType::String => polars::prelude::DataType::String,
408 Bytes => Binary,
409 crate::DataType::Unknown => Null,
410 Vec => List(Box::new(polars::prelude::DataType::Unknown(
411 polars::prelude::UnknownKind::Any,
412 ))),
413 Map => Struct(vec![]),
414 }
415}
416
417#[cfg(feature = "polars-df")]
422pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
423 use polars::prelude::AnyValue::*;
424 use polars::prelude::Field;
425
426 use crate::dataframe::column_store::convert_dv_to_dtype;
427 let dv = convert_dv_to_dtype(key, dv);
428 match dv {
429 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
430 DataValue::Bytes(items) => BinaryOwned(items),
431 DataValue::U8(x) => UInt32(x as _),
432 DataValue::Bool(x) => Boolean(x),
433 DataValue::I32(x) => Int32(x),
434 DataValue::U32(x) => UInt32(x),
435 DataValue::I64(x) => Int64(x),
436 DataValue::U64(x) => UInt64(x),
437 DataValue::I128(x) => Int128(x),
438 DataValue::F32(x) => Float32(x),
439 DataValue::F64(x) => Float64(x),
440 DataValue::Null => Null,
441 DataValue::Vec(data_values) => {
442 let mut dt = crate::DataType::Unknown;
443 for d in data_values.iter() {
444 match crate::detect_dtype(d) {
445 crate::DataType::Unknown => continue,
446 e => {
447 dt = e;
448 break;
449 }
450 }
451 }
452 let vec_key = Key::new(key.name(), dt);
453 let s = polars::series::Series::from_any_values(
454 key.name().into(),
455 &data_values
456 .into_iter()
457 .map(|x| into_polars_value(&vec_key, x))
458 .collect::<Vec<_>>(),
459 true,
460 );
461 List(s.expect(&format!("Cannot create series for {key:?}")))
462 }
463 DataValue::EnumNumber(x) => Int32(x),
464 DataValue::U128(x) => UInt128(x),
465 DataValue::Map(x) => {
466 let mut values = vec![];
467 let mut fields = vec![];
468 let mut sorted_keys = x.keys().collect::<Vec<_>>();
469 sorted_keys.sort();
470 for k in sorted_keys {
471 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
472 let dtype = crate::detect_dtype(value);
473 let k = Key::new(k, dtype);
474 values.push(into_polars_value(&k, value.to_owned()));
475 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
476 }
477 StructOwned(Box::new((values, fields)))
478 }
479 }
480}
481
482#[cfg(feature = "polars-df")]
486pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
487 use polars::prelude::AnyValue::*;
488 match dv {
489 Null => DataValue::Null,
490 Boolean(v) => v.into(),
491 String(v) => DataValue::String(v.into()),
492 UInt8(v) => DataValue::U8(v),
493 UInt16(v) => DataValue::U32(v as u32),
494 UInt32(v) => v.into(),
495 UInt64(v) => v.into(),
496 Int8(v) => (v as i32).into(),
497 Int16(v) => (v as i32).into(),
498 Int32(v) => v.into(),
499 Int64(v) => v.into(),
500 Float32(v) => v.into(),
501 Float64(v) => v.into(),
502 Int128(v) => v.into(),
503 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
504 StringOwned(v) => DataValue::String(v.as_str().into()),
508 Binary(v) => DataValue::Bytes(v.to_owned()),
509 BinaryOwned(v) => DataValue::Bytes(v),
510 StructOwned(m) => {
511 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
512 std::collections::HashMap::new();
513 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
514 hm.insert(k.name.as_str().into(), from_polars_value(v));
515 }
516 DataValue::Map(hm)
517 }
518 e => {
519 tracing::warn!("Unsupported polars value: {e:?}");
520 DataValue::Null
521 }
522 }
523}
524
525impl From<ColumnFrame> for DataFrame {
526 fn from(dataframe: ColumnFrame) -> Self {
527 Self::new(dataframe)
528 }
529}
530
531impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
532 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
533 Self::new(ColumnFrame::from(dataframe))
534 }
535}
536
537impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
538 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
539 Self::new(ColumnFrame::from(dataframe))
540 }
541}
542
543impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
544 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
545 Self::new(ColumnFrame::from(dataframe))
546 }
547}
548
549impl From<MLChefMap> for DataFrame {
550 fn from(dataframe: MLChefMap) -> Self {
551 Self::new(ColumnFrame::from(dataframe))
552 }
553}
554impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
555 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
556 Self::new(ColumnFrame::from(dataframe))
557 }
558}
559
560impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
561 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
562 Self::new(ColumnFrame::from(dataframe))
563 }
564}
565
566#[cfg(feature = "polars-df")]
567impl From<polars::prelude::DataFrame> for DataFrame {
568 fn from(dataframe: polars::prelude::DataFrame) -> Self {
569 Self::new(ColumnFrame::from(dataframe))
570 }
571}
572#[cfg(test)]
573mod test {
574 use crate::filter::FilterRules;
575
576 use super::*;
577 use halfbrown::hashmap;
578 #[cfg(feature = "polars-df")]
579 use polars::prelude::NamedFrom as _;
580 use rstest::*;
581 use tracing_test::traced_test;
582 #[fixture]
583 fn dummy_candidates() -> ColumnFrame {
584 ColumnFrame::from(vec![
585 hashmap! {
586 "key1".into() => 1.into(),
587 "key2".into() => "a".into(),
588 },
589 hashmap! {
590 "key1".into() => 2.into(),
591 "key2".into() => "b".into(),
592 },
593 ])
594 }
595
596 #[rstest]
597 fn test_serde() {
598 let df = crate::df! {
599 "a" => [1u64, 2u64, 3u64],
600 "b" => [4u64, 5u64, 6u64],
601 "c" => [7u64, 8u64, 9u64]
602 };
603
604 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
605
606 let deserialized =
607 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
608
609 assert_eq!(df, deserialized);
610 }
611
612 #[cfg(feature = "polars-df")]
613 #[rstest]
614 fn test_polars() {
615 let expected = crate::df! {
616 "a" => [1u64, 2u64, 3u64],
617 "b" => [4f64, 5f64, 6f64],
618 "c" => [7i64, 8i64, 9i64]
619 };
620
621 let polars_df = polars::df!(
622 "a" => [1u64, 2u64, 3u64],
623 "b" => [4f64, 5f64, 6f64],
624 "c" => [7i64, 8i64, 9i64]
625 )
626 .expect("BUG: should be ok");
627 let as_df: DataFrame = polars_df.into();
628 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
629 assert_eq!(
630 as_df.select(Some(keys.as_slice())),
631 expected.select(Some(keys.as_slice()))
632 );
633 }
634 #[cfg(feature = "polars-df")]
635 use crate::DataType;
636 #[cfg(feature = "polars-df")]
637 #[rstest]
638 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
639 #[case::u32(
640 Key::new("a", DataType::U32),
641 DataValue::U32(u32::MAX),
642 polars::prelude::AnyValue::UInt32(u32::MAX)
643 )]
644 #[case::i32(
645 Key::new("a", DataType::I32),
646 DataValue::I32(i32::MIN),
647 polars::prelude::AnyValue::Int32(i32::MIN)
648 )]
649 #[case::i64(
650 Key::new("a", DataType::I64),
651 DataValue::I64(i64::MIN),
652 polars::prelude::AnyValue::Int64(i64::MIN)
653 )]
654 #[case::u64(
655 Key::new("a", DataType::U64),
656 DataValue::U64(u64::MIN),
657 polars::prelude::AnyValue::UInt64(u64::MIN)
658 )]
659 #[case::f32(
660 Key::new("a", DataType::F32),
661 DataValue::F32(f32::MIN),
662 polars::prelude::AnyValue::Float32(f32::MIN)
663 )]
664 #[case::f64(
665 Key::new("a", DataType::F64),
666 DataValue::F64(f64::MIN),
667 polars::prelude::AnyValue::Float64(f64::MIN)
668 )]
669 #[case::null(
670 Key::new("a", DataType::Unknown),
671 DataValue::Null,
672 polars::prelude::AnyValue::Null
673 )]
674 #[case::i128(
675 Key::new("a", DataType::I128),
676 DataValue::I128(i128::MIN),
677 polars::prelude::AnyValue::Int128(i128::MIN)
678 )]
679 #[case::u8(
680 Key::new("a", DataType::U8),
681 DataValue::U8(255),
682 polars::prelude::AnyValue::UInt8(255)
683 )]
684 #[case::bool(
685 Key::new("a", DataType::Bool),
686 DataValue::Bool(true),
687 polars::prelude::AnyValue::Boolean(true)
688 )]
689 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
690 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
691 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
692 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
693 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
694 fn into_polars_value_test(
698 #[case] key: Key,
699 #[case] input: DataValue,
700 #[case] output: polars::prelude::AnyValue<'static>,
701 ) {
702 assert_eq!(into_polars_value(&key, input.clone()), output);
703 assert_eq!(from_polars_value(output), input);
704 }
705
706 #[rstest]
718 #[case(
719 DataFrame::new(crate::column_frame! {
720 "a" => [1f64, 2f64, 3f64],
721 "b" => [4i64, 5i64, 6i64],
722 "c" => [7i64, 8i64, 9i64]
723 }),
724 DataFrame::new(crate::column_frame! {
725 "a" => [1f64, 2f64],
726 "b" => [4i64, 5i64],
727 "c" => [7i64, 8i64]
728 }),
729 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
730 )]
731 #[case(
732 DataFrame::new(crate::column_frame! {
733 "a" => [1f64, 2f64, 3f64],
734 "b" => [4i64, 5i64, 6i64],
735 "c" => [7i64, 8i64, 9i64]
736 }),
737 DataFrame::new(crate::column_frame! {
738 "a" => [2f64],
739 "b" => [5i64],
740 "c" => [8i64]
741 }),
742 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
743 )]
744 #[traced_test]
745 fn filter_test(
746 #[case] df: DataFrame,
747 #[case] expected: DataFrame,
748 #[case] filter: FilterRules,
749 ) {
750 let filtered = df.filter(&filter).expect("BUG: cannot filter");
751 assert_eq!(filtered, expected);
752 }
753
754 #[rstest]
755 fn test_serde_complex() {
756 let simple = r#"
757{
758 "constants": {},
759 "dataframe": {
760 "index": {
761 "keys": [
762 {
763 "key": 3162770485,
764 "name": "a",
765 "ctype": "U32"
766 },
767 {
768 "key": 2279056742,
769 "name": "b",
770 "ctype": "F64"
771 },
772 {
773 "key": 2994984227,
774 "name": "c",
775 "ctype": "U64"
776 },
777 {
778 "key": 3319645144,
779 "name": "d",
780 "ctype": "F64"
781 },
782 {
783 "key": 1291847470,
784 "name": "e",
785 "ctype": "U32"
786 },
787 {
788 "key": 874241070,
789 "name": "f",
790 "ctype": "Bool"
791 }
792 ],
793 "indexes": {
794 "a": 0,
795 "b": 1,
796 "c": 2,
797 "d": 3,
798 "e": 4,
799 "f": 5
800 },
801 "alias": {}
802 },
803 "data_frame": {
804 "v": 1,
805 "dim": [
806 2,
807 6
808 ],
809 "data": [
810 253780,
811 0.009369421750307085,
812 1633222860381359,
813 8,
814 5,
815 true,
816 64512,
817 0.003391335718333721,
818 1633222860810557,
819 8,
820 5,
821 null
822 ]
823 }
824 },
825 "metadata": {}
826}
827 "#;
828
829 let simple_deserialized: DataFrame =
830 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
831
832 println!("deserialized: {simple_deserialized:?}");
833 let array = format!("[{}, {}, {}]", simple, simple, simple);
834 let deserialized: Vec<DataFrame> =
835 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
836
837 println!("deserialized: {deserialized:?}");
838 assert_eq!(deserialized.len(), 3);
839 assert_eq!(simple_deserialized, deserialized[0]);
840 }
841
842 #[rstest]
843 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
844 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
845 #[case(vec![hashmap! {
846 "key1".into() => 1.into(),
847 "key2".into() => "a".into(),
848 },
849 hashmap! {
850 "key1".into() => 2.into(),
851 },])]
852 #[case(vec![data_value::stdhashmap! {
853 "key1" => DataValue::from(1),
854 "key2" => DataValue::from("a"),
855 },data_value::stdhashmap! {
856 "key1" => DataValue::from(2),
857 },])]
858 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
859 vec![DataValue::from("a"), DataValue::Null])])]
860 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
861 let df: DataFrame = input.into();
862 assert_eq!(
863 df,
864 DataFrame {
865 constants: HashMap::new(),
866 dataframe: ColumnFrame::from(vec![
867 hashmap! {
868 "key1".into() => 1.into(),
869 "key2".into() => "a".into(),
870 },
871 hashmap! {
872 "key1".into() => 2.into(),
873 },
874 ]),
875 metadata: HashMap::new(),
876 }
877 );
878 let selected_transposed = df.select_column("key1".into());
879 assert!(selected_transposed.is_some());
880 let selected_transposed = selected_transposed.unwrap();
881 assert_eq!(selected_transposed.len(), 2);
882 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
883 }
884
885 #[rstest]
886 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
887 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
888 #[case::hm({
889 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
890 hm
891 })]
892 #[case::vec_hhm(vec![hashmap! {
893 "key1".into() => 1.into(),
894 "key2".into() => "a".into(),
895 },
896 hashmap! {
897 "key1".into() => 2.into(),
898 },])]
899 #[case::vec_hme(vec![data_value::stdhashmap! {
900 "key1" => DataValue::from(1),
901 "key2" => DataValue::from("a"),
902 },data_value::stdhashmap! {
903 "key1" => DataValue::from(2),
904 },])]
905 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
906 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
907 let df: DataFrame = input.into();
908 let expected: DataFrame = DataFrame {
909 constants: HashMap::new(),
910 dataframe: ColumnFrame::from(vec![
911 hashmap! {
912 "key1".into() => 1.into(),
913 "key2".into() => "a".into(),
914 },
915 hashmap! {
916 "key1".into() => 2.into(),
917 },
918 ]),
919 metadata: HashMap::new(),
920 };
921 assert_eq!(
922 df.select(Some(&["key1".into(), "key2".into()])),
923 expected.select(Some(&["key1".into(), "key2".into()])),
924 "{df} vs {expected}"
925 );
926 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
927 assert_eq!(selected_transposed.len(), 2);
928 println!("{:?}", selected_transposed);
929 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
930 }
931 #[rstest]
932 fn test_dataframe(dummy_candidates: ColumnFrame) {
933 let mut dataframe: DataFrame = DataFrame::default();
934 assert!(dataframe.is_empty());
935 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
936 assert_eq!(dataframe.len(), 2);
937
938 let candidate = hashmap! {
939 "key1".into() => 3.into(),
940 "key2".into() => "c".into(),
941 };
942
943 assert!(dataframe.push(candidate).is_ok());
944 assert_eq!(dataframe.len(), 3);
945 assert!(!dataframe.is_empty());
946
947 dataframe.insert_constant("key3".into(), 4.into());
948 assert_eq!(dataframe.constants.len(), 1);
949 assert!(dataframe
950 .apply_function(&["key1".into()], |keys, df| {
951 let key = keys[0].clone();
952 let s = df
953 .get_single_column(&key)
954 .expect("BUG: Cannot get column")
955 .to_owned();
956 let s = s.mapv(|x| x + DataValue::from(1));
957 df.add_single_column("key5", s)?;
958 Ok(())
959 })
960 .is_ok());
961 let original = dataframe.clone();
962 dataframe.shrink();
963 let remove_df = dataframe.remove_column(&["key1".into()]);
964 assert!(remove_df.is_ok());
965 let mut remove_df = remove_df.unwrap();
966 assert_eq!(remove_df.len(), 3);
967 let selected = dataframe.select(Some(&["key2".into()]));
968 assert!(selected.is_ok());
969 let selected = selected.unwrap();
970 println!("{:?}", selected);
971
972 let joined_result =
974 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
975 assert!(joined_result.is_ok(), "{:?}", joined_result);
976 let keys = vec!["key1".into(), "key2".into(), "key5".into()];
977 assert_eq!(
978 original.select(Some(keys.as_slice())),
979 remove_df.select(Some(keys.as_slice()))
980 );
981 }
982
983 #[rstest]
984 fn test_size_methods() {
985 let candidate = hashmap! {
986 "key1".into() => 3.into(),
987 "key2".into() => "c".into(),
988 "key3".into() => false.into()
989 };
990
991 let dataframe: DataFrame = vec![candidate].into();
992
993 assert_eq!(dataframe.n_columns(), 3);
994 assert_eq!(dataframe.n_rows(), 1);
995 }
996
997 #[rstest]
998 fn test_metadata(dummy_candidates: ColumnFrame) {
999 let mut dataframe: DataFrame = DataFrame::default();
1000 assert!(dataframe.is_empty());
1001 println!("{:?}", dataframe);
1002 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
1003 println!("{:?}", dataframe);
1004 assert_eq!(dataframe.len(), 2);
1005
1006 dataframe.add_metadata("test".into(), 1.into());
1007 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
1008 let dataframe = DataFrame::new(ColumnFrame::from(vec![
1009 hashmap! {
1010 "key1".into() => 1.into(),
1011 "key2".into() => "a".into(),
1012 },
1013 hashmap! {
1014 "key1".into() => 2.into(),
1015 "key2".into() => "b".into(),
1016 },
1017 ]));
1018 assert_eq!(dataframe.get_metadata("test"), None);
1019 let tt = dataframe.select_transposed(None);
1020 assert!(tt.is_ok());
1021 let tt = tt.unwrap();
1022 assert_eq!(tt.shape(), [2, 2]);
1023 assert_eq!(
1024 tt,
1025 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
1026 .unwrap()
1027 );
1028 }
1029
1030 #[rstest]
1031 #[traced_test]
1032 fn add_single_column_test() {
1033 let mut dataframe = DataFrame::default();
1034 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
1035 let r = dataframe.add_single_column("key1", values);
1036 assert!(r.is_ok(), "{r:?}");
1037 let selected = dataframe.select(None);
1038 assert!(selected.is_ok());
1039 let selected = selected.unwrap();
1040 assert_eq!(selected.shape(), [3, 1]);
1041 assert_eq!(
1042 selected,
1043 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
1044 );
1045 let values = Array1::from(vec![1.into(), 2.into()]);
1046 assert!(dataframe.add_single_column("key1", values).is_err());
1047 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1048 assert!(dataframe.add_single_column("key2", values).is_ok());
1049 let values = Array1::from(vec![3.into()]);
1050 assert!(dataframe.add_single_column("key3", values).is_err());
1051 }
1052
1053 #[rstest]
1054 #[traced_test]
1055 fn add_single_column_empty_test() {
1056 let mut dataframe = DataFrame::default();
1057 let values = Array1::from(vec![]);
1058 let r = dataframe.add_single_column("key1", values);
1059 assert!(r.is_ok(), "{r:?}");
1060 let selected = dataframe.select(None);
1061 assert!(selected.is_ok());
1062 let selected = selected.unwrap();
1063 assert_eq!(selected.shape(), [0, 1]);
1064 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
1065 let values = Array1::from(vec![1.into(), 2.into()]);
1066 assert!(dataframe.add_single_column("key1", values).is_err());
1067 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1068 assert!(dataframe.add_single_column("key2", values).is_ok());
1069 let values = Array1::from(vec![3.into(), 4.into()]);
1070 assert!(dataframe.add_single_column("key3", values).is_err());
1071 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1072 assert!(dataframe.add_single_column("key3", values).is_ok());
1073
1074 assert_eq!(
1075 dataframe
1076 .select_column("key1".into())
1077 .expect("BUG: has to exists"),
1078 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
1079 );
1080 assert_eq!(
1081 dataframe
1082 .select_column("key2".into())
1083 .expect("BUG: has to exists"),
1084 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
1085 );
1086 assert_eq!(
1087 dataframe.select(None).expect("BUG: cannot get data"),
1088 ndarray::arr2(&[
1089 [DataValue::Null, 3.into(), 3.into()],
1090 [DataValue::Null, 4.into(), 4.into()],
1091 [DataValue::Null, 5.into(), 5.into()],
1092 ])
1093 );
1094 }
1095
1096 #[rstest]
1097 #[case(
1098 DataFrame::new(ColumnFrame::from(vec![
1099 hashmap! {
1100 "k".into() => 1.into(),
1101 "k2".into() => 2.into(),
1102 "k3".into() => 2.2.into(),
1103 },
1104 hashmap! {
1105 "k".into() => 11.into(),
1106 "k2".into() => 3.into(),
1107 },
1108 hashmap! {
1109 "k".into() => 4.into(),
1110 "k2".into() => 5.into(),
1111 "k3".into() => 2.3.into(),
1112 },
1113 hashmap! {
1114 "k".into() => 4.into(),
1115 "k2".into() => 5.into(),
1116 "k3".into() => 2.4.into(),
1117 },
1118 ])),
1119 vec!["k".into(), "k2".into()],
1120 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
1121 )]
1122 #[case(
1123 DataFrame::new(ColumnFrame::from(vec![
1124 hashmap! {
1125 "k".into() => 1.into(),
1126 "k2".into() => 2.into(),
1127 "k3".into() => 2.2.into(),
1128 },
1129 hashmap! {
1130 "k".into() => 11.into(),
1131 "k2".into() => 3.into(),
1132 },
1133 hashmap! {
1134 "k".into() => 4.into(),
1135 "k2".into() => 5.into(),
1136 "k3".into() => 2.3.into(),
1137 },
1138 hashmap! {
1139 "k".into() => 4.into(),
1140 "k2".into() => 5.into(),
1141 "k3".into() => 2.4.into(),
1142 },
1143 ])),
1144 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
1145 Array2::from_shape_vec((4, 5), vec![
1146 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
1147 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
1148 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
1149 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
1150 )]
1151 #[traced_test]
1152 fn select_multiple(
1153 #[case] input: DataFrame,
1154 #[case] columns: Vec<Key>,
1155 #[case] expected: Array2<DataValue>,
1156 ) {
1157 let selected = input.select(Some(&columns));
1158 assert!(selected.is_ok());
1159 let selected = selected.unwrap();
1160
1161 assert_eq!(selected, expected);
1162 }
1163
1164 #[rstest]
1165 #[case(
1166 DataFrame::new(ColumnFrame::from(vec![
1167 hashmap! {
1168 "k".into() => 1.into(),
1169 "k2".into() => 2.into(),
1170 "k3".into() => 2.2.into(),
1171 },
1172 hashmap! {
1173 "k".into() => 11.into(),
1174 "k2".into() => 3.into(),
1175 },
1176 hashmap! {
1177 "k".into() => 4.into(),
1178 "k2".into() => 5.into(),
1179 "k3".into() => 2.3.into(),
1180 },
1181 hashmap! {
1182 "k".into() => 4.into(),
1183 "k2".into() => 5.into(),
1184 "k3".into() => 2.4.into(),
1185 },
1186 ])),
1187 "k".into(),
1188 Array2::from_shape_vec((4, 3), vec![
1189 1.into(), 2.into(), 2.2.into(),
1190 4.into(), 5.into(), 2.3.into(),
1191 4.into(), 5.into(), 2.4.into(),
1192 11.into(), 3.into(), DataValue::Null,
1193 ]
1194 ).unwrap(),
1195 vec!["k".into(), "k2".into(), "k3".into()],
1196 )]
1197 #[rstest]
1198 #[case(
1199 DataFrame::new(ColumnFrame::from(vec![
1200 hashmap! {
1201 "k".into() => 1.into(),
1202 "k2".into() => 2.into(),
1203 "k3".into() => 2.2.into(),
1204 },
1205 hashmap! {
1206 "k".into() => 11.into(),
1207 "k2".into() => 3.into(),
1208 },
1209 hashmap! {
1210 "k".into() => 4.into(),
1211 "k2".into() => 5.into(),
1212 "k3".into() => 2.3.into(),
1213 },
1214 hashmap! {
1215 "k".into() => 4.into(),
1216 "k2".into() => 5.into(),
1217 "k3".into() => 2.4.into(),
1218 },
1219 ])),
1220 "k3".into(),
1221 Array2::from_shape_vec((4, 3), vec![
1222 11.into(), 3.into(), DataValue::Null,
1223 1.into(), 2.into(), 2.2.into(),
1224 4.into(), 5.into(), 2.3.into(),
1225 4.into(), 5.into(), 2.4.into(),
1226 ]
1227 ).unwrap(),
1228 vec!["k".into(), "k2".into(), "k3".into()],
1229 )]
1230 #[case(
1231 DataFrame::new(ColumnFrame::from(vec![
1232 hashmap! {
1233 "k".into() => 2.into(),
1234 "k2".into() => 0.000001.into(),
1235 },
1236 hashmap! {
1237 "k".into() => 1.into(),
1238 "k2".into() =>0.0000001.into(),
1239 },
1240 hashmap! {
1241 "k".into() => 3.into(),
1242 "k2".into() => 0.00001.into(),
1243 },
1244 hashmap! {
1245 "k".into() => 4.into(),
1246 "k2".into() => 0.001.into(),
1247 },
1248 ])),
1249 "k2".into(),
1250 Array2::from_shape_vec((4, 2), vec![
1251 1.into(), 0.0000001.into(),
1252 2.into(), 0.000001.into(),
1253 3.into(), 0.00001.into(),
1254 4.into(), 0.001.into(),
1255 ]
1256 ).unwrap(),
1257 vec!["k".into(), "k2".into()],
1258 )]
1259 #[case(
1260 DataFrame::new(ColumnFrame::from(vec![
1261 hashmap! {
1262 "k".into() => 2.into(),
1263 "k2".into() => "b".into(),
1264 },
1265 hashmap! {
1266 "k".into() => 1.into(),
1267 "k2".into() =>"a".into(),
1268 },
1269 hashmap! {
1270 "k".into() => 3.into(),
1271 "k2".into() =>"c".into(),
1272 },
1273 hashmap! {
1274 "k".into() => 4.into(),
1275 "k2".into() =>"z".into(),
1276 },
1277 ])),
1278 "k2".into(),
1279 Array2::from_shape_vec((4, 2), vec![
1280 1.into(),"a".into(),
1281 2.into(), "b".into(),
1282 3.into(), "c".into(),
1283 4.into(), "z".into(),
1284 ]
1285 ).unwrap(),
1286 vec!["k".into(), "k2".into()],
1287 )]
1288 #[traced_test]
1289 fn sort_by(
1290 #[case] input: DataFrame,
1291 #[case] column: Key,
1292 #[case] expected: Array2<DataValue>,
1293 #[case] columns: Vec<Key>,
1294 ) {
1295 let result = input.sorted(&column);
1296 assert!(result.is_ok(), "{result:?}");
1297 let result = result.unwrap().get_sorted();
1298 let selected = result.select(Some(&columns));
1299
1300 assert_eq!(selected, expected);
1301 }
1302 #[rstest]
1303 #[case(
1304 DataFrame::new(ColumnFrame::from(vec![
1305 hashmap! {
1306 "k".into() => 2.into(),
1307 "k2".into() => 0.000001.into(),
1308 },
1309 hashmap! {
1310 "k".into() => 1.into(),
1311 "k2".into() =>0.0000001.into(),
1312 },
1313 hashmap! {
1314 "k".into() => 3.into(),
1315 "k2".into() => 0.00001.into(),
1316 },
1317 hashmap! {
1318 "k".into() => 4.into(),
1319 "k2".into() => 0.001.into(),
1320 },
1321 ])),
1322 "k2".into(),
1323 TopN::Last(1),
1324 Array2::from_shape_vec((1, 2), vec![
1325 4.into(), 0.001.into(),
1326 ]
1327 ).unwrap(),
1328 vec!["k".into(), "k2".into()],
1329 )]
1330 #[case(
1331 DataFrame::new(ColumnFrame::from(vec![
1332 hashmap! {
1333 "k".into() => 2.into(),
1334 "k2".into() => 0.000001.into(),
1335 },
1336 hashmap! {
1337 "k".into() => 1.into(),
1338 "k2".into() =>0.0000001.into(),
1339 },
1340 hashmap! {
1341 "k".into() => 3.into(),
1342 "k2".into() => 0.00001.into(),
1343 },
1344 hashmap! {
1345 "k".into() => 4.into(),
1346 "k2".into() => 0.001.into(),
1347 },
1348 ])),
1349 "k2".into(),
1350 TopN::Last(2),
1351 Array2::from_shape_vec((2, 2), vec![
1352 4.into(), 0.001.into(),
1353 3.into(), 0.00001.into(),
1354 ]
1355 ).unwrap(),
1356 vec!["k".into(), "k2".into()],
1357 )]
1358 #[case(
1359 DataFrame::new(ColumnFrame::from(vec![
1360 hashmap! {
1361 "k".into() => 2.into(),
1362 "k2".into() => "b".into(),
1363 },
1364 hashmap! {
1365 "k".into() => 1.into(),
1366 "k2".into() =>"a".into(),
1367 },
1368 hashmap! {
1369 "k".into() => 3.into(),
1370 "k2".into() =>"c".into(),
1371 },
1372 hashmap! {
1373 "k".into() => 4.into(),
1374 "k2".into() =>"z".into(),
1375 },
1376 ])),
1377 "k2".into(),
1378 TopN::First(1),
1379 Array2::from_shape_vec((1, 2), vec![
1380 1.into(),"a".into(),
1381 ]
1382 ).unwrap(),
1383 vec!["k".into(), "k2".into()],
1384 )]
1385 #[case(
1386 DataFrame::new(ColumnFrame::from(vec![
1387 hashmap! {
1388 "k".into() => 2.into(),
1389 "k2".into() => "b".into(),
1390 },
1391 hashmap! {
1392 "k".into() => 1.into(),
1393 "k2".into() =>"a".into(),
1394 },
1395 hashmap! {
1396 "k".into() => 3.into(),
1397 "k2".into() =>"c".into(),
1398 },
1399 hashmap! {
1400 "k".into() => 4.into(),
1401 "k2".into() =>"z".into(),
1402 },
1403 ])),
1404 "k2".into(),
1405 TopN::First(2),
1406 Array2::from_shape_vec((2, 2), vec![
1407 1.into(),"a".into(),
1408 2.into(),"b".into(),
1409 ]
1410 ).unwrap(),
1411 vec!["k".into(), "k2".into()],
1412 )]
1413 #[traced_test]
1414 fn top_n(
1415 #[case] input: DataFrame,
1416 #[case] column: Key,
1417 #[case] topn: TopN,
1418 #[case] expected: Array2<DataValue>,
1419 #[case] columns: Vec<Key>,
1420 ) {
1421 let result = input.sorted(&column);
1422 assert!(result.is_ok(), "{result:?}");
1423 let result = result.unwrap();
1424 let first = result.topn(topn).unwrap();
1425 let selected = first.select(Some(&columns));
1426 assert_eq!(selected, expected);
1427 }
1428
1429 #[rstest]
1430 fn test_messagepack_roundtrip_empty_dataframe() {
1431 let df = DataFrame::default();
1432
1433 let bytes = df
1434 .store_into_messagepack()
1435 .expect("failed to serialize empty df");
1436 let restored =
1437 DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize empty df");
1438 assert_eq!(df, restored);
1439 assert!(restored.is_empty());
1440 }
1441
1442 #[rstest]
1443 fn test_messagepack_roundtrip_strings_and_bools() {
1444 let df = DataFrame::new(ColumnFrame::from(vec![
1446 hashmap! {
1447 "str".into() => DataValue::String("hello".into()),
1448 "bool".into() => DataValue::Bool(true),
1449 },
1450 hashmap! {
1451 "str".into() => DataValue::String("".into()),
1452 "bool".into() => DataValue::Bool(false),
1453 },
1454 ]));
1455
1456 let bytes = df.store_into_messagepack().expect("failed to serialize");
1457 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1458 assert_eq!(df, restored);
1459 }
1460
1461 #[rstest]
1462 fn test_messagepack_roundtrip_f64_values() {
1463 let df = DataFrame::new(ColumnFrame::from(vec![
1464 hashmap! {
1465 "a".into() => DataValue::F64(3.14),
1466 },
1467 hashmap! {
1468 "a".into() => DataValue::F64(-2.718),
1469 },
1470 ]));
1471
1472 let bytes = df.store_into_messagepack().expect("failed to serialize");
1473 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1474 assert_eq!(df, restored);
1475 }
1476
1477 #[rstest]
1478 fn test_messagepack_f64_special_values_survive_roundtrip() {
1479 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1482 "a".into() => DataValue::F64(f64::INFINITY),
1483 }]));
1484
1485 let bytes = df.store_into_messagepack().expect("failed to serialize");
1486 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1487 assert_eq!(restored.len(), 1);
1488 let col = restored.select_column("a".into()).expect("col exists");
1489 match &col[0] {
1490 DataValue::F64(v) => assert!(v.is_infinite() && v.is_sign_positive()),
1491 other => panic!("expected F64, got {other:?}"),
1492 }
1493 }
1494
1495 #[rstest]
1496 fn test_messagepack_roundtrip_with_nulls() {
1497 let df = DataFrame::new(ColumnFrame::from(vec![
1498 hashmap! {
1499 "a".into() => DataValue::String("x".into()),
1500 "b".into() => DataValue::String("y".into()),
1501 },
1502 hashmap! {
1503 "a".into() => DataValue::String("z".into()),
1504 },
1506 ]));
1507
1508 let bytes = df.store_into_messagepack().expect("failed to serialize");
1509 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1510 assert_eq!(df, restored);
1511 }
1512
1513 #[rstest]
1514 fn test_messagepack_roundtrip_with_metadata() {
1515 let mut df = DataFrame::new(crate::column_frame! {
1516 "col" => ["a", "b"]
1517 });
1518 df.add_metadata("name".into(), DataValue::String("test_df".into()));
1519 df.add_metadata("flag".into(), DataValue::Bool(true));
1520
1521 let bytes = df.store_into_messagepack().expect("failed to serialize");
1522 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1523 assert_eq!(df, restored);
1524 assert_eq!(
1525 restored.get_metadata("name"),
1526 Some(&DataValue::String("test_df".into()))
1527 );
1528 assert_eq!(restored.get_metadata("flag"), Some(&DataValue::Bool(true)));
1529 }
1530
1531 #[rstest]
1532 fn test_messagepack_roundtrip_with_constants() {
1533 let mut df = DataFrame::new(crate::column_frame! {
1534 "x" => ["a", "b"]
1535 });
1536 df.insert_constant("const_key".into(), DataValue::String("const_val".into()));
1537 df.insert_constant("const_flag".into(), DataValue::Bool(false));
1538
1539 let bytes = df.store_into_messagepack().expect("failed to serialize");
1540 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1541 assert_eq!(df, restored);
1542 assert_eq!(
1543 restored.constants.get(&"const_key".into()),
1544 Some(&DataValue::String("const_val".into()))
1545 );
1546 }
1547
1548 #[rstest]
1549 fn test_messagepack_integer_type_coercion() {
1550 let df = crate::df! {
1553 "a" => [1i64, 2i64, 3i64]
1554 };
1555
1556 let bytes = df.store_into_messagepack().expect("failed to serialize");
1557 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1558
1559 assert_eq!(restored.len(), 3);
1561
1562 let col = restored
1564 .select_column("a".into())
1565 .expect("column should exist");
1566 assert_ne!(
1568 col[0],
1569 DataValue::I64(1),
1570 "messagepack coerces small ints to compact types"
1571 );
1572 }
1573
1574 #[rstest]
1575 fn test_messagepack_large_i64_preserved() {
1576 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1578 "big".into() => DataValue::I64(i64::MIN),
1579 }]));
1580
1581 let bytes = df.store_into_messagepack().expect("failed to serialize");
1582 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1583 assert_eq!(df, restored);
1584 }
1585
1586 #[rstest]
1587 fn test_messagepack_load_invalid_bytes() {
1588 let result = DataFrame::load_from_messagepack(&[0xFF, 0xFE, 0xFD, 0x00]);
1589 assert!(result.is_err());
1590 }
1591
1592 #[rstest]
1593 fn test_messagepack_load_empty_bytes() {
1594 let result = DataFrame::load_from_messagepack(&[]);
1595 assert!(result.is_err());
1596 }
1597
1598 #[rstest]
1599 fn test_messagepack_load_truncated_bytes() {
1600 let df = DataFrame::new(ColumnFrame::from(vec![
1601 hashmap! {
1602 "a".into() => DataValue::String("hello world".into()),
1603 "b".into() => DataValue::Bool(true),
1604 },
1605 hashmap! {
1606 "a".into() => DataValue::String("test".into()),
1607 "b".into() => DataValue::Bool(false),
1608 },
1609 ]));
1610 let bytes = df.store_into_messagepack().expect("failed to serialize");
1611 let truncated = &bytes[..bytes.len() / 2];
1613 let result = DataFrame::load_from_messagepack(truncated);
1614 assert!(result.is_err());
1615 }
1616
1617 #[rstest]
1618 fn test_messagepack_roundtrip_with_nested_vec_data() {
1619 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1620 "vec_col".into() => DataValue::Vec(vec![
1621 DataValue::String("a".into()),
1622 DataValue::String("b".into()),
1623 ]),
1624 "bytes_col".into() => DataValue::Bytes(vec![0, 1, 255]),
1625 }]));
1626
1627 let bytes = df.store_into_messagepack().expect("failed to serialize");
1628 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1629 assert_eq!(df, restored);
1630 }
1631
1632 #[rstest]
1633 fn test_messagepack_roundtrip_preserves_row_count() {
1634 let df = DataFrame::new(ColumnFrame::from(vec![
1635 hashmap! { "a".into() => DataValue::String("x".into()) },
1636 hashmap! { "a".into() => DataValue::String("y".into()) },
1637 hashmap! { "a".into() => DataValue::String("z".into()) },
1638 ]));
1639
1640 let bytes = df.store_into_messagepack().expect("failed to serialize");
1641 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1642 assert_eq!(restored.len(), 3);
1643 assert_eq!(restored.n_rows(), 3);
1644 assert_eq!(restored.n_columns(), 1);
1645 }
1646
1647 #[rstest]
1648 fn test_messagepack_idempotent_double_roundtrip() {
1649 let mut df = DataFrame::new(ColumnFrame::from(vec![
1651 hashmap! {
1652 "a".into() => DataValue::String("hello".into()),
1653 "b".into() => DataValue::Bool(true),
1654 },
1655 hashmap! {
1656 "a".into() => DataValue::String("world".into()),
1657 "b".into() => DataValue::Bool(false),
1658 },
1659 ]));
1660 df.add_metadata("meta".into(), DataValue::Bool(true));
1661 df.insert_constant("c".into(), DataValue::String("const".into()));
1662
1663 let bytes1 = df.store_into_messagepack().expect("first serialize");
1664 let restored1 = DataFrame::load_from_messagepack(&bytes1).expect("first deserialize");
1665 let bytes2 = restored1
1666 .store_into_messagepack()
1667 .expect("second serialize");
1668 let restored2 = DataFrame::load_from_messagepack(&bytes2).expect("second deserialize");
1669
1670 assert_eq!(df, restored2);
1671 assert_eq!(bytes1, bytes2);
1672 }
1673
1674 #[rstest]
1675 fn test_messagepack_single_byte_payload() {
1676 let result = DataFrame::load_from_messagepack(&[0x01]);
1678 assert!(result.is_err());
1679 }
1680
1681 #[rstest]
1684 fn test_hash_datavalue_public_api_accessible() {
1685 let val = DataValue::I32(42);
1687 let h = crate::hash_datavalue(&val);
1688 assert_eq!(h, crate::hash_datavalue(&DataValue::I32(42)));
1690 }
1691
1692 #[rstest]
1693 fn test_hash_datavalue_vec_length_matters() {
1694 let short = DataValue::Vec(vec![DataValue::I32(1)]);
1696 let long = DataValue::Vec(vec![DataValue::I32(1), DataValue::Null]);
1697 assert_ne!(crate::hash_datavalue(&short), crate::hash_datavalue(&long));
1698 }
1699
1700 #[rstest]
1701 fn test_hash_datavalue_map_different_keys_same_values() {
1702 let mut m1 = std::collections::HashMap::new();
1703 m1.insert("a".into(), DataValue::I32(1));
1704 let mut m2 = std::collections::HashMap::new();
1705 m2.insert("b".into(), DataValue::I32(1));
1706
1707 assert_ne!(
1708 crate::hash_datavalue(&DataValue::Map(m1)),
1709 crate::hash_datavalue(&DataValue::Map(m2))
1710 );
1711 }
1712
1713 #[rstest]
1714 fn test_hash_datavalue_empty_string_vs_empty_bytes() {
1715 let empty_str = DataValue::String("".into());
1716 let empty_bytes = DataValue::Bytes(vec![]);
1717 assert_ne!(
1718 crate::hash_datavalue(&empty_str),
1719 crate::hash_datavalue(&empty_bytes)
1720 );
1721 }
1722
1723 #[rstest]
1724 fn test_hash_datavalue_empty_vec_vs_empty_map() {
1725 let empty_vec = DataValue::Vec(vec![]);
1726 let empty_map = DataValue::Map(std::collections::HashMap::new());
1727 assert_ne!(
1728 crate::hash_datavalue(&empty_vec),
1729 crate::hash_datavalue(&empty_map)
1730 );
1731 }
1732
1733 #[rstest]
1734 fn test_hash_datavalue_i128_boundary_values() {
1735 let max = DataValue::I128(i128::MAX);
1736 let min = DataValue::I128(i128::MIN);
1737 let zero = DataValue::I128(0);
1738 let neg_one = DataValue::I128(-1);
1739
1740 let hashes: std::collections::HashSet<u64> = [&max, &min, &zero, &neg_one]
1742 .iter()
1743 .map(|v| crate::hash_datavalue(v))
1744 .collect();
1745 assert_eq!(hashes.len(), 4);
1746 }
1747
1748 #[rstest]
1749 fn test_hash_datavalue_u128_boundary_values() {
1750 let max = DataValue::U128(u128::MAX);
1751 let zero = DataValue::U128(0);
1752 let one = DataValue::U128(1);
1753 let i128_neg1 = DataValue::I128(-1);
1755
1756 assert_ne!(
1757 crate::hash_datavalue(&max),
1758 crate::hash_datavalue(&i128_neg1)
1759 );
1760 let hashes: std::collections::HashSet<u64> = [&max, &zero, &one]
1761 .iter()
1762 .map(|v| crate::hash_datavalue(v))
1763 .collect();
1764 assert_eq!(hashes.len(), 3);
1765 }
1766
1767 #[rstest]
1768 fn test_hash_datavalue_f64_special_values() {
1769 let nan1 = DataValue::F64(f64::NAN);
1771 let nan2 = DataValue::F64(f64::NAN);
1772 assert_eq!(crate::hash_datavalue(&nan1), crate::hash_datavalue(&nan2));
1773
1774 let subnormal = DataValue::F64(f64::MIN_POSITIVE / 2.0);
1776 let normal = DataValue::F64(f64::MIN_POSITIVE);
1777 assert_ne!(
1778 crate::hash_datavalue(&subnormal),
1779 crate::hash_datavalue(&normal)
1780 );
1781 }
1782
1783 #[rstest]
1784 fn test_hash_datavalue_enum_number_vs_i32_same_value() {
1785 let enum_val = DataValue::EnumNumber(42);
1787 let i32_val = DataValue::I32(42);
1788 assert_ne!(
1789 crate::hash_datavalue(&enum_val),
1790 crate::hash_datavalue(&i32_val)
1791 );
1792 }
1793
1794 #[rstest]
1795 fn get_single_column_typed_f64_from_i32() {
1796 let df = crate::df! {
1797 "a" => [1i32, 2i32, 3i32]
1798 };
1799 let key: Key = "a".into();
1800 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1801 assert_eq!(col, ndarray::arr1(&[1.0f64, 2.0, 3.0]));
1802 }
1803
1804 #[rstest]
1805 fn get_single_column_typed_string() {
1806 let df = crate::df! {
1807 "name" => ["alice", "bob"]
1808 };
1809 let key: Key = "name".into();
1810 let col = df.get_single_column_typed::<String>(&key).unwrap();
1811 assert_eq!(
1812 col,
1813 ndarray::arr1(&["alice".to_string(), "bob".to_string()])
1814 );
1815 }
1816
1817 #[rstest]
1818 fn get_single_column_typed_missing_key() {
1819 let df = crate::df! {
1820 "a" => [1u64, 2u64]
1821 };
1822 let missing: Key = "z".into();
1823 assert!(df.get_single_column_typed::<u64>(&missing).is_none());
1824 }
1825
1826 #[rstest]
1827 fn get_single_column_typed_matches_untyped() {
1828 let df = crate::df! {
1829 "v" => [10u64, 20u64, 30u64]
1830 };
1831 let key: Key = "v".into();
1832 let typed = df.get_single_column_typed::<u64>(&key).unwrap();
1833 let untyped = df.get_single_column(&key).unwrap();
1834 for (t, u) in typed.iter().zip(untyped.iter()) {
1835 assert_eq!(*t, u64::extract(u));
1836 }
1837 }
1838
1839 #[rstest]
1840 fn get_single_column_typed_bool_from_i32() {
1841 let df = crate::df! {
1842 "flag" => [1i32, 0i32, 1i32, 0i32]
1843 };
1844 let key: Key = "flag".into();
1845 let col = df.get_single_column_typed::<bool>(&key).unwrap();
1846 assert_eq!(col, ndarray::arr1(&[true, false, true, false]));
1847 }
1848
1849 #[rstest]
1850 fn get_single_column_typed_i64_from_u32() {
1851 let df = crate::df! {
1852 "x" => [10u32, 20u32, 30u32]
1853 };
1854 let key: Key = "x".into();
1855 let col = df.get_single_column_typed::<i64>(&key).unwrap();
1856 assert_eq!(col, ndarray::arr1(&[10i64, 20i64, 30i64]));
1857 }
1858
1859 #[rstest]
1860 fn get_single_column_typed_f64_truncation_to_i32() {
1861 let df = crate::df! {
1862 "v" => [1.9f64, 2.1f64, 3.7f64]
1863 };
1864 let key: Key = "v".into();
1865 let col = df.get_single_column_typed::<i32>(&key).unwrap();
1866 assert_eq!(col, ndarray::arr1(&[1i32, 2i32, 3i32]));
1867 }
1868
1869 #[rstest]
1870 fn get_single_column_typed_single_element() {
1871 let df = crate::df! {
1872 "solo" => [42u64]
1873 };
1874 let key: Key = "solo".into();
1875 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1876 assert_eq!(col.len(), 1);
1877 assert_eq!(col[0], 42.0);
1878 }
1879
1880 #[rstest]
1881 fn select_typed_all_columns() {
1882 let df = crate::df! {
1883 "a" => [1i32, 2i32, 3i32],
1884 "b" => [4i32, 5i32, 6i32]
1885 };
1886 let result = df.select_typed::<f64>(None).unwrap();
1887 assert_eq!(result.nrows(), 3);
1888 assert_eq!(result.ncols(), 2);
1889 assert_eq!(result[[0, 0]], 1.0);
1890 assert_eq!(result[[0, 1]], 4.0);
1891 assert_eq!(result[[2, 0]], 3.0);
1892 assert_eq!(result[[2, 1]], 6.0);
1893 }
1894
1895 #[rstest]
1896 fn select_typed_specific_keys() {
1897 let df = crate::df! {
1898 "x" => [10u64, 20u64],
1899 "y" => [30u64, 40u64],
1900 "z" => [50u64, 60u64]
1901 };
1902 let keys: Vec<Key> = vec!["x".into(), "z".into()];
1903 let result = df.select_typed::<i64>(Some(&keys)).unwrap();
1904 assert_eq!(result.nrows(), 2);
1905 assert_eq!(result.ncols(), 2);
1906 assert_eq!(result[[0, 0]], 10i64);
1907 assert_eq!(result[[0, 1]], 50i64);
1908 assert_eq!(result[[1, 0]], 20i64);
1909 assert_eq!(result[[1, 1]], 60i64);
1910 }
1911
1912 #[rstest]
1913 fn select_typed_nonexistent_key_gives_empty() {
1914 let df = crate::df! {
1915 "a" => [1i32, 2i32]
1916 };
1917 let keys: Vec<Key> = vec!["missing".into()];
1918 let result = df.select_typed::<f64>(Some(&keys)).unwrap();
1919 assert_eq!(result.shape(), &[0, 0]);
1920 }
1921
1922 #[rstest]
1923 fn select_typed_matches_select_with_extract() {
1924 let df = crate::df! {
1925 "a" => [1u64, 2u64, 3u64],
1926 "b" => [4u64, 5u64, 6u64]
1927 };
1928 let typed = df.select_typed::<f64>(None).unwrap();
1929 let manual = df.select(None).unwrap().mapv(|v| f64::extract(&v));
1930 assert_eq!(typed, manual);
1931 }
1932
1933 #[rstest]
1934 fn select_typed_string_values() {
1935 let df = crate::df! {
1936 "name" => ["alice", "bob", "carol"]
1937 };
1938 let result = df.select_typed::<String>(None).unwrap();
1939 assert_eq!(result[[0, 0]], "alice");
1940 assert_eq!(result[[1, 0]], "bob");
1941 assert_eq!(result[[2, 0]], "carol");
1942 }
1943
1944 #[rstest]
1945 fn select_typed_cross_numeric_coercion() {
1946 let df = crate::df! {
1948 "a" => [1i32, 2i32, 3i32]
1949 };
1950 let result = df.select_typed::<u64>(None).unwrap();
1951 assert_eq!(result[[0, 0]], 1u64);
1952 assert_eq!(result[[1, 0]], 2u64);
1953 assert_eq!(result[[2, 0]], 3u64);
1954 }
1955}