1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2};
5use std::fmt;
6pub mod column_store;
8pub mod index;
10pub mod join;
12pub mod key;
14use crate::{error::Error, CandidateData};
15#[cfg(feature = "python")]
16pub mod python;
17
18#[cfg(feature = "python")]
19use pyo3::prelude::*;
20
21use crate::{
22 dataframe::{
23 column_store::typed_array::TypedDataArray, column_store::ColumnFrame,
24 column_store::MaybeView, join::JoinRelation, key::Key,
25 },
26 MLChefMap,
27};
28
29#[derive(Debug, Clone, PartialEq, Eq, Copy)]
34pub enum TopN {
35 First(usize),
37 Last(usize),
39}
40
41#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
67#[cfg_attr(feature = "python", pyclass)]
68pub struct DataFrame {
69 pub constants: HashMap<Key, DataValue>,
73 pub dataframe: ColumnFrame,
75 pub metadata: HashMap<String, DataValue>,
77}
78
79impl fmt::Display for DataFrame {
80 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81 self.dataframe.fmt(f)
82 }
83}
84
85impl DataFrame {
86 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
101 Self {
102 constants: HashMap::new(),
103 dataframe: dataframe.into(),
104 metadata: HashMap::new(),
105 }
106 }
107
108 pub fn n_columns(&self) -> usize {
110 self.dataframe.ncolumns()
111 }
112
113 pub fn n_rows(&self) -> usize {
115 self.dataframe.nrows()
116 }
117
118 pub fn shrink(&mut self) {
121 self.dataframe.shrink();
122 }
123
124 pub fn add_metadata(&mut self, key: String, value: DataValue) {
129 self.metadata.insert(key, value);
130 }
131
132 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
134 self.metadata.get(key)
135 }
136
137 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
149 for (key, value) in other.constants {
150 self.constants.insert(key, value);
151 }
152 self.dataframe.join(other.dataframe, join_type)
153 }
154
155 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
160 where
161 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
162 {
163 self.dataframe.apply_function(keys, &mut func)
164 }
165
166 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
181 Ok(self.dataframe.select(keys))
182 }
183
184 pub fn select_typed<T: Extract + Clone>(
211 &self,
212 keys: Option<&[Key]>,
213 ) -> Result<Array2<T>, Error> {
214 Ok(self.dataframe.select_typed(keys))
215 }
216
217 pub fn select_view(&self, keys: Option<&[Key]>) -> Result<MaybeView<'_>, Error> {
245 self.dataframe.select_view(keys)
246 }
247
248 pub fn select_vec_view(
278 &self,
279 keys: Option<&[Key]>,
280 ) -> Result<Vec<Option<&TypedDataArray>>, Error> {
281 self.dataframe.select_vec_view(keys)
282 }
283
284 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
290 self.dataframe.select_transposed_typed::<D>(keys)
291 }
292
293 pub fn select_column(&self, key: Key) -> Option<ndarray::Array1<DataValue>> {
299 #[allow(deprecated)]
300 self.dataframe.select_column(&key)
301 }
302
303 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
309 self.dataframe.select_transposed(keys)
310 }
311
312 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
318 self.constants.insert(key, value);
319 }
320
321 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
327 self.dataframe.push(item)
328 }
329
330 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
333 self.dataframe.remove_column(keys).map(|x| x.into())
334 }
335
336 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
341 self.dataframe.extend(items.dataframe)
342 }
343
344 pub fn len(&self) -> usize {
346 self.dataframe.nrows()
347 }
348
349 pub fn is_empty(&self) -> bool {
351 self.dataframe.is_empty()
352 }
353
354 pub fn add_single_column<K, V>(&mut self, key: K, values: V) -> Result<(), Error>
364 where
365 K: Into<Key>,
366 V: Into<TypedDataArray>,
367 {
368 self.dataframe.add_single_column(key, values)
369 }
370
371 pub fn get_column(&self, key: &Key) -> Option<&TypedDataArray> {
380 self.dataframe.get_column(key).ok()
381 }
382
383 pub fn get_single_column(&self, key: &Key) -> Option<Array1<DataValue>> {
389 #[allow(deprecated)]
390 self.dataframe.get_single_column(key)
391 }
392
393 pub fn get_single_column_typed<T: Extract>(&self, key: &Key) -> Option<Array1<T>> {
420 self.dataframe.get_single_column_typed(key)
421 }
422
423 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
428 self.dataframe.sorted(key)
429 }
430
431 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
438 let filtered_df = self.dataframe.filter(filter)?;
439 Ok(Self {
440 constants: self.constants.clone(),
441 dataframe: filtered_df,
442 metadata: self.metadata.clone(),
443 })
444 }
445
446 #[cfg(feature = "polars-df")]
451 #[allow(deprecated)]
452 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
453 let mut columns = vec![];
454 for key in self.dataframe.keys() {
455 let values = self
456 .dataframe
457 .get_single_column(key)
458 .ok_or_else(|| Error::NotFound(key.clone()))?
459 .into_iter()
460 .map(|x| into_polars_value(key, x.clone()))
461 .collect::<Vec<_>>();
462 let s = polars::prelude::Column::new(key.name().into(), values);
463
464 columns.push(s);
465 }
466
467 Ok(polars::prelude::DataFrame::new(columns)?)
468 }
469
470 pub fn load_from_messagepack(bytes: &[u8]) -> Result<Self, Error> {
475 rmp_serde::decode::from_slice(bytes).map_err(|e| Error::UnknownError(format!("{e:?}")))
476 }
477
478 pub fn store_into_messagepack(&self) -> Result<Vec<u8>, Error> {
483 rmp_serde::encode::to_vec(&self).map_err(|e| Error::UnknownError(format!("{e:?}")))
484 }
485}
486
487#[cfg(feature = "polars-df")]
491pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
492 use crate::DataType::*;
493 use polars::prelude::DataType::*;
494 match dtype {
495 Bool => Boolean,
496 U32 => UInt32,
497 I32 => Int32,
498 U8 => UInt8,
499 U64 => UInt64,
500 I64 => Int64,
501 F32 => Float32,
502 F64 => Float64,
503 U128 => UInt128,
504 I128 => Int128,
505 crate::DataType::String => polars::prelude::DataType::String,
506 Bytes => Binary,
507 crate::DataType::Unknown => Null,
508 Vec => List(Box::new(polars::prelude::DataType::Unknown(
509 polars::prelude::UnknownKind::Any,
510 ))),
511 Map => Struct(vec![]),
512 }
513}
514
515#[cfg(feature = "polars-df")]
520pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
521 use polars::prelude::AnyValue::*;
522 use polars::prelude::Field;
523
524 use crate::dataframe::column_store::convert_dv_to_dtype;
525 let dv = convert_dv_to_dtype(key, dv);
526 match dv {
527 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
528 DataValue::Bytes(items) => BinaryOwned(items),
529 DataValue::U8(x) => UInt32(x as _),
530 DataValue::Bool(x) => Boolean(x),
531 DataValue::I32(x) => Int32(x),
532 DataValue::U32(x) => UInt32(x),
533 DataValue::I64(x) => Int64(x),
534 DataValue::U64(x) => UInt64(x),
535 DataValue::I128(x) => Int128(x),
536 DataValue::F32(x) => Float32(x),
537 DataValue::F64(x) => Float64(x),
538 DataValue::Null => Null,
539 DataValue::Vec(data_values) => {
540 let mut dt = crate::DataType::Unknown;
541 for d in data_values.iter() {
542 match crate::detect_dtype(d) {
543 crate::DataType::Unknown => continue,
544 e => {
545 dt = e;
546 break;
547 }
548 }
549 }
550 let vec_key = Key::new(key.name(), dt);
551 let s = polars::series::Series::from_any_values(
552 key.name().into(),
553 &data_values
554 .into_iter()
555 .map(|x| into_polars_value(&vec_key, x))
556 .collect::<Vec<_>>(),
557 true,
558 );
559 List(s.expect(&format!("Cannot create series for {key:?}")))
560 }
561 DataValue::EnumNumber(x) => Int32(x),
562 DataValue::U128(x) => UInt128(x),
563 DataValue::Map(x) => {
564 let mut values = vec![];
565 let mut fields = vec![];
566 let mut sorted_keys = x.keys().collect::<Vec<_>>();
567 sorted_keys.sort();
568 for k in sorted_keys {
569 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
570 let dtype = crate::detect_dtype(value);
571 let k = Key::new(k, dtype);
572 values.push(into_polars_value(&k, value.to_owned()));
573 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
574 }
575 StructOwned(Box::new((values, fields)))
576 }
577 }
578}
579
580#[cfg(feature = "polars-df")]
584pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
585 use polars::prelude::AnyValue::*;
586 match dv {
587 Null => DataValue::Null,
588 Boolean(v) => v.into(),
589 String(v) => DataValue::String(v.into()),
590 UInt8(v) => DataValue::U8(v),
591 UInt16(v) => DataValue::U32(v as u32),
592 UInt32(v) => v.into(),
593 UInt64(v) => v.into(),
594 Int8(v) => (v as i32).into(),
595 Int16(v) => (v as i32).into(),
596 Int32(v) => v.into(),
597 Int64(v) => v.into(),
598 Float32(v) => v.into(),
599 Float64(v) => v.into(),
600 Int128(v) => v.into(),
601 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
602 StringOwned(v) => DataValue::String(v.as_str().into()),
606 Binary(v) => DataValue::Bytes(v.to_owned()),
607 BinaryOwned(v) => DataValue::Bytes(v),
608 StructOwned(m) => {
609 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
610 std::collections::HashMap::new();
611 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
612 hm.insert(k.name.as_str().into(), from_polars_value(v));
613 }
614 DataValue::Map(hm)
615 }
616 e => {
617 tracing::warn!("Unsupported polars value: {e:?}");
618 DataValue::Null
619 }
620 }
621}
622
623impl From<ColumnFrame> for DataFrame {
624 fn from(dataframe: ColumnFrame) -> Self {
625 Self::new(dataframe)
626 }
627}
628
629impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
630 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
631 Self::new(ColumnFrame::from(dataframe))
632 }
633}
634
635impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
636 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
637 Self::new(ColumnFrame::from(dataframe))
638 }
639}
640
641impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
642 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
643 Self::new(ColumnFrame::from(dataframe))
644 }
645}
646
647impl From<MLChefMap> for DataFrame {
648 fn from(dataframe: MLChefMap) -> Self {
649 Self::new(ColumnFrame::from(dataframe))
650 }
651}
652impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
653 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
654 Self::new(ColumnFrame::from(dataframe))
655 }
656}
657
658impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
659 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
660 Self::new(ColumnFrame::from(dataframe))
661 }
662}
663
664#[cfg(feature = "polars-df")]
665impl From<polars::prelude::DataFrame> for DataFrame {
666 fn from(dataframe: polars::prelude::DataFrame) -> Self {
667 Self::new(ColumnFrame::from(dataframe))
668 }
669}
670#[cfg(test)]
671#[allow(deprecated)]
672mod test {
673 use crate::filter::FilterRules;
674
675 use super::*;
676 use halfbrown::hashmap;
677 #[cfg(feature = "polars-df")]
678 use polars::prelude::NamedFrom as _;
679 use rstest::*;
680 use tracing_test::traced_test;
681 #[fixture]
682 fn dummy_candidates() -> ColumnFrame {
683 ColumnFrame::from(vec![
684 hashmap! {
685 "key1".into() => 1.into(),
686 "key2".into() => "a".into(),
687 },
688 hashmap! {
689 "key1".into() => 2.into(),
690 "key2".into() => "b".into(),
691 },
692 ])
693 }
694
695 #[rstest]
696 fn test_serde() {
697 let df = crate::df! {
698 "a" => [1u64, 2u64, 3u64],
699 "b" => [4u64, 5u64, 6u64],
700 "c" => [7u64, 8u64, 9u64]
701 };
702
703 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
704
705 let deserialized =
706 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
707
708 assert_eq!(df, deserialized);
709 }
710
711 #[cfg(feature = "polars-df")]
712 #[rstest]
713 fn test_polars() {
714 let expected = crate::df! {
715 "a" => [1u64, 2u64, 3u64],
716 "b" => [4f64, 5f64, 6f64],
717 "c" => [7i64, 8i64, 9i64]
718 };
719
720 let polars_df = polars::df!(
721 "a" => [1u64, 2u64, 3u64],
722 "b" => [4f64, 5f64, 6f64],
723 "c" => [7i64, 8i64, 9i64]
724 )
725 .expect("BUG: should be ok");
726 let as_df: DataFrame = polars_df.into();
727 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
728 assert_eq!(
729 as_df.select(Some(keys.as_slice())),
730 expected.select(Some(keys.as_slice()))
731 );
732 }
733 #[cfg(feature = "polars-df")]
734 use crate::DataType;
735 #[cfg(feature = "polars-df")]
736 #[rstest]
737 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
738 #[case::u32(
739 Key::new("a", DataType::U32),
740 DataValue::U32(u32::MAX),
741 polars::prelude::AnyValue::UInt32(u32::MAX)
742 )]
743 #[case::i32(
744 Key::new("a", DataType::I32),
745 DataValue::I32(i32::MIN),
746 polars::prelude::AnyValue::Int32(i32::MIN)
747 )]
748 #[case::i64(
749 Key::new("a", DataType::I64),
750 DataValue::I64(i64::MIN),
751 polars::prelude::AnyValue::Int64(i64::MIN)
752 )]
753 #[case::u64(
754 Key::new("a", DataType::U64),
755 DataValue::U64(u64::MIN),
756 polars::prelude::AnyValue::UInt64(u64::MIN)
757 )]
758 #[case::f32(
759 Key::new("a", DataType::F32),
760 DataValue::F32(f32::MIN),
761 polars::prelude::AnyValue::Float32(f32::MIN)
762 )]
763 #[case::f64(
764 Key::new("a", DataType::F64),
765 DataValue::F64(f64::MIN),
766 polars::prelude::AnyValue::Float64(f64::MIN)
767 )]
768 #[case::null(
769 Key::new("a", DataType::Unknown),
770 DataValue::Null,
771 polars::prelude::AnyValue::Null
772 )]
773 #[case::i128(
774 Key::new("a", DataType::I128),
775 DataValue::I128(i128::MIN),
776 polars::prelude::AnyValue::Int128(i128::MIN)
777 )]
778 #[case::u8(
779 Key::new("a", DataType::U8),
780 DataValue::U8(255),
781 polars::prelude::AnyValue::UInt8(255)
782 )]
783 #[case::bool(
784 Key::new("a", DataType::Bool),
785 DataValue::Bool(true),
786 polars::prelude::AnyValue::Boolean(true)
787 )]
788 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
789 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
790 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
791 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
792 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
793 fn into_polars_value_test(
797 #[case] key: Key,
798 #[case] input: DataValue,
799 #[case] output: polars::prelude::AnyValue<'static>,
800 ) {
801 assert_eq!(into_polars_value(&key, input.clone()), output);
802 assert_eq!(from_polars_value(output), input);
803 }
804
805 #[rstest]
817 #[case(
818 DataFrame::new(crate::column_frame! {
819 "a" => [1f64, 2f64, 3f64],
820 "b" => [4i64, 5i64, 6i64],
821 "c" => [7i64, 8i64, 9i64]
822 }),
823 DataFrame::new(crate::column_frame! {
824 "a" => [1f64, 2f64],
825 "b" => [4i64, 5i64],
826 "c" => [7i64, 8i64]
827 }),
828 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
829 )]
830 #[case(
831 DataFrame::new(crate::column_frame! {
832 "a" => [1f64, 2f64, 3f64],
833 "b" => [4i64, 5i64, 6i64],
834 "c" => [7i64, 8i64, 9i64]
835 }),
836 DataFrame::new(crate::column_frame! {
837 "a" => [2f64],
838 "b" => [5i64],
839 "c" => [8i64]
840 }),
841 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
842 )]
843 #[traced_test]
844 fn filter_test(
845 #[case] df: DataFrame,
846 #[case] expected: DataFrame,
847 #[case] filter: FilterRules,
848 ) {
849 let filtered = df.filter(&filter).expect("BUG: cannot filter");
850 assert_eq!(filtered, expected);
851 }
852
853 #[rstest]
854 fn test_serde_complex() {
855 let simple = r#"
856{
857 "constants": {},
858 "dataframe": {
859 "index": {
860 "keys": [
861 {
862 "key": 3162770485,
863 "name": "a",
864 "ctype": "U32"
865 },
866 {
867 "key": 2279056742,
868 "name": "b",
869 "ctype": "F64"
870 },
871 {
872 "key": 2994984227,
873 "name": "c",
874 "ctype": "U64"
875 },
876 {
877 "key": 3319645144,
878 "name": "d",
879 "ctype": "F64"
880 },
881 {
882 "key": 1291847470,
883 "name": "e",
884 "ctype": "U32"
885 },
886 {
887 "key": 874241070,
888 "name": "f",
889 "ctype": "Bool"
890 }
891 ],
892 "indexes": {
893 "a": 0,
894 "b": 1,
895 "c": 2,
896 "d": 3,
897 "e": 4,
898 "f": 5
899 },
900 "alias": {}
901 },
902 "data_frame": {
903 "v": 1,
904 "dim": [
905 2,
906 6
907 ],
908 "data": [
909 253780,
910 0.009369421750307085,
911 1633222860381359,
912 8,
913 5,
914 true,
915 64512,
916 0.003391335718333721,
917 1633222860810557,
918 8,
919 5,
920 null
921 ]
922 }
923 },
924 "metadata": {}
925}
926 "#;
927
928 let simple_deserialized: DataFrame =
929 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
930
931 println!("deserialized: {simple_deserialized:?}");
932 let array = format!("[{}, {}, {}]", simple, simple, simple);
933 let deserialized: Vec<DataFrame> =
934 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
935
936 println!("deserialized: {deserialized:?}");
937 assert_eq!(deserialized.len(), 3);
938 assert_eq!(simple_deserialized, deserialized[0]);
939 }
940
941 #[rstest]
942 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
943 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
944 #[case(vec![hashmap! {
945 "key1".into() => 1.into(),
946 "key2".into() => "a".into(),
947 },
948 hashmap! {
949 "key1".into() => 2.into(),
950 },])]
951 #[case(vec![data_value::stdhashmap! {
952 "key1" => DataValue::from(1),
953 "key2" => DataValue::from("a"),
954 },data_value::stdhashmap! {
955 "key1" => DataValue::from(2),
956 },])]
957 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
958 vec![DataValue::from("a"), DataValue::Null])])]
959 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
960 let df: DataFrame = input.into();
961 assert_eq!(
962 df,
963 DataFrame {
964 constants: HashMap::new(),
965 dataframe: ColumnFrame::from(vec![
966 hashmap! {
967 "key1".into() => 1.into(),
968 "key2".into() => "a".into(),
969 },
970 hashmap! {
971 "key1".into() => 2.into(),
972 },
973 ]),
974 metadata: HashMap::new(),
975 }
976 );
977 let selected_transposed = df.select_column("key1".into());
978 assert!(selected_transposed.is_some());
979 let selected_transposed = selected_transposed.unwrap();
980 assert_eq!(selected_transposed.len(), 2);
981 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
982 }
983
984 #[rstest]
985 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
986 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
987 #[case::hm({
988 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
989 hm
990 })]
991 #[case::vec_hhm(vec![hashmap! {
992 "key1".into() => 1.into(),
993 "key2".into() => "a".into(),
994 },
995 hashmap! {
996 "key1".into() => 2.into(),
997 },])]
998 #[case::vec_hme(vec![data_value::stdhashmap! {
999 "key1" => DataValue::from(1),
1000 "key2" => DataValue::from("a"),
1001 },data_value::stdhashmap! {
1002 "key1" => DataValue::from(2),
1003 },])]
1004 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
1005 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
1006 let df: DataFrame = input.into();
1007 let expected: DataFrame = DataFrame {
1008 constants: HashMap::new(),
1009 dataframe: ColumnFrame::from(vec![
1010 hashmap! {
1011 "key1".into() => 1.into(),
1012 "key2".into() => "a".into(),
1013 },
1014 hashmap! {
1015 "key1".into() => 2.into(),
1016 },
1017 ]),
1018 metadata: HashMap::new(),
1019 };
1020 assert_eq!(
1021 df.select(Some(&["key1".into(), "key2".into()])),
1022 expected.select(Some(&["key1".into(), "key2".into()])),
1023 "{df} vs {expected}"
1024 );
1025 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
1026 assert_eq!(selected_transposed.len(), 2);
1027 println!("{:?}", selected_transposed);
1028 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
1029 }
1030 #[rstest]
1031 fn test_dataframe(dummy_candidates: ColumnFrame) {
1032 let mut dataframe: DataFrame = DataFrame::default();
1033 assert!(dataframe.is_empty());
1034 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
1035 assert_eq!(dataframe.len(), 2);
1036
1037 let candidate = hashmap! {
1038 "key1".into() => 3.into(),
1039 "key2".into() => "c".into(),
1040 };
1041
1042 assert!(dataframe.push(candidate).is_ok());
1043 assert_eq!(dataframe.len(), 3);
1044 assert!(!dataframe.is_empty());
1045
1046 dataframe.insert_constant("key3".into(), 4.into());
1047 assert_eq!(dataframe.constants.len(), 1);
1048 assert!(dataframe
1049 .apply_function(&["key1".into()], |keys, df| {
1050 let key = keys[0].clone();
1051 let s = df
1052 .get_single_column(&key)
1053 .expect("BUG: Cannot get column")
1054 .to_owned();
1055 let s = s.mapv(|x| x + DataValue::from(1));
1056 df.add_single_column("key5", s)?;
1057 Ok(())
1058 })
1059 .is_ok());
1060 let original = dataframe.clone();
1061 dataframe.shrink();
1062 let remove_df = dataframe.remove_column(&["key1".into()]);
1063 assert!(remove_df.is_ok());
1064 let mut remove_df = remove_df.unwrap();
1065 assert_eq!(remove_df.len(), 3);
1066 let selected = dataframe.select(Some(&["key2".into()]));
1067 assert!(selected.is_ok());
1068 let selected = selected.unwrap();
1069 println!("{:?}", selected);
1070
1071 let joined_result =
1073 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
1074 assert!(joined_result.is_ok(), "{:?}", joined_result);
1075 let keys = vec!["key1".into(), "key2".into(), "key5".into()];
1076 assert_eq!(
1077 original.select(Some(keys.as_slice())),
1078 remove_df.select(Some(keys.as_slice()))
1079 );
1080 }
1081
1082 #[rstest]
1083 fn test_size_methods() {
1084 let candidate = hashmap! {
1085 "key1".into() => 3.into(),
1086 "key2".into() => "c".into(),
1087 "key3".into() => false.into()
1088 };
1089
1090 let dataframe: DataFrame = vec![candidate].into();
1091
1092 assert_eq!(dataframe.n_columns(), 3);
1093 assert_eq!(dataframe.n_rows(), 1);
1094 }
1095
1096 #[rstest]
1097 fn test_metadata(dummy_candidates: ColumnFrame) {
1098 let mut dataframe: DataFrame = DataFrame::default();
1099 assert!(dataframe.is_empty());
1100 println!("{:?}", dataframe);
1101 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
1102 println!("{:?}", dataframe);
1103 assert_eq!(dataframe.len(), 2);
1104
1105 dataframe.add_metadata("test".into(), 1.into());
1106 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
1107 let dataframe = DataFrame::new(ColumnFrame::from(vec![
1108 hashmap! {
1109 "key1".into() => 1.into(),
1110 "key2".into() => "a".into(),
1111 },
1112 hashmap! {
1113 "key1".into() => 2.into(),
1114 "key2".into() => "b".into(),
1115 },
1116 ]));
1117 assert_eq!(dataframe.get_metadata("test"), None);
1118 let tt = dataframe.select_transposed(None);
1119 assert!(tt.is_ok());
1120 let tt = tt.unwrap();
1121 assert_eq!(tt.shape(), [2, 2]);
1122 assert_eq!(
1123 tt,
1124 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
1125 .unwrap()
1126 );
1127 }
1128
1129 #[rstest]
1130 #[traced_test]
1131 fn add_single_column_test() {
1132 let mut dataframe = DataFrame::default();
1133 let values: Array1<DataValue> = Array1::from_vec(vec![1.into(), 2.into(), 3.into()]);
1134 let r = dataframe.add_single_column("key1", values);
1135 assert!(r.is_ok(), "{r:?}");
1136 let selected = dataframe.select(None);
1137 assert!(selected.is_ok());
1138 let selected = selected.unwrap();
1139 assert_eq!(selected.shape(), [3, 1]);
1140 assert_eq!(
1141 selected,
1142 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
1143 );
1144 let values: Array1<i32> = Array1::from_vec(vec![1, 2]);
1145 assert!(dataframe.add_single_column("key1", values).is_err());
1146 let values: Vec<i32> = vec![3i32, 4, 5];
1147 assert!(dataframe.add_single_column("key2", values).is_ok());
1148 let values: Array1<i32> = Array1::from_vec(vec![3i32]);
1149 assert!(dataframe.add_single_column("key3", values).is_err());
1150 }
1151
1152 #[rstest]
1153 #[traced_test]
1154 fn add_single_column_empty_test() {
1155 let mut dataframe = DataFrame::default();
1156 let values: Array1<DataValue> = Array1::from(vec![]);
1157 let r = dataframe.add_single_column("key1", values);
1158 assert!(r.is_ok(), "{r:?}");
1159 let selected = dataframe.select(None);
1160 assert!(selected.is_ok());
1161 let selected = selected.unwrap();
1162 assert_eq!(selected.shape(), [0, 1]);
1163 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
1164 let values: Array1<DataValue> = Array1::from(vec![1.into(), 2.into()]);
1165 assert!(dataframe.add_single_column("key1", values).is_err());
1166 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1167 assert!(dataframe.add_single_column("key2", values).is_ok());
1168 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into()]);
1169 assert!(dataframe.add_single_column("key3", values).is_err());
1170 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1171 assert!(dataframe.add_single_column("key3", values).is_ok());
1172
1173 assert_eq!(
1174 dataframe
1175 .select_column("key1".into())
1176 .expect("BUG: has to exists"),
1177 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
1178 );
1179 assert_eq!(
1180 dataframe
1181 .select_column("key2".into())
1182 .expect("BUG: has to exists"),
1183 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
1184 );
1185 assert_eq!(
1186 dataframe.select(None).expect("BUG: cannot get data"),
1187 ndarray::arr2(&[
1188 [DataValue::Null, 3.into(), 3.into()],
1189 [DataValue::Null, 4.into(), 4.into()],
1190 [DataValue::Null, 5.into(), 5.into()],
1191 ])
1192 );
1193 }
1194
1195 #[rstest]
1196 #[case(
1197 DataFrame::new(ColumnFrame::from(vec![
1198 hashmap! {
1199 "k".into() => 1.into(),
1200 "k2".into() => 2.into(),
1201 "k3".into() => 2.2.into(),
1202 },
1203 hashmap! {
1204 "k".into() => 11.into(),
1205 "k2".into() => 3.into(),
1206 },
1207 hashmap! {
1208 "k".into() => 4.into(),
1209 "k2".into() => 5.into(),
1210 "k3".into() => 2.3.into(),
1211 },
1212 hashmap! {
1213 "k".into() => 4.into(),
1214 "k2".into() => 5.into(),
1215 "k3".into() => 2.4.into(),
1216 },
1217 ])),
1218 vec!["k".into(), "k2".into()],
1219 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
1220 )]
1221 #[case(
1222 DataFrame::new(ColumnFrame::from(vec![
1223 hashmap! {
1224 "k".into() => 1.into(),
1225 "k2".into() => 2.into(),
1226 "k3".into() => 2.2.into(),
1227 },
1228 hashmap! {
1229 "k".into() => 11.into(),
1230 "k2".into() => 3.into(),
1231 },
1232 hashmap! {
1233 "k".into() => 4.into(),
1234 "k2".into() => 5.into(),
1235 "k3".into() => 2.3.into(),
1236 },
1237 hashmap! {
1238 "k".into() => 4.into(),
1239 "k2".into() => 5.into(),
1240 "k3".into() => 2.4.into(),
1241 },
1242 ])),
1243 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
1244 Array2::from_shape_vec((4, 5), vec![
1245 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
1246 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
1247 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
1248 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
1249 )]
1250 #[traced_test]
1251 fn select_multiple(
1252 #[case] input: DataFrame,
1253 #[case] columns: Vec<Key>,
1254 #[case] expected: Array2<DataValue>,
1255 ) {
1256 let selected = input.select(Some(&columns));
1257 assert!(selected.is_ok());
1258 let selected = selected.unwrap();
1259
1260 assert_eq!(selected, expected);
1261 }
1262
1263 #[rstest]
1264 #[case(
1265 DataFrame::new(ColumnFrame::from(vec![
1266 hashmap! {
1267 "k".into() => 1.into(),
1268 "k2".into() => 2.into(),
1269 "k3".into() => 2.2.into(),
1270 },
1271 hashmap! {
1272 "k".into() => 11.into(),
1273 "k2".into() => 3.into(),
1274 },
1275 hashmap! {
1276 "k".into() => 4.into(),
1277 "k2".into() => 5.into(),
1278 "k3".into() => 2.3.into(),
1279 },
1280 hashmap! {
1281 "k".into() => 4.into(),
1282 "k2".into() => 5.into(),
1283 "k3".into() => 2.4.into(),
1284 },
1285 ])),
1286 "k".into(),
1287 Array2::from_shape_vec((4, 3), vec![
1288 1.into(), 2.into(), 2.2.into(),
1289 4.into(), 5.into(), 2.3.into(),
1290 4.into(), 5.into(), 2.4.into(),
1291 11.into(), 3.into(), DataValue::Null,
1292 ]
1293 ).unwrap(),
1294 vec!["k".into(), "k2".into(), "k3".into()],
1295 )]
1296 #[rstest]
1297 #[case(
1298 DataFrame::new(ColumnFrame::from(vec![
1299 hashmap! {
1300 "k".into() => 1.into(),
1301 "k2".into() => 2.into(),
1302 "k3".into() => 2.2.into(),
1303 },
1304 hashmap! {
1305 "k".into() => 11.into(),
1306 "k2".into() => 3.into(),
1307 },
1308 hashmap! {
1309 "k".into() => 4.into(),
1310 "k2".into() => 5.into(),
1311 "k3".into() => 2.3.into(),
1312 },
1313 hashmap! {
1314 "k".into() => 4.into(),
1315 "k2".into() => 5.into(),
1316 "k3".into() => 2.4.into(),
1317 },
1318 ])),
1319 "k3".into(),
1320 Array2::from_shape_vec((4, 3), vec![
1321 11.into(), 3.into(), DataValue::Null,
1322 1.into(), 2.into(), 2.2.into(),
1323 4.into(), 5.into(), 2.3.into(),
1324 4.into(), 5.into(), 2.4.into(),
1325 ]
1326 ).unwrap(),
1327 vec!["k".into(), "k2".into(), "k3".into()],
1328 )]
1329 #[case(
1330 DataFrame::new(ColumnFrame::from(vec![
1331 hashmap! {
1332 "k".into() => 2.into(),
1333 "k2".into() => 0.000001.into(),
1334 },
1335 hashmap! {
1336 "k".into() => 1.into(),
1337 "k2".into() =>0.0000001.into(),
1338 },
1339 hashmap! {
1340 "k".into() => 3.into(),
1341 "k2".into() => 0.00001.into(),
1342 },
1343 hashmap! {
1344 "k".into() => 4.into(),
1345 "k2".into() => 0.001.into(),
1346 },
1347 ])),
1348 "k2".into(),
1349 Array2::from_shape_vec((4, 2), vec![
1350 1.into(), 0.0000001.into(),
1351 2.into(), 0.000001.into(),
1352 3.into(), 0.00001.into(),
1353 4.into(), 0.001.into(),
1354 ]
1355 ).unwrap(),
1356 vec!["k".into(), "k2".into()],
1357 )]
1358 #[case(
1359 DataFrame::new(ColumnFrame::from(vec![
1360 hashmap! {
1361 "k".into() => 2.into(),
1362 "k2".into() => "b".into(),
1363 },
1364 hashmap! {
1365 "k".into() => 1.into(),
1366 "k2".into() =>"a".into(),
1367 },
1368 hashmap! {
1369 "k".into() => 3.into(),
1370 "k2".into() =>"c".into(),
1371 },
1372 hashmap! {
1373 "k".into() => 4.into(),
1374 "k2".into() =>"z".into(),
1375 },
1376 ])),
1377 "k2".into(),
1378 Array2::from_shape_vec((4, 2), vec![
1379 1.into(),"a".into(),
1380 2.into(), "b".into(),
1381 3.into(), "c".into(),
1382 4.into(), "z".into(),
1383 ]
1384 ).unwrap(),
1385 vec!["k".into(), "k2".into()],
1386 )]
1387 #[traced_test]
1388 fn sort_by(
1389 #[case] input: DataFrame,
1390 #[case] column: Key,
1391 #[case] expected: Array2<DataValue>,
1392 #[case] columns: Vec<Key>,
1393 ) {
1394 let result = input.sorted(&column);
1395 assert!(result.is_ok(), "{result:?}");
1396 let result = result.unwrap().get_sorted();
1397 let selected = result.select(Some(&columns));
1398
1399 assert_eq!(selected, expected);
1400 }
1401 #[rstest]
1402 #[case(
1403 DataFrame::new(ColumnFrame::from(vec![
1404 hashmap! {
1405 "k".into() => 2.into(),
1406 "k2".into() => 0.000001.into(),
1407 },
1408 hashmap! {
1409 "k".into() => 1.into(),
1410 "k2".into() =>0.0000001.into(),
1411 },
1412 hashmap! {
1413 "k".into() => 3.into(),
1414 "k2".into() => 0.00001.into(),
1415 },
1416 hashmap! {
1417 "k".into() => 4.into(),
1418 "k2".into() => 0.001.into(),
1419 },
1420 ])),
1421 "k2".into(),
1422 TopN::Last(1),
1423 Array2::from_shape_vec((1, 2), vec![
1424 4.into(), 0.001.into(),
1425 ]
1426 ).unwrap(),
1427 vec!["k".into(), "k2".into()],
1428 )]
1429 #[case(
1430 DataFrame::new(ColumnFrame::from(vec![
1431 hashmap! {
1432 "k".into() => 2.into(),
1433 "k2".into() => 0.000001.into(),
1434 },
1435 hashmap! {
1436 "k".into() => 1.into(),
1437 "k2".into() =>0.0000001.into(),
1438 },
1439 hashmap! {
1440 "k".into() => 3.into(),
1441 "k2".into() => 0.00001.into(),
1442 },
1443 hashmap! {
1444 "k".into() => 4.into(),
1445 "k2".into() => 0.001.into(),
1446 },
1447 ])),
1448 "k2".into(),
1449 TopN::Last(2),
1450 Array2::from_shape_vec((2, 2), vec![
1451 4.into(), 0.001.into(),
1452 3.into(), 0.00001.into(),
1453 ]
1454 ).unwrap(),
1455 vec!["k".into(), "k2".into()],
1456 )]
1457 #[case(
1458 DataFrame::new(ColumnFrame::from(vec![
1459 hashmap! {
1460 "k".into() => 2.into(),
1461 "k2".into() => "b".into(),
1462 },
1463 hashmap! {
1464 "k".into() => 1.into(),
1465 "k2".into() =>"a".into(),
1466 },
1467 hashmap! {
1468 "k".into() => 3.into(),
1469 "k2".into() =>"c".into(),
1470 },
1471 hashmap! {
1472 "k".into() => 4.into(),
1473 "k2".into() =>"z".into(),
1474 },
1475 ])),
1476 "k2".into(),
1477 TopN::First(1),
1478 Array2::from_shape_vec((1, 2), vec![
1479 1.into(),"a".into(),
1480 ]
1481 ).unwrap(),
1482 vec!["k".into(), "k2".into()],
1483 )]
1484 #[case(
1485 DataFrame::new(ColumnFrame::from(vec![
1486 hashmap! {
1487 "k".into() => 2.into(),
1488 "k2".into() => "b".into(),
1489 },
1490 hashmap! {
1491 "k".into() => 1.into(),
1492 "k2".into() =>"a".into(),
1493 },
1494 hashmap! {
1495 "k".into() => 3.into(),
1496 "k2".into() =>"c".into(),
1497 },
1498 hashmap! {
1499 "k".into() => 4.into(),
1500 "k2".into() =>"z".into(),
1501 },
1502 ])),
1503 "k2".into(),
1504 TopN::First(2),
1505 Array2::from_shape_vec((2, 2), vec![
1506 1.into(),"a".into(),
1507 2.into(),"b".into(),
1508 ]
1509 ).unwrap(),
1510 vec!["k".into(), "k2".into()],
1511 )]
1512 #[traced_test]
1513 fn top_n(
1514 #[case] input: DataFrame,
1515 #[case] column: Key,
1516 #[case] topn: TopN,
1517 #[case] expected: Array2<DataValue>,
1518 #[case] columns: Vec<Key>,
1519 ) {
1520 let result = input.sorted(&column);
1521 assert!(result.is_ok(), "{result:?}");
1522 let result = result.unwrap();
1523 let first = result.topn(topn).unwrap();
1524 let selected = first.select(Some(&columns));
1525 assert_eq!(selected, expected);
1526 }
1527
1528 #[rstest]
1529 fn test_messagepack_roundtrip_empty_dataframe() {
1530 let df = DataFrame::default();
1531
1532 let bytes = df
1533 .store_into_messagepack()
1534 .expect("failed to serialize empty df");
1535 let restored =
1536 DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize empty df");
1537 assert_eq!(df, restored);
1538 assert!(restored.is_empty());
1539 }
1540
1541 #[rstest]
1542 fn test_messagepack_roundtrip_strings_and_bools() {
1543 let df = DataFrame::new(ColumnFrame::from(vec![
1545 hashmap! {
1546 "str".into() => DataValue::String("hello".into()),
1547 "bool".into() => DataValue::Bool(true),
1548 },
1549 hashmap! {
1550 "str".into() => DataValue::String("".into()),
1551 "bool".into() => DataValue::Bool(false),
1552 },
1553 ]));
1554
1555 let bytes = df.store_into_messagepack().expect("failed to serialize");
1556 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1557 assert_eq!(df, restored);
1558 }
1559
1560 #[rstest]
1561 fn test_messagepack_roundtrip_f64_values() {
1562 let df = DataFrame::new(ColumnFrame::from(vec![
1563 hashmap! {
1564 "a".into() => DataValue::F64(3.14),
1565 },
1566 hashmap! {
1567 "a".into() => DataValue::F64(-2.718),
1568 },
1569 ]));
1570
1571 let bytes = df.store_into_messagepack().expect("failed to serialize");
1572 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1573 assert_eq!(df, restored);
1574 }
1575
1576 #[rstest]
1577 fn test_messagepack_f64_special_values_survive_roundtrip() {
1578 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1581 "a".into() => DataValue::F64(f64::INFINITY),
1582 }]));
1583
1584 let bytes = df.store_into_messagepack().expect("failed to serialize");
1585 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1586 assert_eq!(restored.len(), 1);
1587 let col = restored.select_column("a".into()).expect("col exists");
1588 match &col[0] {
1589 DataValue::F64(v) => assert!(v.is_infinite() && v.is_sign_positive()),
1590 other => panic!("expected F64, got {other:?}"),
1591 }
1592 }
1593
1594 #[rstest]
1595 fn test_messagepack_roundtrip_with_nulls() {
1596 let df = DataFrame::new(ColumnFrame::from(vec![
1597 hashmap! {
1598 "a".into() => DataValue::String("x".into()),
1599 "b".into() => DataValue::String("y".into()),
1600 },
1601 hashmap! {
1602 "a".into() => DataValue::String("z".into()),
1603 },
1605 ]));
1606
1607 let bytes = df.store_into_messagepack().expect("failed to serialize");
1608 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1609 assert_eq!(df, restored);
1610 }
1611
1612 #[rstest]
1613 fn test_messagepack_roundtrip_with_metadata() {
1614 let mut df = DataFrame::new(crate::column_frame! {
1615 "col" => ["a", "b"]
1616 });
1617 df.add_metadata("name".into(), DataValue::String("test_df".into()));
1618 df.add_metadata("flag".into(), DataValue::Bool(true));
1619
1620 let bytes = df.store_into_messagepack().expect("failed to serialize");
1621 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1622 assert_eq!(df, restored);
1623 assert_eq!(
1624 restored.get_metadata("name"),
1625 Some(&DataValue::String("test_df".into()))
1626 );
1627 assert_eq!(restored.get_metadata("flag"), Some(&DataValue::Bool(true)));
1628 }
1629
1630 #[rstest]
1631 fn test_messagepack_roundtrip_with_constants() {
1632 let mut df = DataFrame::new(crate::column_frame! {
1633 "x" => ["a", "b"]
1634 });
1635 df.insert_constant("const_key".into(), DataValue::String("const_val".into()));
1636 df.insert_constant("const_flag".into(), DataValue::Bool(false));
1637
1638 let bytes = df.store_into_messagepack().expect("failed to serialize");
1639 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1640 assert_eq!(df, restored);
1641 assert_eq!(
1642 restored.constants.get(&"const_key".into()),
1643 Some(&DataValue::String("const_val".into()))
1644 );
1645 }
1646
1647 #[rstest]
1648 fn test_messagepack_integer_type_coercion() {
1649 let df = crate::df! {
1652 "a" => [1i64, 2i64, 3i64]
1653 };
1654
1655 let bytes = df.store_into_messagepack().expect("failed to serialize");
1656 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1657
1658 assert_eq!(restored.len(), 3);
1660
1661 let col = restored
1663 .select_column("a".into())
1664 .expect("column should exist");
1665 assert_ne!(
1667 col[0],
1668 DataValue::I64(1),
1669 "messagepack coerces small ints to compact types"
1670 );
1671 }
1672
1673 #[rstest]
1674 fn test_messagepack_large_i64_preserved() {
1675 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1677 "big".into() => DataValue::I64(i64::MIN),
1678 }]));
1679
1680 let bytes = df.store_into_messagepack().expect("failed to serialize");
1681 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1682 assert_eq!(df, restored);
1683 }
1684
1685 #[rstest]
1686 fn test_messagepack_load_invalid_bytes() {
1687 let result = DataFrame::load_from_messagepack(&[0xFF, 0xFE, 0xFD, 0x00]);
1688 assert!(result.is_err());
1689 }
1690
1691 #[rstest]
1692 fn test_messagepack_load_empty_bytes() {
1693 let result = DataFrame::load_from_messagepack(&[]);
1694 assert!(result.is_err());
1695 }
1696
1697 #[rstest]
1698 fn test_messagepack_load_truncated_bytes() {
1699 let df = DataFrame::new(ColumnFrame::from(vec![
1700 hashmap! {
1701 "a".into() => DataValue::String("hello world".into()),
1702 "b".into() => DataValue::Bool(true),
1703 },
1704 hashmap! {
1705 "a".into() => DataValue::String("test".into()),
1706 "b".into() => DataValue::Bool(false),
1707 },
1708 ]));
1709 let bytes = df.store_into_messagepack().expect("failed to serialize");
1710 let truncated = &bytes[..bytes.len() / 2];
1712 let result = DataFrame::load_from_messagepack(truncated);
1713 assert!(result.is_err());
1714 }
1715
1716 #[rstest]
1717 fn test_messagepack_roundtrip_with_nested_vec_data() {
1718 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1719 "vec_col".into() => DataValue::Vec(vec![
1720 DataValue::String("a".into()),
1721 DataValue::String("b".into()),
1722 ]),
1723 "bytes_col".into() => DataValue::Bytes(vec![0, 1, 255]),
1724 }]));
1725
1726 let bytes = df.store_into_messagepack().expect("failed to serialize");
1727 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1728 assert_eq!(df, restored);
1729 }
1730
1731 #[rstest]
1732 fn test_messagepack_roundtrip_preserves_row_count() {
1733 let df = DataFrame::new(ColumnFrame::from(vec![
1734 hashmap! { "a".into() => DataValue::String("x".into()) },
1735 hashmap! { "a".into() => DataValue::String("y".into()) },
1736 hashmap! { "a".into() => DataValue::String("z".into()) },
1737 ]));
1738
1739 let bytes = df.store_into_messagepack().expect("failed to serialize");
1740 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1741 assert_eq!(restored.len(), 3);
1742 assert_eq!(restored.n_rows(), 3);
1743 assert_eq!(restored.n_columns(), 1);
1744 }
1745
1746 #[rstest]
1747 fn test_messagepack_idempotent_double_roundtrip() {
1748 let mut df = DataFrame::new(ColumnFrame::from(vec![
1750 hashmap! {
1751 "a".into() => DataValue::String("hello".into()),
1752 "b".into() => DataValue::Bool(true),
1753 },
1754 hashmap! {
1755 "a".into() => DataValue::String("world".into()),
1756 "b".into() => DataValue::Bool(false),
1757 },
1758 ]));
1759 df.add_metadata("meta".into(), DataValue::Bool(true));
1760 df.insert_constant("c".into(), DataValue::String("const".into()));
1761
1762 let bytes1 = df.store_into_messagepack().expect("first serialize");
1763 let restored1 = DataFrame::load_from_messagepack(&bytes1).expect("first deserialize");
1764 let bytes2 = restored1
1765 .store_into_messagepack()
1766 .expect("second serialize");
1767 let restored2 = DataFrame::load_from_messagepack(&bytes2).expect("second deserialize");
1768
1769 assert_eq!(df, restored2);
1770 assert_eq!(bytes1, bytes2);
1771 }
1772
1773 #[rstest]
1774 fn test_messagepack_single_byte_payload() {
1775 let result = DataFrame::load_from_messagepack(&[0x01]);
1777 assert!(result.is_err());
1778 }
1779
1780 #[rstest]
1783 fn test_hash_datavalue_public_api_accessible() {
1784 let val = DataValue::I32(42);
1786 let h = crate::hash_datavalue(&val);
1787 assert_eq!(h, crate::hash_datavalue(&DataValue::I32(42)));
1789 }
1790
1791 #[rstest]
1792 fn test_hash_datavalue_vec_length_matters() {
1793 let short = DataValue::Vec(vec![DataValue::I32(1)]);
1795 let long = DataValue::Vec(vec![DataValue::I32(1), DataValue::Null]);
1796 assert_ne!(crate::hash_datavalue(&short), crate::hash_datavalue(&long));
1797 }
1798
1799 #[rstest]
1800 fn test_hash_datavalue_map_different_keys_same_values() {
1801 let mut m1 = std::collections::HashMap::new();
1802 m1.insert("a".into(), DataValue::I32(1));
1803 let mut m2 = std::collections::HashMap::new();
1804 m2.insert("b".into(), DataValue::I32(1));
1805
1806 assert_ne!(
1807 crate::hash_datavalue(&DataValue::Map(m1)),
1808 crate::hash_datavalue(&DataValue::Map(m2))
1809 );
1810 }
1811
1812 #[rstest]
1813 fn test_hash_datavalue_empty_string_vs_empty_bytes() {
1814 let empty_str = DataValue::String("".into());
1815 let empty_bytes = DataValue::Bytes(vec![]);
1816 assert_ne!(
1817 crate::hash_datavalue(&empty_str),
1818 crate::hash_datavalue(&empty_bytes)
1819 );
1820 }
1821
1822 #[rstest]
1823 fn test_hash_datavalue_empty_vec_vs_empty_map() {
1824 let empty_vec = DataValue::Vec(vec![]);
1825 let empty_map = DataValue::Map(std::collections::HashMap::new());
1826 assert_ne!(
1827 crate::hash_datavalue(&empty_vec),
1828 crate::hash_datavalue(&empty_map)
1829 );
1830 }
1831
1832 #[rstest]
1833 fn test_hash_datavalue_i128_boundary_values() {
1834 let max = DataValue::I128(i128::MAX);
1835 let min = DataValue::I128(i128::MIN);
1836 let zero = DataValue::I128(0);
1837 let neg_one = DataValue::I128(-1);
1838
1839 let hashes: std::collections::HashSet<u64> = [&max, &min, &zero, &neg_one]
1841 .iter()
1842 .map(|v| crate::hash_datavalue(v))
1843 .collect();
1844 assert_eq!(hashes.len(), 4);
1845 }
1846
1847 #[rstest]
1848 fn test_hash_datavalue_u128_boundary_values() {
1849 let max = DataValue::U128(u128::MAX);
1850 let zero = DataValue::U128(0);
1851 let one = DataValue::U128(1);
1852 let i128_neg1 = DataValue::I128(-1);
1854
1855 assert_ne!(
1856 crate::hash_datavalue(&max),
1857 crate::hash_datavalue(&i128_neg1)
1858 );
1859 let hashes: std::collections::HashSet<u64> = [&max, &zero, &one]
1860 .iter()
1861 .map(|v| crate::hash_datavalue(v))
1862 .collect();
1863 assert_eq!(hashes.len(), 3);
1864 }
1865
1866 #[rstest]
1867 fn test_hash_datavalue_f64_special_values() {
1868 let nan1 = DataValue::F64(f64::NAN);
1870 let nan2 = DataValue::F64(f64::NAN);
1871 assert_eq!(crate::hash_datavalue(&nan1), crate::hash_datavalue(&nan2));
1872
1873 let subnormal = DataValue::F64(f64::MIN_POSITIVE / 2.0);
1875 let normal = DataValue::F64(f64::MIN_POSITIVE);
1876 assert_ne!(
1877 crate::hash_datavalue(&subnormal),
1878 crate::hash_datavalue(&normal)
1879 );
1880 }
1881
1882 #[rstest]
1883 fn test_hash_datavalue_enum_number_vs_i32_same_value() {
1884 let enum_val = DataValue::EnumNumber(42);
1886 let i32_val = DataValue::I32(42);
1887 assert_ne!(
1888 crate::hash_datavalue(&enum_val),
1889 crate::hash_datavalue(&i32_val)
1890 );
1891 }
1892
1893 #[rstest]
1894 fn get_single_column_typed_f64_from_i32() {
1895 let df = crate::df! {
1896 "a" => [1i32, 2i32, 3i32]
1897 };
1898 let key: Key = "a".into();
1899 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1900 assert_eq!(col, ndarray::arr1(&[1.0f64, 2.0, 3.0]));
1901 }
1902
1903 #[rstest]
1904 fn get_single_column_typed_string() {
1905 let df = crate::df! {
1906 "name" => ["alice", "bob"]
1907 };
1908 let key: Key = "name".into();
1909 let col = df.get_single_column_typed::<String>(&key).unwrap();
1910 assert_eq!(
1911 col,
1912 ndarray::arr1(&["alice".to_string(), "bob".to_string()])
1913 );
1914 }
1915
1916 #[rstest]
1917 fn get_single_column_typed_missing_key() {
1918 let df = crate::df! {
1919 "a" => [1u64, 2u64]
1920 };
1921 let missing: Key = "z".into();
1922 assert!(df.get_single_column_typed::<u64>(&missing).is_none());
1923 }
1924
1925 #[rstest]
1926 fn get_single_column_typed_matches_untyped() {
1927 let df = crate::df! {
1928 "v" => [10u64, 20u64, 30u64]
1929 };
1930 let key: Key = "v".into();
1931 let typed = df.get_single_column_typed::<u64>(&key).unwrap();
1932 let untyped = df.get_single_column(&key).unwrap();
1933 for (t, u) in typed.iter().zip(untyped.iter()) {
1934 assert_eq!(*t, u64::extract(u));
1935 }
1936 }
1937
1938 #[rstest]
1939 fn get_single_column_typed_bool_from_i32() {
1940 let df = crate::df! {
1941 "flag" => [1i32, 0i32, 1i32, 0i32]
1942 };
1943 let key: Key = "flag".into();
1944 let col = df.get_single_column_typed::<bool>(&key).unwrap();
1945 assert_eq!(col, ndarray::arr1(&[true, false, true, false]));
1946 }
1947
1948 #[rstest]
1949 fn get_single_column_typed_i64_from_u32() {
1950 let df = crate::df! {
1951 "x" => [10u32, 20u32, 30u32]
1952 };
1953 let key: Key = "x".into();
1954 let col = df.get_single_column_typed::<i64>(&key).unwrap();
1955 assert_eq!(col, ndarray::arr1(&[10i64, 20i64, 30i64]));
1956 }
1957
1958 #[rstest]
1959 fn get_single_column_typed_f64_truncation_to_i32() {
1960 let df = crate::df! {
1961 "v" => [1.9f64, 2.1f64, 3.7f64]
1962 };
1963 let key: Key = "v".into();
1964 let col = df.get_single_column_typed::<i32>(&key).unwrap();
1965 assert_eq!(col, ndarray::arr1(&[1i32, 2i32, 3i32]));
1966 }
1967
1968 #[rstest]
1969 fn get_single_column_typed_single_element() {
1970 let df = crate::df! {
1971 "solo" => [42u64]
1972 };
1973 let key: Key = "solo".into();
1974 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1975 assert_eq!(col.len(), 1);
1976 assert_eq!(col[0], 42.0);
1977 }
1978
1979 #[rstest]
1980 fn select_typed_all_columns() {
1981 let df = crate::df! {
1982 "a" => [1i32, 2i32, 3i32],
1983 "b" => [4i32, 5i32, 6i32]
1984 };
1985 let result = df.select_typed::<f64>(None).unwrap();
1986 assert_eq!(result.nrows(), 3);
1987 assert_eq!(result.ncols(), 2);
1988 assert_eq!(result[[0, 0]], 1.0);
1989 assert_eq!(result[[0, 1]], 4.0);
1990 assert_eq!(result[[2, 0]], 3.0);
1991 assert_eq!(result[[2, 1]], 6.0);
1992 }
1993
1994 #[rstest]
1995 fn select_typed_specific_keys() {
1996 let df = crate::df! {
1997 "x" => [10u64, 20u64],
1998 "y" => [30u64, 40u64],
1999 "z" => [50u64, 60u64]
2000 };
2001 let keys: Vec<Key> = vec!["x".into(), "z".into()];
2002 let result = df.select_typed::<i64>(Some(&keys)).unwrap();
2003 assert_eq!(result.nrows(), 2);
2004 assert_eq!(result.ncols(), 2);
2005 assert_eq!(result[[0, 0]], 10i64);
2006 assert_eq!(result[[0, 1]], 50i64);
2007 assert_eq!(result[[1, 0]], 20i64);
2008 assert_eq!(result[[1, 1]], 60i64);
2009 }
2010
2011 #[rstest]
2012 fn select_typed_nonexistent_key_gives_empty() {
2013 let df = crate::df! {
2014 "a" => [1i32, 2i32]
2015 };
2016 let keys: Vec<Key> = vec!["missing".into()];
2017 let result = df.select_typed::<f64>(Some(&keys)).unwrap();
2018 assert_eq!(result.shape(), &[0, 0]);
2019 }
2020
2021 #[rstest]
2022 fn select_typed_matches_select_with_extract() {
2023 let df = crate::df! {
2024 "a" => [1u64, 2u64, 3u64],
2025 "b" => [4u64, 5u64, 6u64]
2026 };
2027 let typed = df.select_typed::<f64>(None).unwrap();
2028 let manual = df.select(None).unwrap().mapv(|v| f64::extract(&v));
2029 assert_eq!(typed, manual);
2030 }
2031
2032 #[rstest]
2033 fn select_typed_string_values() {
2034 let df = crate::df! {
2035 "name" => ["alice", "bob", "carol"]
2036 };
2037 let result = df.select_typed::<String>(None).unwrap();
2038 assert_eq!(result[[0, 0]], "alice");
2039 assert_eq!(result[[1, 0]], "bob");
2040 assert_eq!(result[[2, 0]], "carol");
2041 }
2042
2043 #[rstest]
2044 fn select_typed_cross_numeric_coercion() {
2045 let df = crate::df! {
2047 "a" => [1i32, 2i32, 3i32]
2048 };
2049 let result = df.select_typed::<u64>(None).unwrap();
2050 assert_eq!(result[[0, 0]], 1u64);
2051 assert_eq!(result[[1, 0]], 2u64);
2052 assert_eq!(result[[2, 0]], 3u64);
2053 }
2054}