1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2};
5use std::fmt;
6pub mod column_store;
8pub mod index;
10pub mod join;
12pub mod key;
14use crate::{error::Error, CandidateData};
15#[cfg(feature = "python")]
16pub mod python;
17
18#[cfg(feature = "python")]
19use pyo3::prelude::*;
20
21use crate::{
22 dataframe::{
23 column_store::typed_array::TypedDataArray, column_store::ColumnFrame,
24 column_store::MaybeView, join::JoinRelation, key::Key,
25 },
26 MLChefMap,
27};
28
29#[derive(Debug, Clone, PartialEq, Eq, Copy)]
34pub enum TopN {
35 First(usize),
37 Last(usize),
39}
40
41#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
67#[cfg_attr(feature = "python", pyclass)]
68pub struct DataFrame {
69 pub constants: HashMap<Key, DataValue>,
73 pub dataframe: ColumnFrame,
75 pub metadata: HashMap<String, DataValue>,
77}
78
79impl fmt::Display for DataFrame {
80 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81 self.dataframe.fmt(f)
82 }
83}
84
85impl DataFrame {
86 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
101 Self {
102 constants: HashMap::new(),
103 dataframe: dataframe.into(),
104 metadata: HashMap::new(),
105 }
106 }
107
108 pub fn n_columns(&self) -> usize {
110 self.dataframe.ncolumns()
111 }
112
113 pub fn n_rows(&self) -> usize {
115 self.dataframe.nrows()
116 }
117
118 pub fn shrink(&mut self) {
121 self.dataframe.shrink();
122 }
123
124 pub fn add_metadata(&mut self, key: String, value: DataValue) {
129 self.metadata.insert(key, value);
130 }
131
132 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
134 self.metadata.get(key)
135 }
136
137 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
149 for (key, value) in other.constants {
150 self.constants.insert(key, value);
151 }
152 self.dataframe.join(other.dataframe, join_type)
153 }
154
155 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
160 where
161 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
162 {
163 self.dataframe.apply_function(keys, &mut func)
164 }
165
166 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
181 Ok(self.dataframe.select(keys))
182 }
183
184 pub fn select_typed<T: Extract + Clone>(
211 &self,
212 keys: Option<&[Key]>,
213 ) -> Result<Array2<T>, Error> {
214 Ok(self.dataframe.select_typed(keys))
215 }
216
217 pub fn select_view(&self, keys: Option<&[Key]>) -> Result<MaybeView<'_>, Error> {
245 self.dataframe.select_view(keys)
246 }
247
248 pub fn select_vec_view(
278 &self,
279 keys: Option<&[Key]>,
280 ) -> Result<Vec<Option<&TypedDataArray>>, Error> {
281 self.dataframe.select_vec_view(keys)
282 }
283
284 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
290 self.dataframe.select_transposed_typed::<D>(keys)
291 }
292
293 #[deprecated(note = "allocates O(n); use get_column() for zero-copy typed access")]
299 pub fn select_column(&self, key: Key) -> Option<ndarray::Array1<DataValue>> {
300 #[allow(deprecated)]
301 self.dataframe.select_column(&key)
302 }
303
304 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
310 self.dataframe.select_transposed(keys)
311 }
312
313 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
319 self.constants.insert(key, value);
320 }
321
322 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
328 self.dataframe.push(item)
329 }
330
331 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
334 self.dataframe.remove_column(keys).map(|x| x.into())
335 }
336
337 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
342 self.dataframe.extend(items.dataframe)
343 }
344
345 pub fn len(&self) -> usize {
347 self.dataframe.nrows()
348 }
349
350 pub fn is_empty(&self) -> bool {
352 self.dataframe.is_empty()
353 }
354
355 pub fn add_single_column<K, V>(&mut self, key: K, values: V) -> Result<(), Error>
365 where
366 K: Into<Key>,
367 V: Into<TypedDataArray>,
368 {
369 self.dataframe.add_single_column(key, values)
370 }
371
372 pub fn get_column(&self, key: &Key) -> Option<&TypedDataArray> {
381 self.dataframe.get_column(key).ok()
382 }
383
384 #[deprecated(note = "allocates O(n); use get_column() for zero-copy typed access")]
390 pub fn get_single_column(&self, key: &Key) -> Option<Array1<DataValue>> {
391 #[allow(deprecated)]
392 self.dataframe.get_single_column(key)
393 }
394
395 pub fn get_single_column_typed<T: Extract>(&self, key: &Key) -> Option<Array1<T>> {
422 self.dataframe.get_single_column_typed(key)
423 }
424
425 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
430 self.dataframe.sorted(key)
431 }
432
433 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
440 let filtered_df = self.dataframe.filter(filter)?;
441 Ok(Self {
442 constants: self.constants.clone(),
443 dataframe: filtered_df,
444 metadata: self.metadata.clone(),
445 })
446 }
447
448 #[cfg(feature = "polars-df")]
453 #[allow(deprecated)]
454 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
455 let mut columns = vec![];
456 for key in self.dataframe.keys() {
457 let values = self
458 .dataframe
459 .get_single_column(key)
460 .ok_or_else(|| Error::NotFound(key.clone()))?
461 .into_iter()
462 .map(|x| into_polars_value(key, x.clone()))
463 .collect::<Vec<_>>();
464 let s = polars::prelude::Column::new(key.name().into(), values);
465
466 columns.push(s);
467 }
468
469 Ok(polars::prelude::DataFrame::new(columns)?)
470 }
471
472 pub fn load_from_messagepack(bytes: &[u8]) -> Result<Self, Error> {
477 rmp_serde::decode::from_slice(bytes).map_err(|e| Error::UnknownError(format!("{e:?}")))
478 }
479
480 pub fn store_into_messagepack(&self) -> Result<Vec<u8>, Error> {
485 rmp_serde::encode::to_vec(&self).map_err(|e| Error::UnknownError(format!("{e:?}")))
486 }
487}
488
489#[cfg(feature = "polars-df")]
493pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
494 use crate::DataType::*;
495 use polars::prelude::DataType::*;
496 match dtype {
497 Bool => Boolean,
498 U32 => UInt32,
499 I32 => Int32,
500 U8 => UInt8,
501 U64 => UInt64,
502 I64 => Int64,
503 F32 => Float32,
504 F64 => Float64,
505 U128 => UInt128,
506 I128 => Int128,
507 crate::DataType::String => polars::prelude::DataType::String,
508 Bytes => Binary,
509 crate::DataType::Unknown => Null,
510 Vec => List(Box::new(polars::prelude::DataType::Unknown(
511 polars::prelude::UnknownKind::Any,
512 ))),
513 Map => Struct(vec![]),
514 }
515}
516
517#[cfg(feature = "polars-df")]
522pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
523 use polars::prelude::AnyValue::*;
524 use polars::prelude::Field;
525
526 use crate::dataframe::column_store::convert_dv_to_dtype;
527 let dv = convert_dv_to_dtype(key, dv);
528 match dv {
529 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
530 DataValue::Bytes(items) => BinaryOwned(items),
531 DataValue::U8(x) => UInt32(x as _),
532 DataValue::Bool(x) => Boolean(x),
533 DataValue::I32(x) => Int32(x),
534 DataValue::U32(x) => UInt32(x),
535 DataValue::I64(x) => Int64(x),
536 DataValue::U64(x) => UInt64(x),
537 DataValue::I128(x) => Int128(x),
538 DataValue::F32(x) => Float32(x),
539 DataValue::F64(x) => Float64(x),
540 DataValue::Null => Null,
541 DataValue::Vec(data_values) => {
542 let mut dt = crate::DataType::Unknown;
543 for d in data_values.iter() {
544 match crate::detect_dtype(d) {
545 crate::DataType::Unknown => continue,
546 e => {
547 dt = e;
548 break;
549 }
550 }
551 }
552 let vec_key = Key::new(key.name(), dt);
553 let s = polars::series::Series::from_any_values(
554 key.name().into(),
555 &data_values
556 .into_iter()
557 .map(|x| into_polars_value(&vec_key, x))
558 .collect::<Vec<_>>(),
559 true,
560 );
561 List(s.expect(&format!("Cannot create series for {key:?}")))
562 }
563 DataValue::EnumNumber(x) => Int32(x),
564 DataValue::U128(x) => UInt128(x),
565 DataValue::Map(x) => {
566 let mut values = vec![];
567 let mut fields = vec![];
568 let mut sorted_keys = x.keys().collect::<Vec<_>>();
569 sorted_keys.sort();
570 for k in sorted_keys {
571 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
572 let dtype = crate::detect_dtype(value);
573 let k = Key::new(k, dtype);
574 values.push(into_polars_value(&k, value.to_owned()));
575 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
576 }
577 StructOwned(Box::new((values, fields)))
578 }
579 }
580}
581
582#[cfg(feature = "polars-df")]
586pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
587 use polars::prelude::AnyValue::*;
588 match dv {
589 Null => DataValue::Null,
590 Boolean(v) => v.into(),
591 String(v) => DataValue::String(v.into()),
592 UInt8(v) => DataValue::U8(v),
593 UInt16(v) => DataValue::U32(v as u32),
594 UInt32(v) => v.into(),
595 UInt64(v) => v.into(),
596 Int8(v) => (v as i32).into(),
597 Int16(v) => (v as i32).into(),
598 Int32(v) => v.into(),
599 Int64(v) => v.into(),
600 Float32(v) => v.into(),
601 Float64(v) => v.into(),
602 Int128(v) => v.into(),
603 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
604 StringOwned(v) => DataValue::String(v.as_str().into()),
608 Binary(v) => DataValue::Bytes(v.to_owned()),
609 BinaryOwned(v) => DataValue::Bytes(v),
610 StructOwned(m) => {
611 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
612 std::collections::HashMap::new();
613 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
614 hm.insert(k.name.as_str().into(), from_polars_value(v));
615 }
616 DataValue::Map(hm)
617 }
618 e => {
619 tracing::warn!("Unsupported polars value: {e:?}");
620 DataValue::Null
621 }
622 }
623}
624
625impl From<ColumnFrame> for DataFrame {
626 fn from(dataframe: ColumnFrame) -> Self {
627 Self::new(dataframe)
628 }
629}
630
631impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
632 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
633 Self::new(ColumnFrame::from(dataframe))
634 }
635}
636
637impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
638 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
639 Self::new(ColumnFrame::from(dataframe))
640 }
641}
642
643impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
644 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
645 Self::new(ColumnFrame::from(dataframe))
646 }
647}
648
649impl From<MLChefMap> for DataFrame {
650 fn from(dataframe: MLChefMap) -> Self {
651 Self::new(ColumnFrame::from(dataframe))
652 }
653}
654impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
655 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
656 Self::new(ColumnFrame::from(dataframe))
657 }
658}
659
660impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
661 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
662 Self::new(ColumnFrame::from(dataframe))
663 }
664}
665
666#[cfg(feature = "polars-df")]
667impl From<polars::prelude::DataFrame> for DataFrame {
668 fn from(dataframe: polars::prelude::DataFrame) -> Self {
669 Self::new(ColumnFrame::from(dataframe))
670 }
671}
672#[cfg(test)]
673#[allow(deprecated)]
674mod test {
675 use crate::filter::FilterRules;
676
677 use super::*;
678 use halfbrown::hashmap;
679 #[cfg(feature = "polars-df")]
680 use polars::prelude::NamedFrom as _;
681 use rstest::*;
682 use tracing_test::traced_test;
683 #[fixture]
684 fn dummy_candidates() -> ColumnFrame {
685 ColumnFrame::from(vec![
686 hashmap! {
687 "key1".into() => 1.into(),
688 "key2".into() => "a".into(),
689 },
690 hashmap! {
691 "key1".into() => 2.into(),
692 "key2".into() => "b".into(),
693 },
694 ])
695 }
696
697 #[rstest]
698 fn test_serde() {
699 let df = crate::df! {
700 "a" => [1u64, 2u64, 3u64],
701 "b" => [4u64, 5u64, 6u64],
702 "c" => [7u64, 8u64, 9u64]
703 };
704
705 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
706
707 let deserialized =
708 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
709
710 assert_eq!(df, deserialized);
711 }
712
713 #[cfg(feature = "polars-df")]
714 #[rstest]
715 fn test_polars() {
716 let expected = crate::df! {
717 "a" => [1u64, 2u64, 3u64],
718 "b" => [4f64, 5f64, 6f64],
719 "c" => [7i64, 8i64, 9i64]
720 };
721
722 let polars_df = polars::df!(
723 "a" => [1u64, 2u64, 3u64],
724 "b" => [4f64, 5f64, 6f64],
725 "c" => [7i64, 8i64, 9i64]
726 )
727 .expect("BUG: should be ok");
728 let as_df: DataFrame = polars_df.into();
729 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
730 assert_eq!(
731 as_df.select(Some(keys.as_slice())),
732 expected.select(Some(keys.as_slice()))
733 );
734 }
735 #[cfg(feature = "polars-df")]
736 use crate::DataType;
737 #[cfg(feature = "polars-df")]
738 #[rstest]
739 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
740 #[case::u32(
741 Key::new("a", DataType::U32),
742 DataValue::U32(u32::MAX),
743 polars::prelude::AnyValue::UInt32(u32::MAX)
744 )]
745 #[case::i32(
746 Key::new("a", DataType::I32),
747 DataValue::I32(i32::MIN),
748 polars::prelude::AnyValue::Int32(i32::MIN)
749 )]
750 #[case::i64(
751 Key::new("a", DataType::I64),
752 DataValue::I64(i64::MIN),
753 polars::prelude::AnyValue::Int64(i64::MIN)
754 )]
755 #[case::u64(
756 Key::new("a", DataType::U64),
757 DataValue::U64(u64::MIN),
758 polars::prelude::AnyValue::UInt64(u64::MIN)
759 )]
760 #[case::f32(
761 Key::new("a", DataType::F32),
762 DataValue::F32(f32::MIN),
763 polars::prelude::AnyValue::Float32(f32::MIN)
764 )]
765 #[case::f64(
766 Key::new("a", DataType::F64),
767 DataValue::F64(f64::MIN),
768 polars::prelude::AnyValue::Float64(f64::MIN)
769 )]
770 #[case::null(
771 Key::new("a", DataType::Unknown),
772 DataValue::Null,
773 polars::prelude::AnyValue::Null
774 )]
775 #[case::i128(
776 Key::new("a", DataType::I128),
777 DataValue::I128(i128::MIN),
778 polars::prelude::AnyValue::Int128(i128::MIN)
779 )]
780 #[case::u8(
781 Key::new("a", DataType::U8),
782 DataValue::U8(255),
783 polars::prelude::AnyValue::UInt8(255)
784 )]
785 #[case::bool(
786 Key::new("a", DataType::Bool),
787 DataValue::Bool(true),
788 polars::prelude::AnyValue::Boolean(true)
789 )]
790 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
791 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
792 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
793 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
794 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
795 fn into_polars_value_test(
799 #[case] key: Key,
800 #[case] input: DataValue,
801 #[case] output: polars::prelude::AnyValue<'static>,
802 ) {
803 assert_eq!(into_polars_value(&key, input.clone()), output);
804 assert_eq!(from_polars_value(output), input);
805 }
806
807 #[rstest]
819 #[case(
820 DataFrame::new(crate::column_frame! {
821 "a" => [1f64, 2f64, 3f64],
822 "b" => [4i64, 5i64, 6i64],
823 "c" => [7i64, 8i64, 9i64]
824 }),
825 DataFrame::new(crate::column_frame! {
826 "a" => [1f64, 2f64],
827 "b" => [4i64, 5i64],
828 "c" => [7i64, 8i64]
829 }),
830 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
831 )]
832 #[case(
833 DataFrame::new(crate::column_frame! {
834 "a" => [1f64, 2f64, 3f64],
835 "b" => [4i64, 5i64, 6i64],
836 "c" => [7i64, 8i64, 9i64]
837 }),
838 DataFrame::new(crate::column_frame! {
839 "a" => [2f64],
840 "b" => [5i64],
841 "c" => [8i64]
842 }),
843 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
844 )]
845 #[traced_test]
846 fn filter_test(
847 #[case] df: DataFrame,
848 #[case] expected: DataFrame,
849 #[case] filter: FilterRules,
850 ) {
851 let filtered = df.filter(&filter).expect("BUG: cannot filter");
852 assert_eq!(filtered, expected);
853 }
854
855 #[rstest]
856 fn test_serde_complex() {
857 let simple = r#"
858{
859 "constants": {},
860 "dataframe": {
861 "index": {
862 "keys": [
863 {
864 "key": 3162770485,
865 "name": "a",
866 "ctype": "U32"
867 },
868 {
869 "key": 2279056742,
870 "name": "b",
871 "ctype": "F64"
872 },
873 {
874 "key": 2994984227,
875 "name": "c",
876 "ctype": "U64"
877 },
878 {
879 "key": 3319645144,
880 "name": "d",
881 "ctype": "F64"
882 },
883 {
884 "key": 1291847470,
885 "name": "e",
886 "ctype": "U32"
887 },
888 {
889 "key": 874241070,
890 "name": "f",
891 "ctype": "Bool"
892 }
893 ],
894 "indexes": {
895 "a": 0,
896 "b": 1,
897 "c": 2,
898 "d": 3,
899 "e": 4,
900 "f": 5
901 },
902 "alias": {}
903 },
904 "data_frame": {
905 "v": 1,
906 "dim": [
907 2,
908 6
909 ],
910 "data": [
911 253780,
912 0.009369421750307085,
913 1633222860381359,
914 8,
915 5,
916 true,
917 64512,
918 0.003391335718333721,
919 1633222860810557,
920 8,
921 5,
922 null
923 ]
924 }
925 },
926 "metadata": {}
927}
928 "#;
929
930 let simple_deserialized: DataFrame =
931 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
932
933 println!("deserialized: {simple_deserialized:?}");
934 let array = format!("[{}, {}, {}]", simple, simple, simple);
935 let deserialized: Vec<DataFrame> =
936 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
937
938 println!("deserialized: {deserialized:?}");
939 assert_eq!(deserialized.len(), 3);
940 assert_eq!(simple_deserialized, deserialized[0]);
941 }
942
943 #[rstest]
944 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
945 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
946 #[case(vec![hashmap! {
947 "key1".into() => 1.into(),
948 "key2".into() => "a".into(),
949 },
950 hashmap! {
951 "key1".into() => 2.into(),
952 },])]
953 #[case(vec![data_value::stdhashmap! {
954 "key1" => DataValue::from(1),
955 "key2" => DataValue::from("a"),
956 },data_value::stdhashmap! {
957 "key1" => DataValue::from(2),
958 },])]
959 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
960 vec![DataValue::from("a"), DataValue::Null])])]
961 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
962 let df: DataFrame = input.into();
963 assert_eq!(
964 df,
965 DataFrame {
966 constants: HashMap::new(),
967 dataframe: ColumnFrame::from(vec![
968 hashmap! {
969 "key1".into() => 1.into(),
970 "key2".into() => "a".into(),
971 },
972 hashmap! {
973 "key1".into() => 2.into(),
974 },
975 ]),
976 metadata: HashMap::new(),
977 }
978 );
979 let selected_transposed = df.select_column("key1".into());
980 assert!(selected_transposed.is_some());
981 let selected_transposed = selected_transposed.unwrap();
982 assert_eq!(selected_transposed.len(), 2);
983 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
984 }
985
986 #[rstest]
987 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
988 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
989 #[case::hm({
990 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
991 hm
992 })]
993 #[case::vec_hhm(vec![hashmap! {
994 "key1".into() => 1.into(),
995 "key2".into() => "a".into(),
996 },
997 hashmap! {
998 "key1".into() => 2.into(),
999 },])]
1000 #[case::vec_hme(vec![data_value::stdhashmap! {
1001 "key1" => DataValue::from(1),
1002 "key2" => DataValue::from("a"),
1003 },data_value::stdhashmap! {
1004 "key1" => DataValue::from(2),
1005 },])]
1006 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
1007 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
1008 let df: DataFrame = input.into();
1009 let expected: DataFrame = DataFrame {
1010 constants: HashMap::new(),
1011 dataframe: ColumnFrame::from(vec![
1012 hashmap! {
1013 "key1".into() => 1.into(),
1014 "key2".into() => "a".into(),
1015 },
1016 hashmap! {
1017 "key1".into() => 2.into(),
1018 },
1019 ]),
1020 metadata: HashMap::new(),
1021 };
1022 assert_eq!(
1023 df.select(Some(&["key1".into(), "key2".into()])),
1024 expected.select(Some(&["key1".into(), "key2".into()])),
1025 "{df} vs {expected}"
1026 );
1027 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
1028 assert_eq!(selected_transposed.len(), 2);
1029 println!("{:?}", selected_transposed);
1030 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
1031 }
1032 #[rstest]
1033 fn test_dataframe(dummy_candidates: ColumnFrame) {
1034 let mut dataframe: DataFrame = DataFrame::default();
1035 assert!(dataframe.is_empty());
1036 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
1037 assert_eq!(dataframe.len(), 2);
1038
1039 let candidate = hashmap! {
1040 "key1".into() => 3.into(),
1041 "key2".into() => "c".into(),
1042 };
1043
1044 assert!(dataframe.push(candidate).is_ok());
1045 assert_eq!(dataframe.len(), 3);
1046 assert!(!dataframe.is_empty());
1047
1048 dataframe.insert_constant("key3".into(), 4.into());
1049 assert_eq!(dataframe.constants.len(), 1);
1050 assert!(dataframe
1051 .apply_function(&["key1".into()], |keys, df| {
1052 let key = keys[0].clone();
1053 let s = df
1054 .get_single_column(&key)
1055 .expect("BUG: Cannot get column")
1056 .to_owned();
1057 let s = s.mapv(|x| x + DataValue::from(1));
1058 df.add_single_column("key5", s)?;
1059 Ok(())
1060 })
1061 .is_ok());
1062 let original = dataframe.clone();
1063 dataframe.shrink();
1064 let remove_df = dataframe.remove_column(&["key1".into()]);
1065 assert!(remove_df.is_ok());
1066 let mut remove_df = remove_df.unwrap();
1067 assert_eq!(remove_df.len(), 3);
1068 let selected = dataframe.select(Some(&["key2".into()]));
1069 assert!(selected.is_ok());
1070 let selected = selected.unwrap();
1071 println!("{:?}", selected);
1072
1073 let joined_result =
1075 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
1076 assert!(joined_result.is_ok(), "{:?}", joined_result);
1077 let keys = vec!["key1".into(), "key2".into(), "key5".into()];
1078 assert_eq!(
1079 original.select(Some(keys.as_slice())),
1080 remove_df.select(Some(keys.as_slice()))
1081 );
1082 }
1083
1084 #[rstest]
1085 fn test_size_methods() {
1086 let candidate = hashmap! {
1087 "key1".into() => 3.into(),
1088 "key2".into() => "c".into(),
1089 "key3".into() => false.into()
1090 };
1091
1092 let dataframe: DataFrame = vec![candidate].into();
1093
1094 assert_eq!(dataframe.n_columns(), 3);
1095 assert_eq!(dataframe.n_rows(), 1);
1096 }
1097
1098 #[rstest]
1099 fn test_metadata(dummy_candidates: ColumnFrame) {
1100 let mut dataframe: DataFrame = DataFrame::default();
1101 assert!(dataframe.is_empty());
1102 println!("{:?}", dataframe);
1103 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
1104 println!("{:?}", dataframe);
1105 assert_eq!(dataframe.len(), 2);
1106
1107 dataframe.add_metadata("test".into(), 1.into());
1108 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
1109 let dataframe = DataFrame::new(ColumnFrame::from(vec![
1110 hashmap! {
1111 "key1".into() => 1.into(),
1112 "key2".into() => "a".into(),
1113 },
1114 hashmap! {
1115 "key1".into() => 2.into(),
1116 "key2".into() => "b".into(),
1117 },
1118 ]));
1119 assert_eq!(dataframe.get_metadata("test"), None);
1120 let tt = dataframe.select_transposed(None);
1121 assert!(tt.is_ok());
1122 let tt = tt.unwrap();
1123 assert_eq!(tt.shape(), [2, 2]);
1124 assert_eq!(
1125 tt,
1126 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
1127 .unwrap()
1128 );
1129 }
1130
1131 #[rstest]
1132 #[traced_test]
1133 fn add_single_column_test() {
1134 let mut dataframe = DataFrame::default();
1135 let values: Array1<DataValue> = Array1::from_vec(vec![1.into(), 2.into(), 3.into()]);
1136 let r = dataframe.add_single_column("key1", values);
1137 assert!(r.is_ok(), "{r:?}");
1138 let selected = dataframe.select(None);
1139 assert!(selected.is_ok());
1140 let selected = selected.unwrap();
1141 assert_eq!(selected.shape(), [3, 1]);
1142 assert_eq!(
1143 selected,
1144 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
1145 );
1146 let values: Array1<i32> = Array1::from_vec(vec![1, 2]);
1147 assert!(dataframe.add_single_column("key1", values).is_err());
1148 let values: Vec<i32> = vec![3i32, 4, 5];
1149 assert!(dataframe.add_single_column("key2", values).is_ok());
1150 let values: Array1<i32> = Array1::from_vec(vec![3i32]);
1151 assert!(dataframe.add_single_column("key3", values).is_err());
1152 }
1153
1154 #[rstest]
1155 #[traced_test]
1156 fn add_single_column_empty_test() {
1157 let mut dataframe = DataFrame::default();
1158 let values: Array1<DataValue> = Array1::from(vec![]);
1159 let r = dataframe.add_single_column("key1", values);
1160 assert!(r.is_ok(), "{r:?}");
1161 let selected = dataframe.select(None);
1162 assert!(selected.is_ok());
1163 let selected = selected.unwrap();
1164 assert_eq!(selected.shape(), [0, 1]);
1165 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
1166 let values: Array1<DataValue> = Array1::from(vec![1.into(), 2.into()]);
1167 assert!(dataframe.add_single_column("key1", values).is_err());
1168 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1169 assert!(dataframe.add_single_column("key2", values).is_ok());
1170 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into()]);
1171 assert!(dataframe.add_single_column("key3", values).is_err());
1172 let values: Array1<DataValue> = Array1::from(vec![3.into(), 4.into(), 5.into()]);
1173 assert!(dataframe.add_single_column("key3", values).is_ok());
1174
1175 assert_eq!(
1176 dataframe
1177 .select_column("key1".into())
1178 .expect("BUG: has to exists"),
1179 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
1180 );
1181 assert_eq!(
1182 dataframe
1183 .select_column("key2".into())
1184 .expect("BUG: has to exists"),
1185 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
1186 );
1187 assert_eq!(
1188 dataframe.select(None).expect("BUG: cannot get data"),
1189 ndarray::arr2(&[
1190 [DataValue::Null, 3.into(), 3.into()],
1191 [DataValue::Null, 4.into(), 4.into()],
1192 [DataValue::Null, 5.into(), 5.into()],
1193 ])
1194 );
1195 }
1196
1197 #[rstest]
1198 #[case(
1199 DataFrame::new(ColumnFrame::from(vec![
1200 hashmap! {
1201 "k".into() => 1.into(),
1202 "k2".into() => 2.into(),
1203 "k3".into() => 2.2.into(),
1204 },
1205 hashmap! {
1206 "k".into() => 11.into(),
1207 "k2".into() => 3.into(),
1208 },
1209 hashmap! {
1210 "k".into() => 4.into(),
1211 "k2".into() => 5.into(),
1212 "k3".into() => 2.3.into(),
1213 },
1214 hashmap! {
1215 "k".into() => 4.into(),
1216 "k2".into() => 5.into(),
1217 "k3".into() => 2.4.into(),
1218 },
1219 ])),
1220 vec!["k".into(), "k2".into()],
1221 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
1222 )]
1223 #[case(
1224 DataFrame::new(ColumnFrame::from(vec![
1225 hashmap! {
1226 "k".into() => 1.into(),
1227 "k2".into() => 2.into(),
1228 "k3".into() => 2.2.into(),
1229 },
1230 hashmap! {
1231 "k".into() => 11.into(),
1232 "k2".into() => 3.into(),
1233 },
1234 hashmap! {
1235 "k".into() => 4.into(),
1236 "k2".into() => 5.into(),
1237 "k3".into() => 2.3.into(),
1238 },
1239 hashmap! {
1240 "k".into() => 4.into(),
1241 "k2".into() => 5.into(),
1242 "k3".into() => 2.4.into(),
1243 },
1244 ])),
1245 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
1246 Array2::from_shape_vec((4, 5), vec![
1247 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
1248 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
1249 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
1250 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
1251 )]
1252 #[traced_test]
1253 fn select_multiple(
1254 #[case] input: DataFrame,
1255 #[case] columns: Vec<Key>,
1256 #[case] expected: Array2<DataValue>,
1257 ) {
1258 let selected = input.select(Some(&columns));
1259 assert!(selected.is_ok());
1260 let selected = selected.unwrap();
1261
1262 assert_eq!(selected, expected);
1263 }
1264
1265 #[rstest]
1266 #[case(
1267 DataFrame::new(ColumnFrame::from(vec![
1268 hashmap! {
1269 "k".into() => 1.into(),
1270 "k2".into() => 2.into(),
1271 "k3".into() => 2.2.into(),
1272 },
1273 hashmap! {
1274 "k".into() => 11.into(),
1275 "k2".into() => 3.into(),
1276 },
1277 hashmap! {
1278 "k".into() => 4.into(),
1279 "k2".into() => 5.into(),
1280 "k3".into() => 2.3.into(),
1281 },
1282 hashmap! {
1283 "k".into() => 4.into(),
1284 "k2".into() => 5.into(),
1285 "k3".into() => 2.4.into(),
1286 },
1287 ])),
1288 "k".into(),
1289 Array2::from_shape_vec((4, 3), vec![
1290 1.into(), 2.into(), 2.2.into(),
1291 4.into(), 5.into(), 2.3.into(),
1292 4.into(), 5.into(), 2.4.into(),
1293 11.into(), 3.into(), DataValue::Null,
1294 ]
1295 ).unwrap(),
1296 vec!["k".into(), "k2".into(), "k3".into()],
1297 )]
1298 #[rstest]
1299 #[case(
1300 DataFrame::new(ColumnFrame::from(vec![
1301 hashmap! {
1302 "k".into() => 1.into(),
1303 "k2".into() => 2.into(),
1304 "k3".into() => 2.2.into(),
1305 },
1306 hashmap! {
1307 "k".into() => 11.into(),
1308 "k2".into() => 3.into(),
1309 },
1310 hashmap! {
1311 "k".into() => 4.into(),
1312 "k2".into() => 5.into(),
1313 "k3".into() => 2.3.into(),
1314 },
1315 hashmap! {
1316 "k".into() => 4.into(),
1317 "k2".into() => 5.into(),
1318 "k3".into() => 2.4.into(),
1319 },
1320 ])),
1321 "k3".into(),
1322 Array2::from_shape_vec((4, 3), vec![
1323 11.into(), 3.into(), DataValue::Null,
1324 1.into(), 2.into(), 2.2.into(),
1325 4.into(), 5.into(), 2.3.into(),
1326 4.into(), 5.into(), 2.4.into(),
1327 ]
1328 ).unwrap(),
1329 vec!["k".into(), "k2".into(), "k3".into()],
1330 )]
1331 #[case(
1332 DataFrame::new(ColumnFrame::from(vec![
1333 hashmap! {
1334 "k".into() => 2.into(),
1335 "k2".into() => 0.000001.into(),
1336 },
1337 hashmap! {
1338 "k".into() => 1.into(),
1339 "k2".into() =>0.0000001.into(),
1340 },
1341 hashmap! {
1342 "k".into() => 3.into(),
1343 "k2".into() => 0.00001.into(),
1344 },
1345 hashmap! {
1346 "k".into() => 4.into(),
1347 "k2".into() => 0.001.into(),
1348 },
1349 ])),
1350 "k2".into(),
1351 Array2::from_shape_vec((4, 2), vec![
1352 1.into(), 0.0000001.into(),
1353 2.into(), 0.000001.into(),
1354 3.into(), 0.00001.into(),
1355 4.into(), 0.001.into(),
1356 ]
1357 ).unwrap(),
1358 vec!["k".into(), "k2".into()],
1359 )]
1360 #[case(
1361 DataFrame::new(ColumnFrame::from(vec![
1362 hashmap! {
1363 "k".into() => 2.into(),
1364 "k2".into() => "b".into(),
1365 },
1366 hashmap! {
1367 "k".into() => 1.into(),
1368 "k2".into() =>"a".into(),
1369 },
1370 hashmap! {
1371 "k".into() => 3.into(),
1372 "k2".into() =>"c".into(),
1373 },
1374 hashmap! {
1375 "k".into() => 4.into(),
1376 "k2".into() =>"z".into(),
1377 },
1378 ])),
1379 "k2".into(),
1380 Array2::from_shape_vec((4, 2), vec![
1381 1.into(),"a".into(),
1382 2.into(), "b".into(),
1383 3.into(), "c".into(),
1384 4.into(), "z".into(),
1385 ]
1386 ).unwrap(),
1387 vec!["k".into(), "k2".into()],
1388 )]
1389 #[traced_test]
1390 fn sort_by(
1391 #[case] input: DataFrame,
1392 #[case] column: Key,
1393 #[case] expected: Array2<DataValue>,
1394 #[case] columns: Vec<Key>,
1395 ) {
1396 let result = input.sorted(&column);
1397 assert!(result.is_ok(), "{result:?}");
1398 let result = result.unwrap().get_sorted();
1399 let selected = result.select(Some(&columns));
1400
1401 assert_eq!(selected, expected);
1402 }
1403 #[rstest]
1404 #[case(
1405 DataFrame::new(ColumnFrame::from(vec![
1406 hashmap! {
1407 "k".into() => 2.into(),
1408 "k2".into() => 0.000001.into(),
1409 },
1410 hashmap! {
1411 "k".into() => 1.into(),
1412 "k2".into() =>0.0000001.into(),
1413 },
1414 hashmap! {
1415 "k".into() => 3.into(),
1416 "k2".into() => 0.00001.into(),
1417 },
1418 hashmap! {
1419 "k".into() => 4.into(),
1420 "k2".into() => 0.001.into(),
1421 },
1422 ])),
1423 "k2".into(),
1424 TopN::Last(1),
1425 Array2::from_shape_vec((1, 2), vec![
1426 4.into(), 0.001.into(),
1427 ]
1428 ).unwrap(),
1429 vec!["k".into(), "k2".into()],
1430 )]
1431 #[case(
1432 DataFrame::new(ColumnFrame::from(vec![
1433 hashmap! {
1434 "k".into() => 2.into(),
1435 "k2".into() => 0.000001.into(),
1436 },
1437 hashmap! {
1438 "k".into() => 1.into(),
1439 "k2".into() =>0.0000001.into(),
1440 },
1441 hashmap! {
1442 "k".into() => 3.into(),
1443 "k2".into() => 0.00001.into(),
1444 },
1445 hashmap! {
1446 "k".into() => 4.into(),
1447 "k2".into() => 0.001.into(),
1448 },
1449 ])),
1450 "k2".into(),
1451 TopN::Last(2),
1452 Array2::from_shape_vec((2, 2), vec![
1453 4.into(), 0.001.into(),
1454 3.into(), 0.00001.into(),
1455 ]
1456 ).unwrap(),
1457 vec!["k".into(), "k2".into()],
1458 )]
1459 #[case(
1460 DataFrame::new(ColumnFrame::from(vec![
1461 hashmap! {
1462 "k".into() => 2.into(),
1463 "k2".into() => "b".into(),
1464 },
1465 hashmap! {
1466 "k".into() => 1.into(),
1467 "k2".into() =>"a".into(),
1468 },
1469 hashmap! {
1470 "k".into() => 3.into(),
1471 "k2".into() =>"c".into(),
1472 },
1473 hashmap! {
1474 "k".into() => 4.into(),
1475 "k2".into() =>"z".into(),
1476 },
1477 ])),
1478 "k2".into(),
1479 TopN::First(1),
1480 Array2::from_shape_vec((1, 2), vec![
1481 1.into(),"a".into(),
1482 ]
1483 ).unwrap(),
1484 vec!["k".into(), "k2".into()],
1485 )]
1486 #[case(
1487 DataFrame::new(ColumnFrame::from(vec![
1488 hashmap! {
1489 "k".into() => 2.into(),
1490 "k2".into() => "b".into(),
1491 },
1492 hashmap! {
1493 "k".into() => 1.into(),
1494 "k2".into() =>"a".into(),
1495 },
1496 hashmap! {
1497 "k".into() => 3.into(),
1498 "k2".into() =>"c".into(),
1499 },
1500 hashmap! {
1501 "k".into() => 4.into(),
1502 "k2".into() =>"z".into(),
1503 },
1504 ])),
1505 "k2".into(),
1506 TopN::First(2),
1507 Array2::from_shape_vec((2, 2), vec![
1508 1.into(),"a".into(),
1509 2.into(),"b".into(),
1510 ]
1511 ).unwrap(),
1512 vec!["k".into(), "k2".into()],
1513 )]
1514 #[traced_test]
1515 fn top_n(
1516 #[case] input: DataFrame,
1517 #[case] column: Key,
1518 #[case] topn: TopN,
1519 #[case] expected: Array2<DataValue>,
1520 #[case] columns: Vec<Key>,
1521 ) {
1522 let result = input.sorted(&column);
1523 assert!(result.is_ok(), "{result:?}");
1524 let result = result.unwrap();
1525 let first = result.topn(topn).unwrap();
1526 let selected = first.select(Some(&columns));
1527 assert_eq!(selected, expected);
1528 }
1529
1530 #[rstest]
1531 fn test_messagepack_roundtrip_empty_dataframe() {
1532 let df = DataFrame::default();
1533
1534 let bytes = df
1535 .store_into_messagepack()
1536 .expect("failed to serialize empty df");
1537 let restored =
1538 DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize empty df");
1539 assert_eq!(df, restored);
1540 assert!(restored.is_empty());
1541 }
1542
1543 #[rstest]
1544 fn test_messagepack_roundtrip_strings_and_bools() {
1545 let df = DataFrame::new(ColumnFrame::from(vec![
1547 hashmap! {
1548 "str".into() => DataValue::String("hello".into()),
1549 "bool".into() => DataValue::Bool(true),
1550 },
1551 hashmap! {
1552 "str".into() => DataValue::String("".into()),
1553 "bool".into() => DataValue::Bool(false),
1554 },
1555 ]));
1556
1557 let bytes = df.store_into_messagepack().expect("failed to serialize");
1558 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1559 assert_eq!(df, restored);
1560 }
1561
1562 #[rstest]
1563 fn test_messagepack_roundtrip_f64_values() {
1564 let df = DataFrame::new(ColumnFrame::from(vec![
1565 hashmap! {
1566 "a".into() => DataValue::F64(3.14),
1567 },
1568 hashmap! {
1569 "a".into() => DataValue::F64(-2.718),
1570 },
1571 ]));
1572
1573 let bytes = df.store_into_messagepack().expect("failed to serialize");
1574 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1575 assert_eq!(df, restored);
1576 }
1577
1578 #[rstest]
1579 fn test_messagepack_f64_special_values_survive_roundtrip() {
1580 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1583 "a".into() => DataValue::F64(f64::INFINITY),
1584 }]));
1585
1586 let bytes = df.store_into_messagepack().expect("failed to serialize");
1587 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1588 assert_eq!(restored.len(), 1);
1589 let col = restored.select_column("a".into()).expect("col exists");
1590 match &col[0] {
1591 DataValue::F64(v) => assert!(v.is_infinite() && v.is_sign_positive()),
1592 other => panic!("expected F64, got {other:?}"),
1593 }
1594 }
1595
1596 #[rstest]
1597 fn test_messagepack_roundtrip_with_nulls() {
1598 let df = DataFrame::new(ColumnFrame::from(vec![
1599 hashmap! {
1600 "a".into() => DataValue::String("x".into()),
1601 "b".into() => DataValue::String("y".into()),
1602 },
1603 hashmap! {
1604 "a".into() => DataValue::String("z".into()),
1605 },
1607 ]));
1608
1609 let bytes = df.store_into_messagepack().expect("failed to serialize");
1610 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1611 assert_eq!(df, restored);
1612 }
1613
1614 #[rstest]
1615 fn test_messagepack_roundtrip_with_metadata() {
1616 let mut df = DataFrame::new(crate::column_frame! {
1617 "col" => ["a", "b"]
1618 });
1619 df.add_metadata("name".into(), DataValue::String("test_df".into()));
1620 df.add_metadata("flag".into(), DataValue::Bool(true));
1621
1622 let bytes = df.store_into_messagepack().expect("failed to serialize");
1623 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1624 assert_eq!(df, restored);
1625 assert_eq!(
1626 restored.get_metadata("name"),
1627 Some(&DataValue::String("test_df".into()))
1628 );
1629 assert_eq!(restored.get_metadata("flag"), Some(&DataValue::Bool(true)));
1630 }
1631
1632 #[rstest]
1633 fn test_messagepack_roundtrip_with_constants() {
1634 let mut df = DataFrame::new(crate::column_frame! {
1635 "x" => ["a", "b"]
1636 });
1637 df.insert_constant("const_key".into(), DataValue::String("const_val".into()));
1638 df.insert_constant("const_flag".into(), DataValue::Bool(false));
1639
1640 let bytes = df.store_into_messagepack().expect("failed to serialize");
1641 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1642 assert_eq!(df, restored);
1643 assert_eq!(
1644 restored.constants.get(&"const_key".into()),
1645 Some(&DataValue::String("const_val".into()))
1646 );
1647 }
1648
1649 #[rstest]
1650 fn test_messagepack_integer_type_coercion() {
1651 let df = crate::df! {
1654 "a" => [1i64, 2i64, 3i64]
1655 };
1656
1657 let bytes = df.store_into_messagepack().expect("failed to serialize");
1658 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1659
1660 assert_eq!(restored.len(), 3);
1662
1663 let col = restored
1665 .select_column("a".into())
1666 .expect("column should exist");
1667 assert_ne!(
1669 col[0],
1670 DataValue::I64(1),
1671 "messagepack coerces small ints to compact types"
1672 );
1673 }
1674
1675 #[rstest]
1676 fn test_messagepack_large_i64_preserved() {
1677 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1679 "big".into() => DataValue::I64(i64::MIN),
1680 }]));
1681
1682 let bytes = df.store_into_messagepack().expect("failed to serialize");
1683 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1684 assert_eq!(df, restored);
1685 }
1686
1687 #[rstest]
1688 fn test_messagepack_load_invalid_bytes() {
1689 let result = DataFrame::load_from_messagepack(&[0xFF, 0xFE, 0xFD, 0x00]);
1690 assert!(result.is_err());
1691 }
1692
1693 #[rstest]
1694 fn test_messagepack_load_empty_bytes() {
1695 let result = DataFrame::load_from_messagepack(&[]);
1696 assert!(result.is_err());
1697 }
1698
1699 #[rstest]
1700 fn test_messagepack_load_truncated_bytes() {
1701 let df = DataFrame::new(ColumnFrame::from(vec![
1702 hashmap! {
1703 "a".into() => DataValue::String("hello world".into()),
1704 "b".into() => DataValue::Bool(true),
1705 },
1706 hashmap! {
1707 "a".into() => DataValue::String("test".into()),
1708 "b".into() => DataValue::Bool(false),
1709 },
1710 ]));
1711 let bytes = df.store_into_messagepack().expect("failed to serialize");
1712 let truncated = &bytes[..bytes.len() / 2];
1714 let result = DataFrame::load_from_messagepack(truncated);
1715 assert!(result.is_err());
1716 }
1717
1718 #[rstest]
1719 fn test_messagepack_roundtrip_with_nested_vec_data() {
1720 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1721 "vec_col".into() => DataValue::Vec(vec![
1722 DataValue::String("a".into()),
1723 DataValue::String("b".into()),
1724 ]),
1725 "bytes_col".into() => DataValue::Bytes(vec![0, 1, 255]),
1726 }]));
1727
1728 let bytes = df.store_into_messagepack().expect("failed to serialize");
1729 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1730 assert_eq!(df, restored);
1731 }
1732
1733 #[rstest]
1734 fn test_messagepack_roundtrip_preserves_row_count() {
1735 let df = DataFrame::new(ColumnFrame::from(vec![
1736 hashmap! { "a".into() => DataValue::String("x".into()) },
1737 hashmap! { "a".into() => DataValue::String("y".into()) },
1738 hashmap! { "a".into() => DataValue::String("z".into()) },
1739 ]));
1740
1741 let bytes = df.store_into_messagepack().expect("failed to serialize");
1742 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1743 assert_eq!(restored.len(), 3);
1744 assert_eq!(restored.n_rows(), 3);
1745 assert_eq!(restored.n_columns(), 1);
1746 }
1747
1748 #[rstest]
1749 fn test_messagepack_idempotent_double_roundtrip() {
1750 let mut df = DataFrame::new(ColumnFrame::from(vec![
1752 hashmap! {
1753 "a".into() => DataValue::String("hello".into()),
1754 "b".into() => DataValue::Bool(true),
1755 },
1756 hashmap! {
1757 "a".into() => DataValue::String("world".into()),
1758 "b".into() => DataValue::Bool(false),
1759 },
1760 ]));
1761 df.add_metadata("meta".into(), DataValue::Bool(true));
1762 df.insert_constant("c".into(), DataValue::String("const".into()));
1763
1764 let bytes1 = df.store_into_messagepack().expect("first serialize");
1765 let restored1 = DataFrame::load_from_messagepack(&bytes1).expect("first deserialize");
1766 let bytes2 = restored1
1767 .store_into_messagepack()
1768 .expect("second serialize");
1769 let restored2 = DataFrame::load_from_messagepack(&bytes2).expect("second deserialize");
1770
1771 assert_eq!(df, restored2);
1772 assert_eq!(bytes1, bytes2);
1773 }
1774
1775 #[rstest]
1776 fn test_messagepack_single_byte_payload() {
1777 let result = DataFrame::load_from_messagepack(&[0x01]);
1779 assert!(result.is_err());
1780 }
1781
1782 #[rstest]
1785 fn test_hash_datavalue_public_api_accessible() {
1786 let val = DataValue::I32(42);
1788 let h = crate::hash_datavalue(&val);
1789 assert_eq!(h, crate::hash_datavalue(&DataValue::I32(42)));
1791 }
1792
1793 #[rstest]
1794 fn test_hash_datavalue_vec_length_matters() {
1795 let short = DataValue::Vec(vec![DataValue::I32(1)]);
1797 let long = DataValue::Vec(vec![DataValue::I32(1), DataValue::Null]);
1798 assert_ne!(crate::hash_datavalue(&short), crate::hash_datavalue(&long));
1799 }
1800
1801 #[rstest]
1802 fn test_hash_datavalue_map_different_keys_same_values() {
1803 let mut m1 = std::collections::HashMap::new();
1804 m1.insert("a".into(), DataValue::I32(1));
1805 let mut m2 = std::collections::HashMap::new();
1806 m2.insert("b".into(), DataValue::I32(1));
1807
1808 assert_ne!(
1809 crate::hash_datavalue(&DataValue::Map(m1)),
1810 crate::hash_datavalue(&DataValue::Map(m2))
1811 );
1812 }
1813
1814 #[rstest]
1815 fn test_hash_datavalue_empty_string_vs_empty_bytes() {
1816 let empty_str = DataValue::String("".into());
1817 let empty_bytes = DataValue::Bytes(vec![]);
1818 assert_ne!(
1819 crate::hash_datavalue(&empty_str),
1820 crate::hash_datavalue(&empty_bytes)
1821 );
1822 }
1823
1824 #[rstest]
1825 fn test_hash_datavalue_empty_vec_vs_empty_map() {
1826 let empty_vec = DataValue::Vec(vec![]);
1827 let empty_map = DataValue::Map(std::collections::HashMap::new());
1828 assert_ne!(
1829 crate::hash_datavalue(&empty_vec),
1830 crate::hash_datavalue(&empty_map)
1831 );
1832 }
1833
1834 #[rstest]
1835 fn test_hash_datavalue_i128_boundary_values() {
1836 let max = DataValue::I128(i128::MAX);
1837 let min = DataValue::I128(i128::MIN);
1838 let zero = DataValue::I128(0);
1839 let neg_one = DataValue::I128(-1);
1840
1841 let hashes: std::collections::HashSet<u64> = [&max, &min, &zero, &neg_one]
1843 .iter()
1844 .map(|v| crate::hash_datavalue(v))
1845 .collect();
1846 assert_eq!(hashes.len(), 4);
1847 }
1848
1849 #[rstest]
1850 fn test_hash_datavalue_u128_boundary_values() {
1851 let max = DataValue::U128(u128::MAX);
1852 let zero = DataValue::U128(0);
1853 let one = DataValue::U128(1);
1854 let i128_neg1 = DataValue::I128(-1);
1856
1857 assert_ne!(
1858 crate::hash_datavalue(&max),
1859 crate::hash_datavalue(&i128_neg1)
1860 );
1861 let hashes: std::collections::HashSet<u64> = [&max, &zero, &one]
1862 .iter()
1863 .map(|v| crate::hash_datavalue(v))
1864 .collect();
1865 assert_eq!(hashes.len(), 3);
1866 }
1867
1868 #[rstest]
1869 fn test_hash_datavalue_f64_special_values() {
1870 let nan1 = DataValue::F64(f64::NAN);
1872 let nan2 = DataValue::F64(f64::NAN);
1873 assert_eq!(crate::hash_datavalue(&nan1), crate::hash_datavalue(&nan2));
1874
1875 let subnormal = DataValue::F64(f64::MIN_POSITIVE / 2.0);
1877 let normal = DataValue::F64(f64::MIN_POSITIVE);
1878 assert_ne!(
1879 crate::hash_datavalue(&subnormal),
1880 crate::hash_datavalue(&normal)
1881 );
1882 }
1883
1884 #[rstest]
1885 fn test_hash_datavalue_enum_number_vs_i32_same_value() {
1886 let enum_val = DataValue::EnumNumber(42);
1888 let i32_val = DataValue::I32(42);
1889 assert_ne!(
1890 crate::hash_datavalue(&enum_val),
1891 crate::hash_datavalue(&i32_val)
1892 );
1893 }
1894
1895 #[rstest]
1896 fn get_single_column_typed_f64_from_i32() {
1897 let df = crate::df! {
1898 "a" => [1i32, 2i32, 3i32]
1899 };
1900 let key: Key = "a".into();
1901 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1902 assert_eq!(col, ndarray::arr1(&[1.0f64, 2.0, 3.0]));
1903 }
1904
1905 #[rstest]
1906 fn get_single_column_typed_string() {
1907 let df = crate::df! {
1908 "name" => ["alice", "bob"]
1909 };
1910 let key: Key = "name".into();
1911 let col = df.get_single_column_typed::<String>(&key).unwrap();
1912 assert_eq!(
1913 col,
1914 ndarray::arr1(&["alice".to_string(), "bob".to_string()])
1915 );
1916 }
1917
1918 #[rstest]
1919 fn get_single_column_typed_missing_key() {
1920 let df = crate::df! {
1921 "a" => [1u64, 2u64]
1922 };
1923 let missing: Key = "z".into();
1924 assert!(df.get_single_column_typed::<u64>(&missing).is_none());
1925 }
1926
1927 #[rstest]
1928 fn get_single_column_typed_matches_untyped() {
1929 let df = crate::df! {
1930 "v" => [10u64, 20u64, 30u64]
1931 };
1932 let key: Key = "v".into();
1933 let typed = df.get_single_column_typed::<u64>(&key).unwrap();
1934 let untyped = df.get_single_column(&key).unwrap();
1935 for (t, u) in typed.iter().zip(untyped.iter()) {
1936 assert_eq!(*t, u64::extract(u));
1937 }
1938 }
1939
1940 #[rstest]
1941 fn get_single_column_typed_bool_from_i32() {
1942 let df = crate::df! {
1943 "flag" => [1i32, 0i32, 1i32, 0i32]
1944 };
1945 let key: Key = "flag".into();
1946 let col = df.get_single_column_typed::<bool>(&key).unwrap();
1947 assert_eq!(col, ndarray::arr1(&[true, false, true, false]));
1948 }
1949
1950 #[rstest]
1951 fn get_single_column_typed_i64_from_u32() {
1952 let df = crate::df! {
1953 "x" => [10u32, 20u32, 30u32]
1954 };
1955 let key: Key = "x".into();
1956 let col = df.get_single_column_typed::<i64>(&key).unwrap();
1957 assert_eq!(col, ndarray::arr1(&[10i64, 20i64, 30i64]));
1958 }
1959
1960 #[rstest]
1961 fn get_single_column_typed_f64_truncation_to_i32() {
1962 let df = crate::df! {
1963 "v" => [1.9f64, 2.1f64, 3.7f64]
1964 };
1965 let key: Key = "v".into();
1966 let col = df.get_single_column_typed::<i32>(&key).unwrap();
1967 assert_eq!(col, ndarray::arr1(&[1i32, 2i32, 3i32]));
1968 }
1969
1970 #[rstest]
1971 fn get_single_column_typed_single_element() {
1972 let df = crate::df! {
1973 "solo" => [42u64]
1974 };
1975 let key: Key = "solo".into();
1976 let col = df.get_single_column_typed::<f64>(&key).unwrap();
1977 assert_eq!(col.len(), 1);
1978 assert_eq!(col[0], 42.0);
1979 }
1980
1981 #[rstest]
1982 fn select_typed_all_columns() {
1983 let df = crate::df! {
1984 "a" => [1i32, 2i32, 3i32],
1985 "b" => [4i32, 5i32, 6i32]
1986 };
1987 let result = df.select_typed::<f64>(None).unwrap();
1988 assert_eq!(result.nrows(), 3);
1989 assert_eq!(result.ncols(), 2);
1990 assert_eq!(result[[0, 0]], 1.0);
1991 assert_eq!(result[[0, 1]], 4.0);
1992 assert_eq!(result[[2, 0]], 3.0);
1993 assert_eq!(result[[2, 1]], 6.0);
1994 }
1995
1996 #[rstest]
1997 fn select_typed_specific_keys() {
1998 let df = crate::df! {
1999 "x" => [10u64, 20u64],
2000 "y" => [30u64, 40u64],
2001 "z" => [50u64, 60u64]
2002 };
2003 let keys: Vec<Key> = vec!["x".into(), "z".into()];
2004 let result = df.select_typed::<i64>(Some(&keys)).unwrap();
2005 assert_eq!(result.nrows(), 2);
2006 assert_eq!(result.ncols(), 2);
2007 assert_eq!(result[[0, 0]], 10i64);
2008 assert_eq!(result[[0, 1]], 50i64);
2009 assert_eq!(result[[1, 0]], 20i64);
2010 assert_eq!(result[[1, 1]], 60i64);
2011 }
2012
2013 #[rstest]
2014 fn select_typed_nonexistent_key_gives_empty() {
2015 let df = crate::df! {
2016 "a" => [1i32, 2i32]
2017 };
2018 let keys: Vec<Key> = vec!["missing".into()];
2019 let result = df.select_typed::<f64>(Some(&keys)).unwrap();
2020 assert_eq!(result.shape(), &[0, 0]);
2021 }
2022
2023 #[rstest]
2024 fn select_typed_matches_select_with_extract() {
2025 let df = crate::df! {
2026 "a" => [1u64, 2u64, 3u64],
2027 "b" => [4u64, 5u64, 6u64]
2028 };
2029 let typed = df.select_typed::<f64>(None).unwrap();
2030 let manual = df.select(None).unwrap().mapv(|v| f64::extract(&v));
2031 assert_eq!(typed, manual);
2032 }
2033
2034 #[rstest]
2035 fn select_typed_string_values() {
2036 let df = crate::df! {
2037 "name" => ["alice", "bob", "carol"]
2038 };
2039 let result = df.select_typed::<String>(None).unwrap();
2040 assert_eq!(result[[0, 0]], "alice");
2041 assert_eq!(result[[1, 0]], "bob");
2042 assert_eq!(result[[2, 0]], "carol");
2043 }
2044
2045 #[rstest]
2046 fn select_typed_cross_numeric_coercion() {
2047 let df = crate::df! {
2049 "a" => [1i32, 2i32, 3i32]
2050 };
2051 let result = df.select_typed::<u64>(None).unwrap();
2052 assert_eq!(result[[0, 0]], 1u64);
2053 assert_eq!(result[[1, 0]], 2u64);
2054 assert_eq!(result[[2, 0]], 3u64);
2055 }
2056}