1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
52#[cfg_attr(feature = "python", pyclass)]
53pub struct DataFrame {
54 pub constants: HashMap<Key, DataValue>,
58 pub dataframe: ColumnFrame,
60 pub metadata: HashMap<String, DataValue>,
63}
64
65impl fmt::Display for DataFrame {
66 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67 self.dataframe.fmt(f)
68 }
69}
70
71impl DataFrame {
72 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
73 Self {
74 constants: HashMap::new(),
75 dataframe: dataframe.into(),
76 metadata: HashMap::new(),
77 }
78 }
79
80 pub fn n_columns(&self) -> usize {
82 self.dataframe.data_frame.ncols()
83 }
84
85 pub fn n_rows(&self) -> usize {
87 self.dataframe.data_frame.nrows()
88 }
89
90 pub fn shrink(&mut self) {
91 self.dataframe.shrink();
92 }
93
94 pub fn add_metadata(&mut self, key: String, value: DataValue) {
95 self.metadata.insert(key, value);
96 }
97
98 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
99 self.metadata.get(key)
100 }
101
102 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
103 for (key, value) in other.constants {
104 self.constants.insert(key, value);
105 }
106 self.dataframe.join(other.dataframe, join_type)
107 }
108
109 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
110 where
111 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
112 {
113 self.dataframe.apply_function(keys, &mut func)
114 }
115
116 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
117 Ok(self.dataframe.select(keys))
118 }
119
120 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
125 self.dataframe.select_transposed_typed::<D>(keys)
126 }
127
128 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
129 self.dataframe.select_column(&key)
130 }
131
132 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
133 self.dataframe.select_transposed(keys)
134 }
135
136 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
137 self.constants.insert(key, value);
138 }
139
140 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
141 self.dataframe.push(item)
142 }
143
144 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
145 self.dataframe.remove_column(keys).map(|x| x.into())
146 }
147
148 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
149 self.dataframe.extend(items.dataframe)
150 }
151
152 pub fn len(&self) -> usize {
153 self.dataframe.len()
154 }
155
156 pub fn is_empty(&self) -> bool {
157 self.dataframe.is_empty()
158 }
159
160 pub fn add_single_column<K: Into<Key>>(
161 &mut self,
162 key: K,
163 values: Array1<DataValue>,
164 ) -> Result<(), Error> {
165 self.dataframe.add_single_column(key, values)
166 }
167
168 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
169 self.dataframe.get_single_column(key)
170 }
171
172 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
173 self.dataframe.sorted(key)
174 }
175
176 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
177 let filtered_df = self.dataframe.filter(filter)?;
178 Ok(Self {
179 constants: self.constants.clone(),
180 dataframe: filtered_df,
181 metadata: self.metadata.clone(),
182 })
183 }
184
185 #[cfg(feature = "polars-df")]
186 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
187 let mut columns = vec![];
188 for key in self.dataframe.keys() {
189 let values = self
190 .dataframe
191 .get_single_column(key)
192 .ok_or_else(|| Error::NotFound(key.clone()))?
193 .into_iter()
194 .map(|x| into_polars_value(key, x.clone()))
195 .collect::<Vec<_>>();
196 let s = polars::prelude::Column::new(key.name().into(), values);
197
198 columns.push(s);
199 }
200
201 Ok(polars::prelude::DataFrame::new(columns)?)
202 }
203
204 pub fn load_from_messagepack(bytes: &[u8]) -> Result<Self, Error> {
205 rmp_serde::decode::from_slice(bytes).map_err(|e| Error::UnknownError(format!("{e:?}")))
206 }
207
208 pub fn store_into_messagepack(&self) -> Result<Vec<u8>, Error> {
209 rmp_serde::encode::to_vec(&self).map_err(|e| Error::UnknownError(format!("{e:?}")))
210 }
211}
212#[cfg(feature = "polars-df")]
213pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
214 use crate::DataType::*;
215 use polars::prelude::DataType::*;
216 match dtype {
217 Bool => Boolean,
218 U32 => UInt32,
219 I32 => Int32,
220 U8 => UInt8,
221 U64 => UInt64,
222 I64 => Int64,
223 F32 => Float32,
224 F64 => Float64,
225 U128 => UInt128,
226 I128 => Int128,
227 crate::DataType::String => polars::prelude::DataType::String,
228 Bytes => Binary,
229 crate::DataType::Unknown => Null,
230 Vec => List(Box::new(polars::prelude::DataType::Unknown(
231 polars::prelude::UnknownKind::Any,
232 ))),
233 Map => Struct(vec![]),
234 }
235}
236
237#[cfg(feature = "polars-df")]
238pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
239 use polars::prelude::AnyValue::*;
240 use polars::prelude::Field;
241
242 use crate::dataframe::column_store::convert_dv_to_dtype;
243 let dv = convert_dv_to_dtype(key, dv);
244 match dv {
245 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
246 DataValue::Bytes(items) => BinaryOwned(items),
247 DataValue::U8(x) => UInt32(x as _),
248 DataValue::Bool(x) => Boolean(x),
249 DataValue::I32(x) => Int32(x),
250 DataValue::U32(x) => UInt32(x),
251 DataValue::I64(x) => Int64(x),
252 DataValue::U64(x) => UInt64(x),
253 DataValue::I128(x) => Int128(x),
254 DataValue::F32(x) => Float32(x),
255 DataValue::F64(x) => Float64(x),
256 DataValue::Null => Null,
257 DataValue::Vec(data_values) => {
258 let mut dt = crate::DataType::Unknown;
259 for d in data_values.iter() {
260 match crate::detect_dtype(d) {
261 crate::DataType::Unknown => continue,
262 e => {
263 dt = e;
264 break;
265 }
266 }
267 }
268 let vec_key = Key::new(key.name(), dt);
269 let s = polars::series::Series::from_any_values(
270 key.name().into(),
271 &data_values
272 .into_iter()
273 .map(|x| into_polars_value(&vec_key, x))
274 .collect::<Vec<_>>(),
275 true,
276 );
277 List(s.expect(&format!("Cannot create series for {key:?}")))
278 }
279 DataValue::EnumNumber(x) => Int32(x),
280 DataValue::U128(x) => UInt128(x),
281 DataValue::Map(x) => {
282 let mut values = vec![];
283 let mut fields = vec![];
284 let mut sorted_keys = x.keys().collect::<Vec<_>>();
285 sorted_keys.sort();
286 for k in sorted_keys {
287 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
288 let dtype = crate::detect_dtype(value);
289 let k = Key::new(k, dtype);
290 values.push(into_polars_value(&k, value.to_owned()));
291 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
292 }
293 StructOwned(Box::new((values, fields)))
294 }
295 }
296}
297
298#[cfg(feature = "polars-df")]
299pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
300 use polars::prelude::AnyValue::*;
301 match dv {
302 Null => DataValue::Null,
303 Boolean(v) => v.into(),
304 String(v) => DataValue::String(v.into()),
305 UInt8(v) => DataValue::U8(v),
306 UInt16(v) => DataValue::U32(v as u32),
307 UInt32(v) => v.into(),
308 UInt64(v) => v.into(),
309 Int8(v) => (v as i32).into(),
310 Int16(v) => (v as i32).into(),
311 Int32(v) => v.into(),
312 Int64(v) => v.into(),
313 Float32(v) => v.into(),
314 Float64(v) => v.into(),
315 Int128(v) => v.into(),
316 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
317 StringOwned(v) => DataValue::String(v.as_str().into()),
321 Binary(v) => DataValue::Bytes(v.to_owned()),
322 BinaryOwned(v) => DataValue::Bytes(v),
323 StructOwned(m) => {
324 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
325 std::collections::HashMap::new();
326 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
327 hm.insert(k.name.as_str().into(), from_polars_value(v));
328 }
329 DataValue::Map(hm)
330 }
331 e => {
332 tracing::warn!("Unsupported polars value: {e:?}");
333 DataValue::Null
334 }
335 }
336}
337
338impl From<ColumnFrame> for DataFrame {
339 fn from(dataframe: ColumnFrame) -> Self {
340 Self::new(dataframe)
341 }
342}
343
344impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
345 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
346 Self::new(ColumnFrame::from(dataframe))
347 }
348}
349
350impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
351 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
352 Self::new(ColumnFrame::from(dataframe))
353 }
354}
355
356impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
357 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
358 Self::new(ColumnFrame::from(dataframe))
359 }
360}
361
362impl From<MLChefMap> for DataFrame {
363 fn from(dataframe: MLChefMap) -> Self {
364 Self::new(ColumnFrame::from(dataframe))
365 }
366}
367impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
368 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
369 Self::new(ColumnFrame::from(dataframe))
370 }
371}
372
373impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
374 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
375 Self::new(ColumnFrame::from(dataframe))
376 }
377}
378
379#[cfg(feature = "polars-df")]
380impl From<polars::prelude::DataFrame> for DataFrame {
381 fn from(dataframe: polars::prelude::DataFrame) -> Self {
382 Self::new(ColumnFrame::from(dataframe))
383 }
384}
385#[cfg(test)]
386mod test {
387 use crate::filter::FilterRules;
388
389 use super::*;
390 use halfbrown::hashmap;
391 #[cfg(feature = "polars-df")]
392 use polars::prelude::NamedFrom as _;
393 use rstest::*;
394 use tracing_test::traced_test;
395 #[fixture]
396 fn dummy_candidates() -> ColumnFrame {
397 ColumnFrame::from(vec![
398 hashmap! {
399 "key1".into() => 1.into(),
400 "key2".into() => "a".into(),
401 },
402 hashmap! {
403 "key1".into() => 2.into(),
404 "key2".into() => "b".into(),
405 },
406 ])
407 }
408
409 #[rstest]
410 fn test_serde() {
411 let df = crate::df! {
412 "a" => [1u64, 2u64, 3u64],
413 "b" => [4u64, 5u64, 6u64],
414 "c" => [7u64, 8u64, 9u64]
415 };
416
417 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
418
419 let deserialized =
420 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
421
422 assert_eq!(df, deserialized);
423 }
424
425 #[cfg(feature = "polars-df")]
426 #[rstest]
427 fn test_polars() {
428 let expected = crate::df! {
429 "a" => [1u64, 2u64, 3u64],
430 "b" => [4f64, 5f64, 6f64],
431 "c" => [7i64, 8i64, 9i64]
432 };
433
434 let polars_df = polars::df!(
435 "a" => [1u64, 2u64, 3u64],
436 "b" => [4f64, 5f64, 6f64],
437 "c" => [7i64, 8i64, 9i64]
438 )
439 .expect("BUG: should be ok");
440 let as_df: DataFrame = polars_df.into();
441 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
442 assert_eq!(
443 as_df.select(Some(keys.as_slice())),
444 expected.select(Some(keys.as_slice()))
445 );
446 }
447 #[cfg(feature = "polars-df")]
448 use crate::DataType;
449 #[cfg(feature = "polars-df")]
450 #[rstest]
451 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
452 #[case::u32(
453 Key::new("a", DataType::U32),
454 DataValue::U32(u32::MAX),
455 polars::prelude::AnyValue::UInt32(u32::MAX)
456 )]
457 #[case::i32(
458 Key::new("a", DataType::I32),
459 DataValue::I32(i32::MIN),
460 polars::prelude::AnyValue::Int32(i32::MIN)
461 )]
462 #[case::i64(
463 Key::new("a", DataType::I64),
464 DataValue::I64(i64::MIN),
465 polars::prelude::AnyValue::Int64(i64::MIN)
466 )]
467 #[case::u64(
468 Key::new("a", DataType::U64),
469 DataValue::U64(u64::MIN),
470 polars::prelude::AnyValue::UInt64(u64::MIN)
471 )]
472 #[case::f32(
473 Key::new("a", DataType::F32),
474 DataValue::F32(f32::MIN),
475 polars::prelude::AnyValue::Float32(f32::MIN)
476 )]
477 #[case::f64(
478 Key::new("a", DataType::F64),
479 DataValue::F64(f64::MIN),
480 polars::prelude::AnyValue::Float64(f64::MIN)
481 )]
482 #[case::null(
483 Key::new("a", DataType::Unknown),
484 DataValue::Null,
485 polars::prelude::AnyValue::Null
486 )]
487 #[case::i128(
488 Key::new("a", DataType::I128),
489 DataValue::I128(i128::MIN),
490 polars::prelude::AnyValue::Int128(i128::MIN)
491 )]
492 #[case::u8(
493 Key::new("a", DataType::U8),
494 DataValue::U8(255),
495 polars::prelude::AnyValue::UInt8(255)
496 )]
497 #[case::bool(
498 Key::new("a", DataType::Bool),
499 DataValue::Bool(true),
500 polars::prelude::AnyValue::Boolean(true)
501 )]
502 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
503 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
504 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
505 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
506 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
507 fn into_polars_value_test(
511 #[case] key: Key,
512 #[case] input: DataValue,
513 #[case] output: polars::prelude::AnyValue<'static>,
514 ) {
515 assert_eq!(into_polars_value(&key, input.clone()), output);
516 assert_eq!(from_polars_value(output), input);
517 }
518
519 #[rstest]
531 #[case(
532 DataFrame::new(crate::column_frame! {
533 "a" => [1f64, 2f64, 3f64],
534 "b" => [4i64, 5i64, 6i64],
535 "c" => [7i64, 8i64, 9i64]
536 }),
537 DataFrame::new(crate::column_frame! {
538 "a" => [1f64, 2f64],
539 "b" => [4i64, 5i64],
540 "c" => [7i64, 8i64]
541 }),
542 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
543 )]
544 #[case(
545 DataFrame::new(crate::column_frame! {
546 "a" => [1f64, 2f64, 3f64],
547 "b" => [4i64, 5i64, 6i64],
548 "c" => [7i64, 8i64, 9i64]
549 }),
550 DataFrame::new(crate::column_frame! {
551 "a" => [2f64],
552 "b" => [5i64],
553 "c" => [8i64]
554 }),
555 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
556 )]
557 #[traced_test]
558 fn filter_test(
559 #[case] df: DataFrame,
560 #[case] expected: DataFrame,
561 #[case] filter: FilterRules,
562 ) {
563 let filtered = df.filter(&filter).expect("BUG: cannot filter");
564 assert_eq!(filtered, expected);
565 }
566
567 #[rstest]
568 fn test_serde_complex() {
569 let simple = r#"
570{
571 "constants": {},
572 "dataframe": {
573 "index": {
574 "keys": [
575 {
576 "key": 3162770485,
577 "name": "a",
578 "ctype": "U32"
579 },
580 {
581 "key": 2279056742,
582 "name": "b",
583 "ctype": "F64"
584 },
585 {
586 "key": 2994984227,
587 "name": "c",
588 "ctype": "U64"
589 },
590 {
591 "key": 3319645144,
592 "name": "d",
593 "ctype": "F64"
594 },
595 {
596 "key": 1291847470,
597 "name": "e",
598 "ctype": "U32"
599 },
600 {
601 "key": 874241070,
602 "name": "f",
603 "ctype": "Bool"
604 }
605 ],
606 "indexes": {
607 "a": 0,
608 "b": 1,
609 "c": 2,
610 "d": 3,
611 "e": 4,
612 "f": 5
613 },
614 "alias": {}
615 },
616 "data_frame": {
617 "v": 1,
618 "dim": [
619 2,
620 6
621 ],
622 "data": [
623 253780,
624 0.009369421750307085,
625 1633222860381359,
626 8,
627 5,
628 true,
629 64512,
630 0.003391335718333721,
631 1633222860810557,
632 8,
633 5,
634 null
635 ]
636 }
637 },
638 "metadata": {}
639}
640 "#;
641
642 let simple_deserialized: DataFrame =
643 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
644
645 println!("deserialized: {simple_deserialized:?}");
646 let array = format!("[{}, {}, {}]", simple, simple, simple);
647 let deserialized: Vec<DataFrame> =
648 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
649
650 println!("deserialized: {deserialized:?}");
651 assert_eq!(deserialized.len(), 3);
652 assert_eq!(simple_deserialized, deserialized[0]);
653 }
654
655 #[rstest]
656 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
657 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
658 #[case(vec![hashmap! {
659 "key1".into() => 1.into(),
660 "key2".into() => "a".into(),
661 },
662 hashmap! {
663 "key1".into() => 2.into(),
664 },])]
665 #[case(vec![data_value::stdhashmap! {
666 "key1" => DataValue::from(1),
667 "key2" => DataValue::from("a"),
668 },data_value::stdhashmap! {
669 "key1" => DataValue::from(2),
670 },])]
671 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
672 vec![DataValue::from("a"), DataValue::Null])])]
673 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
674 let df: DataFrame = input.into();
675 assert_eq!(
676 df,
677 DataFrame {
678 constants: HashMap::new(),
679 dataframe: ColumnFrame::from(vec![
680 hashmap! {
681 "key1".into() => 1.into(),
682 "key2".into() => "a".into(),
683 },
684 hashmap! {
685 "key1".into() => 2.into(),
686 },
687 ]),
688 metadata: HashMap::new(),
689 }
690 );
691 let selected_transposed = df.select_column("key1".into());
692 assert!(selected_transposed.is_some());
693 let selected_transposed = selected_transposed.unwrap();
694 assert_eq!(selected_transposed.len(), 2);
695 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
696 }
697
698 #[rstest]
699 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
700 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
701 #[case::hm({
702 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
703 hm
704 })]
705 #[case::vec_hhm(vec![hashmap! {
706 "key1".into() => 1.into(),
707 "key2".into() => "a".into(),
708 },
709 hashmap! {
710 "key1".into() => 2.into(),
711 },])]
712 #[case::vec_hme(vec![data_value::stdhashmap! {
713 "key1" => DataValue::from(1),
714 "key2" => DataValue::from("a"),
715 },data_value::stdhashmap! {
716 "key1" => DataValue::from(2),
717 },])]
718 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
719 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
720 let df: DataFrame = input.into();
721 let expected: DataFrame = DataFrame {
722 constants: HashMap::new(),
723 dataframe: ColumnFrame::from(vec![
724 hashmap! {
725 "key1".into() => 1.into(),
726 "key2".into() => "a".into(),
727 },
728 hashmap! {
729 "key1".into() => 2.into(),
730 },
731 ]),
732 metadata: HashMap::new(),
733 };
734 assert_eq!(
735 df.select(Some(&["key1".into(), "key2".into()])),
736 expected.select(Some(&["key1".into(), "key2".into()])),
737 "{df} vs {expected}"
738 );
739 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
740 assert_eq!(selected_transposed.len(), 2);
741 println!("{:?}", selected_transposed);
742 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
743 }
744 #[rstest]
745 fn test_dataframe(dummy_candidates: ColumnFrame) {
746 let mut dataframe: DataFrame = DataFrame::default();
747 assert!(dataframe.is_empty());
748 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
749 assert_eq!(dataframe.len(), 2);
750
751 let candidate = hashmap! {
752 "key1".into() => 3.into(),
753 "key2".into() => "c".into(),
754 };
755
756 assert!(dataframe.push(candidate).is_ok());
757 assert_eq!(dataframe.len(), 3);
758 assert!(!dataframe.is_empty());
759
760 dataframe.insert_constant("key3".into(), 4.into());
761 assert_eq!(dataframe.constants.len(), 1);
762 assert!(dataframe
763 .apply_function(&["key1".into()], |keys, df| {
764 let key = keys[0].clone();
765 let s = df
766 .get_single_column(&key)
767 .expect("BUG: Cannot get column")
768 .to_owned();
769 let s = s.mapv(|x| x + DataValue::from(1));
770 df.add_single_column("key5", s)?;
771 Ok(())
772 })
773 .is_ok());
774 let original = dataframe.clone();
775 dataframe.shrink();
776 let remove_df = dataframe.remove_column(&["key1".into()]);
777 assert!(remove_df.is_ok());
778 let mut remove_df = remove_df.unwrap();
779 assert_eq!(remove_df.len(), 3);
780 let selected = dataframe.select(Some(&["key2".into()]));
781 assert!(selected.is_ok());
782 let selected = selected.unwrap();
783 println!("{:?}", selected);
784
785 let joined_result =
787 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
788 assert!(joined_result.is_ok(), "{:?}", joined_result);
789 let keys = vec!["key1".into(), "key2".into(), "key5".into()];
790 assert_eq!(
791 original.select(Some(keys.as_slice())),
792 remove_df.select(Some(keys.as_slice()))
793 );
794 }
795
796 #[rstest]
797 fn test_size_methods() {
798 let candidate = hashmap! {
799 "key1".into() => 3.into(),
800 "key2".into() => "c".into(),
801 "key3".into() => false.into()
802 };
803
804 let dataframe: DataFrame = vec![candidate].into();
805
806 assert_eq!(dataframe.n_columns(), 3);
807 assert_eq!(dataframe.n_rows(), 1);
808 }
809
810 #[rstest]
811 fn test_metadata(dummy_candidates: ColumnFrame) {
812 let mut dataframe: DataFrame = DataFrame::default();
813 assert!(dataframe.is_empty());
814 println!("{:?}", dataframe);
815 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
816 println!("{:?}", dataframe);
817 assert_eq!(dataframe.len(), 2);
818
819 dataframe.add_metadata("test".into(), 1.into());
820 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
821 let dataframe = DataFrame::new(ColumnFrame::from(vec![
822 hashmap! {
823 "key1".into() => 1.into(),
824 "key2".into() => "a".into(),
825 },
826 hashmap! {
827 "key1".into() => 2.into(),
828 "key2".into() => "b".into(),
829 },
830 ]));
831 assert_eq!(dataframe.get_metadata("test"), None);
832 let tt = dataframe.select_transposed(None);
833 assert!(tt.is_ok());
834 let tt = tt.unwrap();
835 assert_eq!(tt.shape(), [2, 2]);
836 assert_eq!(
837 tt,
838 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
839 .unwrap()
840 );
841 }
842
843 #[rstest]
844 #[traced_test]
845 fn add_single_column_test() {
846 let mut dataframe = DataFrame::default();
847 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
848 let r = dataframe.add_single_column("key1", values);
849 assert!(r.is_ok(), "{r:?}");
850 let selected = dataframe.select(None);
851 assert!(selected.is_ok());
852 let selected = selected.unwrap();
853 assert_eq!(selected.shape(), [3, 1]);
854 assert_eq!(
855 selected,
856 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
857 );
858 let values = Array1::from(vec![1.into(), 2.into()]);
859 assert!(dataframe.add_single_column("key1", values).is_err());
860 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
861 assert!(dataframe.add_single_column("key2", values).is_ok());
862 let values = Array1::from(vec![3.into()]);
863 assert!(dataframe.add_single_column("key3", values).is_err());
864 }
865
866 #[rstest]
867 #[traced_test]
868 fn add_single_column_empty_test() {
869 let mut dataframe = DataFrame::default();
870 let values = Array1::from(vec![]);
871 let r = dataframe.add_single_column("key1", values);
872 assert!(r.is_ok(), "{r:?}");
873 let selected = dataframe.select(None);
874 assert!(selected.is_ok());
875 let selected = selected.unwrap();
876 assert_eq!(selected.shape(), [0, 1]);
877 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
878 let values = Array1::from(vec![1.into(), 2.into()]);
879 assert!(dataframe.add_single_column("key1", values).is_err());
880 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
881 assert!(dataframe.add_single_column("key2", values).is_ok());
882 let values = Array1::from(vec![3.into(), 4.into()]);
883 assert!(dataframe.add_single_column("key3", values).is_err());
884 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
885 assert!(dataframe.add_single_column("key3", values).is_ok());
886
887 assert_eq!(
888 dataframe
889 .select_column("key1".into())
890 .expect("BUG: has to exists"),
891 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
892 );
893 assert_eq!(
894 dataframe
895 .select_column("key2".into())
896 .expect("BUG: has to exists"),
897 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
898 );
899 assert_eq!(
900 dataframe.select(None).expect("BUG: cannot get data"),
901 ndarray::arr2(&[
902 [DataValue::Null, 3.into(), 3.into()],
903 [DataValue::Null, 4.into(), 4.into()],
904 [DataValue::Null, 5.into(), 5.into()],
905 ])
906 );
907 }
908
909 #[rstest]
910 #[case(
911 DataFrame::new(ColumnFrame::from(vec![
912 hashmap! {
913 "k".into() => 1.into(),
914 "k2".into() => 2.into(),
915 "k3".into() => 2.2.into(),
916 },
917 hashmap! {
918 "k".into() => 11.into(),
919 "k2".into() => 3.into(),
920 },
921 hashmap! {
922 "k".into() => 4.into(),
923 "k2".into() => 5.into(),
924 "k3".into() => 2.3.into(),
925 },
926 hashmap! {
927 "k".into() => 4.into(),
928 "k2".into() => 5.into(),
929 "k3".into() => 2.4.into(),
930 },
931 ])),
932 vec!["k".into(), "k2".into()],
933 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
934 )]
935 #[case(
936 DataFrame::new(ColumnFrame::from(vec![
937 hashmap! {
938 "k".into() => 1.into(),
939 "k2".into() => 2.into(),
940 "k3".into() => 2.2.into(),
941 },
942 hashmap! {
943 "k".into() => 11.into(),
944 "k2".into() => 3.into(),
945 },
946 hashmap! {
947 "k".into() => 4.into(),
948 "k2".into() => 5.into(),
949 "k3".into() => 2.3.into(),
950 },
951 hashmap! {
952 "k".into() => 4.into(),
953 "k2".into() => 5.into(),
954 "k3".into() => 2.4.into(),
955 },
956 ])),
957 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
958 Array2::from_shape_vec((4, 5), vec![
959 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
960 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
961 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
962 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
963 )]
964 #[traced_test]
965 fn select_multiple(
966 #[case] input: DataFrame,
967 #[case] columns: Vec<Key>,
968 #[case] expected: Array2<DataValue>,
969 ) {
970 let selected = input.select(Some(&columns));
971 assert!(selected.is_ok());
972 let selected = selected.unwrap();
973
974 assert_eq!(selected, expected);
975 }
976
977 #[rstest]
978 #[case(
979 DataFrame::new(ColumnFrame::from(vec![
980 hashmap! {
981 "k".into() => 1.into(),
982 "k2".into() => 2.into(),
983 "k3".into() => 2.2.into(),
984 },
985 hashmap! {
986 "k".into() => 11.into(),
987 "k2".into() => 3.into(),
988 },
989 hashmap! {
990 "k".into() => 4.into(),
991 "k2".into() => 5.into(),
992 "k3".into() => 2.3.into(),
993 },
994 hashmap! {
995 "k".into() => 4.into(),
996 "k2".into() => 5.into(),
997 "k3".into() => 2.4.into(),
998 },
999 ])),
1000 "k".into(),
1001 Array2::from_shape_vec((4, 3), vec![
1002 1.into(), 2.into(), 2.2.into(),
1003 4.into(), 5.into(), 2.3.into(),
1004 4.into(), 5.into(), 2.4.into(),
1005 11.into(), 3.into(), DataValue::Null,
1006 ]
1007 ).unwrap(),
1008 vec!["k".into(), "k2".into(), "k3".into()],
1009 )]
1010 #[rstest]
1011 #[case(
1012 DataFrame::new(ColumnFrame::from(vec![
1013 hashmap! {
1014 "k".into() => 1.into(),
1015 "k2".into() => 2.into(),
1016 "k3".into() => 2.2.into(),
1017 },
1018 hashmap! {
1019 "k".into() => 11.into(),
1020 "k2".into() => 3.into(),
1021 },
1022 hashmap! {
1023 "k".into() => 4.into(),
1024 "k2".into() => 5.into(),
1025 "k3".into() => 2.3.into(),
1026 },
1027 hashmap! {
1028 "k".into() => 4.into(),
1029 "k2".into() => 5.into(),
1030 "k3".into() => 2.4.into(),
1031 },
1032 ])),
1033 "k3".into(),
1034 Array2::from_shape_vec((4, 3), vec![
1035 11.into(), 3.into(), DataValue::Null,
1036 1.into(), 2.into(), 2.2.into(),
1037 4.into(), 5.into(), 2.3.into(),
1038 4.into(), 5.into(), 2.4.into(),
1039 ]
1040 ).unwrap(),
1041 vec!["k".into(), "k2".into(), "k3".into()],
1042 )]
1043 #[case(
1044 DataFrame::new(ColumnFrame::from(vec![
1045 hashmap! {
1046 "k".into() => 2.into(),
1047 "k2".into() => 0.000001.into(),
1048 },
1049 hashmap! {
1050 "k".into() => 1.into(),
1051 "k2".into() =>0.0000001.into(),
1052 },
1053 hashmap! {
1054 "k".into() => 3.into(),
1055 "k2".into() => 0.00001.into(),
1056 },
1057 hashmap! {
1058 "k".into() => 4.into(),
1059 "k2".into() => 0.001.into(),
1060 },
1061 ])),
1062 "k2".into(),
1063 Array2::from_shape_vec((4, 2), vec![
1064 1.into(), 0.0000001.into(),
1065 2.into(), 0.000001.into(),
1066 3.into(), 0.00001.into(),
1067 4.into(), 0.001.into(),
1068 ]
1069 ).unwrap(),
1070 vec!["k".into(), "k2".into()],
1071 )]
1072 #[case(
1073 DataFrame::new(ColumnFrame::from(vec![
1074 hashmap! {
1075 "k".into() => 2.into(),
1076 "k2".into() => "b".into(),
1077 },
1078 hashmap! {
1079 "k".into() => 1.into(),
1080 "k2".into() =>"a".into(),
1081 },
1082 hashmap! {
1083 "k".into() => 3.into(),
1084 "k2".into() =>"c".into(),
1085 },
1086 hashmap! {
1087 "k".into() => 4.into(),
1088 "k2".into() =>"z".into(),
1089 },
1090 ])),
1091 "k2".into(),
1092 Array2::from_shape_vec((4, 2), vec![
1093 1.into(),"a".into(),
1094 2.into(), "b".into(),
1095 3.into(), "c".into(),
1096 4.into(), "z".into(),
1097 ]
1098 ).unwrap(),
1099 vec!["k".into(), "k2".into()],
1100 )]
1101 #[traced_test]
1102 fn sort_by(
1103 #[case] input: DataFrame,
1104 #[case] column: Key,
1105 #[case] expected: Array2<DataValue>,
1106 #[case] columns: Vec<Key>,
1107 ) {
1108 let result = input.sorted(&column);
1109 assert!(result.is_ok(), "{result:?}");
1110 let result = result.unwrap().get_sorted();
1111 let selected = result.select(Some(&columns));
1112
1113 assert_eq!(selected, expected);
1114 }
1115 #[rstest]
1116 #[case(
1117 DataFrame::new(ColumnFrame::from(vec![
1118 hashmap! {
1119 "k".into() => 2.into(),
1120 "k2".into() => 0.000001.into(),
1121 },
1122 hashmap! {
1123 "k".into() => 1.into(),
1124 "k2".into() =>0.0000001.into(),
1125 },
1126 hashmap! {
1127 "k".into() => 3.into(),
1128 "k2".into() => 0.00001.into(),
1129 },
1130 hashmap! {
1131 "k".into() => 4.into(),
1132 "k2".into() => 0.001.into(),
1133 },
1134 ])),
1135 "k2".into(),
1136 TopN::Last(1),
1137 Array2::from_shape_vec((1, 2), vec![
1138 4.into(), 0.001.into(),
1139 ]
1140 ).unwrap(),
1141 vec!["k".into(), "k2".into()],
1142 )]
1143 #[case(
1144 DataFrame::new(ColumnFrame::from(vec![
1145 hashmap! {
1146 "k".into() => 2.into(),
1147 "k2".into() => 0.000001.into(),
1148 },
1149 hashmap! {
1150 "k".into() => 1.into(),
1151 "k2".into() =>0.0000001.into(),
1152 },
1153 hashmap! {
1154 "k".into() => 3.into(),
1155 "k2".into() => 0.00001.into(),
1156 },
1157 hashmap! {
1158 "k".into() => 4.into(),
1159 "k2".into() => 0.001.into(),
1160 },
1161 ])),
1162 "k2".into(),
1163 TopN::Last(2),
1164 Array2::from_shape_vec((2, 2), vec![
1165 4.into(), 0.001.into(),
1166 3.into(), 0.00001.into(),
1167 ]
1168 ).unwrap(),
1169 vec!["k".into(), "k2".into()],
1170 )]
1171 #[case(
1172 DataFrame::new(ColumnFrame::from(vec![
1173 hashmap! {
1174 "k".into() => 2.into(),
1175 "k2".into() => "b".into(),
1176 },
1177 hashmap! {
1178 "k".into() => 1.into(),
1179 "k2".into() =>"a".into(),
1180 },
1181 hashmap! {
1182 "k".into() => 3.into(),
1183 "k2".into() =>"c".into(),
1184 },
1185 hashmap! {
1186 "k".into() => 4.into(),
1187 "k2".into() =>"z".into(),
1188 },
1189 ])),
1190 "k2".into(),
1191 TopN::First(1),
1192 Array2::from_shape_vec((1, 2), vec![
1193 1.into(),"a".into(),
1194 ]
1195 ).unwrap(),
1196 vec!["k".into(), "k2".into()],
1197 )]
1198 #[case(
1199 DataFrame::new(ColumnFrame::from(vec![
1200 hashmap! {
1201 "k".into() => 2.into(),
1202 "k2".into() => "b".into(),
1203 },
1204 hashmap! {
1205 "k".into() => 1.into(),
1206 "k2".into() =>"a".into(),
1207 },
1208 hashmap! {
1209 "k".into() => 3.into(),
1210 "k2".into() =>"c".into(),
1211 },
1212 hashmap! {
1213 "k".into() => 4.into(),
1214 "k2".into() =>"z".into(),
1215 },
1216 ])),
1217 "k2".into(),
1218 TopN::First(2),
1219 Array2::from_shape_vec((2, 2), vec![
1220 1.into(),"a".into(),
1221 2.into(),"b".into(),
1222 ]
1223 ).unwrap(),
1224 vec!["k".into(), "k2".into()],
1225 )]
1226 #[traced_test]
1227 fn top_n(
1228 #[case] input: DataFrame,
1229 #[case] column: Key,
1230 #[case] topn: TopN,
1231 #[case] expected: Array2<DataValue>,
1232 #[case] columns: Vec<Key>,
1233 ) {
1234 let result = input.sorted(&column);
1235 assert!(result.is_ok(), "{result:?}");
1236 let result = result.unwrap();
1237 let first = result.topn(topn).unwrap();
1238 let selected = first.select(Some(&columns));
1239 assert_eq!(selected, expected);
1240 }
1241
1242 #[rstest]
1243 fn test_messagepack_roundtrip_empty_dataframe() {
1244 let df = DataFrame::default();
1245
1246 let bytes = df
1247 .store_into_messagepack()
1248 .expect("failed to serialize empty df");
1249 let restored =
1250 DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize empty df");
1251 assert_eq!(df, restored);
1252 assert!(restored.is_empty());
1253 }
1254
1255 #[rstest]
1256 fn test_messagepack_roundtrip_strings_and_bools() {
1257 let df = DataFrame::new(ColumnFrame::from(vec![
1259 hashmap! {
1260 "str".into() => DataValue::String("hello".into()),
1261 "bool".into() => DataValue::Bool(true),
1262 },
1263 hashmap! {
1264 "str".into() => DataValue::String("".into()),
1265 "bool".into() => DataValue::Bool(false),
1266 },
1267 ]));
1268
1269 let bytes = df.store_into_messagepack().expect("failed to serialize");
1270 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1271 assert_eq!(df, restored);
1272 }
1273
1274 #[rstest]
1275 fn test_messagepack_roundtrip_f64_values() {
1276 let df = DataFrame::new(ColumnFrame::from(vec![
1277 hashmap! {
1278 "a".into() => DataValue::F64(3.14),
1279 },
1280 hashmap! {
1281 "a".into() => DataValue::F64(-2.718),
1282 },
1283 ]));
1284
1285 let bytes = df.store_into_messagepack().expect("failed to serialize");
1286 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1287 assert_eq!(df, restored);
1288 }
1289
1290 #[rstest]
1291 fn test_messagepack_f64_special_values_survive_roundtrip() {
1292 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1295 "a".into() => DataValue::F64(f64::INFINITY),
1296 }]));
1297
1298 let bytes = df.store_into_messagepack().expect("failed to serialize");
1299 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1300 assert_eq!(restored.len(), 1);
1301 let col = restored.select_column("a".into()).expect("col exists");
1302 match &col[0] {
1303 DataValue::F64(v) => assert!(v.is_infinite() && v.is_sign_positive()),
1304 other => panic!("expected F64, got {other:?}"),
1305 }
1306 }
1307
1308 #[rstest]
1309 fn test_messagepack_roundtrip_with_nulls() {
1310 let df = DataFrame::new(ColumnFrame::from(vec![
1311 hashmap! {
1312 "a".into() => DataValue::String("x".into()),
1313 "b".into() => DataValue::String("y".into()),
1314 },
1315 hashmap! {
1316 "a".into() => DataValue::String("z".into()),
1317 },
1319 ]));
1320
1321 let bytes = df.store_into_messagepack().expect("failed to serialize");
1322 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1323 assert_eq!(df, restored);
1324 }
1325
1326 #[rstest]
1327 fn test_messagepack_roundtrip_with_metadata() {
1328 let mut df = DataFrame::new(crate::column_frame! {
1329 "col" => ["a", "b"]
1330 });
1331 df.add_metadata("name".into(), DataValue::String("test_df".into()));
1332 df.add_metadata("flag".into(), DataValue::Bool(true));
1333
1334 let bytes = df.store_into_messagepack().expect("failed to serialize");
1335 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1336 assert_eq!(df, restored);
1337 assert_eq!(
1338 restored.get_metadata("name"),
1339 Some(&DataValue::String("test_df".into()))
1340 );
1341 assert_eq!(restored.get_metadata("flag"), Some(&DataValue::Bool(true)));
1342 }
1343
1344 #[rstest]
1345 fn test_messagepack_roundtrip_with_constants() {
1346 let mut df = DataFrame::new(crate::column_frame! {
1347 "x" => ["a", "b"]
1348 });
1349 df.insert_constant("const_key".into(), DataValue::String("const_val".into()));
1350 df.insert_constant("const_flag".into(), DataValue::Bool(false));
1351
1352 let bytes = df.store_into_messagepack().expect("failed to serialize");
1353 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1354 assert_eq!(df, restored);
1355 assert_eq!(
1356 restored.constants.get(&"const_key".into()),
1357 Some(&DataValue::String("const_val".into()))
1358 );
1359 }
1360
1361 #[rstest]
1362 fn test_messagepack_integer_type_coercion() {
1363 let df = crate::df! {
1366 "a" => [1i64, 2i64, 3i64]
1367 };
1368
1369 let bytes = df.store_into_messagepack().expect("failed to serialize");
1370 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1371
1372 assert_eq!(restored.len(), 3);
1374
1375 let col = restored
1377 .select_column("a".into())
1378 .expect("column should exist");
1379 assert_ne!(
1381 col[0],
1382 DataValue::I64(1),
1383 "messagepack coerces small ints to compact types"
1384 );
1385 }
1386
1387 #[rstest]
1388 fn test_messagepack_large_i64_preserved() {
1389 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1391 "big".into() => DataValue::I64(i64::MIN),
1392 }]));
1393
1394 let bytes = df.store_into_messagepack().expect("failed to serialize");
1395 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1396 assert_eq!(df, restored);
1397 }
1398
1399 #[rstest]
1400 fn test_messagepack_load_invalid_bytes() {
1401 let result = DataFrame::load_from_messagepack(&[0xFF, 0xFE, 0xFD, 0x00]);
1402 assert!(result.is_err());
1403 }
1404
1405 #[rstest]
1406 fn test_messagepack_load_empty_bytes() {
1407 let result = DataFrame::load_from_messagepack(&[]);
1408 assert!(result.is_err());
1409 }
1410
1411 #[rstest]
1412 fn test_messagepack_load_truncated_bytes() {
1413 let df = DataFrame::new(ColumnFrame::from(vec![
1414 hashmap! {
1415 "a".into() => DataValue::String("hello world".into()),
1416 "b".into() => DataValue::Bool(true),
1417 },
1418 hashmap! {
1419 "a".into() => DataValue::String("test".into()),
1420 "b".into() => DataValue::Bool(false),
1421 },
1422 ]));
1423 let bytes = df.store_into_messagepack().expect("failed to serialize");
1424 let truncated = &bytes[..bytes.len() / 2];
1426 let result = DataFrame::load_from_messagepack(truncated);
1427 assert!(result.is_err());
1428 }
1429
1430 #[rstest]
1431 fn test_messagepack_roundtrip_with_nested_vec_data() {
1432 let df = DataFrame::new(ColumnFrame::from(vec![hashmap! {
1433 "vec_col".into() => DataValue::Vec(vec![
1434 DataValue::String("a".into()),
1435 DataValue::String("b".into()),
1436 ]),
1437 "bytes_col".into() => DataValue::Bytes(vec![0, 1, 255]),
1438 }]));
1439
1440 let bytes = df.store_into_messagepack().expect("failed to serialize");
1441 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1442 assert_eq!(df, restored);
1443 }
1444
1445 #[rstest]
1446 fn test_messagepack_roundtrip_preserves_row_count() {
1447 let df = DataFrame::new(ColumnFrame::from(vec![
1448 hashmap! { "a".into() => DataValue::String("x".into()) },
1449 hashmap! { "a".into() => DataValue::String("y".into()) },
1450 hashmap! { "a".into() => DataValue::String("z".into()) },
1451 ]));
1452
1453 let bytes = df.store_into_messagepack().expect("failed to serialize");
1454 let restored = DataFrame::load_from_messagepack(&bytes).expect("failed to deserialize");
1455 assert_eq!(restored.len(), 3);
1456 assert_eq!(restored.n_rows(), 3);
1457 assert_eq!(restored.n_columns(), 1);
1458 }
1459
1460 #[rstest]
1461 fn test_messagepack_idempotent_double_roundtrip() {
1462 let mut df = DataFrame::new(ColumnFrame::from(vec![
1464 hashmap! {
1465 "a".into() => DataValue::String("hello".into()),
1466 "b".into() => DataValue::Bool(true),
1467 },
1468 hashmap! {
1469 "a".into() => DataValue::String("world".into()),
1470 "b".into() => DataValue::Bool(false),
1471 },
1472 ]));
1473 df.add_metadata("meta".into(), DataValue::Bool(true));
1474 df.insert_constant("c".into(), DataValue::String("const".into()));
1475
1476 let bytes1 = df.store_into_messagepack().expect("first serialize");
1477 let restored1 = DataFrame::load_from_messagepack(&bytes1).expect("first deserialize");
1478 let bytes2 = restored1
1479 .store_into_messagepack()
1480 .expect("second serialize");
1481 let restored2 = DataFrame::load_from_messagepack(&bytes2).expect("second deserialize");
1482
1483 assert_eq!(df, restored2);
1484 assert_eq!(bytes1, bytes2);
1485 }
1486
1487 #[rstest]
1488 fn test_messagepack_single_byte_payload() {
1489 let result = DataFrame::load_from_messagepack(&[0x01]);
1491 assert!(result.is_err());
1492 }
1493
1494 #[rstest]
1497 fn test_hash_datavalue_public_api_accessible() {
1498 let val = DataValue::I32(42);
1500 let h = crate::hash_datavalue(&val);
1501 assert_eq!(h, crate::hash_datavalue(&DataValue::I32(42)));
1503 }
1504
1505 #[rstest]
1506 fn test_hash_datavalue_vec_length_matters() {
1507 let short = DataValue::Vec(vec![DataValue::I32(1)]);
1509 let long = DataValue::Vec(vec![DataValue::I32(1), DataValue::Null]);
1510 assert_ne!(crate::hash_datavalue(&short), crate::hash_datavalue(&long));
1511 }
1512
1513 #[rstest]
1514 fn test_hash_datavalue_map_different_keys_same_values() {
1515 let mut m1 = std::collections::HashMap::new();
1516 m1.insert("a".into(), DataValue::I32(1));
1517 let mut m2 = std::collections::HashMap::new();
1518 m2.insert("b".into(), DataValue::I32(1));
1519
1520 assert_ne!(
1521 crate::hash_datavalue(&DataValue::Map(m1)),
1522 crate::hash_datavalue(&DataValue::Map(m2))
1523 );
1524 }
1525
1526 #[rstest]
1527 fn test_hash_datavalue_empty_string_vs_empty_bytes() {
1528 let empty_str = DataValue::String("".into());
1529 let empty_bytes = DataValue::Bytes(vec![]);
1530 assert_ne!(
1531 crate::hash_datavalue(&empty_str),
1532 crate::hash_datavalue(&empty_bytes)
1533 );
1534 }
1535
1536 #[rstest]
1537 fn test_hash_datavalue_empty_vec_vs_empty_map() {
1538 let empty_vec = DataValue::Vec(vec![]);
1539 let empty_map = DataValue::Map(std::collections::HashMap::new());
1540 assert_ne!(
1541 crate::hash_datavalue(&empty_vec),
1542 crate::hash_datavalue(&empty_map)
1543 );
1544 }
1545
1546 #[rstest]
1547 fn test_hash_datavalue_i128_boundary_values() {
1548 let max = DataValue::I128(i128::MAX);
1549 let min = DataValue::I128(i128::MIN);
1550 let zero = DataValue::I128(0);
1551 let neg_one = DataValue::I128(-1);
1552
1553 let hashes: std::collections::HashSet<u64> = [&max, &min, &zero, &neg_one]
1555 .iter()
1556 .map(|v| crate::hash_datavalue(v))
1557 .collect();
1558 assert_eq!(hashes.len(), 4);
1559 }
1560
1561 #[rstest]
1562 fn test_hash_datavalue_u128_boundary_values() {
1563 let max = DataValue::U128(u128::MAX);
1564 let zero = DataValue::U128(0);
1565 let one = DataValue::U128(1);
1566 let i128_neg1 = DataValue::I128(-1);
1568
1569 assert_ne!(
1570 crate::hash_datavalue(&max),
1571 crate::hash_datavalue(&i128_neg1)
1572 );
1573 let hashes: std::collections::HashSet<u64> = [&max, &zero, &one]
1574 .iter()
1575 .map(|v| crate::hash_datavalue(v))
1576 .collect();
1577 assert_eq!(hashes.len(), 3);
1578 }
1579
1580 #[rstest]
1581 fn test_hash_datavalue_f64_special_values() {
1582 let nan1 = DataValue::F64(f64::NAN);
1584 let nan2 = DataValue::F64(f64::NAN);
1585 assert_eq!(crate::hash_datavalue(&nan1), crate::hash_datavalue(&nan2));
1586
1587 let subnormal = DataValue::F64(f64::MIN_POSITIVE / 2.0);
1589 let normal = DataValue::F64(f64::MIN_POSITIVE);
1590 assert_ne!(
1591 crate::hash_datavalue(&subnormal),
1592 crate::hash_datavalue(&normal)
1593 );
1594 }
1595
1596 #[rstest]
1597 fn test_hash_datavalue_enum_number_vs_i32_same_value() {
1598 let enum_val = DataValue::EnumNumber(42);
1600 let i32_val = DataValue::I32(42);
1601 assert_ne!(
1602 crate::hash_datavalue(&enum_val),
1603 crate::hash_datavalue(&i32_val)
1604 );
1605 }
1606}