1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
52#[cfg_attr(feature = "python", pyclass)]
53pub struct DataFrame {
54 pub constants: HashMap<Key, DataValue>,
58 pub dataframe: ColumnFrame,
60 pub metadata: HashMap<String, DataValue>,
63}
64
65impl fmt::Display for DataFrame {
66 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67 self.dataframe.fmt(f)
68 }
69}
70
71impl DataFrame {
72 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
73 Self {
74 constants: HashMap::new(),
75 dataframe: dataframe.into(),
76 metadata: HashMap::new(),
77 }
78 }
79
80 pub fn n_columns(&self) -> usize {
82 self.dataframe.data_frame.ncols()
83 }
84
85 pub fn n_rows(&self) -> usize {
87 self.dataframe.data_frame.nrows()
88 }
89
90 pub fn shrink(&mut self) {
91 self.dataframe.shrink();
92 }
93
94 pub fn add_metadata(&mut self, key: String, value: DataValue) {
95 self.metadata.insert(key, value);
96 }
97
98 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
99 self.metadata.get(key)
100 }
101
102 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
103 for (key, value) in other.constants {
104 self.constants.insert(key, value);
105 }
106 self.dataframe.join(other.dataframe, join_type)
107 }
108
109 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
110 where
111 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
112 {
113 self.dataframe.apply_function(keys, &mut func)
114 }
115
116 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
117 Ok(self.dataframe.select(keys))
118 }
119
120 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
125 self.dataframe.select_transposed_typed::<D>(keys)
126 }
127
128 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
129 self.dataframe.select_column(&key)
130 }
131
132 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
133 self.dataframe.select_transposed(keys)
134 }
135
136 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
137 self.constants.insert(key, value);
138 }
139
140 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
141 self.dataframe.push(item)
142 }
143
144 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
145 self.dataframe.remove_column(keys).map(|x| x.into())
146 }
147
148 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
149 self.dataframe.extend(items.dataframe)
150 }
151
152 pub fn len(&self) -> usize {
153 self.dataframe.len()
154 }
155
156 pub fn is_empty(&self) -> bool {
157 self.dataframe.is_empty()
158 }
159
160 pub fn add_single_column<K: Into<Key>>(
161 &mut self,
162 key: K,
163 values: Array1<DataValue>,
164 ) -> Result<(), Error> {
165 self.dataframe.add_single_column(key, values)
166 }
167
168 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
169 self.dataframe.get_single_column(key)
170 }
171
172 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
173 self.dataframe.sorted(key)
174 }
175
176 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
177 let filtered_df = self.dataframe.filter(filter)?;
178 Ok(Self {
179 constants: self.constants.clone(),
180 dataframe: filtered_df,
181 metadata: self.metadata.clone(),
182 })
183 }
184
185 #[cfg(feature = "polars-df")]
186 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
187 let mut columns = vec![];
188 for key in self.dataframe.keys() {
189 let values = self
190 .dataframe
191 .get_single_column(key)
192 .ok_or_else(|| Error::NotFound(key.clone()))?
193 .into_iter()
194 .map(|x| into_polars_value(key, x.clone()))
195 .collect::<Vec<_>>();
196 let s = polars::prelude::Column::new(key.name().into(), values);
197
198 columns.push(s);
199 }
200
201 Ok(polars::prelude::DataFrame::new(columns)?)
202 }
203}
204#[cfg(feature = "polars-df")]
205pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
206 use crate::DataType::*;
207 use polars::prelude::DataType::*;
208 match dtype {
209 Bool => Boolean,
210 U32 => UInt32,
211 I32 => Int32,
212 U8 => UInt8,
213 U64 => UInt64,
214 I64 => Int64,
215 F32 => Float32,
216 F64 => Float64,
217 U128 => UInt128,
218 I128 => Int128,
219 crate::DataType::String => polars::prelude::DataType::String,
220 Bytes => Binary,
221 crate::DataType::Unknown => Null,
222 Vec => List(Box::new(polars::prelude::DataType::Unknown(
223 polars::prelude::UnknownKind::Any,
224 ))),
225 Map => Struct(vec![]),
226 }
227}
228
229#[cfg(feature = "polars-df")]
230pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
231 use polars::prelude::AnyValue::*;
232 use polars::prelude::Field;
233
234 use crate::dataframe::column_store::convert_dv_to_dtype;
235 let dv = convert_dv_to_dtype(key, dv);
236 match dv {
237 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
238 DataValue::Bytes(items) => BinaryOwned(items),
239 DataValue::U8(x) => UInt32(x as _),
240 DataValue::Bool(x) => Boolean(x),
241 DataValue::I32(x) => Int32(x),
242 DataValue::U32(x) => UInt32(x),
243 DataValue::I64(x) => Int64(x),
244 DataValue::U64(x) => UInt64(x),
245 DataValue::I128(x) => Int128(x),
246 DataValue::F32(x) => Float32(x),
247 DataValue::F64(x) => Float64(x),
248 DataValue::Null => Null,
249 DataValue::Vec(data_values) => {
250 let mut dt = crate::DataType::Unknown;
251 for d in data_values.iter() {
252 match crate::detect_dtype(d) {
253 crate::DataType::Unknown => continue,
254 e => {
255 dt = e;
256 break;
257 }
258 }
259 }
260 let vec_key = Key::new(key.name(), dt);
261 let s = polars::series::Series::from_any_values(
262 key.name().into(),
263 &data_values
264 .into_iter()
265 .map(|x| into_polars_value(&vec_key, x))
266 .collect::<Vec<_>>(),
267 true,
268 );
269 List(s.expect(&format!("Cannot create series for {key:?}")))
270 }
271 DataValue::EnumNumber(x) => Int32(x),
272 DataValue::U128(x) => UInt128(x),
273 DataValue::Map(x) => {
274 let mut values = vec![];
275 let mut fields = vec![];
276 let mut sorted_keys = x.keys().collect::<Vec<_>>();
277 sorted_keys.sort();
278 for k in sorted_keys {
279 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
280 let dtype = crate::detect_dtype(value);
281 let k = Key::new(k, dtype);
282 values.push(into_polars_value(&k, value.to_owned()));
283 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
284 }
285 StructOwned(Box::new((values, fields)))
286 }
287 }
288}
289
290#[cfg(feature = "polars-df")]
291pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
292 use polars::prelude::AnyValue::*;
293 match dv {
294 Null => DataValue::Null,
295 Boolean(v) => v.into(),
296 String(v) => DataValue::String(v.into()),
297 UInt8(v) => DataValue::U8(v),
298 UInt16(v) => DataValue::U32(v as u32),
299 UInt32(v) => v.into(),
300 UInt64(v) => v.into(),
301 Int8(v) => (v as i32).into(),
302 Int16(v) => (v as i32).into(),
303 Int32(v) => v.into(),
304 Int64(v) => v.into(),
305 Float32(v) => v.into(),
306 Float64(v) => v.into(),
307 Int128(v) => v.into(),
308 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
309 StringOwned(v) => DataValue::String(v.as_str().into()),
313 Binary(v) => DataValue::Bytes(v.to_owned()),
314 BinaryOwned(v) => DataValue::Bytes(v),
315 StructOwned(m) => {
316 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
317 std::collections::HashMap::new();
318 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
319 hm.insert(k.name.as_str().into(), from_polars_value(v));
320 }
321 DataValue::Map(hm)
322 }
323 e => {
324 tracing::warn!("Unsupported polars value: {e:?}");
325 DataValue::Null
326 }
327 }
328}
329
330impl From<ColumnFrame> for DataFrame {
331 fn from(dataframe: ColumnFrame) -> Self {
332 Self::new(dataframe)
333 }
334}
335
336impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
337 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
338 Self::new(ColumnFrame::from(dataframe))
339 }
340}
341
342impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
343 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
344 Self::new(ColumnFrame::from(dataframe))
345 }
346}
347
348impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
349 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
350 Self::new(ColumnFrame::from(dataframe))
351 }
352}
353
354impl From<MLChefMap> for DataFrame {
355 fn from(dataframe: MLChefMap) -> Self {
356 Self::new(ColumnFrame::from(dataframe))
357 }
358}
359impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
360 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
361 Self::new(ColumnFrame::from(dataframe))
362 }
363}
364
365impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
366 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
367 Self::new(ColumnFrame::from(dataframe))
368 }
369}
370
371#[cfg(feature = "polars-df")]
372impl From<polars::prelude::DataFrame> for DataFrame {
373 fn from(dataframe: polars::prelude::DataFrame) -> Self {
374 Self::new(ColumnFrame::from(dataframe))
375 }
376}
377#[cfg(test)]
378mod test {
379 use crate::filter::FilterRules;
380
381 use super::*;
382 use halfbrown::hashmap;
383 #[cfg(feature = "polars-df")]
384 use polars::prelude::NamedFrom as _;
385 use rstest::*;
386 use tracing_test::traced_test;
387 #[fixture]
388 fn dummy_candidates() -> ColumnFrame {
389 ColumnFrame::from(vec![
390 hashmap! {
391 "key1".into() => 1.into(),
392 "key2".into() => "a".into(),
393 },
394 hashmap! {
395 "key1".into() => 2.into(),
396 "key2".into() => "b".into(),
397 },
398 ])
399 }
400
401 #[rstest]
402 fn test_serde() {
403 let df = crate::df! {
404 "a" => [1u64, 2u64, 3u64],
405 "b" => [4u64, 5u64, 6u64],
406 "c" => [7u64, 8u64, 9u64]
407 };
408
409 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
410
411 let deserialized =
412 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
413
414 assert_eq!(df, deserialized);
415 }
416
417 #[cfg(feature = "polars-df")]
418 #[rstest]
419 fn test_polars() {
420 let expected = crate::df! {
421 "a" => [1u64, 2u64, 3u64],
422 "b" => [4f64, 5f64, 6f64],
423 "c" => [7i64, 8i64, 9i64]
424 };
425
426 let polars_df = polars::df!(
427 "a" => [1u64, 2u64, 3u64],
428 "b" => [4f64, 5f64, 6f64],
429 "c" => [7i64, 8i64, 9i64]
430 )
431 .expect("BUG: should be ok");
432 let as_df: DataFrame = polars_df.into();
433 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
434 assert_eq!(
435 as_df.select(Some(keys.as_slice())),
436 expected.select(Some(keys.as_slice()))
437 );
438 }
439 #[cfg(feature = "polars-df")]
440 use crate::DataType;
441 #[cfg(feature = "polars-df")]
442 #[rstest]
443 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
444 #[case::u32(
445 Key::new("a", DataType::U32),
446 DataValue::U32(u32::MAX),
447 polars::prelude::AnyValue::UInt32(u32::MAX)
448 )]
449 #[case::i32(
450 Key::new("a", DataType::I32),
451 DataValue::I32(i32::MIN),
452 polars::prelude::AnyValue::Int32(i32::MIN)
453 )]
454 #[case::i64(
455 Key::new("a", DataType::I64),
456 DataValue::I64(i64::MIN),
457 polars::prelude::AnyValue::Int64(i64::MIN)
458 )]
459 #[case::u64(
460 Key::new("a", DataType::U64),
461 DataValue::U64(u64::MIN),
462 polars::prelude::AnyValue::UInt64(u64::MIN)
463 )]
464 #[case::f32(
465 Key::new("a", DataType::F32),
466 DataValue::F32(f32::MIN),
467 polars::prelude::AnyValue::Float32(f32::MIN)
468 )]
469 #[case::f64(
470 Key::new("a", DataType::F64),
471 DataValue::F64(f64::MIN),
472 polars::prelude::AnyValue::Float64(f64::MIN)
473 )]
474 #[case::null(
475 Key::new("a", DataType::Unknown),
476 DataValue::Null,
477 polars::prelude::AnyValue::Null
478 )]
479 #[case::i128(
480 Key::new("a", DataType::I128),
481 DataValue::I128(i128::MIN),
482 polars::prelude::AnyValue::Int128(i128::MIN)
483 )]
484 #[case::u8(
485 Key::new("a", DataType::U8),
486 DataValue::U8(255),
487 polars::prelude::AnyValue::UInt8(255)
488 )]
489 #[case::bool(
490 Key::new("a", DataType::Bool),
491 DataValue::Bool(true),
492 polars::prelude::AnyValue::Boolean(true)
493 )]
494 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
495 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
496 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
497 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
498 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
499 fn into_polars_value_test(
503 #[case] key: Key,
504 #[case] input: DataValue,
505 #[case] output: polars::prelude::AnyValue<'static>,
506 ) {
507 assert_eq!(into_polars_value(&key, input.clone()), output);
508 assert_eq!(from_polars_value(output), input);
509 }
510
511 #[rstest]
523 #[case(
524 DataFrame::new(crate::column_frame! {
525 "a" => [1f64, 2f64, 3f64],
526 "b" => [4i64, 5i64, 6i64],
527 "c" => [7i64, 8i64, 9i64]
528 }),
529 DataFrame::new(crate::column_frame! {
530 "a" => [1f64, 2f64],
531 "b" => [4i64, 5i64],
532 "c" => [7i64, 8i64]
533 }),
534 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
535 )]
536 #[case(
537 DataFrame::new(crate::column_frame! {
538 "a" => [1f64, 2f64, 3f64],
539 "b" => [4i64, 5i64, 6i64],
540 "c" => [7i64, 8i64, 9i64]
541 }),
542 DataFrame::new(crate::column_frame! {
543 "a" => [2f64],
544 "b" => [5i64],
545 "c" => [8i64]
546 }),
547 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
548 )]
549 #[traced_test]
550 fn filter_test(
551 #[case] df: DataFrame,
552 #[case] expected: DataFrame,
553 #[case] filter: FilterRules,
554 ) {
555 let filtered = df.filter(&filter).expect("BUG: cannot filter");
556 assert_eq!(filtered, expected);
557 }
558
559 #[rstest]
560 fn test_serde_complex() {
561 let simple = r#"
562{
563 "constants": {},
564 "dataframe": {
565 "index": {
566 "keys": [
567 {
568 "key": 3162770485,
569 "name": "a",
570 "ctype": "U32"
571 },
572 {
573 "key": 2279056742,
574 "name": "b",
575 "ctype": "F64"
576 },
577 {
578 "key": 2994984227,
579 "name": "c",
580 "ctype": "U64"
581 },
582 {
583 "key": 3319645144,
584 "name": "d",
585 "ctype": "F64"
586 },
587 {
588 "key": 1291847470,
589 "name": "e",
590 "ctype": "U32"
591 },
592 {
593 "key": 874241070,
594 "name": "f",
595 "ctype": "Bool"
596 }
597 ],
598 "indexes": {
599 "a": 0,
600 "b": 1,
601 "c": 2,
602 "d": 3,
603 "e": 4,
604 "f": 5
605 },
606 "alias": {}
607 },
608 "data_frame": {
609 "v": 1,
610 "dim": [
611 2,
612 6
613 ],
614 "data": [
615 253780,
616 0.009369421750307085,
617 1633222860381359,
618 8,
619 5,
620 true,
621 64512,
622 0.003391335718333721,
623 1633222860810557,
624 8,
625 5,
626 null
627 ]
628 }
629 },
630 "metadata": {}
631}
632 "#;
633
634 let simple_deserialized: DataFrame =
635 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
636
637 println!("deserialized: {simple_deserialized:?}");
638 let array = format!("[{}, {}, {}]", simple, simple, simple);
639 let deserialized: Vec<DataFrame> =
640 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
641
642 println!("deserialized: {deserialized:?}");
643 assert_eq!(deserialized.len(), 3);
644 assert_eq!(simple_deserialized, deserialized[0]);
645 }
646
647 #[rstest]
648 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
649 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
650 #[case(vec![hashmap! {
651 "key1".into() => 1.into(),
652 "key2".into() => "a".into(),
653 },
654 hashmap! {
655 "key1".into() => 2.into(),
656 },])]
657 #[case(vec![data_value::stdhashmap! {
658 "key1" => DataValue::from(1),
659 "key2" => DataValue::from("a"),
660 },data_value::stdhashmap! {
661 "key1" => DataValue::from(2),
662 },])]
663 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
664 vec![DataValue::from("a"), DataValue::Null])])]
665 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
666 let df: DataFrame = input.into();
667 assert_eq!(
668 df,
669 DataFrame {
670 constants: HashMap::new(),
671 dataframe: ColumnFrame::from(vec![
672 hashmap! {
673 "key1".into() => 1.into(),
674 "key2".into() => "a".into(),
675 },
676 hashmap! {
677 "key1".into() => 2.into(),
678 },
679 ]),
680 metadata: HashMap::new(),
681 }
682 );
683 let selected_transposed = df.select_column("key1".into());
684 assert!(selected_transposed.is_some());
685 let selected_transposed = selected_transposed.unwrap();
686 assert_eq!(selected_transposed.len(), 2);
687 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
688 }
689
690 #[rstest]
691 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
692 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
693 #[case::hm({
694 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
695 hm
696 })]
697 #[case::vec_hhm(vec![hashmap! {
698 "key1".into() => 1.into(),
699 "key2".into() => "a".into(),
700 },
701 hashmap! {
702 "key1".into() => 2.into(),
703 },])]
704 #[case::vec_hme(vec![data_value::stdhashmap! {
705 "key1" => DataValue::from(1),
706 "key2" => DataValue::from("a"),
707 },data_value::stdhashmap! {
708 "key1" => DataValue::from(2),
709 },])]
710 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
711 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
712 let df: DataFrame = input.into();
713 let expected: DataFrame = DataFrame {
714 constants: HashMap::new(),
715 dataframe: ColumnFrame::from(vec![
716 hashmap! {
717 "key1".into() => 1.into(),
718 "key2".into() => "a".into(),
719 },
720 hashmap! {
721 "key1".into() => 2.into(),
722 },
723 ]),
724 metadata: HashMap::new(),
725 };
726 assert_eq!(
727 df.select(Some(&["key1".into(), "key2".into()])),
728 expected.select(Some(&["key1".into(), "key2".into()])),
729 "{df} vs {expected}"
730 );
731 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
732 assert_eq!(selected_transposed.len(), 2);
733 println!("{:?}", selected_transposed);
734 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
735 }
736 #[rstest]
737 fn test_dataframe(dummy_candidates: ColumnFrame) {
738 let mut dataframe: DataFrame = DataFrame::default();
739 assert!(dataframe.is_empty());
740 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
741 assert_eq!(dataframe.len(), 2);
742
743 let candidate = hashmap! {
744 "key1".into() => 3.into(),
745 "key2".into() => "c".into(),
746 };
747
748 assert!(dataframe.push(candidate).is_ok());
749 assert_eq!(dataframe.len(), 3);
750 assert!(!dataframe.is_empty());
751
752 dataframe.insert_constant("key3".into(), 4.into());
753 assert_eq!(dataframe.constants.len(), 1);
754 assert!(dataframe
755 .apply_function(&["key1".into()], |keys, df| {
756 let key = keys[0].clone();
757 let s = df
758 .get_single_column(&key)
759 .expect("BUG: Cannot get column")
760 .to_owned();
761 let s = s.mapv(|x| x + DataValue::from(1));
762 df.add_single_column("key5", s)?;
763 Ok(())
764 })
765 .is_ok());
766 let original = dataframe.clone();
767 dataframe.shrink();
768 let remove_df = dataframe.remove_column(&["key1".into()]);
769 assert!(remove_df.is_ok());
770 let mut remove_df = remove_df.unwrap();
771 assert_eq!(remove_df.len(), 3);
772 let selected = dataframe.select(Some(&["key2".into()]));
773 assert!(selected.is_ok());
774 let selected = selected.unwrap();
775 println!("{:?}", selected);
776
777 let joined_result =
779 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
780 assert!(joined_result.is_ok(), "{:?}", joined_result);
781 let keys = vec!["key1".into(), "key2".into(), "key5".into()];
782 assert_eq!(
783 original.select(Some(keys.as_slice())),
784 remove_df.select(Some(keys.as_slice()))
785 );
786 }
787
788 #[rstest]
789 fn test_size_methods() {
790 let candidate = hashmap! {
791 "key1".into() => 3.into(),
792 "key2".into() => "c".into(),
793 "key3".into() => false.into()
794 };
795
796 let dataframe: DataFrame = vec![candidate].into();
797
798 assert_eq!(dataframe.n_columns(), 3);
799 assert_eq!(dataframe.n_rows(), 1);
800 }
801
802 #[rstest]
803 fn test_metadata(dummy_candidates: ColumnFrame) {
804 let mut dataframe: DataFrame = DataFrame::default();
805 assert!(dataframe.is_empty());
806 println!("{:?}", dataframe);
807 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
808 println!("{:?}", dataframe);
809 assert_eq!(dataframe.len(), 2);
810
811 dataframe.add_metadata("test".into(), 1.into());
812 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
813 let dataframe = DataFrame::new(ColumnFrame::from(vec![
814 hashmap! {
815 "key1".into() => 1.into(),
816 "key2".into() => "a".into(),
817 },
818 hashmap! {
819 "key1".into() => 2.into(),
820 "key2".into() => "b".into(),
821 },
822 ]));
823 assert_eq!(dataframe.get_metadata("test"), None);
824 let tt = dataframe.select_transposed(None);
825 assert!(tt.is_ok());
826 let tt = tt.unwrap();
827 assert_eq!(tt.shape(), [2, 2]);
828 assert_eq!(
829 tt,
830 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
831 .unwrap()
832 );
833 }
834
835 #[rstest]
836 #[traced_test]
837 fn add_single_column_test() {
838 let mut dataframe = DataFrame::default();
839 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
840 let r = dataframe.add_single_column("key1", values);
841 assert!(r.is_ok(), "{r:?}");
842 let selected = dataframe.select(None);
843 assert!(selected.is_ok());
844 let selected = selected.unwrap();
845 assert_eq!(selected.shape(), [3, 1]);
846 assert_eq!(
847 selected,
848 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
849 );
850 let values = Array1::from(vec![1.into(), 2.into()]);
851 assert!(dataframe.add_single_column("key1", values).is_err());
852 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
853 assert!(dataframe.add_single_column("key2", values).is_ok());
854 let values = Array1::from(vec![3.into()]);
855 assert!(dataframe.add_single_column("key3", values).is_err());
856 }
857
858 #[rstest]
859 #[traced_test]
860 fn add_single_column_empty_test() {
861 let mut dataframe = DataFrame::default();
862 let values = Array1::from(vec![]);
863 let r = dataframe.add_single_column("key1", values);
864 assert!(r.is_ok(), "{r:?}");
865 let selected = dataframe.select(None);
866 assert!(selected.is_ok());
867 let selected = selected.unwrap();
868 assert_eq!(selected.shape(), [0, 1]);
869 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
870 let values = Array1::from(vec![1.into(), 2.into()]);
871 assert!(dataframe.add_single_column("key1", values).is_err());
872 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
873 assert!(dataframe.add_single_column("key2", values).is_ok());
874 let values = Array1::from(vec![3.into(), 4.into()]);
875 assert!(dataframe.add_single_column("key3", values).is_err());
876 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
877 assert!(dataframe.add_single_column("key3", values).is_ok());
878
879 assert_eq!(
880 dataframe
881 .select_column("key1".into())
882 .expect("BUG: has to exists"),
883 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
884 );
885 assert_eq!(
886 dataframe
887 .select_column("key2".into())
888 .expect("BUG: has to exists"),
889 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
890 );
891 assert_eq!(
892 dataframe.select(None).expect("BUG: cannot get data"),
893 ndarray::arr2(&[
894 [DataValue::Null, 3.into(), 3.into()],
895 [DataValue::Null, 4.into(), 4.into()],
896 [DataValue::Null, 5.into(), 5.into()],
897 ])
898 );
899 }
900
901 #[rstest]
902 #[case(
903 DataFrame::new(ColumnFrame::from(vec![
904 hashmap! {
905 "k".into() => 1.into(),
906 "k2".into() => 2.into(),
907 "k3".into() => 2.2.into(),
908 },
909 hashmap! {
910 "k".into() => 11.into(),
911 "k2".into() => 3.into(),
912 },
913 hashmap! {
914 "k".into() => 4.into(),
915 "k2".into() => 5.into(),
916 "k3".into() => 2.3.into(),
917 },
918 hashmap! {
919 "k".into() => 4.into(),
920 "k2".into() => 5.into(),
921 "k3".into() => 2.4.into(),
922 },
923 ])),
924 vec!["k".into(), "k2".into()],
925 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
926 )]
927 #[case(
928 DataFrame::new(ColumnFrame::from(vec![
929 hashmap! {
930 "k".into() => 1.into(),
931 "k2".into() => 2.into(),
932 "k3".into() => 2.2.into(),
933 },
934 hashmap! {
935 "k".into() => 11.into(),
936 "k2".into() => 3.into(),
937 },
938 hashmap! {
939 "k".into() => 4.into(),
940 "k2".into() => 5.into(),
941 "k3".into() => 2.3.into(),
942 },
943 hashmap! {
944 "k".into() => 4.into(),
945 "k2".into() => 5.into(),
946 "k3".into() => 2.4.into(),
947 },
948 ])),
949 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
950 Array2::from_shape_vec((4, 5), vec![
951 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
952 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
953 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
954 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
955 )]
956 #[traced_test]
957 fn select_multiple(
958 #[case] input: DataFrame,
959 #[case] columns: Vec<Key>,
960 #[case] expected: Array2<DataValue>,
961 ) {
962 let selected = input.select(Some(&columns));
963 assert!(selected.is_ok());
964 let selected = selected.unwrap();
965
966 assert_eq!(selected, expected);
967 }
968
969 #[rstest]
970 #[case(
971 DataFrame::new(ColumnFrame::from(vec![
972 hashmap! {
973 "k".into() => 1.into(),
974 "k2".into() => 2.into(),
975 "k3".into() => 2.2.into(),
976 },
977 hashmap! {
978 "k".into() => 11.into(),
979 "k2".into() => 3.into(),
980 },
981 hashmap! {
982 "k".into() => 4.into(),
983 "k2".into() => 5.into(),
984 "k3".into() => 2.3.into(),
985 },
986 hashmap! {
987 "k".into() => 4.into(),
988 "k2".into() => 5.into(),
989 "k3".into() => 2.4.into(),
990 },
991 ])),
992 "k".into(),
993 Array2::from_shape_vec((4, 3), vec![
994 1.into(), 2.into(), 2.2.into(),
995 4.into(), 5.into(), 2.3.into(),
996 4.into(), 5.into(), 2.4.into(),
997 11.into(), 3.into(), DataValue::Null,
998 ]
999 ).unwrap(),
1000 vec!["k".into(), "k2".into(), "k3".into()],
1001 )]
1002 #[rstest]
1003 #[case(
1004 DataFrame::new(ColumnFrame::from(vec![
1005 hashmap! {
1006 "k".into() => 1.into(),
1007 "k2".into() => 2.into(),
1008 "k3".into() => 2.2.into(),
1009 },
1010 hashmap! {
1011 "k".into() => 11.into(),
1012 "k2".into() => 3.into(),
1013 },
1014 hashmap! {
1015 "k".into() => 4.into(),
1016 "k2".into() => 5.into(),
1017 "k3".into() => 2.3.into(),
1018 },
1019 hashmap! {
1020 "k".into() => 4.into(),
1021 "k2".into() => 5.into(),
1022 "k3".into() => 2.4.into(),
1023 },
1024 ])),
1025 "k3".into(),
1026 Array2::from_shape_vec((4, 3), vec![
1027 11.into(), 3.into(), DataValue::Null,
1028 1.into(), 2.into(), 2.2.into(),
1029 4.into(), 5.into(), 2.3.into(),
1030 4.into(), 5.into(), 2.4.into(),
1031 ]
1032 ).unwrap(),
1033 vec!["k".into(), "k2".into(), "k3".into()],
1034 )]
1035 #[case(
1036 DataFrame::new(ColumnFrame::from(vec![
1037 hashmap! {
1038 "k".into() => 2.into(),
1039 "k2".into() => 0.000001.into(),
1040 },
1041 hashmap! {
1042 "k".into() => 1.into(),
1043 "k2".into() =>0.0000001.into(),
1044 },
1045 hashmap! {
1046 "k".into() => 3.into(),
1047 "k2".into() => 0.00001.into(),
1048 },
1049 hashmap! {
1050 "k".into() => 4.into(),
1051 "k2".into() => 0.001.into(),
1052 },
1053 ])),
1054 "k2".into(),
1055 Array2::from_shape_vec((4, 2), vec![
1056 1.into(), 0.0000001.into(),
1057 2.into(), 0.000001.into(),
1058 3.into(), 0.00001.into(),
1059 4.into(), 0.001.into(),
1060 ]
1061 ).unwrap(),
1062 vec!["k".into(), "k2".into()],
1063 )]
1064 #[case(
1065 DataFrame::new(ColumnFrame::from(vec![
1066 hashmap! {
1067 "k".into() => 2.into(),
1068 "k2".into() => "b".into(),
1069 },
1070 hashmap! {
1071 "k".into() => 1.into(),
1072 "k2".into() =>"a".into(),
1073 },
1074 hashmap! {
1075 "k".into() => 3.into(),
1076 "k2".into() =>"c".into(),
1077 },
1078 hashmap! {
1079 "k".into() => 4.into(),
1080 "k2".into() =>"z".into(),
1081 },
1082 ])),
1083 "k2".into(),
1084 Array2::from_shape_vec((4, 2), vec![
1085 1.into(),"a".into(),
1086 2.into(), "b".into(),
1087 3.into(), "c".into(),
1088 4.into(), "z".into(),
1089 ]
1090 ).unwrap(),
1091 vec!["k".into(), "k2".into()],
1092 )]
1093 #[traced_test]
1094 fn sort_by(
1095 #[case] input: DataFrame,
1096 #[case] column: Key,
1097 #[case] expected: Array2<DataValue>,
1098 #[case] columns: Vec<Key>,
1099 ) {
1100 let result = input.sorted(&column);
1101 assert!(result.is_ok(), "{result:?}");
1102 let result = result.unwrap().get_sorted();
1103 let selected = result.select(Some(&columns));
1104
1105 assert_eq!(selected, expected);
1106 }
1107 #[rstest]
1108 #[case(
1109 DataFrame::new(ColumnFrame::from(vec![
1110 hashmap! {
1111 "k".into() => 2.into(),
1112 "k2".into() => 0.000001.into(),
1113 },
1114 hashmap! {
1115 "k".into() => 1.into(),
1116 "k2".into() =>0.0000001.into(),
1117 },
1118 hashmap! {
1119 "k".into() => 3.into(),
1120 "k2".into() => 0.00001.into(),
1121 },
1122 hashmap! {
1123 "k".into() => 4.into(),
1124 "k2".into() => 0.001.into(),
1125 },
1126 ])),
1127 "k2".into(),
1128 TopN::Last(1),
1129 Array2::from_shape_vec((1, 2), vec![
1130 4.into(), 0.001.into(),
1131 ]
1132 ).unwrap(),
1133 vec!["k".into(), "k2".into()],
1134 )]
1135 #[case(
1136 DataFrame::new(ColumnFrame::from(vec![
1137 hashmap! {
1138 "k".into() => 2.into(),
1139 "k2".into() => 0.000001.into(),
1140 },
1141 hashmap! {
1142 "k".into() => 1.into(),
1143 "k2".into() =>0.0000001.into(),
1144 },
1145 hashmap! {
1146 "k".into() => 3.into(),
1147 "k2".into() => 0.00001.into(),
1148 },
1149 hashmap! {
1150 "k".into() => 4.into(),
1151 "k2".into() => 0.001.into(),
1152 },
1153 ])),
1154 "k2".into(),
1155 TopN::Last(2),
1156 Array2::from_shape_vec((2, 2), vec![
1157 4.into(), 0.001.into(),
1158 3.into(), 0.00001.into(),
1159 ]
1160 ).unwrap(),
1161 vec!["k".into(), "k2".into()],
1162 )]
1163 #[case(
1164 DataFrame::new(ColumnFrame::from(vec![
1165 hashmap! {
1166 "k".into() => 2.into(),
1167 "k2".into() => "b".into(),
1168 },
1169 hashmap! {
1170 "k".into() => 1.into(),
1171 "k2".into() =>"a".into(),
1172 },
1173 hashmap! {
1174 "k".into() => 3.into(),
1175 "k2".into() =>"c".into(),
1176 },
1177 hashmap! {
1178 "k".into() => 4.into(),
1179 "k2".into() =>"z".into(),
1180 },
1181 ])),
1182 "k2".into(),
1183 TopN::First(1),
1184 Array2::from_shape_vec((1, 2), vec![
1185 1.into(),"a".into(),
1186 ]
1187 ).unwrap(),
1188 vec!["k".into(), "k2".into()],
1189 )]
1190 #[case(
1191 DataFrame::new(ColumnFrame::from(vec![
1192 hashmap! {
1193 "k".into() => 2.into(),
1194 "k2".into() => "b".into(),
1195 },
1196 hashmap! {
1197 "k".into() => 1.into(),
1198 "k2".into() =>"a".into(),
1199 },
1200 hashmap! {
1201 "k".into() => 3.into(),
1202 "k2".into() =>"c".into(),
1203 },
1204 hashmap! {
1205 "k".into() => 4.into(),
1206 "k2".into() =>"z".into(),
1207 },
1208 ])),
1209 "k2".into(),
1210 TopN::First(2),
1211 Array2::from_shape_vec((2, 2), vec![
1212 1.into(),"a".into(),
1213 2.into(),"b".into(),
1214 ]
1215 ).unwrap(),
1216 vec!["k".into(), "k2".into()],
1217 )]
1218 #[traced_test]
1219 fn top_n(
1220 #[case] input: DataFrame,
1221 #[case] column: Key,
1222 #[case] topn: TopN,
1223 #[case] expected: Array2<DataValue>,
1224 #[case] columns: Vec<Key>,
1225 ) {
1226 let result = input.sorted(&column);
1227 assert!(result.is_ok(), "{result:?}");
1228 let result = result.unwrap();
1229 let first = result.topn(topn).unwrap();
1230 let selected = first.select(Some(&columns));
1231 assert_eq!(selected, expected);
1232 }
1233}