1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33 pub constants: HashMap<Key, DataValue>,
36 pub dataframe: ColumnFrame,
39 pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 self.dataframe.fmt(f)
46 }
47}
48
49impl DataFrame {
50 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51 Self {
52 constants: HashMap::new(),
53 dataframe: dataframe.into(),
54 metadata: HashMap::new(),
55 }
56 }
57
58 pub fn shrink(&mut self) {
59 self.dataframe.shrink();
60 }
61
62 pub fn add_metadata(&mut self, key: String, value: DataValue) {
63 self.metadata.insert(key, value);
64 }
65
66 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67 self.metadata.get(key)
68 }
69
70 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71 other.constants.into_iter().for_each(|(key, value)| {
72 self.constants.insert(key, value);
73 });
74 self.dataframe.join(other.dataframe, join_type)
75 }
76
77 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78 where
79 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80 {
81 self.dataframe.apply_function(keys, &mut func)
82 }
83
84 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85 Ok(self.dataframe.select(keys))
86 }
87
88 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89 self.dataframe.select_transposed_typed::<D>(keys)
90 }
91
92 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
93 self.dataframe.select_column(&key)
94 }
95
96 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97 self.dataframe.select_transposed(keys)
98 }
99
100 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101 self.constants.insert(key, value);
102 }
103
104 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105 self.dataframe.push(item)
106 }
107
108 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109 self.dataframe.remove_column(keys).map(|x| x.into())
110 }
111
112 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113 self.dataframe.extend(items.dataframe)
114 }
115
116 pub fn len(&self) -> usize {
117 self.dataframe.len()
118 }
119
120 pub fn is_empty(&self) -> bool {
121 self.dataframe.is_empty()
122 }
123
124 pub fn add_single_column<K: Into<Key>>(
125 &mut self,
126 key: K,
127 values: Array1<DataValue>,
128 ) -> Result<(), Error> {
129 self.dataframe.add_single_column(key, values)
130 }
131
132 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
133 self.dataframe.get_single_column(key)
134 }
135
136 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137 self.dataframe.sorted(key)
138 }
139
140 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141 let filtered_df = self.dataframe.filter(filter)?;
142 Ok(Self {
143 constants: self.constants.clone(),
144 dataframe: filtered_df,
145 metadata: self.metadata.clone(),
146 })
147 }
148
149 #[cfg(feature = "polars-df")]
150 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151 let mut columns = vec![];
152 for key in self.dataframe.keys() {
153 columns.push(polars::prelude::Column::new(
154 key.name().into(),
155 self.dataframe
156 .get_single_column(key)
157 .ok_or_else(|| Error::NotFound(key.clone()))?
158 .into_iter()
159 .map(|x| into_polars_value(x.clone()))
160 .collect::<Vec<_>>(),
161 ));
162 }
163
164 Ok(polars::prelude::DataFrame::new(columns)?)
165 }
166}
167#[cfg(feature = "polars-df")]
168pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
169 use crate::DataType::*;
170 use polars::prelude::DataType::*;
171 match dtype {
172 Bool => Boolean,
173 U32 => UInt32,
174 I32 => Int32,
175 U8 => UInt8,
176 U64 => UInt64,
177 I64 => Int64,
178 F32 => Float32,
179 F64 => Float64,
180 crate::DataType::String => polars::prelude::DataType::String,
181 Bytes => Binary,
182 crate::DataType::Unknown => Null,
183 Vec => List(Box::new(polars::prelude::DataType::Unknown(
184 polars::prelude::UnknownKind::Any,
185 ))),
186 Map => Struct(vec![]),
187 }
188}
189
190#[cfg(feature = "polars-df")]
191pub fn into_polars_value(dv: DataValue) -> polars::prelude::AnyValue<'static> {
192 use polars::prelude::AnyValue::*;
193 use polars::prelude::{Field, NamedFrom};
194 match dv {
195 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
196 DataValue::Bytes(items) => BinaryOwned(items),
197 DataValue::U8(x) => UInt16(x as _),
198 DataValue::Bool(x) => Boolean(x),
199 DataValue::I32(x) => Int32(x),
200 DataValue::U32(x) => UInt32(x),
201 DataValue::I64(x) => Int64(x),
202 DataValue::U64(x) => UInt64(x),
203 DataValue::I128(x) => Int128(x),
204 DataValue::F32(x) => Float32(x),
205 DataValue::F64(x) => Float64(x),
206 DataValue::Null => Null,
207 DataValue::Vec(data_values) => List(polars::series::Series::new(
208 "v".into(),
209 data_values
210 .into_iter()
211 .map(into_polars_value)
212 .collect::<Vec<_>>(),
213 )),
214 DataValue::EnumNumber(x) => Int32(x),
215 DataValue::U128(x) => UInt128(x),
216 DataValue::Map(x) => {
217 let mut values = vec![];
218 let mut fields = vec![];
219 let mut sorted_keys = x.keys().collect::<Vec<_>>();
220 sorted_keys.sort();
221 for key in sorted_keys {
222 let value = x
223 .get(key)
224 .expect(&format!("Key {key:?} should exists in hm"));
225 let dtype = crate::detect_dtype(value);
226 values.push(into_polars_value(value.to_owned()));
227 fields.push(Field::new(key.as_str().into(), polars_dtype(dtype)));
228 }
229 StructOwned(Box::new((values, fields)))
230 }
231 }
232}
233
234#[cfg(feature = "polars-df")]
235pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
236 use polars::prelude::AnyValue::*;
237 match dv {
238 Null => DataValue::Null,
239 Boolean(v) => v.into(),
240 String(v) => DataValue::String(v.into()),
241 UInt8(v) => DataValue::U8(v),
242 UInt16(v) => DataValue::U32(v as u32),
243 UInt32(v) => v.into(),
244 UInt64(v) => v.into(),
245 Int8(v) => (v as i32).into(),
246 Int16(v) => (v as i32).into(),
247 Int32(v) => v.into(),
248 Int64(v) => v.into(),
249 Float32(v) => v.into(),
250 Float64(v) => v.into(),
251 Int128(v) => v.into(),
252 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
253 StringOwned(v) => DataValue::String(v.as_str().into()),
257 Binary(v) => DataValue::Bytes(v.to_owned()),
258 BinaryOwned(v) => DataValue::Bytes(v),
259 StructOwned(m) => {
260 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
261 std::collections::HashMap::new();
262 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
263 hm.insert(k.name.as_str().into(), from_polars_value(v));
264 }
265 DataValue::Map(hm)
266 }
267 e => {
268 tracing::warn!("Unsupported polars value: {e:?}");
269 DataValue::Null
270 }
271 }
272}
273
274impl From<ColumnFrame> for DataFrame {
275 fn from(dataframe: ColumnFrame) -> Self {
276 Self::new(dataframe)
277 }
278}
279
280impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
281 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
282 Self::new(ColumnFrame::from(dataframe))
283 }
284}
285
286impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
287 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
288 Self::new(ColumnFrame::from(dataframe))
289 }
290}
291
292impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
293 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
294 Self::new(ColumnFrame::from(dataframe))
295 }
296}
297
298impl From<MLChefMap> for DataFrame {
299 fn from(dataframe: MLChefMap) -> Self {
300 Self::new(ColumnFrame::from(dataframe))
301 }
302}
303impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
304 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
305 Self::new(ColumnFrame::from(dataframe))
306 }
307}
308
309impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
310 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
311 Self::new(ColumnFrame::from(dataframe))
312 }
313}
314
315#[cfg(feature = "polars-df")]
316impl From<polars::prelude::DataFrame> for DataFrame {
317 fn from(dataframe: polars::prelude::DataFrame) -> Self {
318 Self::new(ColumnFrame::from(dataframe))
319 }
320}
321#[cfg(test)]
322mod test {
323 use crate::filter::FilterRules;
324
325 use super::*;
326 use halfbrown::hashmap;
327 #[cfg(feature = "polars-df")]
328 use polars::prelude::NamedFrom as _;
329 use rstest::*;
330 use tracing_test::traced_test;
331 #[fixture]
332 fn dummy_candidates() -> ColumnFrame {
333 ColumnFrame::from(vec![
334 hashmap! {
335 "key1".into() => 1.into(),
336 "key2".into() => "a".into(),
337 },
338 hashmap! {
339 "key1".into() => 2.into(),
340 "key2".into() => "b".into(),
341 },
342 ])
343 }
344
345 #[rstest]
346 fn test_serde() {
347 let df = crate::df! {
348 "a" => [1u64, 2u64, 3u64],
349 "b" => [4u64, 5u64, 6u64],
350 "c" => [7u64, 8u64, 9u64]
351 };
352
353 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
354
355 let deserialized =
356 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
357
358 assert_eq!(df, deserialized);
359 }
360
361 #[cfg(feature = "polars-df")]
362 #[rstest]
363 fn test_polars() {
364 let expected = crate::df! {
365 "a" => [1u64, 2u64, 3u64],
366 "b" => [4f64, 5f64, 6f64],
367 "c" => [7i64, 8i64, 9i64]
368 };
369
370 let polars_df = polars::df!(
371 "a" => [1u64, 2u64, 3u64],
372 "b" => [4f64, 5f64, 6f64],
373 "c" => [7i64, 8i64, 9i64]
374 )
375 .expect("BUG: should be ok");
376 let as_df: DataFrame = polars_df.into();
377 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
378 assert_eq!(
379 as_df.select(Some(keys.as_slice())),
380 expected.select(Some(keys.as_slice()))
381 );
382 }
383
384 #[cfg(feature = "polars-df")]
385 #[rstest]
386 #[case::str(DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
387 #[case::u32(DataValue::U32(u32::MAX), polars::prelude::AnyValue::UInt32(u32::MAX))]
388 #[case::i32(DataValue::I32(i32::MIN), polars::prelude::AnyValue::Int32(i32::MIN))]
389 #[case::i64(DataValue::I64(i64::MIN), polars::prelude::AnyValue::Int64(i64::MIN))]
390 #[case::u64(DataValue::U64(u64::MIN), polars::prelude::AnyValue::UInt64(u64::MIN))]
391 #[case::f32(DataValue::F32(f32::MIN), polars::prelude::AnyValue::Float32(f32::MIN))]
392 #[case::f64(DataValue::F64(f64::MIN), polars::prelude::AnyValue::Float64(f64::MIN))]
393 #[case::null(DataValue::Null, polars::prelude::AnyValue::Null)]
394 #[case::i128(
395 DataValue::I128(i128::MIN),
396 polars::prelude::AnyValue::Int128(i128::MIN)
397 )]
398 #[case::u8(DataValue::U8(255), polars::prelude::AnyValue::UInt8(255))]
399 #[case::bool(DataValue::Bool(true), polars::prelude::AnyValue::Boolean(true))]
400 #[case::bytes(DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
401 #[case::vec_uints(DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
402 #[case::map(DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
403 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
404 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
405 fn into_polars_value_test(
409 #[case] input: DataValue,
410 #[case] output: polars::prelude::AnyValue<'static>,
411 ) {
412 assert_eq!(into_polars_value(input.clone()), output);
413 assert_eq!(from_polars_value(output), input);
414 }
415
416 #[rstest]
417 #[case(
418 DataFrame::new(crate::column_frame! {
419 "a" => [1f64, 2f64, 3f64],
420 "b" => [4i64, 5i64, 6i64],
421 "c" => [7i64, 8i64, 9i64]
422 }),
423 DataFrame::new(crate::column_frame! {
424 "a" => [1f64, 2f64],
425 "b" => [4i64, 5i64],
426 "c" => [7i64, 8i64]
427 }),
428 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
429 )]
430 #[case(
431 DataFrame::new(crate::column_frame! {
432 "a" => [1f64, 2f64, 3f64],
433 "b" => [4i64, 5i64, 6i64],
434 "c" => [7i64, 8i64, 9i64]
435 }),
436 DataFrame::new(crate::column_frame! {
437 "a" => [2f64],
438 "b" => [5i64],
439 "c" => [8i64]
440 }),
441 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
442 )]
443 #[traced_test]
444 fn filter_test(
445 #[case] df: DataFrame,
446 #[case] expected: DataFrame,
447 #[case] filter: FilterRules,
448 ) {
449 let filtered = df.filter(&filter).expect("BUG: cannot filter");
450 assert_eq!(filtered, expected);
451 }
452
453 #[rstest]
454 fn test_serde_complex() {
455 let simple = r#"
456{
457 "constants": {},
458 "dataframe": {
459 "index": {
460 "keys": [
461 {
462 "key": 3162770485,
463 "name": "a",
464 "ctype": "U32"
465 },
466 {
467 "key": 2279056742,
468 "name": "b",
469 "ctype": "F64"
470 },
471 {
472 "key": 2994984227,
473 "name": "c",
474 "ctype": "U64"
475 },
476 {
477 "key": 3319645144,
478 "name": "d",
479 "ctype": "F64"
480 },
481 {
482 "key": 1291847470,
483 "name": "e",
484 "ctype": "U32"
485 },
486 {
487 "key": 874241070,
488 "name": "f",
489 "ctype": "Bool"
490 }
491 ],
492 "indexes": {
493 "a": 0,
494 "b": 1,
495 "c": 2,
496 "d": 3,
497 "e": 4,
498 "f": 5
499 },
500 "alias": {}
501 },
502 "data_frame": {
503 "v": 1,
504 "dim": [
505 2,
506 6
507 ],
508 "data": [
509 253780,
510 0.009369421750307085,
511 1633222860381359,
512 8,
513 5,
514 true,
515 64512,
516 0.003391335718333721,
517 1633222860810557,
518 8,
519 5,
520 null
521 ]
522 }
523 },
524 "metadata": {}
525}
526 "#;
527
528 let simple_deserialized: DataFrame =
529 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
530
531 println!("deserialized: {simple_deserialized:?}");
532 let array = format!("[{}, {}, {}]", simple, simple, simple);
533 let deserialized: Vec<DataFrame> =
534 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
535
536 println!("deserialized: {deserialized:?}");
537 assert_eq!(deserialized.len(), 3);
538 assert_eq!(simple_deserialized, deserialized[0]);
539 }
540
541 #[rstest]
542 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
543 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
544 #[case(vec![hashmap! {
545 "key1".into() => 1.into(),
546 "key2".into() => "a".into(),
547 },
548 hashmap! {
549 "key1".into() => 2.into(),
550 },])]
551 #[case(vec![data_value::stdhashmap! {
552 "key1" => DataValue::from(1),
553 "key2" => DataValue::from("a"),
554 },data_value::stdhashmap! {
555 "key1" => DataValue::from(2),
556 },])]
557 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
558 vec![DataValue::from("a"), DataValue::Null])])]
559 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
560 let df: DataFrame = input.into();
561 assert_eq!(
562 df,
563 DataFrame {
564 constants: HashMap::new(),
565 dataframe: ColumnFrame::from(vec![
566 hashmap! {
567 "key1".into() => 1.into(),
568 "key2".into() => "a".into(),
569 },
570 hashmap! {
571 "key1".into() => 2.into(),
572 },
573 ]),
574 metadata: HashMap::new(),
575 }
576 );
577 let selected_transposed = df.select_column("key1".into());
578 assert!(selected_transposed.is_some());
579 let selected_transposed = selected_transposed.unwrap();
580 assert_eq!(selected_transposed.len(), 2);
581 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
582 }
583
584 #[rstest]
585 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
586 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
587 #[case::hm({
588 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
589 hm
590 })]
591 #[case::vec_hhm(vec![hashmap! {
592 "key1".into() => 1.into(),
593 "key2".into() => "a".into(),
594 },
595 hashmap! {
596 "key1".into() => 2.into(),
597 },])]
598 #[case::vec_hme(vec![data_value::stdhashmap! {
599 "key1" => DataValue::from(1),
600 "key2" => DataValue::from("a"),
601 },data_value::stdhashmap! {
602 "key1" => DataValue::from(2),
603 },])]
604 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
605 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
606 let df: DataFrame = input.into();
607 let expected: DataFrame = DataFrame {
608 constants: HashMap::new(),
609 dataframe: ColumnFrame::from(vec![
610 hashmap! {
611 "key1".into() => 1.into(),
612 "key2".into() => "a".into(),
613 },
614 hashmap! {
615 "key1".into() => 2.into(),
616 },
617 ]),
618 metadata: HashMap::new(),
619 };
620 assert_eq!(
621 df.select(Some(&["key1".into(), "key2".into()])),
622 expected.select(Some(&["key1".into(), "key2".into()])),
623 "{df} vs {expected}"
624 );
625 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
626 assert_eq!(selected_transposed.len(), 2);
627 println!("{:?}", selected_transposed);
628 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
629 }
630 #[rstest]
631 fn test_dataframe(dummy_candidates: ColumnFrame) {
632 let mut dataframe: DataFrame = DataFrame::default();
633 assert!(dataframe.is_empty());
634 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
635 assert_eq!(dataframe.len(), 2);
636
637 let candidate = hashmap! {
638 "key1".into() => 3.into(),
639 "key2".into() => "c".into(),
640 };
641
642 assert!(dataframe.push(candidate).is_ok());
643 assert_eq!(dataframe.len(), 3);
644 assert!(!dataframe.is_empty());
645
646 dataframe.insert_constant("key3".into(), 4.into());
647 assert_eq!(dataframe.constants.len(), 1);
648 assert!(dataframe
649 .apply_function(&["key1".into()], |keys, df| {
650 let key = keys[0].clone();
651 let s = df
652 .get_single_column(&key)
653 .expect("BUG: Cannot get column")
654 .to_owned();
655 let s = s.mapv(|x| x + DataValue::from(1));
656 df.add_single_column("key5", s)?;
657 Ok(())
658 })
659 .is_ok());
660 let original = dataframe.clone();
661 dataframe.shrink();
662 let remove_df = dataframe.remove_column(&["key1".into()]);
663 assert!(remove_df.is_ok());
664 let mut remove_df = remove_df.unwrap();
665 assert_eq!(remove_df.len(), 3);
666 let selected = dataframe.select(Some(&["key2".into()]));
667 assert!(selected.is_ok());
668 let selected = selected.unwrap();
669 println!("{:?}", selected);
670 assert_eq!(selected.len(), 3);
671
672 let joined_result =
674 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
675 assert!(joined_result.is_ok(), "{:?}", joined_result);
676 assert_eq!(original, remove_df);
677 }
678
679 #[rstest]
680 fn test_metadata(dummy_candidates: ColumnFrame) {
681 let mut dataframe: DataFrame = DataFrame::default();
682 assert!(dataframe.is_empty());
683 println!("{:?}", dataframe);
684 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
685 println!("{:?}", dataframe);
686 assert_eq!(dataframe.len(), 2);
687
688 dataframe.add_metadata("test".into(), 1.into());
689 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
690 let dataframe = DataFrame::new(ColumnFrame::from(vec![
691 hashmap! {
692 "key1".into() => 1.into(),
693 "key2".into() => "a".into(),
694 },
695 hashmap! {
696 "key1".into() => 2.into(),
697 "key2".into() => "b".into(),
698 },
699 ]));
700 assert_eq!(dataframe.get_metadata("test"), None);
701 let tt = dataframe.select_transposed(None);
702 assert!(tt.is_ok());
703 let tt = tt.unwrap();
704 assert_eq!(tt.shape(), [2, 2]);
705 assert_eq!(
706 tt,
707 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
708 .unwrap()
709 );
710 }
711
712 #[rstest]
713 #[traced_test]
714 fn add_single_column_test() {
715 let mut dataframe = DataFrame::default();
716 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
717 let r = dataframe.add_single_column("key1", values);
718 assert!(r.is_ok(), "{r:?}");
719 let selected = dataframe.select(None);
720 assert!(selected.is_ok());
721 let selected = selected.unwrap();
722 assert_eq!(selected.shape(), [3, 1]);
723 assert_eq!(
724 selected,
725 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
726 );
727 let values = Array1::from(vec![1.into(), 2.into()]);
728 assert!(dataframe.add_single_column("key1", values).is_err());
729 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
730 assert!(dataframe.add_single_column("key2", values).is_ok());
731 let values = Array1::from(vec![3.into()]);
732 assert!(dataframe.add_single_column("key3", values).is_err());
733 }
734
735 #[rstest]
736 #[traced_test]
737 fn add_single_column_empty_test() {
738 let mut dataframe = DataFrame::default();
739 let values = Array1::from(vec![]);
740 let r = dataframe.add_single_column("key1", values);
741 assert!(r.is_ok(), "{r:?}");
742 let selected = dataframe.select(None);
743 assert!(selected.is_ok());
744 let selected = selected.unwrap();
745 assert_eq!(selected.shape(), [0, 1]);
746 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
747 let values = Array1::from(vec![1.into(), 2.into()]);
748 assert!(dataframe.add_single_column("key1", values).is_err());
749 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
750 assert!(dataframe.add_single_column("key2", values).is_ok());
751 let values = Array1::from(vec![3.into(), 4.into()]);
752 assert!(dataframe.add_single_column("key3", values).is_err());
753 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
754 assert!(dataframe.add_single_column("key3", values).is_ok());
755
756 assert_eq!(
757 dataframe
758 .select_column("key1".into())
759 .expect("BUG: has to exists"),
760 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
761 );
762 assert_eq!(
763 dataframe
764 .select_column("key2".into())
765 .expect("BUG: has to exists"),
766 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
767 );
768 assert_eq!(
769 dataframe.select(None).expect("BUG: cannot get data"),
770 ndarray::arr2(&[
771 [DataValue::Null, 3.into(), 3.into()],
772 [DataValue::Null, 4.into(), 4.into()],
773 [DataValue::Null, 5.into(), 5.into()],
774 ])
775 );
776 }
777
778 #[rstest]
779 #[case(
780 DataFrame::new(ColumnFrame::from(vec![
781 hashmap! {
782 "k".into() => 1.into(),
783 "k2".into() => 2.into(),
784 "k3".into() => 2.2.into(),
785 },
786 hashmap! {
787 "k".into() => 11.into(),
788 "k2".into() => 3.into(),
789 },
790 hashmap! {
791 "k".into() => 4.into(),
792 "k2".into() => 5.into(),
793 "k3".into() => 2.3.into(),
794 },
795 hashmap! {
796 "k".into() => 4.into(),
797 "k2".into() => 5.into(),
798 "k3".into() => 2.4.into(),
799 },
800 ])),
801 vec!["k".into(), "k2".into()],
802 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
803 )]
804 #[case(
805 DataFrame::new(ColumnFrame::from(vec![
806 hashmap! {
807 "k".into() => 1.into(),
808 "k2".into() => 2.into(),
809 "k3".into() => 2.2.into(),
810 },
811 hashmap! {
812 "k".into() => 11.into(),
813 "k2".into() => 3.into(),
814 },
815 hashmap! {
816 "k".into() => 4.into(),
817 "k2".into() => 5.into(),
818 "k3".into() => 2.3.into(),
819 },
820 hashmap! {
821 "k".into() => 4.into(),
822 "k2".into() => 5.into(),
823 "k3".into() => 2.4.into(),
824 },
825 ])),
826 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
827 Array2::from_shape_vec((4, 5), vec![
828 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
829 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
830 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
831 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
832 )]
833 #[traced_test]
834 fn select_multiple(
835 #[case] input: DataFrame,
836 #[case] columns: Vec<Key>,
837 #[case] expected: Array2<DataValue>,
838 ) {
839 let selected = input.select(Some(&columns));
840 assert!(selected.is_ok());
841 let selected = selected.unwrap();
842
843 assert_eq!(selected, expected);
844 }
845
846 #[rstest]
847 #[case(
848 DataFrame::new(ColumnFrame::from(vec![
849 hashmap! {
850 "k".into() => 1.into(),
851 "k2".into() => 2.into(),
852 "k3".into() => 2.2.into(),
853 },
854 hashmap! {
855 "k".into() => 11.into(),
856 "k2".into() => 3.into(),
857 },
858 hashmap! {
859 "k".into() => 4.into(),
860 "k2".into() => 5.into(),
861 "k3".into() => 2.3.into(),
862 },
863 hashmap! {
864 "k".into() => 4.into(),
865 "k2".into() => 5.into(),
866 "k3".into() => 2.4.into(),
867 },
868 ])),
869 "k".into(),
870 Array2::from_shape_vec((4, 3), vec![
871 1.into(), 2.into(), 2.2.into(),
872 4.into(), 5.into(), 2.3.into(),
873 4.into(), 5.into(), 2.4.into(),
874 11.into(), 3.into(), DataValue::Null,
875 ]
876 ).unwrap(),
877 vec!["k".into(), "k2".into(), "k3".into()],
878 )]
879 #[rstest]
880 #[case(
881 DataFrame::new(ColumnFrame::from(vec![
882 hashmap! {
883 "k".into() => 1.into(),
884 "k2".into() => 2.into(),
885 "k3".into() => 2.2.into(),
886 },
887 hashmap! {
888 "k".into() => 11.into(),
889 "k2".into() => 3.into(),
890 },
891 hashmap! {
892 "k".into() => 4.into(),
893 "k2".into() => 5.into(),
894 "k3".into() => 2.3.into(),
895 },
896 hashmap! {
897 "k".into() => 4.into(),
898 "k2".into() => 5.into(),
899 "k3".into() => 2.4.into(),
900 },
901 ])),
902 "k3".into(),
903 Array2::from_shape_vec((4, 3), vec![
904 11.into(), 3.into(), DataValue::Null,
905 1.into(), 2.into(), 2.2.into(),
906 4.into(), 5.into(), 2.3.into(),
907 4.into(), 5.into(), 2.4.into(),
908 ]
909 ).unwrap(),
910 vec!["k".into(), "k2".into(), "k3".into()],
911 )]
912 #[case(
913 DataFrame::new(ColumnFrame::from(vec![
914 hashmap! {
915 "k".into() => 2.into(),
916 "k2".into() => 0.000001.into(),
917 },
918 hashmap! {
919 "k".into() => 1.into(),
920 "k2".into() =>0.0000001.into(),
921 },
922 hashmap! {
923 "k".into() => 3.into(),
924 "k2".into() => 0.00001.into(),
925 },
926 hashmap! {
927 "k".into() => 4.into(),
928 "k2".into() => 0.001.into(),
929 },
930 ])),
931 "k2".into(),
932 Array2::from_shape_vec((4, 2), vec![
933 1.into(), 0.0000001.into(),
934 2.into(), 0.000001.into(),
935 3.into(), 0.00001.into(),
936 4.into(), 0.001.into(),
937 ]
938 ).unwrap(),
939 vec!["k".into(), "k2".into()],
940 )]
941 #[case(
942 DataFrame::new(ColumnFrame::from(vec![
943 hashmap! {
944 "k".into() => 2.into(),
945 "k2".into() => "b".into(),
946 },
947 hashmap! {
948 "k".into() => 1.into(),
949 "k2".into() =>"a".into(),
950 },
951 hashmap! {
952 "k".into() => 3.into(),
953 "k2".into() =>"c".into(),
954 },
955 hashmap! {
956 "k".into() => 4.into(),
957 "k2".into() =>"z".into(),
958 },
959 ])),
960 "k2".into(),
961 Array2::from_shape_vec((4, 2), vec![
962 1.into(),"a".into(),
963 2.into(), "b".into(),
964 3.into(), "c".into(),
965 4.into(), "z".into(),
966 ]
967 ).unwrap(),
968 vec!["k".into(), "k2".into()],
969 )]
970 #[traced_test]
971 fn sort_by(
972 #[case] input: DataFrame,
973 #[case] column: Key,
974 #[case] expected: Array2<DataValue>,
975 #[case] columns: Vec<Key>,
976 ) {
977 let result = input.sorted(&column);
978 assert!(result.is_ok(), "{result:?}");
979 let result = result.unwrap().get_sorted();
980 let selected = result.select(Some(&columns));
981
982 assert_eq!(selected, expected);
983 }
984 #[rstest]
985 #[case(
986 DataFrame::new(ColumnFrame::from(vec![
987 hashmap! {
988 "k".into() => 2.into(),
989 "k2".into() => 0.000001.into(),
990 },
991 hashmap! {
992 "k".into() => 1.into(),
993 "k2".into() =>0.0000001.into(),
994 },
995 hashmap! {
996 "k".into() => 3.into(),
997 "k2".into() => 0.00001.into(),
998 },
999 hashmap! {
1000 "k".into() => 4.into(),
1001 "k2".into() => 0.001.into(),
1002 },
1003 ])),
1004 "k2".into(),
1005 TopN::Last(1),
1006 Array2::from_shape_vec((1, 2), vec![
1007 4.into(), 0.001.into(),
1008 ]
1009 ).unwrap(),
1010 vec!["k".into(), "k2".into()],
1011 )]
1012 #[case(
1013 DataFrame::new(ColumnFrame::from(vec![
1014 hashmap! {
1015 "k".into() => 2.into(),
1016 "k2".into() => 0.000001.into(),
1017 },
1018 hashmap! {
1019 "k".into() => 1.into(),
1020 "k2".into() =>0.0000001.into(),
1021 },
1022 hashmap! {
1023 "k".into() => 3.into(),
1024 "k2".into() => 0.00001.into(),
1025 },
1026 hashmap! {
1027 "k".into() => 4.into(),
1028 "k2".into() => 0.001.into(),
1029 },
1030 ])),
1031 "k2".into(),
1032 TopN::Last(2),
1033 Array2::from_shape_vec((2, 2), vec![
1034 4.into(), 0.001.into(),
1035 3.into(), 0.00001.into(),
1036 ]
1037 ).unwrap(),
1038 vec!["k".into(), "k2".into()],
1039 )]
1040 #[case(
1041 DataFrame::new(ColumnFrame::from(vec![
1042 hashmap! {
1043 "k".into() => 2.into(),
1044 "k2".into() => "b".into(),
1045 },
1046 hashmap! {
1047 "k".into() => 1.into(),
1048 "k2".into() =>"a".into(),
1049 },
1050 hashmap! {
1051 "k".into() => 3.into(),
1052 "k2".into() =>"c".into(),
1053 },
1054 hashmap! {
1055 "k".into() => 4.into(),
1056 "k2".into() =>"z".into(),
1057 },
1058 ])),
1059 "k2".into(),
1060 TopN::First(1),
1061 Array2::from_shape_vec((1, 2), vec![
1062 1.into(),"a".into(),
1063 ]
1064 ).unwrap(),
1065 vec!["k".into(), "k2".into()],
1066 )]
1067 #[case(
1068 DataFrame::new(ColumnFrame::from(vec![
1069 hashmap! {
1070 "k".into() => 2.into(),
1071 "k2".into() => "b".into(),
1072 },
1073 hashmap! {
1074 "k".into() => 1.into(),
1075 "k2".into() =>"a".into(),
1076 },
1077 hashmap! {
1078 "k".into() => 3.into(),
1079 "k2".into() =>"c".into(),
1080 },
1081 hashmap! {
1082 "k".into() => 4.into(),
1083 "k2".into() =>"z".into(),
1084 },
1085 ])),
1086 "k2".into(),
1087 TopN::First(2),
1088 Array2::from_shape_vec((2, 2), vec![
1089 1.into(),"a".into(),
1090 2.into(),"b".into(),
1091 ]
1092 ).unwrap(),
1093 vec!["k".into(), "k2".into()],
1094 )]
1095 #[traced_test]
1096 fn top_n(
1097 #[case] input: DataFrame,
1098 #[case] column: Key,
1099 #[case] topn: TopN,
1100 #[case] expected: Array2<DataValue>,
1101 #[case] columns: Vec<Key>,
1102 ) {
1103 let result = input.sorted(&column);
1104 assert!(result.is_ok(), "{result:?}");
1105 let result = result.unwrap();
1106 let first = result.topn(topn).unwrap();
1107 let selected = first.select(Some(&columns));
1108 assert_eq!(selected, expected);
1109 }
1110}