1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33 pub constants: HashMap<Key, DataValue>,
36 pub dataframe: ColumnFrame,
39 pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 self.dataframe.fmt(f)
46 }
47}
48
49impl DataFrame {
50 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51 Self {
52 constants: HashMap::new(),
53 dataframe: dataframe.into(),
54 metadata: HashMap::new(),
55 }
56 }
57
58 pub fn shrink(&mut self) {
59 self.dataframe.shrink();
60 }
61
62 pub fn add_metadata(&mut self, key: String, value: DataValue) {
63 self.metadata.insert(key, value);
64 }
65
66 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67 self.metadata.get(key)
68 }
69
70 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71 other.constants.into_iter().for_each(|(key, value)| {
72 self.constants.insert(key, value);
73 });
74 self.dataframe.join(other.dataframe, join_type)
75 }
76
77 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78 where
79 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80 {
81 self.dataframe.apply_function(keys, &mut func)
82 }
83
84 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85 Ok(self.dataframe.select(keys))
86 }
87
88 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89 self.dataframe.select_transposed_typed::<D>(keys)
90 }
91
92 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<DataValue>> {
93 self.dataframe.select_column(&key)
94 }
95
96 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97 self.dataframe.select_transposed(keys)
98 }
99
100 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101 self.constants.insert(key, value);
102 }
103
104 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105 self.dataframe.push(item)
106 }
107
108 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109 self.dataframe.remove_column(keys).map(|x| x.into())
110 }
111
112 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113 self.dataframe.extend(items.dataframe)
114 }
115
116 pub fn len(&self) -> usize {
117 self.dataframe.len()
118 }
119
120 pub fn is_empty(&self) -> bool {
121 self.dataframe.is_empty()
122 }
123
124 pub fn add_single_column<K: Into<Key>>(
125 &mut self,
126 key: K,
127 values: Array1<DataValue>,
128 ) -> Result<(), Error> {
129 self.dataframe.add_single_column(key, values)
130 }
131
132 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<DataValue>> {
133 self.dataframe.get_single_column(key)
134 }
135
136 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137 self.dataframe.sorted(key)
138 }
139
140 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141 let filtered_df = self.dataframe.filter(filter)?;
142 Ok(Self {
143 constants: self.constants.clone(),
144 dataframe: filtered_df,
145 metadata: self.metadata.clone(),
146 })
147 }
148
149 #[cfg(feature = "polars-df")]
150 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151 let mut columns = vec![];
152 for key in self.dataframe.keys() {
153 columns.push(polars::prelude::Column::new(
154 key.name().into(),
155 self.dataframe
156 .get_single_column(key)
157 .ok_or_else(|| Error::NotFound(key.clone()))?
158 .into_iter()
159 .map(|x| into_polars_value(x.clone()))
160 .collect::<Vec<_>>(),
161 ));
162 }
163
164 Ok(polars::prelude::DataFrame::new(columns)?)
165 }
166}
167
168#[cfg(feature = "polars-df")]
169pub fn into_polars_value(dv: DataValue) -> polars::prelude::AnyValue<'static> {
170 use polars::prelude::AnyValue::*;
171 use polars::prelude::NamedFrom;
172 match dv {
173 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
174 DataValue::Bytes(items) => BinaryOwned(items),
175 DataValue::U8(x) => UInt16(x as _),
176 DataValue::Bool(x) => Boolean(x),
177 DataValue::I32(x) => Int32(x),
178 DataValue::U32(x) => UInt32(x),
179 DataValue::I64(x) => Int64(x),
180 DataValue::U64(x) => UInt64(x),
181 DataValue::I128(x) => Int128(x),
182 DataValue::F32(x) => Float32(x),
183 DataValue::F64(x) => Float64(x),
184 DataValue::Null => Null,
185 DataValue::Vec(data_values) => List(polars::series::Series::new(
186 "v".into(),
187 data_values
188 .into_iter()
189 .map(into_polars_value)
190 .collect::<Vec<_>>(),
191 )),
192 DataValue::EnumNumber(_) => unimplemented!(),
193 DataValue::U128(_) => unimplemented!(),
194 DataValue::Map(_) => unimplemented!(),
195 }
196}
197
198#[cfg(feature = "polars-df")]
199pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
200 use polars::prelude::AnyValue::*;
201 match dv {
202 Null => DataValue::Null,
203 Boolean(v) => v.into(),
204 String(v) => DataValue::String(v.into()),
205 UInt8(v) => DataValue::U8(v),
206 UInt16(v) => DataValue::U32(v as u32),
207 UInt32(v) => v.into(),
208 UInt64(v) => v.into(),
209 Int8(v) => (v as i32).into(),
210 Int16(v) => (v as i32).into(),
211 Int32(v) => v.into(),
212 Int64(v) => v.into(),
213 Float32(v) => v.into(),
214 Float64(v) => v.into(),
215 Int128(v) => v.into(),
216 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
217 StringOwned(v) => DataValue::String(v.as_str().into()),
221 Binary(v) => DataValue::Bytes(v.to_owned()),
222 BinaryOwned(v) => DataValue::Bytes(v),
223 e => {
224 tracing::warn!("Unsupported polars value: {e:?}");
225 DataValue::Null
226 }
227 }
228}
229
230impl From<ColumnFrame> for DataFrame {
231 fn from(dataframe: ColumnFrame) -> Self {
232 Self::new(dataframe)
233 }
234}
235
236impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
237 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
238 Self::new(ColumnFrame::from(dataframe))
239 }
240}
241
242impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
243 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
244 Self::new(ColumnFrame::from(dataframe))
245 }
246}
247
248impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
249 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
250 Self::new(ColumnFrame::from(dataframe))
251 }
252}
253
254impl From<MLChefMap> for DataFrame {
255 fn from(dataframe: MLChefMap) -> Self {
256 Self::new(ColumnFrame::from(dataframe))
257 }
258}
259impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
260 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
261 Self::new(ColumnFrame::from(dataframe))
262 }
263}
264
265impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
266 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
267 Self::new(ColumnFrame::from(dataframe))
268 }
269}
270
271#[cfg(feature = "polars-df")]
272impl From<polars::prelude::DataFrame> for DataFrame {
273 fn from(dataframe: polars::prelude::DataFrame) -> Self {
274 Self::new(ColumnFrame::from(dataframe))
275 }
276}
277#[cfg(test)]
278mod test {
279 use crate::filter::FilterRules;
280
281 use super::*;
282 use halfbrown::hashmap;
283 #[cfg(feature = "polars-df")]
284 use polars::prelude::NamedFrom as _;
285 use rstest::*;
286 use tracing_test::traced_test;
287 #[fixture]
288 fn dummy_candidates() -> ColumnFrame {
289 ColumnFrame::from(vec![
290 hashmap! {
291 "key1".into() => 1.into(),
292 "key2".into() => "a".into(),
293 },
294 hashmap! {
295 "key1".into() => 2.into(),
296 "key2".into() => "b".into(),
297 },
298 ])
299 }
300
301 #[rstest]
302 fn test_serde() {
303 let df = crate::df! {
304 "a" => [1u64, 2u64, 3u64],
305 "b" => [4u64, 5u64, 6u64],
306 "c" => [7u64, 8u64, 9u64]
307 };
308
309 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
310
311 let deserialized =
312 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
313
314 assert_eq!(df, deserialized);
315 }
316
317 #[cfg(feature = "polars-df")]
318 #[rstest]
319 fn test_polars() {
320 let expected = crate::df! {
321 "a" => [1u64, 2u64, 3u64],
322 "b" => [4f64, 5f64, 6f64],
323 "c" => [7i64, 8i64, 9i64]
324 };
325
326 let polars_df = polars::df!(
327 "a" => [1u64, 2u64, 3u64],
328 "b" => [4f64, 5f64, 6f64],
329 "c" => [7i64, 8i64, 9i64]
330 )
331 .expect("BUG: should be ok");
332 let as_df: DataFrame = polars_df.into();
333 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
334 assert_eq!(
335 as_df.select(Some(keys.as_slice())),
336 expected.select(Some(keys.as_slice()))
337 );
338 }
339
340 #[cfg(feature = "polars-df")]
341 #[rstest]
342 #[case::str(DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
343 #[case::u32(DataValue::U32(u32::MAX), polars::prelude::AnyValue::UInt32(u32::MAX))]
344 #[case::i32(DataValue::I32(i32::MIN), polars::prelude::AnyValue::Int32(i32::MIN))]
345 #[case::i64(DataValue::I64(i64::MIN), polars::prelude::AnyValue::Int64(i64::MIN))]
346 #[case::u64(DataValue::U64(u64::MIN), polars::prelude::AnyValue::UInt64(u64::MIN))]
347 #[case::f32(DataValue::F32(f32::MIN), polars::prelude::AnyValue::Float32(f32::MIN))]
348 #[case::f64(DataValue::F64(f64::MIN), polars::prelude::AnyValue::Float64(f64::MIN))]
349 #[case::null(DataValue::Null, polars::prelude::AnyValue::Null)]
350 #[case::i128(
351 DataValue::I128(i128::MIN),
352 polars::prelude::AnyValue::Int128(i128::MIN)
353 )]
354 #[case::u8(DataValue::U8(255), polars::prelude::AnyValue::UInt8(255))]
355 #[case::bool(DataValue::Bool(true), polars::prelude::AnyValue::Boolean(true))]
356 #[case::bytes(DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
357 #[case::vec_uints(DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
358 fn into_polars_value_test(
362 #[case] input: DataValue,
363 #[case] output: polars::prelude::AnyValue<'static>,
364 ) {
365 assert_eq!(into_polars_value(input.clone()), output);
366 assert_eq!(from_polars_value(output), input);
367 }
368
369 #[rstest]
370 #[case(
371 DataFrame::new(crate::column_frame! {
372 "a" => [1f64, 2f64, 3f64],
373 "b" => [4i64, 5i64, 6i64],
374 "c" => [7i64, 8i64, 9i64]
375 }),
376 DataFrame::new(crate::column_frame! {
377 "a" => [1f64, 2f64],
378 "b" => [4i64, 5i64],
379 "c" => [7i64, 8i64]
380 }),
381 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
382 )]
383 #[case(
384 DataFrame::new(crate::column_frame! {
385 "a" => [1f64, 2f64, 3f64],
386 "b" => [4i64, 5i64, 6i64],
387 "c" => [7i64, 8i64, 9i64]
388 }),
389 DataFrame::new(crate::column_frame! {
390 "a" => [2f64],
391 "b" => [5i64],
392 "c" => [8i64]
393 }),
394 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
395 )]
396 #[traced_test]
397 fn filter_test(
398 #[case] df: DataFrame,
399 #[case] expected: DataFrame,
400 #[case] filter: FilterRules,
401 ) {
402 let filtered = df.filter(&filter).expect("BUG: cannot filter");
403 assert_eq!(filtered, expected);
404 }
405
406 #[rstest]
407 fn test_serde_complex() {
408 let simple = r#"
409{
410 "constants": {},
411 "dataframe": {
412 "index": {
413 "keys": [
414 {
415 "key": 3162770485,
416 "name": "a",
417 "ctype": "U32"
418 },
419 {
420 "key": 2279056742,
421 "name": "b",
422 "ctype": "F64"
423 },
424 {
425 "key": 2994984227,
426 "name": "c",
427 "ctype": "U64"
428 },
429 {
430 "key": 3319645144,
431 "name": "d",
432 "ctype": "F64"
433 },
434 {
435 "key": 1291847470,
436 "name": "e",
437 "ctype": "U32"
438 },
439 {
440 "key": 874241070,
441 "name": "f",
442 "ctype": "Bool"
443 }
444 ],
445 "indexes": {
446 "a": 0,
447 "b": 1,
448 "c": 2,
449 "d": 3,
450 "e": 4,
451 "f": 5
452 },
453 "alias": {}
454 },
455 "data_frame": {
456 "v": 1,
457 "dim": [
458 2,
459 6
460 ],
461 "data": [
462 253780,
463 0.009369421750307085,
464 1633222860381359,
465 8,
466 5,
467 true,
468 64512,
469 0.003391335718333721,
470 1633222860810557,
471 8,
472 5,
473 null
474 ]
475 }
476 },
477 "metadata": {}
478}
479 "#;
480
481 let simple_deserialized: DataFrame =
482 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
483
484 println!("deserialized: {simple_deserialized:?}");
485 let array = format!("[{}, {}, {}]", simple, simple, simple);
486 let deserialized: Vec<DataFrame> =
487 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
488
489 println!("deserialized: {deserialized:?}");
490 assert_eq!(deserialized.len(), 3);
491 assert_eq!(simple_deserialized, deserialized[0]);
492 }
493
494 #[rstest]
495 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
496 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
497 #[case(vec![hashmap! {
498 "key1".into() => 1.into(),
499 "key2".into() => "a".into(),
500 },
501 hashmap! {
502 "key1".into() => 2.into(),
503 },])]
504 #[case(vec![data_value::stdhashmap! {
505 "key1" => DataValue::from(1),
506 "key2" => DataValue::from("a"),
507 },data_value::stdhashmap! {
508 "key1" => DataValue::from(2),
509 },])]
510 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
511 vec![DataValue::from("a"), DataValue::Null])])]
512 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
513 let df: DataFrame = input.into();
514 assert_eq!(
515 df,
516 DataFrame {
517 constants: HashMap::new(),
518 dataframe: ColumnFrame::from(vec![
519 hashmap! {
520 "key1".into() => 1.into(),
521 "key2".into() => "a".into(),
522 },
523 hashmap! {
524 "key1".into() => 2.into(),
525 },
526 ]),
527 metadata: HashMap::new(),
528 }
529 );
530 let selected_transposed = df.select_column("key1".into());
531 assert!(selected_transposed.is_some());
532 let selected_transposed = selected_transposed.unwrap();
533 assert_eq!(selected_transposed.len(), 2);
534 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
535 }
536
537 #[rstest]
538 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
539 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
540 #[case::hm({
541 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
542 hm
543 })]
544 #[case::vec_hhm(vec![hashmap! {
545 "key1".into() => 1.into(),
546 "key2".into() => "a".into(),
547 },
548 hashmap! {
549 "key1".into() => 2.into(),
550 },])]
551 #[case::vec_hme(vec![data_value::stdhashmap! {
552 "key1" => DataValue::from(1),
553 "key2" => DataValue::from("a"),
554 },data_value::stdhashmap! {
555 "key1" => DataValue::from(2),
556 },])]
557 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
558 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
559 let df: DataFrame = input.into();
560 let expected: DataFrame = DataFrame {
561 constants: HashMap::new(),
562 dataframe: ColumnFrame::from(vec![
563 hashmap! {
564 "key1".into() => 1.into(),
565 "key2".into() => "a".into(),
566 },
567 hashmap! {
568 "key1".into() => 2.into(),
569 },
570 ]),
571 metadata: HashMap::new(),
572 };
573 assert_eq!(
574 df.select(Some(&["key1".into(), "key2".into()])),
575 expected.select(Some(&["key1".into(), "key2".into()])),
576 "{df} vs {expected}"
577 );
578 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
579 assert_eq!(selected_transposed.len(), 2);
580 println!("{:?}", selected_transposed);
581 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
582 }
583 #[rstest]
584 fn test_dataframe(dummy_candidates: ColumnFrame) {
585 let mut dataframe: DataFrame = DataFrame::default();
586 assert!(dataframe.is_empty());
587 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
588 assert_eq!(dataframe.len(), 2);
589
590 let candidate = hashmap! {
591 "key1".into() => 3.into(),
592 "key2".into() => "c".into(),
593 };
594
595 assert!(dataframe.push(candidate).is_ok());
596 assert_eq!(dataframe.len(), 3);
597 assert!(!dataframe.is_empty());
598
599 dataframe.insert_constant("key3".into(), 4.into());
600 assert_eq!(dataframe.constants.len(), 1);
601 assert!(dataframe
602 .apply_function(&["key1".into()], |keys, df| {
603 let key = keys[0].clone();
604 let s = df
605 .get_single_column(&key)
606 .expect("BUG: Cannot get column")
607 .to_owned();
608 let s = s.mapv(|x| x + DataValue::from(1));
609 df.add_single_column("key5", s)?;
610 Ok(())
611 })
612 .is_ok());
613 let original = dataframe.clone();
614 dataframe.shrink();
615 let remove_df = dataframe.remove_column(&["key1".into()]);
616 assert!(remove_df.is_ok());
617 let mut remove_df = remove_df.unwrap();
618 assert_eq!(remove_df.len(), 3);
619 let selected = dataframe.select(Some(&["key2".into()]));
620 assert!(selected.is_ok());
621 let selected = selected.unwrap();
622 println!("{:?}", selected);
623 assert_eq!(selected.len(), 3);
624
625 let joined_result =
627 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
628 assert!(joined_result.is_ok(), "{:?}", joined_result);
629 assert_eq!(original, remove_df);
630 }
631
632 #[rstest]
633 fn test_metadata(dummy_candidates: ColumnFrame) {
634 let mut dataframe: DataFrame = DataFrame::default();
635 assert!(dataframe.is_empty());
636 println!("{:?}", dataframe);
637 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
638 println!("{:?}", dataframe);
639 assert_eq!(dataframe.len(), 2);
640
641 dataframe.add_metadata("test".into(), 1.into());
642 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
643 let dataframe = DataFrame::new(ColumnFrame::from(vec![
644 hashmap! {
645 "key1".into() => 1.into(),
646 "key2".into() => "a".into(),
647 },
648 hashmap! {
649 "key1".into() => 2.into(),
650 "key2".into() => "b".into(),
651 },
652 ]));
653 assert_eq!(dataframe.get_metadata("test"), None);
654 let tt = dataframe.select_transposed(None);
655 assert!(tt.is_ok());
656 let tt = tt.unwrap();
657 assert_eq!(tt.shape(), [2, 2]);
658 assert_eq!(
659 tt,
660 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
661 .unwrap()
662 );
663 }
664
665 #[rstest]
666 #[traced_test]
667 fn add_single_column_test() {
668 let mut dataframe = DataFrame::default();
669 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
670 let r = dataframe.add_single_column("key1", values);
671 assert!(r.is_ok(), "{r:?}");
672 let selected = dataframe.select(None);
673 assert!(selected.is_ok());
674 let selected = selected.unwrap();
675 assert_eq!(selected.shape(), [3, 1]);
676 assert_eq!(
677 selected,
678 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
679 );
680 let values = Array1::from(vec![1.into(), 2.into()]);
681 assert!(dataframe.add_single_column("key1", values).is_err());
682 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
683 assert!(dataframe.add_single_column("key2", values).is_ok());
684 let values = Array1::from(vec![3.into()]);
685 assert!(dataframe.add_single_column("key3", values).is_err());
686 }
687
688 #[rstest]
689 #[traced_test]
690 fn add_single_column_empty_test() {
691 let mut dataframe = DataFrame::default();
692 let values = Array1::from(vec![]);
693 let r = dataframe.add_single_column("key1", values);
694 assert!(r.is_ok(), "{r:?}");
695 let selected = dataframe.select(None);
696 assert!(selected.is_ok());
697 let selected = selected.unwrap();
698 assert_eq!(selected.shape(), [0, 1]);
699 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
700 let values = Array1::from(vec![1.into(), 2.into()]);
701 assert!(dataframe.add_single_column("key1", values).is_err());
702 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
703 assert!(dataframe.add_single_column("key2", values).is_ok());
704 let values = Array1::from(vec![3.into(), 4.into()]);
705 assert!(dataframe.add_single_column("key3", values).is_err());
706 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
707 assert!(dataframe.add_single_column("key3", values).is_ok());
708
709 assert_eq!(
710 dataframe
711 .select_column("key1".into())
712 .expect("BUG: has to exists"),
713 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
714 );
715 assert_eq!(
716 dataframe
717 .select_column("key2".into())
718 .expect("BUG: has to exists"),
719 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
720 );
721 assert_eq!(
722 dataframe.select(None).expect("BUG: cannot get data"),
723 ndarray::arr2(&[
724 [DataValue::Null, 3.into(), 3.into()],
725 [DataValue::Null, 4.into(), 4.into()],
726 [DataValue::Null, 5.into(), 5.into()],
727 ])
728 );
729 }
730
731 #[rstest]
732 #[case(
733 DataFrame::new(ColumnFrame::from(vec![
734 hashmap! {
735 "k".into() => 1.into(),
736 "k2".into() => 2.into(),
737 "k3".into() => 2.2.into(),
738 },
739 hashmap! {
740 "k".into() => 11.into(),
741 "k2".into() => 3.into(),
742 },
743 hashmap! {
744 "k".into() => 4.into(),
745 "k2".into() => 5.into(),
746 "k3".into() => 2.3.into(),
747 },
748 hashmap! {
749 "k".into() => 4.into(),
750 "k2".into() => 5.into(),
751 "k3".into() => 2.4.into(),
752 },
753 ])),
754 vec!["k".into(), "k2".into()],
755 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
756 )]
757 #[case(
758 DataFrame::new(ColumnFrame::from(vec![
759 hashmap! {
760 "k".into() => 1.into(),
761 "k2".into() => 2.into(),
762 "k3".into() => 2.2.into(),
763 },
764 hashmap! {
765 "k".into() => 11.into(),
766 "k2".into() => 3.into(),
767 },
768 hashmap! {
769 "k".into() => 4.into(),
770 "k2".into() => 5.into(),
771 "k3".into() => 2.3.into(),
772 },
773 hashmap! {
774 "k".into() => 4.into(),
775 "k2".into() => 5.into(),
776 "k3".into() => 2.4.into(),
777 },
778 ])),
779 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
780 Array2::from_shape_vec((4, 5), vec![
781 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
782 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
783 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
784 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
785 )]
786 #[traced_test]
787 fn select_multiple(
788 #[case] input: DataFrame,
789 #[case] columns: Vec<Key>,
790 #[case] expected: Array2<DataValue>,
791 ) {
792 let selected = input.select(Some(&columns));
793 assert!(selected.is_ok());
794 let selected = selected.unwrap();
795
796 assert_eq!(selected, expected);
797 }
798
799 #[rstest]
800 #[case(
801 DataFrame::new(ColumnFrame::from(vec![
802 hashmap! {
803 "k".into() => 1.into(),
804 "k2".into() => 2.into(),
805 "k3".into() => 2.2.into(),
806 },
807 hashmap! {
808 "k".into() => 11.into(),
809 "k2".into() => 3.into(),
810 },
811 hashmap! {
812 "k".into() => 4.into(),
813 "k2".into() => 5.into(),
814 "k3".into() => 2.3.into(),
815 },
816 hashmap! {
817 "k".into() => 4.into(),
818 "k2".into() => 5.into(),
819 "k3".into() => 2.4.into(),
820 },
821 ])),
822 "k".into(),
823 Array2::from_shape_vec((4, 3), vec![
824 1.into(), 2.into(), 2.2.into(),
825 4.into(), 5.into(), 2.3.into(),
826 4.into(), 5.into(), 2.4.into(),
827 11.into(), 3.into(), DataValue::Null,
828 ]
829 ).unwrap(),
830 vec!["k".into(), "k2".into(), "k3".into()],
831 )]
832 #[rstest]
833 #[case(
834 DataFrame::new(ColumnFrame::from(vec![
835 hashmap! {
836 "k".into() => 1.into(),
837 "k2".into() => 2.into(),
838 "k3".into() => 2.2.into(),
839 },
840 hashmap! {
841 "k".into() => 11.into(),
842 "k2".into() => 3.into(),
843 },
844 hashmap! {
845 "k".into() => 4.into(),
846 "k2".into() => 5.into(),
847 "k3".into() => 2.3.into(),
848 },
849 hashmap! {
850 "k".into() => 4.into(),
851 "k2".into() => 5.into(),
852 "k3".into() => 2.4.into(),
853 },
854 ])),
855 "k3".into(),
856 Array2::from_shape_vec((4, 3), vec![
857 11.into(), 3.into(), DataValue::Null,
858 1.into(), 2.into(), 2.2.into(),
859 4.into(), 5.into(), 2.3.into(),
860 4.into(), 5.into(), 2.4.into(),
861 ]
862 ).unwrap(),
863 vec!["k".into(), "k2".into(), "k3".into()],
864 )]
865 #[case(
866 DataFrame::new(ColumnFrame::from(vec![
867 hashmap! {
868 "k".into() => 2.into(),
869 "k2".into() => 0.000001.into(),
870 },
871 hashmap! {
872 "k".into() => 1.into(),
873 "k2".into() =>0.0000001.into(),
874 },
875 hashmap! {
876 "k".into() => 3.into(),
877 "k2".into() => 0.00001.into(),
878 },
879 hashmap! {
880 "k".into() => 4.into(),
881 "k2".into() => 0.001.into(),
882 },
883 ])),
884 "k2".into(),
885 Array2::from_shape_vec((4, 2), vec![
886 1.into(), 0.0000001.into(),
887 2.into(), 0.000001.into(),
888 3.into(), 0.00001.into(),
889 4.into(), 0.001.into(),
890 ]
891 ).unwrap(),
892 vec!["k".into(), "k2".into()],
893 )]
894 #[case(
895 DataFrame::new(ColumnFrame::from(vec![
896 hashmap! {
897 "k".into() => 2.into(),
898 "k2".into() => "b".into(),
899 },
900 hashmap! {
901 "k".into() => 1.into(),
902 "k2".into() =>"a".into(),
903 },
904 hashmap! {
905 "k".into() => 3.into(),
906 "k2".into() =>"c".into(),
907 },
908 hashmap! {
909 "k".into() => 4.into(),
910 "k2".into() =>"z".into(),
911 },
912 ])),
913 "k2".into(),
914 Array2::from_shape_vec((4, 2), vec![
915 1.into(),"a".into(),
916 2.into(), "b".into(),
917 3.into(), "c".into(),
918 4.into(), "z".into(),
919 ]
920 ).unwrap(),
921 vec!["k".into(), "k2".into()],
922 )]
923 #[traced_test]
924 fn sort_by(
925 #[case] input: DataFrame,
926 #[case] column: Key,
927 #[case] expected: Array2<DataValue>,
928 #[case] columns: Vec<Key>,
929 ) {
930 let result = input.sorted(&column);
931 assert!(result.is_ok(), "{result:?}");
932 let result = result.unwrap().get_sorted();
933 let selected = result.select(Some(&columns));
934
935 assert_eq!(selected, expected);
936 }
937 #[rstest]
938 #[case(
939 DataFrame::new(ColumnFrame::from(vec![
940 hashmap! {
941 "k".into() => 2.into(),
942 "k2".into() => 0.000001.into(),
943 },
944 hashmap! {
945 "k".into() => 1.into(),
946 "k2".into() =>0.0000001.into(),
947 },
948 hashmap! {
949 "k".into() => 3.into(),
950 "k2".into() => 0.00001.into(),
951 },
952 hashmap! {
953 "k".into() => 4.into(),
954 "k2".into() => 0.001.into(),
955 },
956 ])),
957 "k2".into(),
958 TopN::Last(1),
959 Array2::from_shape_vec((1, 2), vec![
960 4.into(), 0.001.into(),
961 ]
962 ).unwrap(),
963 vec!["k".into(), "k2".into()],
964 )]
965 #[case(
966 DataFrame::new(ColumnFrame::from(vec![
967 hashmap! {
968 "k".into() => 2.into(),
969 "k2".into() => 0.000001.into(),
970 },
971 hashmap! {
972 "k".into() => 1.into(),
973 "k2".into() =>0.0000001.into(),
974 },
975 hashmap! {
976 "k".into() => 3.into(),
977 "k2".into() => 0.00001.into(),
978 },
979 hashmap! {
980 "k".into() => 4.into(),
981 "k2".into() => 0.001.into(),
982 },
983 ])),
984 "k2".into(),
985 TopN::Last(2),
986 Array2::from_shape_vec((2, 2), vec![
987 4.into(), 0.001.into(),
988 3.into(), 0.00001.into(),
989 ]
990 ).unwrap(),
991 vec!["k".into(), "k2".into()],
992 )]
993 #[case(
994 DataFrame::new(ColumnFrame::from(vec![
995 hashmap! {
996 "k".into() => 2.into(),
997 "k2".into() => "b".into(),
998 },
999 hashmap! {
1000 "k".into() => 1.into(),
1001 "k2".into() =>"a".into(),
1002 },
1003 hashmap! {
1004 "k".into() => 3.into(),
1005 "k2".into() =>"c".into(),
1006 },
1007 hashmap! {
1008 "k".into() => 4.into(),
1009 "k2".into() =>"z".into(),
1010 },
1011 ])),
1012 "k2".into(),
1013 TopN::First(1),
1014 Array2::from_shape_vec((1, 2), vec![
1015 1.into(),"a".into(),
1016 ]
1017 ).unwrap(),
1018 vec!["k".into(), "k2".into()],
1019 )]
1020 #[case(
1021 DataFrame::new(ColumnFrame::from(vec![
1022 hashmap! {
1023 "k".into() => 2.into(),
1024 "k2".into() => "b".into(),
1025 },
1026 hashmap! {
1027 "k".into() => 1.into(),
1028 "k2".into() =>"a".into(),
1029 },
1030 hashmap! {
1031 "k".into() => 3.into(),
1032 "k2".into() =>"c".into(),
1033 },
1034 hashmap! {
1035 "k".into() => 4.into(),
1036 "k2".into() =>"z".into(),
1037 },
1038 ])),
1039 "k2".into(),
1040 TopN::First(2),
1041 Array2::from_shape_vec((2, 2), vec![
1042 1.into(),"a".into(),
1043 2.into(),"b".into(),
1044 ]
1045 ).unwrap(),
1046 vec!["k".into(), "k2".into()],
1047 )]
1048 #[traced_test]
1049 fn top_n(
1050 #[case] input: DataFrame,
1051 #[case] column: Key,
1052 #[case] topn: TopN,
1053 #[case] expected: Array2<DataValue>,
1054 #[case] columns: Vec<Key>,
1055 ) {
1056 let result = input.sorted(&column);
1057 assert!(result.is_ok(), "{result:?}");
1058 let result = result.unwrap();
1059 let first = result.topn(topn).unwrap();
1060 let selected = first.select(Some(&columns));
1061 assert_eq!(selected, expected);
1062 }
1063}