1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33 pub constants: HashMap<Key, DataValue>,
36 pub dataframe: ColumnFrame,
39 pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 self.dataframe.fmt(f)
46 }
47}
48
49impl DataFrame {
50 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51 Self {
52 constants: HashMap::new(),
53 dataframe: dataframe.into(),
54 metadata: HashMap::new(),
55 }
56 }
57
58 pub fn shrink(&mut self) {
59 self.dataframe.shrink();
60 }
61
62 pub fn add_metadata(&mut self, key: String, value: DataValue) {
63 self.metadata.insert(key, value);
64 }
65
66 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67 self.metadata.get(key)
68 }
69
70 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71 other.constants.into_iter().for_each(|(key, value)| {
72 self.constants.insert(key, value);
73 });
74 self.dataframe.join(other.dataframe, join_type)
75 }
76
77 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78 where
79 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80 {
81 self.dataframe.apply_function(keys, &mut func)
82 }
83
84 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85 Ok(self.dataframe.select(keys))
86 }
87
88 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89 self.dataframe.select_transposed_typed::<D>(keys)
90 }
91
92 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<DataValue>> {
93 self.dataframe.select_column(&key)
94 }
95
96 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97 self.dataframe.select_transposed(keys)
98 }
99
100 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101 self.constants.insert(key, value);
102 }
103
104 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105 self.dataframe.push(item)
106 }
107
108 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109 self.dataframe.remove_column(keys).map(|x| x.into())
110 }
111
112 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113 self.dataframe.extend(items.dataframe)
114 }
115
116 pub fn len(&self) -> usize {
117 self.dataframe.len()
118 }
119
120 pub fn is_empty(&self) -> bool {
121 self.dataframe.is_empty()
122 }
123
124 pub fn add_single_column<K: Into<Key>>(
125 &mut self,
126 key: K,
127 values: Array1<DataValue>,
128 ) -> Result<(), Error> {
129 self.dataframe.add_single_column(key, values)
130 }
131
132 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<DataValue>> {
133 self.dataframe.get_single_column(key)
134 }
135
136 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137 self.dataframe.sorted(key)
138 }
139
140 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141 let filtered_df = self.dataframe.filter(filter)?;
142 Ok(Self {
143 constants: self.constants.clone(),
144 dataframe: filtered_df,
145 metadata: self.metadata.clone(),
146 })
147 }
148
149 #[cfg(feature = "polars-df")]
150 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151 let mut columns = vec![];
152 for key in self.dataframe.keys() {
153 columns.push(polars::prelude::Column::new(
154 key.name().into(),
155 self.dataframe
156 .get_single_column(key)
157 .ok_or_else(|| Error::NotFound(key.clone()))?
158 .into_iter()
159 .map(|x| into_polars_value(x.clone()))
160 .collect::<Vec<_>>(),
161 ));
162 }
163
164 Ok(polars::prelude::DataFrame::new(columns)?)
165 }
166}
167
168#[cfg(feature = "polars-df")]
169pub fn into_polars_value(dv: DataValue) -> polars::prelude::AnyValue<'static> {
170 use polars::prelude::AnyValue::*;
171 use polars::prelude::NamedFrom;
172 match dv {
173 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
174 DataValue::Bytes(items) => BinaryOwned(items),
175 DataValue::U8(x) => UInt16(x as _),
176 DataValue::Bool(x) => Boolean(x),
177 DataValue::I32(x) => Int32(x),
178 DataValue::U32(x) => UInt32(x),
179 DataValue::I64(x) => Int64(x),
180 DataValue::U64(x) => UInt64(x),
181 DataValue::I128(x) => Int128(x),
182 DataValue::F32(x) => Float32(x),
183 DataValue::F64(x) => Float64(x),
184 DataValue::Null => Null,
185 DataValue::Vec(data_values) => List(polars::series::Series::new(
186 "v".into(),
187 data_values
188 .into_iter()
189 .map(into_polars_value)
190 .collect::<Vec<_>>(),
191 )),
192 DataValue::EnumNumber(_) => unimplemented!(),
193 DataValue::U128(_) => unimplemented!(),
194 DataValue::Map(_) => unimplemented!(),
195 }
196}
197
198#[cfg(feature = "polars-df")]
199pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
200 use polars::prelude::AnyValue::*;
201 match dv {
202 Null => DataValue::Null,
203 Boolean(v) => v.into(),
204 String(v) => DataValue::String(v.into()),
205 UInt8(v) => DataValue::U8(v),
206 UInt16(v) => DataValue::U32(v as u32),
207 UInt32(v) => v.into(),
208 UInt64(v) => v.into(),
209 Int8(v) => (v as i32).into(),
210 Int16(v) => (v as i32).into(),
211 Int32(v) => v.into(),
212 Int64(v) => v.into(),
213 Float32(v) => v.into(),
214 Float64(v) => v.into(),
215 Int128(v) => v.into(),
216 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
217 StringOwned(v) => DataValue::String(v.as_str().into()),
221 Binary(v) => DataValue::Bytes(v.to_owned()),
222 BinaryOwned(v) => DataValue::Bytes(v),
223 e => {
224 tracing::warn!("Unsupported polars value: {e:?}");
225 DataValue::Null
226 }
227 }
228}
229
230impl From<ColumnFrame> for DataFrame {
231 fn from(dataframe: ColumnFrame) -> Self {
232 Self::new(dataframe)
233 }
234}
235
236impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
237 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
238 Self::new(ColumnFrame::from(dataframe))
239 }
240}
241
242impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
243 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
244 Self::new(ColumnFrame::from(dataframe))
245 }
246}
247
248impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
249 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
250 Self::new(ColumnFrame::from(dataframe))
251 }
252}
253
254impl From<MLChefMap> for DataFrame {
255 fn from(dataframe: MLChefMap) -> Self {
256 Self::new(ColumnFrame::from(dataframe))
257 }
258}
259impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
260 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
261 Self::new(ColumnFrame::from(dataframe))
262 }
263}
264
265impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
266 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
267 Self::new(ColumnFrame::from(dataframe))
268 }
269}
270
271#[cfg(feature = "polars-df")]
272impl From<polars::prelude::DataFrame> for DataFrame {
273 fn from(dataframe: polars::prelude::DataFrame) -> Self {
274 Self::new(ColumnFrame::from(dataframe))
275 }
276}
277#[cfg(test)]
278mod test {
279 use crate::filter::FilterRules;
280
281 use super::*;
282 use halfbrown::hashmap;
283 #[cfg(feature = "polars-df")]
284 use polars::prelude::NamedFrom as _;
285 use rstest::*;
286 use tracing_test::traced_test;
287 #[fixture]
288 fn dummy_candidates() -> ColumnFrame {
289 ColumnFrame::from(vec![
290 hashmap! {
291 "key1".into() => 1.into(),
292 "key2".into() => "a".into(),
293 },
294 hashmap! {
295 "key1".into() => 2.into(),
296 "key2".into() => "b".into(),
297 },
298 ])
299 }
300
301 #[rstest]
302 fn test_serde() {
303 let df = crate::df! {
304 "a" => [1u64, 2u64, 3u64],
305 "b" => [4u64, 5u64, 6u64],
306 "c" => [7u64, 8u64, 9u64]
307 };
308
309 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
310
311 let deserialized =
312 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
313
314 assert_eq!(df, deserialized);
315 }
316
317 #[cfg(feature = "polars-df")]
318 #[rstest]
319 fn test_polars() {
320 let expected = crate::df! {
321 "a" => [1u64, 2u64, 3u64],
322 "b" => [4f64, 5f64, 6f64],
323 "c" => [7i64, 8i64, 9i64]
324 };
325
326 let polars_df = polars::df!(
327 "a" => [1u64, 2u64, 3u64],
328 "b" => [4f64, 5f64, 6f64],
329 "c" => [7i64, 8i64, 9i64]
330 )
331 .expect("BUG: should be ok");
332 let as_df: DataFrame = polars_df.into();
333 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
334 assert_eq!(
335 as_df.select(Some(keys.as_slice())),
336 expected.select(Some(keys.as_slice()))
337 );
338 }
339
340 #[cfg(feature = "polars-df")]
341 #[rstest]
342 #[case::str(DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
343 #[case::u32(DataValue::U32(u32::MAX), polars::prelude::AnyValue::UInt32(u32::MAX))]
344 #[case::i32(DataValue::I32(i32::MIN), polars::prelude::AnyValue::Int32(i32::MIN))]
345 #[case::i64(DataValue::I64(i64::MIN), polars::prelude::AnyValue::Int64(i64::MIN))]
346 #[case::u64(DataValue::U64(u64::MIN), polars::prelude::AnyValue::UInt64(u64::MIN))]
347 #[case::f32(DataValue::F32(f32::MIN), polars::prelude::AnyValue::Float32(f32::MIN))]
348 #[case::f64(DataValue::F64(f64::MIN), polars::prelude::AnyValue::Float64(f64::MIN))]
349 #[case::null(DataValue::Null, polars::prelude::AnyValue::Null)]
350 #[case::i128(
351 DataValue::I128(i128::MIN),
352 polars::prelude::AnyValue::Int128(i128::MIN)
353 )]
354 #[case::u8(DataValue::U8(255), polars::prelude::AnyValue::UInt8(255))]
355 #[case::bool(DataValue::Bool(true), polars::prelude::AnyValue::Boolean(true))]
356 #[case::bytes(DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
357 #[case::vec_uints(DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
358 fn into_polars_value_test(
362 #[case] input: DataValue,
363 #[case] output: polars::prelude::AnyValue<'static>,
364 ) {
365 assert_eq!(into_polars_value(input.clone()), output);
366 assert_eq!(from_polars_value(output), input);
367 }
368
369 #[rstest]
370 #[case(
371 DataFrame::new(crate::column_frame! {
372 "a" => [1f64, 2f64, 3f64],
373 "b" => [4i64, 5i64, 6i64],
374 "c" => [7i64, 8i64, 9i64]
375 }),
376 DataFrame::new(crate::column_frame! {
377 "a" => [1f64, 2f64],
378 "b" => [4i64, 5i64],
379 "c" => [7i64, 8i64]
380 }),
381 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
382 )]
383 #[traced_test]
384 fn filter_test(
385 #[case] df: DataFrame,
386 #[case] expected: DataFrame,
387 #[case] filter: FilterRules,
388 ) {
389 let filtered = df.filter(&filter).expect("BUG: cannot filter");
390 assert_eq!(filtered, expected);
391 }
392
393 #[rstest]
394 fn test_serde_complex() {
395 let simple = r#"
396{
397 "constants": {},
398 "dataframe": {
399 "index": {
400 "keys": [
401 {
402 "key": 3162770485,
403 "name": "a",
404 "ctype": "U32"
405 },
406 {
407 "key": 2279056742,
408 "name": "b",
409 "ctype": "F64"
410 },
411 {
412 "key": 2994984227,
413 "name": "c",
414 "ctype": "U64"
415 },
416 {
417 "key": 3319645144,
418 "name": "d",
419 "ctype": "F64"
420 },
421 {
422 "key": 1291847470,
423 "name": "e",
424 "ctype": "U32"
425 },
426 {
427 "key": 874241070,
428 "name": "f",
429 "ctype": "Bool"
430 }
431 ],
432 "indexes": {
433 "a": 0,
434 "b": 1,
435 "c": 2,
436 "d": 3,
437 "e": 4,
438 "f": 5
439 },
440 "alias": {}
441 },
442 "data_frame": {
443 "v": 1,
444 "dim": [
445 2,
446 6
447 ],
448 "data": [
449 253780,
450 0.009369421750307085,
451 1633222860381359,
452 8,
453 5,
454 true,
455 64512,
456 0.003391335718333721,
457 1633222860810557,
458 8,
459 5,
460 null
461 ]
462 }
463 },
464 "metadata": {}
465}
466 "#;
467
468 let simple_deserialized: DataFrame =
469 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
470
471 println!("deserialized: {simple_deserialized:?}");
472 let array = format!("[{}, {}, {}]", simple, simple, simple);
473 let deserialized: Vec<DataFrame> =
474 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
475
476 println!("deserialized: {deserialized:?}");
477 assert_eq!(deserialized.len(), 3);
478 assert_eq!(simple_deserialized, deserialized[0]);
479 }
480
481 #[rstest]
482 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
483 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
484 #[case(vec![hashmap! {
485 "key1".into() => 1.into(),
486 "key2".into() => "a".into(),
487 },
488 hashmap! {
489 "key1".into() => 2.into(),
490 },])]
491 #[case(vec![data_value::stdhashmap! {
492 "key1" => DataValue::from(1),
493 "key2" => DataValue::from("a"),
494 },data_value::stdhashmap! {
495 "key1" => DataValue::from(2),
496 },])]
497 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
498 vec![DataValue::from("a"), DataValue::Null])])]
499 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
500 let df: DataFrame = input.into();
501 assert_eq!(
502 df,
503 DataFrame {
504 constants: HashMap::new(),
505 dataframe: ColumnFrame::from(vec![
506 hashmap! {
507 "key1".into() => 1.into(),
508 "key2".into() => "a".into(),
509 },
510 hashmap! {
511 "key1".into() => 2.into(),
512 },
513 ]),
514 metadata: HashMap::new(),
515 }
516 );
517 let selected_transposed = df.select_column("key1".into());
518 assert!(selected_transposed.is_some());
519 let selected_transposed = selected_transposed.unwrap();
520 assert_eq!(selected_transposed.len(), 2);
521 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
522 }
523
524 #[rstest]
525 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
526 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
527 #[case::hm({
528 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
529 hm
530 })]
531 #[case::vec_hhm(vec![hashmap! {
532 "key1".into() => 1.into(),
533 "key2".into() => "a".into(),
534 },
535 hashmap! {
536 "key1".into() => 2.into(),
537 },])]
538 #[case::vec_hme(vec![data_value::stdhashmap! {
539 "key1" => DataValue::from(1),
540 "key2" => DataValue::from("a"),
541 },data_value::stdhashmap! {
542 "key1" => DataValue::from(2),
543 },])]
544 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
545 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
546 let df: DataFrame = input.into();
547 let expected: DataFrame = DataFrame {
548 constants: HashMap::new(),
549 dataframe: ColumnFrame::from(vec![
550 hashmap! {
551 "key1".into() => 1.into(),
552 "key2".into() => "a".into(),
553 },
554 hashmap! {
555 "key1".into() => 2.into(),
556 },
557 ]),
558 metadata: HashMap::new(),
559 };
560 assert_eq!(
561 df.select(Some(&["key1".into(), "key2".into()])),
562 expected.select(Some(&["key1".into(), "key2".into()])),
563 "{df} vs {expected}"
564 );
565 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
566 assert_eq!(selected_transposed.len(), 2);
567 println!("{:?}", selected_transposed);
568 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
569 }
570 #[rstest]
571 fn test_dataframe(dummy_candidates: ColumnFrame) {
572 let mut dataframe: DataFrame = DataFrame::default();
573 assert!(dataframe.is_empty());
574 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
575 assert_eq!(dataframe.len(), 2);
576
577 let candidate = hashmap! {
578 "key1".into() => 3.into(),
579 "key2".into() => "c".into(),
580 };
581
582 assert!(dataframe.push(candidate).is_ok());
583 assert_eq!(dataframe.len(), 3);
584 assert!(!dataframe.is_empty());
585
586 dataframe.insert_constant("key3".into(), 4.into());
587 assert_eq!(dataframe.constants.len(), 1);
588 assert!(dataframe
589 .apply_function(&["key1".into()], |keys, df| {
590 let key = keys[0].clone();
591 let s = df
592 .get_single_column(&key)
593 .expect("BUG: Cannot get column")
594 .to_owned();
595 let s = s.mapv(|x| x + DataValue::from(1));
596 df.add_single_column("key5", s)?;
597 Ok(())
598 })
599 .is_ok());
600 let original = dataframe.clone();
601 dataframe.shrink();
602 let remove_df = dataframe.remove_column(&["key1".into()]);
603 assert!(remove_df.is_ok());
604 let mut remove_df = remove_df.unwrap();
605 assert_eq!(remove_df.len(), 3);
606 let selected = dataframe.select(Some(&["key2".into()]));
607 assert!(selected.is_ok());
608 let selected = selected.unwrap();
609 println!("{:?}", selected);
610 assert_eq!(selected.len(), 3);
611
612 let joined_result =
614 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
615 assert!(joined_result.is_ok(), "{:?}", joined_result);
616 assert_eq!(original, remove_df);
617 }
618
619 #[rstest]
620 fn test_metadata(dummy_candidates: ColumnFrame) {
621 let mut dataframe: DataFrame = DataFrame::default();
622 assert!(dataframe.is_empty());
623 println!("{:?}", dataframe);
624 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
625 println!("{:?}", dataframe);
626 assert_eq!(dataframe.len(), 2);
627
628 dataframe.add_metadata("test".into(), 1.into());
629 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
630 let dataframe = DataFrame::new(ColumnFrame::from(vec![
631 hashmap! {
632 "key1".into() => 1.into(),
633 "key2".into() => "a".into(),
634 },
635 hashmap! {
636 "key1".into() => 2.into(),
637 "key2".into() => "b".into(),
638 },
639 ]));
640 assert_eq!(dataframe.get_metadata("test"), None);
641 let tt = dataframe.select_transposed(None);
642 assert!(tt.is_ok());
643 let tt = tt.unwrap();
644 assert_eq!(tt.shape(), [2, 2]);
645 assert_eq!(
646 tt,
647 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
648 .unwrap()
649 );
650 }
651
652 #[rstest]
653 #[traced_test]
654 fn add_single_column_test() {
655 let mut dataframe = DataFrame::default();
656 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
657 let r = dataframe.add_single_column("key1", values);
658 assert!(r.is_ok(), "{r:?}");
659 let selected = dataframe.select(None);
660 assert!(selected.is_ok());
661 let selected = selected.unwrap();
662 assert_eq!(selected.shape(), [3, 1]);
663 assert_eq!(
664 selected,
665 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
666 );
667 let values = Array1::from(vec![1.into(), 2.into()]);
668 assert!(dataframe.add_single_column("key1", values).is_err());
669 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
670 assert!(dataframe.add_single_column("key2", values).is_ok());
671 let values = Array1::from(vec![3.into()]);
672 assert!(dataframe.add_single_column("key3", values).is_err());
673 }
674
675 #[rstest]
676 #[traced_test]
677 fn add_single_column_empty_test() {
678 let mut dataframe = DataFrame::default();
679 let values = Array1::from(vec![]);
680 let r = dataframe.add_single_column("key1", values);
681 assert!(r.is_ok(), "{r:?}");
682 let selected = dataframe.select(None);
683 assert!(selected.is_ok());
684 let selected = selected.unwrap();
685 assert_eq!(selected.shape(), [0, 1]);
686 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
687 let values = Array1::from(vec![1.into(), 2.into()]);
688 assert!(dataframe.add_single_column("key1", values).is_err());
689 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
690 assert!(dataframe.add_single_column("key2", values).is_ok());
691 let values = Array1::from(vec![3.into(), 4.into()]);
692 assert!(dataframe.add_single_column("key3", values).is_err());
693 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
694 assert!(dataframe.add_single_column("key3", values).is_ok());
695
696 assert_eq!(
697 dataframe
698 .select_column("key1".into())
699 .expect("BUG: has to exists"),
700 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
701 );
702 assert_eq!(
703 dataframe
704 .select_column("key2".into())
705 .expect("BUG: has to exists"),
706 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
707 );
708 assert_eq!(
709 dataframe.select(None).expect("BUG: cannot get data"),
710 ndarray::arr2(&[
711 [DataValue::Null, 3.into(), 3.into()],
712 [DataValue::Null, 4.into(), 4.into()],
713 [DataValue::Null, 5.into(), 5.into()],
714 ])
715 );
716 }
717
718 #[rstest]
719 #[case(
720 DataFrame::new(ColumnFrame::from(vec![
721 hashmap! {
722 "k".into() => 1.into(),
723 "k2".into() => 2.into(),
724 "k3".into() => 2.2.into(),
725 },
726 hashmap! {
727 "k".into() => 11.into(),
728 "k2".into() => 3.into(),
729 },
730 hashmap! {
731 "k".into() => 4.into(),
732 "k2".into() => 5.into(),
733 "k3".into() => 2.3.into(),
734 },
735 hashmap! {
736 "k".into() => 4.into(),
737 "k2".into() => 5.into(),
738 "k3".into() => 2.4.into(),
739 },
740 ])),
741 vec!["k".into(), "k2".into()],
742 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
743 )]
744 #[case(
745 DataFrame::new(ColumnFrame::from(vec![
746 hashmap! {
747 "k".into() => 1.into(),
748 "k2".into() => 2.into(),
749 "k3".into() => 2.2.into(),
750 },
751 hashmap! {
752 "k".into() => 11.into(),
753 "k2".into() => 3.into(),
754 },
755 hashmap! {
756 "k".into() => 4.into(),
757 "k2".into() => 5.into(),
758 "k3".into() => 2.3.into(),
759 },
760 hashmap! {
761 "k".into() => 4.into(),
762 "k2".into() => 5.into(),
763 "k3".into() => 2.4.into(),
764 },
765 ])),
766 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
767 Array2::from_shape_vec((4, 5), vec![
768 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
769 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
770 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
771 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
772 )]
773 #[traced_test]
774 fn select_multiple(
775 #[case] input: DataFrame,
776 #[case] columns: Vec<Key>,
777 #[case] expected: Array2<DataValue>,
778 ) {
779 let selected = input.select(Some(&columns));
780 assert!(selected.is_ok());
781 let selected = selected.unwrap();
782
783 assert_eq!(selected, expected);
784 }
785
786 #[rstest]
787 #[case(
788 DataFrame::new(ColumnFrame::from(vec![
789 hashmap! {
790 "k".into() => 1.into(),
791 "k2".into() => 2.into(),
792 "k3".into() => 2.2.into(),
793 },
794 hashmap! {
795 "k".into() => 11.into(),
796 "k2".into() => 3.into(),
797 },
798 hashmap! {
799 "k".into() => 4.into(),
800 "k2".into() => 5.into(),
801 "k3".into() => 2.3.into(),
802 },
803 hashmap! {
804 "k".into() => 4.into(),
805 "k2".into() => 5.into(),
806 "k3".into() => 2.4.into(),
807 },
808 ])),
809 "k".into(),
810 Array2::from_shape_vec((4, 3), vec![
811 1.into(), 2.into(), 2.2.into(),
812 4.into(), 5.into(), 2.3.into(),
813 4.into(), 5.into(), 2.4.into(),
814 11.into(), 3.into(), DataValue::Null,
815 ]
816 ).unwrap(),
817 vec!["k".into(), "k2".into(), "k3".into()],
818 )]
819 #[rstest]
820 #[case(
821 DataFrame::new(ColumnFrame::from(vec![
822 hashmap! {
823 "k".into() => 1.into(),
824 "k2".into() => 2.into(),
825 "k3".into() => 2.2.into(),
826 },
827 hashmap! {
828 "k".into() => 11.into(),
829 "k2".into() => 3.into(),
830 },
831 hashmap! {
832 "k".into() => 4.into(),
833 "k2".into() => 5.into(),
834 "k3".into() => 2.3.into(),
835 },
836 hashmap! {
837 "k".into() => 4.into(),
838 "k2".into() => 5.into(),
839 "k3".into() => 2.4.into(),
840 },
841 ])),
842 "k3".into(),
843 Array2::from_shape_vec((4, 3), vec![
844 11.into(), 3.into(), DataValue::Null,
845 1.into(), 2.into(), 2.2.into(),
846 4.into(), 5.into(), 2.3.into(),
847 4.into(), 5.into(), 2.4.into(),
848 ]
849 ).unwrap(),
850 vec!["k".into(), "k2".into(), "k3".into()],
851 )]
852 #[case(
853 DataFrame::new(ColumnFrame::from(vec![
854 hashmap! {
855 "k".into() => 2.into(),
856 "k2".into() => 0.000001.into(),
857 },
858 hashmap! {
859 "k".into() => 1.into(),
860 "k2".into() =>0.0000001.into(),
861 },
862 hashmap! {
863 "k".into() => 3.into(),
864 "k2".into() => 0.00001.into(),
865 },
866 hashmap! {
867 "k".into() => 4.into(),
868 "k2".into() => 0.001.into(),
869 },
870 ])),
871 "k2".into(),
872 Array2::from_shape_vec((4, 2), vec![
873 1.into(), 0.0000001.into(),
874 2.into(), 0.000001.into(),
875 3.into(), 0.00001.into(),
876 4.into(), 0.001.into(),
877 ]
878 ).unwrap(),
879 vec!["k".into(), "k2".into()],
880 )]
881 #[case(
882 DataFrame::new(ColumnFrame::from(vec![
883 hashmap! {
884 "k".into() => 2.into(),
885 "k2".into() => "b".into(),
886 },
887 hashmap! {
888 "k".into() => 1.into(),
889 "k2".into() =>"a".into(),
890 },
891 hashmap! {
892 "k".into() => 3.into(),
893 "k2".into() =>"c".into(),
894 },
895 hashmap! {
896 "k".into() => 4.into(),
897 "k2".into() =>"z".into(),
898 },
899 ])),
900 "k2".into(),
901 Array2::from_shape_vec((4, 2), vec![
902 1.into(),"a".into(),
903 2.into(), "b".into(),
904 3.into(), "c".into(),
905 4.into(), "z".into(),
906 ]
907 ).unwrap(),
908 vec!["k".into(), "k2".into()],
909 )]
910 #[traced_test]
911 fn sort_by(
912 #[case] input: DataFrame,
913 #[case] column: Key,
914 #[case] expected: Array2<DataValue>,
915 #[case] columns: Vec<Key>,
916 ) {
917 let result = input.sorted(&column);
918 assert!(result.is_ok(), "{result:?}");
919 let result = result.unwrap().get_sorted();
920 let selected = result.select(Some(&columns));
921
922 assert_eq!(selected, expected);
923 }
924 #[rstest]
925 #[case(
926 DataFrame::new(ColumnFrame::from(vec![
927 hashmap! {
928 "k".into() => 2.into(),
929 "k2".into() => 0.000001.into(),
930 },
931 hashmap! {
932 "k".into() => 1.into(),
933 "k2".into() =>0.0000001.into(),
934 },
935 hashmap! {
936 "k".into() => 3.into(),
937 "k2".into() => 0.00001.into(),
938 },
939 hashmap! {
940 "k".into() => 4.into(),
941 "k2".into() => 0.001.into(),
942 },
943 ])),
944 "k2".into(),
945 TopN::Last(1),
946 Array2::from_shape_vec((1, 2), vec![
947 4.into(), 0.001.into(),
948 ]
949 ).unwrap(),
950 vec!["k".into(), "k2".into()],
951 )]
952 #[case(
953 DataFrame::new(ColumnFrame::from(vec![
954 hashmap! {
955 "k".into() => 2.into(),
956 "k2".into() => 0.000001.into(),
957 },
958 hashmap! {
959 "k".into() => 1.into(),
960 "k2".into() =>0.0000001.into(),
961 },
962 hashmap! {
963 "k".into() => 3.into(),
964 "k2".into() => 0.00001.into(),
965 },
966 hashmap! {
967 "k".into() => 4.into(),
968 "k2".into() => 0.001.into(),
969 },
970 ])),
971 "k2".into(),
972 TopN::Last(2),
973 Array2::from_shape_vec((2, 2), vec![
974 4.into(), 0.001.into(),
975 3.into(), 0.00001.into(),
976 ]
977 ).unwrap(),
978 vec!["k".into(), "k2".into()],
979 )]
980 #[case(
981 DataFrame::new(ColumnFrame::from(vec![
982 hashmap! {
983 "k".into() => 2.into(),
984 "k2".into() => "b".into(),
985 },
986 hashmap! {
987 "k".into() => 1.into(),
988 "k2".into() =>"a".into(),
989 },
990 hashmap! {
991 "k".into() => 3.into(),
992 "k2".into() =>"c".into(),
993 },
994 hashmap! {
995 "k".into() => 4.into(),
996 "k2".into() =>"z".into(),
997 },
998 ])),
999 "k2".into(),
1000 TopN::First(1),
1001 Array2::from_shape_vec((1, 2), vec![
1002 1.into(),"a".into(),
1003 ]
1004 ).unwrap(),
1005 vec!["k".into(), "k2".into()],
1006 )]
1007 #[case(
1008 DataFrame::new(ColumnFrame::from(vec![
1009 hashmap! {
1010 "k".into() => 2.into(),
1011 "k2".into() => "b".into(),
1012 },
1013 hashmap! {
1014 "k".into() => 1.into(),
1015 "k2".into() =>"a".into(),
1016 },
1017 hashmap! {
1018 "k".into() => 3.into(),
1019 "k2".into() =>"c".into(),
1020 },
1021 hashmap! {
1022 "k".into() => 4.into(),
1023 "k2".into() =>"z".into(),
1024 },
1025 ])),
1026 "k2".into(),
1027 TopN::First(2),
1028 Array2::from_shape_vec((2, 2), vec![
1029 1.into(),"a".into(),
1030 2.into(),"b".into(),
1031 ]
1032 ).unwrap(),
1033 vec!["k".into(), "k2".into()],
1034 )]
1035 #[traced_test]
1036 fn top_n(
1037 #[case] input: DataFrame,
1038 #[case] column: Key,
1039 #[case] topn: TopN,
1040 #[case] expected: Array2<DataValue>,
1041 #[case] columns: Vec<Key>,
1042 ) {
1043 let result = input.sorted(&column);
1044 assert!(result.is_ok(), "{result:?}");
1045 let result = result.unwrap();
1046 let first = result.topn(topn).unwrap();
1047 let selected = first.select(Some(&columns));
1048 assert_eq!(selected, expected);
1049 }
1050}