1use column_store::sorted_df::SortedDataFrame;
2use data_value::{DataValue, Extract};
3use halfbrown::HashMap;
4use ndarray::{Array1, Array2, ArrayView1};
5use std::fmt;
6pub mod column_store;
7pub mod index;
8pub mod join;
9pub mod key;
10use crate::{error::Error, CandidateData};
11#[cfg(feature = "python")]
12pub mod python;
13
14#[cfg(feature = "python")]
15use pyo3::prelude::*;
16
17use crate::{
18 dataframe::{column_store::ColumnFrame, join::JoinRelation, key::Key},
19 MLChefMap,
20};
21
22#[derive(Debug, Clone, PartialEq, Eq, Copy)]
23pub enum TopN {
24 First(usize),
25 Last(usize),
26}
27
28#[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
31#[cfg_attr(feature = "python", pyclass)]
32pub struct DataFrame {
33 pub constants: HashMap<Key, DataValue>,
36 pub dataframe: ColumnFrame,
39 pub metadata: HashMap<String, DataValue>,
41}
42
43impl fmt::Display for DataFrame {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 self.dataframe.fmt(f)
46 }
47}
48
49impl DataFrame {
50 pub fn new<C: Into<ColumnFrame>>(dataframe: C) -> Self {
51 Self {
52 constants: HashMap::new(),
53 dataframe: dataframe.into(),
54 metadata: HashMap::new(),
55 }
56 }
57
58 pub fn shrink(&mut self) {
59 self.dataframe.shrink();
60 }
61
62 pub fn add_metadata(&mut self, key: String, value: DataValue) {
63 self.metadata.insert(key, value);
64 }
65
66 pub fn get_metadata(&self, key: &str) -> Option<&DataValue> {
67 self.metadata.get(key)
68 }
69
70 pub fn join(&mut self, other: Self, join_type: &JoinRelation) -> Result<(), Error> {
71 other.constants.into_iter().for_each(|(key, value)| {
72 self.constants.insert(key, value);
73 });
74 self.dataframe.join(other.dataframe, join_type)
75 }
76
77 pub fn apply_function<F>(&mut self, keys: &[Key], mut func: F) -> Result<(), Error>
78 where
79 F: FnMut(&[Key], &mut ColumnFrame) -> Result<(), Error>,
80 {
81 self.dataframe.apply_function(keys, &mut func)
82 }
83
84 pub fn select(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
85 Ok(self.dataframe.select(keys))
86 }
87
88 pub fn select_transposed_typed<D: Extract>(&self, keys: &[Key]) -> Vec<Vec<D>> {
89 self.dataframe.select_transposed_typed::<D>(keys)
90 }
91
92 pub fn select_column(&self, key: Key) -> Option<ndarray::ArrayView1<'_, DataValue>> {
93 self.dataframe.select_column(&key)
94 }
95
96 pub fn select_transposed(&self, keys: Option<&[Key]>) -> Result<Array2<DataValue>, Error> {
97 self.dataframe.select_transposed(keys)
98 }
99
100 pub fn insert_constant(&mut self, key: Key, value: DataValue) {
101 self.constants.insert(key, value);
102 }
103
104 pub fn push<C: CandidateData>(&mut self, item: C) -> Result<(), Error> {
105 self.dataframe.push(item)
106 }
107
108 pub fn remove_column(&mut self, keys: &[Key]) -> Result<Self, Error> {
109 self.dataframe.remove_column(keys).map(|x| x.into())
110 }
111
112 pub fn extend(&mut self, items: Self) -> Result<(), Error> {
113 self.dataframe.extend(items.dataframe)
114 }
115
116 pub fn len(&self) -> usize {
117 self.dataframe.len()
118 }
119
120 pub fn is_empty(&self) -> bool {
121 self.dataframe.is_empty()
122 }
123
124 pub fn add_single_column<K: Into<Key>>(
125 &mut self,
126 key: K,
127 values: Array1<DataValue>,
128 ) -> Result<(), Error> {
129 self.dataframe.add_single_column(key, values)
130 }
131
132 pub fn get_single_column(&self, key: &Key) -> Option<ArrayView1<'_, DataValue>> {
133 self.dataframe.get_single_column(key)
134 }
135
136 pub fn sorted(&self, key: &Key) -> Result<SortedDataFrame<'_>, Error> {
137 self.dataframe.sorted(key)
138 }
139
140 pub fn filter(&self, filter: &crate::filter::FilterRules) -> Result<Self, Error> {
141 let filtered_df = self.dataframe.filter(filter)?;
142 Ok(Self {
143 constants: self.constants.clone(),
144 dataframe: filtered_df,
145 metadata: self.metadata.clone(),
146 })
147 }
148
149 #[cfg(feature = "polars-df")]
150 pub fn as_polars(&self) -> Result<polars::prelude::DataFrame, Error> {
151 let mut columns = vec![];
152 for key in self.dataframe.keys() {
153 let values = self
154 .dataframe
155 .get_single_column(key)
156 .ok_or_else(|| Error::NotFound(key.clone()))?
157 .into_iter()
158 .map(|x| into_polars_value(key, x.clone()))
159 .collect::<Vec<_>>();
160 let s = polars::prelude::Column::new(key.name().into(), values);
161
162 columns.push(s);
163 }
164
165 Ok(polars::prelude::DataFrame::new(columns)?)
166 }
167}
168#[cfg(feature = "polars-df")]
169pub fn polars_dtype(dtype: crate::DataType) -> polars::prelude::DataType {
170 use crate::DataType::*;
171 use polars::prelude::DataType::*;
172 match dtype {
173 Bool => Boolean,
174 U32 => UInt32,
175 I32 => Int32,
176 U8 => UInt8,
177 U64 => UInt64,
178 I64 => Int64,
179 F32 => Float32,
180 F64 => Float64,
181 U128 => UInt128,
182 I128 => Int128,
183 crate::DataType::String => polars::prelude::DataType::String,
184 Bytes => Binary,
185 crate::DataType::Unknown => Null,
186 Vec => List(Box::new(polars::prelude::DataType::Unknown(
187 polars::prelude::UnknownKind::Any,
188 ))),
189 Map => Struct(vec![]),
190 }
191}
192
193#[cfg(feature = "polars-df")]
194pub fn into_polars_value(key: &Key, dv: DataValue) -> polars::prelude::AnyValue<'static> {
195 use polars::prelude::AnyValue::*;
196 use polars::prelude::Field;
197
198 use crate::dataframe::column_store::convert_dv_to_dtype;
199 let dv = convert_dv_to_dtype(key, dv);
200 match dv {
201 DataValue::String(smart_string) => StringOwned(smart_string.as_str().into()),
202 DataValue::Bytes(items) => BinaryOwned(items),
203 DataValue::U8(x) => UInt32(x as _),
204 DataValue::Bool(x) => Boolean(x),
205 DataValue::I32(x) => Int32(x),
206 DataValue::U32(x) => UInt32(x),
207 DataValue::I64(x) => Int64(x),
208 DataValue::U64(x) => UInt64(x),
209 DataValue::I128(x) => Int128(x),
210 DataValue::F32(x) => Float32(x),
211 DataValue::F64(x) => Float64(x),
212 DataValue::Null => Null,
213 DataValue::Vec(data_values) => {
214 let mut dt = crate::DataType::Unknown;
215 for d in data_values.iter() {
216 match crate::detect_dtype(d) {
217 crate::DataType::Unknown => continue,
218 e => {
219 dt = e;
220 break;
221 }
222 }
223 }
224 let vec_key = Key::new(key.name(), dt);
225 let s = polars::series::Series::from_any_values(
226 key.name().into(),
227 &data_values
228 .into_iter()
229 .map(|x| into_polars_value(&vec_key, x))
230 .collect::<Vec<_>>(),
231 true,
232 );
233 List(s.expect(&format!("Cannot create series for {key:?}")))
234 }
235 DataValue::EnumNumber(x) => Int32(x),
236 DataValue::U128(x) => UInt128(x),
237 DataValue::Map(x) => {
238 let mut values = vec![];
239 let mut fields = vec![];
240 let mut sorted_keys = x.keys().collect::<Vec<_>>();
241 sorted_keys.sort();
242 for k in sorted_keys {
243 let value = x.get(k).expect(&format!("Key {key:?} should exists in hm"));
244 let dtype = crate::detect_dtype(value);
245 let k = Key::new(k, dtype);
246 values.push(into_polars_value(&k, value.to_owned()));
247 fields.push(Field::new(k.name().into(), polars_dtype(dtype)));
248 }
249 StructOwned(Box::new((values, fields)))
250 }
251 }
252}
253
254#[cfg(feature = "polars-df")]
255pub fn from_polars_value(dv: polars::prelude::AnyValue<'_>) -> DataValue {
256 use polars::prelude::AnyValue::*;
257 match dv {
258 Null => DataValue::Null,
259 Boolean(v) => v.into(),
260 String(v) => DataValue::String(v.into()),
261 UInt8(v) => DataValue::U8(v),
262 UInt16(v) => DataValue::U32(v as u32),
263 UInt32(v) => v.into(),
264 UInt64(v) => v.into(),
265 Int8(v) => (v as i32).into(),
266 Int16(v) => (v as i32).into(),
267 Int32(v) => v.into(),
268 Int64(v) => v.into(),
269 Float32(v) => v.into(),
270 Float64(v) => v.into(),
271 Int128(v) => v.into(),
272 List(series) => DataValue::Vec(series.iter().map(from_polars_value).collect::<Vec<_>>()),
273 StringOwned(v) => DataValue::String(v.as_str().into()),
277 Binary(v) => DataValue::Bytes(v.to_owned()),
278 BinaryOwned(v) => DataValue::Bytes(v),
279 StructOwned(m) => {
280 let mut hm: std::collections::HashMap<smartstring::alias::String, DataValue> =
281 std::collections::HashMap::new();
282 for (k, v) in m.1.into_iter().zip(m.0.into_iter()) {
283 hm.insert(k.name.as_str().into(), from_polars_value(v));
284 }
285 DataValue::Map(hm)
286 }
287 e => {
288 tracing::warn!("Unsupported polars value: {e:?}");
289 DataValue::Null
290 }
291 }
292}
293
294impl From<ColumnFrame> for DataFrame {
295 fn from(dataframe: ColumnFrame) -> Self {
296 Self::new(dataframe)
297 }
298}
299
300impl From<Vec<std::collections::HashMap<Key, DataValue>>> for DataFrame {
301 fn from(dataframe: Vec<std::collections::HashMap<Key, DataValue>>) -> Self {
302 Self::new(ColumnFrame::from(dataframe))
303 }
304}
305
306impl From<Vec<HashMap<Key, DataValue>>> for DataFrame {
307 fn from(dataframe: Vec<HashMap<Key, DataValue>>) -> Self {
308 Self::new(ColumnFrame::from(dataframe))
309 }
310}
311
312impl From<std::collections::HashMap<String, Vec<DataValue>>> for DataFrame {
313 fn from(dataframe: std::collections::HashMap<String, Vec<DataValue>>) -> Self {
314 Self::new(ColumnFrame::from(dataframe))
315 }
316}
317
318impl From<MLChefMap> for DataFrame {
319 fn from(dataframe: MLChefMap) -> Self {
320 Self::new(ColumnFrame::from(dataframe))
321 }
322}
323impl From<Vec<(Key, Vec<DataValue>)>> for DataFrame {
324 fn from(dataframe: Vec<(Key, Vec<DataValue>)>) -> Self {
325 Self::new(ColumnFrame::from(dataframe))
326 }
327}
328
329impl From<std::collections::HashMap<String, Array1<DataValue>>> for DataFrame {
330 fn from(dataframe: std::collections::HashMap<String, Array1<DataValue>>) -> Self {
331 Self::new(ColumnFrame::from(dataframe))
332 }
333}
334
335#[cfg(feature = "polars-df")]
336impl From<polars::prelude::DataFrame> for DataFrame {
337 fn from(dataframe: polars::prelude::DataFrame) -> Self {
338 Self::new(ColumnFrame::from(dataframe))
339 }
340}
341#[cfg(test)]
342mod test {
343 use crate::filter::FilterRules;
344
345 use super::*;
346 use halfbrown::hashmap;
347 #[cfg(feature = "polars-df")]
348 use polars::prelude::NamedFrom as _;
349 use rstest::*;
350 use tracing_test::traced_test;
351 #[fixture]
352 fn dummy_candidates() -> ColumnFrame {
353 ColumnFrame::from(vec![
354 hashmap! {
355 "key1".into() => 1.into(),
356 "key2".into() => "a".into(),
357 },
358 hashmap! {
359 "key1".into() => 2.into(),
360 "key2".into() => "b".into(),
361 },
362 ])
363 }
364
365 #[rstest]
366 fn test_serde() {
367 let df = crate::df! {
368 "a" => [1u64, 2u64, 3u64],
369 "b" => [4u64, 5u64, 6u64],
370 "c" => [7u64, 8u64, 9u64]
371 };
372
373 let serialized = serde_json::to_string(&df).expect("BUG: Unable to serialize dataframe");
374
375 let deserialized =
376 serde_json::from_str(&serialized).expect("BUG: Unable to deserialize dataframe");
377
378 assert_eq!(df, deserialized);
379 }
380
381 #[cfg(feature = "polars-df")]
382 #[rstest]
383 fn test_polars() {
384 let expected = crate::df! {
385 "a" => [1u64, 2u64, 3u64],
386 "b" => [4f64, 5f64, 6f64],
387 "c" => [7i64, 8i64, 9i64]
388 };
389
390 let polars_df = polars::df!(
391 "a" => [1u64, 2u64, 3u64],
392 "b" => [4f64, 5f64, 6f64],
393 "c" => [7i64, 8i64, 9i64]
394 )
395 .expect("BUG: should be ok");
396 let as_df: DataFrame = polars_df.into();
397 let keys: Vec<Key> = vec!["a".into(), "b".into(), "c".into()];
398 assert_eq!(
399 as_df.select(Some(keys.as_slice())),
400 expected.select(Some(keys.as_slice()))
401 );
402 }
403 use crate::DataType;
404 #[cfg(feature = "polars-df")]
405 #[rstest]
406 #[case::str(Key::new("a", DataType::String), DataValue::String("test".into()), polars::prelude::AnyValue::String("test".into()))]
407 #[case::u32(
408 Key::new("a", DataType::U32),
409 DataValue::U32(u32::MAX),
410 polars::prelude::AnyValue::UInt32(u32::MAX)
411 )]
412 #[case::i32(
413 Key::new("a", DataType::I32),
414 DataValue::I32(i32::MIN),
415 polars::prelude::AnyValue::Int32(i32::MIN)
416 )]
417 #[case::i64(
418 Key::new("a", DataType::I64),
419 DataValue::I64(i64::MIN),
420 polars::prelude::AnyValue::Int64(i64::MIN)
421 )]
422 #[case::u64(
423 Key::new("a", DataType::U64),
424 DataValue::U64(u64::MIN),
425 polars::prelude::AnyValue::UInt64(u64::MIN)
426 )]
427 #[case::f32(
428 Key::new("a", DataType::F32),
429 DataValue::F32(f32::MIN),
430 polars::prelude::AnyValue::Float32(f32::MIN)
431 )]
432 #[case::f64(
433 Key::new("a", DataType::F64),
434 DataValue::F64(f64::MIN),
435 polars::prelude::AnyValue::Float64(f64::MIN)
436 )]
437 #[case::null(
438 Key::new("a", DataType::Unknown),
439 DataValue::Null,
440 polars::prelude::AnyValue::Null
441 )]
442 #[case::i128(
443 Key::new("a", DataType::I128),
444 DataValue::I128(i128::MIN),
445 polars::prelude::AnyValue::Int128(i128::MIN)
446 )]
447 #[case::u8(
448 Key::new("a", DataType::U8),
449 DataValue::U8(255),
450 polars::prelude::AnyValue::UInt8(255)
451 )]
452 #[case::bool(
453 Key::new("a", DataType::Bool),
454 DataValue::Bool(true),
455 polars::prelude::AnyValue::Boolean(true)
456 )]
457 #[case::bytes(Key::new("a", DataType::Bytes), DataValue::Bytes("aaaaa".as_bytes().to_vec()), polars::prelude::AnyValue::BinaryOwned("aaaaa".as_bytes().to_vec()))]
458 #[case::vec_uints(Key::new("a", DataType::Vec), DataValue::Vec(vec![DataValue::U32(0), DataValue::U32(1)]), polars::prelude::AnyValue::List(polars::series::Series::new("v".into(), vec![polars::prelude::AnyValue::UInt32(0u32), polars::prelude::AnyValue::UInt32(1)])))]
459 #[case::map(Key::new("a", DataType::Map), DataValue::Map(data_value::stdhashmap!("a" => 0u64, "b" => "s")), polars::prelude::AnyValue::StructOwned(Box::new((
460 vec![polars::prelude::AnyValue::UInt64(0u64), polars::prelude::AnyValue::String("s".into())],
461 vec![polars::prelude::Field::new("a".into(), polars::prelude::DataType::UInt64), polars::prelude::Field::new("b".into(), polars::prelude::DataType::String)]))))]
462 fn into_polars_value_test(
466 #[case] key: Key,
467 #[case] input: DataValue,
468 #[case] output: polars::prelude::AnyValue<'static>,
469 ) {
470 assert_eq!(into_polars_value(&key, input.clone()), output);
471 assert_eq!(from_polars_value(output), input);
472 }
473
474 #[rstest]
486 #[case(
487 DataFrame::new(crate::column_frame! {
488 "a" => [1f64, 2f64, 3f64],
489 "b" => [4i64, 5i64, 6i64],
490 "c" => [7i64, 8i64, 9i64]
491 }),
492 DataFrame::new(crate::column_frame! {
493 "a" => [1f64, 2f64],
494 "b" => [4i64, 5i64],
495 "c" => [7i64, 8i64]
496 }),
497 FilterRules::try_from("a >= 1f64 && (b <= 5 || c <= 8) && b >= 4").expect("BUG: cannot create filter rules"),
498 )]
499 #[case(
500 DataFrame::new(crate::column_frame! {
501 "a" => [1f64, 2f64, 3f64],
502 "b" => [4i64, 5i64, 6i64],
503 "c" => [7i64, 8i64, 9i64]
504 }),
505 DataFrame::new(crate::column_frame! {
506 "a" => [2f64],
507 "b" => [5i64],
508 "c" => [8i64]
509 }),
510 FilterRules::try_from("a % 2f64 == 0f64").expect("BUG: cannot create filter rules"),
511 )]
512 #[traced_test]
513 fn filter_test(
514 #[case] df: DataFrame,
515 #[case] expected: DataFrame,
516 #[case] filter: FilterRules,
517 ) {
518 let filtered = df.filter(&filter).expect("BUG: cannot filter");
519 assert_eq!(filtered, expected);
520 }
521
522 #[rstest]
523 fn test_serde_complex() {
524 let simple = r#"
525{
526 "constants": {},
527 "dataframe": {
528 "index": {
529 "keys": [
530 {
531 "key": 3162770485,
532 "name": "a",
533 "ctype": "U32"
534 },
535 {
536 "key": 2279056742,
537 "name": "b",
538 "ctype": "F64"
539 },
540 {
541 "key": 2994984227,
542 "name": "c",
543 "ctype": "U64"
544 },
545 {
546 "key": 3319645144,
547 "name": "d",
548 "ctype": "F64"
549 },
550 {
551 "key": 1291847470,
552 "name": "e",
553 "ctype": "U32"
554 },
555 {
556 "key": 874241070,
557 "name": "f",
558 "ctype": "Bool"
559 }
560 ],
561 "indexes": {
562 "a": 0,
563 "b": 1,
564 "c": 2,
565 "d": 3,
566 "e": 4,
567 "f": 5
568 },
569 "alias": {}
570 },
571 "data_frame": {
572 "v": 1,
573 "dim": [
574 2,
575 6
576 ],
577 "data": [
578 253780,
579 0.009369421750307085,
580 1633222860381359,
581 8,
582 5,
583 true,
584 64512,
585 0.003391335718333721,
586 1633222860810557,
587 8,
588 5,
589 null
590 ]
591 }
592 },
593 "metadata": {}
594}
595 "#;
596
597 let simple_deserialized: DataFrame =
598 serde_json::from_str(simple).expect("BUG: Unable to deserialize dataframe");
599
600 println!("deserialized: {simple_deserialized:?}");
601 let array = format!("[{}, {}, {}]", simple, simple, simple);
602 let deserialized: Vec<DataFrame> =
603 serde_json::from_str(&array).expect("BUG: Unable to deserialize dataframe");
604
605 println!("deserialized: {deserialized:?}");
606 assert_eq!(deserialized.len(), 3);
607 assert_eq!(simple_deserialized, deserialized[0]);
608 }
609
610 #[rstest]
611 #[case(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
612 #[case(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
613 #[case(vec![hashmap! {
614 "key1".into() => 1.into(),
615 "key2".into() => "a".into(),
616 },
617 hashmap! {
618 "key1".into() => 2.into(),
619 },])]
620 #[case(vec![data_value::stdhashmap! {
621 "key1" => DataValue::from(1),
622 "key2" => DataValue::from("a"),
623 },data_value::stdhashmap! {
624 "key1" => DataValue::from(2),
625 },])]
626 #[case(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(),
627 vec![DataValue::from("a"), DataValue::Null])])]
628 fn test_select_column<T: Into<DataFrame>>(#[case] input: T) {
629 let df: DataFrame = input.into();
630 assert_eq!(
631 df,
632 DataFrame {
633 constants: HashMap::new(),
634 dataframe: ColumnFrame::from(vec![
635 hashmap! {
636 "key1".into() => 1.into(),
637 "key2".into() => "a".into(),
638 },
639 hashmap! {
640 "key1".into() => 2.into(),
641 },
642 ]),
643 metadata: HashMap::new(),
644 }
645 );
646 let selected_transposed = df.select_column("key1".into());
647 assert!(selected_transposed.is_some());
648 let selected_transposed = selected_transposed.unwrap();
649 assert_eq!(selected_transposed.len(), 2);
650 assert_eq!(selected_transposed, ndarray::array![1.into(), 2.into()]);
651 }
652
653 #[rstest]
654 #[case::hhm(hashmap!("key1".into() => vec![1.into(), 2.into()], "key2".into() => vec!["a".into()]))]
655 #[case::stdhm(data_value::stdhashmap!("key1" => vec![1, 2], "key2" => vec!["a"]))]
656 #[case::hm({
657 let hm: std::collections::HashMap<String, Array1<DataValue>> = data_value::stdhashmap!("key1".to_string() => Array1::from_vec(vec![DataValue::from(1), DataValue::from(2)]), "key2".to_string() => Array1::from_vec(vec![DataValue::from("a"), DataValue::Null]));
658 hm
659 })]
660 #[case::vec_hhm(vec![hashmap! {
661 "key1".into() => 1.into(),
662 "key2".into() => "a".into(),
663 },
664 hashmap! {
665 "key1".into() => 2.into(),
666 },])]
667 #[case::vec_hme(vec![data_value::stdhashmap! {
668 "key1" => DataValue::from(1),
669 "key2" => DataValue::from("a"),
670 },data_value::stdhashmap! {
671 "key1" => DataValue::from(2),
672 },])]
673 #[case::vec_vec(vec![("key1".into(), vec! [DataValue::from(1), DataValue::from(2)]), ("key2".into(), vec![DataValue::from("a"), DataValue::Null])])]
674 fn test_from_conversion<T: Into<DataFrame>>(#[case] input: T) {
675 let df: DataFrame = input.into();
676 let expected: DataFrame = DataFrame {
677 constants: HashMap::new(),
678 dataframe: ColumnFrame::from(vec![
679 hashmap! {
680 "key1".into() => 1.into(),
681 "key2".into() => "a".into(),
682 },
683 hashmap! {
684 "key1".into() => 2.into(),
685 },
686 ]),
687 metadata: HashMap::new(),
688 };
689 assert_eq!(
690 df.select(Some(&["key1".into(), "key2".into()])),
691 expected.select(Some(&["key1".into(), "key2".into()])),
692 "{df} vs {expected}"
693 );
694 let selected_transposed = df.select_transposed_typed::<i32>(&["key1".into()]);
695 assert_eq!(selected_transposed.len(), 2);
696 println!("{:?}", selected_transposed);
697 assert_eq!(selected_transposed, vec![vec![1], vec![2]]);
698 }
699 #[rstest]
700 fn test_dataframe(dummy_candidates: ColumnFrame) {
701 let mut dataframe: DataFrame = DataFrame::default();
702 assert!(dataframe.is_empty());
703 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
704 assert_eq!(dataframe.len(), 2);
705
706 let candidate = hashmap! {
707 "key1".into() => 3.into(),
708 "key2".into() => "c".into(),
709 };
710
711 assert!(dataframe.push(candidate).is_ok());
712 assert_eq!(dataframe.len(), 3);
713 assert!(!dataframe.is_empty());
714
715 dataframe.insert_constant("key3".into(), 4.into());
716 assert_eq!(dataframe.constants.len(), 1);
717 assert!(dataframe
718 .apply_function(&["key1".into()], |keys, df| {
719 let key = keys[0].clone();
720 let s = df
721 .get_single_column(&key)
722 .expect("BUG: Cannot get column")
723 .to_owned();
724 let s = s.mapv(|x| x + DataValue::from(1));
725 df.add_single_column("key5", s)?;
726 Ok(())
727 })
728 .is_ok());
729 let original = dataframe.clone();
730 dataframe.shrink();
731 let remove_df = dataframe.remove_column(&["key1".into()]);
732 assert!(remove_df.is_ok());
733 let mut remove_df = remove_df.unwrap();
734 assert_eq!(remove_df.len(), 3);
735 let selected = dataframe.select(Some(&["key2".into()]));
736 assert!(selected.is_ok());
737 let selected = selected.unwrap();
738 println!("{:?}", selected);
739 assert_eq!(selected.len(), 3);
740
741 let joined_result =
743 remove_df.join(dataframe, &JoinRelation::new(crate::JoinBy::AddColumns));
744 assert!(joined_result.is_ok(), "{:?}", joined_result);
745 assert_eq!(original, remove_df);
746 }
747
748 #[rstest]
749 fn test_metadata(dummy_candidates: ColumnFrame) {
750 let mut dataframe: DataFrame = DataFrame::default();
751 assert!(dataframe.is_empty());
752 println!("{:?}", dataframe);
753 assert!(dataframe.extend(dummy_candidates.into()).is_ok());
754 println!("{:?}", dataframe);
755 assert_eq!(dataframe.len(), 2);
756
757 dataframe.add_metadata("test".into(), 1.into());
758 assert_eq!(dataframe.get_metadata("test"), Some(&1.into()));
759 let dataframe = DataFrame::new(ColumnFrame::from(vec![
760 hashmap! {
761 "key1".into() => 1.into(),
762 "key2".into() => "a".into(),
763 },
764 hashmap! {
765 "key1".into() => 2.into(),
766 "key2".into() => "b".into(),
767 },
768 ]));
769 assert_eq!(dataframe.get_metadata("test"), None);
770 let tt = dataframe.select_transposed(None);
771 assert!(tt.is_ok());
772 let tt = tt.unwrap();
773 assert_eq!(tt.shape(), [2, 2]);
774 assert_eq!(
775 tt,
776 Array2::from_shape_vec((2, 2), vec![1.into(), 2.into(), "a".into(), "b".into()])
777 .unwrap()
778 );
779 }
780
781 #[rstest]
782 #[traced_test]
783 fn add_single_column_test() {
784 let mut dataframe = DataFrame::default();
785 let values = Array1::from(vec![1.into(), 2.into(), 3.into()]);
786 let r = dataframe.add_single_column("key1", values);
787 assert!(r.is_ok(), "{r:?}");
788 let selected = dataframe.select(None);
789 assert!(selected.is_ok());
790 let selected = selected.unwrap();
791 assert_eq!(selected.shape(), [3, 1]);
792 assert_eq!(
793 selected,
794 Array2::from_shape_vec((3, 1), vec![1.into(), 2.into(), 3.into()]).unwrap()
795 );
796 let values = Array1::from(vec![1.into(), 2.into()]);
797 assert!(dataframe.add_single_column("key1", values).is_err());
798 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
799 assert!(dataframe.add_single_column("key2", values).is_ok());
800 let values = Array1::from(vec![3.into()]);
801 assert!(dataframe.add_single_column("key3", values).is_err());
802 }
803
804 #[rstest]
805 #[traced_test]
806 fn add_single_column_empty_test() {
807 let mut dataframe = DataFrame::default();
808 let values = Array1::from(vec![]);
809 let r = dataframe.add_single_column("key1", values);
810 assert!(r.is_ok(), "{r:?}");
811 let selected = dataframe.select(None);
812 assert!(selected.is_ok());
813 let selected = selected.unwrap();
814 assert_eq!(selected.shape(), [0, 1]);
815 assert_eq!(selected, Array2::from_shape_vec((0, 1), vec![]).unwrap());
816 let values = Array1::from(vec![1.into(), 2.into()]);
817 assert!(dataframe.add_single_column("key1", values).is_err());
818 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
819 assert!(dataframe.add_single_column("key2", values).is_ok());
820 let values = Array1::from(vec![3.into(), 4.into()]);
821 assert!(dataframe.add_single_column("key3", values).is_err());
822 let values = Array1::from(vec![3.into(), 4.into(), 5.into()]);
823 assert!(dataframe.add_single_column("key3", values).is_ok());
824
825 assert_eq!(
826 dataframe
827 .select_column("key1".into())
828 .expect("BUG: has to exists"),
829 ndarray::arr1(&[DataValue::Null, DataValue::Null, DataValue::Null]),
830 );
831 assert_eq!(
832 dataframe
833 .select_column("key2".into())
834 .expect("BUG: has to exists"),
835 ndarray::arr1(&[3.into(), 4.into(), 5.into()]),
836 );
837 assert_eq!(
838 dataframe.select(None).expect("BUG: cannot get data"),
839 ndarray::arr2(&[
840 [DataValue::Null, 3.into(), 3.into()],
841 [DataValue::Null, 4.into(), 4.into()],
842 [DataValue::Null, 5.into(), 5.into()],
843 ])
844 );
845 }
846
847 #[rstest]
848 #[case(
849 DataFrame::new(ColumnFrame::from(vec![
850 hashmap! {
851 "k".into() => 1.into(),
852 "k2".into() => 2.into(),
853 "k3".into() => 2.2.into(),
854 },
855 hashmap! {
856 "k".into() => 11.into(),
857 "k2".into() => 3.into(),
858 },
859 hashmap! {
860 "k".into() => 4.into(),
861 "k2".into() => 5.into(),
862 "k3".into() => 2.3.into(),
863 },
864 hashmap! {
865 "k".into() => 4.into(),
866 "k2".into() => 5.into(),
867 "k3".into() => 2.4.into(),
868 },
869 ])),
870 vec!["k".into(), "k2".into()],
871 Array2::from_shape_vec((4, 2), vec![1.into(), 2.into(), 11.into(), 3.into(), 4.into(), 5.into(), 4.into(), 5.into()]).unwrap()
872 )]
873 #[case(
874 DataFrame::new(ColumnFrame::from(vec![
875 hashmap! {
876 "k".into() => 1.into(),
877 "k2".into() => 2.into(),
878 "k3".into() => 2.2.into(),
879 },
880 hashmap! {
881 "k".into() => 11.into(),
882 "k2".into() => 3.into(),
883 },
884 hashmap! {
885 "k".into() => 4.into(),
886 "k2".into() => 5.into(),
887 "k3".into() => 2.3.into(),
888 },
889 hashmap! {
890 "k".into() => 4.into(),
891 "k2".into() => 5.into(),
892 "k3".into() => 2.4.into(),
893 },
894 ])),
895 vec!["k2".into(), "k3".into(), "nonexist1".into(), "nonexists2".into(), "k".into()],
896 Array2::from_shape_vec((4, 5), vec![
897 2.into(), 2.2.into(), DataValue::Null, DataValue::Null, 1.into(),
898 3.into(), DataValue::Null, DataValue::Null, DataValue::Null, 11.into(),
899 5.into(), 2.3.into(), DataValue::Null, DataValue::Null, 4.into(),
900 5.into(), 2.4.into(), DataValue::Null, DataValue::Null, 4.into()]).unwrap()
901 )]
902 #[traced_test]
903 fn select_multiple(
904 #[case] input: DataFrame,
905 #[case] columns: Vec<Key>,
906 #[case] expected: Array2<DataValue>,
907 ) {
908 let selected = input.select(Some(&columns));
909 assert!(selected.is_ok());
910 let selected = selected.unwrap();
911
912 assert_eq!(selected, expected);
913 }
914
915 #[rstest]
916 #[case(
917 DataFrame::new(ColumnFrame::from(vec![
918 hashmap! {
919 "k".into() => 1.into(),
920 "k2".into() => 2.into(),
921 "k3".into() => 2.2.into(),
922 },
923 hashmap! {
924 "k".into() => 11.into(),
925 "k2".into() => 3.into(),
926 },
927 hashmap! {
928 "k".into() => 4.into(),
929 "k2".into() => 5.into(),
930 "k3".into() => 2.3.into(),
931 },
932 hashmap! {
933 "k".into() => 4.into(),
934 "k2".into() => 5.into(),
935 "k3".into() => 2.4.into(),
936 },
937 ])),
938 "k".into(),
939 Array2::from_shape_vec((4, 3), vec![
940 1.into(), 2.into(), 2.2.into(),
941 4.into(), 5.into(), 2.3.into(),
942 4.into(), 5.into(), 2.4.into(),
943 11.into(), 3.into(), DataValue::Null,
944 ]
945 ).unwrap(),
946 vec!["k".into(), "k2".into(), "k3".into()],
947 )]
948 #[rstest]
949 #[case(
950 DataFrame::new(ColumnFrame::from(vec![
951 hashmap! {
952 "k".into() => 1.into(),
953 "k2".into() => 2.into(),
954 "k3".into() => 2.2.into(),
955 },
956 hashmap! {
957 "k".into() => 11.into(),
958 "k2".into() => 3.into(),
959 },
960 hashmap! {
961 "k".into() => 4.into(),
962 "k2".into() => 5.into(),
963 "k3".into() => 2.3.into(),
964 },
965 hashmap! {
966 "k".into() => 4.into(),
967 "k2".into() => 5.into(),
968 "k3".into() => 2.4.into(),
969 },
970 ])),
971 "k3".into(),
972 Array2::from_shape_vec((4, 3), vec![
973 11.into(), 3.into(), DataValue::Null,
974 1.into(), 2.into(), 2.2.into(),
975 4.into(), 5.into(), 2.3.into(),
976 4.into(), 5.into(), 2.4.into(),
977 ]
978 ).unwrap(),
979 vec!["k".into(), "k2".into(), "k3".into()],
980 )]
981 #[case(
982 DataFrame::new(ColumnFrame::from(vec![
983 hashmap! {
984 "k".into() => 2.into(),
985 "k2".into() => 0.000001.into(),
986 },
987 hashmap! {
988 "k".into() => 1.into(),
989 "k2".into() =>0.0000001.into(),
990 },
991 hashmap! {
992 "k".into() => 3.into(),
993 "k2".into() => 0.00001.into(),
994 },
995 hashmap! {
996 "k".into() => 4.into(),
997 "k2".into() => 0.001.into(),
998 },
999 ])),
1000 "k2".into(),
1001 Array2::from_shape_vec((4, 2), vec![
1002 1.into(), 0.0000001.into(),
1003 2.into(), 0.000001.into(),
1004 3.into(), 0.00001.into(),
1005 4.into(), 0.001.into(),
1006 ]
1007 ).unwrap(),
1008 vec!["k".into(), "k2".into()],
1009 )]
1010 #[case(
1011 DataFrame::new(ColumnFrame::from(vec![
1012 hashmap! {
1013 "k".into() => 2.into(),
1014 "k2".into() => "b".into(),
1015 },
1016 hashmap! {
1017 "k".into() => 1.into(),
1018 "k2".into() =>"a".into(),
1019 },
1020 hashmap! {
1021 "k".into() => 3.into(),
1022 "k2".into() =>"c".into(),
1023 },
1024 hashmap! {
1025 "k".into() => 4.into(),
1026 "k2".into() =>"z".into(),
1027 },
1028 ])),
1029 "k2".into(),
1030 Array2::from_shape_vec((4, 2), vec![
1031 1.into(),"a".into(),
1032 2.into(), "b".into(),
1033 3.into(), "c".into(),
1034 4.into(), "z".into(),
1035 ]
1036 ).unwrap(),
1037 vec!["k".into(), "k2".into()],
1038 )]
1039 #[traced_test]
1040 fn sort_by(
1041 #[case] input: DataFrame,
1042 #[case] column: Key,
1043 #[case] expected: Array2<DataValue>,
1044 #[case] columns: Vec<Key>,
1045 ) {
1046 let result = input.sorted(&column);
1047 assert!(result.is_ok(), "{result:?}");
1048 let result = result.unwrap().get_sorted();
1049 let selected = result.select(Some(&columns));
1050
1051 assert_eq!(selected, expected);
1052 }
1053 #[rstest]
1054 #[case(
1055 DataFrame::new(ColumnFrame::from(vec![
1056 hashmap! {
1057 "k".into() => 2.into(),
1058 "k2".into() => 0.000001.into(),
1059 },
1060 hashmap! {
1061 "k".into() => 1.into(),
1062 "k2".into() =>0.0000001.into(),
1063 },
1064 hashmap! {
1065 "k".into() => 3.into(),
1066 "k2".into() => 0.00001.into(),
1067 },
1068 hashmap! {
1069 "k".into() => 4.into(),
1070 "k2".into() => 0.001.into(),
1071 },
1072 ])),
1073 "k2".into(),
1074 TopN::Last(1),
1075 Array2::from_shape_vec((1, 2), vec![
1076 4.into(), 0.001.into(),
1077 ]
1078 ).unwrap(),
1079 vec!["k".into(), "k2".into()],
1080 )]
1081 #[case(
1082 DataFrame::new(ColumnFrame::from(vec![
1083 hashmap! {
1084 "k".into() => 2.into(),
1085 "k2".into() => 0.000001.into(),
1086 },
1087 hashmap! {
1088 "k".into() => 1.into(),
1089 "k2".into() =>0.0000001.into(),
1090 },
1091 hashmap! {
1092 "k".into() => 3.into(),
1093 "k2".into() => 0.00001.into(),
1094 },
1095 hashmap! {
1096 "k".into() => 4.into(),
1097 "k2".into() => 0.001.into(),
1098 },
1099 ])),
1100 "k2".into(),
1101 TopN::Last(2),
1102 Array2::from_shape_vec((2, 2), vec![
1103 4.into(), 0.001.into(),
1104 3.into(), 0.00001.into(),
1105 ]
1106 ).unwrap(),
1107 vec!["k".into(), "k2".into()],
1108 )]
1109 #[case(
1110 DataFrame::new(ColumnFrame::from(vec![
1111 hashmap! {
1112 "k".into() => 2.into(),
1113 "k2".into() => "b".into(),
1114 },
1115 hashmap! {
1116 "k".into() => 1.into(),
1117 "k2".into() =>"a".into(),
1118 },
1119 hashmap! {
1120 "k".into() => 3.into(),
1121 "k2".into() =>"c".into(),
1122 },
1123 hashmap! {
1124 "k".into() => 4.into(),
1125 "k2".into() =>"z".into(),
1126 },
1127 ])),
1128 "k2".into(),
1129 TopN::First(1),
1130 Array2::from_shape_vec((1, 2), vec![
1131 1.into(),"a".into(),
1132 ]
1133 ).unwrap(),
1134 vec!["k".into(), "k2".into()],
1135 )]
1136 #[case(
1137 DataFrame::new(ColumnFrame::from(vec![
1138 hashmap! {
1139 "k".into() => 2.into(),
1140 "k2".into() => "b".into(),
1141 },
1142 hashmap! {
1143 "k".into() => 1.into(),
1144 "k2".into() =>"a".into(),
1145 },
1146 hashmap! {
1147 "k".into() => 3.into(),
1148 "k2".into() =>"c".into(),
1149 },
1150 hashmap! {
1151 "k".into() => 4.into(),
1152 "k2".into() =>"z".into(),
1153 },
1154 ])),
1155 "k2".into(),
1156 TopN::First(2),
1157 Array2::from_shape_vec((2, 2), vec![
1158 1.into(),"a".into(),
1159 2.into(),"b".into(),
1160 ]
1161 ).unwrap(),
1162 vec!["k".into(), "k2".into()],
1163 )]
1164 #[traced_test]
1165 fn top_n(
1166 #[case] input: DataFrame,
1167 #[case] column: Key,
1168 #[case] topn: TopN,
1169 #[case] expected: Array2<DataValue>,
1170 #[case] columns: Vec<Key>,
1171 ) {
1172 let result = input.sorted(&column);
1173 assert!(result.is_ok(), "{result:?}");
1174 let result = result.unwrap();
1175 let first = result.topn(topn).unwrap();
1176 let selected = first.select(Some(&columns));
1177 assert_eq!(selected, expected);
1178 }
1179}