blackjack/dataframe/
mod.rs

1//! DataFrame object and associated functionality
2//!
3//!
4
5use baggie::Baggie;
6use num::*;
7use serde::Deserialize;
8
9use crate::prelude::*;
10
11pub mod dataframe_groupby;
12pub mod io;
13pub use self::dataframe_groupby::*;
14pub use self::io::*;
15
16/// The container for `Series<T>` objects, allowing for additional functionality
17#[derive(Default, Debug)]
18pub struct DataFrame<I>
19where
20    I: PartialOrd + PartialEq + BlackJackData,
21{
22    index: Series<I>,
23    meta: Vec<SeriesMeta>,
24    data: Baggie<String>,
25}
26
27impl<I: PartialOrd + PartialEq + BlackJackData> DataFrame<I> {
28    /// Create a new `DataFrame` struct
29    ///
30    /// ## Example
31    /// ```
32    /// use blackjack::prelude::*;
33    ///
34    /// let mut df: DataFrame<i32> = DataFrame::new();  // `i32` indicates index type of DataFrame
35    /// ```
36    pub fn new() -> Self {
37        DataFrame {
38            index: Series::default(),
39            data: Baggie::new(),
40            meta: vec![],
41        }
42    }
43
44    /// Filter the dataframe by iterating over its `Row`s.
45    ///
46    /// ## Example
47    ///
48    /// ```
49    /// # use blackjack::prelude::*;
50    /// let mut s1 = Series::from(0..5);
51    /// s1.set_name("col1");
52    ///
53    /// let mut s2 = Series::from(10..15);
54    /// s2.set_name("col2");
55    ///
56    /// let mut s3 = Series::from_vec(vec![
57    ///     "foo".to_string(),
58    ///     "bar".to_string(),
59    ///     "foo".to_string(),
60    ///     "bar".to_string(),
61    ///     "foo".to_string(),
62    /// ]);
63    /// s3.set_name("col3");
64    ///
65    /// let mut df = DataFrame::new();
66    /// assert!(df.add_column(s1).is_ok());
67    /// assert!(df.add_column(s2).is_ok());
68    /// assert!(df.add_column(s3).is_ok());
69    ///
70    /// // Before filtering, we're len 5
71    /// assert_eq!(df.len(), 5);
72    ///
73    /// df.filter_by_row(|row| row["col1"] == Datum::I32(&0));
74    ///
75    /// // After filtering, we're len 4 and first element of 'col1' is now 1
76    /// assert_eq!(df.len(), 4);
77    ///
78    /// // Filter by string foo,
79    /// df.filter_by_row(|row| row["col3"] != Datum::STR(&"foo".to_string()));
80    /// assert_eq!(df.len(), 2);
81    /// ```
82    pub fn filter_by_row<F>(&mut self, condition: F) -> ()
83    where
84        F: Fn(&Row<'_>) -> bool,
85    {
86        let positions_to_drop = self
87            .iter_rows()
88            .enumerate()
89            .filter(|(_idx, row)| condition(row))
90            .map(|(idx, _)| idx)
91            .collect::<Vec<usize>>();
92
93        self.drop_positions(positions_to_drop.into_iter())
94    }
95
96    /// Drop positions within the `Series`
97    ///
98    /// ## Example
99    /// ```
100    /// # use blackjack::prelude::*;
101    ///
102    /// let mut df = DataFrame::new();
103    /// assert!(df.add_column(Series::from(0..10)).is_ok());
104    ///
105    /// assert_eq!(df.len(), 10);
106    /// df.drop_positions(0..5);  // Iterator of `usize` items
107    /// assert_eq!(df.len(), 5);
108    /// ```
109    pub fn drop_positions(&mut self, positions: impl Iterator<Item = usize>) -> () {
110        let positions = positions.into_iter().collect::<Vec<usize>>();
111        for meta in self.meta.clone() {
112            match meta.dtype {
113                DType::F64 => {
114                    let s: &mut Series<f64> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
115                    s.drop_positions(positions.clone())
116                }
117                DType::I64 => {
118                    let s: &mut Series<i64> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
119                    s.drop_positions(positions.clone())
120                }
121                DType::F32 => {
122                    let s: &mut Series<f32> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
123                    s.drop_positions(positions.clone())
124                }
125                DType::I32 => {
126                    let s: &mut Series<i32> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
127                    s.drop_positions(positions.clone())
128                }
129                DType::STRING => {
130                    let s: &mut Series<String> =
131                        &mut self.get_column_mut(meta.name.as_str()).unwrap();
132                    s.drop_positions(positions.clone())
133                }
134            };
135        }
136        self.index.drop_positions(positions);
137    }
138
139    /// Iterator over rows of a dataframe where each element contained is a reference
140    ///
141    /// ## Example
142    /// ```
143    /// # use blackjack::prelude::*;
144    /// # let mut df = DataFrame::new();
145    /// # let s1 = Series::from_vec(vec![0, 1, 2, 3]);
146    /// # let s2 = Series::from_vec(vec![1, 2, 3, 4]);
147    /// # assert!(df.add_column(s1).is_ok());
148    /// # assert!(df.add_column(s2).is_ok());
149    ///
150    /// let rows = df.iter_rows().collect::<Vec<Row>>();
151    /// assert_eq!(rows.len(), 4);  // Four rows
152    /// assert!(rows.iter().all(|r| r.data.len() == 2));  // Each row has two elements
153    /// ```
154    pub fn iter_rows(&self) -> impl Iterator<Item = Row<'_>> {
155        (0..self.len()).map(move |idx| {
156            let mut row = Row::new();
157            for meta in self.meta.iter() {
158                match meta.dtype {
159                    DType::F64 => {
160                        let series: &Series<f64> = self.data.get(&meta.name).unwrap();
161                        row.add(Element::new(meta.name.clone(), Datum::F64(&series[idx])))
162                    }
163                    DType::I64 => {
164                        let series: &Series<i64> = self.data.get(&meta.name).unwrap();
165                        row.add(Element::new(meta.name.clone(), Datum::I64(&series[idx])))
166                    }
167                    DType::F32 => {
168                        let series: &Series<f32> = self.data.get(&meta.name).unwrap();
169                        row.add(Element::new(meta.name.clone(), Datum::F32(&series[idx])))
170                    }
171                    DType::I32 => {
172                        let series: &Series<i32> = self.data.get(&meta.name).unwrap();
173                        row.add(Element::new(meta.name.clone(), Datum::I32(&series[idx])))
174                    }
175                    DType::STRING => {
176                        let series: &Series<String> = self.data.get(&meta.name).unwrap();
177                        row.add(Element::new(meta.name.clone(), Datum::STR(&series[idx])))
178                    }
179                }
180            }
181            row
182        })
183    }
184
185    /// Select rows of the DataFrame based on positional index
186    ///
187    /// ## Example
188    /// ```
189    /// use blackjack::prelude::*;
190    ///
191    /// let mut df = DataFrame::new();
192    ///  let s1 = Series::from_vec(vec![0, 1, 2, 3]);
193    ///  let s2 = Series::from_vec(vec![1, 2, 3, 4]);
194    ///
195    ///  assert!(df.add_column(s1).is_ok());
196    ///  assert!(df.add_column(s2).is_ok());
197    ///
198    ///  let rows = df.iloc(vec![1]).collect::<Vec<Row>>();
199    ///
200    ///  // First column is s1, second element is 1
201    ///  if let Datum::I32(val) = rows[0].data[0].data {
202    ///      assert_eq!(val, &1);
203    ///  }
204    ///
205    ///  // second column is s2, second element is 2
206    ///  if let Datum::I32(val) = rows[0].data[1].data {
207    ///      assert_eq!(val, &2);
208    ///  }
209    /// ```
210    pub fn iloc<Idx>(&self, idx: Idx) -> impl Iterator<Item = Row<'_>>
211    where
212        Idx: IntoIterator<Item = usize>,
213    {
214        let indexes = idx.into_iter().collect::<Vec<usize>>();
215
216        self.iter_rows()
217            .enumerate()
218            .filter(move |(idx, _row)| indexes.contains(&idx))
219            .map(|(_idx, row)| row)
220    }
221
222    /// Length of the dataframe
223    ///
224    /// ## Example
225    /// ```
226    /// use blackjack::prelude::*;
227    ///
228    /// let mut df = DataFrame::new();
229    /// assert_eq!(df.len(), 0);
230    ///
231    /// let series: Series<i32> = Series::arange(0, 10);
232    /// df.add_column(series).unwrap();
233    ///
234    /// assert_eq!(df.len(), 10);
235    /// ```
236    pub fn len(&self) -> usize {
237        self.index.len()
238    }
239
240    /// Quickly identify if the dataframe is empty.
241    pub fn is_empty(&self) -> bool {
242        !self.len() > 0
243    }
244
245    /// Add a column to this dataframe.
246    pub fn add_column<T: BlackJackData + 'static>(
247        &mut self,
248        series: Series<T>,
249    ) -> Result<(), BlackJackError>
250    where
251        Vec<I>: std::iter::FromIterator<i32>,
252    {
253        let mut series = series;
254
255        // Ensure length is a match if we have columns
256        if self.len() > 0 && self.len() != series.len() {
257            return Err(BlackJackError::LengthMismatch(format!(
258                "DataFrame has length: {}, cannot add series of length: {}",
259                self.len(),
260                series.len()
261            )));
262        } else {
263            self.index = Series::from_vec((0..series.len() as i32).collect::<Vec<I>>())
264        }
265
266        if let None = series.name() {
267            series.set_name(&format!("col_{}", self.n_columns()))
268        }
269
270        let meta = SeriesMeta::from(&series);
271        self.data.insert(meta.name.clone(), series);
272        self.meta.push(meta);
273
274        Ok(())
275    }
276
277    /// Retrieves a mutable reference to the column
278    pub fn get_column_mut<'a, T>(&mut self, name: impl Into<&'a str>) -> Option<&mut Series<T>>
279    where
280        T: BlackJackData + 'static,
281    {
282        let name = name.into();
283        for meta in &self.meta {
284            if meta.name == name {
285                let series: Option<&mut Series<T>> = self.data.get_mut(&meta.name);
286                return series;
287            }
288        }
289        None
290    }
291
292    /// Retrieves a reference to a column
293    pub fn get_column<'a, T>(&self, name: impl Into<&'a str>) -> Option<&Series<T>>
294    where
295        T: BlackJackData + 'static,
296    {
297        let name = name.into();
298        for meta in &self.meta {
299            if meta.name == name {
300                let series: Option<&Series<T>> = self.data.get(&meta.name);
301                return series;
302            }
303        }
304        None
305    }
306
307    /// Get column, infer
308    pub fn get_column_infer<'a>(&self, name: impl Into<&'a str>) -> Option<GenericSeriesContainer> {
309        let name = name.into();
310        if self.data.contains_key(name) {
311            let meta: &SeriesMeta = self.meta.iter().filter(|m| m.name == name).last()?;
312            let container = match meta.dtype {
313                DType::I64 => {
314                    GenericSeriesContainer::I64(self.data.get::<Series<i64>, _>(name)?.clone())
315                }
316                DType::F64 => {
317                    GenericSeriesContainer::F64(self.data.get::<Series<f64>, _>(name)?.clone())
318                }
319                DType::I32 => {
320                    GenericSeriesContainer::I32(self.data.get::<Series<i32>, _>(name)?.clone())
321                }
322                DType::F32 => {
323                    GenericSeriesContainer::F32(self.data.get::<Series<f32>, _>(name)?.clone())
324                }
325                DType::STRING => GenericSeriesContainer::STRING(
326                    self.data.get::<Series<String>, _>(name).unwrap().clone(),
327                ),
328            };
329            Some(container)
330        } else {
331            None
332        }
333    }
334
335    /// Get a list of column names in this dataframe as an iterator
336    pub fn columns(&self) -> impl Iterator<Item = &str> {
337        self.data.keys().map(|c| c.as_str())
338    }
339
340    /// Get the number of columns for this dataframe
341    pub fn n_columns(&self) -> usize {
342        self.data.len()
343    }
344
345    /// Group by method for grouping [`Series`] in a [`DataFrame`]
346    /// by key.
347    pub fn groupby<T>(&self, keys: &Series<T>) -> DataFrameGroupBy<T>
348    where
349        for<'de> T: BlackJackData + Deserialize<'de> + ToPrimitive + 'static,
350    {
351        let groups = self
352            .columns()
353            .map(|col_name| {
354                let series = self.get_column(col_name).unwrap();
355                series.groupby(keys)
356            })
357            .collect::<Vec<SeriesGroupBy<T>>>();
358
359        DataFrameGroupBy::new(groups)
360    }
361}