blackjack/dataframe/mod.rs
1//! DataFrame object and associated functionality
2//!
3//!
4
5use baggie::Baggie;
6use num::*;
7use serde::Deserialize;
8
9use crate::prelude::*;
10
11pub mod dataframe_groupby;
12pub mod io;
13pub use self::dataframe_groupby::*;
14pub use self::io::*;
15
16/// The container for `Series<T>` objects, allowing for additional functionality
17#[derive(Default, Debug)]
18pub struct DataFrame<I>
19where
20 I: PartialOrd + PartialEq + BlackJackData,
21{
22 index: Series<I>,
23 meta: Vec<SeriesMeta>,
24 data: Baggie<String>,
25}
26
27impl<I: PartialOrd + PartialEq + BlackJackData> DataFrame<I> {
28 /// Create a new `DataFrame` struct
29 ///
30 /// ## Example
31 /// ```
32 /// use blackjack::prelude::*;
33 ///
34 /// let mut df: DataFrame<i32> = DataFrame::new(); // `i32` indicates index type of DataFrame
35 /// ```
36 pub fn new() -> Self {
37 DataFrame {
38 index: Series::default(),
39 data: Baggie::new(),
40 meta: vec![],
41 }
42 }
43
44 /// Filter the dataframe by iterating over its `Row`s.
45 ///
46 /// ## Example
47 ///
48 /// ```
49 /// # use blackjack::prelude::*;
50 /// let mut s1 = Series::from(0..5);
51 /// s1.set_name("col1");
52 ///
53 /// let mut s2 = Series::from(10..15);
54 /// s2.set_name("col2");
55 ///
56 /// let mut s3 = Series::from_vec(vec![
57 /// "foo".to_string(),
58 /// "bar".to_string(),
59 /// "foo".to_string(),
60 /// "bar".to_string(),
61 /// "foo".to_string(),
62 /// ]);
63 /// s3.set_name("col3");
64 ///
65 /// let mut df = DataFrame::new();
66 /// assert!(df.add_column(s1).is_ok());
67 /// assert!(df.add_column(s2).is_ok());
68 /// assert!(df.add_column(s3).is_ok());
69 ///
70 /// // Before filtering, we're len 5
71 /// assert_eq!(df.len(), 5);
72 ///
73 /// df.filter_by_row(|row| row["col1"] == Datum::I32(&0));
74 ///
75 /// // After filtering, we're len 4 and first element of 'col1' is now 1
76 /// assert_eq!(df.len(), 4);
77 ///
78 /// // Filter by string foo,
79 /// df.filter_by_row(|row| row["col3"] != Datum::STR(&"foo".to_string()));
80 /// assert_eq!(df.len(), 2);
81 /// ```
82 pub fn filter_by_row<F>(&mut self, condition: F) -> ()
83 where
84 F: Fn(&Row<'_>) -> bool,
85 {
86 let positions_to_drop = self
87 .iter_rows()
88 .enumerate()
89 .filter(|(_idx, row)| condition(row))
90 .map(|(idx, _)| idx)
91 .collect::<Vec<usize>>();
92
93 self.drop_positions(positions_to_drop.into_iter())
94 }
95
96 /// Drop positions within the `Series`
97 ///
98 /// ## Example
99 /// ```
100 /// # use blackjack::prelude::*;
101 ///
102 /// let mut df = DataFrame::new();
103 /// assert!(df.add_column(Series::from(0..10)).is_ok());
104 ///
105 /// assert_eq!(df.len(), 10);
106 /// df.drop_positions(0..5); // Iterator of `usize` items
107 /// assert_eq!(df.len(), 5);
108 /// ```
109 pub fn drop_positions(&mut self, positions: impl Iterator<Item = usize>) -> () {
110 let positions = positions.into_iter().collect::<Vec<usize>>();
111 for meta in self.meta.clone() {
112 match meta.dtype {
113 DType::F64 => {
114 let s: &mut Series<f64> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
115 s.drop_positions(positions.clone())
116 }
117 DType::I64 => {
118 let s: &mut Series<i64> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
119 s.drop_positions(positions.clone())
120 }
121 DType::F32 => {
122 let s: &mut Series<f32> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
123 s.drop_positions(positions.clone())
124 }
125 DType::I32 => {
126 let s: &mut Series<i32> = &mut self.get_column_mut(meta.name.as_str()).unwrap();
127 s.drop_positions(positions.clone())
128 }
129 DType::STRING => {
130 let s: &mut Series<String> =
131 &mut self.get_column_mut(meta.name.as_str()).unwrap();
132 s.drop_positions(positions.clone())
133 }
134 };
135 }
136 self.index.drop_positions(positions);
137 }
138
139 /// Iterator over rows of a dataframe where each element contained is a reference
140 ///
141 /// ## Example
142 /// ```
143 /// # use blackjack::prelude::*;
144 /// # let mut df = DataFrame::new();
145 /// # let s1 = Series::from_vec(vec![0, 1, 2, 3]);
146 /// # let s2 = Series::from_vec(vec![1, 2, 3, 4]);
147 /// # assert!(df.add_column(s1).is_ok());
148 /// # assert!(df.add_column(s2).is_ok());
149 ///
150 /// let rows = df.iter_rows().collect::<Vec<Row>>();
151 /// assert_eq!(rows.len(), 4); // Four rows
152 /// assert!(rows.iter().all(|r| r.data.len() == 2)); // Each row has two elements
153 /// ```
154 pub fn iter_rows(&self) -> impl Iterator<Item = Row<'_>> {
155 (0..self.len()).map(move |idx| {
156 let mut row = Row::new();
157 for meta in self.meta.iter() {
158 match meta.dtype {
159 DType::F64 => {
160 let series: &Series<f64> = self.data.get(&meta.name).unwrap();
161 row.add(Element::new(meta.name.clone(), Datum::F64(&series[idx])))
162 }
163 DType::I64 => {
164 let series: &Series<i64> = self.data.get(&meta.name).unwrap();
165 row.add(Element::new(meta.name.clone(), Datum::I64(&series[idx])))
166 }
167 DType::F32 => {
168 let series: &Series<f32> = self.data.get(&meta.name).unwrap();
169 row.add(Element::new(meta.name.clone(), Datum::F32(&series[idx])))
170 }
171 DType::I32 => {
172 let series: &Series<i32> = self.data.get(&meta.name).unwrap();
173 row.add(Element::new(meta.name.clone(), Datum::I32(&series[idx])))
174 }
175 DType::STRING => {
176 let series: &Series<String> = self.data.get(&meta.name).unwrap();
177 row.add(Element::new(meta.name.clone(), Datum::STR(&series[idx])))
178 }
179 }
180 }
181 row
182 })
183 }
184
185 /// Select rows of the DataFrame based on positional index
186 ///
187 /// ## Example
188 /// ```
189 /// use blackjack::prelude::*;
190 ///
191 /// let mut df = DataFrame::new();
192 /// let s1 = Series::from_vec(vec![0, 1, 2, 3]);
193 /// let s2 = Series::from_vec(vec![1, 2, 3, 4]);
194 ///
195 /// assert!(df.add_column(s1).is_ok());
196 /// assert!(df.add_column(s2).is_ok());
197 ///
198 /// let rows = df.iloc(vec![1]).collect::<Vec<Row>>();
199 ///
200 /// // First column is s1, second element is 1
201 /// if let Datum::I32(val) = rows[0].data[0].data {
202 /// assert_eq!(val, &1);
203 /// }
204 ///
205 /// // second column is s2, second element is 2
206 /// if let Datum::I32(val) = rows[0].data[1].data {
207 /// assert_eq!(val, &2);
208 /// }
209 /// ```
210 pub fn iloc<Idx>(&self, idx: Idx) -> impl Iterator<Item = Row<'_>>
211 where
212 Idx: IntoIterator<Item = usize>,
213 {
214 let indexes = idx.into_iter().collect::<Vec<usize>>();
215
216 self.iter_rows()
217 .enumerate()
218 .filter(move |(idx, _row)| indexes.contains(&idx))
219 .map(|(_idx, row)| row)
220 }
221
222 /// Length of the dataframe
223 ///
224 /// ## Example
225 /// ```
226 /// use blackjack::prelude::*;
227 ///
228 /// let mut df = DataFrame::new();
229 /// assert_eq!(df.len(), 0);
230 ///
231 /// let series: Series<i32> = Series::arange(0, 10);
232 /// df.add_column(series).unwrap();
233 ///
234 /// assert_eq!(df.len(), 10);
235 /// ```
236 pub fn len(&self) -> usize {
237 self.index.len()
238 }
239
240 /// Quickly identify if the dataframe is empty.
241 pub fn is_empty(&self) -> bool {
242 !self.len() > 0
243 }
244
245 /// Add a column to this dataframe.
246 pub fn add_column<T: BlackJackData + 'static>(
247 &mut self,
248 series: Series<T>,
249 ) -> Result<(), BlackJackError>
250 where
251 Vec<I>: std::iter::FromIterator<i32>,
252 {
253 let mut series = series;
254
255 // Ensure length is a match if we have columns
256 if self.len() > 0 && self.len() != series.len() {
257 return Err(BlackJackError::LengthMismatch(format!(
258 "DataFrame has length: {}, cannot add series of length: {}",
259 self.len(),
260 series.len()
261 )));
262 } else {
263 self.index = Series::from_vec((0..series.len() as i32).collect::<Vec<I>>())
264 }
265
266 if let None = series.name() {
267 series.set_name(&format!("col_{}", self.n_columns()))
268 }
269
270 let meta = SeriesMeta::from(&series);
271 self.data.insert(meta.name.clone(), series);
272 self.meta.push(meta);
273
274 Ok(())
275 }
276
277 /// Retrieves a mutable reference to the column
278 pub fn get_column_mut<'a, T>(&mut self, name: impl Into<&'a str>) -> Option<&mut Series<T>>
279 where
280 T: BlackJackData + 'static,
281 {
282 let name = name.into();
283 for meta in &self.meta {
284 if meta.name == name {
285 let series: Option<&mut Series<T>> = self.data.get_mut(&meta.name);
286 return series;
287 }
288 }
289 None
290 }
291
292 /// Retrieves a reference to a column
293 pub fn get_column<'a, T>(&self, name: impl Into<&'a str>) -> Option<&Series<T>>
294 where
295 T: BlackJackData + 'static,
296 {
297 let name = name.into();
298 for meta in &self.meta {
299 if meta.name == name {
300 let series: Option<&Series<T>> = self.data.get(&meta.name);
301 return series;
302 }
303 }
304 None
305 }
306
307 /// Get column, infer
308 pub fn get_column_infer<'a>(&self, name: impl Into<&'a str>) -> Option<GenericSeriesContainer> {
309 let name = name.into();
310 if self.data.contains_key(name) {
311 let meta: &SeriesMeta = self.meta.iter().filter(|m| m.name == name).last()?;
312 let container = match meta.dtype {
313 DType::I64 => {
314 GenericSeriesContainer::I64(self.data.get::<Series<i64>, _>(name)?.clone())
315 }
316 DType::F64 => {
317 GenericSeriesContainer::F64(self.data.get::<Series<f64>, _>(name)?.clone())
318 }
319 DType::I32 => {
320 GenericSeriesContainer::I32(self.data.get::<Series<i32>, _>(name)?.clone())
321 }
322 DType::F32 => {
323 GenericSeriesContainer::F32(self.data.get::<Series<f32>, _>(name)?.clone())
324 }
325 DType::STRING => GenericSeriesContainer::STRING(
326 self.data.get::<Series<String>, _>(name).unwrap().clone(),
327 ),
328 };
329 Some(container)
330 } else {
331 None
332 }
333 }
334
335 /// Get a list of column names in this dataframe as an iterator
336 pub fn columns(&self) -> impl Iterator<Item = &str> {
337 self.data.keys().map(|c| c.as_str())
338 }
339
340 /// Get the number of columns for this dataframe
341 pub fn n_columns(&self) -> usize {
342 self.data.len()
343 }
344
345 /// Group by method for grouping [`Series`] in a [`DataFrame`]
346 /// by key.
347 pub fn groupby<T>(&self, keys: &Series<T>) -> DataFrameGroupBy<T>
348 where
349 for<'de> T: BlackJackData + Deserialize<'de> + ToPrimitive + 'static,
350 {
351 let groups = self
352 .columns()
353 .map(|col_name| {
354 let series = self.get_column(col_name).unwrap();
355 series.groupby(keys)
356 })
357 .collect::<Vec<SeriesGroupBy<T>>>();
358
359 DataFrameGroupBy::new(groups)
360 }
361}