tubular/dataframe.rs
1use std::fmt::{self, Display};
2use std::collections::HashMap;
3use std::ops::Index;
4
5#[cfg(feature = "serde")]
6use serde::{Serialize, Deserialize};
7
8use crate::{
9 Column,
10 Rows,
11 fmt::left_pad,
12};
13
14/// [`Column`](enum.Column.html) id used to lookup and describe columns in a [`DataFrame`](struct.DataFrame.html)
15pub type Header = String;
16
17/// A 2D matrix of cells of mixed types useful for exploratory data analysis.
18///
19/// ## Basic Concept
20///
21/// A `DataFrame` consists of 0 or more [`Column`]s in a specific order. The DataFrame
22/// keeps track of a [`Header`] associated with each of its columns. The `Header` can
23/// be used to display the whole `DataFrame` or it can be used to lookup a single `Column`.
24///
25/// [`Row`]s can be constructed by gathering up one cell from all columns at the same
26/// position. Since `Row`s are composed of many possibly different types, in the normal case,
27/// each cell in the column is of a different type which can't be known until runtime.
28/// Tubular tries to make this as ergonomic as possible, but working with Rows will always
29/// be a bit more ceremonius than working with Columns.
30///
31/// Here's how to think about the structure of a `DataFrame`:
32///
33/// ```no-run
34/// DataFrame
35///
36/// + - - - -+- - - - + - - - -+
37/// | Header | Header | Header |
38/// + - - - -+- - - - + - - - -+
39/// | Column | Column | Column |
40/// | bool | u32 | String |
41/// | ------ | ------ | ------ |
42/// | |cell| | |cell| | |cell| |
43/// | ------ | ------ | ------ |
44/// | |cell| | |cell| | |cell| |
45/// | ------ | ------ | ------ |
46/// | ================================
47/// | |cell| |cell| |cell | Row |
48/// | |bool| |u32 | |String| |
49/// | ================================
50/// | ------ | ------ | ------ |
51/// | |cell| | |cell| | |cell| |
52/// | ------ | ------ | ------ |
53/// + - - - -+- - - - + - - - -+
54///
55/// ```
56///
57/// ## Constructing a DataFrame
58///
59/// `DataFrame` implements `Default`, which is generally the easiest way to
60/// create one from scratch:
61///
62/// ```
63/// use tubular::DataFrame;
64///
65/// let df = DataFrame::default();
66/// ```
67///
68/// To add columns, use [`push()`](#method.push):
69///
70/// ```
71/// # use tubular::DataFrame;
72/// # let mut df = DataFrame::default();
73/// df.push("Fruits", &["apple", "banana", "pear"]);
74/// ```
75///
76/// Although you could try using Serde to load a DataFrame, the current implementation
77/// is very basic and not likely to work as you expect.
78///
79/// ## Exploring
80///
81/// The easiest way to figure out what's in a `DataFrame` is to print it out:
82///
83/// ```
84/// # use tubular::DataFrame;
85/// # let mut df = DataFrame::default();
86/// # df.push("Fruits", &["apple", "banana", "pear"]);
87/// # df.push("Organic", &[true, false, true]);
88/// # df.push("Quantity", &[16, 30, 10]);
89/// println!("{}", &df);
90/// ```
91///
92/// The result will be a table looking something like this:
93///
94/// ```no-run
95/// Fruits Organic Quantity
96/// apple true 16
97/// banana false 30
98/// pear true 10
99/// ```
100///
101/// ## Iteration
102///
103/// `DataFrame`s can be used in `for` loops to iterate one column at a time:
104///
105/// ```
106/// # use tubular::DataFrame;
107/// # let mut df = DataFrame::default();
108/// # df.push("Fruits", &["apple", "banana", "pear"]);
109/// # df.push("Organic", &[true, false, true]);
110/// # df.push("Quantity", &[16, 30, 10]);
111/// for column in &df {
112/// println!("{:?}", column);
113/// }
114/// ```
115///
116/// [`Column`]: enum.Column.html
117/// [`Header`]: type.Header.html
118/// [`Row`]: struct.Row.html
119#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
120#[derive(Default, Debug, Clone, PartialEq)]
121pub struct DataFrame {
122 columns: HashMap<Header, Column>,
123 order: Vec<Header>,
124}
125
126impl DataFrame {
127 /// Adds a new `Column` to the end of the `DataFrame`.
128 ///
129 /// Any iterator over items that implement [`ColumnType`](trait.ColumnType.html)
130 /// can be passed as argument for the new column:
131 ///
132 /// ```
133 /// use tubular::DataFrame;
134 /// use std::sync::mpsc::channel;
135 /// use std::thread;
136 ///
137 /// let mut df = DataFrame::default();
138 ///
139 /// // Add normal "list"-like iterators to a DataFrame
140 /// df.push("Fruits", &["apple", "banana", "pear"]);
141 /// df.push("Organic", vec![true, false, true]);
142 /// df.push("Quantity", [16, 30, 10].iter());
143 ///
144 /// // Or other less obvious sequences
145 /// df.push("Sku", 1..4);
146 /// df.push("Log Lines", "192.12.78.1 - 200\n25.31.197.245 - 200\n78.95.83.123 - 304".lines());
147 /// df.push("Words", "abc1def2ghi".split(char::is_numeric));
148 ///
149 /// // ...Or real crazy iterators
150 /// let (sender, recv) = channel();
151 /// thread::spawn(move || {
152 /// sender.send(10.3).unwrap();
153 /// sender.send(97.2).unwrap();
154 /// sender.send(-15.3).unwrap();
155 /// });
156 /// df.push("Temperatures", recv);
157 /// ```
158 pub fn push(&mut self, header: impl Into<Header>, column: impl Into<Column>) {
159 let header = header.into();
160 let column = column.into();
161 self.order.push(header.clone());
162 self.columns.insert(header, column);
163 }
164
165 /// Returns the number of `Column`s in the `DataFrame`
166 ///
167 /// ```
168 /// use tubular::DataFrame;
169 /// let mut df = DataFrame::default();
170 /// df.push("Words", "abc1def2ghi".split(char::is_numeric));
171 /// assert_eq!(df.len(), 1);
172 /// ```
173 pub fn len(&self) -> usize {
174 self.order.len()
175 }
176
177 /// Provides all the headers
178 ///
179 /// ```
180 /// use tubular::DataFrame;
181 /// let mut df = DataFrame::default();
182 /// df.push("Fruits", &["apple", "banana", "pear"]);
183 /// df.push("Organic", &[true, false, true]);
184 /// df.push("Quantity", &[16, 30, 10]);
185 /// assert_eq!(df.headers(), &vec![
186 /// "Fruits".to_string(),
187 /// "Organic".to_string(),
188 /// "Quantity".to_string()
189 /// ]);
190 /// ```
191 pub fn headers(&self) -> &Vec<Header> {
192 &self.order
193 }
194
195 /// Returns the number of rows in the `DataFrame`.
196 ///
197 /// NOTE: This method is unstable is likely to be removed or changed semantically
198 /// in the near future.
199 pub fn row_len(&self) -> usize {
200 if self.columns.len() == 0 {
201 return 0;
202 }
203 self[0].len()
204 }
205
206 /// Allows iteration over [`Row`](struct.Row.html) objects
207 ///
208 /// ```
209 /// use tubular::DataFrame;
210 /// let mut df = DataFrame::default();
211 /// df.push("Fruits", &["apple"]);
212 /// df.push("Organic", &[true]);
213 /// df.push("Quantity", &[16]);
214 /// for row in df.rows() {
215 /// assert_eq!(row.column_name::<String>("Fruits"), "apple");
216 /// assert_eq!(row.column_name::<bool>("Organic"), &true);
217 /// assert_eq!(row.column_name::<i32>("Quantity"), &16);
218 /// }
219 pub fn rows(&self) -> Rows {
220 From::from(self)
221 }
222}
223
224impl Index<&'static str> for DataFrame {
225 type Output = Column;
226
227 fn index(&self, index: &'static str) -> &Self::Output {
228 &self.columns[index]
229 }
230}
231
232impl Index<String> for DataFrame {
233 type Output = Column;
234
235 fn index(&self, index: String) -> &Self::Output {
236 &self.columns[&index]
237 }
238}
239
240impl Index<usize> for DataFrame {
241 type Output = Column;
242
243 fn index(&self, index: usize) -> &Self::Output {
244 &self.columns[&self.order[index]]
245 }
246}
247
248impl Display for DataFrame {
249 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
250 let headers = self.headers();
251 let table_height = self.row_len() + 1;
252 let mut lines: Vec<String> = std::iter::repeat(String::new()).take(table_height).collect();
253 for (header, column) in headers.iter().zip(self) {
254 let width = column.display_width().max(header.len()) + 1;
255 let strings: Vec<String> = (&column).into();
256 lines[0] += &left_pad(header, width);
257 for (index, cell) in strings.iter().enumerate() {
258 lines[index + 1] += &left_pad(cell, width);
259 }
260 }
261 write!(f, "{}", lines.join("\n"))
262 }
263}
264
265impl<'d> IntoIterator for &'d DataFrame {
266 type IntoIter = IntoIter<'d>;
267 type Item = Column;
268
269 fn into_iter(self) -> IntoIter<'d> {
270 IntoIter {
271 df: self,
272 index: 0,
273 }
274 }
275}
276
277pub struct IntoIter<'d> {
278 index: usize,
279 df: &'d DataFrame,
280}
281
282impl<'d> Iterator for IntoIter<'d> {
283 type Item = Column;
284
285 fn next(&mut self) -> Option<Column> {
286 if self.index >= self.df.len() {
287 return None;
288 }
289
290 let index = self.index;
291 self.index += 1;
292 Some(self.df[index].clone())
293 }
294}