tubular/
dataframe.rs

1use std::fmt::{self, Display};
2use std::collections::HashMap;
3use std::ops::Index;
4
5#[cfg(feature = "serde")]
6use serde::{Serialize, Deserialize};
7
8use crate::{
9    Column,
10    Rows,
11    fmt::left_pad,
12};
13
14/// [`Column`](enum.Column.html) id used to lookup and describe columns in a [`DataFrame`](struct.DataFrame.html)
15pub type Header = String;
16
17/// A 2D matrix of cells of mixed types useful for exploratory data analysis.
18///
19/// ## Basic Concept
20///
21/// A `DataFrame` consists of 0 or more [`Column`]s in a specific order. The DataFrame
22/// keeps track of a [`Header`] associated with each of its columns. The `Header` can
23/// be used to display the whole `DataFrame` or it can be used to lookup a single `Column`.
24///
25/// [`Row`]s can be constructed by gathering up one cell from all columns at the same
26/// position. Since `Row`s are composed of many possibly different types, in the normal case,
27/// each cell in the column is of a different type which can't be known until runtime.
28/// Tubular tries to make this as ergonomic as possible, but working with Rows will always
29/// be a bit more ceremonius than working with Columns.
30///
31/// Here's how to think about the structure of a `DataFrame`:
32///
33/// ```no-run
34/// DataFrame
35///
36///  + - - - -+- - - - + - - - -+
37///  | Header | Header | Header |
38///  + - - - -+- - - - + - - - -+
39///  | Column | Column | Column |
40///  |  bool  |  u32   | String |
41///  | ------ | ------ | ------ |
42///  | |cell| | |cell| | |cell| |
43///  | ------ | ------ | ------ |
44///  | |cell| | |cell| | |cell| |
45///  | ------ | ------ | ------ |
46///  | ================================
47///  | |cell|   |cell|   |cell  | Row |
48///  | |bool|   |u32 |   |String|     |
49///  | ================================
50///  | ------ | ------ | ------ |
51///  | |cell| | |cell| | |cell| |
52///  | ------ | ------ | ------ |
53///  + - - - -+- - - - + - - - -+
54///
55/// ```
56///
57/// ## Constructing a DataFrame
58///
59/// `DataFrame` implements `Default`, which is generally the easiest way to
60/// create one from scratch:
61///
62/// ```
63/// use tubular::DataFrame;
64///
65/// let df = DataFrame::default();
66/// ```
67///
68/// To add columns, use [`push()`](#method.push):
69///
70/// ```
71/// # use tubular::DataFrame;
72/// # let mut df = DataFrame::default();
73/// df.push("Fruits", &["apple", "banana", "pear"]);
74/// ```
75///
76/// Although you could try using Serde to load a DataFrame, the current implementation
77/// is very basic and not likely to work as you expect.
78///
79/// ## Exploring
80///
81/// The easiest way to figure out what's in a `DataFrame` is to print it out:
82///
83/// ```
84/// # use tubular::DataFrame;
85/// # let mut df = DataFrame::default();
86/// # df.push("Fruits", &["apple", "banana", "pear"]);
87/// # df.push("Organic", &[true, false, true]);
88/// # df.push("Quantity", &[16, 30, 10]);
89/// println!("{}", &df);
90/// ```
91///
92/// The result will be a table looking something like this:
93///
94/// ```no-run
95/// Fruits Organic Quantity
96///  apple    true       16
97/// banana   false       30
98///   pear    true       10
99/// ```
100///
101/// ## Iteration
102///
103/// `DataFrame`s can be used in `for` loops to iterate one column at a time:
104///
105/// ```
106/// # use tubular::DataFrame;
107/// # let mut df = DataFrame::default();
108/// # df.push("Fruits", &["apple", "banana", "pear"]);
109/// # df.push("Organic", &[true, false, true]);
110/// # df.push("Quantity", &[16, 30, 10]);
111/// for column in &df {
112///     println!("{:?}", column);
113/// }
114/// ```
115///
116/// [`Column`]: enum.Column.html
117/// [`Header`]: type.Header.html
118/// [`Row`]: struct.Row.html
119#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
120#[derive(Default, Debug, Clone, PartialEq)]
121pub struct DataFrame {
122    columns: HashMap<Header, Column>,
123    order: Vec<Header>,
124}
125
126impl DataFrame {
127    /// Adds a new `Column` to the end of the `DataFrame`.
128    ///
129    /// Any iterator over items that implement [`ColumnType`](trait.ColumnType.html)
130    /// can be passed as argument for the new column:
131    ///
132    /// ```
133    /// use tubular::DataFrame;
134    /// use std::sync::mpsc::channel;
135    /// use std::thread;
136    ///
137    /// let mut df = DataFrame::default();
138    ///
139    /// // Add normal "list"-like iterators to a DataFrame
140    /// df.push("Fruits", &["apple", "banana", "pear"]);
141    /// df.push("Organic", vec![true, false, true]);
142    /// df.push("Quantity", [16, 30, 10].iter());
143    ///
144    /// // Or other less obvious sequences
145    /// df.push("Sku", 1..4);
146    /// df.push("Log Lines", "192.12.78.1 - 200\n25.31.197.245 - 200\n78.95.83.123 - 304".lines());
147    /// df.push("Words", "abc1def2ghi".split(char::is_numeric));
148    ///
149    /// // ...Or real crazy iterators
150    /// let (sender, recv) = channel();
151    /// thread::spawn(move || {
152    ///     sender.send(10.3).unwrap();
153    ///     sender.send(97.2).unwrap();
154    ///     sender.send(-15.3).unwrap();
155    /// });
156    /// df.push("Temperatures", recv);
157    /// ```
158    pub fn push(&mut self, header: impl Into<Header>, column: impl Into<Column>) {
159        let header = header.into();
160        let column = column.into();
161        self.order.push(header.clone());
162        self.columns.insert(header, column);
163    }
164
165    /// Returns the number of `Column`s in the `DataFrame`
166    ///
167    /// ```
168    /// use tubular::DataFrame;
169    /// let mut df = DataFrame::default();
170    /// df.push("Words", "abc1def2ghi".split(char::is_numeric));
171    /// assert_eq!(df.len(), 1);
172    /// ```
173    pub fn len(&self) -> usize {
174        self.order.len()
175    }
176
177    /// Provides all the headers
178    ///
179    /// ```
180    /// use tubular::DataFrame;
181    /// let mut df = DataFrame::default();
182    /// df.push("Fruits", &["apple", "banana", "pear"]);
183    /// df.push("Organic", &[true, false, true]);
184    /// df.push("Quantity", &[16, 30, 10]);
185    /// assert_eq!(df.headers(), &vec![
186    ///     "Fruits".to_string(),
187    ///     "Organic".to_string(),
188    ///     "Quantity".to_string()
189    /// ]);
190    /// ```
191    pub fn headers(&self) -> &Vec<Header> {
192        &self.order
193    }
194
195    /// Returns the number of rows in the `DataFrame`.
196    ///
197    /// NOTE: This method is unstable is likely to be removed or changed semantically
198    /// in the near future.
199    pub fn row_len(&self) -> usize {
200        if self.columns.len() == 0 {
201            return 0;
202        }
203        self[0].len()
204    }
205
206    /// Allows iteration over [`Row`](struct.Row.html) objects
207    ///
208    /// ```
209    /// use tubular::DataFrame;
210    /// let mut df = DataFrame::default();
211    /// df.push("Fruits", &["apple"]);
212    /// df.push("Organic", &[true]);
213    /// df.push("Quantity", &[16]);
214    /// for row in df.rows() {
215    ///     assert_eq!(row.column_name::<String>("Fruits"), "apple");
216    ///     assert_eq!(row.column_name::<bool>("Organic"), &true);
217    ///     assert_eq!(row.column_name::<i32>("Quantity"), &16);
218    /// }
219    pub fn rows(&self) -> Rows {
220        From::from(self)
221    }
222}
223
224impl Index<&'static str> for DataFrame {
225    type Output = Column;
226
227    fn index(&self, index: &'static str) -> &Self::Output {
228        &self.columns[index]
229    }
230}
231
232impl Index<String> for DataFrame {
233    type Output = Column;
234
235    fn index(&self, index: String) -> &Self::Output {
236        &self.columns[&index]
237    }
238}
239
240impl Index<usize> for DataFrame {
241    type Output = Column;
242
243    fn index(&self, index: usize) -> &Self::Output {
244        &self.columns[&self.order[index]]
245    }
246}
247
248impl Display for DataFrame {
249    fn fmt(&self,  f: &mut fmt::Formatter) -> fmt::Result {
250        let headers = self.headers();
251        let table_height = self.row_len() + 1;
252        let mut lines: Vec<String> = std::iter::repeat(String::new()).take(table_height).collect();
253        for (header, column) in headers.iter().zip(self) {
254            let width = column.display_width().max(header.len()) + 1;
255            let strings: Vec<String> = (&column).into();
256            lines[0] += &left_pad(header, width);
257            for (index, cell) in strings.iter().enumerate() {
258                lines[index + 1] += &left_pad(cell, width);
259            }
260        }
261        write!(f, "{}", lines.join("\n"))
262    }
263}
264
265impl<'d> IntoIterator for &'d DataFrame {
266    type IntoIter = IntoIter<'d>;
267    type Item = Column;
268
269    fn into_iter(self) -> IntoIter<'d> {
270        IntoIter {
271            df: self,
272            index: 0,
273        }
274    }
275}
276
277pub struct IntoIter<'d> {
278    index: usize,
279    df: &'d DataFrame,
280}
281
282impl<'d> Iterator for IntoIter<'d> {
283    type Item = Column;
284
285    fn next(&mut self) -> Option<Column> {
286        if self.index >= self.df.len() {
287            return None;
288        }
289
290        let index = self.index;
291        self.index += 1;
292        Some(self.df[index].clone())
293    }
294}