example_data/
lib.rs

1//! example-data is created to easily load common datasets.
2//!
3//! We use the [Apache Arrow](https://docs.rs/arrow/3.0.0/arrow/index.html) memory format
4//! which allows for simple conversion to multiple dataframe implementations.
5//!
6//! ```rust
7//! use example_data::{Repo};
8//! use arrow::record_batch::RecordBatch;
9//!
10//! let iris = Repo::default().load_table("iris").unwrap();
11//! let batches : Vec<RecordBatch> = iris.data();
12//! let doc : &str = iris.doc().unwrap();
13//! ```
14//!
15//! ### Supported datatables
16//!
17//! - iris
18//! - boston
19//!
20//!
21extern crate arrow;
22
23mod datatables;
24
25use arrow::datatypes::SchemaRef;
26use arrow::record_batch::RecordBatch;
27
28// This structure is currently not public
29// The idea is to expose it in the future
30#[derive(Clone, Debug)]
31pub struct DataSet {
32    tables: Vec<DataTable>,
33    doc: Option<String>,
34    name: String,
35}
36
37impl DataSet {
38    pub fn new(name: String, tables: Vec<DataTable>, doc: Option<String>) -> Self {
39        Self { tables, name, doc }
40    }
41
42    pub fn name(&self) -> &str {
43        &self.name
44    }
45
46    pub fn doc(&self) -> Option<&str> {
47        self.doc.as_deref()
48    }
49
50    pub fn tables(&self) -> &[DataTable] {
51        &self.tables
52    }
53}
54
55/// A table with data
56///
57/// This corresponds to a DataFrame
58#[derive(Clone, Debug)]
59pub struct DataTable {
60    batches: Vec<RecordBatch>,
61    doc: Option<String>,
62    name: String,
63}
64
65impl DataTable {
66    /// The content of the DataTable
67    ///
68    /// It is guarnateed that all batches have
69    /// exactly the same [`Schema`](arrow::datatypes::Schema)
70    pub fn data(&self) -> Vec<RecordBatch> {
71        // Note, that the cloning is not ridiculously expensive.
72        //
73        // we clone RecordBatches here
74        // The data in all recordbatches is stored as an ArrayRef
75        // We are just adding the counters in the Arc
76        self.batches.iter().map(|x| x.clone()).collect()
77    }
78
79    /// The content of the DataTable
80    ///
81    /// It is guarnateed that all batches have
82    /// exactly the same [`Schema`](arrow::datatypes::Schema)
83    pub fn data_ref(&self) -> &[RecordBatch] {
84        &self.batches
85    }
86
87    /// The name of the DataTable
88    pub fn name(&self) -> &str {
89        &self.name
90    }
91
92    /// The number of rows in the DataTable
93    pub fn num_rows(&self) -> usize {
94        self.batches.iter().map(|x| x.num_rows()).sum()
95    }
96
97    /// The number of columns in the DataTable
98    pub fn num_columns(&self) -> usize {
99        self.batches[0].num_columns()
100    }
101
102    /// The schema of the DataTable
103    pub fn schema(&self) -> SchemaRef {
104        self.batches[0].schema()
105    }
106
107    /// The documentation of the DataTable
108    pub fn doc(&self) -> Option<&str> {
109        self.doc.as_deref()
110    }
111}
112
113struct DataTableBuilder {
114    batches: Option<Vec<RecordBatch>>,
115    doc: Option<String>,
116    name: Option<String>,
117}
118
119impl DataTableBuilder {
120    fn new() -> Self {
121        DataTableBuilder {
122            batches: None,
123            doc: None,
124            name: None,
125        }
126    }
127
128    fn with_name(mut self, name: String) -> Self {
129        self.name = Some(name);
130        self
131    }
132
133    fn with_doc(mut self, doc: String) -> Self {
134        self.doc = Some(doc);
135        self
136    }
137
138    fn with_batches(mut self, batches: Vec<RecordBatch>) -> Self {
139        self.batches = Some(batches);
140        self
141    }
142
143    fn build(self) -> Result<DataTable, String> {
144        let batches = self
145            .batches
146            .ok_or_else(|| String::from("Cannot create DataTable without data/batches"))?;
147        let name = self
148            .name
149            .ok_or_else(|| String::from("Cannot create DataTable without a name."))?;
150
151        let table = DataTable {
152            name,
153            batches,
154            doc: self.doc,
155        };
156
157        Ok(table)
158    }
159}
160
161/// Repo is a collection of [`DataTable`](DataTable)s
162pub trait Repo {
163    /// Loads the [`DataTable`](DataTable) with matching name
164    fn load_table(&self, name: &str) -> Result<DataTable, String>;
165
166    /// Loads the [`DataTable`](DataSet) with matching name
167    fn load_data_set(&self, name: &str) -> Result<DataSet, String>;
168}
169
170impl dyn Repo {
171    /// Gets the default repository
172    ///
173    /// Currently, this is the only supported repository.
174    /// In the current set-up all data-tables are included in the
175    /// binary.
176    ///
177    /// This means that no network connection is required to connect to the Repo
178    pub fn default() -> impl Repo {
179        DefaultRepo {}
180    }
181}
182
183struct DefaultRepo {}
184
185impl Repo for DefaultRepo {
186    /// Loads the [`DataTable`](DataTable) with corresponding name
187    fn load_table(&self, name: &str) -> Result<DataTable, String> {
188        match name {
189            "iris" => crate::datatables::iris::load_table(),
190            "boston" => crate::datatables::boston::load_table(),
191            _ => Err(format!("{} could not be found in default-repository", name)),
192        }
193    }
194
195    /// Loads the [`DataSet`](DataSet) with corresponding name
196    fn load_data_set(&self, name: &str) -> Result<DataSet, String> {
197        Err(format!("Failed to find dataset {:}", name))
198    }
199}
200
201#[cfg(test)]
202mod tests {
203
204    use super::*;
205
206    use arrow::datatypes::DataType;
207
208    #[test]
209    fn test_can_load_iris() {
210        let repo = Repo::default();
211        let table: DataTable = repo.load_table("iris").unwrap();
212
213        assert_eq!(
214            table.num_rows(),
215            150,
216            "Iris is supposed to have 64 observations"
217        );
218        assert_eq!(
219            table.num_columns(),
220            5,
221            "Iris is supposed to have 5 features"
222        );
223
224        // Checking field-names
225        assert_eq!(table.schema().field(0).name(), "sepal_length");
226        assert_eq!(table.schema().field(1).name(), "sepal_width");
227        assert_eq!(table.schema().field(2).name(), "petal_length");
228        assert_eq!(table.schema().field(3).name(), "petal_width");
229        assert_eq!(table.schema().field(4).name(), "variety");
230
231        // Checking field-names
232        assert_eq!(table.schema().field(0).data_type(), &DataType::Float64);
233        assert_eq!(table.schema().field(1).data_type(), &DataType::Float64);
234        assert_eq!(table.schema().field(2).data_type(), &DataType::Float64);
235        assert_eq!(table.schema().field(3).data_type(), &DataType::Float64);
236        assert_eq!(table.schema().field(4).data_type(), &DataType::Utf8);
237    }
238
239    #[test]
240    fn test_can_load_boston_housing() {
241        let repo = Repo::default();
242        let table: DataTable = repo.load_table("boston").unwrap();
243
244        assert_eq!(table.num_rows(), 506);
245        assert_eq!(table.num_columns(), 14);
246    }
247}