1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
//! example-data is created to easily load common datasets.
//!
//! We use the [Apache Arrow](https://docs.rs/arrow/3.0.0/arrow/index.html) memory format
//! which allows for simple conversion to multiple dataframe implementations.
//!
//! ```rust
//! use example_data::{Repo};
//! use arrow::record_batch::RecordBatch;
//!
//! let iris = Repo::default().load_table("iris").unwrap();
//! let batches : Vec<RecordBatch> = iris.data();
//! let doc : &str = iris.doc().unwrap();
//! ```
//!
//! ### Supported datatables
//!
//! - iris
//! - boston
//!
//!
extern crate arrow;

mod datatables;

use arrow::datatypes::SchemaRef;
use arrow::record_batch::RecordBatch;

// This structure is currently not public
// The idea is to expose it in the future
#[derive(Clone, Debug)]
pub struct DataSet {
    tables: Vec<DataTable>,
    doc: Option<String>,
    name: String,
}

impl DataSet {
    pub fn new(name: String, tables: Vec<DataTable>, doc: Option<String>) -> Self {
        Self { tables, name, doc }
    }

    pub fn name(&self) -> &str {
        &self.name
    }

    pub fn doc(&self) -> Option<&str> {
        self.doc.as_deref()
    }

    pub fn tables(&self) -> &[DataTable] {
        &self.tables
    }
}

/// A table with data
///
/// This corresponds to a DataFrame
#[derive(Clone, Debug)]
pub struct DataTable {
    batches: Vec<RecordBatch>,
    doc: Option<String>,
    name: String,
}

impl DataTable {
    /// The content of the DataTable
    ///
    /// It is guarnateed that all batches have
    /// exactly the same [`Schema`](arrow::datatypes::Schema)
    pub fn data(&self) -> Vec<RecordBatch> {
        // Note, that the cloning is not ridiculously expensive.
        //
        // we clone RecordBatches here
        // The data in all recordbatches is stored as an ArrayRef
        // We are just adding the counters in the Arc
        self.batches.iter().map(|x| x.clone()).collect()
    }

    /// The content of the DataTable
    ///
    /// It is guarnateed that all batches have
    /// exactly the same [`Schema`](arrow::datatypes::Schema)
    pub fn data_ref(&self) -> &[RecordBatch] {
        &self.batches
    }

    /// The name of the DataTable
    pub fn name(&self) -> &str {
        &self.name
    }

    /// The number of rows in the DataTable
    pub fn num_rows(&self) -> usize {
        self.batches.iter().map(|x| x.num_rows()).sum()
    }

    /// The number of columns in the DataTable
    pub fn num_columns(&self) -> usize {
        self.batches[0].num_columns()
    }

    /// The schema of the DataTable
    pub fn schema(&self) -> SchemaRef {
        self.batches[0].schema()
    }

    /// The documentation of the DataTable
    pub fn doc(&self) -> Option<&str> {
        self.doc.as_deref()
    }
}

struct DataTableBuilder {
    batches: Option<Vec<RecordBatch>>,
    doc: Option<String>,
    name: Option<String>,
}

impl DataTableBuilder {
    fn new() -> Self {
        DataTableBuilder {
            batches: None,
            doc: None,
            name: None,
        }
    }

    fn with_name(mut self, name: String) -> Self {
        self.name = Some(name);
        self
    }

    fn with_doc(mut self, doc: String) -> Self {
        self.doc = Some(doc);
        self
    }

    fn with_batches(mut self, batches: Vec<RecordBatch>) -> Self {
        self.batches = Some(batches);
        self
    }

    fn build(self) -> Result<DataTable, String> {
        let batches = self
            .batches
            .ok_or_else(|| String::from("Cannot create DataTable without data/batches"))?;
        let name = self
            .name
            .ok_or_else(|| String::from("Cannot create DataTable without a name."))?;

        let table = DataTable {
            name,
            batches,
            doc: self.doc,
        };

        Ok(table)
    }
}

/// Repo is a collection of [`DataTable`](DataTable)s
pub trait Repo {
    /// Loads the [`DataTable`](DataTable) with matching name
    fn load_table(&self, name: &str) -> Result<DataTable, String>;

    /// Loads the [`DataTable`](DataSet) with matching name
    fn load_data_set(&self, name: &str) -> Result<DataSet, String>;
}

impl dyn Repo {
    /// Gets the default repository
    ///
    /// Currently, this is the only supported repository.
    /// In the current set-up all data-tables are included in the
    /// binary.
    ///
    /// This means that no network connection is required to connect to the Repo
    pub fn default() -> impl Repo {
        DefaultRepo {}
    }
}

struct DefaultRepo {}

impl Repo for DefaultRepo {
    /// Loads the [`DataTable`](DataTable) with corresponding name
    fn load_table(&self, name: &str) -> Result<DataTable, String> {
        match name {
            "iris" => crate::datatables::iris::load_table(),
            "boston" => crate::datatables::boston::load_table(),
            _ => Err(format!("{} could not be found in default-repository", name)),
        }
    }

    /// Loads the [`DataSet`](DataSet) with corresponding name
    fn load_data_set(&self, name: &str) -> Result<DataSet, String> {
        Err(format!("Failed to find dataset {:}", name))
    }
}

#[cfg(test)]
mod tests {

    use super::*;

    use arrow::datatypes::DataType;

    #[test]
    fn test_can_load_iris() {
        let repo = Repo::default();
        let table: DataTable = repo.load_table("iris").unwrap();

        assert_eq!(
            table.num_rows(),
            150,
            "Iris is supposed to have 64 observations"
        );
        assert_eq!(
            table.num_columns(),
            5,
            "Iris is supposed to have 5 features"
        );

        // Checking field-names
        assert_eq!(table.schema().field(0).name(), "sepal_length");
        assert_eq!(table.schema().field(1).name(), "sepal_width");
        assert_eq!(table.schema().field(2).name(), "petal_length");
        assert_eq!(table.schema().field(3).name(), "petal_width");
        assert_eq!(table.schema().field(4).name(), "variety");

        // Checking field-names
        assert_eq!(table.schema().field(0).data_type(), &DataType::Float64);
        assert_eq!(table.schema().field(1).data_type(), &DataType::Float64);
        assert_eq!(table.schema().field(2).data_type(), &DataType::Float64);
        assert_eq!(table.schema().field(3).data_type(), &DataType::Float64);
        assert_eq!(table.schema().field(4).data_type(), &DataType::Utf8);
    }

    #[test]
    fn test_can_load_boston_housing() {
        let repo = Repo::default();
        let table: DataTable = repo.load_table("boston").unwrap();

        assert_eq!(table.num_rows(), 506);
        assert_eq!(table.num_columns(), 14);
    }
}