extern crate arrow;
mod datatables;
use arrow::datatypes::SchemaRef;
use arrow::record_batch::RecordBatch;
#[derive(Clone, Debug)]
pub struct DataSet {
tables: Vec<DataTable>,
doc: Option<String>,
name: String,
}
impl DataSet {
pub fn new(name: String, tables: Vec<DataTable>, doc: Option<String>) -> Self {
Self { tables, name, doc }
}
pub fn name(&self) -> &str {
&self.name
}
pub fn doc(&self) -> Option<&str> {
self.doc.as_deref()
}
pub fn tables(&self) -> &[DataTable] {
&self.tables
}
}
#[derive(Clone, Debug)]
pub struct DataTable {
batches: Vec<RecordBatch>,
doc: Option<String>,
name: String,
}
impl DataTable {
pub fn data(&self) -> Vec<RecordBatch> {
self.batches.iter().map(|x| x.clone()).collect()
}
pub fn data_ref(&self) -> &[RecordBatch] {
&self.batches
}
pub fn name(&self) -> &str {
&self.name
}
pub fn num_rows(&self) -> usize {
self.batches.iter().map(|x| x.num_rows()).sum()
}
pub fn num_columns(&self) -> usize {
self.batches[0].num_columns()
}
pub fn schema(&self) -> SchemaRef {
self.batches[0].schema()
}
pub fn doc(&self) -> Option<&str> {
self.doc.as_deref()
}
}
struct DataTableBuilder {
batches: Option<Vec<RecordBatch>>,
doc: Option<String>,
name: Option<String>,
}
impl DataTableBuilder {
fn new() -> Self {
DataTableBuilder {
batches: None,
doc: None,
name: None,
}
}
fn with_name(mut self, name: String) -> Self {
self.name = Some(name);
self
}
fn with_doc(mut self, doc: String) -> Self {
self.doc = Some(doc);
self
}
fn with_batches(mut self, batches: Vec<RecordBatch>) -> Self {
self.batches = Some(batches);
self
}
fn build(self) -> Result<DataTable, String> {
let batches = self
.batches
.ok_or_else(|| String::from("Cannot create DataTable without data/batches"))?;
let name = self
.name
.ok_or_else(|| String::from("Cannot create DataTable without a name."))?;
let table = DataTable {
name,
batches,
doc: self.doc,
};
Ok(table)
}
}
pub trait Repo {
fn load_table(&self, name: &str) -> Result<DataTable, String>;
fn load_data_set(&self, name: &str) -> Result<DataSet, String>;
}
impl dyn Repo {
pub fn default() -> impl Repo {
DefaultRepo {}
}
}
struct DefaultRepo {}
impl Repo for DefaultRepo {
fn load_table(&self, name: &str) -> Result<DataTable, String> {
match name {
"iris" => crate::datatables::iris::load_table(),
"boston" => crate::datatables::boston::load_table(),
_ => Err(format!("{} could not be found in default-repository", name)),
}
}
fn load_data_set(&self, name: &str) -> Result<DataSet, String> {
Err(format!("Failed to find dataset {:}", name))
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow::datatypes::DataType;
#[test]
fn test_can_load_iris() {
let repo = Repo::default();
let table: DataTable = repo.load_table("iris").unwrap();
assert_eq!(
table.num_rows(),
150,
"Iris is supposed to have 64 observations"
);
assert_eq!(
table.num_columns(),
5,
"Iris is supposed to have 5 features"
);
assert_eq!(table.schema().field(0).name(), "sepal_length");
assert_eq!(table.schema().field(1).name(), "sepal_width");
assert_eq!(table.schema().field(2).name(), "petal_length");
assert_eq!(table.schema().field(3).name(), "petal_width");
assert_eq!(table.schema().field(4).name(), "variety");
assert_eq!(table.schema().field(0).data_type(), &DataType::Float64);
assert_eq!(table.schema().field(1).data_type(), &DataType::Float64);
assert_eq!(table.schema().field(2).data_type(), &DataType::Float64);
assert_eq!(table.schema().field(3).data_type(), &DataType::Float64);
assert_eq!(table.schema().field(4).data_type(), &DataType::Utf8);
}
#[test]
fn test_can_load_boston_housing() {
let repo = Repo::default();
let table: DataTable = repo.load_table("boston").unwrap();
assert_eq!(table.num_rows(), 506);
assert_eq!(table.num_columns(), 14);
}
}