use std::fmt;
use std::path::Path;
use std::cmp;
use std::fs::File;
use std::io::{Read, BufReader, BufRead};
use std::str::FromStr;
use crate::tensor::Tensor;
use crate::utils;
use crate::random::Rand;
#[derive(PartialEq, Debug)]
pub enum ColumnType {
Feature, Target, Skip }
#[derive(PartialEq, Clone, Copy, Debug)]
pub enum RowType {
Train, Test, Skip }
#[derive(Debug)]
pub struct ColumnMetadata {
name: String,
column_type: ColumnType
}
#[derive(Debug)]
pub struct Row {
data: Vec<f64>,
row_type: RowType
}
#[derive(Debug)]
pub enum DatasetError {
FileNotFound,
BadFormat(String),
}
pub struct Dataset {
data: Vec<Row>,
columns_metadata: Vec<ColumnMetadata>
}
impl Dataset {
pub fn from_raw_data(data: Vec<Vec<f64>>) -> Result<Dataset, DatasetError> {
let cols = data[0].len();
let mut columns_metadata = Vec::new();
if data.iter().any(|ref v| v.len() != data[0].len()) {
return Err(DatasetError::BadFormat(format!("All rows must have equal lengths.")));
}
for i in 0..cols - 1 {
columns_metadata.push(ColumnMetadata {name: format!("X_{}", i), column_type: ColumnType::Feature})
}
columns_metadata.push(ColumnMetadata {name: format!("Y"), column_type: ColumnType::Target});
let mut rows = Vec::new();
for el in data {
rows.push(Row{data: el, row_type: RowType::Train});
}
Ok(Dataset {
data: rows,
columns_metadata,
})
}
pub fn from_csv(path: &Path, header: bool) -> Result<Dataset, DatasetError> {
let delimiter = ";";
let input_file = File::open(path).unwrap();
let buffered = BufReader::new(input_file);
let mut data: Vec<Vec<f64>> = Vec::new();
for (i, line) in buffered.lines().enumerate() {
let l = line.unwrap();
let l = l.split(&delimiter);
let row_vec_str: Vec<&str> = l.collect();
if i == 0 {
continue; }
let row_vec_f64: Vec<f64> = row_vec_str.iter().map(|x| f64::from_str(x).unwrap()).collect();
data.push(row_vec_f64);
}
let dataset = Dataset::from_raw_data(data).unwrap();
Ok(dataset)
}
fn load_ubyte(path: &Path, dataset: String) -> Result<Dataset, DatasetError> {
let mut labels_file = File::open(path.join(format!("{}-labels-idx1-ubyte", dataset))).unwrap();
let mut images_file = File::open(path.join(format!("{}-images-idx3-ubyte", dataset))).unwrap();
let mut buf = [0u8;4];
images_file.read(&mut buf).unwrap();
let magic_number = utils::swap_endian(utils::as_u32_le(&buf));
assert_eq!(magic_number, 2051, "Incorrect magic number for a image file.");
let mut buf = [0u8;4];
labels_file.read(&mut buf).unwrap();
let magic_number = utils::swap_endian(utils::as_u32_le(&buf));
assert_eq!(magic_number, 2049, "Incorrect magic number for a label file.");
images_file.read(&mut buf).unwrap();
let number_images = utils::swap_endian(utils::as_u32_le(&buf));
labels_file.read(&mut buf).unwrap();
let number_labels = utils::swap_endian(utils::as_u32_le(&buf));
assert_eq!(number_images, number_labels, "Number of images and label must be identical.");
images_file.read(&mut buf).unwrap();
let rows = utils::swap_endian(utils::as_u32_le(&buf));
images_file.read(&mut buf).unwrap();
let cols = utils::swap_endian(utils::as_u32_le(&buf));
let vector_size = (rows * cols) as usize;
let mut data: Vec<Vec<f64>> = Vec::new();
for _ in 0..number_images {
let mut buf = vec![0u8;vector_size];
images_file.read(&mut buf).unwrap();
let mut pixels = utils::to_vec_f64(&buf);
pixels = pixels.into_iter().map(|x| x / 255.0).collect();
let mut label = vec![0u8;1];
labels_file.read(&mut label).unwrap();
pixels.append(&mut utils::to_vec_f64(&label));
data.push(pixels);
}
let mut dataset = Dataset::from_raw_data(data).unwrap();
dataset.one_hot_encode(vector_size);
Ok(dataset)
}
pub fn from_ubyte(path: &Path) -> Result<Dataset, DatasetError> {
let train_dataset = Dataset::load_ubyte(path, "train".to_string()).unwrap();
let mut test_dataset = Dataset::load_ubyte(path, "t10k".to_string()).unwrap();
test_dataset.set_all_rows_type(RowType::Test);
test_dataset.concatenate(train_dataset);
Ok(test_dataset)
}
pub fn set_row_type(&mut self, row_type: RowType, index: usize) {
self.data[index].row_type = row_type;
}
pub fn set_all_rows_type(&mut self, row_type: RowType) {
for row in self.data.iter_mut() {
row.row_type = row_type;
}
}
pub fn concatenate(&mut self, other: Dataset) {
self.data.extend(other.data);
}
pub fn one_hot_encode(&mut self, index: usize) {
let distinct_values = self.get_distinct_values(index);
let number_distinct_values = distinct_values.len();
for row in self.data.iter_mut() {
let value_to_encode = row.data[index];
let position = distinct_values.iter().position(|&x| x == value_to_encode).unwrap();
let mut one_hot = vec![0.0f64; number_distinct_values];
one_hot[position] = 1.0;
row.data.append(&mut one_hot);
}
for _col in 0..number_distinct_values {
let name = format!("Y"); let column_type = ColumnType::Target; self.columns_metadata.push(ColumnMetadata {name, column_type});
}
self.remove_column(index)
}
pub fn remove_column(&mut self, index: usize) {
self.columns_metadata.remove(index);
for row in self.data.iter_mut() {
row.data.remove(index);
}
}
pub fn get_distinct_values(&self, index: usize) -> Vec<f64> {
let mut result = Vec::new();
for row in &self.data {
let value = row.data[index];
if !result.contains(&value) {
result.push(value);
}
}
result.sort_by(|a, b| a.partial_cmp(b).unwrap());
result
}
pub fn get_tensor(&self, row_type: RowType, col_type: ColumnType) -> Tensor {
let rows = self.count_row_type(&row_type);
let cols = self.count_column_type(&col_type);
let shape = vec![rows, cols];
let mut col_indexes = Vec::new();
for (i, col) in self.columns_metadata.iter().enumerate() {
if col.column_type == col_type {
col_indexes.push(i);
}
}
let mut result = Vec::new();
for row in &self.data {
if row.row_type == row_type {
for col in &col_indexes {
result.push(row.data[*col]);
}
}
}
Tensor::new(result, shape)
}
pub fn split_train_test(&mut self, percentage: f64, shuffle: bool) {
let mut index = (0..self.data.len()).collect::<Vec<usize>>();
if shuffle {
let mut rand = Rand::new(18);
rand.shuffle(&mut index[..]);
}
let stop_index = (percentage * index.len() as f64) as usize;
for i in 0..self.data.len() {
let idx = index[i];
if i < stop_index {
self.data[idx].row_type = RowType::Train;
}
else {
self.data[idx].row_type = RowType::Test;
}
}
}
fn count_column_type(&self, col_type: &ColumnType) -> usize {
self.columns_metadata.iter().filter(|&n| n.column_type == *col_type).count()
}
pub fn count_row_type(&self, row_type: &RowType) -> usize {
self.data.iter().filter(|&r| r.row_type == *row_type).count()
}
pub fn get_number_features(&self) -> usize {
self.count_column_type(&ColumnType::Feature)
}
pub fn get_number_targets(&self) -> usize {
self.count_column_type(&ColumnType::Target)
}
pub fn get_row_count(&self) -> usize {
self.data.len()
}
}
impl fmt::Debug for Dataset {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}\n\
Observation(s): {} ({} train + {} test) \n\
Feature(s): {}\n\
Target(s): {}\n\
", self, &self.get_row_count(),
&self.count_row_type(&RowType::Train),
&self.count_row_type(&RowType::Test),
&self.get_number_features(),
&self.get_number_targets()
)
}
}
impl fmt::Display for Dataset {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sep = " | "; let mut result = String::new();
let rows = cmp::min(self.data.len(), 4);
let cols = cmp::min(self.data[0].data.len(), 12);
let mut headers = Vec::new();
for c in &self.columns_metadata {
headers.push(c.name.to_string());
}
let header_string = headers[0..cols].join(sep);
result.push_str(&header_string);
for row in 0..rows {
let mut temp_row: Vec<String> = Vec::new();
for col in 0..cols {
let col_len = headers[col].len();
let mut value = self.data[row].data[col].to_string();
let value_len = value.len();
if value_len > col_len {
value = value[0..col_len].to_string();
}
else {
value = value + &" ".repeat(col_len - value_len);
}
temp_row.push(value);
}
result.push_str("\n");
result.push_str(&temp_row.join(sep));
}
write!(f, "{}", result)
}
}