use crate::orderbooks::Orderbook;
use csv::{Reader, ReaderBuilder, Writer};
use std::{
error::Error,
fs,
io::{BufReader, Write},
};
use toml;
pub mod loaders;
#[cfg(feature = "torch")]
use tch::{Kind, Tensor};
pub enum Transformation {
Standarize,
Scale,
}
#[derive(Debug, Clone)]
pub struct Dataset {
pub index: Vec<u32>,
pub features: Vec<Vec<f64>>,
pub target: Vec<f64>,
}
#[derive(Debug)]
pub struct DatasetBuilder {
index: Option<Vec<u32>>,
features: Option<Vec<Vec<f64>>>,
target: Option<Vec<f64>>,
auto_index: bool,
}
impl Default for DatasetBuilder {
fn default() -> Self {
Self::new()
}
}
impl DatasetBuilder {
pub fn new() -> Self {
DatasetBuilder {
index: None,
features: None,
target: None,
auto_index: true,
}
}
pub fn index(mut self, index: Vec<u32>) -> Self {
self.index = Some(index);
self
}
pub fn features(mut self, features: Vec<Vec<f64>>) -> Self {
self.features = Some(features);
self
}
pub fn target(mut self, target: Vec<f64>) -> Self {
self.target = Some(target);
self
}
pub fn disable_auto_index(mut self) -> Self {
self.auto_index = false;
self
}
pub fn build(self) -> Result<Dataset, String> {
let features = self.features.ok_or("Missing features")?;
let target = self.target.ok_or("Missing target")?;
if features.len() != target.len() {
return Err(format!(
"features and target length mismatch: {:?} vs {:?}",
features.len(),
target.len()
));
}
if !features.is_empty() {
let expected_feature_len = features[0].len();
for (i, feature_vec) in features.iter().enumerate() {
if feature_vec.len() != expected_feature_len {
return Err(format!(
"feature vector at index {:?} has length {:?}, expected {:?}",
i,
feature_vec.len(),
expected_feature_len
));
}
}
}
let index = match self.index {
Some(idx) => {
if idx.len() != features.len() {
return Err(format!(
"Index length {:?} doesn't match data length {:?}",
idx.len(),
features.len()
));
}
idx
}
None => {
if self.auto_index {
(0..features.len() as u32).collect()
} else {
Vec::new()
}
}
};
Ok(Dataset {
index,
features,
target,
})
}
}
impl Dataset {
pub fn builder() -> DatasetBuilder {
DatasetBuilder::new()
}
pub fn transform(&mut self, transformation: Transformation) -> Vec<(f64, f64)> {
let n_features = self.features[0].len();
let epsilon = 1e-8;
match transformation {
Transformation::Standarize => {
let transposed: Vec<Vec<f64>> = (0..n_features)
.map(|i| self.features.iter().map(|row| row[i]).collect())
.collect();
let stats: Vec<(f64, f64)> = transposed
.iter()
.map(|feature_col| {
let n = feature_col.len() as f64;
let mean: f64 = feature_col.iter().sum::<f64>() / n;
let variance =
feature_col.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
/ n;
let std_dev = variance.sqrt().max(epsilon);
(mean, std_dev)
})
.collect();
self.features = self
.features
.iter()
.map(|row| {
row.iter()
.enumerate()
.map(|(i, &x)| {
let (mean, std_dev) = &stats[i];
(x - mean) / std_dev
})
.collect()
})
.collect();
stats
}
Transformation::Scale => {
let transposed: Vec<Vec<f64>> = (0..n_features)
.map(|i| self.features.iter().map(|row| row[i]).collect())
.collect();
let maxs: Vec<(f64, f64)> = transposed
.iter()
.map(|feature_col| {
let max: f64 = feature_col
.iter()
.cloned()
.fold(f64::NEG_INFINITY, f64::max)
.max(epsilon);
(0.0, max)
})
.collect();
self.features = self
.features
.iter()
.map(|row| {
row.iter()
.enumerate()
.map(|(i, &x)| {
let (_mean, max) = maxs[i];
x / max
})
.collect()
})
.collect();
maxs
}
}
}
pub fn from_csv(
file_route: &str,
header: bool,
column_types: Option<Vec<u32>>,
target_column: Option<u32>,
) -> Result<Self, Box<dyn Error>> {
let file = fs::File::open(file_route)?;
let mut rdr = ReaderBuilder::new().has_headers(header).from_reader(file);
let col_count = if header {
rdr.headers()?.len()
} else {
let mut records = rdr.records();
let first = records.next().ok_or("CSV file is empty")??;
first.len()
};
let col_types: Vec<u32> = match column_types {
Some(ref v) if v.len() == 1 => vec![v[0]; col_count],
Some(ref v) if v.len() == col_count => v.clone(),
_ => {
let mut types = vec![1; col_count];
types[0] = 0; types[col_count - 1] = 2; types
}
};
let target_col = target_column.unwrap_or((col_count - 1) as u32);
let mut index = Vec::new();
let mut features = Vec::new();
let mut target = Vec::new();
let mut rdr = ReaderBuilder::new()
.has_headers(header)
.from_path(file_route)?;
for result in rdr.records() {
let record = result?;
let mut row_features = Vec::new();
let mut row_index: Option<u32> = None;
let mut row_target: Option<f64> = None;
for (i, field) in record.iter().enumerate() {
let col_type = if i == target_col as usize {
2
} else {
col_types.get(i).copied().unwrap_or(1)
};
match col_type {
0 => {
let idx: u32 = field.parse().unwrap_or(0);
row_index = Some(idx);
}
1 => {
let val: f64 = field.parse().unwrap_or(f64::NAN);
row_features.push(val);
}
2 => {
let val: f64 = field.parse().unwrap_or(f64::NAN);
row_target = Some(val);
}
_ => {}
}
}
index.push(row_index.unwrap_or(index.len() as u32));
features.push(row_features);
target.push(row_target.unwrap_or(f64::NAN));
}
Ok(Dataset {
index,
features,
target,
})
}
#[cfg(feature = "torch")]
pub fn from_vec_to_tensor(self) -> (Tensor, Tensor) {
let d_features = self.features;
let d_targets = self.target;
let num_samples = d_features.len() as i64;
let num_features = d_features[0].len() as i64;
let flat_features: Vec<f64> = d_features.into_iter().flatten().clone().collect();
let features_tensor = Tensor::from_slice(&flat_features)
.reshape([num_samples, num_features])
.to_kind(Kind::Float);
let targets_tensor = Tensor::from_slice(&d_targets)
.reshape([num_samples])
.to_kind(Kind::Float);
(features_tensor, targets_tensor)
}
pub fn from_vectors(
self,
features: Vec<Vec<f64>>,
target: Vec<f64>,
) -> Result<Self, String> {
Self::builder().features(features).target(target).build()
}
pub fn get_pairs(&self) -> Vec<(Vec<f64>, f64)> {
self.features
.iter()
.zip(self.target.iter())
.map(|(features, &target)| (features.clone(), target))
.collect()
}
pub fn get_parirs_ref(&self) -> Vec<(&Vec<f64>, f64)> {
self.features
.iter()
.zip(self.target.iter())
.map(|(features, &target)| (features, target))
.collect()
}
pub fn len(&self) -> usize {
self.features.len()
}
pub fn is_empty(&self) -> bool {
self.features.is_empty()
}
pub fn feature_count(&self) -> usize {
self.features.first().map_or(0, |f| f.len())
}
pub fn get_features(&self) -> &Vec<Vec<f64>> {
&self.features
}
pub fn get_target(&self) -> &Vec<f64> {
&self.target
}
pub fn get_index(&self) -> &Vec<u32> {
&self.index
}
pub fn get_sample(&self, idx: usize) -> Option<(&Vec<f64>, f64)> {
if idx < self.len() {
Some((&self.features[idx], self.target[idx]))
} else {
None
}
}
pub fn get_sample_by_index(&self, index_value: u32) -> Option<(&Vec<f64>, f64)> {
self.index
.iter()
.position(|&idx| idx == index_value)
.and_then(|pos| self.get_sample(pos))
}
pub fn shift_features(&self) -> Dataset {
if self.features.len() < 2 {
return Dataset {
index: Vec::new(),
features: Vec::new(),
target: Vec::new(),
};
}
let shifted_features = self.features[1..].to_vec();
let aligned_targets = self.target[..self.target.len() - 1].to_vec();
let shifted_index = (0..shifted_features.len() as u32).collect();
Dataset {
index: shifted_index,
features: shifted_features,
target: aligned_targets,
}
}
}
#[cfg(feature = "torch")]
pub fn transform(data: &Tensor, operation: Transformation) -> Tensor {
let epsilon = 1e-8;
match operation {
Transformation::Standarize => {
let xs_1 = data - data.mean(Kind::Float);
let xs_2 = data.std(true) + epsilon;
(xs_1 / xs_2).to_kind(Kind::Float)
}
Transformation::Scale => {
let max_data = data.max();
data / max_data
}
}
}
pub fn truncate_to_decimal(num: f64, decimal_places: u32) -> f64 {
let multiplier = 10_f64.powi(decimal_places as i32);
(num * multiplier).trunc() / multiplier
}
pub fn load_from_toml(file_route: &str) -> Result<(), Box<dyn Error>> {
let contents = fs::read_to_string(file_route)?;
toml::from_str::<()>(&contents)?;
Ok(())
}
pub fn load_from_json(file_route: &str) -> Result<Vec<Orderbook>, Box<dyn Error>> {
let file = fs::File::open(file_route)?;
let reader = BufReader::new(file);
let v_orderbook: Vec<Orderbook> = serde_json::from_reader(reader)?;
Ok(v_orderbook)
}
pub fn write_to_json(ob_data: &Vec<Orderbook>, file_route: &str) {
let ob_json = serde_json::to_string(&ob_data).unwrap();
let mut file = fs::File::create(file_route).unwrap();
file.write_all(ob_json.as_bytes()).unwrap();
}
pub fn load_from_csv(file_route: &str) -> Result<Vec<Vec<f64>>, Box<dyn Error>> {
let mut rdr = Reader::from_path(file_route)?;
let mut data = Vec::new();
for result in rdr.records() {
let record = result?;
let float_row: Result<Vec<f64>, _> = record
.iter()
.skip(1)
.map(|field| field.parse::<f64>())
.collect();
data.push(float_row?);
}
Ok(data)
}
pub fn write_to_csv(data: &Dataset, file_route: &str) {
let mut wtr = Writer::from_path(file_route).unwrap();
if !data.features.is_empty() {
let mut header = vec!["index".to_string()];
for i in 0..data.features[0].len() {
header.push(format!("feature_{i}"));
}
header.push("target".to_string());
wtr.write_record(&header).unwrap();
}
for i in 0..data.features.len() {
let mut csv_row = Vec::new();
csv_row.push(data.index[i].to_string());
for feature_value in &data.features[i] {
csv_row.push(feature_value.to_string());
}
csv_row.push(data.target[i].to_string());
wtr.write_record(&csv_row).unwrap();
}
wtr.flush().unwrap();
}