use nalgebra::DVector;
#[cfg(feature = "datasets")]
pub mod csv;
#[cfg(feature = "datasets")]
pub mod image;
pub mod preprocessing;
#[async_trait::async_trait]
pub trait DatasetLoader<T> {
type Error: std::error::Error + Send + Sync + 'static;
async fn load(&self, source: T) -> Result<Dataset, Self::Error>;
async fn load_with_preprocessing(
&self,
source: T,
preprocessing: &PreprocessingConfig
) -> Result<Dataset, Self::Error>;
}
#[derive(Debug, Clone)]
pub struct Dataset {
pub features: Vec<DVector<f64>>,
pub targets: Option<Vec<DVector<f64>>>,
pub feature_names: Option<Vec<String>>,
pub target_names: Option<Vec<String>>,
pub metadata: DatasetMetadata,
}
#[derive(Debug, Clone, Default)]
pub struct DatasetMetadata {
pub sample_count: usize,
pub feature_count: usize,
pub target_count: Option<usize>,
pub source: Option<String>,
pub data_type: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct PreprocessingConfig {
pub normalize: bool,
pub standardize: bool,
pub one_hot_encode: Vec<usize>, pub fill_missing: Option<FillStrategy>,
pub shuffle: bool,
pub split_ratios: Option<(f64, f64, f64)>, }
#[derive(Debug, Clone)]
pub enum FillStrategy {
Constant(f64),
Mean,
Median,
Mode,
ForwardFill,
BackwardFill,
}
impl Dataset {
pub fn new(
features: Vec<DVector<f64>>,
targets: Option<Vec<DVector<f64>>>,
) -> Self {
let sample_count = features.len();
let feature_count = features.first().map(|f| f.len()).unwrap_or(0);
let target_count = targets.as_ref().and_then(|t| t.first().map(|t| t.len()));
Self {
features,
targets,
feature_names: None,
target_names: None,
metadata: DatasetMetadata {
sample_count,
feature_count,
target_count,
source: None,
data_type: None,
},
}
}
pub fn subset(&self, indices: &[usize]) -> Dataset {
let features: Vec<DVector<f64>> = indices.iter()
.filter_map(|&i| self.features.get(i).cloned())
.collect();
let targets = self.targets.as_ref().map(|targets| {
indices.iter()
.filter_map(|&i| targets.get(i).cloned())
.collect()
});
let mut subset = Dataset::new(features, targets);
subset.feature_names = self.feature_names.clone();
subset.target_names = self.target_names.clone();
subset.metadata.source = self.metadata.source.clone();
subset.metadata.data_type = self.metadata.data_type.clone();
subset
}
pub fn train_test_split(&self, train_ratio: f64) -> (Dataset, Dataset) {
let n_samples = self.features.len();
let n_train = (n_samples as f64 * train_ratio) as usize;
let train_indices: Vec<usize> = (0..n_train).collect();
let test_indices: Vec<usize> = (n_train..n_samples).collect();
(self.subset(&train_indices), self.subset(&test_indices))
}
pub fn describe(&self) -> DatasetStats {
DatasetStats::from_dataset(self)
}
}
#[derive(Debug, Clone)]
pub struct DatasetStats {
pub sample_count: usize,
pub feature_count: usize,
pub feature_stats: Vec<FeatureStats>,
}
#[derive(Debug, Clone)]
pub struct FeatureStats {
pub name: Option<String>,
pub mean: f64,
pub std: f64,
pub min: f64,
pub max: f64,
pub missing_count: usize,
}
impl DatasetStats {
pub fn from_dataset(dataset: &Dataset) -> Self {
let sample_count = dataset.features.len();
let feature_count = dataset.metadata.feature_count;
let mut feature_stats = Vec::with_capacity(feature_count);
for feature_idx in 0..feature_count {
let values: Vec<f64> = dataset.features.iter()
.filter_map(|feature| feature.get(feature_idx).copied())
.collect();
if values.is_empty() {
continue;
}
let mean = values.iter().sum::<f64>() / values.len() as f64;
let variance = values.iter()
.map(|x| (x - mean).powi(2))
.sum::<f64>() / values.len() as f64;
let std = variance.sqrt();
let min = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let missing_count = sample_count - values.len();
let name = dataset.feature_names.as_ref()
.and_then(|names| names.get(feature_idx))
.cloned();
feature_stats.push(FeatureStats {
name,
mean,
std,
min,
max,
missing_count,
});
}
Self {
sample_count,
feature_count,
feature_stats,
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum DatasetError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Parse error: {0}")]
Parse(String),
#[error("Invalid configuration: {0}")]
Configuration(String),
#[error("Data validation error: {0}")]
Validation(String),
#[cfg(feature = "datasets")]
#[error("CSV error: {0}")]
CsvError(String),
#[cfg(feature = "datasets")]
#[error("Image error: {0}")]
Image(#[from] ::image::ImageError),
}