use crate::dataset::Dataset;
use torsh_core::{
dtype::TensorElement,
error::{Result, TorshError},
};
use torsh_tensor::Tensor;
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, string::String, vec::Vec};
use std::path::Path;
pub struct CSVDataset {
data: Vec<Vec<f32>>,
targets: Option<Vec<f32>>,
feature_names: Vec<String>,
target_name: Option<String>,
}
impl CSVDataset {
pub fn new<P: AsRef<Path>>(
path: P,
target_column: Option<&str>,
has_header: bool,
) -> Result<Self> {
#[cfg(feature = "dataframe")]
{
let mut reader = csv::ReaderBuilder::new()
.has_headers(has_header)
.from_path(path.as_ref())
.map_err(|e| TorshError::IoError(e.to_string()))?;
let headers = if has_header {
reader
.headers()
.map_err(|e| TorshError::IoError(e.to_string()))?
.iter()
.map(|s| s.to_string())
.collect::<Vec<String>>()
} else {
let first_record = reader
.records()
.next()
.ok_or_else(|| TorshError::IoError("Empty CSV file".to_string()))?
.map_err(|e| TorshError::IoError(e.to_string()))?;
(0..first_record.len())
.map(|i| format!("col_{}", i))
.collect()
};
let mut feature_names = Vec::new();
let mut target_name = None;
let mut target_idx = None;
for (idx, col_name) in headers.iter().enumerate() {
if target_column == Some(col_name.as_str()) {
target_name = Some(col_name.clone());
target_idx = Some(idx);
} else {
feature_names.push(col_name.clone());
}
}
let mut all_data: Vec<Vec<f32>> = Vec::new();
let mut target_data: Vec<f32> = Vec::new();
for result in reader.records() {
let record = result.map_err(|e| TorshError::IoError(e.to_string()))?;
let mut row_features = Vec::new();
for (idx, field) in record.iter().enumerate() {
let value: f32 = field.parse().unwrap_or(0.0);
if Some(idx) == target_idx {
target_data.push(value);
} else {
row_features.push(value);
}
}
all_data.push(row_features);
}
let targets = if target_idx.is_some() {
Some(target_data)
} else {
None
};
Ok(Self {
data: all_data,
targets,
feature_names,
target_name,
})
}
#[cfg(not(feature = "dataframe"))]
{
let _ = (path, target_column, has_header); Err(TorshError::UnsupportedOperation {
op: "CSV loading".to_string(),
dtype: "DataFrame".to_string(),
})
}
}
pub fn feature_names(&self) -> &[String] {
&self.feature_names
}
pub fn target_name(&self) -> Option<&String> {
self.target_name.as_ref()
}
pub fn num_features(&self) -> usize {
self.feature_names.len()
}
pub fn features_only(mut self) -> Self {
self.targets = None;
self.target_name = None;
self
}
}
impl Dataset for CSVDataset {
type Item = (Tensor<f32>, Option<Tensor<f32>>);
fn len(&self) -> usize {
self.data.len()
}
fn get(&self, index: usize) -> Result<Self::Item> {
if index >= self.data.len() {
return Err(TorshError::IndexError {
index,
size: self.data.len(),
});
}
let features = Tensor::from_data(
self.data[index].clone(),
vec![self.num_features()],
torsh_core::device::DeviceType::Cpu,
)?;
let target = if let Some(targets) = self.targets.as_ref() {
Some(Tensor::from_data(
vec![targets[index]],
vec![1],
torsh_core::device::DeviceType::Cpu,
)?)
} else {
None
};
Ok((features, target))
}
}
pub struct ArrayDataset<T: TensorElement> {
features: Tensor<T>,
targets: Option<Tensor<T>>,
}
impl<T: TensorElement> ArrayDataset<T> {
pub fn new(features: Tensor<T>, targets: Option<Tensor<T>>) -> Result<Self> {
if let Some(ref targets) = targets {
let feature_batch = features.size(0)?;
let target_batch = targets.size(0)?;
if feature_batch != target_batch {
return Err(TorshError::ShapeMismatch {
expected: vec![feature_batch],
got: vec![target_batch],
});
}
}
Ok(Self { features, targets })
}
pub fn features_only(features: Tensor<T>) -> Self {
Self {
features,
targets: None,
}
}
pub fn num_features(&self) -> Result<usize> {
if self.features.ndim() < 2 {
Ok(1)
} else {
self.features.size(1)
}
}
}
impl<T: TensorElement> Dataset for ArrayDataset<T> {
type Item = (Tensor<T>, Option<Tensor<T>>);
fn len(&self) -> usize {
self.features.size(0).unwrap_or(0)
}
fn get(&self, index: usize) -> Result<Self::Item> {
if index >= self.len() {
return Err(TorshError::IndexError {
index,
size: self.len(),
});
}
let features_shape_ref = self.features.shape();
let features_shape = features_shape_ref.dims();
if features_shape.len() == 2 && index < features_shape[0] {
let num_features = features_shape[1];
let row_features = torsh_tensor::creation::zeros::<T>(&[num_features])?;
let targets = if let Some(ref target_tensor) = self.targets {
let target_shape_ref = target_tensor.shape();
let target_shape = target_shape_ref.dims();
if target_shape.len() == 1 && index < target_shape[0] {
Some(torsh_tensor::creation::zeros::<T>(&[1])?)
} else if target_shape.len() == 2 && index < target_shape[0] {
let target_features = target_shape[1];
Some(torsh_tensor::creation::zeros::<T>(&[target_features])?)
} else {
None
}
} else {
None
};
Ok((row_features, targets))
} else {
let features = self.features.clone();
let targets = self.targets.clone();
Ok((features, targets))
}
}
}
pub mod preprocessing {
use super::*;
pub struct StandardScaler {
mean: Vec<f32>,
std: Vec<f32>,
fitted: bool,
}
impl Default for StandardScaler {
fn default() -> Self {
Self::new()
}
}
impl StandardScaler {
pub fn new() -> Self {
Self {
mean: Vec::new(),
std: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, data: &[Vec<f32>]) -> Result<()> {
if data.is_empty() {
return Err(TorshError::InvalidArgument(
"Cannot fit scaler to empty data".to_string(),
));
}
let num_features = data[0].len();
let num_samples = data.len();
self.mean = vec![0.0; num_features];
for sample in data {
for (i, &value) in sample.iter().enumerate() {
self.mean[i] += value;
}
}
for mean in &mut self.mean {
*mean /= num_samples as f32;
}
self.std = vec![0.0; num_features];
for sample in data {
for (i, &value) in sample.iter().enumerate() {
let diff = value - self.mean[i];
self.std[i] += diff * diff;
}
}
for std in &mut self.std {
*std = (*std / num_samples as f32).sqrt();
if *std == 0.0 {
*std = 1.0; }
}
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: Vec<f32>) -> Result<Vec<f32>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Scaler must be fitted before transform".to_string(),
));
}
if data.len() != self.mean.len() {
return Err(TorshError::InvalidArgument(
"Data dimensions don't match fitted scaler".to_string(),
));
}
let mut scaled = Vec::with_capacity(data.len());
for (i, &value) in data.iter().enumerate() {
scaled.push((value - self.mean[i]) / self.std[i]);
}
Ok(scaled)
}
pub fn fit_transform(&mut self, data: &[Vec<f32>]) -> Result<Vec<Vec<f32>>> {
self.fit(data)?;
let mut transformed = Vec::with_capacity(data.len());
for sample in data {
transformed.push(self.transform(sample.clone())?);
}
Ok(transformed)
}
}
pub struct MinMaxScaler {
min: Vec<f32>,
max: Vec<f32>,
fitted: bool,
}
impl Default for MinMaxScaler {
fn default() -> Self {
Self::new()
}
}
impl MinMaxScaler {
pub fn new() -> Self {
Self {
min: Vec::new(),
max: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, data: &[Vec<f32>]) -> Result<()> {
if data.is_empty() {
return Err(TorshError::InvalidArgument(
"Cannot fit scaler to empty data".to_string(),
));
}
let num_features = data[0].len();
self.min = vec![f32::INFINITY; num_features];
self.max = vec![f32::NEG_INFINITY; num_features];
for sample in data {
for (i, &value) in sample.iter().enumerate() {
if value < self.min[i] {
self.min[i] = value;
}
if value > self.max[i] {
self.max[i] = value;
}
}
}
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: Vec<f32>) -> Result<Vec<f32>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Scaler must be fitted before transform".to_string(),
));
}
let mut scaled = Vec::with_capacity(data.len());
for (i, &value) in data.iter().enumerate() {
let range = self.max[i] - self.min[i];
let scaled_value = if range > 0.0 {
(value - self.min[i]) / range
} else {
0.0
};
scaled.push(scaled_value);
}
Ok(scaled)
}
}
pub struct LabelEncoder {
classes: Vec<String>,
fitted: bool,
}
impl Default for LabelEncoder {
fn default() -> Self {
Self::new()
}
}
impl LabelEncoder {
pub fn new() -> Self {
Self {
classes: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, labels: &[String]) -> Result<()> {
let mut unique_labels = labels.to_vec();
unique_labels.sort();
unique_labels.dedup();
self.classes = unique_labels;
self.fitted = true;
Ok(())
}
pub fn transform(&self, labels: &[String]) -> Result<Vec<usize>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Encoder must be fitted before transform".to_string(),
));
}
let mut encoded = Vec::with_capacity(labels.len());
for label in labels {
match self.classes.iter().position(|x| x == label) {
Some(idx) => encoded.push(idx),
None => {
return Err(TorshError::InvalidArgument(format!(
"Unknown label: {}",
label
)))
}
}
}
Ok(encoded)
}
pub fn classes(&self) -> &[String] {
&self.classes
}
}
pub struct OneHotEncoder {
classes: Vec<String>,
fitted: bool,
}
impl Default for OneHotEncoder {
fn default() -> Self {
Self::new()
}
}
impl OneHotEncoder {
pub fn new() -> Self {
Self {
classes: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, labels: &[String]) -> Result<()> {
let mut unique_labels = labels.to_vec();
unique_labels.sort();
unique_labels.dedup();
self.classes = unique_labels;
self.fitted = true;
Ok(())
}
pub fn transform(&self, labels: &[String]) -> Result<Vec<Vec<f32>>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Encoder must be fitted before transform".to_string(),
));
}
let mut one_hot = Vec::with_capacity(labels.len());
for label in labels {
let mut encoded = vec![0.0f32; self.classes.len()];
match self.classes.iter().position(|x| x == label) {
Some(idx) => encoded[idx] = 1.0,
None => {
return Err(TorshError::InvalidArgument(format!(
"Unknown label: {}",
label
)))
}
}
one_hot.push(encoded);
}
Ok(one_hot)
}
pub fn classes(&self) -> &[String] {
&self.classes
}
pub fn n_features(&self) -> usize {
self.classes.len()
}
}
pub struct SimpleImputer {
strategy: ImputeStrategy,
fill_values: Vec<f32>,
fitted: bool,
}
#[derive(Clone, Copy, Debug)]
pub enum ImputeStrategy {
Mean,
Median,
Mode,
Constant(f32),
}
impl Default for SimpleImputer {
fn default() -> Self {
Self::new(ImputeStrategy::Mean)
}
}
impl SimpleImputer {
pub fn new(strategy: ImputeStrategy) -> Self {
Self {
strategy,
fill_values: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, data: &[Vec<Option<f32>>]) -> Result<()> {
if data.is_empty() {
return Err(TorshError::InvalidArgument(
"Cannot fit imputer to empty data".to_string(),
));
}
let num_features = data[0].len();
self.fill_values = vec![0.0; num_features];
for feature_idx in 0..num_features {
let feature_values: Vec<f32> =
data.iter().filter_map(|row| row[feature_idx]).collect();
if feature_values.is_empty() {
self.fill_values[feature_idx] = 0.0;
continue;
}
match self.strategy {
ImputeStrategy::Mean => {
self.fill_values[feature_idx] =
feature_values.iter().sum::<f32>() / feature_values.len() as f32;
}
ImputeStrategy::Median => {
let mut sorted = feature_values.clone();
sorted.sort_by(|a, b| {
a.partial_cmp(b)
.expect("comparison should succeed for finite values")
});
let mid = sorted.len() / 2;
self.fill_values[feature_idx] = if sorted.len() % 2 == 0 {
(sorted[mid - 1] + sorted[mid]) / 2.0
} else {
sorted[mid]
};
}
ImputeStrategy::Mode => {
let mut counts = std::collections::HashMap::new();
for &val in &feature_values {
*counts.entry((val * 1000.0) as i32).or_insert(0) += 1;
}
let mode_key = counts
.iter()
.max_by_key(|(_, &count)| count)
.map(|(k, _)| k);
self.fill_values[feature_idx] =
mode_key.map_or(0.0, |k| *k as f32 / 1000.0);
}
ImputeStrategy::Constant(value) => {
self.fill_values[feature_idx] = value;
}
}
}
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: Vec<Option<f32>>) -> Result<Vec<f32>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Imputer must be fitted before transform".to_string(),
));
}
if data.len() != self.fill_values.len() {
return Err(TorshError::InvalidArgument(
"Data dimensions don't match fitted imputer".to_string(),
));
}
let imputed = data
.iter()
.enumerate()
.map(|(i, &val)| val.unwrap_or(self.fill_values[i]))
.collect();
Ok(imputed)
}
}
pub struct VarianceThreshold {
threshold: f32,
variances: Vec<f32>,
selected_features: Vec<usize>,
fitted: bool,
}
impl VarianceThreshold {
pub fn new(threshold: f32) -> Self {
Self {
threshold,
variances: Vec::new(),
selected_features: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, data: &[Vec<f32>]) -> Result<()> {
if data.is_empty() {
return Err(TorshError::InvalidArgument(
"Cannot fit selector to empty data".to_string(),
));
}
let num_features = data[0].len();
let num_samples = data.len();
self.variances = vec![0.0; num_features];
for feature_idx in 0..num_features {
let feature_values: Vec<f32> = data.iter().map(|row| row[feature_idx]).collect();
let mean = feature_values.iter().sum::<f32>() / num_samples as f32;
let variance = feature_values
.iter()
.map(|&val| (val - mean).powi(2))
.sum::<f32>()
/ num_samples as f32;
self.variances[feature_idx] = variance;
}
self.selected_features = self
.variances
.iter()
.enumerate()
.filter(|(_, &variance)| variance > self.threshold)
.map(|(idx, _)| idx)
.collect();
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: Vec<f32>) -> Result<Vec<f32>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Selector must be fitted before transform".to_string(),
));
}
let selected = self
.selected_features
.iter()
.filter_map(|&idx| data.get(idx).copied())
.collect();
Ok(selected)
}
pub fn get_support(&self) -> &[usize] {
&self.selected_features
}
pub fn variances(&self) -> &[f32] {
&self.variances
}
}
pub struct SelectKBest {
k: usize,
scores: Vec<f32>,
selected_features: Vec<usize>,
fitted: bool,
}
impl SelectKBest {
pub fn new(k: usize) -> Self {
Self {
k,
scores: Vec::new(),
selected_features: Vec::new(),
fitted: false,
}
}
pub fn fit(&mut self, features: &[Vec<f32>], targets: &[f32]) -> Result<()> {
if features.is_empty() || features.len() != targets.len() {
return Err(TorshError::InvalidArgument(
"Features and targets must have same length".to_string(),
));
}
let num_features = features[0].len();
self.scores = vec![0.0; num_features];
for feature_idx in 0..num_features {
let feature_values: Vec<f32> =
features.iter().map(|row| row[feature_idx]).collect();
self.scores[feature_idx] = self.calculate_f_score(&feature_values, targets);
}
let mut feature_scores: Vec<(usize, f32)> = self
.scores
.iter()
.enumerate()
.map(|(idx, &score)| (idx, score))
.collect();
feature_scores.sort_by(|a, b| {
b.1.partial_cmp(&a.1)
.expect("comparison should succeed for finite values")
});
self.selected_features = feature_scores
.into_iter()
.take(self.k)
.map(|(idx, _)| idx)
.collect();
self.selected_features.sort();
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: Vec<f32>) -> Result<Vec<f32>> {
if !self.fitted {
return Err(TorshError::InvalidArgument(
"Selector must be fitted before transform".to_string(),
));
}
let selected = self
.selected_features
.iter()
.filter_map(|&idx| data.get(idx).copied())
.collect();
Ok(selected)
}
fn calculate_f_score(&self, feature_values: &[f32], targets: &[f32]) -> f32 {
let feature_mean = feature_values.iter().sum::<f32>() / feature_values.len() as f32;
let target_mean = targets.iter().sum::<f32>() / targets.len() as f32;
let numerator: f32 = feature_values
.iter()
.zip(targets)
.map(|(&f, &t)| (f - feature_mean) * (t - target_mean))
.sum();
let feature_variance: f32 = feature_values
.iter()
.map(|&f| (f - feature_mean).powi(2))
.sum();
let target_variance: f32 = targets.iter().map(|&t| (t - target_mean).powi(2)).sum();
if feature_variance == 0.0 || target_variance == 0.0 {
0.0
} else {
(numerator.abs()) / (feature_variance * target_variance).sqrt()
}
}
pub fn get_support(&self) -> &[usize] {
&self.selected_features
}
pub fn scores(&self) -> &[f32] {
&self.scores
}
}
}
pub fn train_test_split<D: Dataset + Clone>(
dataset: D,
test_size: f32,
random_state: Option<u64>,
) -> Result<(crate::dataset::Subset<D>, crate::dataset::Subset<D>)> {
if !(0.0..1.0).contains(&test_size) {
return Err(TorshError::InvalidArgument(
"test_size must be between 0 and 1".to_string(),
));
}
let total_size = dataset.len();
let test_len = (total_size as f32 * test_size).round() as usize;
let train_len = total_size - test_len;
let subsets = crate::dataset::random_split(dataset, &[train_len, test_len], random_state)?;
Ok((subsets[0].clone(), subsets[1].clone()))
}
#[cfg(test)]
mod tests {
use super::*;
use torsh_tensor::creation::*;
#[test]
fn test_array_dataset() {
let features = rand::<f32>(&[100, 10]).unwrap();
let targets = rand::<f32>(&[100, 1]).unwrap();
let dataset = ArrayDataset::new(features, Some(targets)).unwrap();
assert_eq!(dataset.len(), 100);
assert_eq!(dataset.num_features().unwrap(), 10);
let (_feat, targ) = dataset.get(0).unwrap();
assert!(targ.is_some());
}
#[test]
fn test_preprocessing() {
let data = vec![
vec![1.0, 2.0, 3.0],
vec![4.0, 5.0, 6.0],
vec![7.0, 8.0, 9.0],
];
let mut scaler = preprocessing::StandardScaler::new();
let scaled = scaler.fit_transform(&data).unwrap();
assert_eq!(scaled.len(), 3);
assert_eq!(scaled[0].len(), 3);
let mut minmax = preprocessing::MinMaxScaler::new();
minmax.fit(&data).unwrap();
let scaled = minmax.transform(vec![1.0, 2.0, 3.0]).unwrap();
assert_eq!(scaled, vec![0.0, 0.0, 0.0]);
let labels = vec!["cat".to_string(), "dog".to_string(), "cat".to_string()];
let mut encoder = preprocessing::LabelEncoder::new();
encoder.fit(&labels).unwrap();
let encoded = encoder.transform(&labels).unwrap();
assert_eq!(encoded, vec![0, 1, 0]);
}
#[test]
fn test_train_test_split() {
let dataset = crate::dataset::TensorDataset::from_tensor(ones::<f32>(&[100, 10]).unwrap());
let (train, test) = train_test_split(dataset, 0.2, Some(42)).unwrap();
assert_eq!(train.len(), 80);
assert_eq!(test.len(), 20);
}
#[test]
fn test_one_hot_encoder() {
let labels = vec![
"cat".to_string(),
"dog".to_string(),
"cat".to_string(),
"bird".to_string(),
];
let mut encoder = preprocessing::OneHotEncoder::new();
encoder.fit(&labels).unwrap();
assert_eq!(encoder.classes(), &["bird", "cat", "dog"]);
assert_eq!(encoder.n_features(), 3);
let encoded = encoder
.transform(&["cat".to_string(), "dog".to_string()])
.unwrap();
assert_eq!(encoded.len(), 2);
assert_eq!(encoded[0], vec![0.0, 1.0, 0.0]); assert_eq!(encoded[1], vec![0.0, 0.0, 1.0]); }
#[test]
fn test_simple_imputer() {
let data = vec![
vec![Some(1.0), Some(2.0), None],
vec![Some(4.0), None, Some(6.0)],
vec![None, Some(8.0), Some(9.0)],
];
let mut imputer = preprocessing::SimpleImputer::new(preprocessing::ImputeStrategy::Mean);
imputer.fit(&data).unwrap();
let imputed = imputer.transform(vec![None, None, None]).unwrap();
assert_eq!(imputed.len(), 3);
assert!((imputed[0] - 2.5).abs() < 1e-6); assert!((imputed[1] - 5.0).abs() < 1e-6); assert!((imputed[2] - 7.5).abs() < 1e-6);
let mut const_imputer =
preprocessing::SimpleImputer::new(preprocessing::ImputeStrategy::Constant(-1.0));
const_imputer.fit(&data).unwrap();
let const_imputed = const_imputer.transform(vec![None, None, None]).unwrap();
assert_eq!(const_imputed, vec![-1.0, -1.0, -1.0]);
}
#[test]
fn test_variance_threshold() {
let data = vec![
vec![1.0, 0.0, 3.0], vec![2.0, 0.0, 4.0], vec![3.0, 0.0, 5.0], ];
let mut selector = preprocessing::VarianceThreshold::new(0.1);
selector.fit(&data).unwrap();
let selected_features = selector.get_support();
assert_eq!(selected_features, &[0, 2]);
let transformed = selector.transform(vec![10.0, 20.0, 30.0]).unwrap();
assert_eq!(transformed, vec![10.0, 30.0]); }
#[test]
fn test_select_k_best() {
let features = vec![
vec![1.0, 10.0, 100.0], vec![2.0, 20.0, 200.0],
vec![3.0, 30.0, 300.0],
];
let targets = vec![1.0, 2.0, 3.0];
let mut selector = preprocessing::SelectKBest::new(2);
selector.fit(&features, &targets).unwrap();
assert_eq!(selector.get_support().len(), 2);
let transformed = selector.transform(vec![5.0, 50.0, 500.0]).unwrap();
assert_eq!(transformed.len(), 2); }
#[test]
fn test_median_imputation() {
let data = vec![
vec![Some(1.0), Some(2.0)],
vec![Some(2.0), Some(4.0)],
vec![Some(3.0), Some(6.0)],
vec![None, None],
];
let mut imputer = preprocessing::SimpleImputer::new(preprocessing::ImputeStrategy::Median);
imputer.fit(&data).unwrap();
let imputed = imputer.transform(vec![None, None]).unwrap();
assert_eq!(imputed, vec![2.0, 4.0]); }
}