use crate::error::{ClusterError, ClusterResult};
use torsh_tensor::Tensor;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PreprocessingMethod {
StandardScaler,
MinMaxScaler,
Normalizer,
RobustScaler,
}
pub fn standardize_features(data: &Tensor) -> ClusterResult<Tensor> {
let n_samples = data.shape().dims()[0];
let n_features = data.shape().dims()[1];
let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
let mut standardized_data = Vec::with_capacity(data_vec.len());
for j in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for i in 0..n_samples {
feature_values.push(data_vec[i * n_features + j] as f64);
}
let mean = feature_values.iter().sum::<f64>() / n_samples as f64;
let variance = feature_values
.iter()
.map(|&x| (x - mean).powi(2))
.sum::<f64>()
/ n_samples as f64;
let std = variance.sqrt();
for &feature_val in feature_values.iter() {
let standardized_val = if std > 0.0 {
(feature_val - mean) / std
} else {
0.0
};
standardized_data.push(standardized_val as f32);
}
}
let mut reordered_data = vec![0.0; n_samples * n_features];
for i in 0..n_samples {
for j in 0..n_features {
reordered_data[i * n_features + j] = standardized_data[j * n_samples + i];
}
}
Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
}
pub fn normalize_features(data: &Tensor) -> ClusterResult<Tensor> {
let n_samples = data.shape().dims()[0];
let n_features = data.shape().dims()[1];
let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
let mut normalized_data = Vec::with_capacity(data_vec.len());
for j in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for i in 0..n_samples {
feature_values.push(data_vec[i * n_features + j] as f64);
}
let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max_val = feature_values
.iter()
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let range = max_val - min_val;
for &feature_val in feature_values.iter() {
let normalized_val = if range > 0.0 {
(feature_val - min_val) / range
} else {
0.0
};
normalized_data.push(normalized_val as f32);
}
}
let mut reordered_data = vec![0.0; n_samples * n_features];
for i in 0..n_samples {
for j in 0..n_features {
reordered_data[i * n_features + j] = normalized_data[j * n_samples + i];
}
}
Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
}
pub fn unit_normalize(data: &Tensor) -> ClusterResult<Tensor> {
let n_samples = data.shape().dims()[0];
let n_features = data.shape().dims()[1];
let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
let mut normalized_data = Vec::with_capacity(data_vec.len());
for i in 0..n_samples {
let mut norm = 0.0;
for j in 0..n_features {
let val = data_vec[i * n_features + j] as f64;
norm += val * val;
}
norm = norm.sqrt();
for j in 0..n_features {
let normalized_val = if norm > 0.0 {
data_vec[i * n_features + j] as f64 / norm
} else {
0.0
};
normalized_data.push(normalized_val as f32);
}
}
Tensor::from_vec(normalized_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
}
pub fn robust_scale(data: &Tensor) -> ClusterResult<Tensor> {
let n_samples = data.shape().dims()[0];
let n_features = data.shape().dims()[1];
let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
let mut scaled_data = Vec::with_capacity(data_vec.len());
for j in 0..n_features {
let mut feature_values = Vec::with_capacity(n_samples);
for i in 0..n_samples {
feature_values.push(data_vec[i * n_features + j] as f64);
}
feature_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if n_samples % 2 == 0 {
(feature_values[n_samples / 2 - 1] + feature_values[n_samples / 2]) / 2.0
} else {
feature_values[n_samples / 2]
};
let q1_idx = n_samples / 4;
let q3_idx = 3 * n_samples / 4;
let iqr = feature_values[q3_idx] - feature_values[q1_idx];
for i in 0..n_samples {
let original_val = data_vec[i * n_features + j] as f64;
let scaled_val = if iqr > 0.0 {
(original_val - median) / iqr
} else {
0.0
};
scaled_data.push(scaled_val as f32);
}
}
let mut reordered_data = vec![0.0; n_samples * n_features];
for i in 0..n_samples {
for j in 0..n_features {
reordered_data[i * n_features + j] = scaled_data[j * n_samples + i];
}
}
Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
}