use scirs2_core::ndarray::Array2;
use statrs::statistics::Statistics;
#[allow(dead_code)]
pub fn normalize(data: &mut Array2<f64>) {
let n_features = data.ncols();
for j in 0..n_features {
let mut column = data.column_mut(j);
let mean = {
let val = column.view().mean();
if val.is_nan() {
0.0
} else {
val
}
};
let std = column.view().std(0.0);
if std > 1e-10 {
column.mapv_inplace(|x| (x - mean) / std);
}
}
}
#[allow(dead_code)]
pub fn min_max_scale(_data: &mut Array2<f64>, featurerange: (f64, f64)) {
let (range_min, range_max) = featurerange;
let range_size = range_max - range_min;
for j in 0.._data.ncols() {
let mut column = _data.column_mut(j);
let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
if (col_max - col_min).abs() > 1e-10 {
column.mapv_inplace(|x| (x - col_min) / (col_max - col_min) * range_size + range_min);
} else {
column.fill(range_min + range_size / 2.0);
}
}
}
#[allow(dead_code)]
pub fn robust_scale(data: &mut Array2<f64>) {
for j in 0..data.ncols() {
let mut column_values: Vec<f64> = data.column(j).to_vec();
column_values.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
let n = column_values.len();
if n == 0 {
continue;
}
let median = if n.is_multiple_of(2) {
(column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
} else {
column_values[n / 2]
};
let q1_idx = n / 4;
let q3_idx = 3 * n / 4;
let q1 = column_values[q1_idx];
let q3 = column_values[q3_idx];
let iqr = q3 - q1;
let mut column = data.column_mut(j);
if iqr > 1e-10 {
column.mapv_inplace(|x| (x - median) / iqr);
} else {
column.mapv_inplace(|x| x - median);
}
}
}
pub trait StatsExt {
fn mean(&self) -> Option<f64>;
fn standard_deviation(&self, ddof: f64) -> f64;
}
impl StatsExt for scirs2_core::ndarray::ArrayView1<'_, f64> {
fn mean(&self) -> Option<f64> {
if self.is_empty() {
return None;
}
let sum: f64 = self.sum();
Some(sum / self.len() as f64)
}
fn standard_deviation(&self, ddof: f64) -> f64 {
if self.is_empty() {
return 0.0;
}
use scirs2_core::ndarray::ArrayStatCompat;
let n = self.len() as f64;
let mean = self.mean_or(0.0);
let mut sum_sq = 0.0;
for &x in self.iter() {
let diff = x - mean;
sum_sq += diff * diff;
}
let divisor = n - ddof;
if divisor <= 0.0 {
return 0.0;
}
(sum_sq / divisor).sqrt()
}
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::{array, Array1};
#[test]
fn test_normalize() {
let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
.expect("Operation failed");
normalize(&mut data);
for j in 0..data.ncols() {
let column = data.column(j);
let mean = column.mean();
assert!(mean.abs() < 1e-10);
}
}
#[test]
fn test_normalize_constant_values() {
let mut data =
Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).expect("Operation failed");
normalize(&mut data);
for i in 0..data.nrows() {
assert_eq!(data[[i, 0]], 5.0);
}
}
#[test]
fn test_min_max_scale() {
let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0])
.expect("Operation failed");
min_max_scale(&mut data, (0.0, 1.0));
for i in 0..data.nrows() {
for j in 0..data.ncols() {
let value = data[[i, j]];
assert!((0.0..=1.0).contains(&value));
}
}
assert!((data[[0, 0]] - 0.0).abs() < 1e-10);
assert!((data[[1, 0]] - 0.5).abs() < 1e-10);
assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
}
#[test]
fn test_min_max_scale_custom_range() {
let mut data =
Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).expect("Operation failed");
min_max_scale(&mut data, (-1.0, 1.0));
assert!((data[[0, 0]] - (-1.0)).abs() < 1e-10);
assert!((data[[1, 0]] - 0.0).abs() < 1e-10);
assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
}
#[test]
fn test_min_max_scale_constant_values() {
let mut data =
Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).expect("Operation failed");
min_max_scale(&mut data, (0.0, 1.0));
for i in 0..data.nrows() {
assert!((data[[i, 0]] - 0.5).abs() < 1e-10);
}
}
#[test]
fn test_robust_scale() {
let mut data = Array2::from_shape_vec(
(5, 2),
vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0],
)
.expect("Failed to get slice");
robust_scale(&mut data);
let col1_values: Vec<f64> = data.column(0).to_vec();
let col2_values: Vec<f64> = data.column(1).to_vec();
let col1_range = col1_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
- col1_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let col2_range = col2_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
- col2_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
assert!(col1_range.is_finite());
assert!(col2_range.is_finite());
assert!(col1_range > 0.0); assert!(col2_range > 0.0); }
#[test]
fn test_robust_scale_constant_values() {
let mut data =
Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).expect("Operation failed");
robust_scale(&mut data);
for i in 0..data.nrows() {
assert!((data[[i, 0]] - 0.0).abs() < 1e-10);
}
}
#[test]
fn test_robust_vs_standard_scaling() {
let mut data_robust = Array2::from_shape_vec(
(5, 1),
vec![1.0, 2.0, 3.0, 4.0, 100.0], )
.expect("Failed to get slice");
let mut data_standard = data_robust.clone();
robust_scale(&mut data_robust);
normalize(&mut data_standard);
let robust_range = data_robust.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
- data_robust.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let standard_range = data_standard
.iter()
.fold(f64::NEG_INFINITY, |a, &b| a.max(b))
- data_standard.iter().fold(f64::INFINITY, |a, &b| a.min(b));
assert!(robust_range.is_finite());
assert!(standard_range.is_finite());
assert!(robust_range > 0.0);
assert!(standard_range > 0.0);
assert!(data_robust[[0, 0]] != 1.0); assert!(data_standard[[0, 0]] != 1.0); }
#[test]
fn test_stats_ext_trait() {
let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
let view = data.view();
let mean = view.mean();
assert!((mean - 3.0_f64).abs() < 1e-10);
let std = view.std(0.0); let expected_std = (10.0_f64 / 5.0).sqrt(); assert!((std - expected_std).abs() < 1e-10);
let std_sample = view.std(1.0);
let expected_std_sample = (10.0_f64 / 4.0).sqrt();
assert!((std_sample - expected_std_sample).abs() < 1e-10);
}
#[test]
fn test_stats_ext_empty_array() {
let data: Array1<f64> = array![];
let view = data.view();
assert!(view.mean().is_nan());
assert_eq!(view.standard_deviation(0.0), 0.0);
}
#[test]
fn test_scaling_pipeline() {
let mut data1 =
Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
.expect("Failed to get slice");
let mut data2 = data1.clone();
let mut data3 = data1.clone();
normalize(&mut data1); min_max_scale(&mut data2, (0.0, 1.0)); robust_scale(&mut data3);
assert!(data1.iter().all(|&x| x.is_finite()));
assert!(data2.iter().all(|&x| x.is_finite()));
assert!(data3.iter().all(|&x| x.is_finite()));
assert!(data2.iter().all(|&x| (0.0..=1.0).contains(&x)));
assert_eq!(data1.shape(), data2.shape());
assert_eq!(data2.shape(), data3.shape());
}
}