use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
#[allow(dead_code)]
pub fn whiten<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
check_finite: bool,
) -> Result<Array2<F>> {
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if n_samples == 0 || n_features == 0 {
return Err(ClusteringError::InvalidInput("Input data is empty".into()));
}
if check_finite {
for element in data.iter() {
if !element.is_finite() {
return Err(ClusteringError::InvalidInput(
"Input data contains NaN or infinite values".into(),
));
}
}
}
let std_dev = standard_deviation(data, Axis(0))?;
let mut result = Array2::zeros(data.dim());
for j in 0..n_features {
let std_j = std_dev[j];
if std_j <= F::epsilon() {
for i in 0..n_samples {
result[[i, j]] = data[[i, j]];
}
} else {
for i in 0..n_samples {
result[[i, j]] = data[[i, j]] / std_j;
}
}
}
Ok(result)
}
#[allow(dead_code)]
pub fn standardize<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
check_finite: bool,
) -> Result<Array2<F>> {
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if n_samples == 0 || n_features == 0 {
return Err(ClusteringError::InvalidInput("Input data is empty".into()));
}
if check_finite {
for element in data.iter() {
if !element.is_finite() {
return Err(ClusteringError::InvalidInput(
"Input data contains NaN or infinite values".into(),
));
}
}
}
let mean = data.mean_axis(Axis(0)).expect("Operation failed");
let std_dev = standard_deviation(data, Axis(0))?;
let mut result = Array2::zeros(data.dim());
for j in 0..n_features {
let mean_j = mean[j];
let std_j = std_dev[j];
if std_j <= F::epsilon() {
for i in 0..n_samples {
result[[i, j]] = data[[i, j]] - mean_j;
}
} else {
for i in 0..n_samples {
result[[i, j]] = (data[[i, j]] - mean_j) / std_j;
}
}
}
Ok(result)
}
#[allow(dead_code)]
pub fn normalize<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
norm: NormType,
check_finite: bool,
) -> Result<Array2<F>> {
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if n_samples == 0 || n_features == 0 {
return Err(ClusteringError::InvalidInput("Input data is empty".into()));
}
if check_finite {
for element in data.iter() {
if !element.is_finite() {
return Err(ClusteringError::InvalidInput(
"Input data contains NaN or infinite values".into(),
));
}
}
}
let norms = match norm {
NormType::L1 => {
let mut norms = Array1::zeros(n_samples);
for i in 0..n_samples {
let row = data.row(i);
let row_norm = row.iter().fold(F::zero(), |acc, &x| acc + x.abs());
norms[i] = row_norm;
}
norms
}
NormType::L2 => {
let mut norms = Array1::zeros(n_samples);
for i in 0..n_samples {
let row = data.row(i);
let row_norm = row.iter().fold(F::zero(), |acc, &x| acc + x * x).sqrt();
norms[i] = row_norm;
}
norms
}
NormType::Max => {
let mut norms = Array1::zeros(n_samples);
for i in 0..n_samples {
let row = data.row(i);
let row_norm = row.iter().fold(F::zero(), |acc, &x| acc.max(x.abs()));
norms[i] = row_norm;
}
norms
}
};
let mut result = Array2::zeros(data.dim());
for i in 0..n_samples {
let norm_i = norms[i];
if norm_i <= F::epsilon() {
for j in 0..n_features {
result[[i, j]] = data[[i, j]];
}
} else {
for j in 0..n_features {
result[[i, j]] = data[[i, j]] / norm_i;
}
}
}
Ok(result)
}
#[allow(dead_code)]
pub fn min_max_scale<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
feature_range: (f64, f64),
check_finite: bool,
) -> Result<Array2<F>> {
let n_samples = data.shape()[0];
let n_features = data.shape()[1];
if n_samples == 0 || n_features == 0 {
return Err(ClusteringError::InvalidInput("Input data is empty".into()));
}
if check_finite {
for element in data.iter() {
if !element.is_finite() {
return Err(ClusteringError::InvalidInput(
"Input data contains NaN or infinite values".into(),
));
}
}
}
let (min_val, max_val) = feature_range;
if min_val >= max_val {
return Err(ClusteringError::InvalidInput(
"Feature range minimum must be less than maximum".into(),
));
}
let feature_min = F::from_f64(min_val).expect("Operation failed");
let feature_max = F::from_f64(max_val).expect("Operation failed");
let mut min_values = Array1::zeros(n_features);
let mut max_values = Array1::zeros(n_features);
for j in 0..n_features {
let column = data.column(j);
let (min_j, max_j) = column.iter().fold(
(F::infinity(), F::neg_infinity()),
|(min_val, max_val), &x| (min_val.min(x), max_val.max(x)),
);
min_values[j] = min_j;
max_values[j] = max_j;
}
let mut result = Array2::zeros(data.dim());
for j in 0..n_features {
let min_j = min_values[j];
let max_j = max_values[j];
let range_j = max_j - min_j;
if range_j <= F::epsilon() {
let middle = (feature_min + feature_max) / F::from_f64(2.0).expect("Operation failed");
for i in 0..n_samples {
result[[i, j]] = middle;
}
} else {
for i in 0..n_samples {
let scaled = (data[[i, j]] - min_j) / range_j;
result[[i, j]] = scaled * (feature_max - feature_min) + feature_min;
}
}
}
Ok(result)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NormType {
L1,
L2,
Max,
}
#[allow(dead_code)]
fn standard_deviation<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
axis: Axis,
) -> Result<Array1<F>> {
let mean = data.mean_axis(axis).expect("Operation failed");
let n = F::from_usize(match axis {
Axis(0) => data.shape()[0],
Axis(1) => data.shape()[1],
_ => return Err(ClusteringError::InvalidInput("Invalid axis".into())),
})
.expect("Operation failed");
let mut variance = match axis {
Axis(0) => Array1::zeros(data.shape()[1]),
Axis(1) => Array1::zeros(data.shape()[0]),
_ => return Err(ClusteringError::InvalidInput("Invalid axis".into())),
};
if axis == Axis(0) {
let n_features = data.shape()[1];
for j in 0..n_features {
let mut sum_squared_diff = F::zero();
for i in 0..data.shape()[0] {
let diff = data[[i, j]] - mean[j];
sum_squared_diff = sum_squared_diff + diff * diff;
}
if n > F::one() {
variance[j] = sum_squared_diff / (n - F::one());
} else {
variance[j] = F::zero();
}
}
} else {
let n_samples = data.shape()[0];
for i in 0..n_samples {
let mut sum_squared_diff = F::zero();
for j in 0..data.shape()[1] {
let diff = data[[i, j]] - mean[i];
sum_squared_diff = sum_squared_diff + diff * diff;
}
if n > F::one() {
variance[i] = sum_squared_diff / (n - F::one());
} else {
variance[i] = F::zero();
}
}
}
let std_dev = variance.mapv(|x| x.sqrt());
let std_dev = std_dev.mapv(|x| if x <= F::epsilon() { F::one() } else { x });
Ok(std_dev)
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use scirs2_core::ndarray::Array2;
#[test]
fn test_whiten() {
let data =
Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
.expect("Operation failed");
let whitened = whiten(data.view(), true).expect("Operation failed");
let std_dev = standard_deviation(whitened.view(), Axis(0)).expect("Operation failed");
for &std in std_dev.iter() {
assert_abs_diff_eq!(std, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_standardize() {
let data =
Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
.expect("Operation failed");
let standardized = standardize(data.view(), true).expect("Operation failed");
let mean = standardized.mean_axis(Axis(0)).expect("Operation failed");
for mean_val in mean.iter() {
assert_abs_diff_eq!(*mean_val, 0.0, epsilon = 1e-10);
}
let std_dev = standard_deviation(standardized.view(), Axis(0)).expect("Operation failed");
for std_val in std_dev.iter() {
assert_abs_diff_eq!(*std_val, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_normalize_l2() {
let data =
Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
.expect("Operation failed");
let normalized = normalize(data.view(), NormType::L2, true).expect("Operation failed");
for i in 0..data.shape()[0] {
let row = normalized.row(i);
let norm_sq: f64 = row.iter().fold(0.0, |acc, &x| acc + x * x);
let norm = norm_sq.sqrt();
assert_abs_diff_eq!(norm, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_min_max_scale() {
let data =
Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
.expect("Operation failed");
let scaled = min_max_scale(data.view(), (0.0, 1.0), true).expect("Operation failed");
for val in scaled.iter() {
assert!(*val >= 0.0 && *val <= 1.0);
}
for j in 0..data.shape()[1] {
let column = scaled.column(j);
let column_values: Vec<f64> = column.iter().copied().collect();
if !column_values.is_empty() {
let min_val = column_values
.iter()
.fold(f64::INFINITY, |min, &x| min.min(x));
let max_val = column_values
.iter()
.fold(f64::NEG_INFINITY, |max, &x| max.max(x));
if data.column(j).iter().any(|&x| x != data[[0, j]]) {
assert_abs_diff_eq!(min_val, 0.0, epsilon = 1e-10);
assert_abs_diff_eq!(max_val, 1.0, epsilon = 1e-10);
}
}
}
}
}