Skip to main content

matten_mlprep/
scale.rs

1//! Per-column scaling (RFC-028 §4.1, §4.2).
2//!
3//! Both functions follow the `rows = samples`, `columns = features` convention
4//! and reject constant columns explicitly via [`MattenMlprepError::ZeroVariance`]
5//! rather than silently emitting a zero column. Per-column statistics reuse core
6//! `matten` axis reductions (RFC-019), so `NaN`/`Inf` propagate exactly as in
7//! core (`mean_axis` / `min_axis` / `max_axis`).
8
9use crate::error::MattenMlprepError;
10use crate::util::matrix_dims;
11use matten::Tensor;
12
13#[inline]
14fn at(data: &[f64], i: usize, j: usize, cols: usize) -> f64 {
15    data[i * cols + j]
16}
17
18/// Standardizes each column to zero mean and unit (population) standard
19/// deviation: `out[i,j] = (x[i,j] - mean_j) / std_j`.
20///
21/// `std_j` uses the population formula (divide by `n`), matching scikit-learn's
22/// `StandardScaler`.
23///
24/// # Errors
25///
26/// - [`MattenMlprepError::ExpectedMatrix`] if `x` is not rank-2.
27/// - [`MattenMlprepError::ZeroVariance`] if any column is constant.
28/// - [`MattenMlprepError::DynamicTensor`] (with the `dynamic` feature) if `x` is dynamic.
29///
30/// ```
31/// use matten::Tensor;
32/// use matten_mlprep::standardize_columns;
33///
34/// // Column 0: [1, 3] -> mean 2, std 1 -> [-1, 1]; column 1: [10, 20] -> [-1, 1].
35/// let x = Tensor::new(vec![1.0, 10.0, 3.0, 20.0], &[2, 2]);
36/// let z = standardize_columns(&x).unwrap();
37/// assert_eq!(z.as_slice(), &[-1.0, -1.0, 1.0, 1.0]);
38/// ```
39pub fn standardize_columns(x: &Tensor) -> Result<Tensor, MattenMlprepError> {
40    let (rows, cols) = matrix_dims(x)?;
41    let data = x.as_slice();
42
43    // Per-column means via core axis reduction (NaN propagates as in core).
44    let means = x.mean_axis(0);
45    let means = means.as_slice();
46
47    let mut out = vec![0.0f64; rows * cols];
48    for j in 0..cols {
49        let mean = means[j];
50        let var = (0..rows)
51            .map(|i| {
52                let d = at(data, i, j, cols) - mean;
53                d * d
54            })
55            .sum::<f64>()
56            / rows as f64;
57        let std = var.sqrt();
58        if std == 0.0 {
59            return Err(MattenMlprepError::ZeroVariance { column: j });
60        }
61        for i in 0..rows {
62            out[i * cols + j] = (at(data, i, j, cols) - mean) / std;
63        }
64    }
65
66    Tensor::try_new(out, &[rows, cols]).map_err(MattenMlprepError::Matten)
67}
68
69/// Scales each column to the `[0, 1]` range:
70/// `out[i,j] = (x[i,j] - min_j) / (max_j - min_j)`.
71///
72/// # Errors
73///
74/// - [`MattenMlprepError::ExpectedMatrix`] if `x` is not rank-2.
75/// - [`MattenMlprepError::ZeroVariance`] if any column is constant (zero range).
76/// - [`MattenMlprepError::DynamicTensor`] (with the `dynamic` feature) if `x` is dynamic.
77///
78/// ```
79/// use matten::Tensor;
80/// use matten_mlprep::minmax_scale_columns;
81///
82/// // Column 0: [0, 5, 10] -> [0, 0.5, 1].
83/// let x = Tensor::new(vec![0.0, 5.0, 10.0], &[3, 1]);
84/// let s = minmax_scale_columns(&x).unwrap();
85/// assert_eq!(s.as_slice(), &[0.0, 0.5, 1.0]);
86/// ```
87pub fn minmax_scale_columns(x: &Tensor) -> Result<Tensor, MattenMlprepError> {
88    let (rows, cols) = matrix_dims(x)?;
89    let data = x.as_slice();
90
91    // Per-column min/max via core axis reductions (NaN propagates as in core).
92    let mins = x.min_axis(0);
93    let maxs = x.max_axis(0);
94    let mins = mins.as_slice();
95    let maxs = maxs.as_slice();
96
97    let mut out = vec![0.0f64; rows * cols];
98    for j in 0..cols {
99        let range = maxs[j] - mins[j];
100        if range == 0.0 {
101            return Err(MattenMlprepError::ZeroVariance { column: j });
102        }
103        for i in 0..rows {
104            out[i * cols + j] = (at(data, i, j, cols) - mins[j]) / range;
105        }
106    }
107
108    Tensor::try_new(out, &[rows, cols]).map_err(MattenMlprepError::Matten)
109}