sklears_python/preprocessing/
standard_scaler.rs

1//! Python bindings for StandardScaler
2//!
3//! This module provides Python bindings for StandardScaler,
4//! offering scikit-learn compatible standardization (z-score normalization).
5
6use super::common::*;
7use scirs2_core::ndarray::{Array1, Axis};
8
9/// StandardScaler state after fitting
10#[derive(Debug, Clone)]
11struct StandardScalerState {
12    mean: Array1<f64>,
13    scale: Array1<f64>,
14    var: Array1<f64>,
15    n_features: usize,
16    n_samples_seen: usize,
17}
18
19/// Standardize features by removing the mean and scaling to unit variance.
20///
21/// The standard score of a sample `x` is calculated as:
22///
23///     z = (x - u) / s
24///
25/// where `u` is the mean of the training samples or zero if `with_mean=False`,
26/// and `s` is the standard deviation of the training samples or one if
27/// `with_std=False`.
28///
29/// Centering and scaling happen independently on each feature by computing
30/// the relevant statistics on the samples in the training set. Mean and
31/// standard deviation are then stored to be used on later data using
32/// :meth:`transform`.
33///
34/// Standardization of a dataset is a common requirement for many
35/// machine learning estimators: they might behave badly if the
36/// individual features do not more or less look like standard normally
37/// distributed data (e.g. Gaussian with 0 mean and unit variance).
38///
39/// Parameters
40/// ----------
41/// copy : bool, default=True
42///     If False, try to avoid a copy and do inplace scaling instead.
43///     This is not guaranteed to always work inplace; e.g. if the data is
44///     not a NumPy array or scipy.sparse CSR matrix, a copy may still be
45///     returned.
46///
47/// with_mean : bool, default=True
48///     If True, center the data before scaling.
49///     This does not work (and will raise an exception) when attempted on
50///     sparse matrices, because centering them entails building a dense
51///     matrix which in common use cases is likely to be too large to fit in
52///     memory.
53///
54/// with_std : bool, default=True
55///     If True, scale the data to unit variance (or equivalently,
56///     unit standard deviation).
57///
58/// Attributes
59/// ----------
60/// scale_ : ndarray of shape (n_features,) or None
61///     Per feature relative scaling of the data to achieve zero mean and unit
62///     variance. Generally this is calculated using `np.sqrt(var_)`. If a
63///     variance is zero, we can't achieve unit variance, and the data is left
64///     as-is, giving a scaling factor of 1. `scale_` is equal to `None`
65///     when `with_std=False`.
66///
67/// mean_ : ndarray of shape (n_features,) or None
68///     The mean value for each feature in the training set.
69///     Equal to ``None`` when ``with_mean=False``.
70///
71/// var_ : ndarray of shape (n_features,) or None
72///     The variance for each feature in the training set. Used to compute
73///     `scale_`. Equal to ``None`` when ``with_std=False``.
74///
75/// n_features_in_ : int
76///     Number of features seen during :term:`fit`.
77///
78/// n_samples_seen_ : int
79///     The number of samples processed by the estimator.
80///     It will be reset on new calls to fit, but increments across
81///     ``partial_fit`` calls.
82///
83/// Examples
84/// --------
85/// >>> from sklears_python import StandardScaler
86/// >>> import numpy as np
87/// >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
88/// >>> scaler = StandardScaler()
89/// >>> scaler.fit(data)
90/// StandardScaler()
91/// >>> print(scaler.mean_)
92/// [0.5 0.5]
93/// >>> print(scaler.transform(data))
94/// [[-1. -1.]
95///  [-1. -1.]
96///  [ 1.  1.]
97///  [ 1.  1.]]
98/// >>> print(scaler.transform([[2, 2]]))
99/// [[3. 3.]]
100#[pyclass(name = "StandardScaler")]
101pub struct PyStandardScaler {
102    copy: bool,
103    with_mean: bool,
104    with_std: bool,
105    state: Option<StandardScalerState>,
106}
107
108#[pymethods]
109impl PyStandardScaler {
110    #[new]
111    #[pyo3(signature = (copy=true, with_mean=true, with_std=true))]
112    fn new(copy: bool, with_mean: bool, with_std: bool) -> Self {
113        Self {
114            copy,
115            with_mean,
116            with_std,
117            state: None,
118        }
119    }
120
121    /// Compute the mean and std to be used for later scaling.
122    ///
123    /// Parameters
124    /// ----------
125    /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
126    ///     The data used to compute the mean and standard deviation
127    ///     used for later scaling along the features axis.
128    ///
129    /// y : None
130    ///     Ignored.
131    ///
132    /// sample_weight : array-like of shape (n_samples,), default=None
133    ///     Individual weights for each sample.
134    ///
135    /// Returns
136    /// -------
137    /// self : object
138    ///     Fitted scaler.
139    fn fit(&mut self, x: PyReadonlyArray2<f64>) -> PyResult<()> {
140        let x_array = pyarray_to_core_array2(&x)?;
141        validate_fit_array(&x_array)?;
142
143        let n_samples = x_array.nrows();
144        let n_features = x_array.ncols();
145
146        // Compute mean
147        let mean = if self.with_mean {
148            x_array.mean_axis(Axis(0)).unwrap()
149        } else {
150            Array1::zeros(n_features)
151        };
152
153        // Compute variance and scale
154        let (var, scale) = if self.with_std {
155            // Calculate variance: E[(X - mean)^2]
156            let mut var = Array1::zeros(n_features);
157            for j in 0..n_features {
158                let col = x_array.column(j);
159                let mean_j = mean[j];
160                let sum_sq_diff: f64 = col.iter().map(|&x| (x - mean_j).powi(2)).sum();
161                var[j] = sum_sq_diff / n_samples as f64;
162            }
163
164            // Calculate scale (std dev), but avoid division by zero
165            let scale = var.mapv(|v| {
166                let std = v.sqrt();
167                if std < 1e-10 {
168                    1.0 // Avoid division by zero
169                } else {
170                    std
171                }
172            });
173
174            (var, scale)
175        } else {
176            (Array1::ones(n_features), Array1::ones(n_features))
177        };
178
179        self.state = Some(StandardScalerState {
180            mean,
181            scale,
182            var,
183            n_features,
184            n_samples_seen: n_samples,
185        });
186
187        Ok(())
188    }
189
190    /// Perform standardization by centering and scaling.
191    ///
192    /// Parameters
193    /// ----------
194    /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
195    ///     The data used to scale along the features axis.
196    ///
197    /// copy : bool, default=None
198    ///     Copy the input X or not.
199    ///
200    /// Returns
201    /// -------
202    /// X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
203    ///     Transformed array.
204    fn transform(&self, py: Python<'_>, x: PyReadonlyArray2<f64>) -> PyResult<Py<PyArray2<f64>>> {
205        let state = self
206            .state
207            .as_ref()
208            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
209
210        let x_array = pyarray_to_core_array2(&x)?;
211        validate_transform_array(&x_array, state.n_features)?;
212
213        let mut transformed = x_array.clone();
214
215        // Center the data
216        if self.with_mean {
217            for j in 0..state.n_features {
218                for i in 0..transformed.nrows() {
219                    transformed[[i, j]] -= state.mean[j];
220                }
221            }
222        }
223
224        // Scale the data
225        if self.with_std {
226            for j in 0..state.n_features {
227                for i in 0..transformed.nrows() {
228                    transformed[[i, j]] /= state.scale[j];
229                }
230            }
231        }
232
233        core_array2_to_py(py, &transformed)
234    }
235
236    /// Fit to data, then transform it.
237    ///
238    /// Fits transformer to `X` and returns a transformed version of `X`.
239    ///
240    /// Parameters
241    /// ----------
242    /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
243    ///     Input samples.
244    ///
245    /// y :  array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
246    ///     Target values (None for unsupervised transformations).
247    ///
248    /// Returns
249    /// -------
250    /// X_new : ndarray array of shape (n_samples, n_features_new)
251    ///     Transformed array.
252    fn fit_transform(
253        &mut self,
254        py: Python<'_>,
255        x: PyReadonlyArray2<f64>,
256    ) -> PyResult<Py<PyArray2<f64>>> {
257        // Create copy of x for transform since fit consumes x
258        let x_array = pyarray_to_core_array2(&x)?;
259        self.fit(x)?;
260
261        // Transform using the saved x_array
262        let state = self
263            .state
264            .as_ref()
265            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
266
267        let mut transformed = x_array.clone();
268
269        // Center the data
270        if self.with_mean {
271            for j in 0..state.n_features {
272                for i in 0..transformed.nrows() {
273                    transformed[[i, j]] -= state.mean[j];
274                }
275            }
276        }
277
278        // Scale the data
279        if self.with_std {
280            for j in 0..state.n_features {
281                for i in 0..transformed.nrows() {
282                    transformed[[i, j]] /= state.scale[j];
283                }
284            }
285        }
286
287        core_array2_to_py(py, &transformed)
288    }
289
290    /// Scale back the data to the original representation.
291    ///
292    /// Parameters
293    /// ----------
294    /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
295    ///     The data used to scale along the features axis.
296    ///
297    /// copy : bool, default=None
298    ///     Copy the input X or not.
299    ///
300    /// Returns
301    /// -------
302    /// X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
303    ///     Transformed array.
304    fn inverse_transform(
305        &self,
306        py: Python<'_>,
307        x: PyReadonlyArray2<f64>,
308    ) -> PyResult<Py<PyArray2<f64>>> {
309        let state = self
310            .state
311            .as_ref()
312            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
313
314        let x_array = pyarray_to_core_array2(&x)?;
315        validate_transform_array(&x_array, state.n_features)?;
316
317        let mut inverse = x_array.clone();
318
319        // Reverse scaling
320        if self.with_std {
321            for j in 0..state.n_features {
322                for i in 0..inverse.nrows() {
323                    inverse[[i, j]] *= state.scale[j];
324                }
325            }
326        }
327
328        // Reverse centering
329        if self.with_mean {
330            for j in 0..state.n_features {
331                for i in 0..inverse.nrows() {
332                    inverse[[i, j]] += state.mean[j];
333                }
334            }
335        }
336
337        core_array2_to_py(py, &inverse)
338    }
339
340    /// The mean value for each feature in the training set.
341    #[getter]
342    fn mean_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
343        let state = self
344            .state
345            .as_ref()
346            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
347
348        Ok(core_array1_to_py(py, &state.mean))
349    }
350
351    /// Per feature relative scaling of the data.
352    #[getter]
353    fn scale_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
354        let state = self
355            .state
356            .as_ref()
357            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
358
359        Ok(core_array1_to_py(py, &state.scale))
360    }
361
362    /// The variance for each feature in the training set.
363    #[getter]
364    fn var_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
365        let state = self
366            .state
367            .as_ref()
368            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
369
370        Ok(core_array1_to_py(py, &state.var))
371    }
372
373    /// Number of features seen during fit.
374    #[getter]
375    fn n_features_in_(&self) -> PyResult<usize> {
376        let state = self
377            .state
378            .as_ref()
379            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
380
381        Ok(state.n_features)
382    }
383
384    /// The number of samples processed by the estimator.
385    #[getter]
386    fn n_samples_seen_(&self) -> PyResult<usize> {
387        let state = self
388            .state
389            .as_ref()
390            .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
391
392        Ok(state.n_samples_seen)
393    }
394
395    /// String representation
396    fn __repr__(&self) -> String {
397        format!(
398            "StandardScaler(copy={}, with_mean={}, with_std={})",
399            self.copy, self.with_mean, self.with_std
400        )
401    }
402}