sklears_python/preprocessing/standard_scaler.rs
1//! Python bindings for StandardScaler
2//!
3//! This module provides Python bindings for StandardScaler,
4//! offering scikit-learn compatible standardization (z-score normalization).
5
6use super::common::*;
7use scirs2_core::ndarray::{Array1, Axis};
8
9/// StandardScaler state after fitting
10#[derive(Debug, Clone)]
11struct StandardScalerState {
12 mean: Array1<f64>,
13 scale: Array1<f64>,
14 var: Array1<f64>,
15 n_features: usize,
16 n_samples_seen: usize,
17}
18
19/// Standardize features by removing the mean and scaling to unit variance.
20///
21/// The standard score of a sample `x` is calculated as:
22///
23/// z = (x - u) / s
24///
25/// where `u` is the mean of the training samples or zero if `with_mean=False`,
26/// and `s` is the standard deviation of the training samples or one if
27/// `with_std=False`.
28///
29/// Centering and scaling happen independently on each feature by computing
30/// the relevant statistics on the samples in the training set. Mean and
31/// standard deviation are then stored to be used on later data using
32/// :meth:`transform`.
33///
34/// Standardization of a dataset is a common requirement for many
35/// machine learning estimators: they might behave badly if the
36/// individual features do not more or less look like standard normally
37/// distributed data (e.g. Gaussian with 0 mean and unit variance).
38///
39/// Parameters
40/// ----------
41/// copy : bool, default=True
42/// If False, try to avoid a copy and do inplace scaling instead.
43/// This is not guaranteed to always work inplace; e.g. if the data is
44/// not a NumPy array or scipy.sparse CSR matrix, a copy may still be
45/// returned.
46///
47/// with_mean : bool, default=True
48/// If True, center the data before scaling.
49/// This does not work (and will raise an exception) when attempted on
50/// sparse matrices, because centering them entails building a dense
51/// matrix which in common use cases is likely to be too large to fit in
52/// memory.
53///
54/// with_std : bool, default=True
55/// If True, scale the data to unit variance (or equivalently,
56/// unit standard deviation).
57///
58/// Attributes
59/// ----------
60/// scale_ : ndarray of shape (n_features,) or None
61/// Per feature relative scaling of the data to achieve zero mean and unit
62/// variance. Generally this is calculated using `np.sqrt(var_)`. If a
63/// variance is zero, we can't achieve unit variance, and the data is left
64/// as-is, giving a scaling factor of 1. `scale_` is equal to `None`
65/// when `with_std=False`.
66///
67/// mean_ : ndarray of shape (n_features,) or None
68/// The mean value for each feature in the training set.
69/// Equal to ``None`` when ``with_mean=False``.
70///
71/// var_ : ndarray of shape (n_features,) or None
72/// The variance for each feature in the training set. Used to compute
73/// `scale_`. Equal to ``None`` when ``with_std=False``.
74///
75/// n_features_in_ : int
76/// Number of features seen during :term:`fit`.
77///
78/// n_samples_seen_ : int
79/// The number of samples processed by the estimator.
80/// It will be reset on new calls to fit, but increments across
81/// ``partial_fit`` calls.
82///
83/// Examples
84/// --------
85/// >>> from sklears_python import StandardScaler
86/// >>> import numpy as np
87/// >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
88/// >>> scaler = StandardScaler()
89/// >>> scaler.fit(data)
90/// StandardScaler()
91/// >>> print(scaler.mean_)
92/// [0.5 0.5]
93/// >>> print(scaler.transform(data))
94/// [[-1. -1.]
95/// [-1. -1.]
96/// [ 1. 1.]
97/// [ 1. 1.]]
98/// >>> print(scaler.transform([[2, 2]]))
99/// [[3. 3.]]
100#[pyclass(name = "StandardScaler")]
101pub struct PyStandardScaler {
102 copy: bool,
103 with_mean: bool,
104 with_std: bool,
105 state: Option<StandardScalerState>,
106}
107
108#[pymethods]
109impl PyStandardScaler {
110 #[new]
111 #[pyo3(signature = (copy=true, with_mean=true, with_std=true))]
112 fn new(copy: bool, with_mean: bool, with_std: bool) -> Self {
113 Self {
114 copy,
115 with_mean,
116 with_std,
117 state: None,
118 }
119 }
120
121 /// Compute the mean and std to be used for later scaling.
122 ///
123 /// Parameters
124 /// ----------
125 /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
126 /// The data used to compute the mean and standard deviation
127 /// used for later scaling along the features axis.
128 ///
129 /// y : None
130 /// Ignored.
131 ///
132 /// sample_weight : array-like of shape (n_samples,), default=None
133 /// Individual weights for each sample.
134 ///
135 /// Returns
136 /// -------
137 /// self : object
138 /// Fitted scaler.
139 fn fit(&mut self, x: PyReadonlyArray2<f64>) -> PyResult<()> {
140 let x_array = pyarray_to_core_array2(&x)?;
141 validate_fit_array(&x_array)?;
142
143 let n_samples = x_array.nrows();
144 let n_features = x_array.ncols();
145
146 // Compute mean
147 let mean = if self.with_mean {
148 x_array.mean_axis(Axis(0)).unwrap()
149 } else {
150 Array1::zeros(n_features)
151 };
152
153 // Compute variance and scale
154 let (var, scale) = if self.with_std {
155 // Calculate variance: E[(X - mean)^2]
156 let mut var = Array1::zeros(n_features);
157 for j in 0..n_features {
158 let col = x_array.column(j);
159 let mean_j = mean[j];
160 let sum_sq_diff: f64 = col.iter().map(|&x| (x - mean_j).powi(2)).sum();
161 var[j] = sum_sq_diff / n_samples as f64;
162 }
163
164 // Calculate scale (std dev), but avoid division by zero
165 let scale = var.mapv(|v| {
166 let std = v.sqrt();
167 if std < 1e-10 {
168 1.0 // Avoid division by zero
169 } else {
170 std
171 }
172 });
173
174 (var, scale)
175 } else {
176 (Array1::ones(n_features), Array1::ones(n_features))
177 };
178
179 self.state = Some(StandardScalerState {
180 mean,
181 scale,
182 var,
183 n_features,
184 n_samples_seen: n_samples,
185 });
186
187 Ok(())
188 }
189
190 /// Perform standardization by centering and scaling.
191 ///
192 /// Parameters
193 /// ----------
194 /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
195 /// The data used to scale along the features axis.
196 ///
197 /// copy : bool, default=None
198 /// Copy the input X or not.
199 ///
200 /// Returns
201 /// -------
202 /// X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
203 /// Transformed array.
204 fn transform(&self, py: Python<'_>, x: PyReadonlyArray2<f64>) -> PyResult<Py<PyArray2<f64>>> {
205 let state = self
206 .state
207 .as_ref()
208 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
209
210 let x_array = pyarray_to_core_array2(&x)?;
211 validate_transform_array(&x_array, state.n_features)?;
212
213 let mut transformed = x_array.clone();
214
215 // Center the data
216 if self.with_mean {
217 for j in 0..state.n_features {
218 for i in 0..transformed.nrows() {
219 transformed[[i, j]] -= state.mean[j];
220 }
221 }
222 }
223
224 // Scale the data
225 if self.with_std {
226 for j in 0..state.n_features {
227 for i in 0..transformed.nrows() {
228 transformed[[i, j]] /= state.scale[j];
229 }
230 }
231 }
232
233 core_array2_to_py(py, &transformed)
234 }
235
236 /// Fit to data, then transform it.
237 ///
238 /// Fits transformer to `X` and returns a transformed version of `X`.
239 ///
240 /// Parameters
241 /// ----------
242 /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
243 /// Input samples.
244 ///
245 /// y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
246 /// Target values (None for unsupervised transformations).
247 ///
248 /// Returns
249 /// -------
250 /// X_new : ndarray array of shape (n_samples, n_features_new)
251 /// Transformed array.
252 fn fit_transform(
253 &mut self,
254 py: Python<'_>,
255 x: PyReadonlyArray2<f64>,
256 ) -> PyResult<Py<PyArray2<f64>>> {
257 // Create copy of x for transform since fit consumes x
258 let x_array = pyarray_to_core_array2(&x)?;
259 self.fit(x)?;
260
261 // Transform using the saved x_array
262 let state = self
263 .state
264 .as_ref()
265 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
266
267 let mut transformed = x_array.clone();
268
269 // Center the data
270 if self.with_mean {
271 for j in 0..state.n_features {
272 for i in 0..transformed.nrows() {
273 transformed[[i, j]] -= state.mean[j];
274 }
275 }
276 }
277
278 // Scale the data
279 if self.with_std {
280 for j in 0..state.n_features {
281 for i in 0..transformed.nrows() {
282 transformed[[i, j]] /= state.scale[j];
283 }
284 }
285 }
286
287 core_array2_to_py(py, &transformed)
288 }
289
290 /// Scale back the data to the original representation.
291 ///
292 /// Parameters
293 /// ----------
294 /// X : {array-like, sparse matrix} of shape (n_samples, n_features)
295 /// The data used to scale along the features axis.
296 ///
297 /// copy : bool, default=None
298 /// Copy the input X or not.
299 ///
300 /// Returns
301 /// -------
302 /// X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
303 /// Transformed array.
304 fn inverse_transform(
305 &self,
306 py: Python<'_>,
307 x: PyReadonlyArray2<f64>,
308 ) -> PyResult<Py<PyArray2<f64>>> {
309 let state = self
310 .state
311 .as_ref()
312 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
313
314 let x_array = pyarray_to_core_array2(&x)?;
315 validate_transform_array(&x_array, state.n_features)?;
316
317 let mut inverse = x_array.clone();
318
319 // Reverse scaling
320 if self.with_std {
321 for j in 0..state.n_features {
322 for i in 0..inverse.nrows() {
323 inverse[[i, j]] *= state.scale[j];
324 }
325 }
326 }
327
328 // Reverse centering
329 if self.with_mean {
330 for j in 0..state.n_features {
331 for i in 0..inverse.nrows() {
332 inverse[[i, j]] += state.mean[j];
333 }
334 }
335 }
336
337 core_array2_to_py(py, &inverse)
338 }
339
340 /// The mean value for each feature in the training set.
341 #[getter]
342 fn mean_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
343 let state = self
344 .state
345 .as_ref()
346 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
347
348 Ok(core_array1_to_py(py, &state.mean))
349 }
350
351 /// Per feature relative scaling of the data.
352 #[getter]
353 fn scale_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
354 let state = self
355 .state
356 .as_ref()
357 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
358
359 Ok(core_array1_to_py(py, &state.scale))
360 }
361
362 /// The variance for each feature in the training set.
363 #[getter]
364 fn var_(&self, py: Python<'_>) -> PyResult<Py<PyArray1<f64>>> {
365 let state = self
366 .state
367 .as_ref()
368 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
369
370 Ok(core_array1_to_py(py, &state.var))
371 }
372
373 /// Number of features seen during fit.
374 #[getter]
375 fn n_features_in_(&self) -> PyResult<usize> {
376 let state = self
377 .state
378 .as_ref()
379 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
380
381 Ok(state.n_features)
382 }
383
384 /// The number of samples processed by the estimator.
385 #[getter]
386 fn n_samples_seen_(&self) -> PyResult<usize> {
387 let state = self
388 .state
389 .as_ref()
390 .ok_or_else(|| PyValueError::new_err("Scaler not fitted. Call fit() first."))?;
391
392 Ok(state.n_samples_seen)
393 }
394
395 /// String representation
396 fn __repr__(&self) -> String {
397 format!(
398 "StandardScaler(copy={}, with_mean={}, with_std={})",
399 self.copy, self.with_mean, self.with_std
400 )
401 }
402}