scirs2_datasets/utils/
scaling.rs

1//! Data scaling and normalization utilities
2//!
3//! This module provides various methods for scaling and normalizing data to improve
4//! the performance of machine learning algorithms. It includes standard normalization
5//! (z-score), min-max scaling, and robust scaling that is resistant to outliers.
6
7use scirs2_core::ndarray::Array2;
8use statrs::statistics::Statistics;
9
10/// Helper function to normalize data (zero mean, unit variance)
11///
12/// This function normalizes each feature (column) in the dataset to have zero mean
13/// and unit variance. This is commonly used as a preprocessing step for machine learning.
14/// Also known as z-score normalization or standardization.
15///
16/// # Arguments
17///
18/// * `data` - A mutable reference to the data array to normalize in-place
19///
20/// # Examples
21///
22/// ```rust
23/// use scirs2_core::ndarray::Array2;
24/// use scirs2_datasets::utils::normalize;
25///
26/// let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
27/// normalize(&mut data);
28/// // data is now normalized with zero mean and unit variance for each feature
29/// ```
30#[allow(dead_code)]
31pub fn normalize(data: &mut Array2<f64>) {
32    let n_features = data.ncols();
33
34    for j in 0..n_features {
35        let mut column = data.column_mut(j);
36
37        // Calculate mean and std
38        let mean = {
39            let val = column.view().mean();
40            if val.is_nan() {
41                0.0
42            } else {
43                val
44            }
45        };
46        let std = column.view().std(0.0);
47
48        // Avoid division by zero
49        if std > 1e-10 {
50            column.mapv_inplace(|x| (x - mean) / std);
51        }
52    }
53}
54
55/// Performs Min-Max scaling to scale features to a specified range
56///
57/// Transforms features by scaling each feature to a given range, typically [0, 1].
58/// The transformation is: X_scaled = (X - X_min) / (X_max - X_min) * (max - min) + min
59///
60/// # Arguments
61///
62/// * `data` - Feature matrix to scale in-place (n_samples, n_features)
63/// * `feature_range` - Target range as (min, max) tuple
64///
65/// # Examples
66///
67/// ```rust
68/// use scirs2_core::ndarray::Array2;
69/// use scirs2_datasets::utils::min_max_scale;
70///
71/// let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
72/// min_max_scale(&mut data, (0.0, 1.0));
73/// // Features are now scaled to [0, 1] range
74/// ```
75#[allow(dead_code)]
76pub fn min_max_scale(_data: &mut Array2<f64>, featurerange: (f64, f64)) {
77    let (range_min, range_max) = featurerange;
78    let range_size = range_max - range_min;
79
80    for j in 0.._data.ncols() {
81        let mut column = _data.column_mut(j);
82
83        // Find min and max values in the column
84        let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
85        let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
86
87        // Avoid division by zero
88        if (col_max - col_min).abs() > 1e-10 {
89            column.mapv_inplace(|x| (x - col_min) / (col_max - col_min) * range_size + range_min);
90        } else {
91            // If all values are the same, set to the middle of the _range
92            column.fill(range_min + range_size / 2.0);
93        }
94    }
95}
96
97/// Performs robust scaling using median and interquartile range
98///
99/// Scales features using statistics that are robust to outliers. Each feature is
100/// scaled by: X_scaled = (X - median) / IQR, where IQR is the interquartile range.
101/// This scaling method is less sensitive to outliers compared to standard normalization.
102///
103/// # Arguments
104///
105/// * `data` - Feature matrix to scale in-place (n_samples, n_features)
106///
107/// # Examples
108///
109/// ```rust
110/// use scirs2_core::ndarray::Array2;
111/// use scirs2_datasets::utils::robust_scale;
112///
113/// let mut data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0]).unwrap();
114/// robust_scale(&mut data);
115/// // Features are now robustly scaled using median and IQR
116/// ```
117#[allow(dead_code)]
118pub fn robust_scale(data: &mut Array2<f64>) {
119    for j in 0..data.ncols() {
120        let mut column_values: Vec<f64> = data.column(j).to_vec();
121        column_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
122
123        let n = column_values.len();
124        if n == 0 {
125            continue;
126        }
127
128        // Calculate median
129        let median = if n.is_multiple_of(2) {
130            (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
131        } else {
132            column_values[n / 2]
133        };
134
135        // Calculate Q1 and Q3
136        let q1_idx = n / 4;
137        let q3_idx = 3 * n / 4;
138        let q1 = column_values[q1_idx];
139        let q3 = column_values[q3_idx];
140        let iqr = q3 - q1;
141
142        // Scale the column
143        let mut column = data.column_mut(j);
144        if iqr > 1e-10 {
145            column.mapv_inplace(|x| (x - median) / iqr);
146        } else {
147            // If IQR is zero, center around median but don't scale
148            column.mapv_inplace(|x| x - median);
149        }
150    }
151}
152
153/// Trait extension for Array1 to calculate mean and standard deviation
154///
155/// This trait provides statistical methods for ndarray's ArrayView1 type,
156/// enabling easy calculation of mean and standard deviation for scaling operations.
157///
158/// Note: Uses `standard_deviation` instead of `std` to avoid conflicts with ndarray's built-in methods.
159pub trait StatsExt {
160    /// Calculate the mean of the array
161    fn mean(&self) -> Option<f64>;
162    /// Calculate the standard deviation with specified degrees of freedom
163    fn standard_deviation(&self, ddof: f64) -> f64;
164}
165
166impl StatsExt for scirs2_core::ndarray::ArrayView1<'_, f64> {
167    /// Calculate the mean of the array
168    ///
169    /// # Returns
170    ///
171    /// Some(mean) if the array is not empty, None otherwise
172    fn mean(&self) -> Option<f64> {
173        if self.is_empty() {
174            return None;
175        }
176
177        let sum: f64 = self.sum();
178        Some(sum / self.len() as f64)
179    }
180
181    /// Calculate the standard deviation
182    ///
183    /// # Arguments
184    ///
185    /// * `ddof` - Degrees of freedom (delta degrees of freedom)
186    ///
187    /// # Returns
188    ///
189    /// Standard deviation of the array
190    fn standard_deviation(&self, ddof: f64) -> f64 {
191        if self.is_empty() {
192            return 0.0;
193        }
194
195        let n = self.len() as f64;
196        let mean = {
197            match self.mean() {
198                Some(val) if !val.is_nan() => val,
199                _ => 0.0,
200            }
201        };
202
203        let mut sum_sq = 0.0;
204        for &x in self.iter() {
205            let diff = x - mean;
206            sum_sq += diff * diff;
207        }
208
209        let divisor = n - ddof;
210        if divisor <= 0.0 {
211            return 0.0;
212        }
213
214        (sum_sq / divisor).sqrt()
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use scirs2_core::ndarray::{array, Array1};
222
223    #[test]
224    fn test_normalize() {
225        let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
226        normalize(&mut data);
227
228        // Check that each column has approximately zero mean
229        for j in 0..data.ncols() {
230            let column = data.column(j);
231            let mean = column.mean();
232            assert!(mean.abs() < 1e-10);
233        }
234    }
235
236    #[test]
237    fn test_normalize_constant_values() {
238        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
239        normalize(&mut data);
240
241        // Constant values should remain unchanged (avoid division by zero)
242        for i in 0..data.nrows() {
243            assert_eq!(data[[i, 0]], 5.0);
244        }
245    }
246
247    #[test]
248    fn test_min_max_scale() {
249        let mut data =
250            Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
251        min_max_scale(&mut data, (0.0, 1.0));
252
253        // Check that values are scaled to [0, 1]
254        for i in 0..data.nrows() {
255            for j in 0..data.ncols() {
256                let value = data[[i, j]];
257                assert!((0.0..=1.0).contains(&value));
258            }
259        }
260
261        // Check specific scaling: first column should be [0, 0.5, 1]
262        assert!((data[[0, 0]] - 0.0).abs() < 1e-10);
263        assert!((data[[1, 0]] - 0.5).abs() < 1e-10);
264        assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
265    }
266
267    #[test]
268    fn test_min_max_scale_custom_range() {
269        let mut data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
270        min_max_scale(&mut data, (-1.0, 1.0));
271
272        // Check that values are scaled to [-1, 1]
273        assert!((data[[0, 0]] - (-1.0)).abs() < 1e-10);
274        assert!((data[[1, 0]] - 0.0).abs() < 1e-10);
275        assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
276    }
277
278    #[test]
279    fn test_min_max_scale_constant_values() {
280        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
281        min_max_scale(&mut data, (0.0, 1.0));
282
283        // All values should be 0.5 (middle of range) when all values are the same
284        for i in 0..data.nrows() {
285            assert!((data[[i, 0]] - 0.5).abs() < 1e-10);
286        }
287    }
288
289    #[test]
290    fn test_robust_scale() {
291        let mut data = Array2::from_shape_vec(
292            (5, 2),
293            vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0],
294        )
295        .unwrap(); // Last row has outliers
296
297        robust_scale(&mut data);
298
299        // Check that the scaling was applied (data should have different values than original)
300        // and that extreme outliers have limited influence
301        let col1_values: Vec<f64> = data.column(0).to_vec();
302        let col2_values: Vec<f64> = data.column(1).to_vec();
303
304        // Verify that the data has been transformed (not all values are the same)
305        let col1_range = col1_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
306            - col1_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
307        let col2_range = col2_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
308            - col2_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
309
310        // After robust scaling, the range should be reasonable (not infinite)
311        assert!(col1_range.is_finite());
312        assert!(col2_range.is_finite());
313        assert!(col1_range > 0.0); // Some variation should remain
314        assert!(col2_range > 0.0); // Some variation should remain
315    }
316
317    #[test]
318    fn test_robust_scale_constant_values() {
319        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
320        robust_scale(&mut data);
321
322        // With constant values, robust scaling should center around 0 (median subtraction)
323        for i in 0..data.nrows() {
324            assert!((data[[i, 0]] - 0.0).abs() < 1e-10);
325        }
326    }
327
328    #[test]
329    fn test_robust_vs_standard_scaling() {
330        // Create data with outliers
331        let mut data_robust = Array2::from_shape_vec(
332            (5, 1),
333            vec![1.0, 2.0, 3.0, 4.0, 100.0], // 100.0 is an outlier
334        )
335        .unwrap();
336        let mut data_standard = data_robust.clone();
337
338        // Apply different scaling methods
339        robust_scale(&mut data_robust);
340        normalize(&mut data_standard); // Standard z-score normalization
341
342        // Both scaling methods should produce finite, transformed data
343        let robust_range = data_robust.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
344            - data_robust.iter().fold(f64::INFINITY, |a, &b| a.min(b));
345        let standard_range = data_standard
346            .iter()
347            .fold(f64::NEG_INFINITY, |a, &b| a.max(b))
348            - data_standard.iter().fold(f64::INFINITY, |a, &b| a.min(b));
349
350        // Both scaling methods should produce finite ranges
351        assert!(robust_range.is_finite());
352        assert!(standard_range.is_finite());
353        assert!(robust_range > 0.0);
354        assert!(standard_range > 0.0);
355
356        // The scaled data should be different from the original
357        assert!(data_robust[[0, 0]] != 1.0); // First value should be transformed
358        assert!(data_standard[[0, 0]] != 1.0); // First value should be transformed
359    }
360
361    #[test]
362    fn test_stats_ext_trait() {
363        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
364        let view = data.view();
365
366        // Test mean calculation
367        let mean = view.mean();
368        assert!((mean - 3.0_f64).abs() < 1e-10);
369
370        // Test standard deviation calculation
371        let std = view.std(0.0); // Population standard deviation
372        let expected_std = (10.0_f64 / 5.0).sqrt(); // sqrt(variance)
373        assert!((std - expected_std).abs() < 1e-10);
374
375        // Test with ddof = 1 (sample standard deviation)
376        let std_sample = view.std(1.0);
377        let expected_std_sample = (10.0_f64 / 4.0).sqrt();
378        assert!((std_sample - expected_std_sample).abs() < 1e-10);
379    }
380
381    #[test]
382    fn test_stats_ext_empty_array() {
383        let data: Array1<f64> = array![];
384        let view = data.view();
385
386        // Mean of empty array should be NaN
387        assert!(view.mean().is_nan());
388
389        // Standard deviation of empty array should be 0
390        assert_eq!(view.standard_deviation(0.0), 0.0);
391    }
392
393    #[test]
394    fn test_scaling_pipeline() {
395        // Test a complete scaling pipeline
396        let mut data1 =
397            Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
398                .unwrap();
399        let mut data2 = data1.clone();
400        let mut data3 = data1.clone();
401
402        // Apply different scaling methods
403        normalize(&mut data1); // Z-score normalization
404        min_max_scale(&mut data2, (0.0, 1.0)); // Min-max scaling
405        robust_scale(&mut data3); // Robust scaling
406
407        // All methods should produce finite transformed data
408        assert!(data1.iter().all(|&x| x.is_finite()));
409        assert!(data2.iter().all(|&x| x.is_finite()));
410        assert!(data3.iter().all(|&x| x.is_finite()));
411
412        // Min-max scaled data should be in [0, 1] range
413        assert!(data2.iter().all(|&x| (0.0..=1.0).contains(&x)));
414
415        // All scaling methods should preserve shape
416        assert_eq!(data1.shape(), data2.shape());
417        assert_eq!(data2.shape(), data3.shape());
418    }
419}