scirs2_datasets/utils/
scaling.rs

1//! Data scaling and normalization utilities
2//!
3//! This module provides various methods for scaling and normalizing data to improve
4//! the performance of machine learning algorithms. It includes standard normalization
5//! (z-score), min-max scaling, and robust scaling that is resistant to outliers.
6
7use ndarray::Array2;
8
9/// Helper function to normalize data (zero mean, unit variance)
10///
11/// This function normalizes each feature (column) in the dataset to have zero mean
12/// and unit variance. This is commonly used as a preprocessing step for machine learning.
13/// Also known as z-score normalization or standardization.
14///
15/// # Arguments
16///
17/// * `data` - A mutable reference to the data array to normalize in-place
18///
19/// # Examples
20///
21/// ```rust
22/// use ndarray::Array2;
23/// use scirs2_datasets::utils::normalize;
24///
25/// let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
26/// normalize(&mut data);
27/// // data is now normalized with zero mean and unit variance for each feature
28/// ```
29pub fn normalize(data: &mut Array2<f64>) {
30    let n_features = data.ncols();
31
32    for j in 0..n_features {
33        let mut column = data.column_mut(j);
34
35        // Calculate mean and std
36        let mean = column.mean().unwrap_or(0.0);
37        let std = column.std(0.0);
38
39        // Avoid division by zero
40        if std > 1e-10 {
41            column.mapv_inplace(|x| (x - mean) / std);
42        }
43    }
44}
45
46/// Performs Min-Max scaling to scale features to a specified range
47///
48/// Transforms features by scaling each feature to a given range, typically [0, 1].
49/// The transformation is: X_scaled = (X - X_min) / (X_max - X_min) * (max - min) + min
50///
51/// # Arguments
52///
53/// * `data` - Feature matrix to scale in-place (n_samples, n_features)
54/// * `feature_range` - Target range as (min, max) tuple
55///
56/// # Examples
57///
58/// ```rust
59/// use ndarray::Array2;
60/// use scirs2_datasets::utils::min_max_scale;
61///
62/// let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
63/// min_max_scale(&mut data, (0.0, 1.0));
64/// // Features are now scaled to [0, 1] range
65/// ```
66pub fn min_max_scale(data: &mut Array2<f64>, feature_range: (f64, f64)) {
67    let (range_min, range_max) = feature_range;
68    let range_size = range_max - range_min;
69
70    for j in 0..data.ncols() {
71        let mut column = data.column_mut(j);
72
73        // Find min and max values in the column
74        let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
75        let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
76
77        // Avoid division by zero
78        if (col_max - col_min).abs() > 1e-10 {
79            column.mapv_inplace(|x| (x - col_min) / (col_max - col_min) * range_size + range_min);
80        } else {
81            // If all values are the same, set to the middle of the range
82            column.fill(range_min + range_size / 2.0);
83        }
84    }
85}
86
87/// Performs robust scaling using median and interquartile range
88///
89/// Scales features using statistics that are robust to outliers. Each feature is
90/// scaled by: X_scaled = (X - median) / IQR, where IQR is the interquartile range.
91/// This scaling method is less sensitive to outliers compared to standard normalization.
92///
93/// # Arguments
94///
95/// * `data` - Feature matrix to scale in-place (n_samples, n_features)
96///
97/// # Examples
98///
99/// ```rust
100/// use ndarray::Array2;
101/// use scirs2_datasets::utils::robust_scale;
102///
103/// let mut data = Array2::from_shape_vec((5, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0]).unwrap();
104/// robust_scale(&mut data);
105/// // Features are now robustly scaled using median and IQR
106/// ```
107pub fn robust_scale(data: &mut Array2<f64>) {
108    for j in 0..data.ncols() {
109        let mut column_values: Vec<f64> = data.column(j).to_vec();
110        column_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
111
112        let n = column_values.len();
113        if n == 0 {
114            continue;
115        }
116
117        // Calculate median
118        let median = if n % 2 == 0 {
119            (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
120        } else {
121            column_values[n / 2]
122        };
123
124        // Calculate Q1 and Q3
125        let q1_idx = n / 4;
126        let q3_idx = 3 * n / 4;
127        let q1 = column_values[q1_idx];
128        let q3 = column_values[q3_idx];
129        let iqr = q3 - q1;
130
131        // Scale the column
132        let mut column = data.column_mut(j);
133        if iqr > 1e-10 {
134            column.mapv_inplace(|x| (x - median) / iqr);
135        } else {
136            // If IQR is zero, center around median but don't scale
137            column.mapv_inplace(|x| x - median);
138        }
139    }
140}
141
142/// Trait extension for Array1 to calculate mean and standard deviation
143///
144/// This trait provides statistical methods for ndarray's ArrayView1 type,
145/// enabling easy calculation of mean and standard deviation for scaling operations.
146pub trait StatsExt {
147    /// Calculate the mean of the array
148    fn mean(&self) -> Option<f64>;
149    /// Calculate the standard deviation with specified degrees of freedom
150    fn std(&self, ddof: f64) -> f64;
151}
152
153impl StatsExt for ndarray::ArrayView1<'_, f64> {
154    /// Calculate the mean of the array
155    ///
156    /// # Returns
157    ///
158    /// Some(mean) if the array is not empty, None otherwise
159    fn mean(&self) -> Option<f64> {
160        if self.is_empty() {
161            return None;
162        }
163
164        let sum: f64 = self.sum();
165        Some(sum / self.len() as f64)
166    }
167
168    /// Calculate the standard deviation
169    ///
170    /// # Arguments
171    ///
172    /// * `ddof` - Degrees of freedom (delta degrees of freedom)
173    ///
174    /// # Returns
175    ///
176    /// Standard deviation of the array
177    fn std(&self, ddof: f64) -> f64 {
178        if self.is_empty() {
179            return 0.0;
180        }
181
182        let n = self.len() as f64;
183        let mean = self.mean().unwrap_or(0.0);
184
185        let mut sum_sq = 0.0;
186        for &x in self.iter() {
187            let diff = x - mean;
188            sum_sq += diff * diff;
189        }
190
191        let divisor = n - ddof;
192        if divisor <= 0.0 {
193            return 0.0;
194        }
195
196        (sum_sq / divisor).sqrt()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203    use ndarray::{array, Array1};
204
205    #[test]
206    fn test_normalize() {
207        let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
208        normalize(&mut data);
209
210        // Check that each column has approximately zero mean
211        for j in 0..data.ncols() {
212            let column = data.column(j);
213            let mean = column.mean().unwrap();
214            assert!(mean.abs() < 1e-10);
215        }
216    }
217
218    #[test]
219    fn test_normalize_constant_values() {
220        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
221        normalize(&mut data);
222
223        // Constant values should remain unchanged (avoid division by zero)
224        for i in 0..data.nrows() {
225            assert_eq!(data[[i, 0]], 5.0);
226        }
227    }
228
229    #[test]
230    fn test_min_max_scale() {
231        let mut data =
232            Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
233        min_max_scale(&mut data, (0.0, 1.0));
234
235        // Check that values are scaled to [0, 1]
236        for i in 0..data.nrows() {
237            for j in 0..data.ncols() {
238                let value = data[[i, j]];
239                assert!((0.0..=1.0).contains(&value));
240            }
241        }
242
243        // Check specific scaling: first column should be [0, 0.5, 1]
244        assert!((data[[0, 0]] - 0.0).abs() < 1e-10);
245        assert!((data[[1, 0]] - 0.5).abs() < 1e-10);
246        assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
247    }
248
249    #[test]
250    fn test_min_max_scale_custom_range() {
251        let mut data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
252        min_max_scale(&mut data, (-1.0, 1.0));
253
254        // Check that values are scaled to [-1, 1]
255        assert!((data[[0, 0]] - (-1.0)).abs() < 1e-10);
256        assert!((data[[1, 0]] - 0.0).abs() < 1e-10);
257        assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
258    }
259
260    #[test]
261    fn test_min_max_scale_constant_values() {
262        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
263        min_max_scale(&mut data, (0.0, 1.0));
264
265        // All values should be 0.5 (middle of range) when all values are the same
266        for i in 0..data.nrows() {
267            assert!((data[[i, 0]] - 0.5).abs() < 1e-10);
268        }
269    }
270
271    #[test]
272    fn test_robust_scale() {
273        let mut data = Array2::from_shape_vec(
274            (5, 2),
275            vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0],
276        )
277        .unwrap(); // Last row has outliers
278
279        robust_scale(&mut data);
280
281        // Check that the scaling was applied (data should have different values than original)
282        // and that extreme outliers have limited influence
283        let col1_values: Vec<f64> = data.column(0).to_vec();
284        let col2_values: Vec<f64> = data.column(1).to_vec();
285
286        // Verify that the data has been transformed (not all values are the same)
287        let col1_range = col1_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
288            - col1_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
289        let col2_range = col2_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
290            - col2_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
291
292        // After robust scaling, the range should be reasonable (not infinite)
293        assert!(col1_range.is_finite());
294        assert!(col2_range.is_finite());
295        assert!(col1_range > 0.0); // Some variation should remain
296        assert!(col2_range > 0.0); // Some variation should remain
297    }
298
299    #[test]
300    fn test_robust_scale_constant_values() {
301        let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
302        robust_scale(&mut data);
303
304        // With constant values, robust scaling should center around 0 (median subtraction)
305        for i in 0..data.nrows() {
306            assert!((data[[i, 0]] - 0.0).abs() < 1e-10);
307        }
308    }
309
310    #[test]
311    fn test_robust_vs_standard_scaling() {
312        // Create data with outliers
313        let mut data_robust = Array2::from_shape_vec(
314            (5, 1),
315            vec![1.0, 2.0, 3.0, 4.0, 100.0], // 100.0 is an outlier
316        )
317        .unwrap();
318        let mut data_standard = data_robust.clone();
319
320        // Apply different scaling methods
321        robust_scale(&mut data_robust);
322        normalize(&mut data_standard); // Standard z-score normalization
323
324        // Both scaling methods should produce finite, transformed data
325        let robust_range = data_robust.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
326            - data_robust.iter().fold(f64::INFINITY, |a, &b| a.min(b));
327        let standard_range = data_standard
328            .iter()
329            .fold(f64::NEG_INFINITY, |a, &b| a.max(b))
330            - data_standard.iter().fold(f64::INFINITY, |a, &b| a.min(b));
331
332        // Both scaling methods should produce finite ranges
333        assert!(robust_range.is_finite());
334        assert!(standard_range.is_finite());
335        assert!(robust_range > 0.0);
336        assert!(standard_range > 0.0);
337
338        // The scaled data should be different from the original
339        assert!(data_robust[[0, 0]] != 1.0); // First value should be transformed
340        assert!(data_standard[[0, 0]] != 1.0); // First value should be transformed
341    }
342
343    #[test]
344    fn test_stats_ext_trait() {
345        let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
346        let view = data.view();
347
348        // Test mean calculation
349        let mean = view.mean().unwrap();
350        assert!((mean - 3.0_f64).abs() < 1e-10);
351
352        // Test standard deviation calculation
353        let std = view.std(0.0); // Population standard deviation
354        let expected_std = (10.0_f64 / 5.0).sqrt(); // sqrt(variance)
355        assert!((std - expected_std).abs() < 1e-10);
356
357        // Test with ddof = 1 (sample standard deviation)
358        let std_sample = view.std(1.0);
359        let expected_std_sample = (10.0_f64 / 4.0).sqrt();
360        assert!((std_sample - expected_std_sample).abs() < 1e-10);
361    }
362
363    #[test]
364    fn test_stats_ext_empty_array() {
365        let data: Array1<f64> = array![];
366        let view = data.view();
367
368        // Mean of empty array should be None
369        assert!(StatsExt::mean(&view).is_none());
370
371        // Standard deviation of empty array should be 0
372        assert_eq!(StatsExt::std(&view, 0.0), 0.0);
373    }
374
375    #[test]
376    fn test_scaling_pipeline() {
377        // Test a complete scaling pipeline
378        let mut data1 =
379            Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
380                .unwrap();
381        let mut data2 = data1.clone();
382        let mut data3 = data1.clone();
383
384        // Apply different scaling methods
385        normalize(&mut data1); // Z-score normalization
386        min_max_scale(&mut data2, (0.0, 1.0)); // Min-max scaling
387        robust_scale(&mut data3); // Robust scaling
388
389        // All methods should produce finite transformed data
390        assert!(data1.iter().all(|&x| x.is_finite()));
391        assert!(data2.iter().all(|&x| x.is_finite()));
392        assert!(data3.iter().all(|&x| x.is_finite()));
393
394        // Min-max scaled data should be in [0, 1] range
395        assert!(data2.iter().all(|&x| (0.0..=1.0).contains(&x)));
396
397        // All scaling methods should preserve shape
398        assert_eq!(data1.shape(), data2.shape());
399        assert_eq!(data2.shape(), data3.shape());
400    }
401}