1use scirs2_core::ndarray::Array2;
8use statrs::statistics::Statistics;
9
10#[allow(dead_code)]
31pub fn normalize(data: &mut Array2<f64>) {
32 let n_features = data.ncols();
33
34 for j in 0..n_features {
35 let mut column = data.column_mut(j);
36
37 let mean = {
39 let val = column.view().mean();
40 if val.is_nan() {
41 0.0
42 } else {
43 val
44 }
45 };
46 let std = column.view().std(0.0);
47
48 if std > 1e-10 {
50 column.mapv_inplace(|x| (x - mean) / std);
51 }
52 }
53}
54
55#[allow(dead_code)]
76pub fn min_max_scale(_data: &mut Array2<f64>, featurerange: (f64, f64)) {
77 let (range_min, range_max) = featurerange;
78 let range_size = range_max - range_min;
79
80 for j in 0.._data.ncols() {
81 let mut column = _data.column_mut(j);
82
83 let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
85 let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
86
87 if (col_max - col_min).abs() > 1e-10 {
89 column.mapv_inplace(|x| (x - col_min) / (col_max - col_min) * range_size + range_min);
90 } else {
91 column.fill(range_min + range_size / 2.0);
93 }
94 }
95}
96
97#[allow(dead_code)]
118pub fn robust_scale(data: &mut Array2<f64>) {
119 for j in 0..data.ncols() {
120 let mut column_values: Vec<f64> = data.column(j).to_vec();
121 column_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
122
123 let n = column_values.len();
124 if n == 0 {
125 continue;
126 }
127
128 let median = if n.is_multiple_of(2) {
130 (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
131 } else {
132 column_values[n / 2]
133 };
134
135 let q1_idx = n / 4;
137 let q3_idx = 3 * n / 4;
138 let q1 = column_values[q1_idx];
139 let q3 = column_values[q3_idx];
140 let iqr = q3 - q1;
141
142 let mut column = data.column_mut(j);
144 if iqr > 1e-10 {
145 column.mapv_inplace(|x| (x - median) / iqr);
146 } else {
147 column.mapv_inplace(|x| x - median);
149 }
150 }
151}
152
153pub trait StatsExt {
160 fn mean(&self) -> Option<f64>;
162 fn standard_deviation(&self, ddof: f64) -> f64;
164}
165
166impl StatsExt for scirs2_core::ndarray::ArrayView1<'_, f64> {
167 fn mean(&self) -> Option<f64> {
173 if self.is_empty() {
174 return None;
175 }
176
177 let sum: f64 = self.sum();
178 Some(sum / self.len() as f64)
179 }
180
181 fn standard_deviation(&self, ddof: f64) -> f64 {
191 if self.is_empty() {
192 return 0.0;
193 }
194
195 let n = self.len() as f64;
196 let mean = {
197 match self.mean() {
198 Some(val) if !val.is_nan() => val,
199 _ => 0.0,
200 }
201 };
202
203 let mut sum_sq = 0.0;
204 for &x in self.iter() {
205 let diff = x - mean;
206 sum_sq += diff * diff;
207 }
208
209 let divisor = n - ddof;
210 if divisor <= 0.0 {
211 return 0.0;
212 }
213
214 (sum_sq / divisor).sqrt()
215 }
216}
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221 use scirs2_core::ndarray::{array, Array1};
222
223 #[test]
224 fn test_normalize() {
225 let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
226 normalize(&mut data);
227
228 for j in 0..data.ncols() {
230 let column = data.column(j);
231 let mean = column.mean();
232 assert!(mean.abs() < 1e-10);
233 }
234 }
235
236 #[test]
237 fn test_normalize_constant_values() {
238 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
239 normalize(&mut data);
240
241 for i in 0..data.nrows() {
243 assert_eq!(data[[i, 0]], 5.0);
244 }
245 }
246
247 #[test]
248 fn test_min_max_scale() {
249 let mut data =
250 Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
251 min_max_scale(&mut data, (0.0, 1.0));
252
253 for i in 0..data.nrows() {
255 for j in 0..data.ncols() {
256 let value = data[[i, j]];
257 assert!((0.0..=1.0).contains(&value));
258 }
259 }
260
261 assert!((data[[0, 0]] - 0.0).abs() < 1e-10);
263 assert!((data[[1, 0]] - 0.5).abs() < 1e-10);
264 assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
265 }
266
267 #[test]
268 fn test_min_max_scale_custom_range() {
269 let mut data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
270 min_max_scale(&mut data, (-1.0, 1.0));
271
272 assert!((data[[0, 0]] - (-1.0)).abs() < 1e-10);
274 assert!((data[[1, 0]] - 0.0).abs() < 1e-10);
275 assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
276 }
277
278 #[test]
279 fn test_min_max_scale_constant_values() {
280 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
281 min_max_scale(&mut data, (0.0, 1.0));
282
283 for i in 0..data.nrows() {
285 assert!((data[[i, 0]] - 0.5).abs() < 1e-10);
286 }
287 }
288
289 #[test]
290 fn test_robust_scale() {
291 let mut data = Array2::from_shape_vec(
292 (5, 2),
293 vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0],
294 )
295 .unwrap(); robust_scale(&mut data);
298
299 let col1_values: Vec<f64> = data.column(0).to_vec();
302 let col2_values: Vec<f64> = data.column(1).to_vec();
303
304 let col1_range = col1_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
306 - col1_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
307 let col2_range = col2_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
308 - col2_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
309
310 assert!(col1_range.is_finite());
312 assert!(col2_range.is_finite());
313 assert!(col1_range > 0.0); assert!(col2_range > 0.0); }
316
317 #[test]
318 fn test_robust_scale_constant_values() {
319 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
320 robust_scale(&mut data);
321
322 for i in 0..data.nrows() {
324 assert!((data[[i, 0]] - 0.0).abs() < 1e-10);
325 }
326 }
327
328 #[test]
329 fn test_robust_vs_standard_scaling() {
330 let mut data_robust = Array2::from_shape_vec(
332 (5, 1),
333 vec![1.0, 2.0, 3.0, 4.0, 100.0], )
335 .unwrap();
336 let mut data_standard = data_robust.clone();
337
338 robust_scale(&mut data_robust);
340 normalize(&mut data_standard); let robust_range = data_robust.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
344 - data_robust.iter().fold(f64::INFINITY, |a, &b| a.min(b));
345 let standard_range = data_standard
346 .iter()
347 .fold(f64::NEG_INFINITY, |a, &b| a.max(b))
348 - data_standard.iter().fold(f64::INFINITY, |a, &b| a.min(b));
349
350 assert!(robust_range.is_finite());
352 assert!(standard_range.is_finite());
353 assert!(robust_range > 0.0);
354 assert!(standard_range > 0.0);
355
356 assert!(data_robust[[0, 0]] != 1.0); assert!(data_standard[[0, 0]] != 1.0); }
360
361 #[test]
362 fn test_stats_ext_trait() {
363 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
364 let view = data.view();
365
366 let mean = view.mean();
368 assert!((mean - 3.0_f64).abs() < 1e-10);
369
370 let std = view.std(0.0); let expected_std = (10.0_f64 / 5.0).sqrt(); assert!((std - expected_std).abs() < 1e-10);
374
375 let std_sample = view.std(1.0);
377 let expected_std_sample = (10.0_f64 / 4.0).sqrt();
378 assert!((std_sample - expected_std_sample).abs() < 1e-10);
379 }
380
381 #[test]
382 fn test_stats_ext_empty_array() {
383 let data: Array1<f64> = array![];
384 let view = data.view();
385
386 assert!(view.mean().is_nan());
388
389 assert_eq!(view.standard_deviation(0.0), 0.0);
391 }
392
393 #[test]
394 fn test_scaling_pipeline() {
395 let mut data1 =
397 Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
398 .unwrap();
399 let mut data2 = data1.clone();
400 let mut data3 = data1.clone();
401
402 normalize(&mut data1); min_max_scale(&mut data2, (0.0, 1.0)); robust_scale(&mut data3); assert!(data1.iter().all(|&x| x.is_finite()));
409 assert!(data2.iter().all(|&x| x.is_finite()));
410 assert!(data3.iter().all(|&x| x.is_finite()));
411
412 assert!(data2.iter().all(|&x| (0.0..=1.0).contains(&x)));
414
415 assert_eq!(data1.shape(), data2.shape());
417 assert_eq!(data2.shape(), data3.shape());
418 }
419}