1use ndarray::Array2;
8
9pub fn normalize(data: &mut Array2<f64>) {
30 let n_features = data.ncols();
31
32 for j in 0..n_features {
33 let mut column = data.column_mut(j);
34
35 let mean = column.mean().unwrap_or(0.0);
37 let std = column.std(0.0);
38
39 if std > 1e-10 {
41 column.mapv_inplace(|x| (x - mean) / std);
42 }
43 }
44}
45
46pub fn min_max_scale(data: &mut Array2<f64>, feature_range: (f64, f64)) {
67 let (range_min, range_max) = feature_range;
68 let range_size = range_max - range_min;
69
70 for j in 0..data.ncols() {
71 let mut column = data.column_mut(j);
72
73 let col_min = column.iter().fold(f64::INFINITY, |a, &b| a.min(b));
75 let col_max = column.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
76
77 if (col_max - col_min).abs() > 1e-10 {
79 column.mapv_inplace(|x| (x - col_min) / (col_max - col_min) * range_size + range_min);
80 } else {
81 column.fill(range_min + range_size / 2.0);
83 }
84 }
85}
86
87pub fn robust_scale(data: &mut Array2<f64>) {
108 for j in 0..data.ncols() {
109 let mut column_values: Vec<f64> = data.column(j).to_vec();
110 column_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
111
112 let n = column_values.len();
113 if n == 0 {
114 continue;
115 }
116
117 let median = if n % 2 == 0 {
119 (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
120 } else {
121 column_values[n / 2]
122 };
123
124 let q1_idx = n / 4;
126 let q3_idx = 3 * n / 4;
127 let q1 = column_values[q1_idx];
128 let q3 = column_values[q3_idx];
129 let iqr = q3 - q1;
130
131 let mut column = data.column_mut(j);
133 if iqr > 1e-10 {
134 column.mapv_inplace(|x| (x - median) / iqr);
135 } else {
136 column.mapv_inplace(|x| x - median);
138 }
139 }
140}
141
142pub trait StatsExt {
147 fn mean(&self) -> Option<f64>;
149 fn std(&self, ddof: f64) -> f64;
151}
152
153impl StatsExt for ndarray::ArrayView1<'_, f64> {
154 fn mean(&self) -> Option<f64> {
160 if self.is_empty() {
161 return None;
162 }
163
164 let sum: f64 = self.sum();
165 Some(sum / self.len() as f64)
166 }
167
168 fn std(&self, ddof: f64) -> f64 {
178 if self.is_empty() {
179 return 0.0;
180 }
181
182 let n = self.len() as f64;
183 let mean = self.mean().unwrap_or(0.0);
184
185 let mut sum_sq = 0.0;
186 for &x in self.iter() {
187 let diff = x - mean;
188 sum_sq += diff * diff;
189 }
190
191 let divisor = n - ddof;
192 if divisor <= 0.0 {
193 return 0.0;
194 }
195
196 (sum_sq / divisor).sqrt()
197 }
198}
199
200#[cfg(test)]
201mod tests {
202 use super::*;
203 use ndarray::{array, Array1};
204
205 #[test]
206 fn test_normalize() {
207 let mut data = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
208 normalize(&mut data);
209
210 for j in 0..data.ncols() {
212 let column = data.column(j);
213 let mean = column.mean().unwrap();
214 assert!(mean.abs() < 1e-10);
215 }
216 }
217
218 #[test]
219 fn test_normalize_constant_values() {
220 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
221 normalize(&mut data);
222
223 for i in 0..data.nrows() {
225 assert_eq!(data[[i, 0]], 5.0);
226 }
227 }
228
229 #[test]
230 fn test_min_max_scale() {
231 let mut data =
232 Array2::from_shape_vec((3, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0]).unwrap();
233 min_max_scale(&mut data, (0.0, 1.0));
234
235 for i in 0..data.nrows() {
237 for j in 0..data.ncols() {
238 let value = data[[i, j]];
239 assert!((0.0..=1.0).contains(&value));
240 }
241 }
242
243 assert!((data[[0, 0]] - 0.0).abs() < 1e-10);
245 assert!((data[[1, 0]] - 0.5).abs() < 1e-10);
246 assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
247 }
248
249 #[test]
250 fn test_min_max_scale_custom_range() {
251 let mut data = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
252 min_max_scale(&mut data, (-1.0, 1.0));
253
254 assert!((data[[0, 0]] - (-1.0)).abs() < 1e-10);
256 assert!((data[[1, 0]] - 0.0).abs() < 1e-10);
257 assert!((data[[2, 0]] - 1.0).abs() < 1e-10);
258 }
259
260 #[test]
261 fn test_min_max_scale_constant_values() {
262 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
263 min_max_scale(&mut data, (0.0, 1.0));
264
265 for i in 0..data.nrows() {
267 assert!((data[[i, 0]] - 0.5).abs() < 1e-10);
268 }
269 }
270
271 #[test]
272 fn test_robust_scale() {
273 let mut data = Array2::from_shape_vec(
274 (5, 2),
275 vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 100.0, 500.0],
276 )
277 .unwrap(); robust_scale(&mut data);
280
281 let col1_values: Vec<f64> = data.column(0).to_vec();
284 let col2_values: Vec<f64> = data.column(1).to_vec();
285
286 let col1_range = col1_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
288 - col1_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
289 let col2_range = col2_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
290 - col2_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
291
292 assert!(col1_range.is_finite());
294 assert!(col2_range.is_finite());
295 assert!(col1_range > 0.0); assert!(col2_range > 0.0); }
298
299 #[test]
300 fn test_robust_scale_constant_values() {
301 let mut data = Array2::from_shape_vec((3, 1), vec![5.0, 5.0, 5.0]).unwrap();
302 robust_scale(&mut data);
303
304 for i in 0..data.nrows() {
306 assert!((data[[i, 0]] - 0.0).abs() < 1e-10);
307 }
308 }
309
310 #[test]
311 fn test_robust_vs_standard_scaling() {
312 let mut data_robust = Array2::from_shape_vec(
314 (5, 1),
315 vec![1.0, 2.0, 3.0, 4.0, 100.0], )
317 .unwrap();
318 let mut data_standard = data_robust.clone();
319
320 robust_scale(&mut data_robust);
322 normalize(&mut data_standard); let robust_range = data_robust.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b))
326 - data_robust.iter().fold(f64::INFINITY, |a, &b| a.min(b));
327 let standard_range = data_standard
328 .iter()
329 .fold(f64::NEG_INFINITY, |a, &b| a.max(b))
330 - data_standard.iter().fold(f64::INFINITY, |a, &b| a.min(b));
331
332 assert!(robust_range.is_finite());
334 assert!(standard_range.is_finite());
335 assert!(robust_range > 0.0);
336 assert!(standard_range > 0.0);
337
338 assert!(data_robust[[0, 0]] != 1.0); assert!(data_standard[[0, 0]] != 1.0); }
342
343 #[test]
344 fn test_stats_ext_trait() {
345 let data = array![1.0, 2.0, 3.0, 4.0, 5.0];
346 let view = data.view();
347
348 let mean = view.mean().unwrap();
350 assert!((mean - 3.0_f64).abs() < 1e-10);
351
352 let std = view.std(0.0); let expected_std = (10.0_f64 / 5.0).sqrt(); assert!((std - expected_std).abs() < 1e-10);
356
357 let std_sample = view.std(1.0);
359 let expected_std_sample = (10.0_f64 / 4.0).sqrt();
360 assert!((std_sample - expected_std_sample).abs() < 1e-10);
361 }
362
363 #[test]
364 fn test_stats_ext_empty_array() {
365 let data: Array1<f64> = array![];
366 let view = data.view();
367
368 assert!(StatsExt::mean(&view).is_none());
370
371 assert_eq!(StatsExt::std(&view, 0.0), 0.0);
373 }
374
375 #[test]
376 fn test_scaling_pipeline() {
377 let mut data1 =
379 Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0])
380 .unwrap();
381 let mut data2 = data1.clone();
382 let mut data3 = data1.clone();
383
384 normalize(&mut data1); min_max_scale(&mut data2, (0.0, 1.0)); robust_scale(&mut data3); assert!(data1.iter().all(|&x| x.is_finite()));
391 assert!(data2.iter().all(|&x| x.is_finite()));
392 assert!(data3.iter().all(|&x| x.is_finite()));
393
394 assert!(data2.iter().all(|&x| (0.0..=1.0).contains(&x)));
396
397 assert_eq!(data1.shape(), data2.shape());
399 assert_eq!(data2.shape(), data3.shape());
400 }
401}