scirs2_cluster/preprocess/
mod.rs1use scirs2_core::ndarray::{Array1, Array2, ArrayView2, Axis};
10use scirs2_core::numeric::{Float, FromPrimitive};
11use std::fmt::Debug;
12
13use crate::error::{ClusteringError, Result};
14
15#[allow(dead_code)]
47pub fn whiten<F: Float + FromPrimitive + Debug>(
48 data: ArrayView2<F>,
49 check_finite: bool,
50) -> Result<Array2<F>> {
51 let n_samples = data.shape()[0];
52 let n_features = data.shape()[1];
53
54 if n_samples == 0 || n_features == 0 {
55 return Err(ClusteringError::InvalidInput("Input data is empty".into()));
56 }
57
58 if check_finite {
59 for element in data.iter() {
61 if !element.is_finite() {
62 return Err(ClusteringError::InvalidInput(
63 "Input data contains NaN or infinite values".into(),
64 ));
65 }
66 }
67 }
68
69 let std_dev = standard_deviation(data, Axis(0))?;
71
72 let mut result = Array2::zeros(data.dim());
74
75 for j in 0..n_features {
77 let std_j = std_dev[j];
78 if std_j <= F::epsilon() {
79 for i in 0..n_samples {
81 result[[i, j]] = data[[i, j]];
82 }
83 } else {
84 for i in 0..n_samples {
85 result[[i, j]] = data[[i, j]] / std_j;
86 }
87 }
88 }
89
90 Ok(result)
91}
92
93#[allow(dead_code)]
121pub fn standardize<F: Float + FromPrimitive + Debug>(
122 data: ArrayView2<F>,
123 check_finite: bool,
124) -> Result<Array2<F>> {
125 let n_samples = data.shape()[0];
126 let n_features = data.shape()[1];
127
128 if n_samples == 0 || n_features == 0 {
129 return Err(ClusteringError::InvalidInput("Input data is empty".into()));
130 }
131
132 if check_finite {
133 for element in data.iter() {
135 if !element.is_finite() {
136 return Err(ClusteringError::InvalidInput(
137 "Input data contains NaN or infinite values".into(),
138 ));
139 }
140 }
141 }
142
143 let mean = data.mean_axis(Axis(0)).unwrap();
145
146 let std_dev = standard_deviation(data, Axis(0))?;
148
149 let mut result = Array2::zeros(data.dim());
151
152 for j in 0..n_features {
154 let mean_j = mean[j];
155 let std_j = std_dev[j];
156
157 if std_j <= F::epsilon() {
158 for i in 0..n_samples {
160 result[[i, j]] = data[[i, j]] - mean_j;
161 }
162 } else {
163 for i in 0..n_samples {
164 result[[i, j]] = (data[[i, j]] - mean_j) / std_j;
165 }
166 }
167 }
168
169 Ok(result)
170}
171
172#[allow(dead_code)]
201pub fn normalize<F: Float + FromPrimitive + Debug>(
202 data: ArrayView2<F>,
203 norm: NormType,
204 check_finite: bool,
205) -> Result<Array2<F>> {
206 let n_samples = data.shape()[0];
207 let n_features = data.shape()[1];
208
209 if n_samples == 0 || n_features == 0 {
210 return Err(ClusteringError::InvalidInput("Input data is empty".into()));
211 }
212
213 if check_finite {
214 for element in data.iter() {
216 if !element.is_finite() {
217 return Err(ClusteringError::InvalidInput(
218 "Input data contains NaN or infinite values".into(),
219 ));
220 }
221 }
222 }
223
224 let norms = match norm {
226 NormType::L1 => {
227 let mut norms = Array1::zeros(n_samples);
229 for i in 0..n_samples {
230 let row = data.row(i);
231 let row_norm = row.iter().fold(F::zero(), |acc, &x| acc + x.abs());
232 norms[i] = row_norm;
233 }
234 norms
235 }
236 NormType::L2 => {
237 let mut norms = Array1::zeros(n_samples);
239 for i in 0..n_samples {
240 let row = data.row(i);
241 let row_norm = row.iter().fold(F::zero(), |acc, &x| acc + x * x).sqrt();
242 norms[i] = row_norm;
243 }
244 norms
245 }
246 NormType::Max => {
247 let mut norms = Array1::zeros(n_samples);
249 for i in 0..n_samples {
250 let row = data.row(i);
251 let row_norm = row.iter().fold(F::zero(), |acc, &x| acc.max(x.abs()));
252 norms[i] = row_norm;
253 }
254 norms
255 }
256 };
257
258 let mut result = Array2::zeros(data.dim());
260
261 for i in 0..n_samples {
263 let norm_i = norms[i];
264 if norm_i <= F::epsilon() {
265 for j in 0..n_features {
267 result[[i, j]] = data[[i, j]];
268 }
269 } else {
270 for j in 0..n_features {
271 result[[i, j]] = data[[i, j]] / norm_i;
272 }
273 }
274 }
275
276 Ok(result)
277}
278
279#[allow(dead_code)]
308pub fn min_max_scale<F: Float + FromPrimitive + Debug>(
309 data: ArrayView2<F>,
310 feature_range: (f64, f64),
311 check_finite: bool,
312) -> Result<Array2<F>> {
313 let n_samples = data.shape()[0];
314 let n_features = data.shape()[1];
315
316 if n_samples == 0 || n_features == 0 {
317 return Err(ClusteringError::InvalidInput("Input data is empty".into()));
318 }
319
320 if check_finite {
321 for element in data.iter() {
323 if !element.is_finite() {
324 return Err(ClusteringError::InvalidInput(
325 "Input data contains NaN or infinite values".into(),
326 ));
327 }
328 }
329 }
330
331 let (min_val, max_val) = feature_range;
332 if min_val >= max_val {
333 return Err(ClusteringError::InvalidInput(
334 "Feature range minimum must be less than maximum".into(),
335 ));
336 }
337
338 let feature_min = F::from_f64(min_val).unwrap();
339 let feature_max = F::from_f64(max_val).unwrap();
340
341 let mut min_values = Array1::zeros(n_features);
343 let mut max_values = Array1::zeros(n_features);
344
345 for j in 0..n_features {
346 let column = data.column(j);
347 let (min_j, max_j) = column.iter().fold(
348 (F::infinity(), F::neg_infinity()),
349 |(min_val, max_val), &x| (min_val.min(x), max_val.max(x)),
350 );
351 min_values[j] = min_j;
352 max_values[j] = max_j;
353 }
354
355 let mut result = Array2::zeros(data.dim());
357
358 for j in 0..n_features {
360 let min_j = min_values[j];
361 let max_j = max_values[j];
362 let range_j = max_j - min_j;
363
364 if range_j <= F::epsilon() {
365 let middle = (feature_min + feature_max) / F::from_f64(2.0).unwrap();
367 for i in 0..n_samples {
368 result[[i, j]] = middle;
369 }
370 } else {
371 for i in 0..n_samples {
372 let scaled = (data[[i, j]] - min_j) / range_j;
374 result[[i, j]] = scaled * (feature_max - feature_min) + feature_min;
376 }
377 }
378 }
379
380 Ok(result)
381}
382
383#[derive(Debug, Clone, Copy, PartialEq, Eq)]
385pub enum NormType {
386 L1,
388 L2,
390 Max,
392}
393
394#[allow(dead_code)]
396fn standard_deviation<F: Float + FromPrimitive + Debug>(
397 data: ArrayView2<F>,
398 axis: Axis,
399) -> Result<Array1<F>> {
400 let mean = data.mean_axis(axis).unwrap();
401 let n = F::from_usize(match axis {
402 Axis(0) => data.shape()[0],
403 Axis(1) => data.shape()[1],
404 _ => return Err(ClusteringError::InvalidInput("Invalid axis".into())),
405 })
406 .unwrap();
407
408 let mut variance = match axis {
409 Axis(0) => Array1::zeros(data.shape()[1]),
410 Axis(1) => Array1::zeros(data.shape()[0]),
411 _ => return Err(ClusteringError::InvalidInput("Invalid axis".into())),
412 };
413
414 if axis == Axis(0) {
415 let n_features = data.shape()[1];
417 for j in 0..n_features {
418 let mut sum_squared_diff = F::zero();
419 for i in 0..data.shape()[0] {
420 let diff = data[[i, j]] - mean[j];
421 sum_squared_diff = sum_squared_diff + diff * diff;
422 }
423 if n > F::one() {
425 variance[j] = sum_squared_diff / (n - F::one());
426 } else {
427 variance[j] = F::zero();
428 }
429 }
430 } else {
431 let n_samples = data.shape()[0];
433 for i in 0..n_samples {
434 let mut sum_squared_diff = F::zero();
435 for j in 0..data.shape()[1] {
436 let diff = data[[i, j]] - mean[i];
437 sum_squared_diff = sum_squared_diff + diff * diff;
438 }
439 if n > F::one() {
441 variance[i] = sum_squared_diff / (n - F::one());
442 } else {
443 variance[i] = F::zero();
444 }
445 }
446 }
447
448 let std_dev = variance.mapv(|x| x.sqrt());
450
451 let std_dev = std_dev.mapv(|x| if x <= F::epsilon() { F::one() } else { x });
453
454 Ok(std_dev)
455}
456
457#[cfg(test)]
458mod tests {
459 use super::*;
460 use approx::assert_abs_diff_eq;
461 use scirs2_core::ndarray::Array2;
462
463 #[test]
464 fn test_whiten() {
465 let data =
466 Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
467 .unwrap();
468
469 let whitened = whiten(data.view(), true).unwrap();
470
471 let std_dev = standard_deviation(whitened.view(), Axis(0)).unwrap();
473 for &std in std_dev.iter() {
474 assert_abs_diff_eq!(std, 1.0, epsilon = 1e-10);
475 }
476 }
477
478 #[test]
479 fn test_standardize() {
480 let data =
481 Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
482 .unwrap();
483
484 let standardized = standardize(data.view(), true).unwrap();
485
486 let mean = standardized.mean_axis(Axis(0)).unwrap();
488 for mean_val in mean.iter() {
489 assert_abs_diff_eq!(*mean_val, 0.0, epsilon = 1e-10);
490 }
491
492 let std_dev = standard_deviation(standardized.view(), Axis(0)).unwrap();
494 for std_val in std_dev.iter() {
495 assert_abs_diff_eq!(*std_val, 1.0, epsilon = 1e-10);
496 }
497 }
498
499 #[test]
500 fn test_normalize_l2() {
501 let data =
502 Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
503 .unwrap();
504
505 let normalized = normalize(data.view(), NormType::L2, true).unwrap();
506
507 for i in 0..data.shape()[0] {
509 let row = normalized.row(i);
510 let norm_sq: f64 = row.iter().fold(0.0, |acc, &x| acc + x * x);
511 let norm = norm_sq.sqrt();
512 assert_abs_diff_eq!(norm, 1.0, epsilon = 1e-10);
513 }
514 }
515
516 #[test]
517 fn test_min_max_scale() {
518 let data =
519 Array2::from_shape_vec((3, 3), vec![1.9, 2.3, 1.7, 1.5, 2.5, 2.2, 0.8, 0.6, 1.7])
520 .unwrap();
521
522 let scaled = min_max_scale(data.view(), (0.0, 1.0), true).unwrap();
523
524 for val in scaled.iter() {
526 assert!(*val >= 0.0 && *val <= 1.0);
527 }
528
529 for j in 0..data.shape()[1] {
531 let column = scaled.column(j);
532
533 let column_values: Vec<f64> = column.iter().copied().collect();
535
536 if !column_values.is_empty() {
537 let min_val = column_values
538 .iter()
539 .fold(f64::INFINITY, |min, &x| min.min(x));
540 let max_val = column_values
541 .iter()
542 .fold(f64::NEG_INFINITY, |max, &x| max.max(x));
543
544 if data.column(j).iter().any(|&x| x != data[[0, j]]) {
546 assert_abs_diff_eq!(min_val, 0.0, epsilon = 1e-10);
547 assert_abs_diff_eq!(max_val, 1.0, epsilon = 1e-10);
548 }
549 }
550 }
551 }
552}