quantrs2_ml/utils/
preprocessing.rs1use crate::error::{MLError, Result};
4use scirs2_core::ndarray::{Array1, Array2, Axis};
5
6use super::*;
7pub fn standardize(data: &Array2<f64>) -> Result<Array2<f64>> {
9 let mean = data
10 .mean_axis(Axis(0))
11 .ok_or_else(|| MLError::ComputationError("Failed to compute mean".to_string()))?;
12 let std = data.std_axis(Axis(0), 0.0);
13 let mut normalized = data.clone();
14 for i in 0..data.nrows() {
15 for j in 0..data.ncols() {
16 normalized[(i, j)] = (data[(i, j)] - mean[j]) / (std[j] + 1e-8);
17 }
18 }
19 Ok(normalized)
20}
21pub fn min_max_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
23 let mut normalized = data.clone();
24 for j in 0..data.ncols() {
25 let column = data.column(j);
26 let min_val = column.iter().cloned().fold(f64::INFINITY, f64::min);
27 let max_val = column.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
28 let range = max_val - min_val;
29 if range > 1e-10 {
30 for i in 0..data.nrows() {
31 normalized[(i, j)] = (data[(i, j)] - min_val) / range;
32 }
33 }
34 }
35 Ok(normalized)
36}
37pub fn robust_scale(data: &Array2<f64>) -> Result<Array2<f64>> {
40 let mut normalized = data.clone();
41 for j in 0..data.ncols() {
42 let mut column_values: Vec<f64> = data.column(j).to_vec();
43 column_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
44 let n = column_values.len();
45 let median = if n % 2 == 0 {
46 (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
47 } else {
48 column_values[n / 2]
49 };
50 let q1_idx = n / 4;
51 let q3_idx = (3 * n) / 4;
52 let q1 = column_values[q1_idx];
53 let q3 = column_values[q3_idx];
54 let iqr = q3 - q1;
55 if iqr > 1e-10 {
56 for i in 0..data.nrows() {
57 normalized[(i, j)] = (data[(i, j)] - median) / iqr;
58 }
59 } else {
60 for i in 0..data.nrows() {
61 normalized[(i, j)] = data[(i, j)] - median;
62 }
63 }
64 }
65 Ok(normalized)
66}
67pub fn quantile_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
70 let nrows = data.nrows();
71 let ncols = data.ncols();
72 let mut normalized = Array2::zeros((nrows, ncols));
73 let mut sorted_columns: Vec<Vec<(usize, f64)>> = Vec::with_capacity(ncols);
74 for j in 0..ncols {
75 let mut indexed_values: Vec<(usize, f64)> = data
76 .column(j)
77 .iter()
78 .enumerate()
79 .map(|(i, &v)| (i, v))
80 .collect();
81 indexed_values.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
82 sorted_columns.push(indexed_values);
83 }
84 let mut rank_means = vec![0.0; nrows];
85 for i in 0..nrows {
86 let mut sum = 0.0;
87 for col in &sorted_columns {
88 sum += col[i].1;
89 }
90 rank_means[i] = sum / ncols as f64;
91 }
92 for j in 0..ncols {
93 for (rank, &(original_idx, _)) in sorted_columns[j].iter().enumerate() {
94 normalized[(original_idx, j)] = rank_means[rank];
95 }
96 }
97 Ok(normalized)
98}
99pub fn max_abs_scale(data: &Array2<f64>) -> Result<Array2<f64>> {
102 let mut normalized = data.clone();
103 for j in 0..data.ncols() {
104 let max_abs = data
105 .column(j)
106 .iter()
107 .map(|x| x.abs())
108 .fold(0.0f64, f64::max);
109 if max_abs > 1e-10 {
110 for i in 0..data.nrows() {
111 normalized[(i, j)] = data[(i, j)] / max_abs;
112 }
113 }
114 }
115 Ok(normalized)
116}
117pub fn l1_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
120 let mut normalized = data.clone();
121 for i in 0..data.nrows() {
122 let l1_norm: f64 = data.row(i).iter().map(|x| x.abs()).sum();
123 if l1_norm > 1e-10 {
124 for j in 0..data.ncols() {
125 normalized[(i, j)] = data[(i, j)] / l1_norm;
126 }
127 }
128 }
129 Ok(normalized)
130}
131pub fn l2_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
134 let mut normalized = data.clone();
135 for i in 0..data.nrows() {
136 let l2_norm: f64 = data.row(i).iter().map(|x| x * x).sum::<f64>().sqrt();
137 if l2_norm > 1e-10 {
138 for j in 0..data.ncols() {
139 normalized[(i, j)] = data[(i, j)] / l2_norm;
140 }
141 }
142 }
143 Ok(normalized)
144}