torsh_cluster/utils/
preprocessing.rs1use crate::error::{ClusterError, ClusterResult};
4use torsh_tensor::Tensor;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum PreprocessingMethod {
9 StandardScaler,
11 MinMaxScaler,
13 Normalizer,
15 RobustScaler,
17}
18
19pub fn standardize_features(data: &Tensor) -> ClusterResult<Tensor> {
21 let n_samples = data.shape().dims()[0];
22 let n_features = data.shape().dims()[1];
23 let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
24
25 let mut standardized_data = Vec::with_capacity(data_vec.len());
26
27 for j in 0..n_features {
29 let mut feature_values = Vec::with_capacity(n_samples);
30 for i in 0..n_samples {
31 feature_values.push(data_vec[i * n_features + j] as f64);
32 }
33
34 let mean = feature_values.iter().sum::<f64>() / n_samples as f64;
36
37 let variance = feature_values
39 .iter()
40 .map(|&x| (x - mean).powi(2))
41 .sum::<f64>()
42 / n_samples as f64;
43 let std = variance.sqrt();
44
45 for &feature_val in feature_values.iter() {
47 let standardized_val = if std > 0.0 {
48 (feature_val - mean) / std
49 } else {
50 0.0
51 };
52 standardized_data.push(standardized_val as f32);
53 }
54 }
55
56 let mut reordered_data = vec![0.0; n_samples * n_features];
58 for i in 0..n_samples {
59 for j in 0..n_features {
60 reordered_data[i * n_features + j] = standardized_data[j * n_samples + i];
61 }
62 }
63
64 Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
65}
66
67pub fn normalize_features(data: &Tensor) -> ClusterResult<Tensor> {
69 let n_samples = data.shape().dims()[0];
70 let n_features = data.shape().dims()[1];
71 let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
72
73 let mut normalized_data = Vec::with_capacity(data_vec.len());
74
75 for j in 0..n_features {
77 let mut feature_values = Vec::with_capacity(n_samples);
78 for i in 0..n_samples {
79 feature_values.push(data_vec[i * n_features + j] as f64);
80 }
81
82 let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
83 let max_val = feature_values
84 .iter()
85 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
86 let range = max_val - min_val;
87
88 for &feature_val in feature_values.iter() {
90 let normalized_val = if range > 0.0 {
91 (feature_val - min_val) / range
92 } else {
93 0.0
94 };
95 normalized_data.push(normalized_val as f32);
96 }
97 }
98
99 let mut reordered_data = vec![0.0; n_samples * n_features];
101 for i in 0..n_samples {
102 for j in 0..n_features {
103 reordered_data[i * n_features + j] = normalized_data[j * n_samples + i];
104 }
105 }
106
107 Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
108}
109
110pub fn unit_normalize(data: &Tensor) -> ClusterResult<Tensor> {
112 let n_samples = data.shape().dims()[0];
113 let n_features = data.shape().dims()[1];
114 let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
115
116 let mut normalized_data = Vec::with_capacity(data_vec.len());
117
118 for i in 0..n_samples {
119 let mut norm = 0.0;
121 for j in 0..n_features {
122 let val = data_vec[i * n_features + j] as f64;
123 norm += val * val;
124 }
125 norm = norm.sqrt();
126
127 for j in 0..n_features {
129 let normalized_val = if norm > 0.0 {
130 data_vec[i * n_features + j] as f64 / norm
131 } else {
132 0.0
133 };
134 normalized_data.push(normalized_val as f32);
135 }
136 }
137
138 Tensor::from_vec(normalized_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
139}
140
141pub fn robust_scale(data: &Tensor) -> ClusterResult<Tensor> {
143 let n_samples = data.shape().dims()[0];
144 let n_features = data.shape().dims()[1];
145 let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
146
147 let mut scaled_data = Vec::with_capacity(data_vec.len());
148
149 for j in 0..n_features {
151 let mut feature_values = Vec::with_capacity(n_samples);
152 for i in 0..n_samples {
153 feature_values.push(data_vec[i * n_features + j] as f64);
154 }
155
156 feature_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
158
159 let median = if n_samples % 2 == 0 {
160 (feature_values[n_samples / 2 - 1] + feature_values[n_samples / 2]) / 2.0
161 } else {
162 feature_values[n_samples / 2]
163 };
164
165 let q1_idx = n_samples / 4;
166 let q3_idx = 3 * n_samples / 4;
167 let iqr = feature_values[q3_idx] - feature_values[q1_idx];
168
169 for i in 0..n_samples {
171 let original_val = data_vec[i * n_features + j] as f64;
172 let scaled_val = if iqr > 0.0 {
173 (original_val - median) / iqr
174 } else {
175 0.0
176 };
177 scaled_data.push(scaled_val as f32);
178 }
179 }
180
181 let mut reordered_data = vec![0.0; n_samples * n_features];
183 for i in 0..n_samples {
184 for j in 0..n_features {
185 reordered_data[i * n_features + j] = scaled_data[j * n_samples + i];
186 }
187 }
188
189 Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
190}