1use ghostflow_core::Tensor;
4
5pub struct StandardScaler {
7 pub mean_: Option<Vec<f32>>,
8 pub std_: Option<Vec<f32>>,
9 pub with_mean: bool,
10 pub with_std: bool,
11}
12
13impl StandardScaler {
14 pub fn new() -> Self {
15 StandardScaler {
16 mean_: None,
17 std_: None,
18 with_mean: true,
19 with_std: true,
20 }
21 }
22
23 pub fn with_mean(mut self, with_mean: bool) -> Self {
24 self.with_mean = with_mean;
25 self
26 }
27
28 pub fn with_std(mut self, with_std: bool) -> Self {
29 self.with_std = with_std;
30 self
31 }
32
33 pub fn fit(&mut self, x: &Tensor) {
34 let x_data = x.data_f32();
35 let n_samples = x.dims()[0];
36 let n_features = x.dims()[1];
37
38 let mut mean = vec![0.0f32; n_features];
39 let mut std = vec![0.0f32; n_features];
40
41 for i in 0..n_samples {
43 for j in 0..n_features {
44 mean[j] += x_data[i * n_features + j];
45 }
46 }
47 for j in 0..n_features {
48 mean[j] /= n_samples as f32;
49 }
50
51 for i in 0..n_samples {
53 for j in 0..n_features {
54 let diff = x_data[i * n_features + j] - mean[j];
55 std[j] += diff * diff;
56 }
57 }
58 for j in 0..n_features {
59 std[j] = (std[j] / n_samples as f32).sqrt().max(1e-10);
60 }
61
62 self.mean_ = Some(mean);
63 self.std_ = Some(std);
64 }
65
66 pub fn transform(&self, x: &Tensor) -> Tensor {
67 let x_data = x.data_f32();
68 let n_samples = x.dims()[0];
69 let n_features = x.dims()[1];
70
71 let mean = self.mean_.as_ref().expect("Scaler not fitted");
72 let std = self.std_.as_ref().expect("Scaler not fitted");
73
74 let mut result = vec![0.0f32; n_samples * n_features];
75
76 for i in 0..n_samples {
77 for j in 0..n_features {
78 let mut val = x_data[i * n_features + j];
79 if self.with_mean {
80 val -= mean[j];
81 }
82 if self.with_std {
83 val /= std[j];
84 }
85 result[i * n_features + j] = val;
86 }
87 }
88
89 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
90 }
91
92 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
93 self.fit(x);
94 self.transform(x)
95 }
96
97 pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
98 let x_data = x.data_f32();
99 let n_samples = x.dims()[0];
100 let n_features = x.dims()[1];
101
102 let mean = self.mean_.as_ref().expect("Scaler not fitted");
103 let std = self.std_.as_ref().expect("Scaler not fitted");
104
105 let mut result = vec![0.0f32; n_samples * n_features];
106
107 for i in 0..n_samples {
108 for j in 0..n_features {
109 let mut val = x_data[i * n_features + j];
110 if self.with_std {
111 val *= std[j];
112 }
113 if self.with_mean {
114 val += mean[j];
115 }
116 result[i * n_features + j] = val;
117 }
118 }
119
120 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
121 }
122}
123
124impl Default for StandardScaler {
125 fn default() -> Self {
126 Self::new()
127 }
128}
129
130pub struct MinMaxScaler {
132 pub min_: Option<Vec<f32>>,
133 pub max_: Option<Vec<f32>>,
134 pub feature_range: (f32, f32),
135}
136
137impl MinMaxScaler {
138 pub fn new() -> Self {
139 MinMaxScaler {
140 min_: None,
141 max_: None,
142 feature_range: (0.0, 1.0),
143 }
144 }
145
146 pub fn feature_range(mut self, min: f32, max: f32) -> Self {
147 self.feature_range = (min, max);
148 self
149 }
150
151 pub fn fit(&mut self, x: &Tensor) {
152 let x_data = x.data_f32();
153 let n_samples = x.dims()[0];
154 let n_features = x.dims()[1];
155
156 let mut min = vec![f32::INFINITY; n_features];
157 let mut max = vec![f32::NEG_INFINITY; n_features];
158
159 for i in 0..n_samples {
160 for j in 0..n_features {
161 let val = x_data[i * n_features + j];
162 min[j] = min[j].min(val);
163 max[j] = max[j].max(val);
164 }
165 }
166
167 self.min_ = Some(min);
168 self.max_ = Some(max);
169 }
170
171 pub fn transform(&self, x: &Tensor) -> Tensor {
172 let x_data = x.data_f32();
173 let n_samples = x.dims()[0];
174 let n_features = x.dims()[1];
175
176 let min = self.min_.as_ref().expect("Scaler not fitted");
177 let max = self.max_.as_ref().expect("Scaler not fitted");
178 let (range_min, range_max) = self.feature_range;
179
180 let mut result = vec![0.0f32; n_samples * n_features];
181
182 for i in 0..n_samples {
183 for j in 0..n_features {
184 let val = x_data[i * n_features + j];
185 let scale = max[j] - min[j];
186 let scaled = if scale > 1e-10 {
187 (val - min[j]) / scale
188 } else {
189 0.5
190 };
191 result[i * n_features + j] = scaled * (range_max - range_min) + range_min;
192 }
193 }
194
195 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
196 }
197
198 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
199 self.fit(x);
200 self.transform(x)
201 }
202}
203
204impl Default for MinMaxScaler {
205 fn default() -> Self {
206 Self::new()
207 }
208}
209
210
211pub struct Normalizer {
213 pub norm: Norm,
214}
215
216#[derive(Clone, Copy, Debug)]
217pub enum Norm {
218 L1,
219 L2,
220 Max,
221}
222
223impl Normalizer {
224 pub fn new(norm: Norm) -> Self {
225 Normalizer { norm }
226 }
227
228 pub fn transform(&self, x: &Tensor) -> Tensor {
229 let x_data = x.data_f32();
230 let n_samples = x.dims()[0];
231 let n_features = x.dims()[1];
232
233 let mut result = vec![0.0f32; n_samples * n_features];
234
235 for i in 0..n_samples {
236 let row = &x_data[i * n_features..(i + 1) * n_features];
237
238 let norm_val = match self.norm {
239 Norm::L1 => row.iter().map(|&x| x.abs()).sum::<f32>(),
240 Norm::L2 => row.iter().map(|&x| x * x).sum::<f32>().sqrt(),
241 Norm::Max => row.iter().map(|&x| x.abs()).fold(0.0f32, f32::max),
242 };
243
244 let norm_val = norm_val.max(1e-10);
245
246 for j in 0..n_features {
247 result[i * n_features + j] = row[j] / norm_val;
248 }
249 }
250
251 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
252 }
253}
254
255pub struct LabelEncoder {
257 pub classes_: Option<Vec<String>>,
258}
259
260impl LabelEncoder {
261 pub fn new() -> Self {
262 LabelEncoder { classes_: None }
263 }
264
265 pub fn fit(&mut self, labels: &[String]) {
266 let mut classes: Vec<String> = labels.to_vec();
267 classes.sort();
268 classes.dedup();
269 self.classes_ = Some(classes);
270 }
271
272 pub fn transform(&self, labels: &[String]) -> Vec<usize> {
273 let classes = self.classes_.as_ref().expect("Encoder not fitted");
274
275 labels.iter()
276 .map(|label| {
277 classes.iter().position(|c| c == label).unwrap_or(0)
278 })
279 .collect()
280 }
281
282 pub fn fit_transform(&mut self, labels: &[String]) -> Vec<usize> {
283 self.fit(labels);
284 self.transform(labels)
285 }
286
287 pub fn inverse_transform(&self, encoded: &[usize]) -> Vec<String> {
288 let classes = self.classes_.as_ref().expect("Encoder not fitted");
289
290 encoded.iter()
291 .map(|&idx| {
292 classes.get(idx).cloned().unwrap_or_default()
293 })
294 .collect()
295 }
296}
297
298impl Default for LabelEncoder {
299 fn default() -> Self {
300 Self::new()
301 }
302}
303
304pub struct OneHotEncoder {
306 pub n_categories_: Option<Vec<usize>>,
307}
308
309impl OneHotEncoder {
310 pub fn new() -> Self {
311 OneHotEncoder { n_categories_: None }
312 }
313
314 pub fn fit(&mut self, x: &Tensor) {
315 let x_data = x.data_f32();
316 let n_samples = x.dims()[0];
317 let n_features = x.dims()[1];
318
319 let mut n_categories = vec![0usize; n_features];
320
321 for j in 0..n_features {
322 let max_val = (0..n_samples)
323 .map(|i| x_data[i * n_features + j] as usize)
324 .max()
325 .unwrap_or(0);
326 n_categories[j] = max_val + 1;
327 }
328
329 self.n_categories_ = Some(n_categories);
330 }
331
332 pub fn transform(&self, x: &Tensor) -> Tensor {
333 let x_data = x.data_f32();
334 let n_samples = x.dims()[0];
335 let n_features = x.dims()[1];
336
337 let n_categories = self.n_categories_.as_ref().expect("Encoder not fitted");
338 let total_cols: usize = n_categories.iter().sum();
339
340 let mut result = vec![0.0f32; n_samples * total_cols];
341
342 for i in 0..n_samples {
343 let mut col_offset = 0;
344 for j in 0..n_features {
345 let category = x_data[i * n_features + j] as usize;
346 if category < n_categories[j] {
347 result[i * total_cols + col_offset + category] = 1.0;
348 }
349 col_offset += n_categories[j];
350 }
351 }
352
353 Tensor::from_slice(&result, &[n_samples, total_cols]).unwrap()
354 }
355
356 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
357 self.fit(x);
358 self.transform(x)
359 }
360}
361
362impl Default for OneHotEncoder {
363 fn default() -> Self {
364 Self::new()
365 }
366}
367
368pub fn train_test_split(
370 x: &Tensor,
371 y: &Tensor,
372 test_size: f32,
373 shuffle: bool,
374) -> (Tensor, Tensor, Tensor, Tensor) {
375 let x_data = x.data_f32();
376 let y_data = y.data_f32();
377 let n_samples = x.dims()[0];
378 let n_features = x.dims()[1];
379
380 let mut indices: Vec<usize> = (0..n_samples).collect();
381
382 if shuffle {
383 use rand::seq::SliceRandom;
384 let mut rng = rand::thread_rng();
385 indices.shuffle(&mut rng);
386 }
387
388 let n_test = (n_samples as f32 * test_size).round() as usize;
389 let n_train = n_samples - n_test;
390
391 let mut x_train = vec![0.0f32; n_train * n_features];
392 let mut x_test = vec![0.0f32; n_test * n_features];
393 let mut y_train = vec![0.0f32; n_train];
394 let mut y_test = vec![0.0f32; n_test];
395
396 for (new_idx, &orig_idx) in indices.iter().enumerate() {
397 if new_idx < n_train {
398 for j in 0..n_features {
399 x_train[new_idx * n_features + j] = x_data[orig_idx * n_features + j];
400 }
401 y_train[new_idx] = y_data[orig_idx];
402 } else {
403 let test_idx = new_idx - n_train;
404 for j in 0..n_features {
405 x_test[test_idx * n_features + j] = x_data[orig_idx * n_features + j];
406 }
407 y_test[test_idx] = y_data[orig_idx];
408 }
409 }
410
411 (
412 Tensor::from_slice(&x_train, &[n_train, n_features]).unwrap(),
413 Tensor::from_slice(&x_test, &[n_test, n_features]).unwrap(),
414 Tensor::from_slice(&y_train, &[n_train]).unwrap(),
415 Tensor::from_slice(&y_test, &[n_test]).unwrap(),
416 )
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422
423 #[test]
424 fn test_standard_scaler() {
425 let x = Tensor::from_slice(&[1.0f32, 2.0,
426 3.0, 4.0,
427 5.0, 6.0,
428 ], &[3, 2]).unwrap();
429
430 let mut scaler = StandardScaler::new();
431 let scaled = scaler.fit_transform(&x);
432
433 assert_eq!(scaled.dims(), &[3, 2]);
434
435 let scaled_data = scaled.storage().as_slice::<f32>().to_vec();
437 let mean: f32 = scaled_data.iter().sum::<f32>() / scaled_data.len() as f32;
438 assert!(mean.abs() < 0.1);
439 }
440
441 #[test]
442 fn test_minmax_scaler() {
443 let x = Tensor::from_slice(&[0.0f32, 10.0,
444 5.0, 20.0,
445 10.0, 30.0,
446 ], &[3, 2]).unwrap();
447
448 let mut scaler = MinMaxScaler::new();
449 let scaled = scaler.fit_transform(&x);
450
451 let scaled_data = scaled.storage().as_slice::<f32>().to_vec();
452 assert!(scaled_data.iter().all(|&v| v >= 0.0 && v <= 1.0));
453 }
454
455 #[test]
456 fn test_train_test_split() {
457 let x = Tensor::from_slice(&[1.0f32, 2.0,
458 3.0, 4.0,
459 5.0, 6.0,
460 7.0, 8.0,
461 9.0, 10.0,
462 ], &[5, 2]).unwrap();
463
464 let y = Tensor::from_slice(&[0.0f32, 1.0, 0.0, 1.0, 0.0], &[5]).unwrap();
465
466 let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.4, false);
467
468 assert_eq!(x_train.dims()[0], 3);
469 assert_eq!(x_test.dims()[0], 2);
470 assert_eq!(y_train.dims()[0], 3);
471 assert_eq!(y_test.dims()[0], 2);
472 }
473}
474
475