ghostflow_ml/
preprocessing.rs

1//! Data preprocessing utilities
2
3use ghostflow_core::Tensor;
4
5/// Standard Scaler - standardize features by removing mean and scaling to unit variance
6pub struct StandardScaler {
7    pub mean_: Option<Vec<f32>>,
8    pub std_: Option<Vec<f32>>,
9    pub with_mean: bool,
10    pub with_std: bool,
11}
12
13impl StandardScaler {
14    pub fn new() -> Self {
15        StandardScaler {
16            mean_: None,
17            std_: None,
18            with_mean: true,
19            with_std: true,
20        }
21    }
22
23    pub fn with_mean(mut self, with_mean: bool) -> Self {
24        self.with_mean = with_mean;
25        self
26    }
27
28    pub fn with_std(mut self, with_std: bool) -> Self {
29        self.with_std = with_std;
30        self
31    }
32
33    pub fn fit(&mut self, x: &Tensor) {
34        let x_data = x.data_f32();
35        let n_samples = x.dims()[0];
36        let n_features = x.dims()[1];
37
38        let mut mean = vec![0.0f32; n_features];
39        let mut std = vec![0.0f32; n_features];
40
41        // Compute mean
42        for i in 0..n_samples {
43            for j in 0..n_features {
44                mean[j] += x_data[i * n_features + j];
45            }
46        }
47        for j in 0..n_features {
48            mean[j] /= n_samples as f32;
49        }
50
51        // Compute std
52        for i in 0..n_samples {
53            for j in 0..n_features {
54                let diff = x_data[i * n_features + j] - mean[j];
55                std[j] += diff * diff;
56            }
57        }
58        for j in 0..n_features {
59            std[j] = (std[j] / n_samples as f32).sqrt().max(1e-10);
60        }
61
62        self.mean_ = Some(mean);
63        self.std_ = Some(std);
64    }
65
66    pub fn transform(&self, x: &Tensor) -> Tensor {
67        let x_data = x.data_f32();
68        let n_samples = x.dims()[0];
69        let n_features = x.dims()[1];
70
71        let mean = self.mean_.as_ref().expect("Scaler not fitted");
72        let std = self.std_.as_ref().expect("Scaler not fitted");
73
74        let mut result = vec![0.0f32; n_samples * n_features];
75
76        for i in 0..n_samples {
77            for j in 0..n_features {
78                let mut val = x_data[i * n_features + j];
79                if self.with_mean {
80                    val -= mean[j];
81                }
82                if self.with_std {
83                    val /= std[j];
84                }
85                result[i * n_features + j] = val;
86            }
87        }
88
89        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
90    }
91
92    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
93        self.fit(x);
94        self.transform(x)
95    }
96
97    pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
98        let x_data = x.data_f32();
99        let n_samples = x.dims()[0];
100        let n_features = x.dims()[1];
101
102        let mean = self.mean_.as_ref().expect("Scaler not fitted");
103        let std = self.std_.as_ref().expect("Scaler not fitted");
104
105        let mut result = vec![0.0f32; n_samples * n_features];
106
107        for i in 0..n_samples {
108            for j in 0..n_features {
109                let mut val = x_data[i * n_features + j];
110                if self.with_std {
111                    val *= std[j];
112                }
113                if self.with_mean {
114                    val += mean[j];
115                }
116                result[i * n_features + j] = val;
117            }
118        }
119
120        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
121    }
122}
123
124impl Default for StandardScaler {
125    fn default() -> Self {
126        Self::new()
127    }
128}
129
130/// MinMax Scaler - scale features to a given range
131pub struct MinMaxScaler {
132    pub min_: Option<Vec<f32>>,
133    pub max_: Option<Vec<f32>>,
134    pub feature_range: (f32, f32),
135}
136
137impl MinMaxScaler {
138    pub fn new() -> Self {
139        MinMaxScaler {
140            min_: None,
141            max_: None,
142            feature_range: (0.0, 1.0),
143        }
144    }
145
146    pub fn feature_range(mut self, min: f32, max: f32) -> Self {
147        self.feature_range = (min, max);
148        self
149    }
150
151    pub fn fit(&mut self, x: &Tensor) {
152        let x_data = x.data_f32();
153        let n_samples = x.dims()[0];
154        let n_features = x.dims()[1];
155
156        let mut min = vec![f32::INFINITY; n_features];
157        let mut max = vec![f32::NEG_INFINITY; n_features];
158
159        for i in 0..n_samples {
160            for j in 0..n_features {
161                let val = x_data[i * n_features + j];
162                min[j] = min[j].min(val);
163                max[j] = max[j].max(val);
164            }
165        }
166
167        self.min_ = Some(min);
168        self.max_ = Some(max);
169    }
170
171    pub fn transform(&self, x: &Tensor) -> Tensor {
172        let x_data = x.data_f32();
173        let n_samples = x.dims()[0];
174        let n_features = x.dims()[1];
175
176        let min = self.min_.as_ref().expect("Scaler not fitted");
177        let max = self.max_.as_ref().expect("Scaler not fitted");
178        let (range_min, range_max) = self.feature_range;
179
180        let mut result = vec![0.0f32; n_samples * n_features];
181
182        for i in 0..n_samples {
183            for j in 0..n_features {
184                let val = x_data[i * n_features + j];
185                let scale = max[j] - min[j];
186                let scaled = if scale > 1e-10 {
187                    (val - min[j]) / scale
188                } else {
189                    0.5
190                };
191                result[i * n_features + j] = scaled * (range_max - range_min) + range_min;
192            }
193        }
194
195        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
196    }
197
198    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
199        self.fit(x);
200        self.transform(x)
201    }
202}
203
204impl Default for MinMaxScaler {
205    fn default() -> Self {
206        Self::new()
207    }
208}
209
210
211/// Normalizer - normalize samples individually to unit norm
212pub struct Normalizer {
213    pub norm: Norm,
214}
215
216#[derive(Clone, Copy, Debug)]
217pub enum Norm {
218    L1,
219    L2,
220    Max,
221}
222
223impl Normalizer {
224    pub fn new(norm: Norm) -> Self {
225        Normalizer { norm }
226    }
227
228    pub fn transform(&self, x: &Tensor) -> Tensor {
229        let x_data = x.data_f32();
230        let n_samples = x.dims()[0];
231        let n_features = x.dims()[1];
232
233        let mut result = vec![0.0f32; n_samples * n_features];
234
235        for i in 0..n_samples {
236            let row = &x_data[i * n_features..(i + 1) * n_features];
237            
238            let norm_val = match self.norm {
239                Norm::L1 => row.iter().map(|&x| x.abs()).sum::<f32>(),
240                Norm::L2 => row.iter().map(|&x| x * x).sum::<f32>().sqrt(),
241                Norm::Max => row.iter().map(|&x| x.abs()).fold(0.0f32, f32::max),
242            };
243
244            let norm_val = norm_val.max(1e-10);
245
246            for j in 0..n_features {
247                result[i * n_features + j] = row[j] / norm_val;
248            }
249        }
250
251        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
252    }
253}
254
255/// Label Encoder - encode target labels with value between 0 and n_classes-1
256pub struct LabelEncoder {
257    pub classes_: Option<Vec<String>>,
258}
259
260impl LabelEncoder {
261    pub fn new() -> Self {
262        LabelEncoder { classes_: None }
263    }
264
265    pub fn fit(&mut self, labels: &[String]) {
266        let mut classes: Vec<String> = labels.to_vec();
267        classes.sort();
268        classes.dedup();
269        self.classes_ = Some(classes);
270    }
271
272    pub fn transform(&self, labels: &[String]) -> Vec<usize> {
273        let classes = self.classes_.as_ref().expect("Encoder not fitted");
274        
275        labels.iter()
276            .map(|label| {
277                classes.iter().position(|c| c == label).unwrap_or(0)
278            })
279            .collect()
280    }
281
282    pub fn fit_transform(&mut self, labels: &[String]) -> Vec<usize> {
283        self.fit(labels);
284        self.transform(labels)
285    }
286
287    pub fn inverse_transform(&self, encoded: &[usize]) -> Vec<String> {
288        let classes = self.classes_.as_ref().expect("Encoder not fitted");
289        
290        encoded.iter()
291            .map(|&idx| {
292                classes.get(idx).cloned().unwrap_or_default()
293            })
294            .collect()
295    }
296}
297
298impl Default for LabelEncoder {
299    fn default() -> Self {
300        Self::new()
301    }
302}
303
304/// One-Hot Encoder
305pub struct OneHotEncoder {
306    pub n_categories_: Option<Vec<usize>>,
307}
308
309impl OneHotEncoder {
310    pub fn new() -> Self {
311        OneHotEncoder { n_categories_: None }
312    }
313
314    pub fn fit(&mut self, x: &Tensor) {
315        let x_data = x.data_f32();
316        let n_samples = x.dims()[0];
317        let n_features = x.dims()[1];
318
319        let mut n_categories = vec![0usize; n_features];
320
321        for j in 0..n_features {
322            let max_val = (0..n_samples)
323                .map(|i| x_data[i * n_features + j] as usize)
324                .max()
325                .unwrap_or(0);
326            n_categories[j] = max_val + 1;
327        }
328
329        self.n_categories_ = Some(n_categories);
330    }
331
332    pub fn transform(&self, x: &Tensor) -> Tensor {
333        let x_data = x.data_f32();
334        let n_samples = x.dims()[0];
335        let n_features = x.dims()[1];
336
337        let n_categories = self.n_categories_.as_ref().expect("Encoder not fitted");
338        let total_cols: usize = n_categories.iter().sum();
339
340        let mut result = vec![0.0f32; n_samples * total_cols];
341
342        for i in 0..n_samples {
343            let mut col_offset = 0;
344            for j in 0..n_features {
345                let category = x_data[i * n_features + j] as usize;
346                if category < n_categories[j] {
347                    result[i * total_cols + col_offset + category] = 1.0;
348                }
349                col_offset += n_categories[j];
350            }
351        }
352
353        Tensor::from_slice(&result, &[n_samples, total_cols]).unwrap()
354    }
355
356    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
357        self.fit(x);
358        self.transform(x)
359    }
360}
361
362impl Default for OneHotEncoder {
363    fn default() -> Self {
364        Self::new()
365    }
366}
367
368/// Train-test split utility
369pub fn train_test_split(
370    x: &Tensor,
371    y: &Tensor,
372    test_size: f32,
373    shuffle: bool,
374) -> (Tensor, Tensor, Tensor, Tensor) {
375    let x_data = x.data_f32();
376    let y_data = y.data_f32();
377    let n_samples = x.dims()[0];
378    let n_features = x.dims()[1];
379
380    let mut indices: Vec<usize> = (0..n_samples).collect();
381    
382    if shuffle {
383        use rand::seq::SliceRandom;
384        let mut rng = rand::thread_rng();
385        indices.shuffle(&mut rng);
386    }
387
388    let n_test = (n_samples as f32 * test_size).round() as usize;
389    let n_train = n_samples - n_test;
390
391    let mut x_train = vec![0.0f32; n_train * n_features];
392    let mut x_test = vec![0.0f32; n_test * n_features];
393    let mut y_train = vec![0.0f32; n_train];
394    let mut y_test = vec![0.0f32; n_test];
395
396    for (new_idx, &orig_idx) in indices.iter().enumerate() {
397        if new_idx < n_train {
398            for j in 0..n_features {
399                x_train[new_idx * n_features + j] = x_data[orig_idx * n_features + j];
400            }
401            y_train[new_idx] = y_data[orig_idx];
402        } else {
403            let test_idx = new_idx - n_train;
404            for j in 0..n_features {
405                x_test[test_idx * n_features + j] = x_data[orig_idx * n_features + j];
406            }
407            y_test[test_idx] = y_data[orig_idx];
408        }
409    }
410
411    (
412        Tensor::from_slice(&x_train, &[n_train, n_features]).unwrap(),
413        Tensor::from_slice(&x_test, &[n_test, n_features]).unwrap(),
414        Tensor::from_slice(&y_train, &[n_train]).unwrap(),
415        Tensor::from_slice(&y_test, &[n_test]).unwrap(),
416    )
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn test_standard_scaler() {
425        let x = Tensor::from_slice(&[1.0f32, 2.0,
426            3.0, 4.0,
427            5.0, 6.0,
428        ], &[3, 2]).unwrap();
429
430        let mut scaler = StandardScaler::new();
431        let scaled = scaler.fit_transform(&x);
432        
433        assert_eq!(scaled.dims(), &[3, 2]);
434        
435        // Check mean is ~0
436        let scaled_data = scaled.storage().as_slice::<f32>().to_vec();
437        let mean: f32 = scaled_data.iter().sum::<f32>() / scaled_data.len() as f32;
438        assert!(mean.abs() < 0.1);
439    }
440
441    #[test]
442    fn test_minmax_scaler() {
443        let x = Tensor::from_slice(&[0.0f32, 10.0,
444            5.0, 20.0,
445            10.0, 30.0,
446        ], &[3, 2]).unwrap();
447
448        let mut scaler = MinMaxScaler::new();
449        let scaled = scaler.fit_transform(&x);
450        
451        let scaled_data = scaled.storage().as_slice::<f32>().to_vec();
452        assert!(scaled_data.iter().all(|&v| v >= 0.0 && v <= 1.0));
453    }
454
455    #[test]
456    fn test_train_test_split() {
457        let x = Tensor::from_slice(&[1.0f32, 2.0,
458            3.0, 4.0,
459            5.0, 6.0,
460            7.0, 8.0,
461            9.0, 10.0,
462        ], &[5, 2]).unwrap();
463        
464        let y = Tensor::from_slice(&[0.0f32, 1.0, 0.0, 1.0, 0.0], &[5]).unwrap();
465
466        let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.4, false);
467        
468        assert_eq!(x_train.dims()[0], 3);
469        assert_eq!(x_test.dims()[0], 2);
470        assert_eq!(y_train.dims()[0], 3);
471        assert_eq!(y_test.dims()[0], 2);
472    }
473}
474
475