ghostflow_ml/
preprocessing_extended.rs

1//! Extended Preprocessing - RobustScaler, MaxAbsScaler, OrdinalEncoder, etc.
2
3use ghostflow_core::Tensor;
4use std::collections::HashMap;
5
6/// Robust Scaler - scales using statistics robust to outliers
7/// Uses median and IQR instead of mean and std
8pub struct RobustScaler {
9    pub with_centering: bool,
10    pub with_scaling: bool,
11    pub quantile_range: (f32, f32),
12    center_: Option<Vec<f32>>,
13    scale_: Option<Vec<f32>>,
14}
15
16impl RobustScaler {
17    pub fn new() -> Self {
18        RobustScaler {
19            with_centering: true,
20            with_scaling: true,
21            quantile_range: (25.0, 75.0),
22            center_: None,
23            scale_: None,
24        }
25    }
26
27    pub fn with_centering(mut self, c: bool) -> Self { self.with_centering = c; self }
28    pub fn with_scaling(mut self, s: bool) -> Self { self.with_scaling = s; self }
29    pub fn quantile_range(mut self, low: f32, high: f32) -> Self { 
30        self.quantile_range = (low, high); 
31        self 
32    }
33
34    fn quantile(sorted: &[f32], q: f32) -> f32 {
35        if sorted.is_empty() { return 0.0; }
36        let idx = (q / 100.0 * (sorted.len() - 1) as f32) as usize;
37        let idx = idx.min(sorted.len() - 1);
38        sorted[idx]
39    }
40
41    pub fn fit(&mut self, x: &Tensor) {
42        let x_data = x.data_f32();
43        let n_samples = x.dims()[0];
44        let n_features = x.dims()[1];
45
46        let mut center = vec![0.0f32; n_features];
47        let mut scale = vec![1.0f32; n_features];
48
49        for j in 0..n_features {
50            let mut values: Vec<f32> = (0..n_samples)
51                .map(|i| x_data[i * n_features + j])
52                .collect();
53            values.sort_by(|a, b| a.partial_cmp(b).unwrap());
54
55            if self.with_centering {
56                center[j] = Self::quantile(&values, 50.0); // Median
57            }
58
59            if self.with_scaling {
60                let q_low = Self::quantile(&values, self.quantile_range.0);
61                let q_high = Self::quantile(&values, self.quantile_range.1);
62                let iqr = q_high - q_low;
63                scale[j] = if iqr > 1e-10 { iqr } else { 1.0 };
64            }
65        }
66
67        self.center_ = Some(center);
68        self.scale_ = Some(scale);
69    }
70
71    pub fn transform(&self, x: &Tensor) -> Tensor {
72        let x_data = x.data_f32();
73        let n_samples = x.dims()[0];
74        let n_features = x.dims()[1];
75
76        let center = self.center_.as_ref().expect("Scaler not fitted");
77        let scale = self.scale_.as_ref().unwrap();
78
79        let result: Vec<f32> = (0..n_samples)
80            .flat_map(|i| {
81                (0..n_features).map(|j| {
82                    let mut val = x_data[i * n_features + j];
83                    if self.with_centering {
84                        val -= center[j];
85                    }
86                    if self.with_scaling {
87                        val /= scale[j];
88                    }
89                    val
90                }).collect::<Vec<_>>()
91            })
92            .collect();
93
94        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
95    }
96
97    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
98        self.fit(x);
99        self.transform(x)
100    }
101
102    pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
103        let x_data = x.data_f32();
104        let n_samples = x.dims()[0];
105        let n_features = x.dims()[1];
106
107        let center = self.center_.as_ref().expect("Scaler not fitted");
108        let scale = self.scale_.as_ref().unwrap();
109
110        let result: Vec<f32> = (0..n_samples)
111            .flat_map(|i| {
112                (0..n_features).map(|j| {
113                    let mut val = x_data[i * n_features + j];
114                    if self.with_scaling {
115                        val *= scale[j];
116                    }
117                    if self.with_centering {
118                        val += center[j];
119                    }
120                    val
121                }).collect::<Vec<_>>()
122            })
123            .collect();
124
125        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
126    }
127}
128
129impl Default for RobustScaler {
130    fn default() -> Self { Self::new() }
131}
132
133/// MaxAbs Scaler - scales by maximum absolute value
134pub struct MaxAbsScaler {
135    max_abs_: Option<Vec<f32>>,
136}
137
138impl MaxAbsScaler {
139    pub fn new() -> Self {
140        MaxAbsScaler { max_abs_: None }
141    }
142
143    pub fn fit(&mut self, x: &Tensor) {
144        let x_data = x.data_f32();
145        let n_samples = x.dims()[0];
146        let n_features = x.dims()[1];
147
148        let max_abs: Vec<f32> = (0..n_features)
149            .map(|j| {
150                (0..n_samples)
151                    .map(|i| x_data[i * n_features + j].abs())
152                    .fold(0.0f32, f32::max)
153                    .max(1e-10)
154            })
155            .collect();
156
157        self.max_abs_ = Some(max_abs);
158    }
159
160    pub fn transform(&self, x: &Tensor) -> Tensor {
161        let x_data = x.data_f32();
162        let n_samples = x.dims()[0];
163        let n_features = x.dims()[1];
164
165        let max_abs = self.max_abs_.as_ref().expect("Scaler not fitted");
166
167        let result: Vec<f32> = (0..n_samples)
168            .flat_map(|i| {
169                (0..n_features).map(|j| x_data[i * n_features + j] / max_abs[j]).collect::<Vec<_>>()
170            })
171            .collect();
172
173        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
174    }
175
176    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
177        self.fit(x);
178        self.transform(x)
179    }
180
181    pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
182        let x_data = x.data_f32();
183        let n_samples = x.dims()[0];
184        let n_features = x.dims()[1];
185
186        let max_abs = self.max_abs_.as_ref().expect("Scaler not fitted");
187
188        let result: Vec<f32> = (0..n_samples)
189            .flat_map(|i| {
190                (0..n_features).map(|j| x_data[i * n_features + j] * max_abs[j]).collect::<Vec<_>>()
191            })
192            .collect();
193
194        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
195    }
196}
197
198impl Default for MaxAbsScaler {
199    fn default() -> Self { Self::new() }
200}
201
202/// Ordinal Encoder - encodes categorical features as integers
203pub struct OrdinalEncoder {
204    pub handle_unknown: HandleUnknown,
205    pub unknown_value: Option<f32>,
206    categories_: Option<Vec<Vec<String>>>,
207}
208
209#[derive(Clone, Copy)]
210pub enum HandleUnknown {
211    Error,
212    UseEncodedValue,
213}
214
215impl OrdinalEncoder {
216    pub fn new() -> Self {
217        OrdinalEncoder {
218            handle_unknown: HandleUnknown::Error,
219            unknown_value: None,
220            categories_: None,
221        }
222    }
223
224    pub fn handle_unknown(mut self, h: HandleUnknown, value: Option<f32>) -> Self {
225        self.handle_unknown = h;
226        self.unknown_value = value;
227        self
228    }
229
230    pub fn fit(&mut self, x: &[Vec<String>]) {
231        let n_features = if x.is_empty() { 0 } else { x[0].len() };
232        
233        let mut categories: Vec<Vec<String>> = vec![Vec::new(); n_features];
234
235        for row in x {
236            for (j, val) in row.iter().enumerate() {
237                if !categories[j].contains(val) {
238                    categories[j].push(val.clone());
239                }
240            }
241        }
242
243        // Sort categories for consistency
244        for cats in &mut categories {
245            cats.sort();
246        }
247
248        self.categories_ = Some(categories);
249    }
250
251    pub fn transform(&self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
252        let categories = self.categories_.as_ref().expect("Encoder not fitted");
253        let _n_features = categories.len();
254
255        x.iter()
256            .map(|row| {
257                row.iter()
258                    .enumerate()
259                    .map(|(j, val)| {
260                        if let Some(idx) = categories[j].iter().position(|c| c == val) {
261                            idx as f32
262                        } else {
263                            match self.handle_unknown {
264                                HandleUnknown::UseEncodedValue => {
265                                    self.unknown_value.unwrap_or(-1.0)
266                                }
267                                HandleUnknown::Error => {
268                                    panic!("Unknown category: {}", val);
269                                }
270                            }
271                        }
272                    })
273                    .collect()
274            })
275            .collect()
276    }
277
278    pub fn fit_transform(&mut self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
279        self.fit(x);
280        self.transform(x)
281    }
282
283    pub fn inverse_transform(&self, x: &[Vec<f32>]) -> Vec<Vec<String>> {
284        let categories = self.categories_.as_ref().expect("Encoder not fitted");
285
286        x.iter()
287            .map(|row| {
288                row.iter()
289                    .enumerate()
290                    .map(|(j, &val)| {
291                        let idx = val as usize;
292                        if idx < categories[j].len() {
293                            categories[j][idx].clone()
294                        } else {
295                            "unknown".to_string()
296                        }
297                    })
298                    .collect()
299            })
300            .collect()
301    }
302}
303
304impl Default for OrdinalEncoder {
305    fn default() -> Self { Self::new() }
306}
307
308/// Target Encoder - encodes categorical features using target statistics
309pub struct TargetEncoder {
310    pub smooth: f32,
311    pub target_type: TargetType,
312    encodings_: Option<Vec<HashMap<String, f32>>>,
313    global_mean_: f32,
314}
315
316#[derive(Clone, Copy)]
317pub enum TargetType {
318    Continuous,
319    Binary,
320}
321
322impl TargetEncoder {
323    pub fn new() -> Self {
324        TargetEncoder {
325            smooth: 1.0,
326            target_type: TargetType::Continuous,
327            encodings_: None,
328            global_mean_: 0.0,
329        }
330    }
331
332    pub fn smooth(mut self, s: f32) -> Self { self.smooth = s; self }
333
334    pub fn fit(&mut self, x: &[Vec<String>], y: &[f32]) {
335        let n_samples = x.len();
336        let n_features = if x.is_empty() { 0 } else { x[0].len() };
337
338        self.global_mean_ = y.iter().sum::<f32>() / n_samples as f32;
339
340        let mut encodings: Vec<HashMap<String, f32>> = vec![HashMap::new(); n_features];
341
342        for j in 0..n_features {
343            // Group by category
344            let mut category_stats: HashMap<String, (f32, usize)> = HashMap::new();
345
346            for (i, row) in x.iter().enumerate() {
347                let cat = &row[j];
348                let entry = category_stats.entry(cat.clone()).or_insert((0.0, 0));
349                entry.0 += y[i];
350                entry.1 += 1;
351            }
352
353            // Compute smoothed encoding
354            for (cat, (sum, count)) in category_stats {
355                let cat_mean = sum / count as f32;
356                // Smoothed mean: (count * cat_mean + smooth * global_mean) / (count + smooth)
357                let smoothed = (count as f32 * cat_mean + self.smooth * self.global_mean_) 
358                    / (count as f32 + self.smooth);
359                encodings[j].insert(cat, smoothed);
360            }
361        }
362
363        self.encodings_ = Some(encodings);
364    }
365
366    pub fn transform(&self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
367        let encodings = self.encodings_.as_ref().expect("Encoder not fitted");
368
369        x.iter()
370            .map(|row| {
371                row.iter()
372                    .enumerate()
373                    .map(|(j, cat)| {
374                        *encodings[j].get(cat).unwrap_or(&self.global_mean_)
375                    })
376                    .collect()
377            })
378            .collect()
379    }
380
381    pub fn fit_transform(&mut self, x: &[Vec<String>], y: &[f32]) -> Vec<Vec<f32>> {
382        self.fit(x, y);
383        self.transform(x)
384    }
385}
386
387impl Default for TargetEncoder {
388    fn default() -> Self { Self::new() }
389}
390
391/// KBins Discretizer - bins continuous features into discrete intervals
392pub struct KBinsDiscretizer {
393    pub n_bins: usize,
394    pub strategy: BinStrategy,
395    pub encode: BinEncode,
396    bin_edges_: Option<Vec<Vec<f32>>>,
397}
398
399#[derive(Clone, Copy)]
400pub enum BinStrategy {
401    Uniform,
402    Quantile,
403    KMeans,
404}
405
406#[derive(Clone, Copy)]
407pub enum BinEncode {
408    Ordinal,
409    OneHot,
410}
411
412impl KBinsDiscretizer {
413    pub fn new(n_bins: usize) -> Self {
414        KBinsDiscretizer {
415            n_bins,
416            strategy: BinStrategy::Quantile,
417            encode: BinEncode::Ordinal,
418            bin_edges_: None,
419        }
420    }
421
422    pub fn strategy(mut self, s: BinStrategy) -> Self { self.strategy = s; self }
423    pub fn encode(mut self, e: BinEncode) -> Self { self.encode = e; self }
424
425    pub fn fit(&mut self, x: &Tensor) {
426        let x_data = x.data_f32();
427        let n_samples = x.dims()[0];
428        let n_features = x.dims()[1];
429
430        let mut bin_edges: Vec<Vec<f32>> = Vec::with_capacity(n_features);
431
432        for j in 0..n_features {
433            let mut values: Vec<f32> = (0..n_samples)
434                .map(|i| x_data[i * n_features + j])
435                .collect();
436            values.sort_by(|a, b| a.partial_cmp(b).unwrap());
437
438            let edges = match self.strategy {
439                BinStrategy::Uniform => {
440                    let min_val = values[0];
441                    let max_val = values[values.len() - 1];
442                    let step = (max_val - min_val) / self.n_bins as f32;
443                    (0..=self.n_bins).map(|i| min_val + i as f32 * step).collect()
444                }
445                BinStrategy::Quantile => {
446                    (0..=self.n_bins)
447                        .map(|i| {
448                            let q = i as f32 / self.n_bins as f32;
449                            let idx = ((n_samples - 1) as f32 * q) as usize;
450                            values[idx]
451                        })
452                        .collect()
453                }
454                BinStrategy::KMeans => {
455                    // Simplified: use uniform for now
456                    let min_val = values[0];
457                    let max_val = values[values.len() - 1];
458                    let step = (max_val - min_val) / self.n_bins as f32;
459                    (0..=self.n_bins).map(|i| min_val + i as f32 * step).collect()
460                }
461            };
462
463            bin_edges.push(edges);
464        }
465
466        self.bin_edges_ = Some(bin_edges);
467    }
468
469    pub fn transform(&self, x: &Tensor) -> Tensor {
470        let x_data = x.data_f32();
471        let n_samples = x.dims()[0];
472        let n_features = x.dims()[1];
473
474        let bin_edges = self.bin_edges_.as_ref().expect("Discretizer not fitted");
475
476        match self.encode {
477            BinEncode::Ordinal => {
478                let result: Vec<f32> = (0..n_samples)
479                    .flat_map(|i| {
480                        (0..n_features).map(|j| {
481                            let val = x_data[i * n_features + j];
482                            let edges = &bin_edges[j];
483                            let mut bin = 0;
484                            for k in 1..edges.len() {
485                                if val >= edges[k] {
486                                    bin = k;
487                                } else {
488                                    break;
489                                }
490                            }
491                            bin.min(self.n_bins - 1) as f32
492                        }).collect::<Vec<_>>()
493                    })
494                    .collect();
495
496                Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
497            }
498            BinEncode::OneHot => {
499                let n_output = n_features * self.n_bins;
500                let mut result = vec![0.0f32; n_samples * n_output];
501
502                for i in 0..n_samples {
503                    for j in 0..n_features {
504                        let val = x_data[i * n_features + j];
505                        let edges = &bin_edges[j];
506                        let mut bin = 0;
507                        for k in 1..edges.len() {
508                            if val >= edges[k] {
509                                bin = k;
510                            } else {
511                                break;
512                            }
513                        }
514                        bin = bin.min(self.n_bins - 1);
515                        result[i * n_output + j * self.n_bins + bin] = 1.0;
516                    }
517                }
518
519                Tensor::from_slice(&result, &[n_samples, n_output]).unwrap()
520            }
521        }
522    }
523
524    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
525        self.fit(x);
526        self.transform(x)
527    }
528}
529
530/// Binarizer - threshold features to binary values
531pub struct Binarizer {
532    pub threshold: f32,
533}
534
535impl Binarizer {
536    pub fn new(threshold: f32) -> Self {
537        Binarizer { threshold }
538    }
539
540    pub fn transform(&self, x: &Tensor) -> Tensor {
541        let x_data = x.data_f32();
542        let result: Vec<f32> = x_data.iter()
543            .map(|&v| if v > self.threshold { 1.0 } else { 0.0 })
544            .collect();
545        Tensor::from_slice(&result, x.dims()).unwrap()
546    }
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    #[test]
554    fn test_robust_scaler() {
555        let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 100.0, 6.0], &[3, 2]).unwrap();
556        let mut scaler = RobustScaler::new();
557        let result = scaler.fit_transform(&x);
558        assert_eq!(result.dims(), &[3, 2]);
559    }
560
561    #[test]
562    fn test_max_abs_scaler() {
563        let x = Tensor::from_slice(&[-1.0f32, 2.0, -3.0, 4.0], &[2, 2]).unwrap();
564        let mut scaler = MaxAbsScaler::new();
565        let result = scaler.fit_transform(&x);
566        
567        let data = result.storage().as_slice::<f32>().to_vec();
568        assert!(data.iter().all(|&v| v.abs() <= 1.0));
569    }
570
571    #[test]
572    fn test_kbins_discretizer() {
573        let x = Tensor::from_slice(&[0.0f32, 0.5, 1.0, 1.5, 2.0, 2.5], &[3, 2]).unwrap();
574        let mut disc = KBinsDiscretizer::new(3);
575        let result = disc.fit_transform(&x);
576        assert_eq!(result.dims(), &[3, 2]);
577    }
578
579    #[test]
580    fn test_binarizer() {
581        let x = Tensor::from_slice(&[0.0f32, 0.5, 1.0, 1.5], &[2, 2]).unwrap();
582        let binarizer = Binarizer::new(0.5);
583        let result = binarizer.transform(&x);
584        
585        let data = result.storage().as_slice::<f32>().to_vec();
586        assert_eq!(data, &[0.0, 0.0, 1.0, 1.0]);
587    }
588}
589
590