1use ghostflow_core::Tensor;
4use std::collections::HashMap;
5
6pub struct RobustScaler {
9 pub with_centering: bool,
10 pub with_scaling: bool,
11 pub quantile_range: (f32, f32),
12 center_: Option<Vec<f32>>,
13 scale_: Option<Vec<f32>>,
14}
15
16impl RobustScaler {
17 pub fn new() -> Self {
18 RobustScaler {
19 with_centering: true,
20 with_scaling: true,
21 quantile_range: (25.0, 75.0),
22 center_: None,
23 scale_: None,
24 }
25 }
26
27 pub fn with_centering(mut self, c: bool) -> Self { self.with_centering = c; self }
28 pub fn with_scaling(mut self, s: bool) -> Self { self.with_scaling = s; self }
29 pub fn quantile_range(mut self, low: f32, high: f32) -> Self {
30 self.quantile_range = (low, high);
31 self
32 }
33
34 fn quantile(sorted: &[f32], q: f32) -> f32 {
35 if sorted.is_empty() { return 0.0; }
36 let idx = (q / 100.0 * (sorted.len() - 1) as f32) as usize;
37 let idx = idx.min(sorted.len() - 1);
38 sorted[idx]
39 }
40
41 pub fn fit(&mut self, x: &Tensor) {
42 let x_data = x.data_f32();
43 let n_samples = x.dims()[0];
44 let n_features = x.dims()[1];
45
46 let mut center = vec![0.0f32; n_features];
47 let mut scale = vec![1.0f32; n_features];
48
49 for j in 0..n_features {
50 let mut values: Vec<f32> = (0..n_samples)
51 .map(|i| x_data[i * n_features + j])
52 .collect();
53 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
54
55 if self.with_centering {
56 center[j] = Self::quantile(&values, 50.0); }
58
59 if self.with_scaling {
60 let q_low = Self::quantile(&values, self.quantile_range.0);
61 let q_high = Self::quantile(&values, self.quantile_range.1);
62 let iqr = q_high - q_low;
63 scale[j] = if iqr > 1e-10 { iqr } else { 1.0 };
64 }
65 }
66
67 self.center_ = Some(center);
68 self.scale_ = Some(scale);
69 }
70
71 pub fn transform(&self, x: &Tensor) -> Tensor {
72 let x_data = x.data_f32();
73 let n_samples = x.dims()[0];
74 let n_features = x.dims()[1];
75
76 let center = self.center_.as_ref().expect("Scaler not fitted");
77 let scale = self.scale_.as_ref().unwrap();
78
79 let result: Vec<f32> = (0..n_samples)
80 .flat_map(|i| {
81 (0..n_features).map(|j| {
82 let mut val = x_data[i * n_features + j];
83 if self.with_centering {
84 val -= center[j];
85 }
86 if self.with_scaling {
87 val /= scale[j];
88 }
89 val
90 }).collect::<Vec<_>>()
91 })
92 .collect();
93
94 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
95 }
96
97 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
98 self.fit(x);
99 self.transform(x)
100 }
101
102 pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
103 let x_data = x.data_f32();
104 let n_samples = x.dims()[0];
105 let n_features = x.dims()[1];
106
107 let center = self.center_.as_ref().expect("Scaler not fitted");
108 let scale = self.scale_.as_ref().unwrap();
109
110 let result: Vec<f32> = (0..n_samples)
111 .flat_map(|i| {
112 (0..n_features).map(|j| {
113 let mut val = x_data[i * n_features + j];
114 if self.with_scaling {
115 val *= scale[j];
116 }
117 if self.with_centering {
118 val += center[j];
119 }
120 val
121 }).collect::<Vec<_>>()
122 })
123 .collect();
124
125 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
126 }
127}
128
129impl Default for RobustScaler {
130 fn default() -> Self { Self::new() }
131}
132
133pub struct MaxAbsScaler {
135 max_abs_: Option<Vec<f32>>,
136}
137
138impl MaxAbsScaler {
139 pub fn new() -> Self {
140 MaxAbsScaler { max_abs_: None }
141 }
142
143 pub fn fit(&mut self, x: &Tensor) {
144 let x_data = x.data_f32();
145 let n_samples = x.dims()[0];
146 let n_features = x.dims()[1];
147
148 let max_abs: Vec<f32> = (0..n_features)
149 .map(|j| {
150 (0..n_samples)
151 .map(|i| x_data[i * n_features + j].abs())
152 .fold(0.0f32, f32::max)
153 .max(1e-10)
154 })
155 .collect();
156
157 self.max_abs_ = Some(max_abs);
158 }
159
160 pub fn transform(&self, x: &Tensor) -> Tensor {
161 let x_data = x.data_f32();
162 let n_samples = x.dims()[0];
163 let n_features = x.dims()[1];
164
165 let max_abs = self.max_abs_.as_ref().expect("Scaler not fitted");
166
167 let result: Vec<f32> = (0..n_samples)
168 .flat_map(|i| {
169 (0..n_features).map(|j| x_data[i * n_features + j] / max_abs[j]).collect::<Vec<_>>()
170 })
171 .collect();
172
173 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
174 }
175
176 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
177 self.fit(x);
178 self.transform(x)
179 }
180
181 pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
182 let x_data = x.data_f32();
183 let n_samples = x.dims()[0];
184 let n_features = x.dims()[1];
185
186 let max_abs = self.max_abs_.as_ref().expect("Scaler not fitted");
187
188 let result: Vec<f32> = (0..n_samples)
189 .flat_map(|i| {
190 (0..n_features).map(|j| x_data[i * n_features + j] * max_abs[j]).collect::<Vec<_>>()
191 })
192 .collect();
193
194 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
195 }
196}
197
198impl Default for MaxAbsScaler {
199 fn default() -> Self { Self::new() }
200}
201
202pub struct OrdinalEncoder {
204 pub handle_unknown: HandleUnknown,
205 pub unknown_value: Option<f32>,
206 categories_: Option<Vec<Vec<String>>>,
207}
208
209#[derive(Clone, Copy)]
210pub enum HandleUnknown {
211 Error,
212 UseEncodedValue,
213}
214
215impl OrdinalEncoder {
216 pub fn new() -> Self {
217 OrdinalEncoder {
218 handle_unknown: HandleUnknown::Error,
219 unknown_value: None,
220 categories_: None,
221 }
222 }
223
224 pub fn handle_unknown(mut self, h: HandleUnknown, value: Option<f32>) -> Self {
225 self.handle_unknown = h;
226 self.unknown_value = value;
227 self
228 }
229
230 pub fn fit(&mut self, x: &[Vec<String>]) {
231 let n_features = if x.is_empty() { 0 } else { x[0].len() };
232
233 let mut categories: Vec<Vec<String>> = vec![Vec::new(); n_features];
234
235 for row in x {
236 for (j, val) in row.iter().enumerate() {
237 if !categories[j].contains(val) {
238 categories[j].push(val.clone());
239 }
240 }
241 }
242
243 for cats in &mut categories {
245 cats.sort();
246 }
247
248 self.categories_ = Some(categories);
249 }
250
251 pub fn transform(&self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
252 let categories = self.categories_.as_ref().expect("Encoder not fitted");
253 let _n_features = categories.len();
254
255 x.iter()
256 .map(|row| {
257 row.iter()
258 .enumerate()
259 .map(|(j, val)| {
260 if let Some(idx) = categories[j].iter().position(|c| c == val) {
261 idx as f32
262 } else {
263 match self.handle_unknown {
264 HandleUnknown::UseEncodedValue => {
265 self.unknown_value.unwrap_or(-1.0)
266 }
267 HandleUnknown::Error => {
268 panic!("Unknown category: {}", val);
269 }
270 }
271 }
272 })
273 .collect()
274 })
275 .collect()
276 }
277
278 pub fn fit_transform(&mut self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
279 self.fit(x);
280 self.transform(x)
281 }
282
283 pub fn inverse_transform(&self, x: &[Vec<f32>]) -> Vec<Vec<String>> {
284 let categories = self.categories_.as_ref().expect("Encoder not fitted");
285
286 x.iter()
287 .map(|row| {
288 row.iter()
289 .enumerate()
290 .map(|(j, &val)| {
291 let idx = val as usize;
292 if idx < categories[j].len() {
293 categories[j][idx].clone()
294 } else {
295 "unknown".to_string()
296 }
297 })
298 .collect()
299 })
300 .collect()
301 }
302}
303
304impl Default for OrdinalEncoder {
305 fn default() -> Self { Self::new() }
306}
307
308pub struct TargetEncoder {
310 pub smooth: f32,
311 pub target_type: TargetType,
312 encodings_: Option<Vec<HashMap<String, f32>>>,
313 global_mean_: f32,
314}
315
316#[derive(Clone, Copy)]
317pub enum TargetType {
318 Continuous,
319 Binary,
320}
321
322impl TargetEncoder {
323 pub fn new() -> Self {
324 TargetEncoder {
325 smooth: 1.0,
326 target_type: TargetType::Continuous,
327 encodings_: None,
328 global_mean_: 0.0,
329 }
330 }
331
332 pub fn smooth(mut self, s: f32) -> Self { self.smooth = s; self }
333
334 pub fn fit(&mut self, x: &[Vec<String>], y: &[f32]) {
335 let n_samples = x.len();
336 let n_features = if x.is_empty() { 0 } else { x[0].len() };
337
338 self.global_mean_ = y.iter().sum::<f32>() / n_samples as f32;
339
340 let mut encodings: Vec<HashMap<String, f32>> = vec![HashMap::new(); n_features];
341
342 for j in 0..n_features {
343 let mut category_stats: HashMap<String, (f32, usize)> = HashMap::new();
345
346 for (i, row) in x.iter().enumerate() {
347 let cat = &row[j];
348 let entry = category_stats.entry(cat.clone()).or_insert((0.0, 0));
349 entry.0 += y[i];
350 entry.1 += 1;
351 }
352
353 for (cat, (sum, count)) in category_stats {
355 let cat_mean = sum / count as f32;
356 let smoothed = (count as f32 * cat_mean + self.smooth * self.global_mean_)
358 / (count as f32 + self.smooth);
359 encodings[j].insert(cat, smoothed);
360 }
361 }
362
363 self.encodings_ = Some(encodings);
364 }
365
366 pub fn transform(&self, x: &[Vec<String>]) -> Vec<Vec<f32>> {
367 let encodings = self.encodings_.as_ref().expect("Encoder not fitted");
368
369 x.iter()
370 .map(|row| {
371 row.iter()
372 .enumerate()
373 .map(|(j, cat)| {
374 *encodings[j].get(cat).unwrap_or(&self.global_mean_)
375 })
376 .collect()
377 })
378 .collect()
379 }
380
381 pub fn fit_transform(&mut self, x: &[Vec<String>], y: &[f32]) -> Vec<Vec<f32>> {
382 self.fit(x, y);
383 self.transform(x)
384 }
385}
386
387impl Default for TargetEncoder {
388 fn default() -> Self { Self::new() }
389}
390
391pub struct KBinsDiscretizer {
393 pub n_bins: usize,
394 pub strategy: BinStrategy,
395 pub encode: BinEncode,
396 bin_edges_: Option<Vec<Vec<f32>>>,
397}
398
399#[derive(Clone, Copy)]
400pub enum BinStrategy {
401 Uniform,
402 Quantile,
403 KMeans,
404}
405
406#[derive(Clone, Copy)]
407pub enum BinEncode {
408 Ordinal,
409 OneHot,
410}
411
412impl KBinsDiscretizer {
413 pub fn new(n_bins: usize) -> Self {
414 KBinsDiscretizer {
415 n_bins,
416 strategy: BinStrategy::Quantile,
417 encode: BinEncode::Ordinal,
418 bin_edges_: None,
419 }
420 }
421
422 pub fn strategy(mut self, s: BinStrategy) -> Self { self.strategy = s; self }
423 pub fn encode(mut self, e: BinEncode) -> Self { self.encode = e; self }
424
425 pub fn fit(&mut self, x: &Tensor) {
426 let x_data = x.data_f32();
427 let n_samples = x.dims()[0];
428 let n_features = x.dims()[1];
429
430 let mut bin_edges: Vec<Vec<f32>> = Vec::with_capacity(n_features);
431
432 for j in 0..n_features {
433 let mut values: Vec<f32> = (0..n_samples)
434 .map(|i| x_data[i * n_features + j])
435 .collect();
436 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
437
438 let edges = match self.strategy {
439 BinStrategy::Uniform => {
440 let min_val = values[0];
441 let max_val = values[values.len() - 1];
442 let step = (max_val - min_val) / self.n_bins as f32;
443 (0..=self.n_bins).map(|i| min_val + i as f32 * step).collect()
444 }
445 BinStrategy::Quantile => {
446 (0..=self.n_bins)
447 .map(|i| {
448 let q = i as f32 / self.n_bins as f32;
449 let idx = ((n_samples - 1) as f32 * q) as usize;
450 values[idx]
451 })
452 .collect()
453 }
454 BinStrategy::KMeans => {
455 let min_val = values[0];
457 let max_val = values[values.len() - 1];
458 let step = (max_val - min_val) / self.n_bins as f32;
459 (0..=self.n_bins).map(|i| min_val + i as f32 * step).collect()
460 }
461 };
462
463 bin_edges.push(edges);
464 }
465
466 self.bin_edges_ = Some(bin_edges);
467 }
468
469 pub fn transform(&self, x: &Tensor) -> Tensor {
470 let x_data = x.data_f32();
471 let n_samples = x.dims()[0];
472 let n_features = x.dims()[1];
473
474 let bin_edges = self.bin_edges_.as_ref().expect("Discretizer not fitted");
475
476 match self.encode {
477 BinEncode::Ordinal => {
478 let result: Vec<f32> = (0..n_samples)
479 .flat_map(|i| {
480 (0..n_features).map(|j| {
481 let val = x_data[i * n_features + j];
482 let edges = &bin_edges[j];
483 let mut bin = 0;
484 for k in 1..edges.len() {
485 if val >= edges[k] {
486 bin = k;
487 } else {
488 break;
489 }
490 }
491 bin.min(self.n_bins - 1) as f32
492 }).collect::<Vec<_>>()
493 })
494 .collect();
495
496 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
497 }
498 BinEncode::OneHot => {
499 let n_output = n_features * self.n_bins;
500 let mut result = vec![0.0f32; n_samples * n_output];
501
502 for i in 0..n_samples {
503 for j in 0..n_features {
504 let val = x_data[i * n_features + j];
505 let edges = &bin_edges[j];
506 let mut bin = 0;
507 for k in 1..edges.len() {
508 if val >= edges[k] {
509 bin = k;
510 } else {
511 break;
512 }
513 }
514 bin = bin.min(self.n_bins - 1);
515 result[i * n_output + j * self.n_bins + bin] = 1.0;
516 }
517 }
518
519 Tensor::from_slice(&result, &[n_samples, n_output]).unwrap()
520 }
521 }
522 }
523
524 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
525 self.fit(x);
526 self.transform(x)
527 }
528}
529
530pub struct Binarizer {
532 pub threshold: f32,
533}
534
535impl Binarizer {
536 pub fn new(threshold: f32) -> Self {
537 Binarizer { threshold }
538 }
539
540 pub fn transform(&self, x: &Tensor) -> Tensor {
541 let x_data = x.data_f32();
542 let result: Vec<f32> = x_data.iter()
543 .map(|&v| if v > self.threshold { 1.0 } else { 0.0 })
544 .collect();
545 Tensor::from_slice(&result, x.dims()).unwrap()
546 }
547}
548
549#[cfg(test)]
550mod tests {
551 use super::*;
552
553 #[test]
554 fn test_robust_scaler() {
555 let x = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0, 100.0, 6.0], &[3, 2]).unwrap();
556 let mut scaler = RobustScaler::new();
557 let result = scaler.fit_transform(&x);
558 assert_eq!(result.dims(), &[3, 2]);
559 }
560
561 #[test]
562 fn test_max_abs_scaler() {
563 let x = Tensor::from_slice(&[-1.0, 2.0, -3.0, 4.0], &[2, 2]).unwrap();
564 let mut scaler = MaxAbsScaler::new();
565 let result = scaler.fit_transform(&x);
566
567 let data = result.data_f32();
568 assert!(data.iter().all(|&v| v.abs() <= 1.0));
569 }
570
571 #[test]
572 fn test_kbins_discretizer() {
573 let x = Tensor::from_slice(&[0.0, 0.5, 1.0, 1.5, 2.0, 2.5], &[3, 2]).unwrap();
574 let mut disc = KBinsDiscretizer::new(3);
575 let result = disc.fit_transform(&x);
576 assert_eq!(result.dims(), &[3, 2]);
577 }
578
579 #[test]
580 fn test_binarizer() {
581 let x = Tensor::from_slice(&[0.0, 0.5, 1.0, 1.5], &[2, 2]).unwrap();
582 let binarizer = Binarizer::new(0.5);
583 let result = binarizer.transform(&x);
584
585 let data = result.data_f32();
586 assert_eq!(data, &[0.0, 0.0, 1.0, 1.0]);
587 }
588}