1use ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
7use num_traits::{Float, NumCast};
8
9use crate::error::{Result, TransformError};
10
11const EPSILON: f64 = 1e-10;
13
14pub struct QuantileTransformer {
19 n_quantiles: usize,
21 output_distribution: String,
23 clip: bool,
25 quantiles: Option<Array2<f64>>,
27 references: Option<Array1<f64>>,
29}
30
31impl QuantileTransformer {
32 pub fn new(n_quantiles: usize, output_distribution: &str, clip: bool) -> Result<Self> {
42 if n_quantiles < 2 {
43 return Err(TransformError::InvalidInput(
44 "n_quantiles must be at least 2".to_string(),
45 ));
46 }
47
48 if output_distribution != "uniform" && output_distribution != "normal" {
49 return Err(TransformError::InvalidInput(
50 "output_distribution must be 'uniform' or 'normal'".to_string(),
51 ));
52 }
53
54 Ok(QuantileTransformer {
55 n_quantiles,
56 output_distribution: output_distribution.to_string(),
57 clip,
58 quantiles: None,
59 references: None,
60 })
61 }
62
63 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
71 where
72 S: Data,
73 S::Elem: Float + NumCast,
74 {
75 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
76
77 let n_samples = x_f64.shape()[0];
78 let n_features = x_f64.shape()[1];
79
80 if n_samples == 0 || n_features == 0 {
81 return Err(TransformError::InvalidInput("Empty input data".to_string()));
82 }
83
84 if self.n_quantiles > n_samples {
85 return Err(TransformError::InvalidInput(format!(
86 "n_quantiles ({}) cannot be greater than n_samples ({})",
87 self.n_quantiles, n_samples
88 )));
89 }
90
91 let mut quantiles = Array2::zeros((n_features, self.n_quantiles));
93
94 for j in 0..n_features {
95 let mut feature_data: Vec<f64> = x_f64.column(j).to_vec();
97 feature_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
98
99 for i in 0..self.n_quantiles {
101 let q = i as f64 / (self.n_quantiles - 1) as f64;
102 let idx = (q * (feature_data.len() - 1) as f64).round() as usize;
103 quantiles[[j, i]] = feature_data[idx];
104 }
105 }
106
107 let references = if self.output_distribution == "uniform" {
109 Array1::from_shape_fn(self.n_quantiles, |i| {
111 i as f64 / (self.n_quantiles - 1) as f64
112 })
113 } else {
114 Array1::from_shape_fn(self.n_quantiles, |i| {
116 let u = i as f64 / (self.n_quantiles - 1) as f64;
117 let u_clamped = u.clamp(1e-7, 1.0 - 1e-7);
119 inverse_normal_cdf(u_clamped)
120 })
121 };
122
123 self.quantiles = Some(quantiles);
124 self.references = Some(references);
125
126 Ok(())
127 }
128
129 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
137 where
138 S: Data,
139 S::Elem: Float + NumCast,
140 {
141 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
142
143 let n_samples = x_f64.shape()[0];
144 let n_features = x_f64.shape()[1];
145
146 if self.quantiles.is_none() || self.references.is_none() {
147 return Err(TransformError::TransformationError(
148 "QuantileTransformer has not been fitted".to_string(),
149 ));
150 }
151
152 let quantiles = self.quantiles.as_ref().unwrap();
153 let references = self.references.as_ref().unwrap();
154
155 if n_features != quantiles.shape()[0] {
156 return Err(TransformError::InvalidInput(format!(
157 "x has {} features, but QuantileTransformer was fitted with {} features",
158 n_features,
159 quantiles.shape()[0]
160 )));
161 }
162
163 let mut transformed = Array2::zeros((n_samples, n_features));
164
165 for i in 0..n_samples {
166 for j in 0..n_features {
167 let value = x_f64[[i, j]];
168
169 let feature_quantiles = quantiles.row(j);
171
172 let mut lower_idx = 0;
174 let mut upper_idx = self.n_quantiles - 1;
175
176 if value <= feature_quantiles[0] {
178 transformed[[i, j]] = references[0];
179 continue;
180 }
181 if value >= feature_quantiles[self.n_quantiles - 1] {
182 transformed[[i, j]] = references[self.n_quantiles - 1];
183 continue;
184 }
185
186 while upper_idx - lower_idx > 1 {
188 let mid = (lower_idx + upper_idx) / 2;
189 if value <= feature_quantiles[mid] {
190 upper_idx = mid;
191 } else {
192 lower_idx = mid;
193 }
194 }
195
196 let lower_quantile = feature_quantiles[lower_idx];
198 let upper_quantile = feature_quantiles[upper_idx];
199 let lower_ref = references[lower_idx];
200 let upper_ref = references[upper_idx];
201
202 if (upper_quantile - lower_quantile).abs() < EPSILON {
203 transformed[[i, j]] = lower_ref;
204 } else {
205 let ratio = (value - lower_quantile) / (upper_quantile - lower_quantile);
206 transformed[[i, j]] = lower_ref + ratio * (upper_ref - lower_ref);
207 }
208 }
209 }
210
211 if self.clip && self.output_distribution == "uniform" {
213 for i in 0..n_samples {
214 for j in 0..n_features {
215 transformed[[i, j]] = transformed[[i, j]].clamp(0.0, 1.0);
216 }
217 }
218 }
219
220 Ok(transformed)
221 }
222
223 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
231 where
232 S: Data,
233 S::Elem: Float + NumCast,
234 {
235 self.fit(x)?;
236 self.transform(x)
237 }
238
239 pub fn quantiles(&self) -> Option<&Array2<f64>> {
244 self.quantiles.as_ref()
245 }
246}
247
248fn inverse_normal_cdf(u: f64) -> f64 {
252 const A0: f64 = 2.50662823884;
254 const A1: f64 = -18.61500062529;
255 const A2: f64 = 41.39119773534;
256 const A3: f64 = -25.44106049637;
257 const B1: f64 = -8.47351093090;
258 const B2: f64 = 23.08336743743;
259 const B3: f64 = -21.06224101826;
260 const B4: f64 = 3.13082909833;
261 const C0: f64 = 0.3374754822726147;
262 const C1: f64 = 0.9761690190917186;
263 const C2: f64 = 0.1607979714918209;
264 const C3: f64 = 0.0276438810333863;
265 const C4: f64 = 0.0038405729373609;
266 const C5: f64 = 0.0003951896511919;
267 const C6: f64 = 0.0000321767881768;
268 const C7: f64 = 0.0000002888167364;
269 const C8: f64 = 0.0000003960315187;
270
271 let y = u - 0.5;
272
273 if y.abs() < 0.42 {
274 let r = y * y;
276 y * (((A3 * r + A2) * r + A1) * r + A0) / ((((B4 * r + B3) * r + B2) * r + B1) * r + 1.0)
277 } else {
278 let r = if y > 0.0 { 1.0 - u } else { u };
280 let r = (-r.ln()).ln();
281
282 let result = C0
283 + r * (C1 + r * (C2 + r * (C3 + r * (C4 + r * (C5 + r * (C6 + r * (C7 + r * C8)))))));
284
285 if y < 0.0 {
286 -result
287 } else {
288 result
289 }
290 }
291}
292
293pub struct MaxAbsScaler {
299 max_abs_: Option<Array1<f64>>,
301 scale_: Option<Array1<f64>>,
303}
304
305impl MaxAbsScaler {
306 pub fn new() -> Self {
318 MaxAbsScaler {
319 max_abs_: None,
320 scale_: None,
321 }
322 }
323
324 pub fn with_defaults() -> Self {
326 Self::new()
327 }
328
329 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
337 where
338 S: Data,
339 S::Elem: Float + NumCast,
340 {
341 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
342
343 let n_samples = x_f64.shape()[0];
344 let n_features = x_f64.shape()[1];
345
346 if n_samples == 0 || n_features == 0 {
347 return Err(TransformError::InvalidInput("Empty input data".to_string()));
348 }
349
350 let mut max_abs = Array1::zeros(n_features);
352
353 for j in 0..n_features {
354 let feature_data = x_f64.column(j);
355 let max_abs_value = feature_data
356 .iter()
357 .map(|&x| x.abs())
358 .fold(0.0, |acc, x| acc.max(x));
359
360 max_abs[j] = max_abs_value;
361 }
362
363 let scale = max_abs.mapv(|max_abs_val| {
365 if max_abs_val > EPSILON {
366 1.0 / max_abs_val
367 } else {
368 1.0 }
370 });
371
372 self.max_abs_ = Some(max_abs);
373 self.scale_ = Some(scale);
374
375 Ok(())
376 }
377
378 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
386 where
387 S: Data,
388 S::Elem: Float + NumCast,
389 {
390 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
391
392 let n_samples = x_f64.shape()[0];
393 let n_features = x_f64.shape()[1];
394
395 if self.scale_.is_none() {
396 return Err(TransformError::TransformationError(
397 "MaxAbsScaler has not been fitted".to_string(),
398 ));
399 }
400
401 let scale = self.scale_.as_ref().unwrap();
402
403 if n_features != scale.len() {
404 return Err(TransformError::InvalidInput(format!(
405 "x has {} features, but MaxAbsScaler was fitted with {} features",
406 n_features,
407 scale.len()
408 )));
409 }
410
411 let mut transformed = Array2::zeros((n_samples, n_features));
412
413 for i in 0..n_samples {
415 for j in 0..n_features {
416 transformed[[i, j]] = x_f64[[i, j]] * scale[j];
417 }
418 }
419
420 Ok(transformed)
421 }
422
423 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
431 where
432 S: Data,
433 S::Elem: Float + NumCast,
434 {
435 self.fit(x)?;
436 self.transform(x)
437 }
438
439 pub fn inverse_transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
447 where
448 S: Data,
449 S::Elem: Float + NumCast,
450 {
451 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
452
453 let n_samples = x_f64.shape()[0];
454 let n_features = x_f64.shape()[1];
455
456 if self.max_abs_.is_none() {
457 return Err(TransformError::TransformationError(
458 "MaxAbsScaler has not been fitted".to_string(),
459 ));
460 }
461
462 let max_abs = self.max_abs_.as_ref().unwrap();
463
464 if n_features != max_abs.len() {
465 return Err(TransformError::InvalidInput(format!(
466 "x has {} features, but MaxAbsScaler was fitted with {} features",
467 n_features,
468 max_abs.len()
469 )));
470 }
471
472 let mut transformed = Array2::zeros((n_samples, n_features));
473
474 for i in 0..n_samples {
476 for j in 0..n_features {
477 transformed[[i, j]] = x_f64[[i, j]] * max_abs[j];
478 }
479 }
480
481 Ok(transformed)
482 }
483
484 pub fn max_abs(&self) -> Option<&Array1<f64>> {
489 self.max_abs_.as_ref()
490 }
491
492 pub fn scale(&self) -> Option<&Array1<f64>> {
497 self.scale_.as_ref()
498 }
499}
500
501impl Default for MaxAbsScaler {
502 fn default() -> Self {
503 Self::new()
504 }
505}
506
507#[cfg(test)]
508mod tests {
509 use super::*;
510 use approx::assert_abs_diff_eq;
511 use ndarray::Array;
512
513 #[test]
514 fn test_quantile_transformer_uniform() {
515 let data = Array::from_shape_vec(
517 (6, 2),
518 vec![
519 1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0, 100.0, 1000.0,
520 ], )
522 .unwrap();
523
524 let mut transformer = QuantileTransformer::new(5, "uniform", true).unwrap();
525 let transformed = transformer.fit_transform(&data).unwrap();
526
527 assert_eq!(transformed.shape(), &[6, 2]);
529
530 for i in 0..6 {
532 for j in 0..2 {
533 assert!(
534 transformed[[i, j]] >= 0.0 && transformed[[i, j]] <= 1.0,
535 "Value at [{}, {}] = {} is not in [0, 1]",
536 i,
537 j,
538 transformed[[i, j]]
539 );
540 }
541 }
542
543 assert_abs_diff_eq!(transformed[[0, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[5, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[0, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[5, 1]], 1.0, epsilon = 1e-10); }
549
550 #[test]
551 fn test_quantile_transformer_normal() {
552 let data = Array::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
554
555 let mut transformer = QuantileTransformer::new(5, "normal", false).unwrap();
556 let transformed = transformer.fit_transform(&data).unwrap();
557
558 assert_eq!(transformed.shape(), &[5, 1]);
560
561 assert_abs_diff_eq!(transformed[[2, 0]], 0.0, epsilon = 1e-10);
563 }
564
565 #[test]
566 fn test_quantile_transformer_errors() {
567 assert!(QuantileTransformer::new(1, "uniform", true).is_err());
569
570 assert!(QuantileTransformer::new(100, "invalid", true).is_err());
572
573 let small_data = Array::from_shape_vec((2, 1), vec![1.0, 2.0]).unwrap();
575 let mut transformer = QuantileTransformer::new(10, "uniform", true).unwrap();
576 assert!(transformer.fit(&small_data).is_err());
577 }
578
579 #[test]
580 fn test_inverse_normal_cdf() {
581 assert_abs_diff_eq!(inverse_normal_cdf(0.5), 0.0, epsilon = 1e-6);
583 assert!(inverse_normal_cdf(0.1) < 0.0); assert!(inverse_normal_cdf(0.9) > 0.0); }
586
587 #[test]
588 fn test_max_abs_scaler_basic() {
589 let data = Array::from_shape_vec(
593 (5, 2),
594 vec![-4.0, -10.0, -2.0, -5.0, 0.0, 0.0, 2.0, 5.0, 4.0, 10.0],
595 )
596 .unwrap();
597
598 let mut scaler = MaxAbsScaler::new();
599 let scaled = scaler.fit_transform(&data).unwrap();
600
601 assert_eq!(scaled.shape(), &[5, 2]);
603
604 let max_abs = scaler.max_abs().unwrap();
606 assert_abs_diff_eq!(max_abs[0], 4.0, epsilon = 1e-10);
607 assert_abs_diff_eq!(max_abs[1], 10.0, epsilon = 1e-10);
608
609 let scale = scaler.scale().unwrap();
611 assert_abs_diff_eq!(scale[0], 0.25, epsilon = 1e-10); assert_abs_diff_eq!(scale[1], 0.1, epsilon = 1e-10); for j in 0..2 {
616 let feature_max = scaled
617 .column(j)
618 .iter()
619 .map(|&x| x.abs())
620 .fold(0.0, f64::max);
621 assert_abs_diff_eq!(feature_max, 1.0, epsilon = 1e-10);
622 }
623
624 assert_abs_diff_eq!(scaled[[0, 0]], -1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[0, 1]], -1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[4, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[4, 1]], 1.0, epsilon = 1e-10); }
632
633 #[test]
634 fn test_max_abs_scaler_positive_only() {
635 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 6.0, 5.0, 10.0]).unwrap();
637
638 let mut scaler = MaxAbsScaler::new();
639 let scaled = scaler.fit_transform(&data).unwrap();
640
641 let max_abs = scaler.max_abs().unwrap();
643 assert_abs_diff_eq!(max_abs[0], 5.0, epsilon = 1e-10);
644 assert_abs_diff_eq!(max_abs[1], 10.0, epsilon = 1e-10);
645
646 assert_abs_diff_eq!(scaled[[0, 0]], 0.2, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[0, 1]], 0.2, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 1]], 1.0, epsilon = 1e-10); }
652
653 #[test]
654 fn test_max_abs_scaler_inverse_transform() {
655 let data = Array::from_shape_vec((3, 2), vec![-6.0, 8.0, 0.0, -4.0, 3.0, 12.0]).unwrap();
656
657 let mut scaler = MaxAbsScaler::new();
658 let scaled = scaler.fit_transform(&data).unwrap();
659 let inverse = scaler.inverse_transform(&scaled).unwrap();
660
661 assert_eq!(inverse.shape(), data.shape());
663 for i in 0..3 {
664 for j in 0..2 {
665 assert_abs_diff_eq!(inverse[[i, j]], data[[i, j]], epsilon = 1e-10);
666 }
667 }
668 }
669
670 #[test]
671 fn test_max_abs_scaler_constant_feature() {
672 let data = Array::from_shape_vec((3, 2), vec![0.0, 5.0, 0.0, 10.0, 0.0, 15.0]).unwrap();
674
675 let mut scaler = MaxAbsScaler::new();
676 let scaled = scaler.fit_transform(&data).unwrap();
677
678 for i in 0..3 {
680 assert_abs_diff_eq!(scaled[[i, 0]], 0.0, epsilon = 1e-10);
681 }
682
683 assert_abs_diff_eq!(scaled[[0, 1]], 1.0 / 3.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 1]], 1.0, epsilon = 1e-10); }
687
688 #[test]
689 fn test_max_abs_scaler_errors() {
690 let empty_data = Array::<f64, _>::zeros((0, 2));
692 let mut scaler = MaxAbsScaler::new();
693 assert!(scaler.fit(&empty_data).is_err());
694
695 let data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
697 let unfitted_scaler = MaxAbsScaler::new();
698 assert!(unfitted_scaler.transform(&data).is_err());
699 assert!(unfitted_scaler.inverse_transform(&data).is_err());
700
701 let train_data = Array::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
703 let test_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
704
705 let mut scaler = MaxAbsScaler::new();
706 scaler.fit(&train_data).unwrap();
707 assert!(scaler.transform(&test_data).is_err());
708 assert!(scaler.inverse_transform(&test_data).is_err());
709 }
710
711 #[test]
712 fn test_max_abs_scaler_single_feature() {
713 let data = Array::from_shape_vec((4, 1), vec![-8.0, -2.0, 4.0, 6.0]).unwrap();
715
716 let mut scaler = MaxAbsScaler::new();
717 let scaled = scaler.fit_transform(&data).unwrap();
718
719 let max_abs = scaler.max_abs().unwrap();
721 assert_abs_diff_eq!(max_abs[0], 8.0, epsilon = 1e-10);
722
723 assert_abs_diff_eq!(scaled[[0, 0]], -1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[1, 0]], -0.25, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 0]], 0.5, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[3, 0]], 0.75, epsilon = 1e-10); }
729
730 #[test]
731 fn test_max_abs_scaler_sparse_preservation() {
732 let data = Array::from_shape_vec(
734 (4, 3),
735 vec![
736 0.0, 5.0, 0.0, 10.0, 0.0, -8.0, 0.0, 0.0, 4.0, -5.0, 10.0, 0.0, ],
741 )
742 .unwrap();
743
744 let mut scaler = MaxAbsScaler::new();
745 let scaled = scaler.fit_transform(&data).unwrap();
746
747 assert_abs_diff_eq!(scaled[[0, 0]], 0.0, epsilon = 1e-10);
749 assert_abs_diff_eq!(scaled[[0, 2]], 0.0, epsilon = 1e-10);
750 assert_abs_diff_eq!(scaled[[1, 1]], 0.0, epsilon = 1e-10);
751 assert_abs_diff_eq!(scaled[[2, 0]], 0.0, epsilon = 1e-10);
752 assert_abs_diff_eq!(scaled[[2, 1]], 0.0, epsilon = 1e-10);
753 assert_abs_diff_eq!(scaled[[3, 2]], 0.0, epsilon = 1e-10);
754
755 assert_abs_diff_eq!(scaled[[0, 1]], 0.5, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[1, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[1, 2]], -1.0, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[2, 2]], 0.5, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[3, 0]], -0.5, epsilon = 1e-10); assert_abs_diff_eq!(scaled[[3, 1]], 1.0, epsilon = 1e-10); }
764}