1use ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
8use num_traits::{Float, NumCast};
9use scirs2_core::parallel_ops::*;
10
11use crate::error::{Result, TransformError};
12
13const EPSILON: f64 = 1e-10;
15
16pub struct PolynomialFeatures {
20 degree: usize,
22 interaction_only: bool,
24 include_bias: bool,
26}
27
28impl PolynomialFeatures {
29 pub fn new(degree: usize, interaction_only: bool, include_bias: bool) -> Self {
39 PolynomialFeatures {
40 degree,
41 interaction_only,
42 include_bias,
43 }
44 }
45
46 pub fn n_output_features(&self, n_features: usize) -> usize {
54 if n_features == 0 {
55 return 0;
56 }
57
58 if self.interaction_only {
59 let mut n = if self.include_bias { 1 } else { 0 };
60 for d in 1..=self.degree.min(n_features) {
61 let mut b = 1;
63 for i in 0..d {
64 b = b * (n_features - i) / (i + 1);
65 }
66 n += b;
67 }
68 n
69 } else {
70 let n = if self.include_bias { 1 } else { 0 };
73 n + (0..=self.degree)
74 .skip(if self.include_bias { 1 } else { 0 })
75 .map(|d| {
76 let mut b = 1;
78 for i in 0..d {
79 b = b * (n_features + d - 1 - i) / (i + 1);
80 }
81 b
82 })
83 .sum::<usize>()
84 }
85 }
86
87 pub fn transform<S>(&self, array: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
95 where
96 S: Data,
97 S::Elem: Float + NumCast,
98 {
99 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
100
101 if !array_f64.is_standard_layout() {
102 return Err(TransformError::InvalidInput(
103 "Input array must be in standard memory layout".to_string(),
104 ));
105 }
106
107 let shape = array_f64.shape();
108 let n_samples = shape[0];
109 let n_features = shape[1];
110
111 let n_output_features = self.n_output_features(n_features);
112 let mut result = Array2::zeros((n_samples, n_output_features));
113
114 let mut powers = vec![0; n_features];
116 let mut col_idx = 0;
117
118 if self.include_bias {
120 for i in 0..n_samples {
121 result[[i, col_idx]] = 1.0;
122 }
123 col_idx += 1;
124 }
125
126 for i in 0..n_samples {
129 for j in 0..n_features {
130 result[[i, col_idx + j]] = array_f64[[i, j]];
131 }
132 }
133 col_idx += n_features;
134
135 if self.degree >= 2 {
137 #[allow(clippy::too_many_arguments)]
139 fn generate_combinations(
140 powers: &mut Vec<usize>,
141 start: usize,
142 degree: usize,
143 max_degree: usize,
144 interaction_only: bool,
145 array: &Array2<f64>,
146 result: &mut Array2<f64>,
147 col_idx: &mut usize,
148 ) {
149 if degree == 0 {
150 let sum: usize = powers.iter().sum();
152 if sum >= 2 && sum <= max_degree {
153 for i in 0..array.shape()[0] {
155 let mut val = 1.0;
156 for (j, &p) in powers.iter().enumerate() {
157 val *= array[[i, j]].powi(p as i32);
158 }
159 result[[i, *col_idx]] = val;
160 }
161 *col_idx += 1;
162 }
163 return;
164 }
165
166 for j in start..powers.len() {
167 let max_power = if interaction_only { 1 } else { degree };
169 for p in 1..=max_power {
170 powers[j] += p;
171 generate_combinations(
172 powers,
173 j + 1,
174 degree - p,
175 max_degree,
176 interaction_only,
177 array,
178 result,
179 col_idx,
180 );
181 powers[j] -= p;
182 }
183 }
184 }
185
186 let mut current_col_idx = col_idx;
188 generate_combinations(
189 &mut powers,
190 0,
191 self.degree,
192 self.degree,
193 self.interaction_only,
194 &array_f64,
195 &mut result,
196 &mut current_col_idx,
197 );
198 }
199
200 Ok(result)
201 }
202}
203
204pub fn binarize<S>(array: &ArrayBase<S, Ix2>, threshold: f64) -> Result<Array2<f64>>
225where
226 S: Data,
227 S::Elem: Float + NumCast,
228{
229 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
230
231 if !array_f64.is_standard_layout() {
232 return Err(TransformError::InvalidInput(
233 "Input array must be in standard memory layout".to_string(),
234 ));
235 }
236
237 let shape = array_f64.shape();
238 let mut binarized = Array2::zeros((shape[0], shape[1]));
239
240 for i in 0..shape[0] {
241 for j in 0..shape[1] {
242 binarized[[i, j]] = if array_f64[[i, j]] > threshold {
243 1.0
244 } else {
245 0.0
246 };
247 }
248 }
249
250 Ok(binarized)
251}
252
253fn compute_quantiles<S>(
264 array: &ArrayBase<S, Ix2>,
265 n_quantiles: usize,
266 axis: usize,
267) -> Result<Array2<f64>>
268where
269 S: Data,
270 S::Elem: Float + NumCast,
271{
272 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
273
274 if n_quantiles < 2 {
275 return Err(TransformError::InvalidInput(
276 "n_quantiles must be at least 2".to_string(),
277 ));
278 }
279
280 if axis >= 2 {
281 return Err(TransformError::InvalidInput(
282 "axis must be 0 or 1".to_string(),
283 ));
284 }
285
286 let shape = array_f64.shape();
287 let n_features = if axis == 0 { shape[1] } else { shape[0] };
288
289 let mut quantiles = Array2::zeros((n_features, n_quantiles));
290
291 for i in 0..n_features {
292 let data: Vec<f64> = if axis == 0 {
294 array_f64.column(i).to_vec()
295 } else {
296 array_f64.row(i).to_vec()
297 };
298
299 let mut sorted_data = data.clone();
300 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
301
302 for j in 0..n_quantiles {
304 let q = j as f64 / (n_quantiles - 1) as f64;
305 let idx = (q * (sorted_data.len() - 1) as f64).round() as usize;
306 quantiles[[i, j]] = sorted_data[idx];
307 }
308 }
309
310 Ok(quantiles)
311}
312
313pub fn discretize_equal_width<S>(
324 array: &ArrayBase<S, Ix2>,
325 n_bins: usize,
326 encode: &str,
327 axis: usize,
328) -> Result<Array2<f64>>
329where
330 S: Data,
331 S::Elem: Float + NumCast,
332{
333 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
334
335 if !array_f64.is_standard_layout() {
336 return Err(TransformError::InvalidInput(
337 "Input array must be in standard memory layout".to_string(),
338 ));
339 }
340
341 if n_bins < 2 {
342 return Err(TransformError::InvalidInput(
343 "n_bins must be at least 2".to_string(),
344 ));
345 }
346
347 if encode != "onehot" && encode != "ordinal" {
348 return Err(TransformError::InvalidInput(
349 "encode must be 'onehot' or 'ordinal'".to_string(),
350 ));
351 }
352
353 if axis >= 2 {
354 return Err(TransformError::InvalidInput(
355 "axis must be 0 or 1".to_string(),
356 ));
357 }
358
359 let shape = array_f64.shape();
360 let n_samples = shape[0];
361 let n_features = shape[1];
362
363 let n_output_features = if encode == "onehot" {
364 if axis == 0 {
365 n_features * n_bins
366 } else {
367 n_samples * n_bins
368 }
369 } else if axis == 0 {
370 n_features
371 } else {
372 n_samples
373 };
374
375 let mut min_values = Array1::zeros(if axis == 0 { n_features } else { n_samples });
376 let mut max_values = Array1::zeros(if axis == 0 { n_features } else { n_samples });
377
378 for i in 0..(if axis == 0 { n_features } else { n_samples }) {
380 let data = if axis == 0 {
381 array_f64.column(i)
382 } else {
383 array_f64.row(i)
384 };
385
386 min_values[i] = data.fold(f64::INFINITY, |acc, &x| acc.min(x));
387 max_values[i] = data.fold(f64::NEG_INFINITY, |acc, &x| acc.max(x));
388 }
389
390 let mut bin_edges = Array2::zeros((if axis == 0 { n_features } else { n_samples }, n_bins + 1));
392 for i in 0..(if axis == 0 { n_features } else { n_samples }) {
393 let min_val = min_values[i];
394 let max_val = max_values[i];
395 let bin_width = if (max_val - min_val).abs() < EPSILON {
396 1.0
398 } else {
399 (max_val - min_val) / n_bins as f64
400 };
401
402 for j in 0..=n_bins {
403 bin_edges[[i, j]] = min_val + bin_width * j as f64;
404 }
405
406 bin_edges[[i, n_bins]] = bin_edges[[i, n_bins]].max(max_val + EPSILON);
408 }
409
410 let mut discretized = Array2::zeros((n_samples, n_output_features));
412
413 if encode == "ordinal" {
414 for i in 0..n_samples {
416 for j in 0..n_features {
417 let value = array_f64[[i, j]];
418 let feature_idx = if axis == 0 { j } else { i };
419
420 let mut bin_idx = 0;
422 while bin_idx < n_bins && value > bin_edges[[feature_idx, bin_idx + 1]] {
423 bin_idx += 1;
424 }
425
426 let output_idx = if axis == 0 { j } else { i };
427 discretized[[i, output_idx]] = bin_idx as f64;
428 }
429 }
430 } else {
431 for i in 0..n_samples {
433 for j in 0..n_features {
434 let value = array_f64[[i, j]];
435 let feature_idx = if axis == 0 { j } else { i };
436
437 let mut bin_idx = 0;
439 while bin_idx < n_bins && value > bin_edges[[feature_idx, bin_idx + 1]] {
440 bin_idx += 1;
441 }
442
443 let output_idx = if axis == 0 {
444 j * n_bins + bin_idx
445 } else {
446 i * n_bins + bin_idx
447 };
448
449 discretized[[i, output_idx]] = 1.0;
450 }
451 }
452 }
453
454 Ok(discretized)
455}
456
457pub fn discretize_equal_frequency<S>(
468 array: &ArrayBase<S, Ix2>,
469 n_bins: usize,
470 encode: &str,
471 axis: usize,
472) -> Result<Array2<f64>>
473where
474 S: Data,
475 S::Elem: Float + NumCast,
476{
477 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
478
479 if !array_f64.is_standard_layout() {
480 return Err(TransformError::InvalidInput(
481 "Input array must be in standard memory layout".to_string(),
482 ));
483 }
484
485 if n_bins < 2 {
486 return Err(TransformError::InvalidInput(
487 "n_bins must be at least 2".to_string(),
488 ));
489 }
490
491 if encode != "onehot" && encode != "ordinal" {
492 return Err(TransformError::InvalidInput(
493 "encode must be 'onehot' or 'ordinal'".to_string(),
494 ));
495 }
496
497 if axis >= 2 {
498 return Err(TransformError::InvalidInput(
499 "axis must be 0 or 1".to_string(),
500 ));
501 }
502
503 let shape = array_f64.shape();
504 let n_samples = shape[0];
505 let n_features = shape[1];
506
507 let n_output_features = if encode == "onehot" {
508 if axis == 0 {
509 n_features * n_bins
510 } else {
511 n_samples * n_bins
512 }
513 } else if axis == 0 {
514 n_features
515 } else {
516 n_samples
517 };
518
519 let quantiles = compute_quantiles(&array_f64, n_bins + 1, axis)?;
521
522 let mut discretized = Array2::zeros((n_samples, n_output_features));
524
525 if encode == "ordinal" {
526 for i in 0..n_samples {
528 for j in 0..n_features {
529 let value = array_f64[[i, j]];
530 let feature_idx = if axis == 0 { j } else { i };
531
532 let mut bin_idx = 0;
534 while bin_idx < n_bins && value > quantiles[[feature_idx, bin_idx + 1]] {
535 bin_idx += 1;
536 }
537
538 let output_idx = if axis == 0 { j } else { i };
539 discretized[[i, output_idx]] = bin_idx as f64;
540 }
541 }
542 } else {
543 for i in 0..n_samples {
545 for j in 0..n_features {
546 let value = array_f64[[i, j]];
547 let feature_idx = if axis == 0 { j } else { i };
548
549 let mut bin_idx = 0;
551 while bin_idx < n_bins && value > quantiles[[feature_idx, bin_idx + 1]] {
552 bin_idx += 1;
553 }
554
555 let output_idx = if axis == 0 {
556 j * n_bins + bin_idx
557 } else {
558 i * n_bins + bin_idx
559 };
560
561 discretized[[i, output_idx]] = 1.0;
562 }
563 }
564 }
565
566 Ok(discretized)
567}
568
569pub fn power_transform<S>(
591 array: &ArrayBase<S, Ix2>,
592 method: &str,
593 standardize: bool,
594) -> Result<Array2<f64>>
595where
596 S: Data,
597 S::Elem: Float + NumCast,
598{
599 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
600
601 if !array_f64.is_standard_layout() {
602 return Err(TransformError::InvalidInput(
603 "Input array must be in standard memory layout".to_string(),
604 ));
605 }
606
607 if method != "yeo-johnson" && method != "box-cox" {
608 return Err(TransformError::InvalidInput(
609 "method must be 'yeo-johnson' or 'box-cox'".to_string(),
610 ));
611 }
612
613 if method == "box-cox" {
614 if array_f64.iter().any(|&x| x <= 0.0) {
616 return Err(TransformError::InvalidInput(
617 "Box-Cox transformation requires strictly positive data".to_string(),
618 ));
619 }
620 }
621
622 let shape = array_f64.shape();
623 let n_samples = shape[0];
624 let n_features = shape[1];
625
626 let mut transformed = Array2::zeros((n_samples, n_features));
627
628 for j in 0..n_features {
630 let _feature = array_f64.column(j).to_vec();
632
633 let lambda = if method == "yeo-johnson" {
636 1.0
638 } else {
639 0.0
641 };
642
643 for i in 0..n_samples {
645 let x = array_f64[[i, j]];
646
647 transformed[[i, j]] = if method == "yeo-johnson" {
648 yeo_johnson_transform(x, lambda)
649 } else {
650 box_cox_transform(x, lambda)
651 };
652 }
653
654 if standardize {
656 let mean = transformed.column(j).sum() / n_samples as f64;
657 let variance = transformed
658 .column(j)
659 .iter()
660 .map(|&x| (x - mean).powi(2))
661 .sum::<f64>()
662 / n_samples as f64;
663 let std_dev = variance.sqrt();
664
665 if std_dev > EPSILON {
666 for i in 0..n_samples {
667 transformed[[i, j]] = (transformed[[i, j]] - mean) / std_dev;
668 }
669 }
670 }
671 }
672
673 Ok(transformed)
674}
675
676fn yeo_johnson_transform(x: f64, lambda: f64) -> f64 {
678 if x >= 0.0 {
679 if (lambda - 0.0).abs() < EPSILON {
680 (x + 1.0).ln()
681 } else {
682 ((x + 1.0).powf(lambda) - 1.0) / lambda
683 }
684 } else if (lambda - 2.0).abs() < EPSILON {
685 -((-x + 1.0).ln())
686 } else {
687 -(((-x + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda))
688 }
689}
690
691fn box_cox_transform(x: f64, lambda: f64) -> f64 {
693 if (lambda - 0.0).abs() < EPSILON {
694 x.ln()
695 } else {
696 (x.powf(lambda) - 1.0) / lambda
697 }
698}
699
700#[derive(Debug, Clone)]
710pub struct PowerTransformer {
711 method: String,
713 standardize: bool,
715 lambdas_: Option<Array1<f64>>,
717 means_: Option<Array1<f64>>,
719 stds_: Option<Array1<f64>>,
721 is_fitted: bool,
723}
724
725impl PowerTransformer {
726 pub fn new(method: &str, standardize: bool) -> Result<Self> {
735 if method != "yeo-johnson" && method != "box-cox" {
736 return Err(TransformError::InvalidInput(
737 "method must be 'yeo-johnson' or 'box-cox'".to_string(),
738 ));
739 }
740
741 Ok(PowerTransformer {
742 method: method.to_string(),
743 standardize,
744 lambdas_: None,
745 means_: None,
746 stds_: None,
747 is_fitted: false,
748 })
749 }
750
751 pub fn yeo_johnson(standardize: bool) -> Self {
753 PowerTransformer {
754 method: "yeo-johnson".to_string(),
755 standardize,
756 lambdas_: None,
757 means_: None,
758 stds_: None,
759 is_fitted: false,
760 }
761 }
762
763 pub fn box_cox(standardize: bool) -> Self {
765 PowerTransformer {
766 method: "box-cox".to_string(),
767 standardize,
768 lambdas_: None,
769 means_: None,
770 stds_: None,
771 is_fitted: false,
772 }
773 }
774
775 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
785 where
786 S: Data,
787 S::Elem: Float + NumCast,
788 {
789 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
790
791 if !x_f64.is_standard_layout() {
792 return Err(TransformError::InvalidInput(
793 "Input array must be in standard memory layout".to_string(),
794 ));
795 }
796
797 let shape = x_f64.shape();
798 let n_samples = shape[0];
799 let n_features = shape[1];
800
801 if n_samples == 0 || n_features == 0 {
802 return Err(TransformError::InvalidInput("Empty input data".to_string()));
803 }
804
805 if self.method == "box-cox" {
806 if x_f64.iter().any(|&x| x <= 0.0) {
808 return Err(TransformError::InvalidInput(
809 "Box-Cox transformation requires strictly positive data".to_string(),
810 ));
811 }
812 }
813
814 let lambdas: Vec<f64> = (0..n_features)
816 .into_par_iter()
817 .map(|j| {
818 let feature = x_f64.column(j).to_vec();
819 self.optimize_lambda(&feature)
820 })
821 .collect();
822
823 self.lambdas_ = Some(Array1::from_vec(lambdas));
824
825 if self.standardize {
827 let mut means = Array1::zeros(n_features);
828 let mut stds = Array1::zeros(n_features);
829
830 for j in 0..n_features {
832 let lambda = self.lambdas_.as_ref().unwrap()[j];
833 let mut transformed_feature = Array1::zeros(n_samples);
834
835 for i in 0..n_samples {
837 let x = x_f64[[i, j]];
838 transformed_feature[i] = if self.method == "yeo-johnson" {
839 yeo_johnson_transform(x, lambda)
840 } else {
841 box_cox_transform(x, lambda)
842 };
843 }
844
845 let mean = transformed_feature.sum() / n_samples as f64;
847 let variance = transformed_feature
848 .iter()
849 .map(|&x| (x - mean).powi(2))
850 .sum::<f64>()
851 / n_samples as f64;
852 let std_dev = variance.sqrt();
853
854 means[j] = mean;
855 stds[j] = if std_dev > EPSILON { std_dev } else { 1.0 };
856 }
857
858 self.means_ = Some(means);
859 self.stds_ = Some(stds);
860 }
861
862 self.is_fitted = true;
863 Ok(())
864 }
865
866 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
874 where
875 S: Data,
876 S::Elem: Float + NumCast,
877 {
878 if !self.is_fitted {
879 return Err(TransformError::InvalidInput(
880 "PowerTransformer must be fitted before transform".to_string(),
881 ));
882 }
883
884 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
885
886 if !x_f64.is_standard_layout() {
887 return Err(TransformError::InvalidInput(
888 "Input array must be in standard memory layout".to_string(),
889 ));
890 }
891
892 let shape = x_f64.shape();
893 let n_samples = shape[0];
894 let n_features = shape[1];
895
896 let lambdas = self.lambdas_.as_ref().unwrap();
897
898 if n_features != lambdas.len() {
899 return Err(TransformError::InvalidInput(
900 "Number of features in transform data does not match fitted data".to_string(),
901 ));
902 }
903
904 let mut transformed = Array2::zeros((n_samples, n_features));
905
906 let transformed_data: Vec<Vec<f64>> = (0..n_features)
908 .into_par_iter()
909 .map(|j| {
910 let lambda = lambdas[j];
911 (0..n_samples)
913 .map(|i| {
914 let x = x_f64[[i, j]];
915 if self.method == "yeo-johnson" {
916 yeo_johnson_transform(x, lambda)
917 } else {
918 box_cox_transform(x, lambda)
919 }
920 })
921 .collect()
922 })
923 .collect();
924
925 for j in 0..n_features {
927 for i in 0..n_samples {
928 transformed[[i, j]] = transformed_data[j][i];
929 }
930 }
931
932 if self.standardize {
934 let means = self.means_.as_ref().unwrap();
935 let stds = self.stds_.as_ref().unwrap();
936
937 for j in 0..n_features {
938 let mean = means[j];
939 let std = stds[j];
940
941 for i in 0..n_samples {
942 transformed[[i, j]] = (transformed[[i, j]] - mean) / std;
943 }
944 }
945 }
946
947 Ok(transformed)
948 }
949
950 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
958 where
959 S: Data,
960 S::Elem: Float + NumCast,
961 {
962 self.fit(x)?;
963 self.transform(x)
964 }
965
966 pub fn inverse_transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
974 where
975 S: Data,
976 S::Elem: Float + NumCast,
977 {
978 if !self.is_fitted {
979 return Err(TransformError::InvalidInput(
980 "PowerTransformer must be fitted before inverse_transform".to_string(),
981 ));
982 }
983
984 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
985
986 if !x_f64.is_standard_layout() {
987 return Err(TransformError::InvalidInput(
988 "Input array must be in standard memory layout".to_string(),
989 ));
990 }
991
992 let shape = x_f64.shape();
993 let n_samples = shape[0];
994 let n_features = shape[1];
995
996 let lambdas = self.lambdas_.as_ref().unwrap();
997
998 if n_features != lambdas.len() {
999 return Err(TransformError::InvalidInput(
1000 "Number of features in inverse transform data does not match fitted data"
1001 .to_string(),
1002 ));
1003 }
1004
1005 let mut x_normalized = x_f64.clone();
1006
1007 if self.standardize {
1009 let means = self.means_.as_ref().unwrap();
1010 let stds = self.stds_.as_ref().unwrap();
1011
1012 for j in 0..n_features {
1013 let mean = means[j];
1014 let std = stds[j];
1015
1016 for i in 0..n_samples {
1017 x_normalized[[i, j]] = x_normalized[[i, j]] * std + mean;
1018 }
1019 }
1020 }
1021
1022 let mut original = Array2::zeros((n_samples, n_features));
1023
1024 let original_data: Vec<Vec<f64>> = (0..n_features)
1026 .into_par_iter()
1027 .map(|j| {
1028 let lambda = lambdas[j];
1029 (0..n_samples)
1031 .map(|i| {
1032 let y = x_normalized[[i, j]];
1033 if self.method == "yeo-johnson" {
1034 yeo_johnson_inverse_transform(y, lambda)
1035 } else {
1036 box_cox_inverse_transform(y, lambda)
1037 }
1038 })
1039 .collect()
1040 })
1041 .collect();
1042
1043 for j in 0..n_features {
1045 for i in 0..n_samples {
1046 original[[i, j]] = original_data[j][i];
1047 }
1048 }
1049
1050 Ok(original)
1051 }
1052
1053 fn optimize_lambda(&self, data: &[f64]) -> f64 {
1061 let mut a = -2.0;
1063 let mut b = 2.0;
1064 let tolerance = 1e-6;
1065 let golden_ratio = (5.0_f64.sqrt() - 1.0) / 2.0;
1066
1067 let mut c = b - golden_ratio * (b - a);
1069 let mut d = a + golden_ratio * (b - a);
1070
1071 while (b - a).abs() > tolerance {
1072 let fc = -self.log_likelihood(data, c);
1073 let fd = -self.log_likelihood(data, d);
1074
1075 if fc < fd {
1076 b = d;
1077 d = c;
1078 c = b - golden_ratio * (b - a);
1079 } else {
1080 a = c;
1081 c = d;
1082 d = a + golden_ratio * (b - a);
1083 }
1084 }
1085
1086 (a + b) / 2.0
1087 }
1088
1089 fn log_likelihood(&self, data: &[f64], lambda: f64) -> f64 {
1098 let n = data.len() as f64;
1099 let mut log_likelihood = 0.0;
1100
1101 let transformed: Vec<f64> = data
1103 .iter()
1104 .map(|&x| {
1105 if self.method == "yeo-johnson" {
1106 yeo_johnson_transform(x, lambda)
1107 } else {
1108 box_cox_transform(x, lambda)
1109 }
1110 })
1111 .collect();
1112
1113 let mean = transformed.iter().sum::<f64>() / n;
1115 let variance = transformed.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
1116
1117 if variance <= 0.0 {
1118 return f64::NEG_INFINITY;
1119 }
1120
1121 log_likelihood -= 0.5 * n * (2.0 * std::f64::consts::PI * variance).ln();
1123 log_likelihood -=
1124 0.5 * transformed.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / variance;
1125
1126 for &x in data {
1128 if self.method == "yeo-johnson" {
1129 log_likelihood += yeo_johnson_log_jacobian(x, lambda);
1130 } else {
1131 log_likelihood += box_cox_log_jacobian(x, lambda);
1132 }
1133 }
1134
1135 log_likelihood
1136 }
1137
1138 pub fn lambdas(&self) -> Option<&Array1<f64>> {
1140 self.lambdas_.as_ref()
1141 }
1142
1143 pub fn is_fitted(&self) -> bool {
1145 self.is_fitted
1146 }
1147}
1148
1149fn yeo_johnson_inverse_transform(y: f64, lambda: f64) -> f64 {
1151 if y >= 0.0 {
1152 if (lambda - 0.0).abs() < EPSILON {
1153 y.exp() - 1.0
1154 } else {
1155 (lambda * y + 1.0).powf(1.0 / lambda) - 1.0
1156 }
1157 } else if (lambda - 2.0).abs() < EPSILON {
1158 1.0 - (-y).exp()
1159 } else {
1160 1.0 - (-(2.0 - lambda) * y + 1.0).powf(1.0 / (2.0 - lambda))
1161 }
1162}
1163
1164fn box_cox_inverse_transform(y: f64, lambda: f64) -> f64 {
1166 if (lambda - 0.0).abs() < EPSILON {
1167 y.exp()
1168 } else {
1169 (lambda * y + 1.0).powf(1.0 / lambda)
1170 }
1171}
1172
1173fn yeo_johnson_log_jacobian(x: f64, lambda: f64) -> f64 {
1175 if x >= 0.0 {
1176 (lambda - 1.0) * (x + 1.0).ln()
1177 } else {
1178 (1.0 - lambda) * (-x + 1.0).ln()
1179 }
1180}
1181
1182fn box_cox_log_jacobian(x: f64, lambda: f64) -> f64 {
1184 (lambda - 1.0) * x.ln()
1185}
1186
1187pub fn log_transform<S>(array: &ArrayBase<S, Ix2>, epsilon: f64) -> Result<Array2<f64>>
1197where
1198 S: Data,
1199 S::Elem: Float + NumCast,
1200{
1201 let array_f64 = array.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
1202
1203 if !array_f64.is_standard_layout() {
1204 return Err(TransformError::InvalidInput(
1205 "Input array must be in standard memory layout".to_string(),
1206 ));
1207 }
1208
1209 if epsilon <= 0.0 {
1210 return Err(TransformError::InvalidInput(
1211 "epsilon must be a positive value".to_string(),
1212 ));
1213 }
1214
1215 let mut transformed = Array2::zeros(array_f64.raw_dim());
1216
1217 for i in 0..array_f64.shape()[0] {
1218 for j in 0..array_f64.shape()[1] {
1219 transformed[[i, j]] = (array_f64[[i, j]] + epsilon).ln();
1220 }
1221 }
1222
1223 Ok(transformed)
1224}
1225
1226#[cfg(test)]
1227mod tests {
1228 use super::*;
1229 use approx::assert_abs_diff_eq;
1230 use ndarray::Array;
1231
1232 #[test]
1233 fn test_polynomial_features() {
1234 let data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1235
1236 let poly = PolynomialFeatures::new(2, false, true);
1238 let transformed = poly.transform(&data).unwrap();
1239
1240 assert_eq!(transformed.shape(), &[2, 6]);
1242
1243 let expected_values = [1.0, 1.0, 2.0, 1.0, 2.0, 4.0];
1246 let mut found_values = [false; 6];
1247
1248 for i in 0..6 {
1249 for j in 0..6 {
1250 if (transformed[[0, i]] - expected_values[j]).abs() < 1e-10 {
1251 found_values[j] = true;
1252 }
1253 }
1254 }
1255
1256 assert!(
1257 found_values.iter().all(|&x| x),
1258 "Not all expected values found in the first row"
1259 );
1260
1261 let expected_values = [1.0, 3.0, 4.0, 9.0, 12.0, 16.0];
1263 let mut found_values = [false; 6];
1264
1265 for i in 0..6 {
1266 for j in 0..6 {
1267 if (transformed[[1, i]] - expected_values[j]).abs() < 1e-10 {
1268 found_values[j] = true;
1269 }
1270 }
1271 }
1272
1273 assert!(
1274 found_values.iter().all(|&x| x),
1275 "Not all expected values found in the second row"
1276 );
1277 }
1278
1279 #[test]
1280 fn test_binarize() {
1281 let data = Array::from_shape_vec((2, 3), vec![1.0, -1.0, 2.0, 2.0, 0.0, -3.0]).unwrap();
1282
1283 let binarized = binarize(&data, 0.0).unwrap();
1284
1285 assert_eq!(binarized.shape(), &[2, 3]);
1287
1288 assert_abs_diff_eq!(binarized[[0, 0]], 1.0, epsilon = 1e-10);
1289 assert_abs_diff_eq!(binarized[[0, 1]], 0.0, epsilon = 1e-10);
1290 assert_abs_diff_eq!(binarized[[0, 2]], 1.0, epsilon = 1e-10);
1291 assert_abs_diff_eq!(binarized[[1, 0]], 1.0, epsilon = 1e-10);
1292 assert_abs_diff_eq!(binarized[[1, 1]], 0.0, epsilon = 1e-10);
1293 assert_abs_diff_eq!(binarized[[1, 2]], 0.0, epsilon = 1e-10);
1294 }
1295
1296 #[test]
1297 fn test_discretize_equal_width() {
1298 let data = Array::from_shape_vec((3, 2), vec![0.0, 0.0, 3.0, 5.0, 6.0, 10.0]).unwrap();
1299
1300 let discretized = discretize_equal_width(&data, 2, "ordinal", 0).unwrap();
1302
1303 assert_eq!(discretized.shape(), &[3, 2]);
1305
1306 assert_abs_diff_eq!(discretized[[0, 0]], 0.0, epsilon = 1e-10);
1307 assert_abs_diff_eq!(discretized[[0, 1]], 0.0, epsilon = 1e-10);
1308 assert_abs_diff_eq!(discretized[[1, 0]], 0.0, epsilon = 1e-10);
1309 assert_abs_diff_eq!(discretized[[1, 1]], 0.0, epsilon = 1e-10);
1310 assert_abs_diff_eq!(discretized[[2, 0]], 1.0, epsilon = 1e-10);
1311 assert_abs_diff_eq!(discretized[[2, 1]], 1.0, epsilon = 1e-10);
1312
1313 let discretized = discretize_equal_width(&data, 2, "onehot", 0).unwrap();
1315
1316 assert_eq!(discretized.shape(), &[3, 4]);
1322
1323 assert_abs_diff_eq!(discretized[[0, 0]], 1.0, epsilon = 1e-10);
1324 assert_abs_diff_eq!(discretized[[0, 1]], 0.0, epsilon = 1e-10);
1325 assert_abs_diff_eq!(discretized[[0, 2]], 1.0, epsilon = 1e-10);
1326 assert_abs_diff_eq!(discretized[[0, 3]], 0.0, epsilon = 1e-10);
1327
1328 assert_abs_diff_eq!(discretized[[1, 0]], 1.0, epsilon = 1e-10);
1329 assert_abs_diff_eq!(discretized[[1, 1]], 0.0, epsilon = 1e-10);
1330 assert_abs_diff_eq!(discretized[[1, 2]], 1.0, epsilon = 1e-10);
1331 assert_abs_diff_eq!(discretized[[1, 3]], 0.0, epsilon = 1e-10);
1332
1333 assert_abs_diff_eq!(discretized[[2, 0]], 0.0, epsilon = 1e-10);
1334 assert_abs_diff_eq!(discretized[[2, 1]], 1.0, epsilon = 1e-10);
1335 assert_abs_diff_eq!(discretized[[2, 2]], 0.0, epsilon = 1e-10);
1336 assert_abs_diff_eq!(discretized[[2, 3]], 1.0, epsilon = 1e-10);
1337 }
1338
1339 #[test]
1340 fn test_log_transform() {
1341 let data = Array::from_shape_vec((2, 2), vec![0.0, 1.0, 2.0, 3.0]).unwrap();
1342
1343 let transformed = log_transform(&data, 1.0).unwrap();
1344
1345 assert_abs_diff_eq!(transformed[[0, 0]], (0.0 + 1.0).ln(), epsilon = 1e-10);
1347 assert_abs_diff_eq!(transformed[[0, 1]], (1.0 + 1.0).ln(), epsilon = 1e-10);
1348 assert_abs_diff_eq!(transformed[[1, 0]], (2.0 + 1.0).ln(), epsilon = 1e-10);
1349 assert_abs_diff_eq!(transformed[[1, 1]], (3.0 + 1.0).ln(), epsilon = 1e-10);
1350 }
1351
1352 #[test]
1353 fn test_power_transformer_yeo_johnson() {
1354 let data =
1356 Array::from_shape_vec((4, 2), vec![1.0, -1.0, 2.0, 0.5, 3.0, -2.0, 0.1, 1.5]).unwrap();
1357
1358 let mut transformer = PowerTransformer::yeo_johnson(false);
1359
1360 assert!(!transformer.is_fitted());
1362
1363 transformer.fit(&data).unwrap();
1365 assert!(transformer.is_fitted());
1366
1367 let lambdas = transformer.lambdas().unwrap();
1369 assert_eq!(lambdas.len(), 2);
1370
1371 let transformed = transformer.transform(&data).unwrap();
1373 assert_eq!(transformed.shape(), data.shape());
1374
1375 let inverse = transformer.inverse_transform(&transformed).unwrap();
1377 assert_eq!(inverse.shape(), data.shape());
1378
1379 for i in 0..data.shape()[0] {
1381 for j in 0..data.shape()[1] {
1382 assert_abs_diff_eq!(inverse[[i, j]], data[[i, j]], epsilon = 1e-6);
1383 }
1384 }
1385 }
1386
1387 #[test]
1388 fn test_power_transformer_box_cox() {
1389 let data =
1391 Array::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 0.5, 1.5, 2.5, 3.5]).unwrap();
1392
1393 let mut transformer = PowerTransformer::box_cox(false);
1394
1395 transformer.fit(&data).unwrap();
1397
1398 let transformed = transformer.transform(&data).unwrap();
1400 assert_eq!(transformed.shape(), data.shape());
1401
1402 let inverse = transformer.inverse_transform(&transformed).unwrap();
1404
1405 for i in 0..data.shape()[0] {
1407 for j in 0..data.shape()[1] {
1408 assert_abs_diff_eq!(inverse[[i, j]], data[[i, j]], epsilon = 1e-6);
1409 }
1410 }
1411 }
1412
1413 #[test]
1414 fn test_power_transformer_standardized() {
1415 let data = Array::from_shape_vec(
1416 (5, 2),
1417 vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0],
1418 )
1419 .unwrap();
1420
1421 let mut transformer = PowerTransformer::yeo_johnson(true);
1422
1423 let transformed = transformer.fit_transform(&data).unwrap();
1425
1426 for j in 0..transformed.shape()[1] {
1428 let column = transformed.column(j);
1429 let mean = column.sum() / column.len() as f64;
1430 let variance =
1431 column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / column.len() as f64;
1432
1433 assert_abs_diff_eq!(mean, 0.0, epsilon = 1e-10);
1434 assert_abs_diff_eq!(variance, 1.0, epsilon = 1e-10);
1435 }
1436 }
1437
1438 #[test]
1439 fn test_power_transformer_box_cox_negative_data() {
1440 let data = Array::from_shape_vec((2, 2), vec![1.0, -1.0, 2.0, 3.0]).unwrap();
1442
1443 let mut transformer = PowerTransformer::box_cox(false);
1444
1445 assert!(transformer.fit(&data).is_err());
1447 }
1448
1449 #[test]
1450 fn test_power_transformer_fit_transform() {
1451 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1452
1453 let mut transformer = PowerTransformer::yeo_johnson(false);
1454
1455 let transformed1 = transformer.fit_transform(&data).unwrap();
1457
1458 let mut transformer2 = PowerTransformer::yeo_johnson(false);
1460 transformer2.fit(&data).unwrap();
1461 let transformed2 = transformer2.transform(&data).unwrap();
1462
1463 for i in 0..transformed1.shape()[0] {
1465 for j in 0..transformed1.shape()[1] {
1466 assert_abs_diff_eq!(transformed1[[i, j]], transformed2[[i, j]], epsilon = 1e-12);
1467 }
1468 }
1469 }
1470
1471 #[test]
1472 fn test_power_transformer_different_data_sizes() {
1473 let train_data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1474 let test_data = Array::from_shape_vec((2, 2), vec![2.5, 3.5, 4.5, 5.5]).unwrap();
1475
1476 let mut transformer = PowerTransformer::yeo_johnson(false);
1477
1478 transformer.fit(&train_data).unwrap();
1480
1481 let transformed = transformer.transform(&test_data).unwrap();
1483 assert_eq!(transformed.shape(), test_data.shape());
1484
1485 let inverse = transformer.inverse_transform(&transformed).unwrap();
1487
1488 for i in 0..test_data.shape()[0] {
1489 for j in 0..test_data.shape()[1] {
1490 assert_abs_diff_eq!(inverse[[i, j]], test_data[[i, j]], epsilon = 1e-6);
1491 }
1492 }
1493 }
1494
1495 #[test]
1496 fn test_power_transformer_mismatched_features() {
1497 let train_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1498 let test_data = Array::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1499
1500 let mut transformer = PowerTransformer::yeo_johnson(false);
1501 transformer.fit(&train_data).unwrap();
1502
1503 assert!(transformer.transform(&test_data).is_err());
1505 }
1506
1507 #[test]
1508 fn test_power_transformer_not_fitted() {
1509 let data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1510 let transformer = PowerTransformer::yeo_johnson(false);
1511
1512 assert!(transformer.transform(&data).is_err());
1514 assert!(transformer.inverse_transform(&data).is_err());
1515 }
1516
1517 #[test]
1518 fn test_power_transformer_creation_methods() {
1519 let transformer1 = PowerTransformer::new("yeo-johnson", true).unwrap();
1521 let transformer2 = PowerTransformer::yeo_johnson(true);
1522
1523 assert_eq!(transformer1.method, transformer2.method);
1525 assert_eq!(transformer1.standardize, transformer2.standardize);
1526
1527 let transformer3 = PowerTransformer::new("box-cox", false).unwrap();
1528 let transformer4 = PowerTransformer::box_cox(false);
1529
1530 assert_eq!(transformer3.method, transformer4.method);
1531 assert_eq!(transformer3.standardize, transformer4.standardize);
1532
1533 assert!(PowerTransformer::new("invalid", false).is_err());
1535 }
1536
1537 #[test]
1538 fn test_power_transformer_empty_data() {
1539 let empty_data = Array2::<f64>::zeros((0, 2));
1540 let mut transformer = PowerTransformer::yeo_johnson(false);
1541
1542 assert!(transformer.fit(&empty_data).is_err());
1544 }
1545
1546 #[test]
1547 fn test_yeo_johnson_inverse_functions() {
1548 let test_values = vec![-2.0, -0.5, 0.0, 0.5, 1.0, 2.0];
1549 let lambdas = vec![0.0, 0.5, 1.0, 1.5, 2.0];
1550
1551 for &lambda in &lambdas {
1552 for &x in &test_values {
1553 let y = yeo_johnson_transform(x, lambda);
1554 let x_recovered = yeo_johnson_inverse_transform(y, lambda);
1555 assert_abs_diff_eq!(x_recovered, x, epsilon = 1e-10);
1556 }
1557 }
1558 }
1559
1560 #[test]
1561 fn test_box_cox_inverse_functions() {
1562 let test_values = vec![0.1, 0.5, 1.0, 2.0, 5.0]; let lambdas = vec![0.0, 0.5, 1.0, 1.5, 2.0];
1564
1565 for &lambda in &lambdas {
1566 for &x in &test_values {
1567 let y = box_cox_transform(x, lambda);
1568 let x_recovered = box_cox_inverse_transform(y, lambda);
1569 assert_abs_diff_eq!(x_recovered, x, epsilon = 1e-10);
1570 }
1571 }
1572 }
1573}