sklears_preprocessing/feature_engineering/
power_transformer.rs

1//! Power transformations to make data more Gaussian-like
2//!
3//! This module provides Box-Cox and Yeo-Johnson power transformations that can help
4//! make data more normally distributed, improving the performance of many ML algorithms.
5
6use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8    error::{Result, SklearsError},
9    traits::{Fit, Trained, Transform, Untrained},
10    types::Float,
11};
12use std::marker::PhantomData;
13
14/// Configuration for PowerTransformer
15#[derive(Debug, Clone)]
16pub struct PowerTransformerConfig {
17    /// Power transformation method
18    pub method: PowerMethod,
19    /// Whether to apply zero-mean, unit-variance normalization
20    pub standardize: bool,
21    /// Small constant to add for numerical stability
22    pub eps: Float,
23}
24
25impl Default for PowerTransformerConfig {
26    fn default() -> Self {
27        Self {
28            method: PowerMethod::YeoJohnson,
29            standardize: true,
30            eps: 1e-6,
31        }
32    }
33}
34
35/// Power transformation methods
36#[derive(Debug, Clone, Copy)]
37pub enum PowerMethod {
38    /// Box-Cox transformation (requires positive data)
39    BoxCox,
40    /// Yeo-Johnson transformation (works with any data)
41    YeoJohnson,
42}
43
44/// PowerTransformer applies power transformations to make data more Gaussian
45///
46/// This transformer applies power transformations to each feature to make
47/// the data more Gaussian-like, which can improve the performance of many
48/// machine learning algorithms.
49#[derive(Debug, Clone)]
50pub struct PowerTransformer<State = Untrained> {
51    config: PowerTransformerConfig,
52    state: PhantomData<State>,
53    // Fitted parameters
54    n_features_in_: Option<usize>,
55    lambdas_: Option<Array1<Float>>, // Optimal lambda parameters for each feature
56    means_: Option<Array1<Float>>,   // Means for standardization
57    stds_: Option<Array1<Float>>,    // Standard deviations for standardization
58}
59
60impl PowerTransformer<Untrained> {
61    /// Create a new PowerTransformer
62    pub fn new() -> Self {
63        Self {
64            config: PowerTransformerConfig::default(),
65            state: PhantomData,
66            n_features_in_: None,
67            lambdas_: None,
68            means_: None,
69            stds_: None,
70        }
71    }
72
73    /// Create a new PowerTransformer with custom configuration
74    pub fn with_config(config: PowerTransformerConfig) -> Self {
75        Self {
76            config,
77            state: PhantomData,
78            n_features_in_: None,
79            lambdas_: None,
80            means_: None,
81            stds_: None,
82        }
83    }
84
85    /// Set the transformation method
86    pub fn method(mut self, method: PowerMethod) -> Self {
87        self.config.method = method;
88        self
89    }
90
91    /// Set whether to standardize after transformation
92    pub fn standardize(mut self, standardize: bool) -> Self {
93        self.config.standardize = standardize;
94        self
95    }
96
97    /// Set the epsilon for numerical stability
98    pub fn eps(mut self, eps: Float) -> Self {
99        self.config.eps = eps;
100        self
101    }
102
103    /// Find optimal lambda using maximum likelihood estimation (simplified)
104    fn find_optimal_lambda(&self, x: &Array1<Float>) -> Float {
105        let mut best_lambda = 0.0;
106        let mut best_log_likelihood = Float::NEG_INFINITY;
107
108        // Search over a range of lambda values
109        let lambda_range = Array1::linspace(-2.0, 2.0, 41);
110
111        for &lambda in lambda_range.iter() {
112            let log_likelihood = self.compute_log_likelihood(x, lambda);
113            if log_likelihood > best_log_likelihood {
114                best_log_likelihood = log_likelihood;
115                best_lambda = lambda;
116            }
117        }
118
119        best_lambda
120    }
121
122    /// Compute log-likelihood for a given lambda
123    fn compute_log_likelihood(&self, x: &Array1<Float>, lambda: Float) -> Float {
124        let transformed = self.apply_power_transform(x, lambda);
125
126        // Simple log-likelihood based on normality assumption
127        let mean = transformed.mean().unwrap_or(0.0);
128        let variance = transformed.var(0.0);
129
130        if variance <= 0.0 {
131            return Float::NEG_INFINITY;
132        }
133
134        let n = x.len() as Float;
135        let log_likelihood = -0.5 * n * (2.0 * std::f64::consts::PI * variance).ln()
136            - 0.5
137                * transformed
138                    .iter()
139                    .map(|&val| (val - mean).powi(2))
140                    .sum::<Float>()
141                / variance;
142
143        // Add Jacobian term
144        let jacobian = match self.config.method {
145            PowerMethod::BoxCox => {
146                (lambda - 1.0)
147                    * x.iter()
148                        .map(|&val| val.max(self.config.eps).ln())
149                        .sum::<Float>()
150            }
151            PowerMethod::YeoJohnson => x
152                .iter()
153                .map(|&val| {
154                    if val >= 0.0 {
155                        (lambda - 1.0) * (val + 1.0).ln()
156                    } else {
157                        (1.0 - lambda) * (-val + 1.0).ln()
158                    }
159                })
160                .sum::<Float>(),
161        };
162
163        log_likelihood + jacobian
164    }
165
166    /// Apply power transformation with given lambda
167    fn apply_power_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
168        match self.config.method {
169            PowerMethod::BoxCox => self.box_cox_transform(x, lambda),
170            PowerMethod::YeoJohnson => self.yeo_johnson_transform(x, lambda),
171        }
172    }
173
174    /// Box-Cox transformation
175    fn box_cox_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
176        x.mapv(|val| {
177            let val_pos = val.max(self.config.eps);
178            if lambda.abs() < 1e-8 {
179                val_pos.ln()
180            } else {
181                (val_pos.powf(lambda) - 1.0) / lambda
182            }
183        })
184    }
185
186    /// Yeo-Johnson transformation
187    fn yeo_johnson_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
188        x.mapv(|val| {
189            if val >= 0.0 {
190                if lambda.abs() < 1e-8 {
191                    (val + 1.0).ln()
192                } else {
193                    ((val + 1.0).powf(lambda) - 1.0) / lambda
194                }
195            } else if (lambda - 2.0).abs() < 1e-8 {
196                -(-val + 1.0).ln()
197            } else {
198                -((-val + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
199            }
200        })
201    }
202}
203
204impl PowerTransformer<Trained> {
205    /// Get the number of input features
206    pub fn n_features_in(&self) -> usize {
207        self.n_features_in_
208            .expect("PowerTransformer should be fitted")
209    }
210
211    /// Get the optimal lambda parameters
212    pub fn lambdas(&self) -> &Array1<Float> {
213        self.lambdas_
214            .as_ref()
215            .expect("PowerTransformer should be fitted")
216    }
217
218    /// Get the means used for standardization
219    pub fn means(&self) -> Option<&Array1<Float>> {
220        self.means_.as_ref()
221    }
222
223    /// Get the standard deviations used for standardization
224    pub fn stds(&self) -> Option<&Array1<Float>> {
225        self.stds_.as_ref()
226    }
227
228    /// Apply Box-Cox transformation
229    fn box_cox_transform_fitted(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
230        // FIXME: SIMD implementation disabled for compilation
231        // Use CPU fallback for now
232        self.box_cox_transform_cpu(x, lambda)
233    }
234
235    /// Apply Yeo-Johnson transformation
236    fn yeo_johnson_transform_fitted(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
237        // FIXME: SIMD implementation disabled for compilation
238        // Use CPU fallback for now
239        self.yeo_johnson_transform_cpu(x, lambda)
240    }
241
242    /// CPU implementation of Box-Cox transformation
243    fn box_cox_transform_cpu(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
244        x.mapv(|val| {
245            let val_pos = val.max(self.config.eps);
246            if lambda.abs() < 1e-8 {
247                val_pos.ln()
248            } else {
249                (val_pos.powf(lambda) - 1.0) / lambda
250            }
251        })
252    }
253
254    /// CPU implementation of Yeo-Johnson transformation
255    fn yeo_johnson_transform_cpu(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
256        x.mapv(|val| {
257            if val >= 0.0 {
258                if lambda.abs() < 1e-8 {
259                    (val + 1.0).ln()
260                } else {
261                    ((val + 1.0).powf(lambda) - 1.0) / lambda
262                }
263            } else if (lambda - 2.0).abs() < 1e-8 {
264                -(-val + 1.0).ln()
265            } else {
266                -((-val + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
267            }
268        })
269    }
270
271    /// Inverse Box-Cox transformation
272    pub fn inverse_box_cox(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
273        x.mapv(|val| {
274            if lambda.abs() < 1e-8 {
275                val.exp()
276            } else {
277                (lambda * val + 1.0).powf(1.0 / lambda).max(self.config.eps)
278            }
279        })
280    }
281
282    /// Inverse Yeo-Johnson transformation
283    pub fn inverse_yeo_johnson(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
284        x.mapv(|val| {
285            if val >= 0.0 {
286                if lambda.abs() < 1e-8 {
287                    val.exp() - 1.0
288                } else {
289                    (lambda * val + 1.0).powf(1.0 / lambda) - 1.0
290                }
291            } else if (lambda - 2.0).abs() < 1e-8 {
292                -(-val).exp() + 1.0
293            } else {
294                -((2.0 - lambda) * (-val) + 1.0).powf(1.0 / (2.0 - lambda)) + 1.0
295            }
296        })
297    }
298
299    /// Apply inverse transformation to the entire dataset
300    pub fn inverse_transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
301        let (_n_samples, n_features) = x.dim();
302
303        if n_features != self.n_features_in() {
304            return Err(SklearsError::FeatureMismatch {
305                expected: self.n_features_in(),
306                actual: n_features,
307            });
308        }
309
310        let lambdas = self.lambdas();
311        let mut result = x.clone();
312
313        // Apply inverse standardization if it was applied
314        if self.config.standardize {
315            if let (Some(means), Some(stds)) = (&self.means_, &self.stds_) {
316                for j in 0..n_features {
317                    let mut column = result.column_mut(j);
318                    column.mapv_inplace(|val| val * stds[j] + means[j]);
319                }
320            }
321        }
322
323        // Apply inverse power transformation to each feature
324        for j in 0..n_features {
325            let feature_column = result.column(j).to_owned();
326            let inverse_transformed_column = match self.config.method {
327                PowerMethod::BoxCox => self.inverse_box_cox(&feature_column, lambdas[j]),
328                PowerMethod::YeoJohnson => self.inverse_yeo_johnson(&feature_column, lambdas[j]),
329            };
330            result.column_mut(j).assign(&inverse_transformed_column);
331        }
332
333        Ok(result)
334    }
335}
336
337impl Default for PowerTransformer<Untrained> {
338    fn default() -> Self {
339        Self::new()
340    }
341}
342
343impl Fit<Array2<Float>, ()> for PowerTransformer<Untrained> {
344    type Fitted = PowerTransformer<Trained>;
345
346    fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
347        let (n_samples, n_features) = x.dim();
348
349        if n_samples == 0 {
350            return Err(SklearsError::InvalidInput(
351                "Cannot fit PowerTransformer on empty dataset".to_string(),
352            ));
353        }
354
355        // Check for positive values if using Box-Cox
356        if matches!(self.config.method, PowerMethod::BoxCox) {
357            for &val in x.iter() {
358                if val <= 0.0 {
359                    return Err(SklearsError::InvalidInput(
360                        "Box-Cox transformation requires strictly positive data".to_string(),
361                    ));
362                }
363            }
364        }
365
366        // Find optimal lambda for each feature
367        let mut lambdas = Array1::<Float>::zeros(n_features);
368        for j in 0..n_features {
369            let feature_column = x.column(j).to_owned();
370            lambdas[j] = self.find_optimal_lambda(&feature_column);
371        }
372
373        // Compute means and stds for standardization if requested
374        let (means, stds) = if self.config.standardize {
375            // Transform data first, then compute statistics
376            let mut transformed_data = Array2::<Float>::zeros(x.dim());
377            for j in 0..n_features {
378                let feature_column = x.column(j).to_owned();
379                let transformed_column = self.apply_power_transform(&feature_column, lambdas[j]);
380                transformed_data.column_mut(j).assign(&transformed_column);
381            }
382
383            let means = transformed_data.mean_axis(Axis(0)).unwrap();
384            let stds = transformed_data.std_axis(Axis(0), 0.0);
385            (Some(means), Some(stds))
386        } else {
387            (None, None)
388        };
389
390        Ok(PowerTransformer {
391            config: self.config,
392            state: PhantomData,
393            n_features_in_: Some(n_features),
394            lambdas_: Some(lambdas),
395            means_: means,
396            stds_: stds,
397        })
398    }
399}
400
401impl Transform<Array2<Float>, Array2<Float>> for PowerTransformer<Trained> {
402    fn transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
403        let (_n_samples, n_features) = x.dim();
404
405        if n_features != self.n_features_in() {
406            return Err(SklearsError::FeatureMismatch {
407                expected: self.n_features_in(),
408                actual: n_features,
409            });
410        }
411
412        let lambdas = self.lambdas();
413        let mut result = Array2::<Float>::zeros(x.dim());
414
415        // Apply power transformation to each feature
416        for j in 0..n_features {
417            let feature_column = x.column(j).to_owned();
418            let transformed_column = match self.config.method {
419                PowerMethod::BoxCox => self.box_cox_transform_fitted(&feature_column, lambdas[j]),
420                PowerMethod::YeoJohnson => {
421                    self.yeo_johnson_transform_fitted(&feature_column, lambdas[j])
422                }
423            };
424            result.column_mut(j).assign(&transformed_column);
425        }
426
427        // Apply standardization if requested
428        if self.config.standardize {
429            if let (Some(means), Some(stds)) = (&self.means_, &self.stds_) {
430                for j in 0..n_features {
431                    let mut column = result.column_mut(j);
432                    let mean = means[j];
433                    let std = stds[j];
434                    if std > 0.0 {
435                        column.mapv_inplace(|val| (val - mean) / std);
436                    }
437                }
438            }
439        }
440
441        Ok(result)
442    }
443}
444
445#[allow(non_snake_case)]
446#[cfg(test)]
447mod tests {
448    use super::*;
449    use approx::assert_abs_diff_eq;
450    use scirs2_core::ndarray::array;
451
452    #[test]
453    fn test_power_transformer_yeo_johnson() -> Result<()> {
454        let x = array![[-1.0], [0.0], [1.0], [2.0]];
455        let transformer = PowerTransformer::new()
456            .method(PowerMethod::YeoJohnson)
457            .standardize(false);
458
459        let fitted = transformer.fit(&x, &())?;
460        let transformed = fitted.transform(&x)?;
461
462        // Should transform without errors
463        assert_eq!(transformed.nrows(), 4);
464        assert_eq!(transformed.ncols(), 1);
465
466        Ok(())
467    }
468
469    #[test]
470    fn test_power_transformer_box_cox() -> Result<()> {
471        let x = array![[0.1], [1.0], [2.0], [5.0]]; // Positive values only
472        let transformer = PowerTransformer::new()
473            .method(PowerMethod::BoxCox)
474            .standardize(false);
475
476        let fitted = transformer.fit(&x, &())?;
477        let transformed = fitted.transform(&x)?;
478
479        // Should transform without errors
480        assert_eq!(transformed.nrows(), 4);
481        assert_eq!(transformed.ncols(), 1);
482
483        Ok(())
484    }
485
486    #[test]
487    fn test_box_cox_negative_data_error() {
488        let x = array![[-1.0], [1.0], [2.0]]; // Contains negative value
489        let transformer = PowerTransformer::new().method(PowerMethod::BoxCox);
490
491        let result = transformer.fit(&x, &());
492        assert!(result.is_err());
493    }
494
495    #[test]
496    fn test_power_transformer_with_standardization() -> Result<()> {
497        let x = array![[1.0], [2.0], [3.0], [4.0]];
498        let transformer = PowerTransformer::new()
499            .method(PowerMethod::YeoJohnson)
500            .standardize(true);
501
502        let fitted = transformer.fit(&x, &())?;
503        let transformed = fitted.transform(&x)?;
504
505        // Check that standardization was applied (mean ≈ 0, std ≈ 1)
506        let mean = transformed.mean_axis(Axis(0)).unwrap();
507        let std = transformed.std_axis(Axis(0), 0.0);
508
509        assert_abs_diff_eq!(mean[0], 0.0, epsilon = 1e-10);
510        assert_abs_diff_eq!(std[0], 1.0, epsilon = 1e-10);
511
512        Ok(())
513    }
514
515    #[test]
516    fn test_inverse_transform() -> Result<()> {
517        let x = array![[0.5], [1.0], [1.5], [2.0]];
518        let transformer = PowerTransformer::new()
519            .method(PowerMethod::YeoJohnson)
520            .standardize(false);
521
522        let fitted = transformer.fit(&x, &())?;
523        let transformed = fitted.transform(&x)?;
524        let inverse_transformed = fitted.inverse_transform(&transformed)?;
525
526        // Should recover original data (approximately)
527        for i in 0..x.nrows() {
528            for j in 0..x.ncols() {
529                assert_abs_diff_eq!(x[[i, j]], inverse_transformed[[i, j]], epsilon = 1e-6);
530            }
531        }
532
533        Ok(())
534    }
535
536    #[test]
537    fn test_optimal_lambda_finding() -> Result<()> {
538        let x = array![[1.0], [2.0], [4.0], [8.0]]; // Exponential-like data
539        let transformer = PowerTransformer::new().method(PowerMethod::BoxCox);
540
541        let fitted = transformer.fit(&x, &())?;
542        let lambdas = fitted.lambdas();
543
544        // Lambda should be found (exact value depends on optimization)
545        assert!(lambdas.len() == 1);
546        assert!(lambdas[0].is_finite());
547
548        Ok(())
549    }
550
551    #[test]
552    fn test_multiple_features() -> Result<()> {
553        let x = array![[1.0, 0.5], [2.0, 1.0], [3.0, 1.5], [4.0, 2.0]];
554        let transformer = PowerTransformer::new().method(PowerMethod::YeoJohnson);
555
556        let fitted = transformer.fit(&x, &())?;
557        let transformed = fitted.transform(&x)?;
558
559        assert_eq!(transformed.nrows(), 4);
560        assert_eq!(transformed.ncols(), 2);
561        assert_eq!(fitted.lambdas().len(), 2);
562
563        Ok(())
564    }
565}