sklears_preprocessing/feature_engineering/
power_transformer.rs1use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8 error::{Result, SklearsError},
9 traits::{Fit, Trained, Transform, Untrained},
10 types::Float,
11};
12use std::marker::PhantomData;
13
14#[derive(Debug, Clone)]
16pub struct PowerTransformerConfig {
17 pub method: PowerMethod,
19 pub standardize: bool,
21 pub eps: Float,
23}
24
25impl Default for PowerTransformerConfig {
26 fn default() -> Self {
27 Self {
28 method: PowerMethod::YeoJohnson,
29 standardize: true,
30 eps: 1e-6,
31 }
32 }
33}
34
35#[derive(Debug, Clone, Copy)]
37pub enum PowerMethod {
38 BoxCox,
40 YeoJohnson,
42}
43
44#[derive(Debug, Clone)]
50pub struct PowerTransformer<State = Untrained> {
51 config: PowerTransformerConfig,
52 state: PhantomData<State>,
53 n_features_in_: Option<usize>,
55 lambdas_: Option<Array1<Float>>, means_: Option<Array1<Float>>, stds_: Option<Array1<Float>>, }
59
60impl PowerTransformer<Untrained> {
61 pub fn new() -> Self {
63 Self {
64 config: PowerTransformerConfig::default(),
65 state: PhantomData,
66 n_features_in_: None,
67 lambdas_: None,
68 means_: None,
69 stds_: None,
70 }
71 }
72
73 pub fn with_config(config: PowerTransformerConfig) -> Self {
75 Self {
76 config,
77 state: PhantomData,
78 n_features_in_: None,
79 lambdas_: None,
80 means_: None,
81 stds_: None,
82 }
83 }
84
85 pub fn method(mut self, method: PowerMethod) -> Self {
87 self.config.method = method;
88 self
89 }
90
91 pub fn standardize(mut self, standardize: bool) -> Self {
93 self.config.standardize = standardize;
94 self
95 }
96
97 pub fn eps(mut self, eps: Float) -> Self {
99 self.config.eps = eps;
100 self
101 }
102
103 fn find_optimal_lambda(&self, x: &Array1<Float>) -> Float {
105 let mut best_lambda = 0.0;
106 let mut best_log_likelihood = Float::NEG_INFINITY;
107
108 let lambda_range = Array1::linspace(-2.0, 2.0, 41);
110
111 for &lambda in lambda_range.iter() {
112 let log_likelihood = self.compute_log_likelihood(x, lambda);
113 if log_likelihood > best_log_likelihood {
114 best_log_likelihood = log_likelihood;
115 best_lambda = lambda;
116 }
117 }
118
119 best_lambda
120 }
121
122 fn compute_log_likelihood(&self, x: &Array1<Float>, lambda: Float) -> Float {
124 let transformed = self.apply_power_transform(x, lambda);
125
126 let mean = transformed.mean().unwrap_or(0.0);
128 let variance = transformed.var(0.0);
129
130 if variance <= 0.0 {
131 return Float::NEG_INFINITY;
132 }
133
134 let n = x.len() as Float;
135 let log_likelihood = -0.5 * n * (2.0 * std::f64::consts::PI * variance).ln()
136 - 0.5
137 * transformed
138 .iter()
139 .map(|&val| (val - mean).powi(2))
140 .sum::<Float>()
141 / variance;
142
143 let jacobian = match self.config.method {
145 PowerMethod::BoxCox => {
146 (lambda - 1.0)
147 * x.iter()
148 .map(|&val| val.max(self.config.eps).ln())
149 .sum::<Float>()
150 }
151 PowerMethod::YeoJohnson => x
152 .iter()
153 .map(|&val| {
154 if val >= 0.0 {
155 (lambda - 1.0) * (val + 1.0).ln()
156 } else {
157 (1.0 - lambda) * (-val + 1.0).ln()
158 }
159 })
160 .sum::<Float>(),
161 };
162
163 log_likelihood + jacobian
164 }
165
166 fn apply_power_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
168 match self.config.method {
169 PowerMethod::BoxCox => self.box_cox_transform(x, lambda),
170 PowerMethod::YeoJohnson => self.yeo_johnson_transform(x, lambda),
171 }
172 }
173
174 fn box_cox_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
176 x.mapv(|val| {
177 let val_pos = val.max(self.config.eps);
178 if lambda.abs() < 1e-8 {
179 val_pos.ln()
180 } else {
181 (val_pos.powf(lambda) - 1.0) / lambda
182 }
183 })
184 }
185
186 fn yeo_johnson_transform(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
188 x.mapv(|val| {
189 if val >= 0.0 {
190 if lambda.abs() < 1e-8 {
191 (val + 1.0).ln()
192 } else {
193 ((val + 1.0).powf(lambda) - 1.0) / lambda
194 }
195 } else if (lambda - 2.0).abs() < 1e-8 {
196 -(-val + 1.0).ln()
197 } else {
198 -((-val + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
199 }
200 })
201 }
202}
203
204impl PowerTransformer<Trained> {
205 pub fn n_features_in(&self) -> usize {
207 self.n_features_in_
208 .expect("PowerTransformer should be fitted")
209 }
210
211 pub fn lambdas(&self) -> &Array1<Float> {
213 self.lambdas_
214 .as_ref()
215 .expect("PowerTransformer should be fitted")
216 }
217
218 pub fn means(&self) -> Option<&Array1<Float>> {
220 self.means_.as_ref()
221 }
222
223 pub fn stds(&self) -> Option<&Array1<Float>> {
225 self.stds_.as_ref()
226 }
227
228 fn box_cox_transform_fitted(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
230 self.box_cox_transform_cpu(x, lambda)
233 }
234
235 fn yeo_johnson_transform_fitted(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
237 self.yeo_johnson_transform_cpu(x, lambda)
240 }
241
242 fn box_cox_transform_cpu(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
244 x.mapv(|val| {
245 let val_pos = val.max(self.config.eps);
246 if lambda.abs() < 1e-8 {
247 val_pos.ln()
248 } else {
249 (val_pos.powf(lambda) - 1.0) / lambda
250 }
251 })
252 }
253
254 fn yeo_johnson_transform_cpu(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
256 x.mapv(|val| {
257 if val >= 0.0 {
258 if lambda.abs() < 1e-8 {
259 (val + 1.0).ln()
260 } else {
261 ((val + 1.0).powf(lambda) - 1.0) / lambda
262 }
263 } else if (lambda - 2.0).abs() < 1e-8 {
264 -(-val + 1.0).ln()
265 } else {
266 -((-val + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
267 }
268 })
269 }
270
271 pub fn inverse_box_cox(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
273 x.mapv(|val| {
274 if lambda.abs() < 1e-8 {
275 val.exp()
276 } else {
277 (lambda * val + 1.0).powf(1.0 / lambda).max(self.config.eps)
278 }
279 })
280 }
281
282 pub fn inverse_yeo_johnson(&self, x: &Array1<Float>, lambda: Float) -> Array1<Float> {
284 x.mapv(|val| {
285 if val >= 0.0 {
286 if lambda.abs() < 1e-8 {
287 val.exp() - 1.0
288 } else {
289 (lambda * val + 1.0).powf(1.0 / lambda) - 1.0
290 }
291 } else if (lambda - 2.0).abs() < 1e-8 {
292 -(-val).exp() + 1.0
293 } else {
294 -((2.0 - lambda) * (-val) + 1.0).powf(1.0 / (2.0 - lambda)) + 1.0
295 }
296 })
297 }
298
299 pub fn inverse_transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
301 let (_n_samples, n_features) = x.dim();
302
303 if n_features != self.n_features_in() {
304 return Err(SklearsError::FeatureMismatch {
305 expected: self.n_features_in(),
306 actual: n_features,
307 });
308 }
309
310 let lambdas = self.lambdas();
311 let mut result = x.clone();
312
313 if self.config.standardize {
315 if let (Some(means), Some(stds)) = (&self.means_, &self.stds_) {
316 for j in 0..n_features {
317 let mut column = result.column_mut(j);
318 column.mapv_inplace(|val| val * stds[j] + means[j]);
319 }
320 }
321 }
322
323 for j in 0..n_features {
325 let feature_column = result.column(j).to_owned();
326 let inverse_transformed_column = match self.config.method {
327 PowerMethod::BoxCox => self.inverse_box_cox(&feature_column, lambdas[j]),
328 PowerMethod::YeoJohnson => self.inverse_yeo_johnson(&feature_column, lambdas[j]),
329 };
330 result.column_mut(j).assign(&inverse_transformed_column);
331 }
332
333 Ok(result)
334 }
335}
336
337impl Default for PowerTransformer<Untrained> {
338 fn default() -> Self {
339 Self::new()
340 }
341}
342
343impl Fit<Array2<Float>, ()> for PowerTransformer<Untrained> {
344 type Fitted = PowerTransformer<Trained>;
345
346 fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
347 let (n_samples, n_features) = x.dim();
348
349 if n_samples == 0 {
350 return Err(SklearsError::InvalidInput(
351 "Cannot fit PowerTransformer on empty dataset".to_string(),
352 ));
353 }
354
355 if matches!(self.config.method, PowerMethod::BoxCox) {
357 for &val in x.iter() {
358 if val <= 0.0 {
359 return Err(SklearsError::InvalidInput(
360 "Box-Cox transformation requires strictly positive data".to_string(),
361 ));
362 }
363 }
364 }
365
366 let mut lambdas = Array1::<Float>::zeros(n_features);
368 for j in 0..n_features {
369 let feature_column = x.column(j).to_owned();
370 lambdas[j] = self.find_optimal_lambda(&feature_column);
371 }
372
373 let (means, stds) = if self.config.standardize {
375 let mut transformed_data = Array2::<Float>::zeros(x.dim());
377 for j in 0..n_features {
378 let feature_column = x.column(j).to_owned();
379 let transformed_column = self.apply_power_transform(&feature_column, lambdas[j]);
380 transformed_data.column_mut(j).assign(&transformed_column);
381 }
382
383 let means = transformed_data.mean_axis(Axis(0)).unwrap();
384 let stds = transformed_data.std_axis(Axis(0), 0.0);
385 (Some(means), Some(stds))
386 } else {
387 (None, None)
388 };
389
390 Ok(PowerTransformer {
391 config: self.config,
392 state: PhantomData,
393 n_features_in_: Some(n_features),
394 lambdas_: Some(lambdas),
395 means_: means,
396 stds_: stds,
397 })
398 }
399}
400
401impl Transform<Array2<Float>, Array2<Float>> for PowerTransformer<Trained> {
402 fn transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
403 let (_n_samples, n_features) = x.dim();
404
405 if n_features != self.n_features_in() {
406 return Err(SklearsError::FeatureMismatch {
407 expected: self.n_features_in(),
408 actual: n_features,
409 });
410 }
411
412 let lambdas = self.lambdas();
413 let mut result = Array2::<Float>::zeros(x.dim());
414
415 for j in 0..n_features {
417 let feature_column = x.column(j).to_owned();
418 let transformed_column = match self.config.method {
419 PowerMethod::BoxCox => self.box_cox_transform_fitted(&feature_column, lambdas[j]),
420 PowerMethod::YeoJohnson => {
421 self.yeo_johnson_transform_fitted(&feature_column, lambdas[j])
422 }
423 };
424 result.column_mut(j).assign(&transformed_column);
425 }
426
427 if self.config.standardize {
429 if let (Some(means), Some(stds)) = (&self.means_, &self.stds_) {
430 for j in 0..n_features {
431 let mut column = result.column_mut(j);
432 let mean = means[j];
433 let std = stds[j];
434 if std > 0.0 {
435 column.mapv_inplace(|val| (val - mean) / std);
436 }
437 }
438 }
439 }
440
441 Ok(result)
442 }
443}
444
445#[allow(non_snake_case)]
446#[cfg(test)]
447mod tests {
448 use super::*;
449 use approx::assert_abs_diff_eq;
450 use scirs2_core::ndarray::array;
451
452 #[test]
453 fn test_power_transformer_yeo_johnson() -> Result<()> {
454 let x = array![[-1.0], [0.0], [1.0], [2.0]];
455 let transformer = PowerTransformer::new()
456 .method(PowerMethod::YeoJohnson)
457 .standardize(false);
458
459 let fitted = transformer.fit(&x, &())?;
460 let transformed = fitted.transform(&x)?;
461
462 assert_eq!(transformed.nrows(), 4);
464 assert_eq!(transformed.ncols(), 1);
465
466 Ok(())
467 }
468
469 #[test]
470 fn test_power_transformer_box_cox() -> Result<()> {
471 let x = array![[0.1], [1.0], [2.0], [5.0]]; let transformer = PowerTransformer::new()
473 .method(PowerMethod::BoxCox)
474 .standardize(false);
475
476 let fitted = transformer.fit(&x, &())?;
477 let transformed = fitted.transform(&x)?;
478
479 assert_eq!(transformed.nrows(), 4);
481 assert_eq!(transformed.ncols(), 1);
482
483 Ok(())
484 }
485
486 #[test]
487 fn test_box_cox_negative_data_error() {
488 let x = array![[-1.0], [1.0], [2.0]]; let transformer = PowerTransformer::new().method(PowerMethod::BoxCox);
490
491 let result = transformer.fit(&x, &());
492 assert!(result.is_err());
493 }
494
495 #[test]
496 fn test_power_transformer_with_standardization() -> Result<()> {
497 let x = array![[1.0], [2.0], [3.0], [4.0]];
498 let transformer = PowerTransformer::new()
499 .method(PowerMethod::YeoJohnson)
500 .standardize(true);
501
502 let fitted = transformer.fit(&x, &())?;
503 let transformed = fitted.transform(&x)?;
504
505 let mean = transformed.mean_axis(Axis(0)).unwrap();
507 let std = transformed.std_axis(Axis(0), 0.0);
508
509 assert_abs_diff_eq!(mean[0], 0.0, epsilon = 1e-10);
510 assert_abs_diff_eq!(std[0], 1.0, epsilon = 1e-10);
511
512 Ok(())
513 }
514
515 #[test]
516 fn test_inverse_transform() -> Result<()> {
517 let x = array![[0.5], [1.0], [1.5], [2.0]];
518 let transformer = PowerTransformer::new()
519 .method(PowerMethod::YeoJohnson)
520 .standardize(false);
521
522 let fitted = transformer.fit(&x, &())?;
523 let transformed = fitted.transform(&x)?;
524 let inverse_transformed = fitted.inverse_transform(&transformed)?;
525
526 for i in 0..x.nrows() {
528 for j in 0..x.ncols() {
529 assert_abs_diff_eq!(x[[i, j]], inverse_transformed[[i, j]], epsilon = 1e-6);
530 }
531 }
532
533 Ok(())
534 }
535
536 #[test]
537 fn test_optimal_lambda_finding() -> Result<()> {
538 let x = array![[1.0], [2.0], [4.0], [8.0]]; let transformer = PowerTransformer::new().method(PowerMethod::BoxCox);
540
541 let fitted = transformer.fit(&x, &())?;
542 let lambdas = fitted.lambdas();
543
544 assert!(lambdas.len() == 1);
546 assert!(lambdas[0].is_finite());
547
548 Ok(())
549 }
550
551 #[test]
552 fn test_multiple_features() -> Result<()> {
553 let x = array![[1.0, 0.5], [2.0, 1.0], [3.0, 1.5], [4.0, 2.0]];
554 let transformer = PowerTransformer::new().method(PowerMethod::YeoJohnson);
555
556 let fitted = transformer.fit(&x, &())?;
557 let transformed = fitted.transform(&x)?;
558
559 assert_eq!(transformed.nrows(), 4);
560 assert_eq!(transformed.ncols(), 2);
561 assert_eq!(fitted.lambdas().len(), 2);
562
563 Ok(())
564 }
565}