sklears_preprocessing/
outlier_transformation.rs

1//! Outlier transformation methods for handling extreme values
2//!
3//! This module provides various transformation methods specifically designed to handle
4//! outliers in data while preserving the overall structure and relationships. Unlike
5//! outlier detection which identifies outliers, these methods transform them to reduce
6//! their impact on downstream analysis.
7//!
8//! # Features
9//!
10//! - **Log Transformation**: Reduces impact of large outliers through logarithmic scaling
11//! - **Square Root Transformation**: Mild transformation for positive outliers
12//! - **Box-Cox Transformation**: Data-driven power transformation for normalization
13//! - **Quantile Transformation**: Maps to uniform or normal distribution
14//! - **Robust Scaling**: Scaling resistant to outliers using median and IQR
15//! - **Outlier Interpolation**: Replace outliers with interpolated values
16//! - **Outlier Smoothing**: Smooth outliers using neighboring values
17//! - **Trimmed Transformation**: Apply transformations after trimming extreme percentiles
18
19use scirs2_core::ndarray::{Array1, Array2, Axis};
20use sklears_core::{
21    error::{Result, SklearsError},
22    traits::{Fit, Trained, Transform, Untrained},
23    types::Float,
24};
25use std::marker::PhantomData;
26
27/// Available outlier transformation methods
28#[derive(Debug, Clone, Copy)]
29pub enum OutlierTransformationMethod {
30    /// Natural logarithm transformation (for positive values)
31    Log,
32    /// Log1p transformation (log(1 + x), handles zeros)
33    Log1p,
34    /// Square root transformation (for positive values)
35    Sqrt,
36    /// Box-Cox transformation with automatic lambda estimation
37    BoxCox,
38    /// Box-Cox transformation with fixed lambda
39    BoxCoxFixed(Float),
40    /// Quantile transformation to uniform distribution
41    QuantileUniform,
42    /// Quantile transformation to normal distribution
43    QuantileNormal,
44    /// Robust scaling using median and IQR
45    RobustScale,
46    /// Replace outliers with interpolated values
47    Interpolate,
48    /// Smooth outliers using local neighborhood
49    Smooth,
50    /// Trim extreme percentiles before transformation
51    Trim,
52}
53
54/// Configuration for outlier transformation
55#[derive(Debug, Clone)]
56pub struct OutlierTransformationConfig {
57    /// Transformation method to apply
58    pub method: OutlierTransformationMethod,
59    /// Threshold for outlier detection (used with interpolation/smoothing)
60    pub outlier_threshold: Float,
61    /// Method for outlier detection (z-score, iqr, percentile)
62    pub detection_method: String,
63    /// Lower percentile for trimming (default: 1.0)
64    pub lower_percentile: Float,
65    /// Upper percentile for trimming (default: 99.0)
66    pub upper_percentile: Float,
67    /// Window size for smoothing (default: 5)
68    pub smoothing_window: usize,
69    /// Number of quantiles for quantile transformation (default: 1000)
70    pub n_quantiles: usize,
71    /// Whether to handle negative values by shifting (default: true)
72    pub handle_negatives: bool,
73    /// Small constant to add before log transformation to avoid zeros
74    pub log_epsilon: Float,
75    /// Whether to apply transformation feature-wise (default: true)
76    pub feature_wise: bool,
77}
78
79impl Default for OutlierTransformationConfig {
80    fn default() -> Self {
81        Self {
82            method: OutlierTransformationMethod::Log1p,
83            outlier_threshold: 3.0,
84            detection_method: "z-score".to_string(),
85            lower_percentile: 1.0,
86            upper_percentile: 99.0,
87            smoothing_window: 5,
88            n_quantiles: 1000,
89            handle_negatives: true,
90            log_epsilon: 1e-8,
91            feature_wise: true,
92        }
93    }
94}
95
96/// Outlier transformer for handling extreme values through transformation
97#[derive(Debug, Clone)]
98pub struct OutlierTransformer<State = Untrained> {
99    config: OutlierTransformationConfig,
100    state: PhantomData<State>,
101    // Fitted parameters
102    transformation_params_: Option<TransformationParameters>,
103    n_features_in_: Option<usize>,
104}
105
106/// Parameters learned during fitting for transformations
107#[derive(Debug, Clone)]
108pub struct TransformationParameters {
109    /// Feature-wise transformation parameters
110    pub feature_params: Vec<FeatureTransformationParams>,
111    /// Global parameters (for non-feature-wise transformations)
112    pub global_params: Option<GlobalTransformationParams>,
113}
114
115/// Transformation parameters for a single feature
116#[derive(Debug, Clone)]
117pub struct FeatureTransformationParams {
118    /// Box-Cox lambda parameter
119    pub lambda: Option<Float>,
120    /// Shift applied to handle negative values
121    pub shift: Float,
122    /// Quantiles for quantile transformation
123    pub quantiles: Option<Array1<Float>>,
124    /// References values for quantile transformation
125    pub references: Option<Array1<Float>>,
126    /// Robust scaling parameters (median, IQR)
127    pub median: Option<Float>,
128    pub iqr: Option<Float>,
129    /// Outlier bounds for interpolation/smoothing
130    pub lower_bound: Option<Float>,
131    pub upper_bound: Option<Float>,
132    /// Statistics for outlier detection
133    pub mean: Option<Float>,
134    pub std: Option<Float>,
135}
136
137/// Global transformation parameters
138#[derive(Debug, Clone)]
139pub struct GlobalTransformationParams {
140    /// Global shift for handling negatives
141    pub global_shift: Float,
142    /// Global lambda for Box-Cox
143    pub global_lambda: Option<Float>,
144}
145
146impl OutlierTransformer<Untrained> {
147    /// Create a new OutlierTransformer with default configuration
148    pub fn new() -> Self {
149        Self {
150            config: OutlierTransformationConfig::default(),
151            state: PhantomData,
152            transformation_params_: None,
153            n_features_in_: None,
154        }
155    }
156
157    /// Create a log transformation for outliers
158    pub fn log() -> Self {
159        Self::new().method(OutlierTransformationMethod::Log)
160    }
161
162    /// Create a log1p transformation for outliers
163    pub fn log1p() -> Self {
164        Self::new().method(OutlierTransformationMethod::Log1p)
165    }
166
167    /// Create a square root transformation for outliers
168    pub fn sqrt() -> Self {
169        Self::new().method(OutlierTransformationMethod::Sqrt)
170    }
171
172    /// Create a Box-Cox transformation with automatic lambda
173    pub fn box_cox() -> Self {
174        Self::new().method(OutlierTransformationMethod::BoxCox)
175    }
176
177    /// Create a Box-Cox transformation with fixed lambda
178    pub fn box_cox_fixed(lambda: Float) -> Self {
179        Self::new().method(OutlierTransformationMethod::BoxCoxFixed(lambda))
180    }
181
182    /// Create a quantile transformation to uniform distribution
183    pub fn quantile_uniform(n_quantiles: usize) -> Self {
184        Self::new()
185            .method(OutlierTransformationMethod::QuantileUniform)
186            .n_quantiles(n_quantiles)
187    }
188
189    /// Create a quantile transformation to normal distribution
190    pub fn quantile_normal(n_quantiles: usize) -> Self {
191        Self::new()
192            .method(OutlierTransformationMethod::QuantileNormal)
193            .n_quantiles(n_quantiles)
194    }
195
196    /// Create a robust scaling transformation
197    pub fn robust_scale() -> Self {
198        Self::new().method(OutlierTransformationMethod::RobustScale)
199    }
200
201    /// Create an interpolation transformation
202    pub fn interpolate(threshold: Float, detection_method: &str) -> Self {
203        Self::new()
204            .method(OutlierTransformationMethod::Interpolate)
205            .outlier_threshold(threshold)
206            .detection_method(detection_method.to_string())
207    }
208
209    /// Create a smoothing transformation
210    pub fn smooth(window_size: usize, threshold: Float) -> Self {
211        Self::new()
212            .method(OutlierTransformationMethod::Smooth)
213            .smoothing_window(window_size)
214            .outlier_threshold(threshold)
215    }
216
217    /// Create a trimming transformation
218    pub fn trim(lower_percentile: Float, upper_percentile: Float) -> Self {
219        Self::new()
220            .method(OutlierTransformationMethod::Trim)
221            .lower_percentile(lower_percentile)
222            .upper_percentile(upper_percentile)
223    }
224
225    /// Set the transformation method
226    pub fn method(mut self, method: OutlierTransformationMethod) -> Self {
227        self.config.method = method;
228        self
229    }
230
231    /// Set the outlier detection threshold
232    pub fn outlier_threshold(mut self, threshold: Float) -> Self {
233        self.config.outlier_threshold = threshold;
234        self
235    }
236
237    /// Set the outlier detection method
238    pub fn detection_method(mut self, method: String) -> Self {
239        self.config.detection_method = method;
240        self
241    }
242
243    /// Set the lower percentile for trimming
244    pub fn lower_percentile(mut self, percentile: Float) -> Self {
245        self.config.lower_percentile = percentile;
246        self
247    }
248
249    /// Set the upper percentile for trimming
250    pub fn upper_percentile(mut self, percentile: Float) -> Self {
251        self.config.upper_percentile = percentile;
252        self
253    }
254
255    /// Set the smoothing window size
256    pub fn smoothing_window(mut self, window: usize) -> Self {
257        self.config.smoothing_window = window;
258        self
259    }
260
261    /// Set the number of quantiles
262    pub fn n_quantiles(mut self, n_quantiles: usize) -> Self {
263        self.config.n_quantiles = n_quantiles;
264        self
265    }
266
267    /// Set whether to handle negative values
268    pub fn handle_negatives(mut self, handle: bool) -> Self {
269        self.config.handle_negatives = handle;
270        self
271    }
272
273    /// Set the epsilon for log transformations
274    pub fn log_epsilon(mut self, epsilon: Float) -> Self {
275        self.config.log_epsilon = epsilon;
276        self
277    }
278
279    /// Set whether to apply transformations feature-wise
280    pub fn feature_wise(mut self, feature_wise: bool) -> Self {
281        self.config.feature_wise = feature_wise;
282        self
283    }
284}
285
286impl Fit<Array2<Float>, ()> for OutlierTransformer<Untrained> {
287    type Fitted = OutlierTransformer<Trained>;
288
289    fn fit(mut self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
290        let (n_samples, n_features) = x.dim();
291
292        if n_samples == 0 || n_features == 0 {
293            return Err(SklearsError::InvalidInput(
294                "Input array is empty".to_string(),
295            ));
296        }
297
298        self.n_features_in_ = Some(n_features);
299
300        // Compute transformation parameters based on method
301        let feature_params = if self.config.feature_wise {
302            (0..n_features)
303                .map(|j| {
304                    self.fit_feature_params(
305                        x.column(j)
306                            .to_owned()
307                            .as_slice()
308                            .expect("matrix indexing should be valid"),
309                    )
310                })
311                .collect::<Result<Vec<_>>>()?
312        } else {
313            // For non-feature-wise, we'll use global parameters
314            vec![self.fit_feature_params(x.as_slice().expect("slice operation should succeed"))?]
315        };
316
317        self.transformation_params_ = Some(TransformationParameters {
318            feature_params,
319            global_params: None, // Could be used for global transformations
320        });
321
322        Ok(OutlierTransformer {
323            config: self.config,
324            state: PhantomData,
325            transformation_params_: self.transformation_params_,
326            n_features_in_: self.n_features_in_,
327        })
328    }
329}
330
331impl OutlierTransformer<Untrained> {
332    /// Fit parameters for a single feature
333    fn fit_feature_params(&self, data: &[Float]) -> Result<FeatureTransformationParams> {
334        let mut params = FeatureTransformationParams {
335            lambda: None,
336            shift: 0.0,
337            quantiles: None,
338            references: None,
339            median: None,
340            iqr: None,
341            lower_bound: None,
342            upper_bound: None,
343            mean: None,
344            std: None,
345        };
346
347        // Calculate basic statistics
348        let valid_data: Vec<Float> = data.iter().filter(|x| x.is_finite()).copied().collect();
349
350        if valid_data.is_empty() {
351            return Ok(params);
352        }
353
354        let mean = valid_data.iter().sum::<Float>() / valid_data.len() as Float;
355        let variance = valid_data.iter().map(|x| (x - mean).powi(2)).sum::<Float>()
356            / valid_data.len() as Float;
357        let std = variance.sqrt();
358
359        params.mean = Some(mean);
360        params.std = Some(std);
361
362        // Calculate median and IQR for robust methods
363        let mut sorted_data = valid_data.clone();
364        sorted_data.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
365
366        let median = if sorted_data.len() % 2 == 0 {
367            let mid = sorted_data.len() / 2;
368            (sorted_data[mid - 1] + sorted_data[mid]) / 2.0
369        } else {
370            sorted_data[sorted_data.len() / 2]
371        };
372
373        let q1_idx = sorted_data.len() / 4;
374        let q3_idx = 3 * sorted_data.len() / 4;
375        let q1 = sorted_data[q1_idx];
376        let q3 = sorted_data[q3_idx];
377        let iqr = q3 - q1;
378
379        params.median = Some(median);
380        params.iqr = Some(iqr);
381
382        // Set outlier bounds based on detection method
383        match self.config.detection_method.as_str() {
384            "z-score" => {
385                params.lower_bound = Some(mean - self.config.outlier_threshold * std);
386                params.upper_bound = Some(mean + self.config.outlier_threshold * std);
387            }
388            "iqr" => {
389                params.lower_bound = Some(q1 - self.config.outlier_threshold * iqr);
390                params.upper_bound = Some(q3 + self.config.outlier_threshold * iqr);
391            }
392            "percentile" => {
393                let lower_idx =
394                    ((self.config.lower_percentile / 100.0) * sorted_data.len() as Float) as usize;
395                let upper_idx =
396                    ((self.config.upper_percentile / 100.0) * sorted_data.len() as Float) as usize;
397                params.lower_bound = Some(sorted_data[lower_idx.min(sorted_data.len() - 1)]);
398                params.upper_bound = Some(sorted_data[upper_idx.min(sorted_data.len() - 1)]);
399            }
400            _ => {
401                return Err(SklearsError::InvalidInput(format!(
402                    "Unknown detection method: {}",
403                    self.config.detection_method
404                )));
405            }
406        }
407
408        // Handle negative values for log/sqrt transformations
409        if self.config.handle_negatives {
410            match self.config.method {
411                OutlierTransformationMethod::Log | OutlierTransformationMethod::Sqrt => {
412                    let min_val = sorted_data[0];
413                    if min_val <= 0.0 {
414                        params.shift = -min_val + self.config.log_epsilon;
415                    }
416                }
417                OutlierTransformationMethod::BoxCox
418                | OutlierTransformationMethod::BoxCoxFixed(_) => {
419                    let min_val = sorted_data[0];
420                    if min_val <= 0.0 {
421                        params.shift = -min_val + self.config.log_epsilon;
422                    }
423                }
424                _ => {}
425            }
426        }
427
428        // Fit method-specific parameters
429        match self.config.method {
430            OutlierTransformationMethod::BoxCox => {
431                params.lambda = Some(self.estimate_box_cox_lambda(&valid_data, params.shift)?);
432            }
433            OutlierTransformationMethod::BoxCoxFixed(lambda) => {
434                params.lambda = Some(lambda);
435            }
436            OutlierTransformationMethod::QuantileUniform
437            | OutlierTransformationMethod::QuantileNormal => {
438                params.quantiles = Some(self.compute_quantiles(&sorted_data)?);
439                params.references = Some(self.compute_references()?);
440            }
441            _ => {}
442        }
443
444        Ok(params)
445    }
446
447    /// Estimate optimal lambda for Box-Cox transformation using maximum likelihood
448    fn estimate_box_cox_lambda(&self, data: &[Float], shift: Float) -> Result<Float> {
449        let shifted_data: Vec<Float> = data.iter().map(|x| x + shift).collect();
450
451        // Search for optimal lambda in range [-2, 2]
452        let lambda_range: Vec<Float> = (-20..=20).map(|i| i as Float * 0.1).collect();
453
454        let mut best_lambda = 0.0;
455        let mut best_llf = Float::NEG_INFINITY;
456
457        for &lambda in &lambda_range {
458            if let Ok(llf) = self.box_cox_log_likelihood(&shifted_data, lambda) {
459                if llf > best_llf {
460                    best_llf = llf;
461                    best_lambda = lambda;
462                }
463            }
464        }
465
466        Ok(best_lambda)
467    }
468
469    /// Compute log-likelihood for Box-Cox transformation
470    fn box_cox_log_likelihood(&self, data: &[Float], lambda: Float) -> Result<Float> {
471        let n = data.len() as Float;
472
473        // Transform data
474        let transformed: Vec<Float> = data
475            .iter()
476            .map(|&x| {
477                if x <= 0.0 {
478                    return Float::NAN;
479                }
480                if lambda.abs() < 1e-10 {
481                    x.ln()
482                } else {
483                    (x.powf(lambda) - 1.0) / lambda
484                }
485            })
486            .collect();
487
488        // Check for invalid transformations
489        if transformed.iter().any(|x| !x.is_finite()) {
490            return Err(SklearsError::InvalidInput(
491                "Invalid Box-Cox transformation".to_string(),
492            ));
493        }
494
495        // Compute log-likelihood
496        let mean = transformed.iter().sum::<Float>() / n;
497        let variance = transformed
498            .iter()
499            .map(|x| (x - mean).powi(2))
500            .sum::<Float>()
501            / n;
502
503        let log_jacobian = (lambda - 1.0) * data.iter().map(|x| x.ln()).sum::<Float>();
504        let llf = -0.5 * n * (2.0 * std::f64::consts::PI as Float).ln()
505            - 0.5 * n * variance.ln()
506            - 0.5 * n
507            + log_jacobian;
508
509        Ok(llf)
510    }
511
512    /// Compute quantiles for quantile transformation
513    fn compute_quantiles(&self, sorted_data: &[Float]) -> Result<Array1<Float>> {
514        let n_quantiles = self.config.n_quantiles.min(sorted_data.len());
515        let mut quantiles = Array1::zeros(n_quantiles);
516
517        for i in 0..n_quantiles {
518            let q = i as Float / (n_quantiles - 1) as Float;
519            let idx = (q * (sorted_data.len() - 1) as Float) as usize;
520            quantiles[i] = sorted_data[idx.min(sorted_data.len() - 1)];
521        }
522
523        Ok(quantiles)
524    }
525
526    /// Compute reference values for quantile transformation
527    fn compute_references(&self) -> Result<Array1<Float>> {
528        let n_quantiles = self.config.n_quantiles;
529        let mut references = Array1::zeros(n_quantiles);
530
531        match self.config.method {
532            OutlierTransformationMethod::QuantileUniform => {
533                for i in 0..n_quantiles {
534                    references[i] = i as Float / (n_quantiles - 1) as Float;
535                }
536            }
537            OutlierTransformationMethod::QuantileNormal => {
538                // Approximate normal quantiles
539                for i in 0..n_quantiles {
540                    let p = i as Float / (n_quantiles - 1) as Float;
541                    references[i] = self.inverse_normal_cdf(p);
542                }
543            }
544            _ => {
545                return Err(SklearsError::InvalidInput(
546                    "Invalid quantile method".to_string(),
547                ));
548            }
549        }
550
551        Ok(references)
552    }
553
554    /// Approximate inverse normal CDF using Beasley-Springer-Moro algorithm
555    fn inverse_normal_cdf(&self, p: Float) -> Float {
556        if p <= 0.0 {
557            return Float::NEG_INFINITY;
558        }
559        if p >= 1.0 {
560            return Float::INFINITY;
561        }
562        if p == 0.5 {
563            return 0.0;
564        }
565
566        // Use simple approximation for demonstration
567        // In production, use a more accurate method
568        let a = [
569            -3.969683028665376e+01,
570            2.209460984245205e+02,
571            -2.759285104469687e+02,
572            1.383577518672690e+02,
573            -3.066479806614716e+01,
574            2.506628277459239e+00,
575        ];
576        let b = [
577            -5.447609879822406e+01,
578            1.615858368580409e+02,
579            -1.556989798598866e+02,
580            6.680131188771972e+01,
581            -1.328068155288572e+01,
582        ];
583
584        let q = if p > 0.5 { 1.0 - p } else { p };
585        let t = (-2.0 * q.ln()).sqrt();
586
587        let mut num = a[5];
588        for i in (0..5).rev() {
589            num = num * t + a[i];
590        }
591
592        let mut den = 1.0;
593        for i in (0..5).rev() {
594            den = den * t + b[i];
595        }
596
597        let x = t - num / den;
598        if p > 0.5 {
599            x
600        } else {
601            -x
602        }
603    }
604}
605
606impl Transform<Array2<Float>, Array2<Float>> for OutlierTransformer<Trained> {
607    fn transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
608        let (_n_samples, n_features) = x.dim();
609
610        if n_features != self.n_features_in().expect("operation should succeed") {
611            return Err(SklearsError::FeatureMismatch {
612                expected: self.n_features_in().expect("operation should succeed"),
613                actual: n_features,
614            });
615        }
616
617        let params = self
618            .transformation_params_
619            .as_ref()
620            .expect("operation should succeed");
621        let mut result = x.clone();
622
623        if self.config.feature_wise {
624            for j in 0..n_features {
625                let feature_params = &params.feature_params[j];
626                let mut column = result.column_mut(j);
627                self.transform_feature_inplace(&mut column, feature_params)?;
628            }
629        } else {
630            // Global transformation
631            let feature_params = &params.feature_params[0];
632            for mut row in result.axis_iter_mut(Axis(0)) {
633                for elem in row.iter_mut() {
634                    *elem = self.transform_value(*elem, feature_params)?;
635                }
636            }
637        }
638
639        Ok(result)
640    }
641}
642
643impl OutlierTransformer<Trained> {
644    /// Get the number of features seen during fit
645    pub fn n_features_in(&self) -> Option<usize> {
646        self.n_features_in_
647    }
648
649    /// Transform a single feature in-place
650    fn transform_feature_inplace(
651        &self,
652        column: &mut scirs2_core::ndarray::ArrayViewMut1<Float>,
653        params: &FeatureTransformationParams,
654    ) -> Result<()> {
655        for elem in column.iter_mut() {
656            *elem = self.transform_value(*elem, params)?;
657        }
658        Ok(())
659    }
660
661    /// Transform a single value
662    fn transform_value(&self, value: Float, params: &FeatureTransformationParams) -> Result<Float> {
663        if !value.is_finite() {
664            return Ok(value);
665        }
666
667        match self.config.method {
668            OutlierTransformationMethod::Log => {
669                let shifted = value + params.shift;
670                if shifted <= 0.0 {
671                    Ok(Float::NAN)
672                } else {
673                    Ok(shifted.ln())
674                }
675            }
676            OutlierTransformationMethod::Log1p => Ok((value + params.shift).ln_1p()),
677            OutlierTransformationMethod::Sqrt => {
678                let shifted = value + params.shift;
679                if shifted < 0.0 {
680                    Ok(Float::NAN)
681                } else {
682                    Ok(shifted.sqrt())
683                }
684            }
685            OutlierTransformationMethod::BoxCox | OutlierTransformationMethod::BoxCoxFixed(_) => {
686                let lambda = params.lambda.unwrap_or(0.0);
687                let shifted = value + params.shift;
688                if shifted <= 0.0 {
689                    return Ok(Float::NAN);
690                }
691                if lambda.abs() < 1e-10 {
692                    Ok(shifted.ln())
693                } else {
694                    Ok((shifted.powf(lambda) - 1.0) / lambda)
695                }
696            }
697            OutlierTransformationMethod::QuantileUniform
698            | OutlierTransformationMethod::QuantileNormal => {
699                self.quantile_transform_value(value, params)
700            }
701            OutlierTransformationMethod::RobustScale => {
702                let median = params.median.unwrap_or(0.0);
703                let iqr = params.iqr.unwrap_or(1.0);
704                if iqr > 0.0 {
705                    Ok((value - median) / iqr)
706                } else {
707                    Ok(0.0)
708                }
709            }
710            OutlierTransformationMethod::Interpolate => self.interpolate_value(value, params),
711            OutlierTransformationMethod::Smooth => {
712                // For single value, return as-is (smoothing requires neighborhood)
713                Ok(value)
714            }
715            OutlierTransformationMethod::Trim => {
716                let lower = params.lower_bound.unwrap_or(Float::NEG_INFINITY);
717                let upper = params.upper_bound.unwrap_or(Float::INFINITY);
718                Ok(value.max(lower).min(upper))
719            }
720        }
721    }
722
723    /// Transform value using quantile transformation
724    fn quantile_transform_value(
725        &self,
726        value: Float,
727        params: &FeatureTransformationParams,
728    ) -> Result<Float> {
729        let quantiles = params.quantiles.as_ref().expect("operation should succeed");
730        let references = params
731            .references
732            .as_ref()
733            .expect("operation should succeed");
734
735        // Find position in quantiles
736        let mut pos = 0;
737        for (i, &q) in quantiles.iter().enumerate() {
738            if value <= q {
739                pos = i;
740                break;
741            }
742            pos = i + 1;
743        }
744
745        pos = pos.min(references.len() - 1);
746        Ok(references[pos])
747    }
748
749    /// Interpolate outlier value
750    fn interpolate_value(
751        &self,
752        value: Float,
753        params: &FeatureTransformationParams,
754    ) -> Result<Float> {
755        let lower = params.lower_bound.unwrap_or(Float::NEG_INFINITY);
756        let upper = params.upper_bound.unwrap_or(Float::INFINITY);
757
758        if value < lower {
759            Ok(lower)
760        } else if value > upper {
761            Ok(upper)
762        } else {
763            Ok(value)
764        }
765    }
766
767    /// Get transformation parameters
768    pub fn transformation_params(&self) -> Option<&TransformationParameters> {
769        self.transformation_params_.as_ref()
770    }
771
772    /// Get transformation statistics for a specific feature
773    pub fn feature_stats(&self, feature_idx: usize) -> Option<&FeatureTransformationParams> {
774        self.transformation_params_
775            .as_ref()?
776            .feature_params
777            .get(feature_idx)
778    }
779}
780
781impl Default for OutlierTransformer<Untrained> {
782    fn default() -> Self {
783        Self::new()
784    }
785}
786
787#[allow(non_snake_case)]
788#[cfg(test)]
789mod tests {
790    use super::*;
791    use approx::assert_relative_eq;
792    use scirs2_core::ndarray::Array2;
793
794    #[test]
795    fn test_log_transformation() {
796        let data = Array2::from_shape_vec(
797            (5, 2),
798            vec![
799                1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 100.0, 1000.0, // Outliers
800                4.0, 40.0,
801            ],
802        )
803        .expect("operation should succeed");
804
805        let transformer = OutlierTransformer::log();
806        let fitted = transformer
807            .fit(&data, &())
808            .expect("model fitting should succeed");
809        let result = fitted
810            .transform(&data)
811            .expect("transformation should succeed");
812
813        assert_eq!(result.dim(), data.dim());
814
815        // First value should be ln(1) = 0
816        assert_relative_eq!(result[[0, 0]], 1.0_f64.ln(), epsilon = 1e-10);
817
818        // Large outlier should be transformed: ln(100) ≈ 4.6
819        assert_relative_eq!(result[[3, 0]], 100.0_f64.ln(), epsilon = 1e-10);
820    }
821
822    #[test]
823    fn test_log1p_transformation() {
824        let data = Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 10.0, 100.0])
825            .expect("shape and data length should match");
826
827        let transformer = OutlierTransformer::log1p();
828        let fitted = transformer
829            .fit(&data, &())
830            .expect("model fitting should succeed");
831        let result = fitted
832            .transform(&data)
833            .expect("transformation should succeed");
834
835        assert_eq!(result.dim(), data.dim());
836
837        // log1p(0) = ln(1) = 0
838        assert_relative_eq!(result[[0, 0]], 0.0, epsilon = 1e-10);
839
840        // log1p(1) = ln(2)
841        assert_relative_eq!(result[[1, 0]], (2.0_f64).ln(), epsilon = 1e-10);
842    }
843
844    #[test]
845    fn test_sqrt_transformation() {
846        let data = Array2::from_shape_vec((4, 1), vec![1.0, 4.0, 9.0, 100.0])
847            .expect("shape and data length should match");
848
849        let transformer = OutlierTransformer::sqrt();
850        let fitted = transformer
851            .fit(&data, &())
852            .expect("model fitting should succeed");
853        let result = fitted
854            .transform(&data)
855            .expect("transformation should succeed");
856
857        assert_eq!(result.dim(), data.dim());
858
859        // sqrt(1) = 1
860        assert_relative_eq!(result[[0, 0]], 1.0, epsilon = 1e-10);
861
862        // sqrt(100) = 10
863        assert_relative_eq!(result[[3, 0]], 10.0, epsilon = 1e-10);
864    }
865
866    #[test]
867    fn test_robust_scale_transformation() {
868        let data = Array2::from_shape_vec(
869            (7, 1),
870            vec![
871                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0, // 100 is outlier
872            ],
873        )
874        .expect("operation should succeed");
875
876        let transformer = OutlierTransformer::robust_scale();
877        let fitted = transformer
878            .fit(&data, &())
879            .expect("model fitting should succeed");
880        let result = fitted
881            .transform(&data)
882            .expect("transformation should succeed");
883
884        assert_eq!(result.dim(), data.dim());
885
886        // Should be scaled by median and IQR, making it robust to outliers
887        let params = fitted.feature_stats(0).expect("operation should succeed");
888        assert!(params.median.is_some());
889        assert!(params.iqr.is_some());
890    }
891
892    #[test]
893    fn test_interpolate_transformation() {
894        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 100.0])
895            .expect("shape and data length should match");
896
897        let transformer = OutlierTransformer::interpolate(2.0, "z-score");
898        let fitted = transformer
899            .fit(&data, &())
900            .expect("model fitting should succeed");
901        let result = fitted
902            .transform(&data)
903            .expect("transformation should succeed");
904
905        assert_eq!(result.dim(), data.dim());
906
907        // Normal values should remain unchanged
908        assert_relative_eq!(result[[0, 0]], 1.0, epsilon = 1e-10);
909        assert_relative_eq!(result[[1, 0]], 2.0, epsilon = 1e-10);
910
911        // Outlier should be capped
912        let params = fitted.feature_stats(0).expect("operation should succeed");
913        assert!(params.upper_bound.is_some());
914    }
915
916    #[test]
917    fn test_trim_transformation() {
918        let data = Array2::from_shape_vec(
919            (11, 1),
920            (1..=10)
921                .map(|x| x as f64)
922                .chain(std::iter::once(1000.0))
923                .collect(),
924        )
925        .expect("operation should succeed");
926
927        let transformer = OutlierTransformer::trim(10.0, 90.0);
928        let fitted = transformer
929            .fit(&data, &())
930            .expect("model fitting should succeed");
931        let result = fitted
932            .transform(&data)
933            .expect("transformation should succeed");
934
935        assert_eq!(result.dim(), data.dim());
936
937        // Values should be trimmed to percentile bounds
938        let params = fitted.feature_stats(0).expect("operation should succeed");
939        assert!(params.lower_bound.is_some());
940        assert!(params.upper_bound.is_some());
941    }
942
943    #[test]
944    fn test_box_cox_transformation() {
945        let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0])
946            .expect("shape and data length should match");
947
948        let transformer = OutlierTransformer::box_cox_fixed(0.5);
949        let fitted = transformer
950            .fit(&data, &())
951            .expect("model fitting should succeed");
952        let result = fitted
953            .transform(&data)
954            .expect("transformation should succeed");
955
956        assert_eq!(result.dim(), data.dim());
957
958        let params = fitted.feature_stats(0).expect("operation should succeed");
959        assert!(params.lambda.is_some());
960        assert_relative_eq!(
961            params.lambda.expect("operation should succeed"),
962            0.5,
963            epsilon = 1e-10
964        );
965    }
966
967    #[test]
968    fn test_handle_negative_values() {
969        let data = Array2::from_shape_vec((4, 1), vec![-2.0, -1.0, 1.0, 100.0])
970            .expect("shape and data length should match");
971
972        let transformer = OutlierTransformer::log().handle_negatives(true);
973        let fitted = transformer
974            .fit(&data, &())
975            .expect("model fitting should succeed");
976        let result = fitted
977            .transform(&data)
978            .expect("transformation should succeed");
979
980        assert_eq!(result.dim(), data.dim());
981
982        // Should have applied shift to handle negatives
983        let params = fitted.feature_stats(0).expect("operation should succeed");
984        assert!(params.shift > 0.0);
985    }
986
987    #[test]
988    fn test_feature_wise_vs_global() {
989        let data =
990            Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 100.0, 1000.0])
991                .expect("operation should succeed");
992
993        // Feature-wise transformation
994        let transformer_fw = OutlierTransformer::log().feature_wise(true);
995        let fitted_fw = transformer_fw
996            .fit(&data, &())
997            .expect("model fitting should succeed");
998        let result_fw = fitted_fw
999            .transform(&data)
1000            .expect("transformation should succeed");
1001
1002        // Global transformation
1003        let transformer_global = OutlierTransformer::log().feature_wise(false);
1004        let fitted_global = transformer_global
1005            .fit(&data, &())
1006            .expect("model fitting should succeed");
1007        let result_global = fitted_global
1008            .transform(&data)
1009            .expect("transformation should succeed");
1010
1011        assert_eq!(result_fw.dim(), data.dim());
1012        assert_eq!(result_global.dim(), data.dim());
1013
1014        // Results should be different for feature-wise vs global
1015        // (This is a basic check - specific values depend on implementation)
1016    }
1017
1018    #[test]
1019    fn test_transformation_error_handling() {
1020        let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0])
1021            .expect("shape and data length should match");
1022        let transformer = OutlierTransformer::log();
1023        let fitted = transformer
1024            .fit(&data, &())
1025            .expect("model fitting should succeed");
1026
1027        // Test dimension mismatch
1028        let wrong_data = Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
1029            .expect("shape and data length should match");
1030        assert!(fitted.transform(&wrong_data).is_err());
1031    }
1032
1033    #[test]
1034    fn test_detection_methods() {
1035        let data = Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0])
1036            .expect("shape and data length should match");
1037
1038        // Test different detection methods
1039        let methods = vec!["z-score", "iqr", "percentile"];
1040
1041        for method in methods {
1042            let transformer = OutlierTransformer::interpolate(2.0, method);
1043            let fitted = transformer
1044                .fit(&data, &())
1045                .expect("model fitting should succeed");
1046            let result = fitted
1047                .transform(&data)
1048                .expect("transformation should succeed");
1049
1050            assert_eq!(result.dim(), data.dim());
1051
1052            let params = fitted.feature_stats(0).expect("operation should succeed");
1053            assert!(params.lower_bound.is_some());
1054            assert!(params.upper_bound.is_some());
1055        }
1056    }
1057}
sklears_preprocessing/outlier_transformation.rs

sklears_preprocessing/
outlier_transformation.rs