sklears_preprocessing/
outlier_transformation.rs

1//! Outlier transformation methods for handling extreme values
2//!
3//! This module provides various transformation methods specifically designed to handle
4//! outliers in data while preserving the overall structure and relationships. Unlike
5//! outlier detection which identifies outliers, these methods transform them to reduce
6//! their impact on downstream analysis.
7//!
8//! # Features
9//!
10//! - **Log Transformation**: Reduces impact of large outliers through logarithmic scaling
11//! - **Square Root Transformation**: Mild transformation for positive outliers
12//! - **Box-Cox Transformation**: Data-driven power transformation for normalization
13//! - **Quantile Transformation**: Maps to uniform or normal distribution
14//! - **Robust Scaling**: Scaling resistant to outliers using median and IQR
15//! - **Outlier Interpolation**: Replace outliers with interpolated values
16//! - **Outlier Smoothing**: Smooth outliers using neighboring values
17//! - **Trimmed Transformation**: Apply transformations after trimming extreme percentiles
18
19use scirs2_core::ndarray::{Array1, Array2, Axis};
20use sklears_core::{
21    error::{Result, SklearsError},
22    traits::{Fit, Trained, Transform, Untrained},
23    types::Float,
24};
25use std::marker::PhantomData;
26
27/// Available outlier transformation methods
28#[derive(Debug, Clone, Copy)]
29pub enum OutlierTransformationMethod {
30    /// Natural logarithm transformation (for positive values)
31    Log,
32    /// Log1p transformation (log(1 + x), handles zeros)
33    Log1p,
34    /// Square root transformation (for positive values)
35    Sqrt,
36    /// Box-Cox transformation with automatic lambda estimation
37    BoxCox,
38    /// Box-Cox transformation with fixed lambda
39    BoxCoxFixed(Float),
40    /// Quantile transformation to uniform distribution
41    QuantileUniform,
42    /// Quantile transformation to normal distribution
43    QuantileNormal,
44    /// Robust scaling using median and IQR
45    RobustScale,
46    /// Replace outliers with interpolated values
47    Interpolate,
48    /// Smooth outliers using local neighborhood
49    Smooth,
50    /// Trim extreme percentiles before transformation
51    Trim,
52}
53
54/// Configuration for outlier transformation
55#[derive(Debug, Clone)]
56pub struct OutlierTransformationConfig {
57    /// Transformation method to apply
58    pub method: OutlierTransformationMethod,
59    /// Threshold for outlier detection (used with interpolation/smoothing)
60    pub outlier_threshold: Float,
61    /// Method for outlier detection (z-score, iqr, percentile)
62    pub detection_method: String,
63    /// Lower percentile for trimming (default: 1.0)
64    pub lower_percentile: Float,
65    /// Upper percentile for trimming (default: 99.0)
66    pub upper_percentile: Float,
67    /// Window size for smoothing (default: 5)
68    pub smoothing_window: usize,
69    /// Number of quantiles for quantile transformation (default: 1000)
70    pub n_quantiles: usize,
71    /// Whether to handle negative values by shifting (default: true)
72    pub handle_negatives: bool,
73    /// Small constant to add before log transformation to avoid zeros
74    pub log_epsilon: Float,
75    /// Whether to apply transformation feature-wise (default: true)
76    pub feature_wise: bool,
77}
78
79impl Default for OutlierTransformationConfig {
80    fn default() -> Self {
81        Self {
82            method: OutlierTransformationMethod::Log1p,
83            outlier_threshold: 3.0,
84            detection_method: "z-score".to_string(),
85            lower_percentile: 1.0,
86            upper_percentile: 99.0,
87            smoothing_window: 5,
88            n_quantiles: 1000,
89            handle_negatives: true,
90            log_epsilon: 1e-8,
91            feature_wise: true,
92        }
93    }
94}
95
96/// Outlier transformer for handling extreme values through transformation
97#[derive(Debug, Clone)]
98pub struct OutlierTransformer<State = Untrained> {
99    config: OutlierTransformationConfig,
100    state: PhantomData<State>,
101    // Fitted parameters
102    transformation_params_: Option<TransformationParameters>,
103    n_features_in_: Option<usize>,
104}
105
106/// Parameters learned during fitting for transformations
107#[derive(Debug, Clone)]
108pub struct TransformationParameters {
109    /// Feature-wise transformation parameters
110    pub feature_params: Vec<FeatureTransformationParams>,
111    /// Global parameters (for non-feature-wise transformations)
112    pub global_params: Option<GlobalTransformationParams>,
113}
114
115/// Transformation parameters for a single feature
116#[derive(Debug, Clone)]
117pub struct FeatureTransformationParams {
118    /// Box-Cox lambda parameter
119    pub lambda: Option<Float>,
120    /// Shift applied to handle negative values
121    pub shift: Float,
122    /// Quantiles for quantile transformation
123    pub quantiles: Option<Array1<Float>>,
124    /// References values for quantile transformation
125    pub references: Option<Array1<Float>>,
126    /// Robust scaling parameters (median, IQR)
127    pub median: Option<Float>,
128    pub iqr: Option<Float>,
129    /// Outlier bounds for interpolation/smoothing
130    pub lower_bound: Option<Float>,
131    pub upper_bound: Option<Float>,
132    /// Statistics for outlier detection
133    pub mean: Option<Float>,
134    pub std: Option<Float>,
135}
136
137/// Global transformation parameters
138#[derive(Debug, Clone)]
139pub struct GlobalTransformationParams {
140    /// Global shift for handling negatives
141    pub global_shift: Float,
142    /// Global lambda for Box-Cox
143    pub global_lambda: Option<Float>,
144}
145
146impl OutlierTransformer<Untrained> {
147    /// Create a new OutlierTransformer with default configuration
148    pub fn new() -> Self {
149        Self {
150            config: OutlierTransformationConfig::default(),
151            state: PhantomData,
152            transformation_params_: None,
153            n_features_in_: None,
154        }
155    }
156
157    /// Create a log transformation for outliers
158    pub fn log() -> Self {
159        Self::new().method(OutlierTransformationMethod::Log)
160    }
161
162    /// Create a log1p transformation for outliers
163    pub fn log1p() -> Self {
164        Self::new().method(OutlierTransformationMethod::Log1p)
165    }
166
167    /// Create a square root transformation for outliers
168    pub fn sqrt() -> Self {
169        Self::new().method(OutlierTransformationMethod::Sqrt)
170    }
171
172    /// Create a Box-Cox transformation with automatic lambda
173    pub fn box_cox() -> Self {
174        Self::new().method(OutlierTransformationMethod::BoxCox)
175    }
176
177    /// Create a Box-Cox transformation with fixed lambda
178    pub fn box_cox_fixed(lambda: Float) -> Self {
179        Self::new().method(OutlierTransformationMethod::BoxCoxFixed(lambda))
180    }
181
182    /// Create a quantile transformation to uniform distribution
183    pub fn quantile_uniform(n_quantiles: usize) -> Self {
184        Self::new()
185            .method(OutlierTransformationMethod::QuantileUniform)
186            .n_quantiles(n_quantiles)
187    }
188
189    /// Create a quantile transformation to normal distribution
190    pub fn quantile_normal(n_quantiles: usize) -> Self {
191        Self::new()
192            .method(OutlierTransformationMethod::QuantileNormal)
193            .n_quantiles(n_quantiles)
194    }
195
196    /// Create a robust scaling transformation
197    pub fn robust_scale() -> Self {
198        Self::new().method(OutlierTransformationMethod::RobustScale)
199    }
200
201    /// Create an interpolation transformation
202    pub fn interpolate(threshold: Float, detection_method: &str) -> Self {
203        Self::new()
204            .method(OutlierTransformationMethod::Interpolate)
205            .outlier_threshold(threshold)
206            .detection_method(detection_method.to_string())
207    }
208
209    /// Create a smoothing transformation
210    pub fn smooth(window_size: usize, threshold: Float) -> Self {
211        Self::new()
212            .method(OutlierTransformationMethod::Smooth)
213            .smoothing_window(window_size)
214            .outlier_threshold(threshold)
215    }
216
217    /// Create a trimming transformation
218    pub fn trim(lower_percentile: Float, upper_percentile: Float) -> Self {
219        Self::new()
220            .method(OutlierTransformationMethod::Trim)
221            .lower_percentile(lower_percentile)
222            .upper_percentile(upper_percentile)
223    }
224
225    /// Set the transformation method
226    pub fn method(mut self, method: OutlierTransformationMethod) -> Self {
227        self.config.method = method;
228        self
229    }
230
231    /// Set the outlier detection threshold
232    pub fn outlier_threshold(mut self, threshold: Float) -> Self {
233        self.config.outlier_threshold = threshold;
234        self
235    }
236
237    /// Set the outlier detection method
238    pub fn detection_method(mut self, method: String) -> Self {
239        self.config.detection_method = method;
240        self
241    }
242
243    /// Set the lower percentile for trimming
244    pub fn lower_percentile(mut self, percentile: Float) -> Self {
245        self.config.lower_percentile = percentile;
246        self
247    }
248
249    /// Set the upper percentile for trimming
250    pub fn upper_percentile(mut self, percentile: Float) -> Self {
251        self.config.upper_percentile = percentile;
252        self
253    }
254
255    /// Set the smoothing window size
256    pub fn smoothing_window(mut self, window: usize) -> Self {
257        self.config.smoothing_window = window;
258        self
259    }
260
261    /// Set the number of quantiles
262    pub fn n_quantiles(mut self, n_quantiles: usize) -> Self {
263        self.config.n_quantiles = n_quantiles;
264        self
265    }
266
267    /// Set whether to handle negative values
268    pub fn handle_negatives(mut self, handle: bool) -> Self {
269        self.config.handle_negatives = handle;
270        self
271    }
272
273    /// Set the epsilon for log transformations
274    pub fn log_epsilon(mut self, epsilon: Float) -> Self {
275        self.config.log_epsilon = epsilon;
276        self
277    }
278
279    /// Set whether to apply transformations feature-wise
280    pub fn feature_wise(mut self, feature_wise: bool) -> Self {
281        self.config.feature_wise = feature_wise;
282        self
283    }
284}
285
286impl Fit<Array2<Float>, ()> for OutlierTransformer<Untrained> {
287    type Fitted = OutlierTransformer<Trained>;
288
289    fn fit(mut self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
290        let (n_samples, n_features) = x.dim();
291
292        if n_samples == 0 || n_features == 0 {
293            return Err(SklearsError::InvalidInput(
294                "Input array is empty".to_string(),
295            ));
296        }
297
298        self.n_features_in_ = Some(n_features);
299
300        // Compute transformation parameters based on method
301        let feature_params = if self.config.feature_wise {
302            (0..n_features)
303                .map(|j| self.fit_feature_params(x.column(j).to_owned().as_slice().unwrap()))
304                .collect::<Result<Vec<_>>>()?
305        } else {
306            // For non-feature-wise, we'll use global parameters
307            vec![self.fit_feature_params(x.as_slice().unwrap())?]
308        };
309
310        self.transformation_params_ = Some(TransformationParameters {
311            feature_params,
312            global_params: None, // Could be used for global transformations
313        });
314
315        Ok(OutlierTransformer {
316            config: self.config,
317            state: PhantomData,
318            transformation_params_: self.transformation_params_,
319            n_features_in_: self.n_features_in_,
320        })
321    }
322}
323
324impl OutlierTransformer<Untrained> {
325    /// Fit parameters for a single feature
326    fn fit_feature_params(&self, data: &[Float]) -> Result<FeatureTransformationParams> {
327        let mut params = FeatureTransformationParams {
328            lambda: None,
329            shift: 0.0,
330            quantiles: None,
331            references: None,
332            median: None,
333            iqr: None,
334            lower_bound: None,
335            upper_bound: None,
336            mean: None,
337            std: None,
338        };
339
340        // Calculate basic statistics
341        let valid_data: Vec<Float> = data.iter().filter(|x| x.is_finite()).copied().collect();
342
343        if valid_data.is_empty() {
344            return Ok(params);
345        }
346
347        let mean = valid_data.iter().sum::<Float>() / valid_data.len() as Float;
348        let variance = valid_data.iter().map(|x| (x - mean).powi(2)).sum::<Float>()
349            / valid_data.len() as Float;
350        let std = variance.sqrt();
351
352        params.mean = Some(mean);
353        params.std = Some(std);
354
355        // Calculate median and IQR for robust methods
356        let mut sorted_data = valid_data.clone();
357        sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
358
359        let median = if sorted_data.len() % 2 == 0 {
360            let mid = sorted_data.len() / 2;
361            (sorted_data[mid - 1] + sorted_data[mid]) / 2.0
362        } else {
363            sorted_data[sorted_data.len() / 2]
364        };
365
366        let q1_idx = sorted_data.len() / 4;
367        let q3_idx = 3 * sorted_data.len() / 4;
368        let q1 = sorted_data[q1_idx];
369        let q3 = sorted_data[q3_idx];
370        let iqr = q3 - q1;
371
372        params.median = Some(median);
373        params.iqr = Some(iqr);
374
375        // Set outlier bounds based on detection method
376        match self.config.detection_method.as_str() {
377            "z-score" => {
378                params.lower_bound = Some(mean - self.config.outlier_threshold * std);
379                params.upper_bound = Some(mean + self.config.outlier_threshold * std);
380            }
381            "iqr" => {
382                params.lower_bound = Some(q1 - self.config.outlier_threshold * iqr);
383                params.upper_bound = Some(q3 + self.config.outlier_threshold * iqr);
384            }
385            "percentile" => {
386                let lower_idx =
387                    ((self.config.lower_percentile / 100.0) * sorted_data.len() as Float) as usize;
388                let upper_idx =
389                    ((self.config.upper_percentile / 100.0) * sorted_data.len() as Float) as usize;
390                params.lower_bound = Some(sorted_data[lower_idx.min(sorted_data.len() - 1)]);
391                params.upper_bound = Some(sorted_data[upper_idx.min(sorted_data.len() - 1)]);
392            }
393            _ => {
394                return Err(SklearsError::InvalidInput(format!(
395                    "Unknown detection method: {}",
396                    self.config.detection_method
397                )));
398            }
399        }
400
401        // Handle negative values for log/sqrt transformations
402        if self.config.handle_negatives {
403            match self.config.method {
404                OutlierTransformationMethod::Log | OutlierTransformationMethod::Sqrt => {
405                    let min_val = sorted_data[0];
406                    if min_val <= 0.0 {
407                        params.shift = -min_val + self.config.log_epsilon;
408                    }
409                }
410                OutlierTransformationMethod::BoxCox
411                | OutlierTransformationMethod::BoxCoxFixed(_) => {
412                    let min_val = sorted_data[0];
413                    if min_val <= 0.0 {
414                        params.shift = -min_val + self.config.log_epsilon;
415                    }
416                }
417                _ => {}
418            }
419        }
420
421        // Fit method-specific parameters
422        match self.config.method {
423            OutlierTransformationMethod::BoxCox => {
424                params.lambda = Some(self.estimate_box_cox_lambda(&valid_data, params.shift)?);
425            }
426            OutlierTransformationMethod::BoxCoxFixed(lambda) => {
427                params.lambda = Some(lambda);
428            }
429            OutlierTransformationMethod::QuantileUniform
430            | OutlierTransformationMethod::QuantileNormal => {
431                params.quantiles = Some(self.compute_quantiles(&sorted_data)?);
432                params.references = Some(self.compute_references()?);
433            }
434            _ => {}
435        }
436
437        Ok(params)
438    }
439
440    /// Estimate optimal lambda for Box-Cox transformation using maximum likelihood
441    fn estimate_box_cox_lambda(&self, data: &[Float], shift: Float) -> Result<Float> {
442        let shifted_data: Vec<Float> = data.iter().map(|x| x + shift).collect();
443
444        // Search for optimal lambda in range [-2, 2]
445        let lambda_range: Vec<Float> = (-20..=20).map(|i| i as Float * 0.1).collect();
446
447        let mut best_lambda = 0.0;
448        let mut best_llf = Float::NEG_INFINITY;
449
450        for &lambda in &lambda_range {
451            if let Ok(llf) = self.box_cox_log_likelihood(&shifted_data, lambda) {
452                if llf > best_llf {
453                    best_llf = llf;
454                    best_lambda = lambda;
455                }
456            }
457        }
458
459        Ok(best_lambda)
460    }
461
462    /// Compute log-likelihood for Box-Cox transformation
463    fn box_cox_log_likelihood(&self, data: &[Float], lambda: Float) -> Result<Float> {
464        let n = data.len() as Float;
465
466        // Transform data
467        let transformed: Vec<Float> = data
468            .iter()
469            .map(|&x| {
470                if x <= 0.0 {
471                    return Float::NAN;
472                }
473                if lambda.abs() < 1e-10 {
474                    x.ln()
475                } else {
476                    (x.powf(lambda) - 1.0) / lambda
477                }
478            })
479            .collect();
480
481        // Check for invalid transformations
482        if transformed.iter().any(|x| !x.is_finite()) {
483            return Err(SklearsError::InvalidInput(
484                "Invalid Box-Cox transformation".to_string(),
485            ));
486        }
487
488        // Compute log-likelihood
489        let mean = transformed.iter().sum::<Float>() / n;
490        let variance = transformed
491            .iter()
492            .map(|x| (x - mean).powi(2))
493            .sum::<Float>()
494            / n;
495
496        let log_jacobian = (lambda - 1.0) * data.iter().map(|x| x.ln()).sum::<Float>();
497        let llf = -0.5 * n * (2.0 * std::f64::consts::PI as Float).ln()
498            - 0.5 * n * variance.ln()
499            - 0.5 * n
500            + log_jacobian;
501
502        Ok(llf)
503    }
504
505    /// Compute quantiles for quantile transformation
506    fn compute_quantiles(&self, sorted_data: &[Float]) -> Result<Array1<Float>> {
507        let n_quantiles = self.config.n_quantiles.min(sorted_data.len());
508        let mut quantiles = Array1::zeros(n_quantiles);
509
510        for i in 0..n_quantiles {
511            let q = i as Float / (n_quantiles - 1) as Float;
512            let idx = (q * (sorted_data.len() - 1) as Float) as usize;
513            quantiles[i] = sorted_data[idx.min(sorted_data.len() - 1)];
514        }
515
516        Ok(quantiles)
517    }
518
519    /// Compute reference values for quantile transformation
520    fn compute_references(&self) -> Result<Array1<Float>> {
521        let n_quantiles = self.config.n_quantiles;
522        let mut references = Array1::zeros(n_quantiles);
523
524        match self.config.method {
525            OutlierTransformationMethod::QuantileUniform => {
526                for i in 0..n_quantiles {
527                    references[i] = i as Float / (n_quantiles - 1) as Float;
528                }
529            }
530            OutlierTransformationMethod::QuantileNormal => {
531                // Approximate normal quantiles
532                for i in 0..n_quantiles {
533                    let p = i as Float / (n_quantiles - 1) as Float;
534                    references[i] = self.inverse_normal_cdf(p);
535                }
536            }
537            _ => {
538                return Err(SklearsError::InvalidInput(
539                    "Invalid quantile method".to_string(),
540                ));
541            }
542        }
543
544        Ok(references)
545    }
546
547    /// Approximate inverse normal CDF using Beasley-Springer-Moro algorithm
548    fn inverse_normal_cdf(&self, p: Float) -> Float {
549        if p <= 0.0 {
550            return Float::NEG_INFINITY;
551        }
552        if p >= 1.0 {
553            return Float::INFINITY;
554        }
555        if p == 0.5 {
556            return 0.0;
557        }
558
559        // Use simple approximation for demonstration
560        // In production, use a more accurate method
561        let a = [
562            -3.969683028665376e+01,
563            2.209460984245205e+02,
564            -2.759285104469687e+02,
565            1.383577518672690e+02,
566            -3.066479806614716e+01,
567            2.506628277459239e+00,
568        ];
569        let b = [
570            -5.447609879822406e+01,
571            1.615858368580409e+02,
572            -1.556989798598866e+02,
573            6.680131188771972e+01,
574            -1.328068155288572e+01,
575        ];
576
577        let q = if p > 0.5 { 1.0 - p } else { p };
578        let t = (-2.0 * q.ln()).sqrt();
579
580        let mut num = a[5];
581        for i in (0..5).rev() {
582            num = num * t + a[i];
583        }
584
585        let mut den = 1.0;
586        for i in (0..5).rev() {
587            den = den * t + b[i];
588        }
589
590        let x = t - num / den;
591        if p > 0.5 {
592            x
593        } else {
594            -x
595        }
596    }
597}
598
599impl Transform<Array2<Float>, Array2<Float>> for OutlierTransformer<Trained> {
600    fn transform(&self, x: &Array2<Float>) -> Result<Array2<Float>> {
601        let (_n_samples, n_features) = x.dim();
602
603        if n_features != self.n_features_in().unwrap() {
604            return Err(SklearsError::FeatureMismatch {
605                expected: self.n_features_in().unwrap(),
606                actual: n_features,
607            });
608        }
609
610        let params = self.transformation_params_.as_ref().unwrap();
611        let mut result = x.clone();
612
613        if self.config.feature_wise {
614            for j in 0..n_features {
615                let feature_params = &params.feature_params[j];
616                let mut column = result.column_mut(j);
617                self.transform_feature_inplace(&mut column, feature_params)?;
618            }
619        } else {
620            // Global transformation
621            let feature_params = &params.feature_params[0];
622            for mut row in result.axis_iter_mut(Axis(0)) {
623                for elem in row.iter_mut() {
624                    *elem = self.transform_value(*elem, feature_params)?;
625                }
626            }
627        }
628
629        Ok(result)
630    }
631}
632
633impl OutlierTransformer<Trained> {
634    /// Get the number of features seen during fit
635    pub fn n_features_in(&self) -> Option<usize> {
636        self.n_features_in_
637    }
638
639    /// Transform a single feature in-place
640    fn transform_feature_inplace(
641        &self,
642        column: &mut scirs2_core::ndarray::ArrayViewMut1<Float>,
643        params: &FeatureTransformationParams,
644    ) -> Result<()> {
645        for elem in column.iter_mut() {
646            *elem = self.transform_value(*elem, params)?;
647        }
648        Ok(())
649    }
650
651    /// Transform a single value
652    fn transform_value(&self, value: Float, params: &FeatureTransformationParams) -> Result<Float> {
653        if !value.is_finite() {
654            return Ok(value);
655        }
656
657        match self.config.method {
658            OutlierTransformationMethod::Log => {
659                let shifted = value + params.shift;
660                if shifted <= 0.0 {
661                    Ok(Float::NAN)
662                } else {
663                    Ok(shifted.ln())
664                }
665            }
666            OutlierTransformationMethod::Log1p => Ok((value + params.shift).ln_1p()),
667            OutlierTransformationMethod::Sqrt => {
668                let shifted = value + params.shift;
669                if shifted < 0.0 {
670                    Ok(Float::NAN)
671                } else {
672                    Ok(shifted.sqrt())
673                }
674            }
675            OutlierTransformationMethod::BoxCox | OutlierTransformationMethod::BoxCoxFixed(_) => {
676                let lambda = params.lambda.unwrap_or(0.0);
677                let shifted = value + params.shift;
678                if shifted <= 0.0 {
679                    return Ok(Float::NAN);
680                }
681                if lambda.abs() < 1e-10 {
682                    Ok(shifted.ln())
683                } else {
684                    Ok((shifted.powf(lambda) - 1.0) / lambda)
685                }
686            }
687            OutlierTransformationMethod::QuantileUniform
688            | OutlierTransformationMethod::QuantileNormal => {
689                self.quantile_transform_value(value, params)
690            }
691            OutlierTransformationMethod::RobustScale => {
692                let median = params.median.unwrap_or(0.0);
693                let iqr = params.iqr.unwrap_or(1.0);
694                if iqr > 0.0 {
695                    Ok((value - median) / iqr)
696                } else {
697                    Ok(0.0)
698                }
699            }
700            OutlierTransformationMethod::Interpolate => self.interpolate_value(value, params),
701            OutlierTransformationMethod::Smooth => {
702                // For single value, return as-is (smoothing requires neighborhood)
703                Ok(value)
704            }
705            OutlierTransformationMethod::Trim => {
706                let lower = params.lower_bound.unwrap_or(Float::NEG_INFINITY);
707                let upper = params.upper_bound.unwrap_or(Float::INFINITY);
708                Ok(value.max(lower).min(upper))
709            }
710        }
711    }
712
713    /// Transform value using quantile transformation
714    fn quantile_transform_value(
715        &self,
716        value: Float,
717        params: &FeatureTransformationParams,
718    ) -> Result<Float> {
719        let quantiles = params.quantiles.as_ref().unwrap();
720        let references = params.references.as_ref().unwrap();
721
722        // Find position in quantiles
723        let mut pos = 0;
724        for (i, &q) in quantiles.iter().enumerate() {
725            if value <= q {
726                pos = i;
727                break;
728            }
729            pos = i + 1;
730        }
731
732        pos = pos.min(references.len() - 1);
733        Ok(references[pos])
734    }
735
736    /// Interpolate outlier value
737    fn interpolate_value(
738        &self,
739        value: Float,
740        params: &FeatureTransformationParams,
741    ) -> Result<Float> {
742        let lower = params.lower_bound.unwrap_or(Float::NEG_INFINITY);
743        let upper = params.upper_bound.unwrap_or(Float::INFINITY);
744
745        if value < lower {
746            Ok(lower)
747        } else if value > upper {
748            Ok(upper)
749        } else {
750            Ok(value)
751        }
752    }
753
754    /// Get transformation parameters
755    pub fn transformation_params(&self) -> Option<&TransformationParameters> {
756        self.transformation_params_.as_ref()
757    }
758
759    /// Get transformation statistics for a specific feature
760    pub fn feature_stats(&self, feature_idx: usize) -> Option<&FeatureTransformationParams> {
761        self.transformation_params_
762            .as_ref()?
763            .feature_params
764            .get(feature_idx)
765    }
766}
767
768impl Default for OutlierTransformer<Untrained> {
769    fn default() -> Self {
770        Self::new()
771    }
772}
773
774#[allow(non_snake_case)]
775#[cfg(test)]
776mod tests {
777    use super::*;
778    use approx::assert_relative_eq;
779    use scirs2_core::ndarray::Array2;
780
781    #[test]
782    fn test_log_transformation() {
783        let data = Array2::from_shape_vec(
784            (5, 2),
785            vec![
786                1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 100.0, 1000.0, // Outliers
787                4.0, 40.0,
788            ],
789        )
790        .unwrap();
791
792        let transformer = OutlierTransformer::log();
793        let fitted = transformer.fit(&data, &()).unwrap();
794        let result = fitted.transform(&data).unwrap();
795
796        assert_eq!(result.dim(), data.dim());
797
798        // First value should be ln(1) = 0
799        assert_relative_eq!(result[[0, 0]], 1.0_f64.ln(), epsilon = 1e-10);
800
801        // Large outlier should be transformed: ln(100) ≈ 4.6
802        assert_relative_eq!(result[[3, 0]], 100.0_f64.ln(), epsilon = 1e-10);
803    }
804
805    #[test]
806    fn test_log1p_transformation() {
807        let data = Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 10.0, 100.0]).unwrap();
808
809        let transformer = OutlierTransformer::log1p();
810        let fitted = transformer.fit(&data, &()).unwrap();
811        let result = fitted.transform(&data).unwrap();
812
813        assert_eq!(result.dim(), data.dim());
814
815        // log1p(0) = ln(1) = 0
816        assert_relative_eq!(result[[0, 0]], 0.0, epsilon = 1e-10);
817
818        // log1p(1) = ln(2)
819        assert_relative_eq!(result[[1, 0]], (2.0_f64).ln(), epsilon = 1e-10);
820    }
821
822    #[test]
823    fn test_sqrt_transformation() {
824        let data = Array2::from_shape_vec((4, 1), vec![1.0, 4.0, 9.0, 100.0]).unwrap();
825
826        let transformer = OutlierTransformer::sqrt();
827        let fitted = transformer.fit(&data, &()).unwrap();
828        let result = fitted.transform(&data).unwrap();
829
830        assert_eq!(result.dim(), data.dim());
831
832        // sqrt(1) = 1
833        assert_relative_eq!(result[[0, 0]], 1.0, epsilon = 1e-10);
834
835        // sqrt(100) = 10
836        assert_relative_eq!(result[[3, 0]], 10.0, epsilon = 1e-10);
837    }
838
839    #[test]
840    fn test_robust_scale_transformation() {
841        let data = Array2::from_shape_vec(
842            (7, 1),
843            vec![
844                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0, // 100 is outlier
845            ],
846        )
847        .unwrap();
848
849        let transformer = OutlierTransformer::robust_scale();
850        let fitted = transformer.fit(&data, &()).unwrap();
851        let result = fitted.transform(&data).unwrap();
852
853        assert_eq!(result.dim(), data.dim());
854
855        // Should be scaled by median and IQR, making it robust to outliers
856        let params = fitted.feature_stats(0).unwrap();
857        assert!(params.median.is_some());
858        assert!(params.iqr.is_some());
859    }
860
861    #[test]
862    fn test_interpolate_transformation() {
863        let data = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 100.0]).unwrap();
864
865        let transformer = OutlierTransformer::interpolate(2.0, "z-score");
866        let fitted = transformer.fit(&data, &()).unwrap();
867        let result = fitted.transform(&data).unwrap();
868
869        assert_eq!(result.dim(), data.dim());
870
871        // Normal values should remain unchanged
872        assert_relative_eq!(result[[0, 0]], 1.0, epsilon = 1e-10);
873        assert_relative_eq!(result[[1, 0]], 2.0, epsilon = 1e-10);
874
875        // Outlier should be capped
876        let params = fitted.feature_stats(0).unwrap();
877        assert!(params.upper_bound.is_some());
878    }
879
880    #[test]
881    fn test_trim_transformation() {
882        let data = Array2::from_shape_vec(
883            (11, 1),
884            (1..=10)
885                .map(|x| x as f64)
886                .chain(std::iter::once(1000.0))
887                .collect(),
888        )
889        .unwrap();
890
891        let transformer = OutlierTransformer::trim(10.0, 90.0);
892        let fitted = transformer.fit(&data, &()).unwrap();
893        let result = fitted.transform(&data).unwrap();
894
895        assert_eq!(result.dim(), data.dim());
896
897        // Values should be trimmed to percentile bounds
898        let params = fitted.feature_stats(0).unwrap();
899        assert!(params.lower_bound.is_some());
900        assert!(params.upper_bound.is_some());
901    }
902
903    #[test]
904    fn test_box_cox_transformation() {
905        let data = Array2::from_shape_vec((6, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]).unwrap();
906
907        let transformer = OutlierTransformer::box_cox_fixed(0.5);
908        let fitted = transformer.fit(&data, &()).unwrap();
909        let result = fitted.transform(&data).unwrap();
910
911        assert_eq!(result.dim(), data.dim());
912
913        let params = fitted.feature_stats(0).unwrap();
914        assert!(params.lambda.is_some());
915        assert_relative_eq!(params.lambda.unwrap(), 0.5, epsilon = 1e-10);
916    }
917
918    #[test]
919    fn test_handle_negative_values() {
920        let data = Array2::from_shape_vec((4, 1), vec![-2.0, -1.0, 1.0, 100.0]).unwrap();
921
922        let transformer = OutlierTransformer::log().handle_negatives(true);
923        let fitted = transformer.fit(&data, &()).unwrap();
924        let result = fitted.transform(&data).unwrap();
925
926        assert_eq!(result.dim(), data.dim());
927
928        // Should have applied shift to handle negatives
929        let params = fitted.feature_stats(0).unwrap();
930        assert!(params.shift > 0.0);
931    }
932
933    #[test]
934    fn test_feature_wise_vs_global() {
935        let data =
936            Array2::from_shape_vec((4, 2), vec![1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 100.0, 1000.0])
937                .unwrap();
938
939        // Feature-wise transformation
940        let transformer_fw = OutlierTransformer::log().feature_wise(true);
941        let fitted_fw = transformer_fw.fit(&data, &()).unwrap();
942        let result_fw = fitted_fw.transform(&data).unwrap();
943
944        // Global transformation
945        let transformer_global = OutlierTransformer::log().feature_wise(false);
946        let fitted_global = transformer_global.fit(&data, &()).unwrap();
947        let result_global = fitted_global.transform(&data).unwrap();
948
949        assert_eq!(result_fw.dim(), data.dim());
950        assert_eq!(result_global.dim(), data.dim());
951
952        // Results should be different for feature-wise vs global
953        // (This is a basic check - specific values depend on implementation)
954    }
955
956    #[test]
957    fn test_transformation_error_handling() {
958        let data = Array2::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
959        let transformer = OutlierTransformer::log();
960        let fitted = transformer.fit(&data, &()).unwrap();
961
962        // Test dimension mismatch
963        let wrong_data =
964            Array2::from_shape_vec((2, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
965        assert!(fitted.transform(&wrong_data).is_err());
966    }
967
968    #[test]
969    fn test_detection_methods() {
970        let data =
971            Array2::from_shape_vec((7, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 100.0]).unwrap();
972
973        // Test different detection methods
974        let methods = vec!["z-score", "iqr", "percentile"];
975
976        for method in methods {
977            let transformer = OutlierTransformer::interpolate(2.0, method);
978            let fitted = transformer.fit(&data, &()).unwrap();
979            let result = fitted.transform(&data).unwrap();
980
981            assert_eq!(result.dim(), data.dim());
982
983            let params = fitted.feature_stats(0).unwrap();
984            assert!(params.lower_bound.is_some());
985            assert!(params.upper_bound.is_some());
986        }
987    }
988}
sklears_preprocessing/outlier_transformation.rs

sklears_preprocessing/
outlier_transformation.rs