sklears_preprocessing/outlier_detection/
core.rs

1//! Core types and configurations for outlier detection
2
3use sklears_core::types::Float;
4
5/// Methods for outlier detection (both univariate and multivariate)
6#[derive(Debug, Clone, Copy, Default)]
7pub enum OutlierDetectionMethod {
8    /// Z-score based outlier detection (univariate)
9    #[default]
10    ZScore,
11    /// Modified Z-score using median absolute deviation (univariate)
12    ModifiedZScore,
13    /// Interquartile Range (IQR) based detection (univariate)
14    IQR,
15    /// Percentile-based outlier detection (univariate)
16    Percentile,
17    /// Mahalanobis distance based detection (multivariate)
18    MahalanobisDistance,
19    /// Isolation Forest for anomaly detection (multivariate)
20    IsolationForest,
21    /// Local Outlier Factor (multivariate)
22    LocalOutlierFactor,
23    /// One-Class SVM for novelty detection (multivariate)
24    OneClassSVM,
25    /// Ensemble of multiple outlier detection methods
26    Ensemble,
27}
28
29/// Configuration for outlier detector (supports both univariate and multivariate methods)
30#[derive(Debug, Clone)]
31pub struct OutlierDetectorConfig {
32    /// Method for outlier detection
33    pub method: OutlierDetectionMethod,
34    /// Threshold for Z-score and Modified Z-score methods (default: 3.0)
35    pub threshold: Float,
36    /// Multiplier for IQR method (default: 1.5, commonly used value)
37    pub iqr_multiplier: Float,
38    /// Lower percentile for percentile method (default: 5.0)
39    pub lower_percentile: Float,
40    /// Upper percentile for percentile method (default: 95.0)
41    pub upper_percentile: Float,
42    /// Chi-squared threshold for Mahalanobis distance (default: based on degrees of freedom)
43    pub mahalanobis_threshold: Option<Float>,
44    /// Confidence level for automatic Mahalanobis threshold (default: 0.95)
45    pub confidence_level: Float,
46    /// Number of trees for Isolation Forest (default: 100)
47    pub n_estimators: usize,
48    /// Subsampling size for Isolation Forest (default: 256)
49    pub max_samples: usize,
50    /// Number of neighbors for LOF (default: 20)
51    pub n_neighbors: usize,
52    /// Contamination rate - expected proportion of outliers (default: 0.1)
53    pub contamination: Float,
54    /// Kernel for One-Class SVM (default: RBF)
55    pub svm_kernel: String,
56    /// Nu parameter for One-Class SVM (default: 0.05)
57    pub nu: Float,
58    /// Gamma parameter for RBF kernel (default: scale)
59    pub gamma: Float,
60    /// Ensemble methods to combine (for Ensemble method)
61    pub ensemble_methods: Vec<OutlierDetectionMethod>,
62    /// Voting strategy for ensemble: "majority" or "average" (default: "majority")
63    pub voting_strategy: String,
64}
65
66impl Default for OutlierDetectorConfig {
67    fn default() -> Self {
68        Self {
69            method: OutlierDetectionMethod::ZScore,
70            threshold: 3.0,
71            iqr_multiplier: 1.5,
72            lower_percentile: 5.0,
73            upper_percentile: 95.0,
74            mahalanobis_threshold: None,
75            confidence_level: 0.95,
76            n_estimators: 100,
77            max_samples: 256,
78            n_neighbors: 20,
79            contamination: 0.1,
80            svm_kernel: "rbf".to_string(),
81            nu: 0.05,
82            gamma: 1.0, // Will be set to 1/n_features by default
83            ensemble_methods: vec![
84                OutlierDetectionMethod::ZScore,
85                OutlierDetectionMethod::MahalanobisDistance,
86            ],
87            voting_strategy: "majority".to_string(),
88        }
89    }
90}
91
92/// Statistics computed for outlier detection
93#[derive(Debug, Clone)]
94pub struct OutlierStatistics {
95    pub n_outliers: usize,
96    pub outlier_fraction: Float,
97    pub feature_outlier_counts: Vec<usize>,
98}
99
100/// Parameters for multivariate outlier detection
101#[derive(Debug, Clone, Default)]
102pub struct MultivariateOutlierParams {
103    pub mean: Vec<Float>,
104    pub covariance: Vec<Vec<Float>>,
105    pub inv_covariance: Vec<Vec<Float>>,
106    pub threshold: Float,
107}
108
109/// Parameters for outlier detection on a single feature
110#[derive(Debug, Clone, Default)]
111pub struct FeatureOutlierParams {
112    pub mean: Option<Float>,
113    pub std: Option<Float>,
114    pub median: Option<Float>,
115    pub mad: Option<Float>, // Median Absolute Deviation
116    pub q1: Option<Float>,  // First quartile
117    pub q3: Option<Float>,  // Third quartile
118    pub iqr: Option<Float>, // Interquartile Range
119    pub lower_bound: Option<Float>,
120    pub upper_bound: Option<Float>,
121    pub lower_percentile_value: Option<Float>,
122    pub upper_percentile_value: Option<Float>,
123}
124
125/// Outlier detection results providing detailed information
126#[derive(Debug, Clone)]
127pub struct OutlierDetectionResult {
128    /// Boolean array indicating which samples are outliers
129    pub outliers: Vec<bool>,
130    /// Outlier scores for each sample (higher = more likely outlier)
131    pub scores: Vec<Float>,
132    /// Summary statistics
133    pub summary: OutlierSummary,
134}
135
136/// Summary of outlier detection results
137#[derive(Debug, Clone)]
138pub struct OutlierSummary {
139    /// Total number of samples
140    pub n_samples: usize,
141    /// Number of outliers detected
142    pub n_outliers: usize,
143    /// Fraction of samples that are outliers
144    pub outlier_fraction: Float,
145    /// Method used for detection
146    pub method: OutlierDetectionMethod,
147}