sklears_preprocessing/outlier_detection/core.rs
1//! Core types and configurations for outlier detection
2
3use sklears_core::types::Float;
4
5/// Methods for outlier detection (both univariate and multivariate)
6#[derive(Debug, Clone, Copy, Default)]
7pub enum OutlierDetectionMethod {
8 /// Z-score based outlier detection (univariate)
9 #[default]
10 ZScore,
11 /// Modified Z-score using median absolute deviation (univariate)
12 ModifiedZScore,
13 /// Interquartile Range (IQR) based detection (univariate)
14 IQR,
15 /// Percentile-based outlier detection (univariate)
16 Percentile,
17 /// Mahalanobis distance based detection (multivariate)
18 MahalanobisDistance,
19 /// Isolation Forest for anomaly detection (multivariate)
20 IsolationForest,
21 /// Local Outlier Factor (multivariate)
22 LocalOutlierFactor,
23 /// One-Class SVM for novelty detection (multivariate)
24 OneClassSVM,
25 /// Ensemble of multiple outlier detection methods
26 Ensemble,
27}
28
29/// Configuration for outlier detector (supports both univariate and multivariate methods)
30#[derive(Debug, Clone)]
31pub struct OutlierDetectorConfig {
32 /// Method for outlier detection
33 pub method: OutlierDetectionMethod,
34 /// Threshold for Z-score and Modified Z-score methods (default: 3.0)
35 pub threshold: Float,
36 /// Multiplier for IQR method (default: 1.5, commonly used value)
37 pub iqr_multiplier: Float,
38 /// Lower percentile for percentile method (default: 5.0)
39 pub lower_percentile: Float,
40 /// Upper percentile for percentile method (default: 95.0)
41 pub upper_percentile: Float,
42 /// Chi-squared threshold for Mahalanobis distance (default: based on degrees of freedom)
43 pub mahalanobis_threshold: Option<Float>,
44 /// Confidence level for automatic Mahalanobis threshold (default: 0.95)
45 pub confidence_level: Float,
46 /// Number of trees for Isolation Forest (default: 100)
47 pub n_estimators: usize,
48 /// Subsampling size for Isolation Forest (default: 256)
49 pub max_samples: usize,
50 /// Number of neighbors for LOF (default: 20)
51 pub n_neighbors: usize,
52 /// Contamination rate - expected proportion of outliers (default: 0.1)
53 pub contamination: Float,
54 /// Kernel for One-Class SVM (default: RBF)
55 pub svm_kernel: String,
56 /// Nu parameter for One-Class SVM (default: 0.05)
57 pub nu: Float,
58 /// Gamma parameter for RBF kernel (default: scale)
59 pub gamma: Float,
60 /// Ensemble methods to combine (for Ensemble method)
61 pub ensemble_methods: Vec<OutlierDetectionMethod>,
62 /// Voting strategy for ensemble: "majority" or "average" (default: "majority")
63 pub voting_strategy: String,
64}
65
66impl Default for OutlierDetectorConfig {
67 fn default() -> Self {
68 Self {
69 method: OutlierDetectionMethod::ZScore,
70 threshold: 3.0,
71 iqr_multiplier: 1.5,
72 lower_percentile: 5.0,
73 upper_percentile: 95.0,
74 mahalanobis_threshold: None,
75 confidence_level: 0.95,
76 n_estimators: 100,
77 max_samples: 256,
78 n_neighbors: 20,
79 contamination: 0.1,
80 svm_kernel: "rbf".to_string(),
81 nu: 0.05,
82 gamma: 1.0, // Will be set to 1/n_features by default
83 ensemble_methods: vec![
84 OutlierDetectionMethod::ZScore,
85 OutlierDetectionMethod::MahalanobisDistance,
86 ],
87 voting_strategy: "majority".to_string(),
88 }
89 }
90}
91
92/// Statistics computed for outlier detection
93#[derive(Debug, Clone)]
94pub struct OutlierStatistics {
95 pub n_outliers: usize,
96 pub outlier_fraction: Float,
97 pub feature_outlier_counts: Vec<usize>,
98}
99
100/// Parameters for multivariate outlier detection
101#[derive(Debug, Clone, Default)]
102pub struct MultivariateOutlierParams {
103 pub mean: Vec<Float>,
104 pub covariance: Vec<Vec<Float>>,
105 pub inv_covariance: Vec<Vec<Float>>,
106 pub threshold: Float,
107}
108
109/// Parameters for outlier detection on a single feature
110#[derive(Debug, Clone, Default)]
111pub struct FeatureOutlierParams {
112 pub mean: Option<Float>,
113 pub std: Option<Float>,
114 pub median: Option<Float>,
115 pub mad: Option<Float>, // Median Absolute Deviation
116 pub q1: Option<Float>, // First quartile
117 pub q3: Option<Float>, // Third quartile
118 pub iqr: Option<Float>, // Interquartile Range
119 pub lower_bound: Option<Float>,
120 pub upper_bound: Option<Float>,
121 pub lower_percentile_value: Option<Float>,
122 pub upper_percentile_value: Option<Float>,
123}
124
125/// Outlier detection results providing detailed information
126#[derive(Debug, Clone)]
127pub struct OutlierDetectionResult {
128 /// Boolean array indicating which samples are outliers
129 pub outliers: Vec<bool>,
130 /// Outlier scores for each sample (higher = more likely outlier)
131 pub scores: Vec<Float>,
132 /// Summary statistics
133 pub summary: OutlierSummary,
134}
135
136/// Summary of outlier detection results
137#[derive(Debug, Clone)]
138pub struct OutlierSummary {
139 /// Total number of samples
140 pub n_samples: usize,
141 /// Number of outliers detected
142 pub n_outliers: usize,
143 /// Fraction of samples that are outliers
144 pub outlier_fraction: Float,
145 /// Method used for detection
146 pub method: OutlierDetectionMethod,
147}