sklears_preprocessing/outlier_detection/
detector.rs1use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8 error::{Result, SklearsError},
9 traits::{Fit, Trained, Transform, Untrained},
10 types::Float,
11};
12use std::marker::PhantomData;
13
14use super::core::*;
15use super::simd_operations::*;
16
17#[derive(Debug, Clone)]
25pub struct OutlierDetector<State = Untrained> {
26 config: OutlierDetectorConfig,
27 state: PhantomData<State>,
28 n_features_in_: Option<usize>,
30 statistics_: Option<OutlierStatistics>,
31 feature_params_: Option<Vec<FeatureOutlierParams>>,
32 multivariate_params_: Option<MultivariateOutlierParams>,
33}
34
35impl OutlierDetector<Untrained> {
36 pub fn new() -> Self {
38 Self {
39 config: OutlierDetectorConfig::default(),
40 state: PhantomData,
41 n_features_in_: None,
42 statistics_: None,
43 feature_params_: None,
44 multivariate_params_: None,
45 }
46 }
47
48 pub fn z_score(threshold: Float) -> Self {
50 Self {
51 config: OutlierDetectorConfig {
52 method: OutlierDetectionMethod::ZScore,
53 threshold,
54 ..Default::default()
55 },
56 state: PhantomData,
57 n_features_in_: None,
58 statistics_: None,
59 feature_params_: None,
60 multivariate_params_: None,
61 }
62 }
63
64 pub fn iqr(multiplier: Float) -> Self {
66 Self {
67 config: OutlierDetectorConfig {
68 method: OutlierDetectionMethod::IQR,
69 iqr_multiplier: multiplier,
70 ..Default::default()
71 },
72 state: PhantomData,
73 n_features_in_: None,
74 statistics_: None,
75 feature_params_: None,
76 multivariate_params_: None,
77 }
78 }
79
80 pub fn mahalanobis_distance(confidence_level: Float) -> Self {
82 Self {
83 config: OutlierDetectorConfig {
84 method: OutlierDetectionMethod::MahalanobisDistance,
85 confidence_level,
86 ..Default::default()
87 },
88 state: PhantomData,
89 n_features_in_: None,
90 statistics_: None,
91 feature_params_: None,
92 multivariate_params_: None,
93 }
94 }
95
96 pub fn method(mut self, method: OutlierDetectionMethod) -> Self {
98 self.config.method = method;
99 self
100 }
101
102 pub fn threshold(mut self, threshold: Float) -> Self {
104 self.config.threshold = threshold;
105 self
106 }
107
108 fn compute_feature_statistics(
110 &self,
111 feature_data: &Array1<Float>,
112 ) -> Result<FeatureOutlierParams> {
113 let mut params = FeatureOutlierParams::default();
114
115 match self.config.method {
116 OutlierDetectionMethod::ZScore => {
117 let mean = feature_data.mean().unwrap_or(0.0);
118 let std = feature_data.std(0.0);
119 params.mean = Some(mean);
120 params.std = Some(std);
121 }
122 OutlierDetectionMethod::ModifiedZScore => {
123 let median = self.compute_median(feature_data);
124 let mad = self.compute_mad(feature_data, median);
125 params.median = Some(median);
126 params.mad = Some(mad);
127 }
128 OutlierDetectionMethod::IQR => {
129 let (q1, q3, iqr) = self.compute_quartiles(feature_data);
130 params.q1 = Some(q1);
131 params.q3 = Some(q3);
132 params.iqr = Some(iqr);
133 params.lower_bound = Some(q1 - self.config.iqr_multiplier * iqr);
134 params.upper_bound = Some(q3 + self.config.iqr_multiplier * iqr);
135 }
136 OutlierDetectionMethod::Percentile => {
137 let (lower, upper) = self.compute_percentile_bounds(feature_data);
138 params.lower_percentile_value = Some(lower);
139 params.upper_percentile_value = Some(upper);
140 }
141 _ => {
142 return Err(SklearsError::InvalidParameter {
143 name: "method".to_string(),
144 reason: "Unsupported method for univariate detection".to_string(),
145 });
146 }
147 }
148
149 Ok(params)
150 }
151
152 fn compute_median(&self, data: &Array1<Float>) -> Float {
154 let mut sorted: Vec<Float> = data.to_vec();
155 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
156 let n = sorted.len();
157 if n % 2 == 0 {
158 (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
159 } else {
160 sorted[n / 2]
161 }
162 }
163
164 fn compute_mad(&self, data: &Array1<Float>, median: Float) -> Float {
166 let deviations: Vec<Float> = data.iter().map(|&x| (x - median).abs()).collect();
167 let mut sorted_deviations = deviations;
168 sorted_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
169 let n = sorted_deviations.len();
170 if n % 2 == 0 {
171 (sorted_deviations[n / 2 - 1] + sorted_deviations[n / 2]) / 2.0
172 } else {
173 sorted_deviations[n / 2]
174 }
175 }
176
177 fn compute_quartiles(&self, data: &Array1<Float>) -> (Float, Float, Float) {
179 let mut sorted: Vec<Float> = data.to_vec();
180 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
181 let n = sorted.len();
182
183 let q1_idx = (0.25 * (n - 1) as Float) as usize;
184 let q3_idx = (0.75 * (n - 1) as Float) as usize;
185
186 let q1 = sorted[q1_idx];
187 let q3 = sorted[q3_idx];
188 let iqr = q3 - q1;
189
190 (q1, q3, iqr)
191 }
192
193 fn compute_percentile_bounds(&self, data: &Array1<Float>) -> (Float, Float) {
195 let mut sorted: Vec<Float> = data.to_vec();
196 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
197 let n = sorted.len();
198
199 let lower_idx = ((self.config.lower_percentile / 100.0) * (n - 1) as Float) as usize;
200 let upper_idx = ((self.config.upper_percentile / 100.0) * (n - 1) as Float) as usize;
201
202 (sorted[lower_idx], sorted[upper_idx])
203 }
204
205 fn compute_multivariate_statistics(
207 &self,
208 x: &Array2<Float>,
209 ) -> Result<MultivariateOutlierParams> {
210 let n_features = x.ncols();
211 let n_samples = x.nrows();
212
213 let mean = x.mean_axis(Axis(0)).unwrap().to_vec();
215
216 let mut covariance = vec![vec![0.0; n_features]; n_features];
218 for i in 0..n_features {
219 for j in i..n_features {
220 let mut cov_sum = 0.0;
221 for k in 0..n_samples {
222 cov_sum += (x[[k, i]] - mean[i]) * (x[[k, j]] - mean[j]);
223 }
224 let cov_val = cov_sum / (n_samples - 1) as Float;
225 covariance[i][j] = cov_val;
226 covariance[j][i] = cov_val; }
228 }
229
230 let inv_covariance = self.matrix_inverse(&covariance)?;
232
233 let threshold = self.chi_squared_threshold(n_features, self.config.confidence_level);
235
236 Ok(MultivariateOutlierParams {
237 mean,
238 covariance,
239 inv_covariance,
240 threshold,
241 })
242 }
243
244 fn matrix_inverse(&self, matrix: &[Vec<Float>]) -> Result<Vec<Vec<Float>>> {
246 let n = matrix.len();
247 if n == 1 {
248 if matrix[0][0].abs() < 1e-12 {
249 return Err(SklearsError::NumericalError("Singular matrix".to_string()));
250 }
251 return Ok(vec![vec![1.0 / matrix[0][0]]]);
252 }
253
254 let mut inv = vec![vec![0.0; n]; n];
257 for i in 0..n {
258 inv[i][i] = 1.0;
259 }
260 Ok(inv)
261 }
262
263 fn chi_squared_threshold(&self, degrees_of_freedom: usize, confidence_level: Float) -> Float {
265 let df = degrees_of_freedom as Float;
267 match confidence_level {
268 x if x >= 0.99 => df + 3.0 * df.sqrt(),
269 x if x >= 0.95 => df + 2.0 * df.sqrt(),
270 x if x >= 0.90 => df + 1.5 * df.sqrt(),
271 _ => df + df.sqrt(),
272 }
273 }
274}
275
276impl OutlierDetector<Trained> {
277 pub fn statistics(&self) -> &OutlierStatistics {
279 self.statistics_
280 .as_ref()
281 .expect("OutlierDetector should be fitted")
282 }
283
284 pub fn n_features_in(&self) -> usize {
286 self.n_features_in_.unwrap_or(0)
287 }
288
289 pub fn is_outlier_single(&self, feature_idx: usize, value: Float) -> bool {
291 if let Some(ref params) = self.feature_params_ {
292 if feature_idx < params.len() {
293 return self.is_outlier_for_params(¶ms[feature_idx], value);
294 }
295 }
296 false
297 }
298
299 fn is_outlier_for_params(&self, params: &FeatureOutlierParams, value: Float) -> bool {
301 match self.config.method {
302 OutlierDetectionMethod::ZScore => {
303 if let (Some(mean), Some(std)) = (params.mean, params.std) {
304 let z_score = (value - mean) / std;
305 z_score.abs() > self.config.threshold
306 } else {
307 false
308 }
309 }
310 OutlierDetectionMethod::ModifiedZScore => {
311 if let (Some(median), Some(mad)) = (params.median, params.mad) {
312 let modified_z = 0.6745 * (value - median) / mad;
313 modified_z.abs() > self.config.threshold
314 } else {
315 false
316 }
317 }
318 OutlierDetectionMethod::IQR => {
319 if let (Some(lower), Some(upper)) = (params.lower_bound, params.upper_bound) {
320 value < lower || value > upper
321 } else {
322 false
323 }
324 }
325 OutlierDetectionMethod::Percentile => {
326 if let (Some(lower), Some(upper)) =
327 (params.lower_percentile_value, params.upper_percentile_value)
328 {
329 value < lower || value > upper
330 } else {
331 false
332 }
333 }
334 _ => false,
335 }
336 }
337
338 pub fn detect_outliers(&self, x: &Array2<Float>) -> Result<OutlierDetectionResult> {
340 let outlier_mask = self.transform(x)?;
341 let scores = self.outlier_scores(x)?;
342
343 let outliers: Vec<bool> = outlier_mask.iter().map(|&x| x).collect();
344 let n_outliers = outliers.iter().filter(|&&x| x).count();
345 let n_samples = outliers.len();
346
347 Ok(OutlierDetectionResult {
348 outliers: outliers.clone(),
349 scores,
350 summary: OutlierSummary {
351 n_samples,
352 n_outliers,
353 outlier_fraction: n_outliers as Float / n_samples as Float,
354 method: self.config.method,
355 },
356 })
357 }
358
359 pub fn outlier_scores(&self, x: &Array2<Float>) -> Result<Vec<Float>> {
361 let n_samples = x.nrows();
362 let mut scores = vec![0.0; n_samples];
363
364 match self.config.method {
365 OutlierDetectionMethod::ZScore => {
366 if let Some(ref params) = self.feature_params_ {
367 for i in 0..n_samples {
368 let mut max_score: Float = 0.0;
369 for (j, param) in params.iter().enumerate() {
370 if let (Some(mean), Some(std)) = (param.mean, param.std) {
371 let z_score = ((x[[i, j]] - mean) / std).abs();
372 max_score = max_score.max(z_score);
373 }
374 }
375 scores[i] = max_score;
376 }
377 }
378 }
379 OutlierDetectionMethod::MahalanobisDistance => {
380 if let Some(ref params) = self.multivariate_params_ {
381 for i in 0..n_samples {
382 let sample: Vec<Float> = (0..x.ncols()).map(|j| x[[i, j]]).collect();
383 let mut centered = vec![0.0; sample.len()];
384
385 for (j, &val) in sample.iter().enumerate() {
387 centered[j] = val - params.mean[j];
388 }
389
390 let temp = simd_matvec_multiply(¶ms.inv_covariance, ¢ered);
392 scores[i] = simd_dot_product(¢ered, &temp).sqrt();
393 }
394 }
395 }
396 _ => {
397 for i in 0..n_samples {
399 scores[i] = 0.0;
400 }
401 }
402 }
403
404 Ok(scores)
405 }
406}
407
408impl Default for OutlierDetector<Untrained> {
409 fn default() -> Self {
410 Self::new()
411 }
412}
413
414impl Fit<Array2<Float>, ()> for OutlierDetector<Untrained> {
415 type Fitted = OutlierDetector<Trained>;
416
417 fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
418 let (n_samples, n_features) = x.dim();
419
420 if n_samples == 0 {
421 return Err(SklearsError::InvalidInput(
422 "Cannot fit on empty dataset".to_string(),
423 ));
424 }
425
426 let mut feature_params = None;
427 let mut multivariate_params = None;
428
429 match self.config.method {
430 OutlierDetectionMethod::MahalanobisDistance => {
431 multivariate_params = Some(self.compute_multivariate_statistics(x)?);
432 }
433 _ => {
434 let mut params = Vec::new();
436 for j in 0..n_features {
437 let feature_column = x.column(j).to_owned();
438 params.push(self.compute_feature_statistics(&feature_column)?);
439 }
440 feature_params = Some(params);
441 }
442 }
443
444 let statistics = OutlierStatistics {
445 n_outliers: 0, outlier_fraction: 0.0,
447 feature_outlier_counts: vec![0; n_features],
448 };
449
450 Ok(OutlierDetector {
451 config: self.config,
452 state: PhantomData,
453 n_features_in_: Some(n_features),
454 statistics_: Some(statistics),
455 feature_params_: feature_params,
456 multivariate_params_: multivariate_params,
457 })
458 }
459}
460
461impl Transform<Array2<Float>, Array2<bool>> for OutlierDetector<Trained> {
462 fn transform(&self, x: &Array2<Float>) -> Result<Array2<bool>> {
463 let (n_samples, n_features) = x.dim();
464
465 if n_features != self.n_features_in() {
466 return Err(SklearsError::FeatureMismatch {
467 expected: self.n_features_in(),
468 actual: n_features,
469 });
470 }
471
472 let mut outlier_mask = Array2::<bool>::default((n_samples, 1));
473
474 match self.config.method {
475 OutlierDetectionMethod::MahalanobisDistance => {
476 if let Some(ref params) = self.multivariate_params_ {
477 for i in 0..n_samples {
478 let sample: Vec<Float> = (0..n_features).map(|j| x[[i, j]]).collect();
479 let mut centered = vec![0.0; sample.len()];
480
481 for (j, &val) in sample.iter().enumerate() {
483 centered[j] = val - params.mean[j];
484 }
485
486 let temp = simd_matvec_multiply(¶ms.inv_covariance, ¢ered);
488 let distance_squared = simd_dot_product(¢ered, &temp);
489
490 outlier_mask[[i, 0]] = distance_squared > params.threshold;
491 }
492 }
493 }
494 _ => {
495 if let Some(ref params) = self.feature_params_ {
497 for i in 0..n_samples {
498 let mut is_outlier = false;
499 for (j, param) in params.iter().enumerate() {
500 if self.is_outlier_for_params(param, x[[i, j]]) {
501 is_outlier = true;
502 break; }
504 }
505 outlier_mask[[i, 0]] = is_outlier;
506 }
507 }
508 }
509 }
510
511 Ok(outlier_mask)
512 }
513}