sklears_preprocessing/outlier_detection/
detector.rs1use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8 error::{Result, SklearsError},
9 traits::{Fit, Trained, Transform, Untrained},
10 types::Float,
11};
12use std::marker::PhantomData;
13
14use super::core::*;
15use super::simd_operations::*;
16
17#[derive(Debug, Clone)]
25pub struct OutlierDetector<State = Untrained> {
26 config: OutlierDetectorConfig,
27 state: PhantomData<State>,
28 n_features_in_: Option<usize>,
30 statistics_: Option<OutlierStatistics>,
31 feature_params_: Option<Vec<FeatureOutlierParams>>,
32 multivariate_params_: Option<MultivariateOutlierParams>,
33}
34
35impl OutlierDetector<Untrained> {
36 pub fn new() -> Self {
38 Self {
39 config: OutlierDetectorConfig::default(),
40 state: PhantomData,
41 n_features_in_: None,
42 statistics_: None,
43 feature_params_: None,
44 multivariate_params_: None,
45 }
46 }
47
48 pub fn z_score(threshold: Float) -> Self {
50 Self {
51 config: OutlierDetectorConfig {
52 method: OutlierDetectionMethod::ZScore,
53 threshold,
54 ..Default::default()
55 },
56 state: PhantomData,
57 n_features_in_: None,
58 statistics_: None,
59 feature_params_: None,
60 multivariate_params_: None,
61 }
62 }
63
64 pub fn iqr(multiplier: Float) -> Self {
66 Self {
67 config: OutlierDetectorConfig {
68 method: OutlierDetectionMethod::IQR,
69 iqr_multiplier: multiplier,
70 ..Default::default()
71 },
72 state: PhantomData,
73 n_features_in_: None,
74 statistics_: None,
75 feature_params_: None,
76 multivariate_params_: None,
77 }
78 }
79
80 pub fn mahalanobis_distance(confidence_level: Float) -> Self {
82 Self {
83 config: OutlierDetectorConfig {
84 method: OutlierDetectionMethod::MahalanobisDistance,
85 confidence_level,
86 ..Default::default()
87 },
88 state: PhantomData,
89 n_features_in_: None,
90 statistics_: None,
91 feature_params_: None,
92 multivariate_params_: None,
93 }
94 }
95
96 pub fn method(mut self, method: OutlierDetectionMethod) -> Self {
98 self.config.method = method;
99 self
100 }
101
102 pub fn threshold(mut self, threshold: Float) -> Self {
104 self.config.threshold = threshold;
105 self
106 }
107
108 fn compute_feature_statistics(
110 &self,
111 feature_data: &Array1<Float>,
112 ) -> Result<FeatureOutlierParams> {
113 let mut params = FeatureOutlierParams::default();
114
115 match self.config.method {
116 OutlierDetectionMethod::ZScore => {
117 let mean = feature_data.mean().unwrap_or(0.0);
118 let std = feature_data.std(0.0);
119 params.mean = Some(mean);
120 params.std = Some(std);
121 }
122 OutlierDetectionMethod::ModifiedZScore => {
123 let median = self.compute_median(feature_data);
124 let mad = self.compute_mad(feature_data, median);
125 params.median = Some(median);
126 params.mad = Some(mad);
127 }
128 OutlierDetectionMethod::IQR => {
129 let (q1, q3, iqr) = self.compute_quartiles(feature_data);
130 params.q1 = Some(q1);
131 params.q3 = Some(q3);
132 params.iqr = Some(iqr);
133 params.lower_bound = Some(q1 - self.config.iqr_multiplier * iqr);
134 params.upper_bound = Some(q3 + self.config.iqr_multiplier * iqr);
135 }
136 OutlierDetectionMethod::Percentile => {
137 let (lower, upper) = self.compute_percentile_bounds(feature_data);
138 params.lower_percentile_value = Some(lower);
139 params.upper_percentile_value = Some(upper);
140 }
141 _ => {
142 return Err(SklearsError::InvalidParameter {
143 name: "method".to_string(),
144 reason: "Unsupported method for univariate detection".to_string(),
145 });
146 }
147 }
148
149 Ok(params)
150 }
151
152 fn compute_median(&self, data: &Array1<Float>) -> Float {
154 let mut sorted: Vec<Float> = data.to_vec();
155 sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
156 let n = sorted.len();
157 if n % 2 == 0 {
158 (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
159 } else {
160 sorted[n / 2]
161 }
162 }
163
164 fn compute_mad(&self, data: &Array1<Float>, median: Float) -> Float {
166 let deviations: Vec<Float> = data.iter().map(|&x| (x - median).abs()).collect();
167 let mut sorted_deviations = deviations;
168 sorted_deviations.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
169 let n = sorted_deviations.len();
170 if n % 2 == 0 {
171 (sorted_deviations[n / 2 - 1] + sorted_deviations[n / 2]) / 2.0
172 } else {
173 sorted_deviations[n / 2]
174 }
175 }
176
177 fn compute_quartiles(&self, data: &Array1<Float>) -> (Float, Float, Float) {
179 let mut sorted: Vec<Float> = data.to_vec();
180 sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
181 let n = sorted.len();
182
183 let q1_idx = (0.25 * (n - 1) as Float) as usize;
184 let q3_idx = (0.75 * (n - 1) as Float) as usize;
185
186 let q1 = sorted[q1_idx];
187 let q3 = sorted[q3_idx];
188 let iqr = q3 - q1;
189
190 (q1, q3, iqr)
191 }
192
193 fn compute_percentile_bounds(&self, data: &Array1<Float>) -> (Float, Float) {
195 let mut sorted: Vec<Float> = data.to_vec();
196 sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
197 let n = sorted.len();
198
199 let lower_idx = ((self.config.lower_percentile / 100.0) * (n - 1) as Float) as usize;
200 let upper_idx = ((self.config.upper_percentile / 100.0) * (n - 1) as Float) as usize;
201
202 (sorted[lower_idx], sorted[upper_idx])
203 }
204
205 fn compute_multivariate_statistics(
207 &self,
208 x: &Array2<Float>,
209 ) -> Result<MultivariateOutlierParams> {
210 let n_features = x.ncols();
211 let n_samples = x.nrows();
212
213 let mean = x
215 .mean_axis(Axis(0))
216 .expect("array should have elements for mean computation")
217 .to_vec();
218
219 let mut covariance = vec![vec![0.0; n_features]; n_features];
221 for i in 0..n_features {
222 for j in i..n_features {
223 let mut cov_sum = 0.0;
224 for k in 0..n_samples {
225 cov_sum += (x[[k, i]] - mean[i]) * (x[[k, j]] - mean[j]);
226 }
227 let cov_val = cov_sum / (n_samples - 1) as Float;
228 covariance[i][j] = cov_val;
229 covariance[j][i] = cov_val; }
231 }
232
233 let inv_covariance = self.matrix_inverse(&covariance)?;
235
236 let threshold = self.chi_squared_threshold(n_features, self.config.confidence_level);
238
239 Ok(MultivariateOutlierParams {
240 mean,
241 covariance,
242 inv_covariance,
243 threshold,
244 })
245 }
246
247 fn matrix_inverse(&self, matrix: &[Vec<Float>]) -> Result<Vec<Vec<Float>>> {
249 let n = matrix.len();
250 if n == 1 {
251 if matrix[0][0].abs() < 1e-12 {
252 return Err(SklearsError::NumericalError("Singular matrix".to_string()));
253 }
254 return Ok(vec![vec![1.0 / matrix[0][0]]]);
255 }
256
257 let mut inv = vec![vec![0.0; n]; n];
260 for i in 0..n {
261 inv[i][i] = 1.0;
262 }
263 Ok(inv)
264 }
265
266 fn chi_squared_threshold(&self, degrees_of_freedom: usize, confidence_level: Float) -> Float {
268 let df = degrees_of_freedom as Float;
270 match confidence_level {
271 x if x >= 0.99 => df + 3.0 * df.sqrt(),
272 x if x >= 0.95 => df + 2.0 * df.sqrt(),
273 x if x >= 0.90 => df + 1.5 * df.sqrt(),
274 _ => df + df.sqrt(),
275 }
276 }
277}
278
279impl OutlierDetector<Trained> {
280 pub fn statistics(&self) -> &OutlierStatistics {
282 self.statistics_
283 .as_ref()
284 .expect("OutlierDetector should be fitted")
285 }
286
287 pub fn n_features_in(&self) -> usize {
289 self.n_features_in_.unwrap_or(0)
290 }
291
292 pub fn is_outlier_single(&self, feature_idx: usize, value: Float) -> bool {
294 if let Some(ref params) = self.feature_params_ {
295 if feature_idx < params.len() {
296 return self.is_outlier_for_params(¶ms[feature_idx], value);
297 }
298 }
299 false
300 }
301
302 fn is_outlier_for_params(&self, params: &FeatureOutlierParams, value: Float) -> bool {
304 match self.config.method {
305 OutlierDetectionMethod::ZScore => {
306 if let (Some(mean), Some(std)) = (params.mean, params.std) {
307 let z_score = (value - mean) / std;
308 z_score.abs() > self.config.threshold
309 } else {
310 false
311 }
312 }
313 OutlierDetectionMethod::ModifiedZScore => {
314 if let (Some(median), Some(mad)) = (params.median, params.mad) {
315 let modified_z = 0.6745 * (value - median) / mad;
316 modified_z.abs() > self.config.threshold
317 } else {
318 false
319 }
320 }
321 OutlierDetectionMethod::IQR => {
322 if let (Some(lower), Some(upper)) = (params.lower_bound, params.upper_bound) {
323 value < lower || value > upper
324 } else {
325 false
326 }
327 }
328 OutlierDetectionMethod::Percentile => {
329 if let (Some(lower), Some(upper)) =
330 (params.lower_percentile_value, params.upper_percentile_value)
331 {
332 value < lower || value > upper
333 } else {
334 false
335 }
336 }
337 _ => false,
338 }
339 }
340
341 pub fn detect_outliers(&self, x: &Array2<Float>) -> Result<OutlierDetectionResult> {
343 let outlier_mask = self.transform(x)?;
344 let scores = self.outlier_scores(x)?;
345
346 let outliers: Vec<bool> = outlier_mask.iter().map(|&x| x).collect();
347 let n_outliers = outliers.iter().filter(|&&x| x).count();
348 let n_samples = outliers.len();
349
350 Ok(OutlierDetectionResult {
351 outliers: outliers.clone(),
352 scores,
353 summary: OutlierSummary {
354 n_samples,
355 n_outliers,
356 outlier_fraction: n_outliers as Float / n_samples as Float,
357 method: self.config.method,
358 },
359 })
360 }
361
362 pub fn outlier_scores(&self, x: &Array2<Float>) -> Result<Vec<Float>> {
364 let n_samples = x.nrows();
365 let mut scores = vec![0.0; n_samples];
366
367 match self.config.method {
368 OutlierDetectionMethod::ZScore => {
369 if let Some(ref params) = self.feature_params_ {
370 for i in 0..n_samples {
371 let mut max_score: Float = 0.0;
372 for (j, param) in params.iter().enumerate() {
373 if let (Some(mean), Some(std)) = (param.mean, param.std) {
374 let z_score = ((x[[i, j]] - mean) / std).abs();
375 max_score = max_score.max(z_score);
376 }
377 }
378 scores[i] = max_score;
379 }
380 }
381 }
382 OutlierDetectionMethod::MahalanobisDistance => {
383 if let Some(ref params) = self.multivariate_params_ {
384 for i in 0..n_samples {
385 let sample: Vec<Float> = (0..x.ncols()).map(|j| x[[i, j]]).collect();
386 let mut centered = vec![0.0; sample.len()];
387
388 for (j, &val) in sample.iter().enumerate() {
390 centered[j] = val - params.mean[j];
391 }
392
393 let temp = simd_matvec_multiply(¶ms.inv_covariance, ¢ered);
395 scores[i] = simd_dot_product(¢ered, &temp).sqrt();
396 }
397 }
398 }
399 _ => {
400 for i in 0..n_samples {
402 scores[i] = 0.0;
403 }
404 }
405 }
406
407 Ok(scores)
408 }
409}
410
411impl Default for OutlierDetector<Untrained> {
412 fn default() -> Self {
413 Self::new()
414 }
415}
416
417impl Fit<Array2<Float>, ()> for OutlierDetector<Untrained> {
418 type Fitted = OutlierDetector<Trained>;
419
420 fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
421 let (n_samples, n_features) = x.dim();
422
423 if n_samples == 0 {
424 return Err(SklearsError::InvalidInput(
425 "Cannot fit on empty dataset".to_string(),
426 ));
427 }
428
429 let mut feature_params = None;
430 let mut multivariate_params = None;
431
432 match self.config.method {
433 OutlierDetectionMethod::MahalanobisDistance => {
434 multivariate_params = Some(self.compute_multivariate_statistics(x)?);
435 }
436 _ => {
437 let mut params = Vec::new();
439 for j in 0..n_features {
440 let feature_column = x.column(j).to_owned();
441 params.push(self.compute_feature_statistics(&feature_column)?);
442 }
443 feature_params = Some(params);
444 }
445 }
446
447 let statistics = OutlierStatistics {
448 n_outliers: 0, outlier_fraction: 0.0,
450 feature_outlier_counts: vec![0; n_features],
451 };
452
453 Ok(OutlierDetector {
454 config: self.config,
455 state: PhantomData,
456 n_features_in_: Some(n_features),
457 statistics_: Some(statistics),
458 feature_params_: feature_params,
459 multivariate_params_: multivariate_params,
460 })
461 }
462}
463
464impl Transform<Array2<Float>, Array2<bool>> for OutlierDetector<Trained> {
465 fn transform(&self, x: &Array2<Float>) -> Result<Array2<bool>> {
466 let (n_samples, n_features) = x.dim();
467
468 if n_features != self.n_features_in() {
469 return Err(SklearsError::FeatureMismatch {
470 expected: self.n_features_in(),
471 actual: n_features,
472 });
473 }
474
475 let mut outlier_mask = Array2::<bool>::default((n_samples, 1));
476
477 match self.config.method {
478 OutlierDetectionMethod::MahalanobisDistance => {
479 if let Some(ref params) = self.multivariate_params_ {
480 for i in 0..n_samples {
481 let sample: Vec<Float> = (0..n_features).map(|j| x[[i, j]]).collect();
482 let mut centered = vec![0.0; sample.len()];
483
484 for (j, &val) in sample.iter().enumerate() {
486 centered[j] = val - params.mean[j];
487 }
488
489 let temp = simd_matvec_multiply(¶ms.inv_covariance, ¢ered);
491 let distance_squared = simd_dot_product(¢ered, &temp);
492
493 outlier_mask[[i, 0]] = distance_squared > params.threshold;
494 }
495 }
496 }
497 _ => {
498 if let Some(ref params) = self.feature_params_ {
500 for i in 0..n_samples {
501 let mut is_outlier = false;
502 for (j, param) in params.iter().enumerate() {
503 if self.is_outlier_for_params(param, x[[i, j]]) {
504 is_outlier = true;
505 break; }
507 }
508 outlier_mask[[i, 0]] = is_outlier;
509 }
510 }
511 }
512 }
513
514 Ok(outlier_mask)
515 }
516}