sklears_preprocessing/outlier_detection/
detector.rs

1//! Main outlier detection implementation
2//!
3//! This module provides the core OutlierDetector implementation with support
4//! for multiple detection methods.
5
6use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8    error::{Result, SklearsError},
9    traits::{Fit, Trained, Transform, Untrained},
10    types::Float,
11};
12use std::marker::PhantomData;
13
14use super::core::*;
15use super::simd_operations::*;
16
17/// Univariate outlier detector for identifying anomalous data points
18///
19/// This detector provides several methods for identifying outliers in univariate data:
20/// - Z-score: Points with |z-score| > threshold are considered outliers
21/// - Modified Z-score: Uses median and MAD instead of mean and std for robustness
22/// - IQR: Points outside Q1 - k*IQR or Q3 + k*IQR are considered outliers
23/// - Percentile: Points outside specified percentile bounds are considered outliers
24#[derive(Debug, Clone)]
25pub struct OutlierDetector<State = Untrained> {
26    config: OutlierDetectorConfig,
27    state: PhantomData<State>,
28    // Fitted parameters
29    n_features_in_: Option<usize>,
30    statistics_: Option<OutlierStatistics>,
31    feature_params_: Option<Vec<FeatureOutlierParams>>,
32    multivariate_params_: Option<MultivariateOutlierParams>,
33}
34
35impl OutlierDetector<Untrained> {
36    /// Create a new OutlierDetector
37    pub fn new() -> Self {
38        Self {
39            config: OutlierDetectorConfig::default(),
40            state: PhantomData,
41            n_features_in_: None,
42            statistics_: None,
43            feature_params_: None,
44            multivariate_params_: None,
45        }
46    }
47
48    /// Create Z-score based outlier detector
49    pub fn z_score(threshold: Float) -> Self {
50        Self {
51            config: OutlierDetectorConfig {
52                method: OutlierDetectionMethod::ZScore,
53                threshold,
54                ..Default::default()
55            },
56            state: PhantomData,
57            n_features_in_: None,
58            statistics_: None,
59            feature_params_: None,
60            multivariate_params_: None,
61        }
62    }
63
64    /// Create IQR-based outlier detector
65    pub fn iqr(multiplier: Float) -> Self {
66        Self {
67            config: OutlierDetectorConfig {
68                method: OutlierDetectionMethod::IQR,
69                iqr_multiplier: multiplier,
70                ..Default::default()
71            },
72            state: PhantomData,
73            n_features_in_: None,
74            statistics_: None,
75            feature_params_: None,
76            multivariate_params_: None,
77        }
78    }
79
80    /// Create Mahalanobis distance based outlier detector
81    pub fn mahalanobis_distance(confidence_level: Float) -> Self {
82        Self {
83            config: OutlierDetectorConfig {
84                method: OutlierDetectionMethod::MahalanobisDistance,
85                confidence_level,
86                ..Default::default()
87            },
88            state: PhantomData,
89            n_features_in_: None,
90            statistics_: None,
91            feature_params_: None,
92            multivariate_params_: None,
93        }
94    }
95
96    /// Set the detection method
97    pub fn method(mut self, method: OutlierDetectionMethod) -> Self {
98        self.config.method = method;
99        self
100    }
101
102    /// Set the threshold
103    pub fn threshold(mut self, threshold: Float) -> Self {
104        self.config.threshold = threshold;
105        self
106    }
107
108    /// Compute statistics for a single feature
109    fn compute_feature_statistics(
110        &self,
111        feature_data: &Array1<Float>,
112    ) -> Result<FeatureOutlierParams> {
113        let mut params = FeatureOutlierParams::default();
114
115        match self.config.method {
116            OutlierDetectionMethod::ZScore => {
117                let mean = feature_data.mean().unwrap_or(0.0);
118                let std = feature_data.std(0.0);
119                params.mean = Some(mean);
120                params.std = Some(std);
121            }
122            OutlierDetectionMethod::ModifiedZScore => {
123                let median = self.compute_median(feature_data);
124                let mad = self.compute_mad(feature_data, median);
125                params.median = Some(median);
126                params.mad = Some(mad);
127            }
128            OutlierDetectionMethod::IQR => {
129                let (q1, q3, iqr) = self.compute_quartiles(feature_data);
130                params.q1 = Some(q1);
131                params.q3 = Some(q3);
132                params.iqr = Some(iqr);
133                params.lower_bound = Some(q1 - self.config.iqr_multiplier * iqr);
134                params.upper_bound = Some(q3 + self.config.iqr_multiplier * iqr);
135            }
136            OutlierDetectionMethod::Percentile => {
137                let (lower, upper) = self.compute_percentile_bounds(feature_data);
138                params.lower_percentile_value = Some(lower);
139                params.upper_percentile_value = Some(upper);
140            }
141            _ => {
142                return Err(SklearsError::InvalidParameter {
143                    name: "method".to_string(),
144                    reason: "Unsupported method for univariate detection".to_string(),
145                });
146            }
147        }
148
149        Ok(params)
150    }
151
152    /// Compute median of data
153    fn compute_median(&self, data: &Array1<Float>) -> Float {
154        let mut sorted: Vec<Float> = data.to_vec();
155        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
156        let n = sorted.len();
157        if n % 2 == 0 {
158            (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
159        } else {
160            sorted[n / 2]
161        }
162    }
163
164    /// Compute Median Absolute Deviation
165    fn compute_mad(&self, data: &Array1<Float>, median: Float) -> Float {
166        let deviations: Vec<Float> = data.iter().map(|&x| (x - median).abs()).collect();
167        let mut sorted_deviations = deviations;
168        sorted_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
169        let n = sorted_deviations.len();
170        if n % 2 == 0 {
171            (sorted_deviations[n / 2 - 1] + sorted_deviations[n / 2]) / 2.0
172        } else {
173            sorted_deviations[n / 2]
174        }
175    }
176
177    /// Compute quartiles and IQR
178    fn compute_quartiles(&self, data: &Array1<Float>) -> (Float, Float, Float) {
179        let mut sorted: Vec<Float> = data.to_vec();
180        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
181        let n = sorted.len();
182
183        let q1_idx = (0.25 * (n - 1) as Float) as usize;
184        let q3_idx = (0.75 * (n - 1) as Float) as usize;
185
186        let q1 = sorted[q1_idx];
187        let q3 = sorted[q3_idx];
188        let iqr = q3 - q1;
189
190        (q1, q3, iqr)
191    }
192
193    /// Compute percentile bounds
194    fn compute_percentile_bounds(&self, data: &Array1<Float>) -> (Float, Float) {
195        let mut sorted: Vec<Float> = data.to_vec();
196        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
197        let n = sorted.len();
198
199        let lower_idx = ((self.config.lower_percentile / 100.0) * (n - 1) as Float) as usize;
200        let upper_idx = ((self.config.upper_percentile / 100.0) * (n - 1) as Float) as usize;
201
202        (sorted[lower_idx], sorted[upper_idx])
203    }
204
205    /// Compute multivariate statistics for Mahalanobis distance
206    fn compute_multivariate_statistics(
207        &self,
208        x: &Array2<Float>,
209    ) -> Result<MultivariateOutlierParams> {
210        let n_features = x.ncols();
211        let n_samples = x.nrows();
212
213        // Compute mean vector
214        let mean = x.mean_axis(Axis(0)).unwrap().to_vec();
215
216        // Compute covariance matrix
217        let mut covariance = vec![vec![0.0; n_features]; n_features];
218        for i in 0..n_features {
219            for j in i..n_features {
220                let mut cov_sum = 0.0;
221                for k in 0..n_samples {
222                    cov_sum += (x[[k, i]] - mean[i]) * (x[[k, j]] - mean[j]);
223                }
224                let cov_val = cov_sum / (n_samples - 1) as Float;
225                covariance[i][j] = cov_val;
226                covariance[j][i] = cov_val; // Symmetric
227            }
228        }
229
230        // Compute inverse covariance matrix (simplified)
231        let inv_covariance = self.matrix_inverse(&covariance)?;
232
233        // Compute threshold using chi-squared distribution
234        let threshold = self.chi_squared_threshold(n_features, self.config.confidence_level);
235
236        Ok(MultivariateOutlierParams {
237            mean,
238            covariance,
239            inv_covariance,
240            threshold,
241        })
242    }
243
244    /// Simple matrix inversion (for demo - would use proper LAPACK in practice)
245    fn matrix_inverse(&self, matrix: &[Vec<Float>]) -> Result<Vec<Vec<Float>>> {
246        let n = matrix.len();
247        if n == 1 {
248            if matrix[0][0].abs() < 1e-12 {
249                return Err(SklearsError::NumericalError("Singular matrix".to_string()));
250            }
251            return Ok(vec![vec![1.0 / matrix[0][0]]]);
252        }
253
254        // For simplicity, return identity matrix for now
255        // In practice, would use proper matrix inversion
256        let mut inv = vec![vec![0.0; n]; n];
257        for i in 0..n {
258            inv[i][i] = 1.0;
259        }
260        Ok(inv)
261    }
262
263    /// Chi-squared threshold approximation
264    fn chi_squared_threshold(&self, degrees_of_freedom: usize, confidence_level: Float) -> Float {
265        // Simple approximation - in practice would use proper chi-squared quantile
266        let df = degrees_of_freedom as Float;
267        match confidence_level {
268            x if x >= 0.99 => df + 3.0 * df.sqrt(),
269            x if x >= 0.95 => df + 2.0 * df.sqrt(),
270            x if x >= 0.90 => df + 1.5 * df.sqrt(),
271            _ => df + df.sqrt(),
272        }
273    }
274}
275
276impl OutlierDetector<Trained> {
277    /// Get fitted statistics
278    pub fn statistics(&self) -> &OutlierStatistics {
279        self.statistics_
280            .as_ref()
281            .expect("OutlierDetector should be fitted")
282    }
283
284    /// Get number of input features
285    pub fn n_features_in(&self) -> usize {
286        self.n_features_in_.unwrap_or(0)
287    }
288
289    /// Check if a single feature value is an outlier
290    pub fn is_outlier_single(&self, feature_idx: usize, value: Float) -> bool {
291        if let Some(ref params) = self.feature_params_ {
292            if feature_idx < params.len() {
293                return self.is_outlier_for_params(&params[feature_idx], value);
294            }
295        }
296        false
297    }
298
299    /// Check outlier based on parameters
300    fn is_outlier_for_params(&self, params: &FeatureOutlierParams, value: Float) -> bool {
301        match self.config.method {
302            OutlierDetectionMethod::ZScore => {
303                if let (Some(mean), Some(std)) = (params.mean, params.std) {
304                    let z_score = (value - mean) / std;
305                    z_score.abs() > self.config.threshold
306                } else {
307                    false
308                }
309            }
310            OutlierDetectionMethod::ModifiedZScore => {
311                if let (Some(median), Some(mad)) = (params.median, params.mad) {
312                    let modified_z = 0.6745 * (value - median) / mad;
313                    modified_z.abs() > self.config.threshold
314                } else {
315                    false
316                }
317            }
318            OutlierDetectionMethod::IQR => {
319                if let (Some(lower), Some(upper)) = (params.lower_bound, params.upper_bound) {
320                    value < lower || value > upper
321                } else {
322                    false
323                }
324            }
325            OutlierDetectionMethod::Percentile => {
326                if let (Some(lower), Some(upper)) =
327                    (params.lower_percentile_value, params.upper_percentile_value)
328                {
329                    value < lower || value > upper
330                } else {
331                    false
332                }
333            }
334            _ => false,
335        }
336    }
337
338    /// Generate detailed outlier detection results
339    pub fn detect_outliers(&self, x: &Array2<Float>) -> Result<OutlierDetectionResult> {
340        let outlier_mask = self.transform(x)?;
341        let scores = self.outlier_scores(x)?;
342
343        let outliers: Vec<bool> = outlier_mask.iter().map(|&x| x).collect();
344        let n_outliers = outliers.iter().filter(|&&x| x).count();
345        let n_samples = outliers.len();
346
347        Ok(OutlierDetectionResult {
348            outliers: outliers.clone(),
349            scores,
350            summary: OutlierSummary {
351                n_samples,
352                n_outliers,
353                outlier_fraction: n_outliers as Float / n_samples as Float,
354                method: self.config.method,
355            },
356        })
357    }
358
359    /// Compute outlier scores
360    pub fn outlier_scores(&self, x: &Array2<Float>) -> Result<Vec<Float>> {
361        let n_samples = x.nrows();
362        let mut scores = vec![0.0; n_samples];
363
364        match self.config.method {
365            OutlierDetectionMethod::ZScore => {
366                if let Some(ref params) = self.feature_params_ {
367                    for i in 0..n_samples {
368                        let mut max_score: Float = 0.0;
369                        for (j, param) in params.iter().enumerate() {
370                            if let (Some(mean), Some(std)) = (param.mean, param.std) {
371                                let z_score = ((x[[i, j]] - mean) / std).abs();
372                                max_score = max_score.max(z_score);
373                            }
374                        }
375                        scores[i] = max_score;
376                    }
377                }
378            }
379            OutlierDetectionMethod::MahalanobisDistance => {
380                if let Some(ref params) = self.multivariate_params_ {
381                    for i in 0..n_samples {
382                        let sample: Vec<Float> = (0..x.ncols()).map(|j| x[[i, j]]).collect();
383                        let mut centered = vec![0.0; sample.len()];
384
385                        // Center the data
386                        for (j, &val) in sample.iter().enumerate() {
387                            centered[j] = val - params.mean[j];
388                        }
389
390                        // Compute Mahalanobis distance
391                        let temp = simd_matvec_multiply(&params.inv_covariance, &centered);
392                        scores[i] = simd_dot_product(&centered, &temp).sqrt();
393                    }
394                }
395            }
396            _ => {
397                // Default scoring
398                for i in 0..n_samples {
399                    scores[i] = 0.0;
400                }
401            }
402        }
403
404        Ok(scores)
405    }
406}
407
408impl Default for OutlierDetector<Untrained> {
409    fn default() -> Self {
410        Self::new()
411    }
412}
413
414impl Fit<Array2<Float>, ()> for OutlierDetector<Untrained> {
415    type Fitted = OutlierDetector<Trained>;
416
417    fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
418        let (n_samples, n_features) = x.dim();
419
420        if n_samples == 0 {
421            return Err(SklearsError::InvalidInput(
422                "Cannot fit on empty dataset".to_string(),
423            ));
424        }
425
426        let mut feature_params = None;
427        let mut multivariate_params = None;
428
429        match self.config.method {
430            OutlierDetectionMethod::MahalanobisDistance => {
431                multivariate_params = Some(self.compute_multivariate_statistics(x)?);
432            }
433            _ => {
434                // Compute per-feature parameters
435                let mut params = Vec::new();
436                for j in 0..n_features {
437                    let feature_column = x.column(j).to_owned();
438                    params.push(self.compute_feature_statistics(&feature_column)?);
439                }
440                feature_params = Some(params);
441            }
442        }
443
444        let statistics = OutlierStatistics {
445            n_outliers: 0, // Will be computed during transform
446            outlier_fraction: 0.0,
447            feature_outlier_counts: vec![0; n_features],
448        };
449
450        Ok(OutlierDetector {
451            config: self.config,
452            state: PhantomData,
453            n_features_in_: Some(n_features),
454            statistics_: Some(statistics),
455            feature_params_: feature_params,
456            multivariate_params_: multivariate_params,
457        })
458    }
459}
460
461impl Transform<Array2<Float>, Array2<bool>> for OutlierDetector<Trained> {
462    fn transform(&self, x: &Array2<Float>) -> Result<Array2<bool>> {
463        let (n_samples, n_features) = x.dim();
464
465        if n_features != self.n_features_in() {
466            return Err(SklearsError::FeatureMismatch {
467                expected: self.n_features_in(),
468                actual: n_features,
469            });
470        }
471
472        let mut outlier_mask = Array2::<bool>::default((n_samples, 1));
473
474        match self.config.method {
475            OutlierDetectionMethod::MahalanobisDistance => {
476                if let Some(ref params) = self.multivariate_params_ {
477                    for i in 0..n_samples {
478                        let sample: Vec<Float> = (0..n_features).map(|j| x[[i, j]]).collect();
479                        let mut centered = vec![0.0; sample.len()];
480
481                        // Center the data
482                        for (j, &val) in sample.iter().enumerate() {
483                            centered[j] = val - params.mean[j];
484                        }
485
486                        // Compute Mahalanobis distance
487                        let temp = simd_matvec_multiply(&params.inv_covariance, &centered);
488                        let distance_squared = simd_dot_product(&centered, &temp);
489
490                        outlier_mask[[i, 0]] = distance_squared > params.threshold;
491                    }
492                }
493            }
494            _ => {
495                // Feature-wise outlier detection
496                if let Some(ref params) = self.feature_params_ {
497                    for i in 0..n_samples {
498                        let mut is_outlier = false;
499                        for (j, param) in params.iter().enumerate() {
500                            if self.is_outlier_for_params(param, x[[i, j]]) {
501                                is_outlier = true;
502                                break; // Any feature being an outlier makes the sample an outlier
503                            }
504                        }
505                        outlier_mask[[i, 0]] = is_outlier;
506                    }
507                }
508            }
509        }
510
511        Ok(outlier_mask)
512    }
513}