Skip to main content

sklears_preprocessing/outlier_detection/
detector.rs

1//! Main outlier detection implementation
2//!
3//! This module provides the core OutlierDetector implementation with support
4//! for multiple detection methods.
5
6use scirs2_core::ndarray::{Array1, Array2, Axis};
7use sklears_core::{
8    error::{Result, SklearsError},
9    traits::{Fit, Trained, Transform, Untrained},
10    types::Float,
11};
12use std::marker::PhantomData;
13
14use super::core::*;
15use super::simd_operations::*;
16
17/// Univariate outlier detector for identifying anomalous data points
18///
19/// This detector provides several methods for identifying outliers in univariate data:
20/// - Z-score: Points with |z-score| > threshold are considered outliers
21/// - Modified Z-score: Uses median and MAD instead of mean and std for robustness
22/// - IQR: Points outside Q1 - k*IQR or Q3 + k*IQR are considered outliers
23/// - Percentile: Points outside specified percentile bounds are considered outliers
24#[derive(Debug, Clone)]
25pub struct OutlierDetector<State = Untrained> {
26    config: OutlierDetectorConfig,
27    state: PhantomData<State>,
28    // Fitted parameters
29    n_features_in_: Option<usize>,
30    statistics_: Option<OutlierStatistics>,
31    feature_params_: Option<Vec<FeatureOutlierParams>>,
32    multivariate_params_: Option<MultivariateOutlierParams>,
33}
34
35impl OutlierDetector<Untrained> {
36    /// Create a new OutlierDetector
37    pub fn new() -> Self {
38        Self {
39            config: OutlierDetectorConfig::default(),
40            state: PhantomData,
41            n_features_in_: None,
42            statistics_: None,
43            feature_params_: None,
44            multivariate_params_: None,
45        }
46    }
47
48    /// Create Z-score based outlier detector
49    pub fn z_score(threshold: Float) -> Self {
50        Self {
51            config: OutlierDetectorConfig {
52                method: OutlierDetectionMethod::ZScore,
53                threshold,
54                ..Default::default()
55            },
56            state: PhantomData,
57            n_features_in_: None,
58            statistics_: None,
59            feature_params_: None,
60            multivariate_params_: None,
61        }
62    }
63
64    /// Create IQR-based outlier detector
65    pub fn iqr(multiplier: Float) -> Self {
66        Self {
67            config: OutlierDetectorConfig {
68                method: OutlierDetectionMethod::IQR,
69                iqr_multiplier: multiplier,
70                ..Default::default()
71            },
72            state: PhantomData,
73            n_features_in_: None,
74            statistics_: None,
75            feature_params_: None,
76            multivariate_params_: None,
77        }
78    }
79
80    /// Create Mahalanobis distance based outlier detector
81    pub fn mahalanobis_distance(confidence_level: Float) -> Self {
82        Self {
83            config: OutlierDetectorConfig {
84                method: OutlierDetectionMethod::MahalanobisDistance,
85                confidence_level,
86                ..Default::default()
87            },
88            state: PhantomData,
89            n_features_in_: None,
90            statistics_: None,
91            feature_params_: None,
92            multivariate_params_: None,
93        }
94    }
95
96    /// Set the detection method
97    pub fn method(mut self, method: OutlierDetectionMethod) -> Self {
98        self.config.method = method;
99        self
100    }
101
102    /// Set the threshold
103    pub fn threshold(mut self, threshold: Float) -> Self {
104        self.config.threshold = threshold;
105        self
106    }
107
108    /// Compute statistics for a single feature
109    fn compute_feature_statistics(
110        &self,
111        feature_data: &Array1<Float>,
112    ) -> Result<FeatureOutlierParams> {
113        let mut params = FeatureOutlierParams::default();
114
115        match self.config.method {
116            OutlierDetectionMethod::ZScore => {
117                let mean = feature_data.mean().unwrap_or(0.0);
118                let std = feature_data.std(0.0);
119                params.mean = Some(mean);
120                params.std = Some(std);
121            }
122            OutlierDetectionMethod::ModifiedZScore => {
123                let median = self.compute_median(feature_data);
124                let mad = self.compute_mad(feature_data, median);
125                params.median = Some(median);
126                params.mad = Some(mad);
127            }
128            OutlierDetectionMethod::IQR => {
129                let (q1, q3, iqr) = self.compute_quartiles(feature_data);
130                params.q1 = Some(q1);
131                params.q3 = Some(q3);
132                params.iqr = Some(iqr);
133                params.lower_bound = Some(q1 - self.config.iqr_multiplier * iqr);
134                params.upper_bound = Some(q3 + self.config.iqr_multiplier * iqr);
135            }
136            OutlierDetectionMethod::Percentile => {
137                let (lower, upper) = self.compute_percentile_bounds(feature_data);
138                params.lower_percentile_value = Some(lower);
139                params.upper_percentile_value = Some(upper);
140            }
141            _ => {
142                return Err(SklearsError::InvalidParameter {
143                    name: "method".to_string(),
144                    reason: "Unsupported method for univariate detection".to_string(),
145                });
146            }
147        }
148
149        Ok(params)
150    }
151
152    /// Compute median of data
153    fn compute_median(&self, data: &Array1<Float>) -> Float {
154        let mut sorted: Vec<Float> = data.to_vec();
155        sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
156        let n = sorted.len();
157        if n % 2 == 0 {
158            (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
159        } else {
160            sorted[n / 2]
161        }
162    }
163
164    /// Compute Median Absolute Deviation
165    fn compute_mad(&self, data: &Array1<Float>, median: Float) -> Float {
166        let deviations: Vec<Float> = data.iter().map(|&x| (x - median).abs()).collect();
167        let mut sorted_deviations = deviations;
168        sorted_deviations.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
169        let n = sorted_deviations.len();
170        if n % 2 == 0 {
171            (sorted_deviations[n / 2 - 1] + sorted_deviations[n / 2]) / 2.0
172        } else {
173            sorted_deviations[n / 2]
174        }
175    }
176
177    /// Compute quartiles and IQR
178    fn compute_quartiles(&self, data: &Array1<Float>) -> (Float, Float, Float) {
179        let mut sorted: Vec<Float> = data.to_vec();
180        sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
181        let n = sorted.len();
182
183        let q1_idx = (0.25 * (n - 1) as Float) as usize;
184        let q3_idx = (0.75 * (n - 1) as Float) as usize;
185
186        let q1 = sorted[q1_idx];
187        let q3 = sorted[q3_idx];
188        let iqr = q3 - q1;
189
190        (q1, q3, iqr)
191    }
192
193    /// Compute percentile bounds
194    fn compute_percentile_bounds(&self, data: &Array1<Float>) -> (Float, Float) {
195        let mut sorted: Vec<Float> = data.to_vec();
196        sorted.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
197        let n = sorted.len();
198
199        let lower_idx = ((self.config.lower_percentile / 100.0) * (n - 1) as Float) as usize;
200        let upper_idx = ((self.config.upper_percentile / 100.0) * (n - 1) as Float) as usize;
201
202        (sorted[lower_idx], sorted[upper_idx])
203    }
204
205    /// Compute multivariate statistics for Mahalanobis distance
206    fn compute_multivariate_statistics(
207        &self,
208        x: &Array2<Float>,
209    ) -> Result<MultivariateOutlierParams> {
210        let n_features = x.ncols();
211        let n_samples = x.nrows();
212
213        // Compute mean vector
214        let mean = x
215            .mean_axis(Axis(0))
216            .expect("array should have elements for mean computation")
217            .to_vec();
218
219        // Compute covariance matrix
220        let mut covariance = vec![vec![0.0; n_features]; n_features];
221        for i in 0..n_features {
222            for j in i..n_features {
223                let mut cov_sum = 0.0;
224                for k in 0..n_samples {
225                    cov_sum += (x[[k, i]] - mean[i]) * (x[[k, j]] - mean[j]);
226                }
227                let cov_val = cov_sum / (n_samples - 1) as Float;
228                covariance[i][j] = cov_val;
229                covariance[j][i] = cov_val; // Symmetric
230            }
231        }
232
233        // Compute inverse covariance matrix (simplified)
234        let inv_covariance = self.matrix_inverse(&covariance)?;
235
236        // Compute threshold using chi-squared distribution
237        let threshold = self.chi_squared_threshold(n_features, self.config.confidence_level);
238
239        Ok(MultivariateOutlierParams {
240            mean,
241            covariance,
242            inv_covariance,
243            threshold,
244        })
245    }
246
247    /// Simple matrix inversion (for demo - would use proper LAPACK in practice)
248    fn matrix_inverse(&self, matrix: &[Vec<Float>]) -> Result<Vec<Vec<Float>>> {
249        let n = matrix.len();
250        if n == 1 {
251            if matrix[0][0].abs() < 1e-12 {
252                return Err(SklearsError::NumericalError("Singular matrix".to_string()));
253            }
254            return Ok(vec![vec![1.0 / matrix[0][0]]]);
255        }
256
257        // For simplicity, return identity matrix for now
258        // In practice, would use proper matrix inversion
259        let mut inv = vec![vec![0.0; n]; n];
260        for i in 0..n {
261            inv[i][i] = 1.0;
262        }
263        Ok(inv)
264    }
265
266    /// Chi-squared threshold approximation
267    fn chi_squared_threshold(&self, degrees_of_freedom: usize, confidence_level: Float) -> Float {
268        // Simple approximation - in practice would use proper chi-squared quantile
269        let df = degrees_of_freedom as Float;
270        match confidence_level {
271            x if x >= 0.99 => df + 3.0 * df.sqrt(),
272            x if x >= 0.95 => df + 2.0 * df.sqrt(),
273            x if x >= 0.90 => df + 1.5 * df.sqrt(),
274            _ => df + df.sqrt(),
275        }
276    }
277}
278
279impl OutlierDetector<Trained> {
280    /// Get fitted statistics
281    pub fn statistics(&self) -> &OutlierStatistics {
282        self.statistics_
283            .as_ref()
284            .expect("OutlierDetector should be fitted")
285    }
286
287    /// Get number of input features
288    pub fn n_features_in(&self) -> usize {
289        self.n_features_in_.unwrap_or(0)
290    }
291
292    /// Check if a single feature value is an outlier
293    pub fn is_outlier_single(&self, feature_idx: usize, value: Float) -> bool {
294        if let Some(ref params) = self.feature_params_ {
295            if feature_idx < params.len() {
296                return self.is_outlier_for_params(&params[feature_idx], value);
297            }
298        }
299        false
300    }
301
302    /// Check outlier based on parameters
303    fn is_outlier_for_params(&self, params: &FeatureOutlierParams, value: Float) -> bool {
304        match self.config.method {
305            OutlierDetectionMethod::ZScore => {
306                if let (Some(mean), Some(std)) = (params.mean, params.std) {
307                    let z_score = (value - mean) / std;
308                    z_score.abs() > self.config.threshold
309                } else {
310                    false
311                }
312            }
313            OutlierDetectionMethod::ModifiedZScore => {
314                if let (Some(median), Some(mad)) = (params.median, params.mad) {
315                    let modified_z = 0.6745 * (value - median) / mad;
316                    modified_z.abs() > self.config.threshold
317                } else {
318                    false
319                }
320            }
321            OutlierDetectionMethod::IQR => {
322                if let (Some(lower), Some(upper)) = (params.lower_bound, params.upper_bound) {
323                    value < lower || value > upper
324                } else {
325                    false
326                }
327            }
328            OutlierDetectionMethod::Percentile => {
329                if let (Some(lower), Some(upper)) =
330                    (params.lower_percentile_value, params.upper_percentile_value)
331                {
332                    value < lower || value > upper
333                } else {
334                    false
335                }
336            }
337            _ => false,
338        }
339    }
340
341    /// Generate detailed outlier detection results
342    pub fn detect_outliers(&self, x: &Array2<Float>) -> Result<OutlierDetectionResult> {
343        let outlier_mask = self.transform(x)?;
344        let scores = self.outlier_scores(x)?;
345
346        let outliers: Vec<bool> = outlier_mask.iter().map(|&x| x).collect();
347        let n_outliers = outliers.iter().filter(|&&x| x).count();
348        let n_samples = outliers.len();
349
350        Ok(OutlierDetectionResult {
351            outliers: outliers.clone(),
352            scores,
353            summary: OutlierSummary {
354                n_samples,
355                n_outliers,
356                outlier_fraction: n_outliers as Float / n_samples as Float,
357                method: self.config.method,
358            },
359        })
360    }
361
362    /// Compute outlier scores
363    pub fn outlier_scores(&self, x: &Array2<Float>) -> Result<Vec<Float>> {
364        let n_samples = x.nrows();
365        let mut scores = vec![0.0; n_samples];
366
367        match self.config.method {
368            OutlierDetectionMethod::ZScore => {
369                if let Some(ref params) = self.feature_params_ {
370                    for i in 0..n_samples {
371                        let mut max_score: Float = 0.0;
372                        for (j, param) in params.iter().enumerate() {
373                            if let (Some(mean), Some(std)) = (param.mean, param.std) {
374                                let z_score = ((x[[i, j]] - mean) / std).abs();
375                                max_score = max_score.max(z_score);
376                            }
377                        }
378                        scores[i] = max_score;
379                    }
380                }
381            }
382            OutlierDetectionMethod::MahalanobisDistance => {
383                if let Some(ref params) = self.multivariate_params_ {
384                    for i in 0..n_samples {
385                        let sample: Vec<Float> = (0..x.ncols()).map(|j| x[[i, j]]).collect();
386                        let mut centered = vec![0.0; sample.len()];
387
388                        // Center the data
389                        for (j, &val) in sample.iter().enumerate() {
390                            centered[j] = val - params.mean[j];
391                        }
392
393                        // Compute Mahalanobis distance
394                        let temp = simd_matvec_multiply(&params.inv_covariance, &centered);
395                        scores[i] = simd_dot_product(&centered, &temp).sqrt();
396                    }
397                }
398            }
399            _ => {
400                // Default scoring
401                for i in 0..n_samples {
402                    scores[i] = 0.0;
403                }
404            }
405        }
406
407        Ok(scores)
408    }
409}
410
411impl Default for OutlierDetector<Untrained> {
412    fn default() -> Self {
413        Self::new()
414    }
415}
416
417impl Fit<Array2<Float>, ()> for OutlierDetector<Untrained> {
418    type Fitted = OutlierDetector<Trained>;
419
420    fn fit(self, x: &Array2<Float>, _y: &()) -> Result<Self::Fitted> {
421        let (n_samples, n_features) = x.dim();
422
423        if n_samples == 0 {
424            return Err(SklearsError::InvalidInput(
425                "Cannot fit on empty dataset".to_string(),
426            ));
427        }
428
429        let mut feature_params = None;
430        let mut multivariate_params = None;
431
432        match self.config.method {
433            OutlierDetectionMethod::MahalanobisDistance => {
434                multivariate_params = Some(self.compute_multivariate_statistics(x)?);
435            }
436            _ => {
437                // Compute per-feature parameters
438                let mut params = Vec::new();
439                for j in 0..n_features {
440                    let feature_column = x.column(j).to_owned();
441                    params.push(self.compute_feature_statistics(&feature_column)?);
442                }
443                feature_params = Some(params);
444            }
445        }
446
447        let statistics = OutlierStatistics {
448            n_outliers: 0, // Will be computed during transform
449            outlier_fraction: 0.0,
450            feature_outlier_counts: vec![0; n_features],
451        };
452
453        Ok(OutlierDetector {
454            config: self.config,
455            state: PhantomData,
456            n_features_in_: Some(n_features),
457            statistics_: Some(statistics),
458            feature_params_: feature_params,
459            multivariate_params_: multivariate_params,
460        })
461    }
462}
463
464impl Transform<Array2<Float>, Array2<bool>> for OutlierDetector<Trained> {
465    fn transform(&self, x: &Array2<Float>) -> Result<Array2<bool>> {
466        let (n_samples, n_features) = x.dim();
467
468        if n_features != self.n_features_in() {
469            return Err(SklearsError::FeatureMismatch {
470                expected: self.n_features_in(),
471                actual: n_features,
472            });
473        }
474
475        let mut outlier_mask = Array2::<bool>::default((n_samples, 1));
476
477        match self.config.method {
478            OutlierDetectionMethod::MahalanobisDistance => {
479                if let Some(ref params) = self.multivariate_params_ {
480                    for i in 0..n_samples {
481                        let sample: Vec<Float> = (0..n_features).map(|j| x[[i, j]]).collect();
482                        let mut centered = vec![0.0; sample.len()];
483
484                        // Center the data
485                        for (j, &val) in sample.iter().enumerate() {
486                            centered[j] = val - params.mean[j];
487                        }
488
489                        // Compute Mahalanobis distance
490                        let temp = simd_matvec_multiply(&params.inv_covariance, &centered);
491                        let distance_squared = simd_dot_product(&centered, &temp);
492
493                        outlier_mask[[i, 0]] = distance_squared > params.threshold;
494                    }
495                }
496            }
497            _ => {
498                // Feature-wise outlier detection
499                if let Some(ref params) = self.feature_params_ {
500                    for i in 0..n_samples {
501                        let mut is_outlier = false;
502                        for (j, param) in params.iter().enumerate() {
503                            if self.is_outlier_for_params(param, x[[i, j]]) {
504                                is_outlier = true;
505                                break; // Any feature being an outlier makes the sample an outlier
506                            }
507                        }
508                        outlier_mask[[i, 0]] = is_outlier;
509                    }
510                }
511            }
512        }
513
514        Ok(outlier_mask)
515    }
516}