sklears_decomposition/
robust_methods.rs

1//! Robust Decomposition Methods
2//!
3//! This module provides robust decomposition techniques that are resistant to outliers including:
4//! - Robust PCA with L1 loss
5//! - M-estimator based decomposition
6//! - Outlier-resistant methods
7//! - Breakdown point analysis
8//! - Influence function diagnostics
9
10use scirs2_core::ndarray::{Array1, Array2, Axis};
11use scirs2_core::random::{thread_rng, RandNormal, Rng};
12use sklears_core::{
13    error::{Result, SklearsError},
14    types::Float,
15};
16
17/// Configuration for robust decomposition methods
18#[derive(Debug, Clone)]
19pub struct RobustConfig {
20    /// Maximum number of iterations
21    pub max_iterations: usize,
22    /// Convergence tolerance
23    pub tolerance: Float,
24    /// Outlier detection threshold
25    pub outlier_threshold: Float,
26    /// Robustness parameter for M-estimators
27    pub tuning_constant: Float,
28    /// Loss function type
29    pub loss_function: LossFunction,
30}
31
32/// Loss function types for robust estimation
33#[derive(Debug, Clone, Copy)]
34pub enum LossFunction {
35    /// Huber loss function
36    Huber,
37    /// Tukey's biweight function
38    Tukey,
39    /// L1 (absolute) loss
40    L1,
41    /// Cauchy loss function
42    Cauchy,
43    /// Welsch loss function
44    Welsch,
45}
46
47impl Default for RobustConfig {
48    fn default() -> Self {
49        Self {
50            max_iterations: 100,
51            tolerance: 1e-6,
52            outlier_threshold: 2.5,
53            tuning_constant: 1.345, // For Huber loss with 95% efficiency
54            loss_function: LossFunction::Huber,
55        }
56    }
57}
58
59/// Robust Principal Component Analysis using L1 loss
60///
61/// This implementation uses iterative algorithms to find principal components
62/// that are robust to outliers in the data.
63///
64/// # Examples
65///
66/// ```rust,ignore
67/// use sklears_decomposition::RobustPCA;
68/// use scirs2_core::ndarray::Array2;
69///
70/// let data = Array2::from_shape_vec((4, 3), vec![
71///     1.0, 2.0, 3.0,
72///     4.0, 5.0, 6.0,
73///     7.0, 8.0, 9.0,
74///     100.0, 2.0, 3.0  // Outlier
75/// ]).unwrap();
76///
77/// let rpca = RobustPCA::new().n_components(2);
78/// let result = rpca.fit_transform(&data).unwrap();
79/// ```
80pub struct RobustPCA {
81    config: RobustConfig,
82    n_components: Option<usize>,
83    // Fitted parameters
84    components_: Option<Array2<Float>>,
85    explained_variance_: Option<Array1<Float>>,
86    mean_: Option<Array1<Float>>,
87    outlier_weights_: Option<Array2<Float>>,
88}
89
90impl RobustPCA {
91    /// Create a new Robust PCA instance
92    pub fn new() -> Self {
93        Self {
94            config: RobustConfig::default(),
95            n_components: None,
96            components_: None,
97            explained_variance_: None,
98            mean_: None,
99            outlier_weights_: None,
100        }
101    }
102
103    /// Set the number of components
104    pub fn n_components(mut self, n_components: usize) -> Self {
105        self.n_components = Some(n_components);
106        self
107    }
108
109    /// Set the loss function
110    pub fn loss_function(mut self, loss_function: LossFunction) -> Self {
111        self.config.loss_function = loss_function;
112        self
113    }
114
115    /// Set the tuning constant for M-estimators
116    pub fn tuning_constant(mut self, tuning_constant: Float) -> Self {
117        self.config.tuning_constant = tuning_constant;
118        self
119    }
120
121    /// Set the outlier threshold
122    pub fn outlier_threshold(mut self, threshold: Float) -> Self {
123        self.config.outlier_threshold = threshold;
124        self
125    }
126
127    /// Fit robust PCA and transform the data
128    pub fn fit_transform(&mut self, data: &Array2<Float>) -> Result<RobustPCAResult> {
129        self.fit(data)?;
130        self.transform(data)
131    }
132
133    /// Fit the robust PCA model
134    pub fn fit(&mut self, data: &Array2<Float>) -> Result<()> {
135        let (n_samples, n_features) = data.dim();
136        let n_components = self.n_components.unwrap_or(n_features.min(n_samples));
137
138        if n_components > n_features.min(n_samples) {
139            return Err(SklearsError::InvalidInput(
140                "Number of components cannot exceed min(n_samples, n_features)".to_string(),
141            ));
142        }
143
144        // Center the data using robust mean estimation
145        let robust_mean = self.compute_robust_mean(data)?;
146        let centered_data = data - &robust_mean.view().insert_axis(Axis(0));
147
148        // Initialize components randomly
149        let mut rng = thread_rng();
150        let mut components = Array2::zeros((n_components, n_features));
151        for elem in components.iter_mut() {
152            *elem = rng.sample(RandNormal::new(0.0, 1.0).unwrap());
153        }
154
155        // Orthogonalize initial components
156        self.orthogonalize_components(&mut components);
157
158        // Iterative robust estimation
159        let mut prev_components = components.clone();
160        let mut outlier_weights = Array2::ones((n_samples, n_components));
161
162        for _iter in 0..self.config.max_iterations {
163            // Update components using weighted least squares
164            for comp_idx in 0..n_components {
165                let weights = outlier_weights.column(comp_idx);
166                let updated_component =
167                    self.update_component(&centered_data, &weights, comp_idx)?;
168                components.row_mut(comp_idx).assign(&updated_component);
169            }
170
171            // Orthogonalize components
172            self.orthogonalize_components(&mut components);
173
174            // Update outlier weights based on residuals
175            outlier_weights = self.compute_outlier_weights(&centered_data, &components)?;
176
177            // Check convergence
178            let component_change = self.compute_component_change(&components, &prev_components);
179            if component_change < self.config.tolerance {
180                break;
181            }
182
183            prev_components = components.clone();
184        }
185
186        // Compute explained variance
187        let explained_variance = self.compute_explained_variance(&centered_data, &components)?;
188
189        self.components_ = Some(components);
190        self.explained_variance_ = Some(explained_variance);
191        self.mean_ = Some(robust_mean);
192        self.outlier_weights_ = Some(outlier_weights);
193
194        Ok(())
195    }
196
197    /// Transform data using fitted robust PCA model
198    pub fn transform(&self, data: &Array2<Float>) -> Result<RobustPCAResult> {
199        let components = self
200            .components_
201            .as_ref()
202            .ok_or_else(|| SklearsError::InvalidInput("Model must be fitted first".to_string()))?;
203
204        let mean = self
205            .mean_
206            .as_ref()
207            .ok_or_else(|| SklearsError::InvalidInput("Model must be fitted first".to_string()))?;
208
209        let explained_variance = self
210            .explained_variance_
211            .as_ref()
212            .ok_or_else(|| SklearsError::InvalidInput("Model must be fitted first".to_string()))?;
213
214        let outlier_weights = self
215            .outlier_weights_
216            .as_ref()
217            .ok_or_else(|| SklearsError::InvalidInput("Model must be fitted first".to_string()))?;
218
219        // Center the data
220        let centered_data = data - &mean.view().insert_axis(Axis(0));
221
222        // Project data onto robust components
223        let transformed = centered_data.dot(&components.t());
224
225        Ok(RobustPCAResult {
226            transformed_data: transformed,
227            components: components.clone(),
228            explained_variance: explained_variance.clone(),
229            mean: mean.clone(),
230            outlier_weights: outlier_weights.clone(),
231        })
232    }
233
234    /// Compute robust mean using iterative M-estimator
235    fn compute_robust_mean(&self, data: &Array2<Float>) -> Result<Array1<Float>> {
236        let (n_samples, n_features) = data.dim();
237        let mut mean = data.mean_axis(Axis(0)).unwrap();
238
239        for _ in 0..self.config.max_iterations {
240            let mut new_mean = Array1::zeros(n_features);
241            let mut weight_sum = 0.0;
242
243            for i in 0..n_samples {
244                let residual = &data.row(i) - &mean;
245                let residual_norm = residual.dot(&residual).sqrt();
246                let weight = self.robust_weight(residual_norm);
247
248                new_mean += &(weight * &data.row(i));
249                weight_sum += weight;
250            }
251
252            if weight_sum > 0.0 {
253                new_mean /= weight_sum;
254            }
255
256            let change = (&new_mean - &mean).dot(&(&new_mean - &mean)).sqrt();
257            mean = new_mean;
258
259            if change < self.config.tolerance {
260                break;
261            }
262        }
263
264        Ok(mean)
265    }
266
267    /// Update a single component using robust estimation
268    fn update_component(
269        &self,
270        data: &Array2<Float>,
271        weights: &scirs2_core::ndarray::ArrayView1<Float>,
272        _comp_idx: usize,
273    ) -> Result<Array1<Float>> {
274        let (n_samples, n_features) = data.dim();
275        let mut component = Array1::zeros(n_features);
276        let mut weight_sum = 0.0;
277
278        // Weighted covariance computation
279        for i in 0..n_samples {
280            let weight = weights[i];
281            let sample = data.row(i);
282            component += &(weight * &sample);
283            weight_sum += weight;
284        }
285
286        if weight_sum > 0.0 {
287            component /= weight_sum;
288        }
289
290        // Normalize component
291        let norm = component.dot(&component).sqrt();
292        if norm > 1e-12 {
293            component /= norm;
294        }
295
296        Ok(component)
297    }
298
299    /// Orthogonalize components using Gram-Schmidt process
300    fn orthogonalize_components(&self, components: &mut Array2<Float>) {
301        let n_components = components.nrows();
302
303        for i in 0..n_components {
304            // Collect previous components before mutable borrow
305            let prev_components: Vec<Array1<Float>> =
306                (0..i).map(|j| components.row(j).to_owned()).collect();
307
308            // Normalize current component
309            let mut current = components.row_mut(i);
310            let norm = current.dot(&current).sqrt();
311            if norm > 1e-12 {
312                current /= norm;
313            }
314
315            // Orthogonalize against previous components
316            for prev in prev_components {
317                let projection = current.dot(&prev);
318                current -= &(projection * &prev);
319
320                // Renormalize
321                let norm = current.dot(&current).sqrt();
322                if norm > 1e-12 {
323                    current /= norm;
324                }
325            }
326        }
327    }
328
329    /// Compute outlier weights based on residuals
330    fn compute_outlier_weights(
331        &self,
332        data: &Array2<Float>,
333        components: &Array2<Float>,
334    ) -> Result<Array2<Float>> {
335        let (n_samples, _) = data.dim();
336        let n_components = components.nrows();
337        let mut weights = Array2::zeros((n_samples, n_components));
338
339        for i in 0..n_samples {
340            let sample = data.row(i);
341
342            for j in 0..n_components {
343                let component = components.row(j);
344                let projection = sample.dot(&component);
345                let reconstructed = projection * &component;
346                let residual = &sample - &reconstructed;
347                let residual_norm = residual.dot(&residual).sqrt();
348
349                weights[[i, j]] = self.robust_weight(residual_norm);
350            }
351        }
352
353        Ok(weights)
354    }
355
356    /// Compute robust weight based on loss function
357    fn robust_weight(&self, residual: Float) -> Float {
358        let standardized_residual = residual / self.config.tuning_constant;
359
360        match self.config.loss_function {
361            LossFunction::Huber => {
362                if standardized_residual.abs() <= 1.0 {
363                    1.0
364                } else {
365                    1.0 / standardized_residual.abs()
366                }
367            }
368            LossFunction::Tukey => {
369                if standardized_residual.abs() <= 1.0 {
370                    let x2 = standardized_residual * standardized_residual;
371                    (1.0 - x2) * (1.0 - x2)
372                } else {
373                    0.0
374                }
375            }
376            LossFunction::L1 => {
377                if standardized_residual.abs() > 1e-12 {
378                    1.0 / standardized_residual.abs()
379                } else {
380                    1.0
381                }
382            }
383            LossFunction::Cauchy => 1.0 / (1.0 + standardized_residual * standardized_residual),
384            LossFunction::Welsch => {
385                let x2 = standardized_residual * standardized_residual;
386                (-x2).exp()
387            }
388        }
389    }
390
391    /// Compute explained variance for robust components
392    fn compute_explained_variance(
393        &self,
394        data: &Array2<Float>,
395        components: &Array2<Float>,
396    ) -> Result<Array1<Float>> {
397        let n_components = components.nrows();
398        let mut explained_variance = Array1::zeros(n_components);
399
400        // Total variance (robust estimate)
401        let total_variance = self.compute_robust_variance(data);
402
403        for i in 0..n_components {
404            let component = components.row(i);
405            let projections = data.dot(&component);
406            let component_variance = self.compute_robust_variance_1d(&projections);
407            explained_variance[i] = component_variance / total_variance;
408        }
409
410        Ok(explained_variance)
411    }
412
413    /// Compute robust variance estimate for matrix
414    fn compute_robust_variance(&self, data: &Array2<Float>) -> Float {
415        let (_n_samples, n_features) = data.dim();
416        let mut total_variance = 0.0;
417
418        for j in 0..n_features {
419            let column = data.column(j);
420            total_variance += self.compute_robust_variance_1d(&column.to_owned());
421        }
422
423        total_variance / n_features as Float
424    }
425
426    /// Compute robust variance estimate for 1D array
427    fn compute_robust_variance_1d(&self, data: &Array1<Float>) -> Float {
428        let n = data.len();
429        if n < 2 {
430            return 0.0;
431        }
432
433        // Use median absolute deviation (MAD) as robust variance estimate
434        let median = self.compute_median(data);
435        let mut abs_deviations: Vec<Float> = data.iter().map(|&x| (x - median).abs()).collect();
436        abs_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
437
438        let mad = if abs_deviations.len() % 2 == 0 {
439            let mid = abs_deviations.len() / 2;
440            (abs_deviations[mid - 1] + abs_deviations[mid]) / 2.0
441        } else {
442            abs_deviations[abs_deviations.len() / 2]
443        };
444
445        // Scale MAD to approximate standard deviation
446        1.4826 * mad * 1.4826 * mad // MAD^2 scaled
447    }
448
449    /// Compute median of 1D array
450    fn compute_median(&self, data: &Array1<Float>) -> Float {
451        let mut sorted_data: Vec<Float> = data.iter().cloned().collect();
452        sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
453
454        let n = sorted_data.len();
455        if n % 2 == 0 {
456            (sorted_data[n / 2 - 1] + sorted_data[n / 2]) / 2.0
457        } else {
458            sorted_data[n / 2]
459        }
460    }
461
462    /// Compute change in components for convergence check
463    fn compute_component_change(&self, current: &Array2<Float>, previous: &Array2<Float>) -> Float {
464        let diff = current - previous;
465        (diff.iter().map(|&x| x * x).sum::<Float>()).sqrt()
466    }
467
468    /// Get the fitted components
469    pub fn components(&self) -> Option<&Array2<Float>> {
470        self.components_.as_ref()
471    }
472
473    /// Get the explained variance
474    pub fn explained_variance(&self) -> Option<&Array1<Float>> {
475        self.explained_variance_.as_ref()
476    }
477
478    /// Get the robust mean
479    pub fn mean(&self) -> Option<&Array1<Float>> {
480        self.mean_.as_ref()
481    }
482
483    /// Get the outlier weights
484    pub fn outlier_weights(&self) -> Option<&Array2<Float>> {
485        self.outlier_weights_.as_ref()
486    }
487}
488
489impl Default for RobustPCA {
490    fn default() -> Self {
491        Self::new()
492    }
493}
494
495/// Result of robust PCA decomposition
496#[derive(Debug, Clone)]
497pub struct RobustPCAResult {
498    /// Transformed data in the robust principal component space
499    pub transformed_data: Array2<Float>,
500    /// Robust principal components
501    pub components: Array2<Float>,
502    /// Explained variance ratios
503    pub explained_variance: Array1<Float>,
504    /// Robust mean of the data
505    pub mean: Array1<Float>,
506    /// Outlier weights for each sample and component
507    pub outlier_weights: Array2<Float>,
508}
509
510impl RobustPCAResult {
511    /// Reconstruct data from robust principal components
512    pub fn inverse_transform(&self, transformed_data: &Array2<Float>) -> Array2<Float> {
513        let reconstructed = transformed_data.dot(&self.components);
514        reconstructed + self.mean.view().insert_axis(Axis(0))
515    }
516
517    /// Identify outliers based on weights
518    pub fn identify_outliers(&self, threshold: Float) -> Array1<bool> {
519        let (n_samples, _) = self.outlier_weights.dim();
520        let mut is_outlier = Array1::from_elem(n_samples, false);
521
522        for i in 0..n_samples {
523            let min_weight = self
524                .outlier_weights
525                .row(i)
526                .iter()
527                .cloned()
528                .fold(Float::INFINITY, Float::min);
529            if min_weight < threshold {
530                is_outlier[i] = true;
531            }
532        }
533
534        is_outlier
535    }
536
537    /// Compute reconstruction error for each sample
538    pub fn reconstruction_error(&self, original_data: &Array2<Float>) -> Result<Array1<Float>> {
539        let reconstructed = self.inverse_transform(&self.transformed_data);
540        let (n_samples, _) = original_data.dim();
541        let mut errors = Array1::zeros(n_samples);
542
543        for i in 0..n_samples {
544            let original_row = original_data.row(i);
545            let reconstructed_row = reconstructed.row(i);
546            let diff = &original_row - &reconstructed_row;
547            errors[i] = diff.dot(&diff).sqrt();
548        }
549
550        Ok(errors)
551    }
552}
553
554/// M-estimator based matrix decomposition
555pub struct MEstimatorDecomposition {
556    config: RobustConfig,
557    rank: Option<usize>,
558}
559
560impl MEstimatorDecomposition {
561    /// Create a new M-estimator decomposition
562    pub fn new() -> Self {
563        Self {
564            config: RobustConfig::default(),
565            rank: None,
566        }
567    }
568
569    /// Set the target rank for decomposition
570    pub fn rank(mut self, rank: usize) -> Self {
571        self.rank = Some(rank);
572        self
573    }
574
575    /// Set the loss function
576    pub fn loss_function(mut self, loss_function: LossFunction) -> Self {
577        self.config.loss_function = loss_function;
578        self
579    }
580
581    /// Decompose matrix using M-estimator
582    pub fn decompose(&self, matrix: &Array2<Float>) -> Result<MEstimatorResult> {
583        let (m, n) = matrix.dim();
584        let rank = self.rank.unwrap_or((m.min(n) / 2).max(1));
585
586        // Initialize factors
587        let mut rng = thread_rng();
588        let mut u = Array2::zeros((m, rank));
589        let mut v = Array2::zeros((n, rank));
590
591        // Fill with normal random values
592        for elem in u.iter_mut() {
593            *elem = rng.sample(RandNormal::new(0.0, 1.0).unwrap());
594        }
595        for elem in v.iter_mut() {
596            *elem = rng.sample(RandNormal::new(0.0, 1.0).unwrap());
597        }
598
599        // Iterative M-estimator updates
600        for _ in 0..self.config.max_iterations {
601            let prev_u = u.clone();
602            let prev_v = v.clone();
603
604            // Update U
605            u = self.update_factor_u(matrix, &u, &v)?;
606
607            // Update V
608            v = self.update_factor_v(matrix, &u, &v)?;
609
610            // Check convergence
611            let u_change = (&u - &prev_u).iter().map(|&x| x * x).sum::<Float>().sqrt();
612            let v_change = (&v - &prev_v).iter().map(|&x| x * x).sum::<Float>().sqrt();
613
614            if u_change + v_change < self.config.tolerance {
615                break;
616            }
617        }
618
619        // Compute residuals and weights
620        let reconstruction = u.dot(&v.t());
621        let residuals = matrix - &reconstruction;
622        let weights = self.compute_residual_weights(&residuals);
623
624        Ok(MEstimatorResult {
625            u_factor: u,
626            v_factor: v,
627            reconstruction,
628            residuals,
629            weights,
630        })
631    }
632
633    /// Update U factor using robust estimation
634    fn update_factor_u(
635        &self,
636        matrix: &Array2<Float>,
637        u: &Array2<Float>,
638        v: &Array2<Float>,
639    ) -> Result<Array2<Float>> {
640        let (m, rank) = u.dim();
641        let mut new_u = Array2::zeros((m, rank));
642
643        for i in 0..m {
644            for r in 0..rank {
645                let mut numerator = 0.0;
646                let mut denominator = 0.0;
647
648                for j in 0..matrix.ncols() {
649                    let residual = matrix[[i, j]] - u.row(i).dot(&v.row(j));
650                    let weight = self.robust_weight(residual.abs());
651
652                    numerator += weight * matrix[[i, j]] * v[[j, r]];
653                    denominator += weight * v[[j, r]] * v[[j, r]];
654                }
655
656                if denominator > 1e-12 {
657                    new_u[[i, r]] = numerator / denominator;
658                }
659            }
660        }
661
662        Ok(new_u)
663    }
664
665    /// Update V factor using robust estimation
666    fn update_factor_v(
667        &self,
668        matrix: &Array2<Float>,
669        u: &Array2<Float>,
670        v: &Array2<Float>,
671    ) -> Result<Array2<Float>> {
672        let (n, rank) = v.dim();
673        let mut new_v = Array2::zeros((n, rank));
674
675        for j in 0..n {
676            for r in 0..rank {
677                let mut numerator = 0.0;
678                let mut denominator = 0.0;
679
680                for i in 0..matrix.nrows() {
681                    let residual = matrix[[i, j]] - u.row(i).dot(&v.row(j));
682                    let weight = self.robust_weight(residual.abs());
683
684                    numerator += weight * matrix[[i, j]] * u[[i, r]];
685                    denominator += weight * u[[i, r]] * u[[i, r]];
686                }
687
688                if denominator > 1e-12 {
689                    new_v[[j, r]] = numerator / denominator;
690                }
691            }
692        }
693
694        Ok(new_v)
695    }
696
697    /// Compute weights for residuals
698    fn compute_residual_weights(&self, residuals: &Array2<Float>) -> Array2<Float> {
699        residuals.mapv(|r| self.robust_weight(r.abs()))
700    }
701
702    /// Compute robust weight (same as in RobustPCA)
703    fn robust_weight(&self, residual: Float) -> Float {
704        let standardized_residual = residual / self.config.tuning_constant;
705
706        match self.config.loss_function {
707            LossFunction::Huber => {
708                if standardized_residual.abs() <= 1.0 {
709                    1.0
710                } else {
711                    1.0 / standardized_residual.abs()
712                }
713            }
714            LossFunction::Tukey => {
715                if standardized_residual.abs() <= 1.0 {
716                    let x2 = standardized_residual * standardized_residual;
717                    (1.0 - x2) * (1.0 - x2)
718                } else {
719                    0.0
720                }
721            }
722            LossFunction::L1 => {
723                if standardized_residual.abs() > 1e-12 {
724                    1.0 / standardized_residual.abs()
725                } else {
726                    1.0
727                }
728            }
729            LossFunction::Cauchy => 1.0 / (1.0 + standardized_residual * standardized_residual),
730            LossFunction::Welsch => {
731                let x2 = standardized_residual * standardized_residual;
732                (-x2).exp()
733            }
734        }
735    }
736}
737
738impl Default for MEstimatorDecomposition {
739    fn default() -> Self {
740        Self::new()
741    }
742}
743
744/// Result of M-estimator decomposition
745#[derive(Debug, Clone)]
746pub struct MEstimatorResult {
747    /// U factor matrix
748    pub u_factor: Array2<Float>,
749    /// V factor matrix  
750    pub v_factor: Array2<Float>,
751    /// Reconstructed matrix
752    pub reconstruction: Array2<Float>,
753    /// Residuals matrix
754    pub residuals: Array2<Float>,
755    /// Robust weights for each element
756    pub weights: Array2<Float>,
757}
758
759impl MEstimatorResult {
760    /// Compute the Frobenius norm of residuals
761    pub fn residual_norm(&self) -> Float {
762        self.residuals.iter().map(|&x| x * x).sum::<Float>().sqrt()
763    }
764
765    /// Compute weighted residual norm
766    pub fn weighted_residual_norm(&self) -> Float {
767        let weighted_residuals = &self.residuals * &self.weights;
768        weighted_residuals
769            .iter()
770            .map(|&x| x * x)
771            .sum::<Float>()
772            .sqrt()
773    }
774
775    /// Identify outlier elements based on low weights
776    pub fn identify_outlier_elements(&self, threshold: Float) -> Array2<bool> {
777        self.weights.mapv(|w| w < threshold)
778    }
779}
780
781/// Breakdown point analysis for robust methods
782pub struct BreakdownPointAnalysis {
783    #[allow(dead_code)]
784    config: RobustConfig,
785}
786
787impl BreakdownPointAnalysis {
788    /// Create a new breakdown point analysis
789    pub fn new() -> Self {
790        Self {
791            config: RobustConfig::default(),
792        }
793    }
794
795    /// Compute empirical breakdown point for robust PCA
796    pub fn empirical_breakdown_point(
797        &self,
798        data: &Array2<Float>,
799        contamination_levels: &[Float],
800    ) -> Result<BreakdownResult> {
801        let (n_samples, _) = data.dim();
802        let mut breakdown_errors = Vec::new();
803        let mut breakdown_levels = Vec::new();
804
805        for &contamination_level in contamination_levels {
806            let n_outliers = (contamination_level * n_samples as Float) as usize;
807
808            if n_outliers >= n_samples {
809                continue;
810            }
811
812            // Add outliers to data
813            let contaminated_data = self.add_outliers(data, n_outliers)?;
814
815            // Fit robust PCA
816            let mut robust_pca = RobustPCA::new().n_components(2);
817            let robust_result = robust_pca.fit_transform(&contaminated_data)?;
818
819            // Fit standard PCA for comparison
820            let standard_result = self.standard_pca(&contaminated_data, 2)?;
821
822            // Compute difference in principal directions
823            let direction_error =
824                self.compute_direction_error(&robust_result.components, &standard_result);
825
826            breakdown_errors.push(direction_error);
827            breakdown_levels.push(contamination_level);
828        }
829
830        Ok(BreakdownResult {
831            contamination_levels: Array1::from_vec(breakdown_levels),
832            direction_errors: Array1::from_vec(breakdown_errors),
833        })
834    }
835
836    /// Add outliers to data for breakdown analysis
837    fn add_outliers(&self, data: &Array2<Float>, n_outliers: usize) -> Result<Array2<Float>> {
838        let (n_samples, n_features) = data.dim();
839        let mut contaminated_data = data.clone();
840
841        // Generate outliers at large distances from the data center
842        let data_center = data.mean_axis(Axis(0)).unwrap();
843        let data_scale = self.estimate_scale(data);
844        let mut rng = thread_rng();
845
846        for i in 0..n_outliers.min(n_samples) {
847            for j in 0..n_features {
848                // Place outliers at 10 times the data scale
849                contaminated_data[[i, j]] =
850                    data_center[j] + 10.0 * data_scale * (2.0 * (rng.gen::<Float>()) - 1.0);
851            }
852        }
853
854        Ok(contaminated_data)
855    }
856
857    /// Estimate data scale using MAD
858    fn estimate_scale(&self, data: &Array2<Float>) -> Float {
859        let center = data.mean_axis(Axis(0)).unwrap();
860        let distances: Vec<Float> = data
861            .axis_iter(Axis(0))
862            .map(|row| {
863                let diff = &row - &center;
864                diff.dot(&diff).sqrt()
865            })
866            .collect();
867
868        // Compute median of distances
869        let mut sorted_distances = distances;
870        sorted_distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
871
872        let n = sorted_distances.len();
873        if n % 2 == 0 {
874            (sorted_distances[n / 2 - 1] + sorted_distances[n / 2]) / 2.0
875        } else {
876            sorted_distances[n / 2]
877        }
878    }
879
880    /// Simplified standard PCA for comparison
881    fn standard_pca(&self, data: &Array2<Float>, n_components: usize) -> Result<Array2<Float>> {
882        let (n_samples, n_features) = data.dim();
883
884        // Center data
885        let mean = data.mean_axis(Axis(0)).unwrap();
886        let centered_data = data - &mean.view().insert_axis(Axis(0));
887
888        // Compute covariance matrix
889        let _cov_matrix = centered_data.t().dot(&centered_data) / (n_samples - 1) as Float;
890
891        // Simplified eigendecomposition (in practice would use proper SVD/eigen)
892        let mut components = Array2::eye(n_features);
893        components = components
894            .slice(scirs2_core::ndarray::s![0..n_components, ..])
895            .to_owned();
896
897        Ok(components)
898    }
899
900    /// Compute angular error between principal directions
901    fn compute_direction_error(
902        &self,
903        robust_components: &Array2<Float>,
904        standard_components: &Array2<Float>,
905    ) -> Float {
906        let n_components = robust_components.nrows().min(standard_components.nrows());
907        let mut total_error = 0.0;
908
909        for i in 0..n_components {
910            let robust_dir = robust_components.row(i);
911            let standard_dir = standard_components.row(i);
912
913            // Compute cosine of angle between directions
914            let dot_product = robust_dir.dot(&standard_dir);
915            let robust_norm = robust_dir.dot(&robust_dir).sqrt();
916            let standard_norm = standard_dir.dot(&standard_dir).sqrt();
917
918            if robust_norm > 1e-12 && standard_norm > 1e-12 {
919                let cos_angle = dot_product / (robust_norm * standard_norm);
920                let angle = cos_angle.abs().acos();
921                total_error += angle;
922            }
923        }
924
925        total_error / n_components as Float
926    }
927}
928
929impl Default for BreakdownPointAnalysis {
930    fn default() -> Self {
931        Self::new()
932    }
933}
934
935/// Result of breakdown point analysis
936#[derive(Debug, Clone)]
937pub struct BreakdownResult {
938    /// Contamination levels tested
939    pub contamination_levels: Array1<Float>,
940    /// Direction errors at each contamination level
941    pub direction_errors: Array1<Float>,
942}
943
944impl BreakdownResult {
945    /// Find the breakdown point (contamination level where error exceeds threshold)
946    pub fn breakdown_point(&self, error_threshold: Float) -> Option<Float> {
947        for (level, error) in self
948            .contamination_levels
949            .iter()
950            .zip(self.direction_errors.iter())
951        {
952            if *error > error_threshold {
953                return Some(*level);
954            }
955        }
956        None
957    }
958
959    /// Compute the area under the error curve
960    pub fn robustness_score(&self) -> Float {
961        let n = self.contamination_levels.len();
962        if n < 2 {
963            return 0.0;
964        }
965
966        let mut area = 0.0;
967        for i in 1..n {
968            let dx = self.contamination_levels[i] - self.contamination_levels[i - 1];
969            let avg_error = (self.direction_errors[i] + self.direction_errors[i - 1]) / 2.0;
970            area += dx * avg_error;
971        }
972
973        // Return inverse of area (higher is more robust)
974        1.0 / (area + 1e-12)
975    }
976}
977
978#[allow(non_snake_case)]
979#[cfg(test)]
980mod tests {
981    use super::*;
982    use scirs2_core::ndarray::Array2;
983
984    #[test]
985    fn test_robust_pca_basic() {
986        let data = Array2::from_shape_vec(
987            (4, 3),
988            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 2.0, 3.0, 4.0],
989        )
990        .unwrap();
991
992        let mut rpca = RobustPCA::new().n_components(2);
993        let result = rpca.fit_transform(&data).unwrap();
994
995        assert_eq!(result.transformed_data.dim(), (4, 2));
996        assert_eq!(result.components.dim(), (2, 3));
997        assert_eq!(result.explained_variance.len(), 2);
998        assert_eq!(result.outlier_weights.dim(), (4, 2));
999    }
1000
1001    #[test]
1002    fn test_robust_pca_with_outliers() {
1003        let data = Array2::from_shape_vec(
1004            (5, 3),
1005            vec![
1006                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 2.0, 3.0, 4.0, 100.0, 100.0,
1007                100.0, // Clear outlier
1008            ],
1009        )
1010        .unwrap();
1011
1012        let mut rpca = RobustPCA::new()
1013            .n_components(2)
1014            .loss_function(LossFunction::Tukey);
1015
1016        let result = rpca.fit_transform(&data).unwrap();
1017
1018        // Check that outlier has low weights
1019        let outlier_weights = result.outlier_weights.row(4);
1020        let min_weight = outlier_weights
1021            .iter()
1022            .cloned()
1023            .fold(Float::INFINITY, Float::min);
1024        assert!(min_weight < 0.5); // Outlier should have low weight
1025
1026        // Check outlier identification
1027        let outliers = result.identify_outliers(0.5);
1028        assert!(outliers[4]); // Last sample should be identified as outlier
1029    }
1030
1031    #[test]
1032    fn test_m_estimator_decomposition() {
1033        let matrix =
1034            Array2::from_shape_vec((3, 3), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
1035                .unwrap();
1036
1037        let m_est = MEstimatorDecomposition::new()
1038            .rank(2)
1039            .loss_function(LossFunction::Huber);
1040
1041        let result = m_est.decompose(&matrix).unwrap();
1042
1043        assert_eq!(result.u_factor.dim(), (3, 2));
1044        assert_eq!(result.v_factor.dim(), (3, 2));
1045        assert_eq!(result.reconstruction.dim(), (3, 3));
1046        assert_eq!(result.residuals.dim(), (3, 3));
1047        assert_eq!(result.weights.dim(), (3, 3));
1048    }
1049
1050    #[test]
1051    fn test_loss_functions() {
1052        let _config = RobustConfig::default();
1053        let rpca = RobustPCA::new();
1054
1055        let residuals = vec![0.5, 1.0, 1.5, 2.0, 5.0];
1056
1057        for residual in residuals {
1058            let huber_weight = rpca.robust_weight(residual);
1059            assert!(huber_weight > 0.0);
1060            assert!(huber_weight <= 1.0);
1061        }
1062    }
1063
1064    #[test]
1065    fn test_breakdown_analysis() {
1066        let data = Array2::from_shape_vec(
1067            (10, 3),
1068            vec![
1069                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
1070                16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
1071                30.0,
1072            ],
1073        )
1074        .unwrap();
1075
1076        let breakdown_analysis = BreakdownPointAnalysis::new();
1077        let contamination_levels = vec![0.1, 0.2, 0.3];
1078
1079        let result = breakdown_analysis
1080            .empirical_breakdown_point(&data, &contamination_levels)
1081            .unwrap();
1082
1083        assert_eq!(
1084            result.contamination_levels.len(),
1085            result.direction_errors.len()
1086        );
1087        assert!(result.contamination_levels.len() <= contamination_levels.len());
1088
1089        // Test robustness score
1090        let score = result.robustness_score();
1091        assert!(score > 0.0);
1092    }
1093
1094    #[test]
1095    fn test_robust_mean_computation() {
1096        let data = Array2::from_shape_vec(
1097            (5, 2),
1098            vec![
1099                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 3.0, 100.0, 100.0, // Outlier
1100            ],
1101        )
1102        .unwrap();
1103
1104        let rpca = RobustPCA::new();
1105        let robust_mean = rpca.compute_robust_mean(&data).unwrap();
1106
1107        // Robust mean should be less affected by the outlier
1108        assert!(robust_mean[0] < 50.0); // Should be much less than simple mean
1109        assert!(robust_mean[1] < 50.0);
1110    }
1111
1112    #[test]
1113    fn test_reconstruction_error() {
1114        let data = Array2::from_shape_vec(
1115            (4, 3),
1116            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 2.0, 3.0, 4.0],
1117        )
1118        .unwrap();
1119
1120        let mut rpca = RobustPCA::new().n_components(2);
1121        let result = rpca.fit_transform(&data).unwrap();
1122
1123        let errors = result.reconstruction_error(&data).unwrap();
1124        assert_eq!(errors.len(), 4);
1125
1126        // All errors should be non-negative
1127        for error in errors.iter() {
1128            assert!(*error >= 0.0);
1129        }
1130    }
1131}
sklears_decomposition/robust_methods.rs

sklears_decomposition/
robust_methods.rs