quantrs2_ml/anomaly_detection/
preprocessing.rs

1//! Data preprocessing for quantum anomaly detection
2
3use crate::error::{MLError, Result};
4use scirs2_core::ndarray::{Array1, Array2, Axis};
5use scirs2_core::random::prelude::*;
6use scirs2_core::random::Rng;
7
8use super::config::{
9    DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
10    NormalizationType, PreprocessingConfig,
11};
12
13/// Data preprocessor
14#[derive(Debug)]
15pub struct DataPreprocessor {
16    config: PreprocessingConfig,
17    fitted: bool,
18    normalization_params: Option<NormalizationParams>,
19    feature_selector: Option<FeatureSelector>,
20    dimensionality_reducer: Option<DimensionalityReducer>,
21}
22
23/// Normalization parameters
24#[derive(Debug, Clone)]
25pub struct NormalizationParams {
26    pub means: Array1<f64>,
27    pub stds: Array1<f64>,
28    pub mins: Array1<f64>,
29    pub maxs: Array1<f64>,
30}
31
32/// Feature selector
33#[derive(Debug)]
34pub struct FeatureSelector {
35    pub selected_features: Vec<usize>,
36    pub feature_scores: Array1<f64>,
37}
38
39/// Dimensionality reducer
40#[derive(Debug)]
41pub struct DimensionalityReducer {
42    pub components: Array2<f64>,
43    pub explained_variance: Array1<f64>,
44    pub target_dim: usize,
45}
46
47impl DataPreprocessor {
48    /// Create new preprocessor
49    pub fn new(config: PreprocessingConfig) -> Self {
50        DataPreprocessor {
51            config,
52            fitted: false,
53            normalization_params: None,
54            feature_selector: None,
55            dimensionality_reducer: None,
56        }
57    }
58
59    /// Fit and transform data
60    pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
61        self.fit(data)?;
62        self.transform(data)
63    }
64
65    /// Fit preprocessor to data
66    pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
67        // Compute normalization parameters
68        self.normalization_params = Some(self.compute_normalization_params(data));
69
70        let mut current_data = data.clone();
71
72        // Apply normalization first
73        if let Some(ref params) = self.normalization_params {
74            current_data = self.apply_normalization(&current_data, params)?;
75        }
76
77        // Fit feature selector if configured
78        if self.config.feature_selection.is_some() {
79            self.feature_selector = Some(self.fit_feature_selector(&current_data)?);
80            // Apply feature selection to get the reduced data
81            if let Some(ref selector) = self.feature_selector {
82                current_data = self.apply_feature_selection(&current_data, selector)?;
83            }
84        }
85
86        // Fit dimensionality reducer if configured (on feature-selected data)
87        if self.config.dimensionality_reduction.is_some() {
88            self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(&current_data)?);
89        }
90
91        self.fitted = true;
92        Ok(())
93    }
94
95    /// Transform data
96    pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
97        if !self.fitted {
98            return Err(MLError::MLOperationError(
99                "Preprocessor must be fitted before transform".to_string(),
100            ));
101        }
102
103        let mut transformed = data.clone();
104
105        // Apply normalization
106        if let Some(ref params) = self.normalization_params {
107            transformed = self.apply_normalization(&transformed, params)?;
108        }
109
110        // Apply feature selection
111        if let Some(ref selector) = self.feature_selector {
112            transformed = self.apply_feature_selection(&transformed, selector)?;
113        }
114
115        // Apply dimensionality reduction
116        if let Some(ref reducer) = self.dimensionality_reducer {
117            transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
118        }
119
120        Ok(transformed)
121    }
122
123    /// Compute normalization parameters
124    fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
125        let n_features = data.ncols();
126        let mut means = Array1::zeros(n_features);
127        let mut stds = Array1::zeros(n_features);
128        let mut mins = Array1::zeros(n_features);
129        let mut maxs = Array1::zeros(n_features);
130
131        for j in 0..n_features {
132            let column = data.column(j);
133            means[j] = column.mean().unwrap_or(0.0);
134            stds[j] = column.std(0.0);
135            mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
136            maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
137        }
138
139        NormalizationParams {
140            means,
141            stds,
142            mins,
143            maxs,
144        }
145    }
146
147    /// Apply normalization
148    fn apply_normalization(
149        &self,
150        data: &Array2<f64>,
151        params: &NormalizationParams,
152    ) -> Result<Array2<f64>> {
153        let mut normalized = data.clone();
154
155        match self.config.normalization {
156            NormalizationType::ZScore => {
157                for j in 0..data.ncols() {
158                    let mut column = normalized.column_mut(j);
159                    if params.stds[j] > 1e-8 {
160                        column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
161                    }
162                }
163            }
164            NormalizationType::MinMax => {
165                for j in 0..data.ncols() {
166                    let mut column = normalized.column_mut(j);
167                    let range = params.maxs[j] - params.mins[j];
168                    if range > 1e-8 {
169                        column.mapv_inplace(|x| (x - params.mins[j]) / range);
170                    }
171                }
172            }
173            NormalizationType::Robust => {
174                // Robust scaling using median and IQR
175                for j in 0..data.ncols() {
176                    let mut column_data: Vec<f64> = data.column(j).to_vec();
177                    column_data
178                        .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
179
180                    let median = if column_data.len() % 2 == 0 {
181                        (column_data[column_data.len() / 2 - 1]
182                            + column_data[column_data.len() / 2])
183                            / 2.0
184                    } else {
185                        column_data[column_data.len() / 2]
186                    };
187
188                    let q1 = column_data[column_data.len() / 4];
189                    let q3 = column_data[3 * column_data.len() / 4];
190                    let iqr = q3 - q1;
191
192                    let mut column = normalized.column_mut(j);
193                    if iqr > 1e-8 {
194                        column.mapv_inplace(|x| (x - median) / iqr);
195                    }
196                }
197            }
198            NormalizationType::Quantum => {
199                // Quantum normalization (placeholder - would use quantum circuits)
200                for j in 0..data.ncols() {
201                    let mut column = normalized.column_mut(j);
202                    let norm = column.dot(&column).sqrt();
203                    if norm > 1e-8 {
204                        column.mapv_inplace(|x| x / norm);
205                    }
206                }
207            }
208        }
209
210        Ok(normalized)
211    }
212
213    /// Fit feature selector
214    fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
215        let n_features = data.ncols();
216
217        let feature_scores = match &self.config.feature_selection {
218            Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
219            Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
220            Some(FeatureSelection::MutualInformation) => {
221                self.compute_mutual_information_scores(data)
222            }
223            Some(FeatureSelection::QuantumInformation) => {
224                self.compute_quantum_information_scores(data)
225            }
226            None => Array1::zeros(n_features),
227        };
228
229        // Select top features
230        let mut indexed_scores: Vec<(usize, f64)> = feature_scores
231            .iter()
232            .enumerate()
233            .map(|(i, &score)| (i, score))
234            .collect();
235
236        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
237
238        let num_selected = (n_features / 2).max(1);
239        let selected_features: Vec<usize> = indexed_scores
240            .into_iter()
241            .take(num_selected)
242            .map(|(idx, _)| idx)
243            .collect();
244
245        Ok(FeatureSelector {
246            selected_features,
247            feature_scores,
248        })
249    }
250
251    /// Apply feature selection
252    fn apply_feature_selection(
253        &self,
254        data: &Array2<f64>,
255        selector: &FeatureSelector,
256    ) -> Result<Array2<f64>> {
257        let selected_data = data.select(Axis(1), &selector.selected_features);
258        Ok(selected_data)
259    }
260
261    /// Fit dimensionality reducer
262    fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
263        let n_features = data.ncols();
264        let target_dim = (n_features / 2).max(1);
265
266        match &self.config.dimensionality_reduction {
267            Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
268            Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
269            Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
270            Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
271            Some(DimensionalityReduction::QuantumManifold) => {
272                self.fit_quantum_manifold(data, target_dim)
273            }
274            None => {
275                // Fallback to identity
276                let components = Array2::eye(n_features);
277                let explained_variance = Array1::ones(n_features);
278                Ok(DimensionalityReducer {
279                    components,
280                    explained_variance,
281                    target_dim: n_features,
282                })
283            }
284        }
285    }
286
287    /// Apply dimensionality reduction
288    fn apply_dimensionality_reduction(
289        &self,
290        data: &Array2<f64>,
291        reducer: &DimensionalityReducer,
292    ) -> Result<Array2<f64>> {
293        let reduced = data.dot(&reducer.components.t());
294        Ok(reduced)
295    }
296
297    // Helper methods for feature selection
298
299    fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
300        let n_features = data.ncols();
301        let mut scores = Array1::zeros(n_features);
302
303        for j in 0..n_features {
304            let column = data.column(j);
305            scores[j] = column.var(0.0);
306        }
307
308        scores
309    }
310
311    fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
312        // Placeholder: compute feature correlations
313        let n_features = data.ncols();
314        Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
315    }
316
317    fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
318        // Placeholder: compute mutual information
319        let n_features = data.ncols();
320        Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
321    }
322
323    fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
324        // Placeholder: compute quantum information scores
325        let n_features = data.ncols();
326        Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
327    }
328
329    // Helper methods for dimensionality reduction
330
331    fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
332        // Placeholder PCA implementation
333        let n_features = data.ncols();
334        let components =
335            Array2::from_shape_fn(
336                (target_dim, n_features),
337                |(i, j)| {
338                    if i == j {
339                        1.0
340                    } else {
341                        0.0
342                    }
343                },
344            );
345
346        let explained_variance =
347            Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
348
349        Ok(DimensionalityReducer {
350            components,
351            explained_variance,
352            target_dim,
353        })
354    }
355
356    fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
357        // Placeholder ICA implementation
358        self.fit_pca(data, target_dim)
359    }
360
361    fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
362        // Placeholder UMAP implementation
363        self.fit_pca(data, target_dim)
364    }
365
366    fn fit_quantum_pca(
367        &self,
368        data: &Array2<f64>,
369        target_dim: usize,
370    ) -> Result<DimensionalityReducer> {
371        // Placeholder Quantum PCA implementation
372        self.fit_pca(data, target_dim)
373    }
374
375    fn fit_quantum_manifold(
376        &self,
377        data: &Array2<f64>,
378        target_dim: usize,
379    ) -> Result<DimensionalityReducer> {
380        // Placeholder Quantum Manifold implementation
381        self.fit_pca(data, target_dim)
382    }
383}