quantrs2_ml/anomaly_detection/
preprocessing.rs

1//! Data preprocessing for quantum anomaly detection
2
3use crate::error::{MLError, Result};
4use scirs2_core::random::prelude::*;
5use scirs2_core::ndarray::{Array1, Array2, Axis};
6use scirs2_core::random::Rng;
7
8use super::config::{
9    DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
10    NormalizationType, PreprocessingConfig,
11};
12
13/// Data preprocessor
14#[derive(Debug)]
15pub struct DataPreprocessor {
16    config: PreprocessingConfig,
17    fitted: bool,
18    normalization_params: Option<NormalizationParams>,
19    feature_selector: Option<FeatureSelector>,
20    dimensionality_reducer: Option<DimensionalityReducer>,
21}
22
23/// Normalization parameters
24#[derive(Debug, Clone)]
25pub struct NormalizationParams {
26    pub means: Array1<f64>,
27    pub stds: Array1<f64>,
28    pub mins: Array1<f64>,
29    pub maxs: Array1<f64>,
30}
31
32/// Feature selector
33#[derive(Debug)]
34pub struct FeatureSelector {
35    pub selected_features: Vec<usize>,
36    pub feature_scores: Array1<f64>,
37}
38
39/// Dimensionality reducer
40#[derive(Debug)]
41pub struct DimensionalityReducer {
42    pub components: Array2<f64>,
43    pub explained_variance: Array1<f64>,
44    pub target_dim: usize,
45}
46
47impl DataPreprocessor {
48    /// Create new preprocessor
49    pub fn new(config: PreprocessingConfig) -> Self {
50        DataPreprocessor {
51            config,
52            fitted: false,
53            normalization_params: None,
54            feature_selector: None,
55            dimensionality_reducer: None,
56        }
57    }
58
59    /// Fit and transform data
60    pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
61        self.fit(data)?;
62        self.transform(data)
63    }
64
65    /// Fit preprocessor to data
66    pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
67        // Compute normalization parameters
68        self.normalization_params = Some(self.compute_normalization_params(data));
69
70        let mut current_data = data.clone();
71
72        // Apply normalization first
73        if let Some(ref params) = self.normalization_params {
74            current_data = self.apply_normalization(&current_data, params)?;
75        }
76
77        // Fit feature selector if configured
78        if self.config.feature_selection.is_some() {
79            self.feature_selector = Some(self.fit_feature_selector(&current_data)?);
80            // Apply feature selection to get the reduced data
81            if let Some(ref selector) = self.feature_selector {
82                current_data = self.apply_feature_selection(&current_data, selector)?;
83            }
84        }
85
86        // Fit dimensionality reducer if configured (on feature-selected data)
87        if self.config.dimensionality_reduction.is_some() {
88            self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(&current_data)?);
89        }
90
91        self.fitted = true;
92        Ok(())
93    }
94
95    /// Transform data
96    pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
97        if !self.fitted {
98            return Err(MLError::MLOperationError(
99                "Preprocessor must be fitted before transform".to_string(),
100            ));
101        }
102
103        let mut transformed = data.clone();
104
105        // Apply normalization
106        if let Some(ref params) = self.normalization_params {
107            transformed = self.apply_normalization(&transformed, params)?;
108        }
109
110        // Apply feature selection
111        if let Some(ref selector) = self.feature_selector {
112            transformed = self.apply_feature_selection(&transformed, selector)?;
113        }
114
115        // Apply dimensionality reduction
116        if let Some(ref reducer) = self.dimensionality_reducer {
117            transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
118        }
119
120        Ok(transformed)
121    }
122
123    /// Compute normalization parameters
124    fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
125        let n_features = data.ncols();
126        let mut means = Array1::zeros(n_features);
127        let mut stds = Array1::zeros(n_features);
128        let mut mins = Array1::zeros(n_features);
129        let mut maxs = Array1::zeros(n_features);
130
131        for j in 0..n_features {
132            let column = data.column(j);
133            means[j] = column.mean().unwrap_or(0.0);
134            stds[j] = column.std(0.0);
135            mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
136            maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
137        }
138
139        NormalizationParams {
140            means,
141            stds,
142            mins,
143            maxs,
144        }
145    }
146
147    /// Apply normalization
148    fn apply_normalization(
149        &self,
150        data: &Array2<f64>,
151        params: &NormalizationParams,
152    ) -> Result<Array2<f64>> {
153        let mut normalized = data.clone();
154
155        match self.config.normalization {
156            NormalizationType::ZScore => {
157                for j in 0..data.ncols() {
158                    let mut column = normalized.column_mut(j);
159                    if params.stds[j] > 1e-8 {
160                        column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
161                    }
162                }
163            }
164            NormalizationType::MinMax => {
165                for j in 0..data.ncols() {
166                    let mut column = normalized.column_mut(j);
167                    let range = params.maxs[j] - params.mins[j];
168                    if range > 1e-8 {
169                        column.mapv_inplace(|x| (x - params.mins[j]) / range);
170                    }
171                }
172            }
173            NormalizationType::Robust => {
174                // Robust scaling using median and IQR
175                for j in 0..data.ncols() {
176                    let mut column_data: Vec<f64> = data.column(j).to_vec();
177                    column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
178
179                    let median = if column_data.len() % 2 == 0 {
180                        (column_data[column_data.len() / 2 - 1]
181                            + column_data[column_data.len() / 2])
182                            / 2.0
183                    } else {
184                        column_data[column_data.len() / 2]
185                    };
186
187                    let q1 = column_data[column_data.len() / 4];
188                    let q3 = column_data[3 * column_data.len() / 4];
189                    let iqr = q3 - q1;
190
191                    let mut column = normalized.column_mut(j);
192                    if iqr > 1e-8 {
193                        column.mapv_inplace(|x| (x - median) / iqr);
194                    }
195                }
196            }
197            NormalizationType::Quantum => {
198                // Quantum normalization (placeholder - would use quantum circuits)
199                for j in 0..data.ncols() {
200                    let mut column = normalized.column_mut(j);
201                    let norm = column.dot(&column).sqrt();
202                    if norm > 1e-8 {
203                        column.mapv_inplace(|x| x / norm);
204                    }
205                }
206            }
207        }
208
209        Ok(normalized)
210    }
211
212    /// Fit feature selector
213    fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
214        let n_features = data.ncols();
215
216        let feature_scores = match &self.config.feature_selection {
217            Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
218            Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
219            Some(FeatureSelection::MutualInformation) => {
220                self.compute_mutual_information_scores(data)
221            }
222            Some(FeatureSelection::QuantumInformation) => {
223                self.compute_quantum_information_scores(data)
224            }
225            None => Array1::zeros(n_features),
226        };
227
228        // Select top features
229        let mut indexed_scores: Vec<(usize, f64)> = feature_scores
230            .iter()
231            .enumerate()
232            .map(|(i, &score)| (i, score))
233            .collect();
234
235        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
236
237        let num_selected = (n_features / 2).max(1);
238        let selected_features: Vec<usize> = indexed_scores
239            .into_iter()
240            .take(num_selected)
241            .map(|(idx, _)| idx)
242            .collect();
243
244        Ok(FeatureSelector {
245            selected_features,
246            feature_scores,
247        })
248    }
249
250    /// Apply feature selection
251    fn apply_feature_selection(
252        &self,
253        data: &Array2<f64>,
254        selector: &FeatureSelector,
255    ) -> Result<Array2<f64>> {
256        let selected_data = data.select(Axis(1), &selector.selected_features);
257        Ok(selected_data)
258    }
259
260    /// Fit dimensionality reducer
261    fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
262        let n_features = data.ncols();
263        let target_dim = (n_features / 2).max(1);
264
265        match &self.config.dimensionality_reduction {
266            Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
267            Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
268            Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
269            Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
270            Some(DimensionalityReduction::QuantumManifold) => {
271                self.fit_quantum_manifold(data, target_dim)
272            }
273            None => {
274                // Fallback to identity
275                let components = Array2::eye(n_features);
276                let explained_variance = Array1::ones(n_features);
277                Ok(DimensionalityReducer {
278                    components,
279                    explained_variance,
280                    target_dim: n_features,
281                })
282            }
283        }
284    }
285
286    /// Apply dimensionality reduction
287    fn apply_dimensionality_reduction(
288        &self,
289        data: &Array2<f64>,
290        reducer: &DimensionalityReducer,
291    ) -> Result<Array2<f64>> {
292        let reduced = data.dot(&reducer.components.t());
293        Ok(reduced)
294    }
295
296    // Helper methods for feature selection
297
298    fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
299        let n_features = data.ncols();
300        let mut scores = Array1::zeros(n_features);
301
302        for j in 0..n_features {
303            let column = data.column(j);
304            scores[j] = column.var(0.0);
305        }
306
307        scores
308    }
309
310    fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
311        // Placeholder: compute feature correlations
312        let n_features = data.ncols();
313        Array1::from_vec(
314            (0..n_features)
315                .map(|_| thread_rng().gen::<f64>())
316                .collect(),
317        )
318    }
319
320    fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
321        // Placeholder: compute mutual information
322        let n_features = data.ncols();
323        Array1::from_vec(
324            (0..n_features)
325                .map(|_| thread_rng().gen::<f64>())
326                .collect(),
327        )
328    }
329
330    fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
331        // Placeholder: compute quantum information scores
332        let n_features = data.ncols();
333        Array1::from_vec(
334            (0..n_features)
335                .map(|_| thread_rng().gen::<f64>())
336                .collect(),
337        )
338    }
339
340    // Helper methods for dimensionality reduction
341
342    fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
343        // Placeholder PCA implementation
344        let n_features = data.ncols();
345        let components =
346            Array2::from_shape_fn(
347                (target_dim, n_features),
348                |(i, j)| {
349                    if i == j {
350                        1.0
351                    } else {
352                        0.0
353                    }
354                },
355            );
356
357        let explained_variance =
358            Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
359
360        Ok(DimensionalityReducer {
361            components,
362            explained_variance,
363            target_dim,
364        })
365    }
366
367    fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
368        // Placeholder ICA implementation
369        self.fit_pca(data, target_dim)
370    }
371
372    fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
373        // Placeholder UMAP implementation
374        self.fit_pca(data, target_dim)
375    }
376
377    fn fit_quantum_pca(
378        &self,
379        data: &Array2<f64>,
380        target_dim: usize,
381    ) -> Result<DimensionalityReducer> {
382        // Placeholder Quantum PCA implementation
383        self.fit_pca(data, target_dim)
384    }
385
386    fn fit_quantum_manifold(
387        &self,
388        data: &Array2<f64>,
389        target_dim: usize,
390    ) -> Result<DimensionalityReducer> {
391        // Placeholder Quantum Manifold implementation
392        self.fit_pca(data, target_dim)
393    }
394}