quantrs2_ml/anomaly_detection/
preprocessing.rs

1//! Data preprocessing for quantum anomaly detection
2
3use crate::error::{MLError, Result};
4use ndarray::{Array1, Array2, Axis};
5use rand::Rng;
6
7use super::config::{
8    DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
9    NormalizationType, PreprocessingConfig,
10};
11
12/// Data preprocessor
13#[derive(Debug)]
14pub struct DataPreprocessor {
15    config: PreprocessingConfig,
16    fitted: bool,
17    normalization_params: Option<NormalizationParams>,
18    feature_selector: Option<FeatureSelector>,
19    dimensionality_reducer: Option<DimensionalityReducer>,
20}
21
22/// Normalization parameters
23#[derive(Debug, Clone)]
24pub struct NormalizationParams {
25    pub means: Array1<f64>,
26    pub stds: Array1<f64>,
27    pub mins: Array1<f64>,
28    pub maxs: Array1<f64>,
29}
30
31/// Feature selector
32#[derive(Debug)]
33pub struct FeatureSelector {
34    pub selected_features: Vec<usize>,
35    pub feature_scores: Array1<f64>,
36}
37
38/// Dimensionality reducer
39#[derive(Debug)]
40pub struct DimensionalityReducer {
41    pub components: Array2<f64>,
42    pub explained_variance: Array1<f64>,
43    pub target_dim: usize,
44}
45
46impl DataPreprocessor {
47    /// Create new preprocessor
48    pub fn new(config: PreprocessingConfig) -> Self {
49        DataPreprocessor {
50            config,
51            fitted: false,
52            normalization_params: None,
53            feature_selector: None,
54            dimensionality_reducer: None,
55        }
56    }
57
58    /// Fit and transform data
59    pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
60        self.fit(data)?;
61        self.transform(data)
62    }
63
64    /// Fit preprocessor to data
65    pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
66        // Compute normalization parameters
67        self.normalization_params = Some(self.compute_normalization_params(data));
68
69        let mut current_data = data.clone();
70
71        // Apply normalization first
72        if let Some(ref params) = self.normalization_params {
73            current_data = self.apply_normalization(&current_data, params)?;
74        }
75
76        // Fit feature selector if configured
77        if self.config.feature_selection.is_some() {
78            self.feature_selector = Some(self.fit_feature_selector(&current_data)?);
79            // Apply feature selection to get the reduced data
80            if let Some(ref selector) = self.feature_selector {
81                current_data = self.apply_feature_selection(&current_data, selector)?;
82            }
83        }
84
85        // Fit dimensionality reducer if configured (on feature-selected data)
86        if self.config.dimensionality_reduction.is_some() {
87            self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(&current_data)?);
88        }
89
90        self.fitted = true;
91        Ok(())
92    }
93
94    /// Transform data
95    pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
96        if !self.fitted {
97            return Err(MLError::MLOperationError(
98                "Preprocessor must be fitted before transform".to_string(),
99            ));
100        }
101
102        let mut transformed = data.clone();
103
104        // Apply normalization
105        if let Some(ref params) = self.normalization_params {
106            transformed = self.apply_normalization(&transformed, params)?;
107        }
108
109        // Apply feature selection
110        if let Some(ref selector) = self.feature_selector {
111            transformed = self.apply_feature_selection(&transformed, selector)?;
112        }
113
114        // Apply dimensionality reduction
115        if let Some(ref reducer) = self.dimensionality_reducer {
116            transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
117        }
118
119        Ok(transformed)
120    }
121
122    /// Compute normalization parameters
123    fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
124        let n_features = data.ncols();
125        let mut means = Array1::zeros(n_features);
126        let mut stds = Array1::zeros(n_features);
127        let mut mins = Array1::zeros(n_features);
128        let mut maxs = Array1::zeros(n_features);
129
130        for j in 0..n_features {
131            let column = data.column(j);
132            means[j] = column.mean().unwrap_or(0.0);
133            stds[j] = column.std(0.0);
134            mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
135            maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
136        }
137
138        NormalizationParams {
139            means,
140            stds,
141            mins,
142            maxs,
143        }
144    }
145
146    /// Apply normalization
147    fn apply_normalization(
148        &self,
149        data: &Array2<f64>,
150        params: &NormalizationParams,
151    ) -> Result<Array2<f64>> {
152        let mut normalized = data.clone();
153
154        match self.config.normalization {
155            NormalizationType::ZScore => {
156                for j in 0..data.ncols() {
157                    let mut column = normalized.column_mut(j);
158                    if params.stds[j] > 1e-8 {
159                        column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
160                    }
161                }
162            }
163            NormalizationType::MinMax => {
164                for j in 0..data.ncols() {
165                    let mut column = normalized.column_mut(j);
166                    let range = params.maxs[j] - params.mins[j];
167                    if range > 1e-8 {
168                        column.mapv_inplace(|x| (x - params.mins[j]) / range);
169                    }
170                }
171            }
172            NormalizationType::Robust => {
173                // Robust scaling using median and IQR
174                for j in 0..data.ncols() {
175                    let mut column_data: Vec<f64> = data.column(j).to_vec();
176                    column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
177
178                    let median = if column_data.len() % 2 == 0 {
179                        (column_data[column_data.len() / 2 - 1]
180                            + column_data[column_data.len() / 2])
181                            / 2.0
182                    } else {
183                        column_data[column_data.len() / 2]
184                    };
185
186                    let q1 = column_data[column_data.len() / 4];
187                    let q3 = column_data[3 * column_data.len() / 4];
188                    let iqr = q3 - q1;
189
190                    let mut column = normalized.column_mut(j);
191                    if iqr > 1e-8 {
192                        column.mapv_inplace(|x| (x - median) / iqr);
193                    }
194                }
195            }
196            NormalizationType::Quantum => {
197                // Quantum normalization (placeholder - would use quantum circuits)
198                for j in 0..data.ncols() {
199                    let mut column = normalized.column_mut(j);
200                    let norm = column.dot(&column).sqrt();
201                    if norm > 1e-8 {
202                        column.mapv_inplace(|x| x / norm);
203                    }
204                }
205            }
206        }
207
208        Ok(normalized)
209    }
210
211    /// Fit feature selector
212    fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
213        let n_features = data.ncols();
214
215        let feature_scores = match &self.config.feature_selection {
216            Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
217            Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
218            Some(FeatureSelection::MutualInformation) => {
219                self.compute_mutual_information_scores(data)
220            }
221            Some(FeatureSelection::QuantumInformation) => {
222                self.compute_quantum_information_scores(data)
223            }
224            None => Array1::zeros(n_features),
225        };
226
227        // Select top features
228        let mut indexed_scores: Vec<(usize, f64)> = feature_scores
229            .iter()
230            .enumerate()
231            .map(|(i, &score)| (i, score))
232            .collect();
233
234        indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
235
236        let num_selected = (n_features / 2).max(1);
237        let selected_features: Vec<usize> = indexed_scores
238            .into_iter()
239            .take(num_selected)
240            .map(|(idx, _)| idx)
241            .collect();
242
243        Ok(FeatureSelector {
244            selected_features,
245            feature_scores,
246        })
247    }
248
249    /// Apply feature selection
250    fn apply_feature_selection(
251        &self,
252        data: &Array2<f64>,
253        selector: &FeatureSelector,
254    ) -> Result<Array2<f64>> {
255        let selected_data = data.select(Axis(1), &selector.selected_features);
256        Ok(selected_data)
257    }
258
259    /// Fit dimensionality reducer
260    fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
261        let n_features = data.ncols();
262        let target_dim = (n_features / 2).max(1);
263
264        match &self.config.dimensionality_reduction {
265            Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
266            Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
267            Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
268            Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
269            Some(DimensionalityReduction::QuantumManifold) => {
270                self.fit_quantum_manifold(data, target_dim)
271            }
272            None => {
273                // Fallback to identity
274                let components = Array2::eye(n_features);
275                let explained_variance = Array1::ones(n_features);
276                Ok(DimensionalityReducer {
277                    components,
278                    explained_variance,
279                    target_dim: n_features,
280                })
281            }
282        }
283    }
284
285    /// Apply dimensionality reduction
286    fn apply_dimensionality_reduction(
287        &self,
288        data: &Array2<f64>,
289        reducer: &DimensionalityReducer,
290    ) -> Result<Array2<f64>> {
291        let reduced = data.dot(&reducer.components.t());
292        Ok(reduced)
293    }
294
295    // Helper methods for feature selection
296
297    fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
298        let n_features = data.ncols();
299        let mut scores = Array1::zeros(n_features);
300
301        for j in 0..n_features {
302            let column = data.column(j);
303            scores[j] = column.var(0.0);
304        }
305
306        scores
307    }
308
309    fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
310        // Placeholder: compute feature correlations
311        let n_features = data.ncols();
312        Array1::from_vec(
313            (0..n_features)
314                .map(|_| rand::thread_rng().gen::<f64>())
315                .collect(),
316        )
317    }
318
319    fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
320        // Placeholder: compute mutual information
321        let n_features = data.ncols();
322        Array1::from_vec(
323            (0..n_features)
324                .map(|_| rand::thread_rng().gen::<f64>())
325                .collect(),
326        )
327    }
328
329    fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
330        // Placeholder: compute quantum information scores
331        let n_features = data.ncols();
332        Array1::from_vec(
333            (0..n_features)
334                .map(|_| rand::thread_rng().gen::<f64>())
335                .collect(),
336        )
337    }
338
339    // Helper methods for dimensionality reduction
340
341    fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
342        // Placeholder PCA implementation
343        let n_features = data.ncols();
344        let components =
345            Array2::from_shape_fn(
346                (target_dim, n_features),
347                |(i, j)| {
348                    if i == j {
349                        1.0
350                    } else {
351                        0.0
352                    }
353                },
354            );
355
356        let explained_variance =
357            Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
358
359        Ok(DimensionalityReducer {
360            components,
361            explained_variance,
362            target_dim,
363        })
364    }
365
366    fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
367        // Placeholder ICA implementation
368        self.fit_pca(data, target_dim)
369    }
370
371    fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
372        // Placeholder UMAP implementation
373        self.fit_pca(data, target_dim)
374    }
375
376    fn fit_quantum_pca(
377        &self,
378        data: &Array2<f64>,
379        target_dim: usize,
380    ) -> Result<DimensionalityReducer> {
381        // Placeholder Quantum PCA implementation
382        self.fit_pca(data, target_dim)
383    }
384
385    fn fit_quantum_manifold(
386        &self,
387        data: &Array2<f64>,
388        target_dim: usize,
389    ) -> Result<DimensionalityReducer> {
390        // Placeholder Quantum Manifold implementation
391        self.fit_pca(data, target_dim)
392    }
393}