quantrs2_ml/anomaly_detection/
preprocessing.rs1use crate::error::{MLError, Result};
4use scirs2_core::ndarray::{Array1, Array2, Axis};
5use scirs2_core::random::prelude::*;
6use scirs2_core::random::Rng;
7
8use super::config::{
9 DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
10 NormalizationType, PreprocessingConfig,
11};
12
13#[derive(Debug)]
15pub struct DataPreprocessor {
16 config: PreprocessingConfig,
17 fitted: bool,
18 normalization_params: Option<NormalizationParams>,
19 feature_selector: Option<FeatureSelector>,
20 dimensionality_reducer: Option<DimensionalityReducer>,
21}
22
23#[derive(Debug, Clone)]
25pub struct NormalizationParams {
26 pub means: Array1<f64>,
27 pub stds: Array1<f64>,
28 pub mins: Array1<f64>,
29 pub maxs: Array1<f64>,
30}
31
32#[derive(Debug)]
34pub struct FeatureSelector {
35 pub selected_features: Vec<usize>,
36 pub feature_scores: Array1<f64>,
37}
38
39#[derive(Debug)]
41pub struct DimensionalityReducer {
42 pub components: Array2<f64>,
43 pub explained_variance: Array1<f64>,
44 pub target_dim: usize,
45}
46
47impl DataPreprocessor {
48 pub fn new(config: PreprocessingConfig) -> Self {
50 DataPreprocessor {
51 config,
52 fitted: false,
53 normalization_params: None,
54 feature_selector: None,
55 dimensionality_reducer: None,
56 }
57 }
58
59 pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
61 self.fit(data)?;
62 self.transform(data)
63 }
64
65 pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
67 self.normalization_params = Some(self.compute_normalization_params(data));
69
70 let mut current_data = data.clone();
71
72 if let Some(ref params) = self.normalization_params {
74 current_data = self.apply_normalization(¤t_data, params)?;
75 }
76
77 if self.config.feature_selection.is_some() {
79 self.feature_selector = Some(self.fit_feature_selector(¤t_data)?);
80 if let Some(ref selector) = self.feature_selector {
82 current_data = self.apply_feature_selection(¤t_data, selector)?;
83 }
84 }
85
86 if self.config.dimensionality_reduction.is_some() {
88 self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(¤t_data)?);
89 }
90
91 self.fitted = true;
92 Ok(())
93 }
94
95 pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
97 if !self.fitted {
98 return Err(MLError::MLOperationError(
99 "Preprocessor must be fitted before transform".to_string(),
100 ));
101 }
102
103 let mut transformed = data.clone();
104
105 if let Some(ref params) = self.normalization_params {
107 transformed = self.apply_normalization(&transformed, params)?;
108 }
109
110 if let Some(ref selector) = self.feature_selector {
112 transformed = self.apply_feature_selection(&transformed, selector)?;
113 }
114
115 if let Some(ref reducer) = self.dimensionality_reducer {
117 transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
118 }
119
120 Ok(transformed)
121 }
122
123 fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
125 let n_features = data.ncols();
126 let mut means = Array1::zeros(n_features);
127 let mut stds = Array1::zeros(n_features);
128 let mut mins = Array1::zeros(n_features);
129 let mut maxs = Array1::zeros(n_features);
130
131 for j in 0..n_features {
132 let column = data.column(j);
133 means[j] = column.mean().unwrap_or(0.0);
134 stds[j] = column.std(0.0);
135 mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
136 maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
137 }
138
139 NormalizationParams {
140 means,
141 stds,
142 mins,
143 maxs,
144 }
145 }
146
147 fn apply_normalization(
149 &self,
150 data: &Array2<f64>,
151 params: &NormalizationParams,
152 ) -> Result<Array2<f64>> {
153 let mut normalized = data.clone();
154
155 match self.config.normalization {
156 NormalizationType::ZScore => {
157 for j in 0..data.ncols() {
158 let mut column = normalized.column_mut(j);
159 if params.stds[j] > 1e-8 {
160 column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
161 }
162 }
163 }
164 NormalizationType::MinMax => {
165 for j in 0..data.ncols() {
166 let mut column = normalized.column_mut(j);
167 let range = params.maxs[j] - params.mins[j];
168 if range > 1e-8 {
169 column.mapv_inplace(|x| (x - params.mins[j]) / range);
170 }
171 }
172 }
173 NormalizationType::Robust => {
174 for j in 0..data.ncols() {
176 let mut column_data: Vec<f64> = data.column(j).to_vec();
177 column_data
178 .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
179
180 let median = if column_data.len() % 2 == 0 {
181 (column_data[column_data.len() / 2 - 1]
182 + column_data[column_data.len() / 2])
183 / 2.0
184 } else {
185 column_data[column_data.len() / 2]
186 };
187
188 let q1 = column_data[column_data.len() / 4];
189 let q3 = column_data[3 * column_data.len() / 4];
190 let iqr = q3 - q1;
191
192 let mut column = normalized.column_mut(j);
193 if iqr > 1e-8 {
194 column.mapv_inplace(|x| (x - median) / iqr);
195 }
196 }
197 }
198 NormalizationType::Quantum => {
199 for j in 0..data.ncols() {
201 let mut column = normalized.column_mut(j);
202 let norm = column.dot(&column).sqrt();
203 if norm > 1e-8 {
204 column.mapv_inplace(|x| x / norm);
205 }
206 }
207 }
208 }
209
210 Ok(normalized)
211 }
212
213 fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
215 let n_features = data.ncols();
216
217 let feature_scores = match &self.config.feature_selection {
218 Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
219 Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
220 Some(FeatureSelection::MutualInformation) => {
221 self.compute_mutual_information_scores(data)
222 }
223 Some(FeatureSelection::QuantumInformation) => {
224 self.compute_quantum_information_scores(data)
225 }
226 None => Array1::zeros(n_features),
227 };
228
229 let mut indexed_scores: Vec<(usize, f64)> = feature_scores
231 .iter()
232 .enumerate()
233 .map(|(i, &score)| (i, score))
234 .collect();
235
236 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
237
238 let num_selected = (n_features / 2).max(1);
239 let selected_features: Vec<usize> = indexed_scores
240 .into_iter()
241 .take(num_selected)
242 .map(|(idx, _)| idx)
243 .collect();
244
245 Ok(FeatureSelector {
246 selected_features,
247 feature_scores,
248 })
249 }
250
251 fn apply_feature_selection(
253 &self,
254 data: &Array2<f64>,
255 selector: &FeatureSelector,
256 ) -> Result<Array2<f64>> {
257 let selected_data = data.select(Axis(1), &selector.selected_features);
258 Ok(selected_data)
259 }
260
261 fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
263 let n_features = data.ncols();
264 let target_dim = (n_features / 2).max(1);
265
266 match &self.config.dimensionality_reduction {
267 Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
268 Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
269 Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
270 Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
271 Some(DimensionalityReduction::QuantumManifold) => {
272 self.fit_quantum_manifold(data, target_dim)
273 }
274 None => {
275 let components = Array2::eye(n_features);
277 let explained_variance = Array1::ones(n_features);
278 Ok(DimensionalityReducer {
279 components,
280 explained_variance,
281 target_dim: n_features,
282 })
283 }
284 }
285 }
286
287 fn apply_dimensionality_reduction(
289 &self,
290 data: &Array2<f64>,
291 reducer: &DimensionalityReducer,
292 ) -> Result<Array2<f64>> {
293 let reduced = data.dot(&reducer.components.t());
294 Ok(reduced)
295 }
296
297 fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
300 let n_features = data.ncols();
301 let mut scores = Array1::zeros(n_features);
302
303 for j in 0..n_features {
304 let column = data.column(j);
305 scores[j] = column.var(0.0);
306 }
307
308 scores
309 }
310
311 fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
312 let n_features = data.ncols();
314 Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
315 }
316
317 fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
318 let n_features = data.ncols();
320 Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
321 }
322
323 fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
324 let n_features = data.ncols();
326 Array1::from_vec((0..n_features).map(|_| thread_rng().gen::<f64>()).collect())
327 }
328
329 fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
332 let n_features = data.ncols();
334 let components =
335 Array2::from_shape_fn(
336 (target_dim, n_features),
337 |(i, j)| {
338 if i == j {
339 1.0
340 } else {
341 0.0
342 }
343 },
344 );
345
346 let explained_variance =
347 Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
348
349 Ok(DimensionalityReducer {
350 components,
351 explained_variance,
352 target_dim,
353 })
354 }
355
356 fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
357 self.fit_pca(data, target_dim)
359 }
360
361 fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
362 self.fit_pca(data, target_dim)
364 }
365
366 fn fit_quantum_pca(
367 &self,
368 data: &Array2<f64>,
369 target_dim: usize,
370 ) -> Result<DimensionalityReducer> {
371 self.fit_pca(data, target_dim)
373 }
374
375 fn fit_quantum_manifold(
376 &self,
377 data: &Array2<f64>,
378 target_dim: usize,
379 ) -> Result<DimensionalityReducer> {
380 self.fit_pca(data, target_dim)
382 }
383}