quantrs2_ml/anomaly_detection/
preprocessing.rs1use crate::error::{MLError, Result};
4use scirs2_core::random::prelude::*;
5use scirs2_core::ndarray::{Array1, Array2, Axis};
6use scirs2_core::random::Rng;
7
8use super::config::{
9 DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
10 NormalizationType, PreprocessingConfig,
11};
12
13#[derive(Debug)]
15pub struct DataPreprocessor {
16 config: PreprocessingConfig,
17 fitted: bool,
18 normalization_params: Option<NormalizationParams>,
19 feature_selector: Option<FeatureSelector>,
20 dimensionality_reducer: Option<DimensionalityReducer>,
21}
22
23#[derive(Debug, Clone)]
25pub struct NormalizationParams {
26 pub means: Array1<f64>,
27 pub stds: Array1<f64>,
28 pub mins: Array1<f64>,
29 pub maxs: Array1<f64>,
30}
31
32#[derive(Debug)]
34pub struct FeatureSelector {
35 pub selected_features: Vec<usize>,
36 pub feature_scores: Array1<f64>,
37}
38
39#[derive(Debug)]
41pub struct DimensionalityReducer {
42 pub components: Array2<f64>,
43 pub explained_variance: Array1<f64>,
44 pub target_dim: usize,
45}
46
47impl DataPreprocessor {
48 pub fn new(config: PreprocessingConfig) -> Self {
50 DataPreprocessor {
51 config,
52 fitted: false,
53 normalization_params: None,
54 feature_selector: None,
55 dimensionality_reducer: None,
56 }
57 }
58
59 pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
61 self.fit(data)?;
62 self.transform(data)
63 }
64
65 pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
67 self.normalization_params = Some(self.compute_normalization_params(data));
69
70 let mut current_data = data.clone();
71
72 if let Some(ref params) = self.normalization_params {
74 current_data = self.apply_normalization(¤t_data, params)?;
75 }
76
77 if self.config.feature_selection.is_some() {
79 self.feature_selector = Some(self.fit_feature_selector(¤t_data)?);
80 if let Some(ref selector) = self.feature_selector {
82 current_data = self.apply_feature_selection(¤t_data, selector)?;
83 }
84 }
85
86 if self.config.dimensionality_reduction.is_some() {
88 self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(¤t_data)?);
89 }
90
91 self.fitted = true;
92 Ok(())
93 }
94
95 pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
97 if !self.fitted {
98 return Err(MLError::MLOperationError(
99 "Preprocessor must be fitted before transform".to_string(),
100 ));
101 }
102
103 let mut transformed = data.clone();
104
105 if let Some(ref params) = self.normalization_params {
107 transformed = self.apply_normalization(&transformed, params)?;
108 }
109
110 if let Some(ref selector) = self.feature_selector {
112 transformed = self.apply_feature_selection(&transformed, selector)?;
113 }
114
115 if let Some(ref reducer) = self.dimensionality_reducer {
117 transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
118 }
119
120 Ok(transformed)
121 }
122
123 fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
125 let n_features = data.ncols();
126 let mut means = Array1::zeros(n_features);
127 let mut stds = Array1::zeros(n_features);
128 let mut mins = Array1::zeros(n_features);
129 let mut maxs = Array1::zeros(n_features);
130
131 for j in 0..n_features {
132 let column = data.column(j);
133 means[j] = column.mean().unwrap_or(0.0);
134 stds[j] = column.std(0.0);
135 mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
136 maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
137 }
138
139 NormalizationParams {
140 means,
141 stds,
142 mins,
143 maxs,
144 }
145 }
146
147 fn apply_normalization(
149 &self,
150 data: &Array2<f64>,
151 params: &NormalizationParams,
152 ) -> Result<Array2<f64>> {
153 let mut normalized = data.clone();
154
155 match self.config.normalization {
156 NormalizationType::ZScore => {
157 for j in 0..data.ncols() {
158 let mut column = normalized.column_mut(j);
159 if params.stds[j] > 1e-8 {
160 column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
161 }
162 }
163 }
164 NormalizationType::MinMax => {
165 for j in 0..data.ncols() {
166 let mut column = normalized.column_mut(j);
167 let range = params.maxs[j] - params.mins[j];
168 if range > 1e-8 {
169 column.mapv_inplace(|x| (x - params.mins[j]) / range);
170 }
171 }
172 }
173 NormalizationType::Robust => {
174 for j in 0..data.ncols() {
176 let mut column_data: Vec<f64> = data.column(j).to_vec();
177 column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
178
179 let median = if column_data.len() % 2 == 0 {
180 (column_data[column_data.len() / 2 - 1]
181 + column_data[column_data.len() / 2])
182 / 2.0
183 } else {
184 column_data[column_data.len() / 2]
185 };
186
187 let q1 = column_data[column_data.len() / 4];
188 let q3 = column_data[3 * column_data.len() / 4];
189 let iqr = q3 - q1;
190
191 let mut column = normalized.column_mut(j);
192 if iqr > 1e-8 {
193 column.mapv_inplace(|x| (x - median) / iqr);
194 }
195 }
196 }
197 NormalizationType::Quantum => {
198 for j in 0..data.ncols() {
200 let mut column = normalized.column_mut(j);
201 let norm = column.dot(&column).sqrt();
202 if norm > 1e-8 {
203 column.mapv_inplace(|x| x / norm);
204 }
205 }
206 }
207 }
208
209 Ok(normalized)
210 }
211
212 fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
214 let n_features = data.ncols();
215
216 let feature_scores = match &self.config.feature_selection {
217 Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
218 Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
219 Some(FeatureSelection::MutualInformation) => {
220 self.compute_mutual_information_scores(data)
221 }
222 Some(FeatureSelection::QuantumInformation) => {
223 self.compute_quantum_information_scores(data)
224 }
225 None => Array1::zeros(n_features),
226 };
227
228 let mut indexed_scores: Vec<(usize, f64)> = feature_scores
230 .iter()
231 .enumerate()
232 .map(|(i, &score)| (i, score))
233 .collect();
234
235 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
236
237 let num_selected = (n_features / 2).max(1);
238 let selected_features: Vec<usize> = indexed_scores
239 .into_iter()
240 .take(num_selected)
241 .map(|(idx, _)| idx)
242 .collect();
243
244 Ok(FeatureSelector {
245 selected_features,
246 feature_scores,
247 })
248 }
249
250 fn apply_feature_selection(
252 &self,
253 data: &Array2<f64>,
254 selector: &FeatureSelector,
255 ) -> Result<Array2<f64>> {
256 let selected_data = data.select(Axis(1), &selector.selected_features);
257 Ok(selected_data)
258 }
259
260 fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
262 let n_features = data.ncols();
263 let target_dim = (n_features / 2).max(1);
264
265 match &self.config.dimensionality_reduction {
266 Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
267 Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
268 Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
269 Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
270 Some(DimensionalityReduction::QuantumManifold) => {
271 self.fit_quantum_manifold(data, target_dim)
272 }
273 None => {
274 let components = Array2::eye(n_features);
276 let explained_variance = Array1::ones(n_features);
277 Ok(DimensionalityReducer {
278 components,
279 explained_variance,
280 target_dim: n_features,
281 })
282 }
283 }
284 }
285
286 fn apply_dimensionality_reduction(
288 &self,
289 data: &Array2<f64>,
290 reducer: &DimensionalityReducer,
291 ) -> Result<Array2<f64>> {
292 let reduced = data.dot(&reducer.components.t());
293 Ok(reduced)
294 }
295
296 fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
299 let n_features = data.ncols();
300 let mut scores = Array1::zeros(n_features);
301
302 for j in 0..n_features {
303 let column = data.column(j);
304 scores[j] = column.var(0.0);
305 }
306
307 scores
308 }
309
310 fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
311 let n_features = data.ncols();
313 Array1::from_vec(
314 (0..n_features)
315 .map(|_| thread_rng().gen::<f64>())
316 .collect(),
317 )
318 }
319
320 fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
321 let n_features = data.ncols();
323 Array1::from_vec(
324 (0..n_features)
325 .map(|_| thread_rng().gen::<f64>())
326 .collect(),
327 )
328 }
329
330 fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
331 let n_features = data.ncols();
333 Array1::from_vec(
334 (0..n_features)
335 .map(|_| thread_rng().gen::<f64>())
336 .collect(),
337 )
338 }
339
340 fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
343 let n_features = data.ncols();
345 let components =
346 Array2::from_shape_fn(
347 (target_dim, n_features),
348 |(i, j)| {
349 if i == j {
350 1.0
351 } else {
352 0.0
353 }
354 },
355 );
356
357 let explained_variance =
358 Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
359
360 Ok(DimensionalityReducer {
361 components,
362 explained_variance,
363 target_dim,
364 })
365 }
366
367 fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
368 self.fit_pca(data, target_dim)
370 }
371
372 fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
373 self.fit_pca(data, target_dim)
375 }
376
377 fn fit_quantum_pca(
378 &self,
379 data: &Array2<f64>,
380 target_dim: usize,
381 ) -> Result<DimensionalityReducer> {
382 self.fit_pca(data, target_dim)
384 }
385
386 fn fit_quantum_manifold(
387 &self,
388 data: &Array2<f64>,
389 target_dim: usize,
390 ) -> Result<DimensionalityReducer> {
391 self.fit_pca(data, target_dim)
393 }
394}