quantrs2_ml/anomaly_detection/
preprocessing.rs1use crate::error::{MLError, Result};
4use ndarray::{Array1, Array2, Axis};
5use rand::Rng;
6
7use super::config::{
8 DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
9 NormalizationType, PreprocessingConfig,
10};
11
12#[derive(Debug)]
14pub struct DataPreprocessor {
15 config: PreprocessingConfig,
16 fitted: bool,
17 normalization_params: Option<NormalizationParams>,
18 feature_selector: Option<FeatureSelector>,
19 dimensionality_reducer: Option<DimensionalityReducer>,
20}
21
22#[derive(Debug, Clone)]
24pub struct NormalizationParams {
25 pub means: Array1<f64>,
26 pub stds: Array1<f64>,
27 pub mins: Array1<f64>,
28 pub maxs: Array1<f64>,
29}
30
31#[derive(Debug)]
33pub struct FeatureSelector {
34 pub selected_features: Vec<usize>,
35 pub feature_scores: Array1<f64>,
36}
37
38#[derive(Debug)]
40pub struct DimensionalityReducer {
41 pub components: Array2<f64>,
42 pub explained_variance: Array1<f64>,
43 pub target_dim: usize,
44}
45
46impl DataPreprocessor {
47 pub fn new(config: PreprocessingConfig) -> Self {
49 DataPreprocessor {
50 config,
51 fitted: false,
52 normalization_params: None,
53 feature_selector: None,
54 dimensionality_reducer: None,
55 }
56 }
57
58 pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
60 self.fit(data)?;
61 self.transform(data)
62 }
63
64 pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
66 self.normalization_params = Some(self.compute_normalization_params(data));
68
69 let mut current_data = data.clone();
70
71 if let Some(ref params) = self.normalization_params {
73 current_data = self.apply_normalization(¤t_data, params)?;
74 }
75
76 if self.config.feature_selection.is_some() {
78 self.feature_selector = Some(self.fit_feature_selector(¤t_data)?);
79 if let Some(ref selector) = self.feature_selector {
81 current_data = self.apply_feature_selection(¤t_data, selector)?;
82 }
83 }
84
85 if self.config.dimensionality_reduction.is_some() {
87 self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(¤t_data)?);
88 }
89
90 self.fitted = true;
91 Ok(())
92 }
93
94 pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
96 if !self.fitted {
97 return Err(MLError::MLOperationError(
98 "Preprocessor must be fitted before transform".to_string(),
99 ));
100 }
101
102 let mut transformed = data.clone();
103
104 if let Some(ref params) = self.normalization_params {
106 transformed = self.apply_normalization(&transformed, params)?;
107 }
108
109 if let Some(ref selector) = self.feature_selector {
111 transformed = self.apply_feature_selection(&transformed, selector)?;
112 }
113
114 if let Some(ref reducer) = self.dimensionality_reducer {
116 transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
117 }
118
119 Ok(transformed)
120 }
121
122 fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
124 let n_features = data.ncols();
125 let mut means = Array1::zeros(n_features);
126 let mut stds = Array1::zeros(n_features);
127 let mut mins = Array1::zeros(n_features);
128 let mut maxs = Array1::zeros(n_features);
129
130 for j in 0..n_features {
131 let column = data.column(j);
132 means[j] = column.mean().unwrap_or(0.0);
133 stds[j] = column.std(0.0);
134 mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
135 maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
136 }
137
138 NormalizationParams {
139 means,
140 stds,
141 mins,
142 maxs,
143 }
144 }
145
146 fn apply_normalization(
148 &self,
149 data: &Array2<f64>,
150 params: &NormalizationParams,
151 ) -> Result<Array2<f64>> {
152 let mut normalized = data.clone();
153
154 match self.config.normalization {
155 NormalizationType::ZScore => {
156 for j in 0..data.ncols() {
157 let mut column = normalized.column_mut(j);
158 if params.stds[j] > 1e-8 {
159 column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
160 }
161 }
162 }
163 NormalizationType::MinMax => {
164 for j in 0..data.ncols() {
165 let mut column = normalized.column_mut(j);
166 let range = params.maxs[j] - params.mins[j];
167 if range > 1e-8 {
168 column.mapv_inplace(|x| (x - params.mins[j]) / range);
169 }
170 }
171 }
172 NormalizationType::Robust => {
173 for j in 0..data.ncols() {
175 let mut column_data: Vec<f64> = data.column(j).to_vec();
176 column_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
177
178 let median = if column_data.len() % 2 == 0 {
179 (column_data[column_data.len() / 2 - 1]
180 + column_data[column_data.len() / 2])
181 / 2.0
182 } else {
183 column_data[column_data.len() / 2]
184 };
185
186 let q1 = column_data[column_data.len() / 4];
187 let q3 = column_data[3 * column_data.len() / 4];
188 let iqr = q3 - q1;
189
190 let mut column = normalized.column_mut(j);
191 if iqr > 1e-8 {
192 column.mapv_inplace(|x| (x - median) / iqr);
193 }
194 }
195 }
196 NormalizationType::Quantum => {
197 for j in 0..data.ncols() {
199 let mut column = normalized.column_mut(j);
200 let norm = column.dot(&column).sqrt();
201 if norm > 1e-8 {
202 column.mapv_inplace(|x| x / norm);
203 }
204 }
205 }
206 }
207
208 Ok(normalized)
209 }
210
211 fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
213 let n_features = data.ncols();
214
215 let feature_scores = match &self.config.feature_selection {
216 Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
217 Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
218 Some(FeatureSelection::MutualInformation) => {
219 self.compute_mutual_information_scores(data)
220 }
221 Some(FeatureSelection::QuantumInformation) => {
222 self.compute_quantum_information_scores(data)
223 }
224 None => Array1::zeros(n_features),
225 };
226
227 let mut indexed_scores: Vec<(usize, f64)> = feature_scores
229 .iter()
230 .enumerate()
231 .map(|(i, &score)| (i, score))
232 .collect();
233
234 indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
235
236 let num_selected = (n_features / 2).max(1);
237 let selected_features: Vec<usize> = indexed_scores
238 .into_iter()
239 .take(num_selected)
240 .map(|(idx, _)| idx)
241 .collect();
242
243 Ok(FeatureSelector {
244 selected_features,
245 feature_scores,
246 })
247 }
248
249 fn apply_feature_selection(
251 &self,
252 data: &Array2<f64>,
253 selector: &FeatureSelector,
254 ) -> Result<Array2<f64>> {
255 let selected_data = data.select(Axis(1), &selector.selected_features);
256 Ok(selected_data)
257 }
258
259 fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
261 let n_features = data.ncols();
262 let target_dim = (n_features / 2).max(1);
263
264 match &self.config.dimensionality_reduction {
265 Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
266 Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
267 Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
268 Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
269 Some(DimensionalityReduction::QuantumManifold) => {
270 self.fit_quantum_manifold(data, target_dim)
271 }
272 None => {
273 let components = Array2::eye(n_features);
275 let explained_variance = Array1::ones(n_features);
276 Ok(DimensionalityReducer {
277 components,
278 explained_variance,
279 target_dim: n_features,
280 })
281 }
282 }
283 }
284
285 fn apply_dimensionality_reduction(
287 &self,
288 data: &Array2<f64>,
289 reducer: &DimensionalityReducer,
290 ) -> Result<Array2<f64>> {
291 let reduced = data.dot(&reducer.components.t());
292 Ok(reduced)
293 }
294
295 fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
298 let n_features = data.ncols();
299 let mut scores = Array1::zeros(n_features);
300
301 for j in 0..n_features {
302 let column = data.column(j);
303 scores[j] = column.var(0.0);
304 }
305
306 scores
307 }
308
309 fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
310 let n_features = data.ncols();
312 Array1::from_vec(
313 (0..n_features)
314 .map(|_| rand::thread_rng().gen::<f64>())
315 .collect(),
316 )
317 }
318
319 fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
320 let n_features = data.ncols();
322 Array1::from_vec(
323 (0..n_features)
324 .map(|_| rand::thread_rng().gen::<f64>())
325 .collect(),
326 )
327 }
328
329 fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
330 let n_features = data.ncols();
332 Array1::from_vec(
333 (0..n_features)
334 .map(|_| rand::thread_rng().gen::<f64>())
335 .collect(),
336 )
337 }
338
339 fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
342 let n_features = data.ncols();
344 let components =
345 Array2::from_shape_fn(
346 (target_dim, n_features),
347 |(i, j)| {
348 if i == j {
349 1.0
350 } else {
351 0.0
352 }
353 },
354 );
355
356 let explained_variance =
357 Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
358
359 Ok(DimensionalityReducer {
360 components,
361 explained_variance,
362 target_dim,
363 })
364 }
365
366 fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
367 self.fit_pca(data, target_dim)
369 }
370
371 fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
372 self.fit_pca(data, target_dim)
374 }
375
376 fn fit_quantum_pca(
377 &self,
378 data: &Array2<f64>,
379 target_dim: usize,
380 ) -> Result<DimensionalityReducer> {
381 self.fit_pca(data, target_dim)
383 }
384
385 fn fit_quantum_manifold(
386 &self,
387 data: &Array2<f64>,
388 target_dim: usize,
389 ) -> Result<DimensionalityReducer> {
390 self.fit_pca(data, target_dim)
392 }
393}